email_crawler 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +12 -6
- data/bin/email-crawler +13 -3
- data/email_crawler.gemspec +1 -0
- data/lib/email_crawler/email_scanner.rb +14 -1
- data/lib/email_crawler/page_links.rb +23 -1
- data/lib/email_crawler/scraper.rb +2 -1
- data/lib/email_crawler/version.rb +1 -1
- data/lib/email_crawler.rb +58 -23
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a260b02f463c94ff01de5957eb1ec7ea95e8e150
|
4
|
+
data.tar.gz: aac88c2198ed95902045c7ee2fc874bf6d5b65f4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 00f1003d1e385527d0bcceae8fdddd042e06b074f4f1d3447b1f79506fcc173aa8d7ed750a5698738a5c59a28c0e8306e4f9ea4859bc318c5697f36d6ce0b3c1
|
7
|
+
data.tar.gz: c53d9f2e6cede921ec98c1ce42986d628e4394124514cca1aaf0b0a2fc174d966e20abed152368545714b4e11da724e12b294f491b10c48b8d1fa5dd1d930e21
|
data/README.md
CHANGED
@@ -10,37 +10,43 @@ Email crawler: crawls the top ten Google search results looking for email addres
|
|
10
10
|
|
11
11
|
## Usage
|
12
12
|
|
13
|
-
|
13
|
+
* Ask for help
|
14
14
|
|
15
15
|
```bash
|
16
16
|
email-crawler --help
|
17
17
|
```
|
18
18
|
|
19
|
-
|
19
|
+
* Simplest Google search
|
20
20
|
|
21
21
|
```bash
|
22
22
|
email-crawler --query "berlin walks"
|
23
23
|
```
|
24
24
|
|
25
|
-
|
25
|
+
* Select which Google website to use (defaults to google.com.br)
|
26
26
|
|
27
27
|
```bash
|
28
28
|
email-crawler --query "berlin walks" --google-website google.de
|
29
29
|
```
|
30
30
|
|
31
|
-
|
31
|
+
* Specify how many search results URLs to collect (defaults to 100)
|
32
32
|
|
33
33
|
```bash
|
34
34
|
email-crawler --query "berlin walks" --max-results 250
|
35
35
|
```
|
36
36
|
|
37
|
-
|
37
|
+
* Specify how many internal links are to be scanned for email addresses (defaults to 100)
|
38
38
|
|
39
39
|
```bash
|
40
40
|
email-crawler --query "berlin walks" --max-links 250
|
41
41
|
```
|
42
42
|
|
43
|
-
|
43
|
+
* Specify how many threads to use when searching for links and email addresses (defaults to 10)
|
44
|
+
|
45
|
+
```bash
|
46
|
+
email-crawler --query "berlin walks" --concurrency 25
|
47
|
+
```
|
48
|
+
|
49
|
+
* Redirect output to a file
|
44
50
|
|
45
51
|
```bash
|
46
52
|
email-crawler --query "berlin walks" > ~/Desktop/belin-walks-emails.csv
|
data/bin/email-crawler
CHANGED
@@ -11,6 +11,7 @@ class OptionsParser
|
|
11
11
|
options.google_website = "google.com.br"
|
12
12
|
options.max_results = ::EmailCrawler::Scraper::MAX_RESULTS
|
13
13
|
options.max_links = ::EmailCrawler::PageLinks::MAX_LINKS
|
14
|
+
options.max_concurrency = ::EmailCrawler::Runner::MAX_CONCURRENCY
|
14
15
|
|
15
16
|
opt_parser = OptionParser.new do |opts|
|
16
17
|
opts.banner = "Usage: email-crawler [options]"
|
@@ -38,6 +39,12 @@ class OptionsParser
|
|
38
39
|
" (per search result, defaults to 100)") do |max_links|
|
39
40
|
options.max_links = max_links.to_i
|
40
41
|
end
|
42
|
+
|
43
|
+
opts.on("-c", "--concurrency 25",
|
44
|
+
"Max # of threads to use to look for links and email addresses",
|
45
|
+
" (defaults to 10)") do |max_concurrency|
|
46
|
+
options.max_concurrency = max_concurrency.to_i
|
47
|
+
end
|
41
48
|
end
|
42
49
|
|
43
50
|
opt_parser.parse!(args)
|
@@ -50,8 +57,11 @@ if options.q.empty?
|
|
50
57
|
print "The -q switch is mandatory\n"
|
51
58
|
exit(1)
|
52
59
|
else
|
53
|
-
|
54
|
-
|
55
|
-
|
60
|
+
runner = EmailCrawler::Runner.new(options.google_website) do |runner|
|
61
|
+
runner.max_results = options.max_results
|
62
|
+
runner.max_links = options.max_links
|
63
|
+
runner.max_concurrency = options.max_concurrency
|
64
|
+
end
|
65
|
+
csv = runner.run(options.q)
|
56
66
|
$stdout << "#{csv}\n"
|
57
67
|
end
|
data/email_crawler.gemspec
CHANGED
@@ -19,6 +19,7 @@ Gem::Specification.new do |spec|
|
|
19
19
|
|
20
20
|
spec.add_runtime_dependency "mechanize"
|
21
21
|
spec.add_runtime_dependency "dotenv"
|
22
|
+
spec.add_runtime_dependency "thread_safe"
|
22
23
|
|
23
24
|
spec.add_development_dependency "bundler", "~> 1.5"
|
24
25
|
spec.add_development_dependency "rake"
|
@@ -4,6 +4,7 @@ module EmailCrawler
|
|
4
4
|
class EmailScanner
|
5
5
|
EMAIL_REGEXP = /\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}\b/i
|
6
6
|
SLEEP_TIME = 0.5
|
7
|
+
UTF_8 = "UTF-8".freeze
|
7
8
|
|
8
9
|
def initialize(url)
|
9
10
|
@url = url
|
@@ -17,6 +18,7 @@ module EmailCrawler
|
|
17
18
|
|
18
19
|
links.each do |link|
|
19
20
|
@logger.info "searching for emails on '#{link}'.."
|
21
|
+
retried = false
|
20
22
|
|
21
23
|
html = begin
|
22
24
|
open(link).read
|
@@ -31,7 +33,18 @@ module EmailCrawler
|
|
31
33
|
end
|
32
34
|
next unless html
|
33
35
|
|
34
|
-
|
36
|
+
begin
|
37
|
+
emails = html.scan(EMAIL_REGEXP)
|
38
|
+
rescue ArgumentError => err
|
39
|
+
if retried
|
40
|
+
emails = []
|
41
|
+
else
|
42
|
+
@logger.warn err.inspect
|
43
|
+
html.encode!(UTF_8, UTF_8, invalid: :replace, undef: :replace, replace: "")
|
44
|
+
retried = true
|
45
|
+
retry
|
46
|
+
end
|
47
|
+
end
|
35
48
|
emails_by_link[link] = Set.new(emails) unless emails.empty?
|
36
49
|
sleep(SLEEP_TIME)
|
37
50
|
end
|
@@ -2,6 +2,7 @@ module EmailCrawler
|
|
2
2
|
class PageLinks
|
3
3
|
MAX_LINKS = 100
|
4
4
|
SLEEP_TIME = 0.5
|
5
|
+
MAX_RETRIES = 5
|
5
6
|
|
6
7
|
include MechanizeHelper
|
7
8
|
|
@@ -25,11 +26,32 @@ module EmailCrawler
|
|
25
26
|
|
26
27
|
def fetch_links(max_links = MAX_LINKS)
|
27
28
|
queue, links = Set.new([@url]), Set.new([@url])
|
29
|
+
retries = 0
|
28
30
|
|
29
31
|
until queue.empty?
|
30
32
|
current_link = queue.first
|
31
33
|
@logger.info "current_link: #{current_link}"
|
32
|
-
|
34
|
+
|
35
|
+
begin
|
36
|
+
page = get(current_link)
|
37
|
+
rescue Net::HTTP::Persistent::Error => err
|
38
|
+
@logger.warn err.inspect
|
39
|
+
page = nil
|
40
|
+
|
41
|
+
if retries < MAX_RETRIES
|
42
|
+
retries += 1
|
43
|
+
@logger.debug "Retry ##{retries}"
|
44
|
+
agent.shutdown
|
45
|
+
Thread.current[:agent] = nil
|
46
|
+
sleep(SLEEP_TIME)
|
47
|
+
retry
|
48
|
+
else
|
49
|
+
@logger.error "Giving up grabbing link for '#{@url}' after #{retries} retries"
|
50
|
+
break
|
51
|
+
end
|
52
|
+
else
|
53
|
+
retries = 0
|
54
|
+
end
|
33
55
|
|
34
56
|
if page
|
35
57
|
new_links = page.links_with(href: @domain).map(&:href)
|
data/lib/email_crawler.rb
CHANGED
@@ -2,6 +2,7 @@ require "thread"
|
|
2
2
|
require "logger"
|
3
3
|
require "csv"
|
4
4
|
require "set"
|
5
|
+
require "thread_safe"
|
5
6
|
|
6
7
|
require_relative "email_crawler/version"
|
7
8
|
require_relative "email_crawler/mechanize_helper"
|
@@ -11,53 +12,87 @@ require_relative "email_crawler/email_scanner"
|
|
11
12
|
|
12
13
|
module EmailCrawler
|
13
14
|
class Runner
|
15
|
+
MAX_CONCURRENCY = 10
|
16
|
+
|
17
|
+
attr_writer :max_results, :max_links, :max_concurrency
|
18
|
+
|
14
19
|
def initialize(google_website)
|
15
20
|
@google_website = google_website
|
16
21
|
|
22
|
+
# @logger = ::Logger.new(STDOUT).tap do |logger|
|
17
23
|
log_file = File.join(ENV["HOME"], "email-crawler.log")
|
18
24
|
file = File.open(log_file, File::WRONLY | File::APPEND | File::CREAT)
|
19
25
|
@logger = ::Logger.new(file).tap do |logger|
|
20
26
|
logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
|
21
27
|
end
|
28
|
+
|
29
|
+
yield(self)
|
30
|
+
|
31
|
+
@logger.info "max_results: #{@max_results}"
|
32
|
+
@logger.info "max_links: #{@max_links}"
|
33
|
+
@logger.info "max_concurrency: #{@max_concurrency}"
|
22
34
|
end
|
23
35
|
|
24
|
-
def run(q
|
25
|
-
urls = Scraper.new(@google_website, max_results).search_result_urls_for(q)
|
26
|
-
urls.each { |url
|
36
|
+
def run(q)
|
37
|
+
urls = Scraper.new(@google_website, @max_results).search_result_urls_for(q)
|
38
|
+
urls.each { |url| @logger.info "#{url}" }
|
39
|
+
queue = Queue.new
|
40
|
+
urls.each { |url| queue.push(url) }
|
41
|
+
links_by_url = ThreadSafe::Array.new
|
27
42
|
|
28
|
-
threads = (1..urls.length).map do |i|
|
29
|
-
Thread.new(i
|
30
|
-
|
31
|
-
|
32
|
-
|
43
|
+
threads = (1..[urls.length, @max_concurrency].min).map do |i|
|
44
|
+
Thread.new(i) do |i|
|
45
|
+
url = begin
|
46
|
+
queue.pop(true)
|
47
|
+
rescue ThreadError; end
|
48
|
+
|
49
|
+
while url
|
50
|
+
@logger.info "[Thread ##{i}] grabbing page links for '#{url}'.."
|
51
|
+
links = PageLinks.for(url, @max_links)
|
52
|
+
links_by_url << [url, links]
|
53
|
+
|
54
|
+
url = begin
|
55
|
+
queue.pop(true)
|
56
|
+
rescue ThreadError; end
|
57
|
+
end
|
33
58
|
end
|
34
59
|
end
|
35
|
-
|
36
60
|
threads.each(&:join)
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
61
|
+
@logger.debug "links_by_url: #{links_by_url.inspect}"
|
62
|
+
|
63
|
+
links_by_url.each { |arr| queue.push(arr) }
|
64
|
+
emails_by_url = ThreadSafe::Hash.new
|
65
|
+
threads = (1..[links_by_url.length, MAX_CONCURRENCY].min).map do |i|
|
66
|
+
Thread.new(i) do |i|
|
67
|
+
arr = begin
|
68
|
+
queue.pop(true)
|
69
|
+
rescue ThreadError; end
|
70
|
+
|
71
|
+
while arr
|
72
|
+
url, links = arr
|
73
|
+
@logger.info "[Thread ##{i}] scanning for emails on page '#{url}' (#{links.length} links)"
|
74
|
+
emails = EmailScanner.new(url).scan(links)
|
75
|
+
emails_by_url[url] = emails
|
76
|
+
|
77
|
+
arr = begin
|
78
|
+
queue.pop(true)
|
79
|
+
rescue ThreadError; end
|
80
|
+
end
|
45
81
|
end
|
46
82
|
end
|
47
|
-
|
48
83
|
threads.each(&:join)
|
84
|
+
@logger.debug "emails_by_url: #{emails_by_url.inspect}"
|
49
85
|
|
50
86
|
read_emails = Set.new
|
51
87
|
CSV.generate do |csv|
|
52
88
|
csv << %w(Email Domain URL)
|
53
89
|
csv << []
|
54
90
|
|
55
|
-
|
56
|
-
email_count =
|
57
|
-
@logger.info "#{
|
91
|
+
emails_by_url.each do |url, emails_by_link|
|
92
|
+
email_count = emails_by_link.inject(0) { |sum, arr| sum += arr.last.length }
|
93
|
+
@logger.info "#{url} (#{email_count} emails)"
|
58
94
|
|
59
|
-
|
60
|
-
thread[:emails].each do |link, emails|
|
95
|
+
emails_by_link.each do |link, emails|
|
61
96
|
emails.each do |email|
|
62
97
|
csv << [email, url, link] if read_emails.add?(email)
|
63
98
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: email_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Cristian Rasch
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-03-
|
11
|
+
date: 2014-03-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: thread_safe
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: bundler
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|