email_crawler 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +12 -6
- data/bin/email-crawler +13 -3
- data/email_crawler.gemspec +1 -0
- data/lib/email_crawler/email_scanner.rb +14 -1
- data/lib/email_crawler/page_links.rb +23 -1
- data/lib/email_crawler/scraper.rb +2 -1
- data/lib/email_crawler/version.rb +1 -1
- data/lib/email_crawler.rb +58 -23
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a260b02f463c94ff01de5957eb1ec7ea95e8e150
|
4
|
+
data.tar.gz: aac88c2198ed95902045c7ee2fc874bf6d5b65f4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 00f1003d1e385527d0bcceae8fdddd042e06b074f4f1d3447b1f79506fcc173aa8d7ed750a5698738a5c59a28c0e8306e4f9ea4859bc318c5697f36d6ce0b3c1
|
7
|
+
data.tar.gz: c53d9f2e6cede921ec98c1ce42986d628e4394124514cca1aaf0b0a2fc174d966e20abed152368545714b4e11da724e12b294f491b10c48b8d1fa5dd1d930e21
|
data/README.md
CHANGED
@@ -10,37 +10,43 @@ Email crawler: crawls the top ten Google search results looking for email addres
|
|
10
10
|
|
11
11
|
## Usage
|
12
12
|
|
13
|
-
|
13
|
+
* Ask for help
|
14
14
|
|
15
15
|
```bash
|
16
16
|
email-crawler --help
|
17
17
|
```
|
18
18
|
|
19
|
-
|
19
|
+
* Simplest Google search
|
20
20
|
|
21
21
|
```bash
|
22
22
|
email-crawler --query "berlin walks"
|
23
23
|
```
|
24
24
|
|
25
|
-
|
25
|
+
* Select which Google website to use (defaults to google.com.br)
|
26
26
|
|
27
27
|
```bash
|
28
28
|
email-crawler --query "berlin walks" --google-website google.de
|
29
29
|
```
|
30
30
|
|
31
|
-
|
31
|
+
* Specify how many search results URLs to collect (defaults to 100)
|
32
32
|
|
33
33
|
```bash
|
34
34
|
email-crawler --query "berlin walks" --max-results 250
|
35
35
|
```
|
36
36
|
|
37
|
-
|
37
|
+
* Specify how many internal links are to be scanned for email addresses (defaults to 100)
|
38
38
|
|
39
39
|
```bash
|
40
40
|
email-crawler --query "berlin walks" --max-links 250
|
41
41
|
```
|
42
42
|
|
43
|
-
|
43
|
+
* Specify how many threads to use when searching for links and email addresses (defaults to 10)
|
44
|
+
|
45
|
+
```bash
|
46
|
+
email-crawler --query "berlin walks" --concurrency 25
|
47
|
+
```
|
48
|
+
|
49
|
+
* Redirect output to a file
|
44
50
|
|
45
51
|
```bash
|
46
52
|
email-crawler --query "berlin walks" > ~/Desktop/belin-walks-emails.csv
|
data/bin/email-crawler
CHANGED
@@ -11,6 +11,7 @@ class OptionsParser
|
|
11
11
|
options.google_website = "google.com.br"
|
12
12
|
options.max_results = ::EmailCrawler::Scraper::MAX_RESULTS
|
13
13
|
options.max_links = ::EmailCrawler::PageLinks::MAX_LINKS
|
14
|
+
options.max_concurrency = ::EmailCrawler::Runner::MAX_CONCURRENCY
|
14
15
|
|
15
16
|
opt_parser = OptionParser.new do |opts|
|
16
17
|
opts.banner = "Usage: email-crawler [options]"
|
@@ -38,6 +39,12 @@ class OptionsParser
|
|
38
39
|
" (per search result, defaults to 100)") do |max_links|
|
39
40
|
options.max_links = max_links.to_i
|
40
41
|
end
|
42
|
+
|
43
|
+
opts.on("-c", "--concurrency 25",
|
44
|
+
"Max # of threads to use to look for links and email addresses",
|
45
|
+
" (defaults to 10)") do |max_concurrency|
|
46
|
+
options.max_concurrency = max_concurrency.to_i
|
47
|
+
end
|
41
48
|
end
|
42
49
|
|
43
50
|
opt_parser.parse!(args)
|
@@ -50,8 +57,11 @@ if options.q.empty?
|
|
50
57
|
print "The -q switch is mandatory\n"
|
51
58
|
exit(1)
|
52
59
|
else
|
53
|
-
|
54
|
-
|
55
|
-
|
60
|
+
runner = EmailCrawler::Runner.new(options.google_website) do |runner|
|
61
|
+
runner.max_results = options.max_results
|
62
|
+
runner.max_links = options.max_links
|
63
|
+
runner.max_concurrency = options.max_concurrency
|
64
|
+
end
|
65
|
+
csv = runner.run(options.q)
|
56
66
|
$stdout << "#{csv}\n"
|
57
67
|
end
|
data/email_crawler.gemspec
CHANGED
@@ -19,6 +19,7 @@ Gem::Specification.new do |spec|
|
|
19
19
|
|
20
20
|
spec.add_runtime_dependency "mechanize"
|
21
21
|
spec.add_runtime_dependency "dotenv"
|
22
|
+
spec.add_runtime_dependency "thread_safe"
|
22
23
|
|
23
24
|
spec.add_development_dependency "bundler", "~> 1.5"
|
24
25
|
spec.add_development_dependency "rake"
|
@@ -4,6 +4,7 @@ module EmailCrawler
|
|
4
4
|
class EmailScanner
|
5
5
|
EMAIL_REGEXP = /\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}\b/i
|
6
6
|
SLEEP_TIME = 0.5
|
7
|
+
UTF_8 = "UTF-8".freeze
|
7
8
|
|
8
9
|
def initialize(url)
|
9
10
|
@url = url
|
@@ -17,6 +18,7 @@ module EmailCrawler
|
|
17
18
|
|
18
19
|
links.each do |link|
|
19
20
|
@logger.info "searching for emails on '#{link}'.."
|
21
|
+
retried = false
|
20
22
|
|
21
23
|
html = begin
|
22
24
|
open(link).read
|
@@ -31,7 +33,18 @@ module EmailCrawler
|
|
31
33
|
end
|
32
34
|
next unless html
|
33
35
|
|
34
|
-
|
36
|
+
begin
|
37
|
+
emails = html.scan(EMAIL_REGEXP)
|
38
|
+
rescue ArgumentError => err
|
39
|
+
if retried
|
40
|
+
emails = []
|
41
|
+
else
|
42
|
+
@logger.warn err.inspect
|
43
|
+
html.encode!(UTF_8, UTF_8, invalid: :replace, undef: :replace, replace: "")
|
44
|
+
retried = true
|
45
|
+
retry
|
46
|
+
end
|
47
|
+
end
|
35
48
|
emails_by_link[link] = Set.new(emails) unless emails.empty?
|
36
49
|
sleep(SLEEP_TIME)
|
37
50
|
end
|
@@ -2,6 +2,7 @@ module EmailCrawler
|
|
2
2
|
class PageLinks
|
3
3
|
MAX_LINKS = 100
|
4
4
|
SLEEP_TIME = 0.5
|
5
|
+
MAX_RETRIES = 5
|
5
6
|
|
6
7
|
include MechanizeHelper
|
7
8
|
|
@@ -25,11 +26,32 @@ module EmailCrawler
|
|
25
26
|
|
26
27
|
def fetch_links(max_links = MAX_LINKS)
|
27
28
|
queue, links = Set.new([@url]), Set.new([@url])
|
29
|
+
retries = 0
|
28
30
|
|
29
31
|
until queue.empty?
|
30
32
|
current_link = queue.first
|
31
33
|
@logger.info "current_link: #{current_link}"
|
32
|
-
|
34
|
+
|
35
|
+
begin
|
36
|
+
page = get(current_link)
|
37
|
+
rescue Net::HTTP::Persistent::Error => err
|
38
|
+
@logger.warn err.inspect
|
39
|
+
page = nil
|
40
|
+
|
41
|
+
if retries < MAX_RETRIES
|
42
|
+
retries += 1
|
43
|
+
@logger.debug "Retry ##{retries}"
|
44
|
+
agent.shutdown
|
45
|
+
Thread.current[:agent] = nil
|
46
|
+
sleep(SLEEP_TIME)
|
47
|
+
retry
|
48
|
+
else
|
49
|
+
@logger.error "Giving up grabbing link for '#{@url}' after #{retries} retries"
|
50
|
+
break
|
51
|
+
end
|
52
|
+
else
|
53
|
+
retries = 0
|
54
|
+
end
|
33
55
|
|
34
56
|
if page
|
35
57
|
new_links = page.links_with(href: @domain).map(&:href)
|
data/lib/email_crawler.rb
CHANGED
@@ -2,6 +2,7 @@ require "thread"
|
|
2
2
|
require "logger"
|
3
3
|
require "csv"
|
4
4
|
require "set"
|
5
|
+
require "thread_safe"
|
5
6
|
|
6
7
|
require_relative "email_crawler/version"
|
7
8
|
require_relative "email_crawler/mechanize_helper"
|
@@ -11,53 +12,87 @@ require_relative "email_crawler/email_scanner"
|
|
11
12
|
|
12
13
|
module EmailCrawler
|
13
14
|
class Runner
|
15
|
+
MAX_CONCURRENCY = 10
|
16
|
+
|
17
|
+
attr_writer :max_results, :max_links, :max_concurrency
|
18
|
+
|
14
19
|
def initialize(google_website)
|
15
20
|
@google_website = google_website
|
16
21
|
|
22
|
+
# @logger = ::Logger.new(STDOUT).tap do |logger|
|
17
23
|
log_file = File.join(ENV["HOME"], "email-crawler.log")
|
18
24
|
file = File.open(log_file, File::WRONLY | File::APPEND | File::CREAT)
|
19
25
|
@logger = ::Logger.new(file).tap do |logger|
|
20
26
|
logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
|
21
27
|
end
|
28
|
+
|
29
|
+
yield(self)
|
30
|
+
|
31
|
+
@logger.info "max_results: #{@max_results}"
|
32
|
+
@logger.info "max_links: #{@max_links}"
|
33
|
+
@logger.info "max_concurrency: #{@max_concurrency}"
|
22
34
|
end
|
23
35
|
|
24
|
-
def run(q
|
25
|
-
urls = Scraper.new(@google_website, max_results).search_result_urls_for(q)
|
26
|
-
urls.each { |url
|
36
|
+
def run(q)
|
37
|
+
urls = Scraper.new(@google_website, @max_results).search_result_urls_for(q)
|
38
|
+
urls.each { |url| @logger.info "#{url}" }
|
39
|
+
queue = Queue.new
|
40
|
+
urls.each { |url| queue.push(url) }
|
41
|
+
links_by_url = ThreadSafe::Array.new
|
27
42
|
|
28
|
-
threads = (1..urls.length).map do |i|
|
29
|
-
Thread.new(i
|
30
|
-
|
31
|
-
|
32
|
-
|
43
|
+
threads = (1..[urls.length, @max_concurrency].min).map do |i|
|
44
|
+
Thread.new(i) do |i|
|
45
|
+
url = begin
|
46
|
+
queue.pop(true)
|
47
|
+
rescue ThreadError; end
|
48
|
+
|
49
|
+
while url
|
50
|
+
@logger.info "[Thread ##{i}] grabbing page links for '#{url}'.."
|
51
|
+
links = PageLinks.for(url, @max_links)
|
52
|
+
links_by_url << [url, links]
|
53
|
+
|
54
|
+
url = begin
|
55
|
+
queue.pop(true)
|
56
|
+
rescue ThreadError; end
|
57
|
+
end
|
33
58
|
end
|
34
59
|
end
|
35
|
-
|
36
60
|
threads.each(&:join)
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
61
|
+
@logger.debug "links_by_url: #{links_by_url.inspect}"
|
62
|
+
|
63
|
+
links_by_url.each { |arr| queue.push(arr) }
|
64
|
+
emails_by_url = ThreadSafe::Hash.new
|
65
|
+
threads = (1..[links_by_url.length, MAX_CONCURRENCY].min).map do |i|
|
66
|
+
Thread.new(i) do |i|
|
67
|
+
arr = begin
|
68
|
+
queue.pop(true)
|
69
|
+
rescue ThreadError; end
|
70
|
+
|
71
|
+
while arr
|
72
|
+
url, links = arr
|
73
|
+
@logger.info "[Thread ##{i}] scanning for emails on page '#{url}' (#{links.length} links)"
|
74
|
+
emails = EmailScanner.new(url).scan(links)
|
75
|
+
emails_by_url[url] = emails
|
76
|
+
|
77
|
+
arr = begin
|
78
|
+
queue.pop(true)
|
79
|
+
rescue ThreadError; end
|
80
|
+
end
|
45
81
|
end
|
46
82
|
end
|
47
|
-
|
48
83
|
threads.each(&:join)
|
84
|
+
@logger.debug "emails_by_url: #{emails_by_url.inspect}"
|
49
85
|
|
50
86
|
read_emails = Set.new
|
51
87
|
CSV.generate do |csv|
|
52
88
|
csv << %w(Email Domain URL)
|
53
89
|
csv << []
|
54
90
|
|
55
|
-
|
56
|
-
email_count =
|
57
|
-
@logger.info "#{
|
91
|
+
emails_by_url.each do |url, emails_by_link|
|
92
|
+
email_count = emails_by_link.inject(0) { |sum, arr| sum += arr.last.length }
|
93
|
+
@logger.info "#{url} (#{email_count} emails)"
|
58
94
|
|
59
|
-
|
60
|
-
thread[:emails].each do |link, emails|
|
95
|
+
emails_by_link.each do |link, emails|
|
61
96
|
emails.each do |email|
|
62
97
|
csv << [email, url, link] if read_emails.add?(email)
|
63
98
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: email_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Cristian Rasch
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-03-
|
11
|
+
date: 2014-03-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: thread_safe
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: bundler
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|