email_crawler 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: de057cc74d307b12298c221ec86e0c6e646f3bc1
4
- data.tar.gz: 1b7be472da61eec61943e249486d8ecc5cd01c56
3
+ metadata.gz: a260b02f463c94ff01de5957eb1ec7ea95e8e150
4
+ data.tar.gz: aac88c2198ed95902045c7ee2fc874bf6d5b65f4
5
5
  SHA512:
6
- metadata.gz: 4077de2915db17beaa66786fc9ef978ea6bab0ea86076a644dd2f3d6bd16f12e63df12238c5cd91a62116c7327617eeccd172dd54201457c4b874da32ede00af
7
- data.tar.gz: 81996e8daad656dafde6899ceb33f55680ce7dacb4e276f3cb8f96a0f11e3cacb973ab199f2d36fe02569c90dfef2f09007ce138d5688e602b0f1e8458e17fda
6
+ metadata.gz: 00f1003d1e385527d0bcceae8fdddd042e06b074f4f1d3447b1f79506fcc173aa8d7ed750a5698738a5c59a28c0e8306e4f9ea4859bc318c5697f36d6ce0b3c1
7
+ data.tar.gz: c53d9f2e6cede921ec98c1ce42986d628e4394124514cca1aaf0b0a2fc174d966e20abed152368545714b4e11da724e12b294f491b10c48b8d1fa5dd1d930e21
data/README.md CHANGED
@@ -10,37 +10,43 @@ Email crawler: crawls the top ten Google search results looking for email addres
10
10
 
11
11
  ## Usage
12
12
 
13
- 1. Ask for help
13
+ * Ask for help
14
14
 
15
15
  ```bash
16
16
  email-crawler --help
17
17
  ```
18
18
 
19
- 2. Simplest Google search
19
+ * Simplest Google search
20
20
 
21
21
  ```bash
22
22
  email-crawler --query "berlin walks"
23
23
  ```
24
24
 
25
- 3. Select which Google website to use (defaults to google.com.br)
25
+ * Select which Google website to use (defaults to google.com.br)
26
26
 
27
27
  ```bash
28
28
  email-crawler --query "berlin walks" --google-website google.de
29
29
  ```
30
30
 
31
- 4. Specify how many search results URLs to collect (defaults to 100)
31
+ * Specify how many search results URLs to collect (defaults to 100)
32
32
 
33
33
  ```bash
34
34
  email-crawler --query "berlin walks" --max-results 250
35
35
  ```
36
36
 
37
- 5. Specify how many internal links are to be scanned for email addresses (defaults to 100)
37
+ * Specify how many internal links are to be scanned for email addresses (defaults to 100)
38
38
 
39
39
  ```bash
40
40
  email-crawler --query "berlin walks" --max-links 250
41
41
  ```
42
42
 
43
- 6. Redirect output to a file
43
+ * Specify how many threads to use when searching for links and email addresses (defaults to 10)
44
+
45
+ ```bash
46
+ email-crawler --query "berlin walks" --concurrency 25
47
+ ```
48
+
49
+ * Redirect output to a file
44
50
 
45
51
  ```bash
46
52
  email-crawler --query "berlin walks" > ~/Desktop/belin-walks-emails.csv
data/bin/email-crawler CHANGED
@@ -11,6 +11,7 @@ class OptionsParser
11
11
  options.google_website = "google.com.br"
12
12
  options.max_results = ::EmailCrawler::Scraper::MAX_RESULTS
13
13
  options.max_links = ::EmailCrawler::PageLinks::MAX_LINKS
14
+ options.max_concurrency = ::EmailCrawler::Runner::MAX_CONCURRENCY
14
15
 
15
16
  opt_parser = OptionParser.new do |opts|
16
17
  opts.banner = "Usage: email-crawler [options]"
@@ -38,6 +39,12 @@ class OptionsParser
38
39
  " (per search result, defaults to 100)") do |max_links|
39
40
  options.max_links = max_links.to_i
40
41
  end
42
+
43
+ opts.on("-c", "--concurrency 25",
44
+ "Max # of threads to use to look for links and email addresses",
45
+ " (defaults to 10)") do |max_concurrency|
46
+ options.max_concurrency = max_concurrency.to_i
47
+ end
41
48
  end
42
49
 
43
50
  opt_parser.parse!(args)
@@ -50,8 +57,11 @@ if options.q.empty?
50
57
  print "The -q switch is mandatory\n"
51
58
  exit(1)
52
59
  else
53
- csv = EmailCrawler::Runner.new(options.google_website).run(options.q,
54
- options.max_results,
55
- options.max_links)
60
+ runner = EmailCrawler::Runner.new(options.google_website) do |runner|
61
+ runner.max_results = options.max_results
62
+ runner.max_links = options.max_links
63
+ runner.max_concurrency = options.max_concurrency
64
+ end
65
+ csv = runner.run(options.q)
56
66
  $stdout << "#{csv}\n"
57
67
  end
@@ -19,6 +19,7 @@ Gem::Specification.new do |spec|
19
19
 
20
20
  spec.add_runtime_dependency "mechanize"
21
21
  spec.add_runtime_dependency "dotenv"
22
+ spec.add_runtime_dependency "thread_safe"
22
23
 
23
24
  spec.add_development_dependency "bundler", "~> 1.5"
24
25
  spec.add_development_dependency "rake"
@@ -4,6 +4,7 @@ module EmailCrawler
4
4
  class EmailScanner
5
5
  EMAIL_REGEXP = /\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}\b/i
6
6
  SLEEP_TIME = 0.5
7
+ UTF_8 = "UTF-8".freeze
7
8
 
8
9
  def initialize(url)
9
10
  @url = url
@@ -17,6 +18,7 @@ module EmailCrawler
17
18
 
18
19
  links.each do |link|
19
20
  @logger.info "searching for emails on '#{link}'.."
21
+ retried = false
20
22
 
21
23
  html = begin
22
24
  open(link).read
@@ -31,7 +33,18 @@ module EmailCrawler
31
33
  end
32
34
  next unless html
33
35
 
34
- emails = html.scan(EMAIL_REGEXP)
36
+ begin
37
+ emails = html.scan(EMAIL_REGEXP)
38
+ rescue ArgumentError => err
39
+ if retried
40
+ emails = []
41
+ else
42
+ @logger.warn err.inspect
43
+ html.encode!(UTF_8, UTF_8, invalid: :replace, undef: :replace, replace: "")
44
+ retried = true
45
+ retry
46
+ end
47
+ end
35
48
  emails_by_link[link] = Set.new(emails) unless emails.empty?
36
49
  sleep(SLEEP_TIME)
37
50
  end
@@ -2,6 +2,7 @@ module EmailCrawler
2
2
  class PageLinks
3
3
  MAX_LINKS = 100
4
4
  SLEEP_TIME = 0.5
5
+ MAX_RETRIES = 5
5
6
 
6
7
  include MechanizeHelper
7
8
 
@@ -25,11 +26,32 @@ module EmailCrawler
25
26
 
26
27
  def fetch_links(max_links = MAX_LINKS)
27
28
  queue, links = Set.new([@url]), Set.new([@url])
29
+ retries = 0
28
30
 
29
31
  until queue.empty?
30
32
  current_link = queue.first
31
33
  @logger.info "current_link: #{current_link}"
32
- page = get(current_link)
34
+
35
+ begin
36
+ page = get(current_link)
37
+ rescue Net::HTTP::Persistent::Error => err
38
+ @logger.warn err.inspect
39
+ page = nil
40
+
41
+ if retries < MAX_RETRIES
42
+ retries += 1
43
+ @logger.debug "Retry ##{retries}"
44
+ agent.shutdown
45
+ Thread.current[:agent] = nil
46
+ sleep(SLEEP_TIME)
47
+ retry
48
+ else
49
+ @logger.error "Giving up grabbing link for '#{@url}' after #{retries} retries"
50
+ break
51
+ end
52
+ else
53
+ retries = 0
54
+ end
33
55
 
34
56
  if page
35
57
  new_links = page.links_with(href: @domain).map(&:href)
@@ -40,7 +40,8 @@ module EmailCrawler
40
40
  end
41
41
 
42
42
  def agent
43
- @agent ||= new_agent { |agent| agent.set_proxy(Proxy.random, "8888") }
43
+ @agent ||= new_agent
44
+ # @agent ||= new_agent { |agent| agent.set_proxy(Proxy.random, "8888") }
44
45
  end
45
46
  end
46
47
  end
@@ -1,3 +1,3 @@
1
1
  module EmailCrawler
2
- VERSION = "0.0.5"
2
+ VERSION = "0.0.6"
3
3
  end
data/lib/email_crawler.rb CHANGED
@@ -2,6 +2,7 @@ require "thread"
2
2
  require "logger"
3
3
  require "csv"
4
4
  require "set"
5
+ require "thread_safe"
5
6
 
6
7
  require_relative "email_crawler/version"
7
8
  require_relative "email_crawler/mechanize_helper"
@@ -11,53 +12,87 @@ require_relative "email_crawler/email_scanner"
11
12
 
12
13
  module EmailCrawler
13
14
  class Runner
15
+ MAX_CONCURRENCY = 10
16
+
17
+ attr_writer :max_results, :max_links, :max_concurrency
18
+
14
19
  def initialize(google_website)
15
20
  @google_website = google_website
16
21
 
22
+ # @logger = ::Logger.new(STDOUT).tap do |logger|
17
23
  log_file = File.join(ENV["HOME"], "email-crawler.log")
18
24
  file = File.open(log_file, File::WRONLY | File::APPEND | File::CREAT)
19
25
  @logger = ::Logger.new(file).tap do |logger|
20
26
  logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
21
27
  end
28
+
29
+ yield(self)
30
+
31
+ @logger.info "max_results: #{@max_results}"
32
+ @logger.info "max_links: #{@max_links}"
33
+ @logger.info "max_concurrency: #{@max_concurrency}"
22
34
  end
23
35
 
24
- def run(q, max_results = Scraper::MAX_RESULTS, max_links = PageLinks::MAX_LINKS)
25
- urls = Scraper.new(@google_website, max_results).search_result_urls_for(q)
26
- urls.each { |url, links| @logger.info "#{url}" }
36
+ def run(q)
37
+ urls = Scraper.new(@google_website, @max_results).search_result_urls_for(q)
38
+ urls.each { |url| @logger.info "#{url}" }
39
+ queue = Queue.new
40
+ urls.each { |url| queue.push(url) }
41
+ links_by_url = ThreadSafe::Array.new
27
42
 
28
- threads = (1..urls.length).map do |i|
29
- Thread.new(i, urls[i-1]) do |i, url|
30
- @logger.info "[Thread ##{i}] grabbing page links for '#{url}'.."
31
- Thread.current[:url] = url
32
- Thread.current[:links] = PageLinks.for(url, max_links)
43
+ threads = (1..[urls.length, @max_concurrency].min).map do |i|
44
+ Thread.new(i) do |i|
45
+ url = begin
46
+ queue.pop(true)
47
+ rescue ThreadError; end
48
+
49
+ while url
50
+ @logger.info "[Thread ##{i}] grabbing page links for '#{url}'.."
51
+ links = PageLinks.for(url, @max_links)
52
+ links_by_url << [url, links]
53
+
54
+ url = begin
55
+ queue.pop(true)
56
+ rescue ThreadError; end
57
+ end
33
58
  end
34
59
  end
35
-
36
60
  threads.each(&:join)
37
- threads.each { |thread| @logger.info "#{thread[:url]} (#{thread[:links].length} links)" }
38
- links_by_url = Hash[threads.map { |thread| [thread[:url], thread[:links]] }]
39
-
40
- threads = (links_by_url).map.with_index do |arr, i|
41
- Thread.new(i+1, arr.first, arr.last) do |i, url, links|
42
- @logger.info "[Thread ##{i}] scanning for emails on page '#{url}' (#{links.length} links)"
43
- Thread.current[:url] = url
44
- Thread.current[:emails] = EmailScanner.new(url).scan(links)
61
+ @logger.debug "links_by_url: #{links_by_url.inspect}"
62
+
63
+ links_by_url.each { |arr| queue.push(arr) }
64
+ emails_by_url = ThreadSafe::Hash.new
65
+ threads = (1..[links_by_url.length, MAX_CONCURRENCY].min).map do |i|
66
+ Thread.new(i) do |i|
67
+ arr = begin
68
+ queue.pop(true)
69
+ rescue ThreadError; end
70
+
71
+ while arr
72
+ url, links = arr
73
+ @logger.info "[Thread ##{i}] scanning for emails on page '#{url}' (#{links.length} links)"
74
+ emails = EmailScanner.new(url).scan(links)
75
+ emails_by_url[url] = emails
76
+
77
+ arr = begin
78
+ queue.pop(true)
79
+ rescue ThreadError; end
80
+ end
45
81
  end
46
82
  end
47
-
48
83
  threads.each(&:join)
84
+ @logger.debug "emails_by_url: #{emails_by_url.inspect}"
49
85
 
50
86
  read_emails = Set.new
51
87
  CSV.generate do |csv|
52
88
  csv << %w(Email Domain URL)
53
89
  csv << []
54
90
 
55
- threads.each do |thread|
56
- email_count = thread[:emails].inject(0) { |sum, arr| sum += arr.last.length }
57
- @logger.info "#{thread[:url]} (#{email_count} emails)"
91
+ emails_by_url.each do |url, emails_by_link|
92
+ email_count = emails_by_link.inject(0) { |sum, arr| sum += arr.last.length }
93
+ @logger.info "#{url} (#{email_count} emails)"
58
94
 
59
- url = thread[:url]
60
- thread[:emails].each do |link, emails|
95
+ emails_by_link.each do |link, emails|
61
96
  emails.each do |email|
62
97
  csv << [email, url, link] if read_emails.add?(email)
63
98
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: email_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-07 00:00:00.000000000 Z
11
+ date: 2014-03-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: thread_safe
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: bundler
43
57
  requirement: !ruby/object:Gem::Requirement