email_crawler 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: de057cc74d307b12298c221ec86e0c6e646f3bc1
4
- data.tar.gz: 1b7be472da61eec61943e249486d8ecc5cd01c56
3
+ metadata.gz: a260b02f463c94ff01de5957eb1ec7ea95e8e150
4
+ data.tar.gz: aac88c2198ed95902045c7ee2fc874bf6d5b65f4
5
5
  SHA512:
6
- metadata.gz: 4077de2915db17beaa66786fc9ef978ea6bab0ea86076a644dd2f3d6bd16f12e63df12238c5cd91a62116c7327617eeccd172dd54201457c4b874da32ede00af
7
- data.tar.gz: 81996e8daad656dafde6899ceb33f55680ce7dacb4e276f3cb8f96a0f11e3cacb973ab199f2d36fe02569c90dfef2f09007ce138d5688e602b0f1e8458e17fda
6
+ metadata.gz: 00f1003d1e385527d0bcceae8fdddd042e06b074f4f1d3447b1f79506fcc173aa8d7ed750a5698738a5c59a28c0e8306e4f9ea4859bc318c5697f36d6ce0b3c1
7
+ data.tar.gz: c53d9f2e6cede921ec98c1ce42986d628e4394124514cca1aaf0b0a2fc174d966e20abed152368545714b4e11da724e12b294f491b10c48b8d1fa5dd1d930e21
data/README.md CHANGED
@@ -10,37 +10,43 @@ Email crawler: crawls the top ten Google search results looking for email addres
10
10
 
11
11
  ## Usage
12
12
 
13
- 1. Ask for help
13
+ * Ask for help
14
14
 
15
15
  ```bash
16
16
  email-crawler --help
17
17
  ```
18
18
 
19
- 2. Simplest Google search
19
+ * Simplest Google search
20
20
 
21
21
  ```bash
22
22
  email-crawler --query "berlin walks"
23
23
  ```
24
24
 
25
- 3. Select which Google website to use (defaults to google.com.br)
25
+ * Select which Google website to use (defaults to google.com.br)
26
26
 
27
27
  ```bash
28
28
  email-crawler --query "berlin walks" --google-website google.de
29
29
  ```
30
30
 
31
- 4. Specify how many search results URLs to collect (defaults to 100)
31
+ * Specify how many search results URLs to collect (defaults to 100)
32
32
 
33
33
  ```bash
34
34
  email-crawler --query "berlin walks" --max-results 250
35
35
  ```
36
36
 
37
- 5. Specify how many internal links are to be scanned for email addresses (defaults to 100)
37
+ * Specify how many internal links are to be scanned for email addresses (defaults to 100)
38
38
 
39
39
  ```bash
40
40
  email-crawler --query "berlin walks" --max-links 250
41
41
  ```
42
42
 
43
- 6. Redirect output to a file
43
+ * Specify how many threads to use when searching for links and email addresses (defaults to 10)
44
+
45
+ ```bash
46
+ email-crawler --query "berlin walks" --concurrency 25
47
+ ```
48
+
49
+ * Redirect output to a file
44
50
 
45
51
  ```bash
46
52
  email-crawler --query "berlin walks" > ~/Desktop/belin-walks-emails.csv
data/bin/email-crawler CHANGED
@@ -11,6 +11,7 @@ class OptionsParser
11
11
  options.google_website = "google.com.br"
12
12
  options.max_results = ::EmailCrawler::Scraper::MAX_RESULTS
13
13
  options.max_links = ::EmailCrawler::PageLinks::MAX_LINKS
14
+ options.max_concurrency = ::EmailCrawler::Runner::MAX_CONCURRENCY
14
15
 
15
16
  opt_parser = OptionParser.new do |opts|
16
17
  opts.banner = "Usage: email-crawler [options]"
@@ -38,6 +39,12 @@ class OptionsParser
38
39
  " (per search result, defaults to 100)") do |max_links|
39
40
  options.max_links = max_links.to_i
40
41
  end
42
+
43
+ opts.on("-c", "--concurrency 25",
44
+ "Max # of threads to use to look for links and email addresses",
45
+ " (defaults to 10)") do |max_concurrency|
46
+ options.max_concurrency = max_concurrency.to_i
47
+ end
41
48
  end
42
49
 
43
50
  opt_parser.parse!(args)
@@ -50,8 +57,11 @@ if options.q.empty?
50
57
  print "The -q switch is mandatory\n"
51
58
  exit(1)
52
59
  else
53
- csv = EmailCrawler::Runner.new(options.google_website).run(options.q,
54
- options.max_results,
55
- options.max_links)
60
+ runner = EmailCrawler::Runner.new(options.google_website) do |runner|
61
+ runner.max_results = options.max_results
62
+ runner.max_links = options.max_links
63
+ runner.max_concurrency = options.max_concurrency
64
+ end
65
+ csv = runner.run(options.q)
56
66
  $stdout << "#{csv}\n"
57
67
  end
@@ -19,6 +19,7 @@ Gem::Specification.new do |spec|
19
19
 
20
20
  spec.add_runtime_dependency "mechanize"
21
21
  spec.add_runtime_dependency "dotenv"
22
+ spec.add_runtime_dependency "thread_safe"
22
23
 
23
24
  spec.add_development_dependency "bundler", "~> 1.5"
24
25
  spec.add_development_dependency "rake"
@@ -4,6 +4,7 @@ module EmailCrawler
4
4
  class EmailScanner
5
5
  EMAIL_REGEXP = /\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}\b/i
6
6
  SLEEP_TIME = 0.5
7
+ UTF_8 = "UTF-8".freeze
7
8
 
8
9
  def initialize(url)
9
10
  @url = url
@@ -17,6 +18,7 @@ module EmailCrawler
17
18
 
18
19
  links.each do |link|
19
20
  @logger.info "searching for emails on '#{link}'.."
21
+ retried = false
20
22
 
21
23
  html = begin
22
24
  open(link).read
@@ -31,7 +33,18 @@ module EmailCrawler
31
33
  end
32
34
  next unless html
33
35
 
34
- emails = html.scan(EMAIL_REGEXP)
36
+ begin
37
+ emails = html.scan(EMAIL_REGEXP)
38
+ rescue ArgumentError => err
39
+ if retried
40
+ emails = []
41
+ else
42
+ @logger.warn err.inspect
43
+ html.encode!(UTF_8, UTF_8, invalid: :replace, undef: :replace, replace: "")
44
+ retried = true
45
+ retry
46
+ end
47
+ end
35
48
  emails_by_link[link] = Set.new(emails) unless emails.empty?
36
49
  sleep(SLEEP_TIME)
37
50
  end
@@ -2,6 +2,7 @@ module EmailCrawler
2
2
  class PageLinks
3
3
  MAX_LINKS = 100
4
4
  SLEEP_TIME = 0.5
5
+ MAX_RETRIES = 5
5
6
 
6
7
  include MechanizeHelper
7
8
 
@@ -25,11 +26,32 @@ module EmailCrawler
25
26
 
26
27
  def fetch_links(max_links = MAX_LINKS)
27
28
  queue, links = Set.new([@url]), Set.new([@url])
29
+ retries = 0
28
30
 
29
31
  until queue.empty?
30
32
  current_link = queue.first
31
33
  @logger.info "current_link: #{current_link}"
32
- page = get(current_link)
34
+
35
+ begin
36
+ page = get(current_link)
37
+ rescue Net::HTTP::Persistent::Error => err
38
+ @logger.warn err.inspect
39
+ page = nil
40
+
41
+ if retries < MAX_RETRIES
42
+ retries += 1
43
+ @logger.debug "Retry ##{retries}"
44
+ agent.shutdown
45
+ Thread.current[:agent] = nil
46
+ sleep(SLEEP_TIME)
47
+ retry
48
+ else
49
+ @logger.error "Giving up grabbing link for '#{@url}' after #{retries} retries"
50
+ break
51
+ end
52
+ else
53
+ retries = 0
54
+ end
33
55
 
34
56
  if page
35
57
  new_links = page.links_with(href: @domain).map(&:href)
@@ -40,7 +40,8 @@ module EmailCrawler
40
40
  end
41
41
 
42
42
  def agent
43
- @agent ||= new_agent { |agent| agent.set_proxy(Proxy.random, "8888") }
43
+ @agent ||= new_agent
44
+ # @agent ||= new_agent { |agent| agent.set_proxy(Proxy.random, "8888") }
44
45
  end
45
46
  end
46
47
  end
@@ -1,3 +1,3 @@
1
1
  module EmailCrawler
2
- VERSION = "0.0.5"
2
+ VERSION = "0.0.6"
3
3
  end
data/lib/email_crawler.rb CHANGED
@@ -2,6 +2,7 @@ require "thread"
2
2
  require "logger"
3
3
  require "csv"
4
4
  require "set"
5
+ require "thread_safe"
5
6
 
6
7
  require_relative "email_crawler/version"
7
8
  require_relative "email_crawler/mechanize_helper"
@@ -11,53 +12,87 @@ require_relative "email_crawler/email_scanner"
11
12
 
12
13
  module EmailCrawler
13
14
  class Runner
15
+ MAX_CONCURRENCY = 10
16
+
17
+ attr_writer :max_results, :max_links, :max_concurrency
18
+
14
19
  def initialize(google_website)
15
20
  @google_website = google_website
16
21
 
22
+ # @logger = ::Logger.new(STDOUT).tap do |logger|
17
23
  log_file = File.join(ENV["HOME"], "email-crawler.log")
18
24
  file = File.open(log_file, File::WRONLY | File::APPEND | File::CREAT)
19
25
  @logger = ::Logger.new(file).tap do |logger|
20
26
  logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
21
27
  end
28
+
29
+ yield(self)
30
+
31
+ @logger.info "max_results: #{@max_results}"
32
+ @logger.info "max_links: #{@max_links}"
33
+ @logger.info "max_concurrency: #{@max_concurrency}"
22
34
  end
23
35
 
24
- def run(q, max_results = Scraper::MAX_RESULTS, max_links = PageLinks::MAX_LINKS)
25
- urls = Scraper.new(@google_website, max_results).search_result_urls_for(q)
26
- urls.each { |url, links| @logger.info "#{url}" }
36
+ def run(q)
37
+ urls = Scraper.new(@google_website, @max_results).search_result_urls_for(q)
38
+ urls.each { |url| @logger.info "#{url}" }
39
+ queue = Queue.new
40
+ urls.each { |url| queue.push(url) }
41
+ links_by_url = ThreadSafe::Array.new
27
42
 
28
- threads = (1..urls.length).map do |i|
29
- Thread.new(i, urls[i-1]) do |i, url|
30
- @logger.info "[Thread ##{i}] grabbing page links for '#{url}'.."
31
- Thread.current[:url] = url
32
- Thread.current[:links] = PageLinks.for(url, max_links)
43
+ threads = (1..[urls.length, @max_concurrency].min).map do |i|
44
+ Thread.new(i) do |i|
45
+ url = begin
46
+ queue.pop(true)
47
+ rescue ThreadError; end
48
+
49
+ while url
50
+ @logger.info "[Thread ##{i}] grabbing page links for '#{url}'.."
51
+ links = PageLinks.for(url, @max_links)
52
+ links_by_url << [url, links]
53
+
54
+ url = begin
55
+ queue.pop(true)
56
+ rescue ThreadError; end
57
+ end
33
58
  end
34
59
  end
35
-
36
60
  threads.each(&:join)
37
- threads.each { |thread| @logger.info "#{thread[:url]} (#{thread[:links].length} links)" }
38
- links_by_url = Hash[threads.map { |thread| [thread[:url], thread[:links]] }]
39
-
40
- threads = (links_by_url).map.with_index do |arr, i|
41
- Thread.new(i+1, arr.first, arr.last) do |i, url, links|
42
- @logger.info "[Thread ##{i}] scanning for emails on page '#{url}' (#{links.length} links)"
43
- Thread.current[:url] = url
44
- Thread.current[:emails] = EmailScanner.new(url).scan(links)
61
+ @logger.debug "links_by_url: #{links_by_url.inspect}"
62
+
63
+ links_by_url.each { |arr| queue.push(arr) }
64
+ emails_by_url = ThreadSafe::Hash.new
65
+ threads = (1..[links_by_url.length, MAX_CONCURRENCY].min).map do |i|
66
+ Thread.new(i) do |i|
67
+ arr = begin
68
+ queue.pop(true)
69
+ rescue ThreadError; end
70
+
71
+ while arr
72
+ url, links = arr
73
+ @logger.info "[Thread ##{i}] scanning for emails on page '#{url}' (#{links.length} links)"
74
+ emails = EmailScanner.new(url).scan(links)
75
+ emails_by_url[url] = emails
76
+
77
+ arr = begin
78
+ queue.pop(true)
79
+ rescue ThreadError; end
80
+ end
45
81
  end
46
82
  end
47
-
48
83
  threads.each(&:join)
84
+ @logger.debug "emails_by_url: #{emails_by_url.inspect}"
49
85
 
50
86
  read_emails = Set.new
51
87
  CSV.generate do |csv|
52
88
  csv << %w(Email Domain URL)
53
89
  csv << []
54
90
 
55
- threads.each do |thread|
56
- email_count = thread[:emails].inject(0) { |sum, arr| sum += arr.last.length }
57
- @logger.info "#{thread[:url]} (#{email_count} emails)"
91
+ emails_by_url.each do |url, emails_by_link|
92
+ email_count = emails_by_link.inject(0) { |sum, arr| sum += arr.last.length }
93
+ @logger.info "#{url} (#{email_count} emails)"
58
94
 
59
- url = thread[:url]
60
- thread[:emails].each do |link, emails|
95
+ emails_by_link.each do |link, emails|
61
96
  emails.each do |email|
62
97
  csv << [email, url, link] if read_emails.add?(email)
63
98
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: email_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-07 00:00:00.000000000 Z
11
+ date: 2014-03-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: thread_safe
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: bundler
43
57
  requirement: !ruby/object:Gem::Requirement