email_crawler 0.0.8 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: aadbe92dd2c7670e25d389a5008badaae8334fc0
4
- data.tar.gz: 265adbfb7bb28397ff93af922ddd670044b2f2a6
3
+ metadata.gz: 49d12206392f47b3417cebc922a65e38030aab31
4
+ data.tar.gz: 953cd1f8ab44e9fafc361b06495d9278832b72e5
5
5
  SHA512:
6
- metadata.gz: 0c2eb98801a94251434c1c009a357552813c0f3734d999f5b29c740f5419756561d97ef5b30c22472c1eaacf0cc37bfde835e47502b516d4a45232cdff8a846b
7
- data.tar.gz: 607749a5ce4831fcbd929721f09191f3e2d4f10d93485fc41c92577b5250c2d480b1b1138493ce18519e30969a91ab0e101c7300f31a59ef0482b598589b7935
6
+ metadata.gz: 21531db699176ddca60be2c5215129fc601574674b43f463da2d83b175c46693cd8d90c9fc76a246cf988224afd82cbc1ed6dcb450c938535a859376de44da68
7
+ data.tar.gz: 2b80fb41432a1a549b92fc7342e995e20fbf0753db51488546dddf634f4adb50e67bd2248a2efc6aa17e60d38b74e60318937d499e1651b1275330ca81982add
@@ -6,11 +6,8 @@ module EmailCrawler
6
6
  SLEEP_TIME = 0.5
7
7
  UTF_8 = "UTF-8".freeze
8
8
 
9
- def initialize(url)
10
- @url = url
11
- @logger = ::Logger.new(STDOUT).tap do |logger|
12
- logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
13
- end
9
+ def initialize(url, logger = Logger.new("/dev/null"))
10
+ @url, @logger = url, logger
14
11
  end
15
12
 
16
13
  def scan(links)
@@ -6,7 +6,7 @@ module EmailCrawler
6
6
 
7
7
  include MechanizeHelper
8
8
 
9
- def initialize(url)
9
+ def initialize(url, logger = Logger.new("/dev/null"))
10
10
  @url = url
11
11
  uri = URI(url)
12
12
  scheme_and_host = if uri.host
@@ -15,13 +15,11 @@ module EmailCrawler
15
15
  url[%r(\A(https?://([^/]+))), 1]
16
16
  end
17
17
  @domain = Regexp.new("#{scheme_and_host}/", true)
18
- @logger = ::Logger.new(STDOUT).tap do |logger|
19
- logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
20
- end
18
+ @logger = logger
21
19
  end
22
20
 
23
- def self.for(url, max_links = MAX_LINKS)
24
- new(url).fetch_links(max_links)
21
+ def self.for(url, max_links: MAX_LINKS, logger: Logger.new("/dev/null"))
22
+ new(url, logger).fetch_links(max_links)
25
23
  end
26
24
 
27
25
  def fetch_links(max_links = MAX_LINKS)
@@ -1,3 +1,3 @@
1
1
  module EmailCrawler
2
- VERSION = "0.0.8"
2
+ VERSION = "0.0.9"
3
3
  end
data/lib/email_crawler.rb CHANGED
@@ -14,28 +14,16 @@ module EmailCrawler
14
14
  class Runner
15
15
  MAX_CONCURRENCY = 50
16
16
 
17
- attr_writer :max_results, :max_links, :max_concurrency
17
+ attr_writer :max_results, :max_links, :max_concurrency, :logger
18
18
 
19
19
  def initialize(google_website)
20
20
  @google_website = google_website
21
-
22
- # @logger = ::Logger.new(STDOUT).tap do |logger|
23
- log_file = File.join(ENV["HOME"], "email-crawler.log")
24
- file = File.open(log_file, File::WRONLY | File::APPEND | File::CREAT)
25
- @logger = ::Logger.new(file).tap do |logger|
26
- logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
27
- end
28
-
29
21
  yield(self)
30
-
31
- @logger.info "max_results: #{@max_results}"
32
- @logger.info "max_links: #{@max_links}"
33
- @logger.info "max_concurrency: #{@max_concurrency}"
34
22
  end
35
23
 
36
24
  def run(q)
37
25
  urls = Scraper.new(@google_website, @max_results).search_result_urls_for(q)
38
- urls.each { |url| @logger.info "#{url}" }
26
+ urls.each { |url| logger.info "#{url}" }
39
27
  queue = Queue.new
40
28
  urls.each { |url| queue.push(url) }
41
29
  links_by_url = ThreadSafe::Array.new
@@ -47,8 +35,8 @@ module EmailCrawler
47
35
  rescue ThreadError; end
48
36
 
49
37
  while url
50
- @logger.info "[Thread ##{i}] grabbing page links for '#{url}'.."
51
- links = PageLinks.for(url, @max_links)
38
+ logger.info "[Thread ##{i}] grabbing page links for '#{url}'.."
39
+ links = PageLinks.for(url, max_links: @max_links, logger: logger)
52
40
  links_by_url << [url, links]
53
41
 
54
42
  url = begin
@@ -58,7 +46,7 @@ module EmailCrawler
58
46
  end
59
47
  end
60
48
  threads.each(&:join)
61
- @logger.debug "links_by_url: #{links_by_url.inspect}"
49
+ logger.debug "links_by_url: #{links_by_url.inspect}"
62
50
 
63
51
  links_by_url.each { |arr| queue.push(arr) }
64
52
  emails_by_url = ThreadSafe::Hash.new
@@ -70,8 +58,8 @@ module EmailCrawler
70
58
 
71
59
  while arr
72
60
  url, links = arr
73
- @logger.info "[Thread ##{i}] scanning for emails on page '#{url}' (#{links.length} links)"
74
- emails = EmailScanner.new(url).scan(links)
61
+ logger.info "[Thread ##{i}] scanning for emails on page '#{url}' (#{links.length} links)"
62
+ emails = EmailScanner.new(url, logger).scan(links)
75
63
  emails_by_url[url] = emails
76
64
 
77
65
  arr = begin
@@ -81,7 +69,7 @@ module EmailCrawler
81
69
  end
82
70
  end
83
71
  threads.each(&:join)
84
- @logger.debug "emails_by_url: #{emails_by_url.inspect}"
72
+ logger.debug "emails_by_url: #{emails_by_url.inspect}"
85
73
 
86
74
  read_emails = Set.new
87
75
  CSV.generate do |csv|
@@ -90,7 +78,7 @@ module EmailCrawler
90
78
 
91
79
  emails_by_url.each do |url, emails_by_link|
92
80
  email_count = emails_by_link.inject(0) { |sum, arr| sum += arr.last.length }
93
- @logger.info "#{url} (#{email_count} emails)"
81
+ logger.info "#{url} (#{email_count} emails)"
94
82
 
95
83
  emails_by_link.each do |link, emails|
96
84
  emails.each do |email|
@@ -100,5 +88,17 @@ module EmailCrawler
100
88
  end
101
89
  end
102
90
  end
91
+
92
+ private
93
+
94
+ def logger
95
+ @logger ||= begin
96
+ path = File.join(ENV["HOME"], "email_crawler.log")
97
+ file = File.open(path, File::WRONLY | File::APPEND | File::CREAT)
98
+ logger = ::Logger.new(file).tap do |logger|
99
+ logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
100
+ end
101
+ end
102
+ end
103
103
  end
104
104
  end
@@ -7,7 +7,7 @@ module EmailCrawler
7
7
  let(:max_links) { 25 }
8
8
 
9
9
  it "returns the first N internal links" do
10
- PageLinks.for("http://www.visitberlin.de/en", max_links).length.must_equal max_links
10
+ PageLinks.for("http://www.visitberlin.de/en", max_links: max_links).length.must_equal max_links
11
11
  end
12
12
  end
13
13
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: email_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch