email_crawler 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: aadbe92dd2c7670e25d389a5008badaae8334fc0
4
- data.tar.gz: 265adbfb7bb28397ff93af922ddd670044b2f2a6
3
+ metadata.gz: 49d12206392f47b3417cebc922a65e38030aab31
4
+ data.tar.gz: 953cd1f8ab44e9fafc361b06495d9278832b72e5
5
5
  SHA512:
6
- metadata.gz: 0c2eb98801a94251434c1c009a357552813c0f3734d999f5b29c740f5419756561d97ef5b30c22472c1eaacf0cc37bfde835e47502b516d4a45232cdff8a846b
7
- data.tar.gz: 607749a5ce4831fcbd929721f09191f3e2d4f10d93485fc41c92577b5250c2d480b1b1138493ce18519e30969a91ab0e101c7300f31a59ef0482b598589b7935
6
+ metadata.gz: 21531db699176ddca60be2c5215129fc601574674b43f463da2d83b175c46693cd8d90c9fc76a246cf988224afd82cbc1ed6dcb450c938535a859376de44da68
7
+ data.tar.gz: 2b80fb41432a1a549b92fc7342e995e20fbf0753db51488546dddf634f4adb50e67bd2248a2efc6aa17e60d38b74e60318937d499e1651b1275330ca81982add
@@ -6,11 +6,8 @@ module EmailCrawler
6
6
  SLEEP_TIME = 0.5
7
7
  UTF_8 = "UTF-8".freeze
8
8
 
9
- def initialize(url)
10
- @url = url
11
- @logger = ::Logger.new(STDOUT).tap do |logger|
12
- logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
13
- end
9
+ def initialize(url, logger = Logger.new("/dev/null"))
10
+ @url, @logger = url, logger
14
11
  end
15
12
 
16
13
  def scan(links)
@@ -6,7 +6,7 @@ module EmailCrawler
6
6
 
7
7
  include MechanizeHelper
8
8
 
9
- def initialize(url)
9
+ def initialize(url, logger = Logger.new("/dev/null"))
10
10
  @url = url
11
11
  uri = URI(url)
12
12
  scheme_and_host = if uri.host
@@ -15,13 +15,11 @@ module EmailCrawler
15
15
  url[%r(\A(https?://([^/]+))), 1]
16
16
  end
17
17
  @domain = Regexp.new("#{scheme_and_host}/", true)
18
- @logger = ::Logger.new(STDOUT).tap do |logger|
19
- logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
20
- end
18
+ @logger = logger
21
19
  end
22
20
 
23
- def self.for(url, max_links = MAX_LINKS)
24
- new(url).fetch_links(max_links)
21
+ def self.for(url, max_links: MAX_LINKS, logger: Logger.new("/dev/null"))
22
+ new(url, logger).fetch_links(max_links)
25
23
  end
26
24
 
27
25
  def fetch_links(max_links = MAX_LINKS)
@@ -1,3 +1,3 @@
1
1
  module EmailCrawler
2
- VERSION = "0.0.8"
2
+ VERSION = "0.0.9"
3
3
  end
data/lib/email_crawler.rb CHANGED
@@ -14,28 +14,16 @@ module EmailCrawler
14
14
  class Runner
15
15
  MAX_CONCURRENCY = 50
16
16
 
17
- attr_writer :max_results, :max_links, :max_concurrency
17
+ attr_writer :max_results, :max_links, :max_concurrency, :logger
18
18
 
19
19
  def initialize(google_website)
20
20
  @google_website = google_website
21
-
22
- # @logger = ::Logger.new(STDOUT).tap do |logger|
23
- log_file = File.join(ENV["HOME"], "email-crawler.log")
24
- file = File.open(log_file, File::WRONLY | File::APPEND | File::CREAT)
25
- @logger = ::Logger.new(file).tap do |logger|
26
- logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
27
- end
28
-
29
21
  yield(self)
30
-
31
- @logger.info "max_results: #{@max_results}"
32
- @logger.info "max_links: #{@max_links}"
33
- @logger.info "max_concurrency: #{@max_concurrency}"
34
22
  end
35
23
 
36
24
  def run(q)
37
25
  urls = Scraper.new(@google_website, @max_results).search_result_urls_for(q)
38
- urls.each { |url| @logger.info "#{url}" }
26
+ urls.each { |url| logger.info "#{url}" }
39
27
  queue = Queue.new
40
28
  urls.each { |url| queue.push(url) }
41
29
  links_by_url = ThreadSafe::Array.new
@@ -47,8 +35,8 @@ module EmailCrawler
47
35
  rescue ThreadError; end
48
36
 
49
37
  while url
50
- @logger.info "[Thread ##{i}] grabbing page links for '#{url}'.."
51
- links = PageLinks.for(url, @max_links)
38
+ logger.info "[Thread ##{i}] grabbing page links for '#{url}'.."
39
+ links = PageLinks.for(url, max_links: @max_links, logger: logger)
52
40
  links_by_url << [url, links]
53
41
 
54
42
  url = begin
@@ -58,7 +46,7 @@ module EmailCrawler
58
46
  end
59
47
  end
60
48
  threads.each(&:join)
61
- @logger.debug "links_by_url: #{links_by_url.inspect}"
49
+ logger.debug "links_by_url: #{links_by_url.inspect}"
62
50
 
63
51
  links_by_url.each { |arr| queue.push(arr) }
64
52
  emails_by_url = ThreadSafe::Hash.new
@@ -70,8 +58,8 @@ module EmailCrawler
70
58
 
71
59
  while arr
72
60
  url, links = arr
73
- @logger.info "[Thread ##{i}] scanning for emails on page '#{url}' (#{links.length} links)"
74
- emails = EmailScanner.new(url).scan(links)
61
+ logger.info "[Thread ##{i}] scanning for emails on page '#{url}' (#{links.length} links)"
62
+ emails = EmailScanner.new(url, logger).scan(links)
75
63
  emails_by_url[url] = emails
76
64
 
77
65
  arr = begin
@@ -81,7 +69,7 @@ module EmailCrawler
81
69
  end
82
70
  end
83
71
  threads.each(&:join)
84
- @logger.debug "emails_by_url: #{emails_by_url.inspect}"
72
+ logger.debug "emails_by_url: #{emails_by_url.inspect}"
85
73
 
86
74
  read_emails = Set.new
87
75
  CSV.generate do |csv|
@@ -90,7 +78,7 @@ module EmailCrawler
90
78
 
91
79
  emails_by_url.each do |url, emails_by_link|
92
80
  email_count = emails_by_link.inject(0) { |sum, arr| sum += arr.last.length }
93
- @logger.info "#{url} (#{email_count} emails)"
81
+ logger.info "#{url} (#{email_count} emails)"
94
82
 
95
83
  emails_by_link.each do |link, emails|
96
84
  emails.each do |email|
@@ -100,5 +88,17 @@ module EmailCrawler
100
88
  end
101
89
  end
102
90
  end
91
+
92
+ private
93
+
94
+ def logger
95
+ @logger ||= begin
96
+ path = File.join(ENV["HOME"], "email_crawler.log")
97
+ file = File.open(path, File::WRONLY | File::APPEND | File::CREAT)
98
+ logger = ::Logger.new(file).tap do |logger|
99
+ logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
100
+ end
101
+ end
102
+ end
103
103
  end
104
104
  end
@@ -7,7 +7,7 @@ module EmailCrawler
7
7
  let(:max_links) { 25 }
8
8
 
9
9
  it "returns the first N internal links" do
10
- PageLinks.for("http://www.visitberlin.de/en", max_links).length.must_equal max_links
10
+ PageLinks.for("http://www.visitberlin.de/en", max_links: max_links).length.must_equal max_links
11
11
  end
12
12
  end
13
13
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: email_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch