email_crawler 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 49d12206392f47b3417cebc922a65e38030aab31
|
4
|
+
data.tar.gz: 953cd1f8ab44e9fafc361b06495d9278832b72e5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 21531db699176ddca60be2c5215129fc601574674b43f463da2d83b175c46693cd8d90c9fc76a246cf988224afd82cbc1ed6dcb450c938535a859376de44da68
|
7
|
+
data.tar.gz: 2b80fb41432a1a549b92fc7342e995e20fbf0753db51488546dddf634f4adb50e67bd2248a2efc6aa17e60d38b74e60318937d499e1651b1275330ca81982add
|
@@ -6,11 +6,8 @@ module EmailCrawler
|
|
6
6
|
SLEEP_TIME = 0.5
|
7
7
|
UTF_8 = "UTF-8".freeze
|
8
8
|
|
9
|
-
def initialize(url)
|
10
|
-
@url = url
|
11
|
-
@logger = ::Logger.new(STDOUT).tap do |logger|
|
12
|
-
logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
|
13
|
-
end
|
9
|
+
def initialize(url, logger = Logger.new("/dev/null"))
|
10
|
+
@url, @logger = url, logger
|
14
11
|
end
|
15
12
|
|
16
13
|
def scan(links)
|
@@ -6,7 +6,7 @@ module EmailCrawler
|
|
6
6
|
|
7
7
|
include MechanizeHelper
|
8
8
|
|
9
|
-
def initialize(url)
|
9
|
+
def initialize(url, logger = Logger.new("/dev/null"))
|
10
10
|
@url = url
|
11
11
|
uri = URI(url)
|
12
12
|
scheme_and_host = if uri.host
|
@@ -15,13 +15,11 @@ module EmailCrawler
|
|
15
15
|
url[%r(\A(https?://([^/]+))), 1]
|
16
16
|
end
|
17
17
|
@domain = Regexp.new("#{scheme_and_host}/", true)
|
18
|
-
@logger =
|
19
|
-
logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
|
20
|
-
end
|
18
|
+
@logger = logger
|
21
19
|
end
|
22
20
|
|
23
|
-
def self.for(url, max_links
|
24
|
-
new(url).fetch_links(max_links)
|
21
|
+
def self.for(url, max_links: MAX_LINKS, logger: Logger.new("/dev/null"))
|
22
|
+
new(url, logger).fetch_links(max_links)
|
25
23
|
end
|
26
24
|
|
27
25
|
def fetch_links(max_links = MAX_LINKS)
|
data/lib/email_crawler.rb
CHANGED
@@ -14,28 +14,16 @@ module EmailCrawler
|
|
14
14
|
class Runner
|
15
15
|
MAX_CONCURRENCY = 50
|
16
16
|
|
17
|
-
attr_writer :max_results, :max_links, :max_concurrency
|
17
|
+
attr_writer :max_results, :max_links, :max_concurrency, :logger
|
18
18
|
|
19
19
|
def initialize(google_website)
|
20
20
|
@google_website = google_website
|
21
|
-
|
22
|
-
# @logger = ::Logger.new(STDOUT).tap do |logger|
|
23
|
-
log_file = File.join(ENV["HOME"], "email-crawler.log")
|
24
|
-
file = File.open(log_file, File::WRONLY | File::APPEND | File::CREAT)
|
25
|
-
@logger = ::Logger.new(file).tap do |logger|
|
26
|
-
logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
|
27
|
-
end
|
28
|
-
|
29
21
|
yield(self)
|
30
|
-
|
31
|
-
@logger.info "max_results: #{@max_results}"
|
32
|
-
@logger.info "max_links: #{@max_links}"
|
33
|
-
@logger.info "max_concurrency: #{@max_concurrency}"
|
34
22
|
end
|
35
23
|
|
36
24
|
def run(q)
|
37
25
|
urls = Scraper.new(@google_website, @max_results).search_result_urls_for(q)
|
38
|
-
urls.each { |url|
|
26
|
+
urls.each { |url| logger.info "#{url}" }
|
39
27
|
queue = Queue.new
|
40
28
|
urls.each { |url| queue.push(url) }
|
41
29
|
links_by_url = ThreadSafe::Array.new
|
@@ -47,8 +35,8 @@ module EmailCrawler
|
|
47
35
|
rescue ThreadError; end
|
48
36
|
|
49
37
|
while url
|
50
|
-
|
51
|
-
links = PageLinks.for(url, @max_links)
|
38
|
+
logger.info "[Thread ##{i}] grabbing page links for '#{url}'.."
|
39
|
+
links = PageLinks.for(url, max_links: @max_links, logger: logger)
|
52
40
|
links_by_url << [url, links]
|
53
41
|
|
54
42
|
url = begin
|
@@ -58,7 +46,7 @@ module EmailCrawler
|
|
58
46
|
end
|
59
47
|
end
|
60
48
|
threads.each(&:join)
|
61
|
-
|
49
|
+
logger.debug "links_by_url: #{links_by_url.inspect}"
|
62
50
|
|
63
51
|
links_by_url.each { |arr| queue.push(arr) }
|
64
52
|
emails_by_url = ThreadSafe::Hash.new
|
@@ -70,8 +58,8 @@ module EmailCrawler
|
|
70
58
|
|
71
59
|
while arr
|
72
60
|
url, links = arr
|
73
|
-
|
74
|
-
emails = EmailScanner.new(url).scan(links)
|
61
|
+
logger.info "[Thread ##{i}] scanning for emails on page '#{url}' (#{links.length} links)"
|
62
|
+
emails = EmailScanner.new(url, logger).scan(links)
|
75
63
|
emails_by_url[url] = emails
|
76
64
|
|
77
65
|
arr = begin
|
@@ -81,7 +69,7 @@ module EmailCrawler
|
|
81
69
|
end
|
82
70
|
end
|
83
71
|
threads.each(&:join)
|
84
|
-
|
72
|
+
logger.debug "emails_by_url: #{emails_by_url.inspect}"
|
85
73
|
|
86
74
|
read_emails = Set.new
|
87
75
|
CSV.generate do |csv|
|
@@ -90,7 +78,7 @@ module EmailCrawler
|
|
90
78
|
|
91
79
|
emails_by_url.each do |url, emails_by_link|
|
92
80
|
email_count = emails_by_link.inject(0) { |sum, arr| sum += arr.last.length }
|
93
|
-
|
81
|
+
logger.info "#{url} (#{email_count} emails)"
|
94
82
|
|
95
83
|
emails_by_link.each do |link, emails|
|
96
84
|
emails.each do |email|
|
@@ -100,5 +88,17 @@ module EmailCrawler
|
|
100
88
|
end
|
101
89
|
end
|
102
90
|
end
|
91
|
+
|
92
|
+
private
|
93
|
+
|
94
|
+
def logger
|
95
|
+
@logger ||= begin
|
96
|
+
path = File.join(ENV["HOME"], "email_crawler.log")
|
97
|
+
file = File.open(path, File::WRONLY | File::APPEND | File::CREAT)
|
98
|
+
logger = ::Logger.new(file).tap do |logger|
|
99
|
+
logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
103
|
end
|
104
104
|
end
|
@@ -7,7 +7,7 @@ module EmailCrawler
|
|
7
7
|
let(:max_links) { 25 }
|
8
8
|
|
9
9
|
it "returns the first N internal links" do
|
10
|
-
PageLinks.for("http://www.visitberlin.de/en", max_links).length.must_equal max_links
|
10
|
+
PageLinks.for("http://www.visitberlin.de/en", max_links: max_links).length.must_equal max_links
|
11
11
|
end
|
12
12
|
end
|
13
13
|
end
|