email_crawler 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 45594bfd727c7a3e1e5125755c5b4b136cee5638
|
4
|
+
data.tar.gz: 7af69b7e472e31f47f32f44ebbf975241d1041d2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 574179cf97e5a5131db4889b02d2ee939bcdcf303e0904439bf4d80370fc9a255816698f5883739f18166efcc71b088efc61a357253112d1ddacd1819a0bcc46
|
7
|
+
data.tar.gz: 404d06b126c8e6c55c0e81708d0f12a1dd63dec4069802e7d8899efbb05c0213f1f8701a19037eed1247b6345e4a965bcee0411411bddcd3adbdd13d33fab0db
|
data/lib/email_crawler.rb
CHANGED
@@ -31,7 +31,7 @@ module EmailCrawler
|
|
31
31
|
urls.each { |url| queue.push(url) }
|
32
32
|
links_by_url = ThreadSafe::Array.new
|
33
33
|
|
34
|
-
threads = (1..[
|
34
|
+
threads = (1..[queue.size, @max_concurrency].min).map do |i|
|
35
35
|
Thread.new(i) do |i|
|
36
36
|
url = begin
|
37
37
|
queue.pop(true)
|
@@ -1,10 +1,10 @@
|
|
1
|
-
require "open-uri"
|
2
|
-
|
3
1
|
module EmailCrawler
|
4
2
|
class EmailScanner
|
5
3
|
EMAIL_REGEXP = /\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}\b/i
|
6
4
|
UTF_8 = "UTF-8".freeze
|
7
5
|
|
6
|
+
include MechanizeHelper
|
7
|
+
|
8
8
|
def initialize(logger = Logger.new("/dev/null"))
|
9
9
|
@logger = logger
|
10
10
|
end
|
@@ -14,17 +14,12 @@ module EmailCrawler
|
|
14
14
|
@logger.info "searching for emails on '#{link}'.."
|
15
15
|
retried = false
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
if err.message =~ /redirection forbidden/
|
24
|
-
link = err.message.split(" ").last
|
25
|
-
retry
|
26
|
-
end
|
27
|
-
end
|
17
|
+
begin
|
18
|
+
html = get(link).body
|
19
|
+
rescue => err
|
20
|
+
@logger.warn err.inspect
|
21
|
+
nil
|
22
|
+
end
|
28
23
|
next unless html
|
29
24
|
|
30
25
|
begin
|
@@ -8,8 +8,10 @@ module EmailCrawler
|
|
8
8
|
|
9
9
|
def initialize(url, logger = Logger.new("/dev/null"))
|
10
10
|
@url = url
|
11
|
-
uri =
|
12
|
-
|
11
|
+
uri = begin
|
12
|
+
URI(url)
|
13
|
+
rescue; end
|
14
|
+
scheme_and_host = if uri && uri.host
|
13
15
|
"#{uri.scheme}://#{uri.host}"
|
14
16
|
else
|
15
17
|
url[%r(\A(https?://([^/]+))), 1]
|
@@ -47,7 +49,7 @@ module EmailCrawler
|
|
47
49
|
@logger.error "Giving up grabbing link for '#{@url}' after #{retries} retries"
|
48
50
|
break
|
49
51
|
end
|
50
|
-
rescue
|
52
|
+
rescue => err
|
51
53
|
@logger.warn err.inspect
|
52
54
|
else
|
53
55
|
retries = 0
|
@@ -72,11 +74,5 @@ module EmailCrawler
|
|
72
74
|
|
73
75
|
links.to_a
|
74
76
|
end
|
75
|
-
|
76
|
-
private
|
77
|
-
|
78
|
-
def agent
|
79
|
-
@agent ||= new_agent
|
80
|
-
end
|
81
77
|
end
|
82
78
|
end
|