email_crawler 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 45594bfd727c7a3e1e5125755c5b4b136cee5638
|
4
|
+
data.tar.gz: 7af69b7e472e31f47f32f44ebbf975241d1041d2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 574179cf97e5a5131db4889b02d2ee939bcdcf303e0904439bf4d80370fc9a255816698f5883739f18166efcc71b088efc61a357253112d1ddacd1819a0bcc46
|
7
|
+
data.tar.gz: 404d06b126c8e6c55c0e81708d0f12a1dd63dec4069802e7d8899efbb05c0213f1f8701a19037eed1247b6345e4a965bcee0411411bddcd3adbdd13d33fab0db
|
data/lib/email_crawler.rb
CHANGED
@@ -31,7 +31,7 @@ module EmailCrawler
|
|
31
31
|
urls.each { |url| queue.push(url) }
|
32
32
|
links_by_url = ThreadSafe::Array.new
|
33
33
|
|
34
|
-
threads = (1..[
|
34
|
+
threads = (1..[queue.size, @max_concurrency].min).map do |i|
|
35
35
|
Thread.new(i) do |i|
|
36
36
|
url = begin
|
37
37
|
queue.pop(true)
|
@@ -1,10 +1,10 @@
|
|
1
|
-
require "open-uri"
|
2
|
-
|
3
1
|
module EmailCrawler
|
4
2
|
class EmailScanner
|
5
3
|
EMAIL_REGEXP = /\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}\b/i
|
6
4
|
UTF_8 = "UTF-8".freeze
|
7
5
|
|
6
|
+
include MechanizeHelper
|
7
|
+
|
8
8
|
def initialize(logger = Logger.new("/dev/null"))
|
9
9
|
@logger = logger
|
10
10
|
end
|
@@ -14,17 +14,12 @@ module EmailCrawler
|
|
14
14
|
@logger.info "searching for emails on '#{link}'.."
|
15
15
|
retried = false
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
if err.message =~ /redirection forbidden/
|
24
|
-
link = err.message.split(" ").last
|
25
|
-
retry
|
26
|
-
end
|
27
|
-
end
|
17
|
+
begin
|
18
|
+
html = get(link).body
|
19
|
+
rescue => err
|
20
|
+
@logger.warn err.inspect
|
21
|
+
nil
|
22
|
+
end
|
28
23
|
next unless html
|
29
24
|
|
30
25
|
begin
|
@@ -8,8 +8,10 @@ module EmailCrawler
|
|
8
8
|
|
9
9
|
def initialize(url, logger = Logger.new("/dev/null"))
|
10
10
|
@url = url
|
11
|
-
uri =
|
12
|
-
|
11
|
+
uri = begin
|
12
|
+
URI(url)
|
13
|
+
rescue; end
|
14
|
+
scheme_and_host = if uri && uri.host
|
13
15
|
"#{uri.scheme}://#{uri.host}"
|
14
16
|
else
|
15
17
|
url[%r(\A(https?://([^/]+))), 1]
|
@@ -47,7 +49,7 @@ module EmailCrawler
|
|
47
49
|
@logger.error "Giving up grabbing link for '#{@url}' after #{retries} retries"
|
48
50
|
break
|
49
51
|
end
|
50
|
-
rescue
|
52
|
+
rescue => err
|
51
53
|
@logger.warn err.inspect
|
52
54
|
else
|
53
55
|
retries = 0
|
@@ -72,11 +74,5 @@ module EmailCrawler
|
|
72
74
|
|
73
75
|
links.to_a
|
74
76
|
end
|
75
|
-
|
76
|
-
private
|
77
|
-
|
78
|
-
def agent
|
79
|
-
@agent ||= new_agent
|
80
|
-
end
|
81
77
|
end
|
82
78
|
end
|