email_crawler 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b158c0d7a48cac6a56c44b905b833934e3e88ee2
4
- data.tar.gz: 9b13281878621e1679de17c30e5b13d4ea04bd98
3
+ metadata.gz: 45594bfd727c7a3e1e5125755c5b4b136cee5638
4
+ data.tar.gz: 7af69b7e472e31f47f32f44ebbf975241d1041d2
5
5
  SHA512:
6
- metadata.gz: 589043168d0c1a9ad72dcf3e4d7f2d32194004e76cff744e9fd9ec7187cb8e4101d22f2122f6d236a586626cdd86588ee9a676cc4afe3e91e8624fca0a557d19
7
- data.tar.gz: 1955a721beb4336cb87067460a923826e801701c7d40695f96fb73b41843ecbf58592efe7b4dbcf1053dd76ca61dfefd3a141e897e282db296323945e181c070
6
+ metadata.gz: 574179cf97e5a5131db4889b02d2ee939bcdcf303e0904439bf4d80370fc9a255816698f5883739f18166efcc71b088efc61a357253112d1ddacd1819a0bcc46
7
+ data.tar.gz: 404d06b126c8e6c55c0e81708d0f12a1dd63dec4069802e7d8899efbb05c0213f1f8701a19037eed1247b6345e4a965bcee0411411bddcd3adbdd13d33fab0db
@@ -31,7 +31,7 @@ module EmailCrawler
31
31
  urls.each { |url| queue.push(url) }
32
32
  links_by_url = ThreadSafe::Array.new
33
33
 
34
- threads = (1..[urls.length, @max_concurrency].min).map do |i|
34
+ threads = (1..[queue.size, @max_concurrency].min).map do |i|
35
35
  Thread.new(i) do |i|
36
36
  url = begin
37
37
  queue.pop(true)
@@ -1,10 +1,10 @@
1
- require "open-uri"
2
-
3
1
  module EmailCrawler
4
2
  class EmailScanner
5
3
  EMAIL_REGEXP = /\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}\b/i
6
4
  UTF_8 = "UTF-8".freeze
7
5
 
6
+ include MechanizeHelper
7
+
8
8
  def initialize(logger = Logger.new("/dev/null"))
9
9
  @logger = logger
10
10
  end
@@ -14,17 +14,12 @@ module EmailCrawler
14
14
  @logger.info "searching for emails on '#{link}'.."
15
15
  retried = false
16
16
 
17
- html = begin
18
- open(link).read
19
- rescue OpenURI::HTTPError => err
20
- @logger.warn(err)
21
- nil
22
- rescue => err
23
- if err.message =~ /redirection forbidden/
24
- link = err.message.split(" ").last
25
- retry
26
- end
27
- end
17
+ begin
18
+ html = get(link).body
19
+ rescue => err
20
+ @logger.warn err.inspect
21
+ nil
22
+ end
28
23
  next unless html
29
24
 
30
25
  begin
@@ -37,5 +37,11 @@ module EmailCrawler
37
37
  end
38
38
  end
39
39
  end
40
+
41
+ private
42
+
43
+ def agent
44
+ @agent ||= new_agent
45
+ end
40
46
  end
41
47
  end
@@ -8,8 +8,10 @@ module EmailCrawler
8
8
 
9
9
  def initialize(url, logger = Logger.new("/dev/null"))
10
10
  @url = url
11
- uri = URI(url)
12
- scheme_and_host = if uri.host
11
+ uri = begin
12
+ URI(url)
13
+ rescue; end
14
+ scheme_and_host = if uri && uri.host
13
15
  "#{uri.scheme}://#{uri.host}"
14
16
  else
15
17
  url[%r(\A(https?://([^/]+))), 1]
@@ -47,7 +49,7 @@ module EmailCrawler
47
49
  @logger.error "Giving up grabbing link for '#{@url}' after #{retries} retries"
48
50
  break
49
51
  end
50
- rescue URI::InvalidComponentError => err
52
+ rescue => err
51
53
  @logger.warn err.inspect
52
54
  else
53
55
  retries = 0
@@ -72,11 +74,5 @@ module EmailCrawler
72
74
 
73
75
  links.to_a
74
76
  end
75
-
76
- private
77
-
78
- def agent
79
- @agent ||= new_agent
80
- end
81
77
  end
82
78
  end
@@ -58,9 +58,5 @@ module EmailCrawler
58
58
 
59
59
  urls
60
60
  end
61
-
62
- def agent
63
- @agent ||= new_agent
64
- end
65
61
  end
66
62
  end
@@ -1,3 +1,3 @@
1
1
  module EmailCrawler
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: email_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch