email_crawler 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b158c0d7a48cac6a56c44b905b833934e3e88ee2
4
- data.tar.gz: 9b13281878621e1679de17c30e5b13d4ea04bd98
3
+ metadata.gz: 45594bfd727c7a3e1e5125755c5b4b136cee5638
4
+ data.tar.gz: 7af69b7e472e31f47f32f44ebbf975241d1041d2
5
5
  SHA512:
6
- metadata.gz: 589043168d0c1a9ad72dcf3e4d7f2d32194004e76cff744e9fd9ec7187cb8e4101d22f2122f6d236a586626cdd86588ee9a676cc4afe3e91e8624fca0a557d19
7
- data.tar.gz: 1955a721beb4336cb87067460a923826e801701c7d40695f96fb73b41843ecbf58592efe7b4dbcf1053dd76ca61dfefd3a141e897e282db296323945e181c070
6
+ metadata.gz: 574179cf97e5a5131db4889b02d2ee939bcdcf303e0904439bf4d80370fc9a255816698f5883739f18166efcc71b088efc61a357253112d1ddacd1819a0bcc46
7
+ data.tar.gz: 404d06b126c8e6c55c0e81708d0f12a1dd63dec4069802e7d8899efbb05c0213f1f8701a19037eed1247b6345e4a965bcee0411411bddcd3adbdd13d33fab0db
@@ -31,7 +31,7 @@ module EmailCrawler
31
31
  urls.each { |url| queue.push(url) }
32
32
  links_by_url = ThreadSafe::Array.new
33
33
 
34
- threads = (1..[urls.length, @max_concurrency].min).map do |i|
34
+ threads = (1..[queue.size, @max_concurrency].min).map do |i|
35
35
  Thread.new(i) do |i|
36
36
  url = begin
37
37
  queue.pop(true)
@@ -1,10 +1,10 @@
1
- require "open-uri"
2
-
3
1
  module EmailCrawler
4
2
  class EmailScanner
5
3
  EMAIL_REGEXP = /\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}\b/i
6
4
  UTF_8 = "UTF-8".freeze
7
5
 
6
+ include MechanizeHelper
7
+
8
8
  def initialize(logger = Logger.new("/dev/null"))
9
9
  @logger = logger
10
10
  end
@@ -14,17 +14,12 @@ module EmailCrawler
14
14
  @logger.info "searching for emails on '#{link}'.."
15
15
  retried = false
16
16
 
17
- html = begin
18
- open(link).read
19
- rescue OpenURI::HTTPError => err
20
- @logger.warn(err)
21
- nil
22
- rescue => err
23
- if err.message =~ /redirection forbidden/
24
- link = err.message.split(" ").last
25
- retry
26
- end
27
- end
17
+ begin
18
+ html = get(link).body
19
+ rescue => err
20
+ @logger.warn err.inspect
21
+ nil
22
+ end
28
23
  next unless html
29
24
 
30
25
  begin
@@ -37,5 +37,11 @@ module EmailCrawler
37
37
  end
38
38
  end
39
39
  end
40
+
41
+ private
42
+
43
+ def agent
44
+ @agent ||= new_agent
45
+ end
40
46
  end
41
47
  end
@@ -8,8 +8,10 @@ module EmailCrawler
8
8
 
9
9
  def initialize(url, logger = Logger.new("/dev/null"))
10
10
  @url = url
11
- uri = URI(url)
12
- scheme_and_host = if uri.host
11
+ uri = begin
12
+ URI(url)
13
+ rescue; end
14
+ scheme_and_host = if uri && uri.host
13
15
  "#{uri.scheme}://#{uri.host}"
14
16
  else
15
17
  url[%r(\A(https?://([^/]+))), 1]
@@ -47,7 +49,7 @@ module EmailCrawler
47
49
  @logger.error "Giving up grabbing link for '#{@url}' after #{retries} retries"
48
50
  break
49
51
  end
50
- rescue URI::InvalidComponentError => err
52
+ rescue => err
51
53
  @logger.warn err.inspect
52
54
  else
53
55
  retries = 0
@@ -72,11 +74,5 @@ module EmailCrawler
72
74
 
73
75
  links.to_a
74
76
  end
75
-
76
- private
77
-
78
- def agent
79
- @agent ||= new_agent
80
- end
81
77
  end
82
78
  end
@@ -58,9 +58,5 @@ module EmailCrawler
58
58
 
59
59
  urls
60
60
  end
61
-
62
- def agent
63
- @agent ||= new_agent
64
- end
65
61
  end
66
62
  end
@@ -1,3 +1,3 @@
1
1
  module EmailCrawler
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: email_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch