email_crawler 0.0.12 → 0.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8fe6913140fc9048bb7fdfc21f6e4b5557953cf0
4
- data.tar.gz: 3092d7b6880ed7e813cbbf4a34cf809e66847dea
3
+ metadata.gz: 6d6abc9f4ac1c4c0323addb2123d8795aa268cb2
4
+ data.tar.gz: e835dcd5d5b52bc30ec55831503dfa2f354a4d76
5
5
  SHA512:
6
- metadata.gz: f64caaf2f11217c196a806fc8014e03ca82900b2aa352e2aa5076ff5348a9444c6fecf265e97fb39cf9f5d9132aed0f89b78dbd2245ce3dea8380d14c7eec14d
7
- data.tar.gz: e16e7746cbe5ce00d53826b09b3e67506ecbb8252add8c1f81aadf111263c63741ab3e63b8d76186ef67a89318679e8c8fb0077964e3cf2de8fa882899f15ab7
6
+ metadata.gz: 12e7baf2369874c8f759f47d54fbebf3bdcea67c2d4065dda0e461b2f052e9a97982323d3abbc2e0d06c7c28ac1f486854c64193e0cacb12d19074821dcd03aa
7
+ data.tar.gz: b89ff4b54670fb2ef27dadefbb25476aaa6fe27b5f9374cadaf466e619e130e0c7270a6aa7ebf41dcfaac4774536f7edf18ddf8d4d39fb1caf2aaa086703ee1a
data/README.md CHANGED
@@ -1,12 +1,10 @@
1
1
  # EmailCrawler
2
2
 
3
- Email crawler: crawls the top ten Google search results looking for email addresses and exports them to CSV.
3
+ Email crawler: crawls the top N Google search results looking for email addresses and exports them to CSV.
4
4
 
5
5
  ## Installation
6
6
 
7
7
  $ gem install email_crawler
8
- $ cp .env.example .env
9
- # set your Digital Ocean credentials (@see lib/email_crawler/proxy.rb for more details)
10
8
 
11
9
  ## Usage
12
10
 
@@ -2,10 +2,12 @@ require "mechanize"
2
2
 
3
3
  module EmailCrawler
4
4
  module MechanizeHelper
5
+ READ_TIMEOUT = 15
6
+
5
7
  def new_agent
6
8
  Thread.current[:agent] ||= Mechanize.new do |agent|
7
9
  agent.user_agent_alias = "Mac Safari"
8
- agent.open_timeout = agent.read_timeout = 30
10
+ agent.open_timeout = agent.read_timeout = READ_TIMEOUT
9
11
  agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
10
12
  agent.history.max_size = 1
11
13
  yield(agent) if block_given?
@@ -16,10 +18,19 @@ module EmailCrawler
16
18
  retried = false
17
19
 
18
20
  begin
19
- page = agent.get(url)
21
+ page = begin
22
+ Timeout::timeout(READ_TIMEOUT) do
23
+ agent.get(url)
24
+ end
25
+ rescue Timeout::Error
26
+ unless retried
27
+ retried = true
28
+ retry
29
+ end
30
+ end
20
31
  page if page.is_a?(Mechanize::Page)
21
32
  rescue Mechanize::Error;
22
- rescue SocketError, Net::OpenTimeout
33
+ rescue SocketError
23
34
  unless retried
24
35
  retried = true
25
36
  retry
@@ -1,3 +1,3 @@
1
1
  module EmailCrawler
2
- VERSION = "0.0.12"
2
+ VERSION = "0.0.13"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: email_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.12
4
+ version: 0.0.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-21 00:00:00.000000000 Z
11
+ date: 2014-03-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize