email_crawler 0.0.12 → 0.0.13

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8fe6913140fc9048bb7fdfc21f6e4b5557953cf0
4
- data.tar.gz: 3092d7b6880ed7e813cbbf4a34cf809e66847dea
3
+ metadata.gz: 6d6abc9f4ac1c4c0323addb2123d8795aa268cb2
4
+ data.tar.gz: e835dcd5d5b52bc30ec55831503dfa2f354a4d76
5
5
  SHA512:
6
- metadata.gz: f64caaf2f11217c196a806fc8014e03ca82900b2aa352e2aa5076ff5348a9444c6fecf265e97fb39cf9f5d9132aed0f89b78dbd2245ce3dea8380d14c7eec14d
7
- data.tar.gz: e16e7746cbe5ce00d53826b09b3e67506ecbb8252add8c1f81aadf111263c63741ab3e63b8d76186ef67a89318679e8c8fb0077964e3cf2de8fa882899f15ab7
6
+ metadata.gz: 12e7baf2369874c8f759f47d54fbebf3bdcea67c2d4065dda0e461b2f052e9a97982323d3abbc2e0d06c7c28ac1f486854c64193e0cacb12d19074821dcd03aa
7
+ data.tar.gz: b89ff4b54670fb2ef27dadefbb25476aaa6fe27b5f9374cadaf466e619e130e0c7270a6aa7ebf41dcfaac4774536f7edf18ddf8d4d39fb1caf2aaa086703ee1a
data/README.md CHANGED
@@ -1,12 +1,10 @@
1
1
  # EmailCrawler
2
2
 
3
- Email crawler: crawls the top ten Google search results looking for email addresses and exports them to CSV.
3
+ Email crawler: crawls the top N Google search results looking for email addresses and exports them to CSV.
4
4
 
5
5
  ## Installation
6
6
 
7
7
  $ gem install email_crawler
8
- $ cp .env.example .env
9
- # set your Digital Ocean credentials (@see lib/email_crawler/proxy.rb for more details)
10
8
 
11
9
  ## Usage
12
10
 
@@ -2,10 +2,12 @@ require "mechanize"
2
2
 
3
3
  module EmailCrawler
4
4
  module MechanizeHelper
5
+ READ_TIMEOUT = 15
6
+
5
7
  def new_agent
6
8
  Thread.current[:agent] ||= Mechanize.new do |agent|
7
9
  agent.user_agent_alias = "Mac Safari"
8
- agent.open_timeout = agent.read_timeout = 30
10
+ agent.open_timeout = agent.read_timeout = READ_TIMEOUT
9
11
  agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
10
12
  agent.history.max_size = 1
11
13
  yield(agent) if block_given?
@@ -16,10 +18,19 @@ module EmailCrawler
16
18
  retried = false
17
19
 
18
20
  begin
19
- page = agent.get(url)
21
+ page = begin
22
+ Timeout::timeout(READ_TIMEOUT) do
23
+ agent.get(url)
24
+ end
25
+ rescue Timeout::Error
26
+ unless retried
27
+ retried = true
28
+ retry
29
+ end
30
+ end
20
31
  page if page.is_a?(Mechanize::Page)
21
32
  rescue Mechanize::Error;
22
- rescue SocketError, Net::OpenTimeout
33
+ rescue SocketError
23
34
  unless retried
24
35
  retried = true
25
36
  retry
@@ -1,3 +1,3 @@
1
1
  module EmailCrawler
2
- VERSION = "0.0.12"
2
+ VERSION = "0.0.13"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: email_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.12
4
+ version: 0.0.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-21 00:00:00.000000000 Z
11
+ date: 2014-03-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize