email_crawler 0.0.12 → 0.0.13
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -3
- data/lib/email_crawler/mechanize_helper.rb +14 -3
- data/lib/email_crawler/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6d6abc9f4ac1c4c0323addb2123d8795aa268cb2
|
4
|
+
data.tar.gz: e835dcd5d5b52bc30ec55831503dfa2f354a4d76
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 12e7baf2369874c8f759f47d54fbebf3bdcea67c2d4065dda0e461b2f052e9a97982323d3abbc2e0d06c7c28ac1f486854c64193e0cacb12d19074821dcd03aa
|
7
|
+
data.tar.gz: b89ff4b54670fb2ef27dadefbb25476aaa6fe27b5f9374cadaf466e619e130e0c7270a6aa7ebf41dcfaac4774536f7edf18ddf8d4d39fb1caf2aaa086703ee1a
|
data/README.md
CHANGED
@@ -1,12 +1,10 @@
|
|
1
1
|
# EmailCrawler
|
2
2
|
|
3
|
-
Email crawler: crawls the top
|
3
|
+
Email crawler: crawls the top N Google search results looking for email addresses and exports them to CSV.
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
7
7
|
$ gem install email_crawler
|
8
|
-
$ cp .env.example .env
|
9
|
-
# set your Digital Ocean credentials (@see lib/email_crawler/proxy.rb for more details)
|
10
8
|
|
11
9
|
## Usage
|
12
10
|
|
@@ -2,10 +2,12 @@ require "mechanize"
|
|
2
2
|
|
3
3
|
module EmailCrawler
|
4
4
|
module MechanizeHelper
|
5
|
+
READ_TIMEOUT = 15
|
6
|
+
|
5
7
|
def new_agent
|
6
8
|
Thread.current[:agent] ||= Mechanize.new do |agent|
|
7
9
|
agent.user_agent_alias = "Mac Safari"
|
8
|
-
agent.open_timeout = agent.read_timeout =
|
10
|
+
agent.open_timeout = agent.read_timeout = READ_TIMEOUT
|
9
11
|
agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
10
12
|
agent.history.max_size = 1
|
11
13
|
yield(agent) if block_given?
|
@@ -16,10 +18,19 @@ module EmailCrawler
|
|
16
18
|
retried = false
|
17
19
|
|
18
20
|
begin
|
19
|
-
page =
|
21
|
+
page = begin
|
22
|
+
Timeout::timeout(READ_TIMEOUT) do
|
23
|
+
agent.get(url)
|
24
|
+
end
|
25
|
+
rescue Timeout::Error
|
26
|
+
unless retried
|
27
|
+
retried = true
|
28
|
+
retry
|
29
|
+
end
|
30
|
+
end
|
20
31
|
page if page.is_a?(Mechanize::Page)
|
21
32
|
rescue Mechanize::Error;
|
22
|
-
rescue SocketError
|
33
|
+
rescue SocketError
|
23
34
|
unless retried
|
24
35
|
retried = true
|
25
36
|
retry
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: email_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Cristian Rasch
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-03-
|
11
|
+
date: 2014-03-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|