email_crawler 0.0.12 → 0.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -3
- data/lib/email_crawler/mechanize_helper.rb +14 -3
- data/lib/email_crawler/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6d6abc9f4ac1c4c0323addb2123d8795aa268cb2
|
4
|
+
data.tar.gz: e835dcd5d5b52bc30ec55831503dfa2f354a4d76
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 12e7baf2369874c8f759f47d54fbebf3bdcea67c2d4065dda0e461b2f052e9a97982323d3abbc2e0d06c7c28ac1f486854c64193e0cacb12d19074821dcd03aa
|
7
|
+
data.tar.gz: b89ff4b54670fb2ef27dadefbb25476aaa6fe27b5f9374cadaf466e619e130e0c7270a6aa7ebf41dcfaac4774536f7edf18ddf8d4d39fb1caf2aaa086703ee1a
|
data/README.md
CHANGED
@@ -1,12 +1,10 @@
|
|
1
1
|
# EmailCrawler
|
2
2
|
|
3
|
-
Email crawler: crawls the top
|
3
|
+
Email crawler: crawls the top N Google search results looking for email addresses and exports them to CSV.
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
7
7
|
$ gem install email_crawler
|
8
|
-
$ cp .env.example .env
|
9
|
-
# set your Digital Ocean credentials (@see lib/email_crawler/proxy.rb for more details)
|
10
8
|
|
11
9
|
## Usage
|
12
10
|
|
@@ -2,10 +2,12 @@ require "mechanize"
|
|
2
2
|
|
3
3
|
module EmailCrawler
|
4
4
|
module MechanizeHelper
|
5
|
+
READ_TIMEOUT = 15
|
6
|
+
|
5
7
|
def new_agent
|
6
8
|
Thread.current[:agent] ||= Mechanize.new do |agent|
|
7
9
|
agent.user_agent_alias = "Mac Safari"
|
8
|
-
agent.open_timeout = agent.read_timeout =
|
10
|
+
agent.open_timeout = agent.read_timeout = READ_TIMEOUT
|
9
11
|
agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
10
12
|
agent.history.max_size = 1
|
11
13
|
yield(agent) if block_given?
|
@@ -16,10 +18,19 @@ module EmailCrawler
|
|
16
18
|
retried = false
|
17
19
|
|
18
20
|
begin
|
19
|
-
page =
|
21
|
+
page = begin
|
22
|
+
Timeout::timeout(READ_TIMEOUT) do
|
23
|
+
agent.get(url)
|
24
|
+
end
|
25
|
+
rescue Timeout::Error
|
26
|
+
unless retried
|
27
|
+
retried = true
|
28
|
+
retry
|
29
|
+
end
|
30
|
+
end
|
20
31
|
page if page.is_a?(Mechanize::Page)
|
21
32
|
rescue Mechanize::Error;
|
22
|
-
rescue SocketError
|
33
|
+
rescue SocketError
|
23
34
|
unless retried
|
24
35
|
retried = true
|
25
36
|
retry
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: email_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Cristian Rasch
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-03-
|
11
|
+
date: 2014-03-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|