email_crawler 0.0.13 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.ruby-version +1 -1
- data/README.md +6 -0
- data/bin/email-crawler +1 -1
- data/email_crawler.gemspec +2 -2
- data/lib/email_crawler.rb +6 -6
- data/lib/email_crawler/email_scanner.rb +15 -20
- data/lib/email_crawler/mechanize_helper.rb +1 -1
- data/lib/email_crawler/page_links.rb +1 -1
- data/lib/email_crawler/scraper.rb +28 -14
- data/lib/email_crawler/url_helper.rb +1 -0
- data/lib/email_crawler/version.rb +1 -1
- data/spec/lib/email_crawler/email_scanner_spec.rb +2 -2
- metadata +9 -10
- data/.env.example +0 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b158c0d7a48cac6a56c44b905b833934e3e88ee2
|
4
|
+
data.tar.gz: 9b13281878621e1679de17c30e5b13d4ea04bd98
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 589043168d0c1a9ad72dcf3e4d7f2d32194004e76cff744e9fd9ec7187cb8e4101d22f2122f6d236a586626cdd86588ee9a676cc4afe3e91e8624fca0a557d19
|
7
|
+
data.tar.gz: 1955a721beb4336cb87067460a923826e801701c7d40695f96fb73b41843ecbf58592efe7b4dbcf1053dd76ca61dfefd3a141e897e282db296323945e181c070
|
data/.gitignore
CHANGED
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.
|
1
|
+
2.2.3
|
data/README.md
CHANGED
@@ -44,6 +44,12 @@ email-crawler --query "berlin walks" --max-links 250
|
|
44
44
|
email-crawler --query "berlin walks" --concurrency 25
|
45
45
|
```
|
46
46
|
|
47
|
+
* Exclude certain domains from pages scanned for email addresses
|
48
|
+
|
49
|
+
```bash
|
50
|
+
email-crawler --query "berlin walks" --blacklist berlin.de --blacklist berlin.com
|
51
|
+
```
|
52
|
+
|
47
53
|
* Redirect output to a file
|
48
54
|
|
49
55
|
```bash
|
data/bin/email-crawler
CHANGED
data/email_crawler.gemspec
CHANGED
@@ -21,7 +21,7 @@ Gem::Specification.new do |spec|
|
|
21
21
|
spec.add_runtime_dependency "dotenv"
|
22
22
|
spec.add_runtime_dependency "thread_safe"
|
23
23
|
|
24
|
-
spec.add_development_dependency "bundler"
|
24
|
+
spec.add_development_dependency "bundler"
|
25
25
|
spec.add_development_dependency "rake"
|
26
|
-
spec.add_development_dependency "minitest", "~> 5.
|
26
|
+
spec.add_development_dependency "minitest", "~> 5.8"
|
27
27
|
end
|
data/lib/email_crawler.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
require "thread"
|
2
1
|
require "logger"
|
3
2
|
require "csv"
|
4
3
|
require "set"
|
@@ -14,7 +13,8 @@ module EmailCrawler
|
|
14
13
|
class Runner
|
15
14
|
MAX_CONCURRENCY = 50
|
16
15
|
|
17
|
-
attr_writer :max_results, :max_links, :max_concurrency, :logger,
|
16
|
+
attr_writer :max_results, :max_links, :max_concurrency, :logger,
|
17
|
+
:blacklisted_domains
|
18
18
|
|
19
19
|
def initialize(google_website)
|
20
20
|
@google_website = google_website
|
@@ -27,7 +27,7 @@ module EmailCrawler
|
|
27
27
|
blacklisted_domains: @blacklisted_domains).
|
28
28
|
search_result_urls_for(q)
|
29
29
|
urls.each { |url| logger.info "#{url}" }
|
30
|
-
queue = Queue.new
|
30
|
+
queue = Thread::Queue.new
|
31
31
|
urls.each { |url| queue.push(url) }
|
32
32
|
links_by_url = ThreadSafe::Array.new
|
33
33
|
|
@@ -62,7 +62,7 @@ module EmailCrawler
|
|
62
62
|
while arr
|
63
63
|
url, links = arr
|
64
64
|
logger.info "[Thread ##{i}] scanning for emails on page '#{url}' (#{links.length} links)"
|
65
|
-
emails = EmailScanner.new(
|
65
|
+
emails = EmailScanner.new(logger).scan(links)
|
66
66
|
emails_by_url[url] = emails
|
67
67
|
|
68
68
|
arr = begin
|
@@ -92,11 +92,11 @@ module EmailCrawler
|
|
92
92
|
end
|
93
93
|
end
|
94
94
|
|
95
|
-
|
95
|
+
private
|
96
96
|
|
97
97
|
def logger
|
98
98
|
@logger ||= begin
|
99
|
-
path = File.join(ENV["HOME"], "
|
99
|
+
path = File.join(ENV["HOME"], "email-crawler.log")
|
100
100
|
file = File.open(path, File::WRONLY | File::APPEND | File::CREAT)
|
101
101
|
logger = ::Logger.new(file).tap do |logger|
|
102
102
|
logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
|
@@ -3,31 +3,28 @@ require "open-uri"
|
|
3
3
|
module EmailCrawler
|
4
4
|
class EmailScanner
|
5
5
|
EMAIL_REGEXP = /\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}\b/i
|
6
|
-
SLEEP_TIME = 0.5
|
7
6
|
UTF_8 = "UTF-8".freeze
|
8
7
|
|
9
|
-
def initialize(
|
10
|
-
@
|
8
|
+
def initialize(logger = Logger.new("/dev/null"))
|
9
|
+
@logger = logger
|
11
10
|
end
|
12
11
|
|
13
12
|
def scan(links)
|
14
|
-
|
15
|
-
|
16
|
-
links.each do |link|
|
13
|
+
links.each_with_object({}) do |link, h|
|
17
14
|
@logger.info "searching for emails on '#{link}'.."
|
18
15
|
retried = false
|
19
16
|
|
20
17
|
html = begin
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
18
|
+
open(link).read
|
19
|
+
rescue OpenURI::HTTPError => err
|
20
|
+
@logger.warn(err)
|
21
|
+
nil
|
22
|
+
rescue => err
|
23
|
+
if err.message =~ /redirection forbidden/
|
24
|
+
link = err.message.split(" ").last
|
25
|
+
retry
|
26
|
+
end
|
27
|
+
end
|
31
28
|
next unless html
|
32
29
|
|
33
30
|
begin
|
@@ -42,11 +39,9 @@ module EmailCrawler
|
|
42
39
|
retry
|
43
40
|
end
|
44
41
|
end
|
45
|
-
emails_by_link[link] = Set.new(emails) unless emails.empty?
|
46
|
-
sleep(SLEEP_TIME)
|
47
|
-
end
|
48
42
|
|
49
|
-
|
43
|
+
h[link] = Set.new(emails) unless emails.empty?
|
44
|
+
end
|
50
45
|
end
|
51
46
|
end
|
52
47
|
end
|
@@ -6,7 +6,7 @@ module EmailCrawler
|
|
6
6
|
|
7
7
|
def new_agent
|
8
8
|
Thread.current[:agent] ||= Mechanize.new do |agent|
|
9
|
-
agent.user_agent_alias = "
|
9
|
+
agent.user_agent_alias = "Windows Mozilla"
|
10
10
|
agent.open_timeout = agent.read_timeout = READ_TIMEOUT
|
11
11
|
agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
12
12
|
agent.history.max_size = 1
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require "set"
|
2
|
+
require_relative "mechanize_helper"
|
1
3
|
require_relative "url_helper"
|
2
4
|
|
3
5
|
module EmailCrawler
|
@@ -8,41 +10,53 @@ module EmailCrawler
|
|
8
10
|
include URLHelper
|
9
11
|
|
10
12
|
def initialize(google_website, max_results: MAX_RESULTS, blacklisted_domains: [])
|
11
|
-
@
|
13
|
+
@search_url = "https://www.#{google_website}/search?q="
|
12
14
|
@max_results = max_results
|
13
15
|
@blacklisted_domains = blacklisted_domains.map { |domain| /#{domain}\z/ }
|
14
16
|
end
|
15
17
|
|
16
18
|
def search_result_urls_for(q)
|
17
|
-
|
18
|
-
|
19
|
-
search_form.field_with(name: "q").value = q
|
20
|
-
search_results_page = agent.submit(search_form)
|
21
|
-
urls = search_results_on(search_results_page)
|
19
|
+
search_results_page = agent.get(@search_url + CGI.escape(q))
|
20
|
+
urls = Set.new(search_results_on(search_results_page))
|
22
21
|
|
23
22
|
page = 1
|
24
23
|
while urls.size < @max_results
|
25
24
|
next_page_link = search_results_page.link_with(href: /start=#{page*10}/)
|
26
|
-
|
25
|
+
break unless next_page_link
|
27
26
|
|
28
27
|
next_search_results_page = next_page_link.click
|
29
|
-
|
28
|
+
search_results_on(next_search_results_page).each do |url|
|
29
|
+
urls << url
|
30
|
+
end
|
31
|
+
|
30
32
|
page += 1
|
31
33
|
end
|
32
34
|
|
33
|
-
urls.first(@max_results)
|
35
|
+
urls.to_a.first(@max_results)
|
34
36
|
end
|
35
37
|
|
36
|
-
|
38
|
+
private
|
37
39
|
|
38
40
|
def search_results_on(page)
|
39
|
-
page.search("#search ol li h3.r a").
|
40
|
-
|
41
|
-
|
42
|
-
|
41
|
+
urls = page.search("#search ol li.g h3.r a").map do |a|
|
42
|
+
href = a[:href]
|
43
|
+
url = href =~ %r(/url\?q=) && $POSTMATCH
|
44
|
+
|
45
|
+
if url
|
46
|
+
url = url =~ /&sa=/ && $PREMATCH
|
47
|
+
CGI.unescape(url) if url
|
48
|
+
end
|
49
|
+
end
|
50
|
+
urls.compact!
|
51
|
+
|
52
|
+
unless @blacklisted_domains.empty?
|
53
|
+
urls.delete_if do |url|
|
43
54
|
domain = extract_domain_from(url)
|
44
55
|
@blacklisted_domains.any? { |blacklisted_domain| domain =~ blacklisted_domain }
|
45
56
|
end
|
57
|
+
end
|
58
|
+
|
59
|
+
urls
|
46
60
|
end
|
47
61
|
|
48
62
|
def agent
|
@@ -4,9 +4,9 @@ require File.expand_path("lib/email_crawler")
|
|
4
4
|
|
5
5
|
module EmailCrawler
|
6
6
|
describe EmailScanner do
|
7
|
-
subject { EmailScanner.new
|
7
|
+
subject { EmailScanner.new }
|
8
8
|
|
9
|
-
let(:link) { "
|
9
|
+
let(:link) { "https://www.mrosupply.com/page/plain/contact-us/" }
|
10
10
|
|
11
11
|
it "scans links for email addresses" do
|
12
12
|
emails_by_link = subject.scan([link])
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: email_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Cristian Rasch
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-10-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -56,16 +56,16 @@ dependencies:
|
|
56
56
|
name: bundler
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - "
|
59
|
+
- - ">="
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
61
|
+
version: '0'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- - "
|
66
|
+
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
68
|
+
version: '0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rake
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -86,14 +86,14 @@ dependencies:
|
|
86
86
|
requirements:
|
87
87
|
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: 5.
|
89
|
+
version: '5.8'
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: 5.
|
96
|
+
version: '5.8'
|
97
97
|
description:
|
98
98
|
email:
|
99
99
|
- cristianrasch@fastmail.fm
|
@@ -102,7 +102,6 @@ executables:
|
|
102
102
|
extensions: []
|
103
103
|
extra_rdoc_files: []
|
104
104
|
files:
|
105
|
-
- ".env.example"
|
106
105
|
- ".gitignore"
|
107
106
|
- ".ruby-version"
|
108
107
|
- Gemfile
|
@@ -142,7 +141,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
142
141
|
version: '0'
|
143
142
|
requirements: []
|
144
143
|
rubyforge_project:
|
145
|
-
rubygems_version: 2.
|
144
|
+
rubygems_version: 2.4.5.1
|
146
145
|
signing_key:
|
147
146
|
specification_version: 4
|
148
147
|
summary: 'Email crawler: crawls the top ten Google search results looking for email
|
data/.env.example
DELETED