email_crawler 0.0.13 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.ruby-version +1 -1
- data/README.md +6 -0
- data/bin/email-crawler +1 -1
- data/email_crawler.gemspec +2 -2
- data/lib/email_crawler.rb +6 -6
- data/lib/email_crawler/email_scanner.rb +15 -20
- data/lib/email_crawler/mechanize_helper.rb +1 -1
- data/lib/email_crawler/page_links.rb +1 -1
- data/lib/email_crawler/scraper.rb +28 -14
- data/lib/email_crawler/url_helper.rb +1 -0
- data/lib/email_crawler/version.rb +1 -1
- data/spec/lib/email_crawler/email_scanner_spec.rb +2 -2
- metadata +9 -10
- data/.env.example +0 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b158c0d7a48cac6a56c44b905b833934e3e88ee2
|
4
|
+
data.tar.gz: 9b13281878621e1679de17c30e5b13d4ea04bd98
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 589043168d0c1a9ad72dcf3e4d7f2d32194004e76cff744e9fd9ec7187cb8e4101d22f2122f6d236a586626cdd86588ee9a676cc4afe3e91e8624fca0a557d19
|
7
|
+
data.tar.gz: 1955a721beb4336cb87067460a923826e801701c7d40695f96fb73b41843ecbf58592efe7b4dbcf1053dd76ca61dfefd3a141e897e282db296323945e181c070
|
data/.gitignore
CHANGED
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.
|
1
|
+
2.2.3
|
data/README.md
CHANGED
@@ -44,6 +44,12 @@ email-crawler --query "berlin walks" --max-links 250
|
|
44
44
|
email-crawler --query "berlin walks" --concurrency 25
|
45
45
|
```
|
46
46
|
|
47
|
+
* Exclude certain domains from pages scanned for email addresses
|
48
|
+
|
49
|
+
```bash
|
50
|
+
email-crawler --query "berlin walks" --blacklist berlin.de --blacklist berlin.com
|
51
|
+
```
|
52
|
+
|
47
53
|
* Redirect output to a file
|
48
54
|
|
49
55
|
```bash
|
data/bin/email-crawler
CHANGED
data/email_crawler.gemspec
CHANGED
@@ -21,7 +21,7 @@ Gem::Specification.new do |spec|
|
|
21
21
|
spec.add_runtime_dependency "dotenv"
|
22
22
|
spec.add_runtime_dependency "thread_safe"
|
23
23
|
|
24
|
-
spec.add_development_dependency "bundler"
|
24
|
+
spec.add_development_dependency "bundler"
|
25
25
|
spec.add_development_dependency "rake"
|
26
|
-
spec.add_development_dependency "minitest", "~> 5.
|
26
|
+
spec.add_development_dependency "minitest", "~> 5.8"
|
27
27
|
end
|
data/lib/email_crawler.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
require "thread"
|
2
1
|
require "logger"
|
3
2
|
require "csv"
|
4
3
|
require "set"
|
@@ -14,7 +13,8 @@ module EmailCrawler
|
|
14
13
|
class Runner
|
15
14
|
MAX_CONCURRENCY = 50
|
16
15
|
|
17
|
-
attr_writer :max_results, :max_links, :max_concurrency, :logger,
|
16
|
+
attr_writer :max_results, :max_links, :max_concurrency, :logger,
|
17
|
+
:blacklisted_domains
|
18
18
|
|
19
19
|
def initialize(google_website)
|
20
20
|
@google_website = google_website
|
@@ -27,7 +27,7 @@ module EmailCrawler
|
|
27
27
|
blacklisted_domains: @blacklisted_domains).
|
28
28
|
search_result_urls_for(q)
|
29
29
|
urls.each { |url| logger.info "#{url}" }
|
30
|
-
queue = Queue.new
|
30
|
+
queue = Thread::Queue.new
|
31
31
|
urls.each { |url| queue.push(url) }
|
32
32
|
links_by_url = ThreadSafe::Array.new
|
33
33
|
|
@@ -62,7 +62,7 @@ module EmailCrawler
|
|
62
62
|
while arr
|
63
63
|
url, links = arr
|
64
64
|
logger.info "[Thread ##{i}] scanning for emails on page '#{url}' (#{links.length} links)"
|
65
|
-
emails = EmailScanner.new(
|
65
|
+
emails = EmailScanner.new(logger).scan(links)
|
66
66
|
emails_by_url[url] = emails
|
67
67
|
|
68
68
|
arr = begin
|
@@ -92,11 +92,11 @@ module EmailCrawler
|
|
92
92
|
end
|
93
93
|
end
|
94
94
|
|
95
|
-
|
95
|
+
private
|
96
96
|
|
97
97
|
def logger
|
98
98
|
@logger ||= begin
|
99
|
-
path = File.join(ENV["HOME"], "
|
99
|
+
path = File.join(ENV["HOME"], "email-crawler.log")
|
100
100
|
file = File.open(path, File::WRONLY | File::APPEND | File::CREAT)
|
101
101
|
logger = ::Logger.new(file).tap do |logger|
|
102
102
|
logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
|
@@ -3,31 +3,28 @@ require "open-uri"
|
|
3
3
|
module EmailCrawler
|
4
4
|
class EmailScanner
|
5
5
|
EMAIL_REGEXP = /\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}\b/i
|
6
|
-
SLEEP_TIME = 0.5
|
7
6
|
UTF_8 = "UTF-8".freeze
|
8
7
|
|
9
|
-
def initialize(
|
10
|
-
@
|
8
|
+
def initialize(logger = Logger.new("/dev/null"))
|
9
|
+
@logger = logger
|
11
10
|
end
|
12
11
|
|
13
12
|
def scan(links)
|
14
|
-
|
15
|
-
|
16
|
-
links.each do |link|
|
13
|
+
links.each_with_object({}) do |link, h|
|
17
14
|
@logger.info "searching for emails on '#{link}'.."
|
18
15
|
retried = false
|
19
16
|
|
20
17
|
html = begin
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
18
|
+
open(link).read
|
19
|
+
rescue OpenURI::HTTPError => err
|
20
|
+
@logger.warn(err)
|
21
|
+
nil
|
22
|
+
rescue => err
|
23
|
+
if err.message =~ /redirection forbidden/
|
24
|
+
link = err.message.split(" ").last
|
25
|
+
retry
|
26
|
+
end
|
27
|
+
end
|
31
28
|
next unless html
|
32
29
|
|
33
30
|
begin
|
@@ -42,11 +39,9 @@ module EmailCrawler
|
|
42
39
|
retry
|
43
40
|
end
|
44
41
|
end
|
45
|
-
emails_by_link[link] = Set.new(emails) unless emails.empty?
|
46
|
-
sleep(SLEEP_TIME)
|
47
|
-
end
|
48
42
|
|
49
|
-
|
43
|
+
h[link] = Set.new(emails) unless emails.empty?
|
44
|
+
end
|
50
45
|
end
|
51
46
|
end
|
52
47
|
end
|
@@ -6,7 +6,7 @@ module EmailCrawler
|
|
6
6
|
|
7
7
|
def new_agent
|
8
8
|
Thread.current[:agent] ||= Mechanize.new do |agent|
|
9
|
-
agent.user_agent_alias = "
|
9
|
+
agent.user_agent_alias = "Windows Mozilla"
|
10
10
|
agent.open_timeout = agent.read_timeout = READ_TIMEOUT
|
11
11
|
agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
12
12
|
agent.history.max_size = 1
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require "set"
|
2
|
+
require_relative "mechanize_helper"
|
1
3
|
require_relative "url_helper"
|
2
4
|
|
3
5
|
module EmailCrawler
|
@@ -8,41 +10,53 @@ module EmailCrawler
|
|
8
10
|
include URLHelper
|
9
11
|
|
10
12
|
def initialize(google_website, max_results: MAX_RESULTS, blacklisted_domains: [])
|
11
|
-
@
|
13
|
+
@search_url = "https://www.#{google_website}/search?q="
|
12
14
|
@max_results = max_results
|
13
15
|
@blacklisted_domains = blacklisted_domains.map { |domain| /#{domain}\z/ }
|
14
16
|
end
|
15
17
|
|
16
18
|
def search_result_urls_for(q)
|
17
|
-
|
18
|
-
|
19
|
-
search_form.field_with(name: "q").value = q
|
20
|
-
search_results_page = agent.submit(search_form)
|
21
|
-
urls = search_results_on(search_results_page)
|
19
|
+
search_results_page = agent.get(@search_url + CGI.escape(q))
|
20
|
+
urls = Set.new(search_results_on(search_results_page))
|
22
21
|
|
23
22
|
page = 1
|
24
23
|
while urls.size < @max_results
|
25
24
|
next_page_link = search_results_page.link_with(href: /start=#{page*10}/)
|
26
|
-
|
25
|
+
break unless next_page_link
|
27
26
|
|
28
27
|
next_search_results_page = next_page_link.click
|
29
|
-
|
28
|
+
search_results_on(next_search_results_page).each do |url|
|
29
|
+
urls << url
|
30
|
+
end
|
31
|
+
|
30
32
|
page += 1
|
31
33
|
end
|
32
34
|
|
33
|
-
urls.first(@max_results)
|
35
|
+
urls.to_a.first(@max_results)
|
34
36
|
end
|
35
37
|
|
36
|
-
|
38
|
+
private
|
37
39
|
|
38
40
|
def search_results_on(page)
|
39
|
-
page.search("#search ol li h3.r a").
|
40
|
-
|
41
|
-
|
42
|
-
|
41
|
+
urls = page.search("#search ol li.g h3.r a").map do |a|
|
42
|
+
href = a[:href]
|
43
|
+
url = href =~ %r(/url\?q=) && $POSTMATCH
|
44
|
+
|
45
|
+
if url
|
46
|
+
url = url =~ /&sa=/ && $PREMATCH
|
47
|
+
CGI.unescape(url) if url
|
48
|
+
end
|
49
|
+
end
|
50
|
+
urls.compact!
|
51
|
+
|
52
|
+
unless @blacklisted_domains.empty?
|
53
|
+
urls.delete_if do |url|
|
43
54
|
domain = extract_domain_from(url)
|
44
55
|
@blacklisted_domains.any? { |blacklisted_domain| domain =~ blacklisted_domain }
|
45
56
|
end
|
57
|
+
end
|
58
|
+
|
59
|
+
urls
|
46
60
|
end
|
47
61
|
|
48
62
|
def agent
|
@@ -4,9 +4,9 @@ require File.expand_path("lib/email_crawler")
|
|
4
4
|
|
5
5
|
module EmailCrawler
|
6
6
|
describe EmailScanner do
|
7
|
-
subject { EmailScanner.new
|
7
|
+
subject { EmailScanner.new }
|
8
8
|
|
9
|
-
let(:link) { "
|
9
|
+
let(:link) { "https://www.mrosupply.com/page/plain/contact-us/" }
|
10
10
|
|
11
11
|
it "scans links for email addresses" do
|
12
12
|
emails_by_link = subject.scan([link])
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: email_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Cristian Rasch
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-10-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -56,16 +56,16 @@ dependencies:
|
|
56
56
|
name: bundler
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - "
|
59
|
+
- - ">="
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
61
|
+
version: '0'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- - "
|
66
|
+
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
68
|
+
version: '0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rake
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -86,14 +86,14 @@ dependencies:
|
|
86
86
|
requirements:
|
87
87
|
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: 5.
|
89
|
+
version: '5.8'
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: 5.
|
96
|
+
version: '5.8'
|
97
97
|
description:
|
98
98
|
email:
|
99
99
|
- cristianrasch@fastmail.fm
|
@@ -102,7 +102,6 @@ executables:
|
|
102
102
|
extensions: []
|
103
103
|
extra_rdoc_files: []
|
104
104
|
files:
|
105
|
-
- ".env.example"
|
106
105
|
- ".gitignore"
|
107
106
|
- ".ruby-version"
|
108
107
|
- Gemfile
|
@@ -142,7 +141,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
142
141
|
version: '0'
|
143
142
|
requirements: []
|
144
143
|
rubyforge_project:
|
145
|
-
rubygems_version: 2.
|
144
|
+
rubygems_version: 2.4.5.1
|
146
145
|
signing_key:
|
147
146
|
specification_version: 4
|
148
147
|
summary: 'Email crawler: crawls the top ten Google search results looking for email
|
data/.env.example
DELETED