email_crawler 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.env.example +2 -0
- data/.gitignore +19 -0
- data/.ruby-version +1 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +57 -0
- data/Rakefile +9 -0
- data/bin/email-crawler +47 -0
- data/email_crawler.gemspec +26 -0
- data/lib/email_crawler/email_scanner.rb +42 -0
- data/lib/email_crawler/mechanize_helper.rb +22 -0
- data/lib/email_crawler/page_links.rb +60 -0
- data/lib/email_crawler/proxy.rb +28 -0
- data/lib/email_crawler/scraper.rb +31 -0
- data/lib/email_crawler/version.rb +3 -0
- data/lib/email_crawler.rb +69 -0
- data/spec/lib/email_crawler/email_scanner_spec.rb +16 -0
- data/spec/lib/email_crawler/page_links_spec.rb +13 -0
- data/spec/lib/email_crawler/scraper_spec.rb +13 -0
- data/spec/spec_helper.rb +2 -0
- metadata +140 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 62e4b0ccd69dee8523ebdbb12e3a14c6820b7592
|
4
|
+
data.tar.gz: dc1c7941ed58d5de961ad7bcad9f95af16fb164f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: a71df8fb24765c8acafef1da841b1dfc5c4b7ffdb1f429fc0ffddc375f2ef28107fe353eb4deb352b697d84584c666adfab4370d635487305d38d64a1f5cb1d3
|
7
|
+
data.tar.gz: 43de8cb5b8fa2456ffed8a3e08c04e580079170eb47593587e5abee57103ba6c0bbde70fc62bd0d86a90cce5fb2b11c76ba7c5a2c7357b1a29f9fd6ba479a770
|
data/.env.example
ADDED
data/.gitignore
ADDED
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.1.0
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Cristian Rasch
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
# EmailCrawler
|
2
|
+
|
3
|
+
Email crawler: crawls the top ten Google search results looking for email addresses and exports them to CSV.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'email_crawler'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install email_crawler
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
1. Ask for help
|
22
|
+
|
23
|
+
```bash
|
24
|
+
email-crawler --help
|
25
|
+
```
|
26
|
+
|
27
|
+
2. Simplest Google search
|
28
|
+
|
29
|
+
```bash
|
30
|
+
email-crawler -q "berlin walks"
|
31
|
+
```
|
32
|
+
|
33
|
+
3. Select which Google website to use (defaults to google.com.br)
|
34
|
+
|
35
|
+
```bash
|
36
|
+
email-crawler -q "berlin walks" -g google.de
|
37
|
+
```
|
38
|
+
|
39
|
+
4. Specify how many internal links are to be scanned for email addresses (defaults to 100)
|
40
|
+
|
41
|
+
```bash
|
42
|
+
email-crawler -q "berlin walks" -g google.de -m 250
|
43
|
+
```
|
44
|
+
|
45
|
+
5. Redirect output to a file
|
46
|
+
|
47
|
+
```bash
|
48
|
+
email-crawler -q "berlin walks" -g google.de -m 250 > ~/Desktop/belin-walks-emails.csv
|
49
|
+
```
|
50
|
+
|
51
|
+
## Contributing
|
52
|
+
|
53
|
+
1. Fork it ( http://github.com/cristianrasch/email_crawler/fork )
|
54
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
55
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
56
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
57
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/bin/email-crawler
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'ostruct'
|
5
|
+
|
6
|
+
class OptionsParser
|
7
|
+
def self.parse(args)
|
8
|
+
options = OpenStruct.new
|
9
|
+
options.google_website = "google.com.br"
|
10
|
+
options.max_links = 100
|
11
|
+
|
12
|
+
opt_parser = OptionParser.new do |opts|
|
13
|
+
opts.banner = "Usage: email-crawler [options]"
|
14
|
+
opts.separator ""
|
15
|
+
|
16
|
+
opts.on("-q", '--query "SEARCH TERM/EXPRESSION"',
|
17
|
+
"The term/expression you want to search for") do |q|
|
18
|
+
options.q = q
|
19
|
+
end
|
20
|
+
|
21
|
+
opts.on("-g", "--google-website google.com.au",
|
22
|
+
"An alternative Google website",
|
23
|
+
" (defaults to Google Brazil)") do |google_website|
|
24
|
+
options.google_website = google_website
|
25
|
+
end
|
26
|
+
|
27
|
+
opts.on("-m", "--max-links 250",
|
28
|
+
"Max # of internal links to visit searching for emails",
|
29
|
+
" (per search result, defaults to 100)") do |max_links|
|
30
|
+
options.max_links = max_links.to_i
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
opt_parser.parse!(args)
|
35
|
+
options
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
options = OptionsParser.parse(ARGV)
|
40
|
+
if options.q.empty?
|
41
|
+
print "The -q switch is mandatory\n"
|
42
|
+
exit(1)
|
43
|
+
else
|
44
|
+
require_relative "../lib/email_crawler"
|
45
|
+
csv = EmailCrawler::Runner.new(options.google_website).run(options.q, options.max_links)
|
46
|
+
$stdout << "#{csv}\n"
|
47
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'email_crawler/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "email_crawler"
|
8
|
+
spec.version = EmailCrawler::VERSION
|
9
|
+
spec.authors = ["Cristian Rasch"]
|
10
|
+
spec.email = ["cristianrasch@fastmail.fm"]
|
11
|
+
spec.summary = %q{Email crawler: crawls the top ten Google search results looking for email addresses and exports them to CSV.}
|
12
|
+
spec.homepage = "https://github.com/cristianrasch/email_crawler"
|
13
|
+
spec.license = "MIT"
|
14
|
+
|
15
|
+
spec.files = `git ls-files -z`.split("\x0")
|
16
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
|
+
spec.require_paths = ["lib"]
|
19
|
+
|
20
|
+
spec.add_runtime_dependency "mechanize"
|
21
|
+
spec.add_runtime_dependency "dotenv"
|
22
|
+
|
23
|
+
spec.add_development_dependency "bundler", "~> 1.5"
|
24
|
+
spec.add_development_dependency "rake"
|
25
|
+
spec.add_development_dependency "minitest", "~> 5.2.3"
|
26
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require "open-uri"
|
2
|
+
|
3
|
+
module EmailCrawler
|
4
|
+
class EmailScanner
|
5
|
+
EMAIL_REGEXP = /\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}\b/i
|
6
|
+
SLEEP_TIME = 0.5
|
7
|
+
|
8
|
+
def initialize(url)
|
9
|
+
@url = url
|
10
|
+
@logger = ::Logger.new(STDOUT).tap do |logger|
|
11
|
+
logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def scan(links)
|
16
|
+
emails_by_link = {}
|
17
|
+
|
18
|
+
links.each do |link|
|
19
|
+
@logger.info "searching for emails on '#{link}'.."
|
20
|
+
|
21
|
+
html = begin
|
22
|
+
open(link).read
|
23
|
+
rescue OpenURI::HTTPError => err
|
24
|
+
@logger.warn(err)
|
25
|
+
nil
|
26
|
+
rescue => err
|
27
|
+
if err.message =~ /redirection forbidden/
|
28
|
+
link = err.message.split(" ").last
|
29
|
+
retry
|
30
|
+
end
|
31
|
+
end
|
32
|
+
next unless html
|
33
|
+
|
34
|
+
emails = html.scan(EMAIL_REGEXP)
|
35
|
+
emails_by_link[link] = Set.new(emails) unless emails.empty?
|
36
|
+
sleep(SLEEP_TIME)
|
37
|
+
end
|
38
|
+
|
39
|
+
emails_by_link
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require "mechanize"
|
2
|
+
|
3
|
+
module EmailCrawler
|
4
|
+
module MechanizeHelper
|
5
|
+
def new_agent
|
6
|
+
Thread.current[:agent] ||= Mechanize.new do |agent|
|
7
|
+
agent.user_agent_alias = "Mac Safari"
|
8
|
+
agent.open_timeout = agent.read_timeout = 30
|
9
|
+
agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
10
|
+
agent.history.max_size = 1
|
11
|
+
yield(agent) if block_given?
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def get(url)
|
16
|
+
begin
|
17
|
+
page = agent.get(url)
|
18
|
+
page if page.is_a?(Mechanize::Page)
|
19
|
+
rescue Mechanize::Error, Net::OpenTimeout; end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
module EmailCrawler
|
2
|
+
class PageLinks
|
3
|
+
MAX_LINKS = 100
|
4
|
+
SLEEP_TIME = 0.5
|
5
|
+
|
6
|
+
include MechanizeHelper
|
7
|
+
|
8
|
+
def initialize(url)
|
9
|
+
@url = url
|
10
|
+
uri = URI(url)
|
11
|
+
scheme_and_host = if uri.host
|
12
|
+
"#{uri.scheme}://#{uri.host}"
|
13
|
+
else
|
14
|
+
url[%r(\A(https?://([^/]+))), 1]
|
15
|
+
end
|
16
|
+
@domain = Regexp.new("#{scheme_and_host}/", true)
|
17
|
+
@logger = ::Logger.new(STDOUT).tap do |logger|
|
18
|
+
logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.for(url, max_links = MAX_LINKS)
|
23
|
+
new(url).fetch_links(max_links)
|
24
|
+
end
|
25
|
+
|
26
|
+
def fetch_links(max_links = MAX_LINKS)
|
27
|
+
queue, links = Set.new([@url]), Set.new([@url])
|
28
|
+
|
29
|
+
until queue.empty?
|
30
|
+
current_link = queue.first
|
31
|
+
@logger.info "current_link: #{current_link}"
|
32
|
+
page = get(current_link)
|
33
|
+
|
34
|
+
if page
|
35
|
+
new_links = page.links_with(href: @domain).map(&:href)
|
36
|
+
new_links.reject! { |link| links.include?(link) }
|
37
|
+
@logger.debug "found: #{new_links.length} new link(s)"
|
38
|
+
new_links.each { |link| queue << link }
|
39
|
+
links << current_link
|
40
|
+
|
41
|
+
if links.length == max_links
|
42
|
+
break
|
43
|
+
else
|
44
|
+
sleep(SLEEP_TIME)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
queue.delete(current_link)
|
49
|
+
end
|
50
|
+
|
51
|
+
links.to_a
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def agent
|
57
|
+
@agent ||= new_agent
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require "open-uri"
|
2
|
+
require "json"
|
3
|
+
require "dotenv"
|
4
|
+
|
5
|
+
module EmailCrawler
|
6
|
+
class Proxy
|
7
|
+
class << self
|
8
|
+
def random
|
9
|
+
all.sample
|
10
|
+
end
|
11
|
+
|
12
|
+
private
|
13
|
+
|
14
|
+
def all
|
15
|
+
@all ||= begin
|
16
|
+
Dotenv.load
|
17
|
+
|
18
|
+
json = JSON.parse(open("https://api.digitalocean.com/droplets/?client_id=#{ENV['DO_CLIENT_ID']}&api_key=#{ENV['DO_API_KEY']}").read)
|
19
|
+
json["droplets"].
|
20
|
+
select{ |droplet| droplet["name"] =~ /proxy\d+/ }.
|
21
|
+
map { |droplet| droplet["ip_address"] }
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require_relative "proxy"
|
2
|
+
|
3
|
+
module EmailCrawler
|
4
|
+
class Scraper
|
5
|
+
MAX_URLS = 10
|
6
|
+
|
7
|
+
include MechanizeHelper
|
8
|
+
|
9
|
+
def initialize(google_website)
|
10
|
+
@google_website = "https://www.#{google_website}/"
|
11
|
+
end
|
12
|
+
|
13
|
+
def top_ten_urls_for(q)
|
14
|
+
search_page = agent.get(@google_website)
|
15
|
+
search_form = search_page.form_with(action: "/search")
|
16
|
+
search_form.field_with(name: "q").value = q
|
17
|
+
search_results_page = agent.submit(search_form)
|
18
|
+
search_results_page.search("#search ol li h3.r a").
|
19
|
+
map { |a| a["href"].downcase }.
|
20
|
+
reject { |url| url =~ %r(\A/search[?]q=) }.
|
21
|
+
first(MAX_URLS)
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def agent
|
27
|
+
@agent ||= new_agent { |agent| agent.set_proxy(Proxy.random, "8888") }
|
28
|
+
# @agent ||= new_agent
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require "thread"
|
2
|
+
require "logger"
|
3
|
+
require "csv"
|
4
|
+
require "set"
|
5
|
+
|
6
|
+
require_relative "email_crawler/version"
|
7
|
+
require_relative "email_crawler/mechanize_helper"
|
8
|
+
require_relative "email_crawler/scraper"
|
9
|
+
require_relative "email_crawler/page_links"
|
10
|
+
require_relative "email_crawler/email_scanner"
|
11
|
+
|
12
|
+
module EmailCrawler
|
13
|
+
class Runner
|
14
|
+
def initialize(google_website)
|
15
|
+
@google_website = google_website
|
16
|
+
|
17
|
+
log_file = File.join(ENV["HOME"], "email-crawler.log")
|
18
|
+
file = File.open(log_file, File::WRONLY | File::APPEND | File::CREAT)
|
19
|
+
@logger = ::Logger.new(file).tap do |logger|
|
20
|
+
logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def run(q, max_links = PageLinks::MAX_LINKS)
|
25
|
+
urls = Scraper.new(@google_website).top_ten_urls_for(q)
|
26
|
+
urls.each { |url, links| @logger.info "#{url}" }
|
27
|
+
|
28
|
+
threads = (1..urls.length).map do |i|
|
29
|
+
Thread.new(i, urls[i-1]) do |i, url|
|
30
|
+
@logger.info "[Thread ##{i}] grabbing page links for '#{url}'.."
|
31
|
+
Thread.current[:url] = url
|
32
|
+
Thread.current[:links] = PageLinks.for(url, max_links)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
threads.each(&:join)
|
37
|
+
threads.each { |thread| @logger.info "#{thread[:url]} (#{thread[:links].length} links)" }
|
38
|
+
links_by_url = Hash[threads.map { |thread| [thread[:url], thread[:links]] }]
|
39
|
+
|
40
|
+
threads = (links_by_url).map.with_index do |arr, i|
|
41
|
+
Thread.new(i+1, arr.first, arr.last) do |i, url, links|
|
42
|
+
@logger.info "[Thread ##{i}] scanning for emails on page '#{url}' (#{links.length} links)"
|
43
|
+
Thread.current[:url] = url
|
44
|
+
Thread.current[:emails] = EmailScanner.new(url).scan(links)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
threads.each(&:join)
|
49
|
+
|
50
|
+
read_emails = Set.new
|
51
|
+
CSV.generate do |csv|
|
52
|
+
csv << %w(Email Domain URL)
|
53
|
+
csv << []
|
54
|
+
|
55
|
+
threads.each do |thread|
|
56
|
+
email_count = thread[:emails].inject(0) { |sum, arr| sum += arr.last.length }
|
57
|
+
@logger.info "#{thread[:url]} (#{email_count} emails)"
|
58
|
+
|
59
|
+
url = thread[:url]
|
60
|
+
thread[:emails].each do |link, emails|
|
61
|
+
emails.each do |email|
|
62
|
+
csv << [email, url, link] if read_emails.add?(email)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require_relative "../../spec_helper"
|
2
|
+
|
3
|
+
require File.expand_path("lib/email_crawler")
|
4
|
+
|
5
|
+
module EmailCrawler
|
6
|
+
describe EmailScanner do
|
7
|
+
subject { EmailScanner.new("google.com") }
|
8
|
+
|
9
|
+
let(:link) { "http://www.kitaylaw.com/contact.php" }
|
10
|
+
|
11
|
+
it "scans links for email addresses" do
|
12
|
+
emails_by_link = subject.scan([link])
|
13
|
+
emails_by_link[link].wont_be_empty
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require_relative "../../spec_helper"
|
2
|
+
|
3
|
+
require File.expand_path("lib/email_crawler")
|
4
|
+
|
5
|
+
module EmailCrawler
|
6
|
+
describe PageLinks do
|
7
|
+
let(:max_links) { 25 }
|
8
|
+
|
9
|
+
it "returns the first N internal links" do
|
10
|
+
PageLinks.for("http://www.visitberlin.de/en", max_links).length.must_equal max_links
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require_relative "../../spec_helper"
|
2
|
+
|
3
|
+
require File.expand_path("lib/email_crawler")
|
4
|
+
|
5
|
+
module EmailCrawler
|
6
|
+
describe Scraper do
|
7
|
+
subject { Scraper.new("google.de") }
|
8
|
+
|
9
|
+
it "returns the top 10 URLs for a given search term/expression" do
|
10
|
+
subject.top_ten_urls_for("berlin tours").length.must_equal 10
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,140 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: email_crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Cristian Rasch
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-02-25 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: mechanize
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: dotenv
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: bundler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.5'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.5'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: minitest
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 5.2.3
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 5.2.3
|
83
|
+
description:
|
84
|
+
email:
|
85
|
+
- cristianrasch@fastmail.fm
|
86
|
+
executables:
|
87
|
+
- email-crawler
|
88
|
+
extensions: []
|
89
|
+
extra_rdoc_files: []
|
90
|
+
files:
|
91
|
+
- ".env.example"
|
92
|
+
- ".gitignore"
|
93
|
+
- ".ruby-version"
|
94
|
+
- Gemfile
|
95
|
+
- LICENSE.txt
|
96
|
+
- README.md
|
97
|
+
- Rakefile
|
98
|
+
- bin/email-crawler
|
99
|
+
- email_crawler.gemspec
|
100
|
+
- lib/email_crawler.rb
|
101
|
+
- lib/email_crawler/email_scanner.rb
|
102
|
+
- lib/email_crawler/mechanize_helper.rb
|
103
|
+
- lib/email_crawler/page_links.rb
|
104
|
+
- lib/email_crawler/proxy.rb
|
105
|
+
- lib/email_crawler/scraper.rb
|
106
|
+
- lib/email_crawler/version.rb
|
107
|
+
- spec/lib/email_crawler/email_scanner_spec.rb
|
108
|
+
- spec/lib/email_crawler/page_links_spec.rb
|
109
|
+
- spec/lib/email_crawler/scraper_spec.rb
|
110
|
+
- spec/spec_helper.rb
|
111
|
+
homepage: https://github.com/cristianrasch/email_crawler
|
112
|
+
licenses:
|
113
|
+
- MIT
|
114
|
+
metadata: {}
|
115
|
+
post_install_message:
|
116
|
+
rdoc_options: []
|
117
|
+
require_paths:
|
118
|
+
- lib
|
119
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
120
|
+
requirements:
|
121
|
+
- - ">="
|
122
|
+
- !ruby/object:Gem::Version
|
123
|
+
version: '0'
|
124
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
125
|
+
requirements:
|
126
|
+
- - ">="
|
127
|
+
- !ruby/object:Gem::Version
|
128
|
+
version: '0'
|
129
|
+
requirements: []
|
130
|
+
rubyforge_project:
|
131
|
+
rubygems_version: 2.2.0
|
132
|
+
signing_key:
|
133
|
+
specification_version: 4
|
134
|
+
summary: 'Email crawler: crawls the top ten Google search results looking for email
|
135
|
+
addresses and exports them to CSV.'
|
136
|
+
test_files:
|
137
|
+
- spec/lib/email_crawler/email_scanner_spec.rb
|
138
|
+
- spec/lib/email_crawler/page_links_spec.rb
|
139
|
+
- spec/lib/email_crawler/scraper_spec.rb
|
140
|
+
- spec/spec_helper.rb
|