email_crawler 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 62e4b0ccd69dee8523ebdbb12e3a14c6820b7592
4
+ data.tar.gz: dc1c7941ed58d5de961ad7bcad9f95af16fb164f
5
+ SHA512:
6
+ metadata.gz: a71df8fb24765c8acafef1da841b1dfc5c4b7ffdb1f429fc0ffddc375f2ef28107fe353eb4deb352b697d84584c666adfab4370d635487305d38d64a1f5cb1d3
7
+ data.tar.gz: 43de8cb5b8fa2456ffed8a3e08c04e580079170eb47593587e5abee57103ba6c0bbde70fc62bd0d86a90cce5fb2b11c76ba7c5a2c7357b1a29f9fd6ba479a770
data/.env.example ADDED
@@ -0,0 +1,2 @@
1
+ DO_CLIENT_ID=top
2
+ DO_API_KEY=secret
data/.gitignore ADDED
@@ -0,0 +1,19 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .rbenv-gemsets
19
+ .env
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 2.1.0
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in email_crawler.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Cristian Rasch
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,57 @@
1
+ # EmailCrawler
2
+
3
+ Email crawler: crawls the top ten Google search results looking for email addresses and exports them to CSV.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'email_crawler'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install email_crawler
18
+
19
+ ## Usage
20
+
21
+ 1. Ask for help
22
+
23
+ ```bash
24
+ email-crawler --help
25
+ ```
26
+
27
+ 2. Simplest Google search
28
+
29
+ ```bash
30
+ email-crawler -q "berlin walks"
31
+ ```
32
+
33
+ 3. Select which Google website to use (defaults to google.com.br)
34
+
35
+ ```bash
36
+ email-crawler -q "berlin walks" -g google.de
37
+ ```
38
+
39
+ 4. Specify how many internal links are to be scanned for email addresses (defaults to 100)
40
+
41
+ ```bash
42
+ email-crawler -q "berlin walks" -g google.de -m 250
43
+ ```
44
+
45
+ 5. Redirect output to a file
46
+
47
+ ```bash
48
+ email-crawler -q "berlin walks" -g google.de -m 250 > ~/Desktop/belin-walks-emails.csv
49
+ ```
50
+
51
+ ## Contributing
52
+
53
+ 1. Fork it ( http://github.com/cristianrasch/email_crawler/fork )
54
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
55
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
56
+ 4. Push to the branch (`git push origin my-new-feature`)
57
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.pattern = "spec/**/*_spec.rb"
6
+ t.verbose = true
7
+ end
8
+
9
+ task default: :test
data/bin/email-crawler ADDED
@@ -0,0 +1,47 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+ require 'ostruct'
5
+
6
+ class OptionsParser
7
+ def self.parse(args)
8
+ options = OpenStruct.new
9
+ options.google_website = "google.com.br"
10
+ options.max_links = 100
11
+
12
+ opt_parser = OptionParser.new do |opts|
13
+ opts.banner = "Usage: email-crawler [options]"
14
+ opts.separator ""
15
+
16
+ opts.on("-q", '--query "SEARCH TERM/EXPRESSION"',
17
+ "The term/expression you want to search for") do |q|
18
+ options.q = q
19
+ end
20
+
21
+ opts.on("-g", "--google-website google.com.au",
22
+ "An alternative Google website",
23
+ " (defaults to Google Brazil)") do |google_website|
24
+ options.google_website = google_website
25
+ end
26
+
27
+ opts.on("-m", "--max-links 250",
28
+ "Max # of internal links to visit searching for emails",
29
+ " (per search result, defaults to 100)") do |max_links|
30
+ options.max_links = max_links.to_i
31
+ end
32
+ end
33
+
34
+ opt_parser.parse!(args)
35
+ options
36
+ end
37
+ end
38
+
39
+ options = OptionsParser.parse(ARGV)
40
+ if options.q.empty?
41
+ print "The -q switch is mandatory\n"
42
+ exit(1)
43
+ else
44
+ require_relative "../lib/email_crawler"
45
+ csv = EmailCrawler::Runner.new(options.google_website).run(options.q, options.max_links)
46
+ $stdout << "#{csv}\n"
47
+ end
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'email_crawler/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "email_crawler"
8
+ spec.version = EmailCrawler::VERSION
9
+ spec.authors = ["Cristian Rasch"]
10
+ spec.email = ["cristianrasch@fastmail.fm"]
11
+ spec.summary = %q{Email crawler: crawls the top ten Google search results looking for email addresses and exports them to CSV.}
12
+ spec.homepage = "https://github.com/cristianrasch/email_crawler"
13
+ spec.license = "MIT"
14
+
15
+ spec.files = `git ls-files -z`.split("\x0")
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ["lib"]
19
+
20
+ spec.add_runtime_dependency "mechanize"
21
+ spec.add_runtime_dependency "dotenv"
22
+
23
+ spec.add_development_dependency "bundler", "~> 1.5"
24
+ spec.add_development_dependency "rake"
25
+ spec.add_development_dependency "minitest", "~> 5.2.3"
26
+ end
@@ -0,0 +1,42 @@
1
+ require "open-uri"
2
+
3
+ module EmailCrawler
4
+ class EmailScanner
5
+ EMAIL_REGEXP = /\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}\b/i
6
+ SLEEP_TIME = 0.5
7
+
8
+ def initialize(url)
9
+ @url = url
10
+ @logger = ::Logger.new(STDOUT).tap do |logger|
11
+ logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
12
+ end
13
+ end
14
+
15
+ def scan(links)
16
+ emails_by_link = {}
17
+
18
+ links.each do |link|
19
+ @logger.info "searching for emails on '#{link}'.."
20
+
21
+ html = begin
22
+ open(link).read
23
+ rescue OpenURI::HTTPError => err
24
+ @logger.warn(err)
25
+ nil
26
+ rescue => err
27
+ if err.message =~ /redirection forbidden/
28
+ link = err.message.split(" ").last
29
+ retry
30
+ end
31
+ end
32
+ next unless html
33
+
34
+ emails = html.scan(EMAIL_REGEXP)
35
+ emails_by_link[link] = Set.new(emails) unless emails.empty?
36
+ sleep(SLEEP_TIME)
37
+ end
38
+
39
+ emails_by_link
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,22 @@
1
+ require "mechanize"
2
+
3
+ module EmailCrawler
4
+ module MechanizeHelper
5
+ def new_agent
6
+ Thread.current[:agent] ||= Mechanize.new do |agent|
7
+ agent.user_agent_alias = "Mac Safari"
8
+ agent.open_timeout = agent.read_timeout = 30
9
+ agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
10
+ agent.history.max_size = 1
11
+ yield(agent) if block_given?
12
+ end
13
+ end
14
+
15
+ def get(url)
16
+ begin
17
+ page = agent.get(url)
18
+ page if page.is_a?(Mechanize::Page)
19
+ rescue Mechanize::Error, Net::OpenTimeout; end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,60 @@
1
+ module EmailCrawler
2
+ class PageLinks
3
+ MAX_LINKS = 100
4
+ SLEEP_TIME = 0.5
5
+
6
+ include MechanizeHelper
7
+
8
+ def initialize(url)
9
+ @url = url
10
+ uri = URI(url)
11
+ scheme_and_host = if uri.host
12
+ "#{uri.scheme}://#{uri.host}"
13
+ else
14
+ url[%r(\A(https?://([^/]+))), 1]
15
+ end
16
+ @domain = Regexp.new("#{scheme_and_host}/", true)
17
+ @logger = ::Logger.new(STDOUT).tap do |logger|
18
+ logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
19
+ end
20
+ end
21
+
22
+ def self.for(url, max_links = MAX_LINKS)
23
+ new(url).fetch_links(max_links)
24
+ end
25
+
26
+ def fetch_links(max_links = MAX_LINKS)
27
+ queue, links = Set.new([@url]), Set.new([@url])
28
+
29
+ until queue.empty?
30
+ current_link = queue.first
31
+ @logger.info "current_link: #{current_link}"
32
+ page = get(current_link)
33
+
34
+ if page
35
+ new_links = page.links_with(href: @domain).map(&:href)
36
+ new_links.reject! { |link| links.include?(link) }
37
+ @logger.debug "found: #{new_links.length} new link(s)"
38
+ new_links.each { |link| queue << link }
39
+ links << current_link
40
+
41
+ if links.length == max_links
42
+ break
43
+ else
44
+ sleep(SLEEP_TIME)
45
+ end
46
+ end
47
+
48
+ queue.delete(current_link)
49
+ end
50
+
51
+ links.to_a
52
+ end
53
+
54
+ private
55
+
56
+ def agent
57
+ @agent ||= new_agent
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,28 @@
1
+ require "open-uri"
2
+ require "json"
3
+ require "dotenv"
4
+
5
+ module EmailCrawler
6
+ class Proxy
7
+ class << self
8
+ def random
9
+ all.sample
10
+ end
11
+
12
+ private
13
+
14
+ def all
15
+ @all ||= begin
16
+ Dotenv.load
17
+
18
+ json = JSON.parse(open("https://api.digitalocean.com/droplets/?client_id=#{ENV['DO_CLIENT_ID']}&api_key=#{ENV['DO_API_KEY']}").read)
19
+ json["droplets"].
20
+ select{ |droplet| droplet["name"] =~ /proxy\d+/ }.
21
+ map { |droplet| droplet["ip_address"] }
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
27
+
28
+
@@ -0,0 +1,31 @@
1
+ require_relative "proxy"
2
+
3
+ module EmailCrawler
4
+ class Scraper
5
+ MAX_URLS = 10
6
+
7
+ include MechanizeHelper
8
+
9
+ def initialize(google_website)
10
+ @google_website = "https://www.#{google_website}/"
11
+ end
12
+
13
+ def top_ten_urls_for(q)
14
+ search_page = agent.get(@google_website)
15
+ search_form = search_page.form_with(action: "/search")
16
+ search_form.field_with(name: "q").value = q
17
+ search_results_page = agent.submit(search_form)
18
+ search_results_page.search("#search ol li h3.r a").
19
+ map { |a| a["href"].downcase }.
20
+ reject { |url| url =~ %r(\A/search[?]q=) }.
21
+ first(MAX_URLS)
22
+ end
23
+
24
+ private
25
+
26
+ def agent
27
+ @agent ||= new_agent { |agent| agent.set_proxy(Proxy.random, "8888") }
28
+ # @agent ||= new_agent
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,3 @@
1
+ module EmailCrawler
2
+ VERSION = "0.0.2"
3
+ end
@@ -0,0 +1,69 @@
1
+ require "thread"
2
+ require "logger"
3
+ require "csv"
4
+ require "set"
5
+
6
+ require_relative "email_crawler/version"
7
+ require_relative "email_crawler/mechanize_helper"
8
+ require_relative "email_crawler/scraper"
9
+ require_relative "email_crawler/page_links"
10
+ require_relative "email_crawler/email_scanner"
11
+
12
+ module EmailCrawler
13
+ class Runner
14
+ def initialize(google_website)
15
+ @google_website = google_website
16
+
17
+ log_file = File.join(ENV["HOME"], "email-crawler.log")
18
+ file = File.open(log_file, File::WRONLY | File::APPEND | File::CREAT)
19
+ @logger = ::Logger.new(file).tap do |logger|
20
+ logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
21
+ end
22
+ end
23
+
24
+ def run(q, max_links = PageLinks::MAX_LINKS)
25
+ urls = Scraper.new(@google_website).top_ten_urls_for(q)
26
+ urls.each { |url, links| @logger.info "#{url}" }
27
+
28
+ threads = (1..urls.length).map do |i|
29
+ Thread.new(i, urls[i-1]) do |i, url|
30
+ @logger.info "[Thread ##{i}] grabbing page links for '#{url}'.."
31
+ Thread.current[:url] = url
32
+ Thread.current[:links] = PageLinks.for(url, max_links)
33
+ end
34
+ end
35
+
36
+ threads.each(&:join)
37
+ threads.each { |thread| @logger.info "#{thread[:url]} (#{thread[:links].length} links)" }
38
+ links_by_url = Hash[threads.map { |thread| [thread[:url], thread[:links]] }]
39
+
40
+ threads = (links_by_url).map.with_index do |arr, i|
41
+ Thread.new(i+1, arr.first, arr.last) do |i, url, links|
42
+ @logger.info "[Thread ##{i}] scanning for emails on page '#{url}' (#{links.length} links)"
43
+ Thread.current[:url] = url
44
+ Thread.current[:emails] = EmailScanner.new(url).scan(links)
45
+ end
46
+ end
47
+
48
+ threads.each(&:join)
49
+
50
+ read_emails = Set.new
51
+ CSV.generate do |csv|
52
+ csv << %w(Email Domain URL)
53
+ csv << []
54
+
55
+ threads.each do |thread|
56
+ email_count = thread[:emails].inject(0) { |sum, arr| sum += arr.last.length }
57
+ @logger.info "#{thread[:url]} (#{email_count} emails)"
58
+
59
+ url = thread[:url]
60
+ thread[:emails].each do |link, emails|
61
+ emails.each do |email|
62
+ csv << [email, url, link] if read_emails.add?(email)
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,16 @@
1
+ require_relative "../../spec_helper"
2
+
3
+ require File.expand_path("lib/email_crawler")
4
+
5
+ module EmailCrawler
6
+ describe EmailScanner do
7
+ subject { EmailScanner.new("google.com") }
8
+
9
+ let(:link) { "http://www.kitaylaw.com/contact.php" }
10
+
11
+ it "scans links for email addresses" do
12
+ emails_by_link = subject.scan([link])
13
+ emails_by_link[link].wont_be_empty
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,13 @@
1
+ require_relative "../../spec_helper"
2
+
3
+ require File.expand_path("lib/email_crawler")
4
+
5
+ module EmailCrawler
6
+ describe PageLinks do
7
+ let(:max_links) { 25 }
8
+
9
+ it "returns the first N internal links" do
10
+ PageLinks.for("http://www.visitberlin.de/en", max_links).length.must_equal max_links
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,13 @@
1
+ require_relative "../../spec_helper"
2
+
3
+ require File.expand_path("lib/email_crawler")
4
+
5
+ module EmailCrawler
6
+ describe Scraper do
7
+ subject { Scraper.new("google.de") }
8
+
9
+ it "returns the top 10 URLs for a given search term/expression" do
10
+ subject.top_ten_urls_for("berlin tours").length.must_equal 10
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,2 @@
1
+ require "minitest/autorun"
2
+ require "minitest/pride"
metadata ADDED
@@ -0,0 +1,140 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: email_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - Cristian Rasch
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-02-25 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: mechanize
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: dotenv
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.5'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.5'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: minitest
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 5.2.3
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 5.2.3
83
+ description:
84
+ email:
85
+ - cristianrasch@fastmail.fm
86
+ executables:
87
+ - email-crawler
88
+ extensions: []
89
+ extra_rdoc_files: []
90
+ files:
91
+ - ".env.example"
92
+ - ".gitignore"
93
+ - ".ruby-version"
94
+ - Gemfile
95
+ - LICENSE.txt
96
+ - README.md
97
+ - Rakefile
98
+ - bin/email-crawler
99
+ - email_crawler.gemspec
100
+ - lib/email_crawler.rb
101
+ - lib/email_crawler/email_scanner.rb
102
+ - lib/email_crawler/mechanize_helper.rb
103
+ - lib/email_crawler/page_links.rb
104
+ - lib/email_crawler/proxy.rb
105
+ - lib/email_crawler/scraper.rb
106
+ - lib/email_crawler/version.rb
107
+ - spec/lib/email_crawler/email_scanner_spec.rb
108
+ - spec/lib/email_crawler/page_links_spec.rb
109
+ - spec/lib/email_crawler/scraper_spec.rb
110
+ - spec/spec_helper.rb
111
+ homepage: https://github.com/cristianrasch/email_crawler
112
+ licenses:
113
+ - MIT
114
+ metadata: {}
115
+ post_install_message:
116
+ rdoc_options: []
117
+ require_paths:
118
+ - lib
119
+ required_ruby_version: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: '0'
124
+ required_rubygems_version: !ruby/object:Gem::Requirement
125
+ requirements:
126
+ - - ">="
127
+ - !ruby/object:Gem::Version
128
+ version: '0'
129
+ requirements: []
130
+ rubyforge_project:
131
+ rubygems_version: 2.2.0
132
+ signing_key:
133
+ specification_version: 4
134
+ summary: 'Email crawler: crawls the top ten Google search results looking for email
135
+ addresses and exports them to CSV.'
136
+ test_files:
137
+ - spec/lib/email_crawler/email_scanner_spec.rb
138
+ - spec/lib/email_crawler/page_links_spec.rb
139
+ - spec/lib/email_crawler/scraper_spec.rb
140
+ - spec/spec_helper.rb