email_crawler 0.0.13 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6d6abc9f4ac1c4c0323addb2123d8795aa268cb2
4
- data.tar.gz: e835dcd5d5b52bc30ec55831503dfa2f354a4d76
3
+ metadata.gz: b158c0d7a48cac6a56c44b905b833934e3e88ee2
4
+ data.tar.gz: 9b13281878621e1679de17c30e5b13d4ea04bd98
5
5
  SHA512:
6
- metadata.gz: 12e7baf2369874c8f759f47d54fbebf3bdcea67c2d4065dda0e461b2f052e9a97982323d3abbc2e0d06c7c28ac1f486854c64193e0cacb12d19074821dcd03aa
7
- data.tar.gz: b89ff4b54670fb2ef27dadefbb25476aaa6fe27b5f9374cadaf466e619e130e0c7270a6aa7ebf41dcfaac4774536f7edf18ddf8d4d39fb1caf2aaa086703ee1a
6
+ metadata.gz: 589043168d0c1a9ad72dcf3e4d7f2d32194004e76cff744e9fd9ec7187cb8e4101d22f2122f6d236a586626cdd86588ee9a676cc4afe3e91e8624fca0a557d19
7
+ data.tar.gz: 1955a721beb4336cb87067460a923826e801701c7d40695f96fb73b41843ecbf58592efe7b4dbcf1053dd76ca61dfefd3a141e897e282db296323945e181c070
data/.gitignore CHANGED
@@ -17,3 +17,4 @@ test/version_tmp
17
17
  tmp
18
18
  .rbenv-gemsets
19
19
  .env
20
+ bin/*
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 2.1.0
1
+ 2.2.3
data/README.md CHANGED
@@ -44,6 +44,12 @@ email-crawler --query "berlin walks" --max-links 250
44
44
  email-crawler --query "berlin walks" --concurrency 25
45
45
  ```
46
46
 
47
+ * Exclude certain domains from pages scanned for email addresses
48
+
49
+ ```bash
50
+ email-crawler --query "berlin walks" --blacklist berlin.de --blacklist berlin.com
51
+ ```
52
+
47
53
  * Redirect output to a file
48
54
 
49
55
  ```bash
data/bin/email-crawler CHANGED
@@ -70,5 +70,5 @@ else
70
70
  runner.blacklisted_domains = options.blacklisted_domains
71
71
  end
72
72
  csv = runner.run(options.q)
73
- $stdout << "#{csv}\n"
73
+ STDOUT << "#{csv}\n"
74
74
  end
@@ -21,7 +21,7 @@ Gem::Specification.new do |spec|
21
21
  spec.add_runtime_dependency "dotenv"
22
22
  spec.add_runtime_dependency "thread_safe"
23
23
 
24
- spec.add_development_dependency "bundler", "~> 1.5"
24
+ spec.add_development_dependency "bundler"
25
25
  spec.add_development_dependency "rake"
26
- spec.add_development_dependency "minitest", "~> 5.2.3"
26
+ spec.add_development_dependency "minitest", "~> 5.8"
27
27
  end
data/lib/email_crawler.rb CHANGED
@@ -1,4 +1,3 @@
1
- require "thread"
2
1
  require "logger"
3
2
  require "csv"
4
3
  require "set"
@@ -14,7 +13,8 @@ module EmailCrawler
14
13
  class Runner
15
14
  MAX_CONCURRENCY = 50
16
15
 
17
- attr_writer :max_results, :max_links, :max_concurrency, :logger, :blacklisted_domains
16
+ attr_writer :max_results, :max_links, :max_concurrency, :logger,
17
+ :blacklisted_domains
18
18
 
19
19
  def initialize(google_website)
20
20
  @google_website = google_website
@@ -27,7 +27,7 @@ module EmailCrawler
27
27
  blacklisted_domains: @blacklisted_domains).
28
28
  search_result_urls_for(q)
29
29
  urls.each { |url| logger.info "#{url}" }
30
- queue = Queue.new
30
+ queue = Thread::Queue.new
31
31
  urls.each { |url| queue.push(url) }
32
32
  links_by_url = ThreadSafe::Array.new
33
33
 
@@ -62,7 +62,7 @@ module EmailCrawler
62
62
  while arr
63
63
  url, links = arr
64
64
  logger.info "[Thread ##{i}] scanning for emails on page '#{url}' (#{links.length} links)"
65
- emails = EmailScanner.new(url, logger).scan(links)
65
+ emails = EmailScanner.new(logger).scan(links)
66
66
  emails_by_url[url] = emails
67
67
 
68
68
  arr = begin
@@ -92,11 +92,11 @@ module EmailCrawler
92
92
  end
93
93
  end
94
94
 
95
- private
95
+ private
96
96
 
97
97
  def logger
98
98
  @logger ||= begin
99
- path = File.join(ENV["HOME"], "email_crawler.log")
99
+ path = File.join(ENV["HOME"], "email-crawler.log")
100
100
  file = File.open(path, File::WRONLY | File::APPEND | File::CREAT)
101
101
  logger = ::Logger.new(file).tap do |logger|
102
102
  logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
@@ -3,31 +3,28 @@ require "open-uri"
3
3
  module EmailCrawler
4
4
  class EmailScanner
5
5
  EMAIL_REGEXP = /\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}\b/i
6
- SLEEP_TIME = 0.5
7
6
  UTF_8 = "UTF-8".freeze
8
7
 
9
- def initialize(url, logger = Logger.new("/dev/null"))
10
- @url, @logger = url, logger
8
+ def initialize(logger = Logger.new("/dev/null"))
9
+ @logger = logger
11
10
  end
12
11
 
13
12
  def scan(links)
14
- emails_by_link = {}
15
-
16
- links.each do |link|
13
+ links.each_with_object({}) do |link, h|
17
14
  @logger.info "searching for emails on '#{link}'.."
18
15
  retried = false
19
16
 
20
17
  html = begin
21
- open(link).read
22
- rescue OpenURI::HTTPError => err
23
- @logger.warn(err)
24
- nil
25
- rescue => err
26
- if err.message =~ /redirection forbidden/
27
- link = err.message.split(" ").last
28
- retry
29
- end
30
- end
18
+ open(link).read
19
+ rescue OpenURI::HTTPError => err
20
+ @logger.warn(err)
21
+ nil
22
+ rescue => err
23
+ if err.message =~ /redirection forbidden/
24
+ link = err.message.split(" ").last
25
+ retry
26
+ end
27
+ end
31
28
  next unless html
32
29
 
33
30
  begin
@@ -42,11 +39,9 @@ module EmailCrawler
42
39
  retry
43
40
  end
44
41
  end
45
- emails_by_link[link] = Set.new(emails) unless emails.empty?
46
- sleep(SLEEP_TIME)
47
- end
48
42
 
49
- emails_by_link
43
+ h[link] = Set.new(emails) unless emails.empty?
44
+ end
50
45
  end
51
46
  end
52
47
  end
@@ -6,7 +6,7 @@ module EmailCrawler
6
6
 
7
7
  def new_agent
8
8
  Thread.current[:agent] ||= Mechanize.new do |agent|
9
- agent.user_agent_alias = "Mac Safari"
9
+ agent.user_agent_alias = "Windows Mozilla"
10
10
  agent.open_timeout = agent.read_timeout = READ_TIMEOUT
11
11
  agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
12
12
  agent.history.max_size = 1
@@ -73,7 +73,7 @@ module EmailCrawler
73
73
  links.to_a
74
74
  end
75
75
 
76
- private
76
+ private
77
77
 
78
78
  def agent
79
79
  @agent ||= new_agent
@@ -1,3 +1,5 @@
1
+ require "set"
2
+ require_relative "mechanize_helper"
1
3
  require_relative "url_helper"
2
4
 
3
5
  module EmailCrawler
@@ -8,41 +10,53 @@ module EmailCrawler
8
10
  include URLHelper
9
11
 
10
12
  def initialize(google_website, max_results: MAX_RESULTS, blacklisted_domains: [])
11
- @google_website = "https://www.#{google_website}/"
13
+ @search_url = "https://www.#{google_website}/search?q="
12
14
  @max_results = max_results
13
15
  @blacklisted_domains = blacklisted_domains.map { |domain| /#{domain}\z/ }
14
16
  end
15
17
 
16
18
  def search_result_urls_for(q)
17
- search_page = agent.get(@google_website)
18
- search_form = search_page.form_with(action: "/search")
19
- search_form.field_with(name: "q").value = q
20
- search_results_page = agent.submit(search_form)
21
- urls = search_results_on(search_results_page)
19
+ search_results_page = agent.get(@search_url + CGI.escape(q))
20
+ urls = Set.new(search_results_on(search_results_page))
22
21
 
23
22
  page = 1
24
23
  while urls.size < @max_results
25
24
  next_page_link = search_results_page.link_with(href: /start=#{page*10}/)
26
- return urls unless next_page_link
25
+ break unless next_page_link
27
26
 
28
27
  next_search_results_page = next_page_link.click
29
- urls.concat(search_results_on(next_search_results_page)).uniq!
28
+ search_results_on(next_search_results_page).each do |url|
29
+ urls << url
30
+ end
31
+
30
32
  page += 1
31
33
  end
32
34
 
33
- urls.first(@max_results)
35
+ urls.to_a.first(@max_results)
34
36
  end
35
37
 
36
- private
38
+ private
37
39
 
38
40
  def search_results_on(page)
39
- page.search("#search ol li h3.r a").
40
- map { |a| a["href"].downcase }.
41
- reject { |url| url =~ %r(\A/search[?]q=) }.
42
- reject do |url|
41
+ urls = page.search("#search ol li.g h3.r a").map do |a|
42
+ href = a[:href]
43
+ url = href =~ %r(/url\?q=) && $POSTMATCH
44
+
45
+ if url
46
+ url = url =~ /&sa=/ && $PREMATCH
47
+ CGI.unescape(url) if url
48
+ end
49
+ end
50
+ urls.compact!
51
+
52
+ unless @blacklisted_domains.empty?
53
+ urls.delete_if do |url|
43
54
  domain = extract_domain_from(url)
44
55
  @blacklisted_domains.any? { |blacklisted_domain| domain =~ blacklisted_domain }
45
56
  end
57
+ end
58
+
59
+ urls
46
60
  end
47
61
 
48
62
  def agent
@@ -11,6 +11,7 @@ module URLHelper
11
11
  return
12
12
  end
13
13
  host = uri.host || url[DOMAIN_REGEXP, 1].to_s
14
+
14
15
  if www || host !~ WWW_REGEXP
15
16
  host.downcase
16
17
  else
@@ -1,3 +1,3 @@
1
1
  module EmailCrawler
2
- VERSION = "0.0.13"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -4,9 +4,9 @@ require File.expand_path("lib/email_crawler")
4
4
 
5
5
  module EmailCrawler
6
6
  describe EmailScanner do
7
- subject { EmailScanner.new("google.com") }
7
+ subject { EmailScanner.new }
8
8
 
9
- let(:link) { "http://www.kitaylaw.com/contact.php" }
9
+ let(:link) { "https://www.mrosupply.com/page/plain/contact-us/" }
10
10
 
11
11
  it "scans links for email addresses" do
12
12
  emails_by_link = subject.scan([link])
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: email_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.13
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-28 00:00:00.000000000 Z
11
+ date: 2015-10-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -56,16 +56,16 @@ dependencies:
56
56
  name: bundler
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - "~>"
59
+ - - ">="
60
60
  - !ruby/object:Gem::Version
61
- version: '1.5'
61
+ version: '0'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - "~>"
66
+ - - ">="
67
67
  - !ruby/object:Gem::Version
68
- version: '1.5'
68
+ version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rake
71
71
  requirement: !ruby/object:Gem::Requirement
@@ -86,14 +86,14 @@ dependencies:
86
86
  requirements:
87
87
  - - "~>"
88
88
  - !ruby/object:Gem::Version
89
- version: 5.2.3
89
+ version: '5.8'
90
90
  type: :development
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
94
  - - "~>"
95
95
  - !ruby/object:Gem::Version
96
- version: 5.2.3
96
+ version: '5.8'
97
97
  description:
98
98
  email:
99
99
  - cristianrasch@fastmail.fm
@@ -102,7 +102,6 @@ executables:
102
102
  extensions: []
103
103
  extra_rdoc_files: []
104
104
  files:
105
- - ".env.example"
106
105
  - ".gitignore"
107
106
  - ".ruby-version"
108
107
  - Gemfile
@@ -142,7 +141,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
142
141
  version: '0'
143
142
  requirements: []
144
143
  rubyforge_project:
145
- rubygems_version: 2.2.0
144
+ rubygems_version: 2.4.5.1
146
145
  signing_key:
147
146
  specification_version: 4
148
147
  summary: 'Email crawler: crawls the top ten Google search results looking for email
data/.env.example DELETED
@@ -1,2 +0,0 @@
1
- DO_CLIENT_ID=top
2
- DO_API_KEY=secret