email_crawler 0.0.13 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6d6abc9f4ac1c4c0323addb2123d8795aa268cb2
4
- data.tar.gz: e835dcd5d5b52bc30ec55831503dfa2f354a4d76
3
+ metadata.gz: b158c0d7a48cac6a56c44b905b833934e3e88ee2
4
+ data.tar.gz: 9b13281878621e1679de17c30e5b13d4ea04bd98
5
5
  SHA512:
6
- metadata.gz: 12e7baf2369874c8f759f47d54fbebf3bdcea67c2d4065dda0e461b2f052e9a97982323d3abbc2e0d06c7c28ac1f486854c64193e0cacb12d19074821dcd03aa
7
- data.tar.gz: b89ff4b54670fb2ef27dadefbb25476aaa6fe27b5f9374cadaf466e619e130e0c7270a6aa7ebf41dcfaac4774536f7edf18ddf8d4d39fb1caf2aaa086703ee1a
6
+ metadata.gz: 589043168d0c1a9ad72dcf3e4d7f2d32194004e76cff744e9fd9ec7187cb8e4101d22f2122f6d236a586626cdd86588ee9a676cc4afe3e91e8624fca0a557d19
7
+ data.tar.gz: 1955a721beb4336cb87067460a923826e801701c7d40695f96fb73b41843ecbf58592efe7b4dbcf1053dd76ca61dfefd3a141e897e282db296323945e181c070
data/.gitignore CHANGED
@@ -17,3 +17,4 @@ test/version_tmp
17
17
  tmp
18
18
  .rbenv-gemsets
19
19
  .env
20
+ bin/*
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 2.1.0
1
+ 2.2.3
data/README.md CHANGED
@@ -44,6 +44,12 @@ email-crawler --query "berlin walks" --max-links 250
44
44
  email-crawler --query "berlin walks" --concurrency 25
45
45
  ```
46
46
 
47
+ * Exclude certain domains from pages scanned for email addresses
48
+
49
+ ```bash
50
+ email-crawler --query "berlin walks" --blacklist berlin.de --blacklist berlin.com
51
+ ```
52
+
47
53
  * Redirect output to a file
48
54
 
49
55
  ```bash
data/bin/email-crawler CHANGED
@@ -70,5 +70,5 @@ else
70
70
  runner.blacklisted_domains = options.blacklisted_domains
71
71
  end
72
72
  csv = runner.run(options.q)
73
- $stdout << "#{csv}\n"
73
+ STDOUT << "#{csv}\n"
74
74
  end
@@ -21,7 +21,7 @@ Gem::Specification.new do |spec|
21
21
  spec.add_runtime_dependency "dotenv"
22
22
  spec.add_runtime_dependency "thread_safe"
23
23
 
24
- spec.add_development_dependency "bundler", "~> 1.5"
24
+ spec.add_development_dependency "bundler"
25
25
  spec.add_development_dependency "rake"
26
- spec.add_development_dependency "minitest", "~> 5.2.3"
26
+ spec.add_development_dependency "minitest", "~> 5.8"
27
27
  end
data/lib/email_crawler.rb CHANGED
@@ -1,4 +1,3 @@
1
- require "thread"
2
1
  require "logger"
3
2
  require "csv"
4
3
  require "set"
@@ -14,7 +13,8 @@ module EmailCrawler
14
13
  class Runner
15
14
  MAX_CONCURRENCY = 50
16
15
 
17
- attr_writer :max_results, :max_links, :max_concurrency, :logger, :blacklisted_domains
16
+ attr_writer :max_results, :max_links, :max_concurrency, :logger,
17
+ :blacklisted_domains
18
18
 
19
19
  def initialize(google_website)
20
20
  @google_website = google_website
@@ -27,7 +27,7 @@ module EmailCrawler
27
27
  blacklisted_domains: @blacklisted_domains).
28
28
  search_result_urls_for(q)
29
29
  urls.each { |url| logger.info "#{url}" }
30
- queue = Queue.new
30
+ queue = Thread::Queue.new
31
31
  urls.each { |url| queue.push(url) }
32
32
  links_by_url = ThreadSafe::Array.new
33
33
 
@@ -62,7 +62,7 @@ module EmailCrawler
62
62
  while arr
63
63
  url, links = arr
64
64
  logger.info "[Thread ##{i}] scanning for emails on page '#{url}' (#{links.length} links)"
65
- emails = EmailScanner.new(url, logger).scan(links)
65
+ emails = EmailScanner.new(logger).scan(links)
66
66
  emails_by_url[url] = emails
67
67
 
68
68
  arr = begin
@@ -92,11 +92,11 @@ module EmailCrawler
92
92
  end
93
93
  end
94
94
 
95
- private
95
+ private
96
96
 
97
97
  def logger
98
98
  @logger ||= begin
99
- path = File.join(ENV["HOME"], "email_crawler.log")
99
+ path = File.join(ENV["HOME"], "email-crawler.log")
100
100
  file = File.open(path, File::WRONLY | File::APPEND | File::CREAT)
101
101
  logger = ::Logger.new(file).tap do |logger|
102
102
  logger.level = ENV["DEBUG"] ? Logger::INFO : Logger::ERROR
@@ -3,31 +3,28 @@ require "open-uri"
3
3
  module EmailCrawler
4
4
  class EmailScanner
5
5
  EMAIL_REGEXP = /\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}\b/i
6
- SLEEP_TIME = 0.5
7
6
  UTF_8 = "UTF-8".freeze
8
7
 
9
- def initialize(url, logger = Logger.new("/dev/null"))
10
- @url, @logger = url, logger
8
+ def initialize(logger = Logger.new("/dev/null"))
9
+ @logger = logger
11
10
  end
12
11
 
13
12
  def scan(links)
14
- emails_by_link = {}
15
-
16
- links.each do |link|
13
+ links.each_with_object({}) do |link, h|
17
14
  @logger.info "searching for emails on '#{link}'.."
18
15
  retried = false
19
16
 
20
17
  html = begin
21
- open(link).read
22
- rescue OpenURI::HTTPError => err
23
- @logger.warn(err)
24
- nil
25
- rescue => err
26
- if err.message =~ /redirection forbidden/
27
- link = err.message.split(" ").last
28
- retry
29
- end
30
- end
18
+ open(link).read
19
+ rescue OpenURI::HTTPError => err
20
+ @logger.warn(err)
21
+ nil
22
+ rescue => err
23
+ if err.message =~ /redirection forbidden/
24
+ link = err.message.split(" ").last
25
+ retry
26
+ end
27
+ end
31
28
  next unless html
32
29
 
33
30
  begin
@@ -42,11 +39,9 @@ module EmailCrawler
42
39
  retry
43
40
  end
44
41
  end
45
- emails_by_link[link] = Set.new(emails) unless emails.empty?
46
- sleep(SLEEP_TIME)
47
- end
48
42
 
49
- emails_by_link
43
+ h[link] = Set.new(emails) unless emails.empty?
44
+ end
50
45
  end
51
46
  end
52
47
  end
@@ -6,7 +6,7 @@ module EmailCrawler
6
6
 
7
7
  def new_agent
8
8
  Thread.current[:agent] ||= Mechanize.new do |agent|
9
- agent.user_agent_alias = "Mac Safari"
9
+ agent.user_agent_alias = "Windows Mozilla"
10
10
  agent.open_timeout = agent.read_timeout = READ_TIMEOUT
11
11
  agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
12
12
  agent.history.max_size = 1
@@ -73,7 +73,7 @@ module EmailCrawler
73
73
  links.to_a
74
74
  end
75
75
 
76
- private
76
+ private
77
77
 
78
78
  def agent
79
79
  @agent ||= new_agent
@@ -1,3 +1,5 @@
1
+ require "set"
2
+ require_relative "mechanize_helper"
1
3
  require_relative "url_helper"
2
4
 
3
5
  module EmailCrawler
@@ -8,41 +10,53 @@ module EmailCrawler
8
10
  include URLHelper
9
11
 
10
12
  def initialize(google_website, max_results: MAX_RESULTS, blacklisted_domains: [])
11
- @google_website = "https://www.#{google_website}/"
13
+ @search_url = "https://www.#{google_website}/search?q="
12
14
  @max_results = max_results
13
15
  @blacklisted_domains = blacklisted_domains.map { |domain| /#{domain}\z/ }
14
16
  end
15
17
 
16
18
  def search_result_urls_for(q)
17
- search_page = agent.get(@google_website)
18
- search_form = search_page.form_with(action: "/search")
19
- search_form.field_with(name: "q").value = q
20
- search_results_page = agent.submit(search_form)
21
- urls = search_results_on(search_results_page)
19
+ search_results_page = agent.get(@search_url + CGI.escape(q))
20
+ urls = Set.new(search_results_on(search_results_page))
22
21
 
23
22
  page = 1
24
23
  while urls.size < @max_results
25
24
  next_page_link = search_results_page.link_with(href: /start=#{page*10}/)
26
- return urls unless next_page_link
25
+ break unless next_page_link
27
26
 
28
27
  next_search_results_page = next_page_link.click
29
- urls.concat(search_results_on(next_search_results_page)).uniq!
28
+ search_results_on(next_search_results_page).each do |url|
29
+ urls << url
30
+ end
31
+
30
32
  page += 1
31
33
  end
32
34
 
33
- urls.first(@max_results)
35
+ urls.to_a.first(@max_results)
34
36
  end
35
37
 
36
- private
38
+ private
37
39
 
38
40
  def search_results_on(page)
39
- page.search("#search ol li h3.r a").
40
- map { |a| a["href"].downcase }.
41
- reject { |url| url =~ %r(\A/search[?]q=) }.
42
- reject do |url|
41
+ urls = page.search("#search ol li.g h3.r a").map do |a|
42
+ href = a[:href]
43
+ url = href =~ %r(/url\?q=) && $POSTMATCH
44
+
45
+ if url
46
+ url = url =~ /&sa=/ && $PREMATCH
47
+ CGI.unescape(url) if url
48
+ end
49
+ end
50
+ urls.compact!
51
+
52
+ unless @blacklisted_domains.empty?
53
+ urls.delete_if do |url|
43
54
  domain = extract_domain_from(url)
44
55
  @blacklisted_domains.any? { |blacklisted_domain| domain =~ blacklisted_domain }
45
56
  end
57
+ end
58
+
59
+ urls
46
60
  end
47
61
 
48
62
  def agent
@@ -11,6 +11,7 @@ module URLHelper
11
11
  return
12
12
  end
13
13
  host = uri.host || url[DOMAIN_REGEXP, 1].to_s
14
+
14
15
  if www || host !~ WWW_REGEXP
15
16
  host.downcase
16
17
  else
@@ -1,3 +1,3 @@
1
1
  module EmailCrawler
2
- VERSION = "0.0.13"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -4,9 +4,9 @@ require File.expand_path("lib/email_crawler")
4
4
 
5
5
  module EmailCrawler
6
6
  describe EmailScanner do
7
- subject { EmailScanner.new("google.com") }
7
+ subject { EmailScanner.new }
8
8
 
9
- let(:link) { "http://www.kitaylaw.com/contact.php" }
9
+ let(:link) { "https://www.mrosupply.com/page/plain/contact-us/" }
10
10
 
11
11
  it "scans links for email addresses" do
12
12
  emails_by_link = subject.scan([link])
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: email_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.13
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-28 00:00:00.000000000 Z
11
+ date: 2015-10-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -56,16 +56,16 @@ dependencies:
56
56
  name: bundler
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - "~>"
59
+ - - ">="
60
60
  - !ruby/object:Gem::Version
61
- version: '1.5'
61
+ version: '0'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - "~>"
66
+ - - ">="
67
67
  - !ruby/object:Gem::Version
68
- version: '1.5'
68
+ version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rake
71
71
  requirement: !ruby/object:Gem::Requirement
@@ -86,14 +86,14 @@ dependencies:
86
86
  requirements:
87
87
  - - "~>"
88
88
  - !ruby/object:Gem::Version
89
- version: 5.2.3
89
+ version: '5.8'
90
90
  type: :development
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
94
  - - "~>"
95
95
  - !ruby/object:Gem::Version
96
- version: 5.2.3
96
+ version: '5.8'
97
97
  description:
98
98
  email:
99
99
  - cristianrasch@fastmail.fm
@@ -102,7 +102,6 @@ executables:
102
102
  extensions: []
103
103
  extra_rdoc_files: []
104
104
  files:
105
- - ".env.example"
106
105
  - ".gitignore"
107
106
  - ".ruby-version"
108
107
  - Gemfile
@@ -142,7 +141,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
142
141
  version: '0'
143
142
  requirements: []
144
143
  rubyforge_project:
145
- rubygems_version: 2.2.0
144
+ rubygems_version: 2.4.5.1
146
145
  signing_key:
147
146
  specification_version: 4
148
147
  summary: 'Email crawler: crawls the top ten Google search results looking for email
data/.env.example DELETED
@@ -1,2 +0,0 @@
1
- DO_CLIENT_ID=top
2
- DO_API_KEY=secret