email_crawler 0.0.9 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 49d12206392f47b3417cebc922a65e38030aab31
4
- data.tar.gz: 953cd1f8ab44e9fafc361b06495d9278832b72e5
3
+ metadata.gz: 6c7248b8e8688fb03bc939e0690a87920bd43f74
4
+ data.tar.gz: d125e3a9ad7b554e7e98d994c9b158ecfaf93b6b
5
5
  SHA512:
6
- metadata.gz: 21531db699176ddca60be2c5215129fc601574674b43f463da2d83b175c46693cd8d90c9fc76a246cf988224afd82cbc1ed6dcb450c938535a859376de44da68
7
- data.tar.gz: 2b80fb41432a1a549b92fc7342e995e20fbf0753db51488546dddf634f4adb50e67bd2248a2efc6aa17e60d38b74e60318937d499e1651b1275330ca81982add
6
+ metadata.gz: e238ffdaabe400c7d70ee323a6769aece2b8b141c530aacbbc7c822b6952f44f6379d77a744b45c2e2e1061e8ee3f4f06def65a34299442d149055b591f7f665
7
+ data.tar.gz: 79a2d7d3e0c7f60312e81f72a6b9dcd7b1d2b5b8de672ec3b3fdf823021b57514d77577f01f48cc7609a4650b7bcf1646d312900553d8f9ca55c716668b183ea
@@ -1,14 +1,16 @@
1
- require_relative "proxy"
1
+ require_relative "url_helper"
2
2
 
3
3
  module EmailCrawler
4
4
  class Scraper
5
5
  MAX_RESULTS = 100
6
6
 
7
7
  include MechanizeHelper
8
+ include URLHelper
8
9
 
9
- def initialize(google_website, max_results = MAX_RESULTS)
10
+ def initialize(google_website, max_results: MAX_RESULTS, blacklisted_domains: [])
10
11
  @google_website = "https://www.#{google_website}/"
11
12
  @max_results = max_results
13
+ @blacklisted_domains = blacklisted_domains.map { |domain| /#{domain}\z/ }
12
14
  end
13
15
 
14
16
  def search_result_urls_for(q)
@@ -36,12 +38,15 @@ module EmailCrawler
36
38
  def search_results_on(page)
37
39
  page.search("#search ol li h3.r a").
38
40
  map { |a| a["href"].downcase }.
39
- reject { |url| url =~ %r(\A/search[?]q=) }
41
+ reject { |url| url =~ %r(\A/search[?]q=) }.
42
+ reject do |url|
43
+ domain = extract_domain_from(url)
44
+ @blacklisted_domains.any? { |blacklisted_domain| domain =~ blacklisted_domain }
45
+ end
40
46
  end
41
47
 
42
48
  def agent
43
49
  @agent ||= new_agent
44
- # @agent ||= new_agent { |agent| agent.set_proxy(Proxy.random, "8888") }
45
50
  end
46
51
  end
47
52
  end
@@ -0,0 +1,20 @@
1
+ require "English"
2
+
3
+ module URLHelper
4
+ DOMAIN_REGEXP = %r(https://([^/]+))i
5
+ WWW_REGEXP = /\Awww[.]/i
6
+
7
+ def extract_domain_from(url, www = false)
8
+ uri = begin
9
+ URI(url)
10
+ rescue URI::InvalidURIError
11
+ return
12
+ end
13
+ host = uri.host || url[DOMAIN_REGEXP, 1].to_s
14
+ if www || host !~ WWW_REGEXP
15
+ host.downcase
16
+ else
17
+ $POSTMATCH.downcase
18
+ end
19
+ end
20
+ end
@@ -1,3 +1,3 @@
1
1
  module EmailCrawler
2
- VERSION = "0.0.9"
2
+ VERSION = "0.0.10"
3
3
  end
data/lib/email_crawler.rb CHANGED
@@ -22,7 +22,7 @@ module EmailCrawler
22
22
  end
23
23
 
24
24
  def run(q)
25
- urls = Scraper.new(@google_website, @max_results).search_result_urls_for(q)
25
+ urls = Scraper.new(@google_website, max_results: @max_results).search_result_urls_for(q)
26
26
  urls.each { |url| logger.info "#{url}" }
27
27
  queue = Queue.new
28
28
  urls.each { |url| queue.push(url) }
@@ -6,7 +6,7 @@ module EmailCrawler
6
6
  describe Scraper do
7
7
  let(:max_results) { 10 }
8
8
 
9
- subject { Scraper.new("google.de", max_results) }
9
+ subject { Scraper.new("google.de", max_results: max_results) }
10
10
 
11
11
  it "returns the top 10 URLs for a given search term/expression" do
12
12
  subject.search_result_urls_for("berlin tours").length.must_equal max_results
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: email_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch
@@ -115,8 +115,8 @@ files:
115
115
  - lib/email_crawler/email_scanner.rb
116
116
  - lib/email_crawler/mechanize_helper.rb
117
117
  - lib/email_crawler/page_links.rb
118
- - lib/email_crawler/proxy.rb
119
118
  - lib/email_crawler/scraper.rb
119
+ - lib/email_crawler/url_helper.rb
120
120
  - lib/email_crawler/version.rb
121
121
  - spec/lib/email_crawler/email_scanner_spec.rb
122
122
  - spec/lib/email_crawler/page_links_spec.rb
@@ -1,30 +0,0 @@
1
- require "openssl"
2
- require "open-uri"
3
- require "json"
4
- require "dotenv"
5
-
6
- module EmailCrawler
7
- class Proxy
8
- class << self
9
- def random
10
- all.sample
11
- end
12
-
13
- private
14
-
15
- def all
16
- @all ||= begin
17
- Dotenv.load
18
-
19
- json = JSON.parse(open("https://api.digitalocean.com/droplets/?client_id=#{ENV['DO_CLIENT_ID']}&api_key=#{ENV['DO_API_KEY']}",
20
- ssl_verify_mode: ::OpenSSL::SSL::VERIFY_NONE).read)
21
- json["droplets"].
22
- select{ |droplet| droplet["name"] =~ /proxy\d+/ }.
23
- map { |droplet| droplet["ip_address"] }
24
- end
25
- end
26
- end
27
- end
28
- end
29
-
30
-