email_crawler 0.0.9 → 0.0.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 49d12206392f47b3417cebc922a65e38030aab31
4
- data.tar.gz: 953cd1f8ab44e9fafc361b06495d9278832b72e5
3
+ metadata.gz: 6c7248b8e8688fb03bc939e0690a87920bd43f74
4
+ data.tar.gz: d125e3a9ad7b554e7e98d994c9b158ecfaf93b6b
5
5
  SHA512:
6
- metadata.gz: 21531db699176ddca60be2c5215129fc601574674b43f463da2d83b175c46693cd8d90c9fc76a246cf988224afd82cbc1ed6dcb450c938535a859376de44da68
7
- data.tar.gz: 2b80fb41432a1a549b92fc7342e995e20fbf0753db51488546dddf634f4adb50e67bd2248a2efc6aa17e60d38b74e60318937d499e1651b1275330ca81982add
6
+ metadata.gz: e238ffdaabe400c7d70ee323a6769aece2b8b141c530aacbbc7c822b6952f44f6379d77a744b45c2e2e1061e8ee3f4f06def65a34299442d149055b591f7f665
7
+ data.tar.gz: 79a2d7d3e0c7f60312e81f72a6b9dcd7b1d2b5b8de672ec3b3fdf823021b57514d77577f01f48cc7609a4650b7bcf1646d312900553d8f9ca55c716668b183ea
@@ -1,14 +1,16 @@
1
- require_relative "proxy"
1
+ require_relative "url_helper"
2
2
 
3
3
  module EmailCrawler
4
4
  class Scraper
5
5
  MAX_RESULTS = 100
6
6
 
7
7
  include MechanizeHelper
8
+ include URLHelper
8
9
 
9
- def initialize(google_website, max_results = MAX_RESULTS)
10
+ def initialize(google_website, max_results: MAX_RESULTS, blacklisted_domains: [])
10
11
  @google_website = "https://www.#{google_website}/"
11
12
  @max_results = max_results
13
+ @blacklisted_domains = blacklisted_domains.map { |domain| /#{domain}\z/ }
12
14
  end
13
15
 
14
16
  def search_result_urls_for(q)
@@ -36,12 +38,15 @@ module EmailCrawler
36
38
  def search_results_on(page)
37
39
  page.search("#search ol li h3.r a").
38
40
  map { |a| a["href"].downcase }.
39
- reject { |url| url =~ %r(\A/search[?]q=) }
41
+ reject { |url| url =~ %r(\A/search[?]q=) }.
42
+ reject do |url|
43
+ domain = extract_domain_from(url)
44
+ @blacklisted_domains.any? { |blacklisted_domain| domain =~ blacklisted_domain }
45
+ end
40
46
  end
41
47
 
42
48
  def agent
43
49
  @agent ||= new_agent
44
- # @agent ||= new_agent { |agent| agent.set_proxy(Proxy.random, "8888") }
45
50
  end
46
51
  end
47
52
  end
@@ -0,0 +1,20 @@
1
+ require "English"
2
+
3
+ module URLHelper
4
+ DOMAIN_REGEXP = %r(https://([^/]+))i
5
+ WWW_REGEXP = /\Awww[.]/i
6
+
7
+ def extract_domain_from(url, www = false)
8
+ uri = begin
9
+ URI(url)
10
+ rescue URI::InvalidURIError
11
+ return
12
+ end
13
+ host = uri.host || url[DOMAIN_REGEXP, 1].to_s
14
+ if www || host !~ WWW_REGEXP
15
+ host.downcase
16
+ else
17
+ $POSTMATCH.downcase
18
+ end
19
+ end
20
+ end
@@ -1,3 +1,3 @@
1
1
  module EmailCrawler
2
- VERSION = "0.0.9"
2
+ VERSION = "0.0.10"
3
3
  end
data/lib/email_crawler.rb CHANGED
@@ -22,7 +22,7 @@ module EmailCrawler
22
22
  end
23
23
 
24
24
  def run(q)
25
- urls = Scraper.new(@google_website, @max_results).search_result_urls_for(q)
25
+ urls = Scraper.new(@google_website, max_results: @max_results).search_result_urls_for(q)
26
26
  urls.each { |url| logger.info "#{url}" }
27
27
  queue = Queue.new
28
28
  urls.each { |url| queue.push(url) }
@@ -6,7 +6,7 @@ module EmailCrawler
6
6
  describe Scraper do
7
7
  let(:max_results) { 10 }
8
8
 
9
- subject { Scraper.new("google.de", max_results) }
9
+ subject { Scraper.new("google.de", max_results: max_results) }
10
10
 
11
11
  it "returns the top 10 URLs for a given search term/expression" do
12
12
  subject.search_result_urls_for("berlin tours").length.must_equal max_results
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: email_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch
@@ -115,8 +115,8 @@ files:
115
115
  - lib/email_crawler/email_scanner.rb
116
116
  - lib/email_crawler/mechanize_helper.rb
117
117
  - lib/email_crawler/page_links.rb
118
- - lib/email_crawler/proxy.rb
119
118
  - lib/email_crawler/scraper.rb
119
+ - lib/email_crawler/url_helper.rb
120
120
  - lib/email_crawler/version.rb
121
121
  - spec/lib/email_crawler/email_scanner_spec.rb
122
122
  - spec/lib/email_crawler/page_links_spec.rb
@@ -1,30 +0,0 @@
1
- require "openssl"
2
- require "open-uri"
3
- require "json"
4
- require "dotenv"
5
-
6
- module EmailCrawler
7
- class Proxy
8
- class << self
9
- def random
10
- all.sample
11
- end
12
-
13
- private
14
-
15
- def all
16
- @all ||= begin
17
- Dotenv.load
18
-
19
- json = JSON.parse(open("https://api.digitalocean.com/droplets/?client_id=#{ENV['DO_CLIENT_ID']}&api_key=#{ENV['DO_API_KEY']}",
20
- ssl_verify_mode: ::OpenSSL::SSL::VERIFY_NONE).read)
21
- json["droplets"].
22
- select{ |droplet| droplet["name"] =~ /proxy\d+/ }.
23
- map { |droplet| droplet["ip_address"] }
24
- end
25
- end
26
- end
27
- end
28
- end
29
-
30
-