email_crawler 0.0.9 → 0.0.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/email_crawler/scraper.rb +9 -4
- data/lib/email_crawler/url_helper.rb +20 -0
- data/lib/email_crawler/version.rb +1 -1
- data/lib/email_crawler.rb +1 -1
- data/spec/lib/email_crawler/scraper_spec.rb +1 -1
- metadata +2 -2
- data/lib/email_crawler/proxy.rb +0 -30
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6c7248b8e8688fb03bc939e0690a87920bd43f74
|
4
|
+
data.tar.gz: d125e3a9ad7b554e7e98d994c9b158ecfaf93b6b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e238ffdaabe400c7d70ee323a6769aece2b8b141c530aacbbc7c822b6952f44f6379d77a744b45c2e2e1061e8ee3f4f06def65a34299442d149055b591f7f665
|
7
|
+
data.tar.gz: 79a2d7d3e0c7f60312e81f72a6b9dcd7b1d2b5b8de672ec3b3fdf823021b57514d77577f01f48cc7609a4650b7bcf1646d312900553d8f9ca55c716668b183ea
|
@@ -1,14 +1,16 @@
|
|
1
|
-
require_relative "
|
1
|
+
require_relative "url_helper"
|
2
2
|
|
3
3
|
module EmailCrawler
|
4
4
|
class Scraper
|
5
5
|
MAX_RESULTS = 100
|
6
6
|
|
7
7
|
include MechanizeHelper
|
8
|
+
include URLHelper
|
8
9
|
|
9
|
-
def initialize(google_website, max_results
|
10
|
+
def initialize(google_website, max_results: MAX_RESULTS, blacklisted_domains: [])
|
10
11
|
@google_website = "https://www.#{google_website}/"
|
11
12
|
@max_results = max_results
|
13
|
+
@blacklisted_domains = blacklisted_domains.map { |domain| /#{domain}\z/ }
|
12
14
|
end
|
13
15
|
|
14
16
|
def search_result_urls_for(q)
|
@@ -36,12 +38,15 @@ module EmailCrawler
|
|
36
38
|
def search_results_on(page)
|
37
39
|
page.search("#search ol li h3.r a").
|
38
40
|
map { |a| a["href"].downcase }.
|
39
|
-
reject { |url| url =~ %r(\A/search[?]q=) }
|
41
|
+
reject { |url| url =~ %r(\A/search[?]q=) }.
|
42
|
+
reject do |url|
|
43
|
+
domain = extract_domain_from(url)
|
44
|
+
@blacklisted_domains.any? { |blacklisted_domain| domain =~ blacklisted_domain }
|
45
|
+
end
|
40
46
|
end
|
41
47
|
|
42
48
|
def agent
|
43
49
|
@agent ||= new_agent
|
44
|
-
# @agent ||= new_agent { |agent| agent.set_proxy(Proxy.random, "8888") }
|
45
50
|
end
|
46
51
|
end
|
47
52
|
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require "English"
|
2
|
+
|
3
|
+
module URLHelper
|
4
|
+
DOMAIN_REGEXP = %r(https://([^/]+))i
|
5
|
+
WWW_REGEXP = /\Awww[.]/i
|
6
|
+
|
7
|
+
def extract_domain_from(url, www = false)
|
8
|
+
uri = begin
|
9
|
+
URI(url)
|
10
|
+
rescue URI::InvalidURIError
|
11
|
+
return
|
12
|
+
end
|
13
|
+
host = uri.host || url[DOMAIN_REGEXP, 1].to_s
|
14
|
+
if www || host !~ WWW_REGEXP
|
15
|
+
host.downcase
|
16
|
+
else
|
17
|
+
$POSTMATCH.downcase
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/lib/email_crawler.rb
CHANGED
@@ -22,7 +22,7 @@ module EmailCrawler
|
|
22
22
|
end
|
23
23
|
|
24
24
|
def run(q)
|
25
|
-
urls = Scraper.new(@google_website, @max_results).search_result_urls_for(q)
|
25
|
+
urls = Scraper.new(@google_website, max_results: @max_results).search_result_urls_for(q)
|
26
26
|
urls.each { |url| logger.info "#{url}" }
|
27
27
|
queue = Queue.new
|
28
28
|
urls.each { |url| queue.push(url) }
|
@@ -6,7 +6,7 @@ module EmailCrawler
|
|
6
6
|
describe Scraper do
|
7
7
|
let(:max_results) { 10 }
|
8
8
|
|
9
|
-
subject { Scraper.new("google.de", max_results) }
|
9
|
+
subject { Scraper.new("google.de", max_results: max_results) }
|
10
10
|
|
11
11
|
it "returns the top 10 URLs for a given search term/expression" do
|
12
12
|
subject.search_result_urls_for("berlin tours").length.must_equal max_results
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: email_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Cristian Rasch
|
@@ -115,8 +115,8 @@ files:
|
|
115
115
|
- lib/email_crawler/email_scanner.rb
|
116
116
|
- lib/email_crawler/mechanize_helper.rb
|
117
117
|
- lib/email_crawler/page_links.rb
|
118
|
-
- lib/email_crawler/proxy.rb
|
119
118
|
- lib/email_crawler/scraper.rb
|
119
|
+
- lib/email_crawler/url_helper.rb
|
120
120
|
- lib/email_crawler/version.rb
|
121
121
|
- spec/lib/email_crawler/email_scanner_spec.rb
|
122
122
|
- spec/lib/email_crawler/page_links_spec.rb
|
data/lib/email_crawler/proxy.rb
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
require "openssl"
|
2
|
-
require "open-uri"
|
3
|
-
require "json"
|
4
|
-
require "dotenv"
|
5
|
-
|
6
|
-
module EmailCrawler
|
7
|
-
class Proxy
|
8
|
-
class << self
|
9
|
-
def random
|
10
|
-
all.sample
|
11
|
-
end
|
12
|
-
|
13
|
-
private
|
14
|
-
|
15
|
-
def all
|
16
|
-
@all ||= begin
|
17
|
-
Dotenv.load
|
18
|
-
|
19
|
-
json = JSON.parse(open("https://api.digitalocean.com/droplets/?client_id=#{ENV['DO_CLIENT_ID']}&api_key=#{ENV['DO_API_KEY']}",
|
20
|
-
ssl_verify_mode: ::OpenSSL::SSL::VERIFY_NONE).read)
|
21
|
-
json["droplets"].
|
22
|
-
select{ |droplet| droplet["name"] =~ /proxy\d+/ }.
|
23
|
-
map { |droplet| droplet["ip_address"] }
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
|