email_crawler 0.0.10 → 0.0.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/email-crawler +7 -0
- data/lib/email_crawler/version.rb +1 -1
- data/lib/email_crawler.rb +5 -2
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c1d4c2e2b7fa2781a050534296a5de7783100c3a
|
4
|
+
data.tar.gz: 5fd71c35a51c903284312217e1195e807f1e2d40
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2ac69d6d4042db503c947b303e0b1cb28ea3421e0c11879f18652d734f865adee341dcc3cfae2bc98ff7f46c8e7389c55c5b6884aa6690e63c2b1163c69c50c3
|
7
|
+
data.tar.gz: ce1d9e1f24b7488aac88f68fa75003d28065cd5c12917435c7221cb2901d6b99507613eb473f719929f314a4e9c5b8ace850fef3fb19f1fa58619f99721741e0
|
data/bin/email-crawler
CHANGED
@@ -12,6 +12,7 @@ class OptionsParser
|
|
12
12
|
options.max_results = ::EmailCrawler::Scraper::MAX_RESULTS
|
13
13
|
options.max_links = ::EmailCrawler::PageLinks::MAX_LINKS
|
14
14
|
options.max_concurrency = ::EmailCrawler::Runner::MAX_CONCURRENCY
|
15
|
+
options.blacklisted_domains = []
|
15
16
|
|
16
17
|
opt_parser = OptionParser.new do |opts|
|
17
18
|
opts.banner = "Usage: email-crawler [options]"
|
@@ -45,6 +46,11 @@ class OptionsParser
|
|
45
46
|
" (defaults to 10)") do |max_concurrency|
|
46
47
|
options.max_concurrency = max_concurrency.to_i
|
47
48
|
end
|
49
|
+
|
50
|
+
opts.on("-b", "--blacklist DOMAIN",
|
51
|
+
"Blacklist URLs under this domain from Google's search results") do |domain|
|
52
|
+
options.blacklisted_domains << domain
|
53
|
+
end
|
48
54
|
end
|
49
55
|
|
50
56
|
opt_parser.parse!(args)
|
@@ -61,6 +67,7 @@ else
|
|
61
67
|
runner.max_results = options.max_results
|
62
68
|
runner.max_links = options.max_links
|
63
69
|
runner.max_concurrency = options.max_concurrency
|
70
|
+
runner.blacklisted_domains = options.blacklisted_domains
|
64
71
|
end
|
65
72
|
csv = runner.run(options.q)
|
66
73
|
$stdout << "#{csv}\n"
|
data/lib/email_crawler.rb
CHANGED
@@ -14,7 +14,7 @@ module EmailCrawler
|
|
14
14
|
class Runner
|
15
15
|
MAX_CONCURRENCY = 50
|
16
16
|
|
17
|
-
attr_writer :max_results, :max_links, :max_concurrency, :logger
|
17
|
+
attr_writer :max_results, :max_links, :max_concurrency, :logger, :blacklisted_domains
|
18
18
|
|
19
19
|
def initialize(google_website)
|
20
20
|
@google_website = google_website
|
@@ -22,7 +22,10 @@ module EmailCrawler
|
|
22
22
|
end
|
23
23
|
|
24
24
|
def run(q)
|
25
|
-
urls = Scraper.new(@google_website,
|
25
|
+
urls = Scraper.new(@google_website,
|
26
|
+
max_results: @max_results,
|
27
|
+
blacklisted_domains: @blacklisted_domains).
|
28
|
+
search_result_urls_for(q)
|
26
29
|
urls.each { |url| logger.info "#{url}" }
|
27
30
|
queue = Queue.new
|
28
31
|
urls.each { |url| queue.push(url) }
|