email_crawler 0.0.10 → 0.0.11

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6c7248b8e8688fb03bc939e0690a87920bd43f74
4
- data.tar.gz: d125e3a9ad7b554e7e98d994c9b158ecfaf93b6b
3
+ metadata.gz: c1d4c2e2b7fa2781a050534296a5de7783100c3a
4
+ data.tar.gz: 5fd71c35a51c903284312217e1195e807f1e2d40
5
5
  SHA512:
6
- metadata.gz: e238ffdaabe400c7d70ee323a6769aece2b8b141c530aacbbc7c822b6952f44f6379d77a744b45c2e2e1061e8ee3f4f06def65a34299442d149055b591f7f665
7
- data.tar.gz: 79a2d7d3e0c7f60312e81f72a6b9dcd7b1d2b5b8de672ec3b3fdf823021b57514d77577f01f48cc7609a4650b7bcf1646d312900553d8f9ca55c716668b183ea
6
+ metadata.gz: 2ac69d6d4042db503c947b303e0b1cb28ea3421e0c11879f18652d734f865adee341dcc3cfae2bc98ff7f46c8e7389c55c5b6884aa6690e63c2b1163c69c50c3
7
+ data.tar.gz: ce1d9e1f24b7488aac88f68fa75003d28065cd5c12917435c7221cb2901d6b99507613eb473f719929f314a4e9c5b8ace850fef3fb19f1fa58619f99721741e0
data/bin/email-crawler CHANGED
@@ -12,6 +12,7 @@ class OptionsParser
12
12
  options.max_results = ::EmailCrawler::Scraper::MAX_RESULTS
13
13
  options.max_links = ::EmailCrawler::PageLinks::MAX_LINKS
14
14
  options.max_concurrency = ::EmailCrawler::Runner::MAX_CONCURRENCY
15
+ options.blacklisted_domains = []
15
16
 
16
17
  opt_parser = OptionParser.new do |opts|
17
18
  opts.banner = "Usage: email-crawler [options]"
@@ -45,6 +46,11 @@ class OptionsParser
45
46
  " (defaults to 10)") do |max_concurrency|
46
47
  options.max_concurrency = max_concurrency.to_i
47
48
  end
49
+
50
+ opts.on("-b", "--blacklist DOMAIN",
51
+ "Blacklist URLs under this domain from Google's search results") do |domain|
52
+ options.blacklisted_domains << domain
53
+ end
48
54
  end
49
55
 
50
56
  opt_parser.parse!(args)
@@ -61,6 +67,7 @@ else
61
67
  runner.max_results = options.max_results
62
68
  runner.max_links = options.max_links
63
69
  runner.max_concurrency = options.max_concurrency
70
+ runner.blacklisted_domains = options.blacklisted_domains
64
71
  end
65
72
  csv = runner.run(options.q)
66
73
  $stdout << "#{csv}\n"
@@ -1,3 +1,3 @@
1
1
  module EmailCrawler
2
- VERSION = "0.0.10"
2
+ VERSION = "0.0.11"
3
3
  end
data/lib/email_crawler.rb CHANGED
@@ -14,7 +14,7 @@ module EmailCrawler
14
14
  class Runner
15
15
  MAX_CONCURRENCY = 50
16
16
 
17
- attr_writer :max_results, :max_links, :max_concurrency, :logger
17
+ attr_writer :max_results, :max_links, :max_concurrency, :logger, :blacklisted_domains
18
18
 
19
19
  def initialize(google_website)
20
20
  @google_website = google_website
@@ -22,7 +22,10 @@ module EmailCrawler
22
22
  end
23
23
 
24
24
  def run(q)
25
- urls = Scraper.new(@google_website, max_results: @max_results).search_result_urls_for(q)
25
+ urls = Scraper.new(@google_website,
26
+ max_results: @max_results,
27
+ blacklisted_domains: @blacklisted_domains).
28
+ search_result_urls_for(q)
26
29
  urls.each { |url| logger.info "#{url}" }
27
30
  queue = Queue.new
28
31
  urls.each { |url| queue.push(url) }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: email_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.10
4
+ version: 0.0.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch