email_crawler 0.0.10 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6c7248b8e8688fb03bc939e0690a87920bd43f74
4
- data.tar.gz: d125e3a9ad7b554e7e98d994c9b158ecfaf93b6b
3
+ metadata.gz: c1d4c2e2b7fa2781a050534296a5de7783100c3a
4
+ data.tar.gz: 5fd71c35a51c903284312217e1195e807f1e2d40
5
5
  SHA512:
6
- metadata.gz: e238ffdaabe400c7d70ee323a6769aece2b8b141c530aacbbc7c822b6952f44f6379d77a744b45c2e2e1061e8ee3f4f06def65a34299442d149055b591f7f665
7
- data.tar.gz: 79a2d7d3e0c7f60312e81f72a6b9dcd7b1d2b5b8de672ec3b3fdf823021b57514d77577f01f48cc7609a4650b7bcf1646d312900553d8f9ca55c716668b183ea
6
+ metadata.gz: 2ac69d6d4042db503c947b303e0b1cb28ea3421e0c11879f18652d734f865adee341dcc3cfae2bc98ff7f46c8e7389c55c5b6884aa6690e63c2b1163c69c50c3
7
+ data.tar.gz: ce1d9e1f24b7488aac88f68fa75003d28065cd5c12917435c7221cb2901d6b99507613eb473f719929f314a4e9c5b8ace850fef3fb19f1fa58619f99721741e0
data/bin/email-crawler CHANGED
@@ -12,6 +12,7 @@ class OptionsParser
12
12
  options.max_results = ::EmailCrawler::Scraper::MAX_RESULTS
13
13
  options.max_links = ::EmailCrawler::PageLinks::MAX_LINKS
14
14
  options.max_concurrency = ::EmailCrawler::Runner::MAX_CONCURRENCY
15
+ options.blacklisted_domains = []
15
16
 
16
17
  opt_parser = OptionParser.new do |opts|
17
18
  opts.banner = "Usage: email-crawler [options]"
@@ -45,6 +46,11 @@ class OptionsParser
45
46
  " (defaults to 10)") do |max_concurrency|
46
47
  options.max_concurrency = max_concurrency.to_i
47
48
  end
49
+
50
+ opts.on("-b", "--blacklist DOMAIN",
51
+ "Blacklist URLs under this domain from Google's search results") do |domain|
52
+ options.blacklisted_domains << domain
53
+ end
48
54
  end
49
55
 
50
56
  opt_parser.parse!(args)
@@ -61,6 +67,7 @@ else
61
67
  runner.max_results = options.max_results
62
68
  runner.max_links = options.max_links
63
69
  runner.max_concurrency = options.max_concurrency
70
+ runner.blacklisted_domains = options.blacklisted_domains
64
71
  end
65
72
  csv = runner.run(options.q)
66
73
  $stdout << "#{csv}\n"
@@ -1,3 +1,3 @@
1
1
  module EmailCrawler
2
- VERSION = "0.0.10"
2
+ VERSION = "0.0.11"
3
3
  end
data/lib/email_crawler.rb CHANGED
@@ -14,7 +14,7 @@ module EmailCrawler
14
14
  class Runner
15
15
  MAX_CONCURRENCY = 50
16
16
 
17
- attr_writer :max_results, :max_links, :max_concurrency, :logger
17
+ attr_writer :max_results, :max_links, :max_concurrency, :logger, :blacklisted_domains
18
18
 
19
19
  def initialize(google_website)
20
20
  @google_website = google_website
@@ -22,7 +22,10 @@ module EmailCrawler
22
22
  end
23
23
 
24
24
  def run(q)
25
- urls = Scraper.new(@google_website, max_results: @max_results).search_result_urls_for(q)
25
+ urls = Scraper.new(@google_website,
26
+ max_results: @max_results,
27
+ blacklisted_domains: @blacklisted_domains).
28
+ search_result_urls_for(q)
26
29
  urls.each { |url| logger.info "#{url}" }
27
30
  queue = Queue.new
28
31
  urls.each { |url| queue.push(url) }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: email_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.10
4
+ version: 0.0.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch