email_crawler 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2de69de34df458d935de9deaed59993e9b66e26b
4
- data.tar.gz: ac1e1063e63f708ed663183b4bb3b6331b686520
3
+ metadata.gz: de057cc74d307b12298c221ec86e0c6e646f3bc1
4
+ data.tar.gz: 1b7be472da61eec61943e249486d8ecc5cd01c56
5
5
  SHA512:
6
- metadata.gz: de5977cd80e45368403850c964e4e50e56c4efef0f17d16ecb5cf942f612aff49d00542ae462f4f6b477b8d9d91201c8364e0995cbed88e696a32c1dea230304
7
- data.tar.gz: 4d5b2008c4d23886f45ebeae9d46391d3b22ac6e5eab74a4b93b9c6aee43af7898978e225482818bb81b6d822b16caeb53bb686433b09af79e560eb5a65655dc
6
+ metadata.gz: 4077de2915db17beaa66786fc9ef978ea6bab0ea86076a644dd2f3d6bd16f12e63df12238c5cd91a62116c7327617eeccd172dd54201457c4b874da32ede00af
7
+ data.tar.gz: 81996e8daad656dafde6899ceb33f55680ce7dacb4e276f3cb8f96a0f11e3cacb973ab199f2d36fe02569c90dfef2f09007ce138d5688e602b0f1e8458e17fda
data/README.md CHANGED
@@ -19,25 +19,31 @@ email-crawler --help
19
19
  2. Simplest Google search
20
20
 
21
21
  ```bash
22
- email-crawler -q "berlin walks"
22
+ email-crawler --query "berlin walks"
23
23
  ```
24
24
 
25
25
  3. Select which Google website to use (defaults to google.com.br)
26
26
 
27
27
  ```bash
28
- email-crawler -q "berlin walks" -g google.de
28
+ email-crawler --query "berlin walks" --google-website google.de
29
29
  ```
30
30
 
31
- 4. Specify how many internal links are to be scanned for email addresses (defaults to 100)
31
+ 4. Specify how many search results URLs to collect (defaults to 100)
32
32
 
33
33
  ```bash
34
- email-crawler -q "berlin walks" -g google.de -m 250
34
+ email-crawler --query "berlin walks" --max-results 250
35
35
  ```
36
36
 
37
- 5. Redirect output to a file
37
+ 5. Specify how many internal links are to be scanned for email addresses (defaults to 100)
38
38
 
39
39
  ```bash
40
- email-crawler -q "berlin walks" -g google.de -m 250 > ~/Desktop/belin-walks-emails.csv
40
+ email-crawler --query "berlin walks" --max-links 250
41
+ ```
42
+
43
+ 6. Redirect output to a file
44
+
45
+ ```bash
46
+ email-crawler --query "berlin walks" > ~/Desktop/belin-walks-emails.csv
41
47
  ```
42
48
 
43
49
  ## Contributing
data/bin/email-crawler CHANGED
@@ -3,11 +3,14 @@
3
3
  require 'optparse'
4
4
  require 'ostruct'
5
5
 
6
+ require_relative "../lib/email_crawler"
7
+
6
8
  class OptionsParser
7
9
  def self.parse(args)
8
10
  options = OpenStruct.new
9
11
  options.google_website = "google.com.br"
10
- options.max_links = 100
12
+ options.max_results = ::EmailCrawler::Scraper::MAX_RESULTS
13
+ options.max_links = ::EmailCrawler::PageLinks::MAX_LINKS
11
14
 
12
15
  opt_parser = OptionParser.new do |opts|
13
16
  opts.banner = "Usage: email-crawler [options]"
@@ -24,7 +27,13 @@ class OptionsParser
24
27
  options.google_website = google_website
25
28
  end
26
29
 
27
- opts.on("-m", "--max-links 250",
30
+ opts.on("-r", "--max-results 250",
31
+ "Max # of search result URLs to collect before crawling each one for email addresses",
32
+ " (defaults to 100)") do |max_results|
33
+ options.max_results = max_results.to_i
34
+ end
35
+
36
+ opts.on("-l", "--max-links 250",
28
37
  "Max # of internal links to visit searching for emails",
29
38
  " (per search result, defaults to 100)") do |max_links|
30
39
  options.max_links = max_links.to_i
@@ -41,7 +50,8 @@ if options.q.empty?
41
50
  print "The -q switch is mandatory\n"
42
51
  exit(1)
43
52
  else
44
- require_relative "../lib/email_crawler"
45
- csv = EmailCrawler::Runner.new(options.google_website).run(options.q, options.max_links)
53
+ csv = EmailCrawler::Runner.new(options.google_website).run(options.q,
54
+ options.max_results,
55
+ options.max_links)
46
56
  $stdout << "#{csv}\n"
47
57
  end
data/lib/email_crawler.rb CHANGED
@@ -21,8 +21,8 @@ module EmailCrawler
21
21
  end
22
22
  end
23
23
 
24
- def run(q, max_links = PageLinks::MAX_LINKS)
25
- urls = Scraper.new(@google_website).top_ten_urls_for(q)
24
+ def run(q, max_results = Scraper::MAX_RESULTS, max_links = PageLinks::MAX_LINKS)
25
+ urls = Scraper.new(@google_website, max_results).search_result_urls_for(q)
26
26
  urls.each { |url, links| @logger.info "#{url}" }
27
27
 
28
28
  threads = (1..urls.length).map do |i|
@@ -2,30 +2,45 @@ require_relative "proxy"
2
2
 
3
3
  module EmailCrawler
4
4
  class Scraper
5
- MAX_URLS = 10
5
+ MAX_RESULTS = 100
6
6
 
7
7
  include MechanizeHelper
8
8
 
9
- def initialize(google_website)
9
+ def initialize(google_website, max_results = MAX_RESULTS)
10
10
  @google_website = "https://www.#{google_website}/"
11
+ @max_results = max_results
11
12
  end
12
13
 
13
- def top_ten_urls_for(q)
14
+ def search_result_urls_for(q)
14
15
  search_page = agent.get(@google_website)
15
16
  search_form = search_page.form_with(action: "/search")
16
17
  search_form.field_with(name: "q").value = q
17
18
  search_results_page = agent.submit(search_form)
18
- search_results_page.search("#search ol li h3.r a").
19
- map { |a| a["href"].downcase }.
20
- reject { |url| url =~ %r(\A/search[?]q=) }.
21
- first(MAX_URLS)
19
+ urls = search_results_on(search_results_page)
20
+
21
+ page = 1
22
+ while urls.size < @max_results
23
+ next_page_link = search_results_page.link_with(href: /start=#{page*10}/)
24
+ return urls unless next_page_link
25
+
26
+ next_search_results_page = next_page_link.click
27
+ urls.concat(search_results_on(next_search_results_page)).uniq!
28
+ page += 1
29
+ end
30
+
31
+ urls.first(@max_results)
22
32
  end
23
33
 
24
34
  private
25
35
 
36
+ def search_results_on(page)
37
+ page.search("#search ol li h3.r a").
38
+ map { |a| a["href"].downcase }.
39
+ reject { |url| url =~ %r(\A/search[?]q=) }
40
+ end
41
+
26
42
  def agent
27
43
  @agent ||= new_agent { |agent| agent.set_proxy(Proxy.random, "8888") }
28
- # @agent ||= new_agent
29
44
  end
30
45
  end
31
46
  end
@@ -1,3 +1,3 @@
1
1
  module EmailCrawler
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.5"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: email_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-05 00:00:00.000000000 Z
11
+ date: 2014-03-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize