email_crawler 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2de69de34df458d935de9deaed59993e9b66e26b
4
- data.tar.gz: ac1e1063e63f708ed663183b4bb3b6331b686520
3
+ metadata.gz: de057cc74d307b12298c221ec86e0c6e646f3bc1
4
+ data.tar.gz: 1b7be472da61eec61943e249486d8ecc5cd01c56
5
5
  SHA512:
6
- metadata.gz: de5977cd80e45368403850c964e4e50e56c4efef0f17d16ecb5cf942f612aff49d00542ae462f4f6b477b8d9d91201c8364e0995cbed88e696a32c1dea230304
7
- data.tar.gz: 4d5b2008c4d23886f45ebeae9d46391d3b22ac6e5eab74a4b93b9c6aee43af7898978e225482818bb81b6d822b16caeb53bb686433b09af79e560eb5a65655dc
6
+ metadata.gz: 4077de2915db17beaa66786fc9ef978ea6bab0ea86076a644dd2f3d6bd16f12e63df12238c5cd91a62116c7327617eeccd172dd54201457c4b874da32ede00af
7
+ data.tar.gz: 81996e8daad656dafde6899ceb33f55680ce7dacb4e276f3cb8f96a0f11e3cacb973ab199f2d36fe02569c90dfef2f09007ce138d5688e602b0f1e8458e17fda
data/README.md CHANGED
@@ -19,25 +19,31 @@ email-crawler --help
19
19
  2. Simplest Google search
20
20
 
21
21
  ```bash
22
- email-crawler -q "berlin walks"
22
+ email-crawler --query "berlin walks"
23
23
  ```
24
24
 
25
25
  3. Select which Google website to use (defaults to google.com.br)
26
26
 
27
27
  ```bash
28
- email-crawler -q "berlin walks" -g google.de
28
+ email-crawler --query "berlin walks" --google-website google.de
29
29
  ```
30
30
 
31
- 4. Specify how many internal links are to be scanned for email addresses (defaults to 100)
31
+ 4. Specify how many search results URLs to collect (defaults to 100)
32
32
 
33
33
  ```bash
34
- email-crawler -q "berlin walks" -g google.de -m 250
34
+ email-crawler --query "berlin walks" --max-results 250
35
35
  ```
36
36
 
37
- 5. Redirect output to a file
37
+ 5. Specify how many internal links are to be scanned for email addresses (defaults to 100)
38
38
 
39
39
  ```bash
40
- email-crawler -q "berlin walks" -g google.de -m 250 > ~/Desktop/belin-walks-emails.csv
40
+ email-crawler --query "berlin walks" --max-links 250
41
+ ```
42
+
43
+ 6. Redirect output to a file
44
+
45
+ ```bash
46
+ email-crawler --query "berlin walks" > ~/Desktop/belin-walks-emails.csv
41
47
  ```
42
48
 
43
49
  ## Contributing
data/bin/email-crawler CHANGED
@@ -3,11 +3,14 @@
3
3
  require 'optparse'
4
4
  require 'ostruct'
5
5
 
6
+ require_relative "../lib/email_crawler"
7
+
6
8
  class OptionsParser
7
9
  def self.parse(args)
8
10
  options = OpenStruct.new
9
11
  options.google_website = "google.com.br"
10
- options.max_links = 100
12
+ options.max_results = ::EmailCrawler::Scraper::MAX_RESULTS
13
+ options.max_links = ::EmailCrawler::PageLinks::MAX_LINKS
11
14
 
12
15
  opt_parser = OptionParser.new do |opts|
13
16
  opts.banner = "Usage: email-crawler [options]"
@@ -24,7 +27,13 @@ class OptionsParser
24
27
  options.google_website = google_website
25
28
  end
26
29
 
27
- opts.on("-m", "--max-links 250",
30
+ opts.on("-r", "--max-results 250",
31
+ "Max # of search result URLs to collect before crawling each one for email addresses",
32
+ " (defaults to 100)") do |max_results|
33
+ options.max_results = max_results.to_i
34
+ end
35
+
36
+ opts.on("-l", "--max-links 250",
28
37
  "Max # of internal links to visit searching for emails",
29
38
  " (per search result, defaults to 100)") do |max_links|
30
39
  options.max_links = max_links.to_i
@@ -41,7 +50,8 @@ if options.q.empty?
41
50
  print "The -q switch is mandatory\n"
42
51
  exit(1)
43
52
  else
44
- require_relative "../lib/email_crawler"
45
- csv = EmailCrawler::Runner.new(options.google_website).run(options.q, options.max_links)
53
+ csv = EmailCrawler::Runner.new(options.google_website).run(options.q,
54
+ options.max_results,
55
+ options.max_links)
46
56
  $stdout << "#{csv}\n"
47
57
  end
data/lib/email_crawler.rb CHANGED
@@ -21,8 +21,8 @@ module EmailCrawler
21
21
  end
22
22
  end
23
23
 
24
- def run(q, max_links = PageLinks::MAX_LINKS)
25
- urls = Scraper.new(@google_website).top_ten_urls_for(q)
24
+ def run(q, max_results = Scraper::MAX_RESULTS, max_links = PageLinks::MAX_LINKS)
25
+ urls = Scraper.new(@google_website, max_results).search_result_urls_for(q)
26
26
  urls.each { |url, links| @logger.info "#{url}" }
27
27
 
28
28
  threads = (1..urls.length).map do |i|
@@ -2,30 +2,45 @@ require_relative "proxy"
2
2
 
3
3
  module EmailCrawler
4
4
  class Scraper
5
- MAX_URLS = 10
5
+ MAX_RESULTS = 100
6
6
 
7
7
  include MechanizeHelper
8
8
 
9
- def initialize(google_website)
9
+ def initialize(google_website, max_results = MAX_RESULTS)
10
10
  @google_website = "https://www.#{google_website}/"
11
+ @max_results = max_results
11
12
  end
12
13
 
13
- def top_ten_urls_for(q)
14
+ def search_result_urls_for(q)
14
15
  search_page = agent.get(@google_website)
15
16
  search_form = search_page.form_with(action: "/search")
16
17
  search_form.field_with(name: "q").value = q
17
18
  search_results_page = agent.submit(search_form)
18
- search_results_page.search("#search ol li h3.r a").
19
- map { |a| a["href"].downcase }.
20
- reject { |url| url =~ %r(\A/search[?]q=) }.
21
- first(MAX_URLS)
19
+ urls = search_results_on(search_results_page)
20
+
21
+ page = 1
22
+ while urls.size < @max_results
23
+ next_page_link = search_results_page.link_with(href: /start=#{page*10}/)
24
+ return urls unless next_page_link
25
+
26
+ next_search_results_page = next_page_link.click
27
+ urls.concat(search_results_on(next_search_results_page)).uniq!
28
+ page += 1
29
+ end
30
+
31
+ urls.first(@max_results)
22
32
  end
23
33
 
24
34
  private
25
35
 
36
+ def search_results_on(page)
37
+ page.search("#search ol li h3.r a").
38
+ map { |a| a["href"].downcase }.
39
+ reject { |url| url =~ %r(\A/search[?]q=) }
40
+ end
41
+
26
42
  def agent
27
43
  @agent ||= new_agent { |agent| agent.set_proxy(Proxy.random, "8888") }
28
- # @agent ||= new_agent
29
44
  end
30
45
  end
31
46
  end
@@ -1,3 +1,3 @@
1
1
  module EmailCrawler
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.5"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: email_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-05 00:00:00.000000000 Z
11
+ date: 2014-03-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize