email_crawler 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +12 -6
- data/bin/email-crawler +14 -4
- data/lib/email_crawler.rb +2 -2
- data/lib/email_crawler/scraper.rb +23 -8
- data/lib/email_crawler/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: de057cc74d307b12298c221ec86e0c6e646f3bc1
|
4
|
+
data.tar.gz: 1b7be472da61eec61943e249486d8ecc5cd01c56
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4077de2915db17beaa66786fc9ef978ea6bab0ea86076a644dd2f3d6bd16f12e63df12238c5cd91a62116c7327617eeccd172dd54201457c4b874da32ede00af
|
7
|
+
data.tar.gz: 81996e8daad656dafde6899ceb33f55680ce7dacb4e276f3cb8f96a0f11e3cacb973ab199f2d36fe02569c90dfef2f09007ce138d5688e602b0f1e8458e17fda
|
data/README.md
CHANGED
@@ -19,25 +19,31 @@ email-crawler --help
|
|
19
19
|
2. Simplest Google search
|
20
20
|
|
21
21
|
```bash
|
22
|
-
email-crawler
|
22
|
+
email-crawler --query "berlin walks"
|
23
23
|
```
|
24
24
|
|
25
25
|
3. Select which Google website to use (defaults to google.com.br)
|
26
26
|
|
27
27
|
```bash
|
28
|
-
email-crawler
|
28
|
+
email-crawler --query "berlin walks" --google-website google.de
|
29
29
|
```
|
30
30
|
|
31
|
-
4. Specify how many
|
31
|
+
4. Specify how many search results URLs to collect (defaults to 100)
|
32
32
|
|
33
33
|
```bash
|
34
|
-
email-crawler
|
34
|
+
email-crawler --query "berlin walks" --max-results 250
|
35
35
|
```
|
36
36
|
|
37
|
-
5.
|
37
|
+
5. Specify how many internal links are to be scanned for email addresses (defaults to 100)
|
38
38
|
|
39
39
|
```bash
|
40
|
-
email-crawler
|
40
|
+
email-crawler --query "berlin walks" --max-links 250
|
41
|
+
```
|
42
|
+
|
43
|
+
6. Redirect output to a file
|
44
|
+
|
45
|
+
```bash
|
46
|
+
email-crawler --query "berlin walks" > ~/Desktop/belin-walks-emails.csv
|
41
47
|
```
|
42
48
|
|
43
49
|
## Contributing
|
data/bin/email-crawler
CHANGED
@@ -3,11 +3,14 @@
|
|
3
3
|
require 'optparse'
|
4
4
|
require 'ostruct'
|
5
5
|
|
6
|
+
require_relative "../lib/email_crawler"
|
7
|
+
|
6
8
|
class OptionsParser
|
7
9
|
def self.parse(args)
|
8
10
|
options = OpenStruct.new
|
9
11
|
options.google_website = "google.com.br"
|
10
|
-
options.
|
12
|
+
options.max_results = ::EmailCrawler::Scraper::MAX_RESULTS
|
13
|
+
options.max_links = ::EmailCrawler::PageLinks::MAX_LINKS
|
11
14
|
|
12
15
|
opt_parser = OptionParser.new do |opts|
|
13
16
|
opts.banner = "Usage: email-crawler [options]"
|
@@ -24,7 +27,13 @@ class OptionsParser
|
|
24
27
|
options.google_website = google_website
|
25
28
|
end
|
26
29
|
|
27
|
-
opts.on("-
|
30
|
+
opts.on("-r", "--max-results 250",
|
31
|
+
"Max # of search result URLs to collect before crawling each one for email addresses",
|
32
|
+
" (defaults to 100)") do |max_results|
|
33
|
+
options.max_results = max_results.to_i
|
34
|
+
end
|
35
|
+
|
36
|
+
opts.on("-l", "--max-links 250",
|
28
37
|
"Max # of internal links to visit searching for emails",
|
29
38
|
" (per search result, defaults to 100)") do |max_links|
|
30
39
|
options.max_links = max_links.to_i
|
@@ -41,7 +50,8 @@ if options.q.empty?
|
|
41
50
|
print "The -q switch is mandatory\n"
|
42
51
|
exit(1)
|
43
52
|
else
|
44
|
-
|
45
|
-
|
53
|
+
csv = EmailCrawler::Runner.new(options.google_website).run(options.q,
|
54
|
+
options.max_results,
|
55
|
+
options.max_links)
|
46
56
|
$stdout << "#{csv}\n"
|
47
57
|
end
|
data/lib/email_crawler.rb
CHANGED
@@ -21,8 +21,8 @@ module EmailCrawler
|
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
24
|
-
def run(q, max_links = PageLinks::MAX_LINKS)
|
25
|
-
urls = Scraper.new(@google_website).
|
24
|
+
def run(q, max_results = Scraper::MAX_RESULTS, max_links = PageLinks::MAX_LINKS)
|
25
|
+
urls = Scraper.new(@google_website, max_results).search_result_urls_for(q)
|
26
26
|
urls.each { |url, links| @logger.info "#{url}" }
|
27
27
|
|
28
28
|
threads = (1..urls.length).map do |i|
|
@@ -2,30 +2,45 @@ require_relative "proxy"
|
|
2
2
|
|
3
3
|
module EmailCrawler
|
4
4
|
class Scraper
|
5
|
-
|
5
|
+
MAX_RESULTS = 100
|
6
6
|
|
7
7
|
include MechanizeHelper
|
8
8
|
|
9
|
-
def initialize(google_website)
|
9
|
+
def initialize(google_website, max_results = MAX_RESULTS)
|
10
10
|
@google_website = "https://www.#{google_website}/"
|
11
|
+
@max_results = max_results
|
11
12
|
end
|
12
13
|
|
13
|
-
def
|
14
|
+
def search_result_urls_for(q)
|
14
15
|
search_page = agent.get(@google_website)
|
15
16
|
search_form = search_page.form_with(action: "/search")
|
16
17
|
search_form.field_with(name: "q").value = q
|
17
18
|
search_results_page = agent.submit(search_form)
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
19
|
+
urls = search_results_on(search_results_page)
|
20
|
+
|
21
|
+
page = 1
|
22
|
+
while urls.size < @max_results
|
23
|
+
next_page_link = search_results_page.link_with(href: /start=#{page*10}/)
|
24
|
+
return urls unless next_page_link
|
25
|
+
|
26
|
+
next_search_results_page = next_page_link.click
|
27
|
+
urls.concat(search_results_on(next_search_results_page)).uniq!
|
28
|
+
page += 1
|
29
|
+
end
|
30
|
+
|
31
|
+
urls.first(@max_results)
|
22
32
|
end
|
23
33
|
|
24
34
|
private
|
25
35
|
|
36
|
+
def search_results_on(page)
|
37
|
+
page.search("#search ol li h3.r a").
|
38
|
+
map { |a| a["href"].downcase }.
|
39
|
+
reject { |url| url =~ %r(\A/search[?]q=) }
|
40
|
+
end
|
41
|
+
|
26
42
|
def agent
|
27
43
|
@agent ||= new_agent { |agent| agent.set_proxy(Proxy.random, "8888") }
|
28
|
-
# @agent ||= new_agent
|
29
44
|
end
|
30
45
|
end
|
31
46
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: email_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Cristian Rasch
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-03-
|
11
|
+
date: 2014-03-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|