email_crawler 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +12 -6
- data/bin/email-crawler +14 -4
- data/lib/email_crawler.rb +2 -2
- data/lib/email_crawler/scraper.rb +23 -8
- data/lib/email_crawler/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: de057cc74d307b12298c221ec86e0c6e646f3bc1
|
4
|
+
data.tar.gz: 1b7be472da61eec61943e249486d8ecc5cd01c56
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4077de2915db17beaa66786fc9ef978ea6bab0ea86076a644dd2f3d6bd16f12e63df12238c5cd91a62116c7327617eeccd172dd54201457c4b874da32ede00af
|
7
|
+
data.tar.gz: 81996e8daad656dafde6899ceb33f55680ce7dacb4e276f3cb8f96a0f11e3cacb973ab199f2d36fe02569c90dfef2f09007ce138d5688e602b0f1e8458e17fda
|
data/README.md
CHANGED
@@ -19,25 +19,31 @@ email-crawler --help
|
|
19
19
|
2. Simplest Google search
|
20
20
|
|
21
21
|
```bash
|
22
|
-
email-crawler
|
22
|
+
email-crawler --query "berlin walks"
|
23
23
|
```
|
24
24
|
|
25
25
|
3. Select which Google website to use (defaults to google.com.br)
|
26
26
|
|
27
27
|
```bash
|
28
|
-
email-crawler
|
28
|
+
email-crawler --query "berlin walks" --google-website google.de
|
29
29
|
```
|
30
30
|
|
31
|
-
4. Specify how many
|
31
|
+
4. Specify how many search results URLs to collect (defaults to 100)
|
32
32
|
|
33
33
|
```bash
|
34
|
-
email-crawler
|
34
|
+
email-crawler --query "berlin walks" --max-results 250
|
35
35
|
```
|
36
36
|
|
37
|
-
5.
|
37
|
+
5. Specify how many internal links are to be scanned for email addresses (defaults to 100)
|
38
38
|
|
39
39
|
```bash
|
40
|
-
email-crawler
|
40
|
+
email-crawler --query "berlin walks" --max-links 250
|
41
|
+
```
|
42
|
+
|
43
|
+
6. Redirect output to a file
|
44
|
+
|
45
|
+
```bash
|
46
|
+
email-crawler --query "berlin walks" > ~/Desktop/belin-walks-emails.csv
|
41
47
|
```
|
42
48
|
|
43
49
|
## Contributing
|
data/bin/email-crawler
CHANGED
@@ -3,11 +3,14 @@
|
|
3
3
|
require 'optparse'
|
4
4
|
require 'ostruct'
|
5
5
|
|
6
|
+
require_relative "../lib/email_crawler"
|
7
|
+
|
6
8
|
class OptionsParser
|
7
9
|
def self.parse(args)
|
8
10
|
options = OpenStruct.new
|
9
11
|
options.google_website = "google.com.br"
|
10
|
-
options.
|
12
|
+
options.max_results = ::EmailCrawler::Scraper::MAX_RESULTS
|
13
|
+
options.max_links = ::EmailCrawler::PageLinks::MAX_LINKS
|
11
14
|
|
12
15
|
opt_parser = OptionParser.new do |opts|
|
13
16
|
opts.banner = "Usage: email-crawler [options]"
|
@@ -24,7 +27,13 @@ class OptionsParser
|
|
24
27
|
options.google_website = google_website
|
25
28
|
end
|
26
29
|
|
27
|
-
opts.on("-
|
30
|
+
opts.on("-r", "--max-results 250",
|
31
|
+
"Max # of search result URLs to collect before crawling each one for email addresses",
|
32
|
+
" (defaults to 100)") do |max_results|
|
33
|
+
options.max_results = max_results.to_i
|
34
|
+
end
|
35
|
+
|
36
|
+
opts.on("-l", "--max-links 250",
|
28
37
|
"Max # of internal links to visit searching for emails",
|
29
38
|
" (per search result, defaults to 100)") do |max_links|
|
30
39
|
options.max_links = max_links.to_i
|
@@ -41,7 +50,8 @@ if options.q.empty?
|
|
41
50
|
print "The -q switch is mandatory\n"
|
42
51
|
exit(1)
|
43
52
|
else
|
44
|
-
|
45
|
-
|
53
|
+
csv = EmailCrawler::Runner.new(options.google_website).run(options.q,
|
54
|
+
options.max_results,
|
55
|
+
options.max_links)
|
46
56
|
$stdout << "#{csv}\n"
|
47
57
|
end
|
data/lib/email_crawler.rb
CHANGED
@@ -21,8 +21,8 @@ module EmailCrawler
|
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
24
|
-
def run(q, max_links = PageLinks::MAX_LINKS)
|
25
|
-
urls = Scraper.new(@google_website).
|
24
|
+
def run(q, max_results = Scraper::MAX_RESULTS, max_links = PageLinks::MAX_LINKS)
|
25
|
+
urls = Scraper.new(@google_website, max_results).search_result_urls_for(q)
|
26
26
|
urls.each { |url, links| @logger.info "#{url}" }
|
27
27
|
|
28
28
|
threads = (1..urls.length).map do |i|
|
@@ -2,30 +2,45 @@ require_relative "proxy"
|
|
2
2
|
|
3
3
|
module EmailCrawler
|
4
4
|
class Scraper
|
5
|
-
|
5
|
+
MAX_RESULTS = 100
|
6
6
|
|
7
7
|
include MechanizeHelper
|
8
8
|
|
9
|
-
def initialize(google_website)
|
9
|
+
def initialize(google_website, max_results = MAX_RESULTS)
|
10
10
|
@google_website = "https://www.#{google_website}/"
|
11
|
+
@max_results = max_results
|
11
12
|
end
|
12
13
|
|
13
|
-
def
|
14
|
+
def search_result_urls_for(q)
|
14
15
|
search_page = agent.get(@google_website)
|
15
16
|
search_form = search_page.form_with(action: "/search")
|
16
17
|
search_form.field_with(name: "q").value = q
|
17
18
|
search_results_page = agent.submit(search_form)
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
19
|
+
urls = search_results_on(search_results_page)
|
20
|
+
|
21
|
+
page = 1
|
22
|
+
while urls.size < @max_results
|
23
|
+
next_page_link = search_results_page.link_with(href: /start=#{page*10}/)
|
24
|
+
return urls unless next_page_link
|
25
|
+
|
26
|
+
next_search_results_page = next_page_link.click
|
27
|
+
urls.concat(search_results_on(next_search_results_page)).uniq!
|
28
|
+
page += 1
|
29
|
+
end
|
30
|
+
|
31
|
+
urls.first(@max_results)
|
22
32
|
end
|
23
33
|
|
24
34
|
private
|
25
35
|
|
36
|
+
def search_results_on(page)
|
37
|
+
page.search("#search ol li h3.r a").
|
38
|
+
map { |a| a["href"].downcase }.
|
39
|
+
reject { |url| url =~ %r(\A/search[?]q=) }
|
40
|
+
end
|
41
|
+
|
26
42
|
def agent
|
27
43
|
@agent ||= new_agent { |agent| agent.set_proxy(Proxy.random, "8888") }
|
28
|
-
# @agent ||= new_agent
|
29
44
|
end
|
30
45
|
end
|
31
46
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: email_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Cristian Rasch
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-03-
|
11
|
+
date: 2014-03-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|