socializer-scraper 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/socializer-scraper +6 -11
- data/lib/socializer/scraper/extractor.rb +14 -1
- data/lib/socializer/scraper/version.rb +1 -1
- data/lib/socializer/scraper.rb +54 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b7ab615984c9d9cc1c63b1f3525677f3cea346ab
|
4
|
+
data.tar.gz: 301234730dc644a79f14cde68630c581d7f11212
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2b9b2249243883434e2958fc8e2e42275359893955cea4badccfc2d56ed9f23023d492d02193968a59968b3fb7280d89f7bdf6d692cf6c551c1ce124943bd663
|
7
|
+
data.tar.gz: 64236e958b29d358a15103343c19aab8859e7cc80220c1fd69feaed5c7fd9589047d9075a32d17c3a94b90ba2ef8ad2ca8d578e30bd16cd947e776b732ca297b
|
data/bin/socializer-scraper
CHANGED
@@ -10,6 +10,7 @@ STDOUT.sync = true
|
|
10
10
|
class Socializer::Scraper::CLI < Thor
|
11
11
|
|
12
12
|
desc "emails [URLs]", "scrape emails for a given URL and all subsequently found URLs"
|
13
|
+
method_options pattern: :string, desc: "Comma separate list of patterns that selects which links to follow"
|
13
14
|
def emails(*urls)
|
14
15
|
extractor = Socializer::Scraper::Extractor.new collectors: [:email]
|
15
16
|
urls.each do |website|
|
@@ -19,13 +20,15 @@ class Socializer::Scraper::CLI < Thor
|
|
19
20
|
puts "Scraping website: #{website}"
|
20
21
|
puts "=" * 100
|
21
22
|
|
22
|
-
website = URI.parse("http
|
23
|
+
website = URI.parse(website.start_with?("http") ? website : "http://#{website}")
|
23
24
|
file = File.join(Dir.pwd, "#{website.host}.yml")
|
24
25
|
counter, list = 0, (File.exists?(file) ? YAML.load_file(file) : [])
|
25
26
|
|
27
|
+
patterns = options.has_key?("pattern") ? options["pattern"].split(",").map{|a| Regexp.new a} : []
|
28
|
+
|
26
29
|
extractor.url = website.to_s
|
27
|
-
extractor.run do |page, collector, found|
|
28
|
-
found = found.map{ |email| email.strip }.accumulate - list
|
30
|
+
extractor.run(*patterns) do |page, collector, found|
|
31
|
+
found = found.map{ |email| email.strip.downcase }.accumulate - list
|
29
32
|
list |= found
|
30
33
|
|
31
34
|
found = found.count
|
@@ -51,11 +54,3 @@ class Socializer::Scraper::CLI < Thor
|
|
51
54
|
end
|
52
55
|
|
53
56
|
Socializer::Scraper::CLI.start ARGV
|
54
|
-
|
55
|
-
websites = %w[
|
56
|
-
www.thegearpage.net
|
57
|
-
www.hugeracksin.com
|
58
|
-
www.rig-talk.com
|
59
|
-
www.guitariste.com
|
60
|
-
www.tonequest.com
|
61
|
-
]
|
@@ -79,13 +79,26 @@ module Socializer
|
|
79
79
|
patterns.push(/.*/) if patterns.empty?
|
80
80
|
|
81
81
|
Anemone.crawl(@url, options) do |anemone|
|
82
|
+
anemone.threads = 2
|
83
|
+
anemone.verbose = true
|
84
|
+
anemone.obey_robots_txt = true
|
85
|
+
anemone.accept_cookies = true
|
86
|
+
anemone.user_agent = "Googlebot"
|
82
87
|
anemone.storage = Anemone::Storage.MongoDB
|
83
|
-
anemone.
|
88
|
+
anemone.focus_crawl{|page| links_matching(page.links, patterns) }
|
89
|
+
anemone.on_every_page do |page|
|
84
90
|
@page, @html, @current_url = page, nil, page.url
|
85
91
|
yield(page)
|
86
92
|
end
|
87
93
|
end
|
88
94
|
end
|
95
|
+
|
96
|
+
def links_matching links, patterns = []
|
97
|
+
return links if patterns.empty?
|
98
|
+
links.select do |link|
|
99
|
+
patterns.detect{|p| link.to_s =~ p}
|
100
|
+
end
|
101
|
+
end
|
89
102
|
end
|
90
103
|
end
|
91
104
|
end
|
data/lib/socializer/scraper.rb
CHANGED
@@ -12,3 +12,57 @@ module Socializer
|
|
12
12
|
# Your code goes here...
|
13
13
|
end
|
14
14
|
end
|
15
|
+
|
16
|
+
module Anemone
|
17
|
+
class Core
|
18
|
+
#
|
19
|
+
# Perform the crawl
|
20
|
+
#
|
21
|
+
def run
|
22
|
+
process_options
|
23
|
+
|
24
|
+
@urls.delete_if { |url| !visit_link?(url) }
|
25
|
+
return if @urls.empty?
|
26
|
+
|
27
|
+
link_queue = Queue.new
|
28
|
+
page_queue = Queue.new
|
29
|
+
|
30
|
+
@opts[:threads].times do
|
31
|
+
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
|
32
|
+
end
|
33
|
+
|
34
|
+
@urls.each{ |url| link_queue.enq(url) }
|
35
|
+
|
36
|
+
loop do
|
37
|
+
page = page_queue.deq
|
38
|
+
@pages.touch_key page.url
|
39
|
+
print "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
|
40
|
+
do_page_blocks page
|
41
|
+
page.discard_doc! if @opts[:discard_page_bodies]
|
42
|
+
|
43
|
+
links = links_to_follow page
|
44
|
+
links.each do |link|
|
45
|
+
link_queue << [link, page.url.dup, page.depth + 1]
|
46
|
+
end
|
47
|
+
@pages.touch_keys links
|
48
|
+
|
49
|
+
@pages[page.url] = page
|
50
|
+
|
51
|
+
# if we are done with the crawl, tell the threads to end
|
52
|
+
if link_queue.empty? and page_queue.empty?
|
53
|
+
until link_queue.num_waiting == @tentacles.size
|
54
|
+
Thread.pass
|
55
|
+
end
|
56
|
+
if page_queue.empty?
|
57
|
+
@tentacles.size.times { link_queue << :END }
|
58
|
+
break
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
@tentacles.each { |thread| thread.join }
|
64
|
+
do_after_crawl_blocks
|
65
|
+
self
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: socializer-scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nikhil Gupta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-04-
|
11
|
+
date: 2014-04-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|