socializer-scraper 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/socializer-scraper +6 -11
- data/lib/socializer/scraper/extractor.rb +14 -1
- data/lib/socializer/scraper/version.rb +1 -1
- data/lib/socializer/scraper.rb +54 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b7ab615984c9d9cc1c63b1f3525677f3cea346ab
|
4
|
+
data.tar.gz: 301234730dc644a79f14cde68630c581d7f11212
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2b9b2249243883434e2958fc8e2e42275359893955cea4badccfc2d56ed9f23023d492d02193968a59968b3fb7280d89f7bdf6d692cf6c551c1ce124943bd663
|
7
|
+
data.tar.gz: 64236e958b29d358a15103343c19aab8859e7cc80220c1fd69feaed5c7fd9589047d9075a32d17c3a94b90ba2ef8ad2ca8d578e30bd16cd947e776b732ca297b
|
data/bin/socializer-scraper
CHANGED
@@ -10,6 +10,7 @@ STDOUT.sync = true
|
|
10
10
|
class Socializer::Scraper::CLI < Thor
|
11
11
|
|
12
12
|
desc "emails [URLs]", "scrape emails for a given URL and all subsequently found URLs"
|
13
|
+
method_options pattern: :string, desc: "Comma separate list of patterns that selects which links to follow"
|
13
14
|
def emails(*urls)
|
14
15
|
extractor = Socializer::Scraper::Extractor.new collectors: [:email]
|
15
16
|
urls.each do |website|
|
@@ -19,13 +20,15 @@ class Socializer::Scraper::CLI < Thor
|
|
19
20
|
puts "Scraping website: #{website}"
|
20
21
|
puts "=" * 100
|
21
22
|
|
22
|
-
website = URI.parse("http
|
23
|
+
website = URI.parse(website.start_with?("http") ? website : "http://#{website}")
|
23
24
|
file = File.join(Dir.pwd, "#{website.host}.yml")
|
24
25
|
counter, list = 0, (File.exists?(file) ? YAML.load_file(file) : [])
|
25
26
|
|
27
|
+
patterns = options.has_key?("pattern") ? options["pattern"].split(",").map{|a| Regexp.new a} : []
|
28
|
+
|
26
29
|
extractor.url = website.to_s
|
27
|
-
extractor.run do |page, collector, found|
|
28
|
-
found = found.map{ |email| email.strip }.accumulate - list
|
30
|
+
extractor.run(*patterns) do |page, collector, found|
|
31
|
+
found = found.map{ |email| email.strip.downcase }.accumulate - list
|
29
32
|
list |= found
|
30
33
|
|
31
34
|
found = found.count
|
@@ -51,11 +54,3 @@ class Socializer::Scraper::CLI < Thor
|
|
51
54
|
end
|
52
55
|
|
53
56
|
Socializer::Scraper::CLI.start ARGV
|
54
|
-
|
55
|
-
websites = %w[
|
56
|
-
www.thegearpage.net
|
57
|
-
www.hugeracksin.com
|
58
|
-
www.rig-talk.com
|
59
|
-
www.guitariste.com
|
60
|
-
www.tonequest.com
|
61
|
-
]
|
@@ -79,13 +79,26 @@ module Socializer
|
|
79
79
|
patterns.push(/.*/) if patterns.empty?
|
80
80
|
|
81
81
|
Anemone.crawl(@url, options) do |anemone|
|
82
|
+
anemone.threads = 2
|
83
|
+
anemone.verbose = true
|
84
|
+
anemone.obey_robots_txt = true
|
85
|
+
anemone.accept_cookies = true
|
86
|
+
anemone.user_agent = "Googlebot"
|
82
87
|
anemone.storage = Anemone::Storage.MongoDB
|
83
|
-
anemone.
|
88
|
+
anemone.focus_crawl{|page| links_matching(page.links, patterns) }
|
89
|
+
anemone.on_every_page do |page|
|
84
90
|
@page, @html, @current_url = page, nil, page.url
|
85
91
|
yield(page)
|
86
92
|
end
|
87
93
|
end
|
88
94
|
end
|
95
|
+
|
96
|
+
def links_matching links, patterns = []
|
97
|
+
return links if patterns.empty?
|
98
|
+
links.select do |link|
|
99
|
+
patterns.detect{|p| link.to_s =~ p}
|
100
|
+
end
|
101
|
+
end
|
89
102
|
end
|
90
103
|
end
|
91
104
|
end
|
data/lib/socializer/scraper.rb
CHANGED
@@ -12,3 +12,57 @@ module Socializer
|
|
12
12
|
# Your code goes here...
|
13
13
|
end
|
14
14
|
end
|
15
|
+
|
16
|
+
module Anemone
|
17
|
+
class Core
|
18
|
+
#
|
19
|
+
# Perform the crawl
|
20
|
+
#
|
21
|
+
def run
|
22
|
+
process_options
|
23
|
+
|
24
|
+
@urls.delete_if { |url| !visit_link?(url) }
|
25
|
+
return if @urls.empty?
|
26
|
+
|
27
|
+
link_queue = Queue.new
|
28
|
+
page_queue = Queue.new
|
29
|
+
|
30
|
+
@opts[:threads].times do
|
31
|
+
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
|
32
|
+
end
|
33
|
+
|
34
|
+
@urls.each{ |url| link_queue.enq(url) }
|
35
|
+
|
36
|
+
loop do
|
37
|
+
page = page_queue.deq
|
38
|
+
@pages.touch_key page.url
|
39
|
+
print "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
|
40
|
+
do_page_blocks page
|
41
|
+
page.discard_doc! if @opts[:discard_page_bodies]
|
42
|
+
|
43
|
+
links = links_to_follow page
|
44
|
+
links.each do |link|
|
45
|
+
link_queue << [link, page.url.dup, page.depth + 1]
|
46
|
+
end
|
47
|
+
@pages.touch_keys links
|
48
|
+
|
49
|
+
@pages[page.url] = page
|
50
|
+
|
51
|
+
# if we are done with the crawl, tell the threads to end
|
52
|
+
if link_queue.empty? and page_queue.empty?
|
53
|
+
until link_queue.num_waiting == @tentacles.size
|
54
|
+
Thread.pass
|
55
|
+
end
|
56
|
+
if page_queue.empty?
|
57
|
+
@tentacles.size.times { link_queue << :END }
|
58
|
+
break
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
@tentacles.each { |thread| thread.join }
|
64
|
+
do_after_crawl_blocks
|
65
|
+
self
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: socializer-scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nikhil Gupta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-04-
|
11
|
+
date: 2014-04-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|