spider 0.4.1 → 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +5 -0
- data/README +28 -2
- data/lib/spider.rb +1 -1
- data/lib/spider/included_in_memcached.rb +2 -1
- data/lib/spider/next_urls_in_sqs.rb +66 -0
- data/lib/spider/spider_instance.rb +38 -7
- data/lib/test.rb +27 -0
- data/spider.gemspec +4 -3
- metadata +61 -52
data/CHANGES
CHANGED
data/README
CHANGED
@@ -75,6 +75,31 @@ scraping, collecting, and looping so that you can just handle the data.
|
|
75
75
|
s.check_already_seen_with ExpireLinks.new
|
76
76
|
end
|
77
77
|
|
78
|
+
=== Store nodes to visit with Amazon SQS
|
79
|
+
|
80
|
+
require 'spider'
|
81
|
+
require 'spider/next_urls_in_sqs'
|
82
|
+
Spider.start_at('http://mike-burns.com') do |s|
|
83
|
+
s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
|
84
|
+
end
|
85
|
+
|
86
|
+
==== Store nodes to visit with a custom object
|
87
|
+
|
88
|
+
require 'spider'
|
89
|
+
class MyArray < Array
|
90
|
+
def pop
|
91
|
+
super
|
92
|
+
end
|
93
|
+
|
94
|
+
def push(a_msg)
|
95
|
+
super(a_msg)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
Spider.start_at('http://mike-burns.com') do |s|
|
100
|
+
s.store_next_urls_with MyArray.new
|
101
|
+
end
|
102
|
+
|
78
103
|
=== Create a URL graph
|
79
104
|
|
80
105
|
require 'spider'
|
@@ -106,9 +131,10 @@ scraping, collecting, and looping so that you can just handle the data.
|
|
106
131
|
|
107
132
|
== Author
|
108
133
|
|
109
|
-
|
134
|
+
John Nagro john.nagro@gmail.com
|
135
|
+
Mike Burns http://mike-burns.com mike@mike-burns.com (original author)
|
110
136
|
|
111
|
-
Help from Matt Horan,
|
137
|
+
Help from Matt Horan, and Henri Cook.
|
112
138
|
|
113
139
|
With `robot_rules' from James Edward Gray II via
|
114
140
|
http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589
|
data/lib/spider.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# Use memcached to track cycles.
|
2
|
-
|
2
|
+
#
|
3
|
+
# Copyright 2007 Mike Burns
|
3
4
|
# Redistribution and use in source and binary forms, with or without
|
4
5
|
# modification, are permitted provided that the following conditions are met:
|
5
6
|
# * Redistributions of source code must retain the above copyright
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# Use AmazonSQS to track nodes to visit.
|
2
|
+
#
|
3
|
+
# Copyright 2008 John Nagro
|
4
|
+
# Redistribution and use in source and binary forms, with or without
|
5
|
+
# modification, are permitted provided that the following conditions are met:
|
6
|
+
# * Redistributions of source code must retain the above copyright
|
7
|
+
# notice, this list of conditions and the following disclaimer.
|
8
|
+
# * Redistributions in binary form must reproduce the above copyright
|
9
|
+
# notice, this list of conditions and the following disclaimer in the
|
10
|
+
# documentation and/or other materials provided with the distribution.
|
11
|
+
# * Neither the name Mike Burns nor the
|
12
|
+
# names of his contributors may be used to endorse or promote products
|
13
|
+
# derived from this software without specific prior written permission.
|
14
|
+
#
|
15
|
+
# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
|
16
|
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
17
|
+
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
+
# DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
|
19
|
+
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
20
|
+
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
21
|
+
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
22
|
+
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
23
|
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
24
|
+
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
|
+
|
26
|
+
require 'rubygems'
|
27
|
+
require 'right_aws'
|
28
|
+
require 'yaml'
|
29
|
+
|
30
|
+
# A specialized class using AmazonSQS to track nodes to walk. It supports
|
31
|
+
# two operations: push and pop . Together these can be used to
|
32
|
+
# add items to the queue, then pull items off the queue.
|
33
|
+
#
|
34
|
+
# This is useful if you want multiple Spider processes crawling the same
|
35
|
+
# data set.
|
36
|
+
#
|
37
|
+
# To use it with Spider use the store_next_urls_with method:
|
38
|
+
#
|
39
|
+
# Spider.start_at('http://example.com/') do |s|
|
40
|
+
# s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name)
|
41
|
+
# end
|
42
|
+
class NextUrlsInSQS
|
43
|
+
# Construct a new NextUrlsInSQS instance. All arguments here are
|
44
|
+
# passed to RightAWS::SqsGen2 (part of the right_aws gem) or used
|
45
|
+
# to set the AmazonSQS queue name (optional).
|
46
|
+
def initialize(aws_access_key, aws_secret_access_key, queue_name = 'ruby-spider')
|
47
|
+
@sqs = RightAws::SqsGen2.new(aws_access_key, aws_secret_access_key)
|
48
|
+
@queue = @sqs.queue(queue_name)
|
49
|
+
end
|
50
|
+
|
51
|
+
# Pull an item off the queue, loop until data is found. Data is
|
52
|
+
# encoded with YAML.
|
53
|
+
def pop
|
54
|
+
while true
|
55
|
+
message = @queue.pop
|
56
|
+
return YAML::load(message.to_s) unless message.nil?
|
57
|
+
sleep 5
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Put data on the queue. Data is encoded with YAML.
|
62
|
+
def push(a_msg)
|
63
|
+
encoded_message = YAML::dump(a_msg)
|
64
|
+
@queue.push(a_msg)
|
65
|
+
end
|
66
|
+
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# Specialized spidering rules.
|
2
2
|
|
3
|
-
# Copyright 2007 Mike Burns
|
3
|
+
# Copyright 2007-2008 Mike Burns & John Nagro
|
4
4
|
# Redistribution and use in source and binary forms, with or without
|
5
5
|
# modification, are permitted provided that the following conditions are met:
|
6
6
|
# * Redistributions of source code must retain the above copyright
|
@@ -51,7 +51,7 @@ class SpiderInstance
|
|
51
51
|
@url_checks = []
|
52
52
|
@cache = :memory
|
53
53
|
@callbacks = {}
|
54
|
-
@next_urls = next_urls
|
54
|
+
@next_urls = [next_urls]
|
55
55
|
@seen = seen
|
56
56
|
@rules = rules || RobotRules.new('Ruby Spider 1.0')
|
57
57
|
@robots_seen = robots_seen
|
@@ -96,6 +96,30 @@ class SpiderInstance
|
|
96
96
|
end
|
97
97
|
end
|
98
98
|
|
99
|
+
# The Web is a really, really, really big graph; as such, this list
|
100
|
+
# of nodes to visit grows really, really, really big.
|
101
|
+
#
|
102
|
+
# Change the object used to store nodes we have yet to walk. The default
|
103
|
+
# object is an instance of Array. Available with Spider is a wrapper of
|
104
|
+
# AmazonSQS.
|
105
|
+
#
|
106
|
+
# You can implement a custom class for this; any object passed to
|
107
|
+
# check_already_seen_with must understand just push and pop .
|
108
|
+
#
|
109
|
+
# # default
|
110
|
+
# store_next_urls_with Array.new
|
111
|
+
#
|
112
|
+
# # AmazonSQS
|
113
|
+
# require 'spider/next_urls_in_sqs'
|
114
|
+
# store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name)
|
115
|
+
def store_next_urls_with(a_store)
|
116
|
+
tmp_next_urls = @next_urls
|
117
|
+
@next_urls = a_store
|
118
|
+
tmp_next_urls.each do |a_url_hash|
|
119
|
+
@next_urls.push a_url_hash
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
99
123
|
# Add a response handler. A response handler's trigger can be :every,
|
100
124
|
# :success, :failure, or any HTTP status code. The handler itself can be
|
101
125
|
# either a Proc or a block.
|
@@ -159,9 +183,11 @@ class SpiderInstance
|
|
159
183
|
@headers = {}
|
160
184
|
end
|
161
185
|
|
162
|
-
def start! #:nodoc:
|
163
|
-
|
186
|
+
def start! #:nodoc:
|
187
|
+
interrupted = false
|
188
|
+
trap("SIGINT") { interrupted = true }
|
164
189
|
begin
|
190
|
+
next_urls = @next_urls.pop
|
165
191
|
tmp_n_u = {}
|
166
192
|
next_urls.each do |prior_url, urls|
|
167
193
|
urls.map do |a_url|
|
@@ -172,13 +198,18 @@ class SpiderInstance
|
|
172
198
|
@setup.call(a_url) unless @setup.nil?
|
173
199
|
get_page(parsed_url) do |response|
|
174
200
|
do_callbacks(a_url, response, prior_url)
|
175
|
-
tmp_n_u[a_url] = generate_next_urls(a_url, response)
|
201
|
+
#tmp_n_u[a_url] = generate_next_urls(a_url, response)
|
202
|
+
#@next_urls.push tmp_n_u
|
203
|
+
generate_next_urls(a_url, response).each do |a_next_url|
|
204
|
+
@next_urls.push a_url => a_next_url
|
205
|
+
end
|
206
|
+
#exit if interrupted
|
176
207
|
end
|
177
208
|
@teardown.call(a_url) unless @teardown.nil?
|
209
|
+
exit if interrupted
|
178
210
|
end
|
179
211
|
end
|
180
|
-
|
181
|
-
end while !next_urls.empty?
|
212
|
+
end while !@next_urls.empty?
|
182
213
|
end
|
183
214
|
|
184
215
|
def success_or_failure(code) #:nodoc:
|
data/lib/test.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'spider.rb'
|
2
|
+
require 'spider/next_urls_in_sqs.rb'
|
3
|
+
|
4
|
+
class MyArray < Array
|
5
|
+
def pop
|
6
|
+
a_msg = super
|
7
|
+
puts "pop: #{a_msg.inspect}"
|
8
|
+
return a_msg
|
9
|
+
end
|
10
|
+
|
11
|
+
def push(a_msg)
|
12
|
+
puts "push: #{a_msg.inspect}"
|
13
|
+
super(a_msg)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
AWS_ACCESS_KEY = '0YA99M8Y09J2D4FEC602'
|
18
|
+
AWS_SECRET_ACCESS_KEY = 'Sc9R9uiwbFYz7XhQqkPvSK3Bbq4tPYPVMWyDlF+a'
|
19
|
+
|
20
|
+
#Spider.start_at("http://docs.huihoo.com/ruby/ruby-man-1.4/function.html") do |s|
|
21
|
+
Spider.start_at("http://www.google.com") do |s|
|
22
|
+
#s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
|
23
|
+
s.store_next_urls_with MyArray.new
|
24
|
+
s.on(:every) do |a_url, resp, prior_url|
|
25
|
+
puts a_url
|
26
|
+
end
|
27
|
+
end
|
data/spider.gemspec
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
|
3
3
|
spec = Gem::Specification.new do |s|
|
4
|
-
s.author = '
|
5
|
-
s.email = '
|
4
|
+
s.author = 'John Nagro'
|
5
|
+
s.email = 'john.nagro@gmail.com'
|
6
6
|
s.has_rdoc = true
|
7
7
|
s.homepage = 'http://spider.rubyforge.org/'
|
8
8
|
s.name = 'spider'
|
9
|
+
s.rubyforge_project = 'spider'
|
9
10
|
s.summary = 'A Web spidering library'
|
10
11
|
s.files = Dir['**/*'].delete_if { |f| f =~ /(cvs|gem|svn)$/i }
|
11
12
|
s.require_path = 'lib'
|
@@ -13,5 +14,5 @@ spec = Gem::Specification.new do |s|
|
|
13
14
|
A Web spidering library: handles robots.txt, scraping, finding more
|
14
15
|
links, and doing it all over again.
|
15
16
|
EOF
|
16
|
-
s.version = '0.4.
|
17
|
+
s.version = '0.4.2'
|
17
18
|
end
|
metadata
CHANGED
@@ -1,78 +1,87 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.9.4
|
3
|
-
specification_version: 1
|
4
2
|
name: spider
|
5
3
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.4.
|
7
|
-
date: 2007-11-10 00:00:00 -05:00
|
8
|
-
summary: A Web spidering library
|
9
|
-
require_paths:
|
10
|
-
- lib
|
11
|
-
email: mike@mike-burns.com
|
12
|
-
homepage: http://spider.rubyforge.org/
|
13
|
-
rubyforge_project:
|
14
|
-
description: "A Web spidering library: handles robots.txt, scraping, finding more links, and doing it all over again."
|
15
|
-
autorequire:
|
16
|
-
default_executable:
|
17
|
-
bindir: bin
|
18
|
-
has_rdoc: true
|
19
|
-
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
-
requirements:
|
21
|
-
- - ">"
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 0.0.0
|
24
|
-
version:
|
4
|
+
version: 0.4.2
|
25
5
|
platform: ruby
|
26
|
-
signing_key:
|
27
|
-
cert_chain:
|
28
|
-
post_install_message:
|
29
6
|
authors:
|
30
|
-
-
|
7
|
+
- John Nagro
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-07-06 00:00:00 -04:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: "A Web spidering library: handles robots.txt, scraping, finding more links, and doing it all over again."
|
17
|
+
email: john.nagro@gmail.com
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files: []
|
23
|
+
|
31
24
|
files:
|
25
|
+
- CHANGES
|
32
26
|
- doc
|
33
|
-
- doc/
|
27
|
+
- doc/classes
|
28
|
+
- doc/classes/IncludedInMemcached.html
|
29
|
+
- doc/classes/Spider.html
|
30
|
+
- doc/classes/SpiderInstance.html
|
31
|
+
- doc/created.rid
|
34
32
|
- doc/files
|
35
33
|
- doc/files/lib
|
36
|
-
- doc/files/lib/spider_rb.html
|
37
34
|
- doc/files/lib/spider
|
38
|
-
- doc/files/lib/spider/spider_instance_rb.html
|
39
35
|
- doc/files/lib/spider/included_in_memcached_rb.html
|
36
|
+
- doc/files/lib/spider/spider_instance_rb.html
|
37
|
+
- doc/files/lib/spider_rb.html
|
40
38
|
- doc/files/README.html
|
41
|
-
- doc/classes
|
42
|
-
- doc/classes/IncludedInMemcached.html
|
43
|
-
- doc/classes/SpiderInstance.html
|
44
|
-
- doc/classes/Spider.html
|
45
|
-
- doc/fr_file_index.html
|
46
39
|
- doc/fr_class_index.html
|
40
|
+
- doc/fr_file_index.html
|
47
41
|
- doc/fr_method_index.html
|
48
42
|
- doc/index.html
|
49
|
-
- doc/
|
43
|
+
- doc/rdoc-style.css
|
44
|
+
- lib
|
45
|
+
- lib/spider
|
46
|
+
- lib/spider/included_in_memcached.rb
|
47
|
+
- lib/spider/next_urls_in_sqs.rb
|
48
|
+
- lib/spider/robot_rules.rb
|
49
|
+
- lib/spider/spider_instance.rb
|
50
|
+
- lib/spider.rb
|
51
|
+
- lib/test.rb
|
52
|
+
- README
|
50
53
|
- spec
|
54
|
+
- spec/spec_helper.rb
|
51
55
|
- spec/spider
|
52
56
|
- spec/spider/included_in_memcached_spec.rb
|
53
57
|
- spec/spider/spider_instance_spec.rb
|
54
58
|
- spec/spider_spec.rb
|
55
|
-
- spec/spec_helper.rb
|
56
|
-
- README
|
57
59
|
- spider.gemspec
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
- lib/spider
|
62
|
-
- lib/spider/included_in_memcached.rb
|
63
|
-
- lib/spider/robot_rules.rb
|
64
|
-
- lib/spider/spider_instance.rb
|
65
|
-
test_files: []
|
66
|
-
|
60
|
+
has_rdoc: true
|
61
|
+
homepage: http://spider.rubyforge.org/
|
62
|
+
post_install_message:
|
67
63
|
rdoc_options: []
|
68
64
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
65
|
+
require_paths:
|
66
|
+
- lib
|
67
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: "0"
|
72
|
+
version:
|
73
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
74
|
+
requirements:
|
75
|
+
- - ">="
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: "0"
|
78
|
+
version:
|
75
79
|
requirements: []
|
76
80
|
|
77
|
-
|
81
|
+
rubyforge_project: spider
|
82
|
+
rubygems_version: 1.0.1
|
83
|
+
signing_key:
|
84
|
+
specification_version: 2
|
85
|
+
summary: A Web spidering library
|
86
|
+
test_files: []
|
78
87
|
|