spider 0.4.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +5 -0
- data/README +28 -2
- data/lib/spider.rb +1 -1
- data/lib/spider/included_in_memcached.rb +2 -1
- data/lib/spider/next_urls_in_sqs.rb +66 -0
- data/lib/spider/spider_instance.rb +38 -7
- data/lib/test.rb +27 -0
- data/spider.gemspec +4 -3
- metadata +61 -52
data/CHANGES
CHANGED
data/README
CHANGED
@@ -75,6 +75,31 @@ scraping, collecting, and looping so that you can just handle the data.
|
|
75
75
|
s.check_already_seen_with ExpireLinks.new
|
76
76
|
end
|
77
77
|
|
78
|
+
=== Store nodes to visit with Amazon SQS
|
79
|
+
|
80
|
+
require 'spider'
|
81
|
+
require 'spider/next_urls_in_sqs'
|
82
|
+
Spider.start_at('http://mike-burns.com') do |s|
|
83
|
+
s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
|
84
|
+
end
|
85
|
+
|
86
|
+
==== Store nodes to visit with a custom object
|
87
|
+
|
88
|
+
require 'spider'
|
89
|
+
class MyArray < Array
|
90
|
+
def pop
|
91
|
+
super
|
92
|
+
end
|
93
|
+
|
94
|
+
def push(a_msg)
|
95
|
+
super(a_msg)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
Spider.start_at('http://mike-burns.com') do |s|
|
100
|
+
s.store_next_urls_with MyArray.new
|
101
|
+
end
|
102
|
+
|
78
103
|
=== Create a URL graph
|
79
104
|
|
80
105
|
require 'spider'
|
@@ -106,9 +131,10 @@ scraping, collecting, and looping so that you can just handle the data.
|
|
106
131
|
|
107
132
|
== Author
|
108
133
|
|
109
|
-
|
134
|
+
John Nagro john.nagro@gmail.com
|
135
|
+
Mike Burns http://mike-burns.com mike@mike-burns.com (original author)
|
110
136
|
|
111
|
-
Help from Matt Horan,
|
137
|
+
Help from Matt Horan, and Henri Cook.
|
112
138
|
|
113
139
|
With `robot_rules' from James Edward Gray II via
|
114
140
|
http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589
|
data/lib/spider.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# Use memcached to track cycles.
|
2
|
-
|
2
|
+
#
|
3
|
+
# Copyright 2007 Mike Burns
|
3
4
|
# Redistribution and use in source and binary forms, with or without
|
4
5
|
# modification, are permitted provided that the following conditions are met:
|
5
6
|
# * Redistributions of source code must retain the above copyright
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# Use AmazonSQS to track nodes to visit.
|
2
|
+
#
|
3
|
+
# Copyright 2008 John Nagro
|
4
|
+
# Redistribution and use in source and binary forms, with or without
|
5
|
+
# modification, are permitted provided that the following conditions are met:
|
6
|
+
# * Redistributions of source code must retain the above copyright
|
7
|
+
# notice, this list of conditions and the following disclaimer.
|
8
|
+
# * Redistributions in binary form must reproduce the above copyright
|
9
|
+
# notice, this list of conditions and the following disclaimer in the
|
10
|
+
# documentation and/or other materials provided with the distribution.
|
11
|
+
# * Neither the name Mike Burns nor the
|
12
|
+
# names of his contributors may be used to endorse or promote products
|
13
|
+
# derived from this software without specific prior written permission.
|
14
|
+
#
|
15
|
+
# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
|
16
|
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
17
|
+
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
+
# DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
|
19
|
+
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
20
|
+
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
21
|
+
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
22
|
+
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
23
|
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
24
|
+
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
|
+
|
26
|
+
require 'rubygems'
|
27
|
+
require 'right_aws'
|
28
|
+
require 'yaml'
|
29
|
+
|
30
|
+
# A specialized class using AmazonSQS to track nodes to walk. It supports
|
31
|
+
# two operations: push and pop . Together these can be used to
|
32
|
+
# add items to the queue, then pull items off the queue.
|
33
|
+
#
|
34
|
+
# This is useful if you want multiple Spider processes crawling the same
|
35
|
+
# data set.
|
36
|
+
#
|
37
|
+
# To use it with Spider use the store_next_urls_with method:
|
38
|
+
#
|
39
|
+
# Spider.start_at('http://example.com/') do |s|
|
40
|
+
# s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name)
|
41
|
+
# end
|
42
|
+
class NextUrlsInSQS
|
43
|
+
# Construct a new NextUrlsInSQS instance. All arguments here are
|
44
|
+
# passed to RightAWS::SqsGen2 (part of the right_aws gem) or used
|
45
|
+
# to set the AmazonSQS queue name (optional).
|
46
|
+
def initialize(aws_access_key, aws_secret_access_key, queue_name = 'ruby-spider')
|
47
|
+
@sqs = RightAws::SqsGen2.new(aws_access_key, aws_secret_access_key)
|
48
|
+
@queue = @sqs.queue(queue_name)
|
49
|
+
end
|
50
|
+
|
51
|
+
# Pull an item off the queue, loop until data is found. Data is
|
52
|
+
# encoded with YAML.
|
53
|
+
def pop
|
54
|
+
while true
|
55
|
+
message = @queue.pop
|
56
|
+
return YAML::load(message.to_s) unless message.nil?
|
57
|
+
sleep 5
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Put data on the queue. Data is encoded with YAML.
|
62
|
+
def push(a_msg)
|
63
|
+
encoded_message = YAML::dump(a_msg)
|
64
|
+
@queue.push(a_msg)
|
65
|
+
end
|
66
|
+
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# Specialized spidering rules.
|
2
2
|
|
3
|
-
# Copyright 2007 Mike Burns
|
3
|
+
# Copyright 2007-2008 Mike Burns & John Nagro
|
4
4
|
# Redistribution and use in source and binary forms, with or without
|
5
5
|
# modification, are permitted provided that the following conditions are met:
|
6
6
|
# * Redistributions of source code must retain the above copyright
|
@@ -51,7 +51,7 @@ class SpiderInstance
|
|
51
51
|
@url_checks = []
|
52
52
|
@cache = :memory
|
53
53
|
@callbacks = {}
|
54
|
-
@next_urls = next_urls
|
54
|
+
@next_urls = [next_urls]
|
55
55
|
@seen = seen
|
56
56
|
@rules = rules || RobotRules.new('Ruby Spider 1.0')
|
57
57
|
@robots_seen = robots_seen
|
@@ -96,6 +96,30 @@ class SpiderInstance
|
|
96
96
|
end
|
97
97
|
end
|
98
98
|
|
99
|
+
# The Web is a really, really, really big graph; as such, this list
|
100
|
+
# of nodes to visit grows really, really, really big.
|
101
|
+
#
|
102
|
+
# Change the object used to store nodes we have yet to walk. The default
|
103
|
+
# object is an instance of Array. Available with Spider is a wrapper of
|
104
|
+
# AmazonSQS.
|
105
|
+
#
|
106
|
+
# You can implement a custom class for this; any object passed to
|
107
|
+
# check_already_seen_with must understand just push and pop .
|
108
|
+
#
|
109
|
+
# # default
|
110
|
+
# store_next_urls_with Array.new
|
111
|
+
#
|
112
|
+
# # AmazonSQS
|
113
|
+
# require 'spider/next_urls_in_sqs'
|
114
|
+
# store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name)
|
115
|
+
def store_next_urls_with(a_store)
|
116
|
+
tmp_next_urls = @next_urls
|
117
|
+
@next_urls = a_store
|
118
|
+
tmp_next_urls.each do |a_url_hash|
|
119
|
+
@next_urls.push a_url_hash
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
99
123
|
# Add a response handler. A response handler's trigger can be :every,
|
100
124
|
# :success, :failure, or any HTTP status code. The handler itself can be
|
101
125
|
# either a Proc or a block.
|
@@ -159,9 +183,11 @@ class SpiderInstance
|
|
159
183
|
@headers = {}
|
160
184
|
end
|
161
185
|
|
162
|
-
def start! #:nodoc:
|
163
|
-
|
186
|
+
def start! #:nodoc:
|
187
|
+
interrupted = false
|
188
|
+
trap("SIGINT") { interrupted = true }
|
164
189
|
begin
|
190
|
+
next_urls = @next_urls.pop
|
165
191
|
tmp_n_u = {}
|
166
192
|
next_urls.each do |prior_url, urls|
|
167
193
|
urls.map do |a_url|
|
@@ -172,13 +198,18 @@ class SpiderInstance
|
|
172
198
|
@setup.call(a_url) unless @setup.nil?
|
173
199
|
get_page(parsed_url) do |response|
|
174
200
|
do_callbacks(a_url, response, prior_url)
|
175
|
-
tmp_n_u[a_url] = generate_next_urls(a_url, response)
|
201
|
+
#tmp_n_u[a_url] = generate_next_urls(a_url, response)
|
202
|
+
#@next_urls.push tmp_n_u
|
203
|
+
generate_next_urls(a_url, response).each do |a_next_url|
|
204
|
+
@next_urls.push a_url => a_next_url
|
205
|
+
end
|
206
|
+
#exit if interrupted
|
176
207
|
end
|
177
208
|
@teardown.call(a_url) unless @teardown.nil?
|
209
|
+
exit if interrupted
|
178
210
|
end
|
179
211
|
end
|
180
|
-
|
181
|
-
end while !next_urls.empty?
|
212
|
+
end while !@next_urls.empty?
|
182
213
|
end
|
183
214
|
|
184
215
|
def success_or_failure(code) #:nodoc:
|
data/lib/test.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'spider.rb'
|
2
|
+
require 'spider/next_urls_in_sqs.rb'
|
3
|
+
|
4
|
+
class MyArray < Array
|
5
|
+
def pop
|
6
|
+
a_msg = super
|
7
|
+
puts "pop: #{a_msg.inspect}"
|
8
|
+
return a_msg
|
9
|
+
end
|
10
|
+
|
11
|
+
def push(a_msg)
|
12
|
+
puts "push: #{a_msg.inspect}"
|
13
|
+
super(a_msg)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
AWS_ACCESS_KEY = '0YA99M8Y09J2D4FEC602'
|
18
|
+
AWS_SECRET_ACCESS_KEY = 'Sc9R9uiwbFYz7XhQqkPvSK3Bbq4tPYPVMWyDlF+a'
|
19
|
+
|
20
|
+
#Spider.start_at("http://docs.huihoo.com/ruby/ruby-man-1.4/function.html") do |s|
|
21
|
+
Spider.start_at("http://www.google.com") do |s|
|
22
|
+
#s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
|
23
|
+
s.store_next_urls_with MyArray.new
|
24
|
+
s.on(:every) do |a_url, resp, prior_url|
|
25
|
+
puts a_url
|
26
|
+
end
|
27
|
+
end
|
data/spider.gemspec
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
|
3
3
|
spec = Gem::Specification.new do |s|
|
4
|
-
s.author = '
|
5
|
-
s.email = '
|
4
|
+
s.author = 'John Nagro'
|
5
|
+
s.email = 'john.nagro@gmail.com'
|
6
6
|
s.has_rdoc = true
|
7
7
|
s.homepage = 'http://spider.rubyforge.org/'
|
8
8
|
s.name = 'spider'
|
9
|
+
s.rubyforge_project = 'spider'
|
9
10
|
s.summary = 'A Web spidering library'
|
10
11
|
s.files = Dir['**/*'].delete_if { |f| f =~ /(cvs|gem|svn)$/i }
|
11
12
|
s.require_path = 'lib'
|
@@ -13,5 +14,5 @@ spec = Gem::Specification.new do |s|
|
|
13
14
|
A Web spidering library: handles robots.txt, scraping, finding more
|
14
15
|
links, and doing it all over again.
|
15
16
|
EOF
|
16
|
-
s.version = '0.4.
|
17
|
+
s.version = '0.4.2'
|
17
18
|
end
|
metadata
CHANGED
@@ -1,78 +1,87 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.9.4
|
3
|
-
specification_version: 1
|
4
2
|
name: spider
|
5
3
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.4.
|
7
|
-
date: 2007-11-10 00:00:00 -05:00
|
8
|
-
summary: A Web spidering library
|
9
|
-
require_paths:
|
10
|
-
- lib
|
11
|
-
email: mike@mike-burns.com
|
12
|
-
homepage: http://spider.rubyforge.org/
|
13
|
-
rubyforge_project:
|
14
|
-
description: "A Web spidering library: handles robots.txt, scraping, finding more links, and doing it all over again."
|
15
|
-
autorequire:
|
16
|
-
default_executable:
|
17
|
-
bindir: bin
|
18
|
-
has_rdoc: true
|
19
|
-
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
-
requirements:
|
21
|
-
- - ">"
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 0.0.0
|
24
|
-
version:
|
4
|
+
version: 0.4.2
|
25
5
|
platform: ruby
|
26
|
-
signing_key:
|
27
|
-
cert_chain:
|
28
|
-
post_install_message:
|
29
6
|
authors:
|
30
|
-
-
|
7
|
+
- John Nagro
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-07-06 00:00:00 -04:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: "A Web spidering library: handles robots.txt, scraping, finding more links, and doing it all over again."
|
17
|
+
email: john.nagro@gmail.com
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files: []
|
23
|
+
|
31
24
|
files:
|
25
|
+
- CHANGES
|
32
26
|
- doc
|
33
|
-
- doc/
|
27
|
+
- doc/classes
|
28
|
+
- doc/classes/IncludedInMemcached.html
|
29
|
+
- doc/classes/Spider.html
|
30
|
+
- doc/classes/SpiderInstance.html
|
31
|
+
- doc/created.rid
|
34
32
|
- doc/files
|
35
33
|
- doc/files/lib
|
36
|
-
- doc/files/lib/spider_rb.html
|
37
34
|
- doc/files/lib/spider
|
38
|
-
- doc/files/lib/spider/spider_instance_rb.html
|
39
35
|
- doc/files/lib/spider/included_in_memcached_rb.html
|
36
|
+
- doc/files/lib/spider/spider_instance_rb.html
|
37
|
+
- doc/files/lib/spider_rb.html
|
40
38
|
- doc/files/README.html
|
41
|
-
- doc/classes
|
42
|
-
- doc/classes/IncludedInMemcached.html
|
43
|
-
- doc/classes/SpiderInstance.html
|
44
|
-
- doc/classes/Spider.html
|
45
|
-
- doc/fr_file_index.html
|
46
39
|
- doc/fr_class_index.html
|
40
|
+
- doc/fr_file_index.html
|
47
41
|
- doc/fr_method_index.html
|
48
42
|
- doc/index.html
|
49
|
-
- doc/
|
43
|
+
- doc/rdoc-style.css
|
44
|
+
- lib
|
45
|
+
- lib/spider
|
46
|
+
- lib/spider/included_in_memcached.rb
|
47
|
+
- lib/spider/next_urls_in_sqs.rb
|
48
|
+
- lib/spider/robot_rules.rb
|
49
|
+
- lib/spider/spider_instance.rb
|
50
|
+
- lib/spider.rb
|
51
|
+
- lib/test.rb
|
52
|
+
- README
|
50
53
|
- spec
|
54
|
+
- spec/spec_helper.rb
|
51
55
|
- spec/spider
|
52
56
|
- spec/spider/included_in_memcached_spec.rb
|
53
57
|
- spec/spider/spider_instance_spec.rb
|
54
58
|
- spec/spider_spec.rb
|
55
|
-
- spec/spec_helper.rb
|
56
|
-
- README
|
57
59
|
- spider.gemspec
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
- lib/spider
|
62
|
-
- lib/spider/included_in_memcached.rb
|
63
|
-
- lib/spider/robot_rules.rb
|
64
|
-
- lib/spider/spider_instance.rb
|
65
|
-
test_files: []
|
66
|
-
|
60
|
+
has_rdoc: true
|
61
|
+
homepage: http://spider.rubyforge.org/
|
62
|
+
post_install_message:
|
67
63
|
rdoc_options: []
|
68
64
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
65
|
+
require_paths:
|
66
|
+
- lib
|
67
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: "0"
|
72
|
+
version:
|
73
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
74
|
+
requirements:
|
75
|
+
- - ">="
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: "0"
|
78
|
+
version:
|
75
79
|
requirements: []
|
76
80
|
|
77
|
-
|
81
|
+
rubyforge_project: spider
|
82
|
+
rubygems_version: 1.0.1
|
83
|
+
signing_key:
|
84
|
+
specification_version: 2
|
85
|
+
summary: A Web spidering library
|
86
|
+
test_files: []
|
78
87
|
|