spider 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +6 -0
- data/README +4 -4
- data/doc/classes/Net/HTTPRedirection.html +144 -0
- data/doc/classes/Net/HTTPResponse.html +166 -0
- data/doc/classes/Net/HTTPSuccess.html +144 -0
- data/doc/classes/NilClass.html +144 -0
- data/doc/classes/Spider.html +12 -12
- data/doc/classes/SpiderInstance.html +109 -32
- data/doc/created.rid +1 -1
- data/doc/files/README.html +5 -5
- data/doc/files/lib/spider_rb.html +5 -5
- data/doc/fr_class_index.html +0 -1
- data/doc/fr_file_index.html +1 -0
- data/doc/fr_method_index.html +5 -2
- data/lib/spider.rb +100 -58
- data/spec/spider_instance_spec.rb +115 -30
- data/spider.gemspec +1 -1
- data/test_server/client.rb +4 -4
- metadata +7 -2
data/doc/fr_class_index.html
CHANGED
data/doc/fr_file_index.html
CHANGED
data/doc/fr_method_index.html
CHANGED
@@ -21,9 +21,12 @@
|
|
21
21
|
<h1 class="section-bar">Methods</h1>
|
22
22
|
<div id="index-entries">
|
23
23
|
<a href="classes/SpiderInstance.html#M000001">add_url_check (SpiderInstance)</a><br />
|
24
|
+
<a href="classes/SpiderInstance.html#M000006">clear_headers (SpiderInstance)</a><br />
|
25
|
+
<a href="classes/SpiderInstance.html#M000005">headers (SpiderInstance)</a><br />
|
24
26
|
<a href="classes/SpiderInstance.html#M000002">on (SpiderInstance)</a><br />
|
25
|
-
<a href="classes/SpiderInstance.html#M000003">
|
26
|
-
<a href="classes/Spider.html#
|
27
|
+
<a href="classes/SpiderInstance.html#M000003">setup (SpiderInstance)</a><br />
|
28
|
+
<a href="classes/Spider.html#M000007">start_at (Spider)</a><br />
|
29
|
+
<a href="classes/SpiderInstance.html#M000004">teardown (SpiderInstance)</a><br />
|
27
30
|
</div>
|
28
31
|
</div>
|
29
32
|
</body>
|
data/lib/spider.rb
CHANGED
@@ -29,15 +29,21 @@ require 'uri'
|
|
29
29
|
require 'net/http'
|
30
30
|
require 'net/https'
|
31
31
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
end
|
36
|
-
|
37
|
-
|
32
|
+
module Net #:nodoc:
|
33
|
+
class HTTPResponse #:nodoc:
|
34
|
+
def success?; false; end
|
35
|
+
def redirect?; false; end
|
36
|
+
end
|
37
|
+
class HTTPSuccess #:nodoc:
|
38
|
+
def success?; true; end
|
39
|
+
end
|
40
|
+
class HTTPRedirection #:nodoc:
|
41
|
+
def redirect?; true; end
|
42
|
+
end
|
38
43
|
end
|
39
|
-
|
40
|
-
|
44
|
+
|
45
|
+
class NilClass #:nodoc:
|
46
|
+
def merge(h); h; end
|
41
47
|
end
|
42
48
|
|
43
49
|
# A spidering library for Ruby. Handles robots.txt, scraping, finding more
|
@@ -52,22 +58,22 @@ class Spider
|
|
52
58
|
# a_url =~ %r{^http://mike-burns.com.*}
|
53
59
|
# end
|
54
60
|
#
|
55
|
-
# s.on 404 do |a_url,
|
61
|
+
# s.on 404 do |a_url, resp, prior_url|
|
56
62
|
# puts "URL not found: #{a_url}"
|
57
63
|
# end
|
58
64
|
#
|
59
|
-
# s.on :success do |a_url,
|
60
|
-
# puts "body: #{body}"
|
65
|
+
# s.on :success do |a_url, resp, prior_url|
|
66
|
+
# puts "body: #{resp.body}"
|
61
67
|
# end
|
62
68
|
#
|
63
|
-
# s.on :
|
69
|
+
# s.on :every do |a_url, resp, prior_url|
|
64
70
|
# puts "URL returned anything: #{a_url} with this code #{resp.code}"
|
65
71
|
# end
|
66
72
|
# end
|
67
73
|
|
68
74
|
def self.start_at(a_url, &block)
|
69
75
|
rules = RobotRules.new('Ruby Spider 1.0')
|
70
|
-
a_spider = SpiderInstance.new(
|
76
|
+
a_spider = SpiderInstance.new({nil => a_url}, [], rules, [])
|
71
77
|
block.call(a_spider)
|
72
78
|
a_spider.start!
|
73
79
|
end
|
@@ -77,11 +83,14 @@ class SpiderInstance
|
|
77
83
|
def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc:
|
78
84
|
@url_checks = []
|
79
85
|
@cache = :memory
|
80
|
-
@callbacks = {
|
86
|
+
@callbacks = {}
|
81
87
|
@next_urls = next_urls
|
82
88
|
@seen = seen
|
83
89
|
@rules = rules || RobotRules.new('Ruby Spider 1.0')
|
84
90
|
@robots_seen = robots_seen
|
91
|
+
@headers = {}
|
92
|
+
@setup = nil
|
93
|
+
@teardown = nil
|
85
94
|
end
|
86
95
|
|
87
96
|
# Add a predicate that determines whether to continue down this URL's path.
|
@@ -99,62 +108,88 @@ class SpiderInstance
|
|
99
108
|
@cache = cache_type
|
100
109
|
end
|
101
110
|
|
102
|
-
# Add a response handler. A response handler's trigger can be :
|
103
|
-
# :failure, or any HTTP status code. The handler itself can be
|
104
|
-
#
|
111
|
+
# Add a response handler. A response handler's trigger can be :every,
|
112
|
+
# :success, :failure, or any HTTP status code. The handler itself can be
|
113
|
+
# either a Proc or a block.
|
105
114
|
#
|
106
|
-
#
|
107
|
-
#
|
115
|
+
# The arguments to the block are: the URL as a string, an instance of
|
116
|
+
# Net::HTTPResponse, and the prior URL as a string.
|
108
117
|
#
|
109
|
-
# If the trigger is :success or any HTTP status code that represents a
|
110
|
-
# successful result, the arguments are the URL as a string, the HTTP status
|
111
|
-
# code, an instance of Net::HTTPSuccess, and the body of the result as a
|
112
|
-
# string.
|
113
|
-
#
|
114
|
-
# If the trigger is :failure or any HTTP status code that represents a failed
|
115
|
-
# result, the arguments are the URL as a string and the HTTP status code.
|
116
118
|
#
|
117
119
|
# For example:
|
118
120
|
#
|
119
|
-
# on 404 do |a_url,
|
121
|
+
# on 404 do |a_url, resp, prior_url|
|
120
122
|
# puts "URL not found: #{a_url}"
|
121
123
|
# end
|
122
124
|
#
|
123
|
-
# on :success do |a_url,
|
125
|
+
# on :success do |a_url, resp, prior_url|
|
124
126
|
# puts a_url
|
125
|
-
# puts body
|
127
|
+
# puts resp.body
|
126
128
|
# end
|
127
129
|
#
|
128
|
-
# on :
|
130
|
+
# on :every do |a_url, resp, prior_url|
|
129
131
|
# puts "Given this code: #{resp.code}"
|
130
132
|
# end
|
131
133
|
def on(code, p = nil, &block)
|
132
134
|
f = p ? p : block
|
133
135
|
case code
|
134
136
|
when Fixnum
|
135
|
-
@callbacks[
|
137
|
+
@callbacks[code] = f
|
136
138
|
else
|
137
|
-
|
138
|
-
@callbacks[:any] = f
|
139
|
-
else
|
140
|
-
@callbacks[code.to_sym][:any] = f
|
141
|
-
end
|
139
|
+
@callbacks[code.to_sym] = f
|
142
140
|
end
|
143
141
|
end
|
144
142
|
|
143
|
+
# Run before the HTTP request. Given the URL as a string.
|
144
|
+
# setup do |a_url|
|
145
|
+
# headers['Cookies'] = 'user_id=1;admin=true'
|
146
|
+
# end
|
147
|
+
def setup(p = nil, &block)
|
148
|
+
@setup = p ? p : block
|
149
|
+
end
|
150
|
+
|
151
|
+
# Run last, once for each page. Given the URL as a string.
|
152
|
+
def teardown(p = nil, &block)
|
153
|
+
@teardown = p ? p : block
|
154
|
+
end
|
155
|
+
|
156
|
+
# Use like a hash:
|
157
|
+
# headers['Cookies'] = 'user_id=1;password=btrross3'
|
158
|
+
def headers
|
159
|
+
HeaderSetter.new(self)
|
160
|
+
end
|
161
|
+
|
162
|
+
def raw_headers #:nodoc:
|
163
|
+
@headers
|
164
|
+
end
|
165
|
+
def raw_headers=(v) #:nodoc:
|
166
|
+
@headers = v
|
167
|
+
end
|
168
|
+
|
169
|
+
# Reset the headers hash.
|
170
|
+
def clear_headers
|
171
|
+
@headers = {}
|
172
|
+
end
|
173
|
+
|
145
174
|
def start! #:nodoc:
|
146
175
|
next_urls = @next_urls
|
147
176
|
begin
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
177
|
+
tmp_n_u = {}
|
178
|
+
next_urls.each do |prior_url, urls|
|
179
|
+
urls.map do |a_url|
|
180
|
+
[a_url, (URI.parse(a_url) rescue nil)]
|
181
|
+
end.select do |a_url, parsed_url|
|
182
|
+
allowable_url?(a_url, parsed_url)
|
183
|
+
end.each do |a_url, parsed_url|
|
184
|
+
@setup.call(a_url) unless @setup.nil?
|
185
|
+
get_page(parsed_url) do |response|
|
186
|
+
do_callbacks(a_url, response, prior_url)
|
187
|
+
tmp_n_u[a_url] = generate_next_urls(a_url, response)
|
188
|
+
end
|
189
|
+
@teardown.call(a_url) unless @teardown.nil?
|
190
|
+
end
|
191
|
+
end
|
192
|
+
next_urls = tmp_n_u
|
158
193
|
end while !next_urls.empty?
|
159
194
|
end
|
160
195
|
|
@@ -196,7 +231,8 @@ class SpiderInstance
|
|
196
231
|
http = Net::HTTP.new(parsed_url.host, parsed_url.port)
|
197
232
|
http.use_ssl = parsed_url.scheme == 'https'
|
198
233
|
# Uses start because http.finish cannot be called.
|
199
|
-
r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri
|
234
|
+
r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri,
|
235
|
+
@headers))}
|
200
236
|
if r.redirect?
|
201
237
|
get_page(URI.parse(r['Location']), &block)
|
202
238
|
else
|
@@ -208,16 +244,13 @@ class SpiderInstance
|
|
208
244
|
end
|
209
245
|
end
|
210
246
|
|
211
|
-
def do_callbacks(a_url, resp) #:nodoc:
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
cb_branch = @callbacks[:failure]
|
219
|
-
cb_branch[:any].call(a_url, resp.code) if cb_branch[:any]
|
220
|
-
cb_branch[resp.code].call(a_url, resp.code) if cb_branch[resp.code]
|
247
|
+
def do_callbacks(a_url, resp, prior_url) #:nodoc:
|
248
|
+
cbs = [@callbacks[:every],
|
249
|
+
resp.success? ? @callbacks[:success] : @callbacks[:failure],
|
250
|
+
@callbacks[resp.code]]
|
251
|
+
|
252
|
+
cbs.each do |cb|
|
253
|
+
cb.call(a_url, resp, prior_url) if cb
|
221
254
|
end
|
222
255
|
end
|
223
256
|
|
@@ -254,7 +287,16 @@ class SpiderInstance
|
|
254
287
|
end.compact
|
255
288
|
end
|
256
289
|
|
257
|
-
def remove_trailing_slash(s)
|
290
|
+
def remove_trailing_slash(s) #:nodoc:
|
258
291
|
s.sub(%r{/*$},'')
|
259
292
|
end
|
293
|
+
|
294
|
+
class HeaderSetter #:nodoc:
|
295
|
+
def initialize(si)
|
296
|
+
@si = si
|
297
|
+
end
|
298
|
+
def []=(k,v)
|
299
|
+
@si.raw_headers = @si.raw_headers.merge({k => v})
|
300
|
+
end
|
301
|
+
end
|
260
302
|
end
|
@@ -23,6 +23,80 @@ def null_logger
|
|
23
23
|
end
|
24
24
|
|
25
25
|
describe 'SpiderInstance' do
|
26
|
+
it 'should call the "setup" callback before loading the Web page' do
|
27
|
+
mock_successful_http
|
28
|
+
@on_called = false
|
29
|
+
@before_called = false
|
30
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
31
|
+
si.stubs(:allowed?).returns(true)
|
32
|
+
si.stubs(:generate_next_urls).returns([])
|
33
|
+
si.setup { |*a| @before_called = Time.now }
|
34
|
+
si.on(:every) { |*a| @on_called = Time.now }
|
35
|
+
si.start!
|
36
|
+
@on_called.should_not be_false
|
37
|
+
@before_called.should_not be_false
|
38
|
+
@before_called.should_not be_false
|
39
|
+
@before_called.should < @on_called
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'should call the "teardown" callback after running all other callbacks' do
|
43
|
+
mock_successful_http
|
44
|
+
@on_called = false
|
45
|
+
@after_called = false
|
46
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
47
|
+
si.stubs(:allowed?).returns(true)
|
48
|
+
si.stubs(:generate_next_urls).returns([])
|
49
|
+
si.on(:every) { |*a| @on_called = Time.now }
|
50
|
+
si.teardown { |*a| @after_called = Time.now }
|
51
|
+
si.start!
|
52
|
+
@on_called.should_not be_false
|
53
|
+
@after_called.should_not be_false
|
54
|
+
@after_called.should_not be_false
|
55
|
+
@after_called.should > @on_called
|
56
|
+
end
|
57
|
+
|
58
|
+
it 'should pass headers set by a setup handler to the HTTP request' do
|
59
|
+
mock_successful_http
|
60
|
+
Net::HTTP::Get.expects(:new).with('/foo',{'X-Header-Set' => 'True'})
|
61
|
+
si = SpiderInstance.new(nil => ['http://example.com/foo'])
|
62
|
+
si.stubs(:allowable_url?).returns(true)
|
63
|
+
si.stubs(:generate_next_urls).returns([])
|
64
|
+
si.setup do |a_url|
|
65
|
+
si.headers['X-Header-Set'] = 'True'
|
66
|
+
end
|
67
|
+
si.teardown do |a_url|
|
68
|
+
si.clear_headers
|
69
|
+
end
|
70
|
+
si.start!
|
71
|
+
end
|
72
|
+
|
73
|
+
it 'should allow for a proxy' # fill in more
|
74
|
+
|
75
|
+
it 'should call the :every callback with the current URL, the response, and the prior URL' do
|
76
|
+
mock_successful_http
|
77
|
+
callback_arguments_on(:every)
|
78
|
+
end
|
79
|
+
|
80
|
+
it 'should call the :success callback with the current URL, the request, and the prior URL' do
|
81
|
+
mock_successful_http
|
82
|
+
callback_arguments_on(:success)
|
83
|
+
end
|
84
|
+
|
85
|
+
it 'should call the :failure callback with the current URL, the request, and the prior URL' do
|
86
|
+
mock_failed_http
|
87
|
+
callback_arguments_on(:failure)
|
88
|
+
end
|
89
|
+
|
90
|
+
it 'should call the HTTP status error code callback with the current URL, the request, and the prior URL' do
|
91
|
+
mock_failed_http
|
92
|
+
callback_arguments_on(404)
|
93
|
+
end
|
94
|
+
|
95
|
+
it 'should call the HTTP status success code callback with the current URL, the request, and the prior URL' do
|
96
|
+
mock_successful_http
|
97
|
+
callback_arguments_on(200)
|
98
|
+
end
|
99
|
+
|
26
100
|
# Bug reported by John Nagro, using the example source http://eons.com/
|
27
101
|
# had to change line 192; uses request_uri now instead of path.
|
28
102
|
it 'should handle query URLs without a path' do
|
@@ -33,7 +107,7 @@ describe 'SpiderInstance' do
|
|
33
107
|
:AccessLog => [])
|
34
108
|
server.mount('/', QueryServlet)
|
35
109
|
Thread.new {server.start}
|
36
|
-
si = SpiderInstance.new([u])
|
110
|
+
si = SpiderInstance.new({nil => [u]})
|
37
111
|
si.get_page(u_p) do
|
38
112
|
@block_called = true
|
39
113
|
end
|
@@ -47,7 +121,7 @@ describe 'SpiderInstance' do
|
|
47
121
|
u_p = URI.parse(u)
|
48
122
|
@redirect_handled = false
|
49
123
|
mock_redirect_http
|
50
|
-
si = SpiderInstance.new([u])
|
124
|
+
si = SpiderInstance.new({nil => [u]})
|
51
125
|
si.get_page(u_p) do
|
52
126
|
@redirect_handled = true
|
53
127
|
end
|
@@ -66,7 +140,7 @@ describe 'SpiderInstance' do
|
|
66
140
|
:SSLComment => 'Comment of some sort')
|
67
141
|
server.mount('/', QueryServlet)
|
68
142
|
Thread.new {server.start}
|
69
|
-
si = SpiderInstance.new([u])
|
143
|
+
si = SpiderInstance.new({nil => [u]})
|
70
144
|
si.get_page(u_p) { @page_called = true }
|
71
145
|
server.shutdown
|
72
146
|
@page_called.should be_true
|
@@ -79,7 +153,7 @@ describe 'SpiderInstance' do
|
|
79
153
|
u_p = URI.parse(u)
|
80
154
|
http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
|
81
155
|
Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
|
82
|
-
si = SpiderInstance.new([u])
|
156
|
+
si = SpiderInstance.new({nil => [u]})
|
83
157
|
si.expects(:allowable_url?).with(u, u_p).returns(false)
|
84
158
|
si.expects(:get_page).times(0)
|
85
159
|
si.start!
|
@@ -90,9 +164,8 @@ describe 'SpiderInstance' do
|
|
90
164
|
u_p = URI.parse(u)
|
91
165
|
http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
|
92
166
|
Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
|
93
|
-
si = SpiderInstance.new([u])
|
167
|
+
si = SpiderInstance.new({nil => [u]})
|
94
168
|
si.expects(:allowable_url?).with(u, u_p).returns(true)
|
95
|
-
si.expects(:allowable_url?).with(nil, nil).returns(false)
|
96
169
|
si.expects(:get_page).with(URI.parse(u))
|
97
170
|
si.start!
|
98
171
|
end
|
@@ -106,25 +179,25 @@ describe 'SpiderInstance' do
|
|
106
179
|
robot_rules.expects(:parse).with('http://example.com:80/robots.txt',
|
107
180
|
'robots.txt content')
|
108
181
|
robot_rules.expects(:allowed?).with('http://example.com/').returns(false)
|
109
|
-
si = SpiderInstance.new(['http://example.com/'], [], robot_rules, [])
|
182
|
+
si = SpiderInstance.new({nil => ['http://example.com/']}, [], robot_rules, [])
|
110
183
|
allowable = si.allowable_url?('http://example.com/',
|
111
184
|
URI.parse('http://example.com/'))
|
112
|
-
allowable.should
|
185
|
+
allowable.should be_false
|
113
186
|
end
|
114
187
|
|
115
188
|
it 'should disallow URLs when they fail any url_check' do
|
116
|
-
si = SpiderInstance.new(['http://example.com/'])
|
189
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
117
190
|
si.stubs(:allowed?).returns(true)
|
118
191
|
si.add_url_check { |a_url| false }
|
119
192
|
allowable = si.allowable_url?('http://example.com/',
|
120
193
|
URI.parse('http://example.com/'))
|
121
|
-
allowable.should
|
194
|
+
allowable.should be_false
|
122
195
|
end
|
123
196
|
|
124
197
|
it 'should support multiple url_checks' do
|
125
198
|
@first_url_check = false
|
126
199
|
@second_url_check = false
|
127
|
-
si = SpiderInstance.new(['http://example.com/'])
|
200
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
128
201
|
si.stubs(:allowed?).returns(true)
|
129
202
|
si.add_url_check do |a_url|
|
130
203
|
@first_url_check = true
|
@@ -136,7 +209,7 @@ describe 'SpiderInstance' do
|
|
136
209
|
end
|
137
210
|
allowable = si.allowable_url?('http://example.com/',
|
138
211
|
URI.parse('http://example.com/'))
|
139
|
-
allowable.should
|
212
|
+
allowable.should be_false
|
140
213
|
@first_url_check.should be_true
|
141
214
|
@second_url_check.should be_true
|
142
215
|
end
|
@@ -144,17 +217,17 @@ describe 'SpiderInstance' do
|
|
144
217
|
it 'should avoid cycles' do
|
145
218
|
u = 'http://example.com/'
|
146
219
|
u_p = URI.parse(u)
|
147
|
-
si = SpiderInstance.new([u], [u_p])
|
220
|
+
si = SpiderInstance.new({nil => [u]}, [u_p])
|
148
221
|
si.stubs(:allowed?).returns(true)
|
149
222
|
allowable = si.allowable_url?(u, u_p)
|
150
|
-
allowable.should
|
223
|
+
allowable.should be_false
|
151
224
|
u_p.should_not be_nil
|
152
225
|
end
|
153
226
|
|
154
227
|
it 'should call the 404 handler for 404s' do
|
155
228
|
@proc_called = false
|
156
229
|
mock_failed_http
|
157
|
-
si = SpiderInstance.new(['http://example.com/'])
|
230
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
158
231
|
si.stubs(:allowed?).returns(true)
|
159
232
|
si.stubs(:generate_next_urls).returns([])
|
160
233
|
si.on(404) {|*a| @proc_called = true}
|
@@ -165,7 +238,7 @@ describe 'SpiderInstance' do
|
|
165
238
|
it 'should call the :success handler on success' do
|
166
239
|
@proc_called = false
|
167
240
|
mock_successful_http
|
168
|
-
si = SpiderInstance.new(['http://example.com/'])
|
241
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
169
242
|
si.stubs(:allowed?).returns(true)
|
170
243
|
si.stubs(:generate_next_urls).returns([])
|
171
244
|
si.on(:success) {|*a| @proc_called = true}
|
@@ -176,19 +249,19 @@ describe 'SpiderInstance' do
|
|
176
249
|
it 'should not call the :success handler on failure' do
|
177
250
|
@proc_called = false
|
178
251
|
mock_failed_http
|
179
|
-
si = SpiderInstance.new(['http://example.com/'])
|
252
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
180
253
|
si.stubs(:allowed?).returns(true)
|
181
254
|
si.stubs(:generate_next_urls).returns([])
|
182
255
|
si.on(:success) {|*a| @proc_called = true}
|
183
256
|
si.start!
|
184
|
-
@proc_called.should
|
257
|
+
@proc_called.should be_false
|
185
258
|
end
|
186
259
|
|
187
260
|
it 'should call the :success handler and the 200 handler on 200' do
|
188
261
|
@proc_200_called = false
|
189
262
|
@proc_success_called = false
|
190
263
|
mock_successful_http
|
191
|
-
si = SpiderInstance.new(['http://example.com/'])
|
264
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
192
265
|
si.stubs(:allowed?).returns(true)
|
193
266
|
si.stubs(:generate_next_urls).returns([])
|
194
267
|
si.on(:success) {|*a| @proc_success_called = true}
|
@@ -201,18 +274,18 @@ describe 'SpiderInstance' do
|
|
201
274
|
it 'should not call the :failure handler on success' do
|
202
275
|
@proc_called = false
|
203
276
|
mock_successful_http
|
204
|
-
si = SpiderInstance.new(['http://example.com/'])
|
277
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
205
278
|
si.stubs(:allowed?).returns(true)
|
206
279
|
si.stubs(:generate_next_urls).returns([])
|
207
280
|
si.on(:failure) {|*a| @proc_called = true}
|
208
281
|
si.start!
|
209
|
-
@proc_called.should
|
282
|
+
@proc_called.should be_false
|
210
283
|
end
|
211
284
|
|
212
285
|
it 'should call the :failure handler on failure' do
|
213
286
|
@proc_called = false
|
214
287
|
mock_failed_http
|
215
|
-
si = SpiderInstance.new(['http://example.com/'])
|
288
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
216
289
|
si.stubs(:allowed?).returns(true)
|
217
290
|
si.stubs(:generate_next_urls).returns([])
|
218
291
|
si.on(:failure) {|*a| @proc_called = true}
|
@@ -224,7 +297,7 @@ describe 'SpiderInstance' do
|
|
224
297
|
@proc_404_called = false
|
225
298
|
@proc_failure_called = false
|
226
299
|
mock_failed_http
|
227
|
-
si = SpiderInstance.new(['http://example.com/'])
|
300
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
228
301
|
si.stubs(:allowed?).returns(true)
|
229
302
|
si.stubs(:generate_next_urls).returns([])
|
230
303
|
si.on(:failure) {|*a| @proc_failure_called = true}
|
@@ -234,13 +307,13 @@ describe 'SpiderInstance' do
|
|
234
307
|
@proc_failure_called.should be_true
|
235
308
|
end
|
236
309
|
|
237
|
-
it 'should call the :
|
310
|
+
it 'should call the :every handler even when a handler for the error code is defined' do
|
238
311
|
@any_called = false
|
239
312
|
mock_successful_http
|
240
|
-
si = SpiderInstance.new(['http://example.com/'])
|
313
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
241
314
|
si.stubs(:allowed?).returns(true)
|
242
315
|
si.stubs(:generate_next_urls).returns([])
|
243
|
-
si.on(:
|
316
|
+
si.on(:every) { |*a| @any_called = true }
|
244
317
|
si.on(202) {|*a|}
|
245
318
|
si.start!
|
246
319
|
@any_called.should be_true
|
@@ -249,10 +322,10 @@ describe 'SpiderInstance' do
|
|
249
322
|
it 'should support a block as a response handler' do
|
250
323
|
@proc_called = false
|
251
324
|
mock_successful_http
|
252
|
-
si = SpiderInstance.new(['http://example.com/'])
|
325
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
253
326
|
si.stubs(:allowed?).returns(true)
|
254
327
|
si.stubs(:generate_next_urls).returns([])
|
255
|
-
si.on(:
|
328
|
+
si.on(:every) { |*a| @proc_called = true }
|
256
329
|
si.start!
|
257
330
|
@proc_called.should be_true
|
258
331
|
end
|
@@ -260,10 +333,10 @@ describe 'SpiderInstance' do
|
|
260
333
|
it 'should support a proc as a response handler' do
|
261
334
|
@proc_called = false
|
262
335
|
mock_successful_http
|
263
|
-
si = SpiderInstance.new(['http://example.com/'])
|
336
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
264
337
|
si.stubs(:allowed?).returns(true)
|
265
338
|
si.stubs(:generate_next_urls).returns([])
|
266
|
-
si.on(:
|
339
|
+
si.on(:every, Proc.new { |*a| @proc_called = true })
|
267
340
|
si.start!
|
268
341
|
@proc_called.should be_true
|
269
342
|
end
|
@@ -297,4 +370,16 @@ describe 'SpiderInstance' do
|
|
297
370
|
yields(mock(:request => http_req2)).returns(http_req2)
|
298
371
|
Net::HTTP.expects(:new).times(2).returns(http_obj).then.returns(http_obj2)
|
299
372
|
end
|
373
|
+
|
374
|
+
def callback_arguments_on(code)
|
375
|
+
si = SpiderInstance.new('http://foo.com/' => ['http://example.com/'])
|
376
|
+
si.stubs(:allowed?).returns(true)
|
377
|
+
si.stubs(:generate_next_urls).returns([])
|
378
|
+
si.on(code) do |a_url, resp, prior_url|
|
379
|
+
a_url.should == 'http://example.com/'
|
380
|
+
resp.should_not be_nil
|
381
|
+
prior_url.should == 'http://foo.com/'
|
382
|
+
end
|
383
|
+
si.start!
|
384
|
+
end
|
300
385
|
end
|