spider 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,7 +20,6 @@
20
20
  <div id="index">
21
21
  <h1 class="section-bar">Classes</h1>
22
22
  <div id="index-entries">
23
- <a href="classes/Net.html">Net</a><br />
24
23
  <a href="classes/Spider.html">Spider</a><br />
25
24
  <a href="classes/SpiderInstance.html">SpiderInstance</a><br />
26
25
  </div>
@@ -20,6 +20,7 @@
20
20
  <div id="index">
21
21
  <h1 class="section-bar">Files</h1>
22
22
  <div id="index-entries">
23
+ <a href="files/README.html">README</a><br />
23
24
  <a href="files/lib/spider_rb.html">lib/spider.rb</a><br />
24
25
  </div>
25
26
  </div>
@@ -21,9 +21,12 @@
21
21
  <h1 class="section-bar">Methods</h1>
22
22
  <div id="index-entries">
23
23
  <a href="classes/SpiderInstance.html#M000001">add_url_check (SpiderInstance)</a><br />
24
+ <a href="classes/SpiderInstance.html#M000006">clear_headers (SpiderInstance)</a><br />
25
+ <a href="classes/SpiderInstance.html#M000005">headers (SpiderInstance)</a><br />
24
26
  <a href="classes/SpiderInstance.html#M000002">on (SpiderInstance)</a><br />
25
- <a href="classes/SpiderInstance.html#M000003">remove_trailing_slash (SpiderInstance)</a><br />
26
- <a href="classes/Spider.html#M000004">start_at (Spider)</a><br />
27
+ <a href="classes/SpiderInstance.html#M000003">setup (SpiderInstance)</a><br />
28
+ <a href="classes/Spider.html#M000007">start_at (Spider)</a><br />
29
+ <a href="classes/SpiderInstance.html#M000004">teardown (SpiderInstance)</a><br />
27
30
  </div>
28
31
  </div>
29
32
  </body>
@@ -29,15 +29,21 @@ require 'uri'
29
29
  require 'net/http'
30
30
  require 'net/https'
31
31
 
32
- class Net::HTTPResponse #:nodoc:
33
- def success?; false; end
34
- def redirect?; false; end
35
- end
36
- class Net::HTTPSuccess #:nodoc:
37
- def success?; true; end
32
+ module Net #:nodoc:
33
+ class HTTPResponse #:nodoc:
34
+ def success?; false; end
35
+ def redirect?; false; end
36
+ end
37
+ class HTTPSuccess #:nodoc:
38
+ def success?; true; end
39
+ end
40
+ class HTTPRedirection #:nodoc:
41
+ def redirect?; true; end
42
+ end
38
43
  end
39
- class Net::HTTPRedirection #:nodoc:
40
- def redirect?; true; end
44
+
45
+ class NilClass #:nodoc:
46
+ def merge(h); h; end
41
47
  end
42
48
 
43
49
  # A spidering library for Ruby. Handles robots.txt, scraping, finding more
@@ -52,22 +58,22 @@ class Spider
52
58
  # a_url =~ %r{^http://mike-burns.com.*}
53
59
  # end
54
60
  #
55
- # s.on 404 do |a_url, err_code|
61
+ # s.on 404 do |a_url, resp, prior_url|
56
62
  # puts "URL not found: #{a_url}"
57
63
  # end
58
64
  #
59
- # s.on :success do |a_url, code, headers, body|
60
- # puts "body: #{body}"
65
+ # s.on :success do |a_url, resp, prior_url|
66
+ # puts "body: #{resp.body}"
61
67
  # end
62
68
  #
63
- # s.on :any do |a_url, resp|
69
+ # s.on :every do |a_url, resp, prior_url|
64
70
  # puts "URL returned anything: #{a_url} with this code #{resp.code}"
65
71
  # end
66
72
  # end
67
73
 
68
74
  def self.start_at(a_url, &block)
69
75
  rules = RobotRules.new('Ruby Spider 1.0')
70
- a_spider = SpiderInstance.new([a_url], [], rules, [])
76
+ a_spider = SpiderInstance.new({nil => a_url}, [], rules, [])
71
77
  block.call(a_spider)
72
78
  a_spider.start!
73
79
  end
@@ -77,11 +83,14 @@ class SpiderInstance
77
83
  def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc:
78
84
  @url_checks = []
79
85
  @cache = :memory
80
- @callbacks = {:any => lambda {}, :success => {}, :failure => {}}
86
+ @callbacks = {}
81
87
  @next_urls = next_urls
82
88
  @seen = seen
83
89
  @rules = rules || RobotRules.new('Ruby Spider 1.0')
84
90
  @robots_seen = robots_seen
91
+ @headers = {}
92
+ @setup = nil
93
+ @teardown = nil
85
94
  end
86
95
 
87
96
  # Add a predicate that determines whether to continue down this URL's path.
@@ -99,62 +108,88 @@ class SpiderInstance
99
108
  @cache = cache_type
100
109
  end
101
110
 
102
- # Add a response handler. A response handler's trigger can be :any, :success,
103
- # :failure, or any HTTP status code. The handler itself can be either a Proc
104
- # or a block. The arguments to the block depends on the trigger:
111
+ # Add a response handler. A response handler's trigger can be :every,
112
+ # :success, :failure, or any HTTP status code. The handler itself can be
113
+ # either a Proc or a block.
105
114
  #
106
- # If the trigger is :any, the arguments are the URL as a string and an
107
- # instance of Net::HTTPResponse.
115
+ # The arguments to the block are: the URL as a string, an instance of
116
+ # Net::HTTPResponse, and the prior URL as a string.
108
117
  #
109
- # If the trigger is :success or any HTTP status code that represents a
110
- # successful result, the arguments are the URL as a string, the HTTP status
111
- # code, an instance of Net::HTTPSuccess, and the body of the result as a
112
- # string.
113
- #
114
- # If the trigger is :failure or any HTTP status code that represents a failed
115
- # result, the arguments are the URL as a string and the HTTP status code.
116
118
  #
117
119
  # For example:
118
120
  #
119
- # on 404 do |a_url, code|
121
+ # on 404 do |a_url, resp, prior_url|
120
122
  # puts "URL not found: #{a_url}"
121
123
  # end
122
124
  #
123
- # on :success do |a_url, code, resp, body|
125
+ # on :success do |a_url, resp, prior_url|
124
126
  # puts a_url
125
- # puts body
127
+ # puts resp.body
126
128
  # end
127
129
  #
128
- # on :any do |a_url, resp|
130
+ # on :every do |a_url, resp, prior_url|
129
131
  # puts "Given this code: #{resp.code}"
130
132
  # end
131
133
  def on(code, p = nil, &block)
132
134
  f = p ? p : block
133
135
  case code
134
136
  when Fixnum
135
- @callbacks[success_or_failure(code)][code] = f
137
+ @callbacks[code] = f
136
138
  else
137
- if :any == code.to_sym
138
- @callbacks[:any] = f
139
- else
140
- @callbacks[code.to_sym][:any] = f
141
- end
139
+ @callbacks[code.to_sym] = f
142
140
  end
143
141
  end
144
142
 
143
+ # Run before the HTTP request. Given the URL as a string.
144
+ # setup do |a_url|
145
+ # headers['Cookies'] = 'user_id=1;admin=true'
146
+ # end
147
+ def setup(p = nil, &block)
148
+ @setup = p ? p : block
149
+ end
150
+
151
+ # Run last, once for each page. Given the URL as a string.
152
+ def teardown(p = nil, &block)
153
+ @teardown = p ? p : block
154
+ end
155
+
156
+ # Use like a hash:
157
+ # headers['Cookies'] = 'user_id=1;password=btrross3'
158
+ def headers
159
+ HeaderSetter.new(self)
160
+ end
161
+
162
+ def raw_headers #:nodoc:
163
+ @headers
164
+ end
165
+ def raw_headers=(v) #:nodoc:
166
+ @headers = v
167
+ end
168
+
169
+ # Reset the headers hash.
170
+ def clear_headers
171
+ @headers = {}
172
+ end
173
+
145
174
  def start! #:nodoc:
146
175
  next_urls = @next_urls
147
176
  begin
148
- next_urls = next_urls.map do |a_url|
149
- [a_url, (URI.parse(a_url) rescue nil)]
150
- end.select do |a_url, parsed_url|
151
- allowable_url?(a_url, parsed_url)
152
- end.map do |a_url, parsed_url|
153
- get_page(parsed_url) do |response|
154
- do_callbacks(a_url, response)
155
- generate_next_urls(a_url, response)
156
- end
157
- end.flatten
177
+ tmp_n_u = {}
178
+ next_urls.each do |prior_url, urls|
179
+ urls.map do |a_url|
180
+ [a_url, (URI.parse(a_url) rescue nil)]
181
+ end.select do |a_url, parsed_url|
182
+ allowable_url?(a_url, parsed_url)
183
+ end.each do |a_url, parsed_url|
184
+ @setup.call(a_url) unless @setup.nil?
185
+ get_page(parsed_url) do |response|
186
+ do_callbacks(a_url, response, prior_url)
187
+ tmp_n_u[a_url] = generate_next_urls(a_url, response)
188
+ end
189
+ @teardown.call(a_url) unless @teardown.nil?
190
+ end
191
+ end
192
+ next_urls = tmp_n_u
158
193
  end while !next_urls.empty?
159
194
  end
160
195
 
@@ -196,7 +231,8 @@ class SpiderInstance
196
231
  http = Net::HTTP.new(parsed_url.host, parsed_url.port)
197
232
  http.use_ssl = parsed_url.scheme == 'https'
198
233
  # Uses start because http.finish cannot be called.
199
- r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri))}
234
+ r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri,
235
+ @headers))}
200
236
  if r.redirect?
201
237
  get_page(URI.parse(r['Location']), &block)
202
238
  else
@@ -208,16 +244,13 @@ class SpiderInstance
208
244
  end
209
245
  end
210
246
 
211
- def do_callbacks(a_url, resp) #:nodoc:
212
- @callbacks[:any].call(a_url, resp) if @callbacks[:any]
213
- if resp.success?
214
- cb_branch = @callbacks[:success]
215
- cb_branch[:any].call(a_url, resp.code, resp, resp.body) if cb_branch[:any]
216
- cb_branch[resp.code].call(a_url, resp.code, resp, resp.body) if cb_branch[resp.code]
217
- else
218
- cb_branch = @callbacks[:failure]
219
- cb_branch[:any].call(a_url, resp.code) if cb_branch[:any]
220
- cb_branch[resp.code].call(a_url, resp.code) if cb_branch[resp.code]
247
+ def do_callbacks(a_url, resp, prior_url) #:nodoc:
248
+ cbs = [@callbacks[:every],
249
+ resp.success? ? @callbacks[:success] : @callbacks[:failure],
250
+ @callbacks[resp.code]]
251
+
252
+ cbs.each do |cb|
253
+ cb.call(a_url, resp, prior_url) if cb
221
254
  end
222
255
  end
223
256
 
@@ -254,7 +287,16 @@ class SpiderInstance
254
287
  end.compact
255
288
  end
256
289
 
257
- def remove_trailing_slash(s)
290
+ def remove_trailing_slash(s) #:nodoc:
258
291
  s.sub(%r{/*$},'')
259
292
  end
293
+
294
+ class HeaderSetter #:nodoc:
295
+ def initialize(si)
296
+ @si = si
297
+ end
298
+ def []=(k,v)
299
+ @si.raw_headers = @si.raw_headers.merge({k => v})
300
+ end
301
+ end
260
302
  end
@@ -23,6 +23,80 @@ def null_logger
23
23
  end
24
24
 
25
25
  describe 'SpiderInstance' do
26
+ it 'should call the "setup" callback before loading the Web page' do
27
+ mock_successful_http
28
+ @on_called = false
29
+ @before_called = false
30
+ si = SpiderInstance.new({nil => ['http://example.com/']})
31
+ si.stubs(:allowed?).returns(true)
32
+ si.stubs(:generate_next_urls).returns([])
33
+ si.setup { |*a| @before_called = Time.now }
34
+ si.on(:every) { |*a| @on_called = Time.now }
35
+ si.start!
36
+ @on_called.should_not be_false
37
+ @before_called.should_not be_false
38
+ @before_called.should_not be_false
39
+ @before_called.should < @on_called
40
+ end
41
+
42
+ it 'should call the "teardown" callback after running all other callbacks' do
43
+ mock_successful_http
44
+ @on_called = false
45
+ @after_called = false
46
+ si = SpiderInstance.new({nil => ['http://example.com/']})
47
+ si.stubs(:allowed?).returns(true)
48
+ si.stubs(:generate_next_urls).returns([])
49
+ si.on(:every) { |*a| @on_called = Time.now }
50
+ si.teardown { |*a| @after_called = Time.now }
51
+ si.start!
52
+ @on_called.should_not be_false
53
+ @after_called.should_not be_false
54
+ @after_called.should_not be_false
55
+ @after_called.should > @on_called
56
+ end
57
+
58
+ it 'should pass headers set by a setup handler to the HTTP request' do
59
+ mock_successful_http
60
+ Net::HTTP::Get.expects(:new).with('/foo',{'X-Header-Set' => 'True'})
61
+ si = SpiderInstance.new(nil => ['http://example.com/foo'])
62
+ si.stubs(:allowable_url?).returns(true)
63
+ si.stubs(:generate_next_urls).returns([])
64
+ si.setup do |a_url|
65
+ si.headers['X-Header-Set'] = 'True'
66
+ end
67
+ si.teardown do |a_url|
68
+ si.clear_headers
69
+ end
70
+ si.start!
71
+ end
72
+
73
+ it 'should allow for a proxy' # fill in more
74
+
75
+ it 'should call the :every callback with the current URL, the response, and the prior URL' do
76
+ mock_successful_http
77
+ callback_arguments_on(:every)
78
+ end
79
+
80
+ it 'should call the :success callback with the current URL, the request, and the prior URL' do
81
+ mock_successful_http
82
+ callback_arguments_on(:success)
83
+ end
84
+
85
+ it 'should call the :failure callback with the current URL, the request, and the prior URL' do
86
+ mock_failed_http
87
+ callback_arguments_on(:failure)
88
+ end
89
+
90
+ it 'should call the HTTP status error code callback with the current URL, the request, and the prior URL' do
91
+ mock_failed_http
92
+ callback_arguments_on(404)
93
+ end
94
+
95
+ it 'should call the HTTP status success code callback with the current URL, the request, and the prior URL' do
96
+ mock_successful_http
97
+ callback_arguments_on(200)
98
+ end
99
+
26
100
  # Bug reported by John Nagro, using the example source http://eons.com/
27
101
  # had to change line 192; uses request_uri now instead of path.
28
102
  it 'should handle query URLs without a path' do
@@ -33,7 +107,7 @@ describe 'SpiderInstance' do
33
107
  :AccessLog => [])
34
108
  server.mount('/', QueryServlet)
35
109
  Thread.new {server.start}
36
- si = SpiderInstance.new([u])
110
+ si = SpiderInstance.new({nil => [u]})
37
111
  si.get_page(u_p) do
38
112
  @block_called = true
39
113
  end
@@ -47,7 +121,7 @@ describe 'SpiderInstance' do
47
121
  u_p = URI.parse(u)
48
122
  @redirect_handled = false
49
123
  mock_redirect_http
50
- si = SpiderInstance.new([u])
124
+ si = SpiderInstance.new({nil => [u]})
51
125
  si.get_page(u_p) do
52
126
  @redirect_handled = true
53
127
  end
@@ -66,7 +140,7 @@ describe 'SpiderInstance' do
66
140
  :SSLComment => 'Comment of some sort')
67
141
  server.mount('/', QueryServlet)
68
142
  Thread.new {server.start}
69
- si = SpiderInstance.new([u])
143
+ si = SpiderInstance.new({nil => [u]})
70
144
  si.get_page(u_p) { @page_called = true }
71
145
  server.shutdown
72
146
  @page_called.should be_true
@@ -79,7 +153,7 @@ describe 'SpiderInstance' do
79
153
  u_p = URI.parse(u)
80
154
  http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
81
155
  Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
82
- si = SpiderInstance.new([u])
156
+ si = SpiderInstance.new({nil => [u]})
83
157
  si.expects(:allowable_url?).with(u, u_p).returns(false)
84
158
  si.expects(:get_page).times(0)
85
159
  si.start!
@@ -90,9 +164,8 @@ describe 'SpiderInstance' do
90
164
  u_p = URI.parse(u)
91
165
  http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
92
166
  Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
93
- si = SpiderInstance.new([u])
167
+ si = SpiderInstance.new({nil => [u]})
94
168
  si.expects(:allowable_url?).with(u, u_p).returns(true)
95
- si.expects(:allowable_url?).with(nil, nil).returns(false)
96
169
  si.expects(:get_page).with(URI.parse(u))
97
170
  si.start!
98
171
  end
@@ -106,25 +179,25 @@ describe 'SpiderInstance' do
106
179
  robot_rules.expects(:parse).with('http://example.com:80/robots.txt',
107
180
  'robots.txt content')
108
181
  robot_rules.expects(:allowed?).with('http://example.com/').returns(false)
109
- si = SpiderInstance.new(['http://example.com/'], [], robot_rules, [])
182
+ si = SpiderInstance.new({nil => ['http://example.com/']}, [], robot_rules, [])
110
183
  allowable = si.allowable_url?('http://example.com/',
111
184
  URI.parse('http://example.com/'))
112
- allowable.should == false
185
+ allowable.should be_false
113
186
  end
114
187
 
115
188
  it 'should disallow URLs when they fail any url_check' do
116
- si = SpiderInstance.new(['http://example.com/'])
189
+ si = SpiderInstance.new({nil => ['http://example.com/']})
117
190
  si.stubs(:allowed?).returns(true)
118
191
  si.add_url_check { |a_url| false }
119
192
  allowable = si.allowable_url?('http://example.com/',
120
193
  URI.parse('http://example.com/'))
121
- allowable.should == false
194
+ allowable.should be_false
122
195
  end
123
196
 
124
197
  it 'should support multiple url_checks' do
125
198
  @first_url_check = false
126
199
  @second_url_check = false
127
- si = SpiderInstance.new(['http://example.com/'])
200
+ si = SpiderInstance.new({nil => ['http://example.com/']})
128
201
  si.stubs(:allowed?).returns(true)
129
202
  si.add_url_check do |a_url|
130
203
  @first_url_check = true
@@ -136,7 +209,7 @@ describe 'SpiderInstance' do
136
209
  end
137
210
  allowable = si.allowable_url?('http://example.com/',
138
211
  URI.parse('http://example.com/'))
139
- allowable.should == false
212
+ allowable.should be_false
140
213
  @first_url_check.should be_true
141
214
  @second_url_check.should be_true
142
215
  end
@@ -144,17 +217,17 @@ describe 'SpiderInstance' do
144
217
  it 'should avoid cycles' do
145
218
  u = 'http://example.com/'
146
219
  u_p = URI.parse(u)
147
- si = SpiderInstance.new([u], [u_p])
220
+ si = SpiderInstance.new({nil => [u]}, [u_p])
148
221
  si.stubs(:allowed?).returns(true)
149
222
  allowable = si.allowable_url?(u, u_p)
150
- allowable.should == false
223
+ allowable.should be_false
151
224
  u_p.should_not be_nil
152
225
  end
153
226
 
154
227
  it 'should call the 404 handler for 404s' do
155
228
  @proc_called = false
156
229
  mock_failed_http
157
- si = SpiderInstance.new(['http://example.com/'])
230
+ si = SpiderInstance.new({nil => ['http://example.com/']})
158
231
  si.stubs(:allowed?).returns(true)
159
232
  si.stubs(:generate_next_urls).returns([])
160
233
  si.on(404) {|*a| @proc_called = true}
@@ -165,7 +238,7 @@ describe 'SpiderInstance' do
165
238
  it 'should call the :success handler on success' do
166
239
  @proc_called = false
167
240
  mock_successful_http
168
- si = SpiderInstance.new(['http://example.com/'])
241
+ si = SpiderInstance.new({nil => ['http://example.com/']})
169
242
  si.stubs(:allowed?).returns(true)
170
243
  si.stubs(:generate_next_urls).returns([])
171
244
  si.on(:success) {|*a| @proc_called = true}
@@ -176,19 +249,19 @@ describe 'SpiderInstance' do
176
249
  it 'should not call the :success handler on failure' do
177
250
  @proc_called = false
178
251
  mock_failed_http
179
- si = SpiderInstance.new(['http://example.com/'])
252
+ si = SpiderInstance.new({nil => ['http://example.com/']})
180
253
  si.stubs(:allowed?).returns(true)
181
254
  si.stubs(:generate_next_urls).returns([])
182
255
  si.on(:success) {|*a| @proc_called = true}
183
256
  si.start!
184
- @proc_called.should == false
257
+ @proc_called.should be_false
185
258
  end
186
259
 
187
260
  it 'should call the :success handler and the 200 handler on 200' do
188
261
  @proc_200_called = false
189
262
  @proc_success_called = false
190
263
  mock_successful_http
191
- si = SpiderInstance.new(['http://example.com/'])
264
+ si = SpiderInstance.new({nil => ['http://example.com/']})
192
265
  si.stubs(:allowed?).returns(true)
193
266
  si.stubs(:generate_next_urls).returns([])
194
267
  si.on(:success) {|*a| @proc_success_called = true}
@@ -201,18 +274,18 @@ describe 'SpiderInstance' do
201
274
  it 'should not call the :failure handler on success' do
202
275
  @proc_called = false
203
276
  mock_successful_http
204
- si = SpiderInstance.new(['http://example.com/'])
277
+ si = SpiderInstance.new({nil => ['http://example.com/']})
205
278
  si.stubs(:allowed?).returns(true)
206
279
  si.stubs(:generate_next_urls).returns([])
207
280
  si.on(:failure) {|*a| @proc_called = true}
208
281
  si.start!
209
- @proc_called.should == false
282
+ @proc_called.should be_false
210
283
  end
211
284
 
212
285
  it 'should call the :failure handler on failure' do
213
286
  @proc_called = false
214
287
  mock_failed_http
215
- si = SpiderInstance.new(['http://example.com/'])
288
+ si = SpiderInstance.new({nil => ['http://example.com/']})
216
289
  si.stubs(:allowed?).returns(true)
217
290
  si.stubs(:generate_next_urls).returns([])
218
291
  si.on(:failure) {|*a| @proc_called = true}
@@ -224,7 +297,7 @@ describe 'SpiderInstance' do
224
297
  @proc_404_called = false
225
298
  @proc_failure_called = false
226
299
  mock_failed_http
227
- si = SpiderInstance.new(['http://example.com/'])
300
+ si = SpiderInstance.new({nil => ['http://example.com/']})
228
301
  si.stubs(:allowed?).returns(true)
229
302
  si.stubs(:generate_next_urls).returns([])
230
303
  si.on(:failure) {|*a| @proc_failure_called = true}
@@ -234,13 +307,13 @@ describe 'SpiderInstance' do
234
307
  @proc_failure_called.should be_true
235
308
  end
236
309
 
237
- it 'should call the :any handler even when a handler for the error code is defined' do
310
+ it 'should call the :every handler even when a handler for the error code is defined' do
238
311
  @any_called = false
239
312
  mock_successful_http
240
- si = SpiderInstance.new(['http://example.com/'])
313
+ si = SpiderInstance.new({nil => ['http://example.com/']})
241
314
  si.stubs(:allowed?).returns(true)
242
315
  si.stubs(:generate_next_urls).returns([])
243
- si.on(:any) { |*a| @any_called = true }
316
+ si.on(:every) { |*a| @any_called = true }
244
317
  si.on(202) {|*a|}
245
318
  si.start!
246
319
  @any_called.should be_true
@@ -249,10 +322,10 @@ describe 'SpiderInstance' do
249
322
  it 'should support a block as a response handler' do
250
323
  @proc_called = false
251
324
  mock_successful_http
252
- si = SpiderInstance.new(['http://example.com/'])
325
+ si = SpiderInstance.new({nil => ['http://example.com/']})
253
326
  si.stubs(:allowed?).returns(true)
254
327
  si.stubs(:generate_next_urls).returns([])
255
- si.on(:any) { |*a| @proc_called = true }
328
+ si.on(:every) { |*a| @proc_called = true }
256
329
  si.start!
257
330
  @proc_called.should be_true
258
331
  end
@@ -260,10 +333,10 @@ describe 'SpiderInstance' do
260
333
  it 'should support a proc as a response handler' do
261
334
  @proc_called = false
262
335
  mock_successful_http
263
- si = SpiderInstance.new(['http://example.com/'])
336
+ si = SpiderInstance.new({nil => ['http://example.com/']})
264
337
  si.stubs(:allowed?).returns(true)
265
338
  si.stubs(:generate_next_urls).returns([])
266
- si.on(:any, Proc.new { |*a| @proc_called = true })
339
+ si.on(:every, Proc.new { |*a| @proc_called = true })
267
340
  si.start!
268
341
  @proc_called.should be_true
269
342
  end
@@ -297,4 +370,16 @@ describe 'SpiderInstance' do
297
370
  yields(mock(:request => http_req2)).returns(http_req2)
298
371
  Net::HTTP.expects(:new).times(2).returns(http_obj).then.returns(http_obj2)
299
372
  end
373
+
374
+ def callback_arguments_on(code)
375
+ si = SpiderInstance.new('http://foo.com/' => ['http://example.com/'])
376
+ si.stubs(:allowed?).returns(true)
377
+ si.stubs(:generate_next_urls).returns([])
378
+ si.on(code) do |a_url, resp, prior_url|
379
+ a_url.should == 'http://example.com/'
380
+ resp.should_not be_nil
381
+ prior_url.should == 'http://foo.com/'
382
+ end
383
+ si.start!
384
+ end
300
385
  end