spider 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -20,7 +20,6 @@
20
20
  <div id="index">
21
21
  <h1 class="section-bar">Classes</h1>
22
22
  <div id="index-entries">
23
- <a href="classes/Net.html">Net</a><br />
24
23
  <a href="classes/Spider.html">Spider</a><br />
25
24
  <a href="classes/SpiderInstance.html">SpiderInstance</a><br />
26
25
  </div>
@@ -20,6 +20,7 @@
20
20
  <div id="index">
21
21
  <h1 class="section-bar">Files</h1>
22
22
  <div id="index-entries">
23
+ <a href="files/README.html">README</a><br />
23
24
  <a href="files/lib/spider_rb.html">lib/spider.rb</a><br />
24
25
  </div>
25
26
  </div>
@@ -21,9 +21,12 @@
21
21
  <h1 class="section-bar">Methods</h1>
22
22
  <div id="index-entries">
23
23
  <a href="classes/SpiderInstance.html#M000001">add_url_check (SpiderInstance)</a><br />
24
+ <a href="classes/SpiderInstance.html#M000006">clear_headers (SpiderInstance)</a><br />
25
+ <a href="classes/SpiderInstance.html#M000005">headers (SpiderInstance)</a><br />
24
26
  <a href="classes/SpiderInstance.html#M000002">on (SpiderInstance)</a><br />
25
- <a href="classes/SpiderInstance.html#M000003">remove_trailing_slash (SpiderInstance)</a><br />
26
- <a href="classes/Spider.html#M000004">start_at (Spider)</a><br />
27
+ <a href="classes/SpiderInstance.html#M000003">setup (SpiderInstance)</a><br />
28
+ <a href="classes/Spider.html#M000007">start_at (Spider)</a><br />
29
+ <a href="classes/SpiderInstance.html#M000004">teardown (SpiderInstance)</a><br />
27
30
  </div>
28
31
  </div>
29
32
  </body>
@@ -29,15 +29,21 @@ require 'uri'
29
29
  require 'net/http'
30
30
  require 'net/https'
31
31
 
32
- class Net::HTTPResponse #:nodoc:
33
- def success?; false; end
34
- def redirect?; false; end
35
- end
36
- class Net::HTTPSuccess #:nodoc:
37
- def success?; true; end
32
+ module Net #:nodoc:
33
+ class HTTPResponse #:nodoc:
34
+ def success?; false; end
35
+ def redirect?; false; end
36
+ end
37
+ class HTTPSuccess #:nodoc:
38
+ def success?; true; end
39
+ end
40
+ class HTTPRedirection #:nodoc:
41
+ def redirect?; true; end
42
+ end
38
43
  end
39
- class Net::HTTPRedirection #:nodoc:
40
- def redirect?; true; end
44
+
45
+ class NilClass #:nodoc:
46
+ def merge(h); h; end
41
47
  end
42
48
 
43
49
  # A spidering library for Ruby. Handles robots.txt, scraping, finding more
@@ -52,22 +58,22 @@ class Spider
52
58
  # a_url =~ %r{^http://mike-burns.com.*}
53
59
  # end
54
60
  #
55
- # s.on 404 do |a_url, err_code|
61
+ # s.on 404 do |a_url, resp, prior_url|
56
62
  # puts "URL not found: #{a_url}"
57
63
  # end
58
64
  #
59
- # s.on :success do |a_url, code, headers, body|
60
- # puts "body: #{body}"
65
+ # s.on :success do |a_url, resp, prior_url|
66
+ # puts "body: #{resp.body}"
61
67
  # end
62
68
  #
63
- # s.on :any do |a_url, resp|
69
+ # s.on :every do |a_url, resp, prior_url|
64
70
  # puts "URL returned anything: #{a_url} with this code #{resp.code}"
65
71
  # end
66
72
  # end
67
73
 
68
74
  def self.start_at(a_url, &block)
69
75
  rules = RobotRules.new('Ruby Spider 1.0')
70
- a_spider = SpiderInstance.new([a_url], [], rules, [])
76
+ a_spider = SpiderInstance.new({nil => a_url}, [], rules, [])
71
77
  block.call(a_spider)
72
78
  a_spider.start!
73
79
  end
@@ -77,11 +83,14 @@ class SpiderInstance
77
83
  def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc:
78
84
  @url_checks = []
79
85
  @cache = :memory
80
- @callbacks = {:any => lambda {}, :success => {}, :failure => {}}
86
+ @callbacks = {}
81
87
  @next_urls = next_urls
82
88
  @seen = seen
83
89
  @rules = rules || RobotRules.new('Ruby Spider 1.0')
84
90
  @robots_seen = robots_seen
91
+ @headers = {}
92
+ @setup = nil
93
+ @teardown = nil
85
94
  end
86
95
 
87
96
  # Add a predicate that determines whether to continue down this URL's path.
@@ -99,62 +108,88 @@ class SpiderInstance
99
108
  @cache = cache_type
100
109
  end
101
110
 
102
- # Add a response handler. A response handler's trigger can be :any, :success,
103
- # :failure, or any HTTP status code. The handler itself can be either a Proc
104
- # or a block. The arguments to the block depends on the trigger:
111
+ # Add a response handler. A response handler's trigger can be :every,
112
+ # :success, :failure, or any HTTP status code. The handler itself can be
113
+ # either a Proc or a block.
105
114
  #
106
- # If the trigger is :any, the arguments are the URL as a string and an
107
- # instance of Net::HTTPResponse.
115
+ # The arguments to the block are: the URL as a string, an instance of
116
+ # Net::HTTPResponse, and the prior URL as a string.
108
117
  #
109
- # If the trigger is :success or any HTTP status code that represents a
110
- # successful result, the arguments are the URL as a string, the HTTP status
111
- # code, an instance of Net::HTTPSuccess, and the body of the result as a
112
- # string.
113
- #
114
- # If the trigger is :failure or any HTTP status code that represents a failed
115
- # result, the arguments are the URL as a string and the HTTP status code.
116
118
  #
117
119
  # For example:
118
120
  #
119
- # on 404 do |a_url, code|
121
+ # on 404 do |a_url, resp, prior_url|
120
122
  # puts "URL not found: #{a_url}"
121
123
  # end
122
124
  #
123
- # on :success do |a_url, code, resp, body|
125
+ # on :success do |a_url, resp, prior_url|
124
126
  # puts a_url
125
- # puts body
127
+ # puts resp.body
126
128
  # end
127
129
  #
128
- # on :any do |a_url, resp|
130
+ # on :every do |a_url, resp, prior_url|
129
131
  # puts "Given this code: #{resp.code}"
130
132
  # end
131
133
  def on(code, p = nil, &block)
132
134
  f = p ? p : block
133
135
  case code
134
136
  when Fixnum
135
- @callbacks[success_or_failure(code)][code] = f
137
+ @callbacks[code] = f
136
138
  else
137
- if :any == code.to_sym
138
- @callbacks[:any] = f
139
- else
140
- @callbacks[code.to_sym][:any] = f
141
- end
139
+ @callbacks[code.to_sym] = f
142
140
  end
143
141
  end
144
142
 
143
+ # Run before the HTTP request. Given the URL as a string.
144
+ # setup do |a_url|
145
+ # headers['Cookies'] = 'user_id=1;admin=true'
146
+ # end
147
+ def setup(p = nil, &block)
148
+ @setup = p ? p : block
149
+ end
150
+
151
+ # Run last, once for each page. Given the URL as a string.
152
+ def teardown(p = nil, &block)
153
+ @teardown = p ? p : block
154
+ end
155
+
156
+ # Use like a hash:
157
+ # headers['Cookies'] = 'user_id=1;password=btrross3'
158
+ def headers
159
+ HeaderSetter.new(self)
160
+ end
161
+
162
+ def raw_headers #:nodoc:
163
+ @headers
164
+ end
165
+ def raw_headers=(v) #:nodoc:
166
+ @headers = v
167
+ end
168
+
169
+ # Reset the headers hash.
170
+ def clear_headers
171
+ @headers = {}
172
+ end
173
+
145
174
  def start! #:nodoc:
146
175
  next_urls = @next_urls
147
176
  begin
148
- next_urls = next_urls.map do |a_url|
149
- [a_url, (URI.parse(a_url) rescue nil)]
150
- end.select do |a_url, parsed_url|
151
- allowable_url?(a_url, parsed_url)
152
- end.map do |a_url, parsed_url|
153
- get_page(parsed_url) do |response|
154
- do_callbacks(a_url, response)
155
- generate_next_urls(a_url, response)
156
- end
157
- end.flatten
177
+ tmp_n_u = {}
178
+ next_urls.each do |prior_url, urls|
179
+ urls.map do |a_url|
180
+ [a_url, (URI.parse(a_url) rescue nil)]
181
+ end.select do |a_url, parsed_url|
182
+ allowable_url?(a_url, parsed_url)
183
+ end.each do |a_url, parsed_url|
184
+ @setup.call(a_url) unless @setup.nil?
185
+ get_page(parsed_url) do |response|
186
+ do_callbacks(a_url, response, prior_url)
187
+ tmp_n_u[a_url] = generate_next_urls(a_url, response)
188
+ end
189
+ @teardown.call(a_url) unless @teardown.nil?
190
+ end
191
+ end
192
+ next_urls = tmp_n_u
158
193
  end while !next_urls.empty?
159
194
  end
160
195
 
@@ -196,7 +231,8 @@ class SpiderInstance
196
231
  http = Net::HTTP.new(parsed_url.host, parsed_url.port)
197
232
  http.use_ssl = parsed_url.scheme == 'https'
198
233
  # Uses start because http.finish cannot be called.
199
- r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri))}
234
+ r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri,
235
+ @headers))}
200
236
  if r.redirect?
201
237
  get_page(URI.parse(r['Location']), &block)
202
238
  else
@@ -208,16 +244,13 @@ class SpiderInstance
208
244
  end
209
245
  end
210
246
 
211
- def do_callbacks(a_url, resp) #:nodoc:
212
- @callbacks[:any].call(a_url, resp) if @callbacks[:any]
213
- if resp.success?
214
- cb_branch = @callbacks[:success]
215
- cb_branch[:any].call(a_url, resp.code, resp, resp.body) if cb_branch[:any]
216
- cb_branch[resp.code].call(a_url, resp.code, resp, resp.body) if cb_branch[resp.code]
217
- else
218
- cb_branch = @callbacks[:failure]
219
- cb_branch[:any].call(a_url, resp.code) if cb_branch[:any]
220
- cb_branch[resp.code].call(a_url, resp.code) if cb_branch[resp.code]
247
+ def do_callbacks(a_url, resp, prior_url) #:nodoc:
248
+ cbs = [@callbacks[:every],
249
+ resp.success? ? @callbacks[:success] : @callbacks[:failure],
250
+ @callbacks[resp.code]]
251
+
252
+ cbs.each do |cb|
253
+ cb.call(a_url, resp, prior_url) if cb
221
254
  end
222
255
  end
223
256
 
@@ -254,7 +287,16 @@ class SpiderInstance
254
287
  end.compact
255
288
  end
256
289
 
257
- def remove_trailing_slash(s)
290
+ def remove_trailing_slash(s) #:nodoc:
258
291
  s.sub(%r{/*$},'')
259
292
  end
293
+
294
+ class HeaderSetter #:nodoc:
295
+ def initialize(si)
296
+ @si = si
297
+ end
298
+ def []=(k,v)
299
+ @si.raw_headers = @si.raw_headers.merge({k => v})
300
+ end
301
+ end
260
302
  end
@@ -23,6 +23,80 @@ def null_logger
23
23
  end
24
24
 
25
25
  describe 'SpiderInstance' do
26
+ it 'should call the "setup" callback before loading the Web page' do
27
+ mock_successful_http
28
+ @on_called = false
29
+ @before_called = false
30
+ si = SpiderInstance.new({nil => ['http://example.com/']})
31
+ si.stubs(:allowed?).returns(true)
32
+ si.stubs(:generate_next_urls).returns([])
33
+ si.setup { |*a| @before_called = Time.now }
34
+ si.on(:every) { |*a| @on_called = Time.now }
35
+ si.start!
36
+ @on_called.should_not be_false
37
+ @before_called.should_not be_false
38
+ @before_called.should_not be_false
39
+ @before_called.should < @on_called
40
+ end
41
+
42
+ it 'should call the "teardown" callback after running all other callbacks' do
43
+ mock_successful_http
44
+ @on_called = false
45
+ @after_called = false
46
+ si = SpiderInstance.new({nil => ['http://example.com/']})
47
+ si.stubs(:allowed?).returns(true)
48
+ si.stubs(:generate_next_urls).returns([])
49
+ si.on(:every) { |*a| @on_called = Time.now }
50
+ si.teardown { |*a| @after_called = Time.now }
51
+ si.start!
52
+ @on_called.should_not be_false
53
+ @after_called.should_not be_false
54
+ @after_called.should_not be_false
55
+ @after_called.should > @on_called
56
+ end
57
+
58
+ it 'should pass headers set by a setup handler to the HTTP request' do
59
+ mock_successful_http
60
+ Net::HTTP::Get.expects(:new).with('/foo',{'X-Header-Set' => 'True'})
61
+ si = SpiderInstance.new(nil => ['http://example.com/foo'])
62
+ si.stubs(:allowable_url?).returns(true)
63
+ si.stubs(:generate_next_urls).returns([])
64
+ si.setup do |a_url|
65
+ si.headers['X-Header-Set'] = 'True'
66
+ end
67
+ si.teardown do |a_url|
68
+ si.clear_headers
69
+ end
70
+ si.start!
71
+ end
72
+
73
+ it 'should allow for a proxy' # fill in more
74
+
75
+ it 'should call the :every callback with the current URL, the response, and the prior URL' do
76
+ mock_successful_http
77
+ callback_arguments_on(:every)
78
+ end
79
+
80
+ it 'should call the :success callback with the current URL, the request, and the prior URL' do
81
+ mock_successful_http
82
+ callback_arguments_on(:success)
83
+ end
84
+
85
+ it 'should call the :failure callback with the current URL, the request, and the prior URL' do
86
+ mock_failed_http
87
+ callback_arguments_on(:failure)
88
+ end
89
+
90
+ it 'should call the HTTP status error code callback with the current URL, the request, and the prior URL' do
91
+ mock_failed_http
92
+ callback_arguments_on(404)
93
+ end
94
+
95
+ it 'should call the HTTP status success code callback with the current URL, the request, and the prior URL' do
96
+ mock_successful_http
97
+ callback_arguments_on(200)
98
+ end
99
+
26
100
  # Bug reported by John Nagro, using the example source http://eons.com/
27
101
  # had to change line 192; uses request_uri now instead of path.
28
102
  it 'should handle query URLs without a path' do
@@ -33,7 +107,7 @@ describe 'SpiderInstance' do
33
107
  :AccessLog => [])
34
108
  server.mount('/', QueryServlet)
35
109
  Thread.new {server.start}
36
- si = SpiderInstance.new([u])
110
+ si = SpiderInstance.new({nil => [u]})
37
111
  si.get_page(u_p) do
38
112
  @block_called = true
39
113
  end
@@ -47,7 +121,7 @@ describe 'SpiderInstance' do
47
121
  u_p = URI.parse(u)
48
122
  @redirect_handled = false
49
123
  mock_redirect_http
50
- si = SpiderInstance.new([u])
124
+ si = SpiderInstance.new({nil => [u]})
51
125
  si.get_page(u_p) do
52
126
  @redirect_handled = true
53
127
  end
@@ -66,7 +140,7 @@ describe 'SpiderInstance' do
66
140
  :SSLComment => 'Comment of some sort')
67
141
  server.mount('/', QueryServlet)
68
142
  Thread.new {server.start}
69
- si = SpiderInstance.new([u])
143
+ si = SpiderInstance.new({nil => [u]})
70
144
  si.get_page(u_p) { @page_called = true }
71
145
  server.shutdown
72
146
  @page_called.should be_true
@@ -79,7 +153,7 @@ describe 'SpiderInstance' do
79
153
  u_p = URI.parse(u)
80
154
  http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
81
155
  Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
82
- si = SpiderInstance.new([u])
156
+ si = SpiderInstance.new({nil => [u]})
83
157
  si.expects(:allowable_url?).with(u, u_p).returns(false)
84
158
  si.expects(:get_page).times(0)
85
159
  si.start!
@@ -90,9 +164,8 @@ describe 'SpiderInstance' do
90
164
  u_p = URI.parse(u)
91
165
  http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
92
166
  Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
93
- si = SpiderInstance.new([u])
167
+ si = SpiderInstance.new({nil => [u]})
94
168
  si.expects(:allowable_url?).with(u, u_p).returns(true)
95
- si.expects(:allowable_url?).with(nil, nil).returns(false)
96
169
  si.expects(:get_page).with(URI.parse(u))
97
170
  si.start!
98
171
  end
@@ -106,25 +179,25 @@ describe 'SpiderInstance' do
106
179
  robot_rules.expects(:parse).with('http://example.com:80/robots.txt',
107
180
  'robots.txt content')
108
181
  robot_rules.expects(:allowed?).with('http://example.com/').returns(false)
109
- si = SpiderInstance.new(['http://example.com/'], [], robot_rules, [])
182
+ si = SpiderInstance.new({nil => ['http://example.com/']}, [], robot_rules, [])
110
183
  allowable = si.allowable_url?('http://example.com/',
111
184
  URI.parse('http://example.com/'))
112
- allowable.should == false
185
+ allowable.should be_false
113
186
  end
114
187
 
115
188
  it 'should disallow URLs when they fail any url_check' do
116
- si = SpiderInstance.new(['http://example.com/'])
189
+ si = SpiderInstance.new({nil => ['http://example.com/']})
117
190
  si.stubs(:allowed?).returns(true)
118
191
  si.add_url_check { |a_url| false }
119
192
  allowable = si.allowable_url?('http://example.com/',
120
193
  URI.parse('http://example.com/'))
121
- allowable.should == false
194
+ allowable.should be_false
122
195
  end
123
196
 
124
197
  it 'should support multiple url_checks' do
125
198
  @first_url_check = false
126
199
  @second_url_check = false
127
- si = SpiderInstance.new(['http://example.com/'])
200
+ si = SpiderInstance.new({nil => ['http://example.com/']})
128
201
  si.stubs(:allowed?).returns(true)
129
202
  si.add_url_check do |a_url|
130
203
  @first_url_check = true
@@ -136,7 +209,7 @@ describe 'SpiderInstance' do
136
209
  end
137
210
  allowable = si.allowable_url?('http://example.com/',
138
211
  URI.parse('http://example.com/'))
139
- allowable.should == false
212
+ allowable.should be_false
140
213
  @first_url_check.should be_true
141
214
  @second_url_check.should be_true
142
215
  end
@@ -144,17 +217,17 @@ describe 'SpiderInstance' do
144
217
  it 'should avoid cycles' do
145
218
  u = 'http://example.com/'
146
219
  u_p = URI.parse(u)
147
- si = SpiderInstance.new([u], [u_p])
220
+ si = SpiderInstance.new({nil => [u]}, [u_p])
148
221
  si.stubs(:allowed?).returns(true)
149
222
  allowable = si.allowable_url?(u, u_p)
150
- allowable.should == false
223
+ allowable.should be_false
151
224
  u_p.should_not be_nil
152
225
  end
153
226
 
154
227
  it 'should call the 404 handler for 404s' do
155
228
  @proc_called = false
156
229
  mock_failed_http
157
- si = SpiderInstance.new(['http://example.com/'])
230
+ si = SpiderInstance.new({nil => ['http://example.com/']})
158
231
  si.stubs(:allowed?).returns(true)
159
232
  si.stubs(:generate_next_urls).returns([])
160
233
  si.on(404) {|*a| @proc_called = true}
@@ -165,7 +238,7 @@ describe 'SpiderInstance' do
165
238
  it 'should call the :success handler on success' do
166
239
  @proc_called = false
167
240
  mock_successful_http
168
- si = SpiderInstance.new(['http://example.com/'])
241
+ si = SpiderInstance.new({nil => ['http://example.com/']})
169
242
  si.stubs(:allowed?).returns(true)
170
243
  si.stubs(:generate_next_urls).returns([])
171
244
  si.on(:success) {|*a| @proc_called = true}
@@ -176,19 +249,19 @@ describe 'SpiderInstance' do
176
249
  it 'should not call the :success handler on failure' do
177
250
  @proc_called = false
178
251
  mock_failed_http
179
- si = SpiderInstance.new(['http://example.com/'])
252
+ si = SpiderInstance.new({nil => ['http://example.com/']})
180
253
  si.stubs(:allowed?).returns(true)
181
254
  si.stubs(:generate_next_urls).returns([])
182
255
  si.on(:success) {|*a| @proc_called = true}
183
256
  si.start!
184
- @proc_called.should == false
257
+ @proc_called.should be_false
185
258
  end
186
259
 
187
260
  it 'should call the :success handler and the 200 handler on 200' do
188
261
  @proc_200_called = false
189
262
  @proc_success_called = false
190
263
  mock_successful_http
191
- si = SpiderInstance.new(['http://example.com/'])
264
+ si = SpiderInstance.new({nil => ['http://example.com/']})
192
265
  si.stubs(:allowed?).returns(true)
193
266
  si.stubs(:generate_next_urls).returns([])
194
267
  si.on(:success) {|*a| @proc_success_called = true}
@@ -201,18 +274,18 @@ describe 'SpiderInstance' do
201
274
  it 'should not call the :failure handler on success' do
202
275
  @proc_called = false
203
276
  mock_successful_http
204
- si = SpiderInstance.new(['http://example.com/'])
277
+ si = SpiderInstance.new({nil => ['http://example.com/']})
205
278
  si.stubs(:allowed?).returns(true)
206
279
  si.stubs(:generate_next_urls).returns([])
207
280
  si.on(:failure) {|*a| @proc_called = true}
208
281
  si.start!
209
- @proc_called.should == false
282
+ @proc_called.should be_false
210
283
  end
211
284
 
212
285
  it 'should call the :failure handler on failure' do
213
286
  @proc_called = false
214
287
  mock_failed_http
215
- si = SpiderInstance.new(['http://example.com/'])
288
+ si = SpiderInstance.new({nil => ['http://example.com/']})
216
289
  si.stubs(:allowed?).returns(true)
217
290
  si.stubs(:generate_next_urls).returns([])
218
291
  si.on(:failure) {|*a| @proc_called = true}
@@ -224,7 +297,7 @@ describe 'SpiderInstance' do
224
297
  @proc_404_called = false
225
298
  @proc_failure_called = false
226
299
  mock_failed_http
227
- si = SpiderInstance.new(['http://example.com/'])
300
+ si = SpiderInstance.new({nil => ['http://example.com/']})
228
301
  si.stubs(:allowed?).returns(true)
229
302
  si.stubs(:generate_next_urls).returns([])
230
303
  si.on(:failure) {|*a| @proc_failure_called = true}
@@ -234,13 +307,13 @@ describe 'SpiderInstance' do
234
307
  @proc_failure_called.should be_true
235
308
  end
236
309
 
237
- it 'should call the :any handler even when a handler for the error code is defined' do
310
+ it 'should call the :every handler even when a handler for the error code is defined' do
238
311
  @any_called = false
239
312
  mock_successful_http
240
- si = SpiderInstance.new(['http://example.com/'])
313
+ si = SpiderInstance.new({nil => ['http://example.com/']})
241
314
  si.stubs(:allowed?).returns(true)
242
315
  si.stubs(:generate_next_urls).returns([])
243
- si.on(:any) { |*a| @any_called = true }
316
+ si.on(:every) { |*a| @any_called = true }
244
317
  si.on(202) {|*a|}
245
318
  si.start!
246
319
  @any_called.should be_true
@@ -249,10 +322,10 @@ describe 'SpiderInstance' do
249
322
  it 'should support a block as a response handler' do
250
323
  @proc_called = false
251
324
  mock_successful_http
252
- si = SpiderInstance.new(['http://example.com/'])
325
+ si = SpiderInstance.new({nil => ['http://example.com/']})
253
326
  si.stubs(:allowed?).returns(true)
254
327
  si.stubs(:generate_next_urls).returns([])
255
- si.on(:any) { |*a| @proc_called = true }
328
+ si.on(:every) { |*a| @proc_called = true }
256
329
  si.start!
257
330
  @proc_called.should be_true
258
331
  end
@@ -260,10 +333,10 @@ describe 'SpiderInstance' do
260
333
  it 'should support a proc as a response handler' do
261
334
  @proc_called = false
262
335
  mock_successful_http
263
- si = SpiderInstance.new(['http://example.com/'])
336
+ si = SpiderInstance.new({nil => ['http://example.com/']})
264
337
  si.stubs(:allowed?).returns(true)
265
338
  si.stubs(:generate_next_urls).returns([])
266
- si.on(:any, Proc.new { |*a| @proc_called = true })
339
+ si.on(:every, Proc.new { |*a| @proc_called = true })
267
340
  si.start!
268
341
  @proc_called.should be_true
269
342
  end
@@ -297,4 +370,16 @@ describe 'SpiderInstance' do
297
370
  yields(mock(:request => http_req2)).returns(http_req2)
298
371
  Net::HTTP.expects(:new).times(2).returns(http_obj).then.returns(http_obj2)
299
372
  end
373
+
374
+ def callback_arguments_on(code)
375
+ si = SpiderInstance.new('http://foo.com/' => ['http://example.com/'])
376
+ si.stubs(:allowed?).returns(true)
377
+ si.stubs(:generate_next_urls).returns([])
378
+ si.on(code) do |a_url, resp, prior_url|
379
+ a_url.should == 'http://example.com/'
380
+ resp.should_not be_nil
381
+ prior_url.should == 'http://foo.com/'
382
+ end
383
+ si.start!
384
+ end
300
385
  end