spider 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +6 -0
- data/README +4 -4
- data/doc/classes/Net/HTTPRedirection.html +144 -0
- data/doc/classes/Net/HTTPResponse.html +166 -0
- data/doc/classes/Net/HTTPSuccess.html +144 -0
- data/doc/classes/NilClass.html +144 -0
- data/doc/classes/Spider.html +12 -12
- data/doc/classes/SpiderInstance.html +109 -32
- data/doc/created.rid +1 -1
- data/doc/files/README.html +5 -5
- data/doc/files/lib/spider_rb.html +5 -5
- data/doc/fr_class_index.html +0 -1
- data/doc/fr_file_index.html +1 -0
- data/doc/fr_method_index.html +5 -2
- data/lib/spider.rb +100 -58
- data/spec/spider_instance_spec.rb +115 -30
- data/spider.gemspec +1 -1
- data/test_server/client.rb +4 -4
- metadata +7 -2
data/doc/fr_class_index.html
CHANGED
data/doc/fr_file_index.html
CHANGED
data/doc/fr_method_index.html
CHANGED
@@ -21,9 +21,12 @@
|
|
21
21
|
<h1 class="section-bar">Methods</h1>
|
22
22
|
<div id="index-entries">
|
23
23
|
<a href="classes/SpiderInstance.html#M000001">add_url_check (SpiderInstance)</a><br />
|
24
|
+
<a href="classes/SpiderInstance.html#M000006">clear_headers (SpiderInstance)</a><br />
|
25
|
+
<a href="classes/SpiderInstance.html#M000005">headers (SpiderInstance)</a><br />
|
24
26
|
<a href="classes/SpiderInstance.html#M000002">on (SpiderInstance)</a><br />
|
25
|
-
<a href="classes/SpiderInstance.html#M000003">
|
26
|
-
<a href="classes/Spider.html#
|
27
|
+
<a href="classes/SpiderInstance.html#M000003">setup (SpiderInstance)</a><br />
|
28
|
+
<a href="classes/Spider.html#M000007">start_at (Spider)</a><br />
|
29
|
+
<a href="classes/SpiderInstance.html#M000004">teardown (SpiderInstance)</a><br />
|
27
30
|
</div>
|
28
31
|
</div>
|
29
32
|
</body>
|
data/lib/spider.rb
CHANGED
@@ -29,15 +29,21 @@ require 'uri'
|
|
29
29
|
require 'net/http'
|
30
30
|
require 'net/https'
|
31
31
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
end
|
36
|
-
|
37
|
-
|
32
|
+
module Net #:nodoc:
|
33
|
+
class HTTPResponse #:nodoc:
|
34
|
+
def success?; false; end
|
35
|
+
def redirect?; false; end
|
36
|
+
end
|
37
|
+
class HTTPSuccess #:nodoc:
|
38
|
+
def success?; true; end
|
39
|
+
end
|
40
|
+
class HTTPRedirection #:nodoc:
|
41
|
+
def redirect?; true; end
|
42
|
+
end
|
38
43
|
end
|
39
|
-
|
40
|
-
|
44
|
+
|
45
|
+
class NilClass #:nodoc:
|
46
|
+
def merge(h); h; end
|
41
47
|
end
|
42
48
|
|
43
49
|
# A spidering library for Ruby. Handles robots.txt, scraping, finding more
|
@@ -52,22 +58,22 @@ class Spider
|
|
52
58
|
# a_url =~ %r{^http://mike-burns.com.*}
|
53
59
|
# end
|
54
60
|
#
|
55
|
-
# s.on 404 do |a_url,
|
61
|
+
# s.on 404 do |a_url, resp, prior_url|
|
56
62
|
# puts "URL not found: #{a_url}"
|
57
63
|
# end
|
58
64
|
#
|
59
|
-
# s.on :success do |a_url,
|
60
|
-
# puts "body: #{body}"
|
65
|
+
# s.on :success do |a_url, resp, prior_url|
|
66
|
+
# puts "body: #{resp.body}"
|
61
67
|
# end
|
62
68
|
#
|
63
|
-
# s.on :
|
69
|
+
# s.on :every do |a_url, resp, prior_url|
|
64
70
|
# puts "URL returned anything: #{a_url} with this code #{resp.code}"
|
65
71
|
# end
|
66
72
|
# end
|
67
73
|
|
68
74
|
def self.start_at(a_url, &block)
|
69
75
|
rules = RobotRules.new('Ruby Spider 1.0')
|
70
|
-
a_spider = SpiderInstance.new(
|
76
|
+
a_spider = SpiderInstance.new({nil => a_url}, [], rules, [])
|
71
77
|
block.call(a_spider)
|
72
78
|
a_spider.start!
|
73
79
|
end
|
@@ -77,11 +83,14 @@ class SpiderInstance
|
|
77
83
|
def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc:
|
78
84
|
@url_checks = []
|
79
85
|
@cache = :memory
|
80
|
-
@callbacks = {
|
86
|
+
@callbacks = {}
|
81
87
|
@next_urls = next_urls
|
82
88
|
@seen = seen
|
83
89
|
@rules = rules || RobotRules.new('Ruby Spider 1.0')
|
84
90
|
@robots_seen = robots_seen
|
91
|
+
@headers = {}
|
92
|
+
@setup = nil
|
93
|
+
@teardown = nil
|
85
94
|
end
|
86
95
|
|
87
96
|
# Add a predicate that determines whether to continue down this URL's path.
|
@@ -99,62 +108,88 @@ class SpiderInstance
|
|
99
108
|
@cache = cache_type
|
100
109
|
end
|
101
110
|
|
102
|
-
# Add a response handler. A response handler's trigger can be :
|
103
|
-
# :failure, or any HTTP status code. The handler itself can be
|
104
|
-
#
|
111
|
+
# Add a response handler. A response handler's trigger can be :every,
|
112
|
+
# :success, :failure, or any HTTP status code. The handler itself can be
|
113
|
+
# either a Proc or a block.
|
105
114
|
#
|
106
|
-
#
|
107
|
-
#
|
115
|
+
# The arguments to the block are: the URL as a string, an instance of
|
116
|
+
# Net::HTTPResponse, and the prior URL as a string.
|
108
117
|
#
|
109
|
-
# If the trigger is :success or any HTTP status code that represents a
|
110
|
-
# successful result, the arguments are the URL as a string, the HTTP status
|
111
|
-
# code, an instance of Net::HTTPSuccess, and the body of the result as a
|
112
|
-
# string.
|
113
|
-
#
|
114
|
-
# If the trigger is :failure or any HTTP status code that represents a failed
|
115
|
-
# result, the arguments are the URL as a string and the HTTP status code.
|
116
118
|
#
|
117
119
|
# For example:
|
118
120
|
#
|
119
|
-
# on 404 do |a_url,
|
121
|
+
# on 404 do |a_url, resp, prior_url|
|
120
122
|
# puts "URL not found: #{a_url}"
|
121
123
|
# end
|
122
124
|
#
|
123
|
-
# on :success do |a_url,
|
125
|
+
# on :success do |a_url, resp, prior_url|
|
124
126
|
# puts a_url
|
125
|
-
# puts body
|
127
|
+
# puts resp.body
|
126
128
|
# end
|
127
129
|
#
|
128
|
-
# on :
|
130
|
+
# on :every do |a_url, resp, prior_url|
|
129
131
|
# puts "Given this code: #{resp.code}"
|
130
132
|
# end
|
131
133
|
def on(code, p = nil, &block)
|
132
134
|
f = p ? p : block
|
133
135
|
case code
|
134
136
|
when Fixnum
|
135
|
-
@callbacks[
|
137
|
+
@callbacks[code] = f
|
136
138
|
else
|
137
|
-
|
138
|
-
@callbacks[:any] = f
|
139
|
-
else
|
140
|
-
@callbacks[code.to_sym][:any] = f
|
141
|
-
end
|
139
|
+
@callbacks[code.to_sym] = f
|
142
140
|
end
|
143
141
|
end
|
144
142
|
|
143
|
+
# Run before the HTTP request. Given the URL as a string.
|
144
|
+
# setup do |a_url|
|
145
|
+
# headers['Cookies'] = 'user_id=1;admin=true'
|
146
|
+
# end
|
147
|
+
def setup(p = nil, &block)
|
148
|
+
@setup = p ? p : block
|
149
|
+
end
|
150
|
+
|
151
|
+
# Run last, once for each page. Given the URL as a string.
|
152
|
+
def teardown(p = nil, &block)
|
153
|
+
@teardown = p ? p : block
|
154
|
+
end
|
155
|
+
|
156
|
+
# Use like a hash:
|
157
|
+
# headers['Cookies'] = 'user_id=1;password=btrross3'
|
158
|
+
def headers
|
159
|
+
HeaderSetter.new(self)
|
160
|
+
end
|
161
|
+
|
162
|
+
def raw_headers #:nodoc:
|
163
|
+
@headers
|
164
|
+
end
|
165
|
+
def raw_headers=(v) #:nodoc:
|
166
|
+
@headers = v
|
167
|
+
end
|
168
|
+
|
169
|
+
# Reset the headers hash.
|
170
|
+
def clear_headers
|
171
|
+
@headers = {}
|
172
|
+
end
|
173
|
+
|
145
174
|
def start! #:nodoc:
|
146
175
|
next_urls = @next_urls
|
147
176
|
begin
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
177
|
+
tmp_n_u = {}
|
178
|
+
next_urls.each do |prior_url, urls|
|
179
|
+
urls.map do |a_url|
|
180
|
+
[a_url, (URI.parse(a_url) rescue nil)]
|
181
|
+
end.select do |a_url, parsed_url|
|
182
|
+
allowable_url?(a_url, parsed_url)
|
183
|
+
end.each do |a_url, parsed_url|
|
184
|
+
@setup.call(a_url) unless @setup.nil?
|
185
|
+
get_page(parsed_url) do |response|
|
186
|
+
do_callbacks(a_url, response, prior_url)
|
187
|
+
tmp_n_u[a_url] = generate_next_urls(a_url, response)
|
188
|
+
end
|
189
|
+
@teardown.call(a_url) unless @teardown.nil?
|
190
|
+
end
|
191
|
+
end
|
192
|
+
next_urls = tmp_n_u
|
158
193
|
end while !next_urls.empty?
|
159
194
|
end
|
160
195
|
|
@@ -196,7 +231,8 @@ class SpiderInstance
|
|
196
231
|
http = Net::HTTP.new(parsed_url.host, parsed_url.port)
|
197
232
|
http.use_ssl = parsed_url.scheme == 'https'
|
198
233
|
# Uses start because http.finish cannot be called.
|
199
|
-
r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri
|
234
|
+
r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri,
|
235
|
+
@headers))}
|
200
236
|
if r.redirect?
|
201
237
|
get_page(URI.parse(r['Location']), &block)
|
202
238
|
else
|
@@ -208,16 +244,13 @@ class SpiderInstance
|
|
208
244
|
end
|
209
245
|
end
|
210
246
|
|
211
|
-
def do_callbacks(a_url, resp) #:nodoc:
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
cb_branch = @callbacks[:failure]
|
219
|
-
cb_branch[:any].call(a_url, resp.code) if cb_branch[:any]
|
220
|
-
cb_branch[resp.code].call(a_url, resp.code) if cb_branch[resp.code]
|
247
|
+
def do_callbacks(a_url, resp, prior_url) #:nodoc:
|
248
|
+
cbs = [@callbacks[:every],
|
249
|
+
resp.success? ? @callbacks[:success] : @callbacks[:failure],
|
250
|
+
@callbacks[resp.code]]
|
251
|
+
|
252
|
+
cbs.each do |cb|
|
253
|
+
cb.call(a_url, resp, prior_url) if cb
|
221
254
|
end
|
222
255
|
end
|
223
256
|
|
@@ -254,7 +287,16 @@ class SpiderInstance
|
|
254
287
|
end.compact
|
255
288
|
end
|
256
289
|
|
257
|
-
def remove_trailing_slash(s)
|
290
|
+
def remove_trailing_slash(s) #:nodoc:
|
258
291
|
s.sub(%r{/*$},'')
|
259
292
|
end
|
293
|
+
|
294
|
+
class HeaderSetter #:nodoc:
|
295
|
+
def initialize(si)
|
296
|
+
@si = si
|
297
|
+
end
|
298
|
+
def []=(k,v)
|
299
|
+
@si.raw_headers = @si.raw_headers.merge({k => v})
|
300
|
+
end
|
301
|
+
end
|
260
302
|
end
|
@@ -23,6 +23,80 @@ def null_logger
|
|
23
23
|
end
|
24
24
|
|
25
25
|
describe 'SpiderInstance' do
|
26
|
+
it 'should call the "setup" callback before loading the Web page' do
|
27
|
+
mock_successful_http
|
28
|
+
@on_called = false
|
29
|
+
@before_called = false
|
30
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
31
|
+
si.stubs(:allowed?).returns(true)
|
32
|
+
si.stubs(:generate_next_urls).returns([])
|
33
|
+
si.setup { |*a| @before_called = Time.now }
|
34
|
+
si.on(:every) { |*a| @on_called = Time.now }
|
35
|
+
si.start!
|
36
|
+
@on_called.should_not be_false
|
37
|
+
@before_called.should_not be_false
|
38
|
+
@before_called.should_not be_false
|
39
|
+
@before_called.should < @on_called
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'should call the "teardown" callback after running all other callbacks' do
|
43
|
+
mock_successful_http
|
44
|
+
@on_called = false
|
45
|
+
@after_called = false
|
46
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
47
|
+
si.stubs(:allowed?).returns(true)
|
48
|
+
si.stubs(:generate_next_urls).returns([])
|
49
|
+
si.on(:every) { |*a| @on_called = Time.now }
|
50
|
+
si.teardown { |*a| @after_called = Time.now }
|
51
|
+
si.start!
|
52
|
+
@on_called.should_not be_false
|
53
|
+
@after_called.should_not be_false
|
54
|
+
@after_called.should_not be_false
|
55
|
+
@after_called.should > @on_called
|
56
|
+
end
|
57
|
+
|
58
|
+
it 'should pass headers set by a setup handler to the HTTP request' do
|
59
|
+
mock_successful_http
|
60
|
+
Net::HTTP::Get.expects(:new).with('/foo',{'X-Header-Set' => 'True'})
|
61
|
+
si = SpiderInstance.new(nil => ['http://example.com/foo'])
|
62
|
+
si.stubs(:allowable_url?).returns(true)
|
63
|
+
si.stubs(:generate_next_urls).returns([])
|
64
|
+
si.setup do |a_url|
|
65
|
+
si.headers['X-Header-Set'] = 'True'
|
66
|
+
end
|
67
|
+
si.teardown do |a_url|
|
68
|
+
si.clear_headers
|
69
|
+
end
|
70
|
+
si.start!
|
71
|
+
end
|
72
|
+
|
73
|
+
it 'should allow for a proxy' # fill in more
|
74
|
+
|
75
|
+
it 'should call the :every callback with the current URL, the response, and the prior URL' do
|
76
|
+
mock_successful_http
|
77
|
+
callback_arguments_on(:every)
|
78
|
+
end
|
79
|
+
|
80
|
+
it 'should call the :success callback with the current URL, the request, and the prior URL' do
|
81
|
+
mock_successful_http
|
82
|
+
callback_arguments_on(:success)
|
83
|
+
end
|
84
|
+
|
85
|
+
it 'should call the :failure callback with the current URL, the request, and the prior URL' do
|
86
|
+
mock_failed_http
|
87
|
+
callback_arguments_on(:failure)
|
88
|
+
end
|
89
|
+
|
90
|
+
it 'should call the HTTP status error code callback with the current URL, the request, and the prior URL' do
|
91
|
+
mock_failed_http
|
92
|
+
callback_arguments_on(404)
|
93
|
+
end
|
94
|
+
|
95
|
+
it 'should call the HTTP status success code callback with the current URL, the request, and the prior URL' do
|
96
|
+
mock_successful_http
|
97
|
+
callback_arguments_on(200)
|
98
|
+
end
|
99
|
+
|
26
100
|
# Bug reported by John Nagro, using the example source http://eons.com/
|
27
101
|
# had to change line 192; uses request_uri now instead of path.
|
28
102
|
it 'should handle query URLs without a path' do
|
@@ -33,7 +107,7 @@ describe 'SpiderInstance' do
|
|
33
107
|
:AccessLog => [])
|
34
108
|
server.mount('/', QueryServlet)
|
35
109
|
Thread.new {server.start}
|
36
|
-
si = SpiderInstance.new([u])
|
110
|
+
si = SpiderInstance.new({nil => [u]})
|
37
111
|
si.get_page(u_p) do
|
38
112
|
@block_called = true
|
39
113
|
end
|
@@ -47,7 +121,7 @@ describe 'SpiderInstance' do
|
|
47
121
|
u_p = URI.parse(u)
|
48
122
|
@redirect_handled = false
|
49
123
|
mock_redirect_http
|
50
|
-
si = SpiderInstance.new([u])
|
124
|
+
si = SpiderInstance.new({nil => [u]})
|
51
125
|
si.get_page(u_p) do
|
52
126
|
@redirect_handled = true
|
53
127
|
end
|
@@ -66,7 +140,7 @@ describe 'SpiderInstance' do
|
|
66
140
|
:SSLComment => 'Comment of some sort')
|
67
141
|
server.mount('/', QueryServlet)
|
68
142
|
Thread.new {server.start}
|
69
|
-
si = SpiderInstance.new([u])
|
143
|
+
si = SpiderInstance.new({nil => [u]})
|
70
144
|
si.get_page(u_p) { @page_called = true }
|
71
145
|
server.shutdown
|
72
146
|
@page_called.should be_true
|
@@ -79,7 +153,7 @@ describe 'SpiderInstance' do
|
|
79
153
|
u_p = URI.parse(u)
|
80
154
|
http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
|
81
155
|
Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
|
82
|
-
si = SpiderInstance.new([u])
|
156
|
+
si = SpiderInstance.new({nil => [u]})
|
83
157
|
si.expects(:allowable_url?).with(u, u_p).returns(false)
|
84
158
|
si.expects(:get_page).times(0)
|
85
159
|
si.start!
|
@@ -90,9 +164,8 @@ describe 'SpiderInstance' do
|
|
90
164
|
u_p = URI.parse(u)
|
91
165
|
http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
|
92
166
|
Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
|
93
|
-
si = SpiderInstance.new([u])
|
167
|
+
si = SpiderInstance.new({nil => [u]})
|
94
168
|
si.expects(:allowable_url?).with(u, u_p).returns(true)
|
95
|
-
si.expects(:allowable_url?).with(nil, nil).returns(false)
|
96
169
|
si.expects(:get_page).with(URI.parse(u))
|
97
170
|
si.start!
|
98
171
|
end
|
@@ -106,25 +179,25 @@ describe 'SpiderInstance' do
|
|
106
179
|
robot_rules.expects(:parse).with('http://example.com:80/robots.txt',
|
107
180
|
'robots.txt content')
|
108
181
|
robot_rules.expects(:allowed?).with('http://example.com/').returns(false)
|
109
|
-
si = SpiderInstance.new(['http://example.com/'], [], robot_rules, [])
|
182
|
+
si = SpiderInstance.new({nil => ['http://example.com/']}, [], robot_rules, [])
|
110
183
|
allowable = si.allowable_url?('http://example.com/',
|
111
184
|
URI.parse('http://example.com/'))
|
112
|
-
allowable.should
|
185
|
+
allowable.should be_false
|
113
186
|
end
|
114
187
|
|
115
188
|
it 'should disallow URLs when they fail any url_check' do
|
116
|
-
si = SpiderInstance.new(['http://example.com/'])
|
189
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
117
190
|
si.stubs(:allowed?).returns(true)
|
118
191
|
si.add_url_check { |a_url| false }
|
119
192
|
allowable = si.allowable_url?('http://example.com/',
|
120
193
|
URI.parse('http://example.com/'))
|
121
|
-
allowable.should
|
194
|
+
allowable.should be_false
|
122
195
|
end
|
123
196
|
|
124
197
|
it 'should support multiple url_checks' do
|
125
198
|
@first_url_check = false
|
126
199
|
@second_url_check = false
|
127
|
-
si = SpiderInstance.new(['http://example.com/'])
|
200
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
128
201
|
si.stubs(:allowed?).returns(true)
|
129
202
|
si.add_url_check do |a_url|
|
130
203
|
@first_url_check = true
|
@@ -136,7 +209,7 @@ describe 'SpiderInstance' do
|
|
136
209
|
end
|
137
210
|
allowable = si.allowable_url?('http://example.com/',
|
138
211
|
URI.parse('http://example.com/'))
|
139
|
-
allowable.should
|
212
|
+
allowable.should be_false
|
140
213
|
@first_url_check.should be_true
|
141
214
|
@second_url_check.should be_true
|
142
215
|
end
|
@@ -144,17 +217,17 @@ describe 'SpiderInstance' do
|
|
144
217
|
it 'should avoid cycles' do
|
145
218
|
u = 'http://example.com/'
|
146
219
|
u_p = URI.parse(u)
|
147
|
-
si = SpiderInstance.new([u], [u_p])
|
220
|
+
si = SpiderInstance.new({nil => [u]}, [u_p])
|
148
221
|
si.stubs(:allowed?).returns(true)
|
149
222
|
allowable = si.allowable_url?(u, u_p)
|
150
|
-
allowable.should
|
223
|
+
allowable.should be_false
|
151
224
|
u_p.should_not be_nil
|
152
225
|
end
|
153
226
|
|
154
227
|
it 'should call the 404 handler for 404s' do
|
155
228
|
@proc_called = false
|
156
229
|
mock_failed_http
|
157
|
-
si = SpiderInstance.new(['http://example.com/'])
|
230
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
158
231
|
si.stubs(:allowed?).returns(true)
|
159
232
|
si.stubs(:generate_next_urls).returns([])
|
160
233
|
si.on(404) {|*a| @proc_called = true}
|
@@ -165,7 +238,7 @@ describe 'SpiderInstance' do
|
|
165
238
|
it 'should call the :success handler on success' do
|
166
239
|
@proc_called = false
|
167
240
|
mock_successful_http
|
168
|
-
si = SpiderInstance.new(['http://example.com/'])
|
241
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
169
242
|
si.stubs(:allowed?).returns(true)
|
170
243
|
si.stubs(:generate_next_urls).returns([])
|
171
244
|
si.on(:success) {|*a| @proc_called = true}
|
@@ -176,19 +249,19 @@ describe 'SpiderInstance' do
|
|
176
249
|
it 'should not call the :success handler on failure' do
|
177
250
|
@proc_called = false
|
178
251
|
mock_failed_http
|
179
|
-
si = SpiderInstance.new(['http://example.com/'])
|
252
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
180
253
|
si.stubs(:allowed?).returns(true)
|
181
254
|
si.stubs(:generate_next_urls).returns([])
|
182
255
|
si.on(:success) {|*a| @proc_called = true}
|
183
256
|
si.start!
|
184
|
-
@proc_called.should
|
257
|
+
@proc_called.should be_false
|
185
258
|
end
|
186
259
|
|
187
260
|
it 'should call the :success handler and the 200 handler on 200' do
|
188
261
|
@proc_200_called = false
|
189
262
|
@proc_success_called = false
|
190
263
|
mock_successful_http
|
191
|
-
si = SpiderInstance.new(['http://example.com/'])
|
264
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
192
265
|
si.stubs(:allowed?).returns(true)
|
193
266
|
si.stubs(:generate_next_urls).returns([])
|
194
267
|
si.on(:success) {|*a| @proc_success_called = true}
|
@@ -201,18 +274,18 @@ describe 'SpiderInstance' do
|
|
201
274
|
it 'should not call the :failure handler on success' do
|
202
275
|
@proc_called = false
|
203
276
|
mock_successful_http
|
204
|
-
si = SpiderInstance.new(['http://example.com/'])
|
277
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
205
278
|
si.stubs(:allowed?).returns(true)
|
206
279
|
si.stubs(:generate_next_urls).returns([])
|
207
280
|
si.on(:failure) {|*a| @proc_called = true}
|
208
281
|
si.start!
|
209
|
-
@proc_called.should
|
282
|
+
@proc_called.should be_false
|
210
283
|
end
|
211
284
|
|
212
285
|
it 'should call the :failure handler on failure' do
|
213
286
|
@proc_called = false
|
214
287
|
mock_failed_http
|
215
|
-
si = SpiderInstance.new(['http://example.com/'])
|
288
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
216
289
|
si.stubs(:allowed?).returns(true)
|
217
290
|
si.stubs(:generate_next_urls).returns([])
|
218
291
|
si.on(:failure) {|*a| @proc_called = true}
|
@@ -224,7 +297,7 @@ describe 'SpiderInstance' do
|
|
224
297
|
@proc_404_called = false
|
225
298
|
@proc_failure_called = false
|
226
299
|
mock_failed_http
|
227
|
-
si = SpiderInstance.new(['http://example.com/'])
|
300
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
228
301
|
si.stubs(:allowed?).returns(true)
|
229
302
|
si.stubs(:generate_next_urls).returns([])
|
230
303
|
si.on(:failure) {|*a| @proc_failure_called = true}
|
@@ -234,13 +307,13 @@ describe 'SpiderInstance' do
|
|
234
307
|
@proc_failure_called.should be_true
|
235
308
|
end
|
236
309
|
|
237
|
-
it 'should call the :
|
310
|
+
it 'should call the :every handler even when a handler for the error code is defined' do
|
238
311
|
@any_called = false
|
239
312
|
mock_successful_http
|
240
|
-
si = SpiderInstance.new(['http://example.com/'])
|
313
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
241
314
|
si.stubs(:allowed?).returns(true)
|
242
315
|
si.stubs(:generate_next_urls).returns([])
|
243
|
-
si.on(:
|
316
|
+
si.on(:every) { |*a| @any_called = true }
|
244
317
|
si.on(202) {|*a|}
|
245
318
|
si.start!
|
246
319
|
@any_called.should be_true
|
@@ -249,10 +322,10 @@ describe 'SpiderInstance' do
|
|
249
322
|
it 'should support a block as a response handler' do
|
250
323
|
@proc_called = false
|
251
324
|
mock_successful_http
|
252
|
-
si = SpiderInstance.new(['http://example.com/'])
|
325
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
253
326
|
si.stubs(:allowed?).returns(true)
|
254
327
|
si.stubs(:generate_next_urls).returns([])
|
255
|
-
si.on(:
|
328
|
+
si.on(:every) { |*a| @proc_called = true }
|
256
329
|
si.start!
|
257
330
|
@proc_called.should be_true
|
258
331
|
end
|
@@ -260,10 +333,10 @@ describe 'SpiderInstance' do
|
|
260
333
|
it 'should support a proc as a response handler' do
|
261
334
|
@proc_called = false
|
262
335
|
mock_successful_http
|
263
|
-
si = SpiderInstance.new(['http://example.com/'])
|
336
|
+
si = SpiderInstance.new({nil => ['http://example.com/']})
|
264
337
|
si.stubs(:allowed?).returns(true)
|
265
338
|
si.stubs(:generate_next_urls).returns([])
|
266
|
-
si.on(:
|
339
|
+
si.on(:every, Proc.new { |*a| @proc_called = true })
|
267
340
|
si.start!
|
268
341
|
@proc_called.should be_true
|
269
342
|
end
|
@@ -297,4 +370,16 @@ describe 'SpiderInstance' do
|
|
297
370
|
yields(mock(:request => http_req2)).returns(http_req2)
|
298
371
|
Net::HTTP.expects(:new).times(2).returns(http_obj).then.returns(http_obj2)
|
299
372
|
end
|
373
|
+
|
374
|
+
def callback_arguments_on(code)
|
375
|
+
si = SpiderInstance.new('http://foo.com/' => ['http://example.com/'])
|
376
|
+
si.stubs(:allowed?).returns(true)
|
377
|
+
si.stubs(:generate_next_urls).returns([])
|
378
|
+
si.on(code) do |a_url, resp, prior_url|
|
379
|
+
a_url.should == 'http://example.com/'
|
380
|
+
resp.should_not be_nil
|
381
|
+
prior_url.should == 'http://foo.com/'
|
382
|
+
end
|
383
|
+
si.start!
|
384
|
+
end
|
300
385
|
end
|