spider 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +10 -0
- data/README +29 -25
- data/doc/classes/Net.html +101 -0
- data/doc/classes/Spider.html +180 -0
- data/doc/classes/SpiderInstance.html +229 -0
- data/doc/created.rid +1 -0
- data/doc/files/README.html +149 -0
- data/doc/files/lib/spider_rb.html +159 -0
- data/doc/fr_class_index.html +29 -0
- data/doc/fr_file_index.html +28 -0
- data/doc/fr_method_index.html +29 -0
- data/doc/index.html +24 -0
- data/doc/rdoc-style.css +208 -0
- data/lib/spider.rb +208 -80
- data/spec/spider_instance_spec.rb +219 -0
- data/spec/spider_spec.rb +10 -0
- data/spider.gemspec +2 -2
- data/test_server/client.rb +22 -0
- data/test_server/server1/page1.html +1 -0
- data/test_server/server1/page2.html +3 -0
- data/test_server/server2/page1.html +1 -0
- data/test_server/server2/page2.html +2 -0
- data/test_server/servers.rb +24 -0
- metadata +32 -6
- data/LICENSE +0 -339
data/lib/spider.rb
CHANGED
@@ -1,126 +1,254 @@
|
|
1
1
|
# Copyright 2007 Mike Burns
|
2
|
+
# :include: README
|
2
3
|
|
3
|
-
#
|
4
|
-
#
|
5
|
-
#
|
6
|
-
#
|
7
|
-
#
|
8
|
-
#
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
15
|
-
#
|
4
|
+
# Redistribution and use in source and binary forms, with or without
|
5
|
+
# modification, are permitted provided that the following conditions are met:
|
6
|
+
# * Redistributions of source code must retain the above copyright
|
7
|
+
# notice, this list of conditions and the following disclaimer.
|
8
|
+
# * Redistributions in binary form must reproduce the above copyright
|
9
|
+
# notice, this list of conditions and the following disclaimer in the
|
10
|
+
# documentation and/or other materials provided with the distribution.
|
11
|
+
# * Neither the name Mike Burns nor the
|
12
|
+
# names of its contributors may be used to endorse or promote products
|
13
|
+
# derived from this software without specific prior written permission.
|
14
|
+
#
|
15
|
+
# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
|
16
|
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
17
|
+
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
+
# DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
|
19
|
+
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
20
|
+
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
21
|
+
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
22
|
+
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
23
|
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
24
|
+
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
16
25
|
|
17
26
|
require 'robot_rules'
|
18
27
|
require 'open-uri'
|
19
28
|
require 'uri'
|
29
|
+
require 'net/http'
|
30
|
+
|
31
|
+
class Net::HTTPResponse #:nodoc:
|
32
|
+
def success?; false; end
|
33
|
+
end
|
34
|
+
class Net::HTTPSuccess #:nodoc:
|
35
|
+
def success?; true; end
|
36
|
+
end
|
20
37
|
|
21
38
|
# A spidering library for Ruby. Handles robots.txt, scraping, finding more
|
22
39
|
# links, and doing it all over again.
|
23
|
-
|
40
|
+
class Spider
|
41
|
+
# Runs the spider starting at the given URL. Also takes a block that is given
|
42
|
+
# the SpiderInstance. Use the block to define the rules and handlers for
|
43
|
+
# the discovered Web pages.
|
44
|
+
#
|
45
|
+
# Spider.start_at('http://mike-burns.com/') do |s|
|
46
|
+
# s.add_url_check do |a_url|
|
47
|
+
# a_url =~ %r{^http://mike-burns.com.*}
|
48
|
+
# end
|
49
|
+
#
|
50
|
+
# s.on 404 do |a_url, err_code|
|
51
|
+
# puts "URL not found: #{a_url}"
|
52
|
+
# end
|
53
|
+
#
|
54
|
+
# s.on :success do |a_url, code, headers, body|
|
55
|
+
# puts "body: #{body}"
|
56
|
+
# end
|
57
|
+
#
|
58
|
+
# s.on :any do |a_url, resp|
|
59
|
+
# puts "URL returned anything: #{a_url} with this code #{resp.code}"
|
60
|
+
# end
|
61
|
+
# end
|
62
|
+
|
63
|
+
def self.start_at(a_url, &block)
|
64
|
+
rules = RobotRules.new('Ruby Spider 1.0')
|
65
|
+
a_spider = SpiderInstance.new([a_url], [], rules, [])
|
66
|
+
block.call(a_spider)
|
67
|
+
a_spider.start!
|
68
|
+
end
|
69
|
+
end
|
24
70
|
|
25
|
-
|
26
|
-
|
27
|
-
|
71
|
+
class SpiderInstance
|
72
|
+
def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc:
|
73
|
+
@url_checks = []
|
74
|
+
@cache = :memory
|
75
|
+
@callbacks = {:any => lambda {}, :success => {}, :failure => {}}
|
76
|
+
@next_urls = next_urls
|
77
|
+
@seen = seen
|
78
|
+
@rules = rules || RobotRules.new('Ruby Spider 1.0')
|
79
|
+
@robots_seen = robots_seen
|
80
|
+
end
|
81
|
+
|
82
|
+
# Add a predicate that determines whether to continue down this URL's path.
|
83
|
+
# All predicates must be true in order for a URL to proceed.
|
28
84
|
#
|
29
|
-
#
|
85
|
+
# Takes a block that takes a string and produces a boolean. For example, this
|
86
|
+
# will ensure that the URL starts with 'http://mike-burns.com':
|
30
87
|
#
|
31
|
-
#
|
32
|
-
|
88
|
+
# add_url_check { |a_url| a_url =~ %r{^http://mike-burns.com.*}
|
89
|
+
def add_url_check(&block)
|
90
|
+
@url_checks << block
|
91
|
+
end
|
92
|
+
|
93
|
+
def use_cache(cache_type) #:nodoc:
|
94
|
+
@cache = cache_type
|
95
|
+
end
|
96
|
+
|
97
|
+
# Add a response handler. A response handler's trigger can be :any, :success,
|
98
|
+
# :failure, or any HTTP status code. The handler itself can be either a Proc
|
99
|
+
# or a block. The arguments to the block depends on the trigger:
|
100
|
+
#
|
101
|
+
# If the trigger is :any, the arguments are the URL as a string and an
|
102
|
+
# instance of Net::HTTPResponse.
|
103
|
+
#
|
104
|
+
# If the trigger is :success or any HTTP status code that represents a
|
105
|
+
# successful result, the arguments are the URL as a string, the HTTP status
|
106
|
+
# code, an instance of Net::HTTPSuccess, and the body of the result as a
|
107
|
+
# string.
|
108
|
+
#
|
109
|
+
# If the trigger is :failure or any HTTP status code that represents a failed
|
110
|
+
# result, the arguments are the URL as a string and the HTTP status code.
|
111
|
+
#
|
112
|
+
# For example:
|
113
|
+
#
|
114
|
+
# on 404 do |a_url, code|
|
115
|
+
# puts "URL not found: #{a_url}"
|
33
116
|
# end
|
34
117
|
#
|
35
|
-
#
|
36
|
-
#
|
118
|
+
# on :success do |a_url, code, resp, body|
|
119
|
+
# puts a_url
|
120
|
+
# puts body
|
37
121
|
# end
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
122
|
+
#
|
123
|
+
# on :any do |a_url, resp|
|
124
|
+
# puts "Given this code: #{resp.code}"
|
125
|
+
# end
|
126
|
+
def on(code, p = nil, &block)
|
127
|
+
f = p ? p : block
|
128
|
+
case code
|
129
|
+
when Fixnum
|
130
|
+
@callbacks[success_or_failure(code)][code] = f
|
131
|
+
else
|
132
|
+
if :any == code.to_sym
|
133
|
+
@callbacks[:any] = f
|
134
|
+
else
|
135
|
+
@callbacks[code.to_sym][:any] = f
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
def start! #:nodoc:
|
141
|
+
next_urls = @next_urls
|
45
142
|
begin
|
46
143
|
next_urls = next_urls.map do |a_url|
|
47
144
|
[a_url, (URI.parse(a_url) rescue nil)]
|
48
145
|
end.select do |a_url, parsed_url|
|
49
|
-
|
50
|
-
allowed?(a_url, parsed_url, rules, robots_seen)
|
146
|
+
allowable_url?(a_url, parsed_url)
|
51
147
|
end.map do |a_url, parsed_url|
|
52
|
-
|
53
|
-
|
54
|
-
|
148
|
+
get_page(parsed_url) do |response|
|
149
|
+
do_callbacks(a_url, response)
|
150
|
+
generate_next_urls(a_url, response)
|
55
151
|
end
|
56
|
-
end.flatten
|
152
|
+
end.flatten
|
57
153
|
end while !next_urls.empty?
|
58
154
|
end
|
59
155
|
|
60
|
-
|
156
|
+
def success_or_failure(code) #:nodoc:
|
157
|
+
if code > 199 && code < 300
|
158
|
+
:success
|
159
|
+
else
|
160
|
+
:failure
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
def allowable_url?(a_url, parsed_url) #:nodoc:
|
165
|
+
!parsed_url.nil? && !@seen.include?(parsed_url) && allowed?(a_url, parsed_url) &&
|
166
|
+
@url_checks.map{|url_check|url_check.call(a_url)}.all?
|
167
|
+
end
|
61
168
|
|
62
169
|
# True if the robots.txt for that URL allows access to it.
|
63
|
-
def allowed?(a_url, parsed_url
|
170
|
+
def allowed?(a_url, parsed_url) # :nodoc:
|
64
171
|
u = "#{parsed_url.scheme}://#{parsed_url.host}:#{parsed_url.port}/robots.txt"
|
65
172
|
begin
|
66
|
-
unless robots_seen.include?(u)
|
173
|
+
unless @robots_seen.include?(u)
|
67
174
|
open(u, 'User-Agent' => 'Ruby Spider',
|
68
175
|
'Accept' => 'text/html,text/xml,application/xml,text/plain') do |url|
|
69
|
-
rules.parse(u, url.read)
|
176
|
+
@rules.parse(u, url.read)
|
70
177
|
end
|
71
|
-
robots_seen << u
|
178
|
+
@robots_seen << u
|
72
179
|
end
|
73
|
-
rules.allowed?(a_url)
|
180
|
+
@rules.allowed?(a_url)
|
74
181
|
rescue OpenURI::HTTPError
|
75
|
-
true
|
76
|
-
rescue Timeout::Error # to keep it from crashing
|
77
|
-
false
|
78
|
-
rescue
|
182
|
+
true # No robots.txt
|
183
|
+
rescue Exception, Timeout::Error # to keep it from crashing
|
79
184
|
false
|
80
185
|
end
|
81
186
|
end
|
82
187
|
|
83
|
-
|
84
|
-
|
188
|
+
def get_page(parsed_url, &block) #:nodoc:
|
189
|
+
@seen << parsed_url
|
85
190
|
begin
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
191
|
+
Net::HTTP.start(parsed_url.host, parsed_url.port) do |http|
|
192
|
+
r = http.request(Net::HTTP::Get.new(parsed_url.path))
|
193
|
+
if r.is_a?(Net::HTTPRedirection)
|
194
|
+
get_page(URI.parse(r['Location']), block)
|
195
|
+
else
|
196
|
+
block.call(r)
|
197
|
+
end
|
198
|
+
end
|
199
|
+
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, Exception =>e
|
200
|
+
p e
|
201
|
+
nil
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
def do_callbacks(a_url, resp) #:nodoc:
|
206
|
+
@callbacks[:any].call(a_url, resp) if @callbacks[:any]
|
207
|
+
if resp.success?
|
208
|
+
cb_branch = @callbacks[:success]
|
209
|
+
cb_branch[:any].call(a_url, resp.code, resp, resp.body) if cb_branch[:any]
|
210
|
+
cb_branch[resp.code].call(a_url, resp.code, resp.headers, resp.body) if cb_branch[resp.code]
|
211
|
+
else
|
212
|
+
cb_branch = @callbacks[:failure]
|
213
|
+
cb_branch[:any].call(a_url, resp.code) if cb_branch[:any]
|
214
|
+
cb_branch[resp.code].call(a_url, resp.code) if cb_branch[resp.code]
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
def generate_next_urls(a_url, resp) #:nodoc:
|
219
|
+
web_page = resp.body
|
220
|
+
base_url = (web_page.scan(/base\s+href="(.*?)"/i).flatten +
|
221
|
+
[a_url[0,a_url.rindex('/')]])[0]
|
222
|
+
base_url = remove_trailing_slash(base_url)
|
223
|
+
web_page.scan(/href="(.*?)"/i).flatten.map do |link|
|
224
|
+
begin
|
225
|
+
parsed_link = URI.parse(link)
|
226
|
+
if parsed_link.fragment == '#'
|
227
|
+
nil
|
228
|
+
else
|
229
|
+
case parsed_link.scheme
|
230
|
+
when 'http'
|
231
|
+
link
|
232
|
+
when nil
|
233
|
+
u = URI.parse(base_url)
|
234
|
+
if link[0].chr == '/'
|
235
|
+
"#{u.scheme}://#{u.host}:#{u.port}#{link}"
|
236
|
+
elsif u.path.nil? || u.path == ''
|
237
|
+
"#{u.scheme}://#{u.host}:#{u.port}/#{link}"
|
96
238
|
else
|
97
|
-
|
98
|
-
when 'http'
|
99
|
-
link
|
100
|
-
when nil
|
101
|
-
u = URI.parse(base_url)
|
102
|
-
if link[0].chr == '/'
|
103
|
-
"#{u.scheme}://#{u.host}:#{u.port}#{link}"
|
104
|
-
else
|
105
|
-
"#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{link}"
|
106
|
-
end
|
107
|
-
else
|
108
|
-
nil
|
109
|
-
end
|
239
|
+
"#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{link}"
|
110
240
|
end
|
111
|
-
|
241
|
+
else
|
112
242
|
nil
|
113
243
|
end
|
114
|
-
end
|
115
|
-
|
116
|
-
|
117
|
-
links
|
244
|
+
end
|
245
|
+
rescue
|
246
|
+
nil
|
118
247
|
end
|
119
|
-
|
120
|
-
[]
|
121
|
-
rescue
|
122
|
-
[]
|
123
|
-
end
|
248
|
+
end.compact
|
124
249
|
end
|
125
250
|
|
251
|
+
def remove_trailing_slash(s)
|
252
|
+
s.sub(%r{/*$},'')
|
253
|
+
end
|
126
254
|
end
|
@@ -0,0 +1,219 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'spec'
|
3
|
+
require File.dirname(__FILE__)+'/../lib/spider'
|
4
|
+
|
5
|
+
Spec::Runner.configure { |c| c.mock_with :mocha }
|
6
|
+
|
7
|
+
describe 'SpiderInstance' do
|
8
|
+
it 'should skip URLs when allowable_url? is false' do
|
9
|
+
u = 'http://example.com/'
|
10
|
+
u_p = URI.parse(u)
|
11
|
+
http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
|
12
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
13
|
+
si = SpiderInstance.new([u])
|
14
|
+
si.expects(:allowable_url?).with(u, u_p).returns(false)
|
15
|
+
si.expects(:get_page).times(0)
|
16
|
+
si.start!
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'should not skip URLs when allowable_url? is true' do
|
20
|
+
u = 'http://example.com/'
|
21
|
+
u_p = URI.parse(u)
|
22
|
+
http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
|
23
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
24
|
+
si = SpiderInstance.new([u])
|
25
|
+
si.expects(:allowable_url?).with(u, u_p).returns(true)
|
26
|
+
si.expects(:allowable_url?).with(nil, nil).returns(false)
|
27
|
+
si.expects(:get_page).with(URI.parse(u))
|
28
|
+
si.start!
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'should disallow URLs when the robots.txt says to' do
|
32
|
+
robot_rules = stub
|
33
|
+
SpiderInstance.any_instance.expects(:open).
|
34
|
+
with('http://example.com:80/robots.txt', 'User-Agent' => 'Ruby Spider',
|
35
|
+
'Accept' => 'text/html,text/xml,application/xml,text/plain').
|
36
|
+
yields(stub(:read => 'robots.txt content'))
|
37
|
+
robot_rules.expects(:parse).with('http://example.com:80/robots.txt',
|
38
|
+
'robots.txt content')
|
39
|
+
robot_rules.expects(:allowed?).with('http://example.com/').returns(false)
|
40
|
+
si = SpiderInstance.new(['http://example.com/'], [], robot_rules, [])
|
41
|
+
allowable = si.allowable_url?('http://example.com/',
|
42
|
+
URI.parse('http://example.com/'))
|
43
|
+
allowable.should == false
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'should disallow URLs when they fail any url_check' do
|
47
|
+
si = SpiderInstance.new(['http://example.com/'])
|
48
|
+
si.stubs(:allowed?).returns(true)
|
49
|
+
si.add_url_check { |a_url| false }
|
50
|
+
allowable = si.allowable_url?('http://example.com/',
|
51
|
+
URI.parse('http://example.com/'))
|
52
|
+
allowable.should == false
|
53
|
+
end
|
54
|
+
|
55
|
+
it 'should support multiple url_checks' do
|
56
|
+
@first_url_check = false
|
57
|
+
@second_url_check = false
|
58
|
+
si = SpiderInstance.new(['http://example.com/'])
|
59
|
+
si.stubs(:allowed?).returns(true)
|
60
|
+
si.add_url_check do |a_url|
|
61
|
+
@first_url_check = true
|
62
|
+
true
|
63
|
+
end
|
64
|
+
si.add_url_check do |a_url|
|
65
|
+
@second_url_check = true
|
66
|
+
false
|
67
|
+
end
|
68
|
+
allowable = si.allowable_url?('http://example.com/',
|
69
|
+
URI.parse('http://example.com/'))
|
70
|
+
allowable.should == false
|
71
|
+
@first_url_check == true
|
72
|
+
@second_url_check == true
|
73
|
+
end
|
74
|
+
|
75
|
+
it 'should support memcached'
|
76
|
+
it 'should avoid cycles using memcached'
|
77
|
+
|
78
|
+
it 'should support memory' do
|
79
|
+
si = SpiderInstance.new(['http://example.com/'])
|
80
|
+
si.use_cache :memory # No exn
|
81
|
+
end
|
82
|
+
|
83
|
+
it 'should avoid cycles using memory' do
|
84
|
+
u = 'http://example.com/'
|
85
|
+
u_p = URI.parse(u)
|
86
|
+
si = SpiderInstance.new([u], [u_p])
|
87
|
+
si.stubs(:allowed?).returns(true)
|
88
|
+
allowable = si.allowable_url?(u, u_p)
|
89
|
+
allowable.should == false
|
90
|
+
u_p.should_not be_nil
|
91
|
+
end
|
92
|
+
|
93
|
+
it 'should call the 404 handler for 404s' do
|
94
|
+
@proc_called = false
|
95
|
+
http_resp = stub(:success? => false, :code => 404)
|
96
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
97
|
+
si = SpiderInstance.new(['http://example.com/'])
|
98
|
+
si.stubs(:allowed?).returns(true)
|
99
|
+
si.stubs(:generate_next_urls).returns([])
|
100
|
+
si.on(404) {|*a| @proc_called = true}
|
101
|
+
si.start!
|
102
|
+
@proc_called.should == true
|
103
|
+
end
|
104
|
+
|
105
|
+
it 'should call the :success handler on success' do
|
106
|
+
@proc_called = false
|
107
|
+
http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
|
108
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
109
|
+
si = SpiderInstance.new(['http://example.com/'])
|
110
|
+
si.stubs(:allowed?).returns(true)
|
111
|
+
si.stubs(:generate_next_urls).returns([])
|
112
|
+
si.on(:success) {|*a| @proc_called = true}
|
113
|
+
si.start!
|
114
|
+
@proc_called.should == true
|
115
|
+
end
|
116
|
+
|
117
|
+
it 'should not call the :success handler on failure' do
|
118
|
+
@proc_called = false
|
119
|
+
http_resp = stub(:success? => false, :code => 404)
|
120
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
121
|
+
si = SpiderInstance.new(['http://example.com/'])
|
122
|
+
si.stubs(:allowed?).returns(true)
|
123
|
+
si.stubs(:generate_next_urls).returns([])
|
124
|
+
si.on(:success) {|*a| @proc_called = true}
|
125
|
+
si.start!
|
126
|
+
@proc_called.should == false
|
127
|
+
end
|
128
|
+
|
129
|
+
it 'should call the :success handler and the 200 handler on 200' do
|
130
|
+
@proc_200_called = false
|
131
|
+
@proc_success_called = false
|
132
|
+
http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
|
133
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
134
|
+
si = SpiderInstance.new(['http://example.com/'])
|
135
|
+
si.stubs(:allowed?).returns(true)
|
136
|
+
si.stubs(:generate_next_urls).returns([])
|
137
|
+
si.on(:success) {|*a| @proc_success_called = true}
|
138
|
+
si.on(200) {|*a| @proc_200_called = true}
|
139
|
+
si.start!
|
140
|
+
@proc_200_called.should == true
|
141
|
+
@proc_success_called.should == true
|
142
|
+
end
|
143
|
+
|
144
|
+
it 'should not call the :failure handler on success' do
|
145
|
+
@proc_called = false
|
146
|
+
http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
|
147
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
148
|
+
si = SpiderInstance.new(['http://example.com/'])
|
149
|
+
si.stubs(:allowed?).returns(true)
|
150
|
+
si.stubs(:generate_next_urls).returns([])
|
151
|
+
si.on(:failure) {|*a| @proc_called = true}
|
152
|
+
si.start!
|
153
|
+
@proc_called.should == false
|
154
|
+
end
|
155
|
+
|
156
|
+
it 'should call the :failure handler on failure' do
|
157
|
+
@proc_called = false
|
158
|
+
http_resp = stub(:success? => false, :code => 404)
|
159
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
160
|
+
si = SpiderInstance.new(['http://example.com/'])
|
161
|
+
si.stubs(:allowed?).returns(true)
|
162
|
+
si.stubs(:generate_next_urls).returns([])
|
163
|
+
si.on(:failure) {|*a| @proc_called = true}
|
164
|
+
si.start!
|
165
|
+
@proc_called.should == true
|
166
|
+
end
|
167
|
+
|
168
|
+
it 'should call the :failure handler and the 404 handler on 404' do
|
169
|
+
@proc_404_called = false
|
170
|
+
@proc_failure_called = false
|
171
|
+
http_resp = stub(:success? => false, :code => 404)
|
172
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
173
|
+
si = SpiderInstance.new(['http://example.com/'])
|
174
|
+
si.stubs(:allowed?).returns(true)
|
175
|
+
si.stubs(:generate_next_urls).returns([])
|
176
|
+
si.on(:failure) {|*a| @proc_failure_called = true}
|
177
|
+
si.on(404) {|*a| @proc_404_called = true}
|
178
|
+
si.start!
|
179
|
+
@proc_404_called.should == true
|
180
|
+
@proc_failure_called.should == true
|
181
|
+
end
|
182
|
+
|
183
|
+
it 'should call the :any handler even when a handler for the error code is defined' do
|
184
|
+
@any_called = false
|
185
|
+
http_resp = stub(:success? => true, :code => 200)
|
186
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
187
|
+
si = SpiderInstance.new(['http://example.com/'])
|
188
|
+
si.stubs(:allowed?).returns(true)
|
189
|
+
si.stubs(:generate_next_urls).returns([])
|
190
|
+
si.on(:any) { |*a| @any_called = true }
|
191
|
+
si.on(202) {|*a|}
|
192
|
+
si.start!
|
193
|
+
@any_called.should == true
|
194
|
+
end
|
195
|
+
|
196
|
+
it 'should support a block as a response handler' do
|
197
|
+
@proc_called = false
|
198
|
+
http_resp = stub(:success? => true, :code => 200)
|
199
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
200
|
+
si = SpiderInstance.new(['http://example.com/'])
|
201
|
+
si.stubs(:allowed?).returns(true)
|
202
|
+
si.stubs(:generate_next_urls).returns([])
|
203
|
+
si.on(:any) { |*a| @proc_called = true }
|
204
|
+
si.start!
|
205
|
+
@proc_called.should == true
|
206
|
+
end
|
207
|
+
|
208
|
+
it 'should support a proc as a response handler' do
|
209
|
+
@proc_called = false
|
210
|
+
http_resp = stub(:success? => true, :code => 200)
|
211
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
212
|
+
si = SpiderInstance.new(['http://example.com/'])
|
213
|
+
si.stubs(:allowed?).returns(true)
|
214
|
+
si.stubs(:generate_next_urls).returns([])
|
215
|
+
si.on(:any, Proc.new { |*a| @proc_called = true })
|
216
|
+
si.start!
|
217
|
+
@proc_called.should == true
|
218
|
+
end
|
219
|
+
end
|