spider 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +10 -0
- data/README +29 -25
- data/doc/classes/Net.html +101 -0
- data/doc/classes/Spider.html +180 -0
- data/doc/classes/SpiderInstance.html +229 -0
- data/doc/created.rid +1 -0
- data/doc/files/README.html +149 -0
- data/doc/files/lib/spider_rb.html +159 -0
- data/doc/fr_class_index.html +29 -0
- data/doc/fr_file_index.html +28 -0
- data/doc/fr_method_index.html +29 -0
- data/doc/index.html +24 -0
- data/doc/rdoc-style.css +208 -0
- data/lib/spider.rb +208 -80
- data/spec/spider_instance_spec.rb +219 -0
- data/spec/spider_spec.rb +10 -0
- data/spider.gemspec +2 -2
- data/test_server/client.rb +22 -0
- data/test_server/server1/page1.html +1 -0
- data/test_server/server1/page2.html +3 -0
- data/test_server/server2/page1.html +1 -0
- data/test_server/server2/page2.html +2 -0
- data/test_server/servers.rb +24 -0
- metadata +32 -6
- data/LICENSE +0 -339
data/lib/spider.rb
CHANGED
@@ -1,126 +1,254 @@
|
|
1
1
|
# Copyright 2007 Mike Burns
|
2
|
+
# :include: README
|
2
3
|
|
3
|
-
#
|
4
|
-
#
|
5
|
-
#
|
6
|
-
#
|
7
|
-
#
|
8
|
-
#
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
15
|
-
#
|
4
|
+
# Redistribution and use in source and binary forms, with or without
|
5
|
+
# modification, are permitted provided that the following conditions are met:
|
6
|
+
# * Redistributions of source code must retain the above copyright
|
7
|
+
# notice, this list of conditions and the following disclaimer.
|
8
|
+
# * Redistributions in binary form must reproduce the above copyright
|
9
|
+
# notice, this list of conditions and the following disclaimer in the
|
10
|
+
# documentation and/or other materials provided with the distribution.
|
11
|
+
# * Neither the name Mike Burns nor the
|
12
|
+
# names of its contributors may be used to endorse or promote products
|
13
|
+
# derived from this software without specific prior written permission.
|
14
|
+
#
|
15
|
+
# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
|
16
|
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
17
|
+
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
18
|
+
# DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
|
19
|
+
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
20
|
+
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
21
|
+
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
22
|
+
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
23
|
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
24
|
+
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
16
25
|
|
17
26
|
require 'robot_rules'
|
18
27
|
require 'open-uri'
|
19
28
|
require 'uri'
|
29
|
+
require 'net/http'
|
30
|
+
|
31
|
+
class Net::HTTPResponse #:nodoc:
|
32
|
+
def success?; false; end
|
33
|
+
end
|
34
|
+
class Net::HTTPSuccess #:nodoc:
|
35
|
+
def success?; true; end
|
36
|
+
end
|
20
37
|
|
21
38
|
# A spidering library for Ruby. Handles robots.txt, scraping, finding more
|
22
39
|
# links, and doing it all over again.
|
23
|
-
|
40
|
+
class Spider
|
41
|
+
# Runs the spider starting at the given URL. Also takes a block that is given
|
42
|
+
# the SpiderInstance. Use the block to define the rules and handlers for
|
43
|
+
# the discovered Web pages.
|
44
|
+
#
|
45
|
+
# Spider.start_at('http://mike-burns.com/') do |s|
|
46
|
+
# s.add_url_check do |a_url|
|
47
|
+
# a_url =~ %r{^http://mike-burns.com.*}
|
48
|
+
# end
|
49
|
+
#
|
50
|
+
# s.on 404 do |a_url, err_code|
|
51
|
+
# puts "URL not found: #{a_url}"
|
52
|
+
# end
|
53
|
+
#
|
54
|
+
# s.on :success do |a_url, code, headers, body|
|
55
|
+
# puts "body: #{body}"
|
56
|
+
# end
|
57
|
+
#
|
58
|
+
# s.on :any do |a_url, resp|
|
59
|
+
# puts "URL returned anything: #{a_url} with this code #{resp.code}"
|
60
|
+
# end
|
61
|
+
# end
|
62
|
+
|
63
|
+
def self.start_at(a_url, &block)
|
64
|
+
rules = RobotRules.new('Ruby Spider 1.0')
|
65
|
+
a_spider = SpiderInstance.new([a_url], [], rules, [])
|
66
|
+
block.call(a_spider)
|
67
|
+
a_spider.start!
|
68
|
+
end
|
69
|
+
end
|
24
70
|
|
25
|
-
|
26
|
-
|
27
|
-
|
71
|
+
class SpiderInstance
|
72
|
+
def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc:
|
73
|
+
@url_checks = []
|
74
|
+
@cache = :memory
|
75
|
+
@callbacks = {:any => lambda {}, :success => {}, :failure => {}}
|
76
|
+
@next_urls = next_urls
|
77
|
+
@seen = seen
|
78
|
+
@rules = rules || RobotRules.new('Ruby Spider 1.0')
|
79
|
+
@robots_seen = robots_seen
|
80
|
+
end
|
81
|
+
|
82
|
+
# Add a predicate that determines whether to continue down this URL's path.
|
83
|
+
# All predicates must be true in order for a URL to proceed.
|
28
84
|
#
|
29
|
-
#
|
85
|
+
# Takes a block that takes a string and produces a boolean. For example, this
|
86
|
+
# will ensure that the URL starts with 'http://mike-burns.com':
|
30
87
|
#
|
31
|
-
#
|
32
|
-
|
88
|
+
# add_url_check { |a_url| a_url =~ %r{^http://mike-burns.com.*}
|
89
|
+
def add_url_check(&block)
|
90
|
+
@url_checks << block
|
91
|
+
end
|
92
|
+
|
93
|
+
def use_cache(cache_type) #:nodoc:
|
94
|
+
@cache = cache_type
|
95
|
+
end
|
96
|
+
|
97
|
+
# Add a response handler. A response handler's trigger can be :any, :success,
|
98
|
+
# :failure, or any HTTP status code. The handler itself can be either a Proc
|
99
|
+
# or a block. The arguments to the block depends on the trigger:
|
100
|
+
#
|
101
|
+
# If the trigger is :any, the arguments are the URL as a string and an
|
102
|
+
# instance of Net::HTTPResponse.
|
103
|
+
#
|
104
|
+
# If the trigger is :success or any HTTP status code that represents a
|
105
|
+
# successful result, the arguments are the URL as a string, the HTTP status
|
106
|
+
# code, an instance of Net::HTTPSuccess, and the body of the result as a
|
107
|
+
# string.
|
108
|
+
#
|
109
|
+
# If the trigger is :failure or any HTTP status code that represents a failed
|
110
|
+
# result, the arguments are the URL as a string and the HTTP status code.
|
111
|
+
#
|
112
|
+
# For example:
|
113
|
+
#
|
114
|
+
# on 404 do |a_url, code|
|
115
|
+
# puts "URL not found: #{a_url}"
|
33
116
|
# end
|
34
117
|
#
|
35
|
-
#
|
36
|
-
#
|
118
|
+
# on :success do |a_url, code, resp, body|
|
119
|
+
# puts a_url
|
120
|
+
# puts body
|
37
121
|
# end
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
122
|
+
#
|
123
|
+
# on :any do |a_url, resp|
|
124
|
+
# puts "Given this code: #{resp.code}"
|
125
|
+
# end
|
126
|
+
def on(code, p = nil, &block)
|
127
|
+
f = p ? p : block
|
128
|
+
case code
|
129
|
+
when Fixnum
|
130
|
+
@callbacks[success_or_failure(code)][code] = f
|
131
|
+
else
|
132
|
+
if :any == code.to_sym
|
133
|
+
@callbacks[:any] = f
|
134
|
+
else
|
135
|
+
@callbacks[code.to_sym][:any] = f
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
def start! #:nodoc:
|
141
|
+
next_urls = @next_urls
|
45
142
|
begin
|
46
143
|
next_urls = next_urls.map do |a_url|
|
47
144
|
[a_url, (URI.parse(a_url) rescue nil)]
|
48
145
|
end.select do |a_url, parsed_url|
|
49
|
-
|
50
|
-
allowed?(a_url, parsed_url, rules, robots_seen)
|
146
|
+
allowable_url?(a_url, parsed_url)
|
51
147
|
end.map do |a_url, parsed_url|
|
52
|
-
|
53
|
-
|
54
|
-
|
148
|
+
get_page(parsed_url) do |response|
|
149
|
+
do_callbacks(a_url, response)
|
150
|
+
generate_next_urls(a_url, response)
|
55
151
|
end
|
56
|
-
end.flatten
|
152
|
+
end.flatten
|
57
153
|
end while !next_urls.empty?
|
58
154
|
end
|
59
155
|
|
60
|
-
|
156
|
+
def success_or_failure(code) #:nodoc:
|
157
|
+
if code > 199 && code < 300
|
158
|
+
:success
|
159
|
+
else
|
160
|
+
:failure
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
def allowable_url?(a_url, parsed_url) #:nodoc:
|
165
|
+
!parsed_url.nil? && !@seen.include?(parsed_url) && allowed?(a_url, parsed_url) &&
|
166
|
+
@url_checks.map{|url_check|url_check.call(a_url)}.all?
|
167
|
+
end
|
61
168
|
|
62
169
|
# True if the robots.txt for that URL allows access to it.
|
63
|
-
def allowed?(a_url, parsed_url
|
170
|
+
def allowed?(a_url, parsed_url) # :nodoc:
|
64
171
|
u = "#{parsed_url.scheme}://#{parsed_url.host}:#{parsed_url.port}/robots.txt"
|
65
172
|
begin
|
66
|
-
unless robots_seen.include?(u)
|
173
|
+
unless @robots_seen.include?(u)
|
67
174
|
open(u, 'User-Agent' => 'Ruby Spider',
|
68
175
|
'Accept' => 'text/html,text/xml,application/xml,text/plain') do |url|
|
69
|
-
rules.parse(u, url.read)
|
176
|
+
@rules.parse(u, url.read)
|
70
177
|
end
|
71
|
-
robots_seen << u
|
178
|
+
@robots_seen << u
|
72
179
|
end
|
73
|
-
rules.allowed?(a_url)
|
180
|
+
@rules.allowed?(a_url)
|
74
181
|
rescue OpenURI::HTTPError
|
75
|
-
true
|
76
|
-
rescue Timeout::Error # to keep it from crashing
|
77
|
-
false
|
78
|
-
rescue
|
182
|
+
true # No robots.txt
|
183
|
+
rescue Exception, Timeout::Error # to keep it from crashing
|
79
184
|
false
|
80
185
|
end
|
81
186
|
end
|
82
187
|
|
83
|
-
|
84
|
-
|
188
|
+
def get_page(parsed_url, &block) #:nodoc:
|
189
|
+
@seen << parsed_url
|
85
190
|
begin
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
191
|
+
Net::HTTP.start(parsed_url.host, parsed_url.port) do |http|
|
192
|
+
r = http.request(Net::HTTP::Get.new(parsed_url.path))
|
193
|
+
if r.is_a?(Net::HTTPRedirection)
|
194
|
+
get_page(URI.parse(r['Location']), block)
|
195
|
+
else
|
196
|
+
block.call(r)
|
197
|
+
end
|
198
|
+
end
|
199
|
+
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, Exception =>e
|
200
|
+
p e
|
201
|
+
nil
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
def do_callbacks(a_url, resp) #:nodoc:
|
206
|
+
@callbacks[:any].call(a_url, resp) if @callbacks[:any]
|
207
|
+
if resp.success?
|
208
|
+
cb_branch = @callbacks[:success]
|
209
|
+
cb_branch[:any].call(a_url, resp.code, resp, resp.body) if cb_branch[:any]
|
210
|
+
cb_branch[resp.code].call(a_url, resp.code, resp.headers, resp.body) if cb_branch[resp.code]
|
211
|
+
else
|
212
|
+
cb_branch = @callbacks[:failure]
|
213
|
+
cb_branch[:any].call(a_url, resp.code) if cb_branch[:any]
|
214
|
+
cb_branch[resp.code].call(a_url, resp.code) if cb_branch[resp.code]
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
def generate_next_urls(a_url, resp) #:nodoc:
|
219
|
+
web_page = resp.body
|
220
|
+
base_url = (web_page.scan(/base\s+href="(.*?)"/i).flatten +
|
221
|
+
[a_url[0,a_url.rindex('/')]])[0]
|
222
|
+
base_url = remove_trailing_slash(base_url)
|
223
|
+
web_page.scan(/href="(.*?)"/i).flatten.map do |link|
|
224
|
+
begin
|
225
|
+
parsed_link = URI.parse(link)
|
226
|
+
if parsed_link.fragment == '#'
|
227
|
+
nil
|
228
|
+
else
|
229
|
+
case parsed_link.scheme
|
230
|
+
when 'http'
|
231
|
+
link
|
232
|
+
when nil
|
233
|
+
u = URI.parse(base_url)
|
234
|
+
if link[0].chr == '/'
|
235
|
+
"#{u.scheme}://#{u.host}:#{u.port}#{link}"
|
236
|
+
elsif u.path.nil? || u.path == ''
|
237
|
+
"#{u.scheme}://#{u.host}:#{u.port}/#{link}"
|
96
238
|
else
|
97
|
-
|
98
|
-
when 'http'
|
99
|
-
link
|
100
|
-
when nil
|
101
|
-
u = URI.parse(base_url)
|
102
|
-
if link[0].chr == '/'
|
103
|
-
"#{u.scheme}://#{u.host}:#{u.port}#{link}"
|
104
|
-
else
|
105
|
-
"#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{link}"
|
106
|
-
end
|
107
|
-
else
|
108
|
-
nil
|
109
|
-
end
|
239
|
+
"#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{link}"
|
110
240
|
end
|
111
|
-
|
241
|
+
else
|
112
242
|
nil
|
113
243
|
end
|
114
|
-
end
|
115
|
-
|
116
|
-
|
117
|
-
links
|
244
|
+
end
|
245
|
+
rescue
|
246
|
+
nil
|
118
247
|
end
|
119
|
-
|
120
|
-
[]
|
121
|
-
rescue
|
122
|
-
[]
|
123
|
-
end
|
248
|
+
end.compact
|
124
249
|
end
|
125
250
|
|
251
|
+
def remove_trailing_slash(s)
|
252
|
+
s.sub(%r{/*$},'')
|
253
|
+
end
|
126
254
|
end
|
@@ -0,0 +1,219 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'spec'
|
3
|
+
require File.dirname(__FILE__)+'/../lib/spider'
|
4
|
+
|
5
|
+
Spec::Runner.configure { |c| c.mock_with :mocha }
|
6
|
+
|
7
|
+
describe 'SpiderInstance' do
|
8
|
+
it 'should skip URLs when allowable_url? is false' do
|
9
|
+
u = 'http://example.com/'
|
10
|
+
u_p = URI.parse(u)
|
11
|
+
http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
|
12
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
13
|
+
si = SpiderInstance.new([u])
|
14
|
+
si.expects(:allowable_url?).with(u, u_p).returns(false)
|
15
|
+
si.expects(:get_page).times(0)
|
16
|
+
si.start!
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'should not skip URLs when allowable_url? is true' do
|
20
|
+
u = 'http://example.com/'
|
21
|
+
u_p = URI.parse(u)
|
22
|
+
http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
|
23
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
24
|
+
si = SpiderInstance.new([u])
|
25
|
+
si.expects(:allowable_url?).with(u, u_p).returns(true)
|
26
|
+
si.expects(:allowable_url?).with(nil, nil).returns(false)
|
27
|
+
si.expects(:get_page).with(URI.parse(u))
|
28
|
+
si.start!
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'should disallow URLs when the robots.txt says to' do
|
32
|
+
robot_rules = stub
|
33
|
+
SpiderInstance.any_instance.expects(:open).
|
34
|
+
with('http://example.com:80/robots.txt', 'User-Agent' => 'Ruby Spider',
|
35
|
+
'Accept' => 'text/html,text/xml,application/xml,text/plain').
|
36
|
+
yields(stub(:read => 'robots.txt content'))
|
37
|
+
robot_rules.expects(:parse).with('http://example.com:80/robots.txt',
|
38
|
+
'robots.txt content')
|
39
|
+
robot_rules.expects(:allowed?).with('http://example.com/').returns(false)
|
40
|
+
si = SpiderInstance.new(['http://example.com/'], [], robot_rules, [])
|
41
|
+
allowable = si.allowable_url?('http://example.com/',
|
42
|
+
URI.parse('http://example.com/'))
|
43
|
+
allowable.should == false
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'should disallow URLs when they fail any url_check' do
|
47
|
+
si = SpiderInstance.new(['http://example.com/'])
|
48
|
+
si.stubs(:allowed?).returns(true)
|
49
|
+
si.add_url_check { |a_url| false }
|
50
|
+
allowable = si.allowable_url?('http://example.com/',
|
51
|
+
URI.parse('http://example.com/'))
|
52
|
+
allowable.should == false
|
53
|
+
end
|
54
|
+
|
55
|
+
it 'should support multiple url_checks' do
|
56
|
+
@first_url_check = false
|
57
|
+
@second_url_check = false
|
58
|
+
si = SpiderInstance.new(['http://example.com/'])
|
59
|
+
si.stubs(:allowed?).returns(true)
|
60
|
+
si.add_url_check do |a_url|
|
61
|
+
@first_url_check = true
|
62
|
+
true
|
63
|
+
end
|
64
|
+
si.add_url_check do |a_url|
|
65
|
+
@second_url_check = true
|
66
|
+
false
|
67
|
+
end
|
68
|
+
allowable = si.allowable_url?('http://example.com/',
|
69
|
+
URI.parse('http://example.com/'))
|
70
|
+
allowable.should == false
|
71
|
+
@first_url_check == true
|
72
|
+
@second_url_check == true
|
73
|
+
end
|
74
|
+
|
75
|
+
it 'should support memcached'
|
76
|
+
it 'should avoid cycles using memcached'
|
77
|
+
|
78
|
+
it 'should support memory' do
|
79
|
+
si = SpiderInstance.new(['http://example.com/'])
|
80
|
+
si.use_cache :memory # No exn
|
81
|
+
end
|
82
|
+
|
83
|
+
it 'should avoid cycles using memory' do
|
84
|
+
u = 'http://example.com/'
|
85
|
+
u_p = URI.parse(u)
|
86
|
+
si = SpiderInstance.new([u], [u_p])
|
87
|
+
si.stubs(:allowed?).returns(true)
|
88
|
+
allowable = si.allowable_url?(u, u_p)
|
89
|
+
allowable.should == false
|
90
|
+
u_p.should_not be_nil
|
91
|
+
end
|
92
|
+
|
93
|
+
it 'should call the 404 handler for 404s' do
|
94
|
+
@proc_called = false
|
95
|
+
http_resp = stub(:success? => false, :code => 404)
|
96
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
97
|
+
si = SpiderInstance.new(['http://example.com/'])
|
98
|
+
si.stubs(:allowed?).returns(true)
|
99
|
+
si.stubs(:generate_next_urls).returns([])
|
100
|
+
si.on(404) {|*a| @proc_called = true}
|
101
|
+
si.start!
|
102
|
+
@proc_called.should == true
|
103
|
+
end
|
104
|
+
|
105
|
+
it 'should call the :success handler on success' do
|
106
|
+
@proc_called = false
|
107
|
+
http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
|
108
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
109
|
+
si = SpiderInstance.new(['http://example.com/'])
|
110
|
+
si.stubs(:allowed?).returns(true)
|
111
|
+
si.stubs(:generate_next_urls).returns([])
|
112
|
+
si.on(:success) {|*a| @proc_called = true}
|
113
|
+
si.start!
|
114
|
+
@proc_called.should == true
|
115
|
+
end
|
116
|
+
|
117
|
+
it 'should not call the :success handler on failure' do
|
118
|
+
@proc_called = false
|
119
|
+
http_resp = stub(:success? => false, :code => 404)
|
120
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
121
|
+
si = SpiderInstance.new(['http://example.com/'])
|
122
|
+
si.stubs(:allowed?).returns(true)
|
123
|
+
si.stubs(:generate_next_urls).returns([])
|
124
|
+
si.on(:success) {|*a| @proc_called = true}
|
125
|
+
si.start!
|
126
|
+
@proc_called.should == false
|
127
|
+
end
|
128
|
+
|
129
|
+
it 'should call the :success handler and the 200 handler on 200' do
|
130
|
+
@proc_200_called = false
|
131
|
+
@proc_success_called = false
|
132
|
+
http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
|
133
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
134
|
+
si = SpiderInstance.new(['http://example.com/'])
|
135
|
+
si.stubs(:allowed?).returns(true)
|
136
|
+
si.stubs(:generate_next_urls).returns([])
|
137
|
+
si.on(:success) {|*a| @proc_success_called = true}
|
138
|
+
si.on(200) {|*a| @proc_200_called = true}
|
139
|
+
si.start!
|
140
|
+
@proc_200_called.should == true
|
141
|
+
@proc_success_called.should == true
|
142
|
+
end
|
143
|
+
|
144
|
+
it 'should not call the :failure handler on success' do
|
145
|
+
@proc_called = false
|
146
|
+
http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
|
147
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
148
|
+
si = SpiderInstance.new(['http://example.com/'])
|
149
|
+
si.stubs(:allowed?).returns(true)
|
150
|
+
si.stubs(:generate_next_urls).returns([])
|
151
|
+
si.on(:failure) {|*a| @proc_called = true}
|
152
|
+
si.start!
|
153
|
+
@proc_called.should == false
|
154
|
+
end
|
155
|
+
|
156
|
+
it 'should call the :failure handler on failure' do
|
157
|
+
@proc_called = false
|
158
|
+
http_resp = stub(:success? => false, :code => 404)
|
159
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
160
|
+
si = SpiderInstance.new(['http://example.com/'])
|
161
|
+
si.stubs(:allowed?).returns(true)
|
162
|
+
si.stubs(:generate_next_urls).returns([])
|
163
|
+
si.on(:failure) {|*a| @proc_called = true}
|
164
|
+
si.start!
|
165
|
+
@proc_called.should == true
|
166
|
+
end
|
167
|
+
|
168
|
+
it 'should call the :failure handler and the 404 handler on 404' do
|
169
|
+
@proc_404_called = false
|
170
|
+
@proc_failure_called = false
|
171
|
+
http_resp = stub(:success? => false, :code => 404)
|
172
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
173
|
+
si = SpiderInstance.new(['http://example.com/'])
|
174
|
+
si.stubs(:allowed?).returns(true)
|
175
|
+
si.stubs(:generate_next_urls).returns([])
|
176
|
+
si.on(:failure) {|*a| @proc_failure_called = true}
|
177
|
+
si.on(404) {|*a| @proc_404_called = true}
|
178
|
+
si.start!
|
179
|
+
@proc_404_called.should == true
|
180
|
+
@proc_failure_called.should == true
|
181
|
+
end
|
182
|
+
|
183
|
+
it 'should call the :any handler even when a handler for the error code is defined' do
|
184
|
+
@any_called = false
|
185
|
+
http_resp = stub(:success? => true, :code => 200)
|
186
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
187
|
+
si = SpiderInstance.new(['http://example.com/'])
|
188
|
+
si.stubs(:allowed?).returns(true)
|
189
|
+
si.stubs(:generate_next_urls).returns([])
|
190
|
+
si.on(:any) { |*a| @any_called = true }
|
191
|
+
si.on(202) {|*a|}
|
192
|
+
si.start!
|
193
|
+
@any_called.should == true
|
194
|
+
end
|
195
|
+
|
196
|
+
it 'should support a block as a response handler' do
|
197
|
+
@proc_called = false
|
198
|
+
http_resp = stub(:success? => true, :code => 200)
|
199
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
200
|
+
si = SpiderInstance.new(['http://example.com/'])
|
201
|
+
si.stubs(:allowed?).returns(true)
|
202
|
+
si.stubs(:generate_next_urls).returns([])
|
203
|
+
si.on(:any) { |*a| @proc_called = true }
|
204
|
+
si.start!
|
205
|
+
@proc_called.should == true
|
206
|
+
end
|
207
|
+
|
208
|
+
it 'should support a proc as a response handler' do
|
209
|
+
@proc_called = false
|
210
|
+
http_resp = stub(:success? => true, :code => 200)
|
211
|
+
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
212
|
+
si = SpiderInstance.new(['http://example.com/'])
|
213
|
+
si.stubs(:allowed?).returns(true)
|
214
|
+
si.stubs(:generate_next_urls).returns([])
|
215
|
+
si.on(:any, Proc.new { |*a| @proc_called = true })
|
216
|
+
si.start!
|
217
|
+
@proc_called.should == true
|
218
|
+
end
|
219
|
+
end
|