spider 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +8 -0
- data/doc/classes/Spider.html +7 -7
- data/doc/classes/SpiderInstance.html +26 -2
- data/doc/created.rid +1 -1
- data/doc/files/lib/spider_rb.html +2 -1
- data/doc/fr_file_index.html +0 -1
- data/doc/fr_method_index.html +2 -1
- data/lib/spider.rb +15 -9
- data/spec/spider_instance_spec.rb +127 -46
- data/spider.gemspec +1 -1
- data/test_server/client.rb +4 -0
- metadata +2 -2
data/CHANGES
CHANGED
@@ -1,9 +1,17 @@
|
|
1
|
+
2007-10-23:
|
2
|
+
* URLs without a page component but with a query component.
|
3
|
+
* HTTP Redirect.
|
4
|
+
* HTTPS.
|
5
|
+
* Version 0.2.1 .
|
6
|
+
|
1
7
|
2007-10-22:
|
2
8
|
* Use RSpec to ensure that it mostly works.
|
3
9
|
* Use WEBrick to create a small test server for additional testing.
|
4
10
|
* Completely re-do the API to prepare for future expansion.
|
5
11
|
* Add the ability to apply each URL to a series of custom allowed?-like
|
6
12
|
matchers.
|
13
|
+
* BSD license.
|
14
|
+
* Version 0.2.0 .
|
7
15
|
|
8
16
|
2007-03-30:
|
9
17
|
* Clean up the documentation.
|
data/doc/classes/Spider.html
CHANGED
@@ -93,7 +93,7 @@ links, and doing it all over again.
|
|
93
93
|
<h3 class="section-bar">Methods</h3>
|
94
94
|
|
95
95
|
<div class="name-list">
|
96
|
-
<a href="#
|
96
|
+
<a href="#M000004">start_at</a>
|
97
97
|
</div>
|
98
98
|
</div>
|
99
99
|
|
@@ -115,11 +115,11 @@ links, and doing it all over again.
|
|
115
115
|
<div id="methods">
|
116
116
|
<h3 class="section-bar">Public Class methods</h3>
|
117
117
|
|
118
|
-
<div id="method-
|
119
|
-
<a name="
|
118
|
+
<div id="method-M000004" class="method-detail">
|
119
|
+
<a name="M000004"></a>
|
120
120
|
|
121
121
|
<div class="method-heading">
|
122
|
-
<a href="#
|
122
|
+
<a href="#M000004" class="method-signature">
|
123
123
|
<span class="method-name">start_at</span><span class="method-args">(a_url, &block)</span>
|
124
124
|
</a>
|
125
125
|
</div>
|
@@ -150,10 +150,10 @@ define the rules and handlers for the discovered Web pages.
|
|
150
150
|
end
|
151
151
|
</pre>
|
152
152
|
<p><a class="source-toggle" href="#"
|
153
|
-
onclick="toggleCode('
|
154
|
-
<div class="method-source-code" id="
|
153
|
+
onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
|
154
|
+
<div class="method-source-code" id="M000004-source">
|
155
155
|
<pre>
|
156
|
-
<span class="ruby-comment cmt"># File lib/spider.rb, line
|
156
|
+
<span class="ruby-comment cmt"># File lib/spider.rb, line 68</span>
|
157
157
|
<span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">start_at</span>(<span class="ruby-identifier">a_url</span>, <span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
158
158
|
<span class="ruby-identifier">rules</span> = <span class="ruby-constant">RobotRules</span>.<span class="ruby-identifier">new</span>(<span class="ruby-value str">'Ruby Spider 1.0'</span>)
|
159
159
|
<span class="ruby-identifier">a_spider</span> = <span class="ruby-constant">SpiderInstance</span>.<span class="ruby-identifier">new</span>([<span class="ruby-identifier">a_url</span>], [], <span class="ruby-identifier">rules</span>, [])
|
@@ -88,6 +88,7 @@
|
|
88
88
|
<div class="name-list">
|
89
89
|
<a href="#M000001">add_url_check</a>
|
90
90
|
<a href="#M000002">on</a>
|
91
|
+
<a href="#M000003">remove_trailing_slash</a>
|
91
92
|
</div>
|
92
93
|
</div>
|
93
94
|
|
@@ -135,7 +136,7 @@ href="http://mike-burns.com">mike-burns.com</a>’:
|
|
135
136
|
onclick="toggleCode('M000001-source');return false;">[Source]</a></p>
|
136
137
|
<div class="method-source-code" id="M000001-source">
|
137
138
|
<pre>
|
138
|
-
<span class="ruby-comment cmt"># File lib/spider.rb, line
|
139
|
+
<span class="ruby-comment cmt"># File lib/spider.rb, line 94</span>
|
139
140
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_url_check</span>(<span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
140
141
|
<span class="ruby-ivar">@url_checks</span> <span class="ruby-operator"><<</span> <span class="ruby-identifier">block</span>
|
141
142
|
<span class="ruby-keyword kw">end</span>
|
@@ -195,7 +196,7 @@ For example:
|
|
195
196
|
onclick="toggleCode('M000002-source');return false;">[Source]</a></p>
|
196
197
|
<div class="method-source-code" id="M000002-source">
|
197
198
|
<pre>
|
198
|
-
<span class="ruby-comment cmt"># File lib/spider.rb, line
|
199
|
+
<span class="ruby-comment cmt"># File lib/spider.rb, line 131</span>
|
199
200
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">on</span>(<span class="ruby-identifier">code</span>, <span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
200
201
|
<span class="ruby-identifier">f</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
|
201
202
|
<span class="ruby-keyword kw">case</span> <span class="ruby-identifier">code</span>
|
@@ -214,6 +215,29 @@ For example:
|
|
214
215
|
</div>
|
215
216
|
</div>
|
216
217
|
|
218
|
+
<div id="method-M000003" class="method-detail">
|
219
|
+
<a name="M000003"></a>
|
220
|
+
|
221
|
+
<div class="method-heading">
|
222
|
+
<a href="#M000003" class="method-signature">
|
223
|
+
<span class="method-name">remove_trailing_slash</span><span class="method-args">(s)</span>
|
224
|
+
</a>
|
225
|
+
</div>
|
226
|
+
|
227
|
+
<div class="method-description">
|
228
|
+
<p><a class="source-toggle" href="#"
|
229
|
+
onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
|
230
|
+
<div class="method-source-code" id="M000003-source">
|
231
|
+
<pre>
|
232
|
+
<span class="ruby-comment cmt"># File lib/spider.rb, line 257</span>
|
233
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">remove_trailing_slash</span>(<span class="ruby-identifier">s</span>)
|
234
|
+
<span class="ruby-identifier">s</span>.<span class="ruby-identifier">sub</span>(<span class="ruby-regexp re">%r{/*$}</span>,<span class="ruby-value str">''</span>)
|
235
|
+
<span class="ruby-keyword kw">end</span>
|
236
|
+
</pre>
|
237
|
+
</div>
|
238
|
+
</div>
|
239
|
+
</div>
|
240
|
+
|
217
241
|
|
218
242
|
</div>
|
219
243
|
|
data/doc/created.rid
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
Tue, 23 Oct 2007 23:14:46 -0400
|
@@ -56,7 +56,7 @@
|
|
56
56
|
</tr>
|
57
57
|
<tr class="top-aligned-row">
|
58
58
|
<td><strong>Last Update:</strong></td>
|
59
|
-
<td>
|
59
|
+
<td>Tue Oct 23 23:11:42 -0400 2007</td>
|
60
60
|
</tr>
|
61
61
|
</table>
|
62
62
|
</div>
|
@@ -125,6 +125,7 @@ href="http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589">blade.
|
|
125
125
|
open-uri
|
126
126
|
uri
|
127
127
|
net/http
|
128
|
+
net/https
|
128
129
|
</div>
|
129
130
|
</div>
|
130
131
|
|
data/doc/fr_file_index.html
CHANGED
data/doc/fr_method_index.html
CHANGED
@@ -22,7 +22,8 @@
|
|
22
22
|
<div id="index-entries">
|
23
23
|
<a href="classes/SpiderInstance.html#M000001">add_url_check (SpiderInstance)</a><br />
|
24
24
|
<a href="classes/SpiderInstance.html#M000002">on (SpiderInstance)</a><br />
|
25
|
-
<a href="classes/
|
25
|
+
<a href="classes/SpiderInstance.html#M000003">remove_trailing_slash (SpiderInstance)</a><br />
|
26
|
+
<a href="classes/Spider.html#M000004">start_at (Spider)</a><br />
|
26
27
|
</div>
|
27
28
|
</div>
|
28
29
|
</body>
|
data/lib/spider.rb
CHANGED
@@ -27,13 +27,18 @@ require 'robot_rules'
|
|
27
27
|
require 'open-uri'
|
28
28
|
require 'uri'
|
29
29
|
require 'net/http'
|
30
|
+
require 'net/https'
|
30
31
|
|
31
32
|
class Net::HTTPResponse #:nodoc:
|
32
33
|
def success?; false; end
|
34
|
+
def redirect?; false; end
|
33
35
|
end
|
34
36
|
class Net::HTTPSuccess #:nodoc:
|
35
37
|
def success?; true; end
|
36
38
|
end
|
39
|
+
class Net::HTTPRedirection #:nodoc:
|
40
|
+
def redirect?; true; end
|
41
|
+
end
|
37
42
|
|
38
43
|
# A spidering library for Ruby. Handles robots.txt, scraping, finding more
|
39
44
|
# links, and doing it all over again.
|
@@ -188,15 +193,16 @@ class SpiderInstance
|
|
188
193
|
def get_page(parsed_url, &block) #:nodoc:
|
189
194
|
@seen << parsed_url
|
190
195
|
begin
|
191
|
-
Net::HTTP.
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
196
|
+
http = Net::HTTP.new(parsed_url.host, parsed_url.port)
|
197
|
+
http.use_ssl = parsed_url.scheme == 'https'
|
198
|
+
# Uses start because http.finish cannot be called.
|
199
|
+
r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri))}
|
200
|
+
if r.redirect?
|
201
|
+
get_page(URI.parse(r['Location']), &block)
|
202
|
+
else
|
203
|
+
block.call(r)
|
198
204
|
end
|
199
|
-
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError
|
205
|
+
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError => e
|
200
206
|
p e
|
201
207
|
nil
|
202
208
|
end
|
@@ -207,7 +213,7 @@ class SpiderInstance
|
|
207
213
|
if resp.success?
|
208
214
|
cb_branch = @callbacks[:success]
|
209
215
|
cb_branch[:any].call(a_url, resp.code, resp, resp.body) if cb_branch[:any]
|
210
|
-
cb_branch[resp.code].call(a_url, resp.code, resp
|
216
|
+
cb_branch[resp.code].call(a_url, resp.code, resp, resp.body) if cb_branch[resp.code]
|
211
217
|
else
|
212
218
|
cb_branch = @callbacks[:failure]
|
213
219
|
cb_branch[:any].call(a_url, resp.code) if cb_branch[:any]
|
@@ -1,15 +1,84 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'spec'
|
3
|
+
require 'webrick'
|
4
|
+
require 'webrick/https'
|
3
5
|
require File.dirname(__FILE__)+'/../lib/spider'
|
4
6
|
|
5
7
|
Spec::Runner.configure { |c| c.mock_with :mocha }
|
6
8
|
|
9
|
+
class QueryServlet < WEBrick::HTTPServlet::AbstractServlet
|
10
|
+
def do_GET(req, res)
|
11
|
+
res['Content-type'] = 'text/plain'
|
12
|
+
res.body = "response\n"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def null_logger
|
17
|
+
l = stub
|
18
|
+
[:log, :fatal, :error, :warn , :info, :debug].each do |k|
|
19
|
+
l.stubs(k)
|
20
|
+
l.stubs("#{k}?".to_sym)
|
21
|
+
end
|
22
|
+
l
|
23
|
+
end
|
24
|
+
|
7
25
|
describe 'SpiderInstance' do
|
26
|
+
# Bug reported by John Nagro, using the example source http://eons.com/
|
27
|
+
# had to change line 192; uses request_uri now instead of path.
|
28
|
+
it 'should handle query URLs without a path' do
|
29
|
+
u = 'http://localhost:8888?s=1'
|
30
|
+
u_p = URI.parse(u)
|
31
|
+
@block_called = false
|
32
|
+
server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
|
33
|
+
:AccessLog => [])
|
34
|
+
server.mount('/', QueryServlet)
|
35
|
+
Thread.new {server.start}
|
36
|
+
si = SpiderInstance.new([u])
|
37
|
+
si.get_page(u_p) do
|
38
|
+
@block_called = true
|
39
|
+
end
|
40
|
+
server.shutdown
|
41
|
+
@block_called.should be_true
|
42
|
+
end
|
43
|
+
|
44
|
+
# This solves a problem reported by John Nagro.
|
45
|
+
it 'should handle redirects' do
|
46
|
+
u = 'http://example.com/'
|
47
|
+
u_p = URI.parse(u)
|
48
|
+
@redirect_handled = false
|
49
|
+
mock_redirect_http
|
50
|
+
si = SpiderInstance.new([u])
|
51
|
+
si.get_page(u_p) do
|
52
|
+
@redirect_handled = true
|
53
|
+
end
|
54
|
+
@redirect_handled.should be_true
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'should handle HTTPS' do
|
58
|
+
u = 'https://localhost:10443/'
|
59
|
+
u_p = URI.parse(u)
|
60
|
+
@page_called = false
|
61
|
+
server = WEBrick::HTTPServer.new(:Port => 10443,
|
62
|
+
:Logger => null_logger,
|
63
|
+
:AccessLog => [],
|
64
|
+
:SSLEnable => true,
|
65
|
+
:SSLCertName => [["O", "ruby-lang.org"], ["OU", "sample"], ["CN", WEBrick::Utils::getservername]],
|
66
|
+
:SSLComment => 'Comment of some sort')
|
67
|
+
server.mount('/', QueryServlet)
|
68
|
+
Thread.new {server.start}
|
69
|
+
si = SpiderInstance.new([u])
|
70
|
+
si.get_page(u_p) { @page_called = true }
|
71
|
+
server.shutdown
|
72
|
+
@page_called.should be_true
|
73
|
+
end
|
74
|
+
|
75
|
+
it 'should maintain the entire graph within some external object (or memory, or memcached)'
|
76
|
+
|
8
77
|
it 'should skip URLs when allowable_url? is false' do
|
9
78
|
u = 'http://example.com/'
|
10
79
|
u_p = URI.parse(u)
|
11
|
-
http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
|
12
|
-
Net::HTTP.stubs(:
|
80
|
+
http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
|
81
|
+
Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
|
13
82
|
si = SpiderInstance.new([u])
|
14
83
|
si.expects(:allowable_url?).with(u, u_p).returns(false)
|
15
84
|
si.expects(:get_page).times(0)
|
@@ -19,8 +88,8 @@ describe 'SpiderInstance' do
|
|
19
88
|
it 'should not skip URLs when allowable_url? is true' do
|
20
89
|
u = 'http://example.com/'
|
21
90
|
u_p = URI.parse(u)
|
22
|
-
http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
|
23
|
-
Net::HTTP.stubs(:
|
91
|
+
http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
|
92
|
+
Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
|
24
93
|
si = SpiderInstance.new([u])
|
25
94
|
si.expects(:allowable_url?).with(u, u_p).returns(true)
|
26
95
|
si.expects(:allowable_url?).with(nil, nil).returns(false)
|
@@ -68,19 +137,11 @@ describe 'SpiderInstance' do
|
|
68
137
|
allowable = si.allowable_url?('http://example.com/',
|
69
138
|
URI.parse('http://example.com/'))
|
70
139
|
allowable.should == false
|
71
|
-
@first_url_check
|
72
|
-
@second_url_check
|
73
|
-
end
|
74
|
-
|
75
|
-
it 'should support memcached'
|
76
|
-
it 'should avoid cycles using memcached'
|
77
|
-
|
78
|
-
it 'should support memory' do
|
79
|
-
si = SpiderInstance.new(['http://example.com/'])
|
80
|
-
si.use_cache :memory # No exn
|
140
|
+
@first_url_check.should be_true
|
141
|
+
@second_url_check.should be_true
|
81
142
|
end
|
82
143
|
|
83
|
-
it 'should avoid cycles
|
144
|
+
it 'should avoid cycles' do
|
84
145
|
u = 'http://example.com/'
|
85
146
|
u_p = URI.parse(u)
|
86
147
|
si = SpiderInstance.new([u], [u_p])
|
@@ -92,32 +153,29 @@ describe 'SpiderInstance' do
|
|
92
153
|
|
93
154
|
it 'should call the 404 handler for 404s' do
|
94
155
|
@proc_called = false
|
95
|
-
|
96
|
-
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
156
|
+
mock_failed_http
|
97
157
|
si = SpiderInstance.new(['http://example.com/'])
|
98
158
|
si.stubs(:allowed?).returns(true)
|
99
159
|
si.stubs(:generate_next_urls).returns([])
|
100
160
|
si.on(404) {|*a| @proc_called = true}
|
101
161
|
si.start!
|
102
|
-
@proc_called.should
|
162
|
+
@proc_called.should be_true
|
103
163
|
end
|
104
164
|
|
105
165
|
it 'should call the :success handler on success' do
|
106
166
|
@proc_called = false
|
107
|
-
|
108
|
-
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
167
|
+
mock_successful_http
|
109
168
|
si = SpiderInstance.new(['http://example.com/'])
|
110
169
|
si.stubs(:allowed?).returns(true)
|
111
170
|
si.stubs(:generate_next_urls).returns([])
|
112
171
|
si.on(:success) {|*a| @proc_called = true}
|
113
172
|
si.start!
|
114
|
-
@proc_called.should
|
173
|
+
@proc_called.should be_true
|
115
174
|
end
|
116
175
|
|
117
176
|
it 'should not call the :success handler on failure' do
|
118
177
|
@proc_called = false
|
119
|
-
|
120
|
-
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
178
|
+
mock_failed_http
|
121
179
|
si = SpiderInstance.new(['http://example.com/'])
|
122
180
|
si.stubs(:allowed?).returns(true)
|
123
181
|
si.stubs(:generate_next_urls).returns([])
|
@@ -129,22 +187,20 @@ describe 'SpiderInstance' do
|
|
129
187
|
it 'should call the :success handler and the 200 handler on 200' do
|
130
188
|
@proc_200_called = false
|
131
189
|
@proc_success_called = false
|
132
|
-
|
133
|
-
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
190
|
+
mock_successful_http
|
134
191
|
si = SpiderInstance.new(['http://example.com/'])
|
135
192
|
si.stubs(:allowed?).returns(true)
|
136
193
|
si.stubs(:generate_next_urls).returns([])
|
137
194
|
si.on(:success) {|*a| @proc_success_called = true}
|
138
|
-
si.on(200)
|
195
|
+
si.on(200) {|*a| @proc_200_called = true}
|
139
196
|
si.start!
|
140
|
-
@proc_200_called.should
|
141
|
-
@proc_success_called.should
|
197
|
+
@proc_200_called.should be_true
|
198
|
+
@proc_success_called.should be_true
|
142
199
|
end
|
143
200
|
|
144
201
|
it 'should not call the :failure handler on success' do
|
145
202
|
@proc_called = false
|
146
|
-
|
147
|
-
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
203
|
+
mock_successful_http
|
148
204
|
si = SpiderInstance.new(['http://example.com/'])
|
149
205
|
si.stubs(:allowed?).returns(true)
|
150
206
|
si.stubs(:generate_next_urls).returns([])
|
@@ -155,65 +211,90 @@ describe 'SpiderInstance' do
|
|
155
211
|
|
156
212
|
it 'should call the :failure handler on failure' do
|
157
213
|
@proc_called = false
|
158
|
-
|
159
|
-
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
214
|
+
mock_failed_http
|
160
215
|
si = SpiderInstance.new(['http://example.com/'])
|
161
216
|
si.stubs(:allowed?).returns(true)
|
162
217
|
si.stubs(:generate_next_urls).returns([])
|
163
218
|
si.on(:failure) {|*a| @proc_called = true}
|
164
219
|
si.start!
|
165
|
-
@proc_called.should
|
220
|
+
@proc_called.should be_true
|
166
221
|
end
|
167
222
|
|
168
223
|
it 'should call the :failure handler and the 404 handler on 404' do
|
169
224
|
@proc_404_called = false
|
170
225
|
@proc_failure_called = false
|
171
|
-
|
172
|
-
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
226
|
+
mock_failed_http
|
173
227
|
si = SpiderInstance.new(['http://example.com/'])
|
174
228
|
si.stubs(:allowed?).returns(true)
|
175
229
|
si.stubs(:generate_next_urls).returns([])
|
176
230
|
si.on(:failure) {|*a| @proc_failure_called = true}
|
177
231
|
si.on(404) {|*a| @proc_404_called = true}
|
178
232
|
si.start!
|
179
|
-
@proc_404_called.should
|
180
|
-
@proc_failure_called.should
|
233
|
+
@proc_404_called.should be_true
|
234
|
+
@proc_failure_called.should be_true
|
181
235
|
end
|
182
236
|
|
183
237
|
it 'should call the :any handler even when a handler for the error code is defined' do
|
184
238
|
@any_called = false
|
185
|
-
|
186
|
-
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
239
|
+
mock_successful_http
|
187
240
|
si = SpiderInstance.new(['http://example.com/'])
|
188
241
|
si.stubs(:allowed?).returns(true)
|
189
242
|
si.stubs(:generate_next_urls).returns([])
|
190
243
|
si.on(:any) { |*a| @any_called = true }
|
191
244
|
si.on(202) {|*a|}
|
192
245
|
si.start!
|
193
|
-
@any_called.should
|
246
|
+
@any_called.should be_true
|
194
247
|
end
|
195
248
|
|
196
249
|
it 'should support a block as a response handler' do
|
197
250
|
@proc_called = false
|
198
|
-
|
199
|
-
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
251
|
+
mock_successful_http
|
200
252
|
si = SpiderInstance.new(['http://example.com/'])
|
201
253
|
si.stubs(:allowed?).returns(true)
|
202
254
|
si.stubs(:generate_next_urls).returns([])
|
203
255
|
si.on(:any) { |*a| @proc_called = true }
|
204
256
|
si.start!
|
205
|
-
@proc_called.should
|
257
|
+
@proc_called.should be_true
|
206
258
|
end
|
207
259
|
|
208
260
|
it 'should support a proc as a response handler' do
|
209
261
|
@proc_called = false
|
210
|
-
|
211
|
-
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
262
|
+
mock_successful_http
|
212
263
|
si = SpiderInstance.new(['http://example.com/'])
|
213
264
|
si.stubs(:allowed?).returns(true)
|
214
265
|
si.stubs(:generate_next_urls).returns([])
|
215
266
|
si.on(:any, Proc.new { |*a| @proc_called = true })
|
216
267
|
si.start!
|
217
|
-
@proc_called.should
|
268
|
+
@proc_called.should be_true
|
269
|
+
end
|
270
|
+
|
271
|
+
def mock_http(http_req)
|
272
|
+
http_obj = mock(:use_ssl= => true)
|
273
|
+
http_obj.expects(:start).
|
274
|
+
yields(mock(:request => http_req)).returns(http_req)
|
275
|
+
Net::HTTP.expects(:new).returns(http_obj)
|
276
|
+
end
|
277
|
+
|
278
|
+
def mock_successful_http
|
279
|
+
http_req = stub(:redirect? => false, :success? => true, :code => 200, :body => 'body')
|
280
|
+
mock_http(http_req)
|
281
|
+
end
|
282
|
+
|
283
|
+
def mock_failed_http
|
284
|
+
http_req = stub(:redirect? => false, :success? => false, :code => 404)
|
285
|
+
mock_http(http_req)
|
286
|
+
end
|
287
|
+
|
288
|
+
def mock_redirect_http
|
289
|
+
http_req = stub(:redirect? => true, :success? => false, :code => 404)
|
290
|
+
http_req.expects(:[]).with('Location').returns('http://example.com/')
|
291
|
+
http_req2 = stub(:redirect? => false, :success? => true, :code => 200)
|
292
|
+
http_obj = mock(:use_ssl= => true)
|
293
|
+
http_obj.expects(:start).
|
294
|
+
yields(mock(:request => http_req)).returns(http_req)
|
295
|
+
http_obj2 = mock(:use_ssl= => true)
|
296
|
+
http_obj2.expects(:start).
|
297
|
+
yields(mock(:request => http_req2)).returns(http_req2)
|
298
|
+
Net::HTTP.expects(:new).times(2).returns(http_obj).then.returns(http_obj2)
|
218
299
|
end
|
219
300
|
end
|
data/spider.gemspec
CHANGED
data/test_server/client.rb
CHANGED
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.4
|
|
3
3
|
specification_version: 1
|
4
4
|
name: spider
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.2.
|
7
|
-
date: 2007-10-
|
6
|
+
version: 0.2.1
|
7
|
+
date: 2007-10-23 00:00:00 -04:00
|
8
8
|
summary: A Web spidering library
|
9
9
|
require_paths:
|
10
10
|
- lib
|