spider 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +8 -0
- data/doc/classes/Spider.html +7 -7
- data/doc/classes/SpiderInstance.html +26 -2
- data/doc/created.rid +1 -1
- data/doc/files/lib/spider_rb.html +2 -1
- data/doc/fr_file_index.html +0 -1
- data/doc/fr_method_index.html +2 -1
- data/lib/spider.rb +15 -9
- data/spec/spider_instance_spec.rb +127 -46
- data/spider.gemspec +1 -1
- data/test_server/client.rb +4 -0
- metadata +2 -2
data/CHANGES
CHANGED
@@ -1,9 +1,17 @@
|
|
1
|
+
2007-10-23:
|
2
|
+
* URLs without a page component but with a query component.
|
3
|
+
* HTTP Redirect.
|
4
|
+
* HTTPS.
|
5
|
+
* Version 0.2.1 .
|
6
|
+
|
1
7
|
2007-10-22:
|
2
8
|
* Use RSpec to ensure that it mostly works.
|
3
9
|
* Use WEBrick to create a small test server for additional testing.
|
4
10
|
* Completely re-do the API to prepare for future expansion.
|
5
11
|
* Add the ability to apply each URL to a series of custom allowed?-like
|
6
12
|
matchers.
|
13
|
+
* BSD license.
|
14
|
+
* Version 0.2.0 .
|
7
15
|
|
8
16
|
2007-03-30:
|
9
17
|
* Clean up the documentation.
|
data/doc/classes/Spider.html
CHANGED
@@ -93,7 +93,7 @@ links, and doing it all over again.
|
|
93
93
|
<h3 class="section-bar">Methods</h3>
|
94
94
|
|
95
95
|
<div class="name-list">
|
96
|
-
<a href="#
|
96
|
+
<a href="#M000004">start_at</a>
|
97
97
|
</div>
|
98
98
|
</div>
|
99
99
|
|
@@ -115,11 +115,11 @@ links, and doing it all over again.
|
|
115
115
|
<div id="methods">
|
116
116
|
<h3 class="section-bar">Public Class methods</h3>
|
117
117
|
|
118
|
-
<div id="method-
|
119
|
-
<a name="
|
118
|
+
<div id="method-M000004" class="method-detail">
|
119
|
+
<a name="M000004"></a>
|
120
120
|
|
121
121
|
<div class="method-heading">
|
122
|
-
<a href="#
|
122
|
+
<a href="#M000004" class="method-signature">
|
123
123
|
<span class="method-name">start_at</span><span class="method-args">(a_url, &block)</span>
|
124
124
|
</a>
|
125
125
|
</div>
|
@@ -150,10 +150,10 @@ define the rules and handlers for the discovered Web pages.
|
|
150
150
|
end
|
151
151
|
</pre>
|
152
152
|
<p><a class="source-toggle" href="#"
|
153
|
-
onclick="toggleCode('
|
154
|
-
<div class="method-source-code" id="
|
153
|
+
onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
|
154
|
+
<div class="method-source-code" id="M000004-source">
|
155
155
|
<pre>
|
156
|
-
<span class="ruby-comment cmt"># File lib/spider.rb, line
|
156
|
+
<span class="ruby-comment cmt"># File lib/spider.rb, line 68</span>
|
157
157
|
<span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">start_at</span>(<span class="ruby-identifier">a_url</span>, <span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
158
158
|
<span class="ruby-identifier">rules</span> = <span class="ruby-constant">RobotRules</span>.<span class="ruby-identifier">new</span>(<span class="ruby-value str">'Ruby Spider 1.0'</span>)
|
159
159
|
<span class="ruby-identifier">a_spider</span> = <span class="ruby-constant">SpiderInstance</span>.<span class="ruby-identifier">new</span>([<span class="ruby-identifier">a_url</span>], [], <span class="ruby-identifier">rules</span>, [])
|
@@ -88,6 +88,7 @@
|
|
88
88
|
<div class="name-list">
|
89
89
|
<a href="#M000001">add_url_check</a>
|
90
90
|
<a href="#M000002">on</a>
|
91
|
+
<a href="#M000003">remove_trailing_slash</a>
|
91
92
|
</div>
|
92
93
|
</div>
|
93
94
|
|
@@ -135,7 +136,7 @@ href="http://mike-burns.com">mike-burns.com</a>’:
|
|
135
136
|
onclick="toggleCode('M000001-source');return false;">[Source]</a></p>
|
136
137
|
<div class="method-source-code" id="M000001-source">
|
137
138
|
<pre>
|
138
|
-
<span class="ruby-comment cmt"># File lib/spider.rb, line
|
139
|
+
<span class="ruby-comment cmt"># File lib/spider.rb, line 94</span>
|
139
140
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_url_check</span>(<span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
140
141
|
<span class="ruby-ivar">@url_checks</span> <span class="ruby-operator"><<</span> <span class="ruby-identifier">block</span>
|
141
142
|
<span class="ruby-keyword kw">end</span>
|
@@ -195,7 +196,7 @@ For example:
|
|
195
196
|
onclick="toggleCode('M000002-source');return false;">[Source]</a></p>
|
196
197
|
<div class="method-source-code" id="M000002-source">
|
197
198
|
<pre>
|
198
|
-
<span class="ruby-comment cmt"># File lib/spider.rb, line
|
199
|
+
<span class="ruby-comment cmt"># File lib/spider.rb, line 131</span>
|
199
200
|
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">on</span>(<span class="ruby-identifier">code</span>, <span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&</span><span class="ruby-identifier">block</span>)
|
200
201
|
<span class="ruby-identifier">f</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
|
201
202
|
<span class="ruby-keyword kw">case</span> <span class="ruby-identifier">code</span>
|
@@ -214,6 +215,29 @@ For example:
|
|
214
215
|
</div>
|
215
216
|
</div>
|
216
217
|
|
218
|
+
<div id="method-M000003" class="method-detail">
|
219
|
+
<a name="M000003"></a>
|
220
|
+
|
221
|
+
<div class="method-heading">
|
222
|
+
<a href="#M000003" class="method-signature">
|
223
|
+
<span class="method-name">remove_trailing_slash</span><span class="method-args">(s)</span>
|
224
|
+
</a>
|
225
|
+
</div>
|
226
|
+
|
227
|
+
<div class="method-description">
|
228
|
+
<p><a class="source-toggle" href="#"
|
229
|
+
onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
|
230
|
+
<div class="method-source-code" id="M000003-source">
|
231
|
+
<pre>
|
232
|
+
<span class="ruby-comment cmt"># File lib/spider.rb, line 257</span>
|
233
|
+
<span class="ruby-keyword kw">def</span> <span class="ruby-identifier">remove_trailing_slash</span>(<span class="ruby-identifier">s</span>)
|
234
|
+
<span class="ruby-identifier">s</span>.<span class="ruby-identifier">sub</span>(<span class="ruby-regexp re">%r{/*$}</span>,<span class="ruby-value str">''</span>)
|
235
|
+
<span class="ruby-keyword kw">end</span>
|
236
|
+
</pre>
|
237
|
+
</div>
|
238
|
+
</div>
|
239
|
+
</div>
|
240
|
+
|
217
241
|
|
218
242
|
</div>
|
219
243
|
|
data/doc/created.rid
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
Tue, 23 Oct 2007 23:14:46 -0400
|
@@ -56,7 +56,7 @@
|
|
56
56
|
</tr>
|
57
57
|
<tr class="top-aligned-row">
|
58
58
|
<td><strong>Last Update:</strong></td>
|
59
|
-
<td>
|
59
|
+
<td>Tue Oct 23 23:11:42 -0400 2007</td>
|
60
60
|
</tr>
|
61
61
|
</table>
|
62
62
|
</div>
|
@@ -125,6 +125,7 @@ href="http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589">blade.
|
|
125
125
|
open-uri
|
126
126
|
uri
|
127
127
|
net/http
|
128
|
+
net/https
|
128
129
|
</div>
|
129
130
|
</div>
|
130
131
|
|
data/doc/fr_file_index.html
CHANGED
data/doc/fr_method_index.html
CHANGED
@@ -22,7 +22,8 @@
|
|
22
22
|
<div id="index-entries">
|
23
23
|
<a href="classes/SpiderInstance.html#M000001">add_url_check (SpiderInstance)</a><br />
|
24
24
|
<a href="classes/SpiderInstance.html#M000002">on (SpiderInstance)</a><br />
|
25
|
-
<a href="classes/
|
25
|
+
<a href="classes/SpiderInstance.html#M000003">remove_trailing_slash (SpiderInstance)</a><br />
|
26
|
+
<a href="classes/Spider.html#M000004">start_at (Spider)</a><br />
|
26
27
|
</div>
|
27
28
|
</div>
|
28
29
|
</body>
|
data/lib/spider.rb
CHANGED
@@ -27,13 +27,18 @@ require 'robot_rules'
|
|
27
27
|
require 'open-uri'
|
28
28
|
require 'uri'
|
29
29
|
require 'net/http'
|
30
|
+
require 'net/https'
|
30
31
|
|
31
32
|
class Net::HTTPResponse #:nodoc:
|
32
33
|
def success?; false; end
|
34
|
+
def redirect?; false; end
|
33
35
|
end
|
34
36
|
class Net::HTTPSuccess #:nodoc:
|
35
37
|
def success?; true; end
|
36
38
|
end
|
39
|
+
class Net::HTTPRedirection #:nodoc:
|
40
|
+
def redirect?; true; end
|
41
|
+
end
|
37
42
|
|
38
43
|
# A spidering library for Ruby. Handles robots.txt, scraping, finding more
|
39
44
|
# links, and doing it all over again.
|
@@ -188,15 +193,16 @@ class SpiderInstance
|
|
188
193
|
def get_page(parsed_url, &block) #:nodoc:
|
189
194
|
@seen << parsed_url
|
190
195
|
begin
|
191
|
-
Net::HTTP.
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
196
|
+
http = Net::HTTP.new(parsed_url.host, parsed_url.port)
|
197
|
+
http.use_ssl = parsed_url.scheme == 'https'
|
198
|
+
# Uses start because http.finish cannot be called.
|
199
|
+
r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri))}
|
200
|
+
if r.redirect?
|
201
|
+
get_page(URI.parse(r['Location']), &block)
|
202
|
+
else
|
203
|
+
block.call(r)
|
198
204
|
end
|
199
|
-
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError
|
205
|
+
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError => e
|
200
206
|
p e
|
201
207
|
nil
|
202
208
|
end
|
@@ -207,7 +213,7 @@ class SpiderInstance
|
|
207
213
|
if resp.success?
|
208
214
|
cb_branch = @callbacks[:success]
|
209
215
|
cb_branch[:any].call(a_url, resp.code, resp, resp.body) if cb_branch[:any]
|
210
|
-
cb_branch[resp.code].call(a_url, resp.code, resp
|
216
|
+
cb_branch[resp.code].call(a_url, resp.code, resp, resp.body) if cb_branch[resp.code]
|
211
217
|
else
|
212
218
|
cb_branch = @callbacks[:failure]
|
213
219
|
cb_branch[:any].call(a_url, resp.code) if cb_branch[:any]
|
@@ -1,15 +1,84 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'spec'
|
3
|
+
require 'webrick'
|
4
|
+
require 'webrick/https'
|
3
5
|
require File.dirname(__FILE__)+'/../lib/spider'
|
4
6
|
|
5
7
|
Spec::Runner.configure { |c| c.mock_with :mocha }
|
6
8
|
|
9
|
+
class QueryServlet < WEBrick::HTTPServlet::AbstractServlet
|
10
|
+
def do_GET(req, res)
|
11
|
+
res['Content-type'] = 'text/plain'
|
12
|
+
res.body = "response\n"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def null_logger
|
17
|
+
l = stub
|
18
|
+
[:log, :fatal, :error, :warn , :info, :debug].each do |k|
|
19
|
+
l.stubs(k)
|
20
|
+
l.stubs("#{k}?".to_sym)
|
21
|
+
end
|
22
|
+
l
|
23
|
+
end
|
24
|
+
|
7
25
|
describe 'SpiderInstance' do
|
26
|
+
# Bug reported by John Nagro, using the example source http://eons.com/
|
27
|
+
# had to change line 192; uses request_uri now instead of path.
|
28
|
+
it 'should handle query URLs without a path' do
|
29
|
+
u = 'http://localhost:8888?s=1'
|
30
|
+
u_p = URI.parse(u)
|
31
|
+
@block_called = false
|
32
|
+
server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
|
33
|
+
:AccessLog => [])
|
34
|
+
server.mount('/', QueryServlet)
|
35
|
+
Thread.new {server.start}
|
36
|
+
si = SpiderInstance.new([u])
|
37
|
+
si.get_page(u_p) do
|
38
|
+
@block_called = true
|
39
|
+
end
|
40
|
+
server.shutdown
|
41
|
+
@block_called.should be_true
|
42
|
+
end
|
43
|
+
|
44
|
+
# This solves a problem reported by John Nagro.
|
45
|
+
it 'should handle redirects' do
|
46
|
+
u = 'http://example.com/'
|
47
|
+
u_p = URI.parse(u)
|
48
|
+
@redirect_handled = false
|
49
|
+
mock_redirect_http
|
50
|
+
si = SpiderInstance.new([u])
|
51
|
+
si.get_page(u_p) do
|
52
|
+
@redirect_handled = true
|
53
|
+
end
|
54
|
+
@redirect_handled.should be_true
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'should handle HTTPS' do
|
58
|
+
u = 'https://localhost:10443/'
|
59
|
+
u_p = URI.parse(u)
|
60
|
+
@page_called = false
|
61
|
+
server = WEBrick::HTTPServer.new(:Port => 10443,
|
62
|
+
:Logger => null_logger,
|
63
|
+
:AccessLog => [],
|
64
|
+
:SSLEnable => true,
|
65
|
+
:SSLCertName => [["O", "ruby-lang.org"], ["OU", "sample"], ["CN", WEBrick::Utils::getservername]],
|
66
|
+
:SSLComment => 'Comment of some sort')
|
67
|
+
server.mount('/', QueryServlet)
|
68
|
+
Thread.new {server.start}
|
69
|
+
si = SpiderInstance.new([u])
|
70
|
+
si.get_page(u_p) { @page_called = true }
|
71
|
+
server.shutdown
|
72
|
+
@page_called.should be_true
|
73
|
+
end
|
74
|
+
|
75
|
+
it 'should maintain the entire graph within some external object (or memory, or memcached)'
|
76
|
+
|
8
77
|
it 'should skip URLs when allowable_url? is false' do
|
9
78
|
u = 'http://example.com/'
|
10
79
|
u_p = URI.parse(u)
|
11
|
-
http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
|
12
|
-
Net::HTTP.stubs(:
|
80
|
+
http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
|
81
|
+
Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
|
13
82
|
si = SpiderInstance.new([u])
|
14
83
|
si.expects(:allowable_url?).with(u, u_p).returns(false)
|
15
84
|
si.expects(:get_page).times(0)
|
@@ -19,8 +88,8 @@ describe 'SpiderInstance' do
|
|
19
88
|
it 'should not skip URLs when allowable_url? is true' do
|
20
89
|
u = 'http://example.com/'
|
21
90
|
u_p = URI.parse(u)
|
22
|
-
http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
|
23
|
-
Net::HTTP.stubs(:
|
91
|
+
http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
|
92
|
+
Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
|
24
93
|
si = SpiderInstance.new([u])
|
25
94
|
si.expects(:allowable_url?).with(u, u_p).returns(true)
|
26
95
|
si.expects(:allowable_url?).with(nil, nil).returns(false)
|
@@ -68,19 +137,11 @@ describe 'SpiderInstance' do
|
|
68
137
|
allowable = si.allowable_url?('http://example.com/',
|
69
138
|
URI.parse('http://example.com/'))
|
70
139
|
allowable.should == false
|
71
|
-
@first_url_check
|
72
|
-
@second_url_check
|
73
|
-
end
|
74
|
-
|
75
|
-
it 'should support memcached'
|
76
|
-
it 'should avoid cycles using memcached'
|
77
|
-
|
78
|
-
it 'should support memory' do
|
79
|
-
si = SpiderInstance.new(['http://example.com/'])
|
80
|
-
si.use_cache :memory # No exn
|
140
|
+
@first_url_check.should be_true
|
141
|
+
@second_url_check.should be_true
|
81
142
|
end
|
82
143
|
|
83
|
-
it 'should avoid cycles
|
144
|
+
it 'should avoid cycles' do
|
84
145
|
u = 'http://example.com/'
|
85
146
|
u_p = URI.parse(u)
|
86
147
|
si = SpiderInstance.new([u], [u_p])
|
@@ -92,32 +153,29 @@ describe 'SpiderInstance' do
|
|
92
153
|
|
93
154
|
it 'should call the 404 handler for 404s' do
|
94
155
|
@proc_called = false
|
95
|
-
|
96
|
-
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
156
|
+
mock_failed_http
|
97
157
|
si = SpiderInstance.new(['http://example.com/'])
|
98
158
|
si.stubs(:allowed?).returns(true)
|
99
159
|
si.stubs(:generate_next_urls).returns([])
|
100
160
|
si.on(404) {|*a| @proc_called = true}
|
101
161
|
si.start!
|
102
|
-
@proc_called.should
|
162
|
+
@proc_called.should be_true
|
103
163
|
end
|
104
164
|
|
105
165
|
it 'should call the :success handler on success' do
|
106
166
|
@proc_called = false
|
107
|
-
|
108
|
-
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
167
|
+
mock_successful_http
|
109
168
|
si = SpiderInstance.new(['http://example.com/'])
|
110
169
|
si.stubs(:allowed?).returns(true)
|
111
170
|
si.stubs(:generate_next_urls).returns([])
|
112
171
|
si.on(:success) {|*a| @proc_called = true}
|
113
172
|
si.start!
|
114
|
-
@proc_called.should
|
173
|
+
@proc_called.should be_true
|
115
174
|
end
|
116
175
|
|
117
176
|
it 'should not call the :success handler on failure' do
|
118
177
|
@proc_called = false
|
119
|
-
|
120
|
-
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
178
|
+
mock_failed_http
|
121
179
|
si = SpiderInstance.new(['http://example.com/'])
|
122
180
|
si.stubs(:allowed?).returns(true)
|
123
181
|
si.stubs(:generate_next_urls).returns([])
|
@@ -129,22 +187,20 @@ describe 'SpiderInstance' do
|
|
129
187
|
it 'should call the :success handler and the 200 handler on 200' do
|
130
188
|
@proc_200_called = false
|
131
189
|
@proc_success_called = false
|
132
|
-
|
133
|
-
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
190
|
+
mock_successful_http
|
134
191
|
si = SpiderInstance.new(['http://example.com/'])
|
135
192
|
si.stubs(:allowed?).returns(true)
|
136
193
|
si.stubs(:generate_next_urls).returns([])
|
137
194
|
si.on(:success) {|*a| @proc_success_called = true}
|
138
|
-
si.on(200)
|
195
|
+
si.on(200) {|*a| @proc_200_called = true}
|
139
196
|
si.start!
|
140
|
-
@proc_200_called.should
|
141
|
-
@proc_success_called.should
|
197
|
+
@proc_200_called.should be_true
|
198
|
+
@proc_success_called.should be_true
|
142
199
|
end
|
143
200
|
|
144
201
|
it 'should not call the :failure handler on success' do
|
145
202
|
@proc_called = false
|
146
|
-
|
147
|
-
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
203
|
+
mock_successful_http
|
148
204
|
si = SpiderInstance.new(['http://example.com/'])
|
149
205
|
si.stubs(:allowed?).returns(true)
|
150
206
|
si.stubs(:generate_next_urls).returns([])
|
@@ -155,65 +211,90 @@ describe 'SpiderInstance' do
|
|
155
211
|
|
156
212
|
it 'should call the :failure handler on failure' do
|
157
213
|
@proc_called = false
|
158
|
-
|
159
|
-
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
214
|
+
mock_failed_http
|
160
215
|
si = SpiderInstance.new(['http://example.com/'])
|
161
216
|
si.stubs(:allowed?).returns(true)
|
162
217
|
si.stubs(:generate_next_urls).returns([])
|
163
218
|
si.on(:failure) {|*a| @proc_called = true}
|
164
219
|
si.start!
|
165
|
-
@proc_called.should
|
220
|
+
@proc_called.should be_true
|
166
221
|
end
|
167
222
|
|
168
223
|
it 'should call the :failure handler and the 404 handler on 404' do
|
169
224
|
@proc_404_called = false
|
170
225
|
@proc_failure_called = false
|
171
|
-
|
172
|
-
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
226
|
+
mock_failed_http
|
173
227
|
si = SpiderInstance.new(['http://example.com/'])
|
174
228
|
si.stubs(:allowed?).returns(true)
|
175
229
|
si.stubs(:generate_next_urls).returns([])
|
176
230
|
si.on(:failure) {|*a| @proc_failure_called = true}
|
177
231
|
si.on(404) {|*a| @proc_404_called = true}
|
178
232
|
si.start!
|
179
|
-
@proc_404_called.should
|
180
|
-
@proc_failure_called.should
|
233
|
+
@proc_404_called.should be_true
|
234
|
+
@proc_failure_called.should be_true
|
181
235
|
end
|
182
236
|
|
183
237
|
it 'should call the :any handler even when a handler for the error code is defined' do
|
184
238
|
@any_called = false
|
185
|
-
|
186
|
-
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
239
|
+
mock_successful_http
|
187
240
|
si = SpiderInstance.new(['http://example.com/'])
|
188
241
|
si.stubs(:allowed?).returns(true)
|
189
242
|
si.stubs(:generate_next_urls).returns([])
|
190
243
|
si.on(:any) { |*a| @any_called = true }
|
191
244
|
si.on(202) {|*a|}
|
192
245
|
si.start!
|
193
|
-
@any_called.should
|
246
|
+
@any_called.should be_true
|
194
247
|
end
|
195
248
|
|
196
249
|
it 'should support a block as a response handler' do
|
197
250
|
@proc_called = false
|
198
|
-
|
199
|
-
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
251
|
+
mock_successful_http
|
200
252
|
si = SpiderInstance.new(['http://example.com/'])
|
201
253
|
si.stubs(:allowed?).returns(true)
|
202
254
|
si.stubs(:generate_next_urls).returns([])
|
203
255
|
si.on(:any) { |*a| @proc_called = true }
|
204
256
|
si.start!
|
205
|
-
@proc_called.should
|
257
|
+
@proc_called.should be_true
|
206
258
|
end
|
207
259
|
|
208
260
|
it 'should support a proc as a response handler' do
|
209
261
|
@proc_called = false
|
210
|
-
|
211
|
-
Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
|
262
|
+
mock_successful_http
|
212
263
|
si = SpiderInstance.new(['http://example.com/'])
|
213
264
|
si.stubs(:allowed?).returns(true)
|
214
265
|
si.stubs(:generate_next_urls).returns([])
|
215
266
|
si.on(:any, Proc.new { |*a| @proc_called = true })
|
216
267
|
si.start!
|
217
|
-
@proc_called.should
|
268
|
+
@proc_called.should be_true
|
269
|
+
end
|
270
|
+
|
271
|
+
def mock_http(http_req)
|
272
|
+
http_obj = mock(:use_ssl= => true)
|
273
|
+
http_obj.expects(:start).
|
274
|
+
yields(mock(:request => http_req)).returns(http_req)
|
275
|
+
Net::HTTP.expects(:new).returns(http_obj)
|
276
|
+
end
|
277
|
+
|
278
|
+
def mock_successful_http
|
279
|
+
http_req = stub(:redirect? => false, :success? => true, :code => 200, :body => 'body')
|
280
|
+
mock_http(http_req)
|
281
|
+
end
|
282
|
+
|
283
|
+
def mock_failed_http
|
284
|
+
http_req = stub(:redirect? => false, :success? => false, :code => 404)
|
285
|
+
mock_http(http_req)
|
286
|
+
end
|
287
|
+
|
288
|
+
def mock_redirect_http
|
289
|
+
http_req = stub(:redirect? => true, :success? => false, :code => 404)
|
290
|
+
http_req.expects(:[]).with('Location').returns('http://example.com/')
|
291
|
+
http_req2 = stub(:redirect? => false, :success? => true, :code => 200)
|
292
|
+
http_obj = mock(:use_ssl= => true)
|
293
|
+
http_obj.expects(:start).
|
294
|
+
yields(mock(:request => http_req)).returns(http_req)
|
295
|
+
http_obj2 = mock(:use_ssl= => true)
|
296
|
+
http_obj2.expects(:start).
|
297
|
+
yields(mock(:request => http_req2)).returns(http_req2)
|
298
|
+
Net::HTTP.expects(:new).times(2).returns(http_obj).then.returns(http_obj2)
|
218
299
|
end
|
219
300
|
end
|
data/spider.gemspec
CHANGED
data/test_server/client.rb
CHANGED
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.4
|
|
3
3
|
specification_version: 1
|
4
4
|
name: spider
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.2.
|
7
|
-
date: 2007-10-
|
6
|
+
version: 0.2.1
|
7
|
+
date: 2007-10-23 00:00:00 -04:00
|
8
8
|
summary: A Web spidering library
|
9
9
|
require_paths:
|
10
10
|
- lib
|