spider 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGES CHANGED
@@ -1,9 +1,17 @@
1
+ 2007-10-23:
2
+ * URLs without a page component but with a query component.
3
+ * HTTP Redirect.
4
+ * HTTPS.
5
+ * Version 0.2.1 .
6
+
1
7
  2007-10-22:
2
8
  * Use RSpec to ensure that it mostly works.
3
9
  * Use WEBrick to create a small test server for additional testing.
4
10
  * Completely re-do the API to prepare for future expansion.
5
11
  * Add the ability to apply each URL to a series of custom allowed?-like
6
12
  matchers.
13
+ * BSD license.
14
+ * Version 0.2.0 .
7
15
 
8
16
  2007-03-30:
9
17
  * Clean up the documentation.
@@ -93,7 +93,7 @@ links, and doing it all over again.
93
93
  <h3 class="section-bar">Methods</h3>
94
94
 
95
95
  <div class="name-list">
96
- <a href="#M000003">start_at</a>&nbsp;&nbsp;
96
+ <a href="#M000004">start_at</a>&nbsp;&nbsp;
97
97
  </div>
98
98
  </div>
99
99
 
@@ -115,11 +115,11 @@ links, and doing it all over again.
115
115
  <div id="methods">
116
116
  <h3 class="section-bar">Public Class methods</h3>
117
117
 
118
- <div id="method-M000003" class="method-detail">
119
- <a name="M000003"></a>
118
+ <div id="method-M000004" class="method-detail">
119
+ <a name="M000004"></a>
120
120
 
121
121
  <div class="method-heading">
122
- <a href="#M000003" class="method-signature">
122
+ <a href="#M000004" class="method-signature">
123
123
  <span class="method-name">start_at</span><span class="method-args">(a_url, &amp;block)</span>
124
124
  </a>
125
125
  </div>
@@ -150,10 +150,10 @@ define the rules and handlers for the discovered Web pages.
150
150
  end
151
151
  </pre>
152
152
  <p><a class="source-toggle" href="#"
153
- onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
154
- <div class="method-source-code" id="M000003-source">
153
+ onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
154
+ <div class="method-source-code" id="M000004-source">
155
155
  <pre>
156
- <span class="ruby-comment cmt"># File lib/spider.rb, line 55</span>
156
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 68</span>
157
157
  <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">start_at</span>(<span class="ruby-identifier">a_url</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
158
158
  <span class="ruby-identifier">rules</span> = <span class="ruby-constant">RobotRules</span>.<span class="ruby-identifier">new</span>(<span class="ruby-value str">'Ruby Spider 1.0'</span>)
159
159
  <span class="ruby-identifier">a_spider</span> = <span class="ruby-constant">SpiderInstance</span>.<span class="ruby-identifier">new</span>([<span class="ruby-identifier">a_url</span>], [], <span class="ruby-identifier">rules</span>, [])
@@ -88,6 +88,7 @@
88
88
  <div class="name-list">
89
89
  <a href="#M000001">add_url_check</a>&nbsp;&nbsp;
90
90
  <a href="#M000002">on</a>&nbsp;&nbsp;
91
+ <a href="#M000003">remove_trailing_slash</a>&nbsp;&nbsp;
91
92
  </div>
92
93
  </div>
93
94
 
@@ -135,7 +136,7 @@ href="http://mike-burns.com">mike-burns.com</a>&#8217;:
135
136
  onclick="toggleCode('M000001-source');return false;">[Source]</a></p>
136
137
  <div class="method-source-code" id="M000001-source">
137
138
  <pre>
138
- <span class="ruby-comment cmt"># File lib/spider.rb, line 81</span>
139
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 94</span>
139
140
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_url_check</span>(<span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
140
141
  <span class="ruby-ivar">@url_checks</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-identifier">block</span>
141
142
  <span class="ruby-keyword kw">end</span>
@@ -195,7 +196,7 @@ For example:
195
196
  onclick="toggleCode('M000002-source');return false;">[Source]</a></p>
196
197
  <div class="method-source-code" id="M000002-source">
197
198
  <pre>
198
- <span class="ruby-comment cmt"># File lib/spider.rb, line 118</span>
199
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 131</span>
199
200
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">on</span>(<span class="ruby-identifier">code</span>, <span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
200
201
  <span class="ruby-identifier">f</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
201
202
  <span class="ruby-keyword kw">case</span> <span class="ruby-identifier">code</span>
@@ -214,6 +215,29 @@ For example:
214
215
  </div>
215
216
  </div>
216
217
 
218
+ <div id="method-M000003" class="method-detail">
219
+ <a name="M000003"></a>
220
+
221
+ <div class="method-heading">
222
+ <a href="#M000003" class="method-signature">
223
+ <span class="method-name">remove_trailing_slash</span><span class="method-args">(s)</span>
224
+ </a>
225
+ </div>
226
+
227
+ <div class="method-description">
228
+ <p><a class="source-toggle" href="#"
229
+ onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
230
+ <div class="method-source-code" id="M000003-source">
231
+ <pre>
232
+ <span class="ruby-comment cmt"># File lib/spider.rb, line 257</span>
233
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">remove_trailing_slash</span>(<span class="ruby-identifier">s</span>)
234
+ <span class="ruby-identifier">s</span>.<span class="ruby-identifier">sub</span>(<span class="ruby-regexp re">%r{/*$}</span>,<span class="ruby-value str">''</span>)
235
+ <span class="ruby-keyword kw">end</span>
236
+ </pre>
237
+ </div>
238
+ </div>
239
+ </div>
240
+
217
241
 
218
242
  </div>
219
243
 
@@ -1 +1 @@
1
- Mon, 22 Oct 2007 07:35:00 -0400
1
+ Tue, 23 Oct 2007 23:14:46 -0400
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Mon Oct 22 07:19:31 -0400 2007</td>
59
+ <td>Tue Oct 23 23:11:42 -0400 2007</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -125,6 +125,7 @@ href="http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589">blade.
125
125
  open-uri&nbsp;&nbsp;
126
126
  uri&nbsp;&nbsp;
127
127
  net/http&nbsp;&nbsp;
128
+ net/https&nbsp;&nbsp;
128
129
  </div>
129
130
  </div>
130
131
 
@@ -20,7 +20,6 @@
20
20
  <div id="index">
21
21
  <h1 class="section-bar">Files</h1>
22
22
  <div id="index-entries">
23
- <a href="files/README.html">README</a><br />
24
23
  <a href="files/lib/spider_rb.html">lib/spider.rb</a><br />
25
24
  </div>
26
25
  </div>
@@ -22,7 +22,8 @@
22
22
  <div id="index-entries">
23
23
  <a href="classes/SpiderInstance.html#M000001">add_url_check (SpiderInstance)</a><br />
24
24
  <a href="classes/SpiderInstance.html#M000002">on (SpiderInstance)</a><br />
25
- <a href="classes/Spider.html#M000003">start_at (Spider)</a><br />
25
+ <a href="classes/SpiderInstance.html#M000003">remove_trailing_slash (SpiderInstance)</a><br />
26
+ <a href="classes/Spider.html#M000004">start_at (Spider)</a><br />
26
27
  </div>
27
28
  </div>
28
29
  </body>
@@ -27,13 +27,18 @@ require 'robot_rules'
27
27
  require 'open-uri'
28
28
  require 'uri'
29
29
  require 'net/http'
30
+ require 'net/https'
30
31
 
31
32
  class Net::HTTPResponse #:nodoc:
32
33
  def success?; false; end
34
+ def redirect?; false; end
33
35
  end
34
36
  class Net::HTTPSuccess #:nodoc:
35
37
  def success?; true; end
36
38
  end
39
+ class Net::HTTPRedirection #:nodoc:
40
+ def redirect?; true; end
41
+ end
37
42
 
38
43
  # A spidering library for Ruby. Handles robots.txt, scraping, finding more
39
44
  # links, and doing it all over again.
@@ -188,15 +193,16 @@ class SpiderInstance
188
193
  def get_page(parsed_url, &block) #:nodoc:
189
194
  @seen << parsed_url
190
195
  begin
191
- Net::HTTP.start(parsed_url.host, parsed_url.port) do |http|
192
- r = http.request(Net::HTTP::Get.new(parsed_url.path))
193
- if r.is_a?(Net::HTTPRedirection)
194
- get_page(URI.parse(r['Location']), block)
195
- else
196
- block.call(r)
197
- end
196
+ http = Net::HTTP.new(parsed_url.host, parsed_url.port)
197
+ http.use_ssl = parsed_url.scheme == 'https'
198
+ # Uses start because http.finish cannot be called.
199
+ r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri))}
200
+ if r.redirect?
201
+ get_page(URI.parse(r['Location']), &block)
202
+ else
203
+ block.call(r)
198
204
  end
199
- rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, Exception =>e
205
+ rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError => e
200
206
  p e
201
207
  nil
202
208
  end
@@ -207,7 +213,7 @@ class SpiderInstance
207
213
  if resp.success?
208
214
  cb_branch = @callbacks[:success]
209
215
  cb_branch[:any].call(a_url, resp.code, resp, resp.body) if cb_branch[:any]
210
- cb_branch[resp.code].call(a_url, resp.code, resp.headers, resp.body) if cb_branch[resp.code]
216
+ cb_branch[resp.code].call(a_url, resp.code, resp, resp.body) if cb_branch[resp.code]
211
217
  else
212
218
  cb_branch = @callbacks[:failure]
213
219
  cb_branch[:any].call(a_url, resp.code) if cb_branch[:any]
@@ -1,15 +1,84 @@
1
1
  require 'rubygems'
2
2
  require 'spec'
3
+ require 'webrick'
4
+ require 'webrick/https'
3
5
  require File.dirname(__FILE__)+'/../lib/spider'
4
6
 
5
7
  Spec::Runner.configure { |c| c.mock_with :mocha }
6
8
 
9
+ class QueryServlet < WEBrick::HTTPServlet::AbstractServlet
10
+ def do_GET(req, res)
11
+ res['Content-type'] = 'text/plain'
12
+ res.body = "response\n"
13
+ end
14
+ end
15
+
16
+ def null_logger
17
+ l = stub
18
+ [:log, :fatal, :error, :warn , :info, :debug].each do |k|
19
+ l.stubs(k)
20
+ l.stubs("#{k}?".to_sym)
21
+ end
22
+ l
23
+ end
24
+
7
25
  describe 'SpiderInstance' do
26
+ # Bug reported by John Nagro, using the example source http://eons.com/
27
+ # had to change line 192; uses request_uri now instead of path.
28
+ it 'should handle query URLs without a path' do
29
+ u = 'http://localhost:8888?s=1'
30
+ u_p = URI.parse(u)
31
+ @block_called = false
32
+ server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
33
+ :AccessLog => [])
34
+ server.mount('/', QueryServlet)
35
+ Thread.new {server.start}
36
+ si = SpiderInstance.new([u])
37
+ si.get_page(u_p) do
38
+ @block_called = true
39
+ end
40
+ server.shutdown
41
+ @block_called.should be_true
42
+ end
43
+
44
+ # This solves a problem reported by John Nagro.
45
+ it 'should handle redirects' do
46
+ u = 'http://example.com/'
47
+ u_p = URI.parse(u)
48
+ @redirect_handled = false
49
+ mock_redirect_http
50
+ si = SpiderInstance.new([u])
51
+ si.get_page(u_p) do
52
+ @redirect_handled = true
53
+ end
54
+ @redirect_handled.should be_true
55
+ end
56
+
57
+ it 'should handle HTTPS' do
58
+ u = 'https://localhost:10443/'
59
+ u_p = URI.parse(u)
60
+ @page_called = false
61
+ server = WEBrick::HTTPServer.new(:Port => 10443,
62
+ :Logger => null_logger,
63
+ :AccessLog => [],
64
+ :SSLEnable => true,
65
+ :SSLCertName => [["O", "ruby-lang.org"], ["OU", "sample"], ["CN", WEBrick::Utils::getservername]],
66
+ :SSLComment => 'Comment of some sort')
67
+ server.mount('/', QueryServlet)
68
+ Thread.new {server.start}
69
+ si = SpiderInstance.new([u])
70
+ si.get_page(u_p) { @page_called = true }
71
+ server.shutdown
72
+ @page_called.should be_true
73
+ end
74
+
75
+ it 'should maintain the entire graph within some external object (or memory, or memcached)'
76
+
8
77
  it 'should skip URLs when allowable_url? is false' do
9
78
  u = 'http://example.com/'
10
79
  u_p = URI.parse(u)
11
- http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
12
- Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
80
+ http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
81
+ Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
13
82
  si = SpiderInstance.new([u])
14
83
  si.expects(:allowable_url?).with(u, u_p).returns(false)
15
84
  si.expects(:get_page).times(0)
@@ -19,8 +88,8 @@ describe 'SpiderInstance' do
19
88
  it 'should not skip URLs when allowable_url? is true' do
20
89
  u = 'http://example.com/'
21
90
  u_p = URI.parse(u)
22
- http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
23
- Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
91
+ http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
92
+ Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
24
93
  si = SpiderInstance.new([u])
25
94
  si.expects(:allowable_url?).with(u, u_p).returns(true)
26
95
  si.expects(:allowable_url?).with(nil, nil).returns(false)
@@ -68,19 +137,11 @@ describe 'SpiderInstance' do
68
137
  allowable = si.allowable_url?('http://example.com/',
69
138
  URI.parse('http://example.com/'))
70
139
  allowable.should == false
71
- @first_url_check == true
72
- @second_url_check == true
73
- end
74
-
75
- it 'should support memcached'
76
- it 'should avoid cycles using memcached'
77
-
78
- it 'should support memory' do
79
- si = SpiderInstance.new(['http://example.com/'])
80
- si.use_cache :memory # No exn
140
+ @first_url_check.should be_true
141
+ @second_url_check.should be_true
81
142
  end
82
143
 
83
- it 'should avoid cycles using memory' do
144
+ it 'should avoid cycles' do
84
145
  u = 'http://example.com/'
85
146
  u_p = URI.parse(u)
86
147
  si = SpiderInstance.new([u], [u_p])
@@ -92,32 +153,29 @@ describe 'SpiderInstance' do
92
153
 
93
154
  it 'should call the 404 handler for 404s' do
94
155
  @proc_called = false
95
- http_resp = stub(:success? => false, :code => 404)
96
- Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
156
+ mock_failed_http
97
157
  si = SpiderInstance.new(['http://example.com/'])
98
158
  si.stubs(:allowed?).returns(true)
99
159
  si.stubs(:generate_next_urls).returns([])
100
160
  si.on(404) {|*a| @proc_called = true}
101
161
  si.start!
102
- @proc_called.should == true
162
+ @proc_called.should be_true
103
163
  end
104
164
 
105
165
  it 'should call the :success handler on success' do
106
166
  @proc_called = false
107
- http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
108
- Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
167
+ mock_successful_http
109
168
  si = SpiderInstance.new(['http://example.com/'])
110
169
  si.stubs(:allowed?).returns(true)
111
170
  si.stubs(:generate_next_urls).returns([])
112
171
  si.on(:success) {|*a| @proc_called = true}
113
172
  si.start!
114
- @proc_called.should == true
173
+ @proc_called.should be_true
115
174
  end
116
175
 
117
176
  it 'should not call the :success handler on failure' do
118
177
  @proc_called = false
119
- http_resp = stub(:success? => false, :code => 404)
120
- Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
178
+ mock_failed_http
121
179
  si = SpiderInstance.new(['http://example.com/'])
122
180
  si.stubs(:allowed?).returns(true)
123
181
  si.stubs(:generate_next_urls).returns([])
@@ -129,22 +187,20 @@ describe 'SpiderInstance' do
129
187
  it 'should call the :success handler and the 200 handler on 200' do
130
188
  @proc_200_called = false
131
189
  @proc_success_called = false
132
- http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
133
- Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
190
+ mock_successful_http
134
191
  si = SpiderInstance.new(['http://example.com/'])
135
192
  si.stubs(:allowed?).returns(true)
136
193
  si.stubs(:generate_next_urls).returns([])
137
194
  si.on(:success) {|*a| @proc_success_called = true}
138
- si.on(200) {|*a| @proc_200_called = true}
195
+ si.on(200) {|*a| @proc_200_called = true}
139
196
  si.start!
140
- @proc_200_called.should == true
141
- @proc_success_called.should == true
197
+ @proc_200_called.should be_true
198
+ @proc_success_called.should be_true
142
199
  end
143
200
 
144
201
  it 'should not call the :failure handler on success' do
145
202
  @proc_called = false
146
- http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
147
- Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
203
+ mock_successful_http
148
204
  si = SpiderInstance.new(['http://example.com/'])
149
205
  si.stubs(:allowed?).returns(true)
150
206
  si.stubs(:generate_next_urls).returns([])
@@ -155,65 +211,90 @@ describe 'SpiderInstance' do
155
211
 
156
212
  it 'should call the :failure handler on failure' do
157
213
  @proc_called = false
158
- http_resp = stub(:success? => false, :code => 404)
159
- Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
214
+ mock_failed_http
160
215
  si = SpiderInstance.new(['http://example.com/'])
161
216
  si.stubs(:allowed?).returns(true)
162
217
  si.stubs(:generate_next_urls).returns([])
163
218
  si.on(:failure) {|*a| @proc_called = true}
164
219
  si.start!
165
- @proc_called.should == true
220
+ @proc_called.should be_true
166
221
  end
167
222
 
168
223
  it 'should call the :failure handler and the 404 handler on 404' do
169
224
  @proc_404_called = false
170
225
  @proc_failure_called = false
171
- http_resp = stub(:success? => false, :code => 404)
172
- Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
226
+ mock_failed_http
173
227
  si = SpiderInstance.new(['http://example.com/'])
174
228
  si.stubs(:allowed?).returns(true)
175
229
  si.stubs(:generate_next_urls).returns([])
176
230
  si.on(:failure) {|*a| @proc_failure_called = true}
177
231
  si.on(404) {|*a| @proc_404_called = true}
178
232
  si.start!
179
- @proc_404_called.should == true
180
- @proc_failure_called.should == true
233
+ @proc_404_called.should be_true
234
+ @proc_failure_called.should be_true
181
235
  end
182
236
 
183
237
  it 'should call the :any handler even when a handler for the error code is defined' do
184
238
  @any_called = false
185
- http_resp = stub(:success? => true, :code => 200)
186
- Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
239
+ mock_successful_http
187
240
  si = SpiderInstance.new(['http://example.com/'])
188
241
  si.stubs(:allowed?).returns(true)
189
242
  si.stubs(:generate_next_urls).returns([])
190
243
  si.on(:any) { |*a| @any_called = true }
191
244
  si.on(202) {|*a|}
192
245
  si.start!
193
- @any_called.should == true
246
+ @any_called.should be_true
194
247
  end
195
248
 
196
249
  it 'should support a block as a response handler' do
197
250
  @proc_called = false
198
- http_resp = stub(:success? => true, :code => 200)
199
- Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
251
+ mock_successful_http
200
252
  si = SpiderInstance.new(['http://example.com/'])
201
253
  si.stubs(:allowed?).returns(true)
202
254
  si.stubs(:generate_next_urls).returns([])
203
255
  si.on(:any) { |*a| @proc_called = true }
204
256
  si.start!
205
- @proc_called.should == true
257
+ @proc_called.should be_true
206
258
  end
207
259
 
208
260
  it 'should support a proc as a response handler' do
209
261
  @proc_called = false
210
- http_resp = stub(:success? => true, :code => 200)
211
- Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
262
+ mock_successful_http
212
263
  si = SpiderInstance.new(['http://example.com/'])
213
264
  si.stubs(:allowed?).returns(true)
214
265
  si.stubs(:generate_next_urls).returns([])
215
266
  si.on(:any, Proc.new { |*a| @proc_called = true })
216
267
  si.start!
217
- @proc_called.should == true
268
+ @proc_called.should be_true
269
+ end
270
+
271
+ def mock_http(http_req)
272
+ http_obj = mock(:use_ssl= => true)
273
+ http_obj.expects(:start).
274
+ yields(mock(:request => http_req)).returns(http_req)
275
+ Net::HTTP.expects(:new).returns(http_obj)
276
+ end
277
+
278
+ def mock_successful_http
279
+ http_req = stub(:redirect? => false, :success? => true, :code => 200, :body => 'body')
280
+ mock_http(http_req)
281
+ end
282
+
283
+ def mock_failed_http
284
+ http_req = stub(:redirect? => false, :success? => false, :code => 404)
285
+ mock_http(http_req)
286
+ end
287
+
288
+ def mock_redirect_http
289
+ http_req = stub(:redirect? => true, :success? => false, :code => 404)
290
+ http_req.expects(:[]).with('Location').returns('http://example.com/')
291
+ http_req2 = stub(:redirect? => false, :success? => true, :code => 200)
292
+ http_obj = mock(:use_ssl= => true)
293
+ http_obj.expects(:start).
294
+ yields(mock(:request => http_req)).returns(http_req)
295
+ http_obj2 = mock(:use_ssl= => true)
296
+ http_obj2.expects(:start).
297
+ yields(mock(:request => http_req2)).returns(http_req2)
298
+ Net::HTTP.expects(:new).times(2).returns(http_obj).then.returns(http_obj2)
218
299
  end
219
300
  end
@@ -13,5 +13,5 @@ spec = Gem::Specification.new do |s|
13
13
  A Web spidering library: handles robots.txt, scraping, finding more
14
14
  links, and doing it all over again.
15
15
  EOF
16
- s.version = '0.2.0'
16
+ s.version = '0.2.1'
17
17
  end
@@ -20,3 +20,7 @@ Spider.start_at('http://localhost:8880/page1.html') do |s|
20
20
  puts "URL returned anything: #{a_url} with this code #{resp.code}"
21
21
  end
22
22
  end
23
+
24
+ %w(INT TERM).each do |signal|
25
+ trap(signal) { exit }
26
+ end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.4
3
3
  specification_version: 1
4
4
  name: spider
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.2.0
7
- date: 2007-10-22 00:00:00 -04:00
6
+ version: 0.2.1
7
+ date: 2007-10-23 00:00:00 -04:00
8
8
  summary: A Web spidering library
9
9
  require_paths:
10
10
  - lib