spider 0.4.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGES CHANGED
@@ -1,3 +1,10 @@
1
+ 2007-11-09:
2
+ * Handle redirects that assume a base URL.
3
+
4
+ 2007-11-08:
5
+ * Move spider_instance.rb, robot_rules.rb, and included_in_memcached.rb into
6
+ spider subdirectory.
7
+
1
8
  2007-11-02:
2
9
  * Memcached support.
3
10
 
data/README CHANGED
@@ -108,7 +108,7 @@ scraping, collecting, and looping so that you can just handle the data.
108
108
 
109
109
  Mike Burns http://mike-burns.com mike@mike-burns.com
110
110
 
111
- Help from Matt Horan and John Nagro.
111
+ Help from Matt Horan, John Nagro, and Henri Cook.
112
112
 
113
113
  With `robot_rules' from James Edward Gray II via
114
114
  http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589
@@ -55,8 +55,8 @@
55
55
  <tr class="top-aligned-row">
56
56
  <td><strong>In:</strong></td>
57
57
  <td>
58
- <a href="../files/lib/included_in_memcached_rb.html">
59
- lib/included_in_memcached.rb
58
+ <a href="../files/lib/spider/included_in_memcached_rb.html">
59
+ lib/spider/included_in_memcached.rb
60
60
  </a>
61
61
  <br />
62
62
  </td>
@@ -86,6 +86,15 @@ three operations: <a href="IncludedInMemcached.html#M000001">new</a>,
86
86
  Together these can be used to add items to the memcache, then determine
87
87
  whether the item has been added.
88
88
  </p>
89
+ <p>
90
+ To use it with <a href="Spider.html">Spider</a> use the
91
+ check_already_seen_with method:
92
+ </p>
93
+ <pre>
94
+ Spider.start_at('http://example.com/') do |s|
95
+ s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
96
+ end
97
+ </pre>
89
98
 
90
99
  </div>
91
100
 
@@ -139,7 +148,7 @@ arguments here are passed to MemCache (part of the memcache-client gem).
139
148
  onclick="toggleCode('M000001-source');return false;">[Source]</a></p>
140
149
  <div class="method-source-code" id="M000001-source">
141
150
  <pre>
142
- <span class="ruby-comment cmt"># File lib/included_in_memcached.rb, line 9</span>
151
+ <span class="ruby-comment cmt"># File lib/spider/included_in_memcached.rb, line 39</span>
143
152
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-operator">*</span><span class="ruby-identifier">a</span>)
144
153
  <span class="ruby-ivar">@c</span> = <span class="ruby-constant">MemCache</span>.<span class="ruby-identifier">new</span>(<span class="ruby-operator">*</span><span class="ruby-identifier">a</span>)
145
154
  <span class="ruby-keyword kw">end</span>
@@ -167,7 +176,7 @@ Add an item to the memcache.
167
176
  onclick="toggleCode('M000002-source');return false;">[Source]</a></p>
168
177
  <div class="method-source-code" id="M000002-source">
169
178
  <pre>
170
- <span class="ruby-comment cmt"># File lib/included_in_memcached.rb, line 14</span>
179
+ <span class="ruby-comment cmt"># File lib/spider/included_in_memcached.rb, line 44</span>
171
180
  <span class="ruby-keyword kw">def</span> <span class="ruby-operator">&lt;&lt;</span>(<span class="ruby-identifier">v</span>)
172
181
  <span class="ruby-ivar">@c</span>.<span class="ruby-identifier">add</span>(<span class="ruby-identifier">v</span>.<span class="ruby-identifier">to_s</span>, <span class="ruby-identifier">v</span>)
173
182
  <span class="ruby-keyword kw">end</span>
@@ -193,7 +202,7 @@ True if the item is in the memcache.
193
202
  onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
194
203
  <div class="method-source-code" id="M000003-source">
195
204
  <pre>
196
- <span class="ruby-comment cmt"># File lib/included_in_memcached.rb, line 19</span>
205
+ <span class="ruby-comment cmt"># File lib/spider/included_in_memcached.rb, line 49</span>
197
206
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">include?</span>(<span class="ruby-identifier">v</span>)
198
207
  <span class="ruby-ivar">@c</span>.<span class="ruby-identifier">get</span>(<span class="ruby-identifier">v</span>.<span class="ruby-identifier">to_s</span>) <span class="ruby-operator">==</span> <span class="ruby-identifier">v</span>
199
208
  <span class="ruby-keyword kw">end</span>
@@ -55,8 +55,8 @@
55
55
  <tr class="top-aligned-row">
56
56
  <td><strong>In:</strong></td>
57
57
  <td>
58
- <a href="../files/lib/spider_instance_rb.html">
59
- lib/spider_instance.rb
58
+ <a href="../files/lib/spider/spider_instance_rb.html">
59
+ lib/spider/spider_instance.rb
60
60
  </a>
61
61
  <br />
62
62
  </td>
@@ -140,7 +140,7 @@ href="http://mike-burns.com">mike-burns.com</a>&#8217;:
140
140
  onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
141
141
  <div class="method-source-code" id="M000004-source">
142
142
  <pre>
143
- <span class="ruby-comment cmt"># File lib/spider_instance.rb, line 69</span>
143
+ <span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 70</span>
144
144
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_url_check</span>(<span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
145
145
  <span class="ruby-ivar">@url_checks</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-identifier">block</span>
146
146
  <span class="ruby-keyword kw">end</span>
@@ -186,7 +186,7 @@ understand just &lt;&lt; and included? .
186
186
  onclick="toggleCode('M000005-source');return false;">[Source]</a></p>
187
187
  <div class="method-source-code" id="M000005-source">
188
188
  <pre>
189
- <span class="ruby-comment cmt"># File lib/spider_instance.rb, line 90</span>
189
+ <span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 91</span>
190
190
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">check_already_seen_with</span>(<span class="ruby-identifier">cacher</span>)
191
191
  <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">cacher</span>.<span class="ruby-identifier">respond_to?</span>(<span class="ruby-identifier">:&lt;&lt;</span>) <span class="ruby-operator">&amp;&amp;</span> <span class="ruby-identifier">cacher</span>.<span class="ruby-identifier">respond_to?</span>(<span class="ruby-identifier">:include?</span>)
192
192
  <span class="ruby-ivar">@seen</span> = <span class="ruby-identifier">cacher</span>
@@ -216,7 +216,7 @@ Reset the <a href="SpiderInstance.html#M000009">headers</a> hash.
216
216
  onclick="toggleCode('M000010-source');return false;">[Source]</a></p>
217
217
  <div class="method-source-code" id="M000010-source">
218
218
  <pre>
219
- <span class="ruby-comment cmt"># File lib/spider_instance.rb, line 157</span>
219
+ <span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 158</span>
220
220
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">clear_headers</span>
221
221
  <span class="ruby-ivar">@headers</span> = {}
222
222
  <span class="ruby-keyword kw">end</span>
@@ -245,7 +245,7 @@ Use like a hash:
245
245
  onclick="toggleCode('M000009-source');return false;">[Source]</a></p>
246
246
  <div class="method-source-code" id="M000009-source">
247
247
  <pre>
248
- <span class="ruby-comment cmt"># File lib/spider_instance.rb, line 145</span>
248
+ <span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 146</span>
249
249
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">headers</span>
250
250
  <span class="ruby-constant">HeaderSetter</span>.<span class="ruby-identifier">new</span>(<span class="ruby-keyword kw">self</span>)
251
251
  <span class="ruby-keyword kw">end</span>
@@ -294,7 +294,7 @@ For example:
294
294
  onclick="toggleCode('M000006-source');return false;">[Source]</a></p>
295
295
  <div class="method-source-code" id="M000006-source">
296
296
  <pre>
297
- <span class="ruby-comment cmt"># File lib/spider_instance.rb, line 120</span>
297
+ <span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 121</span>
298
298
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">on</span>(<span class="ruby-identifier">code</span>, <span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
299
299
  <span class="ruby-identifier">f</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
300
300
  <span class="ruby-keyword kw">case</span> <span class="ruby-identifier">code</span>
@@ -331,7 +331,7 @@ Run before the HTTP request. Given the URL as a string.
331
331
  onclick="toggleCode('M000007-source');return false;">[Source]</a></p>
332
332
  <div class="method-source-code" id="M000007-source">
333
333
  <pre>
334
- <span class="ruby-comment cmt"># File lib/spider_instance.rb, line 134</span>
334
+ <span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 135</span>
335
335
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">setup</span>(<span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
336
336
  <span class="ruby-ivar">@setup</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
337
337
  <span class="ruby-keyword kw">end</span>
@@ -357,7 +357,7 @@ Run last, once for each page. Given the URL as a string.
357
357
  onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
358
358
  <div class="method-source-code" id="M000008-source">
359
359
  <pre>
360
- <span class="ruby-comment cmt"># File lib/spider_instance.rb, line 139</span>
360
+ <span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 140</span>
361
361
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">teardown</span>(<span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
362
362
  <span class="ruby-ivar">@teardown</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
363
363
  <span class="ruby-keyword kw">end</span>
@@ -1 +1 @@
1
- Fri, 02 Nov 2007 17:20:02 -0400
1
+ Sat, 10 Nov 2007 00:25:19 -0500
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Fri Nov 02 17:19:47 -0400 2007</td>
59
+ <td>Thu Nov 08 17:51:17 -0500 2007</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -182,7 +182,7 @@ Mike Burns <a href="http://mike-burns.com">mike-burns.com</a>
182
182
  mike@mike-burns.com
183
183
  </p>
184
184
  <p>
185
- Help from Matt Horan and John Nagro.
185
+ Help from Matt Horan, John Nagro, and Henri Cook.
186
186
  </p>
187
187
  <p>
188
188
  With `robot_rules&#8217; from James Edward Gray II via <a
@@ -8,7 +8,7 @@
8
8
  <title>File: included_in_memcached.rb</title>
9
9
  <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
10
  <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
- <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
11
+ <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
12
12
  <script type="text/javascript">
13
13
  // <![CDATA[
14
14
 
@@ -51,12 +51,12 @@
51
51
  <table class="header-table">
52
52
  <tr class="top-aligned-row">
53
53
  <td><strong>Path:</strong></td>
54
- <td>lib/included_in_memcached.rb
54
+ <td>lib/spider/included_in_memcached.rb
55
55
  </td>
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Fri Nov 02 15:04:14 -0400 2007</td>
59
+ <td>Sat Nov 10 00:24:11 -0500 2007</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -68,6 +68,12 @@
68
68
 
69
69
  <div id="contextContent">
70
70
 
71
+ <div id="description">
72
+ <p>
73
+ Use memcached to track cycles.
74
+ </p>
75
+
76
+ </div>
71
77
 
72
78
  <div id="requires-list">
73
79
  <h3 class="section-bar">Required files</h3>
@@ -8,7 +8,7 @@
8
8
  <title>File: spider_instance.rb</title>
9
9
  <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
10
  <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
- <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
11
+ <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
12
12
  <script type="text/javascript">
13
13
  // <![CDATA[
14
14
 
@@ -51,12 +51,12 @@
51
51
  <table class="header-table">
52
52
  <tr class="top-aligned-row">
53
53
  <td><strong>Path:</strong></td>
54
- <td>lib/spider_instance.rb
54
+ <td>lib/spider/spider_instance.rb
55
55
  </td>
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Fri Nov 02 17:05:49 -0400 2007</td>
59
+ <td>Sat Nov 10 00:25:04 -0500 2007</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -70,7 +70,7 @@
70
70
 
71
71
  <div id="description">
72
72
  <p>
73
- Copyright 2007 Mike Burns
73
+ Specialized spidering rules.
74
74
  </p>
75
75
 
76
76
  </div>
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Fri Nov 02 12:32:39 -0400 2007</td>
59
+ <td>Thu Nov 08 17:29:01 -0500 2007</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -182,7 +182,7 @@ Mike Burns <a href="http://mike-burns.com">mike-burns.com</a>
182
182
  mike@mike-burns.com
183
183
  </p>
184
184
  <p>
185
- Help from Matt Horan and John Nagro.
185
+ Help from Matt Horan, John Nagro, and Henri Cook.
186
186
  </p>
187
187
  <p>
188
188
  With `robot_rules&#8217; from James Edward Gray II via <a
@@ -21,9 +21,9 @@
21
21
  <h1 class="section-bar">Files</h1>
22
22
  <div id="index-entries">
23
23
  <a href="files/README.html">README</a><br />
24
- <a href="files/lib/included_in_memcached_rb.html">lib/included_in_memcached.rb</a><br />
25
24
  <a href="files/lib/spider_rb.html">lib/spider.rb</a><br />
26
- <a href="files/lib/spider_instance_rb.html">lib/spider_instance.rb</a><br />
25
+ <a href="files/lib/spider/included_in_memcached_rb.html">lib/spider/included_in_memcached.rb</a><br />
26
+ <a href="files/lib/spider/spider_instance_rb.html">lib/spider/spider_instance.rb</a><br />
27
27
  </div>
28
28
  </div>
29
29
  </body>
@@ -23,7 +23,7 @@
23
23
  # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
24
  # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
25
 
26
- require File.dirname(__FILE__)+'/spider_instance'
26
+ require File.dirname(__FILE__)+'/spider/spider_instance'
27
27
 
28
28
  # A spidering library for Ruby. Handles robots.txt, scraping, finding more
29
29
  # links, and doing it all over again.
@@ -0,0 +1,52 @@
1
+ # Use memcached to track cycles.
2
+
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions are met:
5
+ # * Redistributions of source code must retain the above copyright
6
+ # notice, this list of conditions and the following disclaimer.
7
+ # * Redistributions in binary form must reproduce the above copyright
8
+ # notice, this list of conditions and the following disclaimer in the
9
+ # documentation and/or other materials provided with the distribution.
10
+ # * Neither the name Mike Burns nor the
11
+ # names of his contributors may be used to endorse or promote products
12
+ # derived from this software without specific prior written permission.
13
+ #
14
+ # THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
15
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16
+ # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17
+ # DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
18
+ # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19
+ # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20
+ # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21
+ # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24
+
25
+ require 'memcache'
26
+
27
+ # A specialized class using memcached to track items stored. It supports
28
+ # three operations: new, <<, and include? . Together these can be used to
29
+ # add items to the memcache, then determine whether the item has been added.
30
+ #
31
+ # To use it with Spider use the check_already_seen_with method:
32
+ #
33
+ # Spider.start_at('http://example.com/') do |s|
34
+ # s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
35
+ # end
36
+ class IncludedInMemcached
37
+ # Construct a new IncludedInMemcached instance. All arguments here are
38
+ # passed to MemCache (part of the memcache-client gem).
39
+ def initialize(*a)
40
+ @c = MemCache.new(*a)
41
+ end
42
+
43
+ # Add an item to the memcache.
44
+ def <<(v)
45
+ @c.add(v.to_s, v)
46
+ end
47
+
48
+ # True if the item is in the memcache.
49
+ def include?(v)
50
+ @c.get(v.to_s) == v
51
+ end
52
+ end
@@ -1,3 +1,5 @@
1
+ # Understand robots.txt.
2
+
1
3
  # Created by James Edward Gray II on 2006-01-31.
2
4
  # Copyright 2006 Gray Productions. All rights reserved.
3
5
 
@@ -1,5 +1,6 @@
1
- # Copyright 2007 Mike Burns
1
+ # Specialized spidering rules.
2
2
 
3
+ # Copyright 2007 Mike Burns
3
4
  # Redistribution and use in source and binary forms, with or without
4
5
  # modification, are permitted provided that the following conditions are met:
5
6
  # * Redistributions of source code must retain the above copyright
@@ -22,7 +23,7 @@
22
23
  # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23
24
  # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24
25
 
25
- require 'robot_rules'
26
+ require File.dirname(__FILE__)+'/robot_rules.rb'
26
27
  require 'open-uri'
27
28
  require 'uri'
28
29
  require 'net/http'
@@ -221,7 +222,7 @@ class SpiderInstance
221
222
  r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri,
222
223
  @headers))}
223
224
  if r.redirect?
224
- get_page(URI.parse(r['Location']), &block)
225
+ get_page(URI.parse(construct_complete_url(parsed_url,r['Location'])), &block)
225
226
  else
226
227
  block.call(r)
227
228
  end
@@ -252,21 +253,7 @@ class SpiderInstance
252
253
  if parsed_link.fragment == '#'
253
254
  nil
254
255
  else
255
- case parsed_link.scheme
256
- when 'http'
257
- link
258
- when nil
259
- u = URI.parse(base_url)
260
- if link[0].chr == '/'
261
- "#{u.scheme}://#{u.host}:#{u.port}#{link}"
262
- elsif u.path.nil? || u.path == ''
263
- "#{u.scheme}://#{u.host}:#{u.port}/#{link}"
264
- else
265
- "#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{link}"
266
- end
267
- else
268
- nil
269
- end
256
+ construct_complete_url(base_url, link, parsed_link)
270
257
  end
271
258
  rescue
272
259
  nil
@@ -274,6 +261,23 @@ class SpiderInstance
274
261
  end.compact
275
262
  end
276
263
 
264
+ def construct_complete_url(base_url, additional_url, parsed_additional_url = nil) #:nodoc:
265
+ parsed_additional_url ||= URI.parse(additional_url)
266
+ case parsed_additional_url.scheme
267
+ when nil
268
+ u = base_url.is_a?(URI) ? base_url : URI.parse(base_url)
269
+ if additional_url[0].chr == '/'
270
+ "#{u.scheme}://#{u.host}:#{u.port}#{additional_url}"
271
+ elsif u.path.nil? || u.path == ''
272
+ "#{u.scheme}://#{u.host}:#{u.port}/#{additional_url}"
273
+ else
274
+ "#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{additional_url}"
275
+ end
276
+ else
277
+ additional_url
278
+ end
279
+ end
280
+
277
281
  def remove_trailing_slash(s) #:nodoc:
278
282
  s.sub(%r{/*$},'')
279
283
  end
@@ -0,0 +1,90 @@
1
+ require 'rubygems'
2
+ require 'webrick'
3
+ require 'spec'
4
+
5
+ Spec::Runner.configure { |c| c.mock_with :mocha }
6
+
7
+ def local_require(*files)
8
+ files.each do |file|
9
+ require File.dirname(__FILE__)+'/../lib/'+file
10
+ end
11
+ end
12
+
13
+ class BeStaticServerPages
14
+ def initialize
15
+ @pages = ['http://localhost:8888/', 'http://localhost:8888/foo']
16
+ @actual = nil
17
+ end
18
+
19
+ attr :actual, true
20
+
21
+ def matches?(actual)
22
+ @actual = actual
23
+ actual == @pages
24
+ end
25
+
26
+ def failure_message
27
+ "expected #{@pages.inspect}, got #{@actual.inspect}"
28
+ end
29
+
30
+ def description
31
+ "be the pages returned by the static server (#{@pages.inspect})"
32
+ end
33
+ end
34
+
35
+ def with_web_server(svlt)
36
+ server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
37
+ :AccessLog => [])
38
+ server.mount('/', svlt)
39
+ Thread.new {server.start}
40
+ begin
41
+ yield
42
+ ensure
43
+ server.shutdown
44
+ end
45
+ end
46
+
47
+ def with_memcached
48
+ system('memcached -d -P /tmp/spider-memcached.pid')
49
+ cacher = IncludedInMemcached.new('localhost:11211')
50
+ begin
51
+ yield
52
+ ensure
53
+ system('kill -KILL `cat /tmp/spider-memcached.pid`')
54
+ end
55
+ end
56
+
57
+ def be_static_server_pages
58
+ BeStaticServerPages.new
59
+ end
60
+
61
+ class QueryServlet < WEBrick::HTTPServlet::AbstractServlet
62
+ def do_GET(req, res)
63
+ res['Content-type'] = 'text/plain'
64
+ res.body = "response\n"
65
+ end
66
+ end
67
+
68
+ class LoopingServlet < WEBrick::HTTPServlet::AbstractServlet
69
+ def do_GET(req, res)
70
+ res['Content-type'] = 'text/html'
71
+ if req.path == '/foo'
72
+ res.body = <<-END
73
+ <a href="/">a</a>
74
+ END
75
+ else
76
+ res.body = <<-END
77
+ <a href="/foo">b</a>
78
+ END
79
+ end
80
+ end
81
+ end
82
+
83
+ def null_logger
84
+ l = stub
85
+ [:log, :fatal, :error, :warn , :info, :debug].each do |k|
86
+ l.stubs(k)
87
+ l.stubs("#{k}?".to_sym)
88
+ end
89
+ l
90
+ end
@@ -1,8 +1,7 @@
1
- require 'rubygems'
2
- require 'spec'
1
+ require File.dirname(__FILE__)+'/../spec_helper'
3
2
 
4
3
  def before_specing_memcached
5
- require File.dirname(__FILE__)+'/../lib/included_in_memcached'
4
+ local_require 'spider/included_in_memcached'
6
5
  system('memcached -d -P /tmp/spider-memcached.pid')
7
6
  end
8
7
 
@@ -1,49 +1,35 @@
1
- require 'rubygems'
2
- require 'spec'
1
+ require File.dirname(__FILE__)+'/../spec_helper'
3
2
  require 'webrick'
4
3
  require 'webrick/https'
5
- require File.dirname(__FILE__)+'/../lib/spider'
6
- require File.dirname(__FILE__)+'/../lib/included_in_memcached'
4
+ local_require 'spider', 'spider/included_in_memcached'
7
5
 
8
- Spec::Runner.configure { |c| c.mock_with :mocha }
9
-
10
- class QueryServlet < WEBrick::HTTPServlet::AbstractServlet
11
- def do_GET(req, res)
12
- res['Content-type'] = 'text/plain'
13
- res.body = "response\n"
14
- end
15
- end
16
-
17
- class LoopingServlet < WEBrick::HTTPServlet::AbstractServlet
18
- def do_GET(req, res)
19
- res['Content-type'] = 'text/html'
20
- if req.path == '/foo'
21
- res.body = <<-END
22
- <a href="/">a</a>
23
- END
24
- else
25
- res.body = <<-END
26
- <a href="/foo">b</a>
27
- END
6
+ describe 'SpiderInstance' do
7
+ # http://www.rcuk.ac.uk/ redirects to /default.htm, which isn't a complete
8
+ # URL. Bug reported by Henri Cook.
9
+ it 'should construct a complete redirect URL' do
10
+ @response_called = false
11
+ redirected_resp = stub(:redirect? => true,
12
+ :[] => '/default.htm')
13
+ success_resp = stub(:redirect? => false)
14
+ http_req = stub(:request => true)
15
+ http_mock_redir = stub(:use_ssl= => true)
16
+ http_mock_redir.stubs(:start).yields(http_req).returns(redirected_resp)
17
+ http_mock_success = stub(:use_ssl= => true)
18
+ http_mock_success.stubs(:start).yields(http_req).returns(success_resp)
19
+ Net::HTTP.expects(:new).times(2).returns(http_mock_redir).then.
20
+ returns(http_mock_success)
21
+ si = SpiderInstance.new({nil => ['http://www.rcuk.ac.uk/']})
22
+ si.get_page(URI.parse('http://www.rcuk.ac.uk/')) do |resp|
23
+ @response_called = true
28
24
  end
25
+ @response_called.should be_true
29
26
  end
30
- end
31
-
32
- def null_logger
33
- l = stub
34
- [:log, :fatal, :error, :warn , :info, :debug].each do |k|
35
- l.stubs(k)
36
- l.stubs("#{k}?".to_sym)
37
- end
38
- l
39
- end
40
27
 
41
- describe 'SpiderInstance' do
42
28
  it 'should prevent cycles with an IncludedInMemcached' do
43
- system('memcached -d -P /tmp/spider-memcached.pid')
44
- cacher = IncludedInMemcached.new('localhost:11211')
45
- it_should_prevent_cycles_with(cacher)
46
- system('kill -KILL `cat /tmp/spider-memcached.pid`')
29
+ with_memcached do
30
+ cacher = IncludedInMemcached.new('localhost:11211')
31
+ it_should_prevent_cycles_with(cacher)
32
+ end
47
33
  end
48
34
 
49
35
  it 'should prevent cycles with an Array' do
@@ -129,15 +115,12 @@ describe 'SpiderInstance' do
129
115
  u = 'http://localhost:8888?s=1'
130
116
  u_p = URI.parse(u)
131
117
  @block_called = false
132
- server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
133
- :AccessLog => [])
134
- server.mount('/', QueryServlet)
135
- Thread.new {server.start}
136
- si = SpiderInstance.new({nil => [u]})
137
- si.get_page(u_p) do
138
- @block_called = true
118
+ with_web_server(QueryServlet) do
119
+ si = SpiderInstance.new({nil => [u]})
120
+ si.get_page(u_p) do
121
+ @block_called = true
122
+ end
139
123
  end
140
- server.shutdown
141
124
  @block_called.should be_true
142
125
  end
143
126
 
@@ -413,15 +396,10 @@ describe 'SpiderInstance' do
413
396
  u2 = 'http://localhost:8888/foo'
414
397
  u_p2 = URI.parse(u2)
415
398
 
416
- server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
417
- :AccessLog => [])
418
- server.mount('/', LoopingServlet)
419
- Thread.new {server.start}
420
-
421
- si = SpiderInstance.new(nil => [u])
422
- si.check_already_seen_with cacher
423
- si.start!
424
-
425
- server.shutdown
399
+ with_web_server(LoopingServlet) do
400
+ si = SpiderInstance.new(nil => [u])
401
+ si.check_already_seen_with cacher
402
+ si.start!
403
+ end
426
404
  end
427
405
  end
@@ -1,10 +1,33 @@
1
- require 'rubygems'
2
- require 'spec'
3
- require File.dirname(__FILE__)+'/../lib/spider'
1
+ require File.dirname(__FILE__)+'/spec_helper'
2
+ local_require 'spider', 'spider/included_in_memcached'
4
3
 
5
4
  describe 'Spider' do
6
- it 'should start at the given URL when given a string' do
7
- #Spider.start_at('http://example.com/') {}
8
- pending 'this will be a while'
5
+ it 'should find two pages without cycles using defaults' do
6
+ u = []
7
+ with_web_server(LoopingServlet) do
8
+ u = find_pages_with_static_server
9
+ end
10
+ u.should be_static_server_pages
11
+ end
12
+
13
+ it 'should find two pages without cycles using memcached' do
14
+ u = []
15
+ with_web_server(LoopingServlet) do
16
+ with_memcached do
17
+ u = find_pages_with_static_server do |s|
18
+ s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
19
+ end
20
+ end
21
+ end
22
+ u.should be_static_server_pages
23
+ end
24
+
25
+ def find_pages_with_static_server(&block)
26
+ pages = []
27
+ Spider.start_at('http://localhost:8888/') do |s|
28
+ block.call(s) unless block.nil?
29
+ s.on(:every){ |u,r,p| pages << u }
30
+ end
31
+ pages
9
32
  end
10
33
  end
@@ -13,5 +13,5 @@ spec = Gem::Specification.new do |s|
13
13
  A Web spidering library: handles robots.txt, scraping, finding more
14
14
  links, and doing it all over again.
15
15
  EOF
16
- s.version = '0.4.0'
16
+ s.version = '0.4.1'
17
17
  end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.4
3
3
  specification_version: 1
4
4
  name: spider
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.4.0
7
- date: 2007-11-02 00:00:00 -04:00
6
+ version: 0.4.1
7
+ date: 2007-11-10 00:00:00 -05:00
8
8
  summary: A Web spidering library
9
9
  require_paths:
10
10
  - lib
@@ -34,8 +34,9 @@ files:
34
34
  - doc/files
35
35
  - doc/files/lib
36
36
  - doc/files/lib/spider_rb.html
37
- - doc/files/lib/spider_instance_rb.html
38
- - doc/files/lib/included_in_memcached_rb.html
37
+ - doc/files/lib/spider
38
+ - doc/files/lib/spider/spider_instance_rb.html
39
+ - doc/files/lib/spider/included_in_memcached_rb.html
39
40
  - doc/files/README.html
40
41
  - doc/classes
41
42
  - doc/classes/IncludedInMemcached.html
@@ -47,26 +48,20 @@ files:
47
48
  - doc/index.html
48
49
  - doc/created.rid
49
50
  - spec
51
+ - spec/spider
52
+ - spec/spider/included_in_memcached_spec.rb
53
+ - spec/spider/spider_instance_spec.rb
50
54
  - spec/spider_spec.rb
51
- - spec/included_in_memcached_spec.rb
52
- - spec/spider_instance_spec.rb
55
+ - spec/spec_helper.rb
53
56
  - README
54
57
  - spider.gemspec
55
58
  - CHANGES
56
59
  - lib
57
60
  - lib/spider.rb
58
- - lib/robot_rules.rb
59
- - lib/spider_instance.rb
60
- - lib/included_in_memcached.rb
61
- - test_server
62
- - test_server/server1
63
- - test_server/server1/page1.html
64
- - test_server/server1/page2.html
65
- - test_server/server2
66
- - test_server/server2/page1.html
67
- - test_server/server2/page2.html
68
- - test_server/servers.rb
69
- - test_server/client.rb
61
+ - lib/spider
62
+ - lib/spider/included_in_memcached.rb
63
+ - lib/spider/robot_rules.rb
64
+ - lib/spider/spider_instance.rb
70
65
  test_files: []
71
66
 
72
67
  rdoc_options: []
@@ -1,22 +0,0 @@
1
- require 'memcache'
2
-
3
- # A specialized class using memcached to track items stored. It supports
4
- # three operations: new, <<, and include? . Together these can be used to
5
- # add items to the memcache, then determine whether the item has been added.
6
- class IncludedInMemcached
7
- # Construct a new IncludedInMemcached instance. All arguments here are
8
- # passed to MemCache (part of the memcache-client gem).
9
- def initialize(*a)
10
- @c = MemCache.new(*a)
11
- end
12
-
13
- # Add an item to the memcache.
14
- def <<(v)
15
- @c.add(v.to_s, v)
16
- end
17
-
18
- # True if the item is in the memcache.
19
- def include?(v)
20
- @c.get(v.to_s) == v
21
- end
22
- end
@@ -1,26 +0,0 @@
1
- #!/usr/local/bin/ruby -w
2
-
3
- require 'rubygems'
4
- require 'spider'
5
-
6
- Spider.start_at('http://localhost:8880/page1.html') do |s|
7
- s.add_url_check do |a_url|
8
- a_url =~ %r{^http://localhost:8880.*}
9
- end
10
-
11
- s.on 404 do |a_url, resp, prior|
12
- puts "URL not found: #{a_url}"
13
- end
14
-
15
- s.on :success do |a_url, resp, prior|
16
- puts "body: #{resp.body}"
17
- end
18
-
19
- s.on :any do |a_url, resp, prior|
20
- puts "URL returned anything: #{a_url} with this code #{resp.code}"
21
- end
22
- end
23
-
24
- %w(INT TERM).each do |signal|
25
- trap(signal) { exit }
26
- end
@@ -1 +0,0 @@
1
- <a href="page2.html">See page two!</a>
@@ -1,3 +0,0 @@
1
- <a href="page1.html">See page one!</a>
2
- <a href="http://localhost:8881/page1.html">See server two!</a>
3
-
@@ -1 +0,0 @@
1
- <a href="page2.html">See page two!</a>
@@ -1,2 +0,0 @@
1
- <a href="page1.html">See page one!</a>
2
- <a href="http://localhost:8880/page1.html">See server one!</a>
@@ -1,24 +0,0 @@
1
- #!/usr/local/bin/ruby -w
2
- # Two Web servers, on different ports of localhost, serving two pages each.
3
- # One page links to the next; the next links to both the first and to the the
4
- # first on the other server.
5
- # This is used to test: cycles, domain restrictions.
6
-
7
- require 'webrick'
8
-
9
- server1 = WEBrick::HTTPServer.new(:Port => 8880)
10
- server1.mount('/', WEBrick::HTTPServlet::FileHandler,
11
- File.dirname(__FILE__)+'/server1')
12
- server2 = WEBrick::HTTPServer.new(:Port => 8881)
13
- server2.mount('/', WEBrick::HTTPServlet::FileHandler,
14
- File.dirname(__FILE__)+'/server2')
15
-
16
- %w(INT TERM).each do |signal|
17
- trap(signal) { [server1,server2].each { |server| server.shutdown } }
18
- end
19
-
20
- threads = []
21
- [server1,server2].each do |server|
22
- threads << Thread.new { server.start }
23
- end
24
- threads.each { |t| t.join }