spider 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGES CHANGED
@@ -1,3 +1,10 @@
1
+ 2007-11-09:
2
+ * Handle redirects that assume a base URL.
3
+
4
+ 2007-11-08:
5
+ * Move spider_instance.rb, robot_rules.rb, and included_in_memcached.rb into
6
+ spider subdirectory.
7
+
1
8
  2007-11-02:
2
9
  * Memcached support.
3
10
 
data/README CHANGED
@@ -108,7 +108,7 @@ scraping, collecting, and looping so that you can just handle the data.
108
108
 
109
109
  Mike Burns http://mike-burns.com mike@mike-burns.com
110
110
 
111
- Help from Matt Horan and John Nagro.
111
+ Help from Matt Horan, John Nagro, and Henri Cook.
112
112
 
113
113
  With `robot_rules' from James Edward Gray II via
114
114
  http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589
@@ -55,8 +55,8 @@
55
55
  <tr class="top-aligned-row">
56
56
  <td><strong>In:</strong></td>
57
57
  <td>
58
- <a href="../files/lib/included_in_memcached_rb.html">
59
- lib/included_in_memcached.rb
58
+ <a href="../files/lib/spider/included_in_memcached_rb.html">
59
+ lib/spider/included_in_memcached.rb
60
60
  </a>
61
61
  <br />
62
62
  </td>
@@ -86,6 +86,15 @@ three operations: <a href="IncludedInMemcached.html#M000001">new</a>,
86
86
  Together these can be used to add items to the memcache, then determine
87
87
  whether the item has been added.
88
88
  </p>
89
+ <p>
90
+ To use it with <a href="Spider.html">Spider</a> use the
91
+ check_already_seen_with method:
92
+ </p>
93
+ <pre>
94
+ Spider.start_at('http://example.com/') do |s|
95
+ s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
96
+ end
97
+ </pre>
89
98
 
90
99
  </div>
91
100
 
@@ -139,7 +148,7 @@ arguments here are passed to MemCache (part of the memcache-client gem).
139
148
  onclick="toggleCode('M000001-source');return false;">[Source]</a></p>
140
149
  <div class="method-source-code" id="M000001-source">
141
150
  <pre>
142
- <span class="ruby-comment cmt"># File lib/included_in_memcached.rb, line 9</span>
151
+ <span class="ruby-comment cmt"># File lib/spider/included_in_memcached.rb, line 39</span>
143
152
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-operator">*</span><span class="ruby-identifier">a</span>)
144
153
  <span class="ruby-ivar">@c</span> = <span class="ruby-constant">MemCache</span>.<span class="ruby-identifier">new</span>(<span class="ruby-operator">*</span><span class="ruby-identifier">a</span>)
145
154
  <span class="ruby-keyword kw">end</span>
@@ -167,7 +176,7 @@ Add an item to the memcache.
167
176
  onclick="toggleCode('M000002-source');return false;">[Source]</a></p>
168
177
  <div class="method-source-code" id="M000002-source">
169
178
  <pre>
170
- <span class="ruby-comment cmt"># File lib/included_in_memcached.rb, line 14</span>
179
+ <span class="ruby-comment cmt"># File lib/spider/included_in_memcached.rb, line 44</span>
171
180
  <span class="ruby-keyword kw">def</span> <span class="ruby-operator">&lt;&lt;</span>(<span class="ruby-identifier">v</span>)
172
181
  <span class="ruby-ivar">@c</span>.<span class="ruby-identifier">add</span>(<span class="ruby-identifier">v</span>.<span class="ruby-identifier">to_s</span>, <span class="ruby-identifier">v</span>)
173
182
  <span class="ruby-keyword kw">end</span>
@@ -193,7 +202,7 @@ True if the item is in the memcache.
193
202
  onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
194
203
  <div class="method-source-code" id="M000003-source">
195
204
  <pre>
196
- <span class="ruby-comment cmt"># File lib/included_in_memcached.rb, line 19</span>
205
+ <span class="ruby-comment cmt"># File lib/spider/included_in_memcached.rb, line 49</span>
197
206
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">include?</span>(<span class="ruby-identifier">v</span>)
198
207
  <span class="ruby-ivar">@c</span>.<span class="ruby-identifier">get</span>(<span class="ruby-identifier">v</span>.<span class="ruby-identifier">to_s</span>) <span class="ruby-operator">==</span> <span class="ruby-identifier">v</span>
199
208
  <span class="ruby-keyword kw">end</span>
@@ -55,8 +55,8 @@
55
55
  <tr class="top-aligned-row">
56
56
  <td><strong>In:</strong></td>
57
57
  <td>
58
- <a href="../files/lib/spider_instance_rb.html">
59
- lib/spider_instance.rb
58
+ <a href="../files/lib/spider/spider_instance_rb.html">
59
+ lib/spider/spider_instance.rb
60
60
  </a>
61
61
  <br />
62
62
  </td>
@@ -140,7 +140,7 @@ href="http://mike-burns.com">mike-burns.com</a>&#8217;:
140
140
  onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
141
141
  <div class="method-source-code" id="M000004-source">
142
142
  <pre>
143
- <span class="ruby-comment cmt"># File lib/spider_instance.rb, line 69</span>
143
+ <span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 70</span>
144
144
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_url_check</span>(<span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
145
145
  <span class="ruby-ivar">@url_checks</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-identifier">block</span>
146
146
  <span class="ruby-keyword kw">end</span>
@@ -186,7 +186,7 @@ understand just &lt;&lt; and included? .
186
186
  onclick="toggleCode('M000005-source');return false;">[Source]</a></p>
187
187
  <div class="method-source-code" id="M000005-source">
188
188
  <pre>
189
- <span class="ruby-comment cmt"># File lib/spider_instance.rb, line 90</span>
189
+ <span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 91</span>
190
190
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">check_already_seen_with</span>(<span class="ruby-identifier">cacher</span>)
191
191
  <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">cacher</span>.<span class="ruby-identifier">respond_to?</span>(<span class="ruby-identifier">:&lt;&lt;</span>) <span class="ruby-operator">&amp;&amp;</span> <span class="ruby-identifier">cacher</span>.<span class="ruby-identifier">respond_to?</span>(<span class="ruby-identifier">:include?</span>)
192
192
  <span class="ruby-ivar">@seen</span> = <span class="ruby-identifier">cacher</span>
@@ -216,7 +216,7 @@ Reset the <a href="SpiderInstance.html#M000009">headers</a> hash.
216
216
  onclick="toggleCode('M000010-source');return false;">[Source]</a></p>
217
217
  <div class="method-source-code" id="M000010-source">
218
218
  <pre>
219
- <span class="ruby-comment cmt"># File lib/spider_instance.rb, line 157</span>
219
+ <span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 158</span>
220
220
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">clear_headers</span>
221
221
  <span class="ruby-ivar">@headers</span> = {}
222
222
  <span class="ruby-keyword kw">end</span>
@@ -245,7 +245,7 @@ Use like a hash:
245
245
  onclick="toggleCode('M000009-source');return false;">[Source]</a></p>
246
246
  <div class="method-source-code" id="M000009-source">
247
247
  <pre>
248
- <span class="ruby-comment cmt"># File lib/spider_instance.rb, line 145</span>
248
+ <span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 146</span>
249
249
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">headers</span>
250
250
  <span class="ruby-constant">HeaderSetter</span>.<span class="ruby-identifier">new</span>(<span class="ruby-keyword kw">self</span>)
251
251
  <span class="ruby-keyword kw">end</span>
@@ -294,7 +294,7 @@ For example:
294
294
  onclick="toggleCode('M000006-source');return false;">[Source]</a></p>
295
295
  <div class="method-source-code" id="M000006-source">
296
296
  <pre>
297
- <span class="ruby-comment cmt"># File lib/spider_instance.rb, line 120</span>
297
+ <span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 121</span>
298
298
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">on</span>(<span class="ruby-identifier">code</span>, <span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
299
299
  <span class="ruby-identifier">f</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
300
300
  <span class="ruby-keyword kw">case</span> <span class="ruby-identifier">code</span>
@@ -331,7 +331,7 @@ Run before the HTTP request. Given the URL as a string.
331
331
  onclick="toggleCode('M000007-source');return false;">[Source]</a></p>
332
332
  <div class="method-source-code" id="M000007-source">
333
333
  <pre>
334
- <span class="ruby-comment cmt"># File lib/spider_instance.rb, line 134</span>
334
+ <span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 135</span>
335
335
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">setup</span>(<span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
336
336
  <span class="ruby-ivar">@setup</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
337
337
  <span class="ruby-keyword kw">end</span>
@@ -357,7 +357,7 @@ Run last, once for each page. Given the URL as a string.
357
357
  onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
358
358
  <div class="method-source-code" id="M000008-source">
359
359
  <pre>
360
- <span class="ruby-comment cmt"># File lib/spider_instance.rb, line 139</span>
360
+ <span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 140</span>
361
361
  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">teardown</span>(<span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
362
362
  <span class="ruby-ivar">@teardown</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
363
363
  <span class="ruby-keyword kw">end</span>
@@ -1 +1 @@
1
- Fri, 02 Nov 2007 17:20:02 -0400
1
+ Sat, 10 Nov 2007 00:25:19 -0500
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Fri Nov 02 17:19:47 -0400 2007</td>
59
+ <td>Thu Nov 08 17:51:17 -0500 2007</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -182,7 +182,7 @@ Mike Burns <a href="http://mike-burns.com">mike-burns.com</a>
182
182
  mike@mike-burns.com
183
183
  </p>
184
184
  <p>
185
- Help from Matt Horan and John Nagro.
185
+ Help from Matt Horan, John Nagro, and Henri Cook.
186
186
  </p>
187
187
  <p>
188
188
  With `robot_rules&#8217; from James Edward Gray II via <a
@@ -8,7 +8,7 @@
8
8
  <title>File: included_in_memcached.rb</title>
9
9
  <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
10
  <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
- <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
11
+ <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
12
12
  <script type="text/javascript">
13
13
  // <![CDATA[
14
14
 
@@ -51,12 +51,12 @@
51
51
  <table class="header-table">
52
52
  <tr class="top-aligned-row">
53
53
  <td><strong>Path:</strong></td>
54
- <td>lib/included_in_memcached.rb
54
+ <td>lib/spider/included_in_memcached.rb
55
55
  </td>
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Fri Nov 02 15:04:14 -0400 2007</td>
59
+ <td>Sat Nov 10 00:24:11 -0500 2007</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -68,6 +68,12 @@
68
68
 
69
69
  <div id="contextContent">
70
70
 
71
+ <div id="description">
72
+ <p>
73
+ Use memcached to track cycles.
74
+ </p>
75
+
76
+ </div>
71
77
 
72
78
  <div id="requires-list">
73
79
  <h3 class="section-bar">Required files</h3>
@@ -8,7 +8,7 @@
8
8
  <title>File: spider_instance.rb</title>
9
9
  <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
10
  <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
- <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
11
+ <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
12
12
  <script type="text/javascript">
13
13
  // <![CDATA[
14
14
 
@@ -51,12 +51,12 @@
51
51
  <table class="header-table">
52
52
  <tr class="top-aligned-row">
53
53
  <td><strong>Path:</strong></td>
54
- <td>lib/spider_instance.rb
54
+ <td>lib/spider/spider_instance.rb
55
55
  </td>
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Fri Nov 02 17:05:49 -0400 2007</td>
59
+ <td>Sat Nov 10 00:25:04 -0500 2007</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -70,7 +70,7 @@
70
70
 
71
71
  <div id="description">
72
72
  <p>
73
- Copyright 2007 Mike Burns
73
+ Specialized spidering rules.
74
74
  </p>
75
75
 
76
76
  </div>
@@ -56,7 +56,7 @@
56
56
  </tr>
57
57
  <tr class="top-aligned-row">
58
58
  <td><strong>Last Update:</strong></td>
59
- <td>Fri Nov 02 12:32:39 -0400 2007</td>
59
+ <td>Thu Nov 08 17:29:01 -0500 2007</td>
60
60
  </tr>
61
61
  </table>
62
62
  </div>
@@ -182,7 +182,7 @@ Mike Burns <a href="http://mike-burns.com">mike-burns.com</a>
182
182
  mike@mike-burns.com
183
183
  </p>
184
184
  <p>
185
- Help from Matt Horan and John Nagro.
185
+ Help from Matt Horan, John Nagro, and Henri Cook.
186
186
  </p>
187
187
  <p>
188
188
  With `robot_rules&#8217; from James Edward Gray II via <a
@@ -21,9 +21,9 @@
21
21
  <h1 class="section-bar">Files</h1>
22
22
  <div id="index-entries">
23
23
  <a href="files/README.html">README</a><br />
24
- <a href="files/lib/included_in_memcached_rb.html">lib/included_in_memcached.rb</a><br />
25
24
  <a href="files/lib/spider_rb.html">lib/spider.rb</a><br />
26
- <a href="files/lib/spider_instance_rb.html">lib/spider_instance.rb</a><br />
25
+ <a href="files/lib/spider/included_in_memcached_rb.html">lib/spider/included_in_memcached.rb</a><br />
26
+ <a href="files/lib/spider/spider_instance_rb.html">lib/spider/spider_instance.rb</a><br />
27
27
  </div>
28
28
  </div>
29
29
  </body>
@@ -23,7 +23,7 @@
23
23
  # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
24
  # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
25
 
26
- require File.dirname(__FILE__)+'/spider_instance'
26
+ require File.dirname(__FILE__)+'/spider/spider_instance'
27
27
 
28
28
  # A spidering library for Ruby. Handles robots.txt, scraping, finding more
29
29
  # links, and doing it all over again.
@@ -0,0 +1,52 @@
1
+ # Use memcached to track cycles.
2
+
3
+ # Redistribution and use in source and binary forms, with or without
4
+ # modification, are permitted provided that the following conditions are met:
5
+ # * Redistributions of source code must retain the above copyright
6
+ # notice, this list of conditions and the following disclaimer.
7
+ # * Redistributions in binary form must reproduce the above copyright
8
+ # notice, this list of conditions and the following disclaimer in the
9
+ # documentation and/or other materials provided with the distribution.
10
+ # * Neither the name Mike Burns nor the
11
+ # names of his contributors may be used to endorse or promote products
12
+ # derived from this software without specific prior written permission.
13
+ #
14
+ # THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
15
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16
+ # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17
+ # DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
18
+ # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19
+ # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20
+ # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21
+ # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24
+
25
+ require 'memcache'
26
+
27
+ # A specialized class using memcached to track items stored. It supports
28
+ # three operations: new, <<, and include? . Together these can be used to
29
+ # add items to the memcache, then determine whether the item has been added.
30
+ #
31
+ # To use it with Spider use the check_already_seen_with method:
32
+ #
33
+ # Spider.start_at('http://example.com/') do |s|
34
+ # s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
35
+ # end
36
+ class IncludedInMemcached
37
+ # Construct a new IncludedInMemcached instance. All arguments here are
38
+ # passed to MemCache (part of the memcache-client gem).
39
+ def initialize(*a)
40
+ @c = MemCache.new(*a)
41
+ end
42
+
43
+ # Add an item to the memcache.
44
+ def <<(v)
45
+ @c.add(v.to_s, v)
46
+ end
47
+
48
+ # True if the item is in the memcache.
49
+ def include?(v)
50
+ @c.get(v.to_s) == v
51
+ end
52
+ end
@@ -1,3 +1,5 @@
1
+ # Understand robots.txt.
2
+
1
3
  # Created by James Edward Gray II on 2006-01-31.
2
4
  # Copyright 2006 Gray Productions. All rights reserved.
3
5
 
@@ -1,5 +1,6 @@
1
- # Copyright 2007 Mike Burns
1
+ # Specialized spidering rules.
2
2
 
3
+ # Copyright 2007 Mike Burns
3
4
  # Redistribution and use in source and binary forms, with or without
4
5
  # modification, are permitted provided that the following conditions are met:
5
6
  # * Redistributions of source code must retain the above copyright
@@ -22,7 +23,7 @@
22
23
  # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23
24
  # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24
25
 
25
- require 'robot_rules'
26
+ require File.dirname(__FILE__)+'/robot_rules.rb'
26
27
  require 'open-uri'
27
28
  require 'uri'
28
29
  require 'net/http'
@@ -221,7 +222,7 @@ class SpiderInstance
221
222
  r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri,
222
223
  @headers))}
223
224
  if r.redirect?
224
- get_page(URI.parse(r['Location']), &block)
225
+ get_page(URI.parse(construct_complete_url(parsed_url,r['Location'])), &block)
225
226
  else
226
227
  block.call(r)
227
228
  end
@@ -252,21 +253,7 @@ class SpiderInstance
252
253
  if parsed_link.fragment == '#'
253
254
  nil
254
255
  else
255
- case parsed_link.scheme
256
- when 'http'
257
- link
258
- when nil
259
- u = URI.parse(base_url)
260
- if link[0].chr == '/'
261
- "#{u.scheme}://#{u.host}:#{u.port}#{link}"
262
- elsif u.path.nil? || u.path == ''
263
- "#{u.scheme}://#{u.host}:#{u.port}/#{link}"
264
- else
265
- "#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{link}"
266
- end
267
- else
268
- nil
269
- end
256
+ construct_complete_url(base_url, link, parsed_link)
270
257
  end
271
258
  rescue
272
259
  nil
@@ -274,6 +261,23 @@ class SpiderInstance
274
261
  end.compact
275
262
  end
276
263
 
264
+ def construct_complete_url(base_url, additional_url, parsed_additional_url = nil) #:nodoc:
265
+ parsed_additional_url ||= URI.parse(additional_url)
266
+ case parsed_additional_url.scheme
267
+ when nil
268
+ u = base_url.is_a?(URI) ? base_url : URI.parse(base_url)
269
+ if additional_url[0].chr == '/'
270
+ "#{u.scheme}://#{u.host}:#{u.port}#{additional_url}"
271
+ elsif u.path.nil? || u.path == ''
272
+ "#{u.scheme}://#{u.host}:#{u.port}/#{additional_url}"
273
+ else
274
+ "#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{additional_url}"
275
+ end
276
+ else
277
+ additional_url
278
+ end
279
+ end
280
+
277
281
  def remove_trailing_slash(s) #:nodoc:
278
282
  s.sub(%r{/*$},'')
279
283
  end
@@ -0,0 +1,90 @@
1
+ require 'rubygems'
2
+ require 'webrick'
3
+ require 'spec'
4
+
5
+ Spec::Runner.configure { |c| c.mock_with :mocha }
6
+
7
+ def local_require(*files)
8
+ files.each do |file|
9
+ require File.dirname(__FILE__)+'/../lib/'+file
10
+ end
11
+ end
12
+
13
+ class BeStaticServerPages
14
+ def initialize
15
+ @pages = ['http://localhost:8888/', 'http://localhost:8888/foo']
16
+ @actual = nil
17
+ end
18
+
19
+ attr :actual, true
20
+
21
+ def matches?(actual)
22
+ @actual = actual
23
+ actual == @pages
24
+ end
25
+
26
+ def failure_message
27
+ "expected #{@pages.inspect}, got #{@actual.inspect}"
28
+ end
29
+
30
+ def description
31
+ "be the pages returned by the static server (#{@pages.inspect})"
32
+ end
33
+ end
34
+
35
+ def with_web_server(svlt)
36
+ server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
37
+ :AccessLog => [])
38
+ server.mount('/', svlt)
39
+ Thread.new {server.start}
40
+ begin
41
+ yield
42
+ ensure
43
+ server.shutdown
44
+ end
45
+ end
46
+
47
+ def with_memcached
48
+ system('memcached -d -P /tmp/spider-memcached.pid')
49
+ cacher = IncludedInMemcached.new('localhost:11211')
50
+ begin
51
+ yield
52
+ ensure
53
+ system('kill -KILL `cat /tmp/spider-memcached.pid`')
54
+ end
55
+ end
56
+
57
+ def be_static_server_pages
58
+ BeStaticServerPages.new
59
+ end
60
+
61
+ class QueryServlet < WEBrick::HTTPServlet::AbstractServlet
62
+ def do_GET(req, res)
63
+ res['Content-type'] = 'text/plain'
64
+ res.body = "response\n"
65
+ end
66
+ end
67
+
68
+ class LoopingServlet < WEBrick::HTTPServlet::AbstractServlet
69
+ def do_GET(req, res)
70
+ res['Content-type'] = 'text/html'
71
+ if req.path == '/foo'
72
+ res.body = <<-END
73
+ <a href="/">a</a>
74
+ END
75
+ else
76
+ res.body = <<-END
77
+ <a href="/foo">b</a>
78
+ END
79
+ end
80
+ end
81
+ end
82
+
83
+ def null_logger
84
+ l = stub
85
+ [:log, :fatal, :error, :warn , :info, :debug].each do |k|
86
+ l.stubs(k)
87
+ l.stubs("#{k}?".to_sym)
88
+ end
89
+ l
90
+ end
@@ -1,8 +1,7 @@
1
- require 'rubygems'
2
- require 'spec'
1
+ require File.dirname(__FILE__)+'/../spec_helper'
3
2
 
4
3
  def before_specing_memcached
5
- require File.dirname(__FILE__)+'/../lib/included_in_memcached'
4
+ local_require 'spider/included_in_memcached'
6
5
  system('memcached -d -P /tmp/spider-memcached.pid')
7
6
  end
8
7
 
@@ -1,49 +1,35 @@
1
- require 'rubygems'
2
- require 'spec'
1
+ require File.dirname(__FILE__)+'/../spec_helper'
3
2
  require 'webrick'
4
3
  require 'webrick/https'
5
- require File.dirname(__FILE__)+'/../lib/spider'
6
- require File.dirname(__FILE__)+'/../lib/included_in_memcached'
4
+ local_require 'spider', 'spider/included_in_memcached'
7
5
 
8
- Spec::Runner.configure { |c| c.mock_with :mocha }
9
-
10
- class QueryServlet < WEBrick::HTTPServlet::AbstractServlet
11
- def do_GET(req, res)
12
- res['Content-type'] = 'text/plain'
13
- res.body = "response\n"
14
- end
15
- end
16
-
17
- class LoopingServlet < WEBrick::HTTPServlet::AbstractServlet
18
- def do_GET(req, res)
19
- res['Content-type'] = 'text/html'
20
- if req.path == '/foo'
21
- res.body = <<-END
22
- <a href="/">a</a>
23
- END
24
- else
25
- res.body = <<-END
26
- <a href="/foo">b</a>
27
- END
6
+ describe 'SpiderInstance' do
7
+ # http://www.rcuk.ac.uk/ redirects to /default.htm, which isn't a complete
8
+ # URL. Bug reported by Henri Cook.
9
+ it 'should construct a complete redirect URL' do
10
+ @response_called = false
11
+ redirected_resp = stub(:redirect? => true,
12
+ :[] => '/default.htm')
13
+ success_resp = stub(:redirect? => false)
14
+ http_req = stub(:request => true)
15
+ http_mock_redir = stub(:use_ssl= => true)
16
+ http_mock_redir.stubs(:start).yields(http_req).returns(redirected_resp)
17
+ http_mock_success = stub(:use_ssl= => true)
18
+ http_mock_success.stubs(:start).yields(http_req).returns(success_resp)
19
+ Net::HTTP.expects(:new).times(2).returns(http_mock_redir).then.
20
+ returns(http_mock_success)
21
+ si = SpiderInstance.new({nil => ['http://www.rcuk.ac.uk/']})
22
+ si.get_page(URI.parse('http://www.rcuk.ac.uk/')) do |resp|
23
+ @response_called = true
28
24
  end
25
+ @response_called.should be_true
29
26
  end
30
- end
31
-
32
- def null_logger
33
- l = stub
34
- [:log, :fatal, :error, :warn , :info, :debug].each do |k|
35
- l.stubs(k)
36
- l.stubs("#{k}?".to_sym)
37
- end
38
- l
39
- end
40
27
 
41
- describe 'SpiderInstance' do
42
28
  it 'should prevent cycles with an IncludedInMemcached' do
43
- system('memcached -d -P /tmp/spider-memcached.pid')
44
- cacher = IncludedInMemcached.new('localhost:11211')
45
- it_should_prevent_cycles_with(cacher)
46
- system('kill -KILL `cat /tmp/spider-memcached.pid`')
29
+ with_memcached do
30
+ cacher = IncludedInMemcached.new('localhost:11211')
31
+ it_should_prevent_cycles_with(cacher)
32
+ end
47
33
  end
48
34
 
49
35
  it 'should prevent cycles with an Array' do
@@ -129,15 +115,12 @@ describe 'SpiderInstance' do
129
115
  u = 'http://localhost:8888?s=1'
130
116
  u_p = URI.parse(u)
131
117
  @block_called = false
132
- server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
133
- :AccessLog => [])
134
- server.mount('/', QueryServlet)
135
- Thread.new {server.start}
136
- si = SpiderInstance.new({nil => [u]})
137
- si.get_page(u_p) do
138
- @block_called = true
118
+ with_web_server(QueryServlet) do
119
+ si = SpiderInstance.new({nil => [u]})
120
+ si.get_page(u_p) do
121
+ @block_called = true
122
+ end
139
123
  end
140
- server.shutdown
141
124
  @block_called.should be_true
142
125
  end
143
126
 
@@ -413,15 +396,10 @@ describe 'SpiderInstance' do
413
396
  u2 = 'http://localhost:8888/foo'
414
397
  u_p2 = URI.parse(u2)
415
398
 
416
- server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
417
- :AccessLog => [])
418
- server.mount('/', LoopingServlet)
419
- Thread.new {server.start}
420
-
421
- si = SpiderInstance.new(nil => [u])
422
- si.check_already_seen_with cacher
423
- si.start!
424
-
425
- server.shutdown
399
+ with_web_server(LoopingServlet) do
400
+ si = SpiderInstance.new(nil => [u])
401
+ si.check_already_seen_with cacher
402
+ si.start!
403
+ end
426
404
  end
427
405
  end
@@ -1,10 +1,33 @@
1
- require 'rubygems'
2
- require 'spec'
3
- require File.dirname(__FILE__)+'/../lib/spider'
1
+ require File.dirname(__FILE__)+'/spec_helper'
2
+ local_require 'spider', 'spider/included_in_memcached'
4
3
 
5
4
  describe 'Spider' do
6
- it 'should start at the given URL when given a string' do
7
- #Spider.start_at('http://example.com/') {}
8
- pending 'this will be a while'
5
+ it 'should find two pages without cycles using defaults' do
6
+ u = []
7
+ with_web_server(LoopingServlet) do
8
+ u = find_pages_with_static_server
9
+ end
10
+ u.should be_static_server_pages
11
+ end
12
+
13
+ it 'should find two pages without cycles using memcached' do
14
+ u = []
15
+ with_web_server(LoopingServlet) do
16
+ with_memcached do
17
+ u = find_pages_with_static_server do |s|
18
+ s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
19
+ end
20
+ end
21
+ end
22
+ u.should be_static_server_pages
23
+ end
24
+
25
+ def find_pages_with_static_server(&block)
26
+ pages = []
27
+ Spider.start_at('http://localhost:8888/') do |s|
28
+ block.call(s) unless block.nil?
29
+ s.on(:every){ |u,r,p| pages << u }
30
+ end
31
+ pages
9
32
  end
10
33
  end
@@ -13,5 +13,5 @@ spec = Gem::Specification.new do |s|
13
13
  A Web spidering library: handles robots.txt, scraping, finding more
14
14
  links, and doing it all over again.
15
15
  EOF
16
- s.version = '0.4.0'
16
+ s.version = '0.4.1'
17
17
  end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.4
3
3
  specification_version: 1
4
4
  name: spider
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.4.0
7
- date: 2007-11-02 00:00:00 -04:00
6
+ version: 0.4.1
7
+ date: 2007-11-10 00:00:00 -05:00
8
8
  summary: A Web spidering library
9
9
  require_paths:
10
10
  - lib
@@ -34,8 +34,9 @@ files:
34
34
  - doc/files
35
35
  - doc/files/lib
36
36
  - doc/files/lib/spider_rb.html
37
- - doc/files/lib/spider_instance_rb.html
38
- - doc/files/lib/included_in_memcached_rb.html
37
+ - doc/files/lib/spider
38
+ - doc/files/lib/spider/spider_instance_rb.html
39
+ - doc/files/lib/spider/included_in_memcached_rb.html
39
40
  - doc/files/README.html
40
41
  - doc/classes
41
42
  - doc/classes/IncludedInMemcached.html
@@ -47,26 +48,20 @@ files:
47
48
  - doc/index.html
48
49
  - doc/created.rid
49
50
  - spec
51
+ - spec/spider
52
+ - spec/spider/included_in_memcached_spec.rb
53
+ - spec/spider/spider_instance_spec.rb
50
54
  - spec/spider_spec.rb
51
- - spec/included_in_memcached_spec.rb
52
- - spec/spider_instance_spec.rb
55
+ - spec/spec_helper.rb
53
56
  - README
54
57
  - spider.gemspec
55
58
  - CHANGES
56
59
  - lib
57
60
  - lib/spider.rb
58
- - lib/robot_rules.rb
59
- - lib/spider_instance.rb
60
- - lib/included_in_memcached.rb
61
- - test_server
62
- - test_server/server1
63
- - test_server/server1/page1.html
64
- - test_server/server1/page2.html
65
- - test_server/server2
66
- - test_server/server2/page1.html
67
- - test_server/server2/page2.html
68
- - test_server/servers.rb
69
- - test_server/client.rb
61
+ - lib/spider
62
+ - lib/spider/included_in_memcached.rb
63
+ - lib/spider/robot_rules.rb
64
+ - lib/spider/spider_instance.rb
70
65
  test_files: []
71
66
 
72
67
  rdoc_options: []
@@ -1,22 +0,0 @@
1
- require 'memcache'
2
-
3
- # A specialized class using memcached to track items stored. It supports
4
- # three operations: new, <<, and include? . Together these can be used to
5
- # add items to the memcache, then determine whether the item has been added.
6
- class IncludedInMemcached
7
- # Construct a new IncludedInMemcached instance. All arguments here are
8
- # passed to MemCache (part of the memcache-client gem).
9
- def initialize(*a)
10
- @c = MemCache.new(*a)
11
- end
12
-
13
- # Add an item to the memcache.
14
- def <<(v)
15
- @c.add(v.to_s, v)
16
- end
17
-
18
- # True if the item is in the memcache.
19
- def include?(v)
20
- @c.get(v.to_s) == v
21
- end
22
- end
@@ -1,26 +0,0 @@
1
- #!/usr/local/bin/ruby -w
2
-
3
- require 'rubygems'
4
- require 'spider'
5
-
6
- Spider.start_at('http://localhost:8880/page1.html') do |s|
7
- s.add_url_check do |a_url|
8
- a_url =~ %r{^http://localhost:8880.*}
9
- end
10
-
11
- s.on 404 do |a_url, resp, prior|
12
- puts "URL not found: #{a_url}"
13
- end
14
-
15
- s.on :success do |a_url, resp, prior|
16
- puts "body: #{resp.body}"
17
- end
18
-
19
- s.on :any do |a_url, resp, prior|
20
- puts "URL returned anything: #{a_url} with this code #{resp.code}"
21
- end
22
- end
23
-
24
- %w(INT TERM).each do |signal|
25
- trap(signal) { exit }
26
- end
@@ -1 +0,0 @@
1
- <a href="page2.html">See page two!</a>
@@ -1,3 +0,0 @@
1
- <a href="page1.html">See page one!</a>
2
- <a href="http://localhost:8881/page1.html">See server two!</a>
3
-
@@ -1 +0,0 @@
1
- <a href="page2.html">See page two!</a>
@@ -1,2 +0,0 @@
1
- <a href="page1.html">See page one!</a>
2
- <a href="http://localhost:8880/page1.html">See server one!</a>
@@ -1,24 +0,0 @@
1
- #!/usr/local/bin/ruby -w
2
- # Two Web servers, on different ports of localhost, serving two pages each.
3
- # One page links to the next; the next links to both the first and to the the
4
- # first on the other server.
5
- # This is used to test: cycles, domain restrictions.
6
-
7
- require 'webrick'
8
-
9
- server1 = WEBrick::HTTPServer.new(:Port => 8880)
10
- server1.mount('/', WEBrick::HTTPServlet::FileHandler,
11
- File.dirname(__FILE__)+'/server1')
12
- server2 = WEBrick::HTTPServer.new(:Port => 8881)
13
- server2.mount('/', WEBrick::HTTPServlet::FileHandler,
14
- File.dirname(__FILE__)+'/server2')
15
-
16
- %w(INT TERM).each do |signal|
17
- trap(signal) { [server1,server2].each { |server| server.shutdown } }
18
- end
19
-
20
- threads = []
21
- [server1,server2].each do |server|
22
- threads << Thread.new { server.start }
23
- end
24
- threads.each { |t| t.join }