RubyGems - spider - Versions diffs - 0.4.0 → 0.4.1 - Mend

spider 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

data/CHANGES +7 -0
data/README +1 -1
data/doc/classes/IncludedInMemcached.html +14 -5
data/doc/classes/SpiderInstance.html +9 -9
data/doc/created.rid +1 -1
data/doc/files/README.html +2 -2
data/doc/files/lib/{included_in_memcached_rb.html → spider/included_in_memcached_rb.html} +9 -3
data/doc/files/lib/{spider_instance_rb.html → spider/spider_instance_rb.html} +4 -4
data/doc/files/lib/spider_rb.html +2 -2
data/doc/fr_file_index.html +2 -2
data/lib/spider.rb +1 -1
data/lib/spider/included_in_memcached.rb +52 -0
data/lib/{robot_rules.rb → spider/robot_rules.rb} +2 -0
data/lib/{spider_instance.rb → spider/spider_instance.rb} +22 -18
data/spec/spec_helper.rb +90 -0
data/spec/{included_in_memcached_spec.rb → spider/included_in_memcached_spec.rb} +2 -3
data/spec/{spider_instance_spec.rb → spider/spider_instance_spec.rb} +35 -57
data/spec/spider_spec.rb +29 -6
data/spider.gemspec +1 -1
metadata +13 -18
data/lib/included_in_memcached.rb +0 -22
data/test_server/client.rb +0 -26
data/test_server/server1/page1.html +0 -1
data/test_server/server1/page2.html +0 -3
data/test_server/server2/page1.html +0 -1
data/test_server/server2/page2.html +0 -2
data/test_server/servers.rb +0 -24

data/CHANGES CHANGED

@@ -1,3 +1,10 @@
+2007-11-09:
+* Handle redirects that assume a base URL.
+2007-11-08:
+* Move spider_instance.rb, robot_rules.rb, and included_in_memcached.rb into
+  spider subdirectory.
 2007-11-02:
 * Memcached support.

data/README CHANGED

@@ -108,7 +108,7 @@ scraping, collecting, and looping so that you can just handle the data.
 Mike Burns http://mike-burns.com mike@mike-burns.com
-Help from Matt Horan and John Nagro.
+Help from Matt Horan, John Nagro, and Henri Cook.
 With `robot_rules' from James Edward Gray II via
 http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589

data/doc/classes/IncludedInMemcached.html CHANGED

@@ -55,8 +55,8 @@
         <tr class="top-aligned-row">
             <td><strong>In:</strong></td>
             <td>
-                <a href="../files/lib/included_in_memcached_rb.html">
-                lib/included_in_memcached.rb
+                <a href="../files/lib/spider/included_in_memcached_rb.html">
+                lib/spider/included_in_memcached.rb
                 </a>
         <br />
             </td>
@@ -86,6 +86,15 @@ three operations: <a href="IncludedInMemcached.html#M000001">new</a>,
 Together these can be used to add items to the memcache, then determine
 whether the item has been added.
 </p>
+<p>
+To use it with <a href="Spider.html">Spider</a> use the
+check_already_seen_with method:
+</p>
+<pre>
+ Spider.start_at('http://example.com/') do |s|
+   s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
+ end
+</pre>
     </div>
@@ -139,7 +148,7 @@ arguments here are passed to MemCache (part of the memcache-client gem).
             onclick="toggleCode('M000001-source');return false;">[Source]</a></p>
           <div class="method-source-code" id="M000001-source">
 <pre>
-<span class="ruby-comment cmt"># File lib/included_in_memcached.rb, line 9</span>
+<span class="ruby-comment cmt"># File lib/spider/included_in_memcached.rb, line 39</span>
   <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-operator">*</span><span class="ruby-identifier">a</span>)
     <span class="ruby-ivar">@c</span> = <span class="ruby-constant">MemCache</span>.<span class="ruby-identifier">new</span>(<span class="ruby-operator">*</span><span class="ruby-identifier">a</span>)
   <span class="ruby-keyword kw">end</span>
@@ -167,7 +176,7 @@ Add an item to the memcache.
             onclick="toggleCode('M000002-source');return false;">[Source]</a></p>
           <div class="method-source-code" id="M000002-source">
 <pre>
-<span class="ruby-comment cmt"># File lib/included_in_memcached.rb, line 14</span>
+<span class="ruby-comment cmt"># File lib/spider/included_in_memcached.rb, line 44</span>
   <span class="ruby-keyword kw">def</span> <span class="ruby-operator">&lt;&lt;</span>(<span class="ruby-identifier">v</span>)
     <span class="ruby-ivar">@c</span>.<span class="ruby-identifier">add</span>(<span class="ruby-identifier">v</span>.<span class="ruby-identifier">to_s</span>, <span class="ruby-identifier">v</span>)
   <span class="ruby-keyword kw">end</span>
@@ -193,7 +202,7 @@ True if the item is in the memcache.
             onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
           <div class="method-source-code" id="M000003-source">
 <pre>
-<span class="ruby-comment cmt"># File lib/included_in_memcached.rb, line 19</span>
+<span class="ruby-comment cmt"># File lib/spider/included_in_memcached.rb, line 49</span>
   <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">include?</span>(<span class="ruby-identifier">v</span>)
     <span class="ruby-ivar">@c</span>.<span class="ruby-identifier">get</span>(<span class="ruby-identifier">v</span>.<span class="ruby-identifier">to_s</span>) <span class="ruby-operator">==</span> <span class="ruby-identifier">v</span>
   <span class="ruby-keyword kw">end</span>

data/doc/classes/SpiderInstance.html CHANGED

@@ -55,8 +55,8 @@
         <tr class="top-aligned-row">
             <td><strong>In:</strong></td>
             <td>
-                <a href="../files/lib/spider_instance_rb.html">
-                lib/spider_instance.rb
+                <a href="../files/lib/spider/spider_instance_rb.html">
+                lib/spider/spider_instance.rb
                 </a>
         <br />
             </td>
@@ -140,7 +140,7 @@ href="http://mike-burns.com">mike-burns.com</a>&#8217;:
             onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
           <div class="method-source-code" id="M000004-source">
 <pre>
-<span class="ruby-comment cmt"># File lib/spider_instance.rb, line 69</span>
+<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 70</span>
   <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_url_check</span>(<span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
     <span class="ruby-ivar">@url_checks</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-identifier">block</span>
   <span class="ruby-keyword kw">end</span>
@@ -186,7 +186,7 @@ understand just &lt;&lt; and included? .
             onclick="toggleCode('M000005-source');return false;">[Source]</a></p>
           <div class="method-source-code" id="M000005-source">
 <pre>
-<span class="ruby-comment cmt"># File lib/spider_instance.rb, line 90</span>
+<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 91</span>
   <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">check_already_seen_with</span>(<span class="ruby-identifier">cacher</span>)
     <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">cacher</span>.<span class="ruby-identifier">respond_to?</span>(<span class="ruby-identifier">:&lt;&lt;</span>) <span class="ruby-operator">&amp;&amp;</span> <span class="ruby-identifier">cacher</span>.<span class="ruby-identifier">respond_to?</span>(<span class="ruby-identifier">:include?</span>)
       <span class="ruby-ivar">@seen</span> = <span class="ruby-identifier">cacher</span>
@@ -216,7 +216,7 @@ Reset the <a href="SpiderInstance.html#M000009">headers</a> hash.
             onclick="toggleCode('M000010-source');return false;">[Source]</a></p>
           <div class="method-source-code" id="M000010-source">
 <pre>
-<span class="ruby-comment cmt"># File lib/spider_instance.rb, line 157</span>
+<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 158</span>
   <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">clear_headers</span>
     <span class="ruby-ivar">@headers</span> = {}
   <span class="ruby-keyword kw">end</span>
@@ -245,7 +245,7 @@ Use like a hash:
             onclick="toggleCode('M000009-source');return false;">[Source]</a></p>
           <div class="method-source-code" id="M000009-source">
 <pre>
-<span class="ruby-comment cmt"># File lib/spider_instance.rb, line 145</span>
+<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 146</span>
   <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">headers</span>
     <span class="ruby-constant">HeaderSetter</span>.<span class="ruby-identifier">new</span>(<span class="ruby-keyword kw">self</span>)
   <span class="ruby-keyword kw">end</span>
@@ -294,7 +294,7 @@ For example:
             onclick="toggleCode('M000006-source');return false;">[Source]</a></p>
           <div class="method-source-code" id="M000006-source">
 <pre>
-<span class="ruby-comment cmt"># File lib/spider_instance.rb, line 120</span>
+<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 121</span>
   <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">on</span>(<span class="ruby-identifier">code</span>, <span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
     <span class="ruby-identifier">f</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
     <span class="ruby-keyword kw">case</span> <span class="ruby-identifier">code</span>
@@ -331,7 +331,7 @@ Run before the HTTP request. Given the URL as a string.
             onclick="toggleCode('M000007-source');return false;">[Source]</a></p>
           <div class="method-source-code" id="M000007-source">
 <pre>
-<span class="ruby-comment cmt"># File lib/spider_instance.rb, line 134</span>
+<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 135</span>
   <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">setup</span>(<span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
     <span class="ruby-ivar">@setup</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
   <span class="ruby-keyword kw">end</span>
@@ -357,7 +357,7 @@ Run last, once for each page. Given the URL as a string.
             onclick="toggleCode('M000008-source');return false;">[Source]</a></p>
           <div class="method-source-code" id="M000008-source">
 <pre>
-<span class="ruby-comment cmt"># File lib/spider_instance.rb, line 139</span>
+<span class="ruby-comment cmt"># File lib/spider/spider_instance.rb, line 140</span>
   <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">teardown</span>(<span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
     <span class="ruby-ivar">@teardown</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
   <span class="ruby-keyword kw">end</span>

data/doc/created.rid CHANGED

	@@ -1 +1 @@
1	- ~~Fri~~, 02 Nov 2007 17:20:02 -~~0400~~
1	+ Sat, 10 Nov 2007 00:25:19 -0500

data/doc/files/README.html CHANGED

@@ -56,7 +56,7 @@
     </tr>
     <tr class="top-aligned-row">
       <td><strong>Last Update:</strong></td>
-      <td>Fri Nov 02 17:19:47 -0400 2007</td>
+      <td>Thu Nov 08 17:51:17 -0500 2007</td>
     </tr>
     </table>
   </div>
@@ -182,7 +182,7 @@ Mike Burns <a href="http://mike-burns.com">mike-burns.com</a>
 mike@mike-burns.com
 </p>
 <p>
-Help from Matt Horan and John Nagro.
+Help from Matt Horan, John Nagro, and Henri Cook.
 </p>
 <p>
 With `robot_rules&#8217; from James Edward Gray II via <a

data/doc/files/lib/{included_in_memcached_rb.html → spider/included_in_memcached_rb.html} RENAMED

@@ -8,7 +8,7 @@
   <title>File: included_in_memcached.rb</title>
   <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
   <meta http-equiv="Content-Script-Type" content="text/javascript" />
-  <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
+  <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
   <script type="text/javascript">
   // <![CDATA[
@@ -51,12 +51,12 @@
     <table class="header-table">
     <tr class="top-aligned-row">
       <td><strong>Path:</strong></td>
-      <td>lib/included_in_memcached.rb
+      <td>lib/spider/included_in_memcached.rb
       </td>
     </tr>
     <tr class="top-aligned-row">
       <td><strong>Last Update:</strong></td>
-      <td>Fri Nov 02 15:04:14 -0400 2007</td>
+      <td>Sat Nov 10 00:24:11 -0500 2007</td>
     </tr>
     </table>
   </div>
@@ -68,6 +68,12 @@
   <div id="contextContent">
+    <div id="description">
+      <p>
+Use memcached to track cycles.
+</p>
+    </div>
     <div id="requires-list">
       <h3 class="section-bar">Required files</h3>

data/doc/files/lib/{spider_instance_rb.html → spider/spider_instance_rb.html} RENAMED

@@ -8,7 +8,7 @@
   <title>File: spider_instance.rb</title>
   <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
   <meta http-equiv="Content-Script-Type" content="text/javascript" />
-  <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
+  <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
   <script type="text/javascript">
   // <![CDATA[
@@ -51,12 +51,12 @@
     <table class="header-table">
     <tr class="top-aligned-row">
       <td><strong>Path:</strong></td>
-      <td>lib/spider_instance.rb
+      <td>lib/spider/spider_instance.rb
       </td>
     </tr>
     <tr class="top-aligned-row">
       <td><strong>Last Update:</strong></td>
-      <td>Fri Nov 02 17:05:49 -0400 2007</td>
+      <td>Sat Nov 10 00:25:04 -0500 2007</td>
     </tr>
     </table>
   </div>
@@ -70,7 +70,7 @@
     <div id="description">
       <p>
-Copyright 2007 Mike Burns
+Specialized spidering rules.
 </p>
     </div>

data/doc/files/lib/spider_rb.html CHANGED

@@ -56,7 +56,7 @@
     </tr>
     <tr class="top-aligned-row">
       <td><strong>Last Update:</strong></td>
-      <td>Fri Nov 02 12:32:39 -0400 2007</td>
+      <td>Thu Nov 08 17:29:01 -0500 2007</td>
     </tr>
     </table>
   </div>
@@ -182,7 +182,7 @@ Mike Burns <a href="http://mike-burns.com">mike-burns.com</a>
 mike@mike-burns.com
 </p>
 <p>
-Help from Matt Horan and John Nagro.
+Help from Matt Horan, John Nagro, and Henri Cook.
 </p>
 <p>
 With `robot_rules&#8217; from James Edward Gray II via <a

data/doc/fr_file_index.html CHANGED

@@ -21,9 +21,9 @@
   <h1 class="section-bar">Files</h1>
   <div id="index-entries">
     <a href="files/README.html">README</a><br />
-    <a href="files/lib/included_in_memcached_rb.html">lib/included_in_memcached.rb</a><br />
     <a href="files/lib/spider_rb.html">lib/spider.rb</a><br />
-    <a href="files/lib/spider_instance_rb.html">lib/spider_instance.rb</a><br />
+    <a href="files/lib/spider/included_in_memcached_rb.html">lib/spider/included_in_memcached.rb</a><br />
+    <a href="files/lib/spider/spider_instance_rb.html">lib/spider/spider_instance.rb</a><br />
   </div>
 </div>
 </body>

data/lib/spider.rb CHANGED

@@ -23,7 +23,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-require File.dirname(__FILE__)+'/spider_instance'
+require File.dirname(__FILE__)+'/spider/spider_instance'
 # A spidering library for Ruby. Handles robots.txt, scraping, finding more
 # links, and doing it all over again.

data/lib/spider/included_in_memcached.rb ADDED

@@ -0,0 +1,52 @@
+# Use memcached to track cycles.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+#      * Neither the name Mike Burns nor the
+#      names of his contributors may be used to endorse or promote products
+#      derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+require 'memcache'
+# A specialized class using memcached to track items stored. It supports
+# three operations: new, <<, and include? . Together these can be used to
+# add items to the memcache, then determine whether the item has been added.
+#
+# To use it with Spider use the check_already_seen_with method:
+#
+#  Spider.start_at('http://example.com/') do |s|
+#    s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
+#  end
+class IncludedInMemcached
+  # Construct a new IncludedInMemcached instance. All arguments here are
+  # passed to MemCache (part of the memcache-client gem).
+  def initialize(*a)
+    @c = MemCache.new(*a)
+  end
+  # Add an item to the memcache.
+  def <<(v)
+    @c.add(v.to_s, v)
+  end
+  # True if the item is in the memcache.
+  def include?(v)
+    @c.get(v.to_s) == v
+  end
+end

data/lib/{robot_rules.rb → spider/robot_rules.rb} RENAMED

@@ -1,3 +1,5 @@
+# Understand robots.txt.
 #  Created by James Edward Gray II on 2006-01-31.
 #  Copyright 2006 Gray Productions. All rights reserved.

data/lib/{spider_instance.rb → spider/spider_instance.rb} RENAMED

@@ -1,5 +1,6 @@
-# Copyright 2007 Mike Burns
+# Specialized spidering rules.
+# Copyright 2007 Mike Burns
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #      * Redistributions of source code must retain the above copyright
@@ -22,7 +23,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-require 'robot_rules'
+require File.dirname(__FILE__)+'/robot_rules.rb'
 require 'open-uri'
 require 'uri'
 require 'net/http'
@@ -221,7 +222,7 @@ class SpiderInstance
       r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri,
                                                        @headers))}
       if r.redirect?
-        get_page(URI.parse(r['Location']), &block)
+        get_page(URI.parse(construct_complete_url(parsed_url,r['Location'])), &block)
       else
         block.call(r)
       end
@@ -252,21 +253,7 @@ class SpiderInstance
         if parsed_link.fragment == '#'
           nil
         else
-          case parsed_link.scheme
-          when 'http'
-            link
-          when nil
-            u = URI.parse(base_url)
-            if link[0].chr == '/'
-              "#{u.scheme}://#{u.host}:#{u.port}#{link}"
-            elsif u.path.nil? || u.path == ''
-              "#{u.scheme}://#{u.host}:#{u.port}/#{link}"
-            else
-              "#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{link}"
-            end
-          else
-            nil
-          end
+          construct_complete_url(base_url, link, parsed_link)
         end
       rescue
         nil
@@ -274,6 +261,23 @@ class SpiderInstance
     end.compact
   end
+  def construct_complete_url(base_url, additional_url, parsed_additional_url = nil) #:nodoc:
+    parsed_additional_url ||= URI.parse(additional_url)
+    case parsed_additional_url.scheme
+    when nil
+      u = base_url.is_a?(URI) ? base_url : URI.parse(base_url)
+      if additional_url[0].chr == '/'
+        "#{u.scheme}://#{u.host}:#{u.port}#{additional_url}"
+      elsif u.path.nil? || u.path == ''
+        "#{u.scheme}://#{u.host}:#{u.port}/#{additional_url}"
+      else
+        "#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{additional_url}"
+      end
+    else
+      additional_url
+    end
+  end
   def remove_trailing_slash(s) #:nodoc:
     s.sub(%r{/*$},'')
   end

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,90 @@
+require 'rubygems'
+require 'webrick'
+require 'spec'
+Spec::Runner.configure { |c| c.mock_with :mocha }
+def local_require(*files)
+  files.each do |file|
+    require File.dirname(__FILE__)+'/../lib/'+file
+  end
+end
+class BeStaticServerPages
+  def initialize
+    @pages = ['http://localhost:8888/', 'http://localhost:8888/foo']
+    @actual = nil
+  end
+  attr :actual, true
+  def matches?(actual)
+    @actual = actual
+    actual == @pages
+  end
+  def failure_message
+    "expected #{@pages.inspect}, got #{@actual.inspect}"
+  end
+  def description
+    "be the pages returned by the static server (#{@pages.inspect})"
+  end
+end
+def with_web_server(svlt)
+  server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
+                                   :AccessLog => [])
+  server.mount('/', svlt)
+  Thread.new {server.start}
+  begin
+    yield
+  ensure
+    server.shutdown
+  end
+end
+def with_memcached
+  system('memcached -d -P /tmp/spider-memcached.pid')
+  cacher = IncludedInMemcached.new('localhost:11211')
+  begin
+    yield
+  ensure
+    system('kill -KILL `cat /tmp/spider-memcached.pid`')
+  end
+end
+def be_static_server_pages
+  BeStaticServerPages.new
+end
+class QueryServlet < WEBrick::HTTPServlet::AbstractServlet
+  def do_GET(req, res)
+    res['Content-type'] = 'text/plain'
+    res.body = "response\n"
+  end
+end
+class LoopingServlet < WEBrick::HTTPServlet::AbstractServlet
+  def do_GET(req, res)
+    res['Content-type'] = 'text/html'
+    if req.path == '/foo'
+      res.body = <<-END
+      <a href="/">a</a>
+      END
+    else
+      res.body = <<-END
+      <a href="/foo">b</a>
+      END
+    end
+  end
+end
+def null_logger
+  l = stub
+  [:log, :fatal, :error, :warn , :info, :debug].each do |k|
+    l.stubs(k)
+    l.stubs("#{k}?".to_sym)
+  end
+  l
+end

data/spec/{included_in_memcached_spec.rb → spider/included_in_memcached_spec.rb} RENAMED

@@ -1,8 +1,7 @@
-require 'rubygems'
-require 'spec'
+require File.dirname(__FILE__)+'/../spec_helper'
 def before_specing_memcached
-  require File.dirname(__FILE__)+'/../lib/included_in_memcached'
+  local_require 'spider/included_in_memcached'
   system('memcached -d -P /tmp/spider-memcached.pid')
 end

data/spec/{spider_instance_spec.rb → spider/spider_instance_spec.rb} RENAMED

@@ -1,49 +1,35 @@
-require 'rubygems'
-require 'spec'
+require File.dirname(__FILE__)+'/../spec_helper'
 require 'webrick'
 require 'webrick/https'
-require File.dirname(__FILE__)+'/../lib/spider'
-require File.dirname(__FILE__)+'/../lib/included_in_memcached'
+local_require 'spider', 'spider/included_in_memcached'
-Spec::Runner.configure { |c| c.mock_with :mocha }
-class QueryServlet < WEBrick::HTTPServlet::AbstractServlet
-  def do_GET(req, res)
-    res['Content-type'] = 'text/plain'
-    res.body = "response\n"
-  end
-end
-class LoopingServlet < WEBrick::HTTPServlet::AbstractServlet
-  def do_GET(req, res)
-    res['Content-type'] = 'text/html'
-    if req.path == '/foo'
-      res.body = <<-END
-      <a href="/">a</a>
-      END
-    else
-      res.body = <<-END
-      <a href="/foo">b</a>
-      END
+describe 'SpiderInstance' do
+  # http://www.rcuk.ac.uk/ redirects to /default.htm, which isn't a complete
+  # URL. Bug reported by Henri Cook.
+  it 'should construct a complete redirect URL' do
+    @response_called = false
+    redirected_resp = stub(:redirect? => true,
+                          :[] => '/default.htm')
+    success_resp = stub(:redirect? => false)
+    http_req = stub(:request => true)
+    http_mock_redir = stub(:use_ssl= => true)
+    http_mock_redir.stubs(:start).yields(http_req).returns(redirected_resp)
+    http_mock_success = stub(:use_ssl= => true)
+    http_mock_success.stubs(:start).yields(http_req).returns(success_resp)
+    Net::HTTP.expects(:new).times(2).returns(http_mock_redir).then.
+      returns(http_mock_success)
+    si = SpiderInstance.new({nil => ['http://www.rcuk.ac.uk/']})
+    si.get_page(URI.parse('http://www.rcuk.ac.uk/')) do |resp|
+      @response_called = true
     end
+    @response_called.should be_true
   end
-end
-def null_logger
-  l = stub
-  [:log, :fatal, :error, :warn , :info, :debug].each do |k|
-    l.stubs(k)
-    l.stubs("#{k}?".to_sym)
-  end
-  l
-end
-describe 'SpiderInstance' do
   it 'should prevent cycles with an IncludedInMemcached' do
-    system('memcached -d -P /tmp/spider-memcached.pid')
-    cacher = IncludedInMemcached.new('localhost:11211')
-    it_should_prevent_cycles_with(cacher)
-    system('kill -KILL `cat /tmp/spider-memcached.pid`')
+    with_memcached do
+      cacher = IncludedInMemcached.new('localhost:11211')
+      it_should_prevent_cycles_with(cacher)
+    end
   end
   it 'should prevent cycles with an Array' do
@@ -129,15 +115,12 @@ describe 'SpiderInstance' do
     u = 'http://localhost:8888?s=1'
     u_p = URI.parse(u)
     @block_called = false
-    server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
-                                     :AccessLog => [])
-    server.mount('/', QueryServlet)
-    Thread.new {server.start}
-    si = SpiderInstance.new({nil => [u]})
-    si.get_page(u_p) do
-      @block_called = true
+    with_web_server(QueryServlet) do
+      si = SpiderInstance.new({nil => [u]})
+      si.get_page(u_p) do
+        @block_called = true
+      end
     end
-    server.shutdown
     @block_called.should be_true
   end
@@ -413,15 +396,10 @@ describe 'SpiderInstance' do
     u2 = 'http://localhost:8888/foo'
     u_p2 = URI.parse(u2)
-    server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
-                                     :AccessLog => [])
-    server.mount('/', LoopingServlet)
-    Thread.new {server.start}
-    si = SpiderInstance.new(nil => [u])
-    si.check_already_seen_with cacher
-    si.start!
-    server.shutdown
+    with_web_server(LoopingServlet) do
+      si = SpiderInstance.new(nil => [u])
+      si.check_already_seen_with cacher
+      si.start!
+    end
   end
 end

data/spec/spider_spec.rb CHANGED

@@ -1,10 +1,33 @@
-require 'rubygems'
-require 'spec'
-require File.dirname(__FILE__)+'/../lib/spider'
+require File.dirname(__FILE__)+'/spec_helper'
+local_require 'spider', 'spider/included_in_memcached'
 describe 'Spider' do
-  it 'should start at the given URL when given a string' do
-    #Spider.start_at('http://example.com/') {}
-    pending 'this will be a while'
+  it 'should find two pages without cycles using defaults' do
+    u = []
+    with_web_server(LoopingServlet) do
+      u = find_pages_with_static_server
+    end
+    u.should be_static_server_pages
+  end
+  it 'should find two pages without cycles using memcached' do
+    u = []
+    with_web_server(LoopingServlet) do
+      with_memcached do
+        u = find_pages_with_static_server do |s|
+          s.check_already_seen_with IncludedInMemcached.new('localhost:11211')
+        end
+      end
+    end
+    u.should be_static_server_pages
+  end
+  def find_pages_with_static_server(&block)
+    pages = []
+    Spider.start_at('http://localhost:8888/') do |s|
+      block.call(s) unless block.nil?
+      s.on(:every){ |u,r,p| pages << u }
+    end
+    pages
   end
 end

data/spider.gemspec CHANGED

@@ -13,5 +13,5 @@ spec = Gem::Specification.new do |s|
 A Web spidering library: handles robots.txt, scraping, finding more
 links, and doing it all over again.
 EOF
-  s.version = '0.4.0'
+  s.version = '0.4.1'
 end

metadata CHANGED

@@ -3,8 +3,8 @@ rubygems_version: 0.9.4
 specification_version: 1
 name: spider
 version: !ruby/object:Gem::Version
-  version: 0.4.0
-date: 2007-11-02 00:00:00 -04:00
+  version: 0.4.1
+date: 2007-11-10 00:00:00 -05:00
 summary: A Web spidering library
 require_paths:
 - lib
@@ -34,8 +34,9 @@ files:
 - doc/files
 - doc/files/lib
 - doc/files/lib/spider_rb.html
-- doc/files/lib/spider_instance_rb.html
-- doc/files/lib/included_in_memcached_rb.html
+- doc/files/lib/spider
+- doc/files/lib/spider/spider_instance_rb.html
+- doc/files/lib/spider/included_in_memcached_rb.html
 - doc/files/README.html
 - doc/classes
 - doc/classes/IncludedInMemcached.html
@@ -47,26 +48,20 @@ files:
 - doc/index.html
 - doc/created.rid
 - spec
+- spec/spider
+- spec/spider/included_in_memcached_spec.rb
+- spec/spider/spider_instance_spec.rb
 - spec/spider_spec.rb
-- spec/included_in_memcached_spec.rb
-- spec/spider_instance_spec.rb
+- spec/spec_helper.rb
 - README
 - spider.gemspec
 - CHANGES
 - lib
 - lib/spider.rb
-- lib/robot_rules.rb
-- lib/spider_instance.rb
-- lib/included_in_memcached.rb
-- test_server
-- test_server/server1
-- test_server/server1/page1.html
-- test_server/server1/page2.html
-- test_server/server2
-- test_server/server2/page1.html
-- test_server/server2/page2.html
-- test_server/servers.rb
-- test_server/client.rb
+- lib/spider
+- lib/spider/included_in_memcached.rb
+- lib/spider/robot_rules.rb
+- lib/spider/spider_instance.rb
 test_files: []
 rdoc_options: []

data/lib/included_in_memcached.rb DELETED

@@ -1,22 +0,0 @@
-require 'memcache'
-# A specialized class using memcached to track items stored. It supports
-# three operations: new, <<, and include? . Together these can be used to
-# add items to the memcache, then determine whether the item has been added.
-class IncludedInMemcached
-  # Construct a new IncludedInMemcached instance. All arguments here are
-  # passed to MemCache (part of the memcache-client gem).
-  def initialize(*a)
-    @c = MemCache.new(*a)
-  end
-  # Add an item to the memcache.
-  def <<(v)
-    @c.add(v.to_s, v)
-  end
-  # True if the item is in the memcache.
-  def include?(v)
-    @c.get(v.to_s) == v
-  end
-end

data/test_server/client.rb DELETED

@@ -1,26 +0,0 @@
-#!/usr/local/bin/ruby -w
-require 'rubygems'
-require 'spider'
-Spider.start_at('http://localhost:8880/page1.html') do |s|
-  s.add_url_check do |a_url|
-    a_url =~ %r{^http://localhost:8880.*}
-  end
-  s.on 404 do |a_url, resp, prior|
-    puts "URL not found: #{a_url}"
-  end
-  s.on :success do |a_url, resp, prior|
-    puts "body: #{resp.body}"
-  end
-  s.on :any do |a_url, resp, prior|
-    puts "URL returned anything: #{a_url} with this code #{resp.code}"
-  end
-end
-%w(INT TERM).each do |signal|
-  trap(signal) { exit }
-end

data/test_server/server1/page1.html DELETED

	@@ -1 +0,0 @@
1	- <a href="page2.html">See page two!</a>

data/test_server/server1/page2.html DELETED

@@ -1,3 +0,0 @@
-<a href="page1.html">See page one!</a>
-<a href="http://localhost:8881/page1.html">See server two!</a>

data/test_server/server2/page1.html DELETED

	@@ -1 +0,0 @@
1	- <a href="page2.html">See page two!</a>

data/test_server/server2/page2.html DELETED

	@@ -1,2 +0,0 @@
1	- <a href="page1.html">See page one!</a>
2	- <a href="http://localhost:8880/page1.html">See server one!</a>

data/test_server/servers.rb DELETED

@@ -1,24 +0,0 @@
-#!/usr/local/bin/ruby -w
-# Two Web servers, on different ports of localhost, serving two pages each.
-# One page links to the next; the next links to both the first and to the the
-# first on the other server.
-# This is used to test: cycles, domain restrictions.
-require 'webrick'
-server1 = WEBrick::HTTPServer.new(:Port => 8880)
-server1.mount('/', WEBrick::HTTPServlet::FileHandler,
-              File.dirname(__FILE__)+'/server1')
-server2 = WEBrick::HTTPServer.new(:Port => 8881)
-server2.mount('/', WEBrick::HTTPServlet::FileHandler,
-              File.dirname(__FILE__)+'/server2')
-%w(INT TERM).each do |signal|
-  trap(signal) { [server1,server2].each { |server| server.shutdown  } }
-end
-threads = []
-[server1,server2].each do |server|
-  threads << Thread.new { server.start }
-end
-threads.each { |t| t.join }