RubyGems - spider - Versions diffs - 0.2.0 → 0.2.1 - Mend

spider 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

data/CHANGES +8 -0
data/doc/classes/Spider.html +7 -7
data/doc/classes/SpiderInstance.html +26 -2
data/doc/created.rid +1 -1
data/doc/files/lib/spider_rb.html +2 -1
data/doc/fr_file_index.html +0 -1
data/doc/fr_method_index.html +2 -1
data/lib/spider.rb +15 -9
data/spec/spider_instance_spec.rb +127 -46
data/spider.gemspec +1 -1
data/test_server/client.rb +4 -0
metadata +2 -2

data/CHANGES CHANGED

@@ -1,9 +1,17 @@
+2007-10-23:
+* URLs without a page component but with a query component.
+* HTTP Redirect.
+* HTTPS.
+* Version 0.2.1 .
 2007-10-22:
 * Use RSpec to ensure that it mostly works.
 * Use WEBrick to create a small test server for additional testing.
 * Completely re-do the API to prepare for future expansion.
 * Add the ability to apply each URL to a series of custom allowed?-like
   matchers.
+* BSD license.
+* Version 0.2.0 .
 2007-03-30:
 * Clean up the documentation.

data/doc/classes/Spider.html CHANGED

@@ -93,7 +93,7 @@ links, and doing it all over again.
       <h3 class="section-bar">Methods</h3>
       <div class="name-list">
-      <a href="#M000003">start_at</a>&nbsp;&nbsp;
+      <a href="#M000004">start_at</a>&nbsp;&nbsp;
       </div>
     </div>
@@ -115,11 +115,11 @@ links, and doing it all over again.
     <div id="methods">
       <h3 class="section-bar">Public Class methods</h3>
-      <div id="method-M000003" class="method-detail">
-        <a name="M000003"></a>
+      <div id="method-M000004" class="method-detail">
+        <a name="M000004"></a>
         <div class="method-heading">
-          <a href="#M000003" class="method-signature">
+          <a href="#M000004" class="method-signature">
           <span class="method-name">start_at</span><span class="method-args">(a_url, &amp;block)</span>
           </a>
         </div>
@@ -150,10 +150,10 @@ define the rules and handlers for the discovered Web pages.
  end
 </pre>
           <p><a class="source-toggle" href="#"
-            onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
-          <div class="method-source-code" id="M000003-source">
+            onclick="toggleCode('M000004-source');return false;">[Source]</a></p>
+          <div class="method-source-code" id="M000004-source">
 <pre>
-<span class="ruby-comment cmt"># File lib/spider.rb, line 55</span>
+<span class="ruby-comment cmt"># File lib/spider.rb, line 68</span>
   <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">start_at</span>(<span class="ruby-identifier">a_url</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
     <span class="ruby-identifier">rules</span>    = <span class="ruby-constant">RobotRules</span>.<span class="ruby-identifier">new</span>(<span class="ruby-value str">'Ruby Spider 1.0'</span>)
     <span class="ruby-identifier">a_spider</span> = <span class="ruby-constant">SpiderInstance</span>.<span class="ruby-identifier">new</span>([<span class="ruby-identifier">a_url</span>], [], <span class="ruby-identifier">rules</span>, [])

data/doc/classes/SpiderInstance.html CHANGED

@@ -88,6 +88,7 @@
       <div class="name-list">
       <a href="#M000001">add_url_check</a>&nbsp;&nbsp;
       <a href="#M000002">on</a>&nbsp;&nbsp;
+      <a href="#M000003">remove_trailing_slash</a>&nbsp;&nbsp;
       </div>
     </div>
@@ -135,7 +136,7 @@ href="http://mike-burns.com">mike-burns.com</a>&#8217;:
             onclick="toggleCode('M000001-source');return false;">[Source]</a></p>
           <div class="method-source-code" id="M000001-source">
 <pre>
-<span class="ruby-comment cmt"># File lib/spider.rb, line 81</span>
+<span class="ruby-comment cmt"># File lib/spider.rb, line 94</span>
   <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">add_url_check</span>(<span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
     <span class="ruby-ivar">@url_checks</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-identifier">block</span>
   <span class="ruby-keyword kw">end</span>
@@ -195,7 +196,7 @@ For example:
             onclick="toggleCode('M000002-source');return false;">[Source]</a></p>
           <div class="method-source-code" id="M000002-source">
 <pre>
-<span class="ruby-comment cmt"># File lib/spider.rb, line 118</span>
+<span class="ruby-comment cmt"># File lib/spider.rb, line 131</span>
   <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">on</span>(<span class="ruby-identifier">code</span>, <span class="ruby-identifier">p</span> = <span class="ruby-keyword kw">nil</span>, <span class="ruby-operator">&amp;</span><span class="ruby-identifier">block</span>)
     <span class="ruby-identifier">f</span> = <span class="ruby-identifier">p</span> <span class="ruby-value">? </span><span class="ruby-identifier">p</span> <span class="ruby-operator">:</span> <span class="ruby-identifier">block</span>
     <span class="ruby-keyword kw">case</span> <span class="ruby-identifier">code</span>
@@ -214,6 +215,29 @@ For example:
         </div>
       </div>
+      <div id="method-M000003" class="method-detail">
+        <a name="M000003"></a>
+        <div class="method-heading">
+          <a href="#M000003" class="method-signature">
+          <span class="method-name">remove_trailing_slash</span><span class="method-args">(s)</span>
+          </a>
+        </div>
+        <div class="method-description">
+          <p><a class="source-toggle" href="#"
+            onclick="toggleCode('M000003-source');return false;">[Source]</a></p>
+          <div class="method-source-code" id="M000003-source">
+<pre>
+<span class="ruby-comment cmt"># File lib/spider.rb, line 257</span>
+  <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">remove_trailing_slash</span>(<span class="ruby-identifier">s</span>)
+    <span class="ruby-identifier">s</span>.<span class="ruby-identifier">sub</span>(<span class="ruby-regexp re">%r{/*$}</span>,<span class="ruby-value str">''</span>)
+  <span class="ruby-keyword kw">end</span>
+</pre>
+          </div>
+        </div>
+      </div>
     </div>

data/doc/created.rid CHANGED

	@@ -1 +1 @@
1	- ~~Mon~~, 22 Oct 2007 07:35:00 -0400
1	+ Tue, 23 Oct 2007 23:14:46 -0400

data/doc/files/lib/spider_rb.html CHANGED

@@ -56,7 +56,7 @@
     </tr>
     <tr class="top-aligned-row">
       <td><strong>Last Update:</strong></td>
-      <td>Mon Oct 22 07:19:31 -0400 2007</td>
+      <td>Tue Oct 23 23:11:42 -0400 2007</td>
     </tr>
     </table>
   </div>
@@ -125,6 +125,7 @@ href="http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589">blade.
       open-uri&nbsp;&nbsp;
       uri&nbsp;&nbsp;
       net/http&nbsp;&nbsp;
+      net/https&nbsp;&nbsp;
       </div>
     </div>

data/doc/fr_file_index.html CHANGED

@@ -20,7 +20,6 @@
 <div id="index">
   <h1 class="section-bar">Files</h1>
   <div id="index-entries">
-    <a href="files/README.html">README</a><br />
     <a href="files/lib/spider_rb.html">lib/spider.rb</a><br />
   </div>
 </div>

data/doc/fr_method_index.html CHANGED

@@ -22,7 +22,8 @@
   <div id="index-entries">
     <a href="classes/SpiderInstance.html#M000001">add_url_check (SpiderInstance)</a><br />
     <a href="classes/SpiderInstance.html#M000002">on (SpiderInstance)</a><br />
-    <a href="classes/Spider.html#M000003">start_at (Spider)</a><br />
+    <a href="classes/SpiderInstance.html#M000003">remove_trailing_slash (SpiderInstance)</a><br />
+    <a href="classes/Spider.html#M000004">start_at (Spider)</a><br />
   </div>
 </div>
 </body>

data/lib/spider.rb CHANGED

@@ -27,13 +27,18 @@ require 'robot_rules'
 require 'open-uri'
 require 'uri'
 require 'net/http'
+require 'net/https'
 class Net::HTTPResponse #:nodoc:
   def success?; false; end
+  def redirect?; false; end
 end
 class Net::HTTPSuccess #:nodoc:
   def success?; true; end
 end
+class Net::HTTPRedirection #:nodoc:
+  def redirect?; true; end
+end
 # A spidering library for Ruby. Handles robots.txt, scraping, finding more
 # links, and doing it all over again.
@@ -188,15 +193,16 @@ class SpiderInstance
   def get_page(parsed_url, &block) #:nodoc:
     @seen << parsed_url
     begin
-      Net::HTTP.start(parsed_url.host, parsed_url.port) do |http|
-        r = http.request(Net::HTTP::Get.new(parsed_url.path))
-        if r.is_a?(Net::HTTPRedirection)
-          get_page(URI.parse(r['Location']), block)
-        else
-          block.call(r)
-        end
+      http = Net::HTTP.new(parsed_url.host, parsed_url.port)
+      http.use_ssl = parsed_url.scheme == 'https'
+      # Uses start because http.finish cannot be called.
+      r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri))}
+      if r.redirect?
+        get_page(URI.parse(r['Location']), &block)
+      else
+        block.call(r)
       end
-    rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, Exception =>e
+    rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError => e
       p e
       nil
     end
@@ -207,7 +213,7 @@ class SpiderInstance
     if resp.success?
       cb_branch = @callbacks[:success]
       cb_branch[:any].call(a_url, resp.code, resp, resp.body) if cb_branch[:any]
-      cb_branch[resp.code].call(a_url, resp.code, resp.headers, resp.body) if cb_branch[resp.code]
+      cb_branch[resp.code].call(a_url, resp.code, resp, resp.body) if cb_branch[resp.code]
     else
       cb_branch = @callbacks[:failure]
       cb_branch[:any].call(a_url, resp.code) if cb_branch[:any]

data/spec/spider_instance_spec.rb CHANGED

@@ -1,15 +1,84 @@
 require 'rubygems'
 require 'spec'
+require 'webrick'
+require 'webrick/https'
 require File.dirname(__FILE__)+'/../lib/spider'
 Spec::Runner.configure { |c| c.mock_with :mocha }
+class QueryServlet < WEBrick::HTTPServlet::AbstractServlet
+  def do_GET(req, res)
+    res['Content-type'] = 'text/plain'
+    res.body = "response\n"
+  end
+end
+def null_logger
+  l = stub
+  [:log, :fatal, :error, :warn , :info, :debug].each do |k|
+    l.stubs(k)
+    l.stubs("#{k}?".to_sym)
+  end
+  l
+end
 describe 'SpiderInstance' do
+  # Bug reported by John Nagro, using the example source http://eons.com/
+  # had to change line 192; uses request_uri now instead of path.
+  it 'should handle query URLs without a path' do
+    u = 'http://localhost:8888?s=1'
+    u_p = URI.parse(u)
+    @block_called = false
+    server = WEBrick::HTTPServer.new(:Port => 8888, :Logger => null_logger,
+                                     :AccessLog => [])
+    server.mount('/', QueryServlet)
+    Thread.new {server.start}
+    si = SpiderInstance.new([u])
+    si.get_page(u_p) do
+      @block_called = true
+    end
+    server.shutdown
+    @block_called.should be_true
+  end
+  # This solves a problem reported by John Nagro.
+  it 'should handle redirects' do
+    u = 'http://example.com/'
+    u_p = URI.parse(u)
+    @redirect_handled = false
+    mock_redirect_http
+    si = SpiderInstance.new([u])
+    si.get_page(u_p) do
+      @redirect_handled = true
+    end
+    @redirect_handled.should be_true
+  end
+  it 'should handle HTTPS' do
+    u = 'https://localhost:10443/'
+    u_p = URI.parse(u)
+    @page_called = false
+    server = WEBrick::HTTPServer.new(:Port => 10443,
+                                     :Logger => null_logger,
+                                     :AccessLog => [],
+                                     :SSLEnable => true,
+                                     :SSLCertName => [["O", "ruby-lang.org"], ["OU", "sample"], ["CN", WEBrick::Utils::getservername]],
+                                     :SSLComment => 'Comment of some sort')
+    server.mount('/', QueryServlet)
+    Thread.new {server.start}
+    si = SpiderInstance.new([u])
+    si.get_page(u_p) { @page_called = true }
+    server.shutdown
+    @page_called.should be_true
+  end
+  it 'should maintain the entire graph within some external object (or memory, or memcached)'
   it 'should skip URLs when allowable_url? is false' do
     u = 'http://example.com/'
     u_p = URI.parse(u)
-    http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
-    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
+    Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
     si = SpiderInstance.new([u])
     si.expects(:allowable_url?).with(u, u_p).returns(false)
     si.expects(:get_page).times(0)
@@ -19,8 +88,8 @@ describe 'SpiderInstance' do
   it 'should not skip URLs when allowable_url? is true' do
     u = 'http://example.com/'
     u_p = URI.parse(u)
-    http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
-    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
+    Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
     si = SpiderInstance.new([u])
     si.expects(:allowable_url?).with(u, u_p).returns(true)
     si.expects(:allowable_url?).with(nil, nil).returns(false)
@@ -68,19 +137,11 @@ describe 'SpiderInstance' do
     allowable = si.allowable_url?('http://example.com/',
                                   URI.parse('http://example.com/'))
     allowable.should == false
-    @first_url_check == true
-    @second_url_check == true
-  end
-  it 'should support memcached'
-  it 'should avoid cycles using memcached'
-  it 'should support memory' do
-    si = SpiderInstance.new(['http://example.com/'])
-    si.use_cache :memory # No exn
+    @first_url_check.should be_true
+    @second_url_check.should be_true
   end
-  it 'should avoid cycles using memory' do
+  it 'should avoid cycles' do
     u = 'http://example.com/'
     u_p = URI.parse(u)
     si = SpiderInstance.new([u], [u_p])
@@ -92,32 +153,29 @@ describe 'SpiderInstance' do
   it 'should call the 404 handler for 404s' do
     @proc_called = false
-    http_resp = stub(:success? => false, :code => 404)
-    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    mock_failed_http
     si = SpiderInstance.new(['http://example.com/'])
     si.stubs(:allowed?).returns(true)
     si.stubs(:generate_next_urls).returns([])
     si.on(404) {|*a| @proc_called = true}
     si.start!
-    @proc_called.should == true
+    @proc_called.should be_true
   end
   it 'should call the :success handler on success' do
     @proc_called = false
-    http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
-    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    mock_successful_http
     si = SpiderInstance.new(['http://example.com/'])
     si.stubs(:allowed?).returns(true)
     si.stubs(:generate_next_urls).returns([])
     si.on(:success) {|*a| @proc_called = true}
     si.start!
-    @proc_called.should == true
+    @proc_called.should be_true
   end
   it 'should not call the :success handler on failure' do
     @proc_called = false
-    http_resp = stub(:success? => false, :code => 404)
-    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    mock_failed_http
     si = SpiderInstance.new(['http://example.com/'])
     si.stubs(:allowed?).returns(true)
     si.stubs(:generate_next_urls).returns([])
@@ -129,22 +187,20 @@ describe 'SpiderInstance' do
   it 'should call the :success handler and the 200 handler on 200' do
     @proc_200_called = false
     @proc_success_called = false
-    http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
-    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    mock_successful_http
     si = SpiderInstance.new(['http://example.com/'])
     si.stubs(:allowed?).returns(true)
     si.stubs(:generate_next_urls).returns([])
     si.on(:success) {|*a| @proc_success_called = true}
-    si.on(200) {|*a| @proc_200_called = true}
+    si.on(200)      {|*a| @proc_200_called     = true}
     si.start!
-    @proc_200_called.should == true
-    @proc_success_called.should == true
+    @proc_200_called.should be_true
+    @proc_success_called.should be_true
   end
   it 'should not call the :failure handler on success' do
     @proc_called = false
-    http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
-    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    mock_successful_http
     si = SpiderInstance.new(['http://example.com/'])
     si.stubs(:allowed?).returns(true)
     si.stubs(:generate_next_urls).returns([])
@@ -155,65 +211,90 @@ describe 'SpiderInstance' do
   it 'should call the :failure handler on failure' do
     @proc_called = false
-    http_resp = stub(:success? => false, :code => 404)
-    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    mock_failed_http
     si = SpiderInstance.new(['http://example.com/'])
     si.stubs(:allowed?).returns(true)
     si.stubs(:generate_next_urls).returns([])
     si.on(:failure) {|*a| @proc_called = true}
     si.start!
-    @proc_called.should == true
+    @proc_called.should be_true
   end
   it 'should call the :failure handler and the 404 handler on 404' do
     @proc_404_called = false
     @proc_failure_called = false
-    http_resp = stub(:success? => false, :code => 404)
-    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    mock_failed_http
     si = SpiderInstance.new(['http://example.com/'])
     si.stubs(:allowed?).returns(true)
     si.stubs(:generate_next_urls).returns([])
     si.on(:failure) {|*a| @proc_failure_called = true}
     si.on(404) {|*a| @proc_404_called = true}
     si.start!
-    @proc_404_called.should == true
-    @proc_failure_called.should == true
+    @proc_404_called.should be_true
+    @proc_failure_called.should be_true
   end
   it 'should call the :any handler even when a handler for the error code is defined' do
     @any_called = false
-    http_resp = stub(:success? => true, :code => 200)
-    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    mock_successful_http
     si = SpiderInstance.new(['http://example.com/'])
     si.stubs(:allowed?).returns(true)
     si.stubs(:generate_next_urls).returns([])
     si.on(:any) { |*a| @any_called = true }
     si.on(202) {|*a|}
     si.start!
-    @any_called.should == true
+    @any_called.should be_true
   end
   it 'should support a block as a response handler' do
     @proc_called = false
-    http_resp = stub(:success? => true, :code => 200)
-    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    mock_successful_http
     si = SpiderInstance.new(['http://example.com/'])
     si.stubs(:allowed?).returns(true)
     si.stubs(:generate_next_urls).returns([])
     si.on(:any) { |*a| @proc_called = true }
     si.start!
-    @proc_called.should == true
+    @proc_called.should be_true
   end
   it 'should support a proc as a response handler' do
     @proc_called = false
-    http_resp = stub(:success? => true, :code => 200)
-    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    mock_successful_http
     si = SpiderInstance.new(['http://example.com/'])
     si.stubs(:allowed?).returns(true)
     si.stubs(:generate_next_urls).returns([])
     si.on(:any, Proc.new { |*a| @proc_called = true })
     si.start!
-    @proc_called.should == true
+    @proc_called.should be_true
+  end
+  def mock_http(http_req)
+    http_obj = mock(:use_ssl= => true)
+    http_obj.expects(:start).
+      yields(mock(:request => http_req)).returns(http_req)
+    Net::HTTP.expects(:new).returns(http_obj)
+  end
+  def mock_successful_http
+    http_req = stub(:redirect? => false, :success? => true, :code => 200, :body => 'body')
+    mock_http(http_req)
+  end
+  def mock_failed_http
+    http_req = stub(:redirect? => false, :success? => false, :code => 404)
+    mock_http(http_req)
+  end
+  def mock_redirect_http
+    http_req = stub(:redirect? => true, :success? => false, :code => 404)
+    http_req.expects(:[]).with('Location').returns('http://example.com/')
+    http_req2 = stub(:redirect? => false, :success? => true, :code => 200)
+    http_obj = mock(:use_ssl= => true)
+    http_obj.expects(:start).
+      yields(mock(:request => http_req)).returns(http_req)
+    http_obj2 = mock(:use_ssl= => true)
+    http_obj2.expects(:start).
+      yields(mock(:request => http_req2)).returns(http_req2)
+    Net::HTTP.expects(:new).times(2).returns(http_obj).then.returns(http_obj2)
   end
 end

data/spider.gemspec CHANGED

@@ -13,5 +13,5 @@ spec = Gem::Specification.new do |s|
 A Web spidering library: handles robots.txt, scraping, finding more
 links, and doing it all over again.
 EOF
-  s.version = '0.2.0'
+  s.version = '0.2.1'
 end

data/test_server/client.rb CHANGED

@@ -20,3 +20,7 @@ Spider.start_at('http://localhost:8880/page1.html') do |s|
     puts "URL returned anything: #{a_url} with this code #{resp.code}"
   end
 end
+%w(INT TERM).each do |signal|
+  trap(signal) { exit }
+end

metadata CHANGED

@@ -3,8 +3,8 @@ rubygems_version: 0.9.4
 specification_version: 1
 name: spider
 version: !ruby/object:Gem::Version
-  version: 0.2.0
-date: 2007-10-22 00:00:00 -04:00
+  version: 0.2.1
+date: 2007-10-23 00:00:00 -04:00
 summary: A Web spidering library
 require_paths:
 - lib