RubyGems - spider - Versions diffs - 0.2.1 → 0.3.0 - Mend

spider 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

data/CHANGES +6 -0
data/README +4 -4
data/doc/classes/Net/HTTPRedirection.html +144 -0
data/doc/classes/Net/HTTPResponse.html +166 -0
data/doc/classes/Net/HTTPSuccess.html +144 -0
data/doc/classes/NilClass.html +144 -0
data/doc/classes/Spider.html +12 -12
data/doc/classes/SpiderInstance.html +109 -32
data/doc/created.rid +1 -1
data/doc/files/README.html +5 -5
data/doc/files/lib/spider_rb.html +5 -5
data/doc/fr_class_index.html +0 -1
data/doc/fr_file_index.html +1 -0
data/doc/fr_method_index.html +5 -2
data/lib/spider.rb +100 -58
data/spec/spider_instance_spec.rb +115 -30
data/spider.gemspec +1 -1
data/test_server/client.rb +4 -4
metadata +7 -2

data/doc/fr_class_index.html CHANGED

@@ -20,7 +20,6 @@
 <div id="index">
   <h1 class="section-bar">Classes</h1>
   <div id="index-entries">
-    <a href="classes/Net.html">Net</a><br />
     <a href="classes/Spider.html">Spider</a><br />
     <a href="classes/SpiderInstance.html">SpiderInstance</a><br />
   </div>

data/doc/fr_file_index.html CHANGED

@@ -20,6 +20,7 @@
 <div id="index">
   <h1 class="section-bar">Files</h1>
   <div id="index-entries">
+    <a href="files/README.html">README</a><br />
     <a href="files/lib/spider_rb.html">lib/spider.rb</a><br />
   </div>
 </div>

data/doc/fr_method_index.html CHANGED

@@ -21,9 +21,12 @@
   <h1 class="section-bar">Methods</h1>
   <div id="index-entries">
     <a href="classes/SpiderInstance.html#M000001">add_url_check (SpiderInstance)</a><br />
+    <a href="classes/SpiderInstance.html#M000006">clear_headers (SpiderInstance)</a><br />
+    <a href="classes/SpiderInstance.html#M000005">headers (SpiderInstance)</a><br />
     <a href="classes/SpiderInstance.html#M000002">on (SpiderInstance)</a><br />
-    <a href="classes/SpiderInstance.html#M000003">remove_trailing_slash (SpiderInstance)</a><br />
-    <a href="classes/Spider.html#M000004">start_at (Spider)</a><br />
+    <a href="classes/SpiderInstance.html#M000003">setup (SpiderInstance)</a><br />
+    <a href="classes/Spider.html#M000007">start_at (Spider)</a><br />
+    <a href="classes/SpiderInstance.html#M000004">teardown (SpiderInstance)</a><br />
   </div>
 </div>
 </body>

data/lib/spider.rb CHANGED

@@ -29,15 +29,21 @@ require 'uri'
 require 'net/http'
 require 'net/https'
-class Net::HTTPResponse #:nodoc:
-  def success?; false; end
-  def redirect?; false; end
-end
-class Net::HTTPSuccess #:nodoc:
-  def success?; true; end
+module Net #:nodoc:
+  class HTTPResponse #:nodoc:
+    def success?; false; end
+    def redirect?; false; end
+  end
+  class HTTPSuccess #:nodoc:
+    def success?; true; end
+  end
+  class HTTPRedirection #:nodoc:
+    def redirect?; true; end
+  end
 end
-class Net::HTTPRedirection #:nodoc:
-  def redirect?; true; end
+class NilClass #:nodoc:
+  def merge(h); h; end
 end
 # A spidering library for Ruby. Handles robots.txt, scraping, finding more
@@ -52,22 +58,22 @@ class Spider
   #      a_url =~ %r{^http://mike-burns.com.*}
   #    end
   #
-  #    s.on 404 do |a_url, err_code|
+  #    s.on 404 do |a_url, resp, prior_url|
   #      puts "URL not found: #{a_url}"
   #    end
   #
-  #    s.on :success do |a_url, code, headers, body|
-  #      puts "body: #{body}"
+  #    s.on :success do |a_url, resp, prior_url|
+  #      puts "body: #{resp.body}"
   #    end
   #
-  #    s.on :any do |a_url, resp|
+  #    s.on :every do |a_url, resp, prior_url|
   #      puts "URL returned anything: #{a_url} with this code #{resp.code}"
   #    end
   #  end
   def self.start_at(a_url, &block)
     rules    = RobotRules.new('Ruby Spider 1.0')
-    a_spider = SpiderInstance.new([a_url], [], rules, [])
+    a_spider = SpiderInstance.new({nil => a_url}, [], rules, [])
     block.call(a_spider)
     a_spider.start!
   end
@@ -77,11 +83,14 @@ class SpiderInstance
   def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc:
     @url_checks  = []
     @cache       = :memory
-    @callbacks   = {:any => lambda {}, :success => {}, :failure => {}}
+    @callbacks   = {}
     @next_urls   = next_urls
     @seen        = seen
     @rules       = rules || RobotRules.new('Ruby Spider 1.0')
     @robots_seen = robots_seen
+    @headers     = {}
+    @setup       = nil
+    @teardown    = nil
   end
   # Add a predicate that determines whether to continue down this URL's path.
@@ -99,62 +108,88 @@ class SpiderInstance
     @cache = cache_type
   end
-  # Add a response handler. A response handler's trigger can be :any, :success,
-  # :failure, or any HTTP status code. The handler itself can be either a Proc
-  # or a block. The arguments to the block depends on the trigger:
+  # Add a response handler. A response handler's trigger can be :every,
+  # :success, :failure, or any HTTP status code. The handler itself can be
+  # either a Proc or a block.
   #
-  # If the trigger is :any, the arguments are the URL as a string and an
-  # instance of Net::HTTPResponse.
+  # The arguments to the block are: the URL as a string, an instance of
+  # Net::HTTPResponse, and the prior URL as a string.
   #
-  # If the trigger is :success or any HTTP status code that represents a
-  # successful result, the arguments are the URL as a string, the HTTP status
-  # code, an instance of Net::HTTPSuccess, and the body of the result as a
-  # string.
-  #
-  # If the trigger is :failure or any HTTP status code that represents a failed
-  # result, the arguments are the URL as a string and the HTTP status code.
   #
   # For example:
   #
-  #  on 404 do |a_url, code|
+  #  on 404 do |a_url, resp, prior_url|
   #    puts "URL not found: #{a_url}"
   #  end
   #
-  #  on :success do |a_url, code, resp, body|
+  #  on :success do |a_url, resp, prior_url|
   #    puts a_url
-  #    puts body
+  #    puts resp.body
   #  end
   #
-  #  on :any do |a_url, resp|
+  #  on :every do |a_url, resp, prior_url|
   #    puts "Given this code: #{resp.code}"
   #  end
   def on(code, p = nil, &block)
     f = p ? p : block
     case code
     when Fixnum
-      @callbacks[success_or_failure(code)][code] = f
+      @callbacks[code] = f
     else
-      if :any == code.to_sym
-        @callbacks[:any] = f
-      else
-        @callbacks[code.to_sym][:any] = f
-      end
+      @callbacks[code.to_sym] = f
     end
   end
+  # Run before the HTTP request. Given the URL as a string.
+  #  setup do |a_url|
+  #    headers['Cookies'] = 'user_id=1;admin=true'
+  #  end
+  def setup(p = nil, &block)
+    @setup = p ? p : block
+  end
+  # Run last, once for each page. Given the URL as a string.
+  def teardown(p = nil, &block)
+    @teardown = p ? p : block
+  end
+  # Use like a hash:
+  #  headers['Cookies'] = 'user_id=1;password=btrross3'
+  def headers
+    HeaderSetter.new(self)
+  end
+  def raw_headers #:nodoc:
+    @headers
+  end
+  def raw_headers=(v) #:nodoc:
+    @headers = v
+  end
+  # Reset the headers hash.
+  def clear_headers
+    @headers = {}
+  end
   def start! #:nodoc:
     next_urls = @next_urls
     begin
-      next_urls = next_urls.map do |a_url|
-                    [a_url, (URI.parse(a_url) rescue nil)]
-                  end.select do |a_url, parsed_url|
-                    allowable_url?(a_url, parsed_url)
-                  end.map do |a_url, parsed_url|
-                    get_page(parsed_url) do |response|
-                      do_callbacks(a_url, response)
-                      generate_next_urls(a_url, response)
-                    end
-                  end.flatten
+      tmp_n_u = {}
+      next_urls.each do |prior_url, urls|
+        urls.map do |a_url|
+          [a_url, (URI.parse(a_url) rescue nil)]
+        end.select do |a_url, parsed_url|
+          allowable_url?(a_url, parsed_url)
+        end.each do |a_url, parsed_url|
+          @setup.call(a_url) unless @setup.nil?
+          get_page(parsed_url) do |response|
+            do_callbacks(a_url, response, prior_url)
+            tmp_n_u[a_url] = generate_next_urls(a_url, response)
+          end
+          @teardown.call(a_url) unless @teardown.nil?
+        end
+      end
+      next_urls = tmp_n_u
     end while !next_urls.empty?
   end
@@ -196,7 +231,8 @@ class SpiderInstance
       http = Net::HTTP.new(parsed_url.host, parsed_url.port)
       http.use_ssl = parsed_url.scheme == 'https'
       # Uses start because http.finish cannot be called.
-      r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri))}
+      r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri,
+                                                       @headers))}
       if r.redirect?
         get_page(URI.parse(r['Location']), &block)
       else
@@ -208,16 +244,13 @@ class SpiderInstance
     end
   end
-  def do_callbacks(a_url, resp) #:nodoc:
-    @callbacks[:any].call(a_url, resp) if @callbacks[:any]
-    if resp.success?
-      cb_branch = @callbacks[:success]
-      cb_branch[:any].call(a_url, resp.code, resp, resp.body) if cb_branch[:any]
-      cb_branch[resp.code].call(a_url, resp.code, resp, resp.body) if cb_branch[resp.code]
-    else
-      cb_branch = @callbacks[:failure]
-      cb_branch[:any].call(a_url, resp.code) if cb_branch[:any]
-      cb_branch[resp.code].call(a_url, resp.code) if cb_branch[resp.code]
+  def do_callbacks(a_url, resp, prior_url) #:nodoc:
+    cbs = [@callbacks[:every],
+      resp.success? ?  @callbacks[:success] : @callbacks[:failure],
+      @callbacks[resp.code]]
+    cbs.each do |cb|
+      cb.call(a_url, resp, prior_url) if cb
     end
   end
@@ -254,7 +287,16 @@ class SpiderInstance
     end.compact
   end
-  def remove_trailing_slash(s)
+  def remove_trailing_slash(s) #:nodoc:
     s.sub(%r{/*$},'')
   end
+  class HeaderSetter #:nodoc:
+    def initialize(si)
+      @si = si
+    end
+    def []=(k,v)
+      @si.raw_headers = @si.raw_headers.merge({k => v})
+    end
+  end
 end

data/spec/spider_instance_spec.rb CHANGED

@@ -23,6 +23,80 @@ def null_logger
 end
 describe 'SpiderInstance' do
+  it 'should call the "setup" callback before loading the Web page' do
+    mock_successful_http
+    @on_called = false
+    @before_called = false
+    si = SpiderInstance.new({nil => ['http://example.com/']})
+    si.stubs(:allowed?).returns(true)
+    si.stubs(:generate_next_urls).returns([])
+    si.setup       { |*a| @before_called = Time.now }
+    si.on(:every)  { |*a| @on_called = Time.now }
+    si.start!
+    @on_called.should_not be_false
+    @before_called.should_not be_false
+    @before_called.should_not be_false
+    @before_called.should < @on_called
+  end
+  it 'should call the "teardown" callback after running all other callbacks' do
+    mock_successful_http
+    @on_called = false
+    @after_called = false
+    si = SpiderInstance.new({nil => ['http://example.com/']})
+    si.stubs(:allowed?).returns(true)
+    si.stubs(:generate_next_urls).returns([])
+    si.on(:every)  { |*a| @on_called = Time.now }
+    si.teardown    { |*a| @after_called = Time.now }
+    si.start!
+    @on_called.should_not be_false
+    @after_called.should_not be_false
+    @after_called.should_not be_false
+    @after_called.should > @on_called
+  end
+  it 'should pass headers set by a setup handler to the HTTP request' do
+    mock_successful_http
+    Net::HTTP::Get.expects(:new).with('/foo',{'X-Header-Set' => 'True'})
+    si = SpiderInstance.new(nil => ['http://example.com/foo'])
+    si.stubs(:allowable_url?).returns(true)
+    si.stubs(:generate_next_urls).returns([])
+    si.setup do |a_url|
+      si.headers['X-Header-Set'] = 'True'
+    end
+    si.teardown do |a_url|
+      si.clear_headers
+    end
+    si.start!
+  end
+  it 'should allow for a proxy' # fill in more
+  it 'should call the :every callback with the current URL, the response, and the prior URL' do
+    mock_successful_http
+    callback_arguments_on(:every)
+  end
+  it 'should call the :success callback with the current URL, the request, and the prior URL' do
+    mock_successful_http
+    callback_arguments_on(:success)
+  end
+  it 'should call the :failure callback with the current URL, the request, and the prior URL' do
+    mock_failed_http
+    callback_arguments_on(:failure)
+  end
+  it 'should call the HTTP status error code callback with the current URL, the request, and the prior URL' do
+    mock_failed_http
+    callback_arguments_on(404)
+  end
+  it 'should call the HTTP status success code callback with the current URL, the request, and the prior URL' do
+    mock_successful_http
+    callback_arguments_on(200)
+  end
   # Bug reported by John Nagro, using the example source http://eons.com/
   # had to change line 192; uses request_uri now instead of path.
   it 'should handle query URLs without a path' do
@@ -33,7 +107,7 @@ describe 'SpiderInstance' do
                                      :AccessLog => [])
     server.mount('/', QueryServlet)
     Thread.new {server.start}
-    si = SpiderInstance.new([u])
+    si = SpiderInstance.new({nil => [u]})
     si.get_page(u_p) do
       @block_called = true
     end
@@ -47,7 +121,7 @@ describe 'SpiderInstance' do
     u_p = URI.parse(u)
     @redirect_handled = false
     mock_redirect_http
-    si = SpiderInstance.new([u])
+    si = SpiderInstance.new({nil => [u]})
     si.get_page(u_p) do
       @redirect_handled = true
     end
@@ -66,7 +140,7 @@ describe 'SpiderInstance' do
                                      :SSLComment => 'Comment of some sort')
     server.mount('/', QueryServlet)
     Thread.new {server.start}
-    si = SpiderInstance.new([u])
+    si = SpiderInstance.new({nil => [u]})
     si.get_page(u_p) { @page_called = true }
     server.shutdown
     @page_called.should be_true
@@ -79,7 +153,7 @@ describe 'SpiderInstance' do
     u_p = URI.parse(u)
     http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
     Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
-    si = SpiderInstance.new([u])
+    si = SpiderInstance.new({nil => [u]})
     si.expects(:allowable_url?).with(u, u_p).returns(false)
     si.expects(:get_page).times(0)
     si.start!
@@ -90,9 +164,8 @@ describe 'SpiderInstance' do
     u_p = URI.parse(u)
     http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
     Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
-    si = SpiderInstance.new([u])
+    si = SpiderInstance.new({nil => [u]})
     si.expects(:allowable_url?).with(u, u_p).returns(true)
-    si.expects(:allowable_url?).with(nil, nil).returns(false)
     si.expects(:get_page).with(URI.parse(u))
     si.start!
   end
@@ -106,25 +179,25 @@ describe 'SpiderInstance' do
     robot_rules.expects(:parse).with('http://example.com:80/robots.txt',
                                      'robots.txt content')
     robot_rules.expects(:allowed?).with('http://example.com/').returns(false)
-    si = SpiderInstance.new(['http://example.com/'], [], robot_rules, [])
+    si = SpiderInstance.new({nil => ['http://example.com/']}, [], robot_rules, [])
     allowable = si.allowable_url?('http://example.com/',
                                   URI.parse('http://example.com/'))
-    allowable.should == false
+    allowable.should be_false
   end
   it 'should disallow URLs when they fail any url_check' do
-    si = SpiderInstance.new(['http://example.com/'])
+    si = SpiderInstance.new({nil => ['http://example.com/']})
     si.stubs(:allowed?).returns(true)
     si.add_url_check { |a_url| false }
     allowable = si.allowable_url?('http://example.com/',
                                   URI.parse('http://example.com/'))
-    allowable.should == false
+    allowable.should be_false
   end
   it 'should support multiple url_checks' do
     @first_url_check = false
     @second_url_check = false
-    si = SpiderInstance.new(['http://example.com/'])
+    si = SpiderInstance.new({nil => ['http://example.com/']})
     si.stubs(:allowed?).returns(true)
     si.add_url_check do |a_url|
       @first_url_check = true
@@ -136,7 +209,7 @@ describe 'SpiderInstance' do
     end
     allowable = si.allowable_url?('http://example.com/',
                                   URI.parse('http://example.com/'))
-    allowable.should == false
+    allowable.should be_false
     @first_url_check.should be_true
     @second_url_check.should be_true
   end
@@ -144,17 +217,17 @@ describe 'SpiderInstance' do
   it 'should avoid cycles' do
     u = 'http://example.com/'
     u_p = URI.parse(u)
-    si = SpiderInstance.new([u], [u_p])
+    si = SpiderInstance.new({nil => [u]}, [u_p])
     si.stubs(:allowed?).returns(true)
     allowable = si.allowable_url?(u, u_p)
-    allowable.should == false
+    allowable.should be_false
     u_p.should_not be_nil
   end
   it 'should call the 404 handler for 404s' do
     @proc_called = false
     mock_failed_http
-    si = SpiderInstance.new(['http://example.com/'])
+    si = SpiderInstance.new({nil => ['http://example.com/']})
     si.stubs(:allowed?).returns(true)
     si.stubs(:generate_next_urls).returns([])
     si.on(404) {|*a| @proc_called = true}
@@ -165,7 +238,7 @@ describe 'SpiderInstance' do
   it 'should call the :success handler on success' do
     @proc_called = false
     mock_successful_http
-    si = SpiderInstance.new(['http://example.com/'])
+    si = SpiderInstance.new({nil => ['http://example.com/']})
     si.stubs(:allowed?).returns(true)
     si.stubs(:generate_next_urls).returns([])
     si.on(:success) {|*a| @proc_called = true}
@@ -176,19 +249,19 @@ describe 'SpiderInstance' do
   it 'should not call the :success handler on failure' do
     @proc_called = false
     mock_failed_http
-    si = SpiderInstance.new(['http://example.com/'])
+    si = SpiderInstance.new({nil => ['http://example.com/']})
     si.stubs(:allowed?).returns(true)
     si.stubs(:generate_next_urls).returns([])
     si.on(:success) {|*a| @proc_called = true}
     si.start!
-    @proc_called.should == false
+    @proc_called.should be_false
   end
   it 'should call the :success handler and the 200 handler on 200' do
     @proc_200_called = false
     @proc_success_called = false
     mock_successful_http
-    si = SpiderInstance.new(['http://example.com/'])
+    si = SpiderInstance.new({nil => ['http://example.com/']})
     si.stubs(:allowed?).returns(true)
     si.stubs(:generate_next_urls).returns([])
     si.on(:success) {|*a| @proc_success_called = true}
@@ -201,18 +274,18 @@ describe 'SpiderInstance' do
   it 'should not call the :failure handler on success' do
     @proc_called = false
     mock_successful_http
-    si = SpiderInstance.new(['http://example.com/'])
+    si = SpiderInstance.new({nil => ['http://example.com/']})
     si.stubs(:allowed?).returns(true)
     si.stubs(:generate_next_urls).returns([])
     si.on(:failure) {|*a| @proc_called = true}
     si.start!
-    @proc_called.should == false
+    @proc_called.should be_false
   end
   it 'should call the :failure handler on failure' do
     @proc_called = false
     mock_failed_http
-    si = SpiderInstance.new(['http://example.com/'])
+    si = SpiderInstance.new({nil => ['http://example.com/']})
     si.stubs(:allowed?).returns(true)
     si.stubs(:generate_next_urls).returns([])
     si.on(:failure) {|*a| @proc_called = true}
@@ -224,7 +297,7 @@ describe 'SpiderInstance' do
     @proc_404_called = false
     @proc_failure_called = false
     mock_failed_http
-    si = SpiderInstance.new(['http://example.com/'])
+    si = SpiderInstance.new({nil => ['http://example.com/']})
     si.stubs(:allowed?).returns(true)
     si.stubs(:generate_next_urls).returns([])
     si.on(:failure) {|*a| @proc_failure_called = true}
@@ -234,13 +307,13 @@ describe 'SpiderInstance' do
     @proc_failure_called.should be_true
   end
-  it 'should call the :any handler even when a handler for the error code is defined' do
+  it 'should call the :every handler even when a handler for the error code is defined' do
     @any_called = false
     mock_successful_http
-    si = SpiderInstance.new(['http://example.com/'])
+    si = SpiderInstance.new({nil => ['http://example.com/']})
     si.stubs(:allowed?).returns(true)
     si.stubs(:generate_next_urls).returns([])
-    si.on(:any) { |*a| @any_called = true }
+    si.on(:every) { |*a| @any_called = true }
     si.on(202) {|*a|}
     si.start!
     @any_called.should be_true
@@ -249,10 +322,10 @@ describe 'SpiderInstance' do
   it 'should support a block as a response handler' do
     @proc_called = false
     mock_successful_http
-    si = SpiderInstance.new(['http://example.com/'])
+    si = SpiderInstance.new({nil => ['http://example.com/']})
     si.stubs(:allowed?).returns(true)
     si.stubs(:generate_next_urls).returns([])
-    si.on(:any) { |*a| @proc_called = true }
+    si.on(:every) { |*a| @proc_called = true }
     si.start!
     @proc_called.should be_true
   end
@@ -260,10 +333,10 @@ describe 'SpiderInstance' do
   it 'should support a proc as a response handler' do
     @proc_called = false
     mock_successful_http
-    si = SpiderInstance.new(['http://example.com/'])
+    si = SpiderInstance.new({nil => ['http://example.com/']})
     si.stubs(:allowed?).returns(true)
     si.stubs(:generate_next_urls).returns([])
-    si.on(:any, Proc.new { |*a| @proc_called = true })
+    si.on(:every, Proc.new { |*a| @proc_called = true })
     si.start!
     @proc_called.should be_true
   end
@@ -297,4 +370,16 @@ describe 'SpiderInstance' do
       yields(mock(:request => http_req2)).returns(http_req2)
     Net::HTTP.expects(:new).times(2).returns(http_obj).then.returns(http_obj2)
   end
+  def callback_arguments_on(code)
+    si = SpiderInstance.new('http://foo.com/' => ['http://example.com/'])
+    si.stubs(:allowed?).returns(true)
+    si.stubs(:generate_next_urls).returns([])
+    si.on(code) do |a_url, resp, prior_url|
+      a_url.should == 'http://example.com/'
+      resp.should_not be_nil
+      prior_url.should == 'http://foo.com/'
+    end
+    si.start!
+  end
 end