RubyGems - spider - Versions diffs - 0.1.0 → 0.2.0 - Mend

spider 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

data/CHANGES +10 -0
data/README +29 -25
data/doc/classes/Net.html +101 -0
data/doc/classes/Spider.html +180 -0
data/doc/classes/SpiderInstance.html +229 -0
data/doc/created.rid +1 -0
data/doc/files/README.html +149 -0
data/doc/files/lib/spider_rb.html +159 -0
data/doc/fr_class_index.html +29 -0
data/doc/fr_file_index.html +28 -0
data/doc/fr_method_index.html +29 -0
data/doc/index.html +24 -0
data/doc/rdoc-style.css +208 -0
data/lib/spider.rb +208 -80
data/spec/spider_instance_spec.rb +219 -0
data/spec/spider_spec.rb +10 -0
data/spider.gemspec +2 -2
data/test_server/client.rb +22 -0
data/test_server/server1/page1.html +1 -0
data/test_server/server1/page2.html +3 -0
data/test_server/server2/page1.html +1 -0
data/test_server/server2/page2.html +2 -0
data/test_server/servers.rb +24 -0
metadata +32 -6
data/LICENSE +0 -339

data/lib/spider.rb CHANGED Viewed

@@ -1,126 +1,254 @@
 # Copyright 2007 Mike Burns
+# :include: README
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+#      * Neither the name Mike Burns nor the
+#      names of its contributors may be used to endorse or promote products
+#      derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 require 'robot_rules'
 require 'open-uri'
 require 'uri'
+require 'net/http'
+class Net::HTTPResponse #:nodoc:
+  def success?; false; end
+end
+class Net::HTTPSuccess #:nodoc:
+  def success?; true; end
+end
 # A spidering library for Ruby. Handles robots.txt, scraping, finding more
 # links, and doing it all over again.
-module Spider
+class Spider
+  # Runs the spider starting at the given URL. Also takes a block that is given
+  # the SpiderInstance. Use the block to define the rules and handlers for
+  # the discovered Web pages.
+  #
+  #  Spider.start_at('http://mike-burns.com/') do |s|
+  #    s.add_url_check do |a_url|
+  #      a_url =~ %r{^http://mike-burns.com.*}
+  #    end
+  #
+  #    s.on 404 do |a_url, err_code|
+  #      puts "URL not found: #{a_url}"
+  #    end
+  #
+  #    s.on :success do |a_url, code, headers, body|
+  #      puts "body: #{body}"
+  #    end
+  #
+  #    s.on :any do |a_url, resp|
+  #      puts "URL returned anything: #{a_url} with this code #{resp.code}"
+  #    end
+  #  end
+  def self.start_at(a_url, &block)
+    rules    = RobotRules.new('Ruby Spider 1.0')
+    a_spider = SpiderInstance.new([a_url], [], rules, [])
+    block.call(a_spider)
+    a_spider.start!
+  end
+end
-  # [String] (String String -> a) -> omega
-  # The only function worth calling. Takes a list of seed URLs and a block.
-  # This block is passed each URL and its Web page.
+class SpiderInstance
+  def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc:
+    @url_checks  = []
+    @cache       = :memory
+    @callbacks   = {:any => lambda {}, :success => {}, :failure => {}}
+    @next_urls   = next_urls
+    @seen        = seen
+    @rules       = rules || RobotRules.new('Ruby Spider 1.0')
+    @robots_seen = robots_seen
+  end
+  # Add a predicate that determines whether to continue down this URL's path.
+  # All predicates must be true in order for a URL to proceed.
   #
-  # Examples:
+  # Takes a block that takes a string and produces a boolean. For example, this
+  # will ensure that the URL starts with 'http://mike-burns.com':
   #
-  #  spider(['http://yahoo.com']) do |a_url, web_page|
-  #    puts "At #{a_url}"
+  #  add_url_check { |a_url| a_url =~ %r{^http://mike-burns.com.*}
+  def add_url_check(&block)
+    @url_checks << block
+  end
+  def use_cache(cache_type) #:nodoc:
+    @cache = cache_type
+  end
+  # Add a response handler. A response handler's trigger can be :any, :success,
+  # :failure, or any HTTP status code. The handler itself can be either a Proc
+  # or a block. The arguments to the block depends on the trigger:
+  #
+  # If the trigger is :any, the arguments are the URL as a string and an
+  # instance of Net::HTTPResponse.
+  #
+  # If the trigger is :success or any HTTP status code that represents a
+  # successful result, the arguments are the URL as a string, the HTTP status
+  # code, an instance of Net::HTTPSuccess, and the body of the result as a
+  # string.
+  #
+  # If the trigger is :failure or any HTTP status code that represents a failed
+  # result, the arguments are the URL as a string and the HTTP status code.
+  #
+  # For example:
+  #
+  #  on 404 do |a_url, code|
+  #    puts "URL not found: #{a_url}"
   #  end
   #
-  #  spider(['http://mike-burns.com','http://matthoran.com']) do |u, page|
-  #    scrape_images(page).each { |img| store_image!(img) }
+  #  on :success do |a_url, code, resp, body|
+  #    puts a_url
+  #    puts body
   #  end
-  def spider(urls)
-    rules       = RobotRules.new('Ruby Spider 1.0')
-    next_urls   = (urls.is_a?(Array) ? urls : [urls])
-    seen        = []
-    robots_seen = []
-    # This used to be (tail) recursive, but Ruby doesn't optimize that.
-    # I have no idea if this iterative version is correct, but it seems it.
+  #
+  #  on :any do |a_url, resp|
+  #    puts "Given this code: #{resp.code}"
+  #  end
+  def on(code, p = nil, &block)
+    f = p ? p : block
+    case code
+    when Fixnum
+      @callbacks[success_or_failure(code)][code] = f
+    else
+      if :any == code.to_sym
+        @callbacks[:any] = f
+      else
+        @callbacks[code.to_sym][:any] = f
+      end
+    end
+  end
+  def start! #:nodoc:
+    next_urls = @next_urls
     begin
       next_urls = next_urls.map do |a_url|
                     [a_url, (URI.parse(a_url) rescue nil)]
                   end.select do |a_url, parsed_url|
-                    !parsed_url.nil? && !seen.include?(a_url) &&
-                      allowed?(a_url, parsed_url, rules, robots_seen)
+                    allowable_url?(a_url, parsed_url)
                   end.map do |a_url, parsed_url|
-                    scrape_links(a_url, parsed_url) do |a_url,web_page|
-                      seen << a_url
-                      yield(a_url,web_page)
+                    get_page(parsed_url) do |response|
+                      do_callbacks(a_url, response)
+                      generate_next_urls(a_url, response)
                     end
-                  end.flatten.map { |a_url,parsed_url| a_url }
+                  end.flatten
     end while !next_urls.empty?
   end
-  private
+  def success_or_failure(code) #:nodoc:
+    if code > 199 && code < 300
+      :success
+    else
+      :failure
+    end
+  end
+  def allowable_url?(a_url, parsed_url) #:nodoc:
+    !parsed_url.nil? && !@seen.include?(parsed_url) && allowed?(a_url, parsed_url) &&
+      @url_checks.map{|url_check|url_check.call(a_url)}.all?
+  end
   # True if the robots.txt for that URL allows access to it.
-  def allowed?(a_url, parsed_url, rules, robots_seen) # :nodoc:
+  def allowed?(a_url, parsed_url) # :nodoc:
     u = "#{parsed_url.scheme}://#{parsed_url.host}:#{parsed_url.port}/robots.txt"
     begin
-      unless robots_seen.include?(u)
+      unless @robots_seen.include?(u)
         open(u, 'User-Agent' => 'Ruby Spider',
           'Accept' => 'text/html,text/xml,application/xml,text/plain') do |url|
-          rules.parse(u, url.read)
+          @rules.parse(u, url.read)
         end
-        robots_seen << u
+        @robots_seen << u
       end
-      rules.allowed?(a_url)
+      @rules.allowed?(a_url)
     rescue OpenURI::HTTPError
-      true
-    rescue Timeout::Error # to keep it from crashing
-      false
-    rescue
+      true # No robots.txt
+    rescue Exception, Timeout::Error # to keep it from crashing
       false
     end
   end
-  # Produce all the links on the page.
-  def scrape_links(a_url, parsed_url) # :nodoc:
+  def get_page(parsed_url, &block) #:nodoc:
+    @seen << parsed_url
     begin
-      open(a_url, 'User-Agent' => 'Ruby Spider',
-         'Accept' => 'text/html,text/xml,application/xml,text/plain') do |data|
-        web_page = data.read
-        base_url = (web_page.scan(/base\s+href="(.*?)"/i).flatten +
-                    [a_url[0,a_url.rindex('/')]])[0]
-        links = web_page.scan(/href="(.*?)"/i).flatten.map do |link|
-          begin
-            parsed_link = URI.parse(link)
-            if parsed_link.fragment == '#'
-              nil
+      Net::HTTP.start(parsed_url.host, parsed_url.port) do |http|
+        r = http.request(Net::HTTP::Get.new(parsed_url.path))
+        if r.is_a?(Net::HTTPRedirection)
+          get_page(URI.parse(r['Location']), block)
+        else
+          block.call(r)
+        end
+      end
+    rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, Exception =>e
+      p e
+      nil
+    end
+  end
+  def do_callbacks(a_url, resp) #:nodoc:
+    @callbacks[:any].call(a_url, resp) if @callbacks[:any]
+    if resp.success?
+      cb_branch = @callbacks[:success]
+      cb_branch[:any].call(a_url, resp.code, resp, resp.body) if cb_branch[:any]
+      cb_branch[resp.code].call(a_url, resp.code, resp.headers, resp.body) if cb_branch[resp.code]
+    else
+      cb_branch = @callbacks[:failure]
+      cb_branch[:any].call(a_url, resp.code) if cb_branch[:any]
+      cb_branch[resp.code].call(a_url, resp.code) if cb_branch[resp.code]
+    end
+  end
+  def generate_next_urls(a_url, resp) #:nodoc:
+    web_page = resp.body
+    base_url = (web_page.scan(/base\s+href="(.*?)"/i).flatten +
+                [a_url[0,a_url.rindex('/')]])[0]
+    base_url = remove_trailing_slash(base_url)
+    web_page.scan(/href="(.*?)"/i).flatten.map do |link|
+      begin
+        parsed_link = URI.parse(link)
+        if parsed_link.fragment == '#'
+          nil
+        else
+          case parsed_link.scheme
+          when 'http'
+            link
+          when nil
+            u = URI.parse(base_url)
+            if link[0].chr == '/'
+              "#{u.scheme}://#{u.host}:#{u.port}#{link}"
+            elsif u.path.nil? || u.path == ''
+              "#{u.scheme}://#{u.host}:#{u.port}/#{link}"
             else
-              case parsed_link.scheme
-              when 'http'
-                link
-              when nil
-                u = URI.parse(base_url)
-                if link[0].chr == '/'
-                  "#{u.scheme}://#{u.host}:#{u.port}#{link}"
-                else
-                  "#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{link}"
-                end
-              else
-                nil
-              end
+              "#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{link}"
             end
-          rescue
+          else
             nil
           end
-        end.reject{|link|link.nil?}
-        yield(a_url,web_page)
-        links
+        end
+      rescue
+        nil
       end
-    rescue Timeout::Error # to keep it from crashing
-      []
-    rescue
-      []
-    end
+    end.compact
   end
+  def remove_trailing_slash(s)
+    s.sub(%r{/*$},'')
+  end
 end

data/spec/spider_instance_spec.rb ADDED Viewed

@@ -0,0 +1,219 @@
+require 'rubygems'
+require 'spec'
+require File.dirname(__FILE__)+'/../lib/spider'
+Spec::Runner.configure { |c| c.mock_with :mocha }
+describe 'SpiderInstance' do
+  it 'should skip URLs when allowable_url? is false' do
+    u = 'http://example.com/'
+    u_p = URI.parse(u)
+    http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
+    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    si = SpiderInstance.new([u])
+    si.expects(:allowable_url?).with(u, u_p).returns(false)
+    si.expects(:get_page).times(0)
+    si.start!
+  end
+  it 'should not skip URLs when allowable_url? is true' do
+    u = 'http://example.com/'
+    u_p = URI.parse(u)
+    http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
+    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    si = SpiderInstance.new([u])
+    si.expects(:allowable_url?).with(u, u_p).returns(true)
+    si.expects(:allowable_url?).with(nil, nil).returns(false)
+    si.expects(:get_page).with(URI.parse(u))
+    si.start!
+  end
+  it 'should disallow URLs when the robots.txt says to' do
+    robot_rules = stub
+    SpiderInstance.any_instance.expects(:open).
+      with('http://example.com:80/robots.txt', 'User-Agent' => 'Ruby Spider',
+        'Accept' => 'text/html,text/xml,application/xml,text/plain').
+      yields(stub(:read => 'robots.txt content'))
+    robot_rules.expects(:parse).with('http://example.com:80/robots.txt',
+                                     'robots.txt content')
+    robot_rules.expects(:allowed?).with('http://example.com/').returns(false)
+    si = SpiderInstance.new(['http://example.com/'], [], robot_rules, [])
+    allowable = si.allowable_url?('http://example.com/',
+                                  URI.parse('http://example.com/'))
+    allowable.should == false
+  end
+  it 'should disallow URLs when they fail any url_check' do
+    si = SpiderInstance.new(['http://example.com/'])
+    si.stubs(:allowed?).returns(true)
+    si.add_url_check { |a_url| false }
+    allowable = si.allowable_url?('http://example.com/',
+                                  URI.parse('http://example.com/'))
+    allowable.should == false
+  end
+  it 'should support multiple url_checks' do
+    @first_url_check = false
+    @second_url_check = false
+    si = SpiderInstance.new(['http://example.com/'])
+    si.stubs(:allowed?).returns(true)
+    si.add_url_check do |a_url|
+      @first_url_check = true
+      true
+    end
+    si.add_url_check do |a_url|
+      @second_url_check = true
+      false
+    end
+    allowable = si.allowable_url?('http://example.com/',
+                                  URI.parse('http://example.com/'))
+    allowable.should == false
+    @first_url_check == true
+    @second_url_check == true
+  end
+  it 'should support memcached'
+  it 'should avoid cycles using memcached'
+  it 'should support memory' do
+    si = SpiderInstance.new(['http://example.com/'])
+    si.use_cache :memory # No exn
+  end
+  it 'should avoid cycles using memory' do
+    u = 'http://example.com/'
+    u_p = URI.parse(u)
+    si = SpiderInstance.new([u], [u_p])
+    si.stubs(:allowed?).returns(true)
+    allowable = si.allowable_url?(u, u_p)
+    allowable.should == false
+    u_p.should_not be_nil
+  end
+  it 'should call the 404 handler for 404s' do
+    @proc_called = false
+    http_resp = stub(:success? => false, :code => 404)
+    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    si = SpiderInstance.new(['http://example.com/'])
+    si.stubs(:allowed?).returns(true)
+    si.stubs(:generate_next_urls).returns([])
+    si.on(404) {|*a| @proc_called = true}
+    si.start!
+    @proc_called.should == true
+  end
+  it 'should call the :success handler on success' do
+    @proc_called = false
+    http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
+    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    si = SpiderInstance.new(['http://example.com/'])
+    si.stubs(:allowed?).returns(true)
+    si.stubs(:generate_next_urls).returns([])
+    si.on(:success) {|*a| @proc_called = true}
+    si.start!
+    @proc_called.should == true
+  end
+  it 'should not call the :success handler on failure' do
+    @proc_called = false
+    http_resp = stub(:success? => false, :code => 404)
+    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    si = SpiderInstance.new(['http://example.com/'])
+    si.stubs(:allowed?).returns(true)
+    si.stubs(:generate_next_urls).returns([])
+    si.on(:success) {|*a| @proc_called = true}
+    si.start!
+    @proc_called.should == false
+  end
+  it 'should call the :success handler and the 200 handler on 200' do
+    @proc_200_called = false
+    @proc_success_called = false
+    http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
+    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    si = SpiderInstance.new(['http://example.com/'])
+    si.stubs(:allowed?).returns(true)
+    si.stubs(:generate_next_urls).returns([])
+    si.on(:success) {|*a| @proc_success_called = true}
+    si.on(200) {|*a| @proc_200_called = true}
+    si.start!
+    @proc_200_called.should == true
+    @proc_success_called.should == true
+  end
+  it 'should not call the :failure handler on success' do
+    @proc_called = false
+    http_resp = stub(:success? => true, :code => 200, :headers => 1, :body => 1)
+    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    si = SpiderInstance.new(['http://example.com/'])
+    si.stubs(:allowed?).returns(true)
+    si.stubs(:generate_next_urls).returns([])
+    si.on(:failure) {|*a| @proc_called = true}
+    si.start!
+    @proc_called.should == false
+  end
+  it 'should call the :failure handler on failure' do
+    @proc_called = false
+    http_resp = stub(:success? => false, :code => 404)
+    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    si = SpiderInstance.new(['http://example.com/'])
+    si.stubs(:allowed?).returns(true)
+    si.stubs(:generate_next_urls).returns([])
+    si.on(:failure) {|*a| @proc_called = true}
+    si.start!
+    @proc_called.should == true
+  end
+  it 'should call the :failure handler and the 404 handler on 404' do
+    @proc_404_called = false
+    @proc_failure_called = false
+    http_resp = stub(:success? => false, :code => 404)
+    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    si = SpiderInstance.new(['http://example.com/'])
+    si.stubs(:allowed?).returns(true)
+    si.stubs(:generate_next_urls).returns([])
+    si.on(:failure) {|*a| @proc_failure_called = true}
+    si.on(404) {|*a| @proc_404_called = true}
+    si.start!
+    @proc_404_called.should == true
+    @proc_failure_called.should == true
+  end
+  it 'should call the :any handler even when a handler for the error code is defined' do
+    @any_called = false
+    http_resp = stub(:success? => true, :code => 200)
+    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    si = SpiderInstance.new(['http://example.com/'])
+    si.stubs(:allowed?).returns(true)
+    si.stubs(:generate_next_urls).returns([])
+    si.on(:any) { |*a| @any_called = true }
+    si.on(202) {|*a|}
+    si.start!
+    @any_called.should == true
+  end
+  it 'should support a block as a response handler' do
+    @proc_called = false
+    http_resp = stub(:success? => true, :code => 200)
+    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    si = SpiderInstance.new(['http://example.com/'])
+    si.stubs(:allowed?).returns(true)
+    si.stubs(:generate_next_urls).returns([])
+    si.on(:any) { |*a| @proc_called = true }
+    si.start!
+    @proc_called.should == true
+  end
+  it 'should support a proc as a response handler' do
+    @proc_called = false
+    http_resp = stub(:success? => true, :code => 200)
+    Net::HTTP.stubs(:start).yields(stub(:request => http_resp))
+    si = SpiderInstance.new(['http://example.com/'])
+    si.stubs(:allowed?).returns(true)
+    si.stubs(:generate_next_urls).returns([])
+    si.on(:any, Proc.new { |*a| @proc_called = true })
+    si.start!
+    @proc_called.should == true
+  end
+end