RubyGems - spidr - Versions diffs - 0.2.1 → 0.2.2 - Mend

spidr 0.2.1 → 0.2.2

Files changed (56) hide show

data.tar.gz.sig +0 -0
data/History.rdoc +191 -0
data/Manifest.txt +10 -34
data/{README.txt → README.rdoc} +3 -1
data/Rakefile +6 -4
data/lib/spidr/agent.rb +137 -97
data/lib/spidr/auth_credential.rb +25 -0
data/lib/spidr/auth_store.rb +157 -0
data/lib/spidr/cookie_jar.rb +166 -0
data/lib/spidr/filters.rb +2 -0
data/lib/spidr/page.rb +75 -11
data/lib/spidr/sanitizers.rb +59 -0
data/lib/spidr/session_cache.rb +119 -0
data/lib/spidr/version.rb +1 -1
data/spec/agent_spec.rb +2 -2
data/spec/helpers/history.rb +34 -0
data/spec/helpers/wsoc.rb +83 -0
data/spec/page_examples.rb +5 -1
data/spec/page_spec.rb +30 -0
data/spec/sanitizers_spec.rb +67 -0
data/tasks/yard.rb +1 -1
metadata +24 -40
metadata.gz.sig +0 -0
data/History.txt +0 -167
data/spec/helpers/course.rb +0 -95
data/static/course/absolute/index.html +0 -10
data/static/course/absolute/next.html +0 -9
data/static/course/absolute/start.html +0 -19
data/static/course/empty/index.html +0 -10
data/static/course/empty/start.html +0 -23
data/static/course/fail.html +0 -14
data/static/course/frames/frame.html +0 -15
data/static/course/frames/frame_next.html +0 -9
data/static/course/frames/iframe.html +0 -15
data/static/course/frames/iframe_next.html +0 -9
data/static/course/frames/index.html +0 -10
data/static/course/frames/start.html +0 -15
data/static/course/index.html +0 -10
data/static/course/javascript/index.html +0 -10
data/static/course/javascript/start.html +0 -19
data/static/course/loop/index.html +0 -10
data/static/course/loop/next.html +0 -13
data/static/course/loop/start.html +0 -19
data/static/course/relative/current_directory.html +0 -9
data/static/course/relative/index.html +0 -10
data/static/course/relative/normal.html +0 -9
data/static/course/relative/same_directory.html +0 -9
data/static/course/relative/start.html +0 -27
data/static/course/remote/index.html +0 -10
data/static/course/remote/next.html +0 -9
data/static/course/remote/start.html +0 -27
data/static/course/scripts/course.js +0 -29
data/static/course/scripts/jquery-1.2.6.min.js +0 -32
data/static/course/specs.json +0 -1
data/static/course/start.html +0 -27
data/tasks/course.rb +0 -63

data.tar.gz.sig CHANGED Viewed

Binary file

data/History.rdoc ADDED Viewed

@@ -0,0 +1,191 @@
+=== 0.2.2 / 2010-01-06
+* Require Web Spider Obstacle Course (WSOC) >= 0.1.1.
+* Integrated the new WSOC into the specs.
+* Removed the built-in Web Spider Obstacle Course.
+* Added {Spidr::Page#content_types}.
+* Added {Spidr::Page#cookie}.
+* Added {Spidr::Page#cookies}.
+* Added {Spidr::Page#cookie_params}.
+* Added {Spidr::Sanitizers}.
+* Added {Spidr::SessionCache}.
+* Added {Spidr::CookieJar} (thanks Nick Plante).
+* Added {Spidr::AuthStore} (thanks Nick Plante).
+* Added {Spidr::Agent#post_page} (thanks Nick Plante).
+* Renamed Spidr::Agent#get_session to {Spidr::SessionCache#[]}.
+* Renamed Spidr::Agent#kill_session to {Spidr::SessionCache#kill!}.
+=== 0.2.1 / 2009-11-25
+* Added {Spidr::Events#every_ok_page}.
+* Added {Spidr::Events#every_redirect_page}.
+* Added {Spidr::Events#every_timedout_page}.
+* Added {Spidr::Events#every_bad_request_page}.
+* Added {Spidr::Events#every_unauthorized_page}.
+* Added {Spidr::Events#every_forbidden_page}.
+* Added {Spidr::Events#every_missing_page}.
+* Added {Spidr::Events#every_internal_server_error_page}.
+* Added {Spidr::Events#every_txt_page}.
+* Added {Spidr::Events#every_html_page}.
+* Added {Spidr::Events#every_xml_page}.
+* Added {Spidr::Events#every_xsl_page}.
+* Added {Spidr::Events#every_doc}.
+* Added {Spidr::Events#every_html_doc}.
+* Added {Spidr::Events#every_xml_doc}.
+* Added {Spidr::Events#every_xsl_doc}.
+* Added {Spidr::Events#every_rss_doc}.
+* Added {Spidr::Events#every_atom_doc}.
+* Added {Spidr::Events#every_javascript_page}.
+* Added {Spidr::Events#every_css_page}.
+* Added {Spidr::Events#every_rss_page}.
+* Added {Spidr::Events#every_atom_page}.
+* Added {Spidr::Events#every_ms_word_page}.
+* Added {Spidr::Events#every_pdf_page}.
+* Added {Spidr::Events#every_zip_page}.
+* Fixed a bug where {Spidr::Agent#delay} was not being used to delay
+  requesting pages.
+* Spider +link+ and +script+ tags in HTML pages (thanks Nick Plante).
+=== 0.2.0 / 2009-10-10
+* Added {URI.expand_path}.
+* Added {Spidr::Page#search}.
+* Added {Spidr::Page#at}.
+* Added {Spidr::Page#title}.
+* Added {Spidr::Agent#failures=}.
+* Added a HTTP session cache to {Spidr::Agent}, per suggestion of falter.
+  * Added Spidr::Agent#get_session.
+  * Added Spidr::Agent#kill_session.
+* Added {Spidr.proxy=}.
+* Added {Spidr.disable_proxy!}.
+* Aliased Spidr::Page#txt? to {Spidr::Page#plain_text?}.
+* Aliased Spidr::Page#ok? to {Spidr::Page#is_ok?}.
+* Aliased Spidr::Page#redirect? to {Spidr::Page#is_redirect?}.
+* Aliased Spidr::Page#unauthorized? to {Spidr::Page#is_unauthorized?}.
+* Aliased Spidr::Page#forbidden? to {Spidr::Page#is_forbidden?}.
+* Aliased Spidr::Page#missing? to {Spidr::Page#is_missing?}.
+* Split URL filtering code out of {Spidr::Agent} and into
+  {Spidr::Filters}.
+* Split URL / Page event code out of {Spidr::Agent} and into
+  {Spidr::Events}.
+* Split pause! / continue! / skip_link! / skip_page! methods out of
+  {Spidr::Agent} and into {Spidr::Actions}.
+* Fixed a bug in {Spidr::Page#code}, where it was not returning an Integer.
+* Make sure {Spidr::Page#doc} returns Nokogiri::XML::Document objects for
+  RSS/RDF/Atom pages as well.
+* Fixed the handling of the Location header in {Spidr::Page#links}
+  (thanks falter).
+* Fixed a bug in {Spidr::Page#to_absolute} where trailing '/' characters on
+  URI paths were not being preserved (thanks falter).
+* Fixed a bug where the URI query was not being sent with the request
+  in {Spidr::Agent#get_page} (thanks Damian Steer).
+* Fixed a bug where SSL sessions were not being properly setup
+  (thanks falter).
+* Switched {Spidr::Agent#history} to be a Set, to improve search-time
+  of the history (thanks falter).
+* Switched {Spidr::Agent#failures} to a Set.
+* Allow a block to be passed to {Spidr::Agent#run}, which will receive all
+  pages visited.
+* Allow Spidr::Agent#start_at and Spidr::Agent#continue! to pass blocks
+  to {Spidr::Agent#run}.
+* Made {Spidr::Agent#visit_page} public.
+* Moved to YARD based documentation.
+=== 0.1.9 / 2009-06-13
+* Upgraded to Hoe 2.0.0.
+  * Use Hoe.spec instead of Hoe.new.
+  * Use the Hoe signing task for signed gems.
+* Added the Spidr::Agent#schemes and Spidr::Agent#schemes= methods.
+* Added a warning message if 'net/https' cannot be loaded.
+* Allow the list of acceptable URL schemes to be passed into
+  {Spidr::Agent#initialize}.
+* Allow history and queue information to be passed into
+  {Spidr::Agent#initialize}.
+* {Spidr::Agent#start_at} no longer clears the history or the queue.
+* Fixed a bug in the sanitization of semi-escaped URLs.
+* Fixed a bug where https URLs would be followed even if 'net/https'
+  could not be loaded.
+* Removed Spidr::Agent::SCHEMES.
+=== 0.1.8 / 2009-05-27
+* Added the Spidr::Agent#pause! and Spidr::Agent#continue! methods.
+* Added the Spidr::Agent#running? and Spidr::Agent#paused? methods.
+* Added an alias for pending_urls to the queue methods.
+* Added {Spidr::Agent#queue} to provide read access to the queue.
+* Added {Spidr::Agent#queue=} and {Spidr::Agent#history=} for setting the
+  queue and history.
+* Added {Spidr::Agent#to_hash} which returns a Hash of the agents queue and
+  history.
+* Made {Spidr::Agent#enqueue} and {Spidr::Agent#queued?} public.
+* Added more specs.
+=== 0.1.7 / 2009-04-24
+* Added Spidr::Agent#all_headers.
+* Fixed a bug where Page#headers was always +nil+.
+* {Spidr::Spidr::Agent} will now follow the Location header in HTTP 300,
+  301, 302, 303 and 307 Redirects.
+* {Spidr::Agent} will now follow iframe and frame tags.
+=== 0.1.6 / 2009-04-14
+* Added {Spidr::Agent#failures}, a list of URLs which could not be visited.
+* Added {Spidr::Agent#failed?}.
+* Added Spidr::Agent#every_failed_url.
+* Added {Spidr::Agent#clear}, which clears the history and failures URL
+  lists.
+* Improved fault tolerance in {Spidr::Agent#get_page}.
+  * If a Network or HTTP error is encountered, the URL will be added to
+    the failures list and the next URL will be visited.
+* Fixed a typo in Spidr::Agent#ignore_exts_like.
+* Updated the Web Spider Obstacle Course with links that always fail to be
+  visited.
+=== 0.1.5 / 2009-03-22
+* Catch malformed URIs in {Spidr::Page#to_absolute} and return +nil+.
+* Filter out +nil+ URIs in {Spidr::Page#urls}.
+=== 0.1.4 / 2009-01-15
+* Use Nokogiri for HTML and XML parsing.
+=== 0.1.3 / 2009-01-10
+* Added the :host options to {Spidr::Agent#initialize}.
+* Added the Web Spider Obstacle Course files to the Manifest.
+* Aliased {Spidr::Agent#visited_urls} to {Spidr::Agent#history}.
+=== 0.1.2 / 2008-11-06
+* Fixed a bug in {Spidr::Page#to_absolute} where URLs with no path were not
+  receiving a default path of <tt>/</tt>.
+* Fixed a bug in {Spidr::Page#to_absolute} where URL paths were not being
+  expanded, in order to remove <tt>..</tt> and <tt>.</tt> directories.
+* Fixed a bug where absolute URLs could have a blank path, thus causing
+  {Spidr::Agent#get_page} to crash when it performed the HTTP request.
+* Added RSpec spec tests.
+* Created a Web-Spider Obstacle Course
+  (http://spidr.rubyforge.org/course/start.html) which is used in the spec
+  tests.
+=== 0.1.1 / 2008-10-04
+* Added a reader method for the response instance variable in Page.
+* Fixed a bug in {Spidr::Page#method_missing}.
+=== 0.1.0 / 2008-05-23
+* Initial release.
+  * Black-list or white-list URLs based upon:
+    * Host name
+    * Port number
+    * Full link
+    * URL extension
+  * Provides call-backs for:
+    * Every visited Page.
+    * Every visited URL.
+    * Every visited URL that matches a specified pattern.

data/Manifest.txt CHANGED Viewed

@@ -1,11 +1,12 @@
-History.txt
+History.rdoc
 Manifest.txt
-README.txt
+README.rdoc
 Rakefile
 lib/spidr.rb
 lib/spidr/extensions.rb
 lib/spidr/extensions/uri.rb
 lib/spidr/page.rb
+lib/spidr/sanitizers.rb
 lib/spidr/rules.rb
 lib/spidr/filters.rb
 lib/spidr/events.rb
@@ -16,50 +17,25 @@ lib/spidr/actions/exceptions/paused.rb
 lib/spidr/actions/exceptions/skip_link.rb
 lib/spidr/actions/exceptions/skip_page.rb
 lib/spidr/actions/actions.rb
+lib/spidr/session_cache.rb
+lib/spidr/cookie_jar.rb
+lib/spidr/auth_credential.rb
+lib/spidr/auth_store.rb
 lib/spidr/agent.rb
 lib/spidr/spidr.rb
 lib/spidr/version.rb
 tasks/spec.rb
 tasks/yard.rb
-tasks/course.rb
 spec/spec_helper.rb
-spec/helpers/course.rb
+spec/helpers/history.rb
+spec/helpers/wsoc.rb
 spec/helpers/page.rb
 spec/extensions/uri_spec.rb
 spec/page_examples.rb
 spec/page_spec.rb
 spec/rules_spec.rb
+spec/sanitizers_spec.rb
 spec/filters_spec.rb
 spec/actions_spec.rb
 spec/agent_spec.rb
 spec/spidr_spec.rb
-static/course/index.html
-static/course/start.html
-static/course/fail.html
-static/course/scripts/jquery-1.2.6.min.js
-static/course/scripts/course.js
-static/course/empty/index.html
-static/course/empty/start.html
-static/course/javascript/index.html
-static/course/javascript/start.html
-static/course/loop/index.html
-static/course/loop/start.html
-static/course/loop/next.html
-static/course/relative/index.html
-static/course/relative/start.html
-static/course/relative/normal.html
-static/course/relative/current_directory.html
-static/course/relative/same_directory.html
-static/course/absolute/index.html
-static/course/absolute/start.html
-static/course/absolute/next.html
-static/course/remote/index.html
-static/course/remote/start.html
-static/course/remote/next.html
-static/course/frames/index.html
-static/course/frames/start.html
-static/course/frames/iframe.html
-static/course/frames/iframe_next.html
-static/course/frames/frame.html
-static/course/frames/frame_next.html
-static/course/specs.json

data/{README.txt → README.rdoc} RENAMED Viewed

@@ -18,7 +18,9 @@ and easy to use.
   * a tags.
   * iframe tags.
   * frame tags.
+  * Cookie protected links.
   * HTTP 300, 301, 302, 303 and 307 Redirects.
+  * HTTP Basic Auth protected links.
 * Black-list or white-list URLs based upon:
   * URL scheme.
   * Host name
@@ -156,7 +158,7 @@ and easy to use.
 The MIT License
-Copyright (c) 2008-2009 Hal Brodigan
+Copyright (c) 2008-2010 Hal Brodigan
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the

data/Rakefile CHANGED Viewed

@@ -5,20 +5,22 @@ require 'hoe'
 require 'hoe/signing'
 require './tasks/spec.rb'
 require './tasks/yard.rb'
-require './tasks/course.rb'
-require './lib/spidr/version.rb'
 Hoe.spec('spidr') do
-  self.rubyforge_name = 'spidr'
   self.developer('Postmodern', 'postmodern.mod3@gmail.com')
+  self.readme_file = 'README.rdoc'
+  self.history_file = 'History.rdoc'
   self.remote_rdoc_dir = 'docs'
   self.extra_deps = [
     ['nokogiri', '>=1.2.0']
   ]
   self.extra_dev_deps = [
     ['rspec', '>=1.2.8'],
-    ['yard', '>=0.4.0']
+    ['yard', '>=0.4.0'],
+    ['wsoc', '>=0.1.1']
   ]
   self.spec_extras = {:has_rdoc => 'yard'}

data/lib/spidr/agent.rb CHANGED Viewed

@@ -1,7 +1,11 @@
+require 'spidr/sanitizers'
 require 'spidr/filters'
 require 'spidr/events'
 require 'spidr/actions'
 require 'spidr/page'
+require 'spidr/session_cache'
+require 'spidr/cookie_jar'
+require 'spidr/auth_store'
 require 'spidr/spidr'
 require 'net/http'
@@ -10,16 +14,17 @@ require 'set'
 module Spidr
   class Agent
+    include Sanitizers
     include Filters
     include Events
     include Actions
-    # Proxy to use
-    attr_accessor :proxy
     # User-Agent to use
     attr_accessor :user_agent
+    # HTTP Authentication credentials
+    attr_accessor :authorized
     # Referer to use
     attr_accessor :referer
@@ -35,6 +40,9 @@ module Spidr
     # Queue of URLs to visit
     attr_reader :queue
+    # Cached cookies
+    attr_reader :cookies
     #
     # Creates a new Agent object.
     #
@@ -79,18 +87,19 @@ module Spidr
     #   The newly created agent.
     #
     def initialize(options={},&block)
-      @proxy = (options[:proxy] || Spidr.proxy)
       @user_agent = (options[:user_agent] || Spidr.user_agent)
       @referer = options[:referer]
+      @sessions = SessionCache.new(options[:proxy] || Spidr.proxy)
+      @cookies = CookieJar.new
+      @authorized = AuthStore.new
       @running = false
       @delay = (options[:delay] || 0)
       @history = Set[]
       @failures = Set[]
       @queue = []
-      @sessions = {}
       super(options)
       block.call(self) if block
@@ -222,14 +231,6 @@ module Spidr
       @running = false
-      @sessions.each_value do |sess|
-        begin
-          sess.finish
-        rescue IOError
-          nil
-        end
-      end
       @sessions.clear
       return self
     end
@@ -244,6 +245,37 @@ module Spidr
       @running == true
     end
+    #
+    # The proxy information the agent uses.
+    #
+    # @return [Hash]
+    #   The proxy information.
+    #
+    # @see SessionCache#proxy
+    #
+    # @since 0.2.2
+    #
+    def proxy
+      @sessions.proxy
+    end
+    #
+    # Sets the proxy information that the agent uses.
+    #
+    # @param [Hash] new_proxy
+    #   The new proxy information.
+    #
+    # @return [Hash]
+    #   The new proxy information.
+    #
+    # @see SessionCache#proxy=
+    #
+    # @since 0.2.2
+    #
+    def proxy=(new_proxy)
+      @sessions.proxy = new_proxy
+    end
     #
     # Sets the history of URLs that were previously visited.
     #
@@ -400,10 +432,11 @@ module Spidr
     #   Specifies whether the URL was enqueued, or ignored.
     #
     def enqueue(url)
-      link = url.to_s
-      url = URI(link) unless url.kind_of?(URI)
+      url = sanitize_url(url)
       if (!(queued?(url)) && visit?(url))
+        link = url.to_s
         begin
           @every_url_blocks.each { |block| block.call(url) }
@@ -443,37 +476,51 @@ module Spidr
     #   The page for the response, or +nil+ if the request failed.
     #
     def get_page(url,&block)
-      url = URI(url.to_s) unless url.kind_of?(URI)
+      url = URI(url.to_s)
-      host = url.host
-      port = url.port
+      prepare_request(url) do |session,path,headers|
+        new_page = Page.new(url,session.get(path,headers))
-      unless url.path.empty?
-        path = url.path
-      else
-        path = '/'
-      end
+        # save any new cookies
+        @cookies.from_page(new_page)
-      # append the URL query to the path
-      path += "?#{url.query}" if url.query
+        block.call(new_page) if block
+        return new_page
+      end
+    end
-      begin
-        sleep(@delay) if @delay > 0
+    #
+    # Posts supplied form data and creates a new Page object from a given URL.
+    #
+    # @param [URI::HTTP] url
+    #   The URL to request.
+    #
+    # @param [String] post_data
+    #   Form option data.
+    #
+    # @yield [page]
+    #   If a block is given, it will be passed the page that represents the
+    #   response.
+    #
+    # @yieldparam [Page] page
+    #   The page for the response.
+    #
+    # @return [Page, nil]
+    #   The page for the response, or +nil+ if the request failed.
+    #
+    # @since 0.2.2
+    #
+    def post_page(url,post_data='',&block)
+      url = URI(url.to_s)
-        get_session(url.scheme,host,port) do |sess|
-          headers = {}
-          headers['User-Agent'] = @user_agent if @user_agent
-          headers['Referer'] = @referer if @referer
+      prepare_request(url) do |session,path,headers|
+        new_page = Page.new(url,session.post(path,post_data,headers))
-          new_page = Page.new(url,sess.get(path,headers))
+        # save any new cookies
+        @cookies.from_page(new_page)
-          block.call(new_page) if block
-          return new_page
-        end
-      rescue SystemCallError, Timeout::Error, Net::HTTPBadResponse, IOError
-        failed(url)
-        kill_session(url.scheme,host,port)
-        return nil
+        block.call(new_page) if block
+        return new_page
       end
     end
@@ -529,73 +576,66 @@ module Spidr
     protected
     #
-    # Provides an active HTTP session for the given scheme, host
-    # and port.
-    #
-    # @param [String] scheme
-    #   The scheme of the URL, which will be requested later.
-    #
-    # @param [String] host
-    #   The host that the session is needed with.
+    # Normalizes the request path and grabs a session to handle page
+    # get and post requests.
     #
-    # @param [Integer] port
-    #   The port that the session is needed for.
+    # @param [URI::HTTP] url
+    #   The URL to request.
     #
-    # @yield [session]
-    #   If a block is given, it will be passed the active HTTP session.
+    # @yield [request]
+    #   A block whose purpose is to make a page request.
     #
     # @yieldparam [Net::HTTP] session
-    #   The active HTTP session object.
-    #
-    def get_session(scheme,host,port,&block)
-      key = [scheme,host,port]
-      unless @sessions[key]
-        session = Net::HTTP::Proxy(
-          @proxy[:host],
-          @proxy[:port],
-          @proxy[:user],
-          @proxy[:password]
-        ).new(host,port)
-        if scheme == 'https'
-          session.use_ssl = true
-          session.verify_mode = OpenSSL::SSL::VERIFY_NONE
-        end
-        @sessions[key] = session
-      end
-      session = @sessions[key]
-      block.call(session) if block
-      return session
-    end
-    #
-    # Destroys an HTTP session for the given scheme, host and port.
+    #   An HTTP session object.
     #
-    # @param [String] scheme
-    #   The scheme of the URL, which was requested through the session.
+    # @yieldparam [String] path
+    #   Normalized URL string.
     #
-    # @param [String] host
-    #   The host that the session was connected with.
+    # @yieldparam [Hash] headers
+    #   A Hash of request header options.
     #
-    # @param [Integer] port
-    #   The port that the session was connected to.
+    # @since 0.2.2
     #
-    def kill_session(scheme,host,port,&block)
-      key = [scheme,host,port]
-      sess = @sessions[key]
+    def prepare_request(url,&block)
+      host = url.host
+      port = url.port
-      begin
-        sess.finish
-      rescue IOError
-        nil
+      unless url.path.empty?
+        path = url.path
+      else
+        path = '/'
       end
-      @sessions.delete(key)
-      block.call if block
-      return nil
+      # append the URL query to the path
+      path += "?#{url.query}" if url.query
+      begin
+        sleep(@delay) if @delay > 0
+        headers = {}
+        headers['User-Agent'] = @user_agent if @user_agent
+        headers['Referer'] = @referer if @referer
+        if (authorization = @authorized.for_url(url))
+          headers['Authorization'] = "Basic #{authorization}"
+        end
+        if (header_cookies = @cookies.for_host(url.host))
+          headers['Cookie'] = header_cookies
+        end
+        block.call(@sessions[url],path,headers)
+      rescue SystemCallError,
+             Timeout::Error,
+             SocketError,
+             Net::HTTPBadResponse,
+             IOError
+        @sessions.kill!(url)
+        failed(url)
+        return nil
+      end
     end
     #
@@ -633,8 +673,8 @@ module Spidr
     #   The URL to add to the failures list.
     #
     def failed(url)
-      @every_failed_url_blocks.each { |block| block.call(url) }
       @failures << url
+      @every_failed_url_blocks.each { |block| block.call(url) }
       return true
     end