RubyGems - spidr - Versions diffs - 0.2.1 → 0.2.2 - Mend

spidr 0.2.1 → 0.2.2

Files changed (56) hide show

data.tar.gz.sig +0 -0
data/History.rdoc +191 -0
data/Manifest.txt +10 -34
data/{README.txt → README.rdoc} +3 -1
data/Rakefile +6 -4
data/lib/spidr/agent.rb +137 -97
data/lib/spidr/auth_credential.rb +25 -0
data/lib/spidr/auth_store.rb +157 -0
data/lib/spidr/cookie_jar.rb +166 -0
data/lib/spidr/filters.rb +2 -0
data/lib/spidr/page.rb +75 -11
data/lib/spidr/sanitizers.rb +59 -0
data/lib/spidr/session_cache.rb +119 -0
data/lib/spidr/version.rb +1 -1
data/spec/agent_spec.rb +2 -2
data/spec/helpers/history.rb +34 -0
data/spec/helpers/wsoc.rb +83 -0
data/spec/page_examples.rb +5 -1
data/spec/page_spec.rb +30 -0
data/spec/sanitizers_spec.rb +67 -0
data/tasks/yard.rb +1 -1
metadata +24 -40
metadata.gz.sig +0 -0
data/History.txt +0 -167
data/spec/helpers/course.rb +0 -95
data/static/course/absolute/index.html +0 -10
data/static/course/absolute/next.html +0 -9
data/static/course/absolute/start.html +0 -19
data/static/course/empty/index.html +0 -10
data/static/course/empty/start.html +0 -23
data/static/course/fail.html +0 -14
data/static/course/frames/frame.html +0 -15
data/static/course/frames/frame_next.html +0 -9
data/static/course/frames/iframe.html +0 -15
data/static/course/frames/iframe_next.html +0 -9
data/static/course/frames/index.html +0 -10
data/static/course/frames/start.html +0 -15
data/static/course/index.html +0 -10
data/static/course/javascript/index.html +0 -10
data/static/course/javascript/start.html +0 -19
data/static/course/loop/index.html +0 -10
data/static/course/loop/next.html +0 -13
data/static/course/loop/start.html +0 -19
data/static/course/relative/current_directory.html +0 -9
data/static/course/relative/index.html +0 -10
data/static/course/relative/normal.html +0 -9
data/static/course/relative/same_directory.html +0 -9
data/static/course/relative/start.html +0 -27
data/static/course/remote/index.html +0 -10
data/static/course/remote/next.html +0 -9
data/static/course/remote/start.html +0 -27
data/static/course/scripts/course.js +0 -29
data/static/course/scripts/jquery-1.2.6.min.js +0 -32
data/static/course/specs.json +0 -1
data/static/course/start.html +0 -27
data/tasks/course.rb +0 -63

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: spidr
 version: !ruby/object:Gem::Version
-  version: 0.2.1
+  version: 0.2.2
 platform: ruby
 authors:
 - Postmodern
@@ -30,7 +30,7 @@ cert_chain:
   pDj+ws7QjtH/Qcrr1l9jfN0ehDs=
   -----END CERTIFICATE-----
-date: 2009-11-25 00:00:00 -08:00
+date: 2010-01-06 00:00:00 -08:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -63,6 +63,16 @@ dependencies:
       - !ruby/object:Gem::Version
         version: 0.4.0
     version:
+- !ruby/object:Gem::Dependency
+  name: wsoc
+  type: :development
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.1.1
+    version:
 - !ruby/object:Gem::Dependency
   name: hoe
   type: :development
@@ -71,7 +81,7 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 2.3.3
+        version: 2.4.0
     version:
 description: |-
   Spidr is a versatile Ruby web spidering library that can spider a site,
@@ -84,18 +94,17 @@ executables: []
 extensions: []
 extra_rdoc_files:
-- History.txt
 - Manifest.txt
-- README.txt
 files:
-- History.txt
+- History.rdoc
 - Manifest.txt
-- README.txt
+- README.rdoc
 - Rakefile
 - lib/spidr.rb
 - lib/spidr/extensions.rb
 - lib/spidr/extensions/uri.rb
 - lib/spidr/page.rb
+- lib/spidr/sanitizers.rb
 - lib/spidr/rules.rb
 - lib/spidr/filters.rb
 - lib/spidr/events.rb
@@ -106,53 +115,28 @@ files:
 - lib/spidr/actions/exceptions/skip_link.rb
 - lib/spidr/actions/exceptions/skip_page.rb
 - lib/spidr/actions/actions.rb
+- lib/spidr/session_cache.rb
+- lib/spidr/cookie_jar.rb
+- lib/spidr/auth_credential.rb
+- lib/spidr/auth_store.rb
 - lib/spidr/agent.rb
 - lib/spidr/spidr.rb
 - lib/spidr/version.rb
 - tasks/spec.rb
 - tasks/yard.rb
-- tasks/course.rb
 - spec/spec_helper.rb
-- spec/helpers/course.rb
+- spec/helpers/history.rb
+- spec/helpers/wsoc.rb
 - spec/helpers/page.rb
 - spec/extensions/uri_spec.rb
 - spec/page_examples.rb
 - spec/page_spec.rb
 - spec/rules_spec.rb
+- spec/sanitizers_spec.rb
 - spec/filters_spec.rb
 - spec/actions_spec.rb
 - spec/agent_spec.rb
 - spec/spidr_spec.rb
-- static/course/index.html
-- static/course/start.html
-- static/course/fail.html
-- static/course/scripts/jquery-1.2.6.min.js
-- static/course/scripts/course.js
-- static/course/empty/index.html
-- static/course/empty/start.html
-- static/course/javascript/index.html
-- static/course/javascript/start.html
-- static/course/loop/index.html
-- static/course/loop/start.html
-- static/course/loop/next.html
-- static/course/relative/index.html
-- static/course/relative/start.html
-- static/course/relative/normal.html
-- static/course/relative/current_directory.html
-- static/course/relative/same_directory.html
-- static/course/absolute/index.html
-- static/course/absolute/start.html
-- static/course/absolute/next.html
-- static/course/remote/index.html
-- static/course/remote/start.html
-- static/course/remote/next.html
-- static/course/frames/index.html
-- static/course/frames/start.html
-- static/course/frames/iframe.html
-- static/course/frames/iframe_next.html
-- static/course/frames/frame.html
-- static/course/frames/frame_next.html
-- static/course/specs.json
 has_rdoc: yard
 homepage: http://spidr.rubyforge.org
 licenses: []
@@ -160,7 +144,7 @@ licenses: []
 post_install_message:
 rdoc_options:
 - --main
-- README.txt
+- README.rdoc
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement

metadata.gz.sig CHANGED Viewed

Binary file

data/History.txt DELETED Viewed

@@ -1,167 +0,0 @@
-=== 0.2.1 / 2009-11-25
-* Added Spidr::Events#every_ok_page.
-* Added Spidr::Events#every_redirect_page.
-* Added Spidr::Events#every_timedout_page.
-* Added Spidr::Events#every_bad_request_page.
-* Added Spidr::Events#every_unauthorized_page.
-* Added Spidr::Events#every_forbidden_page.
-* Added Spidr::Events#every_missing_page.
-* Added Spidr::Events#every_internal_server_error_page.
-* Added Spidr::Events#every_txt_page.
-* Added Spidr::Events#every_html_page.
-* Added Spidr::Events#every_xml_page.
-* Added Spidr::Events#every_xsl_page.
-* Added Spidr::Events#every_doc.
-* Added Spidr::Events#every_html_doc.
-* Added Spidr::Events#every_xml_doc.
-* Added Spidr::Events#every_xsl_doc.
-* Added Spidr::Events#every_rss_doc.
-* Added Spidr::Events#every_atom_doc.
-* Added Spidr::Events#every_javascript_page.
-* Added Spidr::Events#every_css_page.
-* Added Spidr::Events#every_rss_page.
-* Added Spidr::Events#every_atom_page.
-* Added Spidr::Events#every_ms_word_page.
-* Added Spidr::Events#every_pdf_page.
-* Added Spidr::Events#every_zip_page.
-* Fixed a bug where Spidr::Agent#delay was not being used to delay
-  requesting pages.
-* Spider +link+ and +script+ tags in HTML pages (thanks Nick Plante).
-=== 0.2.0 / 2009-10-10
-* Added URI.expand_path.
-* Added Spidr::Page#search.
-* Added Spidr::Page#at.
-* Added Spidr::Page#title.
-* Added Spidr::Agent#failures=.
-* Added a HTTP session cache to Spidr::Agent, per suggestion of falter.
-  * Added Spidr::Agent#get_session.
-  * Added Spidr::Agent#kill_session.
-* Added Spidr.proxy=.
-* Added Spidr.disable_proxy!.
-* Aliased Spidr::Page#txt? to Spidr::Page#plain_text?.
-* Aliased Spidr::Page#ok? to Spidr::Page#is_ok?.
-* Aliased Spidr::Page#redirect? to Spidr::Page#is_redirect?.
-* Aliased Spidr::Page#unauthorized? to Spidr::Page#is_unauthorized?.
-* Aliased Spidr::Page#forbidden? to Spidr::Page#is_forbidden?.
-* Aliased Spidr::Page#missing? to Spidr::Page#is_missing?.
-* Split URL filtering code out of Spidr::Agent and into Spidr::Filtering.
-* Split URL / Page event code out of Spidr::Agent and into Spidr::Events.
-* Split pause! / continue! / skip_link! / skip_page! methods out of
-  Spidr::Agent and into Spidr::Actions.
-* Fixed a bug in Spidr::Page#code, where it was not returning an Integer.
-* Make sure Spidr::Page#doc returns Nokogiri::XML::Document objects for
-  RSS/RDF/Atom pages as well.
-* Fixed the handling of the Location header in Spidr::Page#links
-  (thanks falter).
-* Fixed a bug in Spidr::Page#to_absolute where trailing '/' characters on
-  URI paths were not being preserved (thanks falter).
-* Fixed a bug where the URI query was not being sent with the request
-  in Spidr::Agent#get_page (thanks Damian Steer).
-* Fixed a bug where SSL sessions were not being properly setup
-  (thanks falter).
-* Switched Spidr::Agent#history to be a Set, to improve search-time
-  of the history (thanks falter).
-* Switched Spidr::Agent#failures to a Set.
-* Allow a block to be passed to Spidr::Agent#run, which will receive all
-  pages visited.
-* Allow Spidr::Agent#start_at and Spidr::Agent#continue! to pass blocks to
-  Spidr::Agent#run.
-* Made Spidr::Agent#visit_page public.
-* Moved to YARD based documentation.
-=== 0.1.9 / 2009-06-13
-* Upgraded to Hoe 2.0.0.
-  * Use Hoe.spec instead of Hoe.new.
-  * Use the Hoe signing task for signed gems.
-* Added the Agent#schemes and Agent#schemes= methods.
-* Added a warning message if 'net/https' cannot be loaded.
-* Allow the list of acceptable URL schemes to be passed into Agent.new.
-* Allow history and queue information to be passed into Agent.new.
-* Agent#start_at no longer clears the history or the queue.
-* Fixed a bug in the sanitization of semi-escaped URLs.
-* Fixed a bug where https URLs would be followed even if 'net/https'
-  could not be loaded.
-* Removed Agent::SCHEMES.
-=== 0.1.8 / 2009-05-27
-* Added the Agent#pause! and Agent#continue! methods.
-* Added the Agent#running? and Agent#paused? methods.
-* Added an alias for pending_urls to the queue methods.
-* Added Agent#queue to provide read access to the queue.
-* Added Agent#queue= and Agent#history= for setting the queue and history.
-* Added Agent#to_hash which returns a Hash of the agents queue and history.
-* Made Agent#enqueue and Agent#queued? public.
-* Added more specs.
-=== 0.1.7 / 2009-04-24
-* Added Agent#all_headers.
-* Fixed a bug where Page#headers was always +nil+.
-* Spidr::Agent will now follow the Location header in HTTP 300, 301, 302,
-  303 and 307 Redirects.
-* Spidr::Agent will now follow iframe and frame tags.
-=== 0.1.6 / 2009-04-14
-* Added Agent#failures, a list of URLs which could not be visited.
-* Added Agent#failed?.
-* Added Agent#every_failed_url.
-* Added Agent#clear, which clears the history and failures URL lists.
-* Improved fault tolerance in Agent#get_page.
-  * If a Network or HTTP error is encountered, the URL will be added to
-    the failures list and the next URL will be visited.
-* Fixed a typo in Agent#ignore_exts_like.
-* Updated the Web Spider Obstacle Course with links that always fail to be
-  visited.
-=== 0.1.5 / 2009-03-22
-* Catch malformed URIs in Page#to_absolute and return +nil+.
-* Filter out +nil+ URIs in Page#urls.
-=== 0.1.4 / 2009-01-15
-* Use Nokogiri for HTML and XML parsing.
-=== 0.1.3 / 2009-01-10
-* Added the :host options to Spidr::Agent#initialize.
-* Added the Web Spider Obstacle Course files to the Manifest.
-* Aliased Spidr::Agent#visited_urls to Spidr::Agent#history.
-=== 0.1.2 / 2008-11-06
-* Fixed a bug in Page#to_absolute where URLs with no path were not
-  receiving a default path of <tt>/</tt>.
-* Fixed a bug in Page#to_absolute where URL paths were not being
-  expanded, in order to remove <tt>..</tt> and <tt>.</tt> directories.
-* Fixed a bug where absolute URLs could have a blank path, thus causing
-  Agent#get_page to crash when it performed the HTTP request.
-* Added RSpec spec tests.
-* Created a Web-Spider Obstacle Course
-  (http://spidr.rubyforge.org/course/start.html) which is used in the spec
-  tests.
-=== 0.1.1 / 2008-10-04
-* Added a reader method for the response instance variable in Page.
-* Fixed a bug in Page#method_missing.
-=== 0.1.0 / 2008-05-23
-* Initial release.
-  * Black-list or white-list URLs based upon:
-    * Host name
-    * Port number
-    * Full link
-    * URL extension
-  * Provides call-backs for:
-    * Every visited Page.
-    * Every visited URL.
-    * Every visited URL that matches a specified pattern.

data/spec/helpers/course.rb DELETED Viewed

@@ -1,95 +0,0 @@
-require 'open-uri'
-require 'json'
-module Helpers
-  module Course
-    COURSE_URL = URI('http://spidr.rubyforge.org/course/start.html')
-    SPECS_URL = 'http://spidr.rubyforge.org/course/specs.json'
-    def self.included(base)
-      specs = JSON.parse(open(SPECS_URL).read)
-      if specs.kind_of?(Array)
-        specs.each do |spec|
-          message = spec['message'].to_s.dump
-          url = spec['url'].to_s.dump
-          case spec['behavior']
-          when 'follow'
-            base.module_eval %{
-              it #{message} do
-                should_visit_link(#{url})
-              end
-            }
-          when 'nofollow'
-            base.module_eval %{
-              it #{message} do
-                should_visit_once(#{url})
-              end
-            }
-          when 'fail'
-            base.module_eval %{
-              it #{message} do
-                should_fail_link(#{url})
-              end
-            }
-          else
-            link = spec['link'].to_s.dump
-            base.module_eval %{
-              it #{message} do
-                should_ignore_link(#{link})
-                should_ignore_link(#{url})
-              end
-            }
-          end
-        end
-      end
-    end
-    def run_course
-      Agent.start_at(COURSE_URL,:hosts => [COURSE_URL.host]) do |agent|
-        agent.every_failed_url { |url| puts "[FAILED] #{url}" }
-        agent.every_url { |url| puts url }
-      end
-    end
-    def visited_once?(link)
-      url = COURSE_URL.merge(URI.encode(link))
-      return @agent.visited_urls.select { |visited_url|
-        visited_url == url
-      }.length == 1
-    end
-    #
-    # Returns +true+ if the agent has visited the specified _link_, returns
-    # +false+ otherwise.
-    #
-    def visited_link?(link)
-      @agent.visited?(COURSE_URL.merge(URI.encode(link)))
-    end
-    def visit_failed?(link)
-      @agent.failed?(COURSE_URL.merge(URI.encode(link)))
-    end
-    def should_visit_link(link)
-      visited_link?(link).should == true
-    end
-    def should_ignore_link(link)
-      visited_link?(link).should == false
-    end
-    def should_visit_once(link)
-      visited_once?(link).should == true
-    end
-    def should_fail_link(link)
-      visited_link?(link).should == false
-      visit_failed?(link).should == true
-    end
-  end
-end

data/static/course/absolute/index.html DELETED Viewed

@@ -1,10 +0,0 @@
-<html>
-  <head>
-    <title>Spidr :: Web-Spider Obstacle Course :: Empty Links</title>
-    <script type="text/javascript" src="../scripts/jquery-1.2.6.min.js"></script>
-    <script type="text/javascript" src="../scripts/course.js"></script>
-    <script type="text/javascript">
-      fail();
-    </script>
-  </head>
-</html>

data/static/course/absolute/next.html DELETED Viewed

@@ -1,9 +0,0 @@
-<html>
-  <head>
-    <title>Spidr :: Web-Spider Obstacle Course :: Absolute Links</title>
-  </head>
-  <body>
-    <p>Absolute links to an unvisited page</p>
-  </body>
-</html>

data/static/course/absolute/start.html DELETED Viewed

@@ -1,19 +0,0 @@
-<html>
-  <head>
-    <title>Spidr :: Web-Spider Obstacle Course :: Absolute Links</title>
-  </head>
-  <body>
-    <p>Absolute links</p>
-    <ul>
-      <li class="nofollow">
-        <a href="/course/absolute/start.html">should not follow absolute links to the current page</a>
-      </li>
-      <li class="follow">
-        <a href="/course/absolute/next.html">should follow absolute links to unvisited pages</a>
-      </li>
-    </ul>
-  </body>
-</html>

data/static/course/empty/index.html DELETED Viewed

@@ -1,10 +0,0 @@
-<html>
-  <head>
-    <title>Spidr :: Web-Spider Obstacle Course :: Empty Links</title>
-    <script type="text/javascript" src="../scripts/jquery-1.2.6.min.js"></script>
-    <script type="text/javascript" src="../scripts/course.js"></script>
-    <script type="text/javascript">
-      fail();
-    </script>
-  </head>
-</html>