RubyGems - spidr_epg - Versions diffs - 1.0.0 - Mend

spidr_epg 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

checksums.yaml +15 -0
data/.gitignore +10 -0
data/.rspec +1 -0
data/.yardopts +1 -0
data/ChangeLog.md +291 -0
data/ChangeLog.md~ +291 -0
data/Gemfile +16 -0
data/Gemfile.lock +49 -0
data/Gemfile~ +16 -0
data/LICENSE.txt +20 -0
data/README.md +193 -0
data/README.md~ +190 -0
data/Rakefile +29 -0
data/gemspec.yml +19 -0
data/lib/spidr/actions/actions.rb +83 -0
data/lib/spidr/actions/exceptions/action.rb +9 -0
data/lib/spidr/actions/exceptions/paused.rb +11 -0
data/lib/spidr/actions/exceptions/skip_link.rb +12 -0
data/lib/spidr/actions/exceptions/skip_page.rb +12 -0
data/lib/spidr/actions/exceptions.rb +4 -0
data/lib/spidr/actions.rb +2 -0
data/lib/spidr/agent.rb +866 -0
data/lib/spidr/auth_credential.rb +28 -0
data/lib/spidr/auth_store.rb +161 -0
data/lib/spidr/body.rb +98 -0
data/lib/spidr/cookie_jar.rb +202 -0
data/lib/spidr/events.rb +537 -0
data/lib/spidr/extensions/uri.rb +52 -0
data/lib/spidr/extensions.rb +1 -0
data/lib/spidr/filters.rb +539 -0
data/lib/spidr/headers.rb +370 -0
data/lib/spidr/links.rb +229 -0
data/lib/spidr/page.rb +108 -0
data/lib/spidr/rules.rb +79 -0
data/lib/spidr/sanitizers.rb +56 -0
data/lib/spidr/session_cache.rb +145 -0
data/lib/spidr/spidr.rb +107 -0
data/lib/spidr/version.rb +4 -0
data/lib/spidr/version.rb~ +4 -0
data/lib/spidr.rb +3 -0
data/pkg/spidr-1.0.0.gem +0 -0
data/spec/actions_spec.rb +59 -0
data/spec/agent_spec.rb +81 -0
data/spec/auth_store_spec.rb +85 -0
data/spec/cookie_jar_spec.rb +144 -0
data/spec/extensions/uri_spec.rb +43 -0
data/spec/filters_spec.rb +61 -0
data/spec/helpers/history.rb +34 -0
data/spec/helpers/page.rb +8 -0
data/spec/helpers/wsoc.rb +83 -0
data/spec/page_examples.rb +21 -0
data/spec/page_spec.rb +125 -0
data/spec/rules_spec.rb +45 -0
data/spec/sanitizers_spec.rb +61 -0
data/spec/session_cache.rb +58 -0
data/spec/spec_helper.rb +4 -0
data/spec/spidr_spec.rb +39 -0
data/spidr.gemspec +133 -0
data/spidr.gemspec~ +131 -0
metadata +158 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,15 @@
+---
+!binary "U0hBMQ==":
+  metadata.gz: !binary |-
+    NTcxZjE1YTc5OTY1MTQyM2NmYTRjM2VlODE5ZWIwZjBiMjc1OGVmMg==
+  data.tar.gz: !binary |-
+    ZWIyNzI5NGMwMGMwNzJmYTA4Nzg2YzI4OTk3YTIyNjYyZGU4ZDQzYw==
+!binary "U0hBNTEy":
+  metadata.gz: !binary |-
+    ODdiYzJjYWM3NjM3NzBjZjExMzI1YzJmN2I2YzA2YTQ5Y2I5N2IwNTRhMzE3
+    YWE0NDI5MGMwZTM1M2U2YTFkNWNiOWNjY2Q1Y2ExYjYzY2M4MDI1MTliODhi
+    NTM3Y2UyNDUxZDkxZjIwNTRmNzI1NzZjZjliNzE0YzcwNTUyMjk=
+  data.tar.gz: !binary |-
+    MDZkYjMwYTBkZTUyMGMwMGRkZTgxZWM4ZWY3YTgwNWJlNTQ4NmVlNjVkNDli
+    ZWQ3M2FiYjkwYjJkNjI1NGQyOTJhMmM5ZTk2NTZjMzk4ZTgxNmYyNmIxNTA2
+    YTZlNmVmYTg0ZGVmYWFjODMzOTBjMmE3MTAzY2NiYTAxYTQ5ZTU=

data/.gitignore ADDED Viewed

@@ -0,0 +1,10 @@
+pkg
+doc
+web
+tmp
+Gemfile.lock
+.DS_Store
+.bundle
+.yardoc
+*.swp
+*~

data/.rspec ADDED Viewed

	@@ -0,0 +1 @@
1	+ --colour --format documentation

data/.yardopts ADDED Viewed

	@@ -0,0 +1 @@
1	+ --markup markdown --title 'Spidr Documentation' --protected --files ChangeLog.md,LICENSE.txt

data/ChangeLog.md ADDED Viewed

@@ -0,0 +1,291 @@
+### 1.4.2 /2012-04-12
+* 对此gem进行了修改.
+### 0.4.1 / 2011-12-08
+* Catch `OpenSSL::SSL::SSLError` exceptions when initiated HTTPS Sessions.
+### 0.4.0 / 2011-08-07
+* Added {Spidr::Headers#content_charset}.
+* Pass the Page `url` and `content_charset` to Nokogiri in {Spidr::Body#doc}.
+  This ensures that Nokogiri will preserve the body encoding.
+* Made {Spidr::Headers#is_content_type?} public.
+* Allow {Spidr::Headers#is_content_type?} to match the full Content-Type
+  or the sub-type.
+### 0.3.2 / 2011-06-20
+* Added separate intitialize methods for {Spidr::Actions}, {Spidr::Events},
+  {Spidr::Filters} and {Spidr::Sanitizers}.
+* Aliased {Spidr::Events#urls_like} to {Spidr::Events#every_url_like}.
+* Reduce usage of `self.included` and `module_eval`.
+* Reduce usage of nested-blocks.
+* Reduce usage of `return`.
+### 0.3.1 / 2011-04-22
+* Require `set` in `spidr/headers.rb`.
+### 0.3.0 / 2011-04-14
+* Switched from Jeweler to [Ore](http://github.com/ruby-ore/ore).
+* Split all header related methods out of {Spidr::Page} and into
+  {Spidr::Headers}.
+* Split all body related methods out of {Spidr::Page} and into
+  {Spidr::Body}.
+* Split all link related methods out of {Spidr::Page} and into
+  {Spidr::Links}.
+* Added {Spidr::Headers#directory?}.
+* Added {Spidr::Headers#json?}.
+* Added {Spidr::Links#each_url}.
+* Added {Spidr::Links#each_link}.
+* Added {Spidr::Links#each_redirect}.
+* Added {Spidr::Links#each_meta_redirect}.
+* Aliased {Spidr::Headers#raw_cookie} to {Spidr::Headers#cookie}.
+* Aliased {Spidr::Body#to_s} to {Spidr::Body#body}.
+* Also check for `application/xml` in {Spidr::Headers#xml?}.
+* Catch all exceptions when merging URIs in {Spidr::Links#to_absolute}.
+* Always prepend a `/` to all FTP URI paths. Fixes a Ruby 1.8 specific
+  bug, where it expects an absolute path for all FTP URIs.
+* Refactored {URI.expand_path}.
+* Start the session in {Spidr::SessionCache#[]} to prevent multiple
+  `CONNECT` commands being sent to HTTP Proxies (thanks falaise).
+### 0.2.7 / 2010-08-17
+* Added {Spidr::CookieJar#cookies_for_host} (thanks zapnap).
+* Renamed `Spidr::Page#cookie` to `Spidr::Page#raw_cookie`.
+* Rescue `URI::InvalidComponentError` exceptions in
+  `Spidr::Page#to_absolute` (thanks zapnap).
+### 0.2.6 / 2010-07-05
+* Fixed a bug in `Spidr::Page#meta_redirect`, by calling
+  `Nokogiri::XML::Element#get_attribute` instead of `attr`.
+### 0.2.5 / 2010-07-02
+* Added `Spidr::Page#meta_redirect`.
+* Added `Spidr::Page#meta_redirect?`.
+* Manage development dependencies with Bundler.
+* Support following "old-school" meta-refresh redirects (thanks zapnap).
+* Allow {Spidr::CookieJar} inherit cookies set by a parent domain.
+* Fixed a constant lookup issue in {Spidr::Agent}.
+* Use `yield` instead of `block.call` when necessary.
+### 0.2.4 / 2010-05-05
+* Added {Spidr::Filters#visit_urls}.
+* Added {Spidr::Filters#visit_urls_like}.
+* Added {Spidr::Filters#ignore_urls}.
+* Added {Spidr::Filters#ignore_urls_like}.
+* Added `Spidr::Page#is_content_type?`.
+* Default `Spidr::Page#body` to an empty String.
+* Default `Spidr::Page#content_type` to an empty String.
+* Default `Spidr::Page#content_types` to an empty Array.
+* Improved reliability of {Spidr::Page#is_redirect?}.
+* Improved content type detection in {Spidr::Page} to handle `Content-Type`
+  headers containing charsets (thanks Josh Lindsey).
+### 0.2.3 / 2010-02-27
+* Migrated to Jeweler, for the packaging and releasing RubyGems.
+* Switched to MarkDown formatted YARD documentation.
+* Added {Spidr::Events#every_link}.
+* Added {Spidr::SessionCache#active?}.
+* Added specs for {Spidr::SessionCache}.
+### 0.2.2 / 2010-01-06
+* Require Web Spider Obstacle Course (WSOC) >= 0.1.1.
+* Integrated the new WSOC into the specs.
+* Removed the built-in Web Spider Obstacle Course.
+* Added `Spidr::Page#content_types`.
+* Added `Spidr::Page#cookie`.
+* Added `Spidr::Page#cookies`.
+* Added `Spidr::Page#cookie_params`.
+* Added {Spidr::Sanitizers}.
+* Added {Spidr::SessionCache}.
+* Added {Spidr::CookieJar} (thanks Nick Plante).
+* Added {Spidr::AuthStore} (thanks Nick Plante).
+* Added {Spidr::Agent#post_page} (thanks Nick Plante).
+* Renamed `Spidr::Agent#get_session` to {Spidr::SessionCache#[]}.
+* Renamed `Spidr::Agent#kill_session` to {Spidr::SessionCache#kill!}.
+### 0.2.1 / 2009-11-25
+* Added {Spidr::Events#every_ok_page}.
+* Added {Spidr::Events#every_redirect_page}.
+* Added {Spidr::Events#every_timedout_page}.
+* Added {Spidr::Events#every_bad_request_page}.
+* Added {Spidr::Events#every_unauthorized_page}.
+* Added {Spidr::Events#every_forbidden_page}.
+* Added {Spidr::Events#every_missing_page}.
+* Added {Spidr::Events#every_internal_server_error_page}.
+* Added {Spidr::Events#every_txt_page}.
+* Added {Spidr::Events#every_html_page}.
+* Added {Spidr::Events#every_xml_page}.
+* Added {Spidr::Events#every_xsl_page}.
+* Added {Spidr::Events#every_doc}.
+* Added {Spidr::Events#every_html_doc}.
+* Added {Spidr::Events#every_xml_doc}.
+* Added {Spidr::Events#every_xsl_doc}.
+* Added {Spidr::Events#every_rss_doc}.
+* Added {Spidr::Events#every_atom_doc}.
+* Added {Spidr::Events#every_javascript_page}.
+* Added {Spidr::Events#every_css_page}.
+* Added {Spidr::Events#every_rss_page}.
+* Added {Spidr::Events#every_atom_page}.
+* Added {Spidr::Events#every_ms_word_page}.
+* Added {Spidr::Events#every_pdf_page}.
+* Added {Spidr::Events#every_zip_page}.
+* Fixed a bug where {Spidr::Agent#delay} was not being used to delay
+  requesting pages.
+* Spider `link` and `script` tags in HTML pages (thanks Nick Plante).
+### 0.2.0 / 2009-10-10
+* Added {URI.expand_path}.
+* Added `Spidr::Page#search`.
+* Added `Spidr::Page#at`.
+* Added `Spidr::Page#title`.
+* Added {Spidr::Agent#failures=}.
+* Added a HTTP session cache to {Spidr::Agent}, per suggestion of falter.
+  * Added `Spidr::Agent#get_session`.
+  * Added `Spidr::Agent#kill_session`.
+* Added {Spidr.proxy=}.
+* Added {Spidr.disable_proxy!}.
+* Aliased `Spidr::Page#txt?` to `Spidr::Page#plain_text?`.
+* Aliased `Spidr::Page#ok?` to `Spidr::Page#is_ok?`.
+* Aliased `Spidr::Page#redirect?` to `Spidr::Page#is_redirect?`.
+* Aliased `Spidr::Page#unauthorized?` to `Spidr::Page#is_unauthorized?`.
+* Aliased `Spidr::Page#forbidden?` to `Spidr::Page#is_forbidden?`.
+* Aliased `Spidr::Page#missing?` to `Spidr::Page#is_missing?`.
+* Split URL filtering code out of {Spidr::Agent} and into
+  {Spidr::Filters}.
+* Split URL / Page event code out of {Spidr::Agent} and into
+  {Spidr::Events}.
+* Split pause! / continue! / skip_link! / skip_page! methods out of
+  {Spidr::Agent} and into {Spidr::Actions}.
+* Fixed a bug in `Spidr::Page#code`, where it was not returning an Integer.
+* Make sure `Spidr::Page#doc` returns `Nokogiri::XML::Document` objects for
+  RSS/RDF/Atom pages as well.
+* Fixed the handling of the Location header in `Spidr::Page#links`
+  (thanks falter).
+* Fixed a bug in `Spidr::Page#to_absolute` where trailing `/` characters on
+  URI paths were not being preserved (thanks falter).
+* Fixed a bug where the URI query was not being sent with the request
+  in {Spidr::Agent#get_page} (thanks Damian Steer).
+* Fixed a bug where SSL sessions were not being properly setup
+  (thanks falter).
+* Switched {Spidr::Agent#history} to be a Set, to improve search-time
+  of the history (thanks falter).
+* Switched {Spidr::Agent#failures} to a Set.
+* Allow a block to be passed to {Spidr::Agent#run}, which will receive all
+  pages visited.
+* Allow `Spidr::Agent#start_at` and `Spidr::Agent#continue!` to pass blocks
+  to {Spidr::Agent#run}.
+* Made {Spidr::Agent#visit_page} public.
+* Moved to YARD based documentation.
+### 0.1.9 / 2009-06-13
+* Upgraded to Hoe 2.0.0.
+  * Use Hoe.spec instead of Hoe.new.
+  * Use the Hoe signing task for signed gems.
+* Added the `Spidr::Agent#schemes` and `Spidr::Agent#schemes=` methods.
+* Added a warning message if 'net/https' cannot be loaded.
+* Allow the list of acceptable URL schemes to be passed into
+  {Spidr::Agent#initialize}.
+* Allow history and queue information to be passed into
+  {Spidr::Agent#initialize}.
+* {Spidr::Agent#start_at} no longer clears the history or the queue.
+* Fixed a bug in the sanitization of semi-escaped URLs.
+* Fixed a bug where https URLs would be followed even if 'net/https'
+  could not be loaded.
+* Removed Spidr::Agent::SCHEMES.
+### 0.1.8 / 2009-05-27
+* Added the `Spidr::Agent#pause!` and `Spidr::Agent#continue!` methods.
+* Added the `Spidr::Agent#running?` and `Spidr::Agent#paused?` methods.
+* Added an alias for pending_urls to the queue methods.
+* Added {Spidr::Agent#queue} to provide read access to the queue.
+* Added {Spidr::Agent#queue=} and {Spidr::Agent#history=} for setting the
+  queue and history.
+* Added {Spidr::Agent#to_hash} which returns a Hash of the agents queue and
+  history.
+* Made {Spidr::Agent#enqueue} and {Spidr::Agent#queued?} public.
+* Added more specs.
+### 0.1.7 / 2009-04-24
+* Added `Spidr::Agent#all_headers`.
+* Fixed a bug where {Spidr::Page#headers} was always `nil`.
+* {Spidr::Agent} will now follow the Location header in HTTP 300,
+  301, 302, 303 and 307 Redirects.
+* {Spidr::Agent} will now follow iframe and frame tags.
+### 0.1.6 / 2009-04-14
+* Added {Spidr::Agent#failures}, a list of URLs which could not be visited.
+* Added {Spidr::Agent#failed?}.
+* Added `Spidr::Agent#every_failed_url`.
+* Added {Spidr::Agent#clear}, which clears the history and failures URL
+  lists.
+* Improved fault tolerance in {Spidr::Agent#get_page}.
+  * If a Network or HTTP error is encountered, the URL will be added to
+    the failures list and the next URL will be visited.
+* Fixed a typo in `Spidr::Agent#ignore_exts_like`.
+* Updated the Web Spider Obstacle Course with links that always fail to be
+  visited.
+### 0.1.5 / 2009-03-22
+* Catch malformed URIs in `Spidr::Page#to_absolute` and return `nil`.
+* Filter out `nil` URIs in `Spidr::Page#urls`.
+### 0.1.4 / 2009-01-15
+* Use Nokogiri for HTML and XML parsing.
+### 0.1.3 / 2009-01-10
+* Added the `:host` options to {Spidr::Agent#initialize}.
+* Added the Web Spider Obstacle Course files to the Manifest.
+* Aliased {Spidr::Agent#visited_urls} to {Spidr::Agent#history}.
+### 0.1.2 / 2008-11-06
+* Fixed a bug in `Spidr::Page#to_absolute` where URLs with no path were not
+  receiving a default path of `/`.
+* Fixed a bug in `Spidr::Page#to_absolute` where URL paths were not being
+  expanded, in order to remove `..` and `.` directories.
+* Fixed a bug where absolute URLs could have a blank path, thus causing
+  {Spidr::Agent#get_page} to crash when it performed the HTTP request.
+* Added RSpec spec tests.
+* Created a Web-Spider Obstacle Course
+  (http://spidr.rubyforge.org/course/start.html) which is used in the spec
+  tests.
+### 0.1.1 / 2008-10-04
+* Added a reader method for the response instance variable in Page.
+* Fixed a bug in {Spidr::Page#method_missing}.
+### 0.1.0 / 2008-05-23
+* Initial release.
+  * Black-list or white-list URLs based upon:
+    * Host name
+    * Port number
+    * Full link
+    * URL extension
+  * Provides call-backs for:
+    * Every visited Page.
+    * Every visited URL.
+    * Every visited URL that matches a specified pattern.

data/ChangeLog.md~ ADDED Viewed

@@ -0,0 +1,291 @@
+### 0.4.2 /2012-04-12
+* 对此gem进行了修改.
+### 0.4.1 / 2011-12-08
+* Catch `OpenSSL::SSL::SSLError` exceptions when initiated HTTPS Sessions.
+### 0.4.0 / 2011-08-07
+* Added {Spidr::Headers#content_charset}.
+* Pass the Page `url` and `content_charset` to Nokogiri in {Spidr::Body#doc}.
+  This ensures that Nokogiri will preserve the body encoding.
+* Made {Spidr::Headers#is_content_type?} public.
+* Allow {Spidr::Headers#is_content_type?} to match the full Content-Type
+  or the sub-type.
+### 0.3.2 / 2011-06-20
+* Added separate intitialize methods for {Spidr::Actions}, {Spidr::Events},
+  {Spidr::Filters} and {Spidr::Sanitizers}.
+* Aliased {Spidr::Events#urls_like} to {Spidr::Events#every_url_like}.
+* Reduce usage of `self.included` and `module_eval`.
+* Reduce usage of nested-blocks.
+* Reduce usage of `return`.
+### 0.3.1 / 2011-04-22
+* Require `set` in `spidr/headers.rb`.
+### 0.3.0 / 2011-04-14
+* Switched from Jeweler to [Ore](http://github.com/ruby-ore/ore).
+* Split all header related methods out of {Spidr::Page} and into
+  {Spidr::Headers}.
+* Split all body related methods out of {Spidr::Page} and into
+  {Spidr::Body}.
+* Split all link related methods out of {Spidr::Page} and into
+  {Spidr::Links}.
+* Added {Spidr::Headers#directory?}.
+* Added {Spidr::Headers#json?}.
+* Added {Spidr::Links#each_url}.
+* Added {Spidr::Links#each_link}.
+* Added {Spidr::Links#each_redirect}.
+* Added {Spidr::Links#each_meta_redirect}.
+* Aliased {Spidr::Headers#raw_cookie} to {Spidr::Headers#cookie}.
+* Aliased {Spidr::Body#to_s} to {Spidr::Body#body}.
+* Also check for `application/xml` in {Spidr::Headers#xml?}.
+* Catch all exceptions when merging URIs in {Spidr::Links#to_absolute}.
+* Always prepend a `/` to all FTP URI paths. Fixes a Ruby 1.8 specific
+  bug, where it expects an absolute path for all FTP URIs.
+* Refactored {URI.expand_path}.
+* Start the session in {Spidr::SessionCache#[]} to prevent multiple
+  `CONNECT` commands being sent to HTTP Proxies (thanks falaise).
+### 0.2.7 / 2010-08-17
+* Added {Spidr::CookieJar#cookies_for_host} (thanks zapnap).
+* Renamed `Spidr::Page#cookie` to `Spidr::Page#raw_cookie`.
+* Rescue `URI::InvalidComponentError` exceptions in
+  `Spidr::Page#to_absolute` (thanks zapnap).
+### 0.2.6 / 2010-07-05
+* Fixed a bug in `Spidr::Page#meta_redirect`, by calling
+  `Nokogiri::XML::Element#get_attribute` instead of `attr`.
+### 0.2.5 / 2010-07-02
+* Added `Spidr::Page#meta_redirect`.
+* Added `Spidr::Page#meta_redirect?`.
+* Manage development dependencies with Bundler.
+* Support following "old-school" meta-refresh redirects (thanks zapnap).
+* Allow {Spidr::CookieJar} inherit cookies set by a parent domain.
+* Fixed a constant lookup issue in {Spidr::Agent}.
+* Use `yield` instead of `block.call` when necessary.
+### 0.2.4 / 2010-05-05
+* Added {Spidr::Filters#visit_urls}.
+* Added {Spidr::Filters#visit_urls_like}.
+* Added {Spidr::Filters#ignore_urls}.
+* Added {Spidr::Filters#ignore_urls_like}.
+* Added `Spidr::Page#is_content_type?`.
+* Default `Spidr::Page#body` to an empty String.
+* Default `Spidr::Page#content_type` to an empty String.
+* Default `Spidr::Page#content_types` to an empty Array.
+* Improved reliability of {Spidr::Page#is_redirect?}.
+* Improved content type detection in {Spidr::Page} to handle `Content-Type`
+  headers containing charsets (thanks Josh Lindsey).
+### 0.2.3 / 2010-02-27
+* Migrated to Jeweler, for the packaging and releasing RubyGems.
+* Switched to MarkDown formatted YARD documentation.
+* Added {Spidr::Events#every_link}.
+* Added {Spidr::SessionCache#active?}.
+* Added specs for {Spidr::SessionCache}.
+### 0.2.2 / 2010-01-06
+* Require Web Spider Obstacle Course (WSOC) >= 0.1.1.
+* Integrated the new WSOC into the specs.
+* Removed the built-in Web Spider Obstacle Course.
+* Added `Spidr::Page#content_types`.
+* Added `Spidr::Page#cookie`.
+* Added `Spidr::Page#cookies`.
+* Added `Spidr::Page#cookie_params`.
+* Added {Spidr::Sanitizers}.
+* Added {Spidr::SessionCache}.
+* Added {Spidr::CookieJar} (thanks Nick Plante).
+* Added {Spidr::AuthStore} (thanks Nick Plante).
+* Added {Spidr::Agent#post_page} (thanks Nick Plante).
+* Renamed `Spidr::Agent#get_session` to {Spidr::SessionCache#[]}.
+* Renamed `Spidr::Agent#kill_session` to {Spidr::SessionCache#kill!}.
+### 0.2.1 / 2009-11-25
+* Added {Spidr::Events#every_ok_page}.
+* Added {Spidr::Events#every_redirect_page}.
+* Added {Spidr::Events#every_timedout_page}.
+* Added {Spidr::Events#every_bad_request_page}.
+* Added {Spidr::Events#every_unauthorized_page}.
+* Added {Spidr::Events#every_forbidden_page}.
+* Added {Spidr::Events#every_missing_page}.
+* Added {Spidr::Events#every_internal_server_error_page}.
+* Added {Spidr::Events#every_txt_page}.
+* Added {Spidr::Events#every_html_page}.
+* Added {Spidr::Events#every_xml_page}.
+* Added {Spidr::Events#every_xsl_page}.
+* Added {Spidr::Events#every_doc}.
+* Added {Spidr::Events#every_html_doc}.
+* Added {Spidr::Events#every_xml_doc}.
+* Added {Spidr::Events#every_xsl_doc}.
+* Added {Spidr::Events#every_rss_doc}.
+* Added {Spidr::Events#every_atom_doc}.
+* Added {Spidr::Events#every_javascript_page}.
+* Added {Spidr::Events#every_css_page}.
+* Added {Spidr::Events#every_rss_page}.
+* Added {Spidr::Events#every_atom_page}.
+* Added {Spidr::Events#every_ms_word_page}.
+* Added {Spidr::Events#every_pdf_page}.
+* Added {Spidr::Events#every_zip_page}.
+* Fixed a bug where {Spidr::Agent#delay} was not being used to delay
+  requesting pages.
+* Spider `link` and `script` tags in HTML pages (thanks Nick Plante).
+### 0.2.0 / 2009-10-10
+* Added {URI.expand_path}.
+* Added `Spidr::Page#search`.
+* Added `Spidr::Page#at`.
+* Added `Spidr::Page#title`.
+* Added {Spidr::Agent#failures=}.
+* Added a HTTP session cache to {Spidr::Agent}, per suggestion of falter.
+  * Added `Spidr::Agent#get_session`.
+  * Added `Spidr::Agent#kill_session`.
+* Added {Spidr.proxy=}.
+* Added {Spidr.disable_proxy!}.
+* Aliased `Spidr::Page#txt?` to `Spidr::Page#plain_text?`.
+* Aliased `Spidr::Page#ok?` to `Spidr::Page#is_ok?`.
+* Aliased `Spidr::Page#redirect?` to `Spidr::Page#is_redirect?`.
+* Aliased `Spidr::Page#unauthorized?` to `Spidr::Page#is_unauthorized?`.
+* Aliased `Spidr::Page#forbidden?` to `Spidr::Page#is_forbidden?`.
+* Aliased `Spidr::Page#missing?` to `Spidr::Page#is_missing?`.
+* Split URL filtering code out of {Spidr::Agent} and into
+  {Spidr::Filters}.
+* Split URL / Page event code out of {Spidr::Agent} and into
+  {Spidr::Events}.
+* Split pause! / continue! / skip_link! / skip_page! methods out of
+  {Spidr::Agent} and into {Spidr::Actions}.
+* Fixed a bug in `Spidr::Page#code`, where it was not returning an Integer.
+* Make sure `Spidr::Page#doc` returns `Nokogiri::XML::Document` objects for
+  RSS/RDF/Atom pages as well.
+* Fixed the handling of the Location header in `Spidr::Page#links`
+  (thanks falter).
+* Fixed a bug in `Spidr::Page#to_absolute` where trailing `/` characters on
+  URI paths were not being preserved (thanks falter).
+* Fixed a bug where the URI query was not being sent with the request
+  in {Spidr::Agent#get_page} (thanks Damian Steer).
+* Fixed a bug where SSL sessions were not being properly setup
+  (thanks falter).
+* Switched {Spidr::Agent#history} to be a Set, to improve search-time
+  of the history (thanks falter).
+* Switched {Spidr::Agent#failures} to a Set.
+* Allow a block to be passed to {Spidr::Agent#run}, which will receive all
+  pages visited.
+* Allow `Spidr::Agent#start_at` and `Spidr::Agent#continue!` to pass blocks
+  to {Spidr::Agent#run}.
+* Made {Spidr::Agent#visit_page} public.
+* Moved to YARD based documentation.
+### 0.1.9 / 2009-06-13
+* Upgraded to Hoe 2.0.0.
+  * Use Hoe.spec instead of Hoe.new.
+  * Use the Hoe signing task for signed gems.
+* Added the `Spidr::Agent#schemes` and `Spidr::Agent#schemes=` methods.
+* Added a warning message if 'net/https' cannot be loaded.
+* Allow the list of acceptable URL schemes to be passed into
+  {Spidr::Agent#initialize}.
+* Allow history and queue information to be passed into
+  {Spidr::Agent#initialize}.
+* {Spidr::Agent#start_at} no longer clears the history or the queue.
+* Fixed a bug in the sanitization of semi-escaped URLs.
+* Fixed a bug where https URLs would be followed even if 'net/https'
+  could not be loaded.
+* Removed Spidr::Agent::SCHEMES.
+### 0.1.8 / 2009-05-27
+* Added the `Spidr::Agent#pause!` and `Spidr::Agent#continue!` methods.
+* Added the `Spidr::Agent#running?` and `Spidr::Agent#paused?` methods.
+* Added an alias for pending_urls to the queue methods.
+* Added {Spidr::Agent#queue} to provide read access to the queue.
+* Added {Spidr::Agent#queue=} and {Spidr::Agent#history=} for setting the
+  queue and history.
+* Added {Spidr::Agent#to_hash} which returns a Hash of the agents queue and
+  history.
+* Made {Spidr::Agent#enqueue} and {Spidr::Agent#queued?} public.
+* Added more specs.
+### 0.1.7 / 2009-04-24
+* Added `Spidr::Agent#all_headers`.
+* Fixed a bug where {Spidr::Page#headers} was always `nil`.
+* {Spidr::Agent} will now follow the Location header in HTTP 300,
+  301, 302, 303 and 307 Redirects.
+* {Spidr::Agent} will now follow iframe and frame tags.
+### 0.1.6 / 2009-04-14
+* Added {Spidr::Agent#failures}, a list of URLs which could not be visited.
+* Added {Spidr::Agent#failed?}.
+* Added `Spidr::Agent#every_failed_url`.
+* Added {Spidr::Agent#clear}, which clears the history and failures URL
+  lists.
+* Improved fault tolerance in {Spidr::Agent#get_page}.
+  * If a Network or HTTP error is encountered, the URL will be added to
+    the failures list and the next URL will be visited.
+* Fixed a typo in `Spidr::Agent#ignore_exts_like`.
+* Updated the Web Spider Obstacle Course with links that always fail to be
+  visited.
+### 0.1.5 / 2009-03-22
+* Catch malformed URIs in `Spidr::Page#to_absolute` and return `nil`.
+* Filter out `nil` URIs in `Spidr::Page#urls`.
+### 0.1.4 / 2009-01-15
+* Use Nokogiri for HTML and XML parsing.
+### 0.1.3 / 2009-01-10
+* Added the `:host` options to {Spidr::Agent#initialize}.
+* Added the Web Spider Obstacle Course files to the Manifest.
+* Aliased {Spidr::Agent#visited_urls} to {Spidr::Agent#history}.
+### 0.1.2 / 2008-11-06
+* Fixed a bug in `Spidr::Page#to_absolute` where URLs with no path were not
+  receiving a default path of `/`.
+* Fixed a bug in `Spidr::Page#to_absolute` where URL paths were not being
+  expanded, in order to remove `..` and `.` directories.
+* Fixed a bug where absolute URLs could have a blank path, thus causing
+  {Spidr::Agent#get_page} to crash when it performed the HTTP request.
+* Added RSpec spec tests.
+* Created a Web-Spider Obstacle Course
+  (http://spidr.rubyforge.org/course/start.html) which is used in the spec
+  tests.
+### 0.1.1 / 2008-10-04
+* Added a reader method for the response instance variable in Page.
+* Fixed a bug in {Spidr::Page#method_missing}.
+### 0.1.0 / 2008-05-23
+* Initial release.
+  * Black-list or white-list URLs based upon:
+    * Host name
+    * Port number
+    * Full link
+    * URL extension
+  * Provides call-backs for:
+    * Every visited Page.
+    * Every visited URL.
+    * Every visited URL that matches a specified pattern.

data/Gemfile ADDED Viewed

@@ -0,0 +1,16 @@
+source 'http://ruby.taobao.org'
+platform :jruby do
+  gem 'jruby-openssl'
+end
+gemspec
+group :development do
+  gem 'rake',           '~> 10.0'
+  gem 'rubygems-tasks', '~> 0.1'
+  gem 'rspec',          '~> 2.4'
+  gem 'wsoc',     '~> 0.1.3'
+  gem 'kramdown', '~> 0.12'
+end

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,49 @@
+PATH
+  remote: .
+  specs:
+    spidr_epg (1.0.0)
+      nokogiri (~> 1.3)
+GEM
+  remote: http://ruby.taobao.org/
+  specs:
+    diff-lcs (1.2.3)
+    json (1.7.7)
+    kramdown (0.14.2)
+    nokogiri (1.5.9)
+    rack (1.5.2)
+    rack-protection (1.5.0)
+      rack
+    rake (10.0.4)
+    rspec (2.13.0)
+      rspec-core (~> 2.13.0)
+      rspec-expectations (~> 2.13.0)
+      rspec-mocks (~> 2.13.0)
+    rspec-core (2.13.1)
+    rspec-expectations (2.13.0)
+      diff-lcs (>= 1.1.3, < 2.0)
+    rspec-mocks (2.13.1)
+    rubygems-tasks (0.2.4)
+    sinatra (1.4.2)
+      rack (~> 1.5, >= 1.5.2)
+      rack-protection (~> 1.4)
+      tilt (~> 1.3, >= 1.3.4)
+    tilt (1.3.7)
+    wsoc (0.1.4)
+      json (~> 1.4)
+      sinatra (~> 1.0)
+    yard (0.8.5.2)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  bundler (~> 1.0)
+  jruby-openssl
+  kramdown (~> 0.12)
+  rake (~> 10.0)
+  rspec (~> 2.4)
+  rubygems-tasks (~> 0.1)
+  spidr_epg!
+  wsoc (~> 0.1.3)
+  yard (~> 0.7)

data/Gemfile~ ADDED Viewed

@@ -0,0 +1,16 @@
+source 'http://ruby.taobao.org'
+platform :jruby do
+  gem 'jruby-openssl'
+end
+gemspec
+group :development do
+  gem 'rake',           '~> 10.0'
+  gem 'rubygems-tasks', '~> 0.1'
+  gem 'rspec',          '~> 2.4'
+  gem 'wsoc',     '~> 0.1.3'
+  gem 'kramdown', '~> 0.12'
+end