spidr 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/ChangeLog.md +10 -0
- data/Gemfile +27 -0
- data/README.md +1 -0
- data/Rakefile +23 -30
- data/lib/spidr/agent.rb +27 -20
- data/lib/spidr/cookie_jar.rb +16 -0
- data/lib/spidr/events.rb +58 -58
- data/lib/spidr/page.rb +63 -12
- data/lib/spidr/version.rb +1 -1
- data/spec/cookie_jar_spec.rb +14 -1
- data/spec/helpers/wsoc.rb +1 -1
- data/spec/page_spec.rb +15 -0
- data/spec/spec_helper.rb +10 -2
- data/spidr.gemspec +73 -73
- metadata +39 -31
    
        data/.gitignore
    CHANGED
    
    
    
        data/ChangeLog.md
    CHANGED
    
    | @@ -1,3 +1,13 @@ | |
| 1 | 
            +
            ### 0.2.5 / 2010-07-02
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            * Added {Spidr::Page#meta_redirect}.
         | 
| 4 | 
            +
            * Added {Spidr::Page#meta_redirect?}.
         | 
| 5 | 
            +
            * Manage development dependencies with Bundler.
         | 
| 6 | 
            +
            * Support following "old-school" meta-refresh redirects (thanks zapnap).
         | 
| 7 | 
            +
            * Allow {Spidr::CookieJar} inherit cookies set by a parent domain.
         | 
| 8 | 
            +
            * Fixed a constant lookup issue in {Spidr::Agent}.
         | 
| 9 | 
            +
            * Use `yield` instead of `block.call` when necessary.
         | 
| 10 | 
            +
             | 
| 1 11 | 
             
            ### 0.2.4 / 2010-05-05
         | 
| 2 12 |  | 
| 3 13 | 
             
            * Added {Spidr::Filters#visit_urls}.
         | 
    
        data/Gemfile
    ADDED
    
    | @@ -0,0 +1,27 @@ | |
| 1 | 
            +
            source 'https://rubygems.org'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            group(:runtime) do
         | 
| 4 | 
            +
              gem 'nokogiri',	'>= 1.3.0'
         | 
| 5 | 
            +
            end
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            group(:development) do
         | 
| 8 | 
            +
              gem 'rake',			'~> 0.8.7'
         | 
| 9 | 
            +
              gem 'jeweler',		'~> 1.4.0', :git => 'git://github.com/technicalpickles/jeweler.git'
         | 
| 10 | 
            +
            end
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            group(:doc) do
         | 
| 13 | 
            +
              case RUBY_PLATFORM
         | 
| 14 | 
            +
              when 'java'
         | 
| 15 | 
            +
                gem 'maruku',	'~> 0.6.0'
         | 
| 16 | 
            +
              else
         | 
| 17 | 
            +
                gem 'rdiscount',	'~> 1.6.3'
         | 
| 18 | 
            +
              end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
              gem 'yard',		'~> 0.5.3'
         | 
| 21 | 
            +
            end
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            group(:test) do
         | 
| 24 | 
            +
              gem 'wsoc',	'~> 0.1.3'
         | 
| 25 | 
            +
            end
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            gem 'rspec',	'~> 1.3.0', :group => [:development, :test]
         | 
    
        data/README.md
    CHANGED
    
    
    
        data/Rakefile
    CHANGED
    
    | @@ -1,27 +1,28 @@ | |
| 1 1 | 
             
            require 'rubygems'
         | 
| 2 | 
            +
            require 'bundler'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            begin
         | 
| 5 | 
            +
              Bundler.setup(:development, :doc)
         | 
| 6 | 
            +
            rescue Bundler::BundlerError => e
         | 
| 7 | 
            +
              STDERR.puts e.message
         | 
| 8 | 
            +
              STDERR.puts "Run `bundle install` to install missing gems"
         | 
| 9 | 
            +
              exit e.status_code
         | 
| 10 | 
            +
            end
         | 
| 11 | 
            +
             | 
| 2 12 | 
             
            require 'rake'
         | 
| 13 | 
            +
            require 'jeweler'
         | 
| 3 14 | 
             
            require './lib/spidr/version.rb'
         | 
| 4 15 |  | 
| 5 | 
            -
             | 
| 6 | 
            -
               | 
| 7 | 
            -
               | 
| 8 | 
            -
             | 
| 9 | 
            -
             | 
| 10 | 
            -
             | 
| 11 | 
            -
             | 
| 12 | 
            -
             | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
| 15 | 
            -
                gem.authors = ['Postmodern']
         | 
| 16 | 
            -
                gem.add_dependency 'nokogiri', '>= 1.3.0'
         | 
| 17 | 
            -
                gem.add_development_dependency 'rspec', '~> 1.3.0'
         | 
| 18 | 
            -
                gem.add_development_dependency 'yard', '~> 0.5.3'
         | 
| 19 | 
            -
                gem.add_development_dependency 'wsoc', '~> 0.1.1'
         | 
| 20 | 
            -
                gem.has_rdoc = 'yard'
         | 
| 21 | 
            -
              end
         | 
| 22 | 
            -
              Jeweler::GemcutterTasks.new
         | 
| 23 | 
            -
            rescue LoadError
         | 
| 24 | 
            -
              puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
         | 
| 16 | 
            +
            Jeweler::Tasks.new do |gem|
         | 
| 17 | 
            +
              gem.name = 'spidr'
         | 
| 18 | 
            +
              gem.version = Spidr::VERSION
         | 
| 19 | 
            +
              gem.license = 'MIT'
         | 
| 20 | 
            +
              gem.summary = %Q{A versatile Ruby web spidering library}
         | 
| 21 | 
            +
              gem.description = %Q{Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.}
         | 
| 22 | 
            +
              gem.email = 'postmodern.mod3@gmail.com'
         | 
| 23 | 
            +
              gem.homepage = 'http://github.com/postmodern/spidr'
         | 
| 24 | 
            +
              gem.authors = ['Postmodern']
         | 
| 25 | 
            +
              gem.has_rdoc = 'yard'
         | 
| 25 26 | 
             
            end
         | 
| 26 27 |  | 
| 27 28 | 
             
            require 'spec/rake/spectask'
         | 
| @@ -31,15 +32,7 @@ Spec::Rake::SpecTask.new(:spec) do |spec| | |
| 31 32 | 
             
              spec.spec_opts = ['--options', '.specopts']
         | 
| 32 33 | 
             
            end
         | 
| 33 34 |  | 
| 34 | 
            -
            task :spec => :check_dependencies
         | 
| 35 35 | 
             
            task :default => :spec
         | 
| 36 36 |  | 
| 37 | 
            -
             | 
| 38 | 
            -
             | 
| 39 | 
            -
             | 
| 40 | 
            -
              YARD::Rake::YardocTask.new
         | 
| 41 | 
            -
            rescue LoadError
         | 
| 42 | 
            -
              task :yard do
         | 
| 43 | 
            -
                abort "YARD is not available. In order to run yard, you must: gem install yard"
         | 
| 44 | 
            -
              end
         | 
| 45 | 
            -
            end
         | 
| 37 | 
            +
            require 'yard'
         | 
| 38 | 
            +
            YARD::Rake::YardocTask.new
         | 
    
        data/lib/spidr/agent.rb
    CHANGED
    
    | @@ -98,7 +98,7 @@ module Spidr | |
| 98 98 | 
             
                # @yieldparam [Agent] agent
         | 
| 99 99 | 
             
                #   The newly created agent.
         | 
| 100 100 | 
             
                #
         | 
| 101 | 
            -
                def initialize(options={} | 
| 101 | 
            +
                def initialize(options={})
         | 
| 102 102 | 
             
                  @host_header = options[:host_header]
         | 
| 103 103 | 
             
                  @host_headers = {}
         | 
| 104 104 |  | 
| @@ -121,7 +121,7 @@ module Spidr | |
| 121 121 |  | 
| 122 122 | 
             
                  super(options)
         | 
| 123 123 |  | 
| 124 | 
            -
                   | 
| 124 | 
            +
                  yield self if block_given?
         | 
| 125 125 | 
             
                end
         | 
| 126 126 |  | 
| 127 127 | 
             
                #
         | 
| @@ -140,9 +140,9 @@ module Spidr | |
| 140 140 | 
             
                # @yieldparam [Agent] agent
         | 
| 141 141 | 
             
                #   The newly created agent.
         | 
| 142 142 | 
             
                #
         | 
| 143 | 
            -
                def self.start_at(url,options={} | 
| 143 | 
            +
                def self.start_at(url,options={})
         | 
| 144 144 | 
             
                  self.new(options) do |spider|
         | 
| 145 | 
            -
                     | 
| 145 | 
            +
                    yield spider if block_given?
         | 
| 146 146 |  | 
| 147 147 | 
             
                    spider.start_at(url)
         | 
| 148 148 | 
             
                  end
         | 
| @@ -164,9 +164,9 @@ module Spidr | |
| 164 164 | 
             
                # @yieldparam [Agent] agent
         | 
| 165 165 | 
             
                #   The newly created agent.
         | 
| 166 166 | 
             
                #
         | 
| 167 | 
            -
                def self.host(name,options={} | 
| 167 | 
            +
                def self.host(name,options={})
         | 
| 168 168 | 
             
                  self.new(options.merge(:host => name)) do |spider|
         | 
| 169 | 
            -
                     | 
| 169 | 
            +
                    yield spider if block_given?
         | 
| 170 170 |  | 
| 171 171 | 
             
                    spider.start_at("http://#{name}/")
         | 
| 172 172 | 
             
                  end
         | 
| @@ -188,11 +188,11 @@ module Spidr | |
| 188 188 | 
             
                # @yieldparam [Agent] agent
         | 
| 189 189 | 
             
                #   The newly created agent.
         | 
| 190 190 | 
             
                #
         | 
| 191 | 
            -
                def self.site(url,options={} | 
| 191 | 
            +
                def self.site(url,options={})
         | 
| 192 192 | 
             
                  url = URI(url.to_s)
         | 
| 193 193 |  | 
| 194 194 | 
             
                  return self.new(options.merge(:host => url.host)) do |spider|
         | 
| 195 | 
            -
                     | 
| 195 | 
            +
                    yield spider if block_given?
         | 
| 196 196 |  | 
| 197 197 | 
             
                    spider.start_at(url)
         | 
| 198 198 | 
             
                  end
         | 
| @@ -457,11 +457,18 @@ module Spidr | |
| 457 457 | 
             
                    link = url.to_s
         | 
| 458 458 |  | 
| 459 459 | 
             
                    begin
         | 
| 460 | 
            -
                      @every_url_blocks.each { | | 
| 460 | 
            +
                      @every_url_blocks.each { |url_block| url_block.call(url) }
         | 
| 461 461 |  | 
| 462 | 
            -
                      @urls_like_blocks.each do |pattern, | 
| 463 | 
            -
                         | 
| 464 | 
            -
             | 
| 462 | 
            +
                      @urls_like_blocks.each do |pattern,url_blocks|
         | 
| 463 | 
            +
                        match = case pattern
         | 
| 464 | 
            +
                                when Regexp
         | 
| 465 | 
            +
                                  link =~ pattern
         | 
| 466 | 
            +
                                else
         | 
| 467 | 
            +
                                  (pattern == link) || (pattern == url)
         | 
| 468 | 
            +
                                end
         | 
| 469 | 
            +
             | 
| 470 | 
            +
                        if match
         | 
| 471 | 
            +
                          url_blocks.each { |url_block| url_block.call(url) }
         | 
| 465 472 | 
             
                        end
         | 
| 466 473 | 
             
                      end
         | 
| 467 474 | 
             
                    rescue Actions::Paused => action
         | 
| @@ -494,7 +501,7 @@ module Spidr | |
| 494 501 | 
             
                # @return [Page, nil]
         | 
| 495 502 | 
             
                #   The page for the response, or `nil` if the request failed.
         | 
| 496 503 | 
             
                #
         | 
| 497 | 
            -
                def get_page(url | 
| 504 | 
            +
                def get_page(url)
         | 
| 498 505 | 
             
                  url = URI(url.to_s)
         | 
| 499 506 |  | 
| 500 507 | 
             
                  prepare_request(url) do |session,path,headers|
         | 
| @@ -503,7 +510,7 @@ module Spidr | |
| 503 510 | 
             
                    # save any new cookies
         | 
| 504 511 | 
             
                    @cookies.from_page(new_page)
         | 
| 505 512 |  | 
| 506 | 
            -
                     | 
| 513 | 
            +
                    yield new_page if block_given?
         | 
| 507 514 | 
             
                    return new_page
         | 
| 508 515 | 
             
                  end
         | 
| 509 516 | 
             
                end
         | 
| @@ -529,7 +536,7 @@ module Spidr | |
| 529 536 | 
             
                #
         | 
| 530 537 | 
             
                # @since 0.2.2
         | 
| 531 538 | 
             
                #
         | 
| 532 | 
            -
                def post_page(url,post_data='' | 
| 539 | 
            +
                def post_page(url,post_data='')
         | 
| 533 540 | 
             
                  url = URI(url.to_s)
         | 
| 534 541 |  | 
| 535 542 | 
             
                  prepare_request(url) do |session,path,headers|
         | 
| @@ -538,7 +545,7 @@ module Spidr | |
| 538 545 | 
             
                    # save any new cookies
         | 
| 539 546 | 
             
                    @cookies.from_page(new_page)
         | 
| 540 547 |  | 
| 541 | 
            -
                     | 
| 548 | 
            +
                    yield new_page if block_given?
         | 
| 542 549 | 
             
                    return new_page
         | 
| 543 550 | 
             
                  end
         | 
| 544 551 | 
             
                end
         | 
| @@ -560,7 +567,7 @@ module Spidr | |
| 560 567 | 
             
                #   The page that was visited. If `nil` is returned, either the request
         | 
| 561 568 | 
             
                #   for the page failed, or the page was skipped.
         | 
| 562 569 | 
             
                #
         | 
| 563 | 
            -
                def visit_page(url | 
| 570 | 
            +
                def visit_page(url)
         | 
| 564 571 | 
             
                  url = URI(url.to_s) unless url.kind_of?(URI)
         | 
| 565 572 |  | 
| 566 573 | 
             
                  get_page(url) do |page|
         | 
| @@ -569,7 +576,7 @@ module Spidr | |
| 569 576 | 
             
                    begin
         | 
| 570 577 | 
             
                      @every_page_blocks.each { |page_block| page_block.call(page) }
         | 
| 571 578 |  | 
| 572 | 
            -
                       | 
| 579 | 
            +
                      yield page if block_given?
         | 
| 573 580 | 
             
                    rescue Actions::Paused => action
         | 
| 574 581 | 
             
                      raise(action)
         | 
| 575 582 | 
             
                    rescue Actions::SkipPage
         | 
| @@ -668,7 +675,7 @@ module Spidr | |
| 668 675 | 
             
                  begin
         | 
| 669 676 | 
             
                    sleep(@delay) if @delay > 0
         | 
| 670 677 |  | 
| 671 | 
            -
                     | 
| 678 | 
            +
                    yield @sessions[url], path, headers
         | 
| 672 679 | 
             
                  rescue SystemCallError,
         | 
| 673 680 | 
             
                         Timeout::Error,
         | 
| 674 681 | 
             
                         SocketError,
         | 
| @@ -719,7 +726,7 @@ module Spidr | |
| 719 726 | 
             
                #
         | 
| 720 727 | 
             
                def failed(url)
         | 
| 721 728 | 
             
                  @failures << url
         | 
| 722 | 
            -
                  @every_failed_url_blocks.each { | | 
| 729 | 
            +
                  @every_failed_url_blocks.each { |fail_block| fail_block.call(url) }
         | 
| 723 730 | 
             
                  return true
         | 
| 724 731 | 
             
                end
         | 
| 725 732 |  | 
    
        data/lib/spidr/cookie_jar.rb
    CHANGED
    
    | @@ -130,6 +130,22 @@ module Spidr | |
| 130 130 | 
             
                    @dirty.delete(host)
         | 
| 131 131 | 
             
                  end
         | 
| 132 132 |  | 
| 133 | 
            +
                  hdomain = host.split('.')
         | 
| 134 | 
            +
             | 
| 135 | 
            +
                  if hdomain.length > 2
         | 
| 136 | 
            +
                    parent_cookies = for_host(hdomain[1..-1].join('.'))
         | 
| 137 | 
            +
             | 
| 138 | 
            +
                    unless (parent_cookies.nil? || parent_cookies.empty?)
         | 
| 139 | 
            +
                      @cookies[host] = if @cookies[host].nil?
         | 
| 140 | 
            +
                                         # inherit the parent cookies
         | 
| 141 | 
            +
                                         parent_cookies
         | 
| 142 | 
            +
                                       else
         | 
| 143 | 
            +
                                         # merge the parent cookies with any host-specific cookies
         | 
| 144 | 
            +
                                         "#{parent_cookies}; #{@cookies[host]}"
         | 
| 145 | 
            +
                                       end
         | 
| 146 | 
            +
                    end
         | 
| 147 | 
            +
                  end
         | 
| 148 | 
            +
             | 
| 133 149 | 
             
                  return @cookies[host]
         | 
| 134 150 | 
             
                end
         | 
| 135 151 |  | 
    
        data/lib/spidr/events.rb
    CHANGED
    
    | @@ -72,8 +72,8 @@ module Spidr | |
| 72 72 | 
             
                # @yieldparam [Hash] headers
         | 
| 73 73 | 
             
                #   The headers from a response.
         | 
| 74 74 | 
             
                #
         | 
| 75 | 
            -
                def all_headers | 
| 76 | 
            -
                  every_page { |page|  | 
| 75 | 
            +
                def all_headers
         | 
| 76 | 
            +
                  every_page { |page| yield page.headers }
         | 
| 77 77 | 
             
                end
         | 
| 78 78 |  | 
| 79 79 | 
             
                #
         | 
| @@ -99,9 +99,9 @@ module Spidr | |
| 99 99 | 
             
                # @yieldparam [Page] page
         | 
| 100 100 | 
             
                #   A visited page.
         | 
| 101 101 | 
             
                #
         | 
| 102 | 
            -
                def every_ok_page | 
| 102 | 
            +
                def every_ok_page
         | 
| 103 103 | 
             
                  every_page do |page|
         | 
| 104 | 
            -
                     | 
| 104 | 
            +
                    yield page if (block_given? && page.ok?)
         | 
| 105 105 | 
             
                  end
         | 
| 106 106 | 
             
                end
         | 
| 107 107 |  | 
| @@ -114,9 +114,9 @@ module Spidr | |
| 114 114 | 
             
                # @yieldparam [Page] page
         | 
| 115 115 | 
             
                #   A visited page.
         | 
| 116 116 | 
             
                #
         | 
| 117 | 
            -
                def every_redirect_page | 
| 117 | 
            +
                def every_redirect_page
         | 
| 118 118 | 
             
                  every_page do |page|
         | 
| 119 | 
            -
                     | 
| 119 | 
            +
                    yield page if (block_given? && page.redirect?)
         | 
| 120 120 | 
             
                  end
         | 
| 121 121 | 
             
                end
         | 
| 122 122 |  | 
| @@ -129,9 +129,9 @@ module Spidr | |
| 129 129 | 
             
                # @yieldparam [Page] page
         | 
| 130 130 | 
             
                #   A visited page.
         | 
| 131 131 | 
             
                #
         | 
| 132 | 
            -
                def every_timedout_page | 
| 132 | 
            +
                def every_timedout_page
         | 
| 133 133 | 
             
                  every_page do |page|
         | 
| 134 | 
            -
                     | 
| 134 | 
            +
                    yield page if (block_given? && page.timedout?)
         | 
| 135 135 | 
             
                  end
         | 
| 136 136 | 
             
                end
         | 
| 137 137 |  | 
| @@ -144,9 +144,9 @@ module Spidr | |
| 144 144 | 
             
                # @yieldparam [Page] page
         | 
| 145 145 | 
             
                #   A visited page.
         | 
| 146 146 | 
             
                #
         | 
| 147 | 
            -
                def every_bad_request_page | 
| 147 | 
            +
                def every_bad_request_page
         | 
| 148 148 | 
             
                  every_page do |page|
         | 
| 149 | 
            -
                     | 
| 149 | 
            +
                    yield page if (block_given? && page.bad_request?)
         | 
| 150 150 | 
             
                  end
         | 
| 151 151 | 
             
                end
         | 
| 152 152 |  | 
| @@ -159,9 +159,9 @@ module Spidr | |
| 159 159 | 
             
                # @yieldparam [Page] page
         | 
| 160 160 | 
             
                #   A visited page.
         | 
| 161 161 | 
             
                #
         | 
| 162 | 
            -
                def every_unauthorized_page | 
| 162 | 
            +
                def every_unauthorized_page
         | 
| 163 163 | 
             
                  every_page do |page|
         | 
| 164 | 
            -
                     | 
| 164 | 
            +
                    yield page if (block_given? && page.unauthorized?)
         | 
| 165 165 | 
             
                  end
         | 
| 166 166 | 
             
                end
         | 
| 167 167 |  | 
| @@ -174,9 +174,9 @@ module Spidr | |
| 174 174 | 
             
                # @yieldparam [Page] page
         | 
| 175 175 | 
             
                #   A visited page.
         | 
| 176 176 | 
             
                #
         | 
| 177 | 
            -
                def every_forbidden_page | 
| 177 | 
            +
                def every_forbidden_page
         | 
| 178 178 | 
             
                  every_page do |page|
         | 
| 179 | 
            -
                     | 
| 179 | 
            +
                    yield page if (block_given? && page.forbidden?)
         | 
| 180 180 | 
             
                  end
         | 
| 181 181 | 
             
                end
         | 
| 182 182 |  | 
| @@ -189,9 +189,9 @@ module Spidr | |
| 189 189 | 
             
                # @yieldparam [Page] page
         | 
| 190 190 | 
             
                #   A visited page.
         | 
| 191 191 | 
             
                #
         | 
| 192 | 
            -
                def every_missing_page | 
| 192 | 
            +
                def every_missing_page
         | 
| 193 193 | 
             
                  every_page do |page|
         | 
| 194 | 
            -
                     | 
| 194 | 
            +
                    yield page if (block_given? && page.missing?)
         | 
| 195 195 | 
             
                  end
         | 
| 196 196 | 
             
                end
         | 
| 197 197 |  | 
| @@ -205,9 +205,9 @@ module Spidr | |
| 205 205 | 
             
                # @yieldparam [Page] page
         | 
| 206 206 | 
             
                #   A visited page.
         | 
| 207 207 | 
             
                #
         | 
| 208 | 
            -
                def every_internal_server_error_page | 
| 208 | 
            +
                def every_internal_server_error_page
         | 
| 209 209 | 
             
                  every_page do |page|
         | 
| 210 | 
            -
                     | 
| 210 | 
            +
                    yield page if (block_given? && page.had_internal_server_error?)
         | 
| 211 211 | 
             
                  end
         | 
| 212 212 | 
             
                end
         | 
| 213 213 |  | 
| @@ -220,9 +220,9 @@ module Spidr | |
| 220 220 | 
             
                # @yieldparam [Page] page
         | 
| 221 221 | 
             
                #   A visited page.
         | 
| 222 222 | 
             
                #
         | 
| 223 | 
            -
                def every_txt_page | 
| 223 | 
            +
                def every_txt_page
         | 
| 224 224 | 
             
                  every_page do |page|
         | 
| 225 | 
            -
                     | 
| 225 | 
            +
                    yield page if (block_given? && page.txt?)
         | 
| 226 226 | 
             
                  end
         | 
| 227 227 | 
             
                end
         | 
| 228 228 |  | 
| @@ -235,9 +235,9 @@ module Spidr | |
| 235 235 | 
             
                # @yieldparam [Page] page
         | 
| 236 236 | 
             
                #   A visited page.
         | 
| 237 237 | 
             
                #
         | 
| 238 | 
            -
                def every_html_page | 
| 238 | 
            +
                def every_html_page
         | 
| 239 239 | 
             
                  every_page do |page|
         | 
| 240 | 
            -
                     | 
| 240 | 
            +
                    yield page if (block_given? && page.html?)
         | 
| 241 241 | 
             
                  end
         | 
| 242 242 | 
             
                end
         | 
| 243 243 |  | 
| @@ -250,9 +250,9 @@ module Spidr | |
| 250 250 | 
             
                # @yieldparam [Page] page
         | 
| 251 251 | 
             
                #   A visited page.
         | 
| 252 252 | 
             
                #
         | 
| 253 | 
            -
                def every_xml_page | 
| 253 | 
            +
                def every_xml_page
         | 
| 254 254 | 
             
                  every_page do |page|
         | 
| 255 | 
            -
                     | 
| 255 | 
            +
                    yield page if (block_given? && page.xml?)
         | 
| 256 256 | 
             
                  end
         | 
| 257 257 | 
             
                end
         | 
| 258 258 |  | 
| @@ -266,9 +266,9 @@ module Spidr | |
| 266 266 | 
             
                # @yieldparam [Page] page
         | 
| 267 267 | 
             
                #   A visited page.
         | 
| 268 268 | 
             
                #
         | 
| 269 | 
            -
                def every_xsl_page | 
| 269 | 
            +
                def every_xsl_page
         | 
| 270 270 | 
             
                  every_page do |page|
         | 
| 271 | 
            -
                     | 
| 271 | 
            +
                    yield page if (block_given? && page.xsl?)
         | 
| 272 272 | 
             
                  end
         | 
| 273 273 | 
             
                end
         | 
| 274 274 |  | 
| @@ -285,11 +285,11 @@ module Spidr | |
| 285 285 | 
             
                # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
         | 
| 286 286 | 
             
                # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
         | 
| 287 287 | 
             
                #
         | 
| 288 | 
            -
                def every_doc | 
| 288 | 
            +
                def every_doc
         | 
| 289 289 | 
             
                  every_page do |page|
         | 
| 290 | 
            -
                    if  | 
| 290 | 
            +
                    if block_given?
         | 
| 291 291 | 
             
                      if (doc = page.doc)
         | 
| 292 | 
            -
                         | 
| 292 | 
            +
                        yield doc
         | 
| 293 293 | 
             
                      end
         | 
| 294 294 | 
             
                    end
         | 
| 295 295 | 
             
                  end
         | 
| @@ -306,11 +306,11 @@ module Spidr | |
| 306 306 | 
             
                #
         | 
| 307 307 | 
             
                # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
         | 
| 308 308 | 
             
                #
         | 
| 309 | 
            -
                def every_html_doc | 
| 309 | 
            +
                def every_html_doc
         | 
| 310 310 | 
             
                  every_page do |page|
         | 
| 311 | 
            -
                    if ( | 
| 311 | 
            +
                    if (block_given? && page.html?)
         | 
| 312 312 | 
             
                      if (doc = page.doc)
         | 
| 313 | 
            -
                         | 
| 313 | 
            +
                        yield doc
         | 
| 314 314 | 
             
                      end
         | 
| 315 315 | 
             
                    end
         | 
| 316 316 | 
             
                  end
         | 
| @@ -327,11 +327,11 @@ module Spidr | |
| 327 327 | 
             
                #
         | 
| 328 328 | 
             
                # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
         | 
| 329 329 | 
             
                #
         | 
| 330 | 
            -
                def every_xml_doc | 
| 330 | 
            +
                def every_xml_doc
         | 
| 331 331 | 
             
                  every_page do |page|
         | 
| 332 | 
            -
                    if ( | 
| 332 | 
            +
                    if (block_given? && page.xml?)
         | 
| 333 333 | 
             
                      if (doc = page.doc)
         | 
| 334 | 
            -
                         | 
| 334 | 
            +
                        yield doc
         | 
| 335 335 | 
             
                      end
         | 
| 336 336 | 
             
                    end
         | 
| 337 337 | 
             
                  end
         | 
| @@ -349,11 +349,11 @@ module Spidr | |
| 349 349 | 
             
                #
         | 
| 350 350 | 
             
                # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
         | 
| 351 351 | 
             
                #
         | 
| 352 | 
            -
                def every_xsl_doc | 
| 352 | 
            +
                def every_xsl_doc
         | 
| 353 353 | 
             
                  every_page do |page|
         | 
| 354 | 
            -
                    if ( | 
| 354 | 
            +
                    if (block_given? && page.xsl?)
         | 
| 355 355 | 
             
                      if (doc = page.doc)
         | 
| 356 | 
            -
                         | 
| 356 | 
            +
                        yield doc
         | 
| 357 357 | 
             
                      end
         | 
| 358 358 | 
             
                    end
         | 
| 359 359 | 
             
                  end
         | 
| @@ -370,11 +370,11 @@ module Spidr | |
| 370 370 | 
             
                #
         | 
| 371 371 | 
             
                # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
         | 
| 372 372 | 
             
                #
         | 
| 373 | 
            -
                def every_rss_doc | 
| 373 | 
            +
                def every_rss_doc
         | 
| 374 374 | 
             
                  every_page do |page|
         | 
| 375 | 
            -
                    if ( | 
| 375 | 
            +
                    if (block_given? && page.rss?)
         | 
| 376 376 | 
             
                      if (doc = page.doc)
         | 
| 377 | 
            -
                         | 
| 377 | 
            +
                        yield doc
         | 
| 378 378 | 
             
                      end
         | 
| 379 379 | 
             
                    end
         | 
| 380 380 | 
             
                  end
         | 
| @@ -391,11 +391,11 @@ module Spidr | |
| 391 391 | 
             
                #
         | 
| 392 392 | 
             
                # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
         | 
| 393 393 | 
             
                #
         | 
| 394 | 
            -
                def every_atom_doc | 
| 394 | 
            +
                def every_atom_doc
         | 
| 395 395 | 
             
                  every_page do |page|
         | 
| 396 | 
            -
                    if ( | 
| 396 | 
            +
                    if (block_given? && page.atom?)
         | 
| 397 397 | 
             
                      if (doc = page.doc)
         | 
| 398 | 
            -
                         | 
| 398 | 
            +
                        yield doc
         | 
| 399 399 | 
             
                      end
         | 
| 400 400 | 
             
                    end
         | 
| 401 401 | 
             
                  end
         | 
| @@ -410,9 +410,9 @@ module Spidr | |
| 410 410 | 
             
                # @yieldparam [Page] page
         | 
| 411 411 | 
             
                #   A visited page.
         | 
| 412 412 | 
             
                #
         | 
| 413 | 
            -
                def every_javascript_page | 
| 413 | 
            +
                def every_javascript_page
         | 
| 414 414 | 
             
                  every_page do |page|
         | 
| 415 | 
            -
                     | 
| 415 | 
            +
                    yield page if (block_given? && page.javascript?)
         | 
| 416 416 | 
             
                  end
         | 
| 417 417 | 
             
                end
         | 
| 418 418 |  | 
| @@ -425,9 +425,9 @@ module Spidr | |
| 425 425 | 
             
                # @yieldparam [Page] page
         | 
| 426 426 | 
             
                #   A visited page.
         | 
| 427 427 | 
             
                #
         | 
| 428 | 
            -
                def every_css_page | 
| 428 | 
            +
                def every_css_page
         | 
| 429 429 | 
             
                  every_page do |page|
         | 
| 430 | 
            -
                     | 
| 430 | 
            +
                    yield page if (block_given? && page.css?)
         | 
| 431 431 | 
             
                  end
         | 
| 432 432 | 
             
                end
         | 
| 433 433 |  | 
| @@ -440,9 +440,9 @@ module Spidr | |
| 440 440 | 
             
                # @yieldparam [Page] feed
         | 
| 441 441 | 
             
                #   A visited page.
         | 
| 442 442 | 
             
                #
         | 
| 443 | 
            -
                def every_rss_page | 
| 443 | 
            +
                def every_rss_page
         | 
| 444 444 | 
             
                  every_page do |page|
         | 
| 445 | 
            -
                     | 
| 445 | 
            +
                    yield page if (block_given? && page.rss?)
         | 
| 446 446 | 
             
                  end
         | 
| 447 447 | 
             
                end
         | 
| 448 448 |  | 
| @@ -455,9 +455,9 @@ module Spidr | |
| 455 455 | 
             
                # @yieldparam [Page] feed
         | 
| 456 456 | 
             
                #   A visited page.
         | 
| 457 457 | 
             
                #
         | 
| 458 | 
            -
                def every_atom_page | 
| 458 | 
            +
                def every_atom_page
         | 
| 459 459 | 
             
                  every_page do |page|
         | 
| 460 | 
            -
                     | 
| 460 | 
            +
                    yield page if (block_given? && page.atom?)
         | 
| 461 461 | 
             
                  end
         | 
| 462 462 | 
             
                end
         | 
| 463 463 |  | 
| @@ -470,9 +470,9 @@ module Spidr | |
| 470 470 | 
             
                # @yieldparam [Page] page
         | 
| 471 471 | 
             
                #   A visited page.
         | 
| 472 472 | 
             
                #
         | 
| 473 | 
            -
                def every_ms_word_page | 
| 473 | 
            +
                def every_ms_word_page
         | 
| 474 474 | 
             
                  every_page do |page|
         | 
| 475 | 
            -
                     | 
| 475 | 
            +
                    yield page if (block_given? && page.ms_word?)
         | 
| 476 476 | 
             
                  end
         | 
| 477 477 | 
             
                end
         | 
| 478 478 |  | 
| @@ -485,9 +485,9 @@ module Spidr | |
| 485 485 | 
             
                # @yieldparam [Page] page
         | 
| 486 486 | 
             
                #   A visited page.
         | 
| 487 487 | 
             
                #
         | 
| 488 | 
            -
                def every_pdf_page | 
| 488 | 
            +
                def every_pdf_page
         | 
| 489 489 | 
             
                  every_page do |page|
         | 
| 490 | 
            -
                     | 
| 490 | 
            +
                    yield page if (block_given? && page.pdf?)
         | 
| 491 491 | 
             
                  end
         | 
| 492 492 | 
             
                end
         | 
| 493 493 |  | 
| @@ -500,9 +500,9 @@ module Spidr | |
| 500 500 | 
             
                # @yieldparam [Page] page
         | 
| 501 501 | 
             
                #   A visited page.
         | 
| 502 502 | 
             
                #
         | 
| 503 | 
            -
                def every_zip_page | 
| 503 | 
            +
                def every_zip_page
         | 
| 504 504 | 
             
                  every_page do |page|
         | 
| 505 | 
            -
                     | 
| 505 | 
            +
                    yield page if (block_given? && page.zip?)
         | 
| 506 506 | 
             
                  end
         | 
| 507 507 | 
             
                end
         | 
| 508 508 |  | 
    
        data/lib/spidr/page.rb
    CHANGED
    
    | @@ -62,7 +62,8 @@ module Spidr | |
| 62 62 |  | 
| 63 63 | 
             
                #
         | 
| 64 64 | 
             
                # Determines if the response code is `300`, `301`, `302`, `303`
         | 
| 65 | 
            -
                # or `307`.
         | 
| 65 | 
            +
                # or `307`. Also checks for "soft" redirects added at the page 
         | 
| 66 | 
            +
                # level by a meta refresh tag.
         | 
| 66 67 | 
             
                #
         | 
| 67 68 | 
             
                # @return [Boolean]
         | 
| 68 69 | 
             
                #   Specifies whether the response code is a HTTP Redirect code.
         | 
| @@ -71,6 +72,8 @@ module Spidr | |
| 71 72 | 
             
                  case code
         | 
| 72 73 | 
             
                  when 300..303, 307
         | 
| 73 74 | 
             
                    true
         | 
| 75 | 
            +
                  when 200
         | 
| 76 | 
            +
                    meta_redirect?
         | 
| 74 77 | 
             
                  else
         | 
| 75 78 | 
             
                    false
         | 
| 76 79 | 
             
                  end
         | 
| @@ -434,17 +437,7 @@ module Spidr | |
| 434 437 | 
             
                    urls << url unless (url.nil? || url.empty?)
         | 
| 435 438 | 
             
                  }
         | 
| 436 439 |  | 
| 437 | 
            -
                  if self.is_redirect?
         | 
| 438 | 
            -
                    location = @headers['location']
         | 
| 439 | 
            -
             | 
| 440 | 
            -
                    if location.kind_of?(Array)
         | 
| 441 | 
            -
                      # handle multiple location URLs
         | 
| 442 | 
            -
                      location.each(&add_url)
         | 
| 443 | 
            -
                    else
         | 
| 444 | 
            -
                      # usually the location header contains a single String
         | 
| 445 | 
            -
                      add_url.call(location)
         | 
| 446 | 
            -
                    end
         | 
| 447 | 
            -
                  end
         | 
| 440 | 
            +
                  self.redirects_to.each(&add_url) if self.is_redirect?
         | 
| 448 441 |  | 
| 449 442 | 
             
                  if (html? && doc)
         | 
| 450 443 | 
             
                    doc.search('a[@href]').each do |a|
         | 
| @@ -471,6 +464,27 @@ module Spidr | |
| 471 464 | 
             
                  return urls
         | 
| 472 465 | 
             
                end
         | 
| 473 466 |  | 
| 467 | 
            +
                #
         | 
| 468 | 
            +
                # URL(s) that this document redirects to.
         | 
| 469 | 
            +
                #
         | 
| 470 | 
            +
                # @return [Array<String>]
         | 
| 471 | 
            +
                #   The links that this page redirects to (usually found in a
         | 
| 472 | 
            +
                #   location header or by way of a page-level meta redirect).
         | 
| 473 | 
            +
                #
         | 
| 474 | 
            +
                def redirects_to
         | 
| 475 | 
            +
                  location = @headers['location']
         | 
| 476 | 
            +
             | 
| 477 | 
            +
                  if location.nil?
         | 
| 478 | 
            +
                    # check page-level meta redirects if there isn't a location header
         | 
| 479 | 
            +
                    meta_redirect
         | 
| 480 | 
            +
                  elsif location.kind_of?(Array)
         | 
| 481 | 
            +
                    location
         | 
| 482 | 
            +
                  else
         | 
| 483 | 
            +
                    # usually the location header contains a single String
         | 
| 484 | 
            +
                    [location]
         | 
| 485 | 
            +
                  end
         | 
| 486 | 
            +
                end
         | 
| 487 | 
            +
             | 
| 474 488 | 
             
                #
         | 
| 475 489 | 
             
                # Absolute URIs from within the page.
         | 
| 476 490 | 
             
                #
         | 
| @@ -507,6 +521,43 @@ module Spidr | |
| 507 521 | 
             
                  return url
         | 
| 508 522 | 
             
                end
         | 
| 509 523 |  | 
| 524 | 
            +
                #
         | 
| 525 | 
            +
                # Determines if a page-level "soft" redirect is present. If yes,
         | 
| 526 | 
            +
                # returns an array of those redirects (usually a single URL).
         | 
| 527 | 
            +
                # Otherwise, returns false.
         | 
| 528 | 
            +
                #
         | 
| 529 | 
            +
                # @return [Array<String>]
         | 
| 530 | 
            +
                #   An array of redirect URLs
         | 
| 531 | 
            +
                #
         | 
| 532 | 
            +
                def meta_redirect
         | 
| 533 | 
            +
                  redirects = []
         | 
| 534 | 
            +
             | 
| 535 | 
            +
                  if (html? && doc)
         | 
| 536 | 
            +
                    search('//meta[@http-equiv and @content]').each do |node|
         | 
| 537 | 
            +
                      if node.attr('http-equiv') =~ /refresh/i
         | 
| 538 | 
            +
                        content = node.attr('content')
         | 
| 539 | 
            +
             | 
| 540 | 
            +
                        if (redirect = content.match(/url=(\S+)$/))
         | 
| 541 | 
            +
                          redirects << redirect[1]
         | 
| 542 | 
            +
                        end
         | 
| 543 | 
            +
                      end
         | 
| 544 | 
            +
                    end
         | 
| 545 | 
            +
                  end
         | 
| 546 | 
            +
             | 
| 547 | 
            +
                  return redirects.uniq
         | 
| 548 | 
            +
                end
         | 
| 549 | 
            +
             | 
| 550 | 
            +
                #
         | 
| 551 | 
            +
                # Returns a boolean indicating whether or not page-level meta
         | 
| 552 | 
            +
                # redirects are present in this page.
         | 
| 553 | 
            +
                #
         | 
| 554 | 
            +
                # @return [Boolean]
         | 
| 555 | 
            +
                #   Specifies whether the page includes page-level redirects.
         | 
| 556 | 
            +
                #
         | 
| 557 | 
            +
                def meta_redirect?
         | 
| 558 | 
            +
                  !meta_redirect.empty?
         | 
| 559 | 
            +
                end
         | 
| 560 | 
            +
             | 
| 510 561 | 
             
                protected
         | 
| 511 562 |  | 
| 512 563 | 
             
                #
         | 
    
        data/lib/spidr/version.rb
    CHANGED
    
    
    
        data/spec/cookie_jar_spec.rb
    CHANGED
    
    | @@ -101,8 +101,21 @@ describe CookieJar do | |
| 101 101 | 
             
                it "should encode multiple cookie params" do
         | 
| 102 102 | 
             
                  @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
         | 
| 103 103 | 
             
                  @cookie_jar['zerosum.org'] = {'other' => '1'}
         | 
| 104 | 
            +
                  cookie = @cookie_jar.for_host('zerosum.org')
         | 
| 104 105 |  | 
| 105 | 
            -
                   | 
| 106 | 
            +
                  cookie.should include('admin=ofcourseiam')
         | 
| 107 | 
            +
                  cookie.should include('; ')
         | 
| 108 | 
            +
                  cookie.should include('other=1')
         | 
| 109 | 
            +
                end
         | 
| 110 | 
            +
             | 
| 111 | 
            +
                it "should include cookies for the parent domain" do
         | 
| 112 | 
            +
                  @cookie_jar['zerosum.org'] = {'admin' => 'ofcourseiam'}
         | 
| 113 | 
            +
                  @cookie_jar['sub.zerosum.org'] = {'other' => '1'}
         | 
| 114 | 
            +
                  cookie = @cookie_jar.for_host('sub.zerosum.org')
         | 
| 115 | 
            +
             | 
| 116 | 
            +
                  cookie.should include('admin=ofcourseiam')
         | 
| 117 | 
            +
                  cookie.should include('; ')
         | 
| 118 | 
            +
                  cookie.should include('other=1')
         | 
| 106 119 | 
             
                end
         | 
| 107 120 | 
             
              end
         | 
| 108 121 | 
             
            end
         | 
    
        data/spec/helpers/wsoc.rb
    CHANGED
    
    
    
        data/spec/page_spec.rb
    CHANGED
    
    | @@ -79,6 +79,21 @@ describe Page do | |
| 79 79 | 
             
                end
         | 
| 80 80 | 
             
              end
         | 
| 81 81 |  | 
| 82 | 
            +
              describe "redirects" do
         | 
| 83 | 
            +
                before(:all) do
         | 
| 84 | 
            +
                  @page = get_page('http://spidr.rubyforge.org/course/start.html')
         | 
| 85 | 
            +
                  @page.stub!(:body).and_return('<meta HTTP-EQUIV="REFRESH" content="0; url=http://spidr.rubyforge.org/redirected">')
         | 
| 86 | 
            +
                end
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                it "should provide access to page-level redirects" do
         | 
| 89 | 
            +
                  @page.redirects_to.should == ['http://spidr.rubyforge.org/redirected']
         | 
| 90 | 
            +
                end 
         | 
| 91 | 
            +
             | 
| 92 | 
            +
                it "should include meta refresh redirects in the list of links" do
         | 
| 93 | 
            +
                  @page.links.should include('http://spidr.rubyforge.org/redirected')
         | 
| 94 | 
            +
                end
         | 
| 95 | 
            +
              end
         | 
| 96 | 
            +
             | 
| 82 97 | 
             
              describe "cookies" do
         | 
| 83 98 | 
             
                before(:all) do
         | 
| 84 99 | 
             
                  @page = get_page('http://twitter.com/login')
         | 
    
        data/spec/spec_helper.rb
    CHANGED
    
    | @@ -1,7 +1,15 @@ | |
| 1 1 | 
             
            require 'rubygems'
         | 
| 2 | 
            -
             | 
| 3 | 
            -
             | 
| 2 | 
            +
            require 'bundler'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            begin
         | 
| 5 | 
            +
              Bundler.setup(:runtime, :test)
         | 
| 6 | 
            +
            rescue Bundler::BundlerError => e
         | 
| 7 | 
            +
              STDERR.puts e.message
         | 
| 8 | 
            +
              STDERR.puts "Run `bundle install` to install missing gems"
         | 
| 9 | 
            +
              exit e.status_code
         | 
| 10 | 
            +
            end
         | 
| 4 11 |  | 
| 12 | 
            +
            require 'spec'
         | 
| 5 13 | 
             
            require 'spidr/version'
         | 
| 6 14 |  | 
| 7 15 | 
             
            include Spidr
         | 
    
        data/spidr.gemspec
    CHANGED
    
    | @@ -5,112 +5,112 @@ | |
| 5 5 |  | 
| 6 6 | 
             
            Gem::Specification.new do |s|
         | 
| 7 7 | 
             
              s.name = %q{spidr}
         | 
| 8 | 
            -
              s.version = "0.2. | 
| 8 | 
            +
              s.version = "0.2.5"
         | 
| 9 9 |  | 
| 10 10 | 
             
              s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
         | 
| 11 11 | 
             
              s.authors = ["Postmodern"]
         | 
| 12 | 
            -
              s.date = %q{2010- | 
| 12 | 
            +
              s.date = %q{2010-07-02}
         | 
| 13 13 | 
             
              s.description = %q{Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.}
         | 
| 14 14 | 
             
              s.email = %q{postmodern.mod3@gmail.com}
         | 
| 15 15 | 
             
              s.extra_rdoc_files = [
         | 
| 16 16 | 
             
                "ChangeLog.md",
         | 
| 17 | 
            -
             | 
| 18 | 
            -
             | 
| 17 | 
            +
                "LICENSE.txt",
         | 
| 18 | 
            +
                "README.md"
         | 
| 19 19 | 
             
              ]
         | 
| 20 20 | 
             
              s.files = [
         | 
| 21 21 | 
             
                ".gitignore",
         | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
| 26 | 
            -
             | 
| 27 | 
            -
             | 
| 28 | 
            -
             | 
| 29 | 
            -
             | 
| 30 | 
            -
             | 
| 31 | 
            -
             | 
| 32 | 
            -
             | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
| 37 | 
            -
             | 
| 38 | 
            -
             | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
| 41 | 
            -
             | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
| 45 | 
            -
             | 
| 46 | 
            -
             | 
| 47 | 
            -
             | 
| 48 | 
            -
             | 
| 49 | 
            -
             | 
| 50 | 
            -
             | 
| 51 | 
            -
             | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 54 | 
            -
             | 
| 55 | 
            -
             | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 59 | 
            -
             | 
| 60 | 
            -
             | 
| 61 | 
            -
             | 
| 62 | 
            -
             | 
| 63 | 
            -
             | 
| 64 | 
            -
             | 
| 65 | 
            -
             | 
| 66 | 
            -
             | 
| 22 | 
            +
                ".specopts",
         | 
| 23 | 
            +
                ".yardopts",
         | 
| 24 | 
            +
                "ChangeLog.md",
         | 
| 25 | 
            +
                "Gemfile",
         | 
| 26 | 
            +
                "LICENSE.txt",
         | 
| 27 | 
            +
                "README.md",
         | 
| 28 | 
            +
                "Rakefile",
         | 
| 29 | 
            +
                "lib/spidr.rb",
         | 
| 30 | 
            +
                "lib/spidr/actions.rb",
         | 
| 31 | 
            +
                "lib/spidr/actions/actions.rb",
         | 
| 32 | 
            +
                "lib/spidr/actions/exceptions.rb",
         | 
| 33 | 
            +
                "lib/spidr/actions/exceptions/action.rb",
         | 
| 34 | 
            +
                "lib/spidr/actions/exceptions/paused.rb",
         | 
| 35 | 
            +
                "lib/spidr/actions/exceptions/skip_link.rb",
         | 
| 36 | 
            +
                "lib/spidr/actions/exceptions/skip_page.rb",
         | 
| 37 | 
            +
                "lib/spidr/agent.rb",
         | 
| 38 | 
            +
                "lib/spidr/auth_credential.rb",
         | 
| 39 | 
            +
                "lib/spidr/auth_store.rb",
         | 
| 40 | 
            +
                "lib/spidr/cookie_jar.rb",
         | 
| 41 | 
            +
                "lib/spidr/events.rb",
         | 
| 42 | 
            +
                "lib/spidr/extensions.rb",
         | 
| 43 | 
            +
                "lib/spidr/extensions/uri.rb",
         | 
| 44 | 
            +
                "lib/spidr/filters.rb",
         | 
| 45 | 
            +
                "lib/spidr/page.rb",
         | 
| 46 | 
            +
                "lib/spidr/rules.rb",
         | 
| 47 | 
            +
                "lib/spidr/sanitizers.rb",
         | 
| 48 | 
            +
                "lib/spidr/session_cache.rb",
         | 
| 49 | 
            +
                "lib/spidr/spidr.rb",
         | 
| 50 | 
            +
                "lib/spidr/version.rb",
         | 
| 51 | 
            +
                "spec/actions_spec.rb",
         | 
| 52 | 
            +
                "spec/agent_spec.rb",
         | 
| 53 | 
            +
                "spec/auth_store_spec.rb",
         | 
| 54 | 
            +
                "spec/cookie_jar_spec.rb",
         | 
| 55 | 
            +
                "spec/extensions/uri_spec.rb",
         | 
| 56 | 
            +
                "spec/filters_spec.rb",
         | 
| 57 | 
            +
                "spec/helpers/history.rb",
         | 
| 58 | 
            +
                "spec/helpers/page.rb",
         | 
| 59 | 
            +
                "spec/helpers/wsoc.rb",
         | 
| 60 | 
            +
                "spec/page_examples.rb",
         | 
| 61 | 
            +
                "spec/page_spec.rb",
         | 
| 62 | 
            +
                "spec/rules_spec.rb",
         | 
| 63 | 
            +
                "spec/sanitizers_spec.rb",
         | 
| 64 | 
            +
                "spec/session_cache.rb",
         | 
| 65 | 
            +
                "spec/spec_helper.rb",
         | 
| 66 | 
            +
                "spec/spidr_spec.rb",
         | 
| 67 | 
            +
                "spidr.gemspec"
         | 
| 67 68 | 
             
              ]
         | 
| 68 69 | 
             
              s.has_rdoc = %q{yard}
         | 
| 69 70 | 
             
              s.homepage = %q{http://github.com/postmodern/spidr}
         | 
| 70 71 | 
             
              s.licenses = ["MIT"]
         | 
| 71 | 
            -
              s.rdoc_options = ["--charset=UTF-8"]
         | 
| 72 72 | 
             
              s.require_paths = ["lib"]
         | 
| 73 | 
            -
              s.rubygems_version = %q{1.3. | 
| 73 | 
            +
              s.rubygems_version = %q{1.3.7}
         | 
| 74 74 | 
             
              s.summary = %q{A versatile Ruby web spidering library}
         | 
| 75 75 | 
             
              s.test_files = [
         | 
| 76 | 
            +
                "spec/actions_spec.rb",
         | 
| 77 | 
            +
                "spec/agent_spec.rb",
         | 
| 76 78 | 
             
                "spec/auth_store_spec.rb",
         | 
| 77 | 
            -
             | 
| 78 | 
            -
             | 
| 79 | 
            -
             | 
| 80 | 
            -
             | 
| 81 | 
            -
             | 
| 82 | 
            -
             | 
| 83 | 
            -
             | 
| 84 | 
            -
             | 
| 85 | 
            -
             | 
| 86 | 
            -
             | 
| 87 | 
            -
             | 
| 88 | 
            -
             | 
| 89 | 
            -
             | 
| 90 | 
            -
                 "spec/page_examples.rb",
         | 
| 91 | 
            -
                 "spec/actions_spec.rb"
         | 
| 79 | 
            +
                "spec/cookie_jar_spec.rb",
         | 
| 80 | 
            +
                "spec/extensions/uri_spec.rb",
         | 
| 81 | 
            +
                "spec/filters_spec.rb",
         | 
| 82 | 
            +
                "spec/helpers/history.rb",
         | 
| 83 | 
            +
                "spec/helpers/page.rb",
         | 
| 84 | 
            +
                "spec/helpers/wsoc.rb",
         | 
| 85 | 
            +
                "spec/page_examples.rb",
         | 
| 86 | 
            +
                "spec/page_spec.rb",
         | 
| 87 | 
            +
                "spec/rules_spec.rb",
         | 
| 88 | 
            +
                "spec/sanitizers_spec.rb",
         | 
| 89 | 
            +
                "spec/session_cache.rb",
         | 
| 90 | 
            +
                "spec/spec_helper.rb",
         | 
| 91 | 
            +
                "spec/spidr_spec.rb"
         | 
| 92 92 | 
             
              ]
         | 
| 93 93 |  | 
| 94 94 | 
             
              if s.respond_to? :specification_version then
         | 
| 95 95 | 
             
                current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
         | 
| 96 96 | 
             
                s.specification_version = 3
         | 
| 97 97 |  | 
| 98 | 
            -
                if Gem::Version.new(Gem:: | 
| 98 | 
            +
                if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
         | 
| 99 99 | 
             
                  s.add_runtime_dependency(%q<nokogiri>, [">= 1.3.0"])
         | 
| 100 | 
            +
                  s.add_development_dependency(%q<rake>, ["~> 0.8.7"])
         | 
| 101 | 
            +
                  s.add_development_dependency(%q<jeweler>, ["~> 1.4.0"])
         | 
| 100 102 | 
             
                  s.add_development_dependency(%q<rspec>, ["~> 1.3.0"])
         | 
| 101 | 
            -
                  s.add_development_dependency(%q<yard>, ["~> 0.5.3"])
         | 
| 102 | 
            -
                  s.add_development_dependency(%q<wsoc>, ["~> 0.1.1"])
         | 
| 103 103 | 
             
                else
         | 
| 104 104 | 
             
                  s.add_dependency(%q<nokogiri>, [">= 1.3.0"])
         | 
| 105 | 
            +
                  s.add_dependency(%q<rake>, ["~> 0.8.7"])
         | 
| 106 | 
            +
                  s.add_dependency(%q<jeweler>, ["~> 1.4.0"])
         | 
| 105 107 | 
             
                  s.add_dependency(%q<rspec>, ["~> 1.3.0"])
         | 
| 106 | 
            -
                  s.add_dependency(%q<yard>, ["~> 0.5.3"])
         | 
| 107 | 
            -
                  s.add_dependency(%q<wsoc>, ["~> 0.1.1"])
         | 
| 108 108 | 
             
                end
         | 
| 109 109 | 
             
              else
         | 
| 110 110 | 
             
                s.add_dependency(%q<nokogiri>, [">= 1.3.0"])
         | 
| 111 | 
            +
                s.add_dependency(%q<rake>, ["~> 0.8.7"])
         | 
| 112 | 
            +
                s.add_dependency(%q<jeweler>, ["~> 1.4.0"])
         | 
| 111 113 | 
             
                s.add_dependency(%q<rspec>, ["~> 1.3.0"])
         | 
| 112 | 
            -
                s.add_dependency(%q<yard>, ["~> 0.5.3"])
         | 
| 113 | 
            -
                s.add_dependency(%q<wsoc>, ["~> 0.1.1"])
         | 
| 114 114 | 
             
              end
         | 
| 115 115 | 
             
            end
         | 
| 116 116 |  | 
    
        metadata
    CHANGED
    
    | @@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version | |
| 5 5 | 
             
              segments: 
         | 
| 6 6 | 
             
              - 0
         | 
| 7 7 | 
             
              - 2
         | 
| 8 | 
            -
              -  | 
| 9 | 
            -
              version: 0.2. | 
| 8 | 
            +
              - 5
         | 
| 9 | 
            +
              version: 0.2.5
         | 
| 10 10 | 
             
            platform: ruby
         | 
| 11 11 | 
             
            authors: 
         | 
| 12 12 | 
             
            - Postmodern
         | 
| @@ -14,13 +14,13 @@ autorequire: | |
| 14 14 | 
             
            bindir: bin
         | 
| 15 15 | 
             
            cert_chain: []
         | 
| 16 16 |  | 
| 17 | 
            -
            date: 2010- | 
| 17 | 
            +
            date: 2010-07-02 00:00:00 -07:00
         | 
| 18 18 | 
             
            default_executable: 
         | 
| 19 19 | 
             
            dependencies: 
         | 
| 20 20 | 
             
            - !ruby/object:Gem::Dependency 
         | 
| 21 21 | 
             
              name: nokogiri
         | 
| 22 | 
            -
              prerelease: false
         | 
| 23 22 | 
             
              requirement: &id001 !ruby/object:Gem::Requirement 
         | 
| 23 | 
            +
                none: false
         | 
| 24 24 | 
             
                requirements: 
         | 
| 25 25 | 
             
                - - ">="
         | 
| 26 26 | 
             
                  - !ruby/object:Gem::Version 
         | 
| @@ -30,48 +30,52 @@ dependencies: | |
| 30 30 | 
             
                    - 0
         | 
| 31 31 | 
             
                    version: 1.3.0
         | 
| 32 32 | 
             
              type: :runtime
         | 
| 33 | 
            +
              prerelease: false
         | 
| 33 34 | 
             
              version_requirements: *id001
         | 
| 34 35 | 
             
            - !ruby/object:Gem::Dependency 
         | 
| 35 | 
            -
              name:  | 
| 36 | 
            -
              prerelease: false
         | 
| 36 | 
            +
              name: rake
         | 
| 37 37 | 
             
              requirement: &id002 !ruby/object:Gem::Requirement 
         | 
| 38 | 
            +
                none: false
         | 
| 38 39 | 
             
                requirements: 
         | 
| 39 40 | 
             
                - - ~>
         | 
| 40 41 | 
             
                  - !ruby/object:Gem::Version 
         | 
| 41 42 | 
             
                    segments: 
         | 
| 42 | 
            -
                    - 1
         | 
| 43 | 
            -
                    - 3
         | 
| 44 43 | 
             
                    - 0
         | 
| 45 | 
            -
                     | 
| 44 | 
            +
                    - 8
         | 
| 45 | 
            +
                    - 7
         | 
| 46 | 
            +
                    version: 0.8.7
         | 
| 46 47 | 
             
              type: :development
         | 
| 48 | 
            +
              prerelease: false
         | 
| 47 49 | 
             
              version_requirements: *id002
         | 
| 48 50 | 
             
            - !ruby/object:Gem::Dependency 
         | 
| 49 | 
            -
              name:  | 
| 50 | 
            -
              prerelease: false
         | 
| 51 | 
            +
              name: jeweler
         | 
| 51 52 | 
             
              requirement: &id003 !ruby/object:Gem::Requirement 
         | 
| 53 | 
            +
                none: false
         | 
| 52 54 | 
             
                requirements: 
         | 
| 53 55 | 
             
                - - ~>
         | 
| 54 56 | 
             
                  - !ruby/object:Gem::Version 
         | 
| 55 57 | 
             
                    segments: 
         | 
| 58 | 
            +
                    - 1
         | 
| 59 | 
            +
                    - 4
         | 
| 56 60 | 
             
                    - 0
         | 
| 57 | 
            -
                     | 
| 58 | 
            -
                    - 3
         | 
| 59 | 
            -
                    version: 0.5.3
         | 
| 61 | 
            +
                    version: 1.4.0
         | 
| 60 62 | 
             
              type: :development
         | 
| 63 | 
            +
              prerelease: false
         | 
| 61 64 | 
             
              version_requirements: *id003
         | 
| 62 65 | 
             
            - !ruby/object:Gem::Dependency 
         | 
| 63 | 
            -
              name:  | 
| 64 | 
            -
              prerelease: false
         | 
| 66 | 
            +
              name: rspec
         | 
| 65 67 | 
             
              requirement: &id004 !ruby/object:Gem::Requirement 
         | 
| 68 | 
            +
                none: false
         | 
| 66 69 | 
             
                requirements: 
         | 
| 67 70 | 
             
                - - ~>
         | 
| 68 71 | 
             
                  - !ruby/object:Gem::Version 
         | 
| 69 72 | 
             
                    segments: 
         | 
| 70 | 
            -
                    - 0
         | 
| 71 73 | 
             
                    - 1
         | 
| 72 | 
            -
                    -  | 
| 73 | 
            -
                     | 
| 74 | 
            +
                    - 3
         | 
| 75 | 
            +
                    - 0
         | 
| 76 | 
            +
                    version: 1.3.0
         | 
| 74 77 | 
             
              type: :development
         | 
| 78 | 
            +
              prerelease: false
         | 
| 75 79 | 
             
              version_requirements: *id004
         | 
| 76 80 | 
             
            description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
         | 
| 77 81 | 
             
            email: postmodern.mod3@gmail.com
         | 
| @@ -88,6 +92,7 @@ files: | |
| 88 92 | 
             
            - .specopts
         | 
| 89 93 | 
             
            - .yardopts
         | 
| 90 94 | 
             
            - ChangeLog.md
         | 
| 95 | 
            +
            - Gemfile
         | 
| 91 96 | 
             
            - LICENSE.txt
         | 
| 92 97 | 
             
            - README.md
         | 
| 93 98 | 
             
            - Rakefile
         | 
| @@ -135,18 +140,21 @@ homepage: http://github.com/postmodern/spidr | |
| 135 140 | 
             
            licenses: 
         | 
| 136 141 | 
             
            - MIT
         | 
| 137 142 | 
             
            post_install_message: 
         | 
| 138 | 
            -
            rdoc_options: 
         | 
| 139 | 
            -
             | 
| 143 | 
            +
            rdoc_options: []
         | 
| 144 | 
            +
             | 
| 140 145 | 
             
            require_paths: 
         | 
| 141 146 | 
             
            - lib
         | 
| 142 147 | 
             
            required_ruby_version: !ruby/object:Gem::Requirement 
         | 
| 148 | 
            +
              none: false
         | 
| 143 149 | 
             
              requirements: 
         | 
| 144 150 | 
             
              - - ">="
         | 
| 145 151 | 
             
                - !ruby/object:Gem::Version 
         | 
| 152 | 
            +
                  hash: 740918287
         | 
| 146 153 | 
             
                  segments: 
         | 
| 147 154 | 
             
                  - 0
         | 
| 148 155 | 
             
                  version: "0"
         | 
| 149 156 | 
             
            required_rubygems_version: !ruby/object:Gem::Requirement 
         | 
| 157 | 
            +
              none: false
         | 
| 150 158 | 
             
              requirements: 
         | 
| 151 159 | 
             
              - - ">="
         | 
| 152 160 | 
             
                - !ruby/object:Gem::Version 
         | 
| @@ -156,24 +164,24 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 156 164 | 
             
            requirements: []
         | 
| 157 165 |  | 
| 158 166 | 
             
            rubyforge_project: 
         | 
| 159 | 
            -
            rubygems_version: 1.3. | 
| 167 | 
            +
            rubygems_version: 1.3.7
         | 
| 160 168 | 
             
            signing_key: 
         | 
| 161 169 | 
             
            specification_version: 3
         | 
| 162 170 | 
             
            summary: A versatile Ruby web spidering library
         | 
| 163 171 | 
             
            test_files: 
         | 
| 164 | 
            -
            - spec/ | 
| 165 | 
            -
            - spec/rules_spec.rb
         | 
| 166 | 
            -
            - spec/session_cache.rb
         | 
| 167 | 
            -
            - spec/spec_helper.rb
         | 
| 168 | 
            -
            - spec/sanitizers_spec.rb
         | 
| 169 | 
            -
            - spec/filters_spec.rb
         | 
| 170 | 
            -
            - spec/page_spec.rb
         | 
| 171 | 
            -
            - spec/spidr_spec.rb
         | 
| 172 | 
            +
            - spec/actions_spec.rb
         | 
| 172 173 | 
             
            - spec/agent_spec.rb
         | 
| 174 | 
            +
            - spec/auth_store_spec.rb
         | 
| 173 175 | 
             
            - spec/cookie_jar_spec.rb
         | 
| 174 176 | 
             
            - spec/extensions/uri_spec.rb
         | 
| 177 | 
            +
            - spec/filters_spec.rb
         | 
| 175 178 | 
             
            - spec/helpers/history.rb
         | 
| 176 179 | 
             
            - spec/helpers/page.rb
         | 
| 177 180 | 
             
            - spec/helpers/wsoc.rb
         | 
| 178 181 | 
             
            - spec/page_examples.rb
         | 
| 179 | 
            -
            - spec/ | 
| 182 | 
            +
            - spec/page_spec.rb
         | 
| 183 | 
            +
            - spec/rules_spec.rb
         | 
| 184 | 
            +
            - spec/sanitizers_spec.rb
         | 
| 185 | 
            +
            - spec/session_cache.rb
         | 
| 186 | 
            +
            - spec/spec_helper.rb
         | 
| 187 | 
            +
            - spec/spidr_spec.rb
         |