spider 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +6 -0
- data/README +4 -4
- data/doc/classes/Net/HTTPRedirection.html +144 -0
- data/doc/classes/Net/HTTPResponse.html +166 -0
- data/doc/classes/Net/HTTPSuccess.html +144 -0
- data/doc/classes/NilClass.html +144 -0
- data/doc/classes/Spider.html +12 -12
- data/doc/classes/SpiderInstance.html +109 -32
- data/doc/created.rid +1 -1
- data/doc/files/README.html +5 -5
- data/doc/files/lib/spider_rb.html +5 -5
- data/doc/fr_class_index.html +0 -1
- data/doc/fr_file_index.html +1 -0
- data/doc/fr_method_index.html +5 -2
- data/lib/spider.rb +100 -58
- data/spec/spider_instance_spec.rb +115 -30
- data/spider.gemspec +1 -1
- data/test_server/client.rb +4 -4
- metadata +7 -2
    
        data/doc/fr_class_index.html
    CHANGED
    
    
    
        data/doc/fr_file_index.html
    CHANGED
    
    
    
        data/doc/fr_method_index.html
    CHANGED
    
    | @@ -21,9 +21,12 @@ | |
| 21 21 | 
             
              <h1 class="section-bar">Methods</h1>
         | 
| 22 22 | 
             
              <div id="index-entries">
         | 
| 23 23 | 
             
                <a href="classes/SpiderInstance.html#M000001">add_url_check (SpiderInstance)</a><br />
         | 
| 24 | 
            +
                <a href="classes/SpiderInstance.html#M000006">clear_headers (SpiderInstance)</a><br />
         | 
| 25 | 
            +
                <a href="classes/SpiderInstance.html#M000005">headers (SpiderInstance)</a><br />
         | 
| 24 26 | 
             
                <a href="classes/SpiderInstance.html#M000002">on (SpiderInstance)</a><br />
         | 
| 25 | 
            -
                <a href="classes/SpiderInstance.html#M000003"> | 
| 26 | 
            -
                <a href="classes/Spider.html# | 
| 27 | 
            +
                <a href="classes/SpiderInstance.html#M000003">setup (SpiderInstance)</a><br />
         | 
| 28 | 
            +
                <a href="classes/Spider.html#M000007">start_at (Spider)</a><br />
         | 
| 29 | 
            +
                <a href="classes/SpiderInstance.html#M000004">teardown (SpiderInstance)</a><br />
         | 
| 27 30 | 
             
              </div>
         | 
| 28 31 | 
             
            </div>
         | 
| 29 32 | 
             
            </body>
         | 
    
        data/lib/spider.rb
    CHANGED
    
    | @@ -29,15 +29,21 @@ require 'uri' | |
| 29 29 | 
             
            require 'net/http'
         | 
| 30 30 | 
             
            require 'net/https'
         | 
| 31 31 |  | 
| 32 | 
            -
             | 
| 33 | 
            -
               | 
| 34 | 
            -
             | 
| 35 | 
            -
            end
         | 
| 36 | 
            -
             | 
| 37 | 
            -
               | 
| 32 | 
            +
            module Net #:nodoc:
         | 
| 33 | 
            +
              class HTTPResponse #:nodoc:
         | 
| 34 | 
            +
                def success?; false; end
         | 
| 35 | 
            +
                def redirect?; false; end
         | 
| 36 | 
            +
              end
         | 
| 37 | 
            +
              class HTTPSuccess #:nodoc:
         | 
| 38 | 
            +
                def success?; true; end
         | 
| 39 | 
            +
              end
         | 
| 40 | 
            +
              class HTTPRedirection #:nodoc:
         | 
| 41 | 
            +
                def redirect?; true; end
         | 
| 42 | 
            +
              end
         | 
| 38 43 | 
             
            end
         | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
| 44 | 
            +
             | 
| 45 | 
            +
            class NilClass #:nodoc:
         | 
| 46 | 
            +
              def merge(h); h; end
         | 
| 41 47 | 
             
            end
         | 
| 42 48 |  | 
| 43 49 | 
             
            # A spidering library for Ruby. Handles robots.txt, scraping, finding more
         | 
| @@ -52,22 +58,22 @@ class Spider | |
| 52 58 | 
             
              #      a_url =~ %r{^http://mike-burns.com.*}
         | 
| 53 59 | 
             
              #    end
         | 
| 54 60 | 
             
              #
         | 
| 55 | 
            -
              #    s.on 404 do |a_url,  | 
| 61 | 
            +
              #    s.on 404 do |a_url, resp, prior_url|
         | 
| 56 62 | 
             
              #      puts "URL not found: #{a_url}"
         | 
| 57 63 | 
             
              #    end
         | 
| 58 64 | 
             
              #
         | 
| 59 | 
            -
              #    s.on :success do |a_url,  | 
| 60 | 
            -
              #      puts "body: #{body}"
         | 
| 65 | 
            +
              #    s.on :success do |a_url, resp, prior_url|
         | 
| 66 | 
            +
              #      puts "body: #{resp.body}"
         | 
| 61 67 | 
             
              #    end
         | 
| 62 68 | 
             
              #
         | 
| 63 | 
            -
              #    s.on : | 
| 69 | 
            +
              #    s.on :every do |a_url, resp, prior_url|
         | 
| 64 70 | 
             
              #      puts "URL returned anything: #{a_url} with this code #{resp.code}"
         | 
| 65 71 | 
             
              #    end
         | 
| 66 72 | 
             
              #  end
         | 
| 67 73 |  | 
| 68 74 | 
             
              def self.start_at(a_url, &block)
         | 
| 69 75 | 
             
                rules    = RobotRules.new('Ruby Spider 1.0')
         | 
| 70 | 
            -
                a_spider = SpiderInstance.new( | 
| 76 | 
            +
                a_spider = SpiderInstance.new({nil => a_url}, [], rules, [])
         | 
| 71 77 | 
             
                block.call(a_spider)
         | 
| 72 78 | 
             
                a_spider.start!
         | 
| 73 79 | 
             
              end
         | 
| @@ -77,11 +83,14 @@ class SpiderInstance | |
| 77 83 | 
             
              def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc:
         | 
| 78 84 | 
             
                @url_checks  = []
         | 
| 79 85 | 
             
                @cache       = :memory
         | 
| 80 | 
            -
                @callbacks   = { | 
| 86 | 
            +
                @callbacks   = {}
         | 
| 81 87 | 
             
                @next_urls   = next_urls
         | 
| 82 88 | 
             
                @seen        = seen
         | 
| 83 89 | 
             
                @rules       = rules || RobotRules.new('Ruby Spider 1.0')
         | 
| 84 90 | 
             
                @robots_seen = robots_seen
         | 
| 91 | 
            +
                @headers     = {}
         | 
| 92 | 
            +
                @setup       = nil
         | 
| 93 | 
            +
                @teardown    = nil
         | 
| 85 94 | 
             
              end
         | 
| 86 95 |  | 
| 87 96 | 
             
              # Add a predicate that determines whether to continue down this URL's path.
         | 
| @@ -99,62 +108,88 @@ class SpiderInstance | |
| 99 108 | 
             
                @cache = cache_type
         | 
| 100 109 | 
             
              end
         | 
| 101 110 |  | 
| 102 | 
            -
              # Add a response handler. A response handler's trigger can be : | 
| 103 | 
            -
              # :failure, or any HTTP status code. The handler itself can be | 
| 104 | 
            -
              #  | 
| 111 | 
            +
              # Add a response handler. A response handler's trigger can be :every,
         | 
| 112 | 
            +
              # :success, :failure, or any HTTP status code. The handler itself can be
         | 
| 113 | 
            +
              # either a Proc or a block.
         | 
| 105 114 | 
             
              #
         | 
| 106 | 
            -
              #  | 
| 107 | 
            -
              #  | 
| 115 | 
            +
              # The arguments to the block are: the URL as a string, an instance of
         | 
| 116 | 
            +
              # Net::HTTPResponse, and the prior URL as a string.
         | 
| 108 117 | 
             
              # 
         | 
| 109 | 
            -
              # If the trigger is :success or any HTTP status code that represents a
         | 
| 110 | 
            -
              # successful result, the arguments are the URL as a string, the HTTP status
         | 
| 111 | 
            -
              # code, an instance of Net::HTTPSuccess, and the body of the result as a
         | 
| 112 | 
            -
              # string.
         | 
| 113 | 
            -
              #
         | 
| 114 | 
            -
              # If the trigger is :failure or any HTTP status code that represents a failed
         | 
| 115 | 
            -
              # result, the arguments are the URL as a string and the HTTP status code.
         | 
| 116 118 | 
             
              #
         | 
| 117 119 | 
             
              # For example:
         | 
| 118 120 | 
             
              #
         | 
| 119 | 
            -
              #  on 404 do |a_url,  | 
| 121 | 
            +
              #  on 404 do |a_url, resp, prior_url|
         | 
| 120 122 | 
             
              #    puts "URL not found: #{a_url}"
         | 
| 121 123 | 
             
              #  end
         | 
| 122 124 | 
             
              #
         | 
| 123 | 
            -
              #  on :success do |a_url,  | 
| 125 | 
            +
              #  on :success do |a_url, resp, prior_url|
         | 
| 124 126 | 
             
              #    puts a_url
         | 
| 125 | 
            -
              #    puts body
         | 
| 127 | 
            +
              #    puts resp.body
         | 
| 126 128 | 
             
              #  end
         | 
| 127 129 | 
             
              #
         | 
| 128 | 
            -
              #  on : | 
| 130 | 
            +
              #  on :every do |a_url, resp, prior_url|
         | 
| 129 131 | 
             
              #    puts "Given this code: #{resp.code}"
         | 
| 130 132 | 
             
              #  end
         | 
| 131 133 | 
             
              def on(code, p = nil, &block)
         | 
| 132 134 | 
             
                f = p ? p : block
         | 
| 133 135 | 
             
                case code
         | 
| 134 136 | 
             
                when Fixnum
         | 
| 135 | 
            -
                  @callbacks[ | 
| 137 | 
            +
                  @callbacks[code] = f
         | 
| 136 138 | 
             
                else
         | 
| 137 | 
            -
                   | 
| 138 | 
            -
                    @callbacks[:any] = f
         | 
| 139 | 
            -
                  else
         | 
| 140 | 
            -
                    @callbacks[code.to_sym][:any] = f
         | 
| 141 | 
            -
                  end
         | 
| 139 | 
            +
                  @callbacks[code.to_sym] = f
         | 
| 142 140 | 
             
                end
         | 
| 143 141 | 
             
              end
         | 
| 144 142 |  | 
| 143 | 
            +
              # Run before the HTTP request. Given the URL as a string.
         | 
| 144 | 
            +
              #  setup do |a_url|
         | 
| 145 | 
            +
              #    headers['Cookies'] = 'user_id=1;admin=true'
         | 
| 146 | 
            +
              #  end
         | 
| 147 | 
            +
              def setup(p = nil, &block)
         | 
| 148 | 
            +
                @setup = p ? p : block
         | 
| 149 | 
            +
              end
         | 
| 150 | 
            +
             | 
| 151 | 
            +
              # Run last, once for each page. Given the URL as a string.
         | 
| 152 | 
            +
              def teardown(p = nil, &block)
         | 
| 153 | 
            +
                @teardown = p ? p : block
         | 
| 154 | 
            +
              end
         | 
| 155 | 
            +
             | 
| 156 | 
            +
              # Use like a hash:
         | 
| 157 | 
            +
              #  headers['Cookies'] = 'user_id=1;password=btrross3'
         | 
| 158 | 
            +
              def headers
         | 
| 159 | 
            +
                HeaderSetter.new(self)
         | 
| 160 | 
            +
              end
         | 
| 161 | 
            +
             | 
| 162 | 
            +
              def raw_headers #:nodoc:
         | 
| 163 | 
            +
                @headers
         | 
| 164 | 
            +
              end
         | 
| 165 | 
            +
              def raw_headers=(v) #:nodoc:
         | 
| 166 | 
            +
                @headers = v
         | 
| 167 | 
            +
              end
         | 
| 168 | 
            +
             | 
| 169 | 
            +
              # Reset the headers hash.
         | 
| 170 | 
            +
              def clear_headers
         | 
| 171 | 
            +
                @headers = {}
         | 
| 172 | 
            +
              end
         | 
| 173 | 
            +
             | 
| 145 174 | 
             
              def start! #:nodoc:
         | 
| 146 175 | 
             
                next_urls = @next_urls
         | 
| 147 176 | 
             
                begin
         | 
| 148 | 
            -
                   | 
| 149 | 
            -
             | 
| 150 | 
            -
             | 
| 151 | 
            -
             | 
| 152 | 
            -
             | 
| 153 | 
            -
             | 
| 154 | 
            -
             | 
| 155 | 
            -
             | 
| 156 | 
            -
             | 
| 157 | 
            -
             | 
| 177 | 
            +
                  tmp_n_u = {}
         | 
| 178 | 
            +
                  next_urls.each do |prior_url, urls|
         | 
| 179 | 
            +
                    urls.map do |a_url|
         | 
| 180 | 
            +
                      [a_url, (URI.parse(a_url) rescue nil)]
         | 
| 181 | 
            +
                    end.select do |a_url, parsed_url|
         | 
| 182 | 
            +
                      allowable_url?(a_url, parsed_url)
         | 
| 183 | 
            +
                    end.each do |a_url, parsed_url|
         | 
| 184 | 
            +
                      @setup.call(a_url) unless @setup.nil?
         | 
| 185 | 
            +
                      get_page(parsed_url) do |response|
         | 
| 186 | 
            +
                        do_callbacks(a_url, response, prior_url)
         | 
| 187 | 
            +
                        tmp_n_u[a_url] = generate_next_urls(a_url, response)
         | 
| 188 | 
            +
                      end
         | 
| 189 | 
            +
                      @teardown.call(a_url) unless @teardown.nil?
         | 
| 190 | 
            +
                    end
         | 
| 191 | 
            +
                  end
         | 
| 192 | 
            +
                  next_urls = tmp_n_u
         | 
| 158 193 | 
             
                end while !next_urls.empty?
         | 
| 159 194 | 
             
              end
         | 
| 160 195 |  | 
| @@ -196,7 +231,8 @@ class SpiderInstance | |
| 196 231 | 
             
                  http = Net::HTTP.new(parsed_url.host, parsed_url.port)
         | 
| 197 232 | 
             
                  http.use_ssl = parsed_url.scheme == 'https'
         | 
| 198 233 | 
             
                  # Uses start because http.finish cannot be called.
         | 
| 199 | 
            -
                  r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri | 
| 234 | 
            +
                  r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri,
         | 
| 235 | 
            +
                                                                   @headers))}
         | 
| 200 236 | 
             
                  if r.redirect?
         | 
| 201 237 | 
             
                    get_page(URI.parse(r['Location']), &block)
         | 
| 202 238 | 
             
                  else
         | 
| @@ -208,16 +244,13 @@ class SpiderInstance | |
| 208 244 | 
             
                end
         | 
| 209 245 | 
             
              end
         | 
| 210 246 |  | 
| 211 | 
            -
              def do_callbacks(a_url, resp) #:nodoc:
         | 
| 212 | 
            -
                 | 
| 213 | 
            -
             | 
| 214 | 
            -
                   | 
| 215 | 
            -
             | 
| 216 | 
            -
             | 
| 217 | 
            -
             | 
| 218 | 
            -
                  cb_branch = @callbacks[:failure]
         | 
| 219 | 
            -
                  cb_branch[:any].call(a_url, resp.code) if cb_branch[:any]
         | 
| 220 | 
            -
                  cb_branch[resp.code].call(a_url, resp.code) if cb_branch[resp.code]
         | 
| 247 | 
            +
              def do_callbacks(a_url, resp, prior_url) #:nodoc:
         | 
| 248 | 
            +
                cbs = [@callbacks[:every],
         | 
| 249 | 
            +
                  resp.success? ?  @callbacks[:success] : @callbacks[:failure],
         | 
| 250 | 
            +
                  @callbacks[resp.code]]
         | 
| 251 | 
            +
             | 
| 252 | 
            +
                cbs.each do |cb|
         | 
| 253 | 
            +
                  cb.call(a_url, resp, prior_url) if cb
         | 
| 221 254 | 
             
                end
         | 
| 222 255 | 
             
              end
         | 
| 223 256 |  | 
| @@ -254,7 +287,16 @@ class SpiderInstance | |
| 254 287 | 
             
                end.compact
         | 
| 255 288 | 
             
              end
         | 
| 256 289 |  | 
| 257 | 
            -
              def remove_trailing_slash(s)
         | 
| 290 | 
            +
              def remove_trailing_slash(s) #:nodoc:
         | 
| 258 291 | 
             
                s.sub(%r{/*$},'')
         | 
| 259 292 | 
             
              end
         | 
| 293 | 
            +
             | 
| 294 | 
            +
              class HeaderSetter #:nodoc:
         | 
| 295 | 
            +
                def initialize(si)
         | 
| 296 | 
            +
                  @si = si
         | 
| 297 | 
            +
                end
         | 
| 298 | 
            +
                def []=(k,v)
         | 
| 299 | 
            +
                  @si.raw_headers = @si.raw_headers.merge({k => v})
         | 
| 300 | 
            +
                end
         | 
| 301 | 
            +
              end
         | 
| 260 302 | 
             
            end
         | 
| @@ -23,6 +23,80 @@ def null_logger | |
| 23 23 | 
             
            end
         | 
| 24 24 |  | 
| 25 25 | 
             
            describe 'SpiderInstance' do
         | 
| 26 | 
            +
              it 'should call the "setup" callback before loading the Web page' do
         | 
| 27 | 
            +
                mock_successful_http
         | 
| 28 | 
            +
                @on_called = false
         | 
| 29 | 
            +
                @before_called = false
         | 
| 30 | 
            +
                si = SpiderInstance.new({nil => ['http://example.com/']})
         | 
| 31 | 
            +
                si.stubs(:allowed?).returns(true)
         | 
| 32 | 
            +
                si.stubs(:generate_next_urls).returns([])
         | 
| 33 | 
            +
                si.setup       { |*a| @before_called = Time.now }
         | 
| 34 | 
            +
                si.on(:every)  { |*a| @on_called = Time.now }
         | 
| 35 | 
            +
                si.start!
         | 
| 36 | 
            +
                @on_called.should_not be_false
         | 
| 37 | 
            +
                @before_called.should_not be_false
         | 
| 38 | 
            +
                @before_called.should_not be_false
         | 
| 39 | 
            +
                @before_called.should < @on_called
         | 
| 40 | 
            +
              end
         | 
| 41 | 
            +
             | 
| 42 | 
            +
              it 'should call the "teardown" callback after running all other callbacks' do
         | 
| 43 | 
            +
                mock_successful_http
         | 
| 44 | 
            +
                @on_called = false
         | 
| 45 | 
            +
                @after_called = false
         | 
| 46 | 
            +
                si = SpiderInstance.new({nil => ['http://example.com/']})
         | 
| 47 | 
            +
                si.stubs(:allowed?).returns(true)
         | 
| 48 | 
            +
                si.stubs(:generate_next_urls).returns([])
         | 
| 49 | 
            +
                si.on(:every)  { |*a| @on_called = Time.now }
         | 
| 50 | 
            +
                si.teardown    { |*a| @after_called = Time.now }
         | 
| 51 | 
            +
                si.start!
         | 
| 52 | 
            +
                @on_called.should_not be_false
         | 
| 53 | 
            +
                @after_called.should_not be_false
         | 
| 54 | 
            +
                @after_called.should_not be_false
         | 
| 55 | 
            +
                @after_called.should > @on_called
         | 
| 56 | 
            +
              end
         | 
| 57 | 
            +
             | 
| 58 | 
            +
              it 'should pass headers set by a setup handler to the HTTP request' do
         | 
| 59 | 
            +
                mock_successful_http
         | 
| 60 | 
            +
                Net::HTTP::Get.expects(:new).with('/foo',{'X-Header-Set' => 'True'})
         | 
| 61 | 
            +
                si = SpiderInstance.new(nil => ['http://example.com/foo'])
         | 
| 62 | 
            +
                si.stubs(:allowable_url?).returns(true)
         | 
| 63 | 
            +
                si.stubs(:generate_next_urls).returns([])
         | 
| 64 | 
            +
                si.setup do |a_url|
         | 
| 65 | 
            +
                  si.headers['X-Header-Set'] = 'True'
         | 
| 66 | 
            +
                end
         | 
| 67 | 
            +
                si.teardown do |a_url|
         | 
| 68 | 
            +
                  si.clear_headers
         | 
| 69 | 
            +
                end
         | 
| 70 | 
            +
                si.start!
         | 
| 71 | 
            +
              end
         | 
| 72 | 
            +
             | 
| 73 | 
            +
              it 'should allow for a proxy' # fill in more
         | 
| 74 | 
            +
             | 
| 75 | 
            +
              it 'should call the :every callback with the current URL, the response, and the prior URL' do
         | 
| 76 | 
            +
                mock_successful_http
         | 
| 77 | 
            +
                callback_arguments_on(:every)
         | 
| 78 | 
            +
              end
         | 
| 79 | 
            +
             | 
| 80 | 
            +
              it 'should call the :success callback with the current URL, the request, and the prior URL' do
         | 
| 81 | 
            +
                mock_successful_http
         | 
| 82 | 
            +
                callback_arguments_on(:success)
         | 
| 83 | 
            +
              end
         | 
| 84 | 
            +
             | 
| 85 | 
            +
              it 'should call the :failure callback with the current URL, the request, and the prior URL' do
         | 
| 86 | 
            +
                mock_failed_http
         | 
| 87 | 
            +
                callback_arguments_on(:failure)
         | 
| 88 | 
            +
              end
         | 
| 89 | 
            +
             | 
| 90 | 
            +
              it 'should call the HTTP status error code callback with the current URL, the request, and the prior URL' do
         | 
| 91 | 
            +
                mock_failed_http
         | 
| 92 | 
            +
                callback_arguments_on(404)
         | 
| 93 | 
            +
              end
         | 
| 94 | 
            +
             | 
| 95 | 
            +
              it 'should call the HTTP status success code callback with the current URL, the request, and the prior URL' do
         | 
| 96 | 
            +
                mock_successful_http
         | 
| 97 | 
            +
                callback_arguments_on(200)
         | 
| 98 | 
            +
              end
         | 
| 99 | 
            +
             | 
| 26 100 | 
             
              # Bug reported by John Nagro, using the example source http://eons.com/
         | 
| 27 101 | 
             
              # had to change line 192; uses request_uri now instead of path.
         | 
| 28 102 | 
             
              it 'should handle query URLs without a path' do
         | 
| @@ -33,7 +107,7 @@ describe 'SpiderInstance' do | |
| 33 107 | 
             
                                                 :AccessLog => [])
         | 
| 34 108 | 
             
                server.mount('/', QueryServlet)
         | 
| 35 109 | 
             
                Thread.new {server.start}
         | 
| 36 | 
            -
                si = SpiderInstance.new([u])
         | 
| 110 | 
            +
                si = SpiderInstance.new({nil => [u]})
         | 
| 37 111 | 
             
                si.get_page(u_p) do
         | 
| 38 112 | 
             
                  @block_called = true
         | 
| 39 113 | 
             
                end
         | 
| @@ -47,7 +121,7 @@ describe 'SpiderInstance' do | |
| 47 121 | 
             
                u_p = URI.parse(u)
         | 
| 48 122 | 
             
                @redirect_handled = false
         | 
| 49 123 | 
             
                mock_redirect_http
         | 
| 50 | 
            -
                si = SpiderInstance.new([u])
         | 
| 124 | 
            +
                si = SpiderInstance.new({nil => [u]})
         | 
| 51 125 | 
             
                si.get_page(u_p) do
         | 
| 52 126 | 
             
                  @redirect_handled = true
         | 
| 53 127 | 
             
                end
         | 
| @@ -66,7 +140,7 @@ describe 'SpiderInstance' do | |
| 66 140 | 
             
                                                 :SSLComment => 'Comment of some sort')
         | 
| 67 141 | 
             
                server.mount('/', QueryServlet)
         | 
| 68 142 | 
             
                Thread.new {server.start}
         | 
| 69 | 
            -
                si = SpiderInstance.new([u])
         | 
| 143 | 
            +
                si = SpiderInstance.new({nil => [u]})
         | 
| 70 144 | 
             
                si.get_page(u_p) { @page_called = true }
         | 
| 71 145 | 
             
                server.shutdown
         | 
| 72 146 | 
             
                @page_called.should be_true
         | 
| @@ -79,7 +153,7 @@ describe 'SpiderInstance' do | |
| 79 153 | 
             
                u_p = URI.parse(u)
         | 
| 80 154 | 
             
                http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
         | 
| 81 155 | 
             
                Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
         | 
| 82 | 
            -
                si = SpiderInstance.new([u])
         | 
| 156 | 
            +
                si = SpiderInstance.new({nil => [u]})
         | 
| 83 157 | 
             
                si.expects(:allowable_url?).with(u, u_p).returns(false)
         | 
| 84 158 | 
             
                si.expects(:get_page).times(0)
         | 
| 85 159 | 
             
                si.start!
         | 
| @@ -90,9 +164,8 @@ describe 'SpiderInstance' do | |
| 90 164 | 
             
                u_p = URI.parse(u)
         | 
| 91 165 | 
             
                http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
         | 
| 92 166 | 
             
                Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
         | 
| 93 | 
            -
                si = SpiderInstance.new([u])
         | 
| 167 | 
            +
                si = SpiderInstance.new({nil => [u]})
         | 
| 94 168 | 
             
                si.expects(:allowable_url?).with(u, u_p).returns(true)
         | 
| 95 | 
            -
                si.expects(:allowable_url?).with(nil, nil).returns(false)
         | 
| 96 169 | 
             
                si.expects(:get_page).with(URI.parse(u))
         | 
| 97 170 | 
             
                si.start!
         | 
| 98 171 | 
             
              end
         | 
| @@ -106,25 +179,25 @@ describe 'SpiderInstance' do | |
| 106 179 | 
             
                robot_rules.expects(:parse).with('http://example.com:80/robots.txt',
         | 
| 107 180 | 
             
                                                 'robots.txt content')
         | 
| 108 181 | 
             
                robot_rules.expects(:allowed?).with('http://example.com/').returns(false)
         | 
| 109 | 
            -
                si = SpiderInstance.new(['http://example.com/'], [], robot_rules, [])
         | 
| 182 | 
            +
                si = SpiderInstance.new({nil => ['http://example.com/']}, [], robot_rules, [])
         | 
| 110 183 | 
             
                allowable = si.allowable_url?('http://example.com/',
         | 
| 111 184 | 
             
                                              URI.parse('http://example.com/'))
         | 
| 112 | 
            -
                allowable.should  | 
| 185 | 
            +
                allowable.should be_false
         | 
| 113 186 | 
             
              end
         | 
| 114 187 |  | 
| 115 188 | 
             
              it 'should disallow URLs when they fail any url_check' do
         | 
| 116 | 
            -
                si = SpiderInstance.new(['http://example.com/'])
         | 
| 189 | 
            +
                si = SpiderInstance.new({nil => ['http://example.com/']})
         | 
| 117 190 | 
             
                si.stubs(:allowed?).returns(true)
         | 
| 118 191 | 
             
                si.add_url_check { |a_url| false }
         | 
| 119 192 | 
             
                allowable = si.allowable_url?('http://example.com/',
         | 
| 120 193 | 
             
                                              URI.parse('http://example.com/'))
         | 
| 121 | 
            -
                allowable.should  | 
| 194 | 
            +
                allowable.should be_false
         | 
| 122 195 | 
             
              end
         | 
| 123 196 |  | 
| 124 197 | 
             
              it 'should support multiple url_checks' do
         | 
| 125 198 | 
             
                @first_url_check = false
         | 
| 126 199 | 
             
                @second_url_check = false
         | 
| 127 | 
            -
                si = SpiderInstance.new(['http://example.com/'])
         | 
| 200 | 
            +
                si = SpiderInstance.new({nil => ['http://example.com/']})
         | 
| 128 201 | 
             
                si.stubs(:allowed?).returns(true)
         | 
| 129 202 | 
             
                si.add_url_check do |a_url|
         | 
| 130 203 | 
             
                  @first_url_check = true
         | 
| @@ -136,7 +209,7 @@ describe 'SpiderInstance' do | |
| 136 209 | 
             
                end
         | 
| 137 210 | 
             
                allowable = si.allowable_url?('http://example.com/',
         | 
| 138 211 | 
             
                                              URI.parse('http://example.com/'))
         | 
| 139 | 
            -
                allowable.should  | 
| 212 | 
            +
                allowable.should be_false
         | 
| 140 213 | 
             
                @first_url_check.should be_true
         | 
| 141 214 | 
             
                @second_url_check.should be_true
         | 
| 142 215 | 
             
              end
         | 
| @@ -144,17 +217,17 @@ describe 'SpiderInstance' do | |
| 144 217 | 
             
              it 'should avoid cycles' do
         | 
| 145 218 | 
             
                u = 'http://example.com/'
         | 
| 146 219 | 
             
                u_p = URI.parse(u)
         | 
| 147 | 
            -
                si = SpiderInstance.new([u], [u_p])
         | 
| 220 | 
            +
                si = SpiderInstance.new({nil => [u]}, [u_p])
         | 
| 148 221 | 
             
                si.stubs(:allowed?).returns(true)
         | 
| 149 222 | 
             
                allowable = si.allowable_url?(u, u_p)
         | 
| 150 | 
            -
                allowable.should  | 
| 223 | 
            +
                allowable.should be_false
         | 
| 151 224 | 
             
                u_p.should_not be_nil
         | 
| 152 225 | 
             
              end
         | 
| 153 226 |  | 
| 154 227 | 
             
              it 'should call the 404 handler for 404s' do
         | 
| 155 228 | 
             
                @proc_called = false
         | 
| 156 229 | 
             
                mock_failed_http
         | 
| 157 | 
            -
                si = SpiderInstance.new(['http://example.com/'])
         | 
| 230 | 
            +
                si = SpiderInstance.new({nil => ['http://example.com/']})
         | 
| 158 231 | 
             
                si.stubs(:allowed?).returns(true)
         | 
| 159 232 | 
             
                si.stubs(:generate_next_urls).returns([])
         | 
| 160 233 | 
             
                si.on(404) {|*a| @proc_called = true}
         | 
| @@ -165,7 +238,7 @@ describe 'SpiderInstance' do | |
| 165 238 | 
             
              it 'should call the :success handler on success' do
         | 
| 166 239 | 
             
                @proc_called = false
         | 
| 167 240 | 
             
                mock_successful_http
         | 
| 168 | 
            -
                si = SpiderInstance.new(['http://example.com/'])
         | 
| 241 | 
            +
                si = SpiderInstance.new({nil => ['http://example.com/']})
         | 
| 169 242 | 
             
                si.stubs(:allowed?).returns(true)
         | 
| 170 243 | 
             
                si.stubs(:generate_next_urls).returns([])
         | 
| 171 244 | 
             
                si.on(:success) {|*a| @proc_called = true}
         | 
| @@ -176,19 +249,19 @@ describe 'SpiderInstance' do | |
| 176 249 | 
             
              it 'should not call the :success handler on failure' do
         | 
| 177 250 | 
             
                @proc_called = false
         | 
| 178 251 | 
             
                mock_failed_http
         | 
| 179 | 
            -
                si = SpiderInstance.new(['http://example.com/'])
         | 
| 252 | 
            +
                si = SpiderInstance.new({nil => ['http://example.com/']})
         | 
| 180 253 | 
             
                si.stubs(:allowed?).returns(true)
         | 
| 181 254 | 
             
                si.stubs(:generate_next_urls).returns([])
         | 
| 182 255 | 
             
                si.on(:success) {|*a| @proc_called = true}
         | 
| 183 256 | 
             
                si.start!
         | 
| 184 | 
            -
                @proc_called.should  | 
| 257 | 
            +
                @proc_called.should be_false
         | 
| 185 258 | 
             
              end
         | 
| 186 259 |  | 
| 187 260 | 
             
              it 'should call the :success handler and the 200 handler on 200' do
         | 
| 188 261 | 
             
                @proc_200_called = false
         | 
| 189 262 | 
             
                @proc_success_called = false
         | 
| 190 263 | 
             
                mock_successful_http
         | 
| 191 | 
            -
                si = SpiderInstance.new(['http://example.com/'])
         | 
| 264 | 
            +
                si = SpiderInstance.new({nil => ['http://example.com/']})
         | 
| 192 265 | 
             
                si.stubs(:allowed?).returns(true)
         | 
| 193 266 | 
             
                si.stubs(:generate_next_urls).returns([])
         | 
| 194 267 | 
             
                si.on(:success) {|*a| @proc_success_called = true}
         | 
| @@ -201,18 +274,18 @@ describe 'SpiderInstance' do | |
| 201 274 | 
             
              it 'should not call the :failure handler on success' do
         | 
| 202 275 | 
             
                @proc_called = false
         | 
| 203 276 | 
             
                mock_successful_http
         | 
| 204 | 
            -
                si = SpiderInstance.new(['http://example.com/'])
         | 
| 277 | 
            +
                si = SpiderInstance.new({nil => ['http://example.com/']})
         | 
| 205 278 | 
             
                si.stubs(:allowed?).returns(true)
         | 
| 206 279 | 
             
                si.stubs(:generate_next_urls).returns([])
         | 
| 207 280 | 
             
                si.on(:failure) {|*a| @proc_called = true}
         | 
| 208 281 | 
             
                si.start!
         | 
| 209 | 
            -
                @proc_called.should  | 
| 282 | 
            +
                @proc_called.should be_false
         | 
| 210 283 | 
             
              end
         | 
| 211 284 |  | 
| 212 285 | 
             
              it 'should call the :failure handler on failure' do
         | 
| 213 286 | 
             
                @proc_called = false
         | 
| 214 287 | 
             
                mock_failed_http
         | 
| 215 | 
            -
                si = SpiderInstance.new(['http://example.com/'])
         | 
| 288 | 
            +
                si = SpiderInstance.new({nil => ['http://example.com/']})
         | 
| 216 289 | 
             
                si.stubs(:allowed?).returns(true)
         | 
| 217 290 | 
             
                si.stubs(:generate_next_urls).returns([])
         | 
| 218 291 | 
             
                si.on(:failure) {|*a| @proc_called = true}
         | 
| @@ -224,7 +297,7 @@ describe 'SpiderInstance' do | |
| 224 297 | 
             
                @proc_404_called = false
         | 
| 225 298 | 
             
                @proc_failure_called = false
         | 
| 226 299 | 
             
                mock_failed_http
         | 
| 227 | 
            -
                si = SpiderInstance.new(['http://example.com/'])
         | 
| 300 | 
            +
                si = SpiderInstance.new({nil => ['http://example.com/']})
         | 
| 228 301 | 
             
                si.stubs(:allowed?).returns(true)
         | 
| 229 302 | 
             
                si.stubs(:generate_next_urls).returns([])
         | 
| 230 303 | 
             
                si.on(:failure) {|*a| @proc_failure_called = true}
         | 
| @@ -234,13 +307,13 @@ describe 'SpiderInstance' do | |
| 234 307 | 
             
                @proc_failure_called.should be_true
         | 
| 235 308 | 
             
              end
         | 
| 236 309 |  | 
| 237 | 
            -
              it 'should call the : | 
| 310 | 
            +
              it 'should call the :every handler even when a handler for the error code is defined' do
         | 
| 238 311 | 
             
                @any_called = false
         | 
| 239 312 | 
             
                mock_successful_http
         | 
| 240 | 
            -
                si = SpiderInstance.new(['http://example.com/'])
         | 
| 313 | 
            +
                si = SpiderInstance.new({nil => ['http://example.com/']})
         | 
| 241 314 | 
             
                si.stubs(:allowed?).returns(true)
         | 
| 242 315 | 
             
                si.stubs(:generate_next_urls).returns([])
         | 
| 243 | 
            -
                si.on(: | 
| 316 | 
            +
                si.on(:every) { |*a| @any_called = true }
         | 
| 244 317 | 
             
                si.on(202) {|*a|}
         | 
| 245 318 | 
             
                si.start!
         | 
| 246 319 | 
             
                @any_called.should be_true
         | 
| @@ -249,10 +322,10 @@ describe 'SpiderInstance' do | |
| 249 322 | 
             
              it 'should support a block as a response handler' do
         | 
| 250 323 | 
             
                @proc_called = false
         | 
| 251 324 | 
             
                mock_successful_http
         | 
| 252 | 
            -
                si = SpiderInstance.new(['http://example.com/'])
         | 
| 325 | 
            +
                si = SpiderInstance.new({nil => ['http://example.com/']})
         | 
| 253 326 | 
             
                si.stubs(:allowed?).returns(true)
         | 
| 254 327 | 
             
                si.stubs(:generate_next_urls).returns([])
         | 
| 255 | 
            -
                si.on(: | 
| 328 | 
            +
                si.on(:every) { |*a| @proc_called = true }
         | 
| 256 329 | 
             
                si.start!
         | 
| 257 330 | 
             
                @proc_called.should be_true
         | 
| 258 331 | 
             
              end
         | 
| @@ -260,10 +333,10 @@ describe 'SpiderInstance' do | |
| 260 333 | 
             
              it 'should support a proc as a response handler' do
         | 
| 261 334 | 
             
                @proc_called = false
         | 
| 262 335 | 
             
                mock_successful_http
         | 
| 263 | 
            -
                si = SpiderInstance.new(['http://example.com/'])
         | 
| 336 | 
            +
                si = SpiderInstance.new({nil => ['http://example.com/']})
         | 
| 264 337 | 
             
                si.stubs(:allowed?).returns(true)
         | 
| 265 338 | 
             
                si.stubs(:generate_next_urls).returns([])
         | 
| 266 | 
            -
                si.on(: | 
| 339 | 
            +
                si.on(:every, Proc.new { |*a| @proc_called = true })
         | 
| 267 340 | 
             
                si.start!
         | 
| 268 341 | 
             
                @proc_called.should be_true
         | 
| 269 342 | 
             
              end
         | 
| @@ -297,4 +370,16 @@ describe 'SpiderInstance' do | |
| 297 370 | 
             
                  yields(mock(:request => http_req2)).returns(http_req2)
         | 
| 298 371 | 
             
                Net::HTTP.expects(:new).times(2).returns(http_obj).then.returns(http_obj2)
         | 
| 299 372 | 
             
              end
         | 
| 373 | 
            +
             | 
| 374 | 
            +
              def callback_arguments_on(code)
         | 
| 375 | 
            +
                si = SpiderInstance.new('http://foo.com/' => ['http://example.com/'])
         | 
| 376 | 
            +
                si.stubs(:allowed?).returns(true)
         | 
| 377 | 
            +
                si.stubs(:generate_next_urls).returns([])
         | 
| 378 | 
            +
                si.on(code) do |a_url, resp, prior_url|
         | 
| 379 | 
            +
                  a_url.should == 'http://example.com/'
         | 
| 380 | 
            +
                  resp.should_not be_nil
         | 
| 381 | 
            +
                  prior_url.should == 'http://foo.com/'
         | 
| 382 | 
            +
                end
         | 
| 383 | 
            +
                si.start!
         | 
| 384 | 
            +
              end
         | 
| 300 385 | 
             
            end
         |