spidr 0.1.8 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +15 -0
- data/README.txt +1 -0
- data/Rakefile +2 -1
- data/lib/spidr/agent.rb +47 -6
- data/lib/spidr/page.rb +2 -2
- data/lib/spidr/version.rb +1 -1
- data.tar.gz.sig +0 -0
- metadata +33 -7
- metadata.gz.sig +0 -0
    
        data/History.txt
    CHANGED
    
    | @@ -1,3 +1,18 @@ | |
| 1 | 
            +
            === 0.1.9 / 2009-06-13
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            * Upgraded to Hoe 2.0.0.
         | 
| 4 | 
            +
              * Use Hoe.spec instead of Hoe.new.
         | 
| 5 | 
            +
              * Use the Hoe signing task for signed gems.
         | 
| 6 | 
            +
            * Added the Agent#schemes and Agent#schemes= methods.
         | 
| 7 | 
            +
            * Added a warning message if 'net/https' cannot be loaded.
         | 
| 8 | 
            +
            * Allow the list of acceptable URL schemes to be passed into Agent.new.
         | 
| 9 | 
            +
            * Allow history and queue information to be passed into Agent.new.
         | 
| 10 | 
            +
            * Agent#start_at no longer clears the history or the queue.
         | 
| 11 | 
            +
            * Fixed a bug in the sanitization of semi-escaped URLs.
         | 
| 12 | 
            +
            * Fixed a bug where https URLs would be followed even if 'net/https'
         | 
| 13 | 
            +
              could not be loaded.
         | 
| 14 | 
            +
            * Removed Agent::SCHEMES.
         | 
| 15 | 
            +
             | 
| 1 16 | 
             
            === 0.1.8 / 2009-05-27
         | 
| 2 17 |  | 
| 3 18 | 
             
            * Added the Agent#pause! and Agent#continue! methods.
         | 
    
        data/README.txt
    CHANGED
    
    
    
        data/Rakefile
    CHANGED
    
    | @@ -2,11 +2,12 @@ | |
| 2 2 |  | 
| 3 3 | 
             
            require 'rubygems'
         | 
| 4 4 | 
             
            require 'hoe'
         | 
| 5 | 
            +
            require 'hoe/signing'
         | 
| 5 6 | 
             
            require './tasks/spec.rb'
         | 
| 6 7 | 
             
            require './tasks/course.rb'
         | 
| 7 8 | 
             
            require './lib/spidr/version.rb'
         | 
| 8 9 |  | 
| 9 | 
            -
            Hoe. | 
| 10 | 
            +
            Hoe.spec('spidr') do |p|
         | 
| 10 11 | 
             
              p.rubyforge_name = 'spidr'
         | 
| 11 12 | 
             
              p.developer('Postmodern', 'postmodern.mod3@gmail.com')
         | 
| 12 13 | 
             
              p.remote_rdoc_dir = 'docs'
         | 
    
        data/lib/spidr/agent.rb
    CHANGED
    
    | @@ -7,9 +7,6 @@ require 'net/http' | |
| 7 7 | 
             
            module Spidr
         | 
| 8 8 | 
             
              class Agent
         | 
| 9 9 |  | 
| 10 | 
            -
                # URL schemes to visit
         | 
| 11 | 
            -
                SCHEMES = ['http', 'https']
         | 
| 12 | 
            -
             | 
| 13 10 | 
             
                # Proxy to use
         | 
| 14 11 | 
             
                attr_accessor :proxy
         | 
| 15 12 |  | 
| @@ -22,6 +19,9 @@ module Spidr | |
| 22 19 | 
             
                # Delay in between fetching pages
         | 
| 23 20 | 
             
                attr_accessor :delay
         | 
| 24 21 |  | 
| 22 | 
            +
                # List of acceptable URL schemes to follow
         | 
| 23 | 
            +
                attr_reader :schemes
         | 
| 24 | 
            +
             | 
| 25 25 | 
             
                # History containing visited URLs
         | 
| 26 26 | 
             
                attr_reader :history
         | 
| 27 27 |  | 
| @@ -42,6 +42,10 @@ module Spidr | |
| 42 42 | 
             
                # <tt>:referer</tt>:: The referer URL to send.
         | 
| 43 43 | 
             
                # <tt>:delay</tt>:: Duration in seconds to pause between spidering each
         | 
| 44 44 | 
             
                #                   link. Defaults to 0.
         | 
| 45 | 
            +
                # <tt>:schemes</tt>:: The list of acceptable URL schemes to follow.
         | 
| 46 | 
            +
                #                     Defaults to +http+ and +https+. +https+ URL
         | 
| 47 | 
            +
                #                     schemes will be ignored if <tt>net/http</tt>
         | 
| 48 | 
            +
                #                     cannot be loaded.
         | 
| 45 49 | 
             
                # <tt>:host</tt>:: The host-name to visit.
         | 
| 46 50 | 
             
                # <tt>:hosts</tt>:: An +Array+ of host patterns to visit.
         | 
| 47 51 | 
             
                # <tt>:ignore_hosts</tt>:: An +Array+ of host patterns to not visit.
         | 
| @@ -52,12 +56,32 @@ module Spidr | |
| 52 56 | 
             
                # <tt>:exts</tt>:: An +Array+ of File extension patterns to visit.
         | 
| 53 57 | 
             
                # <tt>:ignore_exts</tt>:: An +Array+ of File extension patterns to not
         | 
| 54 58 | 
             
                #                         visit.
         | 
| 59 | 
            +
                # <tt>:queue</tt>:: An initial queue of URLs to visit.
         | 
| 60 | 
            +
                # <tt>:history</tt>:: An initial list of visited URLs.
         | 
| 55 61 | 
             
                #
         | 
| 56 62 | 
             
                def initialize(options={},&block)
         | 
| 57 63 | 
             
                  @proxy = (options[:proxy] || Spidr.proxy)
         | 
| 58 64 | 
             
                  @user_agent = (options[:user_agent] || Spidr.user_agent)
         | 
| 59 65 | 
             
                  @referer = options[:referer]
         | 
| 60 66 |  | 
| 67 | 
            +
                  @schemes = []
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                  if options[:schemes]
         | 
| 70 | 
            +
                    @schemes += options[:schemes]
         | 
| 71 | 
            +
                  else
         | 
| 72 | 
            +
                    @schemes << 'http'
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                    begin
         | 
| 75 | 
            +
                      require 'net/https'
         | 
| 76 | 
            +
             | 
| 77 | 
            +
                      @schemes << 'https'
         | 
| 78 | 
            +
                    rescue Gem::LoadError => e
         | 
| 79 | 
            +
                      raise(e)
         | 
| 80 | 
            +
                    rescue ::LoadError
         | 
| 81 | 
            +
                      STDERR.puts "Warning: cannot load 'net/https', https support disabled"
         | 
| 82 | 
            +
                    end
         | 
| 83 | 
            +
                  end
         | 
| 84 | 
            +
             | 
| 61 85 | 
             
                  @host_rules = Rules.new(
         | 
| 62 86 | 
             
                    :accept => options[:hosts],
         | 
| 63 87 | 
             
                    :reject => options[:ignore_hosts]
         | 
| @@ -91,6 +115,14 @@ module Spidr | |
| 91 115 | 
             
                    visit_hosts_like(options[:host])
         | 
| 92 116 | 
             
                  end
         | 
| 93 117 |  | 
| 118 | 
            +
                  if options[:queue]
         | 
| 119 | 
            +
                    self.queue = options[:queue]
         | 
| 120 | 
            +
                  end
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                  if options[:history]
         | 
| 123 | 
            +
                    self.history = options[:history]
         | 
| 124 | 
            +
                  end
         | 
| 125 | 
            +
             | 
| 94 126 | 
             
                  block.call(self) if block
         | 
| 95 127 | 
             
                end
         | 
| 96 128 |  | 
| @@ -361,10 +393,9 @@ module Spidr | |
| 361 393 | 
             
                end
         | 
| 362 394 |  | 
| 363 395 | 
             
                #
         | 
| 364 | 
            -
                #  | 
| 396 | 
            +
                # Start spidering at the specified _url_.
         | 
| 365 397 | 
             
                #
         | 
| 366 398 | 
             
                def start_at(url)
         | 
| 367 | 
            -
                  clear
         | 
| 368 399 | 
             
                  enqueue(url)
         | 
| 369 400 |  | 
| 370 401 | 
             
                  return continue!
         | 
| @@ -413,6 +444,16 @@ module Spidr | |
| 413 444 | 
             
                  return self
         | 
| 414 445 | 
             
                end
         | 
| 415 446 |  | 
| 447 | 
            +
                #
         | 
| 448 | 
            +
                # Sets the list of acceptable URL schemes to follow to the
         | 
| 449 | 
            +
                # _new_schemes_.
         | 
| 450 | 
            +
                #
         | 
| 451 | 
            +
                #   agent.schemes = ['http']
         | 
| 452 | 
            +
                #
         | 
| 453 | 
            +
                def schemes=(new_schemes)
         | 
| 454 | 
            +
                  @schemes = new_schemes.map { |scheme| scheme.to_s }
         | 
| 455 | 
            +
                end
         | 
| 456 | 
            +
             | 
| 416 457 | 
             
                #
         | 
| 417 458 | 
             
                # Sets the history of links that were previously visited to the
         | 
| 418 459 | 
             
                # specified _new_history_.
         | 
| @@ -575,7 +616,7 @@ module Spidr | |
| 575 616 | 
             
                #
         | 
| 576 617 | 
             
                def visit_scheme?(url)
         | 
| 577 618 | 
             
                  if url.scheme
         | 
| 578 | 
            -
                    return  | 
| 619 | 
            +
                    return @schemes.include?(url.scheme)
         | 
| 579 620 | 
             
                  else
         | 
| 580 621 | 
             
                    return true
         | 
| 581 622 | 
             
                  end
         | 
    
        data/lib/spidr/page.rb
    CHANGED
    
    | @@ -252,8 +252,8 @@ module Spidr | |
| 252 252 | 
             
                # based on the url of the page.
         | 
| 253 253 | 
             
                #
         | 
| 254 254 | 
             
                def to_absolute(link)
         | 
| 255 | 
            -
                  # clean the  | 
| 256 | 
            -
                  link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
         | 
| 255 | 
            +
                  # decode, clean then re-encode the URL
         | 
| 256 | 
            +
                  link = URI.encode(URI.decode(link.to_s).gsub(/#[a-zA-Z0-9_-]*$/,''))
         | 
| 257 257 |  | 
| 258 258 | 
             
                  begin
         | 
| 259 259 | 
             
                    relative = URI(link)
         | 
    
        data/lib/spidr/version.rb
    CHANGED
    
    
    
        data.tar.gz.sig
    ADDED
    
    | Binary file | 
    
        metadata
    CHANGED
    
    | @@ -1,15 +1,36 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification 
         | 
| 2 2 | 
             
            name: spidr
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version 
         | 
| 4 | 
            -
              version: 0.1. | 
| 4 | 
            +
              version: 0.1.9
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors: 
         | 
| 7 7 | 
             
            - Postmodern
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 | 
            -
            cert_chain:  | 
| 10 | 
            +
            cert_chain: 
         | 
| 11 | 
            +
            - |
         | 
| 12 | 
            +
              -----BEGIN CERTIFICATE-----
         | 
| 13 | 
            +
              MIIDQDCCAiigAwIBAgIBADANBgkqhkiG9w0BAQUFADBGMRgwFgYDVQQDDA9wb3N0
         | 
| 14 | 
            +
              bW9kZXJuLm1vZDMxFTATBgoJkiaJk/IsZAEZFgVnbWFpbDETMBEGCgmSJomT8ixk
         | 
| 15 | 
            +
              ARkWA2NvbTAeFw0wOTA2MDMwNDU5MDNaFw0xMDA2MDMwNDU5MDNaMEYxGDAWBgNV
         | 
| 16 | 
            +
              BAMMD3Bvc3Rtb2Rlcm4ubW9kMzEVMBMGCgmSJomT8ixkARkWBWdtYWlsMRMwEQYK
         | 
| 17 | 
            +
              CZImiZPyLGQBGRYDY29tMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA
         | 
| 18 | 
            +
              1wvANkTDHFgVih5XLjuTwTZjgBq1lBGybXJiH6Id1lY2JOMqM5FB1DDHVvvij94i
         | 
| 19 | 
            +
              mJabN0zkzu6VKWC70y0IwOxY7CPokr0eFdK/D0y7mCq1P8QITv76i2YqAl0eYqIt
         | 
| 20 | 
            +
              W+IhIkANQ7E6uMZIZcdnfadC6lPAtlKkqtd9crvRbFgr6e3kyflmohbRnTEJHoRd
         | 
| 21 | 
            +
              7SHHsybE6DSn7oTDs6XBTNrNIn5VfZA0z01eeos/+zBm1zKJOK2+/7xtLLDuDU9G
         | 
| 22 | 
            +
              +Rd+ltUBbvxUrMNZmDG29pnmN2xTRH+Q8HxD2AxlvM5SRpK6OeZaHV7PaCCAVZ4L
         | 
| 23 | 
            +
              T9BFl1sfMvRlABeGEkSyuQIDAQABozkwNzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIE
         | 
| 24 | 
            +
              sDAdBgNVHQ4EFgQUKwsd+PqEYmBvyaTyoL+uRuk+PhEwDQYJKoZIhvcNAQEFBQAD
         | 
| 25 | 
            +
              ggEBAB4TvHsrlbcXcKg6gX5BIb9tI+zGkpzo0Z7jnxMEcNO7NGGwmzafDBI/xZYv
         | 
| 26 | 
            +
              xkRH3/HXbGGYDOi6Q6gWt5GujSx0bOImDtYTJTH8jnzN92HzEK5WdScm1QpZKF1e
         | 
| 27 | 
            +
              cezArMbxbSPaosxTCtG6LQTkE28lFQsmFZ5xzouugS4h5+LVJiVMmiP+l3EfkjFa
         | 
| 28 | 
            +
              GOURU+rNEMPWo8MCWivGW7jes6BMzWHcW7DQ0scNVmIcCIgdyMmpscuAEOSeghy9
         | 
| 29 | 
            +
              /fFs57Ey2OXBL55nDOyvN/ZQ2Vab05UH4t+GCxjAPeirzL/29FBtePT6VD44c38j
         | 
| 30 | 
            +
              pDj+ws7QjtH/Qcrr1l9jfN0ehDs=
         | 
| 31 | 
            +
              -----END CERTIFICATE-----
         | 
| 11 32 |  | 
| 12 | 
            -
            date: 2009- | 
| 33 | 
            +
            date: 2009-06-13 00:00:00 -07:00
         | 
| 13 34 | 
             
            default_executable: 
         | 
| 14 35 | 
             
            dependencies: 
         | 
| 15 36 | 
             
            - !ruby/object:Gem::Dependency 
         | 
| @@ -30,9 +51,12 @@ dependencies: | |
| 30 51 | 
             
                requirements: 
         | 
| 31 52 | 
             
                - - ">="
         | 
| 32 53 | 
             
                  - !ruby/object:Gem::Version 
         | 
| 33 | 
            -
                    version:  | 
| 54 | 
            +
                    version: 2.0.0
         | 
| 34 55 | 
             
                version: 
         | 
| 35 | 
            -
            description:  | 
| 56 | 
            +
            description: |-
         | 
| 57 | 
            +
              Spidr is a versatile Ruby web spidering library that can spider a site,
         | 
| 58 | 
            +
              multiple domains, certain links or infinitely. Spidr is designed to be fast
         | 
| 59 | 
            +
              and easy to use.
         | 
| 36 60 | 
             
            email: 
         | 
| 37 61 | 
             
            - postmodern.mod3@gmail.com
         | 
| 38 62 | 
             
            executables: []
         | 
| @@ -92,6 +116,8 @@ files: | |
| 92 116 | 
             
            - static/course/specs.json
         | 
| 93 117 | 
             
            has_rdoc: true
         | 
| 94 118 | 
             
            homepage: http://spidr.rubyforge.org/
         | 
| 119 | 
            +
            licenses: []
         | 
| 120 | 
            +
             | 
| 95 121 | 
             
            post_install_message: 
         | 
| 96 122 | 
             
            rdoc_options: 
         | 
| 97 123 | 
             
            - --main
         | 
| @@ -113,9 +139,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 113 139 | 
             
            requirements: []
         | 
| 114 140 |  | 
| 115 141 | 
             
            rubyforge_project: spidr
         | 
| 116 | 
            -
            rubygems_version: 1.3. | 
| 142 | 
            +
            rubygems_version: 1.3.4
         | 
| 117 143 | 
             
            signing_key: 
         | 
| 118 | 
            -
            specification_version:  | 
| 144 | 
            +
            specification_version: 3
         | 
| 119 145 | 
             
            summary: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely
         | 
| 120 146 | 
             
            test_files: []
         | 
| 121 147 |  | 
    
        metadata.gz.sig
    ADDED
    
    | Binary file |