RubyGems - spidr - Versions diffs - 0.1.8 → 0.1.9 - Mend

spidr 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

data/History.txt CHANGED Viewed

@@ -1,3 +1,18 @@
+=== 0.1.9 / 2009-06-13
+* Upgraded to Hoe 2.0.0.
+  * Use Hoe.spec instead of Hoe.new.
+  * Use the Hoe signing task for signed gems.
+* Added the Agent#schemes and Agent#schemes= methods.
+* Added a warning message if 'net/https' cannot be loaded.
+* Allow the list of acceptable URL schemes to be passed into Agent.new.
+* Allow history and queue information to be passed into Agent.new.
+* Agent#start_at no longer clears the history or the queue.
+* Fixed a bug in the sanitization of semi-escaped URLs.
+* Fixed a bug where https URLs would be followed even if 'net/https'
+  could not be loaded.
+* Removed Agent::SCHEMES.
 === 0.1.8 / 2009-05-27
 * Added the Agent#pause! and Agent#continue! methods.

data/README.txt CHANGED Viewed

@@ -18,6 +18,7 @@ and easy to use.
   * frame tags.
   * HTTP 300, 301, 302, 303 and 307 Redirects.
 * Black-list or white-list URLs based upon:
+  * URL scheme.
   * Host name
   * Port number
   * Full link

data/Rakefile CHANGED Viewed

@@ -2,11 +2,12 @@
 require 'rubygems'
 require 'hoe'
+require 'hoe/signing'
 require './tasks/spec.rb'
 require './tasks/course.rb'
 require './lib/spidr/version.rb'
-Hoe.new('spidr', Spidr::VERSION) do |p|
+Hoe.spec('spidr') do |p|
   p.rubyforge_name = 'spidr'
   p.developer('Postmodern', 'postmodern.mod3@gmail.com')
   p.remote_rdoc_dir = 'docs'

data/lib/spidr/agent.rb CHANGED Viewed

@@ -7,9 +7,6 @@ require 'net/http'
 module Spidr
   class Agent
-    # URL schemes to visit
-    SCHEMES = ['http', 'https']
     # Proxy to use
     attr_accessor :proxy
@@ -22,6 +19,9 @@ module Spidr
     # Delay in between fetching pages
     attr_accessor :delay
+    # List of acceptable URL schemes to follow
+    attr_reader :schemes
     # History containing visited URLs
     attr_reader :history
@@ -42,6 +42,10 @@ module Spidr
     # <tt>:referer</tt>:: The referer URL to send.
     # <tt>:delay</tt>:: Duration in seconds to pause between spidering each
     #                   link. Defaults to 0.
+    # <tt>:schemes</tt>:: The list of acceptable URL schemes to follow.
+    #                     Defaults to +http+ and +https+. +https+ URL
+    #                     schemes will be ignored if <tt>net/http</tt>
+    #                     cannot be loaded.
     # <tt>:host</tt>:: The host-name to visit.
     # <tt>:hosts</tt>:: An +Array+ of host patterns to visit.
     # <tt>:ignore_hosts</tt>:: An +Array+ of host patterns to not visit.
@@ -52,12 +56,32 @@ module Spidr
     # <tt>:exts</tt>:: An +Array+ of File extension patterns to visit.
     # <tt>:ignore_exts</tt>:: An +Array+ of File extension patterns to not
     #                         visit.
+    # <tt>:queue</tt>:: An initial queue of URLs to visit.
+    # <tt>:history</tt>:: An initial list of visited URLs.
     #
     def initialize(options={},&block)
       @proxy = (options[:proxy] || Spidr.proxy)
       @user_agent = (options[:user_agent] || Spidr.user_agent)
       @referer = options[:referer]
+      @schemes = []
+      if options[:schemes]
+        @schemes += options[:schemes]
+      else
+        @schemes << 'http'
+        begin
+          require 'net/https'
+          @schemes << 'https'
+        rescue Gem::LoadError => e
+          raise(e)
+        rescue ::LoadError
+          STDERR.puts "Warning: cannot load 'net/https', https support disabled"
+        end
+      end
       @host_rules = Rules.new(
         :accept => options[:hosts],
         :reject => options[:ignore_hosts]
@@ -91,6 +115,14 @@ module Spidr
         visit_hosts_like(options[:host])
       end
+      if options[:queue]
+        self.queue = options[:queue]
+      end
+      if options[:history]
+        self.history = options[:history]
+      end
       block.call(self) if block
     end
@@ -361,10 +393,9 @@ module Spidr
     end
     #
-    # Clear the history and start spidering at the specified _url_.
+    # Start spidering at the specified _url_.
     #
     def start_at(url)
-      clear
       enqueue(url)
       return continue!
@@ -413,6 +444,16 @@ module Spidr
       return self
     end
+    #
+    # Sets the list of acceptable URL schemes to follow to the
+    # _new_schemes_.
+    #
+    #   agent.schemes = ['http']
+    #
+    def schemes=(new_schemes)
+      @schemes = new_schemes.map { |scheme| scheme.to_s }
+    end
     #
     # Sets the history of links that were previously visited to the
     # specified _new_history_.
@@ -575,7 +616,7 @@ module Spidr
     #
     def visit_scheme?(url)
       if url.scheme
-        return SCHEMES.include?(url.scheme)
+        return @schemes.include?(url.scheme)
       else
         return true
       end

data/lib/spidr/page.rb CHANGED Viewed

@@ -252,8 +252,8 @@ module Spidr
     # based on the url of the page.
     #
     def to_absolute(link)
-      # clean the link
-      link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
+      # decode, clean then re-encode the URL
+      link = URI.encode(URI.decode(link.to_s).gsub(/#[a-zA-Z0-9_-]*$/,''))
       begin
         relative = URI(link)

data/lib/spidr/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Spidr
-  VERSION = '0.1.8'
+  VERSION = '0.1.9'
 end

data.tar.gz.sig ADDED Viewed

Binary file

metadata CHANGED Viewed

@@ -1,15 +1,36 @@
 --- !ruby/object:Gem::Specification
 name: spidr
 version: !ruby/object:Gem::Version
-  version: 0.1.8
+  version: 0.1.9
 platform: ruby
 authors:
 - Postmodern
 autorequire:
 bindir: bin
-cert_chain: []
+cert_chain:
+- |
+  -----BEGIN CERTIFICATE-----
+  MIIDQDCCAiigAwIBAgIBADANBgkqhkiG9w0BAQUFADBGMRgwFgYDVQQDDA9wb3N0
+  bW9kZXJuLm1vZDMxFTATBgoJkiaJk/IsZAEZFgVnbWFpbDETMBEGCgmSJomT8ixk
+  ARkWA2NvbTAeFw0wOTA2MDMwNDU5MDNaFw0xMDA2MDMwNDU5MDNaMEYxGDAWBgNV
+  BAMMD3Bvc3Rtb2Rlcm4ubW9kMzEVMBMGCgmSJomT8ixkARkWBWdtYWlsMRMwEQYK
+  CZImiZPyLGQBGRYDY29tMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA
+  1wvANkTDHFgVih5XLjuTwTZjgBq1lBGybXJiH6Id1lY2JOMqM5FB1DDHVvvij94i
+  mJabN0zkzu6VKWC70y0IwOxY7CPokr0eFdK/D0y7mCq1P8QITv76i2YqAl0eYqIt
+  W+IhIkANQ7E6uMZIZcdnfadC6lPAtlKkqtd9crvRbFgr6e3kyflmohbRnTEJHoRd
+  7SHHsybE6DSn7oTDs6XBTNrNIn5VfZA0z01eeos/+zBm1zKJOK2+/7xtLLDuDU9G
+  +Rd+ltUBbvxUrMNZmDG29pnmN2xTRH+Q8HxD2AxlvM5SRpK6OeZaHV7PaCCAVZ4L
+  T9BFl1sfMvRlABeGEkSyuQIDAQABozkwNzAJBgNVHRMEAjAAMAsGA1UdDwQEAwIE
+  sDAdBgNVHQ4EFgQUKwsd+PqEYmBvyaTyoL+uRuk+PhEwDQYJKoZIhvcNAQEFBQAD
+  ggEBAB4TvHsrlbcXcKg6gX5BIb9tI+zGkpzo0Z7jnxMEcNO7NGGwmzafDBI/xZYv
+  xkRH3/HXbGGYDOi6Q6gWt5GujSx0bOImDtYTJTH8jnzN92HzEK5WdScm1QpZKF1e
+  cezArMbxbSPaosxTCtG6LQTkE28lFQsmFZ5xzouugS4h5+LVJiVMmiP+l3EfkjFa
+  GOURU+rNEMPWo8MCWivGW7jes6BMzWHcW7DQ0scNVmIcCIgdyMmpscuAEOSeghy9
+  /fFs57Ey2OXBL55nDOyvN/ZQ2Vab05UH4t+GCxjAPeirzL/29FBtePT6VD44c38j
+  pDj+ws7QjtH/Qcrr1l9jfN0ehDs=
+  -----END CERTIFICATE-----
-date: 2009-05-27 00:00:00 -07:00
+date: 2009-06-13 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -30,9 +51,12 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 1.12.2
+        version: 2.0.0
     version:
-description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
+description: |-
+  Spidr is a versatile Ruby web spidering library that can spider a site,
+  multiple domains, certain links or infinitely. Spidr is designed to be fast
+  and easy to use.
 email:
 - postmodern.mod3@gmail.com
 executables: []
@@ -92,6 +116,8 @@ files:
 - static/course/specs.json
 has_rdoc: true
 homepage: http://spidr.rubyforge.org/
+licenses: []
 post_install_message:
 rdoc_options:
 - --main
@@ -113,9 +139,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: spidr
-rubygems_version: 1.3.1
+rubygems_version: 1.3.4
 signing_key:
-specification_version: 2
+specification_version: 3
 summary: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely
 test_files: []

metadata.gz.sig ADDED Viewed

Binary file