RubyGems - spidr - Versions diffs - 0.1.0 → 0.1.1 - Mend

spidr 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/History.txt CHANGED Viewed

@@ -1,3 +1,8 @@
+=== 0.1.1 / 2008-10-04
+* Added a reader method for the response instance variable in Page.
+* Fixed a bug in Page#method_missing.
 === 0.1.0 / 2008-05-23
 * Initial release.

data/README.txt CHANGED Viewed

@@ -12,14 +12,16 @@ and easy to use.
 == FEATURES/PROBLEMS:
 * Black-list or white-list URLs based upon:
- * Host name
- * Port number
- * Full link
- * URL extension
+  * Host name
+  * Port number
+  * Full link
+  * URL extension
 * Provides call-backs for:
- * Every visited Page.
- * Every visited URL.
- * Every visited URL that matches a specified pattern.
+  * Every visited Page.
+  * Every visited URL.
+  * Every visited URL that matches a specified pattern.
+* Custom User-Agent strings.
+* Custom proxy settings.
 == REQUIREMENTS:
@@ -29,6 +31,26 @@ and easy to use.
   $ sudo gem install spidr
+== EXAMPLES:
+* Start spidering from a URL:
+    Spidr.start_at('http://tenderlovemaking.com/')
+* Spider a host:
+    Spidr.host('www.0x000000.com')
+* Spider a site:
+    Spidr.site('http://hackety.org/')
+* Print out visited URLs:
+    Spidr.site('http://rubyinside.org/') do |spider|
+      spider.every_url { |url| puts url }
+    end
 == LICENSE:
 The MIT License

data/Rakefile CHANGED Viewed

@@ -7,6 +7,7 @@ require './lib/spidr/version.rb'
 Hoe.new('spidr', Spidr::VERSION) do |p|
   p.rubyforge_name = 'spidr'
   p.developer('Postmodern Modulus III', 'postmodern.mod3@gmail.com')
+  p.remote_rdoc_dir = 'docs'
   p.extra_deps = ['hpricot']
 end

data/lib/spidr/agent.rb CHANGED Viewed

@@ -366,6 +366,32 @@ module Spidr
       end
     end
+    #
+    # Creates a new Page object from the specified _url_. If a _block_ is
+    # given, it will be passed the newly created Page object.
+    #
+    def get_page(url,&block)
+      host = url.host
+      port = url.port
+      proxy_host = @proxy[:host]
+      proxy_port = @proxy[:port]
+      proxy_user = @proxy[:user]
+      proxy_password = @proxy[:password]
+      Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
+        headers = {}
+        headers['User-Agent'] = @user_agent if @user_agent
+        headers['Referer'] = @referer if @referer
+        new_page = Page.new(url,sess.get(url.path,headers))
+        block.call(new_page) if block
+        return new_page
+      end
+    end
     protected
     #
@@ -464,27 +490,5 @@ module Spidr
       @ext_rules.accept?(File.extname(url.path)[1..-1])
     end
-    def get_page(url,&block)
-      host = url.host
-      port = url.port
-      proxy_host = @proxy[:host]
-      proxy_port = @proxy[:port]
-      proxy_user = @proxy[:user]
-      proxy_password = @proxy[:password]
-      Net::HTTP::Proxy(proxy_host,proxy_port,proxy_user,proxy_password).start(host,port) do |sess|
-        headers = {}
-        headers['User-Agent'] = @user_agent if @user_agent
-        headers['Referer'] = @referer if @referer
-        new_page = Page.new(url,sess.get(url.path,headers))
-        block.call(new_page) if block
-        return new_page
-      end
-    end
   end
 end

data/lib/spidr/page.rb CHANGED Viewed

@@ -7,6 +7,9 @@ module Spidr
     # URL of the page
     attr_reader :url
+    # HTTP Response
+    attr_reader :response
     # Body returned for the page
     attr_reader :body
@@ -23,6 +26,70 @@ module Spidr
       @doc = nil
     end
+    #
+    # Returns the response code from the page.
+    #
+    def code
+      @response.code
+    end
+    #
+    # Returns +true+ if the response code is 200, returns +false+ otherwise.
+    #
+    def is_ok?
+      code == 200
+    end
+    #
+    # Returns +true+ if the response code is 301 or 307, returns +false+
+    # otherwise.
+    #
+    def is_redirect?
+      (code == 301 || code == 307)
+    end
+    #
+    # Returns +true+ if the response code is 308, returns +false+ otherwise.
+    #
+    def timedout?
+      code == 308
+    end
+    #
+    # Returns +true+ if the response code is 400, returns +false+ otherwise.
+    #
+    def bad_request?
+      code == 400
+    end
+    #
+    # Returns +true+ if the response code is 401, returns +false+ otherwise.
+    #
+    def is_unauthorized?
+      code == 401
+    end
+    #
+    # Returns +true+ if the response code is 403, returns +false+ otherwise.
+    #
+    def is_forbidden?
+      code == 403
+    end
+    #
+    # Returns +true+ if the response code is 404, returns +false+ otherwise.
+    #
+    def is_missing?
+      code == 404
+    end
+    #
+    # Returns +true+ if the response code is 500, returns +false+ otherwise.
+    #
+    def had_internal_server_error?
+      code == 500
+    end
     #
     # Returns the content-type of the page.
     #
@@ -30,6 +97,14 @@ module Spidr
       @response['Content-Type']
     end
+    #
+    # Returns +true+ if the page is a plain text document, returns +false+
+    # otherwise.
+    #
+    def plain_text?
+      (content_type =~ /text\/plain/) == 0
+    end
     #
     # Returns +true+ if the page is a HTML document, returns +false+
     # otherwise.
@@ -78,6 +153,30 @@ module Spidr
       (content_type =~ /application\/atom\+xml/) == 0
     end
+    #
+    # Returns +true+ if the page is a MS Word document, returns +false+
+    # otherwise.
+    #
+    def ms_word?
+      (content_type =~ /application\/msword/) == 0
+    end
+    #
+    # Returns +true+ if the page is a PDF document, returns +false+
+    # otherwise.
+    #
+    def pdf?
+      (content_type =~ /application\/pdf/) == 0
+    end
+    #
+    # Returns +true+ if the page is a ZIP archive, returns +false+
+    # otherwise.
+    #
+    def zip?
+      (content_type =~ /application\/zip/) == 0
+    end
     #
     # Returns the body of the page in +String+ form.
     #
@@ -122,24 +221,11 @@ module Spidr
     # based on the url of the page.
     #
     def to_absolute(link)
+      # clean the link
       link = URI.encode(link.to_s.gsub(/#.*$/,''))
-      relative = URI(link)
-      if relative.scheme.nil?
-        new_url = @url.clone
-        if relative.path[0..0] == '/'
-          new_url.path = relative.path
-        elsif relative.path[-1..-1] == '/'
-          new_url.path = File.expand_path(File.join(new_url.path,relative.path))
-        elsif !(relative.path.empty?)
-          new_url.path = File.expand_path(File.join(File.dirname(new_url.path),relative.path))
-        end
-        return new_url
-      end
-      return relative
+      relative = URI(link)
+      return @url.merge(relative)
     end
     #
@@ -149,7 +235,7 @@ module Spidr
       if (args.empty? && block.nil?)
         name = sym.id2name.sub('_','-')
-        return @response[name] if @response.has_key?(name)
+        return @response[name] if @response.key?(name)
       end
       return super(sym,*args,&block)

data/lib/spidr/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Spidr
-  VERSION = '0.1.0'
+  VERSION = '0.1.1'
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: spidr
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.1
 platform: ruby
 authors:
 - Postmodern Modulus III
@@ -9,11 +9,12 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2008-05-23 00:00:00 -07:00
+date: 2008-10-04 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
   name: hpricot
+  type: :runtime
   version_requirement:
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
@@ -23,12 +24,13 @@ dependencies:
     version:
 - !ruby/object:Gem::Dependency
   name: hoe
+  type: :development
   version_requirement:
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 1.5.3
+        version: 1.7.0
     version:
 description: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely. Spidr is designed to be fast and easy to use.
 email:
@@ -76,7 +78,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: spidr
-rubygems_version: 1.1.1
+rubygems_version: 1.2.0
 signing_key:
 specification_version: 2
 summary: Spidr is a versatile Ruby web spidering library that can spider a site, multiple domains, certain links or infinitely