RubyGems - spidr - Versions diffs - 0.5.0 → 0.6.0 - Mend

spidr 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

checksums.yaml +4 -4
data/.travis.yml +14 -0
data/ChangeLog.md +20 -2
data/Gemfile +2 -2
data/README.md +4 -2
data/Rakefile +1 -0
data/gemspec.yml +1 -1
data/lib/spidr/agent.rb +145 -85
data/lib/spidr/agent/filters.rb +1 -9
data/lib/spidr/agent/robots.rb +36 -0
data/lib/spidr/page.rb +76 -28
data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
data/lib/spidr/page/cookies.rb +60 -0
data/lib/spidr/page/{links.rb → html.rb} +47 -23
data/lib/spidr/page/status_codes.rb +112 -0
data/lib/spidr/proxy.rb +56 -0
data/lib/spidr/session_cache.rb +60 -24
data/lib/spidr/settings.rb +3 -0
data/lib/spidr/settings/proxy.rb +61 -0
data/lib/spidr/settings/timeouts.rb +33 -0
data/lib/spidr/settings/user_agent.rb +14 -0
data/lib/spidr/spidr.rb +15 -79
data/lib/spidr/version.rb +1 -1
data/spec/agent/actions_spec.rb +158 -32
data/spec/agent/filters_spec.rb +46 -29
data/spec/agent/sanitizers_spec.rb +25 -31
data/spec/agent_spec.rb +772 -50
data/spec/example_app.rb +27 -0
data/spec/example_page.rb +33 -0
data/spec/page/content_types_spec.rb +150 -0
data/spec/page/cookies_spec.rb +58 -0
data/spec/page/html_spec.rb +524 -0
data/spec/page/status_codes_spec.rb +87 -0
data/spec/page_spec.rb +114 -78
data/spec/proxy_spec.rb +45 -0
data/spec/session_cache.rb +103 -2
data/spec/settings/proxy_examples.rb +82 -0
data/spec/settings/timeouts_examples.rb +93 -0
data/spec/settings/user_agent_examples.rb +25 -0
data/spec/spidr_spec.rb +6 -29
data/spidr.gemspec +38 -109
metadata +35 -31
data/lib/spidr/page/body.rb +0 -98
data/spec/helpers/history.rb +0 -34
data/spec/helpers/page.rb +0 -8
data/spec/helpers/wsoc.rb +0 -83
data/spec/page_examples.rb +0 -21

data/lib/spidr/agent/filters.rb CHANGED

@@ -400,7 +400,7 @@ module Spidr
       @schemes = []
       if options[:schemes]
-        @schemes += options[:schemes]
+        self.schemes = options[:schemes]
       else
         @schemes << 'http'
@@ -439,14 +439,6 @@ module Spidr
       if options[:host]
         visit_hosts_like(options[:host])
       end
-      if options[:queue]
-        self.queue = options[:queue]
-      end
-      if options[:history]
-        self.history = options[:history]
-      end
     end
     #

data/lib/spidr/agent/robots.rb ADDED

@@ -0,0 +1,36 @@
+begin
+  require 'robots'
+rescue LoadError
+end
+module Spidr
+  class Agent
+    #
+    # Initializes the robots filter.
+    #
+    def initialize_robots
+      unless Object.const_defined?(:Robots)
+        raise(ArgumentError,":robots option given but unable to require 'robots' gem")
+      end
+      @robots = Robots.new(@user_agent)
+    end
+    #
+    # Determines whether a URL is allowed by the robot policy.
+    #
+    # @param [URI::HTTP, String] url
+    #   The URL to check.
+    #
+    # @return [Boolean]
+    #   Specifies whether a URL is allowed by the robot policy.
+    #
+    def robot_allowed?(url)
+      if @robots
+        @robots.allowed?(url)
+      else
+        true
+      end
+    end
+  end
+end

data/lib/spidr/page.rb CHANGED

@@ -1,7 +1,3 @@
-require 'spidr/page/headers'
-require 'spidr/page/body'
-require 'spidr/page/links'
 module Spidr
   #
   # Represents a requested page from a website.
@@ -34,42 +30,89 @@ module Spidr
     end
     #
-    # The meta-redirect links of the page.
+    # The body of the response.
+    #
+    # @return [String]
+    #   The body of the response.
     #
-    # @return [Array<String>]
-    #   All meta-redirect links in the page.
+    def body
+      (response.body || '')
+    end
+    alias to_s body
     #
-    # @deprecated
-    #   Deprecated in 0.3.0 and will be removed in 0.4.0.
-    #   Use {#meta_redirects} instead.
+    # Returns a parsed document object for HTML, XML, RSS and Atom pages.
     #
-    def meta_redirect
-      STDERR.puts 'DEPRECATION: Spidr::Page#meta_redirect will be removed in 0.3.0'
-      STDERR.puts 'DEPRECATION: Use Spidr::Page#meta_redirects instead'
+    # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
+    #   The document that represents HTML or XML pages.
+    #   Returns `nil` if the page is neither HTML, XML, RSS, Atom or if
+    #   the page could not be parsed properly.
+    #
+    # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
+    # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
+    #
+    def doc
+      unless body.empty?
+        doc_class = if html?
+                      Nokogiri::HTML::Document
+                    elsif rss? || atom? || xml? || xsl?
+                      Nokogiri::XML::Document
+                    end
-      meta_redirects
+        if doc_class
+          begin
+            @doc ||= doc_class.parse(body, @url.to_s, content_charset)
+          rescue
+          end
+        end
+      end
     end
     #
-    # Determines if the response code is `300`, `301`, `302`, `303`
-    # or `307`. Also checks for "soft" redirects added at the page
-    # level by a meta refresh tag.
+    # Searches the document for XPath or CSS Path paths.
+    #
+    # @param [Array<String>] paths
+    #   CSS or XPath expressions to search the document with.
     #
-    # @return [Boolean]
-    #   Specifies whether the response code is a HTTP Redirect code.
+    # @return [Array]
+    #   The matched nodes from the document.
+    #   Returns an empty Array if no nodes were matched, or if the page
+    #   is not an HTML or XML document.
     #
-    def is_redirect?
-      case code
-      when 300..303, 307
-        true
-      when 200
-        meta_redirect?
+    # @example
+    #   page.search('//a[@href]')
+    #
+    # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
+    #
+    def search(*paths)
+      if doc
+        doc.search(*paths)
       else
-        false
+        []
       end
     end
-    alias redirect? is_redirect?
+    #
+    # Searches for the first occurrence an XPath or CSS Path expression.
+    #
+    # @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
+    #   The first matched node. Returns `nil` if no nodes could be matched,
+    #   or if the page is not a HTML or XML document.
+    #
+    # @example
+    #   page.at('//title')
+    #
+    # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
+    #
+    def at(*arguments)
+      if doc
+        doc.at(*arguments)
+      end
+    end
+    alias / search
+    alias % at
     protected
@@ -90,7 +133,7 @@ module Spidr
     #
     def method_missing(name,*arguments,&block)
       if (arguments.empty? && block.nil?)
-        header_name = name.to_s.sub('_','-')
+        header_name = name.to_s.tr('_','-')
         if @response.key?(header_name)
           return @response[header_name]
@@ -102,3 +145,8 @@ module Spidr
   end
 end
+require 'spidr/page/status_codes'
+require 'spidr/page/content_types'
+require 'spidr/page/cookies'
+require 'spidr/page/html'

data/lib/spidr/page/{headers.rb → content_types.rb} RENAMED

@@ -1,98 +1,5 @@
-require 'set'
 module Spidr
   class Page
-    # Reserved names used within Cookie strings
-    RESERVED_COOKIE_NAMES = Set['path', 'expires', 'domain']
-    #
-    # The response code from the page.
-    #
-    # @return [Integer]
-    #   Response code from the page.
-    #
-    def code
-      response.code.to_i
-    end
-    #
-    # Determines if the response code is `200`.
-    #
-    # @return [Boolean]
-    #   Specifies whether the response code is `200`.
-    #
-    def is_ok?
-      code == 200
-    end
-    alias ok? is_ok?
-    #
-    # Determines if the response code is `308`.
-    #
-    # @return [Boolean]
-    #   Specifies whether the response code is `308`.
-    #
-    def timedout?
-      code == 308
-    end
-    #
-    # Determines if the response code is `400`.
-    #
-    # @return [Boolean]
-    #   Specifies whether the response code is `400`.
-    #
-    def bad_request?
-      code == 400
-    end
-    #
-    # Determines if the response code is `401`.
-    #
-    # @return [Boolean]
-    #   Specifies whether the response code is `401`.
-    #
-    def is_unauthorized?
-      code == 401
-    end
-    alias unauthorized? is_unauthorized?
-    #
-    # Determines if the response code is `403`.
-    #
-    # @return [Boolean]
-    #   Specifies whether the response code is `403`.
-    #
-    def is_forbidden?
-      code == 403
-    end
-    alias forbidden? is_forbidden?
-    #
-    # Determines if the response code is `404`.
-    #
-    # @return [Boolean]
-    #   Specifies whether the response code is `404`.
-    #
-    def is_missing?
-      code == 404
-    end
-    alias missing? is_missing?
-    #
-    # Determines if the response code is `500`.
-    #
-    # @return [Boolean]
-    #   Specifies whether the response code is `500`.
-    #
-    def had_internal_server_error?
-      code == 500
-    end
     #
     # The Content-Type of the page.
     #
@@ -100,7 +7,7 @@ module Spidr
     #   The Content-Type of the page.
     #
     def content_type
-      (response['Content-Type'] || '')
+      @response['Content-Type'] || ''
     end
     #
@@ -112,7 +19,7 @@ module Spidr
     # @since 0.2.2
     #
     def content_types
-      (headers['content-type'] || [])
+      @response.get_fields('content-type') || []
     end
     #
@@ -314,57 +221,5 @@ module Spidr
     def zip?
       is_content_type?('application/zip')
     end
-    #
-    # The raw Cookie String sent along with the page.
-    #
-    # @return [String]
-    #   The raw Cookie from the response.
-    #
-    # @since 0.2.7
-    #
-    def cookie
-      (response['Set-Cookie'] || '')
-    end
-    alias raw_cookie cookie
-    #
-    # The Cookie values sent along with the page.
-    #
-    # @return [Array<String>]
-    #   The Cookies from the response.
-    #
-    # @since 0.2.2
-    #
-    def cookies
-      (headers['set-cookie'] || [])
-    end
-    #
-    # The Cookie key -> value pairs returned with the response.
-    #
-    # @return [Hash{String => String}]
-    #   The cookie keys and values.
-    #
-    # @since 0.2.2
-    #
-    def cookie_params
-      params = {}
-      cookies.each do |value|
-        value.split(';').each do |param|
-          param.strip!
-          name, value = param.split('=',2)
-          unless RESERVED_COOKIE_NAMES.include?(name)
-            params[name] = (value || '')
-          end
-        end
-      end
-      return params
-    end
   end
 end

data/lib/spidr/page/cookies.rb ADDED

@@ -0,0 +1,60 @@
+require 'set'
+module Spidr
+  class Page
+    # Reserved names used within Cookie strings
+    RESERVED_COOKIE_NAMES = /^(?:Path|Expires|Domain|Secure|HTTPOnly)$/i
+    #
+    # The raw Cookie String sent along with the page.
+    #
+    # @return [String]
+    #   The raw Cookie from the response.
+    #
+    # @since 0.2.7
+    #
+    def cookie
+      @response['Set-Cookie'] || ''
+    end
+    alias raw_cookie cookie
+    #
+    # The Cookie values sent along with the page.
+    #
+    # @return [Array<String>]
+    #   The Cookies from the response.
+    #
+    # @since 0.2.2
+    #
+    def cookies
+      (@response.get_fields('Set-Cookie') || [])
+    end
+    #
+    # The Cookie key -> value pairs returned with the response.
+    #
+    # @return [Hash{String => String}]
+    #   The cookie keys and values.
+    #
+    # @since 0.2.2
+    #
+    def cookie_params
+      params = {}
+      cookies.each do |value|
+        value.split(';').each do |param|
+          param.strip!
+          name, value = param.split('=',2)
+          unless name =~ RESERVED_COOKIE_NAMES
+            params[name] = (value || '')
+          end
+        end
+      end
+      return params
+    end
+  end
+end

data/lib/spidr/page/{links.rb → html.rb} RENAMED

@@ -1,10 +1,22 @@
+require 'nokogiri'
 require 'spidr/extensions/uri'
-require 'uri'
 module Spidr
   class Page
     include Enumerable
+    #
+    # The title of the HTML page.
+    #
+    # @return [String]
+    #   The inner-text of the title element of the page.
+    #
+    def title
+      if (node = at('//title'))
+        node.inner_text
+      end
+    end
     #
     # Enumerates over the meta-redirect links in the page.
     #
@@ -21,7 +33,7 @@ module Spidr
     # @since 0.3.0
     #
     def each_meta_redirect
-      return enum_for(:each_meta_redirect) unless block_given?
+      return enum_for(__method__) unless block_given?
       if (html? && doc)
         search('//meta[@http-equiv and @content]').each do |node|
@@ -44,7 +56,7 @@ module Spidr
     #   Specifies whether the page includes page-level redirects.
     #
     def meta_redirect?
-      !(each_meta_redirect.first.nil?)
+      !each_meta_redirect.first.nil?
     end
     #
@@ -59,6 +71,23 @@ module Spidr
       each_meta_redirect.to_a
     end
+    #
+    # The meta-redirect links of the page.
+    #
+    # @return [Array<String>]
+    #   All meta-redirect links in the page.
+    #
+    # @deprecated
+    #   Deprecated in 0.3.0 and will be removed in 0.4.0.
+    #   Use {#meta_redirects} instead.
+    #
+    def meta_redirect
+      warn 'DEPRECATION: Spidr::Page#meta_redirect will be removed in 0.3.0'
+      warn 'DEPRECATION: Use Spidr::Page#meta_redirects instead'
+      meta_redirects
+    end
     #
     # Enumerates over every HTTP or meta-redirect link in the page.
     #
@@ -74,18 +103,14 @@ module Spidr
     # @since 0.3.0
     #
     def each_redirect(&block)
-      return enum_for(:each_redirect) unless block
+      return enum_for(__method__) unless block
-      location = headers['location']
-      if location.nil?
+      if (locations = @response.get_fields('Location'))
+        # Location headers override any meta-refresh redirects in the HTML
+        locations.each(&block)
+      else
         # check page-level meta redirects if there isn't a location header
         each_meta_redirect(&block)
-      elsif location.kind_of?(Array)
-        location.each(&block)
-      else
-        # usually the location header contains a single String
-        yield location
       end
     end
@@ -115,7 +140,7 @@ module Spidr
     # @since 0.5.0
     #
     def each_mailto
-      return enum_for(:each_mailto) unless block_given?
+      return enum_for(__method__) unless block_given?
       if (html? && doc)
         doc.search('//a[starts-with(@href,"mailto:")]').each do |a|
@@ -151,7 +176,7 @@ module Spidr
     # @since 0.3.0
     #
     def each_link
-      return enum_for(:each_link) unless block_given?
+      return enum_for(__method__) unless block_given?
       filter = lambda { |url|
         yield url unless (url.nil? || url.empty?)
@@ -208,7 +233,7 @@ module Spidr
     # @since 0.3.0
     #
     def each_url
-      return enum_for(:each_url) unless block_given?
+      return enum_for(__method__) unless block_given?
       each_link do |link|
         if (url = to_absolute(link))
@@ -239,15 +264,14 @@ module Spidr
     #   The normalized URI.
     #
     def to_absolute(link)
-      begin
-        new_url = url.merge(link.to_s)
-      rescue Exception
-        return nil
-      end
-      if new_url.path
-        path = new_url.path
+      link    = link.to_s
+      new_url = begin
+                  url.merge(link)
+                rescue Exception
+                  return
+                end
+      if (path = new_url.path)
         # ensure that paths begin with a leading '/' for URI::FTP
         if (new_url.scheme == 'ftp' && !path.start_with?('/'))
           path.insert(0,'/')