RubyGems - spidr - Versions diffs - 0.1.9 → 0.2.0 - Mend

spidr 0.1.9 → 0.2.0

Files changed (37) hide show

data.tar.gz.sig +0 -0
data/History.txt +43 -0
data/Manifest.txt +19 -0
data/README.txt +100 -11
data/Rakefile +15 -5
data/lib/spidr/actions.rb +2 -0
data/lib/spidr/actions/actions.rb +79 -0
data/lib/spidr/actions/exceptions.rb +4 -0
data/lib/spidr/actions/exceptions/action.rb +6 -0
data/lib/spidr/actions/exceptions/paused.rb +8 -0
data/lib/spidr/actions/exceptions/skip_link.rb +8 -0
data/lib/spidr/actions/exceptions/skip_page.rb +8 -0
data/lib/spidr/agent.rb +385 -444
data/lib/spidr/events.rb +87 -0
data/lib/spidr/extensions.rb +1 -0
data/lib/spidr/extensions/uri.rb +45 -0
data/lib/spidr/filters.rb +438 -0
data/lib/spidr/page.rb +211 -70
data/lib/spidr/rules.rb +40 -18
data/lib/spidr/spidr.rb +57 -7
data/lib/spidr/version.rb +2 -1
data/spec/actions_spec.rb +61 -0
data/spec/agent_spec.rb +24 -31
data/spec/extensions/uri_spec.rb +39 -0
data/spec/filters_spec.rb +53 -0
data/spec/helpers/page.rb +8 -0
data/spec/page_examples.rb +17 -0
data/spec/page_spec.rb +81 -0
data/spec/rules_spec.rb +43 -0
data/spec/spec_helper.rb +1 -1
data/spec/spidr_spec.rb +30 -0
data/static/course/specs.json +1 -1
data/tasks/course.rb +8 -1
data/tasks/spec.rb +1 -0
data/tasks/yard.rb +12 -0
metadata +45 -6
metadata.gz.sig +0 -0

data/lib/spidr/page.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+require 'spidr/extensions/uri'
 require 'uri'
 require 'nokogiri'
@@ -10,15 +12,17 @@ module Spidr
     # HTTP Response
     attr_reader :response
-    # Body returned for the page
-    attr_reader :body
     # Headers returned with the body
     attr_reader :headers
     #
-    # Creates a new Page object from the specified _url_ and HTTP
-    # _response_.
+    # Creates a new Page object.
+    #
+    # @param [URI::HTTP] url
+    #   The URL of the page.
+    #
+    # @param [Net::HTTP::Response] response
+    #   The response from the request for the page.
     #
     def initialize(url,response)
       @url = url
@@ -28,169 +32,234 @@ module Spidr
     end
     #
-    # Returns the response code from the page.
+    # The response code from the page.
+    #
+    # @return [Integer]
+    #   Response code from the page.
     #
     def code
-      @response.code
+      @response.code.to_i
     end
     #
-    # Returns +true+ if the response code is 200, returns +false+ otherwise.
+    # Determines if the response code is +200+.
+    #
+    # @return [Boolean]
+    #   Specifies whether the response code is +200+.
     #
     def is_ok?
       code == 200
     end
+    alias ok? is_ok?
+    #
+    # Determines if the response code is +301+ or +307+.
     #
-    # Returns +true+ if the response code is 301 or 307, returns +false+
-    # otherwise.
+    # @return [Boolean]
+    #   Specifies whether the response code is +301+ or +307+.
     #
     def is_redirect?
       (code == 301 || code == 307)
     end
+    alias redirect? is_redirect?
+    #
+    # Determines if the response code is +308+.
     #
-    # Returns +true+ if the response code is 308, returns +false+ otherwise.
+    # @return [Boolean]
+    #   Specifies whether the response code is +308+.
     #
     def timedout?
       code == 308
     end
     #
-    # Returns +true+ if the response code is 400, returns +false+ otherwise.
+    # Determines if the response code is +400+.
+    #
+    # @return [Boolean]
+    #   Specifies whether the response code is +400+.
     #
     def bad_request?
       code == 400
     end
     #
-    # Returns +true+ if the response code is 401, returns +false+ otherwise.
+    # Determines if the response code is +401+.
+    #
+    # @return [Boolean]
+    #   Specifies whether the response code is +401+.
     #
     def is_unauthorized?
       code == 401
     end
+    alias unauthorized? is_unauthorized?
     #
-    # Returns +true+ if the response code is 403, returns +false+ otherwise.
+    # Determines if the response code is +403+.
+    #
+    # @return [Boolean]
+    #   Specifies whether the response code is +403+.
     #
     def is_forbidden?
       code == 403
     end
+    alias forbidden? is_forbidden?
     #
-    # Returns +true+ if the response code is 404, returns +false+ otherwise.
+    # Determines if the response code is +404+.
+    #
+    # @return [Boolean]
+    #   Specifies whether the response code is +404+.
     #
     def is_missing?
       code == 404
     end
+    alias missing? is_missing?
     #
-    # Returns +true+ if the response code is 500, returns +false+ otherwise.
+    # Determines if the response code is +500+.
+    #
+    # @return [Boolean]
+    #   Specifies whether the response code is +500+.
     #
     def had_internal_server_error?
       code == 500
     end
     #
-    # Returns the content-type of the page.
+    # The Content-Type of the page.
+    #
+    # @return [String]
+    #   The Content-Type of the page.
     #
     def content_type
       @response['Content-Type']
     end
     #
-    # Returns +true+ if the page is a plain text document, returns +false+
-    # otherwise.
+    # Determines if the page is plain-text.
+    #
+    # @return [Boolean]
+    #   Specifies whether the page is plain-text.
     #
     def plain_text?
       (content_type =~ /text\/plain/) == 0
     end
+    alias txt? plain_text?
+    #
+    # Determines if the page is HTML document.
     #
-    # Returns +true+ if the page is a HTML document, returns +false+
-    # otherwise.
+    # @return [Boolean]
+    #   Specifies whether the page is HTML document.
     #
     def html?
       (content_type =~ /text\/html/) == 0
     end
     #
-    # Returns +true+ if the page is a XML document, returns +false+
-    # otherwise.
+    # Determines if the page is XML document.
+    #
+    # @return [Boolean]
+    #   Specifies whether the page is XML document.
     #
     def xml?
       (content_type =~ /text\/xml/) == 0
     end
     #
-    # Returns +true+ if the page is a Javascript file, returns +false+
-    # otherwise.
+    # Determines if the page is JavaScript.
+    #
+    # @return [Boolean]
+    #   Specifies whether the page is JavaScript.
     #
     def javascript?
       (content_type =~ /(text|application)\/javascript/) == 0
     end
     #
-    # Returns +true+ if the page is a CSS file, returns +false+
-    # otherwise.
+    # Determines if the page is a CSS stylesheet.
+    #
+    # @return [Boolean]
+    #   Specifies whether the page is a CSS stylesheet.
     #
     def css?
       (content_type =~ /text\/css/) == 0
     end
     #
-    # Returns +true+ if the page is a RSS/RDF feed, returns +false+
-    # otherwise.
+    # Determines if the page is a RSS feed.
+    #
+    # @return [Boolean]
+    #   Specifies whether the page is a RSS feed.
     #
     def rss?
       (content_type =~ /application\/(rss|rdf)\+xml/) == 0
     end
     #
-    # Returns +true+ if the page is a Atom feed, returns +false+
-    # otherwise.
+    # Determines if the page is an Atom feed.
+    #
+    # @return [Boolean]
+    #   Specifies whether the page is an Atom feed.
     #
     def atom?
       (content_type =~ /application\/atom\+xml/) == 0
     end
     #
-    # Returns +true+ if the page is a MS Word document, returns +false+
-    # otherwise.
+    # Determines if the page is a MS Word document.
+    #
+    # @return [Boolean]
+    #   Specifies whether the page is a MS Word document.
     #
     def ms_word?
       (content_type =~ /application\/msword/) == 0
     end
     #
-    # Returns +true+ if the page is a PDF document, returns +false+
-    # otherwise.
+    # Determines if the page is a PDF document.
+    #
+    # @return [Boolean]
+    #   Specifies whether the page is a PDF document.
     #
     def pdf?
       (content_type =~ /application\/pdf/) == 0
     end
     #
-    # Returns +true+ if the page is a ZIP archive, returns +false+
-    # otherwise.
+    # Determines if the page is a ZIP archive.
+    #
+    # @return [Boolean]
+    #   Specifies whether the page is a ZIP archive.
     #
     def zip?
       (content_type =~ /application\/zip/) == 0
     end
     #
-    # Returns the body of the page in +String+ form.
+    # The body of the response.
+    #
+    # @return [String]
+    #   The body of the response.
     #
     def body
       @response.body
     end
     #
-    # If the page has a <tt>text/html</tt> content-type, a
-    # Nokogiri::HTML::Document object will be returned. If the page has a
-    # <tt>text/xml</tt> content-type, a Nokogiri::XML::Document object
-    # will be returned. Other content-types will cause +nil+ to be
-    # returned.
+    # Returns a parsed document object for HTML, XML, RSS and Atom pages.
+    #
+    # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
+    #   The document that represents HTML or XML pages.
+    #   Returns +nil+ if the page is neither HTML, XML, RSS, Atom or if
+    #   the page could not be parsed properly.
     #
     def doc
       return nil if (body.nil? || body.empty?)
@@ -198,7 +267,7 @@ module Spidr
       begin
         if html?
           return @doc ||= Nokogiri::HTML(body)
-        elsif xml?
+        elsif (xml? || rss? || atom?)
           return @doc ||= Nokogiri::XML(body)
         end
       rescue
@@ -207,7 +276,70 @@ module Spidr
     end
     #
-    # Returns all links from the HTML page.
+    # Searches the document for XPath or CSS Path paths.
+    #
+    # @param [Array<String>] paths
+    #   CSS or XPath expressions to search the document with.
+    #
+    # @return [Array]
+    #   The matched nodes from the document.
+    #   Returns an empty Array if no nodes were matched, or if the page
+    #   is not an HTML or XML document.
+    #
+    # @example
+    #   page.search('//a[@href]')
+    #
+    # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
+    #
+    def search(*paths)
+      if doc
+        return doc.search(*paths)
+      end
+      return []
+    end
+    #
+    # Searches for the first occurrence an XPath or CSS Path expression.
+    #
+    # @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
+    #   The first matched node. Returns +nil+ if no nodes could be matched,
+    #   or if the page is not a HTML or XML document.
+    #
+    # @example
+    #   page.at('//title')
+    #
+    # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
+    #
+    def at(*arguments)
+      if doc
+        return doc.at(*arguments)
+      end
+      return nil
+    end
+    alias / search
+    alias % at
+    #
+    # The title of the HTML page.
+    #
+    # @return [String]
+    #   The inner-text of the title element of the page.
+    #
+    def title
+      if (node = at('//title'))
+        return node.inner_text
+      end
+    end
+    #
+    # The links from within the page.
+    #
+    # @return [Array<String>]
+    #   All links within the HTML page, frame/iframe source URLs and any
+    #   links in the +Location+ header.
     #
     def links
       urls = []
@@ -218,7 +350,15 @@ module Spidr
       case code
       when 300..303, 307
-        add_url.call(@headers['location'])
+        location = @headers['location']
+        if location.kind_of?(Array)
+          # handle multiple location URLs
+          location.each(&add_url)
+        else
+          # usually the location header contains a single String
+          add_url.call(location)
+        end
       end
       if (html? && doc)
@@ -239,44 +379,45 @@ module Spidr
     end
     #
-    # Returns all links from the HtML page as absolute URLs.
+    # Absolute URIs from within the page.
+    #
+    # @return [Array<URI::HTTP>]
+    #   The links from within the page, converted to absolute URIs.
     #
     def urls
       links.map { |link| to_absolute(link) }.compact
     end
-    protected
     #
-    # Converts the specified _link_ into an absolute URL
-    # based on the url of the page.
+    # Normalizes and expands a given link into a proper URI.
+    #
+    # @param [String] link
+    #   The link to normalize and expand.
+    #
+    # @return [URI::HTTP]
+    #   The normalized URI.
     #
     def to_absolute(link)
-      # decode, clean then re-encode the URL
-      link = URI.encode(URI.decode(link.to_s).gsub(/#[a-zA-Z0-9_-]*$/,''))
       begin
-        relative = URI(link)
-        absolute = @url.merge(relative)
-        if absolute.path
-          if absolute.path.empty?
-            # default the absolute path to '/'
-            absolute.path = '/'
-          else
-            # make sure the path does not contain any .. or . directories.
-            absolute.path = File.expand_path(absolute.path)
-          end
-        end
-        return absolute
-      rescue URI::InvalidURIError => e
+        url = @url.merge(link.to_s)
+      rescue URI::InvalidURIError
         return nil
       end
+      unless (url.path.nil? || url.path.empty?)
+        # make sure the path does not contain any .. or . directories,
+        # since URI::Generic#merge cannot normalize paths such as
+        # "/stuff/../"
+        url.path = URI.expand_path(url.path)
+      end
+      return url
     end
+    protected
     #
-    # Provides transparent access to the values in the +headers+ +Hash+.
+    # Provides transparent access to the values in +headers+.
     #
     def method_missing(sym,*args,&block)
       if (args.empty? && block.nil?)