RubyGems - mechanize - Versions diffs - 2.1.1 → 2.2 - Mend

mechanize 2.1.1 → 2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mechanize might be problematic. Click here for more details.

Files changed (30) hide show

data.tar.gz.sig +0 -0
data/CHANGELOG.rdoc +36 -1
data/EXAMPLES.rdoc +23 -18
data/GUIDE.rdoc +10 -5
data/Manifest.txt +4 -0
data/Rakefile +2 -1
data/lib/mechanize.rb +88 -18
data/lib/mechanize/file_request.rb +4 -0
data/lib/mechanize/file_saver.rb +3 -3
data/lib/mechanize/http/agent.rb +155 -114
data/lib/mechanize/image.rb +6 -0
data/lib/mechanize/page.rb +38 -3
data/lib/mechanize/page/image.rb +160 -10
data/lib/mechanize/page/link.rb +5 -0
data/lib/mechanize/page/meta_refresh.rb +28 -25
data/lib/mechanize/pluggable_parsers.rb +28 -14
data/lib/mechanize/util.rb +6 -0
data/test/htdocs/tc_links.html +2 -0
data/test/test_mechanize.rb +39 -10
data/test/test_mechanize_directory_saver.rb +49 -0
data/test/test_mechanize_file_request.rb +14 -8
data/test/test_mechanize_http_agent.rb +391 -370
data/test/test_mechanize_image.rb +8 -0
data/test/test_mechanize_link.rb +8 -0
data/test/test_mechanize_page.rb +11 -10
data/test/test_mechanize_page_image.rb +183 -0
data/test/test_mechanize_page_meta_refresh.rb +20 -4
data/test/test_mechanize_pluggable_parser.rb +15 -0
metadata +56 -27
metadata.gz.sig +0 -0

data/lib/mechanize/image.rb ADDED Viewed

@@ -0,0 +1,6 @@
+##
+# An Image holds downloaded data for an image/* response.
+class Mechanize::Image < Mechanize::Download
+end

data/lib/mechanize/page.rb CHANGED Viewed

@@ -186,9 +186,26 @@ class Mechanize::Page < Mechanize::File
     @meta_content_type || response['content-type']
   end
-  # Search through the page like HPricot
+  ##
+  # :method: search
+  #
+  # Search for +paths+ in the page using Nokogiri's #search.  The +paths+ can
+  # be XPath or CSS and an optional Hash of namespaces may be appended.
+  #
+  # See Nokogiri::XML::Node#search for further details.
   def_delegator :parser, :search, :search
-  def_delegator :parser, :/, :/
+  alias / search
+  ##
+  # :method: at
+  #
+  # Search through the page for +path+ under +namespace+ using Nokogiri's #at.
+  # The +path+ may be either a CSS or XPath expression.
+  #
+  # See also Nokogiri::XML::Node#at
   def_delegator :parser, :at, :at
   ##
@@ -283,6 +300,24 @@ class Mechanize::Page < Mechanize::File
   elements_with :iframe
+  ##
+  # :method: image_with(criteria)
+  #
+  # Find a single image matching +criteria+.
+  # Example:
+  #   page.image_with(:alt => /main/).fetch.save
+  ##
+  # :method: images_with(criteria)
+  #
+  # Find all images matching +criteria+.
+  # Example:
+  #   page.images_with(:src => /jpg\Z/).each do |img|
+  #     img.fetch.save
+  #   end
+  elements_with :image
   ##
   # Return a list of all link and area tags
   def links
@@ -310,7 +345,7 @@ class Mechanize::Page < Mechanize::File
     query = @mech.follow_meta_refresh == :anywhere ? 'meta' : 'head > meta'
     @meta_refresh ||= search(query).map do |node|
-      MetaRefresh.from_node node, self, uri
+      MetaRefresh.from_node node, self
     end.compact
   end

data/lib/mechanize/page/image.rb CHANGED Viewed

@@ -2,29 +2,179 @@
 # An image element on an HTML page
 class Mechanize::Page::Image
   attr_reader :node
-  attr_reader :page
+  attr_accessor :page
+  attr_accessor :mech
+  ##
+  # Creates a new Mechanize::Page::Image from an image +node+ and source
+  # +page+.
-  def initialize(node, page)
+  def initialize node, page
     @node = node
     @page = page
+    @mech = page.mech
+  end
+  ##
+  # The alt attribute of the image
+  def alt
+    node['alt']
+  end
+  ##
+  # The caption of the image.  In order of preference, the #title, #alt, or
+  # empty string "".
+  def caption
+    title || alt || ''
+  end
+  alias :text :caption
+  ##
+  # The class attribute of the image
+  def dom_class
+    node['class']
+  end
+  ##
+  # The id attribute of the image
+  def dom_id
+    node['id']
+  end
+  ##
+  # The suffix of the #url. The dot is a part of suffix, not a delimiter.
+  #
+  #   p image.url     # => "http://example/test.jpg"
+  #   p image.extname # => ".jpg"
+  #
+  # Returns an empty string if #url has no suffix:
+  #
+  #   p image.url     # => "http://example/sampleimage"
+  #   p image.extname # => ""
+  def extname
+    return nil unless src
+    File.extname url.path
+  end
+  ##
+  # Downloads the image.
+  #
+  #   agent.page.image_with(:src => /logo/).fetch.save
+  #
+  # The referer is:
+  #
+  # #page("parent") ::
+  #   all images on http html, relative #src images on https html
+  # (no referer)    ::
+  #   absolute #src images on https html
+  # user specified  ::
+  #   img.fetch(nil, my_referer_uri_or_page)
+  def fetch parameters = [], referer = nil, headers = {}
+    mech.get src, parameters, referer || image_referer, headers
+  end
+  ##
+  # The height attribute of the image
+  def height
+    node['height']
+  end
+  def image_referer # :nodoc:
+    http_page  = page.uri && page.uri.scheme == 'http'
+    https_page = page.uri && page.uri.scheme == 'https'
+    case
+    when http_page               then page
+    when https_page && relative? then page
+    else
+      Mechanize::File.new(nil, { 'content-type' => 'text/plain' }, '', 200)
+    end
+  end
+  ##
+  # MIME type guessed from the image url suffix
+  #
+  #   p image.extname   # => ".jpg"
+  #   p image.mime_type # => "image/jpeg"
+  #   page.images_with(:mime_type => /gif|jpeg|png/).each do ...
+  #
+  # Returns nil if url has no (well-known) suffix:
+  #
+  #   p image.url       # => "http://example/sampleimage"
+  #   p image.mime_type # => nil
+  def mime_type
+    suffix_without_dot = extname ? extname.sub(/\A\./){''}.downcase : nil
+    Mechanize::Util::DefaultMimeTypes[suffix_without_dot]
   end
+  def pretty_print(q) # :nodoc:
+    q.object_group(self) {
+      q.breakable; q.pp url
+      q.breakable; q.pp caption
+    }
+  end
+  alias inspect pretty_inspect # :nodoc:
+  def relative? # :nodoc:
+    %r{^https?://} !~ src
+  end
+  ##
+  # The src attribute of the image
   def src
-    @node['src']
+    node['src']
   end
+  ##
+  # The title attribute of the image
+  def title
+    node['title']
+  end
+  ##
+  # The URL string of this image
+  def to_s
+    url.to_s
+  end
+  ##
+  # URI for this image
   def url
-    case src
-    when %r{^https?://}
-      src
-    else
-      if page.bases[0]
-        (page.bases[0].href + src).to_s
+    if relative? then
+      if page.bases[0] then
+         page.bases[0].href + src
       else
-        (page.uri + src).to_s
+        page.uri + src
       end
+    else
+      src
     end
   end
+  ##
+  # The width attribute of the image
+  def width
+    node['width']
+  end
 end

data/lib/mechanize/page/link.rb CHANGED Viewed

@@ -63,6 +63,11 @@ class Mechanize::Page::Link
     rel.include? kind
   end
+  # Test if this link should not be traced.
+  def noreferrer?
+    rel?('noreferrer')
+  end
   # The text content of this link
   def text
     return @text if @text

data/lib/mechanize/page/meta_refresh.rb CHANGED Viewed

@@ -22,7 +22,7 @@ class Mechanize::Page::MetaRefresh < Mechanize::Page::Link
   #   $1:: delay
   #   $3:: url
-  CONTENT_REGEXP = /^\s*(\d+\.?\d*)(;|;\s*url=\s*['"]?(\S*?)['"]?)?\s*$/i
+  CONTENT_REGEXP = /^\s*(\d+\.?\d*)\s*(?:;(?:\s*url\s*=\s*(['"]?)(\S*)\2)?\s*)?$/i
   ##
   # Regexp of unsafe URI characters that excludes % for Issue #177
@@ -30,46 +30,49 @@ class Mechanize::Page::MetaRefresh < Mechanize::Page::Link
   UNSAFE = /[^\-_.!~*'()a-zA-Z\d;\/?:@&%=+$,\[\]]/
   ##
-  # Parses the delay and url from the content attribute of a meta refresh
-  # element.  Parse requires the uri of the current page to infer a url when
-  # no url is specified.
+  # Parses the delay and url from the content attribute of a meta
+  # refresh element.
   #
-  # Returns an array of [delay, url]. (both in string)
-  #
-  # Returns nil if the delay and url cannot be parsed.
-  def self.parse content, base_uri
-    return unless content =~ CONTENT_REGEXP
-    link_self = $3.nil? || $3.empty?
-    delay = $1
-    refresh_uri = $3
-    refresh_uri = Mechanize::Util.uri_escape refresh_uri, UNSAFE if refresh_uri
-    dest = base_uri
-    dest += refresh_uri if refresh_uri
-    return delay, dest, link_self
+  # Returns an array of [delay, url, link_self], where the first two
+  # are strings containing the respective parts of the refresh value,
+  # and link_self is a boolean value that indicates whether the url
+  # part is missing or empty.  If base_uri, the URI of the current
+  # page is given, the value of url becomes an absolute URI.
+  def self.parse content, base_uri = nil
+    m = CONTENT_REGEXP.match(content) or return
+    delay, url = m[1], m[3]
+    url &&= url.empty? ? nil : Mechanize::Util.uri_escape(url, UNSAFE)
+    link_self = url.nil?
+    if base_uri
+      url = url ? base_uri + url : base_uri
+    end
+    return delay, url, link_self
   end
-  def self.from_node node, page, uri
-    http_equiv = node['http-equiv']
-    return unless http_equiv and http_equiv.downcase == 'refresh'
+  def self.from_node node, page, uri = nil
+    http_equiv = node['http-equiv'] and
+      /\ARefresh\z/i =~ http_equiv or return
     delay, uri, link_self = parse node['content'], uri
     return unless delay
-    new node, page, delay, uri.to_s, link_self
+    new node, page, delay, uri, link_self
   end
   def initialize node, page, delay, href, link_self = false
     super node, page.mech, page
-    @delay     = delay =~ /\./ ? delay.to_f : delay.to_i
+    @delay     = delay.include?(?.) ? delay.to_f : delay.to_i
     @href      = href
     @link_self = link_self
   end
+  def noreferrer?
+    true
+  end
 end

data/lib/mechanize/pluggable_parsers.rb CHANGED Viewed

@@ -3,13 +3,10 @@ require 'mechanize/file_saver'
 require 'mechanize/page'
 ##
-# This class is used to register and maintain pluggable parsers for Mechanize
-# to use.
-#
 # Mechanize allows different parsers for different content types.  Mechanize
 # uses PluggableParser to determine which parser to use for any content type.
-# To use your own pluggable parser or to change the default pluggable parsers,
-# register them with this class.
+# To use your own parser or to change the default parsers, register them with
+# this class through Mechanize#pluggable_parser.
 #
 # The default parser for unregistered content types is Mechanize::File.
 #
@@ -22,8 +19,8 @@ require 'mechanize/page'
 # == Example
 #
 # To create your own parser, just create a class that takes four parameters in
-# the constructor.  Here is an example of registering a pluggable parser that
-# handles CSV files:
+# the constructor.  Here is an example of registering a parser that handles
+# CSV files:
 #
 #   require 'csv'
 #
@@ -43,8 +40,8 @@ require 'mechanize/page'
 # Now any response with a content type of 'text/csv' will initialize a
 # CSVParser and return that object to the caller.
 #
-# To register a pluggable parser for a content type that pluggable parser does
-# not know about, use the hash syntax:
+# To register a parser for a content type that Mechanize does not know about,
+# use the hash syntax:
 #
 #   agent.pluggable_parser['text/something'] = SomeClass
 #
@@ -73,6 +70,7 @@ class Mechanize::PluggableParser
       CONTENT_TYPES[:html]  => Mechanize::Page,
       CONTENT_TYPES[:xhtml] => Mechanize::Page,
       CONTENT_TYPES[:wap]   => Mechanize::Page,
+      'image'               => Mechanize::Image
     }
     @default = Mechanize::File
@@ -81,11 +79,24 @@ class Mechanize::PluggableParser
   ##
   # Returns the parser registered for the given +content_type+
-  def parser(content_type)
-    content_type.nil? ? default : @parsers[content_type] || default
+  def parser content_type
+    return default unless content_type
+    parser = @parsers[content_type]
+    return parser if parser
+    mime_type = MIME::Type.new content_type
+    parser = @parsers[mime_type.to_s] ||
+             @parsers[mime_type.simplified] ||
+             @parsers[mime_type.media_type] ||
+             default
+  rescue MIME::InvalidContentType
+    default
   end
-  def register_parser(content_type, klass) # :nodoc:
+  def register_parser content_type, klass # :nodoc:
     @parsers[content_type] = klass
   end
@@ -135,9 +146,12 @@ class Mechanize::PluggableParser
   ##
   # Sets the parser for +content_type+ content to +klass+
+  #
+  # The +content_type+ may either be a full MIME type a simplified MIME type
+  # ('text/x-csv' simplifies to 'text/csv') or a media type like 'image'.
-  def []=(content_type, klass)
-    @parsers[content_type] = klass
+  def []= content_type, klass
+    register_parser content_type, klass
   end
 end

data/lib/mechanize/util.rb CHANGED Viewed

@@ -21,6 +21,12 @@ class Mechanize::Util
                       [Iconv::InvalidEncoding, Iconv::IllegalSequence]
                     end
+  # default mime type data for Page::Image#mime_type.
+  # You can use another Apache-compatible mimetab.
+  #   mimetab = WEBrick::HTTPUtils.load_mime_types('/etc/mime.types')
+  #   Mechanize::Util::DefaultMimeTypes.replace(mimetab)
+  DefaultMimeTypes = WEBrick::HTTPUtils::DefaultMimeTypes
   def self.build_query_string(parameters, enc=nil)
     parameters.map { |k,v|
       # WEBrick::HTTP.escape* has some problems about m17n on ruby-1.9.*.