RubyGems - mechanize - Versions diffs - 2.0.pre.2 → 2.0 - Mend

mechanize 2.0.pre.2 → 2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mechanize might be problematic. Click here for more details.

Files changed (46) hide show

data.tar.gz.sig +0 -0
data/CHANGELOG.rdoc +22 -0
data/Manifest.txt +11 -8
data/Rakefile +2 -2
data/examples/flickr_upload.rb +6 -7
data/examples/mech-dump.rb +0 -2
data/examples/proxy_req.rb +0 -2
data/examples/rubyforge.rb +1 -3
data/examples/spider.rb +2 -3
data/lib/mechanize.rb +228 -680
data/lib/mechanize/form/field.rb +1 -1
data/lib/mechanize/history.rb +23 -5
data/lib/mechanize/http.rb +3 -0
data/lib/mechanize/http/agent.rb +738 -0
data/lib/mechanize/inspect.rb +2 -2
data/lib/mechanize/page.rb +101 -42
data/lib/mechanize/page/frame.rb +24 -17
data/lib/mechanize/page/link.rb +72 -54
data/lib/mechanize/page/meta_refresh.rb +56 -0
data/lib/mechanize/response_read_error.rb +27 -0
data/test/htdocs/frame_referer_test.html +10 -0
data/test/htdocs/tc_referer.html +4 -0
data/test/test_frames.rb +9 -0
data/test/test_history.rb +74 -98
data/test/test_mechanize.rb +334 -812
data/test/test_mechanize_form.rb +32 -3
data/test/{test_textarea.rb → test_mechanize_form_textarea.rb} +1 -1
data/test/test_mechanize_http_agent.rb +697 -0
data/test/test_mechanize_link.rb +83 -0
data/test/test_mechanize_page_encoding.rb +147 -0
data/test/test_mechanize_page_link.rb +379 -0
data/test/test_mechanize_page_meta_refresh.rb +115 -0
data/test/test_pretty_print.rb +1 -1
data/test/test_referer.rb +29 -5
data/test/test_response_code.rb +21 -20
data/test/test_robots.rb +13 -17
data/test/test_scheme.rb +1 -1
metadata +30 -31
metadata.gz.sig +0 -0
data/lib/mechanize/page/meta.rb +0 -48
data/test/test_form_no_inputname.rb +0 -15
data/test/test_links.rb +0 -146
data/test/test_mechanize_page.rb +0 -224
data/test/test_meta.rb +0 -67
data/test/test_upload.rb +0 -109
data/test/test_verbs.rb +0 -25

data/lib/mechanize/inspect.rb CHANGED

@@ -17,8 +17,8 @@ class Mechanize
         q.breakable
         q.group(1, '{url', '}') {q.breakable; q.pp uri }
         q.breakable
-        q.group(1, '{meta', '}') {
-          meta.each { |link| q.breakable; q.pp link }
+        q.group(1, '{meta_refresh', '}') {
+          meta_refresh.each { |link| q.breakable; q.pp link }
         }
         q.breakable
         q.group(1, '{title', '}') { q.breakable; q.pp title }

data/lib/mechanize/page.rb CHANGED

@@ -24,36 +24,32 @@ class Mechanize::Page < Mechanize::File
     raise Mechanize::ContentTypeError, response['content-type'] unless
       response['content-type'] =~ /^(text\/html)|(application\/xhtml\+xml)/i
+    @meta_content_type = nil
     @encoding = nil
     @encodings = [nil]
+    raise 'no' if mech and not Mechanize === mech
     @mech = mech
     reset
     @encodings << Mechanize::Util.detect_charset(body) if body
-    response.each do |header, value|
-      next unless value =~ /charset/i
-      @encodings << charset(value)
-    end
+    @encodings.concat self.class.response_header_charset(response)
     if body
       # Force the encoding to be 8BIT so we can perform regular expressions.
       # We'll set it to the detected encoding later
-      body.force_encoding('ASCII-8BIT') if body.respond_to?(:force_encoding)
+      body.force_encoding 'ASCII-8BIT' if body.respond_to? :force_encoding
-      body.scan(/<meta .*?>/i) do |meta|
-        next unless meta =~ /http-equiv\s*=\s*(["'])?content-type\1/i
+      @encodings.concat self.class.meta_charset body
-        meta =~ /content=(["'])?(.*?)\1/i
-        encoding = charset $2
-        @encodings << encoding if encoding
-      end
+      meta_content_type = self.class.meta_content_type body
+      @meta_content_type = meta_content_type if meta_content_type
     end
-    super(uri, response, body, code)
+    @encodings << mech.default_encoding if mech and mech.default_encoding
+    super uri, response, body, code
   end
   def title
@@ -64,10 +60,16 @@ class Mechanize::Page < Mechanize::File
       end
   end
-  def charset content_type
-    charset = content_type[/charset=([^; ]+)/i, 1]
-    return nil if charset == 'none'
-    charset
+  def response_header_charset
+    self.class.response_header_charset(response)
+  end
+  def meta_charset
+    self.class.meta_charset(body)
+  end
+  def detected_encoding
+    Mechanize::Util.detect_charset(body)
   end
   def encoding=(encoding)
@@ -90,23 +92,31 @@ class Mechanize::Page < Mechanize::File
     parser.respond_to?(:encoding) ? parser.encoding : nil
   end
+  # Return whether parser result has errors related to encoding or not.
+  # false indicates just parser has no encoding errors, not encoding is vaild.
+  def encoding_error?(parser=nil)
+    parser = self.parser unless parser
+    return false if parser.errors.empty?
+    parser.errors.any? do |error|
+      error.message =~ /(indicate\ encoding)|
+                        (Invalid\ char)|
+                        (input\ conversion\ failed)/x
+    end
+  end
   def parser
     return @parser if @parser
     return nil unless @body
     if @encoding then
-      @parser = mech.html_parser.parse(html_body, nil, @encoding)
+      @parser = @mech.html_parser.parse html_body, nil, @encoding
+    elsif mech.force_default_encoding then
+      @parser = @mech.html_parser.parse html_body, nil, @mech.default_encoding
     else
       @encodings.reverse_each do |encoding|
-        @parser = mech.html_parser.parse(html_body, nil, encoding)
+        @parser = @mech.html_parser.parse html_body, nil, encoding
-        break if @parser.errors.empty?
-        break unless @parser.errors.any? do |error|
-          error.message =~ /(indicate\ encoding)|
-                            (Invalid\ char)|
-                            (input\ conversion failed)/x
-        end
+        break unless encoding_error? @parser
       end
     end
@@ -123,7 +133,7 @@ class Mechanize::Page < Mechanize::File
     @links = nil
     @labels = nil
     @labels_hash = nil
-    @meta = nil
+    @meta_refresh = nil
     @parser = nil
     @title = nil
   end
@@ -142,7 +152,7 @@ class Mechanize::Page < Mechanize::File
   # Get the content type
   def content_type
-    response['content-type']
+    @meta_content_type || response['content-type']
   end
   # Search through the page like HPricot
@@ -263,18 +273,13 @@ class Mechanize::Page < Mechanize::File
   end
   ##
-  # Return a list of all meta tags
-  def meta
-    @meta ||= search('head > meta').map do |node|
-      next unless node['http-equiv'] && node['content']
-      (equiv, content) = node['http-equiv'], node['content']
-      if equiv && equiv.downcase == 'refresh'
-        Meta.parse(content, uri) do |delay, href|
-          node['delay'] = delay
-          node['href'] = href
-          Meta.new(node, @mech, self)
-        end
-      end
+  # Return a list of all meta refresh elements
+  def meta_refresh
+    query = @mech.follow_meta_refresh == :anywhere ? 'meta' : 'head > meta'
+    @meta_refresh ||= search(query).map do |node|
+      MetaRefresh.from_node node, self, uri
     end.compact
   end
@@ -328,6 +333,54 @@ class Mechanize::Page < Mechanize::File
     return @labels_hash
   end
+  def self.charset content_type
+    charset = content_type[/charset=([^; ]+)/i, 1]
+    return nil if charset == 'none'
+    charset
+  end
+  def self.response_header_charset response
+    charsets = []
+    response.each do |header, value|
+      next unless value =~ /charset/i
+      charsets << charset(value)
+    end
+    charsets
+  end
+  ##
+  # Retrieves all charsets from +meta+ tags in +body+
+  def self.meta_charset body
+    # HACK use .map
+    body.scan(/<meta .*?>/i).map do |meta|
+      if meta =~ /charset\s*=\s*(["'])?\s*(.+)\s*\1/i then
+        $2
+      elsif meta =~ /http-equiv\s*=\s*(["'])?content-type\1/i then
+        meta =~ /content=(["'])?(.*?)\1/i
+        m_charset = charset $2
+        m_charset if m_charset
+      end
+    end.compact
+  end
+  ##
+  # Retrieves the last <tt>content-type</tt> set by a +meta+ tag in +body+
+  def self.meta_content_type body
+    body.scan(/<meta .*?>/i).reverse.map do |meta|
+      if meta =~ /http-equiv\s*=\s*(["'])?content-type\1/i then
+        meta =~ /content=(["'])?(.*?)\1/i
+        return $2
+      end
+    end
+    nil
+  end
   private
   def html_body
@@ -337,6 +390,12 @@ class Mechanize::Page < Mechanize::File
       ''
     end
   end
+  def self.charset_from_content_type content_type
+    charset = content_type[/charset=([^; ]+)/i, 1]
+    return nil if charset == 'none'
+    charset
+  end
 end
 require 'mechanize/headers'
@@ -345,5 +404,5 @@ require 'mechanize/page/label'
 require 'mechanize/page/link'
 require 'mechanize/page/base'
 require 'mechanize/page/frame'
-require 'mechanize/page/meta'
+require 'mechanize/page/meta_refresh'

data/lib/mechanize/page/frame.rb CHANGED

@@ -1,20 +1,27 @@
-class Mechanize
-  class Page < Mechanize::File
-    # This class encapsulates a 'frame' tag.  Frame objects can be treated
-    # just like Link objects.  They contain src, the link they refer to,
-    # name, the name of the frame.  'src' and 'name' are aliased to 'href'
-    # and 'text' respectively so that a Frame object can be treated just
-    # like a Link.
-    class Frame < Link
-      alias :src :href
-      alias :name :text
+# This class encapsulates a 'frame' tag.  Frame objects can be treated just
+# like Link objects.  They contain #src, the #link they refer to and a #name,
+# the name of the frame they refer to.  #src and #name are aliased to #href
+# and #text respectively so that a Frame object can be treated just like a
+# Link.
-      def initialize(node, mech, referer)
-        super(node, mech, referer)
-        @node = node
-        @text = node['name']
-        @href = node['src']
-      end
-    end
+class Mechanize::Page::Frame < Mechanize::Page::Link
+  alias :src :href
+  attr_reader :text
+  alias :name :text
+  def initialize(node, mech, referer)
+    super(node, mech, referer)
+    @node = node
+    @text = node['name']
+    @href = node['src']
+    @content = nil
   end
+  def content
+    @content ||= @mech.get @href, [], page
+  end
 end

data/lib/mechanize/page/link.rb CHANGED

@@ -1,64 +1,82 @@
-class Mechanize
-  class Page < Mechanize::File
-    # This class encapsulates links.  It contains the text and the URI for
-    # 'a' tags parsed out of an HTML page.  If the link contains an image,
-    # the alt text will be used for that image.
-    #
-    # For example, the text for the following links with both be 'Hello World':
-    #
-    # <a href="http://rubyforge.org">Hello World</a>
-    # <a href="http://rubyforge.org"><img src="test.jpg" alt="Hello World"></a>
-    class Link
-      attr_reader :node
-      attr_reader :href
-      attr_reader :text
-      attr_reader :attributes
-      attr_reader :page
-      alias :to_s :text
-      alias :referer :page
+##
+# This class encapsulates links.  It contains the text and the URI for
+# 'a' tags parsed out of an HTML page.  If the link contains an image,
+# the alt text will be used for that image.
+#
+# For example, the text for the following links with both be 'Hello World':
+#
+#   <a href="http://example">Hello World</a>
+#   <a href="http://example"><img src="test.jpg" alt="Hello World"></a>
-      def initialize(node, mech, page)
-        @node = node
-        @href = node['href']
-        @text = node.inner_text
-        @page = page
-        @mech = mech
-        @attributes = node
+class Mechanize::Page::Link
+  attr_reader :node
+  attr_reader :href
+  attr_reader :attributes
+  attr_reader :page
+  alias :referer :page
-        # If there is no text, try to find an image and use it's alt text
-        if (@text.nil? || @text.length == 0) && node.search('img').length > 0
-          @text = ''
-          node.search('img').each do |e|
-            @text << ( e['alt'] || '')
-          end
-        end
+  def initialize(node, mech, page)
+    @node       = node
+    @attributes = node
+    @href       = node['href']
+    @mech       = mech
+    @page       = page
+    @text       = nil
+    @uri        = nil
+  end
+  # Click on this link
+  def click
+    @mech.click self
+  end
-      end
+  # This method is a shorthand to get link's DOM id.
+  # Common usage:
+  #   page.link_with(:dom_id => "links_exact_id")
+  def dom_id
+    node['id']
+  end
-      def uri
-        @href && URI.parse(WEBrick::HTTPUtils.escape(@href))
-      end
+  # A list of words in the rel attribute, all lower-cased.
+  def rel
+    @rel ||= (val = attributes['rel']) ? val.downcase.split(' ') : []
+  end
+  # Test if the rel attribute includes +kind+.
+  def rel? kind
+    rel.include? kind
+  end
-      # A list of words in the rel attribute, all lower-cased.
-      def rel
-        @rel ||= (val = attributes['rel']) ? val.downcase.split(' ') : []
-      end
+  # The text content of this link
+  def text
+    return @text if @text
-      # Test if the rel attribute includes +kind+.
-      def rel?(kind)
-        rel.include?(kind)
-      end
+    @text = @node.inner_text
-      # Click on this link
-      def click
-        @mech.click self
-      end
-      # This method is a shorthand to get link's DOM id.
-      # Common usage: page.link_with(:dom_id => "links_exact_id")
-      def dom_id
-        node['id']
-      end
+    # If there is no text, try to find an image and use it's alt text
+    if (@text.nil? or @text.empty?) and imgs = @node.search('img') then
+      @text = imgs.map do |e|
+        e['alt']
+      end.join
     end
+    @text
   end
+  alias :to_s :text
+  # A URI for the #href for this link.  The link is first parsed as a raw
+  # link.  If that fails parsing an escaped link is attepmted.
+  def uri
+    @uri ||= if @href then
+               begin
+                 URI.parse @href
+               rescue URI::InvalidURIError
+                 URI.parse WEBrick::HTTPUtils.escape @href
+               end
+             end
+  end
 end

data/lib/mechanize/page/meta_refresh.rb ADDED

@@ -0,0 +1,56 @@
+##
+# This class encapsulates a meta element with a refresh http-equiv.  Mechanize
+# treats meta refresh elements just like 'a' tags.  MetaRefresh objects will
+# contain links, but most likely will have no text.
+class Mechanize::Page::MetaRefresh < Mechanize::Page::Link
+  attr_reader :delay
+  ##
+  # Matches the content attribute of a meta refresh element.  After the match:
+  #
+  #   $1:: delay
+  #   $3:: url
+  CONTENT_REGEXP = /^\s*(\d+\.?\d*)(;|;\s*url=\s*['"]?(\S*?)['"]?)?\s*$/i
+  ##
+  # Parses the delay and url from the content attribute of a meta refresh
+  # element.  Parse requires the uri of the current page to infer a url when
+  # no url is specified.
+  #
+  # Returns a MetaRefresh instance.
+  #
+  # Returns nil if the delay and url cannot be parsed.
+  def self.parse content, base_uri
+    return unless content =~ CONTENT_REGEXP
+    delay, refresh_uri = $1, $3
+    dest = base_uri
+    dest += refresh_uri if refresh_uri
+    return delay, dest
+  end
+  def self.from_node node, page, uri
+    http_equiv = node['http-equiv']
+    return unless http_equiv and http_equiv.downcase == 'refresh'
+    delay, uri = parse node['content'], uri
+    return unless delay
+    new node, page, delay, uri.to_s
+  end
+  def initialize node, page, delay, href
+    super node, page.mech, page
+    @delay = delay.to_i
+    @href  = href
+  end
+end