RubyGems - mechanize - Versions diffs - 0.6.5 → 0.6.6 - Mend

mechanize 0.6.5 → 0.6.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mechanize might be problematic. Click here for more details.

Files changed (17) hide show

data/CHANGELOG.txt +10 -0
data/Manifest.txt +2 -1
data/lib/mechanize.rb +99 -92
data/lib/mechanize/cookie.rb +5 -3
data/lib/mechanize/form.rb +1 -1
data/lib/mechanize/form_elements.rb +1 -1
data/lib/mechanize/history.rb +62 -0
data/lib/mechanize/page_elements.rb +6 -2
data/lib/mechanize/rexml.rb +3 -3
data/test/htdocs/alt_text.html +1 -0
data/test/tc_cookie_class.rb +1 -1
data/test/tc_history.rb +125 -0
data/test/tc_links.rb +2 -1
data/test/tc_mech.rb +14 -1
data/test/test_all.rb +1 -0
metadata +4 -3
data/lib/mechanize/hpricot.rb +0 -9

data/CHANGELOG.txt CHANGED

@@ -1,5 +1,15 @@
 = Mechanize CHANGELOG
+== 0.6.6
+* Removing hpricot overrides
+* Fixed a bug where alt text can be nil.  Thanks Yannick!
+* Unparseable expiration dates in cookies are now treated as session cookies
+* Caching connections
+* Requests now default to keep alive
+* [#9434] Fixed bug where html entities weren't decoded
+* [#9150] Updated mechanize history to deal with redirects
 == 0.6.5
 * Copying headers to a hash to prevent memory leaks

data/Manifest.txt CHANGED

@@ -16,7 +16,7 @@ lib/mechanize/cookie.rb
 lib/mechanize/errors.rb
 lib/mechanize/form.rb
 lib/mechanize/form_elements.rb
-lib/mechanize/hpricot.rb
+lib/mechanize/history.rb
 lib/mechanize/inspect.rb
 lib/mechanize/list.rb
 lib/mechanize/net-overrides/net/http.rb
@@ -83,6 +83,7 @@ test/tc_form_no_inputname.rb
 test/tc_forms.rb
 test/tc_frames.rb
 test/tc_gzipping.rb
+test/tc_history.rb
 test/tc_html_unscape_forms.rb
 test/tc_if_modified_since.rb
 test/tc_links.rb

data/lib/mechanize.rb CHANGED

@@ -28,12 +28,12 @@ require 'uri'
 require 'webrick/httputils'
 require 'zlib'
 require 'stringio'
-require 'mechanize/hpricot'
 require 'mechanize/cookie'
 require 'mechanize/errors'
 require 'mechanize/pluggable_parsers'
 require 'mechanize/form'
 require 'mechanize/form_elements'
+require 'mechanize/history'
 require 'mechanize/list'
 require 'mechanize/page'
 require 'mechanize/page_elements'
@@ -62,7 +62,7 @@ class Mechanize
   ##
   # The version of Mechanize you are using.
-  VERSION = '0.6.5'
+  VERSION = '0.6.6'
   ##
   # User Agent aliases
@@ -80,7 +80,6 @@ class Mechanize
   attr_accessor :cookie_jar
   attr_accessor :log
-  attr_accessor :max_history
   attr_accessor :open_timeout, :read_timeout
   attr_accessor :user_agent
   attr_accessor :watch_for_set
@@ -89,6 +88,7 @@ class Mechanize
   attr_accessor :cert
   attr_accessor :pass
   attr_accessor :redirect_ok
+  attr_accessor :keep_alive_time
   attr_reader :history
   attr_reader :pluggable_parser
@@ -97,9 +97,8 @@ class Mechanize
   def initialize
     # attr_accessors
-    @cookie_jar = CookieJar.new
+    @cookie_jar     = CookieJar.new
     @log            = nil
-    @max_history    = nil
     @open_timeout   = nil
     @read_timeout   = nil
     @user_agent     = AGENT_ALIASES['Mechanize']
@@ -111,7 +110,7 @@ class Mechanize
     @redirect_ok    = true # Should we follow redirects?
     # attr_readers
-    @history        = []
+    @history        = WWW::Mechanize::History.new
     @pluggable_parser = PluggableParser.new
     # Basic Auth variables
@@ -124,9 +123,16 @@ class Mechanize
     @proxy_port     = nil
     @proxy_user     = nil
+    # Connection Cache & Keep alive
+    @connection_cache = {}
+    @keep_alive_time  = 300
     yield self if block_given?
   end
+  def max_history=(length); @history.max_size = length; end
+  def max_history; @history.max_size; end
   # Sets the proxy address, port, user, and password
   def set_proxy(addr, port, user = nil, pass = nil)
     @proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass
@@ -241,9 +247,10 @@ class Mechanize
   # Returns a visited page for the url passed in, otherwise nil
   def visited_page(url)
-    url = url.uri if url.respond_to? :uri
-    uri = to_absolute_uri(url).to_s
-    @history.reverse.find { |h| h.uri.to_s == uri }
+    if url.respond_to? :href
+      url = url.href
+    end
+    @history.visited_page(to_absolute_uri(url))
   end
   # Runs given block, then resets the page history as it was before. self is
@@ -261,6 +268,8 @@ class Mechanize
   protected
   def set_headers(uri, request, cur_page)
+    request.add_field('Connection', 'keep-alive')
+    request.add_field('Keep-Alive', keep_alive_time.to_s)
     request.add_field('Accept-Encoding', 'gzip,identity')
     request.add_field('Accept-Language', 'en-us,en;q0.5')
     request.add_field('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7')
@@ -304,8 +313,8 @@ class Mechanize
                   url.scan(/%[0-9A-Fa-f]{2}/)
                 ).map { |x,y|
                   "#{URI.escape(x)}#{y}"
-                }.join('')
-              ).gsub(/%23/, '#')
+                }.join('').gsub(/%23/, '#')
+              )
             )
     end
@@ -357,15 +366,19 @@ class Mechanize
     page = nil
-    http_obj = Net::HTTP.new( uri.host,
-                          uri.port,
-                          @proxy_addr,
-                          @proxy_port,
-                          @proxy_user,
-                          @proxy_pass
-                        )
+    http_obj = @connection_cache["#{uri.host}:#{uri.port}"]
+    if http_obj.nil? || ! http_obj.started?
+      http_obj = @connection_cache["#{uri.host}:#{uri.port}"] =
+          Net::HTTP.new( uri.host,
+                  uri.port,
+                  @proxy_addr,
+                  @proxy_port,
+                  @proxy_user,
+                  @proxy_pass
+                )
+    end
-    if uri.scheme == 'https'
+    if uri.scheme == 'https' && ! http_obj.started?
       http_obj.use_ssl = true
       http_obj.verify_mode = OpenSSL::SSL::VERIFY_NONE
       if @ca_file
@@ -387,86 +400,86 @@ class Mechanize
       end
     end
-    http_obj.start { |http|
-      # Specify timeouts if given
-      http.open_timeout = @open_timeout if @open_timeout
-      http.read_timeout = @read_timeout if @read_timeout
-      # Send the request
-      http.request(request, *request_data) {|response|
-        if log
-          response.each_header {|k,v|
-            log.debug("response-header: #{ k } => #{ v }")
-          }
-        end
+    # Specify timeouts if given
+    http_obj.open_timeout = @open_timeout if @open_timeout
+    http_obj.read_timeout = @read_timeout if @read_timeout
-        (response.get_fields('Set-Cookie')||[]).each do |cookie|
-          Cookie::parse(uri, cookie, log) { |c|
-            log.debug("saved cookie: #{c}") if log
-            @cookie_jar.add(uri, c)
-          }
-        end
+    # Send the request
+    http_obj.request(request, *request_data) {|response|
+      if log
+        response.each_header {|k,v|
+          log.debug("response-header: #{ k } => #{ v }")
+        }
+      end
-        body = StringIO.new
-        total = 0
-        response.read_body { |part|
-          total += part.length
-          body.write(part)
-          log.debug("Read #{total} bytes") if log
+      (response.get_fields('Set-Cookie')||[]).each do |cookie|
+        Cookie::parse(uri, cookie, log) { |c|
+          log.debug("saved cookie: #{c}") if log
+          @cookie_jar.add(uri, c)
         }
-        body.rewind
+      end
-        content_type = nil
-        unless response['Content-Type'].nil?
-          data = response['Content-Type'].match(/^([^;]*)/)
-          content_type = data[1].downcase unless data.nil?
-        end
+      body = StringIO.new
+      total = 0
+      response.read_body { |part|
+        total += part.length
+        body.write(part)
+        log.debug("Read #{total} bytes") if log
+      }
+      body.rewind
-        response_body =
-        if encoding = response['Content-Encoding']
-          case encoding.downcase
-          when 'gzip'
-            log.debug('gunzip body') if log
-            Zlib::GzipReader.new(body).read
-          else
-            raise 'Unsupported content encoding'
-          end
+      content_type = nil
+      unless response['Content-Type'].nil?
+        data = response['Content-Type'].match(/^([^;]*)/)
+        content_type = data[1].downcase unless data.nil?
+      end
+      response_body =
+      if encoding = response['Content-Encoding']
+        case encoding.downcase
+        when 'gzip'
+          log.debug('gunzip body') if log
+          Zlib::GzipReader.new(body).read
         else
-          body.read
+          raise 'Unsupported content encoding'
         end
+      else
+        body.read
+      end
-        # Find our pluggable parser
-        page = @pluggable_parser.parser(content_type).new(
-          uri,
-          response,
-          response_body,
-          response.code
-        ) { |parser|
-          parser.mech = self if parser.respond_to? :mech=
-          if parser.respond_to?(:watch_for_set=) && @watch_for_set
-            parser.watch_for_set = @watch_for_set
-          end
-        }
+      # Find our pluggable parser
+      page = @pluggable_parser.parser(content_type).new(
+        uri,
+        response,
+        response_body,
+        response.code
+      ) { |parser|
+        parser.mech = self if parser.respond_to? :mech=
+        if parser.respond_to?(:watch_for_set=) && @watch_for_set
+          parser.watch_for_set = @watch_for_set
+        end
+      }
-        log.info("status: #{ page.code }") if log
+      log.info("status: #{ page.code }") if log
-        res_klass = Net::HTTPResponse::CODE_TO_OBJ[page.code.to_s]
+      res_klass = Net::HTTPResponse::CODE_TO_OBJ[page.code.to_s]
-        return page if res_klass <= Net::HTTPSuccess
+      return page if res_klass <= Net::HTTPSuccess
-        if res_klass == Net::HTTPNotModified
-          log.debug("Got cached page") if log
-          return visited_page(uri)
-        elsif res_klass <= Net::HTTPRedirection
-          return page unless follow_redirect?
-          log.info("follow redirect to: #{ response['Location'] }") if log
-          abs_uri = to_absolute_uri(response['Location'].to_s, page)
-          request = fetch_request(abs_uri)
-          return fetch_page(abs_uri, request, page)
-        end
+      if res_klass == Net::HTTPNotModified
+        log.debug("Got cached page") if log
+        return visited_page(uri)
+      elsif res_klass <= Net::HTTPRedirection
+        return page unless follow_redirect?
+        log.info("follow redirect to: #{ response['Location'] }") if log
+        from_uri  = page.uri
+        abs_uri   = to_absolute_uri(response['Location'].to_s, page)
+        page = fetch_page(abs_uri, fetch_request(abs_uri), page)
+        @history.push(page, from_uri)
+        return page
+      end
-        raise ResponseCodeError.new(page), "Unhandled response", caller
-      }
+      raise ResponseCodeError.new(page), "Unhandled response", caller
     }
   end
@@ -484,12 +497,6 @@ class Mechanize
   def add_to_history(page)
     @history.push(page)
-    if @max_history and @history.length > @max_history
-      while @history.length > @max_history
-        @history[0] = nil
-        @history.shift
-      end
-    end
   end
   # :stopdoc:

data/lib/mechanize/cookie.rb CHANGED

@@ -23,10 +23,12 @@ module WWW
             when "domain"  then cookie.domain  = value.sub(/^\./, '')
             when "path"    then cookie.path    = value
             when 'expires'
-              cookie.expires = begin
-                Time::parse(value)
+              begin
+                cookie.expires = Time::parse(value)
               rescue
-                Time.now
+                if log
+                  log.warn("Couldn't parse expires: #{value}")
+                end
               end
             when "max-age" then
               begin

data/lib/mechanize/form.rb CHANGED

@@ -148,7 +148,7 @@ module WWW
         (@elements_node/'textarea').each do |node|
           next if node.attributes.nil?
           next if node.attributes['name'].nil?
-          @fields << Field.new(node.attributes['name'], node.all_text)
+          @fields << Field.new(node.attributes['name'], node.inner_text)
         end
         # Find all select tags

data/lib/mechanize/form_elements.rb CHANGED

@@ -218,7 +218,7 @@ module WWW
     def initialize(node, select_list)
       node.attributes ||= {}
-      @text     = node.all_text
+      @text     = node.inner_text
       @value    = Util.html_unescape(node.attributes['value'])
       @selected = node.attributes.has_key?('selected') ? true : false
       @select_list = select_list # The select list this option belongs to

data/lib/mechanize/history.rb ADDED

@@ -0,0 +1,62 @@
+module WWW
+  class Mechanize
+    ##
+    # This class manages history for your mechanize object.
+    class History < Array
+      attr_accessor :max_size
+      def initialize(max_size = nil)
+        @max_size       = max_size
+        @history_index  = {}
+      end
+      def push(page, uri = nil)
+        super(page)
+        @history_index[(uri ? uri : page.uri).to_s] = page
+        if @max_size && self.length > @max_size
+          while self.length > @max_size
+            self.shift
+          end
+        end
+        self
+      end
+      alias :<< :push
+      def visited?(url)
+        ! visited_page(url).nil?
+      end
+      def visited_page(url)
+        @history_index[(url.respond_to?(:uri) ? url.uri : url).to_s]
+      end
+      def clear
+        @history_index.clear
+        super
+      end
+      def shift
+        return nil if length == 0
+        page    = self[0]
+        self[0] = nil
+        super
+        remove_from_index(page)
+        page
+      end
+      def pop
+        return nil if length == 0
+        page = super
+        remove_from_index(page)
+        page
+      end
+      private
+      def remove_from_index(page)
+        @history_index.each do |k,v|
+          @history_index.delete(k) if v == page
+        end
+      end
+    end
+  end
+end

data/lib/mechanize/page_elements.rb CHANGED

@@ -21,7 +21,7 @@ module WWW
         node.attributes ||= {}
         @node = node
         @href = node.attributes['href']
-        @text = node.all_text
+        @text = node.inner_text
         @page = page
         @mech = mech
         @attributes = node.attributes
@@ -31,7 +31,11 @@ module WWW
           @text = ''
           (node/'img').each do |e|
             e.attributes ||= {}
-            @text << (e.attributes.has_key?('alt') ? e.attributes['alt'] : '')
+            @text << (
+                      (e.attributes.has_key?('alt') && e.attributes['alt']) ?
+                        e.attributes['alt'] :
+                        ''
+                     )
           end
         end

data/lib/mechanize/rexml.rb CHANGED

@@ -90,11 +90,11 @@ end
   #
   #   collect_text_recursively.flatten.join("")
-  def all_text
+  def inner_text
     collect_text_recursively.flatten.join("")
   end
-  alias :text :all_text
+  alias :text :inner_text
 end
@@ -163,7 +163,7 @@ def extract_from_table(root_node, headers, header_tags = %w(td th))
   header_nodes = headers.collect { |header|
     root_node.find_first_recursive {|node|
-      header_tags.include?(node.name.downcase) and header === node.all_text
+      header_tags.include?(node.name.downcase) and header === node.inner_text
     }
   }

data/test/htdocs/alt_text.html CHANGED

@@ -3,6 +3,7 @@
   <body>
     <a href="alt_text.html"><img alt="alt text" src="hello"></a>
     <a href="no_alt_text.html"><img src="hello"></a>
+    <a href="nil_alt_text.html"><img alt src="hello"></a>
     <a href="no_image.html">no image</a>
     <a href="no_text.html"></a>
   </body>

data/test/tc_cookie_class.rb CHANGED

@@ -90,7 +90,7 @@ class CookieClassTest < Test::Unit::TestCase
       dates.each do |date|
         cookie = "PREF=1; expires=#{date}"
         WWW::Mechanize::Cookie.parse(url, cookie) { |cookie|
-          assert_equal(true, cookie.expires > (Time.now - 86400))
+          assert_equal(true, cookie.expires.nil?)
         }
       end
     end

data/test/tc_history.rb ADDED

@@ -0,0 +1,125 @@
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'test/unit'
+require 'rubygems'
+require 'mechanize'
+require 'test_includes'
+class TestHistory < Test::Unit::TestCase
+  include TestMethods
+  def setup
+    @agent    = WWW::Mechanize.new
+    @history  = WWW::Mechanize::History.new
+  end
+  def test_push
+    assert_equal(0, @history.length)
+    page = @agent.get("http://localhost/tc_bad_links.html")
+    x = @history.push(page)
+    assert_equal(x, @history)
+    assert_equal(1, @history.length)
+    assert(@history.visited?(page))
+    assert(@history.visited?(page.uri))
+    assert(@history.visited?(page.uri.to_s))
+    assert_equal(page, @history.visited_page(page))
+    assert_equal(page, @history.visited_page(page.uri))
+    assert_equal(page, @history.visited_page(page.uri.to_s))
+    @history.push(@agent.get("/tc_bad_links.html"))
+    assert_equal(2, @history.length)
+  end
+  def test_shift
+    assert_equal(0, @history.length)
+    page = @agent.get("http://localhost/tc_bad_links.html")
+    @history.push(page)
+    assert_equal(1, @history.length)
+    @history.push(@agent.get("/tc_bad_links.html"))
+    assert_equal(2, @history.length)
+    @history.push(@agent.get("/index.html"))
+    assert_equal(3, @history.length)
+    page2 = @history.shift
+    assert_equal(page, page2)
+    assert_equal(2, @history.length)
+    @history.shift
+    assert_equal(1, @history.length)
+    assert_equal(false, @history.visited?(page))
+    @history.shift
+    assert_equal(0, @history.length)
+    assert_nil(@history.shift)
+    assert_equal(0, @history.length)
+  end
+  def test_pop
+    assert_equal(0, @history.length)
+    page = @agent.get("http://localhost/tc_bad_links.html")
+    @history.push(page)
+    assert_equal(1, @history.length)
+    page2 = @agent.get("/index.html")
+    @history.push(page2)
+    assert_equal(2, @history.length)
+    assert_equal(page2, @history.pop)
+    assert_equal(1, @history.length)
+    assert_equal(true, @history.visited?(page))
+    assert_equal(false, @history.visited?(page2))
+    assert_equal(page, @history.pop)
+    assert_equal(0, @history.length)
+    assert_equal(false, @history.visited?(page))
+    assert_equal(false, @history.visited?(page2))
+    assert_nil(@history.pop)
+  end
+  def test_max_size
+    @history  = WWW::Mechanize::History.new(10)
+    1.upto(20) do |i|
+      page = @agent.get('http://localhost/index.html')
+      @history.push page
+      assert_equal(true, @history.visited?(page))
+      if i < 10
+        assert_equal(i, @history.length)
+      else
+        assert_equal(10, @history.length)
+      end
+    end
+    @history.clear
+    @history.max_size = 5
+    1.upto(20) do |i|
+      page = @agent.get('http://localhost/index.html')
+      @history.push page
+      assert_equal(true, @history.visited?(page))
+      if i < 5
+        assert_equal(i, @history.length)
+      else
+        assert_equal(5, @history.length)
+      end
+    end
+    @history.max_size = 0
+    1.upto(20) do |i|
+      page = @agent.get('http://localhost/index.html')
+      @history.push page
+      assert_equal(false, @history.visited?(page))
+      assert_equal(0, @history.length)
+    end
+  end
+  def test_clear
+    page = nil
+    20.times { @history.push(page = @agent.get('http://localhost/index.html')) }
+    assert_equal(20, @history.length)
+    assert_equal(true, @history.visited?(page))
+    @history.clear
+    assert_equal(0, @history.length)
+    assert_equal(false, @history.visited?(page))
+  end
+end

data/test/tc_links.rb CHANGED

@@ -26,7 +26,7 @@ class LinksMechTest < Test::Unit::TestCase
   def test_alt_text
     page = @agent.get("http://localhost:#{PORT}/alt_text.html")
-    assert_equal(4, page.links.length)
+    assert_equal(5, page.links.length)
     assert_equal(1, page.meta.length)
     assert_equal('', page.meta.first.text)
@@ -34,6 +34,7 @@ class LinksMechTest < Test::Unit::TestCase
     assert_equal('', page.links.href('no_alt_text.html').first.text)
     assert_equal('no image', page.links.href('no_image.html').first.text)
     assert_equal('', page.links.href('no_text.html').first.text)
+    assert_equal('', page.links.href('nil_alt_text.html').first.text)
   end
   def test_click_link

data/test/tc_mech.rb CHANGED

@@ -14,7 +14,12 @@ class TestMechMethods < Test::Unit::TestCase
   end
   def test_weird_url
-    @agent.get('http://localhost/?action=bing&bang=boom=1|a=|b=|c=')
+    assert_nothing_raised {
+      @agent.get('http://localhost/?action=bing&bang=boom=1|a=|b=|c=')
+    }
+    assert_nothing_raised {
+      @agent.get('http://localhost/?a=b&#038;b=c&#038;c=d')
+    }
   end
   def test_history
@@ -48,6 +53,14 @@ class TestMechMethods < Test::Unit::TestCase
       @agent.visited?("http://localhost/content_type_test?ct=text/html"))
   end
+  def test_visited_after_redirect
+    @agent.get("http://localhost/response_code?code=302")
+    assert_equal("http://localhost/index.html",
+      @agent.current_page.uri.to_s)
+    assert_equal(true,
+                 @agent.visited?('http://localhost/response_code?code=302'))
+  end
   def test_max_history
     @agent.max_history = 10
     0.upto(10) do |i|

data/test/test_all.rb CHANGED

@@ -13,6 +13,7 @@ require 'tc_form_button'
 require 'tc_form_no_inputname'
 require 'tc_forms'
 require 'tc_gzipping'
+require 'tc_history'
 require 'tc_html_unscape_forms'
 require 'tc_if_modified_since'
 require 'tc_links'

metadata CHANGED

@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
 specification_version: 1
 name: mechanize
 version: !ruby/object:Gem::Version
-  version: 0.6.5
-date: 2007-02-26 00:00:00 -08:00
+  version: 0.6.6
+date: 2007-03-24 00:00:00 -07:00
 summary: Mechanize provides automated web-browsing
 require_paths:
   - lib
@@ -51,7 +51,7 @@ files:
   - lib/mechanize/errors.rb
   - lib/mechanize/form.rb
   - lib/mechanize/form_elements.rb
-  - lib/mechanize/hpricot.rb
+  - lib/mechanize/history.rb
   - lib/mechanize/inspect.rb
   - lib/mechanize/list.rb
   - lib/mechanize/net-overrides/net/http.rb
@@ -118,6 +118,7 @@ files:
   - test/tc_forms.rb
   - test/tc_frames.rb
   - test/tc_gzipping.rb
+  - test/tc_history.rb
   - test/tc_html_unscape_forms.rb
   - test/tc_if_modified_since.rb
   - test/tc_links.rb

data/lib/mechanize/hpricot.rb DELETED

@@ -1,9 +0,0 @@
-# :enddoc:
-require 'hpricot'
-class Hpricot::Elem
-  def all_text
-    text = ''
-    traverse_text { |t| text << t.content }
-    text
-  end
-end