RubyGems - mechanize - Versions diffs - 0.4.7 → 0.5.0 - Mend

mechanize 0.4.7 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

data/CHANGELOG +17 -0
data/EXAMPLES +23 -44
data/NOTES +49 -0
data/lib/mechanize.rb +95 -80
data/lib/mechanize/cookie.rb +147 -148
data/lib/mechanize/cookie.rb.rej +16 -0
data/lib/mechanize/errors.rb +29 -0
data/lib/mechanize/form.rb +211 -186
data/lib/mechanize/form_elements.rb +31 -71
data/lib/mechanize/list.rb +34 -0
data/lib/mechanize/mech_version.rb +3 -1
data/lib/mechanize/module.rb +1 -1
data/lib/mechanize/page.rb +162 -180
data/lib/mechanize/page_elements.rb +53 -40
data/lib/mechanize/parsing.rb +11 -3
data/lib/mechanize/pluggable_parsers.rb +147 -0
data/test/data/server.crt +14 -0
data/test/data/server.csr +11 -0
data/test/data/server.key +18 -0
data/test/data/server.pem +15 -0
data/test/htdocs/no_title_test.html +6 -0
data/test/parse.rb +39 -0
data/test/proxy.rb +30 -0
data/test/server.rb +2 -0
data/test/servlets.rb +8 -0
data/test/ssl_server.rb +49 -0
data/test/tc_authenticate.rb +8 -6
data/test/tc_cookie_class.rb +28 -18
data/test/tc_cookie_jar.rb +88 -27
data/test/tc_cookies.rb +41 -44
data/test/tc_errors.rb +9 -23
data/test/tc_forms.rb +36 -32
data/test/tc_frames.rb +6 -4
data/test/tc_links.rb +7 -6
data/test/tc_mech.rb +43 -46
data/test/tc_page.rb +24 -0
data/test/tc_pluggable_parser.rb +103 -0
data/test/tc_post_form.rb +41 -0
data/test/tc_proxy.rb +25 -0
data/test/tc_response_code.rb +13 -10
data/test/tc_save_file.rb +25 -0
data/test/tc_ssl_server.rb +27 -0
data/test/tc_upload.rb +8 -6
data/test/tc_watches.rb +5 -2
data/test/test_includes.rb +3 -3
data/test/ts_mech.rb +11 -2
metadata +100 -86
data/test/tc_filter.rb +0 -34

data/CHANGELOG CHANGED Viewed

@@ -1,3 +1,20 @@
+= Mechanize CHANGELOG
+== 0.5.0
+* Added pluggable parsers. (Thanks to Eric Kolve for the idea)
+* Changed namespace so all classes are under WWW::Mechanize.
+* Updating Forms so that fields can be used as accessors (Thanks Gregory Brown)
+* Added WWW::Mechanize::File as default object used for unknown content types.
+* Added 'save_as' method to Mechanize::File, so any page can be saved.
+* Adding 'save_as' and 'load' to CookieJar so that cookies can be saved
+  between sessions.
+* Added WWW::Mechanize::FileSaver pluggable parser to automatically save files.
+* Added WWW::Mechanize::Page#title for page titles
+* Added OpenSSL certificate support (Thanks Mike Dalessio)
+* Removed support for body filters in favor of pluggable parsers.
+* Fixed cookie bug adding a '/' when the url is missing one (Thanks Nick Dainty)
 == 0.4.7
 * Fixed bug with no action in forms.  Thanks to Adam Wiggins

data/EXAMPLES CHANGED Viewed

@@ -3,26 +3,25 @@
 == Google
   require 'rubygems'
   require 'mechanize'
-  require 'logger'
-  agent = WWW::Mechanize.new { |a| a.log = Logger.new("mech.log") }
+  agent = WWW::Mechanize.new
   agent.user_agent_alias = 'Mac Safari'
   page = agent.get("http://www.google.com/")
   search_form = page.forms.with.name("f").first
-  search_form.fields.name("q").first.value = "Hello"
+  search_form.q = "Hello"
   search_results = agent.submit(search_form)
   puts search_results.body
 == Rubyforge
   require 'mechanize'
-  agent = WWW::Mechanize.new {|a| a.log = Logger.new(STDERR) }
+  agent = WWW::Mechanize.new
   page = agent.get('http://rubyforge.org/')
   link = page.links.text(/Log In/).first
   page = agent.click(link)
   form = page.forms[1]
-  form.fields.find {|f| f.name == 'form_loginname'}.value = ARGV[0]
-  form.fields.find {|f| f.name == 'form_pw'}.value = ARGV[1]
+  form.form_loginname = ARGV[0]
+  form.form_pw = ARGV[1]
   page = agent.submit(form, form.buttons.first)
   puts page.body
@@ -36,8 +35,8 @@ This example uploads one image as two different images to flickr.
   agent = WWW::Mechanize.new
   page = agent.get('http://flickr.com/signin/flickr/')
   form = page.forms.first
-  form.fields.name('email').first.value = ARGV[0]
-  form.fields.name('password').first.value = ARGV[1]
+  form.email = ARGV[0]
+  form.password = ARGV[1]
   page = agent.submit(form)
   page = agent.click page.links.text('Upload').first
   form = page.forms.first
@@ -53,48 +52,28 @@ This example uploads one image as two different images to flickr.
   agent.submit(form)
-== Page Body Filter
-This example shows how to preprocess a body before mechanize parses it.  The
-body filter sends the page body to the code block, and parses what the code
-block returns.  The filter on WWW::Page#body_filter is a "per-page" filter,
-meaning that it is only applied to one page object.
+== Pluggable Parsers
+Lets say you want html pages to automatically be parsed with Rubyful Soup.
+This example shows you how:
   require 'rubygems'
   require 'mechanize'
-  agent = WWW::Mechanize.new
-  page = agent.get('http://google.com/')
-  page.body_filter = lambda { |body|
-      body.gsub(/google/i, "Net::DAAP::Client")
-  }
-  puts page.body
-  page = agent.get('http://google.com/')
-  puts page.body
+  require 'rubyful_soup'
-== Global Body Filter
-The body filter can be set on the WWW::Mechanize object for use as a global
-filter.  The filter set will be applied to every page that is requested.  The
-following example shows the global filter being used, then being set back to
-the original filter.
+  class SoupParser < WWW::Mechanize::Page
+    attr_reader :soup
+    def initialize(uri = nil, response = nil, body = nil, code = nil)
+      @soup = BeautifulSoup.new(body)
+      super(uri, response, body, code)
+    end
+  end
-  require 'rubygems'
-  require 'mechanize'
   agent = WWW::Mechanize.new
-  old_filter = agent.body_filter
-  agent.body_filter = lambda { |body|
-    body.gsub(/(<a[^>]*>)[^<]*(<\/a[^>]*>)/i, "#{$1}Net::DAAP::Client#{$2}")
-  }
-  page = agent.get('http://google.com/')
-  page.links.each { |l| puts l.text }
-  agent.body_filter = old_filter
-  page = agent.get('http://google.com/')
-  page.links.each { |l| puts l.text }
+  agent.pluggable_parser.html = SoupParser
+Now all HTML pages will be parsed with the SoupParser class, and automatically
+give you access to a method called 'soup' where you can get access to the
+Beautiful Soup for that page.
 == Using a proxy

data/NOTES CHANGED Viewed

@@ -1,5 +1,54 @@
 = Mechanize Release Notes
+== 0.5.0
+Good News first:
+This release has many new great features!  Mechanize has been updated to
+handle any content type a web server returns using a system called "Pluggable
+Parsers".  Mechanize has always been able to handle any content type
+(sort of), but the pluggable parser system lets us cleanly handle any
+content type by instantiating a class for the content type returned from the
+server.  For example, a web server returns type 'text/html', mechanize asks
+the pluggable parser for a class to instantiate for 'text/html'.  Mechanize
+then instantiates that class and returns it.  Users can define their own
+parsers, and register them with the Pluggable Parser so that mechanize will
+instantiate your class when the content type you specify is returned.  This
+allows you to easily preprocess your HTML, or even use other HTML parsers.
+Content types that the pluggable parser doesn't know how to handle will
+return WWW::Mechanize::File which has basic functionality like a 'save_as'
+method.  For more information, see the RDoc for
+WWW::Mechanize::PluggableParser also see the EXAMPLES file.
+A 'save_as' method has been added so that any page downloaded can be easily
+saved to a file.
+The cookie jar for mechanize can now be saved to disk and loaded back up at
+another time.  If your script needs to save cookie state between executions,
+you can now use the 'save_as' and 'load' methods on WWW::Mechanize::CookieJar.
+Form fields can now be treated as accessors.  This means that if you have a
+form with the fields 'username' and 'password', you could manipulate them like
+this:
+  form.username = 'test'
+  form.password = 'testing'
+  puts "username: #{form.username}"
+  puts "password: #{form.password}"
+Form fields can still be accessed in the usual way in case there are multiple
+input fields with the same name.
+Bad news second:
+In this release, the name space has been altered to be more consistent.  Many
+classes used to be under WWW directly, they are now all under WWW::Mechanize.
+For example, in 0.4.7 Page was WWW::Page, in this release it is now
+WWW::Mechanize::Page.  This may break your code, but if you aren't using
+class names directly, everything should be fine.
+Body filters have been removed in favor of Pluggable Parsers.
 == 0.4.7
 This release of mechanize comes with a few bug fixes including fixing a

data/lib/mechanize.rb CHANGED Viewed

@@ -17,27 +17,20 @@ require 'net/https'
 require 'uri'
 require 'logger'
 require 'webrick'
-require 'date'
 require 'web/htmltools/xmltree'   # narf
 require 'mechanize/module'
-require 'mechanize/list'
-require 'mechanize/parsing'
+require 'mechanize/mech_version'
 require 'mechanize/cookie'
+require 'mechanize/errors'
+require 'mechanize/pluggable_parsers'
 require 'mechanize/form'
 require 'mechanize/form_elements'
+require 'mechanize/list'
 require 'mechanize/page'
 require 'mechanize/page_elements'
+require 'mechanize/parsing'
 module WWW
-  require 'mechanize/mech_version.rb'
-class ResponseCodeError < RuntimeError
-  attr_reader :response_code
-  def initialize(response_code)
-    @response_code = response_code
-  end
-end
 # = Synopsis
 # The Mechanize library is used for automating interaction with a website.  It
@@ -57,7 +50,6 @@ end
 #  search_results = agent.submit(search_form)
 #  puts search_results.body
 class Mechanize
   AGENT_ALIASES = {
     'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
     'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6',
@@ -66,34 +58,51 @@ class Mechanize
     'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401',
     'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624',
     'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)',
-    'Mechanize' => "WWW-Mechanize/#{WWW::MechVersion} (http://rubyforge.org/projects/mechanize/)"
+    'Mechanize' => "WWW-Mechanize/#{Version} (http://rubyforge.org/projects/mechanize/)"
   }
-  attr_accessor :log
-  attr_accessor :user_agent
   attr_accessor :cookie_jar
+  attr_accessor :log
+  attr_accessor :max_history
   attr_accessor :open_timeout, :read_timeout
+  attr_accessor :user_agent
   attr_accessor :watch_for_set
-  attr_accessor :max_history
   attr_accessor :ca_file
-  attr_accessor :body_filter
+  attr_accessor :key
+  attr_accessor :cert
+  attr_accessor :pass
   attr_reader :history
+  attr_reader :pluggable_parser
   def initialize
-    @history        = []
-    @user_agent     = AGENT_ALIASES['Mechanize']
-    @user           = nil
+    # attr_accessors
+    @cookie_jar = CookieJar.new
+    @log = Logger.new(nil)
+    @max_history    = nil
     @open_timeout   = nil
     @read_timeout   = nil
+    @user_agent     = AGENT_ALIASES['Mechanize']
     @watch_for_set  = nil
-    @max_history    = nil
-    @body_filter    = lambda { |body| body }
-    @cookie_jar = CookieJar.new
-    @log = Logger.new(nil)
+    @ca_file        = nil
+    @cert           = nil # OpenSSL Certificate
+    @key            = nil # OpenSSL Private Key
+    @pass           = nil # OpenSSL Password
+    # attr_readers
+    @history        = []
+    @pluggable_parser = PluggableParser.new
+    # Basic Auth variables
+    @user           = nil # Basic Auth User
+    @password       = nil # Basic Auth Password
+    # Proxy settings
     @proxy_addr     = nil
+    @proxy_pass     = nil
     @proxy_port     = nil
     @proxy_user     = nil
-    @proxy_pass     = nil
     yield self if block_given?
   end
@@ -110,13 +119,7 @@ class Mechanize
   # Returns a list of cookies stored in the cookie jar.
   def cookies
-    cookies = []
-    @cookie_jar.jar.each_key do |domain|
-      @cookie_jar.jar[domain].each_key do |name|
-        cookies << @cookie_jar.jar[domain][name]
-      end
-    end
-    cookies
+    @cookie_jar.to_a
   end
   # Sets the user and password to be used for basic authentication.
@@ -125,12 +128,7 @@ class Mechanize
     @password = password
   end
-  def basic_authetication(user, password)
-    $stderr.puts "This method will be deprecated, please change to 'basic_auth'"
-    basic_auth(user, password)
-  end
-  # Fetches the URL passed in.
+  # Fetches the URL passed in and returns a page.
   def get(url)
     cur_page = current_page() || Page.new
@@ -140,31 +138,14 @@ class Mechanize
     page
   end
-  # Fetch a file and return the contents
+  # Fetch a file and return the contents of the file.
   def get_file(url)
     get(url).body
   end
-  # Posts to the given URL wht the query parameters passed in.
-  def post(url, query={})
-    cur_page = current_page() || Page.new
-    request_data = [WWW::Mechanize.build_query_string(query)]
-    # this is called before the request is sent
-    pre_request_hook = proc {|request|
-      log.debug("query: #{ query.inspect }")
-      request.add_field('Content-Type', 'application/x-www-form-urlencoded')
-      request.add_field('Content-Length', request_data[0].size.to_s)
-    }
-    # fetch the page
-    page = fetch_page(to_absolute_uri(url, cur_page), :post, cur_page, pre_request_hook, request_data)
-    add_to_history(page)
-    page
-  end
-  # Clicks the WWW::Link object passed in.
+  # Clicks the WWW::Mechanize::Link object passed in and returns the
+  # page fetched.
   def click(link)
     uri = to_absolute_uri(link.href)
     get(uri)
@@ -176,23 +157,48 @@ class Mechanize
     @history.pop
   end
+  # Posts to the given URL wht the query parameters passed in.  Query
+  # parameters can be passed as a hash, or as an array of arrays.
+  # Example:
+  #  agent.post('http://example.com/', "foo" => "bar")
+  # or
+  #  agent.post('http://example.com/', [ ["foo", "bar"] ])
+  def post(url, query={})
+    cur_page = current_page() || Page.new
+    node = REXML::Element.new
+    node.add_attribute('method', 'POST')
+    node.add_attribute('enctype', 'application/x-www-form-urlencoded')
+    form = Form.new(node)
+    query.each { |k,v|
+      form.fields << Field.new(k,v)
+    }
+    post_form(url, form)
+  end
+  # Submit a form with an optional button.
+  # Without a button:
+  #  page = agent.get('http://example.com')
+  #  agent.submit(page.forms.first)
+  # With a button
+  #  agent.submit(page.forms.first, page.forms.first.buttons.first)
   def submit(form, button=nil)
     form.add_button_to_query(button) if button
-    query = form.build_query
     uri = to_absolute_uri(form.action)
     case form.method.upcase
     when 'POST'
       post_form(uri, form)
     when 'GET'
       if uri.query.nil?
-        uri.query = WWW::Mechanize.build_query_string(query)
+        uri.query = WWW::Mechanize.build_query_string(form.build_query)
       else
-        uri.query = uri.query + "&" + WWW::Mechanize.build_query_string(query)
+        uri.query = uri.query + "&" +
+          WWW::Mechanize.build_query_string(form.build_query)
       end
       get(uri)
     else
-      raise 'unsupported method'
+      raise "unsupported method: #{form.method.upcase}"
     end
   end
@@ -203,14 +209,14 @@ class Mechanize
   # Returns whether or not a url has been visited
   def visited?(url)
-    if url.is_a?(WWW::Link)
+    if url.is_a?(Link)
       url = url.uri
     end
     uri = to_absolute_uri(url)
     ! @history.find { |h| h.uri.to_s == uri.to_s }.nil?
   end
-  alias page current_page
+  alias :page :current_page
   private
@@ -269,10 +275,15 @@ class Mechanize
     if uri.scheme == 'https'
       http.use_ssl = true
+      http.verify_mode = OpenSSL::SSL::VERIFY_NONE
       if @ca_file
         http.ca_file = @ca_file
         http.verify_mode = OpenSSL::SSL::VERIFY_PEER
       end
+      if @cert && @key
+        http.cert = OpenSSL::X509::Certificate.new(::File.read(@cert))
+        http.key  = OpenSSL::PKey::RSA.new(::File.read(@key), @pass)
+      end
     end
@@ -334,15 +345,28 @@ class Mechanize
           log.debug("header: #{ k } : #{ v }")
         }
-        page.response = response
-        page.code = response.code
         response.read_body
-        page.body = body_filter.call(response.body)
+        content_type = nil
+        unless response['Content-Type'].nil?
+          data = response['Content-Type'].match(/^([^;]*)/)
+          content_type = data[1].downcase unless data.nil?
+        end
+        # Find our pluggable parser
+        page = @pluggable_parser.parser(content_type).new(
+          uri,
+          response,
+          response.body,
+          response.code
+        )
         log.info("status: #{ page.code }")
-        page.watch_for_set = @watch_for_set
+        if page.respond_to? :watch_for_set
+          page.watch_for_set = @watch_for_set
+        end
         case page.code
         when "200"
@@ -375,15 +399,6 @@ class Mechanize
       @history = @history[@history.size - @max_history, @max_history]
     end
   end
-  class ContentTypeError < RuntimeError
-    attr_reader :content_type
-    def initialize(content_type)
-      @content_type = content_type
-    end
-  end
 end
 end # module WWW