RubyGems - ruby-web - Versions diffs - 1.1.1 - Mend

ruby-web 1.1.1

Files changed (190) hide show

data/ChangeLog +474 -0
data/INSTALL.txt +9 -0
data/InstalledFiles +180 -0
data/LICENSE.txt +74 -0
data/Rakefile +529 -0
data/TODO +65 -0
data/doc/additional.xml +149 -0
data/doc/core.xml +652 -0
data/doc/credits/index.xml +52 -0
data/doc/credits/php.contributors.xml +118 -0
data/doc/credits/php.language-snippets.ent +622 -0
data/doc/install/index.xml +136 -0
data/doc/install/mac/index.xml +21 -0
data/doc/install/ruby-web.install.rb.instructions.xml +7 -0
data/doc/install/unix/index.xml +46 -0
data/doc/install/win/apache1.xml +166 -0
data/doc/install/win/apache2.xml +141 -0
data/doc/install/win/iis.xml +162 -0
data/doc/install/win/index.xml +24 -0
data/doc/install/win/installer.xml +31 -0
data/doc/install/win/manual.xml +43 -0
data/doc/manual.xml +69 -0
data/doc/old/apache_cgi.txt +23 -0
data/doc/old/fastcgi.txt +23 -0
data/doc/old/mod_ruby.txt +21 -0
data/doc/old/snippets.rdoc +183 -0
data/doc/old/webrick.txt +23 -0
data/doc/old/windows_cgi.txt +9 -0
data/doc/tutorial.xml +14 -0
data/doc/xsl/manual-multi.xsl +10 -0
data/doc/xsl/manual-pdf.xsl +6 -0
data/doc/xsl/manual-single.xsl +6 -0
data/doc/xsl/manual.css +22 -0
data/install.rb +1022 -0
data/lib/formatter.rb +314 -0
data/lib/html-parser.rb +429 -0
data/lib/htmlrepair.rb +113 -0
data/lib/htmlsplit.rb +842 -0
data/lib/sgml-parser.rb +332 -0
data/lib/web.rb +68 -0
data/lib/web/assertinclude.rb +129 -0
data/lib/web/config.rb +50 -0
data/lib/web/connection.rb +1070 -0
data/lib/web/convenience.rb +154 -0
data/lib/web/formreader.rb +318 -0
data/lib/web/htmlparser/html-parser.rb +429 -0
data/lib/web/htmlparser/sgml-parser.rb +332 -0
data/lib/web/htmltools/element.rb +296 -0
data/lib/web/htmltools/stparser.rb +276 -0
data/lib/web/htmltools/tags.rb +286 -0
data/lib/web/htmltools/tree.rb +139 -0
data/lib/web/htmltools/xmltree.rb +160 -0
data/lib/web/htmltools/xpath.rb +71 -0
data/lib/web/info.rb +63 -0
data/lib/web/load.rb +210 -0
data/lib/web/mime.rb +87 -0
data/lib/web/phprb.rb +340 -0
data/lib/web/resources/test/cookie.rb +33 -0
data/lib/web/resources/test/counter.rb +20 -0
data/lib/web/resources/test/multipart.rb +14 -0
data/lib/web/resources/test/redirect.rb +8 -0
data/lib/web/resources/test/stock.rb +33 -0
data/lib/web/sapi/apache.rb +129 -0
data/lib/web/sapi/fastcgi.rb +22 -0
data/lib/web/sapi/install/apache.rb +180 -0
data/lib/web/sapi/install/iis.rb +93 -0
data/lib/web/sapi/install/macosx.rb +90 -0
data/lib/web/sapi/webrick.rb +86 -0
data/lib/web/session.rb +83 -0
data/lib/web/shim/cgi.rb +129 -0
data/lib/web/shim/rails.rb +175 -0
data/lib/web/stringio.rb +78 -0
data/lib/web/strscanparser.rb +24 -0
data/lib/web/tagparser.rb +96 -0
data/lib/web/testing.rb +666 -0
data/lib/web/traceoutput.rb +75 -0
data/lib/web/unit.rb +56 -0
data/lib/web/upload.rb +59 -0
data/lib/web/validate.rb +52 -0
data/lib/web/wiki.rb +557 -0
data/lib/web/wiki/linker.rb +72 -0
data/lib/web/wiki/page.rb +201 -0
data/lib/webunit.rb +27 -0
data/lib/webunit/assert.rb +152 -0
data/lib/webunit/converter.rb +154 -0
data/lib/webunit/cookie.rb +118 -0
data/lib/webunit/domwalker.rb +185 -0
data/lib/webunit/exception.rb +14 -0
data/lib/webunit/form.rb +116 -0
data/lib/webunit/frame.rb +37 -0
data/lib/webunit/htmlelem.rb +122 -0
data/lib/webunit/image.rb +26 -0
data/lib/webunit/jscript.rb +31 -0
data/lib/webunit/link.rb +33 -0
data/lib/webunit/params.rb +321 -0
data/lib/webunit/parser.rb +229 -0
data/lib/webunit/response.rb +464 -0
data/lib/webunit/runtest.rb +41 -0
data/lib/webunit/table.rb +148 -0
data/lib/webunit/testcase.rb +45 -0
data/lib/webunit/ui/cui/testrunner.rb +50 -0
data/lib/webunit/utils.rb +68 -0
data/lib/webunit/webunit.rb +28 -0
data/test/dev/action.rb +83 -0
data/test/dev/forms.rb +104 -0
data/test/dev/forms2.rb +104 -0
data/test/dev/parser.rb +17 -0
data/test/dev/scripts/dump.rb +24 -0
data/test/dev/scripts/makedist.rb +62 -0
data/test/dev/scripts/uri.rb +41 -0
data/test/dev/scripts/uri/common.rb +432 -0
data/test/dev/scripts/uri/ftp.rb +149 -0
data/test/dev/scripts/uri/generic.rb +1106 -0
data/test/dev/scripts/uri/http.rb +76 -0
data/test/dev/scripts/uri/https.rb +26 -0
data/test/dev/scripts/uri/ldap.rb +238 -0
data/test/dev/scripts/uri/mailto.rb +260 -0
data/test/dev/scripts/urireg.rb +174 -0
data/test/dev/simpledispatcher.rb +156 -0
data/test/dev/test.action.rb +146 -0
data/test/dev/test.formreader.rb +463 -0
data/test/dev/test.simpledispatcher.rb +186 -0
data/test/dev/webunit/conv/digit-0.rb +21 -0
data/test/dev/webunit/conv/digit-1.rb +17 -0
data/test/dev/webunit/conv/digit.rb +23 -0
data/test/dev/webunit/conv/test_digit-0.rb +16 -0
data/test/dev/webunit/conv/test_digit-1.rb +19 -0
data/test/dev/webunit/conv/test_digit.rb +26 -0
data/test/dev/webunit/conv/test_digit_view-0.rb +76 -0
data/test/dev/webunit/conv/test_digit_view-1.rb +102 -0
data/test/dev/webunit/conv/test_digit_view.rb +134 -0
data/test/installation/htdocs/cgi_test.rb +296 -0
data/test/installation/htdocs/test_install.rb +4 -0
data/test/installation/runwebtest.rb +5 -0
data/test/installation/test_cookie.rb +128 -0
data/test/installation/test_form.rb +47 -0
data/test/installation/test_multipart.rb +51 -0
data/test/installation/test_request.rb +24 -0
data/test/installation/test_response.rb +35 -0
data/test/unit/htdocs/cookie.rb +32 -0
data/test/unit/htdocs/multipart.rb +28 -0
data/test/unit/htdocs/redirect.rb +12 -0
data/test/unit/htdocs/simple.rb +13 -0
data/test/unit/htdocs/stock.rb +33 -0
data/test/unit/test_assert.rb +162 -0
data/test/unit/test_cookie.rb +114 -0
data/test/unit/test_domwalker.rb +77 -0
data/test/unit/test_form.rb +42 -0
data/test/unit/test_frame.rb +40 -0
data/test/unit/test_htmlelem.rb +74 -0
data/test/unit/test_image.rb +45 -0
data/test/unit/test_jscript.rb +57 -0
data/test/unit/test_link.rb +85 -0
data/test/unit/test_multipart.rb +51 -0
data/test/unit/test_params.rb +210 -0
data/test/unit/test_parser.rb +53 -0
data/test/unit/test_response.rb +150 -0
data/test/unit/test_table.rb +70 -0
data/test/unit/test_utils.rb +106 -0
data/test/unit/test_webunit.rb +28 -0
data/test/web/mod_ruby_stub.rb +39 -0
data/test/web/test.assertinclude.rb +109 -0
data/test/web/test.buffer.rb +182 -0
data/test/web/test.code.loader.rb +78 -0
data/test/web/test.config.rb +31 -0
data/test/web/test.error.handling.rb +91 -0
data/test/web/test.formreader-2.0.rb +352 -0
data/test/web/test.load.rb +125 -0
data/test/web/test.mime-type.rb +23 -0
data/test/web/test.narf.cgi.rb +106 -0
data/test/web/test.phprb.rb +239 -0
data/test/web/test.request.rb +368 -0
data/test/web/test.response.rb +637 -0
data/test/web/test.ruby-web.rb +10 -0
data/test/web/test.session.rb +50 -0
data/test/web/test.shim.cgi.rb +96 -0
data/test/web/test.tagparser.rb +65 -0
data/test/web/test.template2.rb +297 -0
data/test/web/test.testing2.rb +318 -0
data/test/web/test.upload.rb +45 -0
data/test/web/test.validate.rb +46 -0
data/test/web/test.web.test.rb +495 -0
data/test/wiki/test.history.rb +297 -0
data/test/wiki/test.illustration_page.rb +287 -0
data/test/wiki/test.linker.rb +197 -0
data/test/wiki/test.tarpit.rb +56 -0
data/test/wiki/test.wiki.rb +300 -0
data/test/wikitestroot/admin.rb +7 -0
data/test/wikitestroot/wiki.rb +6 -0
metadata +234 -0

@@ -0,0 +1,286 @@
+# This encodes the knowledge of HTML 4.0 tags for a parser.
+# It knows about block vs. inline tags, empty tags, and optionally
+# omitted end tags.
+#
+# Copyright:: Copyright(C) 2002 Ned Konz <ned@bike-nomad.com>
+# License::   Ruby's license
+# CVS ID::    $Id: tags.rb,v 1.7 2002/06/04 01:55:59 ned Exp $
+# This is an error raised by <tt>HTML::Tag.named()</tt> when a tag doesn't exist.
+class NoSuchHTMLTagError < RuntimeError #:nodoc:
+end
+# This is the base class for all the HTML tag classes.
+module HTML #:nodoc: all
+  class Tag
+    # tag_name:: a String, the name of the tag
+    # can_omit:: a Boolean, true if end tag is optional
+    def initialize(tag_name, can_omit)
+      @name = tag_name.downcase
+      @can_omit_end = can_omit
+    end
+    # Return my tag name.
+    def name; @name; end
+    # Return true if my end tag can be omitted.
+    def can_omit_end_tag; @can_omit_end; end
+    # Return true if I am a block element.
+    def is_block_element; false; end
+    # Return true if I am an inline element.
+    def is_inline_element; false; end
+    # Return true if I am an empty element.
+    def is_empty_element; false; end
+    # Return true if I can contain <tt>tag</tt> if my parent is of type <tt>parent</tt>.
+    # tag:: tag name, a String
+    # parent:: parent tag name, a String.
+    def can_contain(tag, parent); false; end
+    # Return true if whitespace within me can be omitted (ignoring browser
+    # bugs)
+    def can_ignore_whitespace; true; end
+  end
+  # This represents an HTML block element.
+  class BlockTag < Tag
+    def is_block_element; true; end
+    # Blocks can contain anything, so return true.
+    def can_contain(tag, parent); true; end
+  end
+  # This represents an HTML inline element.
+  class InlineTag < Tag
+    def is_inline_element; true; end
+    # Inlines can only contain other inlines.
+    def can_contain(tag, parent)
+      Tag.named(tag).is_inline_element
+    end
+  end
+  # This represents an HTML element that can be regarded as either a block
+  # or an inline element..
+  class BlockOrInlineTag < InlineTag
+    def is_block_element; true; end
+    # If used as inline elements (e.g., within another inline element or a P),
+    # these elements should not contain any block-level elements.
+    def can_contain(tag, parent)
+      return ((parent.downcase == 'p' \
+        or Tag.named(parent).is_inline_element) \
+          and ! Tag.named(tag).is_block_element)
+    end
+  end
+  # This represents an HTML tag that never has an end tag.
+  class EmptyTag < Tag
+    def is_empty_element; true; end
+    def is_inline_element; true; end
+    def can_contain(tag, parent); false; end
+  end
+  # This block initializes the tag lookup table.
+  class Tag
+    @table = Hash.new
+    # Add the given tag to the tag lookup table.
+    #
+    # This can be called by user code to add otherwise unknown tags to the
+    # table.
+    #
+    # name::      the tag name, a String.
+    # is_block::  true if I am a block element.
+    # is_inline:: true if I am an inline element.
+    # is_empty::  true if I am an empty element.
+    # can_omit::  true if my end tag can be omitted.
+    def Tag.add_tag(name, is_block, is_inline, is_empty, can_omit)
+      @table[ name.upcase ] = @table[ name.downcase ] = \
+      if is_empty
+        EmptyTag.new(name, true)
+      elsif is_block
+        if is_inline
+          BlockOrInlineTag.new(name, can_omit)
+        else
+          BlockTag.new(name, can_omit)
+        end
+      else
+        InlineTag.new(name, can_omit)
+      end
+    end
+    # Return an Tag with the given name, or raise a
+    # NoSuchHTMLTagError.
+    def Tag.named(tagname)
+      @table[ tagname ] || raise(NoSuchHTMLTagError.exception(tagname))
+    end
+    #               Block Inline Empty can_omit_end
+    [
+    [ 'A',          false, true, false, false ], # Anchor
+    [ 'ABBR',       false, true, false, false ], # Abbreviation
+    [ 'ACRONYM',    false, true, false, false ], # Acronym
+    [ 'ADDRESS',    true, false, false, false ], # Address
+    [ 'APPLET',     true,  true, false, false ], # Java applet
+    [ 'AREA',       true, false, true, true ], # Image map region
+    [ 'B',          false, true, false, false ], # Bold text
+    [ 'BASE',       false, false, true, true ], # Document base URI
+    [ 'BASEFONT',   false, true, true,  true  ], # Base font change
+    [ 'BDO',        false, true, false, false ], # Bi_di override
+    [ 'BIG',        false, true, false, false ], # Large text
+    [ 'BLOCKQUOTE', true, false, false, false ], # Block quotation
+    [ 'BODY',       true, false, false, false ], # Document body
+    [ 'BR',         false, true,  true, true ], # Line break
+    [ 'BUTTON',     true,  true,  false, false ], # Button
+    [ 'CAPTION',    false, true, false, false ], # Table caption
+    [ 'CENTER',     false, true, false, false ], # Centered block
+    [ 'CITE',       false, true, false, false ], # Citation
+    [ 'CODE',       false, true, false, false ], # Computer code
+    [ 'COL',        false, false, true, true ], # Table column
+    [ 'COLGROUP',   true, false, false, true ], # Table column group
+    [ 'DD',         true, false, false, true ], # Definition description
+    [ 'DEL',        true,  true,  false, false ], # Deleted text
+    [ 'DFN',        false, true, false, false ], # Defined term
+    [ 'DIR',        true, false, false, false ], # Directory list
+    [ 'DIV',        true, false, false, false ], # Generic block-level container
+    [ 'DL',         true, false, false, false ], # Definition list
+    [ 'DT',         false, true, false, true ], # Definition term
+    [ 'EM',         false, true, false, false ], # Emphasis
+    [ 'FIELDSET',   true, false, false, false ], # Form control group
+    [ 'FONT',       false, true, false, false ], # Font change
+    [ 'FORM',       true, false, false, false ], # Interactive form
+    [ 'FRAME',      false, false, true, true ], # Frame
+    [ 'FRAMESET',   true, false, false, false ], # Frameset
+    [ 'H1',         true, false, false, false ], # Level-one heading
+    [ 'H2',         true, false, false, false ], # Level-two heading
+    [ 'H3',         true, false, false, false ], # Level-three heading
+    [ 'H4',         true, false, false, false ], # Level-four heading
+    [ 'H5',         true, false, false, false ], # Level-five heading
+    [ 'H6',         true, false, false, false ], # Level-six heading
+    [ 'HEAD',       true, false, false, false ], # Document head
+    [ 'HR',         false, true, true, true ], # Horizontal rule
+    [ 'HTML',       true, false, false, false ], # HTML document
+    [ 'I',          false, true, false, false ], # Italic text
+    [ 'IFRAME',     true,  true,  false, false ], # Inline frame
+    [ 'IMG',        false, true, true, true ], # Inline image
+    [ 'INPUT',      false, true, true, true ], # Form input
+    [ 'INS',        true,  true, false, false ], # Inserted text
+    [ 'ISINDEX',    false, true, true,  true ], # Input prompt
+    [ 'KBD',        false, true, false, false ], # Text to be input
+    [ 'LABEL',      false, true, false, false ], # Form field label
+    [ 'LEGEND',     false, true, false, false ], # Fieldset caption
+    [ 'LI',         true, false, false, true ], # List item
+    [ 'LINK',       true, false, false, false ], # Document relationship
+    [ 'MAP',        true,  true, false, false ], # Image map
+    [ 'MENU',       true, false, false, false ], # Menu list
+    [ 'META',       false, true,  true, true ], # Metadata
+    [ 'NOFRAMES',   true, false, false, false ], # Frames alternate content
+    [ 'NOSCRIPT',   true, false, false, false ], # Alternate script content
+    [ 'OBJECT',     true,  true,  false, false ], # Object
+    [ 'OL',         true, false, false, false ], # Ordered list
+    [ 'OPTGROUP',   true, false, false, false ], # Option group
+    [ 'OPTION',     true, false, false, false ], # Menu option
+    [ 'P',          true, false, false, true ], # Paragraph
+    [ 'PARAM',      false, true, true,  true ], # Object parameter
+    [ 'PRE',        true, false, false, false ], # Preformatted text
+    [ 'Q',          false, true, false, false ], # Short quotation
+    [ 'S',          false, true, false, false ], # Strike-through text
+    [ 'SAMP',       false, true, false, false ], # Sample output
+    [ 'SCRIPT',     true,  true, false, false ], # Client-side script
+    [ 'SELECT',     true, false, false, false ], # Option selector
+    [ 'SMALL',      false, true, false, false ], # Small text
+    [ 'SPAN',       false, true, false, false ], # Generic inline container
+    [ 'STRIKE',     false, true, false, false ], # Strike-through text
+    [ 'STRONG',     false, true, false, false ], # Strong emphasis
+    [ 'STYLE',      true, false, false, false ], # Embedded style sheet
+    [ 'SUB',        false, true, false, false ], # Subscript
+    [ 'SUP',        false, true, false, false ], # Superscript
+    [ 'TABLE',      true, false, false, false ], # Table
+    [ 'TBODY',      true, false, false, false ], # Table body
+    [ 'TD',         true, false, false, true ], # Table data cell
+    [ 'TEXTAREA',   false, true, false, false ], # Multi-line text input
+    [ 'TFOOT',      true, false, false, true ], # Table foot
+    [ 'TH',         true, false, false, true ], # Table header cell
+    [ 'THEAD',      true, false, false, true ], # Table head
+    [ 'TITLE',      true, false, false, false ], # Document title
+    [ 'TR',         true, false, false, true ], # Table row
+    [ 'TT',         false, true, false, false ], # Teletype text
+    [ 'U',          false, true, false, false ], # Underlined text
+    [ 'UL',         true, false, false, false ], # Unordered list
+    [ 'VAR',        false, true, false, false ], # Variable
+    ].each { |a| add_tag(*a) }
+    # EXCEPTIONS TODO
+    # A, LABEL can't contain itself
+    # several things (fonts, etc) can't be in PRE
+    # SELECT can only have OPTGROUP or OPTION
+    # TEXTAREA, OPTION only contains plain text
+    # APPLET and OBJECT has PARAM+ followed by block and/or inline
+    # BUTTON can't contain:
+    #  A, INPUT, SELECT, TEXTAREA, LABEL, BUTTON, or IFRAME
+    #  nor FORM, ISINDEX, and FIELDSET
+    # IFRAME can only contain block elems if parent can
+    # MAP can contain block+ *xor* AREA+
+    # SCRIPT only contains a SCRIPT (that is, until /<\/[A-Za-z]/)
+    # BODY must be in HTML or NOFRAMES
+    # COL can only be in COLGROUP or TABLE
+    # COLGROUP has only COL*, and can only be in TABLE
+    # DIR, MENU can only contain LI+, none of which may contain block elems
+    # DL must contain (DT|DD)+
+    # DT and DD are only allowed in DL
+    # FIELDSET contains LEGEND, (block|inline)*
+    # FRAMESET contains (FRAMESET|FRAME), plus NOFRAMES and must be in HTML
+    # H# can only be contained in block elems, but only contain inlines.
+    # HEAD must only contain TITLE, BASE?, ISINDEX?, SCRIPT* STYLE* META* LINK*
+    #   OBJECT* HEAD must be in HTML
+    # HTML is top-level and can only contain HEAD, BODY, or HEAD, FRAMESET
+    # LI can contain blocks except when inside DIR or MENU
+    # LI can only be inside OL, UL, DIR, MENU
+    # OL, UL can only contain LI+
+    # OPTGROUP contains OPTION+
+    # P can only contain inlines. However, it is a block-level elem.
+    # PRE can only contain inlines except IMG, OBJECT, APPLET, BIG, SMALL, SUB,
+    #   SUP, FONT, BASEFONT
+    # tags with optional omitted endtags and their allowed contents:
+    # anchor matches at beginning and end
+      {
+          'AREA'      => '(?!AREA)[A-Z]+',
+          'COLGROUP'  => 'COL',
+          'DD'        => '(?!D[DT]$)[A-Z]+',
+          'DT'        => '(?!D[DT]$)[A-Z]+',
+          'MAP'       => 'AREA',
+          'P'         => '(?!P$)[A-Z]+',
+          'TD'        => '(?!T[HDR]$)[A-Z]+',
+          'TFOOT'     => 'TR',
+          'TH'        => '(?!T[HDR]$)[A-Z]+',
+          'THEAD'     => 'TR',
+          'TR'        => 'T[HD]',
+      }.each_pair { |tagname, pattern|
+      eval <<EOM
+      class << named(tagname)   # :nodoc:
+        def can_contain(tag, parent)
+          (/\\A#{pattern}\\z/i =~ tag) == 0
+        end
+      end
+EOM
+    }
+    class << named('TEXTAREA') # :nodoc:
+      def can_ignore_whitespace; false; end
+    end
+    class << named('PRE') # :nodoc:
+      def can_ignore_whitespace; false; end
+    end
+    class << named('OPTION') # :nodoc:
+      def can_ignore_whitespace; false; end
+    end
+  end
+end

data/lib/web/htmltools/tree.rb ADDED

@@ -0,0 +1,139 @@
+#!/usr/bin/ruby
+# This is an HTML parser that builds an element tree for further
+# processing. Attributes and data are also stored.
+#
+# Typical usage is:
+#   parser = HTMLTree::Parser.new(false, false)
+#   parser.parse_file_named('whatever.html')
+#   # then you have the tree built..
+#   parser.tree.dump
+#
+# Copyright:: Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
+# License::   Ruby's
+# CVS ID::    $Id: tree.rb,v 1.14 2002/06/04 01:55:59 ned Exp $
+require 'web/htmltools/tags'
+require 'web/htmltools/stparser'
+require 'web/htmltools/element'
+# This is a tree building HTML parser.
+module HTMLTree #:nodoc: all
+  class Parser < HTML::StackingParser
+    # verbose::  if true, will warn to $stderr on unknown
+    # tags/entities/characters, as well as missing end tags and extra end
+    # tags.
+    # strip_white:: if true, remove all non-essential whitespace. Note
+    # that there are browser bugs that may cause this to change the
+    # appearance of HTML (even though it shouldn't by the standard).
+    def initialize(verbose=false, strip_white=true)
+      super
+      reset
+    end
+    # Reset this parser so that it can parse a new document.
+    def reset
+      super
+      @rootNode = @currentNode = Document.new
+    end
+    # Return the tree that was built. This will be an HTMLTree::Element that
+    # represents the whole document. The \<html> node is a child of this.
+    def tree
+      @rootNode
+    end
+    # Return the <html> node, if any.
+    def html
+      @rootNode.html_node()
+    end
+    # no user-serviceable parts inside...
+    # though you can subclass carefully.
+    private
+    def add_child_to_current(tag, attrs)
+      node = Element.new(@currentNode, tag)
+      attrs.each { |a| node.add_attribute(*a) }
+      node
+    end
+    # callbacks
+    # add a child to the current node and descend
+    def handle_start_tag(tag, attrs)
+      node = add_child_to_current(tag, attrs)
+      @rootNode = node unless @rootNode
+      @currentNode = node
+    end
+    # go up to parent
+    def handle_end_tag(tag)
+      @currentNode = @currentNode.parent
+    end
+    # add a child to the current node
+    def handle_empty_tag(tag, attrs)
+      add_child_to_current(tag, attrs)
+    end
+    # Add a child to the current node and descend
+    # Assume that the unknown tag has an end tag.
+    def handle_unknown_tag(tag, attrs)
+      super
+      handle_start_tag(tag, attrs)
+    end
+    # go up to parent
+    def handle_missing_end_tag(tag)
+      super
+      handle_end_tag(tag)
+    end
+    # ignore
+    def handle_extra_end_tag(tag)
+      super
+    end
+    def handle_cdata(data)
+      node = Data.new(@currentNode, data)
+    end
+    def handle_script(data)
+      node = Data.new(@currentNode, data)
+    end
+    def handle_unknown_character(name)
+      super
+    end
+    def handle_unknown_entity(name)
+      super
+    end
+    def handle_comment(data)
+      super # make sure and strip whitespace.
+      node = Comment.new(@currentNode, data)
+    end
+    def handle_special(data)
+      node = HTMLTree::Special.new(@currentNode, data)
+      $stderr.print('special ', node, ' discarded') unless @currentNode
+    end
+  end
+end
+if $0 == __FILE__
+  $stdout.sync = true
+  class TestStackingParser < HTMLTree::Parser #:nodoc: all
+    $DEBUG = false
+    p = TestStackingParser.new(true, false)
+    p.parse_file_named(ARGV[0] || 'ebay.html')
+    File.open('xx.html', 'w') { |of|
+      p.tree.write(of)
+    }
+    p.tree.dump
+  end
+end

data/lib/web/htmltools/xmltree.rb ADDED

@@ -0,0 +1,160 @@
+#!/usr/bin/ruby
+# This is an HTML parser that builds an element tree for further
+# processing. Attributes and data are also stored.
+# The storage is that of REXML, which is required.
+#
+# Typical usage is:
+#   parser = HTMLTree::XMLParser.new(false, false)
+#   parser.parse_file_named('whatever.html')
+#   # then you have the tree built..
+#   parser.tree # is a REXML::Document
+#
+# Copyright:: Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
+# License::   Ruby's
+# CVS ID::    $Id: tree.rb,v 1.14 2002/06/04 01:55:59 ned Exp $
+require 'web/htmltools/tags'
+require 'web/htmltools/stparser'
+require 'rexml/element'
+require 'rexml/document'
+# REXML::Child
+#   REXML::XMLDecl
+#   REXML::Instruction
+#   REXML::Text
+#   REXML::Comment
+#   REXML::Entity
+#   REXML::Parent
+#     REXML::Element (+REXML::Namespace)
+#       REXML::Document
+#     REXML::DocType
+#
+# This is a tree building HTML parser that makes XML.
+module HTMLTree #:nodoc: all
+  class XMLParser < HTML::StackingParser
+    # verbose::  if true, will warn to $stderr on unknown
+    # tags/entities/characters, as well as missing end tags and extra end
+    # tags.
+    # strip_white:: if true, remove all non-essential whitespace. Note
+    # that there are browser bugs that may cause this to change the
+    # appearance of HTML (even though it shouldn't by the standard).
+    def initialize(verbose=false, strip_white=true)
+      super
+      reset
+    end
+    # Reset this parser so that it can parse a new document.
+    def reset
+      super
+      @rootNode = @currentNode = REXML::Document.new()
+    end
+    # Return the document that was built. This will be an
+    # REXML::Document that represents the whole document. The \<html>
+    # node is a child of this.
+    def document
+      @rootNode
+    end
+    def tree
+      document()
+    end
+    # Return the root of the document, if any.
+    def root
+      @rootNode.root()
+    end
+    # Return the <html> node, if any.
+    def html
+      @rootNode.root.elements['html']
+    end
+    # no user-serviceable parts inside...
+    # though you can subclass carefully.
+    private
+    def add_child_to_current(tag, attrs)
+      node = REXML::Element.new(tag, @currentNode)
+      attrs.each { |a| node.attributes[a[0]] = a[1] }
+      node
+    end
+    # callbacks
+    # add a child to the current node and descend
+    def handle_start_tag(tag, attrs)
+      node = add_child_to_current(tag, attrs)
+      @rootNode = node unless @rootNode
+      @currentNode = node
+    end
+    # go up to parent
+    def handle_end_tag(tag)
+      @currentNode = @currentNode.parent
+    end
+    # add a child to the current node
+    def handle_empty_tag(tag, attrs)
+      add_child_to_current(tag, attrs)
+    end
+    # Add a child to the current node and descend
+    # Assume that the unknown tag has an end tag.
+    def handle_unknown_tag(tag, attrs)
+      super
+      handle_start_tag(tag, attrs)
+    end
+    # go up to parent
+    def handle_missing_end_tag(tag)
+      super
+      handle_end_tag(tag)
+    end
+    # ignore
+    def handle_extra_end_tag(tag)
+      super
+    end
+    def handle_cdata(data)
+      REXML::Text.new(data, !@stripWhitespace, @currentNode)
+    end
+    def handle_script(data)
+      REXML::Comment.new(data, @currentNode)
+    end
+    def handle_unknown_character(name)
+      super # that is, do nothing
+    end
+    def handle_unknown_entity(name)
+      super # that is, do nothing
+    end
+    def handle_comment(data)
+      super # strip white
+      REXML::Comment.new(data, @currentNode)
+    end
+    def handle_special(data)
+      REXML::DocType.new(data, @currentNode)  # TODO
+    end
+  end
+end
+if $0 == __FILE__
+  $stdout.sync = true
+  class TestStackingParser < HTMLTree::XMLParser #:nodoc: all
+    $DEBUG = false
+    p = TestStackingParser.new(true, false)
+    p.parse_file_named(ARGV[0] || 'ebay.html')
+    File.open('xx.html', 'w') { |of|
+      p.document.write(of)
+    }
+  end
+end