RubyGems - htmltools - Versions diffs - 1.10 - Mend

htmltools 1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

data/lib/html/stparser.rb ADDED

@@ -0,0 +1,280 @@
+# This is an SGMLParser subclass that knows about HTML 4.0 rules
+# and can spot empty tags and deal with tags that may have omitted endtags.
+#
+# Copyright::   Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
+#               Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
+# License::   Ruby's License
+# CVS ID::    $Id: stparser.rb,v 1.4 2004/09/24 23:28:55 jhannes Exp $
+require 'html/sgml-parser'
+require 'html/tags'
+module HTML
+  class StackingParser < SGMLParser
+    # accessors
+    def stack; @tagStack; end
+    def last_tag; @tagStack[-1] || 'html'; end
+    def parent_tag; @tagStack[-2] || 'html'; end
+    def strip_whitespace=(flag); @stripWhitespace = flag; end
+    # input methods
+    # Open and parse the given file.
+    def parse_file_named(name)
+      File.open(name) { |f|
+        while bytes = f.read(65536)
+          feed(bytes)
+        end
+      }
+    end
+    # Feed some more data to the parser.
+    def feed(string)
+      super
+      while @saved.size > 0
+        saved = @saved
+        @saved = ''
+        super(saved)
+      end
+    end
+    # available only to subclasses
+    private
+    if $DEBUG
+      def dprint(*stuff)
+        print(("  " * @tagStack.size), stuff) if @verbose
+      end
+    else
+      def dprint(*stuff); end
+    end
+    def warn(msg)
+      $stderr.print(msg) if @verbose
+    end
+    def initialize(verbose=false, strip_white=false)
+      super(verbose)
+      @tagStack = []
+      @saved = ''
+      @stripWhitespace = strip_white
+    end
+    # handle_data will call this.
+    def skip_script(data)
+      # is the end of the script in this buffer?
+      if m = data.index(%r{</[A-Za-z]})
+        @nomoretags = false
+        @saved = data[m..-1]
+        handle_script(data[0,m]) # call user handler
+      else
+        handle_script(data)
+      end
+    end
+    # Unfortunately, sgml-parser calls this and there's important work to do in
+    # it. So the user handler has to be named something different.
+    def handle_data(data)
+      # need to handle scripts
+      if last_tag() == 'script' && @nomoretags
+        skip_script(data)
+      else
+        if @stripWhitespace
+          begin
+            data.strip! if HTML::Tag.named(last_tag()).can_ignore_whitespace
+          rescue NoSuchHTMLTagError
+            data.strip!
+          end
+        end
+        handle_cdata(data)  if data.size > 0 # call user handler
+      end
+    end
+    def finish_starttag(tag, attrs)
+      dprint "*START* #{tag} #{attrs.inspect}\n"
+      # dprint "-START- #{tag}\n"
+      begin
+        unless HTML::Tag.named(last_tag()).can_contain(tag, parent_tag())
+          dprint "-INSERT-\n"
+          finish_endtag(last_tag())
+        end
+      rescue NoSuchHTMLTagError
+        # hmm.. last_tag was unknown.
+        # Assume it doesn't have an optional endtag.
+      end
+      push(tag)
+      begin
+        if HTML::Tag.named(tag).is_empty_element
+          dprint "-EMPTY-\n"
+          handle_empty_tag(tag, attrs)  # call user handler
+          drop_to_tag(tag)
+        else
+          handle_start_tag(tag, attrs)  # call user handler
+        end
+        if tag.downcase == 'script'
+          @nomoretags = true
+        end
+      rescue NoSuchHTMLTagError
+        # hmm... the start tag is unknown.
+        # And we pushed it.
+        # If it's empty, we'll get rid of it at the next end tag.
+        handle_unknown_tag(tag, attrs)
+      end
+    end
+    # return true if tag is not extra
+    def drop_to_tag(tag)
+      dropped = @tagStack.size - (@tagStack.rindex(tag.downcase) || @tagStack.size)
+      if dropped == 0   # got an end tag but we haven't seen start tag?
+        handle_extra_end_tag(tag)  # call user handler
+        return false
+      end
+      dropped.times do
+        begin
+          # detect missing end tag
+          if last_tag != tag and ! HTML::Tag.named(last_tag).can_omit_end_tag
+            handle_missing_end_tag(last_tag)  # call user handler
+          elsif last_tag != tag
+            handle_end_tag(last_tag)
+          end
+        rescue NoSuchHTMLTagError
+          # oops, don't recognize last_tag.
+        end
+        pop
+      end
+      return true
+    end
+    def finish_endtag(tag)
+      dprint "*END* #{tag}\n"
+      if drop_to_tag(tag)
+        dprint "-END- #{tag} #{@tagStack.inspect}\n"
+        handle_end_tag(tag) # call user handler
+      end
+    end
+    def push(tag)
+      @tagStack.push(tag.downcase)
+      dprint "*PUSH* #{tag} => #{@tagStack.inspect}\n"
+    end
+    def pop
+      tag = @tagStack.pop
+      dprint "*POP*  #{tag} => #{@tagStack.inspect}\n"
+      tag
+    end
+    def unknown_charref(name)
+      handle_unknown_character(name)
+    end
+    def unknown_entityref(name)
+      handle_unknown_entity(name)
+    end
+    # callbacks: can be overridden in subclasses
+    def handle_start_tag(tag, attrs)
+    end
+    def handle_end_tag(tag)
+    end
+    # by default, an empty tag is handled as a start tag
+    # with an inserted end tag.
+    def handle_empty_tag(tag, attrs)
+      handle_start_tag(tag, attrs)
+      handle_end_tag(tag)
+    end
+    def handle_unknown_tag(tag, attrs)
+      warn("warning: unknown tag #{tag}\n")
+    end
+    def handle_missing_end_tag(tag)
+      warn("warning: missing end tag </#{tag}>\n")
+    end
+    def handle_extra_end_tag(tag)
+      warn("warning: extra end tag </#{tag}>\n")
+    end
+    def handle_cdata(data)
+    end
+    def handle_script(data)
+    end
+    def handle_unknown_character(name)
+    end
+    def handle_unknown_entity(name)
+    end
+    # call super if you want the data stripped
+    def handle_comment(data)
+      data.strip! if @stripWhitespace
+    end
+    def handle_special(data)
+    end
+  end
+end
+# test script
+if $0 == __FILE__
+  $stdout.sync = true
+  class TestStackingParser < HTML::StackingParser
+    def dump_stack
+      stack.each { |ea| print ea, '/' }
+    end
+    def handle_start_tag(tag, attrs)
+      print("START: #{tag} #{attrs.inspect}\n")
+    end
+    def handle_end_tag(tag)
+      # print("END: #{tag}\n")
+    end
+    def handle_empty_tag(tag, attrs)
+      # print("EMPTY: #{tag} #{attrs.inspect}\n")
+    end
+    def handle_cdata(data)
+      # print("DATA: #{data.size} chars\n")
+      if last_tag() != 'style'
+        str = data.strip
+        if str.size > 0
+          dump_stack
+          print(str.inspect, "\n")
+        end
+      end
+    end
+    def handle_script(data)
+      # print("SCRIPT: #{data.size} chars\n")
+    end
+    def handle_unknown_character(name)
+      print("UNKC: #{name}\n")
+    end
+    def handle_unknown_entity(name)
+      print("UNKE: #{name}\n")
+    end
+    def handle_comment(data)
+      super
+      print("COMMENT: #{data}\n")
+    end
+    def handle_special(data)
+      print("SPECIAL: #{data}\n")
+    end
+  end
+  $DEBUG = false
+  p = TestStackingParser.new(true)
+  p.parse_file_named(ARGV[0] || 'ebay.html')
+end

data/lib/html/tags.rb ADDED

@@ -0,0 +1,288 @@
+# This encodes the knowledge of HTML 4.0 tags for a parser.
+# It knows about block vs. inline tags, empty tags, and optionally
+# omitted end tags.
+#
+# Copyright::   Copyright (C) 2003, Johannes Brodwall <johannes@brodwall.com>,
+#               Copyright (C) 2002, Ned Konz <ned@bike-nomad.com>
+# License::   Ruby's license
+# CVS ID::    $Id: tags.rb,v 1.4 2004/09/24 23:28:55 jhannes Exp $
+# This is an error raised by <tt>HTML::Tag.named()</tt> when a tag doesn't exist.
+class NoSuchHTMLTagError < RuntimeError
+end
+# This is the base class for all the HTML tag classes.
+module HTML
+  class Tag
+    # tag_name:: a String, the name of the tag
+    # can_omit:: a Boolean, true if end tag is optional
+    def initialize(tag_name, can_omit)
+      @name = tag_name.downcase
+      @can_omit_end = can_omit
+    end
+    # Return my tag name.
+    def name; @name; end
+    # Return true if my end tag can be omitted.
+    def can_omit_end_tag; @can_omit_end; end
+    # Return true if I am a block element.
+    def is_block_element; false; end
+    # Return true if I am an inline element.
+    def is_inline_element; false; end
+    # Return true if I am an empty element.
+    def is_empty_element; false; end
+    # Return true if I can contain <tt>tag</tt> if my parent is of type <tt>parent</tt>.
+    # tag:: tag name, a String
+    # parent:: parent tag name, a String.
+    def can_contain(tag, parent); false; end
+    # Return true if whitespace within me can be omitted (ignoring browser
+    # bugs)
+    def can_ignore_whitespace; true; end
+  end
+  # This represents an HTML block element.
+  class BlockTag < Tag
+    def is_block_element; true; end
+    # Blocks can contain anything, so return true.
+    def can_contain(tag, parent); true; end
+  end
+  # This represents an HTML inline element.
+  class InlineTag < Tag
+    def is_inline_element; true; end
+    # Inlines can only contain other inlines.
+    def can_contain(tag, parent)
+      Tag.named(tag).is_inline_element
+    end
+  end
+  # This represents an HTML element that can be regarded as either a block
+  # or an inline element..
+  class BlockOrInlineTag < InlineTag
+    def is_block_element; true; end
+    # If used as inline elements (e.g., within another inline element or a P),
+    # these elements should not contain any block-level elements.
+    def can_contain(tag, parent)
+      return ((parent.downcase == 'p' \
+        or Tag.named(parent).is_inline_element) \
+          and ! Tag.named(tag).is_block_element)
+    end
+  end
+  # This represents an HTML tag that never has an end tag.
+  class EmptyTag < Tag
+    def is_empty_element; true; end
+    def is_inline_element; true; end
+    def can_contain(tag, parent); false; end
+  end
+  # This block initializes the tag lookup table.
+  class Tag
+    @table = Hash.new
+    # Add the given tag to the tag lookup table.
+    #
+    # This can be called by user code to add otherwise unknown tags to the
+    # table.
+    #
+    # name::      the tag name, a String.
+    # is_block::  true if I am a block element.
+    # is_inline:: true if I am an inline element.
+    # is_empty::  true if I am an empty element.
+    # can_omit::  true if my end tag can be omitted.
+    def Tag.add_tag(name, is_block, is_inline, is_empty, can_omit)
+      @table[ name.upcase ] = @table[ name.downcase ] = \
+      if is_empty
+        EmptyTag.new(name, true)
+      elsif is_block
+        if is_inline
+          BlockOrInlineTag.new(name, can_omit)
+        else
+          BlockTag.new(name, can_omit)
+        end
+      else
+        InlineTag.new(name, can_omit)
+      end
+    end
+    # Return an Tag with the given name, or raise a
+    # NoSuchHTMLTagError.
+    def Tag.named(tagname)
+      @table[ tagname ] || raise(NoSuchHTMLTagError.exception(tagname))
+    end
+    #               Block Inline Empty can_omit_end
+    [
+    [ 'A',          false, true, false, false ], # Anchor
+    [ 'ABBR',       false, true, false, false ], # Abbreviation
+    [ 'ACRONYM',    false, true, false, false ], # Acronym
+    [ 'ADDRESS',    true, false, false, false ], # Address
+    [ 'APPLET',     true,  true, false, false ], # Java applet
+    [ 'AREA',       true, false, true, true ], # Image map region
+    [ 'B',          false, true, false, false ], # Bold text
+    [ 'BASE',       false, false, true, true ], # Document base URI
+    [ 'BASEFONT',   false, true, true,  true  ], # Base font change
+    [ 'BDO',        false, true, false, false ], # Bi_di override
+    [ 'BIG',        false, true, false, false ], # Large text
+    [ 'BLOCKQUOTE', true, false, false, false ], # Block quotation
+    [ 'BODY',       true, false, false, false ], # Document body
+    [ 'BR',         false, true,  true, true ], # Line break
+    [ 'BUTTON',     true,  true,  false, false ], # Button
+    [ 'CAPTION',    false, true, false, false ], # Table caption
+    [ 'CENTER',     false, true, false, false ], # Centered block
+    [ 'CITE',       false, true, false, false ], # Citation
+    [ 'CODE',       false, true, false, false ], # Computer code
+    [ 'COL',        false, false, true, true ], # Table column
+    [ 'COLGROUP',   true, false, false, true ], # Table column group
+    [ 'DD',         true, false, false, true ], # Definition description
+    [ 'DEL',        true,  true,  false, false ], # Deleted text
+    [ 'DFN',        false, true, false, false ], # Defined term
+    [ 'DIR',        true, false, false, false ], # Directory list
+    [ 'DIV',        true, false, false, false ], # Generic block-level container
+    [ 'DL',         true, false, false, false ], # Definition list
+    [ 'DT',         false, true, false, true ], # Definition term
+    [ 'EM',         false, true, false, false ], # Emphasis
+    [ 'FIELDSET',   true, false, false, false ], # Form control group
+    [ 'FONT',       false, true, false, false ], # Font change
+    [ 'FORM',       true, false, false, false ], # Interactive form
+    [ 'FRAME',      false, false, true, true ], # Frame
+    [ 'FRAMESET',   true, false, false, false ], # Frameset
+    [ 'H1',         true, false, false, false ], # Level-one heading
+    [ 'H2',         true, false, false, false ], # Level-two heading
+    [ 'H3',         true, false, false, false ], # Level-three heading
+    [ 'H4',         true, false, false, false ], # Level-four heading
+    [ 'H5',         true, false, false, false ], # Level-five heading
+    [ 'H6',         true, false, false, false ], # Level-six heading
+    [ 'HEAD',       true, false, false, false ], # Document head
+    [ 'HR',         false, true, true, true ], # Horizontal rule
+    [ 'HTML',       true, false, false, false ], # HTML document
+    [ 'I',          false, true, false, false ], # Italic text
+    [ 'IFRAME',     true,  true,  false, false ], # Inline frame
+    [ 'IMG',        false, true, true, true ], # Inline image
+    [ 'INPUT',      false, true, true, true ], # Form input
+    [ 'INS',        true,  true, false, false ], # Inserted text
+    [ 'ISINDEX',    false, true, true,  true ], # Input prompt
+    [ 'KBD',        false, true, false, false ], # Text to be input
+    [ 'LABEL',      false, true, false, false ], # Form field label
+    [ 'LEGEND',     false, true, false, false ], # Fieldset caption
+    [ 'LI',         true, false, false, true ], # List item
+    [ 'LINK',       true, false, false, true ], # Document relationship
+    [ 'MAP',        true,  true, false, false ], # Image map
+    [ 'MENU',       true, false, false, false ], # Menu list
+    [ 'META',       false, true,  true, true ], # Metadata
+    [ 'NOFRAMES',   true, false, false, false ], # Frames alternate content
+    [ 'NOSCRIPT',   true, false, false, false ], # Alternate script content
+    [ 'OBJECT',     true,  true,  false, false ], # Object
+    [ 'OL',         true, false, false, false ], # Ordered list
+    [ 'OPTGROUP',   true, false, false, false ], # Option group
+    [ 'OPTION',     true, false, false, false ], # Menu option
+    [ 'P',          true, false, false, true ], # Paragraph
+    [ 'PARAM',      false, true, true,  true ], # Object parameter
+    [ 'PRE',        true, false, false, false ], # Preformatted text
+    [ 'Q',          false, true, false, false ], # Short quotation
+    [ 'S',          false, true, false, false ], # Strike-through text
+    [ 'SAMP',       false, true, false, false ], # Sample output
+    [ 'SCRIPT',     true,  true, false, false ], # Client-side script
+    [ 'SELECT',     true, false, false, false ], # Option selector
+    [ 'SMALL',      false, true, false, false ], # Small text
+    [ 'SPAN',       false, true, false, false ], # Generic inline container
+    [ 'STRIKE',     false, true, false, false ], # Strike-through text
+    [ 'STRONG',     false, true, false, false ], # Strong emphasis
+    [ 'STYLE',      true, false, false, false ], # Embedded style sheet
+    [ 'SUB',        false, true, false, false ], # Subscript
+    [ 'SUP',        false, true, false, false ], # Superscript
+    [ 'TABLE',      true, false, false, false ], # Table
+    [ 'TBODY',      true, false, false, false ], # Table body
+    [ 'TD',         true, false, false, true ], # Table data cell
+    [ 'TEXTAREA',   false, true, false, false ], # Multi-line text input
+    [ 'TFOOT',      true, false, false, true ], # Table foot
+    [ 'TH',         true, false, false, true ], # Table header cell
+    [ 'THEAD',      true, false, false, true ], # Table head
+    [ 'TITLE',      true, false, false, false ], # Document title
+    [ 'TR',         true, false, false, true ], # Table row
+    [ 'TT',         false, true, false, false ], # Teletype text
+    [ 'U',          false, true, false, false ], # Underlined text
+    [ 'UL',         true, false, false, false ], # Unordered list
+    [ 'VAR',        false, true, false, false ], # Variable
+    ].each { |a| add_tag(*a) }
+    # EXCEPTIONS TODO
+    # A, LABEL can't contain itself
+    # several things (fonts, etc) can't be in PRE
+    # SELECT can only have OPTGROUP or OPTION
+    # TEXTAREA, OPTION only contains plain text
+    # APPLET and OBJECT has PARAM+ followed by block and/or inline
+    # BUTTON can't contain:
+    #  A, INPUT, SELECT, TEXTAREA, LABEL, BUTTON, or IFRAME
+    #  nor FORM, ISINDEX, and FIELDSET
+    # IFRAME can only contain block elems if parent can
+    # MAP can contain block+ *xor* AREA+
+    # SCRIPT only contains a SCRIPT (that is, until /<\/[A-Za-z]/)
+    # BODY must be in HTML or NOFRAMES
+    # COL can only be in COLGROUP or TABLE
+    # COLGROUP has only COL*, and can only be in TABLE
+    # DIR, MENU can only contain LI+, none of which may contain block elems
+    # DL must contain (DT|DD)+
+    # DT and DD are only allowed in DL
+    # FIELDSET contains LEGEND, (block|inline)*
+    # FRAMESET contains (FRAMESET|FRAME), plus NOFRAMES and must be in HTML
+    # H# can only be contained in block elems, but only contain inlines.
+    # HEAD must only contain TITLE, BASE?, ISINDEX?, SCRIPT* STYLE* META* LINK*
+    #   OBJECT* HEAD must be in HTML
+    # HTML is top-level and can only contain HEAD, BODY, or HEAD, FRAMESET
+    # LI can contain blocks except when inside DIR or MENU
+    # LI can only be inside OL, UL, DIR, MENU
+    # OL, UL can only contain LI+
+    # OPTGROUP contains OPTION+
+    # P can only contain inlines. However, it is a block-level elem.
+    # PRE can only contain inlines except IMG, OBJECT, APPLET, BIG, SMALL, SUB,
+    #   SUP, FONT, BASEFONT
+    # tags with optional omitted endtags and their allowed contents:
+    # anchor matches at beginning and end
+      {
+          'AREA'      => '(?!AREA)[A-Z]+',
+          'COLGROUP'  => 'COL',
+          'DD'        => '(?!D[DT]$)[A-Z]+',
+          'DT'        => '(?!D[DT]$)[A-Z]+',
+          'LI'        => '(?!LI$)[A-Z]+',
+          'MAP'       => 'AREA',
+          'P'         => '(?!P$)[A-Z]+',
+          'TD'        => '(?!T[HDR]$)[A-Z]+',
+          'TFOOT'     => 'TR',
+          'TH'        => '(?!T[HDR]$)[A-Z]+',
+          'THEAD'     => 'TR',
+          'TR'        => 'T[HD]',
+      }.each_pair { |tagname, pattern|
+      eval <<EOM
+      class << named(tagname)   # :nodoc:
+        def can_contain(tag, parent)
+          (/\\A#{pattern}\\z/i =~ tag) == 0
+        end
+      end
+EOM
+    }
+    class << named('TEXTAREA') # :nodoc:
+      def can_ignore_whitespace; false; end
+    end
+    class << named('PRE') # :nodoc:
+      def can_ignore_whitespace; false; end
+    end
+    class << named('OPTION') # :nodoc:
+      def can_ignore_whitespace; false; end
+    end
+  end
+end