RubyGems - spk-html5 - Versions diffs - 0.10.1 - Mend

spk-html5 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

data/History.txt +10 -0
data/Manifest.txt +73 -0
data/README +45 -0
data/Rakefile.rb +33 -0
data/bin/html5 +7 -0
data/lib/html5.rb +13 -0
data/lib/html5/cli.rb +248 -0
data/lib/html5/constants.rb +1061 -0
data/lib/html5/filters/base.rb +10 -0
data/lib/html5/filters/inject_meta_charset.rb +82 -0
data/lib/html5/filters/iso639codes.rb +755 -0
data/lib/html5/filters/optionaltags.rb +198 -0
data/lib/html5/filters/rfc2046.rb +31 -0
data/lib/html5/filters/rfc3987.rb +91 -0
data/lib/html5/filters/sanitizer.rb +15 -0
data/lib/html5/filters/validator.rb +834 -0
data/lib/html5/filters/whitespace.rb +36 -0
data/lib/html5/html5parser.rb +247 -0
data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
data/lib/html5/html5parser/after_body_phase.rb +46 -0
data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
data/lib/html5/html5parser/after_head_phase.rb +55 -0
data/lib/html5/html5parser/before_head_phase.rb +44 -0
data/lib/html5/html5parser/before_html_phase.rb +41 -0
data/lib/html5/html5parser/in_body_phase.rb +636 -0
data/lib/html5/html5parser/in_caption_phase.rb +69 -0
data/lib/html5/html5parser/in_cell_phase.rb +78 -0
data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
data/lib/html5/html5parser/in_head_phase.rb +143 -0
data/lib/html5/html5parser/in_row_phase.rb +96 -0
data/lib/html5/html5parser/in_select_phase.rb +90 -0
data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
data/lib/html5/html5parser/in_table_phase.rb +177 -0
data/lib/html5/html5parser/initial_phase.rb +133 -0
data/lib/html5/html5parser/phase.rb +171 -0
data/lib/html5/inputstream.rb +735 -0
data/lib/html5/liberalxmlparser.rb +158 -0
data/lib/html5/sanitizer.rb +209 -0
data/lib/html5/serializer.rb +2 -0
data/lib/html5/serializer/htmlserializer.rb +179 -0
data/lib/html5/serializer/xhtmlserializer.rb +20 -0
data/lib/html5/sniffer.rb +45 -0
data/lib/html5/tokenizer.rb +1059 -0
data/lib/html5/treebuilders.rb +24 -0
data/lib/html5/treebuilders/base.rb +339 -0
data/lib/html5/treebuilders/hpricot.rb +231 -0
data/lib/html5/treebuilders/rexml.rb +215 -0
data/lib/html5/treebuilders/simpletree.rb +191 -0
data/lib/html5/treewalkers.rb +26 -0
data/lib/html5/treewalkers/base.rb +162 -0
data/lib/html5/treewalkers/hpricot.rb +48 -0
data/lib/html5/treewalkers/rexml.rb +48 -0
data/lib/html5/treewalkers/simpletree.rb +48 -0
data/lib/html5/version.rb +3 -0
data/test/preamble.rb +69 -0
data/test/test_cli.rb +16 -0
data/test/test_encoding.rb +35 -0
data/test/test_input_stream.rb +26 -0
data/test/test_lxp.rb +283 -0
data/test/test_parser.rb +63 -0
data/test/test_sanitizer.rb +173 -0
data/test/test_serializer.rb +67 -0
data/test/test_sniffer.rb +27 -0
data/test/test_stream.rb +71 -0
data/test/test_tokenizer.rb +95 -0
data/test/test_treewalkers.rb +135 -0
data/test/test_validator.rb +31 -0
data/test/tokenizer_test_parser.rb +67 -0
data/test19.rb +38 -0
metadata +198 -0

data/lib/html5/serializer/xhtmlserializer.rb ADDED

@@ -0,0 +1,20 @@
+require 'html5/serializer/htmlserializer'
+module HTML5
+  class XHTMLSerializer < HTMLSerializer
+    DEFAULTS = {
+      :quote_attr_values           => true,
+      :minimize_boolean_attributes => false,
+      :use_trailing_solidus        => true,
+      :escape_lt_in_attrs          => true,
+      :omit_optional_tags          => false,
+      :escape_rcdata               => true
+    }
+    def initialize(options={})
+      super(DEFAULTS.clone.update(options))
+    end
+  end
+end

data/lib/html5/sniffer.rb ADDED

@@ -0,0 +1,45 @@
+module HTML5
+module Sniffer
+  # 4.7.4
+  def html_or_feed str
+    s = str[0, 512] # steps 1, 2
+    pos = 0
+    while pos < s.length
+      case s[pos]
+      when ?\t, ?\ , ?\n, ?\r # 0x09, 0x20, 0x0A, 0x0D == tab, space, LF, CR
+        pos += 1
+      when ?< # 0x3C
+        pos += 1
+        if s[pos..pos+2] == "!--" # [0x21, 0x2D, 0x2D]
+          pos += 3
+          until s[pos..pos+2] == "-->" or pos >= s.length
+            pos += 1
+          end
+          pos += 3
+        elsif s[pos] == ?! # 0x21
+          pos += 1
+          until s[pos] == ?> or pos >= s.length # 0x3E
+            pos += 1
+          end
+          pos += 1
+        elsif s[pos] == ?? # 0x3F
+          until s[pos..pos+1] == "?>" or pos >= s.length # [0x3F, 0x3E]
+            pos +=  1
+          end
+          pos += 2
+        elsif s[pos..pos+2] == "rss"   # [0x72, 0x73, 0x73]
+          return "application/rss+xml"
+        elsif s[pos..pos+3] == "feed"  # [0x66, 0x65, 0x65, 0x64]
+          return "application/atom+xml"
+        elsif s[pos..pos+6] == "rdf:RDF" # [0x72, 0x64, 0x66, 0x3A, 0x52, 0x44, 0x46]
+          raise NotImplementedError
+        end
+      else
+        break
+      end
+    end
+    "text/html"
+  end
+end
+end

data/lib/html5/tokenizer.rb ADDED

@@ -0,0 +1,1059 @@
+require 'html5/constants'
+require 'html5/inputstream'
+module HTML5
+  # This class takes care of tokenizing HTML.
+  #
+  # * @current_token
+  #   Holds the token that is currently being processed.
+  #
+  # * @state
+  #   Holds a reference to the method to be invoked... XXX
+  #
+  # * @states
+  #   Holds a mapping between states and methods that implement the state.
+  #
+  # * @stream
+  #   Points to HTMLInputStream object.
+  class HTMLTokenizer
+    attr_accessor :content_model_flag, :current_token
+    attr_reader :stream
+    # XXX need to fix documentation
+    def initialize(stream, options = {})
+      @stream = HTMLInputStream.new(stream, options)
+      # Setup the initial tokenizer state
+      @content_model_flag = :PCDATA
+      @state              = :data_state
+      @escapeFlag         = false
+      @lastFourChars      = []
+      # The current token being created
+      @current_token = nil
+      # Tokens to be processed.
+      @token_queue             = []
+      @lowercase_element_name = options[:lowercase_element_name] != false
+      @lowercase_attr_name    = options[:lowercase_attr_name]    != false
+    end
+    # This is where the magic happens.
+    #
+    # We do our usually processing through the states and when we have a token
+    # to return we yield the token which pauses processing until the next token
+    # is requested.
+    def each
+      @token_queue = []
+      # Start processing. When EOF is reached @state will return false
+      # instead of true and the loop will terminate.
+      while send @state
+        yield :type => :ParseError, :data => @stream.errors.shift until @stream.errors.empty?
+        yield @token_queue.shift until @token_queue.empty?
+      end
+    end
+    # Below are various helper functions the tokenizer states use worked out.
+    # If the next character is a '>', convert the current_token into
+    # an EmptyTag
+    def process_solidus_in_tag
+      # We need to consume another character to make sure it's a ">"
+      data = @stream.char
+      rv = false
+      if @current_token[:type] == :StartTag and data == ">"
+        @current_token[:type] = :EmptyTag
+      elsif data == :EOF
+        @token_queue << ({:type => :ParseError, :data => "eof-following-solidus"})
+        @state = :data_state
+        emit_current_token
+        rv = true
+      else
+        @token_queue << {:type => :ParseError, :data => "incorrectly-placed-solidus"}
+      end
+      # The character we just consumed need to be put back on the stack so it
+      # doesn't get lost...
+      @stream.unget(data)
+      rv
+    end
+    # This function returns either U+FFFD or the character based on the
+    # decimal or hexadecimal representation. It also discards ";" if present.
+    # If not present @token_queue << {:type => :ParseError}" is invoked.
+    def consume_number_entity(isHex)
+      # XXX More need to be done here. For instance, #13 should prolly be
+      # converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
+      # such. Thoughts on this appreciated.
+      allowed = DIGITS
+      radix = 10
+      if isHex
+        allowed = HEX_DIGITS
+        radix = 16
+      end
+      char_stack = []
+      # Consume all the characters that are in range while making sure we
+      # don't hit an EOF.
+      c = @stream.char
+      while allowed.include?(c) and c != :EOF
+        char_stack.push(c)
+        c = @stream.char
+      end
+      # Convert the set of characters consumed to an int.
+      charAsInt = char_stack.join('').to_i(radix)
+      if charAsInt == 13
+        @token_queue << {:type => :ParseError, :data => "incorrect-cr-newline-entity"}
+        charAsInt = 10
+      elsif (128..159).include? charAsInt
+        # If the integer is between 127 and 160 (so 128 and bigger and 159
+        # and smaller) we need to do the "windows trick".
+        @token_queue << {:type => :ParseError, :data => "illegal-windows-1252-entity"}
+        charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
+      end
+      if 0 < charAsInt && charAsInt <= 1114111 && !(55296 <= charAsInt && charAsInt <= 57343) &&
+        ![0x10FFFF].include?(charAsInt) # TODO add more entity replacements here
+        if String.method_defined? :force_encoding
+          char = charAsInt.chr('utf-8')
+        else
+          char = [charAsInt].pack('U')
+        end
+      else
+        char = [0xFFFD].pack('U')
+        @token_queue << {:type => :ParseError, :data => "cant-convert-numeric-entity", :datavars => {"charAsInt" => charAsInt}}
+      end
+      # Discard the ; if present. Otherwise, put it back on the queue and
+      # invoke parse_error on parser.
+      if c != ";"
+        @token_queue << {:type => :ParseError, :data => "numeric-entity-without-semicolon"}
+        @stream.unget(c)
+      end
+      return char
+    end
+    def consume_entity(allowed_char=nil, from_attribute=false)
+      char = nil
+      char_stack = [@stream.char]
+      if SPACE_CHARACTERS.include?(char_stack[0]) or [:EOF, '<', '&'].include?(char_stack[0]) ||
+         (allowed_char && allowed_char == char_stack[0])
+        @stream.unget(char_stack)
+      elsif char_stack[0] == '#'
+        # We might have a number entity here.
+        char_stack += [@stream.char, @stream.char]
+        if char_stack[0 .. 1].include? :EOF
+          # If we reach the end of the file put everything up to :EOF
+          # back in the queue
+          char_stack = char_stack[0...char_stack.index(:EOF)]
+          @stream.unget(char_stack)
+          @token_queue << {:type => :ParseError, :data => "expected-numeric-entity-but-got-eof"}
+        else
+          if char_stack[1].downcase == "x" and HEX_DIGITS.include? char_stack[2]
+            # Hexadecimal entity detected.
+            @stream.unget(char_stack[2])
+            char = consume_number_entity(true)
+          elsif DIGITS.include? char_stack[1]
+            # Decimal entity detected.
+            @stream.unget(char_stack[1..-1])
+            char = consume_number_entity(false)
+          else
+            # No number entity detected.
+            @stream.unget(char_stack)
+            @token_queue << {:type => :ParseError, :data => "expected-numeric-entity"}
+          end
+        end
+      else
+        # At this point in the process might have named entity. Entities
+        # are stored in the global variable "entities".
+        #
+        # Consume characters and compare to these to a substring of the
+        # entity names in the list until the substring no longer matches.
+        filteredEntityList = ENTITIES.keys
+        filteredEntityList.reject! {|e| e[0].chr != char_stack[0]}
+        entityName = nil
+        # Try to find the longest entity the string will match to take care
+        # of &noti for instance.
+        while char_stack.last != :EOF
+          name = char_stack.join('')
+          if filteredEntityList.any? {|e| e[0...name.length] == name}
+            filteredEntityList.reject! {|e| e[0...name.length] != name}
+            char_stack.push(@stream.char)
+          else
+            break
+          end
+          if ENTITIES.include? name
+            entityName = name
+            break if entityName[-1] == ';'
+          end
+        end
+        if entityName != nil
+          char = ENTITIES[entityName]
+          # Check whether or not the last character returned can be
+          # discarded or needs to be put back.
+          if entityName[-1] != ?;
+            @token_queue << {:type => :ParseError, :data => "named-entity-without-semicolon"}
+          end
+          if entityName[-1] != ";" and from_attribute and
+             (ASCII_LETTERS.include?(char_stack[entityName.length]) or
+              DIGITS.include?(char_stack[entityName.length]))
+            @stream.unget(char_stack)
+            char = '&'
+          else
+            @stream.unget(char_stack[entityName.length..-1])
+          end
+        else
+          @token_queue << {:type => :ParseError, :data => "expected-named-entity"}
+          @stream.unget(char_stack)
+        end
+      end
+      return char
+    end
+    # This method replaces the need for "entityInAttributeValueState".
+    def process_entity_in_attribute allowed_char
+      entity = consume_entity(allowed_char, true)
+      if entity
+        @current_token[:data][-1][1] += entity
+      else
+        @current_token[:data][-1][1] += "&"
+      end
+    end
+    # This method is a generic handler for emitting the tags. It also sets
+    # the state to "data" because that's what's needed after a token has been
+    # emitted.
+    def emit_current_token
+      # Add token to the queue to be yielded
+      token = @current_token
+      if [:StartTag, :EndTag, :EmptyTag].include?(token[:type])
+        if @lowercase_element_name
+          token[:name] = token[:name].downcase
+        end
+        if token[:type] == :EndTag && token[:self_closing]
+          @token_queue << {:type => :ParseError, :data => "self-closing-end-tag"}
+        end
+        @token_queue << token
+        @state = :data_state
+      end
+    end
+    # Below are the various tokenizer states worked out.
+    # XXX AT Perhaps we should have Hixie run some evaluation on billions of
+    # documents to figure out what the order of the various if and elsif
+    # statements should be.
+    def data_state
+      data = @stream.char
+      if @content_model_flag == :CDATA or @content_model_flag == :RCDATA
+        @lastFourChars.shift if @lastFourChars.length == 4
+        @lastFourChars << data
+      end
+      if data == "&" and [:PCDATA,:RCDATA].include?(@content_model_flag) and !@escapeFlag
+          @state = :entity_data_state
+      elsif data == "-" && [:CDATA, :RCDATA].include?(@content_model_flag) && !@escapeFlag && @lastFourChars.join('') == "<!--"
+          @escapeFlag = true
+          @token_queue << {:type => :Characters, :data => data}
+      elsif data == "<" and !@escapeFlag and
+        [:PCDATA,:CDATA,:RCDATA].include?(@content_model_flag)
+          @state = :tag_open_state
+      elsif data == ">" and @escapeFlag and
+        [:CDATA,:RCDATA].include?(@content_model_flag) and
+        @lastFourChars[1..-1].join('') == "-->"
+          @escapeFlag = false
+          @token_queue << {:type => :Characters, :data => data}
+      elsif data == :EOF
+        # Tokenization ends.
+        return false
+      elsif SPACE_CHARACTERS.include? data
+        # Directly after emitting a token you switch back to the "data
+        # state". At that point SPACE_CHARACTERS are important so they are
+        # emitted separately.
+        # XXX need to check if we don't need a special "spaces" flag on
+        # characters.
+        @token_queue << {:type => :SpaceCharacters, :data => data + @stream.chars_until(SPACE_CHARACTERS, true)}
+      else
+        chars = @stream.chars_until(["&", "<", ">", "-"])
+        @token_queue << {:type => :Characters, :data => data + chars}
+        @lastFourChars += (chars[chars.length > 4 ? -4 : -chars.length, 4] || '').scan(/./)
+        @lastFourChars = @lastFourChars[(@lastFourChars.length > 4 ? -4 : -@lastFourChars.length), 4] || []
+      end
+      return true
+    end
+    def entity_data_state
+      entity = consume_entity
+      if entity
+        @token_queue << {:type => :Characters, :data => entity}
+      else
+        @token_queue << {:type => :Characters, :data => "&"}
+      end
+      @state = :data_state
+      return true
+    end
+    def tag_open_state
+      data = @stream.char
+      if @content_model_flag == :PCDATA
+        if data == "!"
+          @state = :markup_declaration_open_state
+        elsif data == "/"
+          @state = :close_tag_open_state
+        elsif data != :EOF and ASCII_LETTERS.include? data
+          @current_token = {:type => :StartTag, :name => data, :data => []}
+          @state = :tag_name_state
+        elsif data == ">"
+          # XXX In theory it could be something besides a tag name. But
+          # do we really care?
+          @token_queue << {:type => :ParseError, :data =>       "expected-tag-name-but-got-right-bracket"}
+          @token_queue << {:type => :Characters, :data => "<>"}
+          @state = :data_state
+        elsif data == "?"
+          # XXX In theory it could be something besides a tag name. But
+          # do we really care?
+          @token_queue.push({:type => :ParseError, :data => "expected-tag-name-but-got-question-mark"})
+          @stream.unget(data)
+          @state = :bogus_comment_state
+        else
+          # XXX
+          @token_queue << {:type => :ParseError, :data => "expected-tag-name"}
+          @token_queue << {:type => :Characters, :data => "<"}
+          @stream.unget(data)
+          @state = :data_state
+        end
+      else
+        # We know the content model flag is set to either RCDATA or CDATA
+        # now because this state can never be entered with the PLAINTEXT
+        # flag.
+        if data == "/"
+          @state = :close_tag_open_state
+        else
+          @token_queue << {:type => :Characters, :data => "<"}
+          @stream.unget(data)
+          @state = :data_state
+        end
+      end
+      return true
+    end
+    def close_tag_open_state
+      if (@content_model_flag == :RCDATA or @content_model_flag == :CDATA)
+        if @current_token
+          char_stack = []
+          # So far we know that "</" has been consumed. We now need to know
+          # whether the next few characters match the name of last emitted
+          # start tag which also happens to be the current_token. We also need
+          # to have the character directly after the characters that could
+          # match the start tag name.
+          (@current_token[:name].length + 1).times do
+            char_stack.push(@stream.char)
+            # Make sure we don't get hit by :EOF
+            break if char_stack[-1] == :EOF
+          end
+          # Since this is just for checking. We put the characters back on
+          # the stack.
+          @stream.unget(char_stack)
+        end
+        if @current_token and
+          @current_token[:name].downcase ==
+          char_stack[0...-1].join('').downcase and
+          (SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? char_stack[-1]
+          # Because the characters are correct we can safely switch to
+          # PCDATA mode now. This also means we don't have to do it when
+          # emitting the end tag token.
+          @content_model_flag = :PCDATA
+        else
+          @token_queue << {:type => :Characters, :data => "</"}
+          @state = :data_state
+          # Need to return here since we don't want the rest of the
+          # method to be walked through.
+          return true
+        end
+      end
+      data = @stream.char
+      if data == :EOF
+        @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-eof"}
+        @token_queue << {:type => :Characters, :data => "</"}
+        @state = :data_state
+      elsif ASCII_LETTERS.include? data
+        @current_token = {:type => :EndTag, :name => data, :data => []}
+        @state = :tag_name_state
+      elsif data == ">"
+        @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-right-bracket"}
+        @state = :data_state
+      else
+        # XXX data can be _'_...
+        @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-char", :datavars => {:data => data}}
+        @stream.unget(data)
+        @state = :bogus_comment_state
+      end
+      return true
+    end
+    def tag_name_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include? data
+        @state = :before_attribute_name_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-tag-name"}
+        emit_current_token
+      elsif ASCII_LETTERS.include? data
+        @current_token[:name] += data + @stream.chars_until(ASCII_LETTERS, true)
+      elsif data == ">"
+        emit_current_token
+      elsif data == "/"
+        @state = :self_closing_tag_state
+      else
+        @current_token[:name] += data
+      end
+      return true
+    end
+    def before_attribute_name_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include? data
+        @stream.chars_until(SPACE_CHARACTERS, true)
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "expected-attribute-name-but-got-eof"}
+        emit_current_token
+      elsif ASCII_LETTERS.include? data
+        @current_token[:data].push([data, ""])
+        @state = :attribute_name_state
+      elsif data == ">"
+        emit_current_token
+      elsif data == "/"
+        @state = :self_closing_tag_state
+      elsif data == "'" || data == '"' || data == "="
+        @token_queue.push({:type => :ParseError, :data => "invalid-character-in-attribute-name"})
+        @current_token[:data].push([data, ""])
+        @state = :attribute_name_state
+      else
+        @current_token[:data].push([data, ""])
+        @state = :attribute_name_state
+      end
+      return true
+    end
+    def attribute_name_state
+      data = @stream.char
+      leavingThisState = true
+      emitToken = false
+      if data == "="
+        @state = :before_attribute_value_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-attribute-name"}
+        @state = :data_state
+        emitToken = true
+      elsif ASCII_LETTERS.include? data
+        @current_token[:data][-1][0] += data + @stream.chars_until(ASCII_LETTERS, true)
+        leavingThisState = false
+      elsif data == ">"
+        # XXX If we emit here the attributes are converted to a dict
+        # without being checked and when the code below runs we error
+        # because data is a dict not a list
+        emitToken = true
+      elsif SPACE_CHARACTERS.include? data
+        @state = :after_attribute_name_state
+      elsif data == "/"
+        if !process_solidus_in_tag
+          @state = :before_attribute_name_state
+        end
+     elsif data == "'" or data == '"'
+        @token_queue.push({:type => :ParseError, :data => "invalid-character-in-attribute-name"})
+        @current_token[:data][-1][0] += data
+        leavingThisState = false
+      else
+        @current_token[:data][-1][0] += data
+        leavingThisState = false
+      end
+      if leavingThisState
+        # Attributes are not dropped at this stage. That happens when the
+        # start tag token is emitted so values can still be safely appended
+        # to attributes, but we do want to report the parse error in time.
+        if @lowercase_attr_name
+            @current_token[:data][-1][0] = @current_token[:data].last.first.downcase
+        end
+        @current_token[:data][0...-1].each {|name,value|
+          if @current_token[:data].last.first == name
+            @token_queue << {:type => :ParseError, :data => "duplicate-attribute"}
+            break # don't report an error more than once
+          end
+        }
+        # XXX Fix for above XXX
+        emit_current_token if emitToken
+      end
+      return true
+    end
+    def after_attribute_name_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include? data
+        @stream.chars_until(SPACE_CHARACTERS, true)
+      elsif data == "="
+        @state = :before_attribute_value_state
+      elsif data == ">"
+        emit_current_token
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "expected-end-of-tag-but-got-eof"}
+        emit_current_token
+      elsif ASCII_LETTERS.include? data
+        @current_token[:data].push([data, ""])
+        @state = :attribute_name_state
+      elsif data == "/"
+        @state = :self_closing_tag_state
+      else
+        @current_token[:data].push([data, ""])
+        @state = :attribute_name_state
+      end
+      return true
+    end
+    def before_attribute_value_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include? data
+        @stream.chars_until(SPACE_CHARACTERS, true)
+      elsif data == "\""
+        @state = :attribute_value_double_quoted_state
+      elsif data == "&"
+        @state = :attribute_value_unquoted_state
+        @stream.unget(data);
+      elsif data == "'"
+        @state = :attribute_value_single_quoted_state
+      elsif data == ">"
+        emit_current_token
+      elsif data == "="
+        @token_queue.push({:type => :ParseError, :data => "equals-in-unquoted-attribute-value"})
+          @current_token[:data][-1][1] += data
+          @state = :attribute_value_unquoted_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "expected-attribute-value-but-got-eof"}
+        emit_current_token
+      else
+        @current_token[:data][-1][1] += data
+        @state = :attribute_value_unquoted_state
+      end
+      return true
+    end
+    def attribute_value_double_quoted_state
+      data = @stream.char
+      if data == "\""
+        @state = :after_attribute_value_state
+      elsif data == "&"
+        process_entity_in_attribute('"')
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-double-quote"}
+        emit_current_token
+      else
+        @current_token[:data][-1][1] += data + @stream.chars_until(["\"", "&"])
+      end
+      return true
+    end
+    def attribute_value_single_quoted_state
+      data = @stream.char
+      if data == "'"
+        @state = :after_attribute_value_state
+      elsif data == "&"
+        process_entity_in_attribute("'")
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-single-quote"}
+        emit_current_token
+      else
+        @current_token[:data][-1][1] += data + @stream.chars_until(["'", "&"])
+      end
+      return true
+    end
+    def attribute_value_unquoted_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include? data
+        @state = :before_attribute_name_state
+      elsif data == "&"
+        process_entity_in_attribute ''
+      elsif data == ">"
+        emit_current_token
+      elsif data == '"' || data == "'" || data == "="
+        @token_queue.push({:type => :ParseError, :data => "unexpected-character-in-unquoted-attribute-value"})
+        @current_token[:data][-1][1] += data
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-no-quotes"}
+        emit_current_token
+      else
+        @current_token[:data][-1][1] += data +  @stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
+      end
+      return true
+    end
+    def after_attribute_value_state
+      data = self.stream.char()
+      if SPACE_CHARACTERS.include? data
+        @state = :before_attribute_name_state
+      elsif data == ">"
+        emit_current_token
+        @state = :data_state
+      elsif data == "/"
+        @state = :self_closing_tag_state
+      elsif data == :EOF
+        @token_queue << {:type =>  :ParseError, :data => "unexpected-EOF-after-attribute-value"}
+        emit_current_token
+        @stream.unget(data)
+        @state = :data_state
+      else
+        @token_queue.push({:type => :ParseError, :data => "unexpected-character-after-attribute-value"})
+        @stream.unget(data)
+        @state = :before_attribute_name_state
+      end
+      true
+    end
+    def self_closing_tag_state
+      c = @stream.char
+      case c
+      when ">"
+        @current_token[:self_closing] = true
+        emit_current_token
+        @state = :data_state
+      when :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-tag-name"}
+        @stream.unget(c)
+        @state = :data_state
+      else
+        @token_queue << {:type => :ParseError, :data => "expected-self-closing-tag"}
+        @stream.unget(c)
+        @state = :before_attribute_name_state
+      end
+      true
+    end
+    def bogus_comment_state
+      # Make a new comment token and give it as value all the characters
+      # until the first > or :EOF (chars_until checks for :EOF automatically)
+      # and emit it.
+      @token_queue << {:type => :Comment, :data => @stream.chars_until([">"])}
+      # Eat the character directly after the bogus comment which is either a
+      # ">" or an :EOF.
+      @stream.char
+      @state = :data_state
+      return true
+    end
+    def markup_declaration_open_state
+      char_stack = [@stream.char, @stream.char]
+      if char_stack == ["-", "-"]
+        @current_token = {:type => :Comment, :data => ""}
+        @state = :comment_start_state
+      else
+        5.times { char_stack.push(@stream.char) }
+        # Put in explicit :EOF check
+        if !char_stack.include?(:EOF) && char_stack.join("").upcase == "DOCTYPE"
+          @current_token = {:type => :Doctype, :name => "", :publicId => nil, :systemId => nil, :correct => true}
+          @state = :doctype_state
+        else
+          @token_queue << {:type => :ParseError, :data => "expected-dashes-or-doctype"}
+          @stream.unget(char_stack)
+          @state = :bogus_comment_state
+        end
+      end
+      return true
+    end
+    def comment_start_state
+      data = @stream.char
+      if data == "-"
+        @state = :comment_start_dash_state
+      elsif data == ">"
+        @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
+        @token_queue << @current_token
+        @state = :data_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @current_token[:data] += data + @stream.chars_until("-")
+        @state = :comment_state
+      end
+      return true
+    end
+    def comment_start_dash_state
+        data = @stream.char
+        if data == "-"
+            @state = :comment_end_state
+        elsif data == ">"
+            @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
+            @token_queue << @current_token
+            @state = :data_state
+        elsif data == :EOF
+            @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
+            @token_queue << @current_token
+            @state = :data_state
+        else
+            @current_token[:data] += '-' + data + @stream.chars_until("-")
+            @state = :comment_state
+        end
+        return true
+    end
+    def comment_state
+      data = @stream.char
+      if data == "-"
+        @state = :comment_end_dash_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @current_token[:data] += data + @stream.chars_until("-")
+      end
+      return true
+    end
+    def comment_end_dash_state
+      data = @stream.char
+      if data == "-"
+        @state = :comment_end_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-comment-end-dash"}
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @current_token[:data] += "-" + data + @stream.chars_until("-")
+        # Consume the next character which is either a "-" or an :EOF as
+        # well so if there's a "-" directly after the "-" we go nicely to
+        # the "comment end state" without emitting a ParseError there.
+        @stream.char
+      end
+      return true
+    end
+    def comment_end_state
+      data = @stream.char
+      if data == ">"
+        @token_queue << @current_token
+        @state = :data_state
+      elsif data == "-"
+        @token_queue << {:type => :ParseError, :data => "unexpected-dash-after-double-dash-in-comment"}
+        @current_token[:data] += data
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-comment-double-dash"}
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        # XXX
+        @token_queue << {:type => :ParseError, :data => "unexpected-char-in-comment"}
+        @current_token[:data] += "--" + data
+        @state = :comment_state
+      end
+      return true
+    end
+    def doctype_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include? data
+        @state = :before_doctype_name_state
+      else
+        @token_queue << {:type => :ParseError, :data => "need-space-after-doctype"}
+        @stream.unget(data)
+        @state = :before_doctype_name_state
+      end
+      return true
+    end
+    def before_doctype_name_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include? data
+      elsif data == ">"
+        @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-right-bracket"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-eof"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @current_token[:name] = data
+        @state = :doctype_name_state
+      end
+      return true
+    end
+    def doctype_name_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include? data
+        @state = :after_doctype_name_state
+      elsif data == ">"
+        @token_queue << @current_token
+        @state = :data_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype-name"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @current_token[:name] += data
+      end
+      return true
+    end
+    def after_doctype_name_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include? data
+      elsif data == ">"
+        @token_queue << @current_token
+        @state = :data_state
+      elsif data == :EOF
+        @current_token[:correct] = false
+        @stream.unget(data)
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        char_stack = [data]
+        5.times { char_stack << stream.char }
+        token = char_stack.join('').tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
+        if token == "public" and !char_stack.include?(:EOF)
+          @state = :before_doctype_public_identifier_state
+        elsif token == "system" and !char_stack.include?(:EOF)
+          @state = :before_doctype_system_identifier_state
+        else
+          @stream.unget(char_stack)
+          @token_queue << {:type => :ParseError, :data => "expected-space-or-right-bracket-in-doctype", "datavars" => {"data" => data}}
+          @current_token[:correct] = false
+          @state = :bogus_doctype_state
+        end
+      end
+      return true
+    end
+    def before_doctype_public_identifier_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include?(data)
+      elsif data == "\""
+        @current_token[:publicId] = ""
+        @state = :doctype_public_identifier_double_quoted_state
+      elsif data == "'"
+        @current_token[:publicId] = ""
+        @state = :doctype_public_identifier_single_quoted_state
+      elsif data == ">"
+        @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
+        @current_token[:correct] = false
+        @state = :bogus_doctype_state
+      end
+      return true
+    end
+    def doctype_public_identifier_double_quoted_state
+      data = @stream.char
+      if data == "\""
+        @state = :after_doctype_public_identifier_state
+      elsif data == ">"
+        @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @current_token[:publicId] += data
+      end
+      return true
+    end
+    def doctype_public_identifier_single_quoted_state
+      data = @stream.char
+      if data == "'"
+        @state = :after_doctype_public_identifier_state
+      elsif data == ">"
+        @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @current_token[:publicId] += data
+      end
+      return true
+    end
+    def after_doctype_public_identifier_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include?(data)
+      elsif data == "\""
+        @current_token[:systemId] = ""
+        @state = :doctype_system_identifier_double_quoted_state
+      elsif data == "'"
+        @current_token[:systemId] = ""
+        @state = :doctype_system_identifier_single_quoted_state
+      elsif data == ">"
+        @token_queue << @current_token
+        @state = :data_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
+        @current_token[:correct] = false
+        @state = :bogus_doctype_state
+      end
+      return true
+    end
+    def before_doctype_system_identifier_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include?(data)
+      elsif data == "\""
+        @current_token[:systemId] = ""
+        @state = :doctype_system_identifier_double_quoted_state
+      elsif data == "'"
+        @current_token[:systemId] = ""
+        @state = :doctype_system_identifier_single_quoted_state
+      elsif data == ">"
+        @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
+        @current_token[:correct] = false
+        @state = :bogus_doctype_state
+      end
+      return true
+    end
+    def doctype_system_identifier_double_quoted_state
+      data = @stream.char
+      if data == "\""
+        @state = :after_doctype_system_identifier_state
+      elsif data == ">"
+        @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @current_token[:systemId] += data
+      end
+      return true
+    end
+    def doctype_system_identifier_single_quoted_state
+      data = @stream.char
+      if data == "'"
+        @state = :after_doctype_system_identifier_state
+      elsif data == ">"
+        @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @current_token[:systemId] += data
+      end
+      return true
+    end
+    def after_doctype_system_identifier_state
+      data = @stream.char
+      if SPACE_CHARACTERS.include?(data)
+      elsif data == ">"
+        @token_queue << @current_token
+        @state = :data_state
+      elsif data == :EOF
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
+        @current_token[:correct] = false
+        @token_queue << @current_token
+        @state = :data_state
+      else
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
+        @state = :bogus_doctype_state
+      end
+      return true
+    end
+    def bogus_doctype_state
+      data = @stream.char
+      if data == ">"
+        @token_queue << @current_token
+        @state = :data_state
+      elsif data == :EOF
+        @stream.unget(data)
+        @token_queue << @current_token
+        @state = :data_state
+      end
+      return true
+    end
+  end
+end