RubyGems - html5 - Versions diffs - 0.1.0 → 0.10.0 - Mend

html5 0.1.0 → 0.10.0

Files changed (98) hide show

data/History.txt +9 -2
data/Manifest.txt +61 -2
data/README +41 -5
data/Rakefile.rb +22 -6
data/{parse.rb → bin/html5} +11 -11
data/lib/core_ext/string.rb +17 -0
data/lib/html5/constants.rb +228 -0
data/lib/html5/filters/iso639codes.rb +752 -0
data/lib/html5/filters/rfc2046.rb +30 -0
data/lib/html5/filters/rfc3987.rb +89 -0
data/lib/html5/filters/validator.rb +830 -0
data/lib/html5/html5parser.rb +25 -25
data/lib/html5/html5parser/after_body_phase.rb +3 -3
data/lib/html5/html5parser/after_frameset_phase.rb +3 -4
data/lib/html5/html5parser/after_head_phase.rb +6 -6
data/lib/html5/html5parser/before_head_phase.rb +1 -1
data/lib/html5/html5parser/in_body_phase.rb +54 -48
data/lib/html5/html5parser/in_caption_phase.rb +7 -6
data/lib/html5/html5parser/in_cell_phase.rb +3 -3
data/lib/html5/html5parser/in_column_group_phase.rb +1 -1
data/lib/html5/html5parser/in_frameset_phase.rb +5 -5
data/lib/html5/html5parser/in_head_phase.rb +10 -10
data/lib/html5/html5parser/in_row_phase.rb +4 -2
data/lib/html5/html5parser/in_select_phase.rb +7 -6
data/lib/html5/html5parser/in_table_body_phase.rb +8 -5
data/lib/html5/html5parser/in_table_phase.rb +12 -7
data/lib/html5/html5parser/initial_phase.rb +5 -6
data/lib/html5/html5parser/phase.rb +5 -9
data/lib/html5/html5parser/root_element_phase.rb +1 -2
data/lib/html5/html5parser/trailing_end_phase.rb +3 -3
data/lib/html5/inputstream.rb +25 -31
data/lib/html5/liberalxmlparser.rb +2 -2
data/lib/html5/sanitizer.rb +6 -6
data/lib/html5/serializer/htmlserializer.rb +2 -3
data/lib/html5/sniffer.rb +45 -0
data/lib/html5/tokenizer.rb +57 -59
data/lib/html5/treebuilders/rexml.rb +7 -6
data/lib/html5/treebuilders/simpletree.rb +1 -1
data/lib/html5/treewalkers/base.rb +8 -0
data/lib/html5/version.rb +3 -0
data/testdata/encoding/chardet/test_big5.txt +51 -0
data/testdata/encoding/test-yahoo-jp.dat +10 -0
data/testdata/encoding/tests1.dat +394 -0
data/testdata/encoding/tests2.dat +81 -0
data/testdata/sanitizer/tests1.dat +416 -0
data/testdata/serializer/core.test +104 -0
data/testdata/serializer/injectmeta.test +65 -0
data/testdata/serializer/optionaltags.test +900 -0
data/testdata/serializer/options.test +60 -0
data/testdata/serializer/whitespace.test +51 -0
data/testdata/sites/google-results.htm +1 -0
data/testdata/sites/python-ref-import.htm +1 -0
data/testdata/sites/web-apps-old.htm +1 -0
data/testdata/sites/web-apps.htm +34275 -0
data/testdata/sniffer/htmlOrFeed.json +43 -0
data/testdata/tokenizer/contentModelFlags.test +48 -0
data/testdata/tokenizer/entities.test +2339 -0
data/testdata/tokenizer/escapeFlag.test +21 -0
data/testdata/tokenizer/test1.test +172 -0
data/testdata/tokenizer/test2.test +129 -0
data/testdata/tokenizer/test3.test +367 -0
data/testdata/tokenizer/test4.test +198 -0
data/testdata/tree-construction/tests1.dat +1950 -0
data/testdata/tree-construction/tests2.dat +773 -0
data/testdata/tree-construction/tests3.dat +270 -0
data/testdata/tree-construction/tests4.dat +60 -0
data/testdata/tree-construction/tests5.dat +175 -0
data/testdata/tree-construction/tests6.dat +196 -0
data/testdata/validator/attributes.test +1035 -0
data/testdata/validator/base-href-attribute.test +787 -0
data/testdata/validator/base-target-attribute.test +35 -0
data/testdata/validator/blockquote-cite-attribute.test +7 -0
data/testdata/validator/classattribute.test +152 -0
data/testdata/validator/contenteditableattribute.test +59 -0
data/testdata/validator/contextmenuattribute.test +115 -0
data/testdata/validator/dirattribute.test +59 -0
data/testdata/validator/draggableattribute.test +63 -0
data/testdata/validator/html-xmlns-attribute.test +23 -0
data/testdata/validator/idattribute.test +115 -0
data/testdata/validator/inputattributes.test +2795 -0
data/testdata/validator/irrelevantattribute.test +63 -0
data/testdata/validator/langattribute.test +5579 -0
data/testdata/validator/li-value-attribute.test +7 -0
data/testdata/validator/link-href-attribute.test +7 -0
data/testdata/validator/link-hreflang-attribute.test +7 -0
data/testdata/validator/link-rel-attribute.test +271 -0
data/testdata/validator/ol-start-attribute.test +7 -0
data/testdata/validator/starttags.test +375 -0
data/testdata/validator/style-scoped-attribute.test +7 -0
data/testdata/validator/tabindexattribute.test +79 -0
data/tests/preamble.rb +7 -17
data/tests/test_encoding.rb +1 -1
data/tests/test_lxp.rb +16 -0
data/tests/test_parser.rb +2 -2
data/tests/test_sniffer.rb +27 -0
data/tests/test_treewalkers.rb +41 -22
data/tests/test_validator.rb +31 -0
metadata +65 -6

data/lib/html5/html5parser/root_element_phase.rb CHANGED

@@ -33,10 +33,9 @@ module HTML5
     def insert_html_element
       element = @tree.createElement('html', {})
-      @tree.open_elements.push(element)
+      @tree.open_elements << element
       @tree.document.appendChild(element)
       @parser.phase = @parser.phases[:beforeHead]
     end
   end
 end

data/lib/html5/html5parser/trailing_end_phase.rb CHANGED

@@ -15,19 +15,19 @@ module HTML5
     end
     def processCharacters(data)
-      parse_error(_('Unexpected non-space characters. Expected end of file.'))
+      parse_error("expected-eof-but-got-char")
       @parser.phase = @parser.last_phase
       @parser.phase.processCharacters(data)
     end
     def processStartTag(name, attributes)
-      parse_error(_('Unexpected start tag (#{name}). Expected end of file.'))
+      parse_error("expected-eof-but-got-start-tag", {"name" => name})
       @parser.phase = @parser.last_phase
       @parser.phase.processStartTag(name, attributes)
     end
     def processEndTag(name)
-      parse_error(_('Unexpected end tag (#{name}). Expected end of file.'))
+      parse_error("expected-eof-but-got-end-tag", {"name" => name})
       @parser.phase = @parser.last_phase
       @parser.phase.processEndTag(name)
     end

data/lib/html5/inputstream.rb CHANGED

@@ -60,15 +60,11 @@ module HTML5
       if @char_encoding == 'windows-1252'
         @win1252 = true
       elsif @char_encoding != 'utf-8'
+        require 'iconv'
         begin
-          require 'iconv'
-          begin
-            @buffer << @raw_stream.read unless @raw_stream.eof?
-            @buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
-          rescue
-            @win1252 = true
-          end
-        rescue LoadError
+          @buffer << @raw_stream.read unless @raw_stream.eof?
+          @buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
+        rescue
           @win1252 = true
         end
       end
@@ -88,12 +84,11 @@ module HTML5
     def open_stream(source)
       # Already an IO like object
       if source.respond_to?(:read)
-        @stream = source
+        source
       else
         # Treat source as a string and wrap in StringIO
-        @stream = StringIO.new(source)
+        StringIO.new(source)
       end
-      return @stream
     end
     def detect_encoding
@@ -138,14 +133,12 @@ module HTML5
         encoding = @DEFAULT_ENCODING
       end
-      #Substitute for equivalent encodings
-      encoding_sub = {'iso-8859-1' => 'windows-1252'}
-      if encoding_sub.has_key?(encoding.downcase)
-        encoding = encoding_sub[encoding.downcase]
+      #Substitute for equivalent encoding
+      if 'iso-8859-1' == encoding.downcase
+        encoding = 'windows-1252'
       end
-      return encoding
+      encoding
     end
     # Attempts to detect at BOM at the start of the stream. If
@@ -153,9 +146,9 @@ module HTML5
     # encoding otherwise return nil
     def detect_bom
       bom_dict = {
-        "\xef\xbb\xbf" => 'utf-8',
-        "\xff\xfe" => 'utf-16le',
-        "\xfe\xff" => 'utf-16be',
+        "\xef\xbb\xbf"     => 'utf-8',
+        "\xff\xfe"         => 'utf-16le',
+        "\xfe\xff"         => 'utf-16be',
         "\xff\xfe\x00\x00" => 'utf-32le',
         "\x00\x00\xfe\xff" => 'utf-32be'
       }
@@ -198,6 +191,7 @@ module HTML5
         end
       end
+      #TODO: huh?
       require 'delegate'
       @raw_stream = SimpleDelegator.new(@raw_stream)
@@ -250,7 +244,7 @@ module HTML5
           col -= 1
         end
       end
-      return [line+1, col]
+      return [line + 1, col]
     end
     # Read one character from the stream or queue if available. Return
@@ -259,9 +253,9 @@ module HTML5
       unless @queue.empty?
         return @queue.shift
       else
-        if @tell + 3 > @buffer.length and !@raw_stream.eof?
+        if @tell + 3 > @buffer.length && !@raw_stream.eof?
           # read next block
-          @buffer = @buffer[@tell .. -1] + @raw_stream.read(@NUM_BYTES_BUFFER)
+          @buffer = @buffer[@tell..-1] + @raw_stream.read(@NUM_BYTES_BUFFER)
           @tell = 0
         end
@@ -269,7 +263,7 @@ module HTML5
         @tell += 1
         case c
-        when 0x01 .. 0x7F
+        when 0x01..0x7F
           if c == 0x0D
             # normalize newlines
             @tell += 1 if @buffer[@tell] == 0x0A
@@ -287,7 +281,7 @@ module HTML5
           c.chr
-        when 0x80 .. 0xBF
+        when 0x80..0xBF
           if !@win1252
             [0xFFFD].pack('U') # invalid utf-8
           elsif c <= 0x9f
@@ -296,10 +290,11 @@ module HTML5
             "\xC2" + c.chr # convert to utf-8
           end
-        when 0xC0 .. 0xFF
-          if instance_variable_defined?(:@win1252) && @win1252
-            "\xC3" + (c-64).chr # convert to utf-8
-          elsif @buffer[@tell-1 .. @tell+3] =~ /^
+        when 0xC0..0xFF
+          if instance_variables.include?("@win1252") && @win1252
+            "\xC3" + (c - 64).chr # convert to utf-8
+          # from http://www.w3.org/International/questions/qa-forms-utf-8.en.php
+          elsif @buffer[@tell - 1..@tell + 3] =~ /^
                 ( [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
                 |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
                 | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
@@ -315,8 +310,7 @@ module HTML5
           end
         when 0x00
-          @errors.push('null character found in input stream, ' +
-            'replaced with U+FFFD')
+          @errors.push("null-character")
           [0xFFFD].pack('U') # null characters are invalid
         else

data/lib/html5/liberalxmlparser.rb CHANGED

@@ -50,7 +50,7 @@ module HTML5
       when :EndTag
         if token[:data]
-           parse_error(_("End tag contains unexpected attributes."))
+           parse_error("attributes-in-end-tag")
         end
       when :Comment
@@ -81,7 +81,7 @@ module HTML5
       # open and close tags are emitted
       if token[:type]  == :EndTag
         if VOID_ELEMENTS.include? token[:name]
-          if @tree.open_elements[-1].name != token["name"]:
+          if @tree.open_elements[-1].name != token["name"]
             token[:type] = :EmptyTag
             token["data"] ||= {}
           end

data/lib/html5/sanitizer.rb CHANGED

@@ -110,13 +110,13 @@ module HTML5
     def sanitize_token(token)
         case token[:type]
         when :StartTag, :EndTag, :EmptyTag
-          if ALLOWED_ELEMENTS.include?(token[:name])
+          if self.class.const_get("ALLOWED_ELEMENTS").include?(token[:name])
             if token.has_key? :data
               attrs = Hash[*token[:data].flatten]
-              attrs.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
+              attrs.delete_if { |attr,v| !self.class.const_get("ALLOWED_ATTRIBUTES").include?(attr) }
               ATTR_VAL_IS_URI.each do |attr|
                 val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
-                if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
+                if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !self.class.const_get("ALLOWED_PROTOCOLS").include?(val_unescaped.split(':')[0])
                   attrs.delete attr
                 end
               end
@@ -160,14 +160,14 @@ module HTML5
       style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
         next if val.empty?
         prop.downcase!
-        if ALLOWED_CSS_PROPERTIES.include?(prop)
+        if self.class.const_get("ALLOWED_CSS_PROPERTIES").include?(prop)
           clean << "#{prop}: #{val};"
         elsif %w[background border margin padding].include?(prop.split('-')[0])
           clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
-            !ALLOWED_CSS_KEYWORDS.include?(keyword) and
+            !self.class.const_get("ALLOWED_CSS_KEYWORDS").include?(keyword) and
             keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
           end
-        elsif ALLOWED_SVG_PROPERTIES.include?(prop)
+        elsif self.class.const_get("ALLOWED_SVG_PROPERTIES").include?(prop)
           clean << "#{prop}: #{val};"
         end
       end

data/lib/html5/serializer/htmlserializer.rb CHANGED

@@ -31,7 +31,7 @@ module HTML5
       @inject_meta_charset = true
       options.each do |name, value|
-        next unless instance_variable_defined?("@#{name}")
+        next unless instance_variables.include?("@#{name}")
         @use_best_quote_char = false if name.to_s == 'quote_char'
         instance_variable_set("@#{name}", value)
       end
@@ -73,7 +73,7 @@ module HTML5
         elsif [:Characters, :SpaceCharacters].include? type
           if type == :SpaceCharacters or in_cdata
             if in_cdata and token[:data].include?("</")
-              serialize_error(_("Unexpected </ in CDATA"))
+              serialize_error("Unexpected </ in CDATA")
             end
             result << token[:data]
           else
@@ -171,7 +171,6 @@ module HTML5
       end
     end
-    def _(string); string; end
   end
   # Error in serialized tree

data/lib/html5/sniffer.rb ADDED

@@ -0,0 +1,45 @@
+module HTML5
+module Sniffer
+  # 4.7.4
+  def html_or_feed str
+    s = str[0, 512] # steps 1, 2
+    pos = 0
+    while pos < s.length
+      case s[pos]
+      when 0x09, 0x20, 0x0A, 0x0D # tab, space, LF, CR
+        pos += 1
+      when  0x3C # "<"
+        pos += 1
+        if s[pos..pos+2] == "!--" # [0x21, 0x2D, 0x2D]
+          pos += 3
+          until s[pos..pos+2] == "-->" or pos >= s.length
+            pos += 1
+          end
+          pos += 3
+        elsif s[pos] == 0x21 # "!"
+          pos += 1
+          until s[pos] == 0x3E or pos >= s.length # ">"
+            pos += 1
+          end
+          pos += 1
+        elsif s[pos] == 0x3F # "?"
+          until s[pos..pos+1] == "?>" or pos >= s.length # [0x3F, 0x3E]
+            pos +=  1
+          end
+          pos += 2
+        elsif s[pos..pos+2] == "rss"   # [0x72, 0x73, 0x73]
+          return "application/rss+xml"
+        elsif s[pos..pos+3] == "feed"  # [0x66, 0x65, 0x65, 0x64]
+          return "application/atom+xml"
+        elsif s[pos..pos+6] == "rdf:RDF" # [0x72, 0x64, 0x66, 0x3A, 0x52, 0x44, 0x46]
+          raise NotImplementedError
+        end
+      else
+        break
+      end
+    end
+    "text/html"
+  end
+end
+end

data/lib/html5/tokenizer.rb CHANGED

@@ -69,7 +69,7 @@ module HTML5
       if @current_token[:type] == :StartTag and data == ">"
         @current_token[:type] = :EmptyTag
       else
-        @token_queue << {:type => :ParseError, :data => _("Solidus (/) incorrectly placed in tag.")}
+        @token_queue << {:type => :ParseError, :data => "incorrectly-placed-solidus"}
       end
       # The character we just consumed need to be put back on the stack so it
@@ -107,12 +107,12 @@ module HTML5
       charAsInt = char_stack.join('').to_i(radix)
       if charAsInt == 13
-        @token_queue << {:type => :ParseError, :data => _("Incorrect CR newline entity. Replaced with LF.")}
+        @token_queue << {:type => :ParseError, :data => "incorrect-cr-newline-entity"}
         charAsInt = 10
       elsif (128..159).include? charAsInt
         # If the integer is between 127 and 160 (so 128 and bigger and 159
         # and smaller) we need to do the "windows trick".
-        @token_queue << {:type => :ParseError, :data => _("Entity used with illegal number (windows-1252 reference).")}
+        @token_queue << {:type => :ParseError, :data => "illegal-windows-1252-entity"}
         charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
       end
@@ -121,13 +121,13 @@ module HTML5
         char = [charAsInt].pack('U')
       else
         char = [0xFFFD].pack('U')
-        @token_queue << {:type => :ParseError, :data => _("Numeric entity represents an illegal codepoint.")}
+        @token_queue << {:type => :ParseError, :data => "cant-convert-numeric-entity", :datavars => {"charAsInt" => charAsInt}}
       end
       # Discard the ; if present. Otherwise, put it back on the queue and
       # invoke parse_error on parser.
       if c != ";"
-        @token_queue << {:type => :ParseError, :data => _("Numeric entity didn't end with ';'.")}
+        @token_queue << {:type => :ParseError, :data => "numeric-entity-without-semicolon"}
         @stream.unget(c)
       end
@@ -147,7 +147,7 @@ module HTML5
           # back in the queue
           char_stack = char_stack[0...char_stack.index(:EOF)]
           @stream.unget(char_stack)
-          @token_queue << {:type => :ParseError, :data => _("Numeric entity expected. Got end of file instead.")}
+          @token_queue << {:type => :ParseError, :data => "expected-numeric-entity-but-got-eof"}
         else
           if char_stack[1].downcase == "x" and HEX_DIGITS.include? char_stack[2]
             # Hexadecimal entity detected.
@@ -160,7 +160,7 @@ module HTML5
           else
             # No number entity detected.
             @stream.unget(char_stack)
-            @token_queue << {:type => :ParseError, :data => _("Numeric entity expected but none found.")}
+            @token_queue << {:type => :ParseError, :data => "expected-numeric-entity"}
           end
         end
       else
@@ -196,10 +196,10 @@ module HTML5
           # Check whether or not the last character returned can be
           # discarded or needs to be put back.
           if entityName[-1] != ?;
-            @token_queue << {:type => :ParseError, :data => _("Named entity didn't end with ';'.")}
+            @token_queue << {:type => :ParseError, :data => "named-entity-without-semicolon"}
           end
-          if char_stack[-1] != ";" and from_attribute and
+          if entityName[-1] != ";" and from_attribute and
              (ASCII_LETTERS.include?(char_stack[entityName.length]) or
               DIGITS.include?(char_stack[entityName.length]))
             @stream.unget(char_stack)
@@ -208,7 +208,7 @@ module HTML5
             @stream.unget(char_stack[entityName.length..-1])
           end
         else
-          @token_queue << {:type => :ParseError, :data => _("Named entity expected. Got none.")}
+          @token_queue << {:type => :ParseError, :data => "expected-named-entity"}
           @stream.unget(char_stack)
         end
       end
@@ -217,7 +217,7 @@ module HTML5
     # This method replaces the need for "entityInAttributeValueState".
     def process_entity_in_attribute
-      entity = consume_entity(true)
+      entity = consume_entity()
       if entity
         @current_token[:data][-1][1] += entity
       else
@@ -309,19 +309,18 @@ module HTML5
         elsif data == ">"
           # XXX In theory it could be something besides a tag name. But
           # do we really care?
-          @token_queue << {:type => :ParseError, :data =>       _("Expected tag name. Got '>' instead.")}
+          @token_queue << {:type => :ParseError, :data =>       "expected-tag-name-but-got-right-bracket"}
           @token_queue << {:type => :Characters, :data => "<>"}
           @state = :data_state
         elsif data == "?"
           # XXX In theory it could be something besides a tag name. But
           # do we really care?
-          @token_queue.push({:type => :ParseError, :data => _("Expected tag name. Got '?' instead (HTML doesn't " +
-            "support processing instructions).")})
+          @token_queue.push({:type => :ParseError, :data => "expected-tag-name-but-got-question-mark"})
           @stream.unget(data)
           @state = :bogus_comment_state
         else
           # XXX
-          @token_queue << {:type => :ParseError, :data => _("Expected tag name. Got something else instead")}
+          @token_queue << {:type => :ParseError, :data => "expected-tag-name"}
           @token_queue << {:type => :Characters, :data => "<"}
           @stream.unget(data)
           @state = :data_state
@@ -382,18 +381,18 @@ module HTML5
       data = @stream.char
       if data == :EOF
-        @token_queue << {:type => :ParseError, :data => _("Expected closing tag. Unexpected end of file.")}
+        @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-eof"}
         @token_queue << {:type => :Characters, :data => "</"}
         @state = :data_state
       elsif ASCII_LETTERS.include? data
         @current_token = {:type => :EndTag, :name => data, :data => []}
         @state = :tag_name_state
       elsif data == ">"
-        @token_queue << {:type => :ParseError, :data => _("Expected closing tag. Got '>' instead. Ignoring '</>'.")}
+        @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-right-bracket"}
         @state = :data_state
       else
         # XXX data can be _'_...
-        @token_queue << {:type => :ParseError, :data => _("Expected closing tag. Unexpected character '#{data}' found.")}
+        @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-char", :datavars => {:data => data}}
         @stream.unget(data)
         @state = :bogus_comment_state
       end
@@ -406,7 +405,7 @@ module HTML5
       if SPACE_CHARACTERS.include? data
         @state = :before_attribute_name_state
       elsif data == :EOF
-        @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in the tag name.")}
+        @token_queue << {:type => :ParseError, :data => "eof-in-tag-name"}
         emit_current_token
       elsif ASCII_LETTERS.include? data
         @current_token[:name] += data + @stream.chars_until(ASCII_LETTERS, true)
@@ -426,7 +425,7 @@ module HTML5
       if SPACE_CHARACTERS.include? data
         @stream.chars_until(SPACE_CHARACTERS, true)
       elsif data == :EOF
-        @token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected attribute name instead.")}
+        @token_queue << {:type => :ParseError, :data => "expected-attribute-name-but-got-eof"}
         emit_current_token
       elsif ASCII_LETTERS.include? data
         @current_token[:data].push([data, ""])
@@ -449,7 +448,7 @@ module HTML5
       if data == "="
         @state = :before_attribute_value_state
       elsif data == :EOF
-        @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute name.")}
+        @token_queue << {:type => :ParseError, :data => "eof-in-attribute-name"}
         @state = :data_state
         emitToken = true
       elsif ASCII_LETTERS.include? data
@@ -479,7 +478,7 @@ module HTML5
         end
         @current_token[:data][0...-1].each {|name,value|
           if @current_token[:data].last.first == name
-            @token_queue << {:type => :ParseError, :data =>_("Dropped duplicate attribute on tag.")}
+            @token_queue << {:type => :ParseError, :data => "duplicate-attribute"}
             break # don't report an error more than once
           end
         }
@@ -498,7 +497,7 @@ module HTML5
       elsif data == ">"
         emit_current_token
       elsif data == :EOF
-        @token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected = or end of tag.")}
+        @token_queue << {:type => :ParseError, :data => "expected-end-of-tag-but-got-eof"}
         emit_current_token
       elsif ASCII_LETTERS.include? data
         @current_token[:data].push([data, ""])
@@ -527,7 +526,7 @@ module HTML5
       elsif data == ">"
         emit_current_token
       elsif data == :EOF
-        @token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected attribute value.")}
+        @token_queue << {:type => :ParseError, :data => "expected-attribute-value-but-got-eof"}
         emit_current_token
       else
         @current_token[:data][-1][1] += data
@@ -543,7 +542,7 @@ module HTML5
       elsif data == "&"
         process_entity_in_attribute
       elsif data == :EOF
-        @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute value (\").")}
+        @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-double-quote"}
         emit_current_token
       else
         @current_token[:data][-1][1] += data + @stream.chars_until(["\"", "&"])
@@ -558,7 +557,7 @@ module HTML5
       elsif data == "&"
         process_entity_in_attribute
       elsif data == :EOF
-        @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute value (').")}
+        @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-single-quote"}
         emit_current_token
       else
         @current_token[:data][-1][1] += data +\
@@ -576,7 +575,7 @@ module HTML5
       elsif data == ">"
         emit_current_token
       elsif data == :EOF
-        @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute value.")}
+        @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-no-quotes"}
         emit_current_token
       else
         @current_token[:data][-1][1] += data +  @stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
@@ -609,7 +608,7 @@ module HTML5
           @current_token = {:type => :Doctype, :name => "", :publicId => nil, :systemId => nil, :correct => true}
           @state = :doctype_state
         else
-          @token_queue << {:type => :ParseError, :data => _("Expected '--' or 'DOCTYPE'. Not found.")}
+          @token_queue << {:type => :ParseError, :data => "expected-dashes-or-doctype"}
           @stream.unget(char_stack)
           @state = :bogus_comment_state
         end
@@ -622,11 +621,11 @@ module HTML5
         if data == "-"
             @state = :comment_start_dash_state
         elsif data == ">"
-            @token_queue << {:type => :ParseError, :data => _("Incorrect comment.")}
+            @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
             @token_queue << @current_token
             @state = :data_state
         elsif data == :EOF
-            @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment.")}
+            @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
             @token_queue << @current_token
             @state = :data_state
         else
@@ -641,11 +640,11 @@ module HTML5
         if data == "-"
             @state = :comment_end_state
         elsif data == ">"
-            @token_queue << {:type => :ParseError, :data => _("Incorrect comment.")}
+            @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
             @token_queue << @current_token
             @state = :data_state
         elsif data == :EOF
-            @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment.")}
+            @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
             @token_queue << @current_token
             @state = :data_state
         else
@@ -660,7 +659,7 @@ module HTML5
       if data == "-"
         @state = :comment_end_dash_state
       elsif data == :EOF
-        @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment.")}
+        @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
         @token_queue << @current_token
         @state = :data_state
       else
@@ -674,7 +673,7 @@ module HTML5
       if data == "-"
         @state = :comment_end_state
       elsif data == :EOF
-        @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment (-)")}
+        @token_queue << {:type => :ParseError, :data => "eof-in-comment-end-dash"}
         @token_queue << @current_token
         @state = :data_state
       else
@@ -694,15 +693,15 @@ module HTML5
         @token_queue << @current_token
         @state = :data_state
       elsif data == "-"
-        @token_queue << {:type => :ParseError, :data => _("Unexpected '-' after '--' found in comment.")}
+        @token_queue << {:type => :ParseError, :data => "unexpected-dash-after-double-dash-in-comment"}
         @current_token[:data] += data
       elsif data == :EOF
-        @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment (--).")}
+        @token_queue << {:type => :ParseError, :data => "eof-in-comment-double-dash"}
         @token_queue << @current_token
         @state = :data_state
       else
         # XXX
-        @token_queue << {:type => :ParseError, :data => _("Unexpected character in comment found.")}
+        @token_queue << {:type => :ParseError, :data => "unexpected-char-in-comment"}
         @current_token[:data] += "--" + data
         @state = :comment_state
       end
@@ -714,7 +713,7 @@ module HTML5
       if SPACE_CHARACTERS.include? data
         @state = :before_doctype_name_state
       else
-        @token_queue << {:type => :ParseError, :data => _("No space after literal string 'DOCTYPE'.")}
+        @token_queue << {:type => :ParseError, :data => "need-space-after-doctype"}
         @stream.unget(data)
         @state = :before_doctype_name_state
       end
@@ -725,12 +724,12 @@ module HTML5
       data = @stream.char
       if SPACE_CHARACTERS.include? data
       elsif data == ">"
-        @token_queue << {:type => :ParseError, :data => _("Unexpected > character. Expected DOCTYPE name.")}
+        @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-right-bracket"}
         @current_token[:correct] = false
         @token_queue << @current_token
         @state = :data_state
       elsif data == :EOF
-        @token_queue << {:type => :ParseError, :data =>          _("Unexpected end of file. Expected DOCTYPE name.")}
+        @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-eof"}
         @current_token[:correct] = false
         @token_queue << @current_token
         @state = :data_state
@@ -749,7 +748,7 @@ module HTML5
         @token_queue << @current_token
         @state = :data_state
       elsif data == :EOF
-        @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE name.")}
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype-name"}
         @current_token[:correct] = false
         @token_queue << @current_token
         @state = :data_state
@@ -769,7 +768,7 @@ module HTML5
       elsif data == :EOF
         @current_token[:correct] = false
         @stream.unget(data)
-        @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
         @token_queue << @current_token
         @state = :data_state
       else
@@ -782,7 +781,7 @@ module HTML5
           @state = :before_doctype_system_identifier_state
         else
           @stream.unget(char_stack)
-          @token_queue << {:type => :ParseError, :data => _("Expected 'public' or 'system'. Got '#{token}'")}
+          @token_queue << {:type => :ParseError, :data => "expected-space-or-right-bracket-in-doctype", "datavars" => {"data" => data}}
           @state = :bogus_doctype_state
         end
       end
@@ -800,17 +799,17 @@ module HTML5
         @current_token[:publicId] = ""
         @state = :doctype_public_identifier_single_quoted_state
       elsif data == ">"
-        @token_queue << {:type => :ParseError, :data => _("Unexpected end of DOCTYPE.")}
+        @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
         @current_token[:correct] = false
         @token_queue << @current_token
         @state = :data_state
       elsif data == :EOF
-        @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
         @current_token[:correct] = false
         @token_queue << @current_token
         @state = :data_state
       else
-        @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
+        @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
         @state = :bogus_doctype_state
       end
@@ -822,7 +821,7 @@ module HTML5
       if data == "\""
         @state = :after_doctype_public_identifier_state
       elsif data == :EOF
-        @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
         @current_token[:correct] = false
         @token_queue << @current_token
         @state = :data_state
@@ -837,7 +836,7 @@ module HTML5
       if data == "'"
         @state = :after_doctype_public_identifier_state
       elsif data == :EOF
-        @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
         @current_token[:correct] = false
         @token_queue << @current_token
         @state = :data_state
@@ -860,12 +859,12 @@ module HTML5
         @token_queue << @current_token
         @state = :data_state
       elsif data == :EOF
-        @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
         @current_token[:correct] = false
         @token_queue << @current_token
         @state = :data_state
       else
-        @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
         @state = :bogus_doctype_state
       end
       return true
@@ -881,17 +880,17 @@ module HTML5
         @current_token[:systemId] = ""
         @state = :doctype_system_identifier_single_quoted_state
       elsif data == ">"
-        @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
+        @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
         @current_token[:correct] = false
         @token_queue << @current_token
         @state = :data_state
       elsif data == :EOF
-        @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
         @current_token[:correct] = false
         @token_queue << @current_token
         @state = :data_state
       else
-        @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
+        @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
         @state = :bogus_doctype_state
       end
       return true
@@ -902,7 +901,7 @@ module HTML5
       if data == "\""
         @state = :after_doctype_system_identifier_state
       elsif data == :EOF
-        @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
         @current_token[:correct] = false
         @token_queue << @current_token
         @state = :data_state
@@ -917,7 +916,7 @@ module HTML5
       if data == "'"
         @state = :after_doctype_system_identifier_state
       elsif data == :EOF
-        @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
         @current_token[:correct] = false
         @token_queue << @current_token
         @state = :data_state
@@ -934,12 +933,12 @@ module HTML5
         @token_queue << @current_token
         @state = :data_state
       elsif data == :EOF
-        @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
         @current_token[:correct] = false
         @token_queue << @current_token
         @state = :data_state
       else
-        @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
         @state = :bogus_doctype_state
       end
       return true
@@ -954,7 +953,7 @@ module HTML5
       elsif data == :EOF
         # XXX EMIT
         @stream.unget(data)
-        @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in bogus doctype.")}
+        @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
         @current_token[:correct] = false
         @token_queue << @current_token
         @state = :data_state
@@ -962,7 +961,6 @@ module HTML5
       return true
     end
-    def _(string); string; end
   end
 end