html5 0.1.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. data/History.txt +9 -2
  2. data/Manifest.txt +61 -2
  3. data/README +41 -5
  4. data/Rakefile.rb +22 -6
  5. data/{parse.rb → bin/html5} +11 -11
  6. data/lib/core_ext/string.rb +17 -0
  7. data/lib/html5/constants.rb +228 -0
  8. data/lib/html5/filters/iso639codes.rb +752 -0
  9. data/lib/html5/filters/rfc2046.rb +30 -0
  10. data/lib/html5/filters/rfc3987.rb +89 -0
  11. data/lib/html5/filters/validator.rb +830 -0
  12. data/lib/html5/html5parser.rb +25 -25
  13. data/lib/html5/html5parser/after_body_phase.rb +3 -3
  14. data/lib/html5/html5parser/after_frameset_phase.rb +3 -4
  15. data/lib/html5/html5parser/after_head_phase.rb +6 -6
  16. data/lib/html5/html5parser/before_head_phase.rb +1 -1
  17. data/lib/html5/html5parser/in_body_phase.rb +54 -48
  18. data/lib/html5/html5parser/in_caption_phase.rb +7 -6
  19. data/lib/html5/html5parser/in_cell_phase.rb +3 -3
  20. data/lib/html5/html5parser/in_column_group_phase.rb +1 -1
  21. data/lib/html5/html5parser/in_frameset_phase.rb +5 -5
  22. data/lib/html5/html5parser/in_head_phase.rb +10 -10
  23. data/lib/html5/html5parser/in_row_phase.rb +4 -2
  24. data/lib/html5/html5parser/in_select_phase.rb +7 -6
  25. data/lib/html5/html5parser/in_table_body_phase.rb +8 -5
  26. data/lib/html5/html5parser/in_table_phase.rb +12 -7
  27. data/lib/html5/html5parser/initial_phase.rb +5 -6
  28. data/lib/html5/html5parser/phase.rb +5 -9
  29. data/lib/html5/html5parser/root_element_phase.rb +1 -2
  30. data/lib/html5/html5parser/trailing_end_phase.rb +3 -3
  31. data/lib/html5/inputstream.rb +25 -31
  32. data/lib/html5/liberalxmlparser.rb +2 -2
  33. data/lib/html5/sanitizer.rb +6 -6
  34. data/lib/html5/serializer/htmlserializer.rb +2 -3
  35. data/lib/html5/sniffer.rb +45 -0
  36. data/lib/html5/tokenizer.rb +57 -59
  37. data/lib/html5/treebuilders/rexml.rb +7 -6
  38. data/lib/html5/treebuilders/simpletree.rb +1 -1
  39. data/lib/html5/treewalkers/base.rb +8 -0
  40. data/lib/html5/version.rb +3 -0
  41. data/testdata/encoding/chardet/test_big5.txt +51 -0
  42. data/testdata/encoding/test-yahoo-jp.dat +10 -0
  43. data/testdata/encoding/tests1.dat +394 -0
  44. data/testdata/encoding/tests2.dat +81 -0
  45. data/testdata/sanitizer/tests1.dat +416 -0
  46. data/testdata/serializer/core.test +104 -0
  47. data/testdata/serializer/injectmeta.test +65 -0
  48. data/testdata/serializer/optionaltags.test +900 -0
  49. data/testdata/serializer/options.test +60 -0
  50. data/testdata/serializer/whitespace.test +51 -0
  51. data/testdata/sites/google-results.htm +1 -0
  52. data/testdata/sites/python-ref-import.htm +1 -0
  53. data/testdata/sites/web-apps-old.htm +1 -0
  54. data/testdata/sites/web-apps.htm +34275 -0
  55. data/testdata/sniffer/htmlOrFeed.json +43 -0
  56. data/testdata/tokenizer/contentModelFlags.test +48 -0
  57. data/testdata/tokenizer/entities.test +2339 -0
  58. data/testdata/tokenizer/escapeFlag.test +21 -0
  59. data/testdata/tokenizer/test1.test +172 -0
  60. data/testdata/tokenizer/test2.test +129 -0
  61. data/testdata/tokenizer/test3.test +367 -0
  62. data/testdata/tokenizer/test4.test +198 -0
  63. data/testdata/tree-construction/tests1.dat +1950 -0
  64. data/testdata/tree-construction/tests2.dat +773 -0
  65. data/testdata/tree-construction/tests3.dat +270 -0
  66. data/testdata/tree-construction/tests4.dat +60 -0
  67. data/testdata/tree-construction/tests5.dat +175 -0
  68. data/testdata/tree-construction/tests6.dat +196 -0
  69. data/testdata/validator/attributes.test +1035 -0
  70. data/testdata/validator/base-href-attribute.test +787 -0
  71. data/testdata/validator/base-target-attribute.test +35 -0
  72. data/testdata/validator/blockquote-cite-attribute.test +7 -0
  73. data/testdata/validator/classattribute.test +152 -0
  74. data/testdata/validator/contenteditableattribute.test +59 -0
  75. data/testdata/validator/contextmenuattribute.test +115 -0
  76. data/testdata/validator/dirattribute.test +59 -0
  77. data/testdata/validator/draggableattribute.test +63 -0
  78. data/testdata/validator/html-xmlns-attribute.test +23 -0
  79. data/testdata/validator/idattribute.test +115 -0
  80. data/testdata/validator/inputattributes.test +2795 -0
  81. data/testdata/validator/irrelevantattribute.test +63 -0
  82. data/testdata/validator/langattribute.test +5579 -0
  83. data/testdata/validator/li-value-attribute.test +7 -0
  84. data/testdata/validator/link-href-attribute.test +7 -0
  85. data/testdata/validator/link-hreflang-attribute.test +7 -0
  86. data/testdata/validator/link-rel-attribute.test +271 -0
  87. data/testdata/validator/ol-start-attribute.test +7 -0
  88. data/testdata/validator/starttags.test +375 -0
  89. data/testdata/validator/style-scoped-attribute.test +7 -0
  90. data/testdata/validator/tabindexattribute.test +79 -0
  91. data/tests/preamble.rb +7 -17
  92. data/tests/test_encoding.rb +1 -1
  93. data/tests/test_lxp.rb +16 -0
  94. data/tests/test_parser.rb +2 -2
  95. data/tests/test_sniffer.rb +27 -0
  96. data/tests/test_treewalkers.rb +41 -22
  97. data/tests/test_validator.rb +31 -0
  98. metadata +65 -6
@@ -33,10 +33,9 @@ module HTML5
33
33
 
34
34
  def insert_html_element
35
35
  element = @tree.createElement('html', {})
36
- @tree.open_elements.push(element)
36
+ @tree.open_elements << element
37
37
  @tree.document.appendChild(element)
38
38
  @parser.phase = @parser.phases[:beforeHead]
39
39
  end
40
-
41
40
  end
42
41
  end
@@ -15,19 +15,19 @@ module HTML5
15
15
  end
16
16
 
17
17
  def processCharacters(data)
18
- parse_error(_('Unexpected non-space characters. Expected end of file.'))
18
+ parse_error("expected-eof-but-got-char")
19
19
  @parser.phase = @parser.last_phase
20
20
  @parser.phase.processCharacters(data)
21
21
  end
22
22
 
23
23
  def processStartTag(name, attributes)
24
- parse_error(_('Unexpected start tag (#{name}). Expected end of file.'))
24
+ parse_error("expected-eof-but-got-start-tag", {"name" => name})
25
25
  @parser.phase = @parser.last_phase
26
26
  @parser.phase.processStartTag(name, attributes)
27
27
  end
28
28
 
29
29
  def processEndTag(name)
30
- parse_error(_('Unexpected end tag (#{name}). Expected end of file.'))
30
+ parse_error("expected-eof-but-got-end-tag", {"name" => name})
31
31
  @parser.phase = @parser.last_phase
32
32
  @parser.phase.processEndTag(name)
33
33
  end
@@ -60,15 +60,11 @@ module HTML5
60
60
  if @char_encoding == 'windows-1252'
61
61
  @win1252 = true
62
62
  elsif @char_encoding != 'utf-8'
63
+ require 'iconv'
63
64
  begin
64
- require 'iconv'
65
- begin
66
- @buffer << @raw_stream.read unless @raw_stream.eof?
67
- @buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
68
- rescue
69
- @win1252 = true
70
- end
71
- rescue LoadError
65
+ @buffer << @raw_stream.read unless @raw_stream.eof?
66
+ @buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
67
+ rescue
72
68
  @win1252 = true
73
69
  end
74
70
  end
@@ -88,12 +84,11 @@ module HTML5
88
84
  def open_stream(source)
89
85
  # Already an IO like object
90
86
  if source.respond_to?(:read)
91
- @stream = source
87
+ source
92
88
  else
93
89
  # Treat source as a string and wrap in StringIO
94
- @stream = StringIO.new(source)
90
+ StringIO.new(source)
95
91
  end
96
- return @stream
97
92
  end
98
93
 
99
94
  def detect_encoding
@@ -138,14 +133,12 @@ module HTML5
138
133
  encoding = @DEFAULT_ENCODING
139
134
  end
140
135
 
141
- #Substitute for equivalent encodings
142
- encoding_sub = {'iso-8859-1' => 'windows-1252'}
143
-
144
- if encoding_sub.has_key?(encoding.downcase)
145
- encoding = encoding_sub[encoding.downcase]
136
+ #Substitute for equivalent encoding
137
+ if 'iso-8859-1' == encoding.downcase
138
+ encoding = 'windows-1252'
146
139
  end
147
140
 
148
- return encoding
141
+ encoding
149
142
  end
150
143
 
151
144
  # Attempts to detect at BOM at the start of the stream. If
@@ -153,9 +146,9 @@ module HTML5
153
146
  # encoding otherwise return nil
154
147
  def detect_bom
155
148
  bom_dict = {
156
- "\xef\xbb\xbf" => 'utf-8',
157
- "\xff\xfe" => 'utf-16le',
158
- "\xfe\xff" => 'utf-16be',
149
+ "\xef\xbb\xbf" => 'utf-8',
150
+ "\xff\xfe" => 'utf-16le',
151
+ "\xfe\xff" => 'utf-16be',
159
152
  "\xff\xfe\x00\x00" => 'utf-32le',
160
153
  "\x00\x00\xfe\xff" => 'utf-32be'
161
154
  }
@@ -198,6 +191,7 @@ module HTML5
198
191
  end
199
192
  end
200
193
 
194
+ #TODO: huh?
201
195
  require 'delegate'
202
196
  @raw_stream = SimpleDelegator.new(@raw_stream)
203
197
 
@@ -250,7 +244,7 @@ module HTML5
250
244
  col -= 1
251
245
  end
252
246
  end
253
- return [line+1, col]
247
+ return [line + 1, col]
254
248
  end
255
249
 
256
250
  # Read one character from the stream or queue if available. Return
@@ -259,9 +253,9 @@ module HTML5
259
253
  unless @queue.empty?
260
254
  return @queue.shift
261
255
  else
262
- if @tell + 3 > @buffer.length and !@raw_stream.eof?
256
+ if @tell + 3 > @buffer.length && !@raw_stream.eof?
263
257
  # read next block
264
- @buffer = @buffer[@tell .. -1] + @raw_stream.read(@NUM_BYTES_BUFFER)
258
+ @buffer = @buffer[@tell..-1] + @raw_stream.read(@NUM_BYTES_BUFFER)
265
259
  @tell = 0
266
260
  end
267
261
 
@@ -269,7 +263,7 @@ module HTML5
269
263
  @tell += 1
270
264
 
271
265
  case c
272
- when 0x01 .. 0x7F
266
+ when 0x01..0x7F
273
267
  if c == 0x0D
274
268
  # normalize newlines
275
269
  @tell += 1 if @buffer[@tell] == 0x0A
@@ -287,7 +281,7 @@ module HTML5
287
281
 
288
282
  c.chr
289
283
 
290
- when 0x80 .. 0xBF
284
+ when 0x80..0xBF
291
285
  if !@win1252
292
286
  [0xFFFD].pack('U') # invalid utf-8
293
287
  elsif c <= 0x9f
@@ -296,10 +290,11 @@ module HTML5
296
290
  "\xC2" + c.chr # convert to utf-8
297
291
  end
298
292
 
299
- when 0xC0 .. 0xFF
300
- if instance_variable_defined?(:@win1252) && @win1252
301
- "\xC3" + (c-64).chr # convert to utf-8
302
- elsif @buffer[@tell-1 .. @tell+3] =~ /^
293
+ when 0xC0..0xFF
294
+ if instance_variables.include?("@win1252") && @win1252
295
+ "\xC3" + (c - 64).chr # convert to utf-8
296
+ # from http://www.w3.org/International/questions/qa-forms-utf-8.en.php
297
+ elsif @buffer[@tell - 1..@tell + 3] =~ /^
303
298
  ( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
304
299
  | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
305
300
  | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
@@ -315,8 +310,7 @@ module HTML5
315
310
  end
316
311
 
317
312
  when 0x00
318
- @errors.push('null character found in input stream, ' +
319
- 'replaced with U+FFFD')
313
+ @errors.push("null-character")
320
314
  [0xFFFD].pack('U') # null characters are invalid
321
315
 
322
316
  else
@@ -50,7 +50,7 @@ module HTML5
50
50
 
51
51
  when :EndTag
52
52
  if token[:data]
53
- parse_error(_("End tag contains unexpected attributes."))
53
+ parse_error("attributes-in-end-tag")
54
54
  end
55
55
 
56
56
  when :Comment
@@ -81,7 +81,7 @@ module HTML5
81
81
  # open and close tags are emitted
82
82
  if token[:type] == :EndTag
83
83
  if VOID_ELEMENTS.include? token[:name]
84
- if @tree.open_elements[-1].name != token["name"]:
84
+ if @tree.open_elements[-1].name != token["name"]
85
85
  token[:type] = :EmptyTag
86
86
  token["data"] ||= {}
87
87
  end
@@ -110,13 +110,13 @@ module HTML5
110
110
  def sanitize_token(token)
111
111
  case token[:type]
112
112
  when :StartTag, :EndTag, :EmptyTag
113
- if ALLOWED_ELEMENTS.include?(token[:name])
113
+ if self.class.const_get("ALLOWED_ELEMENTS").include?(token[:name])
114
114
  if token.has_key? :data
115
115
  attrs = Hash[*token[:data].flatten]
116
- attrs.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
116
+ attrs.delete_if { |attr,v| !self.class.const_get("ALLOWED_ATTRIBUTES").include?(attr) }
117
117
  ATTR_VAL_IS_URI.each do |attr|
118
118
  val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
119
- if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
119
+ if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !self.class.const_get("ALLOWED_PROTOCOLS").include?(val_unescaped.split(':')[0])
120
120
  attrs.delete attr
121
121
  end
122
122
  end
@@ -160,14 +160,14 @@ module HTML5
160
160
  style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
161
161
  next if val.empty?
162
162
  prop.downcase!
163
- if ALLOWED_CSS_PROPERTIES.include?(prop)
163
+ if self.class.const_get("ALLOWED_CSS_PROPERTIES").include?(prop)
164
164
  clean << "#{prop}: #{val};"
165
165
  elsif %w[background border margin padding].include?(prop.split('-')[0])
166
166
  clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
167
- !ALLOWED_CSS_KEYWORDS.include?(keyword) and
167
+ !self.class.const_get("ALLOWED_CSS_KEYWORDS").include?(keyword) and
168
168
  keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
169
169
  end
170
- elsif ALLOWED_SVG_PROPERTIES.include?(prop)
170
+ elsif self.class.const_get("ALLOWED_SVG_PROPERTIES").include?(prop)
171
171
  clean << "#{prop}: #{val};"
172
172
  end
173
173
  end
@@ -31,7 +31,7 @@ module HTML5
31
31
  @inject_meta_charset = true
32
32
 
33
33
  options.each do |name, value|
34
- next unless instance_variable_defined?("@#{name}")
34
+ next unless instance_variables.include?("@#{name}")
35
35
  @use_best_quote_char = false if name.to_s == 'quote_char'
36
36
  instance_variable_set("@#{name}", value)
37
37
  end
@@ -73,7 +73,7 @@ module HTML5
73
73
  elsif [:Characters, :SpaceCharacters].include? type
74
74
  if type == :SpaceCharacters or in_cdata
75
75
  if in_cdata and token[:data].include?("</")
76
- serialize_error(_("Unexpected </ in CDATA"))
76
+ serialize_error("Unexpected </ in CDATA")
77
77
  end
78
78
  result << token[:data]
79
79
  else
@@ -171,7 +171,6 @@ module HTML5
171
171
  end
172
172
  end
173
173
 
174
- def _(string); string; end
175
174
  end
176
175
 
177
176
  # Error in serialized tree
@@ -0,0 +1,45 @@
1
+ module HTML5
2
+ module Sniffer
3
+ # 4.7.4
4
+ def html_or_feed str
5
+ s = str[0, 512] # steps 1, 2
6
+ pos = 0
7
+
8
+ while pos < s.length
9
+ case s[pos]
10
+ when 0x09, 0x20, 0x0A, 0x0D # tab, space, LF, CR
11
+ pos += 1
12
+ when 0x3C # "<"
13
+ pos += 1
14
+ if s[pos..pos+2] == "!--" # [0x21, 0x2D, 0x2D]
15
+ pos += 3
16
+ until s[pos..pos+2] == "-->" or pos >= s.length
17
+ pos += 1
18
+ end
19
+ pos += 3
20
+ elsif s[pos] == 0x21 # "!"
21
+ pos += 1
22
+ until s[pos] == 0x3E or pos >= s.length # ">"
23
+ pos += 1
24
+ end
25
+ pos += 1
26
+ elsif s[pos] == 0x3F # "?"
27
+ until s[pos..pos+1] == "?>" or pos >= s.length # [0x3F, 0x3E]
28
+ pos += 1
29
+ end
30
+ pos += 2
31
+ elsif s[pos..pos+2] == "rss" # [0x72, 0x73, 0x73]
32
+ return "application/rss+xml"
33
+ elsif s[pos..pos+3] == "feed" # [0x66, 0x65, 0x65, 0x64]
34
+ return "application/atom+xml"
35
+ elsif s[pos..pos+6] == "rdf:RDF" # [0x72, 0x64, 0x66, 0x3A, 0x52, 0x44, 0x46]
36
+ raise NotImplementedError
37
+ end
38
+ else
39
+ break
40
+ end
41
+ end
42
+ "text/html"
43
+ end
44
+ end
45
+ end
@@ -69,7 +69,7 @@ module HTML5
69
69
  if @current_token[:type] == :StartTag and data == ">"
70
70
  @current_token[:type] = :EmptyTag
71
71
  else
72
- @token_queue << {:type => :ParseError, :data => _("Solidus (/) incorrectly placed in tag.")}
72
+ @token_queue << {:type => :ParseError, :data => "incorrectly-placed-solidus"}
73
73
  end
74
74
 
75
75
  # The character we just consumed need to be put back on the stack so it
@@ -107,12 +107,12 @@ module HTML5
107
107
  charAsInt = char_stack.join('').to_i(radix)
108
108
 
109
109
  if charAsInt == 13
110
- @token_queue << {:type => :ParseError, :data => _("Incorrect CR newline entity. Replaced with LF.")}
110
+ @token_queue << {:type => :ParseError, :data => "incorrect-cr-newline-entity"}
111
111
  charAsInt = 10
112
112
  elsif (128..159).include? charAsInt
113
113
  # If the integer is between 127 and 160 (so 128 and bigger and 159
114
114
  # and smaller) we need to do the "windows trick".
115
- @token_queue << {:type => :ParseError, :data => _("Entity used with illegal number (windows-1252 reference).")}
115
+ @token_queue << {:type => :ParseError, :data => "illegal-windows-1252-entity"}
116
116
 
117
117
  charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
118
118
  end
@@ -121,13 +121,13 @@ module HTML5
121
121
  char = [charAsInt].pack('U')
122
122
  else
123
123
  char = [0xFFFD].pack('U')
124
- @token_queue << {:type => :ParseError, :data => _("Numeric entity represents an illegal codepoint.")}
124
+ @token_queue << {:type => :ParseError, :data => "cant-convert-numeric-entity", :datavars => {"charAsInt" => charAsInt}}
125
125
  end
126
126
 
127
127
  # Discard the ; if present. Otherwise, put it back on the queue and
128
128
  # invoke parse_error on parser.
129
129
  if c != ";"
130
- @token_queue << {:type => :ParseError, :data => _("Numeric entity didn't end with ';'.")}
130
+ @token_queue << {:type => :ParseError, :data => "numeric-entity-without-semicolon"}
131
131
  @stream.unget(c)
132
132
  end
133
133
 
@@ -147,7 +147,7 @@ module HTML5
147
147
  # back in the queue
148
148
  char_stack = char_stack[0...char_stack.index(:EOF)]
149
149
  @stream.unget(char_stack)
150
- @token_queue << {:type => :ParseError, :data => _("Numeric entity expected. Got end of file instead.")}
150
+ @token_queue << {:type => :ParseError, :data => "expected-numeric-entity-but-got-eof"}
151
151
  else
152
152
  if char_stack[1].downcase == "x" and HEX_DIGITS.include? char_stack[2]
153
153
  # Hexadecimal entity detected.
@@ -160,7 +160,7 @@ module HTML5
160
160
  else
161
161
  # No number entity detected.
162
162
  @stream.unget(char_stack)
163
- @token_queue << {:type => :ParseError, :data => _("Numeric entity expected but none found.")}
163
+ @token_queue << {:type => :ParseError, :data => "expected-numeric-entity"}
164
164
  end
165
165
  end
166
166
  else
@@ -196,10 +196,10 @@ module HTML5
196
196
  # Check whether or not the last character returned can be
197
197
  # discarded or needs to be put back.
198
198
  if entityName[-1] != ?;
199
- @token_queue << {:type => :ParseError, :data => _("Named entity didn't end with ';'.")}
199
+ @token_queue << {:type => :ParseError, :data => "named-entity-without-semicolon"}
200
200
  end
201
201
 
202
- if char_stack[-1] != ";" and from_attribute and
202
+ if entityName[-1] != ";" and from_attribute and
203
203
  (ASCII_LETTERS.include?(char_stack[entityName.length]) or
204
204
  DIGITS.include?(char_stack[entityName.length]))
205
205
  @stream.unget(char_stack)
@@ -208,7 +208,7 @@ module HTML5
208
208
  @stream.unget(char_stack[entityName.length..-1])
209
209
  end
210
210
  else
211
- @token_queue << {:type => :ParseError, :data => _("Named entity expected. Got none.")}
211
+ @token_queue << {:type => :ParseError, :data => "expected-named-entity"}
212
212
  @stream.unget(char_stack)
213
213
  end
214
214
  end
@@ -217,7 +217,7 @@ module HTML5
217
217
 
218
218
  # This method replaces the need for "entityInAttributeValueState".
219
219
  def process_entity_in_attribute
220
- entity = consume_entity(true)
220
+ entity = consume_entity()
221
221
  if entity
222
222
  @current_token[:data][-1][1] += entity
223
223
  else
@@ -309,19 +309,18 @@ module HTML5
309
309
  elsif data == ">"
310
310
  # XXX In theory it could be something besides a tag name. But
311
311
  # do we really care?
312
- @token_queue << {:type => :ParseError, :data => _("Expected tag name. Got '>' instead.")}
312
+ @token_queue << {:type => :ParseError, :data => "expected-tag-name-but-got-right-bracket"}
313
313
  @token_queue << {:type => :Characters, :data => "<>"}
314
314
  @state = :data_state
315
315
  elsif data == "?"
316
316
  # XXX In theory it could be something besides a tag name. But
317
317
  # do we really care?
318
- @token_queue.push({:type => :ParseError, :data => _("Expected tag name. Got '?' instead (HTML doesn't " +
319
- "support processing instructions).")})
318
+ @token_queue.push({:type => :ParseError, :data => "expected-tag-name-but-got-question-mark"})
320
319
  @stream.unget(data)
321
320
  @state = :bogus_comment_state
322
321
  else
323
322
  # XXX
324
- @token_queue << {:type => :ParseError, :data => _("Expected tag name. Got something else instead")}
323
+ @token_queue << {:type => :ParseError, :data => "expected-tag-name"}
325
324
  @token_queue << {:type => :Characters, :data => "<"}
326
325
  @stream.unget(data)
327
326
  @state = :data_state
@@ -382,18 +381,18 @@ module HTML5
382
381
 
383
382
  data = @stream.char
384
383
  if data == :EOF
385
- @token_queue << {:type => :ParseError, :data => _("Expected closing tag. Unexpected end of file.")}
384
+ @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-eof"}
386
385
  @token_queue << {:type => :Characters, :data => "</"}
387
386
  @state = :data_state
388
387
  elsif ASCII_LETTERS.include? data
389
388
  @current_token = {:type => :EndTag, :name => data, :data => []}
390
389
  @state = :tag_name_state
391
390
  elsif data == ">"
392
- @token_queue << {:type => :ParseError, :data => _("Expected closing tag. Got '>' instead. Ignoring '</>'.")}
391
+ @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-right-bracket"}
393
392
  @state = :data_state
394
393
  else
395
394
  # XXX data can be _'_...
396
- @token_queue << {:type => :ParseError, :data => _("Expected closing tag. Unexpected character '#{data}' found.")}
395
+ @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-char", :datavars => {:data => data}}
397
396
  @stream.unget(data)
398
397
  @state = :bogus_comment_state
399
398
  end
@@ -406,7 +405,7 @@ module HTML5
406
405
  if SPACE_CHARACTERS.include? data
407
406
  @state = :before_attribute_name_state
408
407
  elsif data == :EOF
409
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in the tag name.")}
408
+ @token_queue << {:type => :ParseError, :data => "eof-in-tag-name"}
410
409
  emit_current_token
411
410
  elsif ASCII_LETTERS.include? data
412
411
  @current_token[:name] += data + @stream.chars_until(ASCII_LETTERS, true)
@@ -426,7 +425,7 @@ module HTML5
426
425
  if SPACE_CHARACTERS.include? data
427
426
  @stream.chars_until(SPACE_CHARACTERS, true)
428
427
  elsif data == :EOF
429
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected attribute name instead.")}
428
+ @token_queue << {:type => :ParseError, :data => "expected-attribute-name-but-got-eof"}
430
429
  emit_current_token
431
430
  elsif ASCII_LETTERS.include? data
432
431
  @current_token[:data].push([data, ""])
@@ -449,7 +448,7 @@ module HTML5
449
448
  if data == "="
450
449
  @state = :before_attribute_value_state
451
450
  elsif data == :EOF
452
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute name.")}
451
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-name"}
453
452
  @state = :data_state
454
453
  emitToken = true
455
454
  elsif ASCII_LETTERS.include? data
@@ -479,7 +478,7 @@ module HTML5
479
478
  end
480
479
  @current_token[:data][0...-1].each {|name,value|
481
480
  if @current_token[:data].last.first == name
482
- @token_queue << {:type => :ParseError, :data =>_("Dropped duplicate attribute on tag.")}
481
+ @token_queue << {:type => :ParseError, :data => "duplicate-attribute"}
483
482
  break # don't report an error more than once
484
483
  end
485
484
  }
@@ -498,7 +497,7 @@ module HTML5
498
497
  elsif data == ">"
499
498
  emit_current_token
500
499
  elsif data == :EOF
501
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected = or end of tag.")}
500
+ @token_queue << {:type => :ParseError, :data => "expected-end-of-tag-but-got-eof"}
502
501
  emit_current_token
503
502
  elsif ASCII_LETTERS.include? data
504
503
  @current_token[:data].push([data, ""])
@@ -527,7 +526,7 @@ module HTML5
527
526
  elsif data == ">"
528
527
  emit_current_token
529
528
  elsif data == :EOF
530
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected attribute value.")}
529
+ @token_queue << {:type => :ParseError, :data => "expected-attribute-value-but-got-eof"}
531
530
  emit_current_token
532
531
  else
533
532
  @current_token[:data][-1][1] += data
@@ -543,7 +542,7 @@ module HTML5
543
542
  elsif data == "&"
544
543
  process_entity_in_attribute
545
544
  elsif data == :EOF
546
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute value (\").")}
545
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-double-quote"}
547
546
  emit_current_token
548
547
  else
549
548
  @current_token[:data][-1][1] += data + @stream.chars_until(["\"", "&"])
@@ -558,7 +557,7 @@ module HTML5
558
557
  elsif data == "&"
559
558
  process_entity_in_attribute
560
559
  elsif data == :EOF
561
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute value (').")}
560
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-single-quote"}
562
561
  emit_current_token
563
562
  else
564
563
  @current_token[:data][-1][1] += data +\
@@ -576,7 +575,7 @@ module HTML5
576
575
  elsif data == ">"
577
576
  emit_current_token
578
577
  elsif data == :EOF
579
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute value.")}
578
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-no-quotes"}
580
579
  emit_current_token
581
580
  else
582
581
  @current_token[:data][-1][1] += data + @stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
@@ -609,7 +608,7 @@ module HTML5
609
608
  @current_token = {:type => :Doctype, :name => "", :publicId => nil, :systemId => nil, :correct => true}
610
609
  @state = :doctype_state
611
610
  else
612
- @token_queue << {:type => :ParseError, :data => _("Expected '--' or 'DOCTYPE'. Not found.")}
611
+ @token_queue << {:type => :ParseError, :data => "expected-dashes-or-doctype"}
613
612
  @stream.unget(char_stack)
614
613
  @state = :bogus_comment_state
615
614
  end
@@ -622,11 +621,11 @@ module HTML5
622
621
  if data == "-"
623
622
  @state = :comment_start_dash_state
624
623
  elsif data == ">"
625
- @token_queue << {:type => :ParseError, :data => _("Incorrect comment.")}
624
+ @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
626
625
  @token_queue << @current_token
627
626
  @state = :data_state
628
627
  elsif data == :EOF
629
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment.")}
628
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
630
629
  @token_queue << @current_token
631
630
  @state = :data_state
632
631
  else
@@ -641,11 +640,11 @@ module HTML5
641
640
  if data == "-"
642
641
  @state = :comment_end_state
643
642
  elsif data == ">"
644
- @token_queue << {:type => :ParseError, :data => _("Incorrect comment.")}
643
+ @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
645
644
  @token_queue << @current_token
646
645
  @state = :data_state
647
646
  elsif data == :EOF
648
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment.")}
647
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
649
648
  @token_queue << @current_token
650
649
  @state = :data_state
651
650
  else
@@ -660,7 +659,7 @@ module HTML5
660
659
  if data == "-"
661
660
  @state = :comment_end_dash_state
662
661
  elsif data == :EOF
663
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment.")}
662
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
664
663
  @token_queue << @current_token
665
664
  @state = :data_state
666
665
  else
@@ -674,7 +673,7 @@ module HTML5
674
673
  if data == "-"
675
674
  @state = :comment_end_state
676
675
  elsif data == :EOF
677
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment (-)")}
676
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment-end-dash"}
678
677
  @token_queue << @current_token
679
678
  @state = :data_state
680
679
  else
@@ -694,15 +693,15 @@ module HTML5
694
693
  @token_queue << @current_token
695
694
  @state = :data_state
696
695
  elsif data == "-"
697
- @token_queue << {:type => :ParseError, :data => _("Unexpected '-' after '--' found in comment.")}
696
+ @token_queue << {:type => :ParseError, :data => "unexpected-dash-after-double-dash-in-comment"}
698
697
  @current_token[:data] += data
699
698
  elsif data == :EOF
700
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment (--).")}
699
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment-double-dash"}
701
700
  @token_queue << @current_token
702
701
  @state = :data_state
703
702
  else
704
703
  # XXX
705
- @token_queue << {:type => :ParseError, :data => _("Unexpected character in comment found.")}
704
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-comment"}
706
705
  @current_token[:data] += "--" + data
707
706
  @state = :comment_state
708
707
  end
@@ -714,7 +713,7 @@ module HTML5
714
713
  if SPACE_CHARACTERS.include? data
715
714
  @state = :before_doctype_name_state
716
715
  else
717
- @token_queue << {:type => :ParseError, :data => _("No space after literal string 'DOCTYPE'.")}
716
+ @token_queue << {:type => :ParseError, :data => "need-space-after-doctype"}
718
717
  @stream.unget(data)
719
718
  @state = :before_doctype_name_state
720
719
  end
@@ -725,12 +724,12 @@ module HTML5
725
724
  data = @stream.char
726
725
  if SPACE_CHARACTERS.include? data
727
726
  elsif data == ">"
728
- @token_queue << {:type => :ParseError, :data => _("Unexpected > character. Expected DOCTYPE name.")}
727
+ @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-right-bracket"}
729
728
  @current_token[:correct] = false
730
729
  @token_queue << @current_token
731
730
  @state = :data_state
732
731
  elsif data == :EOF
733
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected DOCTYPE name.")}
732
+ @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-eof"}
734
733
  @current_token[:correct] = false
735
734
  @token_queue << @current_token
736
735
  @state = :data_state
@@ -749,7 +748,7 @@ module HTML5
749
748
  @token_queue << @current_token
750
749
  @state = :data_state
751
750
  elsif data == :EOF
752
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE name.")}
751
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype-name"}
753
752
  @current_token[:correct] = false
754
753
  @token_queue << @current_token
755
754
  @state = :data_state
@@ -769,7 +768,7 @@ module HTML5
769
768
  elsif data == :EOF
770
769
  @current_token[:correct] = false
771
770
  @stream.unget(data)
772
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
771
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
773
772
  @token_queue << @current_token
774
773
  @state = :data_state
775
774
  else
@@ -782,7 +781,7 @@ module HTML5
782
781
  @state = :before_doctype_system_identifier_state
783
782
  else
784
783
  @stream.unget(char_stack)
785
- @token_queue << {:type => :ParseError, :data => _("Expected 'public' or 'system'. Got '#{token}'")}
784
+ @token_queue << {:type => :ParseError, :data => "expected-space-or-right-bracket-in-doctype", "datavars" => {"data" => data}}
786
785
  @state = :bogus_doctype_state
787
786
  end
788
787
  end
@@ -800,17 +799,17 @@ module HTML5
800
799
  @current_token[:publicId] = ""
801
800
  @state = :doctype_public_identifier_single_quoted_state
802
801
  elsif data == ">"
803
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of DOCTYPE.")}
802
+ @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
804
803
  @current_token[:correct] = false
805
804
  @token_queue << @current_token
806
805
  @state = :data_state
807
806
  elsif data == :EOF
808
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
807
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
809
808
  @current_token[:correct] = false
810
809
  @token_queue << @current_token
811
810
  @state = :data_state
812
811
  else
813
- @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
812
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
814
813
  @state = :bogus_doctype_state
815
814
  end
816
815
 
@@ -822,7 +821,7 @@ module HTML5
822
821
  if data == "\""
823
822
  @state = :after_doctype_public_identifier_state
824
823
  elsif data == :EOF
825
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
824
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
826
825
  @current_token[:correct] = false
827
826
  @token_queue << @current_token
828
827
  @state = :data_state
@@ -837,7 +836,7 @@ module HTML5
837
836
  if data == "'"
838
837
  @state = :after_doctype_public_identifier_state
839
838
  elsif data == :EOF
840
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
839
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
841
840
  @current_token[:correct] = false
842
841
  @token_queue << @current_token
843
842
  @state = :data_state
@@ -860,12 +859,12 @@ module HTML5
860
859
  @token_queue << @current_token
861
860
  @state = :data_state
862
861
  elsif data == :EOF
863
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
862
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
864
863
  @current_token[:correct] = false
865
864
  @token_queue << @current_token
866
865
  @state = :data_state
867
866
  else
868
- @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
867
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
869
868
  @state = :bogus_doctype_state
870
869
  end
871
870
  return true
@@ -881,17 +880,17 @@ module HTML5
881
880
  @current_token[:systemId] = ""
882
881
  @state = :doctype_system_identifier_single_quoted_state
883
882
  elsif data == ">"
884
- @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
883
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
885
884
  @current_token[:correct] = false
886
885
  @token_queue << @current_token
887
886
  @state = :data_state
888
887
  elsif data == :EOF
889
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
888
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
890
889
  @current_token[:correct] = false
891
890
  @token_queue << @current_token
892
891
  @state = :data_state
893
892
  else
894
- @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
893
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
895
894
  @state = :bogus_doctype_state
896
895
  end
897
896
  return true
@@ -902,7 +901,7 @@ module HTML5
902
901
  if data == "\""
903
902
  @state = :after_doctype_system_identifier_state
904
903
  elsif data == :EOF
905
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
904
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
906
905
  @current_token[:correct] = false
907
906
  @token_queue << @current_token
908
907
  @state = :data_state
@@ -917,7 +916,7 @@ module HTML5
917
916
  if data == "'"
918
917
  @state = :after_doctype_system_identifier_state
919
918
  elsif data == :EOF
920
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
919
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
921
920
  @current_token[:correct] = false
922
921
  @token_queue << @current_token
923
922
  @state = :data_state
@@ -934,12 +933,12 @@ module HTML5
934
933
  @token_queue << @current_token
935
934
  @state = :data_state
936
935
  elsif data == :EOF
937
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
936
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
938
937
  @current_token[:correct] = false
939
938
  @token_queue << @current_token
940
939
  @state = :data_state
941
940
  else
942
- @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
941
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
943
942
  @state = :bogus_doctype_state
944
943
  end
945
944
  return true
@@ -954,7 +953,7 @@ module HTML5
954
953
  elsif data == :EOF
955
954
  # XXX EMIT
956
955
  @stream.unget(data)
957
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in bogus doctype.")}
956
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
958
957
  @current_token[:correct] = false
959
958
  @token_queue << @current_token
960
959
  @state = :data_state
@@ -962,7 +961,6 @@ module HTML5
962
961
  return true
963
962
  end
964
963
 
965
- def _(string); string; end
966
964
  end
967
965
 
968
966
  end