html5 0.1.0 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (98) hide show
  1. data/History.txt +9 -2
  2. data/Manifest.txt +61 -2
  3. data/README +41 -5
  4. data/Rakefile.rb +22 -6
  5. data/{parse.rb → bin/html5} +11 -11
  6. data/lib/core_ext/string.rb +17 -0
  7. data/lib/html5/constants.rb +228 -0
  8. data/lib/html5/filters/iso639codes.rb +752 -0
  9. data/lib/html5/filters/rfc2046.rb +30 -0
  10. data/lib/html5/filters/rfc3987.rb +89 -0
  11. data/lib/html5/filters/validator.rb +830 -0
  12. data/lib/html5/html5parser.rb +25 -25
  13. data/lib/html5/html5parser/after_body_phase.rb +3 -3
  14. data/lib/html5/html5parser/after_frameset_phase.rb +3 -4
  15. data/lib/html5/html5parser/after_head_phase.rb +6 -6
  16. data/lib/html5/html5parser/before_head_phase.rb +1 -1
  17. data/lib/html5/html5parser/in_body_phase.rb +54 -48
  18. data/lib/html5/html5parser/in_caption_phase.rb +7 -6
  19. data/lib/html5/html5parser/in_cell_phase.rb +3 -3
  20. data/lib/html5/html5parser/in_column_group_phase.rb +1 -1
  21. data/lib/html5/html5parser/in_frameset_phase.rb +5 -5
  22. data/lib/html5/html5parser/in_head_phase.rb +10 -10
  23. data/lib/html5/html5parser/in_row_phase.rb +4 -2
  24. data/lib/html5/html5parser/in_select_phase.rb +7 -6
  25. data/lib/html5/html5parser/in_table_body_phase.rb +8 -5
  26. data/lib/html5/html5parser/in_table_phase.rb +12 -7
  27. data/lib/html5/html5parser/initial_phase.rb +5 -6
  28. data/lib/html5/html5parser/phase.rb +5 -9
  29. data/lib/html5/html5parser/root_element_phase.rb +1 -2
  30. data/lib/html5/html5parser/trailing_end_phase.rb +3 -3
  31. data/lib/html5/inputstream.rb +25 -31
  32. data/lib/html5/liberalxmlparser.rb +2 -2
  33. data/lib/html5/sanitizer.rb +6 -6
  34. data/lib/html5/serializer/htmlserializer.rb +2 -3
  35. data/lib/html5/sniffer.rb +45 -0
  36. data/lib/html5/tokenizer.rb +57 -59
  37. data/lib/html5/treebuilders/rexml.rb +7 -6
  38. data/lib/html5/treebuilders/simpletree.rb +1 -1
  39. data/lib/html5/treewalkers/base.rb +8 -0
  40. data/lib/html5/version.rb +3 -0
  41. data/testdata/encoding/chardet/test_big5.txt +51 -0
  42. data/testdata/encoding/test-yahoo-jp.dat +10 -0
  43. data/testdata/encoding/tests1.dat +394 -0
  44. data/testdata/encoding/tests2.dat +81 -0
  45. data/testdata/sanitizer/tests1.dat +416 -0
  46. data/testdata/serializer/core.test +104 -0
  47. data/testdata/serializer/injectmeta.test +65 -0
  48. data/testdata/serializer/optionaltags.test +900 -0
  49. data/testdata/serializer/options.test +60 -0
  50. data/testdata/serializer/whitespace.test +51 -0
  51. data/testdata/sites/google-results.htm +1 -0
  52. data/testdata/sites/python-ref-import.htm +1 -0
  53. data/testdata/sites/web-apps-old.htm +1 -0
  54. data/testdata/sites/web-apps.htm +34275 -0
  55. data/testdata/sniffer/htmlOrFeed.json +43 -0
  56. data/testdata/tokenizer/contentModelFlags.test +48 -0
  57. data/testdata/tokenizer/entities.test +2339 -0
  58. data/testdata/tokenizer/escapeFlag.test +21 -0
  59. data/testdata/tokenizer/test1.test +172 -0
  60. data/testdata/tokenizer/test2.test +129 -0
  61. data/testdata/tokenizer/test3.test +367 -0
  62. data/testdata/tokenizer/test4.test +198 -0
  63. data/testdata/tree-construction/tests1.dat +1950 -0
  64. data/testdata/tree-construction/tests2.dat +773 -0
  65. data/testdata/tree-construction/tests3.dat +270 -0
  66. data/testdata/tree-construction/tests4.dat +60 -0
  67. data/testdata/tree-construction/tests5.dat +175 -0
  68. data/testdata/tree-construction/tests6.dat +196 -0
  69. data/testdata/validator/attributes.test +1035 -0
  70. data/testdata/validator/base-href-attribute.test +787 -0
  71. data/testdata/validator/base-target-attribute.test +35 -0
  72. data/testdata/validator/blockquote-cite-attribute.test +7 -0
  73. data/testdata/validator/classattribute.test +152 -0
  74. data/testdata/validator/contenteditableattribute.test +59 -0
  75. data/testdata/validator/contextmenuattribute.test +115 -0
  76. data/testdata/validator/dirattribute.test +59 -0
  77. data/testdata/validator/draggableattribute.test +63 -0
  78. data/testdata/validator/html-xmlns-attribute.test +23 -0
  79. data/testdata/validator/idattribute.test +115 -0
  80. data/testdata/validator/inputattributes.test +2795 -0
  81. data/testdata/validator/irrelevantattribute.test +63 -0
  82. data/testdata/validator/langattribute.test +5579 -0
  83. data/testdata/validator/li-value-attribute.test +7 -0
  84. data/testdata/validator/link-href-attribute.test +7 -0
  85. data/testdata/validator/link-hreflang-attribute.test +7 -0
  86. data/testdata/validator/link-rel-attribute.test +271 -0
  87. data/testdata/validator/ol-start-attribute.test +7 -0
  88. data/testdata/validator/starttags.test +375 -0
  89. data/testdata/validator/style-scoped-attribute.test +7 -0
  90. data/testdata/validator/tabindexattribute.test +79 -0
  91. data/tests/preamble.rb +7 -17
  92. data/tests/test_encoding.rb +1 -1
  93. data/tests/test_lxp.rb +16 -0
  94. data/tests/test_parser.rb +2 -2
  95. data/tests/test_sniffer.rb +27 -0
  96. data/tests/test_treewalkers.rb +41 -22
  97. data/tests/test_validator.rb +31 -0
  98. metadata +65 -6
@@ -33,10 +33,9 @@ module HTML5
33
33
 
34
34
  def insert_html_element
35
35
  element = @tree.createElement('html', {})
36
- @tree.open_elements.push(element)
36
+ @tree.open_elements << element
37
37
  @tree.document.appendChild(element)
38
38
  @parser.phase = @parser.phases[:beforeHead]
39
39
  end
40
-
41
40
  end
42
41
  end
@@ -15,19 +15,19 @@ module HTML5
15
15
  end
16
16
 
17
17
  def processCharacters(data)
18
- parse_error(_('Unexpected non-space characters. Expected end of file.'))
18
+ parse_error("expected-eof-but-got-char")
19
19
  @parser.phase = @parser.last_phase
20
20
  @parser.phase.processCharacters(data)
21
21
  end
22
22
 
23
23
  def processStartTag(name, attributes)
24
- parse_error(_('Unexpected start tag (#{name}). Expected end of file.'))
24
+ parse_error("expected-eof-but-got-start-tag", {"name" => name})
25
25
  @parser.phase = @parser.last_phase
26
26
  @parser.phase.processStartTag(name, attributes)
27
27
  end
28
28
 
29
29
  def processEndTag(name)
30
- parse_error(_('Unexpected end tag (#{name}). Expected end of file.'))
30
+ parse_error("expected-eof-but-got-end-tag", {"name" => name})
31
31
  @parser.phase = @parser.last_phase
32
32
  @parser.phase.processEndTag(name)
33
33
  end
@@ -60,15 +60,11 @@ module HTML5
60
60
  if @char_encoding == 'windows-1252'
61
61
  @win1252 = true
62
62
  elsif @char_encoding != 'utf-8'
63
+ require 'iconv'
63
64
  begin
64
- require 'iconv'
65
- begin
66
- @buffer << @raw_stream.read unless @raw_stream.eof?
67
- @buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
68
- rescue
69
- @win1252 = true
70
- end
71
- rescue LoadError
65
+ @buffer << @raw_stream.read unless @raw_stream.eof?
66
+ @buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
67
+ rescue
72
68
  @win1252 = true
73
69
  end
74
70
  end
@@ -88,12 +84,11 @@ module HTML5
88
84
  def open_stream(source)
89
85
  # Already an IO like object
90
86
  if source.respond_to?(:read)
91
- @stream = source
87
+ source
92
88
  else
93
89
  # Treat source as a string and wrap in StringIO
94
- @stream = StringIO.new(source)
90
+ StringIO.new(source)
95
91
  end
96
- return @stream
97
92
  end
98
93
 
99
94
  def detect_encoding
@@ -138,14 +133,12 @@ module HTML5
138
133
  encoding = @DEFAULT_ENCODING
139
134
  end
140
135
 
141
- #Substitute for equivalent encodings
142
- encoding_sub = {'iso-8859-1' => 'windows-1252'}
143
-
144
- if encoding_sub.has_key?(encoding.downcase)
145
- encoding = encoding_sub[encoding.downcase]
136
+ #Substitute for equivalent encoding
137
+ if 'iso-8859-1' == encoding.downcase
138
+ encoding = 'windows-1252'
146
139
  end
147
140
 
148
- return encoding
141
+ encoding
149
142
  end
150
143
 
151
144
  # Attempts to detect at BOM at the start of the stream. If
@@ -153,9 +146,9 @@ module HTML5
153
146
  # encoding otherwise return nil
154
147
  def detect_bom
155
148
  bom_dict = {
156
- "\xef\xbb\xbf" => 'utf-8',
157
- "\xff\xfe" => 'utf-16le',
158
- "\xfe\xff" => 'utf-16be',
149
+ "\xef\xbb\xbf" => 'utf-8',
150
+ "\xff\xfe" => 'utf-16le',
151
+ "\xfe\xff" => 'utf-16be',
159
152
  "\xff\xfe\x00\x00" => 'utf-32le',
160
153
  "\x00\x00\xfe\xff" => 'utf-32be'
161
154
  }
@@ -198,6 +191,7 @@ module HTML5
198
191
  end
199
192
  end
200
193
 
194
+ #TODO: huh?
201
195
  require 'delegate'
202
196
  @raw_stream = SimpleDelegator.new(@raw_stream)
203
197
 
@@ -250,7 +244,7 @@ module HTML5
250
244
  col -= 1
251
245
  end
252
246
  end
253
- return [line+1, col]
247
+ return [line + 1, col]
254
248
  end
255
249
 
256
250
  # Read one character from the stream or queue if available. Return
@@ -259,9 +253,9 @@ module HTML5
259
253
  unless @queue.empty?
260
254
  return @queue.shift
261
255
  else
262
- if @tell + 3 > @buffer.length and !@raw_stream.eof?
256
+ if @tell + 3 > @buffer.length && !@raw_stream.eof?
263
257
  # read next block
264
- @buffer = @buffer[@tell .. -1] + @raw_stream.read(@NUM_BYTES_BUFFER)
258
+ @buffer = @buffer[@tell..-1] + @raw_stream.read(@NUM_BYTES_BUFFER)
265
259
  @tell = 0
266
260
  end
267
261
 
@@ -269,7 +263,7 @@ module HTML5
269
263
  @tell += 1
270
264
 
271
265
  case c
272
- when 0x01 .. 0x7F
266
+ when 0x01..0x7F
273
267
  if c == 0x0D
274
268
  # normalize newlines
275
269
  @tell += 1 if @buffer[@tell] == 0x0A
@@ -287,7 +281,7 @@ module HTML5
287
281
 
288
282
  c.chr
289
283
 
290
- when 0x80 .. 0xBF
284
+ when 0x80..0xBF
291
285
  if !@win1252
292
286
  [0xFFFD].pack('U') # invalid utf-8
293
287
  elsif c <= 0x9f
@@ -296,10 +290,11 @@ module HTML5
296
290
  "\xC2" + c.chr # convert to utf-8
297
291
  end
298
292
 
299
- when 0xC0 .. 0xFF
300
- if instance_variable_defined?(:@win1252) && @win1252
301
- "\xC3" + (c-64).chr # convert to utf-8
302
- elsif @buffer[@tell-1 .. @tell+3] =~ /^
293
+ when 0xC0..0xFF
294
+ if instance_variables.include?("@win1252") && @win1252
295
+ "\xC3" + (c - 64).chr # convert to utf-8
296
+ # from http://www.w3.org/International/questions/qa-forms-utf-8.en.php
297
+ elsif @buffer[@tell - 1..@tell + 3] =~ /^
303
298
  ( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
304
299
  | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
305
300
  | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
@@ -315,8 +310,7 @@ module HTML5
315
310
  end
316
311
 
317
312
  when 0x00
318
- @errors.push('null character found in input stream, ' +
319
- 'replaced with U+FFFD')
313
+ @errors.push("null-character")
320
314
  [0xFFFD].pack('U') # null characters are invalid
321
315
 
322
316
  else
@@ -50,7 +50,7 @@ module HTML5
50
50
 
51
51
  when :EndTag
52
52
  if token[:data]
53
- parse_error(_("End tag contains unexpected attributes."))
53
+ parse_error("attributes-in-end-tag")
54
54
  end
55
55
 
56
56
  when :Comment
@@ -81,7 +81,7 @@ module HTML5
81
81
  # open and close tags are emitted
82
82
  if token[:type] == :EndTag
83
83
  if VOID_ELEMENTS.include? token[:name]
84
- if @tree.open_elements[-1].name != token["name"]:
84
+ if @tree.open_elements[-1].name != token["name"]
85
85
  token[:type] = :EmptyTag
86
86
  token["data"] ||= {}
87
87
  end
@@ -110,13 +110,13 @@ module HTML5
110
110
  def sanitize_token(token)
111
111
  case token[:type]
112
112
  when :StartTag, :EndTag, :EmptyTag
113
- if ALLOWED_ELEMENTS.include?(token[:name])
113
+ if self.class.const_get("ALLOWED_ELEMENTS").include?(token[:name])
114
114
  if token.has_key? :data
115
115
  attrs = Hash[*token[:data].flatten]
116
- attrs.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
116
+ attrs.delete_if { |attr,v| !self.class.const_get("ALLOWED_ATTRIBUTES").include?(attr) }
117
117
  ATTR_VAL_IS_URI.each do |attr|
118
118
  val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
119
- if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
119
+ if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !self.class.const_get("ALLOWED_PROTOCOLS").include?(val_unescaped.split(':')[0])
120
120
  attrs.delete attr
121
121
  end
122
122
  end
@@ -160,14 +160,14 @@ module HTML5
160
160
  style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
161
161
  next if val.empty?
162
162
  prop.downcase!
163
- if ALLOWED_CSS_PROPERTIES.include?(prop)
163
+ if self.class.const_get("ALLOWED_CSS_PROPERTIES").include?(prop)
164
164
  clean << "#{prop}: #{val};"
165
165
  elsif %w[background border margin padding].include?(prop.split('-')[0])
166
166
  clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
167
- !ALLOWED_CSS_KEYWORDS.include?(keyword) and
167
+ !self.class.const_get("ALLOWED_CSS_KEYWORDS").include?(keyword) and
168
168
  keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
169
169
  end
170
- elsif ALLOWED_SVG_PROPERTIES.include?(prop)
170
+ elsif self.class.const_get("ALLOWED_SVG_PROPERTIES").include?(prop)
171
171
  clean << "#{prop}: #{val};"
172
172
  end
173
173
  end
@@ -31,7 +31,7 @@ module HTML5
31
31
  @inject_meta_charset = true
32
32
 
33
33
  options.each do |name, value|
34
- next unless instance_variable_defined?("@#{name}")
34
+ next unless instance_variables.include?("@#{name}")
35
35
  @use_best_quote_char = false if name.to_s == 'quote_char'
36
36
  instance_variable_set("@#{name}", value)
37
37
  end
@@ -73,7 +73,7 @@ module HTML5
73
73
  elsif [:Characters, :SpaceCharacters].include? type
74
74
  if type == :SpaceCharacters or in_cdata
75
75
  if in_cdata and token[:data].include?("</")
76
- serialize_error(_("Unexpected </ in CDATA"))
76
+ serialize_error("Unexpected </ in CDATA")
77
77
  end
78
78
  result << token[:data]
79
79
  else
@@ -171,7 +171,6 @@ module HTML5
171
171
  end
172
172
  end
173
173
 
174
- def _(string); string; end
175
174
  end
176
175
 
177
176
  # Error in serialized tree
@@ -0,0 +1,45 @@
1
+ module HTML5
2
+ module Sniffer
3
+ # 4.7.4
4
+ def html_or_feed str
5
+ s = str[0, 512] # steps 1, 2
6
+ pos = 0
7
+
8
+ while pos < s.length
9
+ case s[pos]
10
+ when 0x09, 0x20, 0x0A, 0x0D # tab, space, LF, CR
11
+ pos += 1
12
+ when 0x3C # "<"
13
+ pos += 1
14
+ if s[pos..pos+2] == "!--" # [0x21, 0x2D, 0x2D]
15
+ pos += 3
16
+ until s[pos..pos+2] == "-->" or pos >= s.length
17
+ pos += 1
18
+ end
19
+ pos += 3
20
+ elsif s[pos] == 0x21 # "!"
21
+ pos += 1
22
+ until s[pos] == 0x3E or pos >= s.length # ">"
23
+ pos += 1
24
+ end
25
+ pos += 1
26
+ elsif s[pos] == 0x3F # "?"
27
+ until s[pos..pos+1] == "?>" or pos >= s.length # [0x3F, 0x3E]
28
+ pos += 1
29
+ end
30
+ pos += 2
31
+ elsif s[pos..pos+2] == "rss" # [0x72, 0x73, 0x73]
32
+ return "application/rss+xml"
33
+ elsif s[pos..pos+3] == "feed" # [0x66, 0x65, 0x65, 0x64]
34
+ return "application/atom+xml"
35
+ elsif s[pos..pos+6] == "rdf:RDF" # [0x72, 0x64, 0x66, 0x3A, 0x52, 0x44, 0x46]
36
+ raise NotImplementedError
37
+ end
38
+ else
39
+ break
40
+ end
41
+ end
42
+ "text/html"
43
+ end
44
+ end
45
+ end
@@ -69,7 +69,7 @@ module HTML5
69
69
  if @current_token[:type] == :StartTag and data == ">"
70
70
  @current_token[:type] = :EmptyTag
71
71
  else
72
- @token_queue << {:type => :ParseError, :data => _("Solidus (/) incorrectly placed in tag.")}
72
+ @token_queue << {:type => :ParseError, :data => "incorrectly-placed-solidus"}
73
73
  end
74
74
 
75
75
  # The character we just consumed need to be put back on the stack so it
@@ -107,12 +107,12 @@ module HTML5
107
107
  charAsInt = char_stack.join('').to_i(radix)
108
108
 
109
109
  if charAsInt == 13
110
- @token_queue << {:type => :ParseError, :data => _("Incorrect CR newline entity. Replaced with LF.")}
110
+ @token_queue << {:type => :ParseError, :data => "incorrect-cr-newline-entity"}
111
111
  charAsInt = 10
112
112
  elsif (128..159).include? charAsInt
113
113
  # If the integer is between 127 and 160 (so 128 and bigger and 159
114
114
  # and smaller) we need to do the "windows trick".
115
- @token_queue << {:type => :ParseError, :data => _("Entity used with illegal number (windows-1252 reference).")}
115
+ @token_queue << {:type => :ParseError, :data => "illegal-windows-1252-entity"}
116
116
 
117
117
  charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
118
118
  end
@@ -121,13 +121,13 @@ module HTML5
121
121
  char = [charAsInt].pack('U')
122
122
  else
123
123
  char = [0xFFFD].pack('U')
124
- @token_queue << {:type => :ParseError, :data => _("Numeric entity represents an illegal codepoint.")}
124
+ @token_queue << {:type => :ParseError, :data => "cant-convert-numeric-entity", :datavars => {"charAsInt" => charAsInt}}
125
125
  end
126
126
 
127
127
  # Discard the ; if present. Otherwise, put it back on the queue and
128
128
  # invoke parse_error on parser.
129
129
  if c != ";"
130
- @token_queue << {:type => :ParseError, :data => _("Numeric entity didn't end with ';'.")}
130
+ @token_queue << {:type => :ParseError, :data => "numeric-entity-without-semicolon"}
131
131
  @stream.unget(c)
132
132
  end
133
133
 
@@ -147,7 +147,7 @@ module HTML5
147
147
  # back in the queue
148
148
  char_stack = char_stack[0...char_stack.index(:EOF)]
149
149
  @stream.unget(char_stack)
150
- @token_queue << {:type => :ParseError, :data => _("Numeric entity expected. Got end of file instead.")}
150
+ @token_queue << {:type => :ParseError, :data => "expected-numeric-entity-but-got-eof"}
151
151
  else
152
152
  if char_stack[1].downcase == "x" and HEX_DIGITS.include? char_stack[2]
153
153
  # Hexadecimal entity detected.
@@ -160,7 +160,7 @@ module HTML5
160
160
  else
161
161
  # No number entity detected.
162
162
  @stream.unget(char_stack)
163
- @token_queue << {:type => :ParseError, :data => _("Numeric entity expected but none found.")}
163
+ @token_queue << {:type => :ParseError, :data => "expected-numeric-entity"}
164
164
  end
165
165
  end
166
166
  else
@@ -196,10 +196,10 @@ module HTML5
196
196
  # Check whether or not the last character returned can be
197
197
  # discarded or needs to be put back.
198
198
  if entityName[-1] != ?;
199
- @token_queue << {:type => :ParseError, :data => _("Named entity didn't end with ';'.")}
199
+ @token_queue << {:type => :ParseError, :data => "named-entity-without-semicolon"}
200
200
  end
201
201
 
202
- if char_stack[-1] != ";" and from_attribute and
202
+ if entityName[-1] != ";" and from_attribute and
203
203
  (ASCII_LETTERS.include?(char_stack[entityName.length]) or
204
204
  DIGITS.include?(char_stack[entityName.length]))
205
205
  @stream.unget(char_stack)
@@ -208,7 +208,7 @@ module HTML5
208
208
  @stream.unget(char_stack[entityName.length..-1])
209
209
  end
210
210
  else
211
- @token_queue << {:type => :ParseError, :data => _("Named entity expected. Got none.")}
211
+ @token_queue << {:type => :ParseError, :data => "expected-named-entity"}
212
212
  @stream.unget(char_stack)
213
213
  end
214
214
  end
@@ -217,7 +217,7 @@ module HTML5
217
217
 
218
218
  # This method replaces the need for "entityInAttributeValueState".
219
219
  def process_entity_in_attribute
220
- entity = consume_entity(true)
220
+ entity = consume_entity()
221
221
  if entity
222
222
  @current_token[:data][-1][1] += entity
223
223
  else
@@ -309,19 +309,18 @@ module HTML5
309
309
  elsif data == ">"
310
310
  # XXX In theory it could be something besides a tag name. But
311
311
  # do we really care?
312
- @token_queue << {:type => :ParseError, :data => _("Expected tag name. Got '>' instead.")}
312
+ @token_queue << {:type => :ParseError, :data => "expected-tag-name-but-got-right-bracket"}
313
313
  @token_queue << {:type => :Characters, :data => "<>"}
314
314
  @state = :data_state
315
315
  elsif data == "?"
316
316
  # XXX In theory it could be something besides a tag name. But
317
317
  # do we really care?
318
- @token_queue.push({:type => :ParseError, :data => _("Expected tag name. Got '?' instead (HTML doesn't " +
319
- "support processing instructions).")})
318
+ @token_queue.push({:type => :ParseError, :data => "expected-tag-name-but-got-question-mark"})
320
319
  @stream.unget(data)
321
320
  @state = :bogus_comment_state
322
321
  else
323
322
  # XXX
324
- @token_queue << {:type => :ParseError, :data => _("Expected tag name. Got something else instead")}
323
+ @token_queue << {:type => :ParseError, :data => "expected-tag-name"}
325
324
  @token_queue << {:type => :Characters, :data => "<"}
326
325
  @stream.unget(data)
327
326
  @state = :data_state
@@ -382,18 +381,18 @@ module HTML5
382
381
 
383
382
  data = @stream.char
384
383
  if data == :EOF
385
- @token_queue << {:type => :ParseError, :data => _("Expected closing tag. Unexpected end of file.")}
384
+ @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-eof"}
386
385
  @token_queue << {:type => :Characters, :data => "</"}
387
386
  @state = :data_state
388
387
  elsif ASCII_LETTERS.include? data
389
388
  @current_token = {:type => :EndTag, :name => data, :data => []}
390
389
  @state = :tag_name_state
391
390
  elsif data == ">"
392
- @token_queue << {:type => :ParseError, :data => _("Expected closing tag. Got '>' instead. Ignoring '</>'.")}
391
+ @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-right-bracket"}
393
392
  @state = :data_state
394
393
  else
395
394
  # XXX data can be _'_...
396
- @token_queue << {:type => :ParseError, :data => _("Expected closing tag. Unexpected character '#{data}' found.")}
395
+ @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-char", :datavars => {:data => data}}
397
396
  @stream.unget(data)
398
397
  @state = :bogus_comment_state
399
398
  end
@@ -406,7 +405,7 @@ module HTML5
406
405
  if SPACE_CHARACTERS.include? data
407
406
  @state = :before_attribute_name_state
408
407
  elsif data == :EOF
409
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in the tag name.")}
408
+ @token_queue << {:type => :ParseError, :data => "eof-in-tag-name"}
410
409
  emit_current_token
411
410
  elsif ASCII_LETTERS.include? data
412
411
  @current_token[:name] += data + @stream.chars_until(ASCII_LETTERS, true)
@@ -426,7 +425,7 @@ module HTML5
426
425
  if SPACE_CHARACTERS.include? data
427
426
  @stream.chars_until(SPACE_CHARACTERS, true)
428
427
  elsif data == :EOF
429
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected attribute name instead.")}
428
+ @token_queue << {:type => :ParseError, :data => "expected-attribute-name-but-got-eof"}
430
429
  emit_current_token
431
430
  elsif ASCII_LETTERS.include? data
432
431
  @current_token[:data].push([data, ""])
@@ -449,7 +448,7 @@ module HTML5
449
448
  if data == "="
450
449
  @state = :before_attribute_value_state
451
450
  elsif data == :EOF
452
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute name.")}
451
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-name"}
453
452
  @state = :data_state
454
453
  emitToken = true
455
454
  elsif ASCII_LETTERS.include? data
@@ -479,7 +478,7 @@ module HTML5
479
478
  end
480
479
  @current_token[:data][0...-1].each {|name,value|
481
480
  if @current_token[:data].last.first == name
482
- @token_queue << {:type => :ParseError, :data =>_("Dropped duplicate attribute on tag.")}
481
+ @token_queue << {:type => :ParseError, :data => "duplicate-attribute"}
483
482
  break # don't report an error more than once
484
483
  end
485
484
  }
@@ -498,7 +497,7 @@ module HTML5
498
497
  elsif data == ">"
499
498
  emit_current_token
500
499
  elsif data == :EOF
501
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected = or end of tag.")}
500
+ @token_queue << {:type => :ParseError, :data => "expected-end-of-tag-but-got-eof"}
502
501
  emit_current_token
503
502
  elsif ASCII_LETTERS.include? data
504
503
  @current_token[:data].push([data, ""])
@@ -527,7 +526,7 @@ module HTML5
527
526
  elsif data == ">"
528
527
  emit_current_token
529
528
  elsif data == :EOF
530
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected attribute value.")}
529
+ @token_queue << {:type => :ParseError, :data => "expected-attribute-value-but-got-eof"}
531
530
  emit_current_token
532
531
  else
533
532
  @current_token[:data][-1][1] += data
@@ -543,7 +542,7 @@ module HTML5
543
542
  elsif data == "&"
544
543
  process_entity_in_attribute
545
544
  elsif data == :EOF
546
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute value (\").")}
545
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-double-quote"}
547
546
  emit_current_token
548
547
  else
549
548
  @current_token[:data][-1][1] += data + @stream.chars_until(["\"", "&"])
@@ -558,7 +557,7 @@ module HTML5
558
557
  elsif data == "&"
559
558
  process_entity_in_attribute
560
559
  elsif data == :EOF
561
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute value (').")}
560
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-single-quote"}
562
561
  emit_current_token
563
562
  else
564
563
  @current_token[:data][-1][1] += data +\
@@ -576,7 +575,7 @@ module HTML5
576
575
  elsif data == ">"
577
576
  emit_current_token
578
577
  elsif data == :EOF
579
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in attribute value.")}
578
+ @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-no-quotes"}
580
579
  emit_current_token
581
580
  else
582
581
  @current_token[:data][-1][1] += data + @stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
@@ -609,7 +608,7 @@ module HTML5
609
608
  @current_token = {:type => :Doctype, :name => "", :publicId => nil, :systemId => nil, :correct => true}
610
609
  @state = :doctype_state
611
610
  else
612
- @token_queue << {:type => :ParseError, :data => _("Expected '--' or 'DOCTYPE'. Not found.")}
611
+ @token_queue << {:type => :ParseError, :data => "expected-dashes-or-doctype"}
613
612
  @stream.unget(char_stack)
614
613
  @state = :bogus_comment_state
615
614
  end
@@ -622,11 +621,11 @@ module HTML5
622
621
  if data == "-"
623
622
  @state = :comment_start_dash_state
624
623
  elsif data == ">"
625
- @token_queue << {:type => :ParseError, :data => _("Incorrect comment.")}
624
+ @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
626
625
  @token_queue << @current_token
627
626
  @state = :data_state
628
627
  elsif data == :EOF
629
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment.")}
628
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
630
629
  @token_queue << @current_token
631
630
  @state = :data_state
632
631
  else
@@ -641,11 +640,11 @@ module HTML5
641
640
  if data == "-"
642
641
  @state = :comment_end_state
643
642
  elsif data == ">"
644
- @token_queue << {:type => :ParseError, :data => _("Incorrect comment.")}
643
+ @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
645
644
  @token_queue << @current_token
646
645
  @state = :data_state
647
646
  elsif data == :EOF
648
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment.")}
647
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
649
648
  @token_queue << @current_token
650
649
  @state = :data_state
651
650
  else
@@ -660,7 +659,7 @@ module HTML5
660
659
  if data == "-"
661
660
  @state = :comment_end_dash_state
662
661
  elsif data == :EOF
663
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment.")}
662
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
664
663
  @token_queue << @current_token
665
664
  @state = :data_state
666
665
  else
@@ -674,7 +673,7 @@ module HTML5
674
673
  if data == "-"
675
674
  @state = :comment_end_state
676
675
  elsif data == :EOF
677
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment (-)")}
676
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment-end-dash"}
678
677
  @token_queue << @current_token
679
678
  @state = :data_state
680
679
  else
@@ -694,15 +693,15 @@ module HTML5
694
693
  @token_queue << @current_token
695
694
  @state = :data_state
696
695
  elsif data == "-"
697
- @token_queue << {:type => :ParseError, :data => _("Unexpected '-' after '--' found in comment.")}
696
+ @token_queue << {:type => :ParseError, :data => "unexpected-dash-after-double-dash-in-comment"}
698
697
  @current_token[:data] += data
699
698
  elsif data == :EOF
700
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in comment (--).")}
699
+ @token_queue << {:type => :ParseError, :data => "eof-in-comment-double-dash"}
701
700
  @token_queue << @current_token
702
701
  @state = :data_state
703
702
  else
704
703
  # XXX
705
- @token_queue << {:type => :ParseError, :data => _("Unexpected character in comment found.")}
704
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-comment"}
706
705
  @current_token[:data] += "--" + data
707
706
  @state = :comment_state
708
707
  end
@@ -714,7 +713,7 @@ module HTML5
714
713
  if SPACE_CHARACTERS.include? data
715
714
  @state = :before_doctype_name_state
716
715
  else
717
- @token_queue << {:type => :ParseError, :data => _("No space after literal string 'DOCTYPE'.")}
716
+ @token_queue << {:type => :ParseError, :data => "need-space-after-doctype"}
718
717
  @stream.unget(data)
719
718
  @state = :before_doctype_name_state
720
719
  end
@@ -725,12 +724,12 @@ module HTML5
725
724
  data = @stream.char
726
725
  if SPACE_CHARACTERS.include? data
727
726
  elsif data == ">"
728
- @token_queue << {:type => :ParseError, :data => _("Unexpected > character. Expected DOCTYPE name.")}
727
+ @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-right-bracket"}
729
728
  @current_token[:correct] = false
730
729
  @token_queue << @current_token
731
730
  @state = :data_state
732
731
  elsif data == :EOF
733
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file. Expected DOCTYPE name.")}
732
+ @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-eof"}
734
733
  @current_token[:correct] = false
735
734
  @token_queue << @current_token
736
735
  @state = :data_state
@@ -749,7 +748,7 @@ module HTML5
749
748
  @token_queue << @current_token
750
749
  @state = :data_state
751
750
  elsif data == :EOF
752
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE name.")}
751
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype-name"}
753
752
  @current_token[:correct] = false
754
753
  @token_queue << @current_token
755
754
  @state = :data_state
@@ -769,7 +768,7 @@ module HTML5
769
768
  elsif data == :EOF
770
769
  @current_token[:correct] = false
771
770
  @stream.unget(data)
772
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
771
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
773
772
  @token_queue << @current_token
774
773
  @state = :data_state
775
774
  else
@@ -782,7 +781,7 @@ module HTML5
782
781
  @state = :before_doctype_system_identifier_state
783
782
  else
784
783
  @stream.unget(char_stack)
785
- @token_queue << {:type => :ParseError, :data => _("Expected 'public' or 'system'. Got '#{token}'")}
784
+ @token_queue << {:type => :ParseError, :data => "expected-space-or-right-bracket-in-doctype", "datavars" => {"data" => data}}
786
785
  @state = :bogus_doctype_state
787
786
  end
788
787
  end
@@ -800,17 +799,17 @@ module HTML5
800
799
  @current_token[:publicId] = ""
801
800
  @state = :doctype_public_identifier_single_quoted_state
802
801
  elsif data == ">"
803
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of DOCTYPE.")}
802
+ @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
804
803
  @current_token[:correct] = false
805
804
  @token_queue << @current_token
806
805
  @state = :data_state
807
806
  elsif data == :EOF
808
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
807
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
809
808
  @current_token[:correct] = false
810
809
  @token_queue << @current_token
811
810
  @state = :data_state
812
811
  else
813
- @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
812
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
814
813
  @state = :bogus_doctype_state
815
814
  end
816
815
 
@@ -822,7 +821,7 @@ module HTML5
822
821
  if data == "\""
823
822
  @state = :after_doctype_public_identifier_state
824
823
  elsif data == :EOF
825
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
824
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
826
825
  @current_token[:correct] = false
827
826
  @token_queue << @current_token
828
827
  @state = :data_state
@@ -837,7 +836,7 @@ module HTML5
837
836
  if data == "'"
838
837
  @state = :after_doctype_public_identifier_state
839
838
  elsif data == :EOF
840
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
839
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
841
840
  @current_token[:correct] = false
842
841
  @token_queue << @current_token
843
842
  @state = :data_state
@@ -860,12 +859,12 @@ module HTML5
860
859
  @token_queue << @current_token
861
860
  @state = :data_state
862
861
  elsif data == :EOF
863
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
862
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
864
863
  @current_token[:correct] = false
865
864
  @token_queue << @current_token
866
865
  @state = :data_state
867
866
  else
868
- @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
867
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
869
868
  @state = :bogus_doctype_state
870
869
  end
871
870
  return true
@@ -881,17 +880,17 @@ module HTML5
881
880
  @current_token[:systemId] = ""
882
881
  @state = :doctype_system_identifier_single_quoted_state
883
882
  elsif data == ">"
884
- @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
883
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
885
884
  @current_token[:correct] = false
886
885
  @token_queue << @current_token
887
886
  @state = :data_state
888
887
  elsif data == :EOF
889
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
888
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
890
889
  @current_token[:correct] = false
891
890
  @token_queue << @current_token
892
891
  @state = :data_state
893
892
  else
894
- @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
893
+ @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
895
894
  @state = :bogus_doctype_state
896
895
  end
897
896
  return true
@@ -902,7 +901,7 @@ module HTML5
902
901
  if data == "\""
903
902
  @state = :after_doctype_system_identifier_state
904
903
  elsif data == :EOF
905
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
904
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
906
905
  @current_token[:correct] = false
907
906
  @token_queue << @current_token
908
907
  @state = :data_state
@@ -917,7 +916,7 @@ module HTML5
917
916
  if data == "'"
918
917
  @state = :after_doctype_system_identifier_state
919
918
  elsif data == :EOF
920
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
919
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
921
920
  @current_token[:correct] = false
922
921
  @token_queue << @current_token
923
922
  @state = :data_state
@@ -934,12 +933,12 @@ module HTML5
934
933
  @token_queue << @current_token
935
934
  @state = :data_state
936
935
  elsif data == :EOF
937
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in DOCTYPE.")}
936
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
938
937
  @current_token[:correct] = false
939
938
  @token_queue << @current_token
940
939
  @state = :data_state
941
940
  else
942
- @token_queue << {:type => :ParseError, :data => _("Unexpected character in DOCTYPE.")}
941
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
943
942
  @state = :bogus_doctype_state
944
943
  end
945
944
  return true
@@ -954,7 +953,7 @@ module HTML5
954
953
  elsif data == :EOF
955
954
  # XXX EMIT
956
955
  @stream.unget(data)
957
- @token_queue << {:type => :ParseError, :data => _("Unexpected end of file in bogus doctype.")}
956
+ @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
958
957
  @current_token[:correct] = false
959
958
  @token_queue << @current_token
960
959
  @state = :data_state
@@ -962,7 +961,6 @@ module HTML5
962
961
  return true
963
962
  end
964
963
 
965
- def _(string); string; end
966
964
  end
967
965
 
968
966
  end