html5 0.1.0 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +9 -2
- data/Manifest.txt +61 -2
- data/README +41 -5
- data/Rakefile.rb +22 -6
- data/{parse.rb → bin/html5} +11 -11
- data/lib/core_ext/string.rb +17 -0
- data/lib/html5/constants.rb +228 -0
- data/lib/html5/filters/iso639codes.rb +752 -0
- data/lib/html5/filters/rfc2046.rb +30 -0
- data/lib/html5/filters/rfc3987.rb +89 -0
- data/lib/html5/filters/validator.rb +830 -0
- data/lib/html5/html5parser.rb +25 -25
- data/lib/html5/html5parser/after_body_phase.rb +3 -3
- data/lib/html5/html5parser/after_frameset_phase.rb +3 -4
- data/lib/html5/html5parser/after_head_phase.rb +6 -6
- data/lib/html5/html5parser/before_head_phase.rb +1 -1
- data/lib/html5/html5parser/in_body_phase.rb +54 -48
- data/lib/html5/html5parser/in_caption_phase.rb +7 -6
- data/lib/html5/html5parser/in_cell_phase.rb +3 -3
- data/lib/html5/html5parser/in_column_group_phase.rb +1 -1
- data/lib/html5/html5parser/in_frameset_phase.rb +5 -5
- data/lib/html5/html5parser/in_head_phase.rb +10 -10
- data/lib/html5/html5parser/in_row_phase.rb +4 -2
- data/lib/html5/html5parser/in_select_phase.rb +7 -6
- data/lib/html5/html5parser/in_table_body_phase.rb +8 -5
- data/lib/html5/html5parser/in_table_phase.rb +12 -7
- data/lib/html5/html5parser/initial_phase.rb +5 -6
- data/lib/html5/html5parser/phase.rb +5 -9
- data/lib/html5/html5parser/root_element_phase.rb +1 -2
- data/lib/html5/html5parser/trailing_end_phase.rb +3 -3
- data/lib/html5/inputstream.rb +25 -31
- data/lib/html5/liberalxmlparser.rb +2 -2
- data/lib/html5/sanitizer.rb +6 -6
- data/lib/html5/serializer/htmlserializer.rb +2 -3
- data/lib/html5/sniffer.rb +45 -0
- data/lib/html5/tokenizer.rb +57 -59
- data/lib/html5/treebuilders/rexml.rb +7 -6
- data/lib/html5/treebuilders/simpletree.rb +1 -1
- data/lib/html5/treewalkers/base.rb +8 -0
- data/lib/html5/version.rb +3 -0
- data/testdata/encoding/chardet/test_big5.txt +51 -0
- data/testdata/encoding/test-yahoo-jp.dat +10 -0
- data/testdata/encoding/tests1.dat +394 -0
- data/testdata/encoding/tests2.dat +81 -0
- data/testdata/sanitizer/tests1.dat +416 -0
- data/testdata/serializer/core.test +104 -0
- data/testdata/serializer/injectmeta.test +65 -0
- data/testdata/serializer/optionaltags.test +900 -0
- data/testdata/serializer/options.test +60 -0
- data/testdata/serializer/whitespace.test +51 -0
- data/testdata/sites/google-results.htm +1 -0
- data/testdata/sites/python-ref-import.htm +1 -0
- data/testdata/sites/web-apps-old.htm +1 -0
- data/testdata/sites/web-apps.htm +34275 -0
- data/testdata/sniffer/htmlOrFeed.json +43 -0
- data/testdata/tokenizer/contentModelFlags.test +48 -0
- data/testdata/tokenizer/entities.test +2339 -0
- data/testdata/tokenizer/escapeFlag.test +21 -0
- data/testdata/tokenizer/test1.test +172 -0
- data/testdata/tokenizer/test2.test +129 -0
- data/testdata/tokenizer/test3.test +367 -0
- data/testdata/tokenizer/test4.test +198 -0
- data/testdata/tree-construction/tests1.dat +1950 -0
- data/testdata/tree-construction/tests2.dat +773 -0
- data/testdata/tree-construction/tests3.dat +270 -0
- data/testdata/tree-construction/tests4.dat +60 -0
- data/testdata/tree-construction/tests5.dat +175 -0
- data/testdata/tree-construction/tests6.dat +196 -0
- data/testdata/validator/attributes.test +1035 -0
- data/testdata/validator/base-href-attribute.test +787 -0
- data/testdata/validator/base-target-attribute.test +35 -0
- data/testdata/validator/blockquote-cite-attribute.test +7 -0
- data/testdata/validator/classattribute.test +152 -0
- data/testdata/validator/contenteditableattribute.test +59 -0
- data/testdata/validator/contextmenuattribute.test +115 -0
- data/testdata/validator/dirattribute.test +59 -0
- data/testdata/validator/draggableattribute.test +63 -0
- data/testdata/validator/html-xmlns-attribute.test +23 -0
- data/testdata/validator/idattribute.test +115 -0
- data/testdata/validator/inputattributes.test +2795 -0
- data/testdata/validator/irrelevantattribute.test +63 -0
- data/testdata/validator/langattribute.test +5579 -0
- data/testdata/validator/li-value-attribute.test +7 -0
- data/testdata/validator/link-href-attribute.test +7 -0
- data/testdata/validator/link-hreflang-attribute.test +7 -0
- data/testdata/validator/link-rel-attribute.test +271 -0
- data/testdata/validator/ol-start-attribute.test +7 -0
- data/testdata/validator/starttags.test +375 -0
- data/testdata/validator/style-scoped-attribute.test +7 -0
- data/testdata/validator/tabindexattribute.test +79 -0
- data/tests/preamble.rb +7 -17
- data/tests/test_encoding.rb +1 -1
- data/tests/test_lxp.rb +16 -0
- data/tests/test_parser.rb +2 -2
- data/tests/test_sniffer.rb +27 -0
- data/tests/test_treewalkers.rb +41 -22
- data/tests/test_validator.rb +31 -0
- metadata +65 -6
@@ -33,10 +33,9 @@ module HTML5
|
|
33
33
|
|
34
34
|
def insert_html_element
|
35
35
|
element = @tree.createElement('html', {})
|
36
|
-
@tree.open_elements
|
36
|
+
@tree.open_elements << element
|
37
37
|
@tree.document.appendChild(element)
|
38
38
|
@parser.phase = @parser.phases[:beforeHead]
|
39
39
|
end
|
40
|
-
|
41
40
|
end
|
42
41
|
end
|
@@ -15,19 +15,19 @@ module HTML5
|
|
15
15
|
end
|
16
16
|
|
17
17
|
def processCharacters(data)
|
18
|
-
parse_error(
|
18
|
+
parse_error("expected-eof-but-got-char")
|
19
19
|
@parser.phase = @parser.last_phase
|
20
20
|
@parser.phase.processCharacters(data)
|
21
21
|
end
|
22
22
|
|
23
23
|
def processStartTag(name, attributes)
|
24
|
-
parse_error(
|
24
|
+
parse_error("expected-eof-but-got-start-tag", {"name" => name})
|
25
25
|
@parser.phase = @parser.last_phase
|
26
26
|
@parser.phase.processStartTag(name, attributes)
|
27
27
|
end
|
28
28
|
|
29
29
|
def processEndTag(name)
|
30
|
-
parse_error(
|
30
|
+
parse_error("expected-eof-but-got-end-tag", {"name" => name})
|
31
31
|
@parser.phase = @parser.last_phase
|
32
32
|
@parser.phase.processEndTag(name)
|
33
33
|
end
|
data/lib/html5/inputstream.rb
CHANGED
@@ -60,15 +60,11 @@ module HTML5
|
|
60
60
|
if @char_encoding == 'windows-1252'
|
61
61
|
@win1252 = true
|
62
62
|
elsif @char_encoding != 'utf-8'
|
63
|
+
require 'iconv'
|
63
64
|
begin
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
@buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
|
68
|
-
rescue
|
69
|
-
@win1252 = true
|
70
|
-
end
|
71
|
-
rescue LoadError
|
65
|
+
@buffer << @raw_stream.read unless @raw_stream.eof?
|
66
|
+
@buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
|
67
|
+
rescue
|
72
68
|
@win1252 = true
|
73
69
|
end
|
74
70
|
end
|
@@ -88,12 +84,11 @@ module HTML5
|
|
88
84
|
def open_stream(source)
|
89
85
|
# Already an IO like object
|
90
86
|
if source.respond_to?(:read)
|
91
|
-
|
87
|
+
source
|
92
88
|
else
|
93
89
|
# Treat source as a string and wrap in StringIO
|
94
|
-
|
90
|
+
StringIO.new(source)
|
95
91
|
end
|
96
|
-
return @stream
|
97
92
|
end
|
98
93
|
|
99
94
|
def detect_encoding
|
@@ -138,14 +133,12 @@ module HTML5
|
|
138
133
|
encoding = @DEFAULT_ENCODING
|
139
134
|
end
|
140
135
|
|
141
|
-
#Substitute for equivalent
|
142
|
-
|
143
|
-
|
144
|
-
if encoding_sub.has_key?(encoding.downcase)
|
145
|
-
encoding = encoding_sub[encoding.downcase]
|
136
|
+
#Substitute for equivalent encoding
|
137
|
+
if 'iso-8859-1' == encoding.downcase
|
138
|
+
encoding = 'windows-1252'
|
146
139
|
end
|
147
140
|
|
148
|
-
|
141
|
+
encoding
|
149
142
|
end
|
150
143
|
|
151
144
|
# Attempts to detect at BOM at the start of the stream. If
|
@@ -153,9 +146,9 @@ module HTML5
|
|
153
146
|
# encoding otherwise return nil
|
154
147
|
def detect_bom
|
155
148
|
bom_dict = {
|
156
|
-
"\xef\xbb\xbf"
|
157
|
-
"\xff\xfe"
|
158
|
-
"\xfe\xff"
|
149
|
+
"\xef\xbb\xbf" => 'utf-8',
|
150
|
+
"\xff\xfe" => 'utf-16le',
|
151
|
+
"\xfe\xff" => 'utf-16be',
|
159
152
|
"\xff\xfe\x00\x00" => 'utf-32le',
|
160
153
|
"\x00\x00\xfe\xff" => 'utf-32be'
|
161
154
|
}
|
@@ -198,6 +191,7 @@ module HTML5
|
|
198
191
|
end
|
199
192
|
end
|
200
193
|
|
194
|
+
#TODO: huh?
|
201
195
|
require 'delegate'
|
202
196
|
@raw_stream = SimpleDelegator.new(@raw_stream)
|
203
197
|
|
@@ -250,7 +244,7 @@ module HTML5
|
|
250
244
|
col -= 1
|
251
245
|
end
|
252
246
|
end
|
253
|
-
return [line+1, col]
|
247
|
+
return [line + 1, col]
|
254
248
|
end
|
255
249
|
|
256
250
|
# Read one character from the stream or queue if available. Return
|
@@ -259,9 +253,9 @@ module HTML5
|
|
259
253
|
unless @queue.empty?
|
260
254
|
return @queue.shift
|
261
255
|
else
|
262
|
-
if @tell + 3 > @buffer.length
|
256
|
+
if @tell + 3 > @buffer.length && !@raw_stream.eof?
|
263
257
|
# read next block
|
264
|
-
@buffer = @buffer[@tell
|
258
|
+
@buffer = @buffer[@tell..-1] + @raw_stream.read(@NUM_BYTES_BUFFER)
|
265
259
|
@tell = 0
|
266
260
|
end
|
267
261
|
|
@@ -269,7 +263,7 @@ module HTML5
|
|
269
263
|
@tell += 1
|
270
264
|
|
271
265
|
case c
|
272
|
-
when 0x01
|
266
|
+
when 0x01..0x7F
|
273
267
|
if c == 0x0D
|
274
268
|
# normalize newlines
|
275
269
|
@tell += 1 if @buffer[@tell] == 0x0A
|
@@ -287,7 +281,7 @@ module HTML5
|
|
287
281
|
|
288
282
|
c.chr
|
289
283
|
|
290
|
-
when 0x80
|
284
|
+
when 0x80..0xBF
|
291
285
|
if !@win1252
|
292
286
|
[0xFFFD].pack('U') # invalid utf-8
|
293
287
|
elsif c <= 0x9f
|
@@ -296,10 +290,11 @@ module HTML5
|
|
296
290
|
"\xC2" + c.chr # convert to utf-8
|
297
291
|
end
|
298
292
|
|
299
|
-
when 0xC0
|
300
|
-
if
|
301
|
-
"\xC3" + (c-64).chr # convert to utf-8
|
302
|
-
|
293
|
+
when 0xC0..0xFF
|
294
|
+
if instance_variables.include?("@win1252") && @win1252
|
295
|
+
"\xC3" + (c - 64).chr # convert to utf-8
|
296
|
+
# from http://www.w3.org/International/questions/qa-forms-utf-8.en.php
|
297
|
+
elsif @buffer[@tell - 1..@tell + 3] =~ /^
|
303
298
|
( [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
|
304
299
|
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
|
305
300
|
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
|
@@ -315,8 +310,7 @@ module HTML5
|
|
315
310
|
end
|
316
311
|
|
317
312
|
when 0x00
|
318
|
-
@errors.push(
|
319
|
-
'replaced with U+FFFD')
|
313
|
+
@errors.push("null-character")
|
320
314
|
[0xFFFD].pack('U') # null characters are invalid
|
321
315
|
|
322
316
|
else
|
@@ -50,7 +50,7 @@ module HTML5
|
|
50
50
|
|
51
51
|
when :EndTag
|
52
52
|
if token[:data]
|
53
|
-
parse_error(
|
53
|
+
parse_error("attributes-in-end-tag")
|
54
54
|
end
|
55
55
|
|
56
56
|
when :Comment
|
@@ -81,7 +81,7 @@ module HTML5
|
|
81
81
|
# open and close tags are emitted
|
82
82
|
if token[:type] == :EndTag
|
83
83
|
if VOID_ELEMENTS.include? token[:name]
|
84
|
-
if @tree.open_elements[-1].name != token["name"]
|
84
|
+
if @tree.open_elements[-1].name != token["name"]
|
85
85
|
token[:type] = :EmptyTag
|
86
86
|
token["data"] ||= {}
|
87
87
|
end
|
data/lib/html5/sanitizer.rb
CHANGED
@@ -110,13 +110,13 @@ module HTML5
|
|
110
110
|
def sanitize_token(token)
|
111
111
|
case token[:type]
|
112
112
|
when :StartTag, :EndTag, :EmptyTag
|
113
|
-
if ALLOWED_ELEMENTS.include?(token[:name])
|
113
|
+
if self.class.const_get("ALLOWED_ELEMENTS").include?(token[:name])
|
114
114
|
if token.has_key? :data
|
115
115
|
attrs = Hash[*token[:data].flatten]
|
116
|
-
attrs.delete_if { |attr,v| !ALLOWED_ATTRIBUTES.include?(attr) }
|
116
|
+
attrs.delete_if { |attr,v| !self.class.const_get("ALLOWED_ATTRIBUTES").include?(attr) }
|
117
117
|
ATTR_VAL_IS_URI.each do |attr|
|
118
118
|
val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
|
119
|
-
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0])
|
119
|
+
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !self.class.const_get("ALLOWED_PROTOCOLS").include?(val_unescaped.split(':')[0])
|
120
120
|
attrs.delete attr
|
121
121
|
end
|
122
122
|
end
|
@@ -160,14 +160,14 @@ module HTML5
|
|
160
160
|
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
|
161
161
|
next if val.empty?
|
162
162
|
prop.downcase!
|
163
|
-
if ALLOWED_CSS_PROPERTIES.include?(prop)
|
163
|
+
if self.class.const_get("ALLOWED_CSS_PROPERTIES").include?(prop)
|
164
164
|
clean << "#{prop}: #{val};"
|
165
165
|
elsif %w[background border margin padding].include?(prop.split('-')[0])
|
166
166
|
clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
|
167
|
-
!ALLOWED_CSS_KEYWORDS.include?(keyword) and
|
167
|
+
!self.class.const_get("ALLOWED_CSS_KEYWORDS").include?(keyword) and
|
168
168
|
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
|
169
169
|
end
|
170
|
-
elsif ALLOWED_SVG_PROPERTIES.include?(prop)
|
170
|
+
elsif self.class.const_get("ALLOWED_SVG_PROPERTIES").include?(prop)
|
171
171
|
clean << "#{prop}: #{val};"
|
172
172
|
end
|
173
173
|
end
|
@@ -31,7 +31,7 @@ module HTML5
|
|
31
31
|
@inject_meta_charset = true
|
32
32
|
|
33
33
|
options.each do |name, value|
|
34
|
-
next unless
|
34
|
+
next unless instance_variables.include?("@#{name}")
|
35
35
|
@use_best_quote_char = false if name.to_s == 'quote_char'
|
36
36
|
instance_variable_set("@#{name}", value)
|
37
37
|
end
|
@@ -73,7 +73,7 @@ module HTML5
|
|
73
73
|
elsif [:Characters, :SpaceCharacters].include? type
|
74
74
|
if type == :SpaceCharacters or in_cdata
|
75
75
|
if in_cdata and token[:data].include?("</")
|
76
|
-
serialize_error(
|
76
|
+
serialize_error("Unexpected </ in CDATA")
|
77
77
|
end
|
78
78
|
result << token[:data]
|
79
79
|
else
|
@@ -171,7 +171,6 @@ module HTML5
|
|
171
171
|
end
|
172
172
|
end
|
173
173
|
|
174
|
-
def _(string); string; end
|
175
174
|
end
|
176
175
|
|
177
176
|
# Error in serialized tree
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module HTML5
|
2
|
+
module Sniffer
|
3
|
+
# 4.7.4
|
4
|
+
def html_or_feed str
|
5
|
+
s = str[0, 512] # steps 1, 2
|
6
|
+
pos = 0
|
7
|
+
|
8
|
+
while pos < s.length
|
9
|
+
case s[pos]
|
10
|
+
when 0x09, 0x20, 0x0A, 0x0D # tab, space, LF, CR
|
11
|
+
pos += 1
|
12
|
+
when 0x3C # "<"
|
13
|
+
pos += 1
|
14
|
+
if s[pos..pos+2] == "!--" # [0x21, 0x2D, 0x2D]
|
15
|
+
pos += 3
|
16
|
+
until s[pos..pos+2] == "-->" or pos >= s.length
|
17
|
+
pos += 1
|
18
|
+
end
|
19
|
+
pos += 3
|
20
|
+
elsif s[pos] == 0x21 # "!"
|
21
|
+
pos += 1
|
22
|
+
until s[pos] == 0x3E or pos >= s.length # ">"
|
23
|
+
pos += 1
|
24
|
+
end
|
25
|
+
pos += 1
|
26
|
+
elsif s[pos] == 0x3F # "?"
|
27
|
+
until s[pos..pos+1] == "?>" or pos >= s.length # [0x3F, 0x3E]
|
28
|
+
pos += 1
|
29
|
+
end
|
30
|
+
pos += 2
|
31
|
+
elsif s[pos..pos+2] == "rss" # [0x72, 0x73, 0x73]
|
32
|
+
return "application/rss+xml"
|
33
|
+
elsif s[pos..pos+3] == "feed" # [0x66, 0x65, 0x65, 0x64]
|
34
|
+
return "application/atom+xml"
|
35
|
+
elsif s[pos..pos+6] == "rdf:RDF" # [0x72, 0x64, 0x66, 0x3A, 0x52, 0x44, 0x46]
|
36
|
+
raise NotImplementedError
|
37
|
+
end
|
38
|
+
else
|
39
|
+
break
|
40
|
+
end
|
41
|
+
end
|
42
|
+
"text/html"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
data/lib/html5/tokenizer.rb
CHANGED
@@ -69,7 +69,7 @@ module HTML5
|
|
69
69
|
if @current_token[:type] == :StartTag and data == ">"
|
70
70
|
@current_token[:type] = :EmptyTag
|
71
71
|
else
|
72
|
-
@token_queue << {:type => :ParseError, :data =>
|
72
|
+
@token_queue << {:type => :ParseError, :data => "incorrectly-placed-solidus"}
|
73
73
|
end
|
74
74
|
|
75
75
|
# The character we just consumed need to be put back on the stack so it
|
@@ -107,12 +107,12 @@ module HTML5
|
|
107
107
|
charAsInt = char_stack.join('').to_i(radix)
|
108
108
|
|
109
109
|
if charAsInt == 13
|
110
|
-
@token_queue << {:type => :ParseError, :data =>
|
110
|
+
@token_queue << {:type => :ParseError, :data => "incorrect-cr-newline-entity"}
|
111
111
|
charAsInt = 10
|
112
112
|
elsif (128..159).include? charAsInt
|
113
113
|
# If the integer is between 127 and 160 (so 128 and bigger and 159
|
114
114
|
# and smaller) we need to do the "windows trick".
|
115
|
-
@token_queue << {:type => :ParseError, :data =>
|
115
|
+
@token_queue << {:type => :ParseError, :data => "illegal-windows-1252-entity"}
|
116
116
|
|
117
117
|
charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
|
118
118
|
end
|
@@ -121,13 +121,13 @@ module HTML5
|
|
121
121
|
char = [charAsInt].pack('U')
|
122
122
|
else
|
123
123
|
char = [0xFFFD].pack('U')
|
124
|
-
@token_queue << {:type => :ParseError, :data =>
|
124
|
+
@token_queue << {:type => :ParseError, :data => "cant-convert-numeric-entity", :datavars => {"charAsInt" => charAsInt}}
|
125
125
|
end
|
126
126
|
|
127
127
|
# Discard the ; if present. Otherwise, put it back on the queue and
|
128
128
|
# invoke parse_error on parser.
|
129
129
|
if c != ";"
|
130
|
-
@token_queue << {:type => :ParseError, :data =>
|
130
|
+
@token_queue << {:type => :ParseError, :data => "numeric-entity-without-semicolon"}
|
131
131
|
@stream.unget(c)
|
132
132
|
end
|
133
133
|
|
@@ -147,7 +147,7 @@ module HTML5
|
|
147
147
|
# back in the queue
|
148
148
|
char_stack = char_stack[0...char_stack.index(:EOF)]
|
149
149
|
@stream.unget(char_stack)
|
150
|
-
@token_queue << {:type => :ParseError, :data =>
|
150
|
+
@token_queue << {:type => :ParseError, :data => "expected-numeric-entity-but-got-eof"}
|
151
151
|
else
|
152
152
|
if char_stack[1].downcase == "x" and HEX_DIGITS.include? char_stack[2]
|
153
153
|
# Hexadecimal entity detected.
|
@@ -160,7 +160,7 @@ module HTML5
|
|
160
160
|
else
|
161
161
|
# No number entity detected.
|
162
162
|
@stream.unget(char_stack)
|
163
|
-
@token_queue << {:type => :ParseError, :data =>
|
163
|
+
@token_queue << {:type => :ParseError, :data => "expected-numeric-entity"}
|
164
164
|
end
|
165
165
|
end
|
166
166
|
else
|
@@ -196,10 +196,10 @@ module HTML5
|
|
196
196
|
# Check whether or not the last character returned can be
|
197
197
|
# discarded or needs to be put back.
|
198
198
|
if entityName[-1] != ?;
|
199
|
-
@token_queue << {:type => :ParseError, :data =>
|
199
|
+
@token_queue << {:type => :ParseError, :data => "named-entity-without-semicolon"}
|
200
200
|
end
|
201
201
|
|
202
|
-
if
|
202
|
+
if entityName[-1] != ";" and from_attribute and
|
203
203
|
(ASCII_LETTERS.include?(char_stack[entityName.length]) or
|
204
204
|
DIGITS.include?(char_stack[entityName.length]))
|
205
205
|
@stream.unget(char_stack)
|
@@ -208,7 +208,7 @@ module HTML5
|
|
208
208
|
@stream.unget(char_stack[entityName.length..-1])
|
209
209
|
end
|
210
210
|
else
|
211
|
-
@token_queue << {:type => :ParseError, :data =>
|
211
|
+
@token_queue << {:type => :ParseError, :data => "expected-named-entity"}
|
212
212
|
@stream.unget(char_stack)
|
213
213
|
end
|
214
214
|
end
|
@@ -217,7 +217,7 @@ module HTML5
|
|
217
217
|
|
218
218
|
# This method replaces the need for "entityInAttributeValueState".
|
219
219
|
def process_entity_in_attribute
|
220
|
-
entity = consume_entity(
|
220
|
+
entity = consume_entity()
|
221
221
|
if entity
|
222
222
|
@current_token[:data][-1][1] += entity
|
223
223
|
else
|
@@ -309,19 +309,18 @@ module HTML5
|
|
309
309
|
elsif data == ">"
|
310
310
|
# XXX In theory it could be something besides a tag name. But
|
311
311
|
# do we really care?
|
312
|
-
@token_queue << {:type => :ParseError, :data =>
|
312
|
+
@token_queue << {:type => :ParseError, :data => "expected-tag-name-but-got-right-bracket"}
|
313
313
|
@token_queue << {:type => :Characters, :data => "<>"}
|
314
314
|
@state = :data_state
|
315
315
|
elsif data == "?"
|
316
316
|
# XXX In theory it could be something besides a tag name. But
|
317
317
|
# do we really care?
|
318
|
-
@token_queue.push({:type => :ParseError, :data =>
|
319
|
-
"support processing instructions).")})
|
318
|
+
@token_queue.push({:type => :ParseError, :data => "expected-tag-name-but-got-question-mark"})
|
320
319
|
@stream.unget(data)
|
321
320
|
@state = :bogus_comment_state
|
322
321
|
else
|
323
322
|
# XXX
|
324
|
-
@token_queue << {:type => :ParseError, :data =>
|
323
|
+
@token_queue << {:type => :ParseError, :data => "expected-tag-name"}
|
325
324
|
@token_queue << {:type => :Characters, :data => "<"}
|
326
325
|
@stream.unget(data)
|
327
326
|
@state = :data_state
|
@@ -382,18 +381,18 @@ module HTML5
|
|
382
381
|
|
383
382
|
data = @stream.char
|
384
383
|
if data == :EOF
|
385
|
-
@token_queue << {:type => :ParseError, :data =>
|
384
|
+
@token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-eof"}
|
386
385
|
@token_queue << {:type => :Characters, :data => "</"}
|
387
386
|
@state = :data_state
|
388
387
|
elsif ASCII_LETTERS.include? data
|
389
388
|
@current_token = {:type => :EndTag, :name => data, :data => []}
|
390
389
|
@state = :tag_name_state
|
391
390
|
elsif data == ">"
|
392
|
-
@token_queue << {:type => :ParseError, :data =>
|
391
|
+
@token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-right-bracket"}
|
393
392
|
@state = :data_state
|
394
393
|
else
|
395
394
|
# XXX data can be _'_...
|
396
|
-
@token_queue << {:type => :ParseError, :data =>
|
395
|
+
@token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-char", :datavars => {:data => data}}
|
397
396
|
@stream.unget(data)
|
398
397
|
@state = :bogus_comment_state
|
399
398
|
end
|
@@ -406,7 +405,7 @@ module HTML5
|
|
406
405
|
if SPACE_CHARACTERS.include? data
|
407
406
|
@state = :before_attribute_name_state
|
408
407
|
elsif data == :EOF
|
409
|
-
@token_queue << {:type => :ParseError, :data =>
|
408
|
+
@token_queue << {:type => :ParseError, :data => "eof-in-tag-name"}
|
410
409
|
emit_current_token
|
411
410
|
elsif ASCII_LETTERS.include? data
|
412
411
|
@current_token[:name] += data + @stream.chars_until(ASCII_LETTERS, true)
|
@@ -426,7 +425,7 @@ module HTML5
|
|
426
425
|
if SPACE_CHARACTERS.include? data
|
427
426
|
@stream.chars_until(SPACE_CHARACTERS, true)
|
428
427
|
elsif data == :EOF
|
429
|
-
@token_queue << {:type => :ParseError, :data =>
|
428
|
+
@token_queue << {:type => :ParseError, :data => "expected-attribute-name-but-got-eof"}
|
430
429
|
emit_current_token
|
431
430
|
elsif ASCII_LETTERS.include? data
|
432
431
|
@current_token[:data].push([data, ""])
|
@@ -449,7 +448,7 @@ module HTML5
|
|
449
448
|
if data == "="
|
450
449
|
@state = :before_attribute_value_state
|
451
450
|
elsif data == :EOF
|
452
|
-
@token_queue << {:type => :ParseError, :data =>
|
451
|
+
@token_queue << {:type => :ParseError, :data => "eof-in-attribute-name"}
|
453
452
|
@state = :data_state
|
454
453
|
emitToken = true
|
455
454
|
elsif ASCII_LETTERS.include? data
|
@@ -479,7 +478,7 @@ module HTML5
|
|
479
478
|
end
|
480
479
|
@current_token[:data][0...-1].each {|name,value|
|
481
480
|
if @current_token[:data].last.first == name
|
482
|
-
@token_queue << {:type => :ParseError, :data =>
|
481
|
+
@token_queue << {:type => :ParseError, :data => "duplicate-attribute"}
|
483
482
|
break # don't report an error more than once
|
484
483
|
end
|
485
484
|
}
|
@@ -498,7 +497,7 @@ module HTML5
|
|
498
497
|
elsif data == ">"
|
499
498
|
emit_current_token
|
500
499
|
elsif data == :EOF
|
501
|
-
@token_queue << {:type => :ParseError, :data =>
|
500
|
+
@token_queue << {:type => :ParseError, :data => "expected-end-of-tag-but-got-eof"}
|
502
501
|
emit_current_token
|
503
502
|
elsif ASCII_LETTERS.include? data
|
504
503
|
@current_token[:data].push([data, ""])
|
@@ -527,7 +526,7 @@ module HTML5
|
|
527
526
|
elsif data == ">"
|
528
527
|
emit_current_token
|
529
528
|
elsif data == :EOF
|
530
|
-
@token_queue << {:type => :ParseError, :data =>
|
529
|
+
@token_queue << {:type => :ParseError, :data => "expected-attribute-value-but-got-eof"}
|
531
530
|
emit_current_token
|
532
531
|
else
|
533
532
|
@current_token[:data][-1][1] += data
|
@@ -543,7 +542,7 @@ module HTML5
|
|
543
542
|
elsif data == "&"
|
544
543
|
process_entity_in_attribute
|
545
544
|
elsif data == :EOF
|
546
|
-
@token_queue << {:type => :ParseError, :data =>
|
545
|
+
@token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-double-quote"}
|
547
546
|
emit_current_token
|
548
547
|
else
|
549
548
|
@current_token[:data][-1][1] += data + @stream.chars_until(["\"", "&"])
|
@@ -558,7 +557,7 @@ module HTML5
|
|
558
557
|
elsif data == "&"
|
559
558
|
process_entity_in_attribute
|
560
559
|
elsif data == :EOF
|
561
|
-
@token_queue << {:type => :ParseError, :data =>
|
560
|
+
@token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-single-quote"}
|
562
561
|
emit_current_token
|
563
562
|
else
|
564
563
|
@current_token[:data][-1][1] += data +\
|
@@ -576,7 +575,7 @@ module HTML5
|
|
576
575
|
elsif data == ">"
|
577
576
|
emit_current_token
|
578
577
|
elsif data == :EOF
|
579
|
-
@token_queue << {:type => :ParseError, :data =>
|
578
|
+
@token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-no-quotes"}
|
580
579
|
emit_current_token
|
581
580
|
else
|
582
581
|
@current_token[:data][-1][1] += data + @stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
|
@@ -609,7 +608,7 @@ module HTML5
|
|
609
608
|
@current_token = {:type => :Doctype, :name => "", :publicId => nil, :systemId => nil, :correct => true}
|
610
609
|
@state = :doctype_state
|
611
610
|
else
|
612
|
-
@token_queue << {:type => :ParseError, :data =>
|
611
|
+
@token_queue << {:type => :ParseError, :data => "expected-dashes-or-doctype"}
|
613
612
|
@stream.unget(char_stack)
|
614
613
|
@state = :bogus_comment_state
|
615
614
|
end
|
@@ -622,11 +621,11 @@ module HTML5
|
|
622
621
|
if data == "-"
|
623
622
|
@state = :comment_start_dash_state
|
624
623
|
elsif data == ">"
|
625
|
-
@token_queue << {:type => :ParseError, :data =>
|
624
|
+
@token_queue << {:type => :ParseError, :data => "incorrect-comment"}
|
626
625
|
@token_queue << @current_token
|
627
626
|
@state = :data_state
|
628
627
|
elsif data == :EOF
|
629
|
-
@token_queue << {:type => :ParseError, :data =>
|
628
|
+
@token_queue << {:type => :ParseError, :data => "eof-in-comment"}
|
630
629
|
@token_queue << @current_token
|
631
630
|
@state = :data_state
|
632
631
|
else
|
@@ -641,11 +640,11 @@ module HTML5
|
|
641
640
|
if data == "-"
|
642
641
|
@state = :comment_end_state
|
643
642
|
elsif data == ">"
|
644
|
-
@token_queue << {:type => :ParseError, :data =>
|
643
|
+
@token_queue << {:type => :ParseError, :data => "incorrect-comment"}
|
645
644
|
@token_queue << @current_token
|
646
645
|
@state = :data_state
|
647
646
|
elsif data == :EOF
|
648
|
-
@token_queue << {:type => :ParseError, :data =>
|
647
|
+
@token_queue << {:type => :ParseError, :data => "eof-in-comment"}
|
649
648
|
@token_queue << @current_token
|
650
649
|
@state = :data_state
|
651
650
|
else
|
@@ -660,7 +659,7 @@ module HTML5
|
|
660
659
|
if data == "-"
|
661
660
|
@state = :comment_end_dash_state
|
662
661
|
elsif data == :EOF
|
663
|
-
@token_queue << {:type => :ParseError, :data =>
|
662
|
+
@token_queue << {:type => :ParseError, :data => "eof-in-comment"}
|
664
663
|
@token_queue << @current_token
|
665
664
|
@state = :data_state
|
666
665
|
else
|
@@ -674,7 +673,7 @@ module HTML5
|
|
674
673
|
if data == "-"
|
675
674
|
@state = :comment_end_state
|
676
675
|
elsif data == :EOF
|
677
|
-
@token_queue << {:type => :ParseError, :data =>
|
676
|
+
@token_queue << {:type => :ParseError, :data => "eof-in-comment-end-dash"}
|
678
677
|
@token_queue << @current_token
|
679
678
|
@state = :data_state
|
680
679
|
else
|
@@ -694,15 +693,15 @@ module HTML5
|
|
694
693
|
@token_queue << @current_token
|
695
694
|
@state = :data_state
|
696
695
|
elsif data == "-"
|
697
|
-
@token_queue << {:type => :ParseError, :data =>
|
696
|
+
@token_queue << {:type => :ParseError, :data => "unexpected-dash-after-double-dash-in-comment"}
|
698
697
|
@current_token[:data] += data
|
699
698
|
elsif data == :EOF
|
700
|
-
@token_queue << {:type => :ParseError, :data =>
|
699
|
+
@token_queue << {:type => :ParseError, :data => "eof-in-comment-double-dash"}
|
701
700
|
@token_queue << @current_token
|
702
701
|
@state = :data_state
|
703
702
|
else
|
704
703
|
# XXX
|
705
|
-
@token_queue << {:type => :ParseError, :data =>
|
704
|
+
@token_queue << {:type => :ParseError, :data => "unexpected-char-in-comment"}
|
706
705
|
@current_token[:data] += "--" + data
|
707
706
|
@state = :comment_state
|
708
707
|
end
|
@@ -714,7 +713,7 @@ module HTML5
|
|
714
713
|
if SPACE_CHARACTERS.include? data
|
715
714
|
@state = :before_doctype_name_state
|
716
715
|
else
|
717
|
-
@token_queue << {:type => :ParseError, :data =>
|
716
|
+
@token_queue << {:type => :ParseError, :data => "need-space-after-doctype"}
|
718
717
|
@stream.unget(data)
|
719
718
|
@state = :before_doctype_name_state
|
720
719
|
end
|
@@ -725,12 +724,12 @@ module HTML5
|
|
725
724
|
data = @stream.char
|
726
725
|
if SPACE_CHARACTERS.include? data
|
727
726
|
elsif data == ">"
|
728
|
-
@token_queue << {:type => :ParseError, :data =>
|
727
|
+
@token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-right-bracket"}
|
729
728
|
@current_token[:correct] = false
|
730
729
|
@token_queue << @current_token
|
731
730
|
@state = :data_state
|
732
731
|
elsif data == :EOF
|
733
|
-
@token_queue << {:type => :ParseError, :data =>
|
732
|
+
@token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-eof"}
|
734
733
|
@current_token[:correct] = false
|
735
734
|
@token_queue << @current_token
|
736
735
|
@state = :data_state
|
@@ -749,7 +748,7 @@ module HTML5
|
|
749
748
|
@token_queue << @current_token
|
750
749
|
@state = :data_state
|
751
750
|
elsif data == :EOF
|
752
|
-
@token_queue << {:type => :ParseError, :data =>
|
751
|
+
@token_queue << {:type => :ParseError, :data => "eof-in-doctype-name"}
|
753
752
|
@current_token[:correct] = false
|
754
753
|
@token_queue << @current_token
|
755
754
|
@state = :data_state
|
@@ -769,7 +768,7 @@ module HTML5
|
|
769
768
|
elsif data == :EOF
|
770
769
|
@current_token[:correct] = false
|
771
770
|
@stream.unget(data)
|
772
|
-
@token_queue << {:type => :ParseError, :data =>
|
771
|
+
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
773
772
|
@token_queue << @current_token
|
774
773
|
@state = :data_state
|
775
774
|
else
|
@@ -782,7 +781,7 @@ module HTML5
|
|
782
781
|
@state = :before_doctype_system_identifier_state
|
783
782
|
else
|
784
783
|
@stream.unget(char_stack)
|
785
|
-
@token_queue << {:type => :ParseError, :data =>
|
784
|
+
@token_queue << {:type => :ParseError, :data => "expected-space-or-right-bracket-in-doctype", "datavars" => {"data" => data}}
|
786
785
|
@state = :bogus_doctype_state
|
787
786
|
end
|
788
787
|
end
|
@@ -800,17 +799,17 @@ module HTML5
|
|
800
799
|
@current_token[:publicId] = ""
|
801
800
|
@state = :doctype_public_identifier_single_quoted_state
|
802
801
|
elsif data == ">"
|
803
|
-
@token_queue << {:type => :ParseError, :data =>
|
802
|
+
@token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
|
804
803
|
@current_token[:correct] = false
|
805
804
|
@token_queue << @current_token
|
806
805
|
@state = :data_state
|
807
806
|
elsif data == :EOF
|
808
|
-
@token_queue << {:type => :ParseError, :data =>
|
807
|
+
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
809
808
|
@current_token[:correct] = false
|
810
809
|
@token_queue << @current_token
|
811
810
|
@state = :data_state
|
812
811
|
else
|
813
|
-
@token_queue << {:type => :ParseError, :data =>
|
812
|
+
@token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
|
814
813
|
@state = :bogus_doctype_state
|
815
814
|
end
|
816
815
|
|
@@ -822,7 +821,7 @@ module HTML5
|
|
822
821
|
if data == "\""
|
823
822
|
@state = :after_doctype_public_identifier_state
|
824
823
|
elsif data == :EOF
|
825
|
-
@token_queue << {:type => :ParseError, :data =>
|
824
|
+
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
826
825
|
@current_token[:correct] = false
|
827
826
|
@token_queue << @current_token
|
828
827
|
@state = :data_state
|
@@ -837,7 +836,7 @@ module HTML5
|
|
837
836
|
if data == "'"
|
838
837
|
@state = :after_doctype_public_identifier_state
|
839
838
|
elsif data == :EOF
|
840
|
-
@token_queue << {:type => :ParseError, :data =>
|
839
|
+
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
841
840
|
@current_token[:correct] = false
|
842
841
|
@token_queue << @current_token
|
843
842
|
@state = :data_state
|
@@ -860,12 +859,12 @@ module HTML5
|
|
860
859
|
@token_queue << @current_token
|
861
860
|
@state = :data_state
|
862
861
|
elsif data == :EOF
|
863
|
-
@token_queue << {:type => :ParseError, :data =>
|
862
|
+
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
864
863
|
@current_token[:correct] = false
|
865
864
|
@token_queue << @current_token
|
866
865
|
@state = :data_state
|
867
866
|
else
|
868
|
-
@token_queue << {:type => :ParseError, :data =>
|
867
|
+
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
869
868
|
@state = :bogus_doctype_state
|
870
869
|
end
|
871
870
|
return true
|
@@ -881,17 +880,17 @@ module HTML5
|
|
881
880
|
@current_token[:systemId] = ""
|
882
881
|
@state = :doctype_system_identifier_single_quoted_state
|
883
882
|
elsif data == ">"
|
884
|
-
@token_queue << {:type => :ParseError, :data =>
|
883
|
+
@token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
|
885
884
|
@current_token[:correct] = false
|
886
885
|
@token_queue << @current_token
|
887
886
|
@state = :data_state
|
888
887
|
elsif data == :EOF
|
889
|
-
@token_queue << {:type => :ParseError, :data =>
|
888
|
+
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
890
889
|
@current_token[:correct] = false
|
891
890
|
@token_queue << @current_token
|
892
891
|
@state = :data_state
|
893
892
|
else
|
894
|
-
@token_queue << {:type => :ParseError, :data =>
|
893
|
+
@token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
|
895
894
|
@state = :bogus_doctype_state
|
896
895
|
end
|
897
896
|
return true
|
@@ -902,7 +901,7 @@ module HTML5
|
|
902
901
|
if data == "\""
|
903
902
|
@state = :after_doctype_system_identifier_state
|
904
903
|
elsif data == :EOF
|
905
|
-
@token_queue << {:type => :ParseError, :data =>
|
904
|
+
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
906
905
|
@current_token[:correct] = false
|
907
906
|
@token_queue << @current_token
|
908
907
|
@state = :data_state
|
@@ -917,7 +916,7 @@ module HTML5
|
|
917
916
|
if data == "'"
|
918
917
|
@state = :after_doctype_system_identifier_state
|
919
918
|
elsif data == :EOF
|
920
|
-
@token_queue << {:type => :ParseError, :data =>
|
919
|
+
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
921
920
|
@current_token[:correct] = false
|
922
921
|
@token_queue << @current_token
|
923
922
|
@state = :data_state
|
@@ -934,12 +933,12 @@ module HTML5
|
|
934
933
|
@token_queue << @current_token
|
935
934
|
@state = :data_state
|
936
935
|
elsif data == :EOF
|
937
|
-
@token_queue << {:type => :ParseError, :data =>
|
936
|
+
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
938
937
|
@current_token[:correct] = false
|
939
938
|
@token_queue << @current_token
|
940
939
|
@state = :data_state
|
941
940
|
else
|
942
|
-
@token_queue << {:type => :ParseError, :data =>
|
941
|
+
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
943
942
|
@state = :bogus_doctype_state
|
944
943
|
end
|
945
944
|
return true
|
@@ -954,7 +953,7 @@ module HTML5
|
|
954
953
|
elsif data == :EOF
|
955
954
|
# XXX EMIT
|
956
955
|
@stream.unget(data)
|
957
|
-
@token_queue << {:type => :ParseError, :data =>
|
956
|
+
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
|
958
957
|
@current_token[:correct] = false
|
959
958
|
@token_queue << @current_token
|
960
959
|
@state = :data_state
|
@@ -962,7 +961,6 @@ module HTML5
|
|
962
961
|
return true
|
963
962
|
end
|
964
963
|
|
965
|
-
def _(string); string; end
|
966
964
|
end
|
967
965
|
|
968
966
|
end
|