spk-html5 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +10 -0
- data/Manifest.txt +73 -0
- data/README +45 -0
- data/Rakefile.rb +33 -0
- data/bin/html5 +7 -0
- data/lib/html5.rb +13 -0
- data/lib/html5/cli.rb +248 -0
- data/lib/html5/constants.rb +1061 -0
- data/lib/html5/filters/base.rb +10 -0
- data/lib/html5/filters/inject_meta_charset.rb +82 -0
- data/lib/html5/filters/iso639codes.rb +755 -0
- data/lib/html5/filters/optionaltags.rb +198 -0
- data/lib/html5/filters/rfc2046.rb +31 -0
- data/lib/html5/filters/rfc3987.rb +91 -0
- data/lib/html5/filters/sanitizer.rb +15 -0
- data/lib/html5/filters/validator.rb +834 -0
- data/lib/html5/filters/whitespace.rb +36 -0
- data/lib/html5/html5parser.rb +247 -0
- data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
- data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
- data/lib/html5/html5parser/after_body_phase.rb +46 -0
- data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
- data/lib/html5/html5parser/after_head_phase.rb +55 -0
- data/lib/html5/html5parser/before_head_phase.rb +44 -0
- data/lib/html5/html5parser/before_html_phase.rb +41 -0
- data/lib/html5/html5parser/in_body_phase.rb +636 -0
- data/lib/html5/html5parser/in_caption_phase.rb +69 -0
- data/lib/html5/html5parser/in_cell_phase.rb +78 -0
- data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
- data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
- data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
- data/lib/html5/html5parser/in_head_phase.rb +143 -0
- data/lib/html5/html5parser/in_row_phase.rb +96 -0
- data/lib/html5/html5parser/in_select_phase.rb +90 -0
- data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
- data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
- data/lib/html5/html5parser/in_table_phase.rb +177 -0
- data/lib/html5/html5parser/initial_phase.rb +133 -0
- data/lib/html5/html5parser/phase.rb +171 -0
- data/lib/html5/inputstream.rb +735 -0
- data/lib/html5/liberalxmlparser.rb +158 -0
- data/lib/html5/sanitizer.rb +209 -0
- data/lib/html5/serializer.rb +2 -0
- data/lib/html5/serializer/htmlserializer.rb +179 -0
- data/lib/html5/serializer/xhtmlserializer.rb +20 -0
- data/lib/html5/sniffer.rb +45 -0
- data/lib/html5/tokenizer.rb +1059 -0
- data/lib/html5/treebuilders.rb +24 -0
- data/lib/html5/treebuilders/base.rb +339 -0
- data/lib/html5/treebuilders/hpricot.rb +231 -0
- data/lib/html5/treebuilders/rexml.rb +215 -0
- data/lib/html5/treebuilders/simpletree.rb +191 -0
- data/lib/html5/treewalkers.rb +26 -0
- data/lib/html5/treewalkers/base.rb +162 -0
- data/lib/html5/treewalkers/hpricot.rb +48 -0
- data/lib/html5/treewalkers/rexml.rb +48 -0
- data/lib/html5/treewalkers/simpletree.rb +48 -0
- data/lib/html5/version.rb +3 -0
- data/test/preamble.rb +69 -0
- data/test/test_cli.rb +16 -0
- data/test/test_encoding.rb +35 -0
- data/test/test_input_stream.rb +26 -0
- data/test/test_lxp.rb +283 -0
- data/test/test_parser.rb +63 -0
- data/test/test_sanitizer.rb +173 -0
- data/test/test_serializer.rb +67 -0
- data/test/test_sniffer.rb +27 -0
- data/test/test_stream.rb +71 -0
- data/test/test_tokenizer.rb +95 -0
- data/test/test_treewalkers.rb +135 -0
- data/test/test_validator.rb +31 -0
- data/test/tokenizer_test_parser.rb +67 -0
- data/test19.rb +38 -0
- metadata +198 -0
@@ -0,0 +1,198 @@
|
|
1
|
+
require 'html5/constants'
|
2
|
+
require 'html5/filters/base'
|
3
|
+
|
4
|
+
module HTML5
|
5
|
+
module Filters
|
6
|
+
|
7
|
+
class OptionalTagFilter < Base
|
8
|
+
def slider
|
9
|
+
previous1 = previous2 = nil
|
10
|
+
__getobj__.each do |token|
|
11
|
+
yield previous2, previous1, token if previous1 != nil
|
12
|
+
previous2 = previous1
|
13
|
+
previous1 = token
|
14
|
+
end
|
15
|
+
yield previous2, previous1, nil
|
16
|
+
end
|
17
|
+
|
18
|
+
def each
|
19
|
+
slider do |previous, token, nexttok|
|
20
|
+
type = token[:type]
|
21
|
+
if type == :StartTag
|
22
|
+
yield token unless token[:data].empty? and is_optional_start(token[:name], previous, nexttok)
|
23
|
+
elsif type == :EndTag
|
24
|
+
yield token unless is_optional_end(token[:name], nexttok)
|
25
|
+
else
|
26
|
+
yield token
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def is_optional_start(tagname, previous, nexttok)
|
32
|
+
type = nexttok ? nexttok[:type] : nil
|
33
|
+
if tagname == 'html'
|
34
|
+
# An html element's start tag may be omitted if the first thing
|
35
|
+
# inside the html element is not a space character or a comment.
|
36
|
+
return ![:Comment, :SpaceCharacters].include?(type)
|
37
|
+
elsif tagname == 'head'
|
38
|
+
# A head element's start tag may be omitted if the first thing
|
39
|
+
# inside the head element is an element.
|
40
|
+
return type == :StartTag
|
41
|
+
elsif tagname == 'body'
|
42
|
+
# A body element's start tag may be omitted if the first thing
|
43
|
+
# inside the body element is not a space character or a comment,
|
44
|
+
# except if the first thing inside the body element is a script
|
45
|
+
# or style element and the node immediately preceding the body
|
46
|
+
# element is a head element whose end tag has been omitted.
|
47
|
+
if [:Comment, :SpaceCharacters].include?(type)
|
48
|
+
return false
|
49
|
+
elsif type == :StartTag
|
50
|
+
# XXX: we do not look at the preceding event, so we never omit
|
51
|
+
# the body element's start tag if it's followed by a script or
|
52
|
+
# a style element.
|
53
|
+
return !%w[script style].include?(nexttok[:name])
|
54
|
+
else
|
55
|
+
return true
|
56
|
+
end
|
57
|
+
elsif tagname == 'colgroup'
|
58
|
+
# A colgroup element's start tag may be omitted if the first thing
|
59
|
+
# inside the colgroup element is a col element, and if the element
|
60
|
+
# is not immediately preceeded by another colgroup element whose
|
61
|
+
# end tag has been omitted.
|
62
|
+
if type == :StartTag
|
63
|
+
# XXX: we do not look at the preceding event, so instead we never
|
64
|
+
# omit the colgroup element's end tag when it is immediately
|
65
|
+
# followed by another colgroup element. See is_optional_end.
|
66
|
+
return nexttok[:name] == "col"
|
67
|
+
else
|
68
|
+
return false
|
69
|
+
end
|
70
|
+
elsif tagname == 'tbody'
|
71
|
+
# A tbody element's start tag may be omitted if the first thing
|
72
|
+
# inside the tbody element is a tr element, and if the element is
|
73
|
+
# not immediately preceeded by a tbody, thead, or tfoot element
|
74
|
+
# whose end tag has been omitted.
|
75
|
+
if type == :StartTag
|
76
|
+
# omit the thead and tfoot elements' end tag when they are
|
77
|
+
# immediately followed by a tbody element. See is_optional_end.
|
78
|
+
if previous and previous[:type] == :EndTag && %w(tbody thead tfoot).include?(previous[:name])
|
79
|
+
return false
|
80
|
+
end
|
81
|
+
|
82
|
+
return nexttok[:name] == 'tr'
|
83
|
+
else
|
84
|
+
return false
|
85
|
+
end
|
86
|
+
end
|
87
|
+
return false
|
88
|
+
end
|
89
|
+
|
90
|
+
def is_optional_end(tagname, nexttok)
|
91
|
+
type = nexttok ? nexttok[:type] : nil
|
92
|
+
if %w[html head body].include?(tagname)
|
93
|
+
# An html element's end tag may be omitted if the html element
|
94
|
+
# is not immediately followed by a space character or a comment.
|
95
|
+
return ![:Comment, :SpaceCharacters].include?(type)
|
96
|
+
elsif %w[li optgroup option tr].include?(tagname)
|
97
|
+
# A li element's end tag may be omitted if the li element is
|
98
|
+
# immediately followed by another li element or if there is
|
99
|
+
# no more content in the parent element.
|
100
|
+
# An optgroup element's end tag may be omitted if the optgroup
|
101
|
+
# element is immediately followed by another optgroup element,
|
102
|
+
# or if there is no more content in the parent element.
|
103
|
+
# An option element's end tag may be omitted if the option
|
104
|
+
# element is immediately followed by another option element,
|
105
|
+
# or if there is no more content in the parent element.
|
106
|
+
# A tr element's end tag may be omitted if the tr element is
|
107
|
+
# immediately followed by another tr element, or if there is
|
108
|
+
# no more content in the parent element.
|
109
|
+
if type == :StartTag
|
110
|
+
return nexttok[:name] == tagname
|
111
|
+
else
|
112
|
+
return type == :EndTag || type == nil
|
113
|
+
end
|
114
|
+
elsif %w(dt dd).include?(tagname)
|
115
|
+
# A dt element's end tag may be omitted if the dt element is
|
116
|
+
# immediately followed by another dt element or a dd element.
|
117
|
+
# A dd element's end tag may be omitted if the dd element is
|
118
|
+
# immediately followed by another dd element or a dt element,
|
119
|
+
# or if there is no more content in the parent element.
|
120
|
+
if type == :StartTag
|
121
|
+
return %w(dt dd).include?(nexttok[:name])
|
122
|
+
elsif tagname == 'dd'
|
123
|
+
return type == :EndTag || type == nil
|
124
|
+
else
|
125
|
+
return false
|
126
|
+
end
|
127
|
+
elsif tagname == 'p'
|
128
|
+
# A p element's end tag may be omitted if the p element is
|
129
|
+
# immediately followed by an address, blockquote, dl, fieldset,
|
130
|
+
# form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
|
131
|
+
# or ul element, or if there is no more content in the parent
|
132
|
+
# element.
|
133
|
+
if type == :StartTag
|
134
|
+
return %w(address blockquote dl fieldset form h1 h2 h3 h4 h5
|
135
|
+
h6 hr menu ol p pre table ul).include?(nexttok[:name])
|
136
|
+
else
|
137
|
+
return type == :EndTag || type == nil
|
138
|
+
end
|
139
|
+
elsif tagname == 'colgroup'
|
140
|
+
# A colgroup element's end tag may be omitted if the colgroup
|
141
|
+
# element is not immediately followed by a space character or
|
142
|
+
# a comment.
|
143
|
+
if [:Comment, :SpaceCharacters].include?(type)
|
144
|
+
return false
|
145
|
+
elsif type == :StartTag
|
146
|
+
# XXX: we also look for an immediately following colgroup
|
147
|
+
# element. See is_optional_start.
|
148
|
+
return nexttok[:name] != 'colgroup'
|
149
|
+
else
|
150
|
+
return true
|
151
|
+
end
|
152
|
+
elsif %w(thead tbody).include? tagname
|
153
|
+
# A thead element's end tag may be omitted if the thead element
|
154
|
+
# is immediately followed by a tbody or tfoot element.
|
155
|
+
# A tbody element's end tag may be omitted if the tbody element
|
156
|
+
# is immediately followed by a tbody or tfoot element, or if
|
157
|
+
# there is no more content in the parent element.
|
158
|
+
# A tfoot element's end tag may be omitted if the tfoot element
|
159
|
+
# is immediately followed by a tbody element, or if there is no
|
160
|
+
# more content in the parent element.
|
161
|
+
# XXX: we never omit the end tag when the following element is
|
162
|
+
# a tbody. See is_optional_start.
|
163
|
+
if type == :StartTag
|
164
|
+
return %w(tbody tfoot).include?(nexttok[:name])
|
165
|
+
elsif tagname == 'tbody'
|
166
|
+
return (type == :EndTag or type == nil)
|
167
|
+
else
|
168
|
+
return false
|
169
|
+
end
|
170
|
+
elsif tagname == 'tfoot'
|
171
|
+
# A tfoot element's end tag may be omitted if the tfoot element
|
172
|
+
# is immediately followed by a tbody element, or if there is no
|
173
|
+
# more content in the parent element.
|
174
|
+
# XXX: we never omit the end tag when the following element is
|
175
|
+
# a tbody. See is_optional_start.
|
176
|
+
if type == :StartTag
|
177
|
+
return nexttok[:name] == 'tbody'
|
178
|
+
else
|
179
|
+
return type == :EndTag || type == nil
|
180
|
+
end
|
181
|
+
elsif %w(td th).include? tagname
|
182
|
+
# A td element's end tag may be omitted if the td element is
|
183
|
+
# immediately followed by a td or th element, or if there is
|
184
|
+
# no more content in the parent element.
|
185
|
+
# A th element's end tag may be omitted if the th element is
|
186
|
+
# immediately followed by a td or th element, or if there is
|
187
|
+
# no more content in the parent element.
|
188
|
+
if type == :StartTag
|
189
|
+
return %w(td th).include?(nexttok[:name])
|
190
|
+
else
|
191
|
+
return type == :EndTag || type == nil
|
192
|
+
end
|
193
|
+
end
|
194
|
+
return false
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# adapted from feedvalidator, original copyright license is
|
2
|
+
#
|
3
|
+
# Copyright (c) 2002-2006, Sam Ruby, Mark Pilgrim, Joseph Walton, and Phil Ringnalda
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
# of this software and associated documentation files (the "Software"), to deal
|
7
|
+
# in the Software without restriction, including without limitation the rights
|
8
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
# copies of the Software, and to permit persons to whom the Software is
|
10
|
+
# furnished to do so, subject to the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be included in all
|
13
|
+
# copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
# SOFTWARE.
|
22
|
+
|
23
|
+
|
24
|
+
module RFC2046
|
25
|
+
def is_valid_mime_type(value)
|
26
|
+
# mime_re = Regexp.new('[^\s()<>,;:\\"/[\]?=]+/[^\s()<>,;:\\"/[\]?=]+(\s*;\s*[^\s()<>,;:\\"/[\]?=]+=("(\\"|[^"])*"|[^\s()<>,;:\\"/[\]?=]+))*$')
|
27
|
+
# !!mime_re.match(value)
|
28
|
+
true
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# adapted from feedvalidator, original copyright license is
|
2
|
+
#
|
3
|
+
# Copyright (c) 2002-2006, Sam Ruby, Mark Pilgrim, Joseph Walton, and Phil Ringnalda
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
# of this software and associated documentation files (the "Software"), to deal
|
7
|
+
# in the Software without restriction, including without limitation the rights
|
8
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
# copies of the Software, and to permit persons to whom the Software is
|
10
|
+
# furnished to do so, subject to the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be included in all
|
13
|
+
# copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
# SOFTWARE.
|
22
|
+
|
23
|
+
module RFC3987
|
24
|
+
iana_schemes = [ # http://www.iana.org/assignments/uri-schemes.html
|
25
|
+
"ftp", "http", "gopher", "mailto", "news", "nntp", "telnet", "wais",
|
26
|
+
"file", "prospero", "z39.50s", "z39.50r", "cid", "mid", "vemmi",
|
27
|
+
"service", "imap", "nfs", "acap", "rtsp", "tip", "pop", "data", "dav",
|
28
|
+
"opaquelocktoken", "sip", "sips", "tel", "fax", "modem", "ldap",
|
29
|
+
"https", "soap.beep", "soap.beeps", "xmlrpc.beep", "xmlrpc.beeps",
|
30
|
+
"urn", "go", "h323", "ipp", "tftp", "mupdate", "pres", "im", "mtqp",
|
31
|
+
"iris.beep", "dict", "snmp", "crid", "tag", "dns", "info"
|
32
|
+
]
|
33
|
+
ALLOWED_SCHEMES = iana_schemes + ['javascript']
|
34
|
+
|
35
|
+
RFC2396 = Regexp.new("^([a-zA-Z][0-9a-zA-Z+\\-\\.]*:)?/{0,2}[0-9a-zA-Z;/?:@&=+$\\.\\-_!~*'()%,#]*$", Regexp::MULTILINE)
|
36
|
+
rfc2396_full = Regexp.new("[a-zA-Z][0-9a-zA-Z+\\-\\.]*:(//)?[0-9a-zA-Z;/?:@&=+$\\.\\-_!~*'()%,#]+$")
|
37
|
+
URN = Regexp.new("^[Uu][Rr][Nn]:[a-zA-Z0-9][a-zA-Z0-9-]{1,31}:([a-zA-Z0-9()+,\.:=@;$_!*'\-]|%[0-9A-Fa-f]{2})+$")
|
38
|
+
TAG = Regexp.new("^tag:([a-z0-9\\-\._]+?@)?[a-z0-9\.\-]+?,\d{4}(-\d{2}(-\d{2})?)?:[0-9a-zA-Z;/\?:@&=+$\.\-_!~*'\(\)%,]*(#[0-9a-zA-Z;/\?:@&=+$\.\-_!~*'\(\)%,]*)?$")
|
39
|
+
|
40
|
+
def is_valid_uri(value, uri_pattern = RFC2396)
|
41
|
+
scheme = value.split(':').first
|
42
|
+
scheme.downcase! if scheme
|
43
|
+
if scheme == 'tag'
|
44
|
+
if !TAG.match(value)
|
45
|
+
return false, "invalid-tag-uri"
|
46
|
+
end
|
47
|
+
elsif scheme == "urn"
|
48
|
+
if !URN.match(value)
|
49
|
+
return false, "invalid-urn"
|
50
|
+
end
|
51
|
+
elsif uri_pattern.match(value).to_a.reject{|i| i == ''}.compact.length == 0 || uri_pattern.match(value)[0] != value
|
52
|
+
urichars = Regexp.new("^[0-9a-zA-Z;/?:@&=+$\\.\\-_!~*'()%,#]$", Regexp::MULTILINE)
|
53
|
+
if value.length > 0
|
54
|
+
value.each_byte do |b|
|
55
|
+
if b < 128 and !urichars.match([b].pack('c*'))
|
56
|
+
return false, "invalid-uri-char"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
else
|
60
|
+
begin
|
61
|
+
if uri_pattern.match(value.encode('idna'))
|
62
|
+
return false, "uri-not-iri"
|
63
|
+
end
|
64
|
+
rescue
|
65
|
+
end
|
66
|
+
return false, "invalid-uri"
|
67
|
+
end
|
68
|
+
elsif ['http','ftp'].include?(scheme)
|
69
|
+
if !value.match(%r{^\w+://[^/].*})
|
70
|
+
return false, "invalid-http-or-ftp-uri"
|
71
|
+
end
|
72
|
+
elsif value.index(':') && scheme.match(/^[a-z]+$/) && !ALLOWED_SCHEMES.include?(scheme)
|
73
|
+
return false, "invalid-scheme"
|
74
|
+
end
|
75
|
+
return true, ""
|
76
|
+
end
|
77
|
+
|
78
|
+
def is_valid_iri(value)
|
79
|
+
begin
|
80
|
+
if value.length > 0
|
81
|
+
value = value.encode('idna')
|
82
|
+
end
|
83
|
+
rescue
|
84
|
+
end
|
85
|
+
is_valid_uri(value)
|
86
|
+
end
|
87
|
+
|
88
|
+
def is_valid_fully_qualified_uri(value)
|
89
|
+
is_valid_uri(value, rfc2396_full)
|
90
|
+
end
|
91
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'html5/filters/base'
|
2
|
+
require 'html5/sanitizer'
|
3
|
+
|
4
|
+
module HTML5
|
5
|
+
module Filters
|
6
|
+
class HTMLSanitizeFilter < Base
|
7
|
+
include HTMLSanitizeModule
|
8
|
+
def each
|
9
|
+
__getobj__.each do |token|
|
10
|
+
yield(sanitize_token(token))
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,834 @@
|
|
1
|
+
# HTML 5 conformance checker
|
2
|
+
#
|
3
|
+
# Warning: this module is experimental, incomplete, and subject to removal at any time.
|
4
|
+
#
|
5
|
+
# Usage:
|
6
|
+
# >>> from html5lib.html5parser import HTMLParser
|
7
|
+
# >>> from html5lib.filters.validator import HTMLConformanceChecker
|
8
|
+
# >>> p = HTMLParser(tokenizer=HTMLConformanceChecker)
|
9
|
+
# >>> p.parse('<!doctype html>\n<html foo=bar></html>')
|
10
|
+
# <<class 'html5lib.treebuilders.simpletree.Document'> nil>
|
11
|
+
# >>> p.errors
|
12
|
+
# [((2, 14), 'unknown-attribute', {'attributeName' => u'foo', 'tagName' => u'html'})]
|
13
|
+
|
14
|
+
require 'html5/constants'
|
15
|
+
require 'html5/filters/base'
|
16
|
+
require 'html5/filters/iso639codes'
|
17
|
+
require 'html5/filters/rfc3987'
|
18
|
+
require 'html5/filters/rfc2046'
|
19
|
+
|
20
|
+
def _(str); str; end
|
21
|
+
|
22
|
+
class String
|
23
|
+
# lifted from rails
|
24
|
+
def underscore()
|
25
|
+
self.gsub(/::/, '/').
|
26
|
+
gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
|
27
|
+
gsub(/([a-z\d])([A-Z])/,'\1_\2').
|
28
|
+
tr("-", "_").
|
29
|
+
downcase
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
HTML5::E.update({
|
34
|
+
"unknown-start-tag" =>
|
35
|
+
_("Unknown start tag <%(tagName)>."),
|
36
|
+
"unknown-attribute" =>
|
37
|
+
_("Unknown '%(attributeName)' attribute on <%(tagName)>."),
|
38
|
+
"missing-required-attribute" =>
|
39
|
+
_("The '%(attributeName)' attribute is required on <%(tagName)>."),
|
40
|
+
"unknown-input-type" =>
|
41
|
+
_("Illegal value for attribute on <input type='%(inputType)'>."),
|
42
|
+
"attribute-not-allowed-on-this-input-type" =>
|
43
|
+
_("The '%(attributeName)' attribute is not allowed on <input type=%(inputType)>."),
|
44
|
+
"deprecated-attribute" =>
|
45
|
+
_("This attribute is deprecated: '%(attributeName)' attribute on <%(tagName)>."),
|
46
|
+
"duplicate-value-in-token-list" =>
|
47
|
+
_("Duplicate value in token list: '%(attributeValue)' in '%(attributeName)' attribute on <%(tagName)>."),
|
48
|
+
"invalid-attribute-value" =>
|
49
|
+
_("Invalid attribute value: '%(attributeName)' attribute on <%(tagName)>."),
|
50
|
+
"space-in-id" =>
|
51
|
+
_("Whitespace is not allowed here: '%(attributeName)' attribute on <%(tagName)>."),
|
52
|
+
"duplicate-id" =>
|
53
|
+
_("This ID was already defined earlier: 'id' attribute on <%(tagName)>."),
|
54
|
+
"attribute-value-can-not-be-blank" =>
|
55
|
+
_("This value can not be blank: '%(attributeName)' attribute on <%(tagName)>."),
|
56
|
+
"id-does-not-exist" =>
|
57
|
+
_("This value refers to a non-existent ID: '%(attributeName)' attribute on <%(tagName)>."),
|
58
|
+
"invalid-enumerated-value" =>
|
59
|
+
_("Value must be one of %(enumeratedValues): '%(attributeName)' attribute on <%tagName)>."),
|
60
|
+
"invalid-boolean-value" =>
|
61
|
+
_("Value must be one of %(enumeratedValues): '%(attributeName)' attribute on <%tagName)>."),
|
62
|
+
"contextmenu-must-point-to-menu" =>
|
63
|
+
_("The contextmenu attribute must point to an ID defined on a <menu> element."),
|
64
|
+
"invalid-lang-code" =>
|
65
|
+
_("Invalid language code: '%(attributeName)' attibute on <%(tagName)>."),
|
66
|
+
"invalid-integer-value" =>
|
67
|
+
_("Value must be an integer: '%(attributeName)' attribute on <%tagName)>."),
|
68
|
+
"invalid-root-namespace" =>
|
69
|
+
_("Root namespace must be 'http://www.w3.org/1999/xhtml', or omitted."),
|
70
|
+
"invalid-browsing-context" =>
|
71
|
+
_("Value must be one of ('_self', '_parent', '_top'), or a name that does not start with '_' => '%(attributeName)' attribute on <%(tagName)>."),
|
72
|
+
"invalid-tag-uri" =>
|
73
|
+
_("Invalid URI: '%(attributeName)' attribute on <%(tagName)>."),
|
74
|
+
"invalid-urn" =>
|
75
|
+
_("Invalid URN: '%(attributeName)' attribute on <%(tagName)>."),
|
76
|
+
"invalid-uri-char" =>
|
77
|
+
_("Illegal character in URI: '%(attributeName)' attribute on <%(tagName)>."),
|
78
|
+
"uri-not-iri" =>
|
79
|
+
_("Expected a URI but found an IRI: '%(attributeName)' attribute on <%(tagName)>."),
|
80
|
+
"invalid-uri" =>
|
81
|
+
_("Invalid URI: '%(attributeName)' attribute on <%(tagName)>."),
|
82
|
+
"invalid-http-or-ftp-uri" =>
|
83
|
+
_("Invalid URI: '%(attributeName)' attribute on <%(tagName)>."),
|
84
|
+
"invalid-scheme" =>
|
85
|
+
_("Unregistered URI scheme: '%(attributeName)' attribute on <%(tagName)>."),
|
86
|
+
"invalid-rel" =>
|
87
|
+
_("Invalid link relation: '%(attributeName)' attribute on <%(tagName)>."),
|
88
|
+
"invalid-mime-type" =>
|
89
|
+
_("Invalid MIME type: '%(attributeName)' attribute on <%(tagName)>."),
|
90
|
+
})
|
91
|
+
|
92
|
+
|
93
|
+
class HTMLConformanceChecker < HTML5::Filters::Base
|
94
|
+
|
95
|
+
include RFC3987
|
96
|
+
include ISO639Codes
|
97
|
+
include RFC2046
|
98
|
+
|
99
|
+
@@global_attributes = %w[class contenteditable contextmenu dir
|
100
|
+
draggable id irrelevant lang ref tabindex template
|
101
|
+
title onabort onbeforeunload onblur onchange onclick
|
102
|
+
oncontextmenu ondblclick ondrag ondragend ondragenter
|
103
|
+
ondragleave ondragover ondragstart ondrop onerror
|
104
|
+
onfocus onkeydown onkeypress onkeyup onload onmessage
|
105
|
+
onmousedown onmousemove onmouseout onmouseover onmouseup
|
106
|
+
onmousewheel onresize onscroll onselect onsubmit onunload]
|
107
|
+
# XXX lang in HTML only, xml:lang in XHTML only
|
108
|
+
# XXX validate ref, template
|
109
|
+
|
110
|
+
@@allowed_attribute_map = {
|
111
|
+
'html' => %w[xmlns],
|
112
|
+
'head' => [],
|
113
|
+
'title' => [],
|
114
|
+
'base' => %w[href target],
|
115
|
+
'link' => %w[href rel media hreflang type],
|
116
|
+
'meta' => %w[name http-equiv content charset], # XXX charset in HTML only
|
117
|
+
'style' => %w[media type scoped],
|
118
|
+
'body' => [],
|
119
|
+
'section' => [],
|
120
|
+
'nav' => [],
|
121
|
+
'article' => [],
|
122
|
+
'blockquote' => %w[cite],
|
123
|
+
'aside' => [],
|
124
|
+
'h1' => [],
|
125
|
+
'h2' => [],
|
126
|
+
'h3' => [],
|
127
|
+
'h4' => [],
|
128
|
+
'h5' => [],
|
129
|
+
'h6' => [],
|
130
|
+
'header' => [],
|
131
|
+
'footer' => [],
|
132
|
+
'address' => [],
|
133
|
+
'p' => [],
|
134
|
+
'hr' => [],
|
135
|
+
'br' => [],
|
136
|
+
'dialog' => [],
|
137
|
+
'pre' => [],
|
138
|
+
'ol' => %w[start],
|
139
|
+
'ul' => [],
|
140
|
+
'li' => %w[value], # XXX depends on parent
|
141
|
+
'dl' => [],
|
142
|
+
'dt' => [],
|
143
|
+
'dd' => [],
|
144
|
+
'a' => %w[href target ping rel media hreflang type],
|
145
|
+
'q' => %w[cite],
|
146
|
+
'cite' => [],
|
147
|
+
'em' => [],
|
148
|
+
'strong' => [],
|
149
|
+
'small' => [],
|
150
|
+
'm' => [],
|
151
|
+
'dfn' => [],
|
152
|
+
'abbr' => [],
|
153
|
+
'time' => %w[datetime],
|
154
|
+
'meter' => %w[value min low high max optimum],
|
155
|
+
'progress' => %w[value max],
|
156
|
+
'code' => [],
|
157
|
+
'var' => [],
|
158
|
+
'samp' => [],
|
159
|
+
'kbd' => [],
|
160
|
+
'sup' => [],
|
161
|
+
'sub' => [],
|
162
|
+
'span' => [],
|
163
|
+
'i' => [],
|
164
|
+
'b' => [],
|
165
|
+
'bdo' => [],
|
166
|
+
'ins' => %w[cite datetime],
|
167
|
+
'del' => %w[cite datetime],
|
168
|
+
'figure' => [],
|
169
|
+
'img' => %w[alt src usemap ismap height width], # XXX ismap depends on parent
|
170
|
+
'iframe' => %w[src],
|
171
|
+
# <embed> handled separately
|
172
|
+
'object' => %w[data type usemap height width],
|
173
|
+
'param' => %w[name value],
|
174
|
+
'video' => %w[src autoplay start loopstart loopend end loopcount controls],
|
175
|
+
'audio' => %w[src autoplay start loopstart loopend end loopcount controls],
|
176
|
+
'source' => %w[src type media],
|
177
|
+
'canvas' => %w[height width],
|
178
|
+
'map' => [],
|
179
|
+
'area' => %w[alt coords shape href target ping rel media hreflang type],
|
180
|
+
'table' => [],
|
181
|
+
'caption' => [],
|
182
|
+
'colgroup' => %w[span], # XXX only if element contains no <col> elements
|
183
|
+
'col' => %w[span],
|
184
|
+
'tbody' => [],
|
185
|
+
'thead' => [],
|
186
|
+
'tfoot' => [],
|
187
|
+
'tr' => [],
|
188
|
+
'td' => %w[colspan rowspan],
|
189
|
+
'th' => %w[colspan rowspan scope],
|
190
|
+
# all possible <input> attributes are listed here but <input> is really handled separately
|
191
|
+
'input' => %w[accept accesskey action alt autocomplete autofocus checked
|
192
|
+
disabled enctype form inputmode list maxlength method min
|
193
|
+
max name pattern step readonly replace required size src
|
194
|
+
tabindex target template value
|
195
|
+
],
|
196
|
+
'form' => %w[action method enctype accept name onsubmit onreset accept-charset
|
197
|
+
data replace
|
198
|
+
],
|
199
|
+
'button' => %w[action enctype method replace template name value type disabled form autofocus], # XXX may need matrix of acceptable attributes based on value of type attribute (like input)
|
200
|
+
'select' => %w[name size multiple disabled data accesskey form autofocus],
|
201
|
+
'optgroup' => %w[disabled label],
|
202
|
+
'option' => %w[selected disabled label value],
|
203
|
+
'textarea' => %w[maxlength name rows cols disabled readonly required form autofocus wrap accept],
|
204
|
+
'label' => %w[for accesskey form],
|
205
|
+
'fieldset' => %w[disabled form],
|
206
|
+
'output' => %w[form name for onforminput onformchange],
|
207
|
+
'datalist' => %w[data],
|
208
|
+
# XXX repetition model for repeating form controls
|
209
|
+
'script' => %w[src defer async type],
|
210
|
+
'noscript' => [],
|
211
|
+
'noembed' => [],
|
212
|
+
'event-source' => %w[src],
|
213
|
+
'details' => %w[open],
|
214
|
+
'datagrid' => %w[multiple disabled],
|
215
|
+
'command' => %w[type label icon hidden disabled checked radiogroup default],
|
216
|
+
'menu' => %w[type label autosubmit],
|
217
|
+
'datatemplate' => [],
|
218
|
+
'rule' => [],
|
219
|
+
'nest' => [],
|
220
|
+
'legend' => [],
|
221
|
+
'div' => [],
|
222
|
+
'font' => %w[style]
|
223
|
+
}
|
224
|
+
|
225
|
+
@@required_attribute_map = {
|
226
|
+
'link' => %w[href rel],
|
227
|
+
'bdo' => %w[dir],
|
228
|
+
'img' => %w[src],
|
229
|
+
'embed' => %w[src],
|
230
|
+
'object' => [], # XXX one of 'data' or 'type' is required
|
231
|
+
'param' => %w[name value],
|
232
|
+
'source' => %w[src],
|
233
|
+
'map' => %w[id]
|
234
|
+
}
|
235
|
+
|
236
|
+
@@input_type_allowed_attribute_map = {
|
237
|
+
'text' => %w[accesskey autocomplete autofocus disabled form inputmode list maxlength name pattern readonly required size tabindex value],
|
238
|
+
'password' => %w[accesskey autocomplete autofocus disabled form inputmode maxlength name pattern readonly required size tabindex value],
|
239
|
+
'checkbox' => %w[accesskey autofocus checked disabled form name required tabindex value],
|
240
|
+
'radio' => %w[accesskey autofocus checked disabled form name required tabindex value],
|
241
|
+
'button' => %w[accesskey autofocus disabled form name tabindex value],
|
242
|
+
'submit' => %w[accesskey action autofocus disabled enctype form method name replace tabindex target value],
|
243
|
+
'reset' => %w[accesskey autofocus disabled form name tabindex value],
|
244
|
+
'add' => %w[accesskey autofocus disabled form name tabindex template value],
|
245
|
+
'remove' => %w[accesskey autofocus disabled form name tabindex value],
|
246
|
+
'move-up' => %w[accesskey autofocus disabled form name tabindex value],
|
247
|
+
'move-down' => %w[accesskey autofocus disabled form name tabindex value],
|
248
|
+
'file' => %w[accept accesskey autofocus disabled form min max name required tabindex],
|
249
|
+
'hidden' => %w[disabled form name value],
|
250
|
+
'image' => %w[accesskey action alt autofocus disabled enctype form method name replace src tabindex target],
|
251
|
+
'datetime' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
|
252
|
+
'datetime-local' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
|
253
|
+
'date' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
|
254
|
+
'month' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
|
255
|
+
'week' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
|
256
|
+
'time' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
|
257
|
+
'number' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
|
258
|
+
'range' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
|
259
|
+
'email' => %w[accesskey autocomplete autofocus disabled form inputmode list maxlength name pattern readonly required tabindex value],
|
260
|
+
'url' => %w[accesskey autocomplete autofocus disabled form inputmode list maxlength name pattern readonly required tabindex value],
|
261
|
+
}
|
262
|
+
|
263
|
+
@@input_type_deprecated_attribute_map = {
|
264
|
+
'text' => ['size'],
|
265
|
+
'password' => ['size']
|
266
|
+
}
|
267
|
+
|
268
|
+
@@link_rel_values = %w[alternate archive archives author contact feed first begin start help icon index top contents toc last end license copyright next pingback prefetch prev previous search stylesheet sidebar tag up]
|
269
|
+
@@a_rel_values = %w[alternate archive archives author contact feed first begin start help index top contents toc last end license copyright next prev previous search sidebar tag up bookmark external nofollow]
|
270
|
+
|
271
|
+
def initialize(stream, *args)
|
272
|
+
super(HTML5::HTMLTokenizer.new(stream, *args))
|
273
|
+
@things_that_define_an_id = []
|
274
|
+
@things_that_point_to_an_id = []
|
275
|
+
@ids_we_have_known_and_loved = []
|
276
|
+
end
|
277
|
+
|
278
|
+
def each
|
279
|
+
__getobj__.each do |token|
|
280
|
+
method = "validate_#{token.fetch(:type, '-').to_s.underscore}_#{token.fetch(:name, '-').to_s.underscore}"
|
281
|
+
if respond_to?(method)
|
282
|
+
send(method, token){|t| yield t }
|
283
|
+
else
|
284
|
+
method = "validate_#{token.fetch(:type, '-').to_s.underscore}"
|
285
|
+
if respond_to?(method)
|
286
|
+
send(method, token) do |t|
|
287
|
+
yield t
|
288
|
+
end
|
289
|
+
end
|
290
|
+
end
|
291
|
+
yield token
|
292
|
+
end
|
293
|
+
eof do |t|
|
294
|
+
yield t
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
##########################################################################
|
299
|
+
# Start tag validation
|
300
|
+
##########################################################################
|
301
|
+
|
302
|
+
def validate_start_tag(token)
|
303
|
+
check_unknown_start_tag(token){|t| yield t}
|
304
|
+
check_start_tag_required_attributes(token) do |t|
|
305
|
+
yield t
|
306
|
+
end
|
307
|
+
check_start_tag_unknown_attributes(token) do |t|
|
308
|
+
yield t
|
309
|
+
end
|
310
|
+
check_attribute_values(token) do |t|
|
311
|
+
yield t
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
def validate_start_tag_embed(token)
|
316
|
+
check_start_tag_required_attributes(token) do |t|
|
317
|
+
yield t
|
318
|
+
end
|
319
|
+
check_attribute_values(token) do |t|
|
320
|
+
yield t
|
321
|
+
end
|
322
|
+
# spec says "any attributes w/o namespace"
|
323
|
+
# so don't call check_start_tag_unknown_attributes
|
324
|
+
end
|
325
|
+
|
326
|
+
def validate_start_tag_input(token)
|
327
|
+
check_attribute_values(token) do |t|
|
328
|
+
yield t
|
329
|
+
end
|
330
|
+
attr_dict = Hash[*token[:data].collect{|(name, value)| [name.downcase, value]}.flatten]
|
331
|
+
input_type = attr_dict.fetch('type', "text")
|
332
|
+
if !@@input_type_allowed_attribute_map.keys().include?(input_type)
|
333
|
+
yield({:type => "ParseError",
|
334
|
+
:data => "unknown-input-type",
|
335
|
+
:datavars => {:attrValue => input_type}})
|
336
|
+
end
|
337
|
+
allowed_attributes = @@input_type_allowed_attribute_map.fetch(input_type, [])
|
338
|
+
attr_dict.each do |attr_name, attr_value|
|
339
|
+
if !@@allowed_attribute_map['input'].include?(attr_name)
|
340
|
+
yield({:type => "ParseError",
|
341
|
+
:data => "unknown-attribute",
|
342
|
+
:datavars => {"tagName" => "input",
|
343
|
+
"attributeName" => attr_name}})
|
344
|
+
elsif !allowed_attributes.include?(attr_name)
|
345
|
+
yield({:type => "ParseError",
|
346
|
+
:data => "attribute-not-allowed-on-this-input-type",
|
347
|
+
:datavars => {"attributeName" => attr_name,
|
348
|
+
"inputType" => input_type}})
|
349
|
+
end
|
350
|
+
if @@input_type_deprecated_attribute_map.fetch(input_type, []).include?(attr_name)
|
351
|
+
yield({:type => "ParseError",
|
352
|
+
:data => "deprecated-attribute",
|
353
|
+
:datavars => {"attributeName" => attr_name,
|
354
|
+
"inputType" => input_type}})
|
355
|
+
end
|
356
|
+
end
|
357
|
+
end
|
358
|
+
|
359
|
+
##########################################################################
|
360
|
+
# Start tag validation helpers
|
361
|
+
##########################################################################
|
362
|
+
|
363
|
+
def check_unknown_start_tag(token)
|
364
|
+
# check for recognized tag name
|
365
|
+
name = (token[:name] || "").downcase
|
366
|
+
if !@@allowed_attribute_map.keys.include?(name)
|
367
|
+
yield({:type => "ParseError",
|
368
|
+
:data => "unknown-start-tag",
|
369
|
+
:datavars => {"tagName" => name}})
|
370
|
+
end
|
371
|
+
end
|
372
|
+
|
373
|
+
def check_start_tag_required_attributes(token)
|
374
|
+
# check for presence of required attributes
|
375
|
+
name = (token[:name] || "").downcase
|
376
|
+
if @@required_attribute_map.keys().include?(name)
|
377
|
+
attrs_present = (token[:data] || []).collect{|t| t[0]}
|
378
|
+
for attr_name in @@required_attribute_map[name]
|
379
|
+
if !attrs_present.include?(attr_name)
|
380
|
+
yield( {:type => "ParseError",
|
381
|
+
:data => "missing-required-attribute",
|
382
|
+
:datavars => {"tagName" => name,
|
383
|
+
"attributeName" => attr_name}})
|
384
|
+
end
|
385
|
+
end
|
386
|
+
end
|
387
|
+
end
|
388
|
+
|
389
|
+
def check_start_tag_unknown_attributes(token)
|
390
|
+
# check for recognized attribute names
|
391
|
+
name = token[:name].downcase
|
392
|
+
allowed_attributes = @@global_attributes | @@allowed_attribute_map.fetch(name, [])
|
393
|
+
for attr_name, attr_value in token.fetch(:data, [])
|
394
|
+
if !allowed_attributes.include?(attr_name.downcase())
|
395
|
+
yield( {:type => "ParseError",
|
396
|
+
:data => "unknown-attribute",
|
397
|
+
:datavars => {"tagName" => name,
|
398
|
+
"attributeName" => attr_name}})
|
399
|
+
end
|
400
|
+
end
|
401
|
+
end
|
402
|
+
|
403
|
+
##########################################################################
|
404
|
+
# Attribute validation helpers
|
405
|
+
##########################################################################
|
406
|
+
|
407
|
+
# def checkURI(token, tag_name, attr_name, attr_value)
|
408
|
+
# is_valid, error_code = rfc3987.is_valid_uri(attr_value)
|
409
|
+
# if not is_valid
|
410
|
+
# yield {:type => "ParseError",
|
411
|
+
# :data => error_code,
|
412
|
+
# :datavars => {"tagName" => tag_name,
|
413
|
+
# "attributeName" => attr_name}}
|
414
|
+
# yield {:type => "ParseError",
|
415
|
+
# :data => "invalid-attribute-value",
|
416
|
+
# :datavars => {"tagName" => tag_name,
|
417
|
+
# "attributeName" => attr_name}}
|
418
|
+
|
419
|
+
def check_iri(token, tag_name, attr_name, attr_value)
|
420
|
+
is_valid, error_code = is_valid_iri(attr_value)
|
421
|
+
if !is_valid
|
422
|
+
yield({:type => "ParseError",
|
423
|
+
:data => error_code,
|
424
|
+
:datavars => {"tagName" => tag_name,
|
425
|
+
"attributeName" => attr_name}})
|
426
|
+
yield({:type => "ParseError",
|
427
|
+
:data => "invalid-attribute-value",
|
428
|
+
:datavars => {"tagName" => tag_name,
|
429
|
+
"attributeName" => attr_name}})
|
430
|
+
end
|
431
|
+
end
|
432
|
+
|
433
|
+
def check_id(token, tag_name, attr_name, attr_value)
|
434
|
+
if !attr_value || attr_value.length == 0
|
435
|
+
yield({:type => "ParseError",
|
436
|
+
:data => "attribute-value-can-not-be-blank",
|
437
|
+
:datavars => {"tagName" => tag_name,
|
438
|
+
"attributeName" => attr_name}})
|
439
|
+
end
|
440
|
+
attr_value.each_byte do |b|
|
441
|
+
c = [b].pack('c*')
|
442
|
+
if HTML5::SPACE_CHARACTERS.include?(c)
|
443
|
+
yield( {:type => "ParseError",
|
444
|
+
:data => "space-in-id",
|
445
|
+
:datavars => {"tagName" => tag_name,
|
446
|
+
"attributeName" => attr_name}})
|
447
|
+
yield( {:type => "ParseError",
|
448
|
+
:data => "invalid-attribute-value",
|
449
|
+
:datavars => {"tagName" => tag_name,
|
450
|
+
"attributeName" => attr_name}})
|
451
|
+
break
|
452
|
+
end
|
453
|
+
end
|
454
|
+
end
|
455
|
+
|
456
|
+
def parse_token_list(value)
|
457
|
+
valueList = []
|
458
|
+
currentValue = ''
|
459
|
+
(value + ' ').each_byte do |b|
|
460
|
+
c = [b].pack('c*')
|
461
|
+
if HTML5::SPACE_CHARACTERS.include?(c)
|
462
|
+
if currentValue.length > 0
|
463
|
+
valueList << currentValue
|
464
|
+
currentValue = ''
|
465
|
+
end
|
466
|
+
else
|
467
|
+
currentValue += c
|
468
|
+
end
|
469
|
+
end
|
470
|
+
if currentValue.length > 0
|
471
|
+
valueList << currentValue
|
472
|
+
end
|
473
|
+
valueList
|
474
|
+
end
|
475
|
+
|
476
|
+
def check_token_list(tag_name, attr_name, attr_value)
|
477
|
+
# The "token" in the method name refers to tokens in an attribute value
|
478
|
+
# i.e. http://www.whatwg.org/specs/web-apps/current-work/#set-of
|
479
|
+
# but the "token" parameter refers to the token generated from
|
480
|
+
# HTMLTokenizer. Sorry for the confusion.
|
481
|
+
value_list = parse_token_list(attr_value)
|
482
|
+
value_dict = {}
|
483
|
+
for current_value in value_list
|
484
|
+
if value_dict.has_key?(current_value)
|
485
|
+
yield({:type => "ParseError",
|
486
|
+
:data => "duplicate-value-in-token-list",
|
487
|
+
:datavars => {"tagName" => tag_name,
|
488
|
+
"attributeName" => attr_name,
|
489
|
+
"attributeValue" => current_value}})
|
490
|
+
break
|
491
|
+
end
|
492
|
+
value_dict[current_value] = 1
|
493
|
+
end
|
494
|
+
end
|
495
|
+
|
496
|
+
def check_enumerated_value(token, tag_name, attr_name, attr_value, enumerated_values)
|
497
|
+
if !attr_value || attr_value.length == 0
|
498
|
+
yield( {:type => "ParseError",
|
499
|
+
:data => "attribute-value-can-not-be-blank",
|
500
|
+
:datavars => {"tagName" => tag_name,
|
501
|
+
"attributeName" => attr_name}})
|
502
|
+
return
|
503
|
+
end
|
504
|
+
attr_value.downcase!
|
505
|
+
if !enumerated_values.include?(attr_value)
|
506
|
+
yield( {:type => "ParseError",
|
507
|
+
:data => "invalid-enumerated-value",
|
508
|
+
:datavars => {"tagName" => tag_name,
|
509
|
+
"attribute_name" => attr_name,
|
510
|
+
"enumeratedValues" => enumerated_values}})
|
511
|
+
yield( {:type => "ParseError",
|
512
|
+
:data => "invalid-attribute-value",
|
513
|
+
:datavars => {"tagName" => tag_name,
|
514
|
+
"attributeName" => attr_name}})
|
515
|
+
end
|
516
|
+
end
|
517
|
+
|
518
|
+
def check_boolean(token, tag_name, attr_name, attr_value)
|
519
|
+
enumerated_values = [attr_name, '']
|
520
|
+
if !enumerated_values.include?(attr_value)
|
521
|
+
yield( {:type => "ParseError",
|
522
|
+
:data => "invalid-boolean-value",
|
523
|
+
:datavars => {"tagName" => tag_name,
|
524
|
+
"attributeName" => attr_name,
|
525
|
+
"enumeratedValues" => enumerated_values}})
|
526
|
+
yield( {:type => "ParseError",
|
527
|
+
:data => "invalid-attribute-value",
|
528
|
+
:datavars => {"tagName" => tag_name,
|
529
|
+
"attributeName" => attr_name}})
|
530
|
+
end
|
531
|
+
end
|
532
|
+
|
533
|
+
def check_integer(token, tag_name, attr_name, attr_value)
|
534
|
+
sign = 1
|
535
|
+
number_string = ''
|
536
|
+
state = 'begin' # ('begin', 'initial-number', 'number', 'trailing-junk')
|
537
|
+
error = {:type => "ParseError",
|
538
|
+
:data => "invalid-integer-value",
|
539
|
+
:datavars => {"tagName" => tag_name,
|
540
|
+
"attributeName" => attr_name,
|
541
|
+
"attributeValue" => attr_value}}
|
542
|
+
attr_value.scan(/./) do |c|
|
543
|
+
if state == 'begin'
|
544
|
+
if HTML5::SPACE_CHARACTERS.include?(c)
|
545
|
+
next
|
546
|
+
elsif c == '-'
|
547
|
+
sign = -1
|
548
|
+
state = 'initial-number'
|
549
|
+
elsif HTML5::DIGITS.include?(c)
|
550
|
+
number_string += c
|
551
|
+
state = 'in-number'
|
552
|
+
else
|
553
|
+
yield error
|
554
|
+
return
|
555
|
+
end
|
556
|
+
elsif state == 'initial-number'
|
557
|
+
if !HTML5::DIGITS.include?(c)
|
558
|
+
yield error
|
559
|
+
return
|
560
|
+
end
|
561
|
+
number_string += c
|
562
|
+
state = 'in-number'
|
563
|
+
elsif state == 'in-number'
|
564
|
+
if HTML5::DIGITS.include?(c)
|
565
|
+
number_string += c
|
566
|
+
else
|
567
|
+
state = 'trailing-junk'
|
568
|
+
end
|
569
|
+
elsif state == 'trailing-junk'
|
570
|
+
next
|
571
|
+
end
|
572
|
+
end
|
573
|
+
if number_string.length == 0
|
574
|
+
yield( {:type => "ParseError",
|
575
|
+
:data => "attribute-value-can-not-be-blank",
|
576
|
+
:datavars => {"tagName" => tag_name,
|
577
|
+
"attributeName" => attr_name}})
|
578
|
+
end
|
579
|
+
end
|
580
|
+
|
581
|
+
def check_floating_point_number(token, tag_name, attr_name, attr_value)
|
582
|
+
# XXX
|
583
|
+
end
|
584
|
+
|
585
|
+
def check_browsing_context(token, tag_name, attr_name, attr_value)
|
586
|
+
return if not attr_value
|
587
|
+
return if attr_value[0] != ?_
|
588
|
+
attr_value.downcase!
|
589
|
+
return if ['_self', '_parent', '_top', '_blank'].include?(attr_value)
|
590
|
+
yield({:type => "ParseError",
|
591
|
+
:data => "invalid-browsing-context",
|
592
|
+
:datavars => {"tagName" => tag_name,
|
593
|
+
"attributeName" => attr_name}})
|
594
|
+
end
|
595
|
+
|
596
|
+
def check_lang_code(token, tag_name, attr_name, attr_value)
|
597
|
+
return if !attr_value || attr_value == '' # blank is OK
|
598
|
+
if not is_valid_lang_code(attr_value)
|
599
|
+
yield( {:type => "ParseError",
|
600
|
+
:data => "invalid-lang-code",
|
601
|
+
:datavars => {"tagName" => tag_name,
|
602
|
+
"attributeName" => attr_name,
|
603
|
+
"attributeValue" => attr_value}})
|
604
|
+
end
|
605
|
+
end
|
606
|
+
|
607
|
+
def check_mime_type(token, tag_name, attr_name, attr_value)
|
608
|
+
# XXX needs tests
|
609
|
+
if not attr_value
|
610
|
+
yield( {:type => "ParseError",
|
611
|
+
:data => "attribute-value-can-not-be-blank",
|
612
|
+
:datavars => {"tagName" => tag_name,
|
613
|
+
"attributeName" => attr_name}})
|
614
|
+
end
|
615
|
+
if not is_valid_mime_type(attr_value)
|
616
|
+
yield( {:type => "ParseError",
|
617
|
+
:data => "invalid-mime-type",
|
618
|
+
:datavars => {"tagName" => tag_name,
|
619
|
+
"attributeName" => attr_name,
|
620
|
+
"attributeValue" => attr_value}})
|
621
|
+
end
|
622
|
+
end
|
623
|
+
|
624
|
+
def check_media_query(token, tag_name, attr_name, attr_value)
|
625
|
+
# XXX
|
626
|
+
end
|
627
|
+
|
628
|
+
def check_link_relation(token, tag_name, attr_name, attr_value)
|
629
|
+
check_token_list(tag_name, attr_name, attr_value) do |t|
|
630
|
+
yield t
|
631
|
+
end
|
632
|
+
value_list = parse_token_list(attr_value)
|
633
|
+
allowed_values = tag_name == 'link' ? @@link_rel_values : @@a_rel_values
|
634
|
+
for current_value in value_list
|
635
|
+
if !allowed_values.include?(current_value)
|
636
|
+
yield({:type => "ParseError",
|
637
|
+
:data => "invalid-rel",
|
638
|
+
:datavars => {"tagName" => tag_name,
|
639
|
+
"attributeName" => attr_name}})
|
640
|
+
end
|
641
|
+
end
|
642
|
+
end
|
643
|
+
|
644
|
+
def check_date_time(token, tag_name, attr_name, attr_value)
|
645
|
+
# XXX
|
646
|
+
state = 'begin' # ('begin', '...
|
647
|
+
# for c in attr_value
|
648
|
+
# if state == 'begin' =>
|
649
|
+
# if SPACE_CHARACTERS.include?(c)
|
650
|
+
# continue
|
651
|
+
# elsif digits.include?(c)
|
652
|
+
# state = ...
|
653
|
+
end
|
654
|
+
|
655
|
+
##########################################################################
|
656
|
+
# Attribute validation
|
657
|
+
##########################################################################
|
658
|
+
|
659
|
+
def check_attribute_values(token)
|
660
|
+
tag_name = token.fetch(:name, "")
|
661
|
+
for attr_name, attr_value in token.fetch(:data, [])
|
662
|
+
attr_name = attr_name.downcase
|
663
|
+
method = "validate_attribute_value_#{tag_name.to_s.underscore}_#{attr_name.to_s.underscore}"
|
664
|
+
if respond_to?(method)
|
665
|
+
send(method, token, tag_name, attr_name, attr_value) do |t|
|
666
|
+
yield t
|
667
|
+
end
|
668
|
+
else
|
669
|
+
method = "validate_attribute_value_#{attr_name.to_s.underscore}"
|
670
|
+
if respond_to?(method)
|
671
|
+
send(method, token, tag_name, attr_name, attr_value) do |t|
|
672
|
+
yield t
|
673
|
+
end
|
674
|
+
end
|
675
|
+
end
|
676
|
+
end
|
677
|
+
end
|
678
|
+
|
679
|
+
def validate_attribute_value_class(token, tag_name, attr_name, attr_value)
|
680
|
+
check_token_list(tag_name, attr_name, attr_value) do |t|
|
681
|
+
yield t
|
682
|
+
yield( {:type => "ParseError",
|
683
|
+
:data => "invalid-attribute-value",
|
684
|
+
:datavars => {"tagName" => tag_name,
|
685
|
+
"attributeName" => attr_name}})
|
686
|
+
end
|
687
|
+
end
|
688
|
+
|
689
|
+
def validate_attribute_value_contenteditable(token, tag_name, attr_name, attr_value)
|
690
|
+
check_enumerated_value(token, tag_name, attr_name, attr_value, ['true', 'false', '']) do |t|
|
691
|
+
yield t
|
692
|
+
end
|
693
|
+
end
|
694
|
+
|
695
|
+
def validate_attribute_value_dir(token, tag_name, attr_name, attr_value)
|
696
|
+
check_enumerated_value(token, tag_name, attr_name, attr_value, ['ltr', 'rtl']) do |t|
|
697
|
+
yield t
|
698
|
+
end
|
699
|
+
end
|
700
|
+
|
701
|
+
def validate_attribute_value_draggable(token, tag_name, attr_name, attr_value)
|
702
|
+
check_enumerated_value(token, tag_name, attr_name, attr_value, ['true', 'false']) do |t|
|
703
|
+
yield t
|
704
|
+
end
|
705
|
+
end
|
706
|
+
|
707
|
+
alias validate_attribute_value_irrelevant check_boolean
|
708
|
+
alias validate_attribute_value_lang check_lang_code
|
709
|
+
|
710
|
+
def validate_attribute_value_contextmenu(token, tag_name, attr_name, attr_value)
|
711
|
+
check_id(token, tag_name, attr_name, attr_value) do |t|
|
712
|
+
yield t
|
713
|
+
end
|
714
|
+
@things_that_point_to_an_id << token
|
715
|
+
end
|
716
|
+
|
717
|
+
def validate_attribute_value_id(token, tag_name, attr_name, attr_value)
|
718
|
+
# This method has side effects. It adds 'token' to the list of
|
719
|
+
# things that define an ID (@things_that_define_an_id) so that we can
|
720
|
+
# later check 1) whether an ID is duplicated, and 2) whether all the
|
721
|
+
# things that point to something else by ID (like <label for> or
|
722
|
+
# <span contextmenu>) point to an ID that actually exists somewhere.
|
723
|
+
check_id(token, tag_name, attr_name, attr_value) do |t|
|
724
|
+
yield t
|
725
|
+
end
|
726
|
+
return if not attr_value
|
727
|
+
if @ids_we_have_known_and_loved.include?(attr_value)
|
728
|
+
yield( {:type => "ParseError",
|
729
|
+
:data => "duplicate-id",
|
730
|
+
:datavars => {"tagName" => tag_name}})
|
731
|
+
end
|
732
|
+
@ids_we_have_known_and_loved << attr_value
|
733
|
+
@things_that_define_an_id << token
|
734
|
+
end
|
735
|
+
|
736
|
+
alias validate_attribute_value_tabindex check_integer
|
737
|
+
|
738
|
+
def validate_attribute_value_ref(token, tag_name, attr_name, attr_value)
|
739
|
+
# XXX
|
740
|
+
end
|
741
|
+
|
742
|
+
def validate_attribute_value_template(token, tag_name, attr_name, attr_value)
|
743
|
+
# XXX
|
744
|
+
end
|
745
|
+
|
746
|
+
def validate_attribute_value_html_xmlns(token, tag_name, attr_name, attr_value)
|
747
|
+
if attr_value != "http://www.w3.org/1999/xhtml"
|
748
|
+
yield( {:type => "ParseError",
|
749
|
+
:data => "invalid-root-namespace",
|
750
|
+
:datavars => {"tagName" => tag_name,
|
751
|
+
"attributeName" => attr_name}})
|
752
|
+
end
|
753
|
+
end
|
754
|
+
|
755
|
+
alias validate_attribute_value_base_href check_iri
|
756
|
+
alias validate_attribute_value_base_target check_browsing_context
|
757
|
+
alias validate_attribute_value_link_href check_iri
|
758
|
+
alias validate_attribute_value_link_rel check_link_relation
|
759
|
+
alias validate_attribute_value_link_media check_media_query
|
760
|
+
alias validate_attribute_value_link_hreflang check_lang_code
|
761
|
+
alias validate_attribute_value_link_type check_mime_type
|
762
|
+
# XXX <meta> attributes
|
763
|
+
alias validate_attribute_value_style_media check_media_query
|
764
|
+
alias validate_attribute_value_style_type check_mime_type
|
765
|
+
alias validate_attribute_value_style_scoped check_boolean
|
766
|
+
alias validate_attribute_value_blockquote_cite check_iri
|
767
|
+
alias validate_attribute_value_ol_start check_integer
|
768
|
+
alias validate_attribute_value_li_value check_integer
|
769
|
+
# XXX need tests from here on
|
770
|
+
alias validate_attribute_value_a_href check_iri
|
771
|
+
alias validate_attribute_value_a_target check_browsing_context
|
772
|
+
|
773
|
+
def validate_attribute_value_a_ping(token, tag_name, attr_name, attr_value)
|
774
|
+
value_list = parse_token_list(attr_value)
|
775
|
+
for current_value in value_list
|
776
|
+
checkIRI(token, tag_name, attr_name, attr_value) do |t|
|
777
|
+
yield t
|
778
|
+
end
|
779
|
+
end
|
780
|
+
end
|
781
|
+
|
782
|
+
alias validate_attribute_value_a_rel check_link_relation
|
783
|
+
alias validate_attribute_value_a_media check_media_query
|
784
|
+
alias validate_attribute_value_a_hreflang check_lang_code
|
785
|
+
alias validate_attribute_value_a_type check_mime_type
|
786
|
+
alias validate_attribute_value_q_cite check_iri
|
787
|
+
alias validate_attribute_value_time_datetime check_date_time
|
788
|
+
alias validate_attribute_value_meter_value check_floating_point_number
|
789
|
+
alias validate_attribute_value_meter_min check_floating_point_number
|
790
|
+
alias validate_attribute_value_meter_low check_floating_point_number
|
791
|
+
alias validate_attribute_value_meter_high check_floating_point_number
|
792
|
+
alias validate_attribute_value_meter_max check_floating_point_number
|
793
|
+
alias validate_attribute_value_meter_optimum check_floating_point_number
|
794
|
+
alias validate_attribute_value_progress_value check_floating_point_number
|
795
|
+
alias validate_attribute_value_progress_max check_floating_point_number
|
796
|
+
alias validate_attribute_value_ins_cite check_iri
|
797
|
+
alias validate_attribute_value_ins_datetime check_date_time
|
798
|
+
alias validate_attribute_value_del_cite check_iri
|
799
|
+
alias validate_attribute_value_del_datetime check_date_time
|
800
|
+
|
801
|
+
##########################################################################
|
802
|
+
# Whole document validation (IDs, etc.)
|
803
|
+
##########################################################################
|
804
|
+
|
805
|
+
def eof
|
806
|
+
for token in @things_that_point_to_an_id
|
807
|
+
tag_name = token.fetch(:name, "").downcase
|
808
|
+
attrs_dict = token[:data] # by now html5parser has "normalized" the attrs list into a dict.
|
809
|
+
# hooray for obscure side effects!
|
810
|
+
attr_value = attrs_dict.fetch("contextmenu", "")
|
811
|
+
if attr_value and (!@ids_we_have_known_and_loved.include?(attr_value))
|
812
|
+
yield( {:type => "ParseError",
|
813
|
+
:data => "id-does-not-exist",
|
814
|
+
:datavars => {"tagName" => tag_name,
|
815
|
+
"attributeName" => "contextmenu",
|
816
|
+
"attributeValue" => attr_value}})
|
817
|
+
else
|
818
|
+
for ref_token in @things_that_define_an_id
|
819
|
+
id = ref_token.fetch(:data, {}).fetch("id", "")
|
820
|
+
if not id
|
821
|
+
continue
|
822
|
+
end
|
823
|
+
if id == attr_value
|
824
|
+
if ref_token.fetch(:name, "").downcase != "men"
|
825
|
+
yield( {:type => "ParseError",
|
826
|
+
:data => "contextmenu-must-point-to-menu"})
|
827
|
+
end
|
828
|
+
break
|
829
|
+
end
|
830
|
+
end
|
831
|
+
end
|
832
|
+
end
|
833
|
+
end
|
834
|
+
end
|