spk-html5 0.10.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +73 -0
  3. data/README +45 -0
  4. data/Rakefile.rb +33 -0
  5. data/bin/html5 +7 -0
  6. data/lib/html5.rb +13 -0
  7. data/lib/html5/cli.rb +248 -0
  8. data/lib/html5/constants.rb +1061 -0
  9. data/lib/html5/filters/base.rb +10 -0
  10. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  11. data/lib/html5/filters/iso639codes.rb +755 -0
  12. data/lib/html5/filters/optionaltags.rb +198 -0
  13. data/lib/html5/filters/rfc2046.rb +31 -0
  14. data/lib/html5/filters/rfc3987.rb +91 -0
  15. data/lib/html5/filters/sanitizer.rb +15 -0
  16. data/lib/html5/filters/validator.rb +834 -0
  17. data/lib/html5/filters/whitespace.rb +36 -0
  18. data/lib/html5/html5parser.rb +247 -0
  19. data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
  20. data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
  21. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  22. data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  23. data/lib/html5/html5parser/after_head_phase.rb +55 -0
  24. data/lib/html5/html5parser/before_head_phase.rb +44 -0
  25. data/lib/html5/html5parser/before_html_phase.rb +41 -0
  26. data/lib/html5/html5parser/in_body_phase.rb +636 -0
  27. data/lib/html5/html5parser/in_caption_phase.rb +69 -0
  28. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  29. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  30. data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
  31. data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
  32. data/lib/html5/html5parser/in_head_phase.rb +143 -0
  33. data/lib/html5/html5parser/in_row_phase.rb +96 -0
  34. data/lib/html5/html5parser/in_select_phase.rb +90 -0
  35. data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
  36. data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
  37. data/lib/html5/html5parser/in_table_phase.rb +177 -0
  38. data/lib/html5/html5parser/initial_phase.rb +133 -0
  39. data/lib/html5/html5parser/phase.rb +171 -0
  40. data/lib/html5/inputstream.rb +735 -0
  41. data/lib/html5/liberalxmlparser.rb +158 -0
  42. data/lib/html5/sanitizer.rb +209 -0
  43. data/lib/html5/serializer.rb +2 -0
  44. data/lib/html5/serializer/htmlserializer.rb +179 -0
  45. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  46. data/lib/html5/sniffer.rb +45 -0
  47. data/lib/html5/tokenizer.rb +1059 -0
  48. data/lib/html5/treebuilders.rb +24 -0
  49. data/lib/html5/treebuilders/base.rb +339 -0
  50. data/lib/html5/treebuilders/hpricot.rb +231 -0
  51. data/lib/html5/treebuilders/rexml.rb +215 -0
  52. data/lib/html5/treebuilders/simpletree.rb +191 -0
  53. data/lib/html5/treewalkers.rb +26 -0
  54. data/lib/html5/treewalkers/base.rb +162 -0
  55. data/lib/html5/treewalkers/hpricot.rb +48 -0
  56. data/lib/html5/treewalkers/rexml.rb +48 -0
  57. data/lib/html5/treewalkers/simpletree.rb +48 -0
  58. data/lib/html5/version.rb +3 -0
  59. data/test/preamble.rb +69 -0
  60. data/test/test_cli.rb +16 -0
  61. data/test/test_encoding.rb +35 -0
  62. data/test/test_input_stream.rb +26 -0
  63. data/test/test_lxp.rb +283 -0
  64. data/test/test_parser.rb +63 -0
  65. data/test/test_sanitizer.rb +173 -0
  66. data/test/test_serializer.rb +67 -0
  67. data/test/test_sniffer.rb +27 -0
  68. data/test/test_stream.rb +71 -0
  69. data/test/test_tokenizer.rb +95 -0
  70. data/test/test_treewalkers.rb +135 -0
  71. data/test/test_validator.rb +31 -0
  72. data/test/tokenizer_test_parser.rb +67 -0
  73. data/test19.rb +38 -0
  74. metadata +198 -0
@@ -0,0 +1,198 @@
1
+ require 'html5/constants'
2
+ require 'html5/filters/base'
3
+
4
+ module HTML5
5
+ module Filters
6
+
7
+ class OptionalTagFilter < Base
8
+ def slider
9
+ previous1 = previous2 = nil
10
+ __getobj__.each do |token|
11
+ yield previous2, previous1, token if previous1 != nil
12
+ previous2 = previous1
13
+ previous1 = token
14
+ end
15
+ yield previous2, previous1, nil
16
+ end
17
+
18
+ def each
19
+ slider do |previous, token, nexttok|
20
+ type = token[:type]
21
+ if type == :StartTag
22
+ yield token unless token[:data].empty? and is_optional_start(token[:name], previous, nexttok)
23
+ elsif type == :EndTag
24
+ yield token unless is_optional_end(token[:name], nexttok)
25
+ else
26
+ yield token
27
+ end
28
+ end
29
+ end
30
+
31
+ def is_optional_start(tagname, previous, nexttok)
32
+ type = nexttok ? nexttok[:type] : nil
33
+ if tagname == 'html'
34
+ # An html element's start tag may be omitted if the first thing
35
+ # inside the html element is not a space character or a comment.
36
+ return ![:Comment, :SpaceCharacters].include?(type)
37
+ elsif tagname == 'head'
38
+ # A head element's start tag may be omitted if the first thing
39
+ # inside the head element is an element.
40
+ return type == :StartTag
41
+ elsif tagname == 'body'
42
+ # A body element's start tag may be omitted if the first thing
43
+ # inside the body element is not a space character or a comment,
44
+ # except if the first thing inside the body element is a script
45
+ # or style element and the node immediately preceding the body
46
+ # element is a head element whose end tag has been omitted.
47
+ if [:Comment, :SpaceCharacters].include?(type)
48
+ return false
49
+ elsif type == :StartTag
50
+ # XXX: we do not look at the preceding event, so we never omit
51
+ # the body element's start tag if it's followed by a script or
52
+ # a style element.
53
+ return !%w[script style].include?(nexttok[:name])
54
+ else
55
+ return true
56
+ end
57
+ elsif tagname == 'colgroup'
58
+ # A colgroup element's start tag may be omitted if the first thing
59
+ # inside the colgroup element is a col element, and if the element
60
+ # is not immediately preceeded by another colgroup element whose
61
+ # end tag has been omitted.
62
+ if type == :StartTag
63
+ # XXX: we do not look at the preceding event, so instead we never
64
+ # omit the colgroup element's end tag when it is immediately
65
+ # followed by another colgroup element. See is_optional_end.
66
+ return nexttok[:name] == "col"
67
+ else
68
+ return false
69
+ end
70
+ elsif tagname == 'tbody'
71
+ # A tbody element's start tag may be omitted if the first thing
72
+ # inside the tbody element is a tr element, and if the element is
73
+ # not immediately preceeded by a tbody, thead, or tfoot element
74
+ # whose end tag has been omitted.
75
+ if type == :StartTag
76
+ # omit the thead and tfoot elements' end tag when they are
77
+ # immediately followed by a tbody element. See is_optional_end.
78
+ if previous and previous[:type] == :EndTag && %w(tbody thead tfoot).include?(previous[:name])
79
+ return false
80
+ end
81
+
82
+ return nexttok[:name] == 'tr'
83
+ else
84
+ return false
85
+ end
86
+ end
87
+ return false
88
+ end
89
+
90
+ def is_optional_end(tagname, nexttok)
91
+ type = nexttok ? nexttok[:type] : nil
92
+ if %w[html head body].include?(tagname)
93
+ # An html element's end tag may be omitted if the html element
94
+ # is not immediately followed by a space character or a comment.
95
+ return ![:Comment, :SpaceCharacters].include?(type)
96
+ elsif %w[li optgroup option tr].include?(tagname)
97
+ # A li element's end tag may be omitted if the li element is
98
+ # immediately followed by another li element or if there is
99
+ # no more content in the parent element.
100
+ # An optgroup element's end tag may be omitted if the optgroup
101
+ # element is immediately followed by another optgroup element,
102
+ # or if there is no more content in the parent element.
103
+ # An option element's end tag may be omitted if the option
104
+ # element is immediately followed by another option element,
105
+ # or if there is no more content in the parent element.
106
+ # A tr element's end tag may be omitted if the tr element is
107
+ # immediately followed by another tr element, or if there is
108
+ # no more content in the parent element.
109
+ if type == :StartTag
110
+ return nexttok[:name] == tagname
111
+ else
112
+ return type == :EndTag || type == nil
113
+ end
114
+ elsif %w(dt dd).include?(tagname)
115
+ # A dt element's end tag may be omitted if the dt element is
116
+ # immediately followed by another dt element or a dd element.
117
+ # A dd element's end tag may be omitted if the dd element is
118
+ # immediately followed by another dd element or a dt element,
119
+ # or if there is no more content in the parent element.
120
+ if type == :StartTag
121
+ return %w(dt dd).include?(nexttok[:name])
122
+ elsif tagname == 'dd'
123
+ return type == :EndTag || type == nil
124
+ else
125
+ return false
126
+ end
127
+ elsif tagname == 'p'
128
+ # A p element's end tag may be omitted if the p element is
129
+ # immediately followed by an address, blockquote, dl, fieldset,
130
+ # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
131
+ # or ul element, or if there is no more content in the parent
132
+ # element.
133
+ if type == :StartTag
134
+ return %w(address blockquote dl fieldset form h1 h2 h3 h4 h5
135
+ h6 hr menu ol p pre table ul).include?(nexttok[:name])
136
+ else
137
+ return type == :EndTag || type == nil
138
+ end
139
+ elsif tagname == 'colgroup'
140
+ # A colgroup element's end tag may be omitted if the colgroup
141
+ # element is not immediately followed by a space character or
142
+ # a comment.
143
+ if [:Comment, :SpaceCharacters].include?(type)
144
+ return false
145
+ elsif type == :StartTag
146
+ # XXX: we also look for an immediately following colgroup
147
+ # element. See is_optional_start.
148
+ return nexttok[:name] != 'colgroup'
149
+ else
150
+ return true
151
+ end
152
+ elsif %w(thead tbody).include? tagname
153
+ # A thead element's end tag may be omitted if the thead element
154
+ # is immediately followed by a tbody or tfoot element.
155
+ # A tbody element's end tag may be omitted if the tbody element
156
+ # is immediately followed by a tbody or tfoot element, or if
157
+ # there is no more content in the parent element.
158
+ # A tfoot element's end tag may be omitted if the tfoot element
159
+ # is immediately followed by a tbody element, or if there is no
160
+ # more content in the parent element.
161
+ # XXX: we never omit the end tag when the following element is
162
+ # a tbody. See is_optional_start.
163
+ if type == :StartTag
164
+ return %w(tbody tfoot).include?(nexttok[:name])
165
+ elsif tagname == 'tbody'
166
+ return (type == :EndTag or type == nil)
167
+ else
168
+ return false
169
+ end
170
+ elsif tagname == 'tfoot'
171
+ # A tfoot element's end tag may be omitted if the tfoot element
172
+ # is immediately followed by a tbody element, or if there is no
173
+ # more content in the parent element.
174
+ # XXX: we never omit the end tag when the following element is
175
+ # a tbody. See is_optional_start.
176
+ if type == :StartTag
177
+ return nexttok[:name] == 'tbody'
178
+ else
179
+ return type == :EndTag || type == nil
180
+ end
181
+ elsif %w(td th).include? tagname
182
+ # A td element's end tag may be omitted if the td element is
183
+ # immediately followed by a td or th element, or if there is
184
+ # no more content in the parent element.
185
+ # A th element's end tag may be omitted if the th element is
186
+ # immediately followed by a td or th element, or if there is
187
+ # no more content in the parent element.
188
+ if type == :StartTag
189
+ return %w(td th).include?(nexttok[:name])
190
+ else
191
+ return type == :EndTag || type == nil
192
+ end
193
+ end
194
+ return false
195
+ end
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,31 @@
1
+ # adapted from feedvalidator, original copyright license is
2
+ #
3
+ # Copyright (c) 2002-2006, Sam Ruby, Mark Pilgrim, Joseph Walton, and Phil Ringnalda
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+
24
+ module RFC2046
25
+ def is_valid_mime_type(value)
26
+ # mime_re = Regexp.new('[^\s()<>,;:\\"/[\]?=]+/[^\s()<>,;:\\"/[\]?=]+(\s*;\s*[^\s()<>,;:\\"/[\]?=]+=("(\\"|[^"])*"|[^\s()<>,;:\\"/[\]?=]+))*$')
27
+ # !!mime_re.match(value)
28
+ true
29
+ end
30
+ end
31
+
@@ -0,0 +1,91 @@
1
+ # adapted from feedvalidator, original copyright license is
2
+ #
3
+ # Copyright (c) 2002-2006, Sam Ruby, Mark Pilgrim, Joseph Walton, and Phil Ringnalda
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ module RFC3987
24
+ iana_schemes = [ # http://www.iana.org/assignments/uri-schemes.html
25
+ "ftp", "http", "gopher", "mailto", "news", "nntp", "telnet", "wais",
26
+ "file", "prospero", "z39.50s", "z39.50r", "cid", "mid", "vemmi",
27
+ "service", "imap", "nfs", "acap", "rtsp", "tip", "pop", "data", "dav",
28
+ "opaquelocktoken", "sip", "sips", "tel", "fax", "modem", "ldap",
29
+ "https", "soap.beep", "soap.beeps", "xmlrpc.beep", "xmlrpc.beeps",
30
+ "urn", "go", "h323", "ipp", "tftp", "mupdate", "pres", "im", "mtqp",
31
+ "iris.beep", "dict", "snmp", "crid", "tag", "dns", "info"
32
+ ]
33
+ ALLOWED_SCHEMES = iana_schemes + ['javascript']
34
+
35
+ RFC2396 = Regexp.new("^([a-zA-Z][0-9a-zA-Z+\\-\\.]*:)?/{0,2}[0-9a-zA-Z;/?:@&=+$\\.\\-_!~*'()%,#]*$", Regexp::MULTILINE)
36
+ rfc2396_full = Regexp.new("[a-zA-Z][0-9a-zA-Z+\\-\\.]*:(//)?[0-9a-zA-Z;/?:@&=+$\\.\\-_!~*'()%,#]+$")
37
+ URN = Regexp.new("^[Uu][Rr][Nn]:[a-zA-Z0-9][a-zA-Z0-9-]{1,31}:([a-zA-Z0-9()+,\.:=@;$_!*'\-]|%[0-9A-Fa-f]{2})+$")
38
+ TAG = Regexp.new("^tag:([a-z0-9\\-\._]+?@)?[a-z0-9\.\-]+?,\d{4}(-\d{2}(-\d{2})?)?:[0-9a-zA-Z;/\?:@&=+$\.\-_!~*'\(\)%,]*(#[0-9a-zA-Z;/\?:@&=+$\.\-_!~*'\(\)%,]*)?$")
39
+
40
+ def is_valid_uri(value, uri_pattern = RFC2396)
41
+ scheme = value.split(':').first
42
+ scheme.downcase! if scheme
43
+ if scheme == 'tag'
44
+ if !TAG.match(value)
45
+ return false, "invalid-tag-uri"
46
+ end
47
+ elsif scheme == "urn"
48
+ if !URN.match(value)
49
+ return false, "invalid-urn"
50
+ end
51
+ elsif uri_pattern.match(value).to_a.reject{|i| i == ''}.compact.length == 0 || uri_pattern.match(value)[0] != value
52
+ urichars = Regexp.new("^[0-9a-zA-Z;/?:@&=+$\\.\\-_!~*'()%,#]$", Regexp::MULTILINE)
53
+ if value.length > 0
54
+ value.each_byte do |b|
55
+ if b < 128 and !urichars.match([b].pack('c*'))
56
+ return false, "invalid-uri-char"
57
+ end
58
+ end
59
+ else
60
+ begin
61
+ if uri_pattern.match(value.encode('idna'))
62
+ return false, "uri-not-iri"
63
+ end
64
+ rescue
65
+ end
66
+ return false, "invalid-uri"
67
+ end
68
+ elsif ['http','ftp'].include?(scheme)
69
+ if !value.match(%r{^\w+://[^/].*})
70
+ return false, "invalid-http-or-ftp-uri"
71
+ end
72
+ elsif value.index(':') && scheme.match(/^[a-z]+$/) && !ALLOWED_SCHEMES.include?(scheme)
73
+ return false, "invalid-scheme"
74
+ end
75
+ return true, ""
76
+ end
77
+
78
+ def is_valid_iri(value)
79
+ begin
80
+ if value.length > 0
81
+ value = value.encode('idna')
82
+ end
83
+ rescue
84
+ end
85
+ is_valid_uri(value)
86
+ end
87
+
88
+ def is_valid_fully_qualified_uri(value)
89
+ is_valid_uri(value, rfc2396_full)
90
+ end
91
+ end
@@ -0,0 +1,15 @@
1
+ require 'html5/filters/base'
2
+ require 'html5/sanitizer'
3
+
4
+ module HTML5
5
+ module Filters
6
+ class HTMLSanitizeFilter < Base
7
+ include HTMLSanitizeModule
8
+ def each
9
+ __getobj__.each do |token|
10
+ yield(sanitize_token(token))
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,834 @@
1
+ # HTML 5 conformance checker
2
+ #
3
+ # Warning: this module is experimental, incomplete, and subject to removal at any time.
4
+ #
5
+ # Usage:
6
+ # >>> from html5lib.html5parser import HTMLParser
7
+ # >>> from html5lib.filters.validator import HTMLConformanceChecker
8
+ # >>> p = HTMLParser(tokenizer=HTMLConformanceChecker)
9
+ # >>> p.parse('<!doctype html>\n<html foo=bar></html>')
10
+ # <<class 'html5lib.treebuilders.simpletree.Document'> nil>
11
+ # >>> p.errors
12
+ # [((2, 14), 'unknown-attribute', {'attributeName' => u'foo', 'tagName' => u'html'})]
13
+
14
+ require 'html5/constants'
15
+ require 'html5/filters/base'
16
+ require 'html5/filters/iso639codes'
17
+ require 'html5/filters/rfc3987'
18
+ require 'html5/filters/rfc2046'
19
+
20
+ def _(str); str; end
21
+
22
+ class String
23
+ # lifted from rails
24
+ def underscore()
25
+ self.gsub(/::/, '/').
26
+ gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
27
+ gsub(/([a-z\d])([A-Z])/,'\1_\2').
28
+ tr("-", "_").
29
+ downcase
30
+ end
31
+ end
32
+
33
+ HTML5::E.update({
34
+ "unknown-start-tag" =>
35
+ _("Unknown start tag <%(tagName)>."),
36
+ "unknown-attribute" =>
37
+ _("Unknown '%(attributeName)' attribute on <%(tagName)>."),
38
+ "missing-required-attribute" =>
39
+ _("The '%(attributeName)' attribute is required on <%(tagName)>."),
40
+ "unknown-input-type" =>
41
+ _("Illegal value for attribute on <input type='%(inputType)'>."),
42
+ "attribute-not-allowed-on-this-input-type" =>
43
+ _("The '%(attributeName)' attribute is not allowed on <input type=%(inputType)>."),
44
+ "deprecated-attribute" =>
45
+ _("This attribute is deprecated: '%(attributeName)' attribute on <%(tagName)>."),
46
+ "duplicate-value-in-token-list" =>
47
+ _("Duplicate value in token list: '%(attributeValue)' in '%(attributeName)' attribute on <%(tagName)>."),
48
+ "invalid-attribute-value" =>
49
+ _("Invalid attribute value: '%(attributeName)' attribute on <%(tagName)>."),
50
+ "space-in-id" =>
51
+ _("Whitespace is not allowed here: '%(attributeName)' attribute on <%(tagName)>."),
52
+ "duplicate-id" =>
53
+ _("This ID was already defined earlier: 'id' attribute on <%(tagName)>."),
54
+ "attribute-value-can-not-be-blank" =>
55
+ _("This value can not be blank: '%(attributeName)' attribute on <%(tagName)>."),
56
+ "id-does-not-exist" =>
57
+ _("This value refers to a non-existent ID: '%(attributeName)' attribute on <%(tagName)>."),
58
+ "invalid-enumerated-value" =>
59
+ _("Value must be one of %(enumeratedValues): '%(attributeName)' attribute on <%tagName)>."),
60
+ "invalid-boolean-value" =>
61
+ _("Value must be one of %(enumeratedValues): '%(attributeName)' attribute on <%tagName)>."),
62
+ "contextmenu-must-point-to-menu" =>
63
+ _("The contextmenu attribute must point to an ID defined on a <menu> element."),
64
+ "invalid-lang-code" =>
65
+ _("Invalid language code: '%(attributeName)' attibute on <%(tagName)>."),
66
+ "invalid-integer-value" =>
67
+ _("Value must be an integer: '%(attributeName)' attribute on <%tagName)>."),
68
+ "invalid-root-namespace" =>
69
+ _("Root namespace must be 'http://www.w3.org/1999/xhtml', or omitted."),
70
+ "invalid-browsing-context" =>
71
+ _("Value must be one of ('_self', '_parent', '_top'), or a name that does not start with '_' => '%(attributeName)' attribute on <%(tagName)>."),
72
+ "invalid-tag-uri" =>
73
+ _("Invalid URI: '%(attributeName)' attribute on <%(tagName)>."),
74
+ "invalid-urn" =>
75
+ _("Invalid URN: '%(attributeName)' attribute on <%(tagName)>."),
76
+ "invalid-uri-char" =>
77
+ _("Illegal character in URI: '%(attributeName)' attribute on <%(tagName)>."),
78
+ "uri-not-iri" =>
79
+ _("Expected a URI but found an IRI: '%(attributeName)' attribute on <%(tagName)>."),
80
+ "invalid-uri" =>
81
+ _("Invalid URI: '%(attributeName)' attribute on <%(tagName)>."),
82
+ "invalid-http-or-ftp-uri" =>
83
+ _("Invalid URI: '%(attributeName)' attribute on <%(tagName)>."),
84
+ "invalid-scheme" =>
85
+ _("Unregistered URI scheme: '%(attributeName)' attribute on <%(tagName)>."),
86
+ "invalid-rel" =>
87
+ _("Invalid link relation: '%(attributeName)' attribute on <%(tagName)>."),
88
+ "invalid-mime-type" =>
89
+ _("Invalid MIME type: '%(attributeName)' attribute on <%(tagName)>."),
90
+ })
91
+
92
+
93
+ class HTMLConformanceChecker < HTML5::Filters::Base
94
+
95
+ include RFC3987
96
+ include ISO639Codes
97
+ include RFC2046
98
+
99
+ @@global_attributes = %w[class contenteditable contextmenu dir
100
+ draggable id irrelevant lang ref tabindex template
101
+ title onabort onbeforeunload onblur onchange onclick
102
+ oncontextmenu ondblclick ondrag ondragend ondragenter
103
+ ondragleave ondragover ondragstart ondrop onerror
104
+ onfocus onkeydown onkeypress onkeyup onload onmessage
105
+ onmousedown onmousemove onmouseout onmouseover onmouseup
106
+ onmousewheel onresize onscroll onselect onsubmit onunload]
107
+ # XXX lang in HTML only, xml:lang in XHTML only
108
+ # XXX validate ref, template
109
+
110
+ @@allowed_attribute_map = {
111
+ 'html' => %w[xmlns],
112
+ 'head' => [],
113
+ 'title' => [],
114
+ 'base' => %w[href target],
115
+ 'link' => %w[href rel media hreflang type],
116
+ 'meta' => %w[name http-equiv content charset], # XXX charset in HTML only
117
+ 'style' => %w[media type scoped],
118
+ 'body' => [],
119
+ 'section' => [],
120
+ 'nav' => [],
121
+ 'article' => [],
122
+ 'blockquote' => %w[cite],
123
+ 'aside' => [],
124
+ 'h1' => [],
125
+ 'h2' => [],
126
+ 'h3' => [],
127
+ 'h4' => [],
128
+ 'h5' => [],
129
+ 'h6' => [],
130
+ 'header' => [],
131
+ 'footer' => [],
132
+ 'address' => [],
133
+ 'p' => [],
134
+ 'hr' => [],
135
+ 'br' => [],
136
+ 'dialog' => [],
137
+ 'pre' => [],
138
+ 'ol' => %w[start],
139
+ 'ul' => [],
140
+ 'li' => %w[value], # XXX depends on parent
141
+ 'dl' => [],
142
+ 'dt' => [],
143
+ 'dd' => [],
144
+ 'a' => %w[href target ping rel media hreflang type],
145
+ 'q' => %w[cite],
146
+ 'cite' => [],
147
+ 'em' => [],
148
+ 'strong' => [],
149
+ 'small' => [],
150
+ 'm' => [],
151
+ 'dfn' => [],
152
+ 'abbr' => [],
153
+ 'time' => %w[datetime],
154
+ 'meter' => %w[value min low high max optimum],
155
+ 'progress' => %w[value max],
156
+ 'code' => [],
157
+ 'var' => [],
158
+ 'samp' => [],
159
+ 'kbd' => [],
160
+ 'sup' => [],
161
+ 'sub' => [],
162
+ 'span' => [],
163
+ 'i' => [],
164
+ 'b' => [],
165
+ 'bdo' => [],
166
+ 'ins' => %w[cite datetime],
167
+ 'del' => %w[cite datetime],
168
+ 'figure' => [],
169
+ 'img' => %w[alt src usemap ismap height width], # XXX ismap depends on parent
170
+ 'iframe' => %w[src],
171
+ # <embed> handled separately
172
+ 'object' => %w[data type usemap height width],
173
+ 'param' => %w[name value],
174
+ 'video' => %w[src autoplay start loopstart loopend end loopcount controls],
175
+ 'audio' => %w[src autoplay start loopstart loopend end loopcount controls],
176
+ 'source' => %w[src type media],
177
+ 'canvas' => %w[height width],
178
+ 'map' => [],
179
+ 'area' => %w[alt coords shape href target ping rel media hreflang type],
180
+ 'table' => [],
181
+ 'caption' => [],
182
+ 'colgroup' => %w[span], # XXX only if element contains no <col> elements
183
+ 'col' => %w[span],
184
+ 'tbody' => [],
185
+ 'thead' => [],
186
+ 'tfoot' => [],
187
+ 'tr' => [],
188
+ 'td' => %w[colspan rowspan],
189
+ 'th' => %w[colspan rowspan scope],
190
+ # all possible <input> attributes are listed here but <input> is really handled separately
191
+ 'input' => %w[accept accesskey action alt autocomplete autofocus checked
192
+ disabled enctype form inputmode list maxlength method min
193
+ max name pattern step readonly replace required size src
194
+ tabindex target template value
195
+ ],
196
+ 'form' => %w[action method enctype accept name onsubmit onreset accept-charset
197
+ data replace
198
+ ],
199
+ 'button' => %w[action enctype method replace template name value type disabled form autofocus], # XXX may need matrix of acceptable attributes based on value of type attribute (like input)
200
+ 'select' => %w[name size multiple disabled data accesskey form autofocus],
201
+ 'optgroup' => %w[disabled label],
202
+ 'option' => %w[selected disabled label value],
203
+ 'textarea' => %w[maxlength name rows cols disabled readonly required form autofocus wrap accept],
204
+ 'label' => %w[for accesskey form],
205
+ 'fieldset' => %w[disabled form],
206
+ 'output' => %w[form name for onforminput onformchange],
207
+ 'datalist' => %w[data],
208
+ # XXX repetition model for repeating form controls
209
+ 'script' => %w[src defer async type],
210
+ 'noscript' => [],
211
+ 'noembed' => [],
212
+ 'event-source' => %w[src],
213
+ 'details' => %w[open],
214
+ 'datagrid' => %w[multiple disabled],
215
+ 'command' => %w[type label icon hidden disabled checked radiogroup default],
216
+ 'menu' => %w[type label autosubmit],
217
+ 'datatemplate' => [],
218
+ 'rule' => [],
219
+ 'nest' => [],
220
+ 'legend' => [],
221
+ 'div' => [],
222
+ 'font' => %w[style]
223
+ }
224
+
225
+ @@required_attribute_map = {
226
+ 'link' => %w[href rel],
227
+ 'bdo' => %w[dir],
228
+ 'img' => %w[src],
229
+ 'embed' => %w[src],
230
+ 'object' => [], # XXX one of 'data' or 'type' is required
231
+ 'param' => %w[name value],
232
+ 'source' => %w[src],
233
+ 'map' => %w[id]
234
+ }
235
+
236
+ @@input_type_allowed_attribute_map = {
237
+ 'text' => %w[accesskey autocomplete autofocus disabled form inputmode list maxlength name pattern readonly required size tabindex value],
238
+ 'password' => %w[accesskey autocomplete autofocus disabled form inputmode maxlength name pattern readonly required size tabindex value],
239
+ 'checkbox' => %w[accesskey autofocus checked disabled form name required tabindex value],
240
+ 'radio' => %w[accesskey autofocus checked disabled form name required tabindex value],
241
+ 'button' => %w[accesskey autofocus disabled form name tabindex value],
242
+ 'submit' => %w[accesskey action autofocus disabled enctype form method name replace tabindex target value],
243
+ 'reset' => %w[accesskey autofocus disabled form name tabindex value],
244
+ 'add' => %w[accesskey autofocus disabled form name tabindex template value],
245
+ 'remove' => %w[accesskey autofocus disabled form name tabindex value],
246
+ 'move-up' => %w[accesskey autofocus disabled form name tabindex value],
247
+ 'move-down' => %w[accesskey autofocus disabled form name tabindex value],
248
+ 'file' => %w[accept accesskey autofocus disabled form min max name required tabindex],
249
+ 'hidden' => %w[disabled form name value],
250
+ 'image' => %w[accesskey action alt autofocus disabled enctype form method name replace src tabindex target],
251
+ 'datetime' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
252
+ 'datetime-local' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
253
+ 'date' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
254
+ 'month' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
255
+ 'week' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
256
+ 'time' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
257
+ 'number' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
258
+ 'range' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
259
+ 'email' => %w[accesskey autocomplete autofocus disabled form inputmode list maxlength name pattern readonly required tabindex value],
260
+ 'url' => %w[accesskey autocomplete autofocus disabled form inputmode list maxlength name pattern readonly required tabindex value],
261
+ }
262
+
263
+ @@input_type_deprecated_attribute_map = {
264
+ 'text' => ['size'],
265
+ 'password' => ['size']
266
+ }
267
+
268
+ @@link_rel_values = %w[alternate archive archives author contact feed first begin start help icon index top contents toc last end license copyright next pingback prefetch prev previous search stylesheet sidebar tag up]
269
+ @@a_rel_values = %w[alternate archive archives author contact feed first begin start help index top contents toc last end license copyright next prev previous search sidebar tag up bookmark external nofollow]
270
+
271
+ def initialize(stream, *args)
272
+ super(HTML5::HTMLTokenizer.new(stream, *args))
273
+ @things_that_define_an_id = []
274
+ @things_that_point_to_an_id = []
275
+ @ids_we_have_known_and_loved = []
276
+ end
277
+
278
+ def each
279
+ __getobj__.each do |token|
280
+ method = "validate_#{token.fetch(:type, '-').to_s.underscore}_#{token.fetch(:name, '-').to_s.underscore}"
281
+ if respond_to?(method)
282
+ send(method, token){|t| yield t }
283
+ else
284
+ method = "validate_#{token.fetch(:type, '-').to_s.underscore}"
285
+ if respond_to?(method)
286
+ send(method, token) do |t|
287
+ yield t
288
+ end
289
+ end
290
+ end
291
+ yield token
292
+ end
293
+ eof do |t|
294
+ yield t
295
+ end
296
+ end
297
+
298
+ ##########################################################################
299
+ # Start tag validation
300
+ ##########################################################################
301
+
302
+ def validate_start_tag(token)
303
+ check_unknown_start_tag(token){|t| yield t}
304
+ check_start_tag_required_attributes(token) do |t|
305
+ yield t
306
+ end
307
+ check_start_tag_unknown_attributes(token) do |t|
308
+ yield t
309
+ end
310
+ check_attribute_values(token) do |t|
311
+ yield t
312
+ end
313
+ end
314
+
315
+ def validate_start_tag_embed(token)
316
+ check_start_tag_required_attributes(token) do |t|
317
+ yield t
318
+ end
319
+ check_attribute_values(token) do |t|
320
+ yield t
321
+ end
322
+ # spec says "any attributes w/o namespace"
323
+ # so don't call check_start_tag_unknown_attributes
324
+ end
325
+
326
+ def validate_start_tag_input(token)
327
+ check_attribute_values(token) do |t|
328
+ yield t
329
+ end
330
+ attr_dict = Hash[*token[:data].collect{|(name, value)| [name.downcase, value]}.flatten]
331
+ input_type = attr_dict.fetch('type', "text")
332
+ if !@@input_type_allowed_attribute_map.keys().include?(input_type)
333
+ yield({:type => "ParseError",
334
+ :data => "unknown-input-type",
335
+ :datavars => {:attrValue => input_type}})
336
+ end
337
+ allowed_attributes = @@input_type_allowed_attribute_map.fetch(input_type, [])
338
+ attr_dict.each do |attr_name, attr_value|
339
+ if !@@allowed_attribute_map['input'].include?(attr_name)
340
+ yield({:type => "ParseError",
341
+ :data => "unknown-attribute",
342
+ :datavars => {"tagName" => "input",
343
+ "attributeName" => attr_name}})
344
+ elsif !allowed_attributes.include?(attr_name)
345
+ yield({:type => "ParseError",
346
+ :data => "attribute-not-allowed-on-this-input-type",
347
+ :datavars => {"attributeName" => attr_name,
348
+ "inputType" => input_type}})
349
+ end
350
+ if @@input_type_deprecated_attribute_map.fetch(input_type, []).include?(attr_name)
351
+ yield({:type => "ParseError",
352
+ :data => "deprecated-attribute",
353
+ :datavars => {"attributeName" => attr_name,
354
+ "inputType" => input_type}})
355
+ end
356
+ end
357
+ end
358
+
359
+ ##########################################################################
360
+ # Start tag validation helpers
361
+ ##########################################################################
362
+
363
+ def check_unknown_start_tag(token)
364
+ # check for recognized tag name
365
+ name = (token[:name] || "").downcase
366
+ if !@@allowed_attribute_map.keys.include?(name)
367
+ yield({:type => "ParseError",
368
+ :data => "unknown-start-tag",
369
+ :datavars => {"tagName" => name}})
370
+ end
371
+ end
372
+
373
+ def check_start_tag_required_attributes(token)
374
+ # check for presence of required attributes
375
+ name = (token[:name] || "").downcase
376
+ if @@required_attribute_map.keys().include?(name)
377
+ attrs_present = (token[:data] || []).collect{|t| t[0]}
378
+ for attr_name in @@required_attribute_map[name]
379
+ if !attrs_present.include?(attr_name)
380
+ yield( {:type => "ParseError",
381
+ :data => "missing-required-attribute",
382
+ :datavars => {"tagName" => name,
383
+ "attributeName" => attr_name}})
384
+ end
385
+ end
386
+ end
387
+ end
388
+
389
+ def check_start_tag_unknown_attributes(token)
390
+ # check for recognized attribute names
391
+ name = token[:name].downcase
392
+ allowed_attributes = @@global_attributes | @@allowed_attribute_map.fetch(name, [])
393
+ for attr_name, attr_value in token.fetch(:data, [])
394
+ if !allowed_attributes.include?(attr_name.downcase())
395
+ yield( {:type => "ParseError",
396
+ :data => "unknown-attribute",
397
+ :datavars => {"tagName" => name,
398
+ "attributeName" => attr_name}})
399
+ end
400
+ end
401
+ end
402
+
403
+ ##########################################################################
404
+ # Attribute validation helpers
405
+ ##########################################################################
406
+
407
+ # def checkURI(token, tag_name, attr_name, attr_value)
408
+ # is_valid, error_code = rfc3987.is_valid_uri(attr_value)
409
+ # if not is_valid
410
+ # yield {:type => "ParseError",
411
+ # :data => error_code,
412
+ # :datavars => {"tagName" => tag_name,
413
+ # "attributeName" => attr_name}}
414
+ # yield {:type => "ParseError",
415
+ # :data => "invalid-attribute-value",
416
+ # :datavars => {"tagName" => tag_name,
417
+ # "attributeName" => attr_name}}
418
+
419
+ def check_iri(token, tag_name, attr_name, attr_value)
420
+ is_valid, error_code = is_valid_iri(attr_value)
421
+ if !is_valid
422
+ yield({:type => "ParseError",
423
+ :data => error_code,
424
+ :datavars => {"tagName" => tag_name,
425
+ "attributeName" => attr_name}})
426
+ yield({:type => "ParseError",
427
+ :data => "invalid-attribute-value",
428
+ :datavars => {"tagName" => tag_name,
429
+ "attributeName" => attr_name}})
430
+ end
431
+ end
432
+
433
+ def check_id(token, tag_name, attr_name, attr_value)
434
+ if !attr_value || attr_value.length == 0
435
+ yield({:type => "ParseError",
436
+ :data => "attribute-value-can-not-be-blank",
437
+ :datavars => {"tagName" => tag_name,
438
+ "attributeName" => attr_name}})
439
+ end
440
+ attr_value.each_byte do |b|
441
+ c = [b].pack('c*')
442
+ if HTML5::SPACE_CHARACTERS.include?(c)
443
+ yield( {:type => "ParseError",
444
+ :data => "space-in-id",
445
+ :datavars => {"tagName" => tag_name,
446
+ "attributeName" => attr_name}})
447
+ yield( {:type => "ParseError",
448
+ :data => "invalid-attribute-value",
449
+ :datavars => {"tagName" => tag_name,
450
+ "attributeName" => attr_name}})
451
+ break
452
+ end
453
+ end
454
+ end
455
+
456
+ def parse_token_list(value)
457
+ valueList = []
458
+ currentValue = ''
459
+ (value + ' ').each_byte do |b|
460
+ c = [b].pack('c*')
461
+ if HTML5::SPACE_CHARACTERS.include?(c)
462
+ if currentValue.length > 0
463
+ valueList << currentValue
464
+ currentValue = ''
465
+ end
466
+ else
467
+ currentValue += c
468
+ end
469
+ end
470
+ if currentValue.length > 0
471
+ valueList << currentValue
472
+ end
473
+ valueList
474
+ end
475
+
476
+ def check_token_list(tag_name, attr_name, attr_value)
477
+ # The "token" in the method name refers to tokens in an attribute value
478
+ # i.e. http://www.whatwg.org/specs/web-apps/current-work/#set-of
479
+ # but the "token" parameter refers to the token generated from
480
+ # HTMLTokenizer. Sorry for the confusion.
481
+ value_list = parse_token_list(attr_value)
482
+ value_dict = {}
483
+ for current_value in value_list
484
+ if value_dict.has_key?(current_value)
485
+ yield({:type => "ParseError",
486
+ :data => "duplicate-value-in-token-list",
487
+ :datavars => {"tagName" => tag_name,
488
+ "attributeName" => attr_name,
489
+ "attributeValue" => current_value}})
490
+ break
491
+ end
492
+ value_dict[current_value] = 1
493
+ end
494
+ end
495
+
496
+ def check_enumerated_value(token, tag_name, attr_name, attr_value, enumerated_values)
497
+ if !attr_value || attr_value.length == 0
498
+ yield( {:type => "ParseError",
499
+ :data => "attribute-value-can-not-be-blank",
500
+ :datavars => {"tagName" => tag_name,
501
+ "attributeName" => attr_name}})
502
+ return
503
+ end
504
+ attr_value.downcase!
505
+ if !enumerated_values.include?(attr_value)
506
+ yield( {:type => "ParseError",
507
+ :data => "invalid-enumerated-value",
508
+ :datavars => {"tagName" => tag_name,
509
+ "attribute_name" => attr_name,
510
+ "enumeratedValues" => enumerated_values}})
511
+ yield( {:type => "ParseError",
512
+ :data => "invalid-attribute-value",
513
+ :datavars => {"tagName" => tag_name,
514
+ "attributeName" => attr_name}})
515
+ end
516
+ end
517
+
518
+ def check_boolean(token, tag_name, attr_name, attr_value)
519
+ enumerated_values = [attr_name, '']
520
+ if !enumerated_values.include?(attr_value)
521
+ yield( {:type => "ParseError",
522
+ :data => "invalid-boolean-value",
523
+ :datavars => {"tagName" => tag_name,
524
+ "attributeName" => attr_name,
525
+ "enumeratedValues" => enumerated_values}})
526
+ yield( {:type => "ParseError",
527
+ :data => "invalid-attribute-value",
528
+ :datavars => {"tagName" => tag_name,
529
+ "attributeName" => attr_name}})
530
+ end
531
+ end
532
+
533
+ def check_integer(token, tag_name, attr_name, attr_value)
534
+ sign = 1
535
+ number_string = ''
536
+ state = 'begin' # ('begin', 'initial-number', 'number', 'trailing-junk')
537
+ error = {:type => "ParseError",
538
+ :data => "invalid-integer-value",
539
+ :datavars => {"tagName" => tag_name,
540
+ "attributeName" => attr_name,
541
+ "attributeValue" => attr_value}}
542
+ attr_value.scan(/./) do |c|
543
+ if state == 'begin'
544
+ if HTML5::SPACE_CHARACTERS.include?(c)
545
+ next
546
+ elsif c == '-'
547
+ sign = -1
548
+ state = 'initial-number'
549
+ elsif HTML5::DIGITS.include?(c)
550
+ number_string += c
551
+ state = 'in-number'
552
+ else
553
+ yield error
554
+ return
555
+ end
556
+ elsif state == 'initial-number'
557
+ if !HTML5::DIGITS.include?(c)
558
+ yield error
559
+ return
560
+ end
561
+ number_string += c
562
+ state = 'in-number'
563
+ elsif state == 'in-number'
564
+ if HTML5::DIGITS.include?(c)
565
+ number_string += c
566
+ else
567
+ state = 'trailing-junk'
568
+ end
569
+ elsif state == 'trailing-junk'
570
+ next
571
+ end
572
+ end
573
+ if number_string.length == 0
574
+ yield( {:type => "ParseError",
575
+ :data => "attribute-value-can-not-be-blank",
576
+ :datavars => {"tagName" => tag_name,
577
+ "attributeName" => attr_name}})
578
+ end
579
+ end
580
+
581
+ def check_floating_point_number(token, tag_name, attr_name, attr_value)
582
+ # XXX
583
+ end
584
+
585
+ def check_browsing_context(token, tag_name, attr_name, attr_value)
586
+ return if not attr_value
587
+ return if attr_value[0] != ?_
588
+ attr_value.downcase!
589
+ return if ['_self', '_parent', '_top', '_blank'].include?(attr_value)
590
+ yield({:type => "ParseError",
591
+ :data => "invalid-browsing-context",
592
+ :datavars => {"tagName" => tag_name,
593
+ "attributeName" => attr_name}})
594
+ end
595
+
596
+ def check_lang_code(token, tag_name, attr_name, attr_value)
597
+ return if !attr_value || attr_value == '' # blank is OK
598
+ if not is_valid_lang_code(attr_value)
599
+ yield( {:type => "ParseError",
600
+ :data => "invalid-lang-code",
601
+ :datavars => {"tagName" => tag_name,
602
+ "attributeName" => attr_name,
603
+ "attributeValue" => attr_value}})
604
+ end
605
+ end
606
+
607
+ def check_mime_type(token, tag_name, attr_name, attr_value)
608
+ # XXX needs tests
609
+ if not attr_value
610
+ yield( {:type => "ParseError",
611
+ :data => "attribute-value-can-not-be-blank",
612
+ :datavars => {"tagName" => tag_name,
613
+ "attributeName" => attr_name}})
614
+ end
615
+ if not is_valid_mime_type(attr_value)
616
+ yield( {:type => "ParseError",
617
+ :data => "invalid-mime-type",
618
+ :datavars => {"tagName" => tag_name,
619
+ "attributeName" => attr_name,
620
+ "attributeValue" => attr_value}})
621
+ end
622
+ end
623
+
624
+ def check_media_query(token, tag_name, attr_name, attr_value)
625
+ # XXX
626
+ end
627
+
628
+ def check_link_relation(token, tag_name, attr_name, attr_value)
629
+ check_token_list(tag_name, attr_name, attr_value) do |t|
630
+ yield t
631
+ end
632
+ value_list = parse_token_list(attr_value)
633
+ allowed_values = tag_name == 'link' ? @@link_rel_values : @@a_rel_values
634
+ for current_value in value_list
635
+ if !allowed_values.include?(current_value)
636
+ yield({:type => "ParseError",
637
+ :data => "invalid-rel",
638
+ :datavars => {"tagName" => tag_name,
639
+ "attributeName" => attr_name}})
640
+ end
641
+ end
642
+ end
643
+
644
+ def check_date_time(token, tag_name, attr_name, attr_value)
645
+ # XXX
646
+ state = 'begin' # ('begin', '...
647
+ # for c in attr_value
648
+ # if state == 'begin' =>
649
+ # if SPACE_CHARACTERS.include?(c)
650
+ # continue
651
+ # elsif digits.include?(c)
652
+ # state = ...
653
+ end
654
+
655
+ ##########################################################################
656
+ # Attribute validation
657
+ ##########################################################################
658
+
659
+ def check_attribute_values(token)
660
+ tag_name = token.fetch(:name, "")
661
+ for attr_name, attr_value in token.fetch(:data, [])
662
+ attr_name = attr_name.downcase
663
+ method = "validate_attribute_value_#{tag_name.to_s.underscore}_#{attr_name.to_s.underscore}"
664
+ if respond_to?(method)
665
+ send(method, token, tag_name, attr_name, attr_value) do |t|
666
+ yield t
667
+ end
668
+ else
669
+ method = "validate_attribute_value_#{attr_name.to_s.underscore}"
670
+ if respond_to?(method)
671
+ send(method, token, tag_name, attr_name, attr_value) do |t|
672
+ yield t
673
+ end
674
+ end
675
+ end
676
+ end
677
+ end
678
+
679
+ def validate_attribute_value_class(token, tag_name, attr_name, attr_value)
680
+ check_token_list(tag_name, attr_name, attr_value) do |t|
681
+ yield t
682
+ yield( {:type => "ParseError",
683
+ :data => "invalid-attribute-value",
684
+ :datavars => {"tagName" => tag_name,
685
+ "attributeName" => attr_name}})
686
+ end
687
+ end
688
+
689
+ def validate_attribute_value_contenteditable(token, tag_name, attr_name, attr_value)
690
+ check_enumerated_value(token, tag_name, attr_name, attr_value, ['true', 'false', '']) do |t|
691
+ yield t
692
+ end
693
+ end
694
+
695
+ def validate_attribute_value_dir(token, tag_name, attr_name, attr_value)
696
+ check_enumerated_value(token, tag_name, attr_name, attr_value, ['ltr', 'rtl']) do |t|
697
+ yield t
698
+ end
699
+ end
700
+
701
+ def validate_attribute_value_draggable(token, tag_name, attr_name, attr_value)
702
+ check_enumerated_value(token, tag_name, attr_name, attr_value, ['true', 'false']) do |t|
703
+ yield t
704
+ end
705
+ end
706
+
707
+ alias validate_attribute_value_irrelevant check_boolean
708
+ alias validate_attribute_value_lang check_lang_code
709
+
710
+ def validate_attribute_value_contextmenu(token, tag_name, attr_name, attr_value)
711
+ check_id(token, tag_name, attr_name, attr_value) do |t|
712
+ yield t
713
+ end
714
+ @things_that_point_to_an_id << token
715
+ end
716
+
717
+ def validate_attribute_value_id(token, tag_name, attr_name, attr_value)
718
+ # This method has side effects. It adds 'token' to the list of
719
+ # things that define an ID (@things_that_define_an_id) so that we can
720
+ # later check 1) whether an ID is duplicated, and 2) whether all the
721
+ # things that point to something else by ID (like <label for> or
722
+ # <span contextmenu>) point to an ID that actually exists somewhere.
723
+ check_id(token, tag_name, attr_name, attr_value) do |t|
724
+ yield t
725
+ end
726
+ return if not attr_value
727
+ if @ids_we_have_known_and_loved.include?(attr_value)
728
+ yield( {:type => "ParseError",
729
+ :data => "duplicate-id",
730
+ :datavars => {"tagName" => tag_name}})
731
+ end
732
+ @ids_we_have_known_and_loved << attr_value
733
+ @things_that_define_an_id << token
734
+ end
735
+
736
+ alias validate_attribute_value_tabindex check_integer
737
+
738
+ def validate_attribute_value_ref(token, tag_name, attr_name, attr_value)
739
+ # XXX
740
+ end
741
+
742
+ def validate_attribute_value_template(token, tag_name, attr_name, attr_value)
743
+ # XXX
744
+ end
745
+
746
+ def validate_attribute_value_html_xmlns(token, tag_name, attr_name, attr_value)
747
+ if attr_value != "http://www.w3.org/1999/xhtml"
748
+ yield( {:type => "ParseError",
749
+ :data => "invalid-root-namespace",
750
+ :datavars => {"tagName" => tag_name,
751
+ "attributeName" => attr_name}})
752
+ end
753
+ end
754
+
755
+ alias validate_attribute_value_base_href check_iri
756
+ alias validate_attribute_value_base_target check_browsing_context
757
+ alias validate_attribute_value_link_href check_iri
758
+ alias validate_attribute_value_link_rel check_link_relation
759
+ alias validate_attribute_value_link_media check_media_query
760
+ alias validate_attribute_value_link_hreflang check_lang_code
761
+ alias validate_attribute_value_link_type check_mime_type
762
+ # XXX <meta> attributes
763
+ alias validate_attribute_value_style_media check_media_query
764
+ alias validate_attribute_value_style_type check_mime_type
765
+ alias validate_attribute_value_style_scoped check_boolean
766
+ alias validate_attribute_value_blockquote_cite check_iri
767
+ alias validate_attribute_value_ol_start check_integer
768
+ alias validate_attribute_value_li_value check_integer
769
+ # XXX need tests from here on
770
+ alias validate_attribute_value_a_href check_iri
771
+ alias validate_attribute_value_a_target check_browsing_context
772
+
773
+ def validate_attribute_value_a_ping(token, tag_name, attr_name, attr_value)
774
+ value_list = parse_token_list(attr_value)
775
+ for current_value in value_list
776
+ checkIRI(token, tag_name, attr_name, attr_value) do |t|
777
+ yield t
778
+ end
779
+ end
780
+ end
781
+
782
+ alias validate_attribute_value_a_rel check_link_relation
783
+ alias validate_attribute_value_a_media check_media_query
784
+ alias validate_attribute_value_a_hreflang check_lang_code
785
+ alias validate_attribute_value_a_type check_mime_type
786
+ alias validate_attribute_value_q_cite check_iri
787
+ alias validate_attribute_value_time_datetime check_date_time
788
+ alias validate_attribute_value_meter_value check_floating_point_number
789
+ alias validate_attribute_value_meter_min check_floating_point_number
790
+ alias validate_attribute_value_meter_low check_floating_point_number
791
+ alias validate_attribute_value_meter_high check_floating_point_number
792
+ alias validate_attribute_value_meter_max check_floating_point_number
793
+ alias validate_attribute_value_meter_optimum check_floating_point_number
794
+ alias validate_attribute_value_progress_value check_floating_point_number
795
+ alias validate_attribute_value_progress_max check_floating_point_number
796
+ alias validate_attribute_value_ins_cite check_iri
797
+ alias validate_attribute_value_ins_datetime check_date_time
798
+ alias validate_attribute_value_del_cite check_iri
799
+ alias validate_attribute_value_del_datetime check_date_time
800
+
801
+ ##########################################################################
802
+ # Whole document validation (IDs, etc.)
803
+ ##########################################################################
804
+
805
+ def eof
806
+ for token in @things_that_point_to_an_id
807
+ tag_name = token.fetch(:name, "").downcase
808
+ attrs_dict = token[:data] # by now html5parser has "normalized" the attrs list into a dict.
809
+ # hooray for obscure side effects!
810
+ attr_value = attrs_dict.fetch("contextmenu", "")
811
+ if attr_value and (!@ids_we_have_known_and_loved.include?(attr_value))
812
+ yield( {:type => "ParseError",
813
+ :data => "id-does-not-exist",
814
+ :datavars => {"tagName" => tag_name,
815
+ "attributeName" => "contextmenu",
816
+ "attributeValue" => attr_value}})
817
+ else
818
+ for ref_token in @things_that_define_an_id
819
+ id = ref_token.fetch(:data, {}).fetch("id", "")
820
+ if not id
821
+ continue
822
+ end
823
+ if id == attr_value
824
+ if ref_token.fetch(:name, "").downcase != "men"
825
+ yield( {:type => "ParseError",
826
+ :data => "contextmenu-must-point-to-menu"})
827
+ end
828
+ break
829
+ end
830
+ end
831
+ end
832
+ end
833
+ end
834
+ end