spk-html5 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +73 -0
  3. data/README +45 -0
  4. data/Rakefile.rb +33 -0
  5. data/bin/html5 +7 -0
  6. data/lib/html5.rb +13 -0
  7. data/lib/html5/cli.rb +248 -0
  8. data/lib/html5/constants.rb +1061 -0
  9. data/lib/html5/filters/base.rb +10 -0
  10. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  11. data/lib/html5/filters/iso639codes.rb +755 -0
  12. data/lib/html5/filters/optionaltags.rb +198 -0
  13. data/lib/html5/filters/rfc2046.rb +31 -0
  14. data/lib/html5/filters/rfc3987.rb +91 -0
  15. data/lib/html5/filters/sanitizer.rb +15 -0
  16. data/lib/html5/filters/validator.rb +834 -0
  17. data/lib/html5/filters/whitespace.rb +36 -0
  18. data/lib/html5/html5parser.rb +247 -0
  19. data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
  20. data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
  21. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  22. data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  23. data/lib/html5/html5parser/after_head_phase.rb +55 -0
  24. data/lib/html5/html5parser/before_head_phase.rb +44 -0
  25. data/lib/html5/html5parser/before_html_phase.rb +41 -0
  26. data/lib/html5/html5parser/in_body_phase.rb +636 -0
  27. data/lib/html5/html5parser/in_caption_phase.rb +69 -0
  28. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  29. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  30. data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
  31. data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
  32. data/lib/html5/html5parser/in_head_phase.rb +143 -0
  33. data/lib/html5/html5parser/in_row_phase.rb +96 -0
  34. data/lib/html5/html5parser/in_select_phase.rb +90 -0
  35. data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
  36. data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
  37. data/lib/html5/html5parser/in_table_phase.rb +177 -0
  38. data/lib/html5/html5parser/initial_phase.rb +133 -0
  39. data/lib/html5/html5parser/phase.rb +171 -0
  40. data/lib/html5/inputstream.rb +735 -0
  41. data/lib/html5/liberalxmlparser.rb +158 -0
  42. data/lib/html5/sanitizer.rb +209 -0
  43. data/lib/html5/serializer.rb +2 -0
  44. data/lib/html5/serializer/htmlserializer.rb +179 -0
  45. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  46. data/lib/html5/sniffer.rb +45 -0
  47. data/lib/html5/tokenizer.rb +1059 -0
  48. data/lib/html5/treebuilders.rb +24 -0
  49. data/lib/html5/treebuilders/base.rb +339 -0
  50. data/lib/html5/treebuilders/hpricot.rb +231 -0
  51. data/lib/html5/treebuilders/rexml.rb +215 -0
  52. data/lib/html5/treebuilders/simpletree.rb +191 -0
  53. data/lib/html5/treewalkers.rb +26 -0
  54. data/lib/html5/treewalkers/base.rb +162 -0
  55. data/lib/html5/treewalkers/hpricot.rb +48 -0
  56. data/lib/html5/treewalkers/rexml.rb +48 -0
  57. data/lib/html5/treewalkers/simpletree.rb +48 -0
  58. data/lib/html5/version.rb +3 -0
  59. data/test/preamble.rb +69 -0
  60. data/test/test_cli.rb +16 -0
  61. data/test/test_encoding.rb +35 -0
  62. data/test/test_input_stream.rb +26 -0
  63. data/test/test_lxp.rb +283 -0
  64. data/test/test_parser.rb +63 -0
  65. data/test/test_sanitizer.rb +173 -0
  66. data/test/test_serializer.rb +67 -0
  67. data/test/test_sniffer.rb +27 -0
  68. data/test/test_stream.rb +71 -0
  69. data/test/test_tokenizer.rb +95 -0
  70. data/test/test_treewalkers.rb +135 -0
  71. data/test/test_validator.rb +31 -0
  72. data/test/tokenizer_test_parser.rb +67 -0
  73. data/test19.rb +38 -0
  74. metadata +198 -0
@@ -0,0 +1,198 @@
1
+ require 'html5/constants'
2
+ require 'html5/filters/base'
3
+
4
+ module HTML5
5
+ module Filters
6
+
7
+ class OptionalTagFilter < Base
8
+ def slider
9
+ previous1 = previous2 = nil
10
+ __getobj__.each do |token|
11
+ yield previous2, previous1, token if previous1 != nil
12
+ previous2 = previous1
13
+ previous1 = token
14
+ end
15
+ yield previous2, previous1, nil
16
+ end
17
+
18
+ def each
19
+ slider do |previous, token, nexttok|
20
+ type = token[:type]
21
+ if type == :StartTag
22
+ yield token unless token[:data].empty? and is_optional_start(token[:name], previous, nexttok)
23
+ elsif type == :EndTag
24
+ yield token unless is_optional_end(token[:name], nexttok)
25
+ else
26
+ yield token
27
+ end
28
+ end
29
+ end
30
+
31
+ def is_optional_start(tagname, previous, nexttok)
32
+ type = nexttok ? nexttok[:type] : nil
33
+ if tagname == 'html'
34
+ # An html element's start tag may be omitted if the first thing
35
+ # inside the html element is not a space character or a comment.
36
+ return ![:Comment, :SpaceCharacters].include?(type)
37
+ elsif tagname == 'head'
38
+ # A head element's start tag may be omitted if the first thing
39
+ # inside the head element is an element.
40
+ return type == :StartTag
41
+ elsif tagname == 'body'
42
+ # A body element's start tag may be omitted if the first thing
43
+ # inside the body element is not a space character or a comment,
44
+ # except if the first thing inside the body element is a script
45
+ # or style element and the node immediately preceding the body
46
+ # element is a head element whose end tag has been omitted.
47
+ if [:Comment, :SpaceCharacters].include?(type)
48
+ return false
49
+ elsif type == :StartTag
50
+ # XXX: we do not look at the preceding event, so we never omit
51
+ # the body element's start tag if it's followed by a script or
52
+ # a style element.
53
+ return !%w[script style].include?(nexttok[:name])
54
+ else
55
+ return true
56
+ end
57
+ elsif tagname == 'colgroup'
58
+ # A colgroup element's start tag may be omitted if the first thing
59
+ # inside the colgroup element is a col element, and if the element
60
+ # is not immediately preceeded by another colgroup element whose
61
+ # end tag has been omitted.
62
+ if type == :StartTag
63
+ # XXX: we do not look at the preceding event, so instead we never
64
+ # omit the colgroup element's end tag when it is immediately
65
+ # followed by another colgroup element. See is_optional_end.
66
+ return nexttok[:name] == "col"
67
+ else
68
+ return false
69
+ end
70
+ elsif tagname == 'tbody'
71
+ # A tbody element's start tag may be omitted if the first thing
72
+ # inside the tbody element is a tr element, and if the element is
73
+ # not immediately preceeded by a tbody, thead, or tfoot element
74
+ # whose end tag has been omitted.
75
+ if type == :StartTag
76
+ # omit the thead and tfoot elements' end tag when they are
77
+ # immediately followed by a tbody element. See is_optional_end.
78
+ if previous and previous[:type] == :EndTag && %w(tbody thead tfoot).include?(previous[:name])
79
+ return false
80
+ end
81
+
82
+ return nexttok[:name] == 'tr'
83
+ else
84
+ return false
85
+ end
86
+ end
87
+ return false
88
+ end
89
+
90
+ def is_optional_end(tagname, nexttok)
91
+ type = nexttok ? nexttok[:type] : nil
92
+ if %w[html head body].include?(tagname)
93
+ # An html element's end tag may be omitted if the html element
94
+ # is not immediately followed by a space character or a comment.
95
+ return ![:Comment, :SpaceCharacters].include?(type)
96
+ elsif %w[li optgroup option tr].include?(tagname)
97
+ # A li element's end tag may be omitted if the li element is
98
+ # immediately followed by another li element or if there is
99
+ # no more content in the parent element.
100
+ # An optgroup element's end tag may be omitted if the optgroup
101
+ # element is immediately followed by another optgroup element,
102
+ # or if there is no more content in the parent element.
103
+ # An option element's end tag may be omitted if the option
104
+ # element is immediately followed by another option element,
105
+ # or if there is no more content in the parent element.
106
+ # A tr element's end tag may be omitted if the tr element is
107
+ # immediately followed by another tr element, or if there is
108
+ # no more content in the parent element.
109
+ if type == :StartTag
110
+ return nexttok[:name] == tagname
111
+ else
112
+ return type == :EndTag || type == nil
113
+ end
114
+ elsif %w(dt dd).include?(tagname)
115
+ # A dt element's end tag may be omitted if the dt element is
116
+ # immediately followed by another dt element or a dd element.
117
+ # A dd element's end tag may be omitted if the dd element is
118
+ # immediately followed by another dd element or a dt element,
119
+ # or if there is no more content in the parent element.
120
+ if type == :StartTag
121
+ return %w(dt dd).include?(nexttok[:name])
122
+ elsif tagname == 'dd'
123
+ return type == :EndTag || type == nil
124
+ else
125
+ return false
126
+ end
127
+ elsif tagname == 'p'
128
+ # A p element's end tag may be omitted if the p element is
129
+ # immediately followed by an address, blockquote, dl, fieldset,
130
+ # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
131
+ # or ul element, or if there is no more content in the parent
132
+ # element.
133
+ if type == :StartTag
134
+ return %w(address blockquote dl fieldset form h1 h2 h3 h4 h5
135
+ h6 hr menu ol p pre table ul).include?(nexttok[:name])
136
+ else
137
+ return type == :EndTag || type == nil
138
+ end
139
+ elsif tagname == 'colgroup'
140
+ # A colgroup element's end tag may be omitted if the colgroup
141
+ # element is not immediately followed by a space character or
142
+ # a comment.
143
+ if [:Comment, :SpaceCharacters].include?(type)
144
+ return false
145
+ elsif type == :StartTag
146
+ # XXX: we also look for an immediately following colgroup
147
+ # element. See is_optional_start.
148
+ return nexttok[:name] != 'colgroup'
149
+ else
150
+ return true
151
+ end
152
+ elsif %w(thead tbody).include? tagname
153
+ # A thead element's end tag may be omitted if the thead element
154
+ # is immediately followed by a tbody or tfoot element.
155
+ # A tbody element's end tag may be omitted if the tbody element
156
+ # is immediately followed by a tbody or tfoot element, or if
157
+ # there is no more content in the parent element.
158
+ # A tfoot element's end tag may be omitted if the tfoot element
159
+ # is immediately followed by a tbody element, or if there is no
160
+ # more content in the parent element.
161
+ # XXX: we never omit the end tag when the following element is
162
+ # a tbody. See is_optional_start.
163
+ if type == :StartTag
164
+ return %w(tbody tfoot).include?(nexttok[:name])
165
+ elsif tagname == 'tbody'
166
+ return (type == :EndTag or type == nil)
167
+ else
168
+ return false
169
+ end
170
+ elsif tagname == 'tfoot'
171
+ # A tfoot element's end tag may be omitted if the tfoot element
172
+ # is immediately followed by a tbody element, or if there is no
173
+ # more content in the parent element.
174
+ # XXX: we never omit the end tag when the following element is
175
+ # a tbody. See is_optional_start.
176
+ if type == :StartTag
177
+ return nexttok[:name] == 'tbody'
178
+ else
179
+ return type == :EndTag || type == nil
180
+ end
181
+ elsif %w(td th).include? tagname
182
+ # A td element's end tag may be omitted if the td element is
183
+ # immediately followed by a td or th element, or if there is
184
+ # no more content in the parent element.
185
+ # A th element's end tag may be omitted if the th element is
186
+ # immediately followed by a td or th element, or if there is
187
+ # no more content in the parent element.
188
+ if type == :StartTag
189
+ return %w(td th).include?(nexttok[:name])
190
+ else
191
+ return type == :EndTag || type == nil
192
+ end
193
+ end
194
+ return false
195
+ end
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,31 @@
1
+ # adapted from feedvalidator, original copyright license is
2
+ #
3
+ # Copyright (c) 2002-2006, Sam Ruby, Mark Pilgrim, Joseph Walton, and Phil Ringnalda
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+
24
+ module RFC2046
25
+ def is_valid_mime_type(value)
26
+ # mime_re = Regexp.new('[^\s()<>,;:\\"/[\]?=]+/[^\s()<>,;:\\"/[\]?=]+(\s*;\s*[^\s()<>,;:\\"/[\]?=]+=("(\\"|[^"])*"|[^\s()<>,;:\\"/[\]?=]+))*$')
27
+ # !!mime_re.match(value)
28
+ true
29
+ end
30
+ end
31
+
@@ -0,0 +1,91 @@
1
+ # adapted from feedvalidator, original copyright license is
2
+ #
3
+ # Copyright (c) 2002-2006, Sam Ruby, Mark Pilgrim, Joseph Walton, and Phil Ringnalda
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ module RFC3987
24
+ iana_schemes = [ # http://www.iana.org/assignments/uri-schemes.html
25
+ "ftp", "http", "gopher", "mailto", "news", "nntp", "telnet", "wais",
26
+ "file", "prospero", "z39.50s", "z39.50r", "cid", "mid", "vemmi",
27
+ "service", "imap", "nfs", "acap", "rtsp", "tip", "pop", "data", "dav",
28
+ "opaquelocktoken", "sip", "sips", "tel", "fax", "modem", "ldap",
29
+ "https", "soap.beep", "soap.beeps", "xmlrpc.beep", "xmlrpc.beeps",
30
+ "urn", "go", "h323", "ipp", "tftp", "mupdate", "pres", "im", "mtqp",
31
+ "iris.beep", "dict", "snmp", "crid", "tag", "dns", "info"
32
+ ]
33
+ ALLOWED_SCHEMES = iana_schemes + ['javascript']
34
+
35
+ RFC2396 = Regexp.new("^([a-zA-Z][0-9a-zA-Z+\\-\\.]*:)?/{0,2}[0-9a-zA-Z;/?:@&=+$\\.\\-_!~*'()%,#]*$", Regexp::MULTILINE)
36
+ rfc2396_full = Regexp.new("[a-zA-Z][0-9a-zA-Z+\\-\\.]*:(//)?[0-9a-zA-Z;/?:@&=+$\\.\\-_!~*'()%,#]+$")
37
+ URN = Regexp.new("^[Uu][Rr][Nn]:[a-zA-Z0-9][a-zA-Z0-9-]{1,31}:([a-zA-Z0-9()+,\.:=@;$_!*'\-]|%[0-9A-Fa-f]{2})+$")
38
+ TAG = Regexp.new("^tag:([a-z0-9\\-\._]+?@)?[a-z0-9\.\-]+?,\d{4}(-\d{2}(-\d{2})?)?:[0-9a-zA-Z;/\?:@&=+$\.\-_!~*'\(\)%,]*(#[0-9a-zA-Z;/\?:@&=+$\.\-_!~*'\(\)%,]*)?$")
39
+
40
+ def is_valid_uri(value, uri_pattern = RFC2396)
41
+ scheme = value.split(':').first
42
+ scheme.downcase! if scheme
43
+ if scheme == 'tag'
44
+ if !TAG.match(value)
45
+ return false, "invalid-tag-uri"
46
+ end
47
+ elsif scheme == "urn"
48
+ if !URN.match(value)
49
+ return false, "invalid-urn"
50
+ end
51
+ elsif uri_pattern.match(value).to_a.reject{|i| i == ''}.compact.length == 0 || uri_pattern.match(value)[0] != value
52
+ urichars = Regexp.new("^[0-9a-zA-Z;/?:@&=+$\\.\\-_!~*'()%,#]$", Regexp::MULTILINE)
53
+ if value.length > 0
54
+ value.each_byte do |b|
55
+ if b < 128 and !urichars.match([b].pack('c*'))
56
+ return false, "invalid-uri-char"
57
+ end
58
+ end
59
+ else
60
+ begin
61
+ if uri_pattern.match(value.encode('idna'))
62
+ return false, "uri-not-iri"
63
+ end
64
+ rescue
65
+ end
66
+ return false, "invalid-uri"
67
+ end
68
+ elsif ['http','ftp'].include?(scheme)
69
+ if !value.match(%r{^\w+://[^/].*})
70
+ return false, "invalid-http-or-ftp-uri"
71
+ end
72
+ elsif value.index(':') && scheme.match(/^[a-z]+$/) && !ALLOWED_SCHEMES.include?(scheme)
73
+ return false, "invalid-scheme"
74
+ end
75
+ return true, ""
76
+ end
77
+
78
+ def is_valid_iri(value)
79
+ begin
80
+ if value.length > 0
81
+ value = value.encode('idna')
82
+ end
83
+ rescue
84
+ end
85
+ is_valid_uri(value)
86
+ end
87
+
88
+ def is_valid_fully_qualified_uri(value)
89
+ is_valid_uri(value, rfc2396_full)
90
+ end
91
+ end
@@ -0,0 +1,15 @@
1
+ require 'html5/filters/base'
2
+ require 'html5/sanitizer'
3
+
4
+ module HTML5
5
+ module Filters
6
+ class HTMLSanitizeFilter < Base
7
+ include HTMLSanitizeModule
8
+ def each
9
+ __getobj__.each do |token|
10
+ yield(sanitize_token(token))
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,834 @@
1
+ # HTML 5 conformance checker
2
+ #
3
+ # Warning: this module is experimental, incomplete, and subject to removal at any time.
4
+ #
5
+ # Usage:
6
+ # >>> from html5lib.html5parser import HTMLParser
7
+ # >>> from html5lib.filters.validator import HTMLConformanceChecker
8
+ # >>> p = HTMLParser(tokenizer=HTMLConformanceChecker)
9
+ # >>> p.parse('<!doctype html>\n<html foo=bar></html>')
10
+ # <<class 'html5lib.treebuilders.simpletree.Document'> nil>
11
+ # >>> p.errors
12
+ # [((2, 14), 'unknown-attribute', {'attributeName' => u'foo', 'tagName' => u'html'})]
13
+
14
+ require 'html5/constants'
15
+ require 'html5/filters/base'
16
+ require 'html5/filters/iso639codes'
17
+ require 'html5/filters/rfc3987'
18
+ require 'html5/filters/rfc2046'
19
+
20
+ def _(str); str; end
21
+
22
+ class String
23
+ # lifted from rails
24
+ def underscore()
25
+ self.gsub(/::/, '/').
26
+ gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
27
+ gsub(/([a-z\d])([A-Z])/,'\1_\2').
28
+ tr("-", "_").
29
+ downcase
30
+ end
31
+ end
32
+
33
+ HTML5::E.update({
34
+ "unknown-start-tag" =>
35
+ _("Unknown start tag <%(tagName)>."),
36
+ "unknown-attribute" =>
37
+ _("Unknown '%(attributeName)' attribute on <%(tagName)>."),
38
+ "missing-required-attribute" =>
39
+ _("The '%(attributeName)' attribute is required on <%(tagName)>."),
40
+ "unknown-input-type" =>
41
+ _("Illegal value for attribute on <input type='%(inputType)'>."),
42
+ "attribute-not-allowed-on-this-input-type" =>
43
+ _("The '%(attributeName)' attribute is not allowed on <input type=%(inputType)>."),
44
+ "deprecated-attribute" =>
45
+ _("This attribute is deprecated: '%(attributeName)' attribute on <%(tagName)>."),
46
+ "duplicate-value-in-token-list" =>
47
+ _("Duplicate value in token list: '%(attributeValue)' in '%(attributeName)' attribute on <%(tagName)>."),
48
+ "invalid-attribute-value" =>
49
+ _("Invalid attribute value: '%(attributeName)' attribute on <%(tagName)>."),
50
+ "space-in-id" =>
51
+ _("Whitespace is not allowed here: '%(attributeName)' attribute on <%(tagName)>."),
52
+ "duplicate-id" =>
53
+ _("This ID was already defined earlier: 'id' attribute on <%(tagName)>."),
54
+ "attribute-value-can-not-be-blank" =>
55
+ _("This value can not be blank: '%(attributeName)' attribute on <%(tagName)>."),
56
+ "id-does-not-exist" =>
57
+ _("This value refers to a non-existent ID: '%(attributeName)' attribute on <%(tagName)>."),
58
+ "invalid-enumerated-value" =>
59
+ _("Value must be one of %(enumeratedValues): '%(attributeName)' attribute on <%tagName)>."),
60
+ "invalid-boolean-value" =>
61
+ _("Value must be one of %(enumeratedValues): '%(attributeName)' attribute on <%tagName)>."),
62
+ "contextmenu-must-point-to-menu" =>
63
+ _("The contextmenu attribute must point to an ID defined on a <menu> element."),
64
+ "invalid-lang-code" =>
65
+ _("Invalid language code: '%(attributeName)' attibute on <%(tagName)>."),
66
+ "invalid-integer-value" =>
67
+ _("Value must be an integer: '%(attributeName)' attribute on <%tagName)>."),
68
+ "invalid-root-namespace" =>
69
+ _("Root namespace must be 'http://www.w3.org/1999/xhtml', or omitted."),
70
+ "invalid-browsing-context" =>
71
+ _("Value must be one of ('_self', '_parent', '_top'), or a name that does not start with '_' => '%(attributeName)' attribute on <%(tagName)>."),
72
+ "invalid-tag-uri" =>
73
+ _("Invalid URI: '%(attributeName)' attribute on <%(tagName)>."),
74
+ "invalid-urn" =>
75
+ _("Invalid URN: '%(attributeName)' attribute on <%(tagName)>."),
76
+ "invalid-uri-char" =>
77
+ _("Illegal character in URI: '%(attributeName)' attribute on <%(tagName)>."),
78
+ "uri-not-iri" =>
79
+ _("Expected a URI but found an IRI: '%(attributeName)' attribute on <%(tagName)>."),
80
+ "invalid-uri" =>
81
+ _("Invalid URI: '%(attributeName)' attribute on <%(tagName)>."),
82
+ "invalid-http-or-ftp-uri" =>
83
+ _("Invalid URI: '%(attributeName)' attribute on <%(tagName)>."),
84
+ "invalid-scheme" =>
85
+ _("Unregistered URI scheme: '%(attributeName)' attribute on <%(tagName)>."),
86
+ "invalid-rel" =>
87
+ _("Invalid link relation: '%(attributeName)' attribute on <%(tagName)>."),
88
+ "invalid-mime-type" =>
89
+ _("Invalid MIME type: '%(attributeName)' attribute on <%(tagName)>."),
90
+ })
91
+
92
+
93
+ class HTMLConformanceChecker < HTML5::Filters::Base
94
+
95
+ include RFC3987
96
+ include ISO639Codes
97
+ include RFC2046
98
+
99
+ @@global_attributes = %w[class contenteditable contextmenu dir
100
+ draggable id irrelevant lang ref tabindex template
101
+ title onabort onbeforeunload onblur onchange onclick
102
+ oncontextmenu ondblclick ondrag ondragend ondragenter
103
+ ondragleave ondragover ondragstart ondrop onerror
104
+ onfocus onkeydown onkeypress onkeyup onload onmessage
105
+ onmousedown onmousemove onmouseout onmouseover onmouseup
106
+ onmousewheel onresize onscroll onselect onsubmit onunload]
107
+ # XXX lang in HTML only, xml:lang in XHTML only
108
+ # XXX validate ref, template
109
+
110
+ @@allowed_attribute_map = {
111
+ 'html' => %w[xmlns],
112
+ 'head' => [],
113
+ 'title' => [],
114
+ 'base' => %w[href target],
115
+ 'link' => %w[href rel media hreflang type],
116
+ 'meta' => %w[name http-equiv content charset], # XXX charset in HTML only
117
+ 'style' => %w[media type scoped],
118
+ 'body' => [],
119
+ 'section' => [],
120
+ 'nav' => [],
121
+ 'article' => [],
122
+ 'blockquote' => %w[cite],
123
+ 'aside' => [],
124
+ 'h1' => [],
125
+ 'h2' => [],
126
+ 'h3' => [],
127
+ 'h4' => [],
128
+ 'h5' => [],
129
+ 'h6' => [],
130
+ 'header' => [],
131
+ 'footer' => [],
132
+ 'address' => [],
133
+ 'p' => [],
134
+ 'hr' => [],
135
+ 'br' => [],
136
+ 'dialog' => [],
137
+ 'pre' => [],
138
+ 'ol' => %w[start],
139
+ 'ul' => [],
140
+ 'li' => %w[value], # XXX depends on parent
141
+ 'dl' => [],
142
+ 'dt' => [],
143
+ 'dd' => [],
144
+ 'a' => %w[href target ping rel media hreflang type],
145
+ 'q' => %w[cite],
146
+ 'cite' => [],
147
+ 'em' => [],
148
+ 'strong' => [],
149
+ 'small' => [],
150
+ 'm' => [],
151
+ 'dfn' => [],
152
+ 'abbr' => [],
153
+ 'time' => %w[datetime],
154
+ 'meter' => %w[value min low high max optimum],
155
+ 'progress' => %w[value max],
156
+ 'code' => [],
157
+ 'var' => [],
158
+ 'samp' => [],
159
+ 'kbd' => [],
160
+ 'sup' => [],
161
+ 'sub' => [],
162
+ 'span' => [],
163
+ 'i' => [],
164
+ 'b' => [],
165
+ 'bdo' => [],
166
+ 'ins' => %w[cite datetime],
167
+ 'del' => %w[cite datetime],
168
+ 'figure' => [],
169
+ 'img' => %w[alt src usemap ismap height width], # XXX ismap depends on parent
170
+ 'iframe' => %w[src],
171
+ # <embed> handled separately
172
+ 'object' => %w[data type usemap height width],
173
+ 'param' => %w[name value],
174
+ 'video' => %w[src autoplay start loopstart loopend end loopcount controls],
175
+ 'audio' => %w[src autoplay start loopstart loopend end loopcount controls],
176
+ 'source' => %w[src type media],
177
+ 'canvas' => %w[height width],
178
+ 'map' => [],
179
+ 'area' => %w[alt coords shape href target ping rel media hreflang type],
180
+ 'table' => [],
181
+ 'caption' => [],
182
+ 'colgroup' => %w[span], # XXX only if element contains no <col> elements
183
+ 'col' => %w[span],
184
+ 'tbody' => [],
185
+ 'thead' => [],
186
+ 'tfoot' => [],
187
+ 'tr' => [],
188
+ 'td' => %w[colspan rowspan],
189
+ 'th' => %w[colspan rowspan scope],
190
+ # all possible <input> attributes are listed here but <input> is really handled separately
191
+ 'input' => %w[accept accesskey action alt autocomplete autofocus checked
192
+ disabled enctype form inputmode list maxlength method min
193
+ max name pattern step readonly replace required size src
194
+ tabindex target template value
195
+ ],
196
+ 'form' => %w[action method enctype accept name onsubmit onreset accept-charset
197
+ data replace
198
+ ],
199
+ 'button' => %w[action enctype method replace template name value type disabled form autofocus], # XXX may need matrix of acceptable attributes based on value of type attribute (like input)
200
+ 'select' => %w[name size multiple disabled data accesskey form autofocus],
201
+ 'optgroup' => %w[disabled label],
202
+ 'option' => %w[selected disabled label value],
203
+ 'textarea' => %w[maxlength name rows cols disabled readonly required form autofocus wrap accept],
204
+ 'label' => %w[for accesskey form],
205
+ 'fieldset' => %w[disabled form],
206
+ 'output' => %w[form name for onforminput onformchange],
207
+ 'datalist' => %w[data],
208
+ # XXX repetition model for repeating form controls
209
+ 'script' => %w[src defer async type],
210
+ 'noscript' => [],
211
+ 'noembed' => [],
212
+ 'event-source' => %w[src],
213
+ 'details' => %w[open],
214
+ 'datagrid' => %w[multiple disabled],
215
+ 'command' => %w[type label icon hidden disabled checked radiogroup default],
216
+ 'menu' => %w[type label autosubmit],
217
+ 'datatemplate' => [],
218
+ 'rule' => [],
219
+ 'nest' => [],
220
+ 'legend' => [],
221
+ 'div' => [],
222
+ 'font' => %w[style]
223
+ }
224
+
225
+ @@required_attribute_map = {
226
+ 'link' => %w[href rel],
227
+ 'bdo' => %w[dir],
228
+ 'img' => %w[src],
229
+ 'embed' => %w[src],
230
+ 'object' => [], # XXX one of 'data' or 'type' is required
231
+ 'param' => %w[name value],
232
+ 'source' => %w[src],
233
+ 'map' => %w[id]
234
+ }
235
+
236
+ @@input_type_allowed_attribute_map = {
237
+ 'text' => %w[accesskey autocomplete autofocus disabled form inputmode list maxlength name pattern readonly required size tabindex value],
238
+ 'password' => %w[accesskey autocomplete autofocus disabled form inputmode maxlength name pattern readonly required size tabindex value],
239
+ 'checkbox' => %w[accesskey autofocus checked disabled form name required tabindex value],
240
+ 'radio' => %w[accesskey autofocus checked disabled form name required tabindex value],
241
+ 'button' => %w[accesskey autofocus disabled form name tabindex value],
242
+ 'submit' => %w[accesskey action autofocus disabled enctype form method name replace tabindex target value],
243
+ 'reset' => %w[accesskey autofocus disabled form name tabindex value],
244
+ 'add' => %w[accesskey autofocus disabled form name tabindex template value],
245
+ 'remove' => %w[accesskey autofocus disabled form name tabindex value],
246
+ 'move-up' => %w[accesskey autofocus disabled form name tabindex value],
247
+ 'move-down' => %w[accesskey autofocus disabled form name tabindex value],
248
+ 'file' => %w[accept accesskey autofocus disabled form min max name required tabindex],
249
+ 'hidden' => %w[disabled form name value],
250
+ 'image' => %w[accesskey action alt autofocus disabled enctype form method name replace src tabindex target],
251
+ 'datetime' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
252
+ 'datetime-local' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
253
+ 'date' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
254
+ 'month' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
255
+ 'week' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
256
+ 'time' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
257
+ 'number' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
258
+ 'range' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
259
+ 'email' => %w[accesskey autocomplete autofocus disabled form inputmode list maxlength name pattern readonly required tabindex value],
260
+ 'url' => %w[accesskey autocomplete autofocus disabled form inputmode list maxlength name pattern readonly required tabindex value],
261
+ }
262
+
263
+ @@input_type_deprecated_attribute_map = {
264
+ 'text' => ['size'],
265
+ 'password' => ['size']
266
+ }
267
+
268
+ @@link_rel_values = %w[alternate archive archives author contact feed first begin start help icon index top contents toc last end license copyright next pingback prefetch prev previous search stylesheet sidebar tag up]
269
+ @@a_rel_values = %w[alternate archive archives author contact feed first begin start help index top contents toc last end license copyright next prev previous search sidebar tag up bookmark external nofollow]
270
+
271
+ def initialize(stream, *args)
272
+ super(HTML5::HTMLTokenizer.new(stream, *args))
273
+ @things_that_define_an_id = []
274
+ @things_that_point_to_an_id = []
275
+ @ids_we_have_known_and_loved = []
276
+ end
277
+
278
+ def each
279
+ __getobj__.each do |token|
280
+ method = "validate_#{token.fetch(:type, '-').to_s.underscore}_#{token.fetch(:name, '-').to_s.underscore}"
281
+ if respond_to?(method)
282
+ send(method, token){|t| yield t }
283
+ else
284
+ method = "validate_#{token.fetch(:type, '-').to_s.underscore}"
285
+ if respond_to?(method)
286
+ send(method, token) do |t|
287
+ yield t
288
+ end
289
+ end
290
+ end
291
+ yield token
292
+ end
293
+ eof do |t|
294
+ yield t
295
+ end
296
+ end
297
+
298
+ ##########################################################################
299
+ # Start tag validation
300
+ ##########################################################################
301
+
302
+ def validate_start_tag(token)
303
+ check_unknown_start_tag(token){|t| yield t}
304
+ check_start_tag_required_attributes(token) do |t|
305
+ yield t
306
+ end
307
+ check_start_tag_unknown_attributes(token) do |t|
308
+ yield t
309
+ end
310
+ check_attribute_values(token) do |t|
311
+ yield t
312
+ end
313
+ end
314
+
315
+ def validate_start_tag_embed(token)
316
+ check_start_tag_required_attributes(token) do |t|
317
+ yield t
318
+ end
319
+ check_attribute_values(token) do |t|
320
+ yield t
321
+ end
322
+ # spec says "any attributes w/o namespace"
323
+ # so don't call check_start_tag_unknown_attributes
324
+ end
325
+
326
+ def validate_start_tag_input(token)
327
+ check_attribute_values(token) do |t|
328
+ yield t
329
+ end
330
+ attr_dict = Hash[*token[:data].collect{|(name, value)| [name.downcase, value]}.flatten]
331
+ input_type = attr_dict.fetch('type', "text")
332
+ if !@@input_type_allowed_attribute_map.keys().include?(input_type)
333
+ yield({:type => "ParseError",
334
+ :data => "unknown-input-type",
335
+ :datavars => {:attrValue => input_type}})
336
+ end
337
+ allowed_attributes = @@input_type_allowed_attribute_map.fetch(input_type, [])
338
+ attr_dict.each do |attr_name, attr_value|
339
+ if !@@allowed_attribute_map['input'].include?(attr_name)
340
+ yield({:type => "ParseError",
341
+ :data => "unknown-attribute",
342
+ :datavars => {"tagName" => "input",
343
+ "attributeName" => attr_name}})
344
+ elsif !allowed_attributes.include?(attr_name)
345
+ yield({:type => "ParseError",
346
+ :data => "attribute-not-allowed-on-this-input-type",
347
+ :datavars => {"attributeName" => attr_name,
348
+ "inputType" => input_type}})
349
+ end
350
+ if @@input_type_deprecated_attribute_map.fetch(input_type, []).include?(attr_name)
351
+ yield({:type => "ParseError",
352
+ :data => "deprecated-attribute",
353
+ :datavars => {"attributeName" => attr_name,
354
+ "inputType" => input_type}})
355
+ end
356
+ end
357
+ end
358
+
359
+ ##########################################################################
360
+ # Start tag validation helpers
361
+ ##########################################################################
362
+
363
+ def check_unknown_start_tag(token)
364
+ # check for recognized tag name
365
+ name = (token[:name] || "").downcase
366
+ if !@@allowed_attribute_map.keys.include?(name)
367
+ yield({:type => "ParseError",
368
+ :data => "unknown-start-tag",
369
+ :datavars => {"tagName" => name}})
370
+ end
371
+ end
372
+
373
+ def check_start_tag_required_attributes(token)
374
+ # check for presence of required attributes
375
+ name = (token[:name] || "").downcase
376
+ if @@required_attribute_map.keys().include?(name)
377
+ attrs_present = (token[:data] || []).collect{|t| t[0]}
378
+ for attr_name in @@required_attribute_map[name]
379
+ if !attrs_present.include?(attr_name)
380
+ yield( {:type => "ParseError",
381
+ :data => "missing-required-attribute",
382
+ :datavars => {"tagName" => name,
383
+ "attributeName" => attr_name}})
384
+ end
385
+ end
386
+ end
387
+ end
388
+
389
+ def check_start_tag_unknown_attributes(token)
390
+ # check for recognized attribute names
391
+ name = token[:name].downcase
392
+ allowed_attributes = @@global_attributes | @@allowed_attribute_map.fetch(name, [])
393
+ for attr_name, attr_value in token.fetch(:data, [])
394
+ if !allowed_attributes.include?(attr_name.downcase())
395
+ yield( {:type => "ParseError",
396
+ :data => "unknown-attribute",
397
+ :datavars => {"tagName" => name,
398
+ "attributeName" => attr_name}})
399
+ end
400
+ end
401
+ end
402
+
403
+ ##########################################################################
404
+ # Attribute validation helpers
405
+ ##########################################################################
406
+
407
+ # def checkURI(token, tag_name, attr_name, attr_value)
408
+ # is_valid, error_code = rfc3987.is_valid_uri(attr_value)
409
+ # if not is_valid
410
+ # yield {:type => "ParseError",
411
+ # :data => error_code,
412
+ # :datavars => {"tagName" => tag_name,
413
+ # "attributeName" => attr_name}}
414
+ # yield {:type => "ParseError",
415
+ # :data => "invalid-attribute-value",
416
+ # :datavars => {"tagName" => tag_name,
417
+ # "attributeName" => attr_name}}
418
+
419
+ def check_iri(token, tag_name, attr_name, attr_value)
420
+ is_valid, error_code = is_valid_iri(attr_value)
421
+ if !is_valid
422
+ yield({:type => "ParseError",
423
+ :data => error_code,
424
+ :datavars => {"tagName" => tag_name,
425
+ "attributeName" => attr_name}})
426
+ yield({:type => "ParseError",
427
+ :data => "invalid-attribute-value",
428
+ :datavars => {"tagName" => tag_name,
429
+ "attributeName" => attr_name}})
430
+ end
431
+ end
432
+
433
+ def check_id(token, tag_name, attr_name, attr_value)
434
+ if !attr_value || attr_value.length == 0
435
+ yield({:type => "ParseError",
436
+ :data => "attribute-value-can-not-be-blank",
437
+ :datavars => {"tagName" => tag_name,
438
+ "attributeName" => attr_name}})
439
+ end
440
+ attr_value.each_byte do |b|
441
+ c = [b].pack('c*')
442
+ if HTML5::SPACE_CHARACTERS.include?(c)
443
+ yield( {:type => "ParseError",
444
+ :data => "space-in-id",
445
+ :datavars => {"tagName" => tag_name,
446
+ "attributeName" => attr_name}})
447
+ yield( {:type => "ParseError",
448
+ :data => "invalid-attribute-value",
449
+ :datavars => {"tagName" => tag_name,
450
+ "attributeName" => attr_name}})
451
+ break
452
+ end
453
+ end
454
+ end
455
+
456
+ def parse_token_list(value)
457
+ valueList = []
458
+ currentValue = ''
459
+ (value + ' ').each_byte do |b|
460
+ c = [b].pack('c*')
461
+ if HTML5::SPACE_CHARACTERS.include?(c)
462
+ if currentValue.length > 0
463
+ valueList << currentValue
464
+ currentValue = ''
465
+ end
466
+ else
467
+ currentValue += c
468
+ end
469
+ end
470
+ if currentValue.length > 0
471
+ valueList << currentValue
472
+ end
473
+ valueList
474
+ end
475
+
476
+ def check_token_list(tag_name, attr_name, attr_value)
477
+ # The "token" in the method name refers to tokens in an attribute value
478
+ # i.e. http://www.whatwg.org/specs/web-apps/current-work/#set-of
479
+ # but the "token" parameter refers to the token generated from
480
+ # HTMLTokenizer. Sorry for the confusion.
481
+ value_list = parse_token_list(attr_value)
482
+ value_dict = {}
483
+ for current_value in value_list
484
+ if value_dict.has_key?(current_value)
485
+ yield({:type => "ParseError",
486
+ :data => "duplicate-value-in-token-list",
487
+ :datavars => {"tagName" => tag_name,
488
+ "attributeName" => attr_name,
489
+ "attributeValue" => current_value}})
490
+ break
491
+ end
492
+ value_dict[current_value] = 1
493
+ end
494
+ end
495
+
496
+ def check_enumerated_value(token, tag_name, attr_name, attr_value, enumerated_values)
497
+ if !attr_value || attr_value.length == 0
498
+ yield( {:type => "ParseError",
499
+ :data => "attribute-value-can-not-be-blank",
500
+ :datavars => {"tagName" => tag_name,
501
+ "attributeName" => attr_name}})
502
+ return
503
+ end
504
+ attr_value.downcase!
505
+ if !enumerated_values.include?(attr_value)
506
+ yield( {:type => "ParseError",
507
+ :data => "invalid-enumerated-value",
508
+ :datavars => {"tagName" => tag_name,
509
+ "attribute_name" => attr_name,
510
+ "enumeratedValues" => enumerated_values}})
511
+ yield( {:type => "ParseError",
512
+ :data => "invalid-attribute-value",
513
+ :datavars => {"tagName" => tag_name,
514
+ "attributeName" => attr_name}})
515
+ end
516
+ end
517
+
518
+ def check_boolean(token, tag_name, attr_name, attr_value)
519
+ enumerated_values = [attr_name, '']
520
+ if !enumerated_values.include?(attr_value)
521
+ yield( {:type => "ParseError",
522
+ :data => "invalid-boolean-value",
523
+ :datavars => {"tagName" => tag_name,
524
+ "attributeName" => attr_name,
525
+ "enumeratedValues" => enumerated_values}})
526
+ yield( {:type => "ParseError",
527
+ :data => "invalid-attribute-value",
528
+ :datavars => {"tagName" => tag_name,
529
+ "attributeName" => attr_name}})
530
+ end
531
+ end
532
+
533
+ def check_integer(token, tag_name, attr_name, attr_value)
534
+ sign = 1
535
+ number_string = ''
536
+ state = 'begin' # ('begin', 'initial-number', 'number', 'trailing-junk')
537
+ error = {:type => "ParseError",
538
+ :data => "invalid-integer-value",
539
+ :datavars => {"tagName" => tag_name,
540
+ "attributeName" => attr_name,
541
+ "attributeValue" => attr_value}}
542
+ attr_value.scan(/./) do |c|
543
+ if state == 'begin'
544
+ if HTML5::SPACE_CHARACTERS.include?(c)
545
+ next
546
+ elsif c == '-'
547
+ sign = -1
548
+ state = 'initial-number'
549
+ elsif HTML5::DIGITS.include?(c)
550
+ number_string += c
551
+ state = 'in-number'
552
+ else
553
+ yield error
554
+ return
555
+ end
556
+ elsif state == 'initial-number'
557
+ if !HTML5::DIGITS.include?(c)
558
+ yield error
559
+ return
560
+ end
561
+ number_string += c
562
+ state = 'in-number'
563
+ elsif state == 'in-number'
564
+ if HTML5::DIGITS.include?(c)
565
+ number_string += c
566
+ else
567
+ state = 'trailing-junk'
568
+ end
569
+ elsif state == 'trailing-junk'
570
+ next
571
+ end
572
+ end
573
+ if number_string.length == 0
574
+ yield( {:type => "ParseError",
575
+ :data => "attribute-value-can-not-be-blank",
576
+ :datavars => {"tagName" => tag_name,
577
+ "attributeName" => attr_name}})
578
+ end
579
+ end
580
+
581
+ def check_floating_point_number(token, tag_name, attr_name, attr_value)
582
+ # XXX
583
+ end
584
+
585
+ def check_browsing_context(token, tag_name, attr_name, attr_value)
586
+ return if not attr_value
587
+ return if attr_value[0] != ?_
588
+ attr_value.downcase!
589
+ return if ['_self', '_parent', '_top', '_blank'].include?(attr_value)
590
+ yield({:type => "ParseError",
591
+ :data => "invalid-browsing-context",
592
+ :datavars => {"tagName" => tag_name,
593
+ "attributeName" => attr_name}})
594
+ end
595
+
596
+ def check_lang_code(token, tag_name, attr_name, attr_value)
597
+ return if !attr_value || attr_value == '' # blank is OK
598
+ if not is_valid_lang_code(attr_value)
599
+ yield( {:type => "ParseError",
600
+ :data => "invalid-lang-code",
601
+ :datavars => {"tagName" => tag_name,
602
+ "attributeName" => attr_name,
603
+ "attributeValue" => attr_value}})
604
+ end
605
+ end
606
+
607
+ def check_mime_type(token, tag_name, attr_name, attr_value)
608
+ # XXX needs tests
609
+ if not attr_value
610
+ yield( {:type => "ParseError",
611
+ :data => "attribute-value-can-not-be-blank",
612
+ :datavars => {"tagName" => tag_name,
613
+ "attributeName" => attr_name}})
614
+ end
615
+ if not is_valid_mime_type(attr_value)
616
+ yield( {:type => "ParseError",
617
+ :data => "invalid-mime-type",
618
+ :datavars => {"tagName" => tag_name,
619
+ "attributeName" => attr_name,
620
+ "attributeValue" => attr_value}})
621
+ end
622
+ end
623
+
624
+ def check_media_query(token, tag_name, attr_name, attr_value)
625
+ # XXX
626
+ end
627
+
628
+ def check_link_relation(token, tag_name, attr_name, attr_value)
629
+ check_token_list(tag_name, attr_name, attr_value) do |t|
630
+ yield t
631
+ end
632
+ value_list = parse_token_list(attr_value)
633
+ allowed_values = tag_name == 'link' ? @@link_rel_values : @@a_rel_values
634
+ for current_value in value_list
635
+ if !allowed_values.include?(current_value)
636
+ yield({:type => "ParseError",
637
+ :data => "invalid-rel",
638
+ :datavars => {"tagName" => tag_name,
639
+ "attributeName" => attr_name}})
640
+ end
641
+ end
642
+ end
643
+
644
+ def check_date_time(token, tag_name, attr_name, attr_value)
645
+ # XXX
646
+ state = 'begin' # ('begin', '...
647
+ # for c in attr_value
648
+ # if state == 'begin' =>
649
+ # if SPACE_CHARACTERS.include?(c)
650
+ # continue
651
+ # elsif digits.include?(c)
652
+ # state = ...
653
+ end
654
+
655
+ ##########################################################################
656
+ # Attribute validation
657
+ ##########################################################################
658
+
659
+ def check_attribute_values(token)
660
+ tag_name = token.fetch(:name, "")
661
+ for attr_name, attr_value in token.fetch(:data, [])
662
+ attr_name = attr_name.downcase
663
+ method = "validate_attribute_value_#{tag_name.to_s.underscore}_#{attr_name.to_s.underscore}"
664
+ if respond_to?(method)
665
+ send(method, token, tag_name, attr_name, attr_value) do |t|
666
+ yield t
667
+ end
668
+ else
669
+ method = "validate_attribute_value_#{attr_name.to_s.underscore}"
670
+ if respond_to?(method)
671
+ send(method, token, tag_name, attr_name, attr_value) do |t|
672
+ yield t
673
+ end
674
+ end
675
+ end
676
+ end
677
+ end
678
+
679
+ def validate_attribute_value_class(token, tag_name, attr_name, attr_value)
680
+ check_token_list(tag_name, attr_name, attr_value) do |t|
681
+ yield t
682
+ yield( {:type => "ParseError",
683
+ :data => "invalid-attribute-value",
684
+ :datavars => {"tagName" => tag_name,
685
+ "attributeName" => attr_name}})
686
+ end
687
+ end
688
+
689
+ def validate_attribute_value_contenteditable(token, tag_name, attr_name, attr_value)
690
+ check_enumerated_value(token, tag_name, attr_name, attr_value, ['true', 'false', '']) do |t|
691
+ yield t
692
+ end
693
+ end
694
+
695
+ def validate_attribute_value_dir(token, tag_name, attr_name, attr_value)
696
+ check_enumerated_value(token, tag_name, attr_name, attr_value, ['ltr', 'rtl']) do |t|
697
+ yield t
698
+ end
699
+ end
700
+
701
+ def validate_attribute_value_draggable(token, tag_name, attr_name, attr_value)
702
+ check_enumerated_value(token, tag_name, attr_name, attr_value, ['true', 'false']) do |t|
703
+ yield t
704
+ end
705
+ end
706
+
707
+ alias validate_attribute_value_irrelevant check_boolean
708
+ alias validate_attribute_value_lang check_lang_code
709
+
710
+ def validate_attribute_value_contextmenu(token, tag_name, attr_name, attr_value)
711
+ check_id(token, tag_name, attr_name, attr_value) do |t|
712
+ yield t
713
+ end
714
+ @things_that_point_to_an_id << token
715
+ end
716
+
717
+ def validate_attribute_value_id(token, tag_name, attr_name, attr_value)
718
+ # This method has side effects. It adds 'token' to the list of
719
+ # things that define an ID (@things_that_define_an_id) so that we can
720
+ # later check 1) whether an ID is duplicated, and 2) whether all the
721
+ # things that point to something else by ID (like <label for> or
722
+ # <span contextmenu>) point to an ID that actually exists somewhere.
723
+ check_id(token, tag_name, attr_name, attr_value) do |t|
724
+ yield t
725
+ end
726
+ return if not attr_value
727
+ if @ids_we_have_known_and_loved.include?(attr_value)
728
+ yield( {:type => "ParseError",
729
+ :data => "duplicate-id",
730
+ :datavars => {"tagName" => tag_name}})
731
+ end
732
+ @ids_we_have_known_and_loved << attr_value
733
+ @things_that_define_an_id << token
734
+ end
735
+
736
+ alias validate_attribute_value_tabindex check_integer
737
+
738
+ def validate_attribute_value_ref(token, tag_name, attr_name, attr_value)
739
+ # XXX
740
+ end
741
+
742
+ def validate_attribute_value_template(token, tag_name, attr_name, attr_value)
743
+ # XXX
744
+ end
745
+
746
+ def validate_attribute_value_html_xmlns(token, tag_name, attr_name, attr_value)
747
+ if attr_value != "http://www.w3.org/1999/xhtml"
748
+ yield( {:type => "ParseError",
749
+ :data => "invalid-root-namespace",
750
+ :datavars => {"tagName" => tag_name,
751
+ "attributeName" => attr_name}})
752
+ end
753
+ end
754
+
755
+ alias validate_attribute_value_base_href check_iri
756
+ alias validate_attribute_value_base_target check_browsing_context
757
+ alias validate_attribute_value_link_href check_iri
758
+ alias validate_attribute_value_link_rel check_link_relation
759
+ alias validate_attribute_value_link_media check_media_query
760
+ alias validate_attribute_value_link_hreflang check_lang_code
761
+ alias validate_attribute_value_link_type check_mime_type
762
+ # XXX <meta> attributes
763
+ alias validate_attribute_value_style_media check_media_query
764
+ alias validate_attribute_value_style_type check_mime_type
765
+ alias validate_attribute_value_style_scoped check_boolean
766
+ alias validate_attribute_value_blockquote_cite check_iri
767
+ alias validate_attribute_value_ol_start check_integer
768
+ alias validate_attribute_value_li_value check_integer
769
+ # XXX need tests from here on
770
+ alias validate_attribute_value_a_href check_iri
771
+ alias validate_attribute_value_a_target check_browsing_context
772
+
773
+ def validate_attribute_value_a_ping(token, tag_name, attr_name, attr_value)
774
+ value_list = parse_token_list(attr_value)
775
+ for current_value in value_list
776
+ checkIRI(token, tag_name, attr_name, attr_value) do |t|
777
+ yield t
778
+ end
779
+ end
780
+ end
781
+
782
+ alias validate_attribute_value_a_rel check_link_relation
783
+ alias validate_attribute_value_a_media check_media_query
784
+ alias validate_attribute_value_a_hreflang check_lang_code
785
+ alias validate_attribute_value_a_type check_mime_type
786
+ alias validate_attribute_value_q_cite check_iri
787
+ alias validate_attribute_value_time_datetime check_date_time
788
+ alias validate_attribute_value_meter_value check_floating_point_number
789
+ alias validate_attribute_value_meter_min check_floating_point_number
790
+ alias validate_attribute_value_meter_low check_floating_point_number
791
+ alias validate_attribute_value_meter_high check_floating_point_number
792
+ alias validate_attribute_value_meter_max check_floating_point_number
793
+ alias validate_attribute_value_meter_optimum check_floating_point_number
794
+ alias validate_attribute_value_progress_value check_floating_point_number
795
+ alias validate_attribute_value_progress_max check_floating_point_number
796
+ alias validate_attribute_value_ins_cite check_iri
797
+ alias validate_attribute_value_ins_datetime check_date_time
798
+ alias validate_attribute_value_del_cite check_iri
799
+ alias validate_attribute_value_del_datetime check_date_time
800
+
801
+ ##########################################################################
802
+ # Whole document validation (IDs, etc.)
803
+ ##########################################################################
804
+
805
+ def eof
806
+ for token in @things_that_point_to_an_id
807
+ tag_name = token.fetch(:name, "").downcase
808
+ attrs_dict = token[:data] # by now html5parser has "normalized" the attrs list into a dict.
809
+ # hooray for obscure side effects!
810
+ attr_value = attrs_dict.fetch("contextmenu", "")
811
+ if attr_value and (!@ids_we_have_known_and_loved.include?(attr_value))
812
+ yield( {:type => "ParseError",
813
+ :data => "id-does-not-exist",
814
+ :datavars => {"tagName" => tag_name,
815
+ "attributeName" => "contextmenu",
816
+ "attributeValue" => attr_value}})
817
+ else
818
+ for ref_token in @things_that_define_an_id
819
+ id = ref_token.fetch(:data, {}).fetch("id", "")
820
+ if not id
821
+ continue
822
+ end
823
+ if id == attr_value
824
+ if ref_token.fetch(:name, "").downcase != "men"
825
+ yield( {:type => "ParseError",
826
+ :data => "contextmenu-must-point-to-menu"})
827
+ end
828
+ break
829
+ end
830
+ end
831
+ end
832
+ end
833
+ end
834
+ end