feedtools 0.2.26 → 0.2.27

Sign up to get free protection for your applications and to get access to all the features.
Files changed (166) hide show
  1. data/CHANGELOG +232 -216
  2. data/db/migration.rb +2 -0
  3. data/db/schema.mysql.sql +2 -0
  4. data/db/schema.postgresql.sql +3 -1
  5. data/db/schema.sqlite.sql +3 -1
  6. data/lib/feed_tools.rb +37 -14
  7. data/lib/feed_tools/database_feed_cache.rb +13 -2
  8. data/lib/feed_tools/feed.rb +430 -104
  9. data/lib/feed_tools/feed_item.rb +533 -268
  10. data/lib/feed_tools/helpers/generic_helper.rb +1 -1
  11. data/lib/feed_tools/helpers/html_helper.rb +78 -116
  12. data/lib/feed_tools/helpers/retrieval_helper.rb +33 -3
  13. data/lib/feed_tools/helpers/uri_helper.rb +46 -54
  14. data/lib/feed_tools/monkey_patch.rb +27 -1
  15. data/lib/feed_tools/vendor/html5/History.txt +10 -0
  16. data/lib/feed_tools/vendor/html5/Manifest.txt +117 -0
  17. data/lib/feed_tools/vendor/html5/README +45 -0
  18. data/lib/feed_tools/vendor/html5/Rakefile.rb +33 -0
  19. data/lib/feed_tools/vendor/html5/bin/html5 +217 -0
  20. data/lib/feed_tools/vendor/html5/lib/core_ext/string.rb +17 -0
  21. data/lib/feed_tools/vendor/html5/lib/html5.rb +13 -0
  22. data/lib/feed_tools/vendor/html5/lib/html5/constants.rb +1046 -0
  23. data/lib/feed_tools/vendor/html5/lib/html5/filters/base.rb +10 -0
  24. data/lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb +82 -0
  25. data/lib/feed_tools/vendor/html5/lib/html5/filters/iso639codes.rb +752 -0
  26. data/lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb +198 -0
  27. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc2046.rb +30 -0
  28. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc3987.rb +89 -0
  29. data/lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb +15 -0
  30. data/lib/feed_tools/vendor/html5/lib/html5/filters/validator.rb +830 -0
  31. data/lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb +36 -0
  32. data/lib/feed_tools/vendor/html5/lib/html5/html5parser.rb +248 -0
  33. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb +46 -0
  34. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  35. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb +50 -0
  36. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb +41 -0
  37. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb +613 -0
  38. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb +69 -0
  39. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb +78 -0
  40. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  41. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  42. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb +138 -0
  43. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb +89 -0
  44. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb +85 -0
  45. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb +86 -0
  46. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb +115 -0
  47. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb +133 -0
  48. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb +154 -0
  49. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb +41 -0
  50. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  51. data/lib/feed_tools/vendor/html5/lib/html5/inputstream.rb +648 -0
  52. data/lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb +158 -0
  53. data/lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb +188 -0
  54. data/lib/feed_tools/vendor/html5/lib/html5/serializer.rb +2 -0
  55. data/lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb +179 -0
  56. data/lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb +20 -0
  57. data/lib/feed_tools/vendor/html5/lib/html5/sniffer.rb +45 -0
  58. data/lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb +966 -0
  59. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders.rb +24 -0
  60. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb +334 -0
  61. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb +231 -0
  62. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb +209 -0
  63. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb +185 -0
  64. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers.rb +26 -0
  65. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb +162 -0
  66. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb +48 -0
  67. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb +48 -0
  68. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb +48 -0
  69. data/lib/feed_tools/vendor/html5/lib/html5/version.rb +3 -0
  70. data/lib/feed_tools/vendor/html5/testdata/encoding/chardet/test_big5.txt +51 -0
  71. data/lib/feed_tools/vendor/html5/testdata/encoding/test-yahoo-jp.dat +10 -0
  72. data/lib/feed_tools/vendor/html5/testdata/encoding/tests1.dat +394 -0
  73. data/lib/feed_tools/vendor/html5/testdata/encoding/tests2.dat +81 -0
  74. data/lib/feed_tools/vendor/html5/testdata/sanitizer/tests1.dat +416 -0
  75. data/lib/feed_tools/vendor/html5/testdata/serializer/core.test +104 -0
  76. data/lib/feed_tools/vendor/html5/testdata/serializer/injectmeta.test +65 -0
  77. data/lib/feed_tools/vendor/html5/testdata/serializer/optionaltags.test +900 -0
  78. data/lib/feed_tools/vendor/html5/testdata/serializer/options.test +60 -0
  79. data/lib/feed_tools/vendor/html5/testdata/serializer/whitespace.test +51 -0
  80. data/lib/feed_tools/vendor/html5/testdata/sites/google-results.htm +1 -0
  81. data/lib/feed_tools/vendor/html5/testdata/sites/python-ref-import.htm +1 -0
  82. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps-old.htm +1 -0
  83. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps.htm +34275 -0
  84. data/lib/feed_tools/vendor/html5/testdata/sniffer/htmlOrFeed.json +43 -0
  85. data/lib/feed_tools/vendor/html5/testdata/tokenizer/contentModelFlags.test +48 -0
  86. data/lib/feed_tools/vendor/html5/testdata/tokenizer/entities.test +2339 -0
  87. data/lib/feed_tools/vendor/html5/testdata/tokenizer/escapeFlag.test +21 -0
  88. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test1.test +172 -0
  89. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test2.test +129 -0
  90. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test3.test +367 -0
  91. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test4.test +198 -0
  92. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests1.dat +1950 -0
  93. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests2.dat +773 -0
  94. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests3.dat +270 -0
  95. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests4.dat +60 -0
  96. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests5.dat +175 -0
  97. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests6.dat +196 -0
  98. data/lib/feed_tools/vendor/html5/testdata/validator/attributes.test +1035 -0
  99. data/lib/feed_tools/vendor/html5/testdata/validator/base-href-attribute.test +787 -0
  100. data/lib/feed_tools/vendor/html5/testdata/validator/base-target-attribute.test +35 -0
  101. data/lib/feed_tools/vendor/html5/testdata/validator/blockquote-cite-attribute.test +7 -0
  102. data/lib/feed_tools/vendor/html5/testdata/validator/classattribute.test +152 -0
  103. data/lib/feed_tools/vendor/html5/testdata/validator/contenteditableattribute.test +59 -0
  104. data/lib/feed_tools/vendor/html5/testdata/validator/contextmenuattribute.test +115 -0
  105. data/lib/feed_tools/vendor/html5/testdata/validator/dirattribute.test +59 -0
  106. data/lib/feed_tools/vendor/html5/testdata/validator/draggableattribute.test +63 -0
  107. data/lib/feed_tools/vendor/html5/testdata/validator/html-xmlns-attribute.test +23 -0
  108. data/lib/feed_tools/vendor/html5/testdata/validator/idattribute.test +115 -0
  109. data/lib/feed_tools/vendor/html5/testdata/validator/inputattributes.test +2795 -0
  110. data/lib/feed_tools/vendor/html5/testdata/validator/irrelevantattribute.test +63 -0
  111. data/lib/feed_tools/vendor/html5/testdata/validator/langattribute.test +5579 -0
  112. data/lib/feed_tools/vendor/html5/testdata/validator/li-value-attribute.test +7 -0
  113. data/lib/feed_tools/vendor/html5/testdata/validator/link-href-attribute.test +7 -0
  114. data/lib/feed_tools/vendor/html5/testdata/validator/link-hreflang-attribute.test +7 -0
  115. data/lib/feed_tools/vendor/html5/testdata/validator/link-rel-attribute.test +271 -0
  116. data/lib/feed_tools/vendor/html5/testdata/validator/ol-start-attribute.test +7 -0
  117. data/lib/feed_tools/vendor/html5/testdata/validator/starttags.test +375 -0
  118. data/lib/feed_tools/vendor/html5/testdata/validator/style-scoped-attribute.test +7 -0
  119. data/lib/feed_tools/vendor/html5/testdata/validator/tabindexattribute.test +79 -0
  120. data/lib/feed_tools/vendor/html5/tests/preamble.rb +72 -0
  121. data/lib/feed_tools/vendor/html5/tests/test_encoding.rb +35 -0
  122. data/lib/feed_tools/vendor/html5/tests/test_lxp.rb +279 -0
  123. data/lib/feed_tools/vendor/html5/tests/test_parser.rb +68 -0
  124. data/lib/feed_tools/vendor/html5/tests/test_sanitizer.rb +142 -0
  125. data/lib/feed_tools/vendor/html5/tests/test_serializer.rb +68 -0
  126. data/lib/feed_tools/vendor/html5/tests/test_sniffer.rb +27 -0
  127. data/lib/feed_tools/vendor/html5/tests/test_stream.rb +62 -0
  128. data/lib/feed_tools/vendor/html5/tests/test_tokenizer.rb +94 -0
  129. data/lib/feed_tools/vendor/html5/tests/test_treewalkers.rb +135 -0
  130. data/lib/feed_tools/vendor/html5/tests/test_validator.rb +31 -0
  131. data/lib/feed_tools/vendor/html5/tests/tokenizer_test_parser.rb +63 -0
  132. data/lib/feed_tools/vendor/uri.rb +781 -0
  133. data/lib/feed_tools/version.rb +1 -1
  134. data/rakefile +27 -6
  135. data/test/unit/atom_test.rb +298 -210
  136. data/test/unit/helper_test.rb +7 -12
  137. data/test/unit/rdf_test.rb +51 -1
  138. data/test/unit/rss_test.rb +13 -3
  139. metadata +239 -116
  140. data/lib/feed_tools/vendor/htree.rb +0 -97
  141. data/lib/feed_tools/vendor/htree/container.rb +0 -10
  142. data/lib/feed_tools/vendor/htree/context.rb +0 -67
  143. data/lib/feed_tools/vendor/htree/display.rb +0 -27
  144. data/lib/feed_tools/vendor/htree/doc.rb +0 -149
  145. data/lib/feed_tools/vendor/htree/elem.rb +0 -262
  146. data/lib/feed_tools/vendor/htree/encoder.rb +0 -163
  147. data/lib/feed_tools/vendor/htree/equality.rb +0 -218
  148. data/lib/feed_tools/vendor/htree/extract_text.rb +0 -37
  149. data/lib/feed_tools/vendor/htree/fstr.rb +0 -33
  150. data/lib/feed_tools/vendor/htree/gencode.rb +0 -97
  151. data/lib/feed_tools/vendor/htree/htmlinfo.rb +0 -672
  152. data/lib/feed_tools/vendor/htree/inspect.rb +0 -108
  153. data/lib/feed_tools/vendor/htree/leaf.rb +0 -94
  154. data/lib/feed_tools/vendor/htree/loc.rb +0 -367
  155. data/lib/feed_tools/vendor/htree/modules.rb +0 -48
  156. data/lib/feed_tools/vendor/htree/name.rb +0 -124
  157. data/lib/feed_tools/vendor/htree/output.rb +0 -207
  158. data/lib/feed_tools/vendor/htree/parse.rb +0 -409
  159. data/lib/feed_tools/vendor/htree/raw_string.rb +0 -124
  160. data/lib/feed_tools/vendor/htree/regexp-util.rb +0 -15
  161. data/lib/feed_tools/vendor/htree/rexml.rb +0 -130
  162. data/lib/feed_tools/vendor/htree/scan.rb +0 -166
  163. data/lib/feed_tools/vendor/htree/tag.rb +0 -111
  164. data/lib/feed_tools/vendor/htree/template.rb +0 -909
  165. data/lib/feed_tools/vendor/htree/text.rb +0 -115
  166. data/lib/feed_tools/vendor/htree/traverse.rb +0 -465
@@ -0,0 +1,198 @@
1
+ require 'html5/constants'
2
+ require 'html5/filters/base'
3
+
4
+ module HTML5
5
+ module Filters
6
+
7
+ class OptionalTagFilter < Base
8
+ def slider
9
+ previous1 = previous2 = nil
10
+ __getobj__.each do |token|
11
+ yield previous2, previous1, token if previous1 != nil
12
+ previous2 = previous1
13
+ previous1 = token
14
+ end
15
+ yield previous2, previous1, nil
16
+ end
17
+
18
+ def each
19
+ slider do |previous, token, nexttok|
20
+ type = token[:type]
21
+ if type == :StartTag
22
+ yield token unless token[:data].empty? and is_optional_start(token[:name], previous, nexttok)
23
+ elsif type == :EndTag
24
+ yield token unless is_optional_end(token[:name], nexttok)
25
+ else
26
+ yield token
27
+ end
28
+ end
29
+ end
30
+
31
+ def is_optional_start(tagname, previous, nexttok)
32
+ type = nexttok ? nexttok[:type] : nil
33
+ if tagname == 'html'
34
+ # An html element's start tag may be omitted if the first thing
35
+ # inside the html element is not a space character or a comment.
36
+ return ![:Comment, :SpaceCharacters].include?(type)
37
+ elsif tagname == 'head'
38
+ # A head element's start tag may be omitted if the first thing
39
+ # inside the head element is an element.
40
+ return type == :StartTag
41
+ elsif tagname == 'body'
42
+ # A body element's start tag may be omitted if the first thing
43
+ # inside the body element is not a space character or a comment,
44
+ # except if the first thing inside the body element is a script
45
+ # or style element and the node immediately preceding the body
46
+ # element is a head element whose end tag has been omitted.
47
+ if [:Comment, :SpaceCharacters].include?(type)
48
+ return false
49
+ elsif type == :StartTag
50
+ # XXX: we do not look at the preceding event, so we never omit
51
+ # the body element's start tag if it's followed by a script or
52
+ # a style element.
53
+ return !%w[script style].include?(nexttok[:name])
54
+ else
55
+ return true
56
+ end
57
+ elsif tagname == 'colgroup'
58
+ # A colgroup element's start tag may be omitted if the first thing
59
+ # inside the colgroup element is a col element, and if the element
60
+ # is not immediately preceeded by another colgroup element whose
61
+ # end tag has been omitted.
62
+ if type == :StartTag
63
+ # XXX: we do not look at the preceding event, so instead we never
64
+ # omit the colgroup element's end tag when it is immediately
65
+ # followed by another colgroup element. See is_optional_end.
66
+ return nexttok[:name] == "col"
67
+ else
68
+ return false
69
+ end
70
+ elsif tagname == 'tbody'
71
+ # A tbody element's start tag may be omitted if the first thing
72
+ # inside the tbody element is a tr element, and if the element is
73
+ # not immediately preceeded by a tbody, thead, or tfoot element
74
+ # whose end tag has been omitted.
75
+ if type == :StartTag
76
+ # omit the thead and tfoot elements' end tag when they are
77
+ # immediately followed by a tbody element. See is_optional_end.
78
+ if previous and previous[:type] == :EndTag && %w(tbody thead tfoot).include?(previous[:name])
79
+ return false
80
+ end
81
+
82
+ return nexttok[:name] == 'tr'
83
+ else
84
+ return false
85
+ end
86
+ end
87
+ return false
88
+ end
89
+
90
+ def is_optional_end(tagname, nexttok)
91
+ type = nexttok ? nexttok[:type] : nil
92
+ if %w[html head body].include?(tagname)
93
+ # An html element's end tag may be omitted if the html element
94
+ # is not immediately followed by a space character or a comment.
95
+ return ![:Comment, :SpaceCharacters].include?(type)
96
+ elsif %w[li optgroup option tr].include?(tagname)
97
+ # A li element's end tag may be omitted if the li element is
98
+ # immediately followed by another li element or if there is
99
+ # no more content in the parent element.
100
+ # An optgroup element's end tag may be omitted if the optgroup
101
+ # element is immediately followed by another optgroup element,
102
+ # or if there is no more content in the parent element.
103
+ # An option element's end tag may be omitted if the option
104
+ # element is immediately followed by another option element,
105
+ # or if there is no more content in the parent element.
106
+ # A tr element's end tag may be omitted if the tr element is
107
+ # immediately followed by another tr element, or if there is
108
+ # no more content in the parent element.
109
+ if type == :StartTag
110
+ return nexttok[:name] == tagname
111
+ else
112
+ return type == :EndTag || type == nil
113
+ end
114
+ elsif %w(dt dd).include?(tagname)
115
+ # A dt element's end tag may be omitted if the dt element is
116
+ # immediately followed by another dt element or a dd element.
117
+ # A dd element's end tag may be omitted if the dd element is
118
+ # immediately followed by another dd element or a dt element,
119
+ # or if there is no more content in the parent element.
120
+ if type == :StartTag
121
+ return %w(dt dd).include?(nexttok[:name])
122
+ elsif tagname == 'dd'
123
+ return type == :EndTag || type == nil
124
+ else
125
+ return false
126
+ end
127
+ elsif tagname == 'p'
128
+ # A p element's end tag may be omitted if the p element is
129
+ # immediately followed by an address, blockquote, dl, fieldset,
130
+ # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
131
+ # or ul element, or if there is no more content in the parent
132
+ # element.
133
+ if type == :StartTag
134
+ return %w(address blockquote dl fieldset form h1 h2 h3 h4 h5
135
+ h6 hr menu ol p pre table ul).include?(nexttok[:name])
136
+ else
137
+ return type == :EndTag || type == nil
138
+ end
139
+ elsif tagname == 'colgroup'
140
+ # A colgroup element's end tag may be omitted if the colgroup
141
+ # element is not immediately followed by a space character or
142
+ # a comment.
143
+ if [:Comment, :SpaceCharacters].include?(type)
144
+ return false
145
+ elsif type == :StartTag
146
+ # XXX: we also look for an immediately following colgroup
147
+ # element. See is_optional_start.
148
+ return nexttok[:name] != 'colgroup'
149
+ else
150
+ return true
151
+ end
152
+ elsif %w(thead tbody).include? tagname
153
+ # A thead element's end tag may be omitted if the thead element
154
+ # is immediately followed by a tbody or tfoot element.
155
+ # A tbody element's end tag may be omitted if the tbody element
156
+ # is immediately followed by a tbody or tfoot element, or if
157
+ # there is no more content in the parent element.
158
+ # A tfoot element's end tag may be omitted if the tfoot element
159
+ # is immediately followed by a tbody element, or if there is no
160
+ # more content in the parent element.
161
+ # XXX: we never omit the end tag when the following element is
162
+ # a tbody. See is_optional_start.
163
+ if type == :StartTag
164
+ return %w(tbody tfoot).include?(nexttok[:name])
165
+ elsif tagname == 'tbody'
166
+ return (type == :EndTag or type == nil)
167
+ else
168
+ return false
169
+ end
170
+ elsif tagname == 'tfoot'
171
+ # A tfoot element's end tag may be omitted if the tfoot element
172
+ # is immediately followed by a tbody element, or if there is no
173
+ # more content in the parent element.
174
+ # XXX: we never omit the end tag when the following element is
175
+ # a tbody. See is_optional_start.
176
+ if type == :StartTag
177
+ return nexttok[:name] == 'tbody'
178
+ else
179
+ return type == :EndTag || type == nil
180
+ end
181
+ elsif %w(td th).include? tagname
182
+ # A td element's end tag may be omitted if the td element is
183
+ # immediately followed by a td or th element, or if there is
184
+ # no more content in the parent element.
185
+ # A th element's end tag may be omitted if the th element is
186
+ # immediately followed by a td or th element, or if there is
187
+ # no more content in the parent element.
188
+ if type == :StartTag
189
+ return %w(td th).include?(nexttok[:name])
190
+ else
191
+ return type == :EndTag || type == nil
192
+ end
193
+ end
194
+ return false
195
+ end
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,30 @@
1
+ # adapted from feedvalidator, original copyright license is
2
+ #
3
+ # Copyright (c) 2002-2006, Sam Ruby, Mark Pilgrim, Joseph Walton, and Phil Ringnalda
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+
24
+ # mime_re = Regexp.new('[^\s()<>,;:\\"/[\]?=]+/[^\s()<>,;:\\"/[\]?=]+(\s*;\s*[^\s()<>,;:\\"/[\]?=]+=("(\\"|[^"])*"|[^\s()<>,;:\\"/[\]?=]+))*$')
25
+
26
+ def is_valid_mime_type(value)
27
+ # !!mime_re.match(value)
28
+ true
29
+ end
30
+
@@ -0,0 +1,89 @@
1
+ # adapted from feedvalidator, original copyright license is
2
+ #
3
+ # Copyright (c) 2002-2006, Sam Ruby, Mark Pilgrim, Joseph Walton, and Phil Ringnalda
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ iana_schemes = [ # http://www.iana.org/assignments/uri-schemes.html
24
+ "ftp", "http", "gopher", "mailto", "news", "nntp", "telnet", "wais",
25
+ "file", "prospero", "z39.50s", "z39.50r", "cid", "mid", "vemmi",
26
+ "service", "imap", "nfs", "acap", "rtsp", "tip", "pop", "data", "dav",
27
+ "opaquelocktoken", "sip", "sips", "tel", "fax", "modem", "ldap",
28
+ "https", "soap.beep", "soap.beeps", "xmlrpc.beep", "xmlrpc.beeps",
29
+ "urn", "go", "h323", "ipp", "tftp", "mupdate", "pres", "im", "mtqp",
30
+ "iris.beep", "dict", "snmp", "crid", "tag", "dns", "info"
31
+ ]
32
+ ALLOWED_SCHEMES = iana_schemes + ['javascript']
33
+
34
+ RFC2396 = Regexp.new("^([a-zA-Z][0-9a-zA-Z+\\-\\.]*:)?/{0,2}[0-9a-zA-Z;/?:@&=+$\\.\\-_!~*'()%,#]*$", Regexp::MULTILINE)
35
+ rfc2396_full = Regexp.new("[a-zA-Z][0-9a-zA-Z+\\-\\.]*:(//)?[0-9a-zA-Z;/?:@&=+$\\.\\-_!~*'()%,#]+$")
36
+ URN = Regexp.new("^[Uu][Rr][Nn]:[a-zA-Z0-9][a-zA-Z0-9-]{1,31}:([a-zA-Z0-9()+,\.:=@;$_!*'\-]|%[0-9A-Fa-f]{2})+$")
37
+ TAG = Regexp.new("^tag:([a-z0-9\\-\._]+?@)?[a-z0-9\.\-]+?,\d{4}(-\d{2}(-\d{2})?)?:[0-9a-zA-Z;/\?:@&=+$\.\-_!~*'\(\)%,]*(#[0-9a-zA-Z;/\?:@&=+$\.\-_!~*'\(\)%,]*)?$")
38
+
39
+ def is_valid_uri(value, uri_pattern = RFC2396)
40
+ scheme = value.split(':').first
41
+ scheme.downcase! if scheme
42
+ if scheme == 'tag'
43
+ if !TAG.match(value)
44
+ return false, "invalid-tag-uri"
45
+ end
46
+ elsif scheme == "urn"
47
+ if !URN.match(value)
48
+ return false, "invalid-urn"
49
+ end
50
+ elsif uri_pattern.match(value).to_a.reject{|i| i == ''}.compact.length == 0 || uri_pattern.match(value)[0] != value
51
+ urichars = Regexp.new("^[0-9a-zA-Z;/?:@&=+$\\.\\-_!~*'()%,#]$", Regexp::MULTILINE)
52
+ if value.length > 0
53
+ value.each_byte do |b|
54
+ if b < 128 and !urichars.match([b].pack('c*'))
55
+ return false, "invalid-uri-char"
56
+ end
57
+ end
58
+ else
59
+ begin
60
+ if uri_pattern.match(value.encode('idna'))
61
+ return false, "uri-not-iri"
62
+ end
63
+ rescue
64
+ end
65
+ return false, "invalid-uri"
66
+ end
67
+ elsif ['http','ftp'].include?(scheme)
68
+ if !value.match(%r{^\w+://[^/].*})
69
+ return false, "invalid-http-or-ftp-uri"
70
+ end
71
+ elsif value.index(':') && scheme.match(/^[a-z]+$/) && !ALLOWED_SCHEMES.include?(scheme)
72
+ return false, "invalid-scheme"
73
+ end
74
+ return true, ""
75
+ end
76
+
77
+ def is_valid_iri(value)
78
+ begin
79
+ if value.length > 0
80
+ value = value.encode('idna')
81
+ end
82
+ rescue
83
+ end
84
+ is_valid_uri(value)
85
+ end
86
+
87
+ def is_valid_fully_qualified_uri(value)
88
+ is_valid_uri(value, rfc2396_full)
89
+ end
@@ -0,0 +1,15 @@
1
+ require 'html5/filters/base'
2
+ require 'html5/sanitizer'
3
+
4
+ module HTML5
5
+ module Filters
6
+ class HTMLSanitizeFilter < Base
7
+ include HTMLSanitizeModule
8
+ def each
9
+ __getobj__.each do |token|
10
+ yield(sanitize_token(token))
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,830 @@
1
+ # HTML 5 conformance checker
2
+ #
3
+ # Warning: this module is experimental, incomplete, and subject to removal at any time.
4
+ #
5
+ # Usage:
6
+ # >>> from html5lib.html5parser import HTMLParser
7
+ # >>> from html5lib.filters.validator import HTMLConformanceChecker
8
+ # >>> p = HTMLParser(tokenizer=HTMLConformanceChecker)
9
+ # >>> p.parse('<!doctype html>\n<html foo=bar></html>')
10
+ # <<class 'html5lib.treebuilders.simpletree.Document'> nil>
11
+ # >>> p.errors
12
+ # [((2, 14), 'unknown-attribute', {'attributeName' => u'foo', 'tagName' => u'html'})]
13
+
14
+ require 'html5/constants'
15
+ require 'html5/filters/base'
16
+ require 'html5/filters/iso639codes'
17
+ require 'html5/filters/rfc3987'
18
+ require 'html5/filters/rfc2046'
19
+
20
+ def _(str); str; end
21
+
22
+ class String
23
+ # lifted from rails
24
+ def underscore()
25
+ self.gsub(/::/, '/').
26
+ gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
27
+ gsub(/([a-z\d])([A-Z])/,'\1_\2').
28
+ tr("-", "_").
29
+ downcase
30
+ end
31
+ end
32
+
33
+ HTML5::E.update({
34
+ "unknown-start-tag" =>
35
+ _("Unknown start tag <%(tagName)>."),
36
+ "unknown-attribute" =>
37
+ _("Unknown '%(attributeName)' attribute on <%(tagName)>."),
38
+ "missing-required-attribute" =>
39
+ _("The '%(attributeName)' attribute is required on <%(tagName)>."),
40
+ "unknown-input-type" =>
41
+ _("Illegal value for attribute on <input type='%(inputType)'>."),
42
+ "attribute-not-allowed-on-this-input-type" =>
43
+ _("The '%(attributeName)' attribute is not allowed on <input type=%(inputType)>."),
44
+ "deprecated-attribute" =>
45
+ _("This attribute is deprecated: '%(attributeName)' attribute on <%(tagName)>."),
46
+ "duplicate-value-in-token-list" =>
47
+ _("Duplicate value in token list: '%(attributeValue)' in '%(attributeName)' attribute on <%(tagName)>."),
48
+ "invalid-attribute-value" =>
49
+ _("Invalid attribute value: '%(attributeName)' attribute on <%(tagName)>."),
50
+ "space-in-id" =>
51
+ _("Whitespace is not allowed here: '%(attributeName)' attribute on <%(tagName)>."),
52
+ "duplicate-id" =>
53
+ _("This ID was already defined earlier: 'id' attribute on <%(tagName)>."),
54
+ "attribute-value-can-not-be-blank" =>
55
+ _("This value can not be blank: '%(attributeName)' attribute on <%(tagName)>."),
56
+ "id-does-not-exist" =>
57
+ _("This value refers to a non-existent ID: '%(attributeName)' attribute on <%(tagName)>."),
58
+ "invalid-enumerated-value" =>
59
+ _("Value must be one of %(enumeratedValues): '%(attributeName)' attribute on <%tagName)>."),
60
+ "invalid-boolean-value" =>
61
+ _("Value must be one of %(enumeratedValues): '%(attributeName)' attribute on <%tagName)>."),
62
+ "contextmenu-must-point-to-menu" =>
63
+ _("The contextmenu attribute must point to an ID defined on a <menu> element."),
64
+ "invalid-lang-code" =>
65
+ _("Invalid language code: '%(attributeName)' attibute on <%(tagName)>."),
66
+ "invalid-integer-value" =>
67
+ _("Value must be an integer: '%(attributeName)' attribute on <%tagName)>."),
68
+ "invalid-root-namespace" =>
69
+ _("Root namespace must be 'http://www.w3.org/1999/xhtml', or omitted."),
70
+ "invalid-browsing-context" =>
71
+ _("Value must be one of ('_self', '_parent', '_top'), or a name that does not start with '_' => '%(attributeName)' attribute on <%(tagName)>."),
72
+ "invalid-tag-uri" =>
73
+ _("Invalid URI: '%(attributeName)' attribute on <%(tagName)>."),
74
+ "invalid-urn" =>
75
+ _("Invalid URN: '%(attributeName)' attribute on <%(tagName)>."),
76
+ "invalid-uri-char" =>
77
+ _("Illegal character in URI: '%(attributeName)' attribute on <%(tagName)>."),
78
+ "uri-not-iri" =>
79
+ _("Expected a URI but found an IRI: '%(attributeName)' attribute on <%(tagName)>."),
80
+ "invalid-uri" =>
81
+ _("Invalid URI: '%(attributeName)' attribute on <%(tagName)>."),
82
+ "invalid-http-or-ftp-uri" =>
83
+ _("Invalid URI: '%(attributeName)' attribute on <%(tagName)>."),
84
+ "invalid-scheme" =>
85
+ _("Unregistered URI scheme: '%(attributeName)' attribute on <%(tagName)>."),
86
+ "invalid-rel" =>
87
+ _("Invalid link relation: '%(attributeName)' attribute on <%(tagName)>."),
88
+ "invalid-mime-type" =>
89
+ _("Invalid MIME type: '%(attributeName)' attribute on <%(tagName)>."),
90
+ })
91
+
92
+
93
+ class HTMLConformanceChecker < HTML5::Filters::Base
94
+
95
+ @@global_attributes = %w[class contenteditable contextmenu dir
96
+ draggable id irrelevant lang ref tabindex template
97
+ title onabort onbeforeunload onblur onchange onclick
98
+ oncontextmenu ondblclick ondrag ondragend ondragenter
99
+ ondragleave ondragover ondragstart ondrop onerror
100
+ onfocus onkeydown onkeypress onkeyup onload onmessage
101
+ onmousedown onmousemove onmouseout onmouseover onmouseup
102
+ onmousewheel onresize onscroll onselect onsubmit onunload]
103
+ # XXX lang in HTML only, xml:lang in XHTML only
104
+ # XXX validate ref, template
105
+
106
+ @@allowed_attribute_map = {
107
+ 'html' => %w[xmlns],
108
+ 'head' => [],
109
+ 'title' => [],
110
+ 'base' => %w[href target],
111
+ 'link' => %w[href rel media hreflang type],
112
+ 'meta' => %w[name http-equiv content charset], # XXX charset in HTML only
113
+ 'style' => %w[media type scoped],
114
+ 'body' => [],
115
+ 'section' => [],
116
+ 'nav' => [],
117
+ 'article' => [],
118
+ 'blockquote' => %w[cite],
119
+ 'aside' => [],
120
+ 'h1' => [],
121
+ 'h2' => [],
122
+ 'h3' => [],
123
+ 'h4' => [],
124
+ 'h5' => [],
125
+ 'h6' => [],
126
+ 'header' => [],
127
+ 'footer' => [],
128
+ 'address' => [],
129
+ 'p' => [],
130
+ 'hr' => [],
131
+ 'br' => [],
132
+ 'dialog' => [],
133
+ 'pre' => [],
134
+ 'ol' => %w[start],
135
+ 'ul' => [],
136
+ 'li' => %w[value], # XXX depends on parent
137
+ 'dl' => [],
138
+ 'dt' => [],
139
+ 'dd' => [],
140
+ 'a' => %w[href target ping rel media hreflang type],
141
+ 'q' => %w[cite],
142
+ 'cite' => [],
143
+ 'em' => [],
144
+ 'strong' => [],
145
+ 'small' => [],
146
+ 'm' => [],
147
+ 'dfn' => [],
148
+ 'abbr' => [],
149
+ 'time' => %w[datetime],
150
+ 'meter' => %w[value min low high max optimum],
151
+ 'progress' => %w[value max],
152
+ 'code' => [],
153
+ 'var' => [],
154
+ 'samp' => [],
155
+ 'kbd' => [],
156
+ 'sup' => [],
157
+ 'sub' => [],
158
+ 'span' => [],
159
+ 'i' => [],
160
+ 'b' => [],
161
+ 'bdo' => [],
162
+ 'ins' => %w[cite datetime],
163
+ 'del' => %w[cite datetime],
164
+ 'figure' => [],
165
+ 'img' => %w[alt src usemap ismap height width], # XXX ismap depends on parent
166
+ 'iframe' => %w[src],
167
+ # <embed> handled separately
168
+ 'object' => %w[data type usemap height width],
169
+ 'param' => %w[name value],
170
+ 'video' => %w[src autoplay start loopstart loopend end loopcount controls],
171
+ 'audio' => %w[src autoplay start loopstart loopend end loopcount controls],
172
+ 'source' => %w[src type media],
173
+ 'canvas' => %w[height width],
174
+ 'map' => [],
175
+ 'area' => %w[alt coords shape href target ping rel media hreflang type],
176
+ 'table' => [],
177
+ 'caption' => [],
178
+ 'colgroup' => %w[span], # XXX only if element contains no <col> elements
179
+ 'col' => %w[span],
180
+ 'tbody' => [],
181
+ 'thead' => [],
182
+ 'tfoot' => [],
183
+ 'tr' => [],
184
+ 'td' => %w[colspan rowspan],
185
+ 'th' => %w[colspan rowspan scope],
186
+ # all possible <input> attributes are listed here but <input> is really handled separately
187
+ 'input' => %w[accept accesskey action alt autocomplete autofocus checked
188
+ disabled enctype form inputmode list maxlength method min
189
+ max name pattern step readonly replace required size src
190
+ tabindex target template value
191
+ ],
192
+ 'form' => %w[action method enctype accept name onsubmit onreset accept-charset
193
+ data replace
194
+ ],
195
+ 'button' => %w[action enctype method replace template name value type disabled form autofocus], # XXX may need matrix of acceptable attributes based on value of type attribute (like input)
196
+ 'select' => %w[name size multiple disabled data accesskey form autofocus],
197
+ 'optgroup' => %w[disabled label],
198
+ 'option' => %w[selected disabled label value],
199
+ 'textarea' => %w[maxlength name rows cols disabled readonly required form autofocus wrap accept],
200
+ 'label' => %w[for accesskey form],
201
+ 'fieldset' => %w[disabled form],
202
+ 'output' => %w[form name for onforminput onformchange],
203
+ 'datalist' => %w[data],
204
+ # XXX repetition model for repeating form controls
205
+ 'script' => %w[src defer async type],
206
+ 'noscript' => [],
207
+ 'noembed' => [],
208
+ 'event-source' => %w[src],
209
+ 'details' => %w[open],
210
+ 'datagrid' => %w[multiple disabled],
211
+ 'command' => %w[type label icon hidden disabled checked radiogroup default],
212
+ 'menu' => %w[type label autosubmit],
213
+ 'datatemplate' => [],
214
+ 'rule' => [],
215
+ 'nest' => [],
216
+ 'legend' => [],
217
+ 'div' => [],
218
+ 'font' => %w[style]
219
+ }
220
+
221
+ @@required_attribute_map = {
222
+ 'link' => %w[href rel],
223
+ 'bdo' => %w[dir],
224
+ 'img' => %w[src],
225
+ 'embed' => %w[src],
226
+ 'object' => [], # XXX one of 'data' or 'type' is required
227
+ 'param' => %w[name value],
228
+ 'source' => %w[src],
229
+ 'map' => %w[id]
230
+ }
231
+
232
+ @@input_type_allowed_attribute_map = {
233
+ 'text' => %w[accesskey autocomplete autofocus disabled form inputmode list maxlength name pattern readonly required size tabindex value],
234
+ 'password' => %w[accesskey autocomplete autofocus disabled form inputmode maxlength name pattern readonly required size tabindex value],
235
+ 'checkbox' => %w[accesskey autofocus checked disabled form name required tabindex value],
236
+ 'radio' => %w[accesskey autofocus checked disabled form name required tabindex value],
237
+ 'button' => %w[accesskey autofocus disabled form name tabindex value],
238
+ 'submit' => %w[accesskey action autofocus disabled enctype form method name replace tabindex target value],
239
+ 'reset' => %w[accesskey autofocus disabled form name tabindex value],
240
+ 'add' => %w[accesskey autofocus disabled form name tabindex template value],
241
+ 'remove' => %w[accesskey autofocus disabled form name tabindex value],
242
+ 'move-up' => %w[accesskey autofocus disabled form name tabindex value],
243
+ 'move-down' => %w[accesskey autofocus disabled form name tabindex value],
244
+ 'file' => %w[accept accesskey autofocus disabled form min max name required tabindex],
245
+ 'hidden' => %w[disabled form name value],
246
+ 'image' => %w[accesskey action alt autofocus disabled enctype form method name replace src tabindex target],
247
+ 'datetime' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
248
+ 'datetime-local' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
249
+ 'date' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
250
+ 'month' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
251
+ 'week' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
252
+ 'time' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
253
+ 'number' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
254
+ 'range' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
255
+ 'email' => %w[accesskey autocomplete autofocus disabled form inputmode list maxlength name pattern readonly required tabindex value],
256
+ 'url' => %w[accesskey autocomplete autofocus disabled form inputmode list maxlength name pattern readonly required tabindex value],
257
+ }
258
+
259
+ @@input_type_deprecated_attribute_map = {
260
+ 'text' => ['size'],
261
+ 'password' => ['size']
262
+ }
263
+
264
+ @@link_rel_values = %w[alternate archive archives author contact feed first begin start help icon index top contents toc last end license copyright next pingback prefetch prev previous search stylesheet sidebar tag up]
265
+ @@a_rel_values = %w[alternate archive archives author contact feed first begin start help index top contents toc last end license copyright next prev previous search sidebar tag up bookmark external nofollow]
266
+
267
+ def initialize(stream, *args)
268
+ super(HTML5::HTMLTokenizer.new(stream, *args))
269
+ @things_that_define_an_id = []
270
+ @things_that_point_to_an_id = []
271
+ @ids_we_have_known_and_loved = []
272
+ end
273
+
274
+ def each
275
+ __getobj__.each do |token|
276
+ method = "validate_#{token.fetch(:type, '-').to_s.underscore}_#{token.fetch(:name, '-').to_s.underscore}"
277
+ if respond_to?(method)
278
+ send(method, token){|t| yield t }
279
+ else
280
+ method = "validate_#{token.fetch(:type, '-').to_s.underscore}"
281
+ if respond_to?(method)
282
+ send(method, token) do |t|
283
+ yield t
284
+ end
285
+ end
286
+ end
287
+ yield token
288
+ end
289
+ eof do |t|
290
+ yield t
291
+ end
292
+ end
293
+
294
+ ##########################################################################
295
+ # Start tag validation
296
+ ##########################################################################
297
+
298
+ def validate_start_tag(token)
299
+ check_unknown_start_tag(token){|t| yield t}
300
+ check_start_tag_required_attributes(token) do |t|
301
+ yield t
302
+ end
303
+ check_start_tag_unknown_attributes(token) do |t|
304
+ yield t
305
+ end
306
+ check_attribute_values(token) do |t|
307
+ yield t
308
+ end
309
+ end
310
+
311
+ def validate_start_tag_embed(token)
312
+ check_start_tag_required_attributes(token) do |t|
313
+ yield t
314
+ end
315
+ check_attribute_values(token) do |t|
316
+ yield t
317
+ end
318
+ # spec says "any attributes w/o namespace"
319
+ # so don't call check_start_tag_unknown_attributes
320
+ end
321
+
322
+ def validate_start_tag_input(token)
323
+ check_attribute_values(token) do |t|
324
+ yield t
325
+ end
326
+ attr_dict = Hash[*token[:data].collect{|(name, value)| [name.downcase, value]}.flatten]
327
+ input_type = attr_dict.fetch('type', "text")
328
+ if !@@input_type_allowed_attribute_map.keys().include?(input_type)
329
+ yield({:type => "ParseError",
330
+ :data => "unknown-input-type",
331
+ :datavars => {:attrValue => input_type}})
332
+ end
333
+ allowed_attributes = @@input_type_allowed_attribute_map.fetch(input_type, [])
334
+ attr_dict.each do |attr_name, attr_value|
335
+ if !@@allowed_attribute_map['input'].include?(attr_name)
336
+ yield({:type => "ParseError",
337
+ :data => "unknown-attribute",
338
+ :datavars => {"tagName" => "input",
339
+ "attributeName" => attr_name}})
340
+ elsif !allowed_attributes.include?(attr_name)
341
+ yield({:type => "ParseError",
342
+ :data => "attribute-not-allowed-on-this-input-type",
343
+ :datavars => {"attributeName" => attr_name,
344
+ "inputType" => input_type}})
345
+ end
346
+ if @@input_type_deprecated_attribute_map.fetch(input_type, []).include?(attr_name)
347
+ yield({:type => "ParseError",
348
+ :data => "deprecated-attribute",
349
+ :datavars => {"attributeName" => attr_name,
350
+ "inputType" => input_type}})
351
+ end
352
+ end
353
+ end
354
+
355
+ ##########################################################################
356
+ # Start tag validation helpers
357
+ ##########################################################################
358
+
359
+ def check_unknown_start_tag(token)
360
+ # check for recognized tag name
361
+ name = (token[:name] || "").downcase
362
+ if !@@allowed_attribute_map.keys.include?(name)
363
+ yield({:type => "ParseError",
364
+ :data => "unknown-start-tag",
365
+ :datavars => {"tagName" => name}})
366
+ end
367
+ end
368
+
369
+ def check_start_tag_required_attributes(token)
370
+ # check for presence of required attributes
371
+ name = (token[:name] || "").downcase
372
+ if @@required_attribute_map.keys().include?(name)
373
+ attrs_present = (token[:data] || []).collect{|t| t[0]}
374
+ for attr_name in @@required_attribute_map[name]
375
+ if !attrs_present.include?(attr_name)
376
+ yield( {:type => "ParseError",
377
+ :data => "missing-required-attribute",
378
+ :datavars => {"tagName" => name,
379
+ "attributeName" => attr_name}})
380
+ end
381
+ end
382
+ end
383
+ end
384
+
385
+ def check_start_tag_unknown_attributes(token)
386
+ # check for recognized attribute names
387
+ name = token[:name].downcase
388
+ allowed_attributes = @@global_attributes | @@allowed_attribute_map.fetch(name, [])
389
+ for attr_name, attr_value in token.fetch(:data, [])
390
+ if !allowed_attributes.include?(attr_name.downcase())
391
+ yield( {:type => "ParseError",
392
+ :data => "unknown-attribute",
393
+ :datavars => {"tagName" => name,
394
+ "attributeName" => attr_name}})
395
+ end
396
+ end
397
+ end
398
+
399
+ ##########################################################################
400
+ # Attribute validation helpers
401
+ ##########################################################################
402
+
403
+ # def checkURI(token, tag_name, attr_name, attr_value)
404
+ # is_valid, error_code = rfc3987.is_valid_uri(attr_value)
405
+ # if not is_valid
406
+ # yield {:type => "ParseError",
407
+ # :data => error_code,
408
+ # :datavars => {"tagName" => tag_name,
409
+ # "attributeName" => attr_name}}
410
+ # yield {:type => "ParseError",
411
+ # :data => "invalid-attribute-value",
412
+ # :datavars => {"tagName" => tag_name,
413
+ # "attributeName" => attr_name}}
414
+
415
+ def check_iri(token, tag_name, attr_name, attr_value)
416
+ is_valid, error_code = is_valid_iri(attr_value)
417
+ if !is_valid
418
+ yield({:type => "ParseError",
419
+ :data => error_code,
420
+ :datavars => {"tagName" => tag_name,
421
+ "attributeName" => attr_name}})
422
+ yield({:type => "ParseError",
423
+ :data => "invalid-attribute-value",
424
+ :datavars => {"tagName" => tag_name,
425
+ "attributeName" => attr_name}})
426
+ end
427
+ end
428
+
429
+ def check_id(token, tag_name, attr_name, attr_value)
430
+ if !attr_value || attr_value.length == 0
431
+ yield({:type => "ParseError",
432
+ :data => "attribute-value-can-not-be-blank",
433
+ :datavars => {"tagName" => tag_name,
434
+ "attributeName" => attr_name}})
435
+ end
436
+ attr_value.each_byte do |b|
437
+ c = [b].pack('c*')
438
+ if HTML5::SPACE_CHARACTERS.include?(c)
439
+ yield( {:type => "ParseError",
440
+ :data => "space-in-id",
441
+ :datavars => {"tagName" => tag_name,
442
+ "attributeName" => attr_name}})
443
+ yield( {:type => "ParseError",
444
+ :data => "invalid-attribute-value",
445
+ :datavars => {"tagName" => tag_name,
446
+ "attributeName" => attr_name}})
447
+ break
448
+ end
449
+ end
450
+ end
451
+
452
+ def parse_token_list(value)
453
+ valueList = []
454
+ currentValue = ''
455
+ (value + ' ').each_byte do |b|
456
+ c = [b].pack('c*')
457
+ if HTML5::SPACE_CHARACTERS.include?(c)
458
+ if currentValue.length > 0
459
+ valueList << currentValue
460
+ currentValue = ''
461
+ end
462
+ else
463
+ currentValue += c
464
+ end
465
+ end
466
+ if currentValue.length > 0
467
+ valueList << currentValue
468
+ end
469
+ valueList
470
+ end
471
+
472
+ def check_token_list(tag_name, attr_name, attr_value)
473
+ # The "token" in the method name refers to tokens in an attribute value
474
+ # i.e. http://www.whatwg.org/specs/web-apps/current-work/#set-of
475
+ # but the "token" parameter refers to the token generated from
476
+ # HTMLTokenizer. Sorry for the confusion.
477
+ value_list = parse_token_list(attr_value)
478
+ value_dict = {}
479
+ for current_value in value_list
480
+ if value_dict.has_key?(current_value)
481
+ yield({:type => "ParseError",
482
+ :data => "duplicate-value-in-token-list",
483
+ :datavars => {"tagName" => tag_name,
484
+ "attributeName" => attr_name,
485
+ "attributeValue" => current_value}})
486
+ break
487
+ end
488
+ value_dict[current_value] = 1
489
+ end
490
+ end
491
+
492
+ def check_enumerated_value(token, tag_name, attr_name, attr_value, enumerated_values)
493
+ if !attr_value || attr_value.length == 0
494
+ yield( {:type => "ParseError",
495
+ :data => "attribute-value-can-not-be-blank",
496
+ :datavars => {"tagName" => tag_name,
497
+ "attributeName" => attr_name}})
498
+ return
499
+ end
500
+ attr_value.downcase!
501
+ if !enumerated_values.include?(attr_value)
502
+ yield( {:type => "ParseError",
503
+ :data => "invalid-enumerated-value",
504
+ :datavars => {"tagName" => tag_name,
505
+ "attribute_name" => attr_name,
506
+ "enumeratedValues" => enumerated_values}})
507
+ yield( {:type => "ParseError",
508
+ :data => "invalid-attribute-value",
509
+ :datavars => {"tagName" => tag_name,
510
+ "attributeName" => attr_name}})
511
+ end
512
+ end
513
+
514
+ def check_boolean(token, tag_name, attr_name, attr_value)
515
+ enumerated_values = [attr_name, '']
516
+ if !enumerated_values.include?(attr_value)
517
+ yield( {:type => "ParseError",
518
+ :data => "invalid-boolean-value",
519
+ :datavars => {"tagName" => tag_name,
520
+ "attributeName" => attr_name,
521
+ "enumeratedValues" => enumerated_values}})
522
+ yield( {:type => "ParseError",
523
+ :data => "invalid-attribute-value",
524
+ :datavars => {"tagName" => tag_name,
525
+ "attributeName" => attr_name}})
526
+ end
527
+ end
528
+
529
+ def check_integer(token, tag_name, attr_name, attr_value)
530
+ sign = 1
531
+ number_string = ''
532
+ state = 'begin' # ('begin', 'initial-number', 'number', 'trailing-junk')
533
+ error = {:type => "ParseError",
534
+ :data => "invalid-integer-value",
535
+ :datavars => {"tagName" => tag_name,
536
+ "attributeName" => attr_name,
537
+ "attributeValue" => attr_value}}
538
+ attr_value.scan(/./) do |c|
539
+ if state == 'begin'
540
+ if HTML5::SPACE_CHARACTERS.include?(c)
541
+ next
542
+ elsif c == '-'
543
+ sign = -1
544
+ state = 'initial-number'
545
+ elsif HTML5::DIGITS.include?(c)
546
+ number_string += c
547
+ state = 'in-number'
548
+ else
549
+ yield error
550
+ return
551
+ end
552
+ elsif state == 'initial-number'
553
+ if !HTML5::DIGITS.include?(c)
554
+ yield error
555
+ return
556
+ end
557
+ number_string += c
558
+ state = 'in-number'
559
+ elsif state == 'in-number'
560
+ if HTML5::DIGITS.include?(c)
561
+ number_string += c
562
+ else
563
+ state = 'trailing-junk'
564
+ end
565
+ elsif state == 'trailing-junk'
566
+ next
567
+ end
568
+ end
569
+ if number_string.length == 0
570
+ yield( {:type => "ParseError",
571
+ :data => "attribute-value-can-not-be-blank",
572
+ :datavars => {"tagName" => tag_name,
573
+ "attributeName" => attr_name}})
574
+ end
575
+ end
576
+
577
+ def check_floating_point_number(token, tag_name, attr_name, attr_value)
578
+ # XXX
579
+ end
580
+
581
+ def check_browsing_context(token, tag_name, attr_name, attr_value)
582
+ return if not attr_value
583
+ return if attr_value[0] != ?_
584
+ attr_value.downcase!
585
+ return if ['_self', '_parent', '_top', '_blank'].include?(attr_value)
586
+ yield({:type => "ParseError",
587
+ :data => "invalid-browsing-context",
588
+ :datavars => {"tagName" => tag_name,
589
+ "attributeName" => attr_name}})
590
+ end
591
+
592
+ def check_lang_code(token, tag_name, attr_name, attr_value)
593
+ return if !attr_value || attr_value == '' # blank is OK
594
+ if not is_valid_lang_code(attr_value)
595
+ yield( {:type => "ParseError",
596
+ :data => "invalid-lang-code",
597
+ :datavars => {"tagName" => tag_name,
598
+ "attributeName" => attr_name,
599
+ "attributeValue" => attr_value}})
600
+ end
601
+ end
602
+
603
+ def check_mime_type(token, tag_name, attr_name, attr_value)
604
+ # XXX needs tests
605
+ if not attr_value
606
+ yield( {:type => "ParseError",
607
+ :data => "attribute-value-can-not-be-blank",
608
+ :datavars => {"tagName" => tag_name,
609
+ "attributeName" => attr_name}})
610
+ end
611
+ if not is_valid_mime_type(attr_value)
612
+ yield( {:type => "ParseError",
613
+ :data => "invalid-mime-type",
614
+ :datavars => {"tagName" => tag_name,
615
+ "attributeName" => attr_name,
616
+ "attributeValue" => attr_value}})
617
+ end
618
+ end
619
+
620
+ def check_media_query(token, tag_name, attr_name, attr_value)
621
+ # XXX
622
+ end
623
+
624
+ def check_link_relation(token, tag_name, attr_name, attr_value)
625
+ check_token_list(tag_name, attr_name, attr_value) do |t|
626
+ yield t
627
+ end
628
+ value_list = parse_token_list(attr_value)
629
+ allowed_values = tag_name == 'link' ? @@link_rel_values : @@a_rel_values
630
+ for current_value in value_list
631
+ if !allowed_values.include?(current_value)
632
+ yield({:type => "ParseError",
633
+ :data => "invalid-rel",
634
+ :datavars => {"tagName" => tag_name,
635
+ "attributeName" => attr_name}})
636
+ end
637
+ end
638
+ end
639
+
640
+ def check_date_time(token, tag_name, attr_name, attr_value)
641
+ # XXX
642
+ state = 'begin' # ('begin', '...
643
+ # for c in attr_value
644
+ # if state == 'begin' =>
645
+ # if SPACE_CHARACTERS.include?(c)
646
+ # continue
647
+ # elsif digits.include?(c)
648
+ # state = ...
649
+ end
650
+
651
+ ##########################################################################
652
+ # Attribute validation
653
+ ##########################################################################
654
+
655
+ def check_attribute_values(token)
656
+ tag_name = token.fetch(:name, "")
657
+ for attr_name, attr_value in token.fetch(:data, [])
658
+ attr_name = attr_name.downcase
659
+ method = "validate_attribute_value_#{tag_name.to_s.underscore}_#{attr_name.to_s.underscore}"
660
+ if respond_to?(method)
661
+ send(method, token, tag_name, attr_name, attr_value) do |t|
662
+ yield t
663
+ end
664
+ else
665
+ method = "validate_attribute_value_#{attr_name.to_s.underscore}"
666
+ if respond_to?(method)
667
+ send(method, token, tag_name, attr_name, attr_value) do |t|
668
+ yield t
669
+ end
670
+ end
671
+ end
672
+ end
673
+ end
674
+
675
+ def validate_attribute_value_class(token, tag_name, attr_name, attr_value)
676
+ check_token_list(tag_name, attr_name, attr_value) do |t|
677
+ yield t
678
+ yield( {:type => "ParseError",
679
+ :data => "invalid-attribute-value",
680
+ :datavars => {"tagName" => tag_name,
681
+ "attributeName" => attr_name}})
682
+ end
683
+ end
684
+
685
+ def validate_attribute_value_contenteditable(token, tag_name, attr_name, attr_value)
686
+ check_enumerated_value(token, tag_name, attr_name, attr_value, ['true', 'false', '']) do |t|
687
+ yield t
688
+ end
689
+ end
690
+
691
+ def validate_attribute_value_dir(token, tag_name, attr_name, attr_value)
692
+ check_enumerated_value(token, tag_name, attr_name, attr_value, ['ltr', 'rtl']) do |t|
693
+ yield t
694
+ end
695
+ end
696
+
697
+ def validate_attribute_value_draggable(token, tag_name, attr_name, attr_value)
698
+ check_enumerated_value(token, tag_name, attr_name, attr_value, ['true', 'false']) do |t|
699
+ yield t
700
+ end
701
+ end
702
+
703
+ alias validate_attribute_value_irrelevant check_boolean
704
+ alias validate_attribute_value_lang check_lang_code
705
+
706
+ def validate_attribute_value_contextmenu(token, tag_name, attr_name, attr_value)
707
+ check_id(token, tag_name, attr_name, attr_value) do |t|
708
+ yield t
709
+ end
710
+ @things_that_point_to_an_id << token
711
+ end
712
+
713
+ def validate_attribute_value_id(token, tag_name, attr_name, attr_value)
714
+ # This method has side effects. It adds 'token' to the list of
715
+ # things that define an ID (@things_that_define_an_id) so that we can
716
+ # later check 1) whether an ID is duplicated, and 2) whether all the
717
+ # things that point to something else by ID (like <label for> or
718
+ # <span contextmenu>) point to an ID that actually exists somewhere.
719
+ check_id(token, tag_name, attr_name, attr_value) do |t|
720
+ yield t
721
+ end
722
+ return if not attr_value
723
+ if @ids_we_have_known_and_loved.include?(attr_value)
724
+ yield( {:type => "ParseError",
725
+ :data => "duplicate-id",
726
+ :datavars => {"tagName" => tag_name}})
727
+ end
728
+ @ids_we_have_known_and_loved << attr_value
729
+ @things_that_define_an_id << token
730
+ end
731
+
732
+ alias validate_attribute_value_tabindex check_integer
733
+
734
+ def validate_attribute_value_ref(token, tag_name, attr_name, attr_value)
735
+ # XXX
736
+ end
737
+
738
+ def validate_attribute_value_template(token, tag_name, attr_name, attr_value)
739
+ # XXX
740
+ end
741
+
742
+ def validate_attribute_value_html_xmlns(token, tag_name, attr_name, attr_value)
743
+ if attr_value != "http://www.w3.org/1999/xhtml"
744
+ yield( {:type => "ParseError",
745
+ :data => "invalid-root-namespace",
746
+ :datavars => {"tagName" => tag_name,
747
+ "attributeName" => attr_name}})
748
+ end
749
+ end
750
+
751
+ alias validate_attribute_value_base_href check_iri
752
+ alias validate_attribute_value_base_target check_browsing_context
753
+ alias validate_attribute_value_link_href check_iri
754
+ alias validate_attribute_value_link_rel check_link_relation
755
+ alias validate_attribute_value_link_media check_media_query
756
+ alias validate_attribute_value_link_hreflang check_lang_code
757
+ alias validate_attribute_value_link_type check_mime_type
758
+ # XXX <meta> attributes
759
+ alias validate_attribute_value_style_media check_media_query
760
+ alias validate_attribute_value_style_type check_mime_type
761
+ alias validate_attribute_value_style_scoped check_boolean
762
+ alias validate_attribute_value_blockquote_cite check_iri
763
+ alias validate_attribute_value_ol_start check_integer
764
+ alias validate_attribute_value_li_value check_integer
765
+ # XXX need tests from here on
766
+ alias validate_attribute_value_a_href check_iri
767
+ alias validate_attribute_value_a_target check_browsing_context
768
+
769
+ def validate_attribute_value_a_ping(token, tag_name, attr_name, attr_value)
770
+ value_list = parse_token_list(attr_value)
771
+ for current_value in value_list
772
+ checkIRI(token, tag_name, attr_name, attr_value) do |t|
773
+ yield t
774
+ end
775
+ end
776
+ end
777
+
778
+ alias validate_attribute_value_a_rel check_link_relation
779
+ alias validate_attribute_value_a_media check_media_query
780
+ alias validate_attribute_value_a_hreflang check_lang_code
781
+ alias validate_attribute_value_a_type check_mime_type
782
+ alias validate_attribute_value_q_cite check_iri
783
+ alias validate_attribute_value_time_datetime check_date_time
784
+ alias validate_attribute_value_meter_value check_floating_point_number
785
+ alias validate_attribute_value_meter_min check_floating_point_number
786
+ alias validate_attribute_value_meter_low check_floating_point_number
787
+ alias validate_attribute_value_meter_high check_floating_point_number
788
+ alias validate_attribute_value_meter_max check_floating_point_number
789
+ alias validate_attribute_value_meter_optimum check_floating_point_number
790
+ alias validate_attribute_value_progress_value check_floating_point_number
791
+ alias validate_attribute_value_progress_max check_floating_point_number
792
+ alias validate_attribute_value_ins_cite check_iri
793
+ alias validate_attribute_value_ins_datetime check_date_time
794
+ alias validate_attribute_value_del_cite check_iri
795
+ alias validate_attribute_value_del_datetime check_date_time
796
+
797
+ ##########################################################################
798
+ # Whole document validation (IDs, etc.)
799
+ ##########################################################################
800
+
801
+ def eof
802
+ for token in @things_that_point_to_an_id
803
+ tag_name = token.fetch(:name, "").downcase
804
+ attrs_dict = token[:data] # by now html5parser has "normalized" the attrs list into a dict.
805
+ # hooray for obscure side effects!
806
+ attr_value = attrs_dict.fetch("contextmenu", "")
807
+ if attr_value and (!@ids_we_have_known_and_loved.include?(attr_value))
808
+ yield( {:type => "ParseError",
809
+ :data => "id-does-not-exist",
810
+ :datavars => {"tagName" => tag_name,
811
+ "attributeName" => "contextmenu",
812
+ "attributeValue" => attr_value}})
813
+ else
814
+ for ref_token in @things_that_define_an_id
815
+ id = ref_token.fetch(:data, {}).fetch("id", "")
816
+ if not id
817
+ continue
818
+ end
819
+ if id == attr_value
820
+ if ref_token.fetch(:name, "").downcase != "men"
821
+ yield( {:type => "ParseError",
822
+ :data => "contextmenu-must-point-to-menu"})
823
+ end
824
+ break
825
+ end
826
+ end
827
+ end
828
+ end
829
+ end
830
+ end