feedtools 0.2.26 → 0.2.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. data/CHANGELOG +232 -216
  2. data/db/migration.rb +2 -0
  3. data/db/schema.mysql.sql +2 -0
  4. data/db/schema.postgresql.sql +3 -1
  5. data/db/schema.sqlite.sql +3 -1
  6. data/lib/feed_tools.rb +37 -14
  7. data/lib/feed_tools/database_feed_cache.rb +13 -2
  8. data/lib/feed_tools/feed.rb +430 -104
  9. data/lib/feed_tools/feed_item.rb +533 -268
  10. data/lib/feed_tools/helpers/generic_helper.rb +1 -1
  11. data/lib/feed_tools/helpers/html_helper.rb +78 -116
  12. data/lib/feed_tools/helpers/retrieval_helper.rb +33 -3
  13. data/lib/feed_tools/helpers/uri_helper.rb +46 -54
  14. data/lib/feed_tools/monkey_patch.rb +27 -1
  15. data/lib/feed_tools/vendor/html5/History.txt +10 -0
  16. data/lib/feed_tools/vendor/html5/Manifest.txt +117 -0
  17. data/lib/feed_tools/vendor/html5/README +45 -0
  18. data/lib/feed_tools/vendor/html5/Rakefile.rb +33 -0
  19. data/lib/feed_tools/vendor/html5/bin/html5 +217 -0
  20. data/lib/feed_tools/vendor/html5/lib/core_ext/string.rb +17 -0
  21. data/lib/feed_tools/vendor/html5/lib/html5.rb +13 -0
  22. data/lib/feed_tools/vendor/html5/lib/html5/constants.rb +1046 -0
  23. data/lib/feed_tools/vendor/html5/lib/html5/filters/base.rb +10 -0
  24. data/lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb +82 -0
  25. data/lib/feed_tools/vendor/html5/lib/html5/filters/iso639codes.rb +752 -0
  26. data/lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb +198 -0
  27. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc2046.rb +30 -0
  28. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc3987.rb +89 -0
  29. data/lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb +15 -0
  30. data/lib/feed_tools/vendor/html5/lib/html5/filters/validator.rb +830 -0
  31. data/lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb +36 -0
  32. data/lib/feed_tools/vendor/html5/lib/html5/html5parser.rb +248 -0
  33. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb +46 -0
  34. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  35. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb +50 -0
  36. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb +41 -0
  37. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb +613 -0
  38. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb +69 -0
  39. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb +78 -0
  40. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  41. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  42. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb +138 -0
  43. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb +89 -0
  44. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb +85 -0
  45. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb +86 -0
  46. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb +115 -0
  47. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb +133 -0
  48. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb +154 -0
  49. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb +41 -0
  50. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  51. data/lib/feed_tools/vendor/html5/lib/html5/inputstream.rb +648 -0
  52. data/lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb +158 -0
  53. data/lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb +188 -0
  54. data/lib/feed_tools/vendor/html5/lib/html5/serializer.rb +2 -0
  55. data/lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb +179 -0
  56. data/lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb +20 -0
  57. data/lib/feed_tools/vendor/html5/lib/html5/sniffer.rb +45 -0
  58. data/lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb +966 -0
  59. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders.rb +24 -0
  60. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb +334 -0
  61. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb +231 -0
  62. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb +209 -0
  63. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb +185 -0
  64. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers.rb +26 -0
  65. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb +162 -0
  66. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb +48 -0
  67. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb +48 -0
  68. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb +48 -0
  69. data/lib/feed_tools/vendor/html5/lib/html5/version.rb +3 -0
  70. data/lib/feed_tools/vendor/html5/testdata/encoding/chardet/test_big5.txt +51 -0
  71. data/lib/feed_tools/vendor/html5/testdata/encoding/test-yahoo-jp.dat +10 -0
  72. data/lib/feed_tools/vendor/html5/testdata/encoding/tests1.dat +394 -0
  73. data/lib/feed_tools/vendor/html5/testdata/encoding/tests2.dat +81 -0
  74. data/lib/feed_tools/vendor/html5/testdata/sanitizer/tests1.dat +416 -0
  75. data/lib/feed_tools/vendor/html5/testdata/serializer/core.test +104 -0
  76. data/lib/feed_tools/vendor/html5/testdata/serializer/injectmeta.test +65 -0
  77. data/lib/feed_tools/vendor/html5/testdata/serializer/optionaltags.test +900 -0
  78. data/lib/feed_tools/vendor/html5/testdata/serializer/options.test +60 -0
  79. data/lib/feed_tools/vendor/html5/testdata/serializer/whitespace.test +51 -0
  80. data/lib/feed_tools/vendor/html5/testdata/sites/google-results.htm +1 -0
  81. data/lib/feed_tools/vendor/html5/testdata/sites/python-ref-import.htm +1 -0
  82. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps-old.htm +1 -0
  83. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps.htm +34275 -0
  84. data/lib/feed_tools/vendor/html5/testdata/sniffer/htmlOrFeed.json +43 -0
  85. data/lib/feed_tools/vendor/html5/testdata/tokenizer/contentModelFlags.test +48 -0
  86. data/lib/feed_tools/vendor/html5/testdata/tokenizer/entities.test +2339 -0
  87. data/lib/feed_tools/vendor/html5/testdata/tokenizer/escapeFlag.test +21 -0
  88. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test1.test +172 -0
  89. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test2.test +129 -0
  90. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test3.test +367 -0
  91. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test4.test +198 -0
  92. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests1.dat +1950 -0
  93. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests2.dat +773 -0
  94. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests3.dat +270 -0
  95. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests4.dat +60 -0
  96. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests5.dat +175 -0
  97. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests6.dat +196 -0
  98. data/lib/feed_tools/vendor/html5/testdata/validator/attributes.test +1035 -0
  99. data/lib/feed_tools/vendor/html5/testdata/validator/base-href-attribute.test +787 -0
  100. data/lib/feed_tools/vendor/html5/testdata/validator/base-target-attribute.test +35 -0
  101. data/lib/feed_tools/vendor/html5/testdata/validator/blockquote-cite-attribute.test +7 -0
  102. data/lib/feed_tools/vendor/html5/testdata/validator/classattribute.test +152 -0
  103. data/lib/feed_tools/vendor/html5/testdata/validator/contenteditableattribute.test +59 -0
  104. data/lib/feed_tools/vendor/html5/testdata/validator/contextmenuattribute.test +115 -0
  105. data/lib/feed_tools/vendor/html5/testdata/validator/dirattribute.test +59 -0
  106. data/lib/feed_tools/vendor/html5/testdata/validator/draggableattribute.test +63 -0
  107. data/lib/feed_tools/vendor/html5/testdata/validator/html-xmlns-attribute.test +23 -0
  108. data/lib/feed_tools/vendor/html5/testdata/validator/idattribute.test +115 -0
  109. data/lib/feed_tools/vendor/html5/testdata/validator/inputattributes.test +2795 -0
  110. data/lib/feed_tools/vendor/html5/testdata/validator/irrelevantattribute.test +63 -0
  111. data/lib/feed_tools/vendor/html5/testdata/validator/langattribute.test +5579 -0
  112. data/lib/feed_tools/vendor/html5/testdata/validator/li-value-attribute.test +7 -0
  113. data/lib/feed_tools/vendor/html5/testdata/validator/link-href-attribute.test +7 -0
  114. data/lib/feed_tools/vendor/html5/testdata/validator/link-hreflang-attribute.test +7 -0
  115. data/lib/feed_tools/vendor/html5/testdata/validator/link-rel-attribute.test +271 -0
  116. data/lib/feed_tools/vendor/html5/testdata/validator/ol-start-attribute.test +7 -0
  117. data/lib/feed_tools/vendor/html5/testdata/validator/starttags.test +375 -0
  118. data/lib/feed_tools/vendor/html5/testdata/validator/style-scoped-attribute.test +7 -0
  119. data/lib/feed_tools/vendor/html5/testdata/validator/tabindexattribute.test +79 -0
  120. data/lib/feed_tools/vendor/html5/tests/preamble.rb +72 -0
  121. data/lib/feed_tools/vendor/html5/tests/test_encoding.rb +35 -0
  122. data/lib/feed_tools/vendor/html5/tests/test_lxp.rb +279 -0
  123. data/lib/feed_tools/vendor/html5/tests/test_parser.rb +68 -0
  124. data/lib/feed_tools/vendor/html5/tests/test_sanitizer.rb +142 -0
  125. data/lib/feed_tools/vendor/html5/tests/test_serializer.rb +68 -0
  126. data/lib/feed_tools/vendor/html5/tests/test_sniffer.rb +27 -0
  127. data/lib/feed_tools/vendor/html5/tests/test_stream.rb +62 -0
  128. data/lib/feed_tools/vendor/html5/tests/test_tokenizer.rb +94 -0
  129. data/lib/feed_tools/vendor/html5/tests/test_treewalkers.rb +135 -0
  130. data/lib/feed_tools/vendor/html5/tests/test_validator.rb +31 -0
  131. data/lib/feed_tools/vendor/html5/tests/tokenizer_test_parser.rb +63 -0
  132. data/lib/feed_tools/vendor/uri.rb +781 -0
  133. data/lib/feed_tools/version.rb +1 -1
  134. data/rakefile +27 -6
  135. data/test/unit/atom_test.rb +298 -210
  136. data/test/unit/helper_test.rb +7 -12
  137. data/test/unit/rdf_test.rb +51 -1
  138. data/test/unit/rss_test.rb +13 -3
  139. metadata +239 -116
  140. data/lib/feed_tools/vendor/htree.rb +0 -97
  141. data/lib/feed_tools/vendor/htree/container.rb +0 -10
  142. data/lib/feed_tools/vendor/htree/context.rb +0 -67
  143. data/lib/feed_tools/vendor/htree/display.rb +0 -27
  144. data/lib/feed_tools/vendor/htree/doc.rb +0 -149
  145. data/lib/feed_tools/vendor/htree/elem.rb +0 -262
  146. data/lib/feed_tools/vendor/htree/encoder.rb +0 -163
  147. data/lib/feed_tools/vendor/htree/equality.rb +0 -218
  148. data/lib/feed_tools/vendor/htree/extract_text.rb +0 -37
  149. data/lib/feed_tools/vendor/htree/fstr.rb +0 -33
  150. data/lib/feed_tools/vendor/htree/gencode.rb +0 -97
  151. data/lib/feed_tools/vendor/htree/htmlinfo.rb +0 -672
  152. data/lib/feed_tools/vendor/htree/inspect.rb +0 -108
  153. data/lib/feed_tools/vendor/htree/leaf.rb +0 -94
  154. data/lib/feed_tools/vendor/htree/loc.rb +0 -367
  155. data/lib/feed_tools/vendor/htree/modules.rb +0 -48
  156. data/lib/feed_tools/vendor/htree/name.rb +0 -124
  157. data/lib/feed_tools/vendor/htree/output.rb +0 -207
  158. data/lib/feed_tools/vendor/htree/parse.rb +0 -409
  159. data/lib/feed_tools/vendor/htree/raw_string.rb +0 -124
  160. data/lib/feed_tools/vendor/htree/regexp-util.rb +0 -15
  161. data/lib/feed_tools/vendor/htree/rexml.rb +0 -130
  162. data/lib/feed_tools/vendor/htree/scan.rb +0 -166
  163. data/lib/feed_tools/vendor/htree/tag.rb +0 -111
  164. data/lib/feed_tools/vendor/htree/template.rb +0 -909
  165. data/lib/feed_tools/vendor/htree/text.rb +0 -115
  166. data/lib/feed_tools/vendor/htree/traverse.rb +0 -465
@@ -0,0 +1,198 @@
1
+ require 'html5/constants'
2
+ require 'html5/filters/base'
3
+
4
+ module HTML5
5
+ module Filters
6
+
7
+ class OptionalTagFilter < Base
8
+ def slider
9
+ previous1 = previous2 = nil
10
+ __getobj__.each do |token|
11
+ yield previous2, previous1, token if previous1 != nil
12
+ previous2 = previous1
13
+ previous1 = token
14
+ end
15
+ yield previous2, previous1, nil
16
+ end
17
+
18
+ def each
19
+ slider do |previous, token, nexttok|
20
+ type = token[:type]
21
+ if type == :StartTag
22
+ yield token unless token[:data].empty? and is_optional_start(token[:name], previous, nexttok)
23
+ elsif type == :EndTag
24
+ yield token unless is_optional_end(token[:name], nexttok)
25
+ else
26
+ yield token
27
+ end
28
+ end
29
+ end
30
+
31
+ def is_optional_start(tagname, previous, nexttok)
32
+ type = nexttok ? nexttok[:type] : nil
33
+ if tagname == 'html'
34
+ # An html element's start tag may be omitted if the first thing
35
+ # inside the html element is not a space character or a comment.
36
+ return ![:Comment, :SpaceCharacters].include?(type)
37
+ elsif tagname == 'head'
38
+ # A head element's start tag may be omitted if the first thing
39
+ # inside the head element is an element.
40
+ return type == :StartTag
41
+ elsif tagname == 'body'
42
+ # A body element's start tag may be omitted if the first thing
43
+ # inside the body element is not a space character or a comment,
44
+ # except if the first thing inside the body element is a script
45
+ # or style element and the node immediately preceding the body
46
+ # element is a head element whose end tag has been omitted.
47
+ if [:Comment, :SpaceCharacters].include?(type)
48
+ return false
49
+ elsif type == :StartTag
50
+ # XXX: we do not look at the preceding event, so we never omit
51
+ # the body element's start tag if it's followed by a script or
52
+ # a style element.
53
+ return !%w[script style].include?(nexttok[:name])
54
+ else
55
+ return true
56
+ end
57
+ elsif tagname == 'colgroup'
58
+ # A colgroup element's start tag may be omitted if the first thing
59
+ # inside the colgroup element is a col element, and if the element
60
+ # is not immediately preceeded by another colgroup element whose
61
+ # end tag has been omitted.
62
+ if type == :StartTag
63
+ # XXX: we do not look at the preceding event, so instead we never
64
+ # omit the colgroup element's end tag when it is immediately
65
+ # followed by another colgroup element. See is_optional_end.
66
+ return nexttok[:name] == "col"
67
+ else
68
+ return false
69
+ end
70
+ elsif tagname == 'tbody'
71
+ # A tbody element's start tag may be omitted if the first thing
72
+ # inside the tbody element is a tr element, and if the element is
73
+ # not immediately preceeded by a tbody, thead, or tfoot element
74
+ # whose end tag has been omitted.
75
+ if type == :StartTag
76
+ # omit the thead and tfoot elements' end tag when they are
77
+ # immediately followed by a tbody element. See is_optional_end.
78
+ if previous and previous[:type] == :EndTag && %w(tbody thead tfoot).include?(previous[:name])
79
+ return false
80
+ end
81
+
82
+ return nexttok[:name] == 'tr'
83
+ else
84
+ return false
85
+ end
86
+ end
87
+ return false
88
+ end
89
+
90
+ def is_optional_end(tagname, nexttok)
91
+ type = nexttok ? nexttok[:type] : nil
92
+ if %w[html head body].include?(tagname)
93
+ # An html element's end tag may be omitted if the html element
94
+ # is not immediately followed by a space character or a comment.
95
+ return ![:Comment, :SpaceCharacters].include?(type)
96
+ elsif %w[li optgroup option tr].include?(tagname)
97
+ # A li element's end tag may be omitted if the li element is
98
+ # immediately followed by another li element or if there is
99
+ # no more content in the parent element.
100
+ # An optgroup element's end tag may be omitted if the optgroup
101
+ # element is immediately followed by another optgroup element,
102
+ # or if there is no more content in the parent element.
103
+ # An option element's end tag may be omitted if the option
104
+ # element is immediately followed by another option element,
105
+ # or if there is no more content in the parent element.
106
+ # A tr element's end tag may be omitted if the tr element is
107
+ # immediately followed by another tr element, or if there is
108
+ # no more content in the parent element.
109
+ if type == :StartTag
110
+ return nexttok[:name] == tagname
111
+ else
112
+ return type == :EndTag || type == nil
113
+ end
114
+ elsif %w(dt dd).include?(tagname)
115
+ # A dt element's end tag may be omitted if the dt element is
116
+ # immediately followed by another dt element or a dd element.
117
+ # A dd element's end tag may be omitted if the dd element is
118
+ # immediately followed by another dd element or a dt element,
119
+ # or if there is no more content in the parent element.
120
+ if type == :StartTag
121
+ return %w(dt dd).include?(nexttok[:name])
122
+ elsif tagname == 'dd'
123
+ return type == :EndTag || type == nil
124
+ else
125
+ return false
126
+ end
127
+ elsif tagname == 'p'
128
+ # A p element's end tag may be omitted if the p element is
129
+ # immediately followed by an address, blockquote, dl, fieldset,
130
+ # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
131
+ # or ul element, or if there is no more content in the parent
132
+ # element.
133
+ if type == :StartTag
134
+ return %w(address blockquote dl fieldset form h1 h2 h3 h4 h5
135
+ h6 hr menu ol p pre table ul).include?(nexttok[:name])
136
+ else
137
+ return type == :EndTag || type == nil
138
+ end
139
+ elsif tagname == 'colgroup'
140
+ # A colgroup element's end tag may be omitted if the colgroup
141
+ # element is not immediately followed by a space character or
142
+ # a comment.
143
+ if [:Comment, :SpaceCharacters].include?(type)
144
+ return false
145
+ elsif type == :StartTag
146
+ # XXX: we also look for an immediately following colgroup
147
+ # element. See is_optional_start.
148
+ return nexttok[:name] != 'colgroup'
149
+ else
150
+ return true
151
+ end
152
+ elsif %w(thead tbody).include? tagname
153
+ # A thead element's end tag may be omitted if the thead element
154
+ # is immediately followed by a tbody or tfoot element.
155
+ # A tbody element's end tag may be omitted if the tbody element
156
+ # is immediately followed by a tbody or tfoot element, or if
157
+ # there is no more content in the parent element.
158
+ # A tfoot element's end tag may be omitted if the tfoot element
159
+ # is immediately followed by a tbody element, or if there is no
160
+ # more content in the parent element.
161
+ # XXX: we never omit the end tag when the following element is
162
+ # a tbody. See is_optional_start.
163
+ if type == :StartTag
164
+ return %w(tbody tfoot).include?(nexttok[:name])
165
+ elsif tagname == 'tbody'
166
+ return (type == :EndTag or type == nil)
167
+ else
168
+ return false
169
+ end
170
+ elsif tagname == 'tfoot'
171
+ # A tfoot element's end tag may be omitted if the tfoot element
172
+ # is immediately followed by a tbody element, or if there is no
173
+ # more content in the parent element.
174
+ # XXX: we never omit the end tag when the following element is
175
+ # a tbody. See is_optional_start.
176
+ if type == :StartTag
177
+ return nexttok[:name] == 'tbody'
178
+ else
179
+ return type == :EndTag || type == nil
180
+ end
181
+ elsif %w(td th).include? tagname
182
+ # A td element's end tag may be omitted if the td element is
183
+ # immediately followed by a td or th element, or if there is
184
+ # no more content in the parent element.
185
+ # A th element's end tag may be omitted if the th element is
186
+ # immediately followed by a td or th element, or if there is
187
+ # no more content in the parent element.
188
+ if type == :StartTag
189
+ return %w(td th).include?(nexttok[:name])
190
+ else
191
+ return type == :EndTag || type == nil
192
+ end
193
+ end
194
+ return false
195
+ end
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,30 @@
1
+ # adapted from feedvalidator, original copyright license is
2
+ #
3
+ # Copyright (c) 2002-2006, Sam Ruby, Mark Pilgrim, Joseph Walton, and Phil Ringnalda
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+
24
+ # mime_re = Regexp.new('[^\s()<>,;:\\"/[\]?=]+/[^\s()<>,;:\\"/[\]?=]+(\s*;\s*[^\s()<>,;:\\"/[\]?=]+=("(\\"|[^"])*"|[^\s()<>,;:\\"/[\]?=]+))*$')
25
+
26
+ def is_valid_mime_type(value)
27
+ # !!mime_re.match(value)
28
+ true
29
+ end
30
+
@@ -0,0 +1,89 @@
1
+ # adapted from feedvalidator, original copyright license is
2
+ #
3
+ # Copyright (c) 2002-2006, Sam Ruby, Mark Pilgrim, Joseph Walton, and Phil Ringnalda
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in all
13
+ # copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ iana_schemes = [ # http://www.iana.org/assignments/uri-schemes.html
24
+ "ftp", "http", "gopher", "mailto", "news", "nntp", "telnet", "wais",
25
+ "file", "prospero", "z39.50s", "z39.50r", "cid", "mid", "vemmi",
26
+ "service", "imap", "nfs", "acap", "rtsp", "tip", "pop", "data", "dav",
27
+ "opaquelocktoken", "sip", "sips", "tel", "fax", "modem", "ldap",
28
+ "https", "soap.beep", "soap.beeps", "xmlrpc.beep", "xmlrpc.beeps",
29
+ "urn", "go", "h323", "ipp", "tftp", "mupdate", "pres", "im", "mtqp",
30
+ "iris.beep", "dict", "snmp", "crid", "tag", "dns", "info"
31
+ ]
32
+ ALLOWED_SCHEMES = iana_schemes + ['javascript']
33
+
34
+ RFC2396 = Regexp.new("^([a-zA-Z][0-9a-zA-Z+\\-\\.]*:)?/{0,2}[0-9a-zA-Z;/?:@&=+$\\.\\-_!~*'()%,#]*$", Regexp::MULTILINE)
35
+ rfc2396_full = Regexp.new("[a-zA-Z][0-9a-zA-Z+\\-\\.]*:(//)?[0-9a-zA-Z;/?:@&=+$\\.\\-_!~*'()%,#]+$")
36
+ URN = Regexp.new("^[Uu][Rr][Nn]:[a-zA-Z0-9][a-zA-Z0-9-]{1,31}:([a-zA-Z0-9()+,\.:=@;$_!*'\-]|%[0-9A-Fa-f]{2})+$")
37
+ TAG = Regexp.new("^tag:([a-z0-9\\-\._]+?@)?[a-z0-9\.\-]+?,\d{4}(-\d{2}(-\d{2})?)?:[0-9a-zA-Z;/\?:@&=+$\.\-_!~*'\(\)%,]*(#[0-9a-zA-Z;/\?:@&=+$\.\-_!~*'\(\)%,]*)?$")
38
+
39
+ def is_valid_uri(value, uri_pattern = RFC2396)
40
+ scheme = value.split(':').first
41
+ scheme.downcase! if scheme
42
+ if scheme == 'tag'
43
+ if !TAG.match(value)
44
+ return false, "invalid-tag-uri"
45
+ end
46
+ elsif scheme == "urn"
47
+ if !URN.match(value)
48
+ return false, "invalid-urn"
49
+ end
50
+ elsif uri_pattern.match(value).to_a.reject{|i| i == ''}.compact.length == 0 || uri_pattern.match(value)[0] != value
51
+ urichars = Regexp.new("^[0-9a-zA-Z;/?:@&=+$\\.\\-_!~*'()%,#]$", Regexp::MULTILINE)
52
+ if value.length > 0
53
+ value.each_byte do |b|
54
+ if b < 128 and !urichars.match([b].pack('c*'))
55
+ return false, "invalid-uri-char"
56
+ end
57
+ end
58
+ else
59
+ begin
60
+ if uri_pattern.match(value.encode('idna'))
61
+ return false, "uri-not-iri"
62
+ end
63
+ rescue
64
+ end
65
+ return false, "invalid-uri"
66
+ end
67
+ elsif ['http','ftp'].include?(scheme)
68
+ if !value.match(%r{^\w+://[^/].*})
69
+ return false, "invalid-http-or-ftp-uri"
70
+ end
71
+ elsif value.index(':') && scheme.match(/^[a-z]+$/) && !ALLOWED_SCHEMES.include?(scheme)
72
+ return false, "invalid-scheme"
73
+ end
74
+ return true, ""
75
+ end
76
+
77
+ def is_valid_iri(value)
78
+ begin
79
+ if value.length > 0
80
+ value = value.encode('idna')
81
+ end
82
+ rescue
83
+ end
84
+ is_valid_uri(value)
85
+ end
86
+
87
+ def is_valid_fully_qualified_uri(value)
88
+ is_valid_uri(value, rfc2396_full)
89
+ end
@@ -0,0 +1,15 @@
1
+ require 'html5/filters/base'
2
+ require 'html5/sanitizer'
3
+
4
+ module HTML5
5
+ module Filters
6
+ class HTMLSanitizeFilter < Base
7
+ include HTMLSanitizeModule
8
+ def each
9
+ __getobj__.each do |token|
10
+ yield(sanitize_token(token))
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,830 @@
1
+ # HTML 5 conformance checker
2
+ #
3
+ # Warning: this module is experimental, incomplete, and subject to removal at any time.
4
+ #
5
+ # Usage:
6
+ # >>> from html5lib.html5parser import HTMLParser
7
+ # >>> from html5lib.filters.validator import HTMLConformanceChecker
8
+ # >>> p = HTMLParser(tokenizer=HTMLConformanceChecker)
9
+ # >>> p.parse('<!doctype html>\n<html foo=bar></html>')
10
+ # <<class 'html5lib.treebuilders.simpletree.Document'> nil>
11
+ # >>> p.errors
12
+ # [((2, 14), 'unknown-attribute', {'attributeName' => u'foo', 'tagName' => u'html'})]
13
+
14
+ require 'html5/constants'
15
+ require 'html5/filters/base'
16
+ require 'html5/filters/iso639codes'
17
+ require 'html5/filters/rfc3987'
18
+ require 'html5/filters/rfc2046'
19
+
20
+ def _(str); str; end
21
+
22
+ class String
23
+ # lifted from rails
24
+ def underscore()
25
+ self.gsub(/::/, '/').
26
+ gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
27
+ gsub(/([a-z\d])([A-Z])/,'\1_\2').
28
+ tr("-", "_").
29
+ downcase
30
+ end
31
+ end
32
+
33
+ HTML5::E.update({
34
+ "unknown-start-tag" =>
35
+ _("Unknown start tag <%(tagName)>."),
36
+ "unknown-attribute" =>
37
+ _("Unknown '%(attributeName)' attribute on <%(tagName)>."),
38
+ "missing-required-attribute" =>
39
+ _("The '%(attributeName)' attribute is required on <%(tagName)>."),
40
+ "unknown-input-type" =>
41
+ _("Illegal value for attribute on <input type='%(inputType)'>."),
42
+ "attribute-not-allowed-on-this-input-type" =>
43
+ _("The '%(attributeName)' attribute is not allowed on <input type=%(inputType)>."),
44
+ "deprecated-attribute" =>
45
+ _("This attribute is deprecated: '%(attributeName)' attribute on <%(tagName)>."),
46
+ "duplicate-value-in-token-list" =>
47
+ _("Duplicate value in token list: '%(attributeValue)' in '%(attributeName)' attribute on <%(tagName)>."),
48
+ "invalid-attribute-value" =>
49
+ _("Invalid attribute value: '%(attributeName)' attribute on <%(tagName)>."),
50
+ "space-in-id" =>
51
+ _("Whitespace is not allowed here: '%(attributeName)' attribute on <%(tagName)>."),
52
+ "duplicate-id" =>
53
+ _("This ID was already defined earlier: 'id' attribute on <%(tagName)>."),
54
+ "attribute-value-can-not-be-blank" =>
55
+ _("This value can not be blank: '%(attributeName)' attribute on <%(tagName)>."),
56
+ "id-does-not-exist" =>
57
+ _("This value refers to a non-existent ID: '%(attributeName)' attribute on <%(tagName)>."),
58
+ "invalid-enumerated-value" =>
59
+ _("Value must be one of %(enumeratedValues): '%(attributeName)' attribute on <%tagName)>."),
60
+ "invalid-boolean-value" =>
61
+ _("Value must be one of %(enumeratedValues): '%(attributeName)' attribute on <%tagName)>."),
62
+ "contextmenu-must-point-to-menu" =>
63
+ _("The contextmenu attribute must point to an ID defined on a <menu> element."),
64
+ "invalid-lang-code" =>
65
+ _("Invalid language code: '%(attributeName)' attibute on <%(tagName)>."),
66
+ "invalid-integer-value" =>
67
+ _("Value must be an integer: '%(attributeName)' attribute on <%tagName)>."),
68
+ "invalid-root-namespace" =>
69
+ _("Root namespace must be 'http://www.w3.org/1999/xhtml', or omitted."),
70
+ "invalid-browsing-context" =>
71
+ _("Value must be one of ('_self', '_parent', '_top'), or a name that does not start with '_' => '%(attributeName)' attribute on <%(tagName)>."),
72
+ "invalid-tag-uri" =>
73
+ _("Invalid URI: '%(attributeName)' attribute on <%(tagName)>."),
74
+ "invalid-urn" =>
75
+ _("Invalid URN: '%(attributeName)' attribute on <%(tagName)>."),
76
+ "invalid-uri-char" =>
77
+ _("Illegal character in URI: '%(attributeName)' attribute on <%(tagName)>."),
78
+ "uri-not-iri" =>
79
+ _("Expected a URI but found an IRI: '%(attributeName)' attribute on <%(tagName)>."),
80
+ "invalid-uri" =>
81
+ _("Invalid URI: '%(attributeName)' attribute on <%(tagName)>."),
82
+ "invalid-http-or-ftp-uri" =>
83
+ _("Invalid URI: '%(attributeName)' attribute on <%(tagName)>."),
84
+ "invalid-scheme" =>
85
+ _("Unregistered URI scheme: '%(attributeName)' attribute on <%(tagName)>."),
86
+ "invalid-rel" =>
87
+ _("Invalid link relation: '%(attributeName)' attribute on <%(tagName)>."),
88
+ "invalid-mime-type" =>
89
+ _("Invalid MIME type: '%(attributeName)' attribute on <%(tagName)>."),
90
+ })
91
+
92
+
93
+ class HTMLConformanceChecker < HTML5::Filters::Base
94
+
95
+ @@global_attributes = %w[class contenteditable contextmenu dir
96
+ draggable id irrelevant lang ref tabindex template
97
+ title onabort onbeforeunload onblur onchange onclick
98
+ oncontextmenu ondblclick ondrag ondragend ondragenter
99
+ ondragleave ondragover ondragstart ondrop onerror
100
+ onfocus onkeydown onkeypress onkeyup onload onmessage
101
+ onmousedown onmousemove onmouseout onmouseover onmouseup
102
+ onmousewheel onresize onscroll onselect onsubmit onunload]
103
+ # XXX lang in HTML only, xml:lang in XHTML only
104
+ # XXX validate ref, template
105
+
106
+ @@allowed_attribute_map = {
107
+ 'html' => %w[xmlns],
108
+ 'head' => [],
109
+ 'title' => [],
110
+ 'base' => %w[href target],
111
+ 'link' => %w[href rel media hreflang type],
112
+ 'meta' => %w[name http-equiv content charset], # XXX charset in HTML only
113
+ 'style' => %w[media type scoped],
114
+ 'body' => [],
115
+ 'section' => [],
116
+ 'nav' => [],
117
+ 'article' => [],
118
+ 'blockquote' => %w[cite],
119
+ 'aside' => [],
120
+ 'h1' => [],
121
+ 'h2' => [],
122
+ 'h3' => [],
123
+ 'h4' => [],
124
+ 'h5' => [],
125
+ 'h6' => [],
126
+ 'header' => [],
127
+ 'footer' => [],
128
+ 'address' => [],
129
+ 'p' => [],
130
+ 'hr' => [],
131
+ 'br' => [],
132
+ 'dialog' => [],
133
+ 'pre' => [],
134
+ 'ol' => %w[start],
135
+ 'ul' => [],
136
+ 'li' => %w[value], # XXX depends on parent
137
+ 'dl' => [],
138
+ 'dt' => [],
139
+ 'dd' => [],
140
+ 'a' => %w[href target ping rel media hreflang type],
141
+ 'q' => %w[cite],
142
+ 'cite' => [],
143
+ 'em' => [],
144
+ 'strong' => [],
145
+ 'small' => [],
146
+ 'm' => [],
147
+ 'dfn' => [],
148
+ 'abbr' => [],
149
+ 'time' => %w[datetime],
150
+ 'meter' => %w[value min low high max optimum],
151
+ 'progress' => %w[value max],
152
+ 'code' => [],
153
+ 'var' => [],
154
+ 'samp' => [],
155
+ 'kbd' => [],
156
+ 'sup' => [],
157
+ 'sub' => [],
158
+ 'span' => [],
159
+ 'i' => [],
160
+ 'b' => [],
161
+ 'bdo' => [],
162
+ 'ins' => %w[cite datetime],
163
+ 'del' => %w[cite datetime],
164
+ 'figure' => [],
165
+ 'img' => %w[alt src usemap ismap height width], # XXX ismap depends on parent
166
+ 'iframe' => %w[src],
167
+ # <embed> handled separately
168
+ 'object' => %w[data type usemap height width],
169
+ 'param' => %w[name value],
170
+ 'video' => %w[src autoplay start loopstart loopend end loopcount controls],
171
+ 'audio' => %w[src autoplay start loopstart loopend end loopcount controls],
172
+ 'source' => %w[src type media],
173
+ 'canvas' => %w[height width],
174
+ 'map' => [],
175
+ 'area' => %w[alt coords shape href target ping rel media hreflang type],
176
+ 'table' => [],
177
+ 'caption' => [],
178
+ 'colgroup' => %w[span], # XXX only if element contains no <col> elements
179
+ 'col' => %w[span],
180
+ 'tbody' => [],
181
+ 'thead' => [],
182
+ 'tfoot' => [],
183
+ 'tr' => [],
184
+ 'td' => %w[colspan rowspan],
185
+ 'th' => %w[colspan rowspan scope],
186
+ # all possible <input> attributes are listed here but <input> is really handled separately
187
+ 'input' => %w[accept accesskey action alt autocomplete autofocus checked
188
+ disabled enctype form inputmode list maxlength method min
189
+ max name pattern step readonly replace required size src
190
+ tabindex target template value
191
+ ],
192
+ 'form' => %w[action method enctype accept name onsubmit onreset accept-charset
193
+ data replace
194
+ ],
195
+ 'button' => %w[action enctype method replace template name value type disabled form autofocus], # XXX may need matrix of acceptable attributes based on value of type attribute (like input)
196
+ 'select' => %w[name size multiple disabled data accesskey form autofocus],
197
+ 'optgroup' => %w[disabled label],
198
+ 'option' => %w[selected disabled label value],
199
+ 'textarea' => %w[maxlength name rows cols disabled readonly required form autofocus wrap accept],
200
+ 'label' => %w[for accesskey form],
201
+ 'fieldset' => %w[disabled form],
202
+ 'output' => %w[form name for onforminput onformchange],
203
+ 'datalist' => %w[data],
204
+ # XXX repetition model for repeating form controls
205
+ 'script' => %w[src defer async type],
206
+ 'noscript' => [],
207
+ 'noembed' => [],
208
+ 'event-source' => %w[src],
209
+ 'details' => %w[open],
210
+ 'datagrid' => %w[multiple disabled],
211
+ 'command' => %w[type label icon hidden disabled checked radiogroup default],
212
+ 'menu' => %w[type label autosubmit],
213
+ 'datatemplate' => [],
214
+ 'rule' => [],
215
+ 'nest' => [],
216
+ 'legend' => [],
217
+ 'div' => [],
218
+ 'font' => %w[style]
219
+ }
220
+
221
+ @@required_attribute_map = {
222
+ 'link' => %w[href rel],
223
+ 'bdo' => %w[dir],
224
+ 'img' => %w[src],
225
+ 'embed' => %w[src],
226
+ 'object' => [], # XXX one of 'data' or 'type' is required
227
+ 'param' => %w[name value],
228
+ 'source' => %w[src],
229
+ 'map' => %w[id]
230
+ }
231
+
232
+ @@input_type_allowed_attribute_map = {
233
+ 'text' => %w[accesskey autocomplete autofocus disabled form inputmode list maxlength name pattern readonly required size tabindex value],
234
+ 'password' => %w[accesskey autocomplete autofocus disabled form inputmode maxlength name pattern readonly required size tabindex value],
235
+ 'checkbox' => %w[accesskey autofocus checked disabled form name required tabindex value],
236
+ 'radio' => %w[accesskey autofocus checked disabled form name required tabindex value],
237
+ 'button' => %w[accesskey autofocus disabled form name tabindex value],
238
+ 'submit' => %w[accesskey action autofocus disabled enctype form method name replace tabindex target value],
239
+ 'reset' => %w[accesskey autofocus disabled form name tabindex value],
240
+ 'add' => %w[accesskey autofocus disabled form name tabindex template value],
241
+ 'remove' => %w[accesskey autofocus disabled form name tabindex value],
242
+ 'move-up' => %w[accesskey autofocus disabled form name tabindex value],
243
+ 'move-down' => %w[accesskey autofocus disabled form name tabindex value],
244
+ 'file' => %w[accept accesskey autofocus disabled form min max name required tabindex],
245
+ 'hidden' => %w[disabled form name value],
246
+ 'image' => %w[accesskey action alt autofocus disabled enctype form method name replace src tabindex target],
247
+ 'datetime' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
248
+ 'datetime-local' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
249
+ 'date' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
250
+ 'month' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
251
+ 'week' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
252
+ 'time' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
253
+ 'number' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
254
+ 'range' => %w[accesskey autocomplete autofocus disabled form list min max name step readonly required tabindex value],
255
+ 'email' => %w[accesskey autocomplete autofocus disabled form inputmode list maxlength name pattern readonly required tabindex value],
256
+ 'url' => %w[accesskey autocomplete autofocus disabled form inputmode list maxlength name pattern readonly required tabindex value],
257
+ }
258
+
259
+ @@input_type_deprecated_attribute_map = {
260
+ 'text' => ['size'],
261
+ 'password' => ['size']
262
+ }
263
+
264
+ @@link_rel_values = %w[alternate archive archives author contact feed first begin start help icon index top contents toc last end license copyright next pingback prefetch prev previous search stylesheet sidebar tag up]
265
+ @@a_rel_values = %w[alternate archive archives author contact feed first begin start help index top contents toc last end license copyright next prev previous search sidebar tag up bookmark external nofollow]
266
+
267
+ def initialize(stream, *args)
268
+ super(HTML5::HTMLTokenizer.new(stream, *args))
269
+ @things_that_define_an_id = []
270
+ @things_that_point_to_an_id = []
271
+ @ids_we_have_known_and_loved = []
272
+ end
273
+
274
+ def each
275
+ __getobj__.each do |token|
276
+ method = "validate_#{token.fetch(:type, '-').to_s.underscore}_#{token.fetch(:name, '-').to_s.underscore}"
277
+ if respond_to?(method)
278
+ send(method, token){|t| yield t }
279
+ else
280
+ method = "validate_#{token.fetch(:type, '-').to_s.underscore}"
281
+ if respond_to?(method)
282
+ send(method, token) do |t|
283
+ yield t
284
+ end
285
+ end
286
+ end
287
+ yield token
288
+ end
289
+ eof do |t|
290
+ yield t
291
+ end
292
+ end
293
+
294
+ ##########################################################################
295
+ # Start tag validation
296
+ ##########################################################################
297
+
298
+ def validate_start_tag(token)
299
+ check_unknown_start_tag(token){|t| yield t}
300
+ check_start_tag_required_attributes(token) do |t|
301
+ yield t
302
+ end
303
+ check_start_tag_unknown_attributes(token) do |t|
304
+ yield t
305
+ end
306
+ check_attribute_values(token) do |t|
307
+ yield t
308
+ end
309
+ end
310
+
311
+ def validate_start_tag_embed(token)
312
+ check_start_tag_required_attributes(token) do |t|
313
+ yield t
314
+ end
315
+ check_attribute_values(token) do |t|
316
+ yield t
317
+ end
318
+ # spec says "any attributes w/o namespace"
319
+ # so don't call check_start_tag_unknown_attributes
320
+ end
321
+
322
+ def validate_start_tag_input(token)
323
+ check_attribute_values(token) do |t|
324
+ yield t
325
+ end
326
+ attr_dict = Hash[*token[:data].collect{|(name, value)| [name.downcase, value]}.flatten]
327
+ input_type = attr_dict.fetch('type', "text")
328
+ if !@@input_type_allowed_attribute_map.keys().include?(input_type)
329
+ yield({:type => "ParseError",
330
+ :data => "unknown-input-type",
331
+ :datavars => {:attrValue => input_type}})
332
+ end
333
+ allowed_attributes = @@input_type_allowed_attribute_map.fetch(input_type, [])
334
+ attr_dict.each do |attr_name, attr_value|
335
+ if !@@allowed_attribute_map['input'].include?(attr_name)
336
+ yield({:type => "ParseError",
337
+ :data => "unknown-attribute",
338
+ :datavars => {"tagName" => "input",
339
+ "attributeName" => attr_name}})
340
+ elsif !allowed_attributes.include?(attr_name)
341
+ yield({:type => "ParseError",
342
+ :data => "attribute-not-allowed-on-this-input-type",
343
+ :datavars => {"attributeName" => attr_name,
344
+ "inputType" => input_type}})
345
+ end
346
+ if @@input_type_deprecated_attribute_map.fetch(input_type, []).include?(attr_name)
347
+ yield({:type => "ParseError",
348
+ :data => "deprecated-attribute",
349
+ :datavars => {"attributeName" => attr_name,
350
+ "inputType" => input_type}})
351
+ end
352
+ end
353
+ end
354
+
355
+ ##########################################################################
356
+ # Start tag validation helpers
357
+ ##########################################################################
358
+
359
+ def check_unknown_start_tag(token)
360
+ # check for recognized tag name
361
+ name = (token[:name] || "").downcase
362
+ if !@@allowed_attribute_map.keys.include?(name)
363
+ yield({:type => "ParseError",
364
+ :data => "unknown-start-tag",
365
+ :datavars => {"tagName" => name}})
366
+ end
367
+ end
368
+
369
+ def check_start_tag_required_attributes(token)
370
+ # check for presence of required attributes
371
+ name = (token[:name] || "").downcase
372
+ if @@required_attribute_map.keys().include?(name)
373
+ attrs_present = (token[:data] || []).collect{|t| t[0]}
374
+ for attr_name in @@required_attribute_map[name]
375
+ if !attrs_present.include?(attr_name)
376
+ yield( {:type => "ParseError",
377
+ :data => "missing-required-attribute",
378
+ :datavars => {"tagName" => name,
379
+ "attributeName" => attr_name}})
380
+ end
381
+ end
382
+ end
383
+ end
384
+
385
+ def check_start_tag_unknown_attributes(token)
386
+ # check for recognized attribute names
387
+ name = token[:name].downcase
388
+ allowed_attributes = @@global_attributes | @@allowed_attribute_map.fetch(name, [])
389
+ for attr_name, attr_value in token.fetch(:data, [])
390
+ if !allowed_attributes.include?(attr_name.downcase())
391
+ yield( {:type => "ParseError",
392
+ :data => "unknown-attribute",
393
+ :datavars => {"tagName" => name,
394
+ "attributeName" => attr_name}})
395
+ end
396
+ end
397
+ end
398
+
399
+ ##########################################################################
400
+ # Attribute validation helpers
401
+ ##########################################################################
402
+
403
+ # def checkURI(token, tag_name, attr_name, attr_value)
404
+ # is_valid, error_code = rfc3987.is_valid_uri(attr_value)
405
+ # if not is_valid
406
+ # yield {:type => "ParseError",
407
+ # :data => error_code,
408
+ # :datavars => {"tagName" => tag_name,
409
+ # "attributeName" => attr_name}}
410
+ # yield {:type => "ParseError",
411
+ # :data => "invalid-attribute-value",
412
+ # :datavars => {"tagName" => tag_name,
413
+ # "attributeName" => attr_name}}
414
+
415
+ def check_iri(token, tag_name, attr_name, attr_value)
416
+ is_valid, error_code = is_valid_iri(attr_value)
417
+ if !is_valid
418
+ yield({:type => "ParseError",
419
+ :data => error_code,
420
+ :datavars => {"tagName" => tag_name,
421
+ "attributeName" => attr_name}})
422
+ yield({:type => "ParseError",
423
+ :data => "invalid-attribute-value",
424
+ :datavars => {"tagName" => tag_name,
425
+ "attributeName" => attr_name}})
426
+ end
427
+ end
428
+
429
+ def check_id(token, tag_name, attr_name, attr_value)
430
+ if !attr_value || attr_value.length == 0
431
+ yield({:type => "ParseError",
432
+ :data => "attribute-value-can-not-be-blank",
433
+ :datavars => {"tagName" => tag_name,
434
+ "attributeName" => attr_name}})
435
+ end
436
+ attr_value.each_byte do |b|
437
+ c = [b].pack('c*')
438
+ if HTML5::SPACE_CHARACTERS.include?(c)
439
+ yield( {:type => "ParseError",
440
+ :data => "space-in-id",
441
+ :datavars => {"tagName" => tag_name,
442
+ "attributeName" => attr_name}})
443
+ yield( {:type => "ParseError",
444
+ :data => "invalid-attribute-value",
445
+ :datavars => {"tagName" => tag_name,
446
+ "attributeName" => attr_name}})
447
+ break
448
+ end
449
+ end
450
+ end
451
+
452
+ def parse_token_list(value)
453
+ valueList = []
454
+ currentValue = ''
455
+ (value + ' ').each_byte do |b|
456
+ c = [b].pack('c*')
457
+ if HTML5::SPACE_CHARACTERS.include?(c)
458
+ if currentValue.length > 0
459
+ valueList << currentValue
460
+ currentValue = ''
461
+ end
462
+ else
463
+ currentValue += c
464
+ end
465
+ end
466
+ if currentValue.length > 0
467
+ valueList << currentValue
468
+ end
469
+ valueList
470
+ end
471
+
472
+ def check_token_list(tag_name, attr_name, attr_value)
473
+ # The "token" in the method name refers to tokens in an attribute value
474
+ # i.e. http://www.whatwg.org/specs/web-apps/current-work/#set-of
475
+ # but the "token" parameter refers to the token generated from
476
+ # HTMLTokenizer. Sorry for the confusion.
477
+ value_list = parse_token_list(attr_value)
478
+ value_dict = {}
479
+ for current_value in value_list
480
+ if value_dict.has_key?(current_value)
481
+ yield({:type => "ParseError",
482
+ :data => "duplicate-value-in-token-list",
483
+ :datavars => {"tagName" => tag_name,
484
+ "attributeName" => attr_name,
485
+ "attributeValue" => current_value}})
486
+ break
487
+ end
488
+ value_dict[current_value] = 1
489
+ end
490
+ end
491
+
492
+ def check_enumerated_value(token, tag_name, attr_name, attr_value, enumerated_values)
493
+ if !attr_value || attr_value.length == 0
494
+ yield( {:type => "ParseError",
495
+ :data => "attribute-value-can-not-be-blank",
496
+ :datavars => {"tagName" => tag_name,
497
+ "attributeName" => attr_name}})
498
+ return
499
+ end
500
+ attr_value.downcase!
501
+ if !enumerated_values.include?(attr_value)
502
+ yield( {:type => "ParseError",
503
+ :data => "invalid-enumerated-value",
504
+ :datavars => {"tagName" => tag_name,
505
+ "attribute_name" => attr_name,
506
+ "enumeratedValues" => enumerated_values}})
507
+ yield( {:type => "ParseError",
508
+ :data => "invalid-attribute-value",
509
+ :datavars => {"tagName" => tag_name,
510
+ "attributeName" => attr_name}})
511
+ end
512
+ end
513
+
514
+ def check_boolean(token, tag_name, attr_name, attr_value)
515
+ enumerated_values = [attr_name, '']
516
+ if !enumerated_values.include?(attr_value)
517
+ yield( {:type => "ParseError",
518
+ :data => "invalid-boolean-value",
519
+ :datavars => {"tagName" => tag_name,
520
+ "attributeName" => attr_name,
521
+ "enumeratedValues" => enumerated_values}})
522
+ yield( {:type => "ParseError",
523
+ :data => "invalid-attribute-value",
524
+ :datavars => {"tagName" => tag_name,
525
+ "attributeName" => attr_name}})
526
+ end
527
+ end
528
+
529
+ def check_integer(token, tag_name, attr_name, attr_value)
530
+ sign = 1
531
+ number_string = ''
532
+ state = 'begin' # ('begin', 'initial-number', 'number', 'trailing-junk')
533
+ error = {:type => "ParseError",
534
+ :data => "invalid-integer-value",
535
+ :datavars => {"tagName" => tag_name,
536
+ "attributeName" => attr_name,
537
+ "attributeValue" => attr_value}}
538
+ attr_value.scan(/./) do |c|
539
+ if state == 'begin'
540
+ if HTML5::SPACE_CHARACTERS.include?(c)
541
+ next
542
+ elsif c == '-'
543
+ sign = -1
544
+ state = 'initial-number'
545
+ elsif HTML5::DIGITS.include?(c)
546
+ number_string += c
547
+ state = 'in-number'
548
+ else
549
+ yield error
550
+ return
551
+ end
552
+ elsif state == 'initial-number'
553
+ if !HTML5::DIGITS.include?(c)
554
+ yield error
555
+ return
556
+ end
557
+ number_string += c
558
+ state = 'in-number'
559
+ elsif state == 'in-number'
560
+ if HTML5::DIGITS.include?(c)
561
+ number_string += c
562
+ else
563
+ state = 'trailing-junk'
564
+ end
565
+ elsif state == 'trailing-junk'
566
+ next
567
+ end
568
+ end
569
+ if number_string.length == 0
570
+ yield( {:type => "ParseError",
571
+ :data => "attribute-value-can-not-be-blank",
572
+ :datavars => {"tagName" => tag_name,
573
+ "attributeName" => attr_name}})
574
+ end
575
+ end
576
+
577
+ def check_floating_point_number(token, tag_name, attr_name, attr_value)
578
+ # XXX
579
+ end
580
+
581
+ def check_browsing_context(token, tag_name, attr_name, attr_value)
582
+ return if not attr_value
583
+ return if attr_value[0] != ?_
584
+ attr_value.downcase!
585
+ return if ['_self', '_parent', '_top', '_blank'].include?(attr_value)
586
+ yield({:type => "ParseError",
587
+ :data => "invalid-browsing-context",
588
+ :datavars => {"tagName" => tag_name,
589
+ "attributeName" => attr_name}})
590
+ end
591
+
592
+ def check_lang_code(token, tag_name, attr_name, attr_value)
593
+ return if !attr_value || attr_value == '' # blank is OK
594
+ if not is_valid_lang_code(attr_value)
595
+ yield( {:type => "ParseError",
596
+ :data => "invalid-lang-code",
597
+ :datavars => {"tagName" => tag_name,
598
+ "attributeName" => attr_name,
599
+ "attributeValue" => attr_value}})
600
+ end
601
+ end
602
+
603
+ def check_mime_type(token, tag_name, attr_name, attr_value)
604
+ # XXX needs tests
605
+ if not attr_value
606
+ yield( {:type => "ParseError",
607
+ :data => "attribute-value-can-not-be-blank",
608
+ :datavars => {"tagName" => tag_name,
609
+ "attributeName" => attr_name}})
610
+ end
611
+ if not is_valid_mime_type(attr_value)
612
+ yield( {:type => "ParseError",
613
+ :data => "invalid-mime-type",
614
+ :datavars => {"tagName" => tag_name,
615
+ "attributeName" => attr_name,
616
+ "attributeValue" => attr_value}})
617
+ end
618
+ end
619
+
620
+ def check_media_query(token, tag_name, attr_name, attr_value)
621
+ # XXX
622
+ end
623
+
624
+ def check_link_relation(token, tag_name, attr_name, attr_value)
625
+ check_token_list(tag_name, attr_name, attr_value) do |t|
626
+ yield t
627
+ end
628
+ value_list = parse_token_list(attr_value)
629
+ allowed_values = tag_name == 'link' ? @@link_rel_values : @@a_rel_values
630
+ for current_value in value_list
631
+ if !allowed_values.include?(current_value)
632
+ yield({:type => "ParseError",
633
+ :data => "invalid-rel",
634
+ :datavars => {"tagName" => tag_name,
635
+ "attributeName" => attr_name}})
636
+ end
637
+ end
638
+ end
639
+
640
+ def check_date_time(token, tag_name, attr_name, attr_value)
641
+ # XXX
642
+ state = 'begin' # ('begin', '...
643
+ # for c in attr_value
644
+ # if state == 'begin' =>
645
+ # if SPACE_CHARACTERS.include?(c)
646
+ # continue
647
+ # elsif digits.include?(c)
648
+ # state = ...
649
+ end
650
+
651
+ ##########################################################################
652
+ # Attribute validation
653
+ ##########################################################################
654
+
655
+ def check_attribute_values(token)
656
+ tag_name = token.fetch(:name, "")
657
+ for attr_name, attr_value in token.fetch(:data, [])
658
+ attr_name = attr_name.downcase
659
+ method = "validate_attribute_value_#{tag_name.to_s.underscore}_#{attr_name.to_s.underscore}"
660
+ if respond_to?(method)
661
+ send(method, token, tag_name, attr_name, attr_value) do |t|
662
+ yield t
663
+ end
664
+ else
665
+ method = "validate_attribute_value_#{attr_name.to_s.underscore}"
666
+ if respond_to?(method)
667
+ send(method, token, tag_name, attr_name, attr_value) do |t|
668
+ yield t
669
+ end
670
+ end
671
+ end
672
+ end
673
+ end
674
+
675
+ def validate_attribute_value_class(token, tag_name, attr_name, attr_value)
676
+ check_token_list(tag_name, attr_name, attr_value) do |t|
677
+ yield t
678
+ yield( {:type => "ParseError",
679
+ :data => "invalid-attribute-value",
680
+ :datavars => {"tagName" => tag_name,
681
+ "attributeName" => attr_name}})
682
+ end
683
+ end
684
+
685
+ def validate_attribute_value_contenteditable(token, tag_name, attr_name, attr_value)
686
+ check_enumerated_value(token, tag_name, attr_name, attr_value, ['true', 'false', '']) do |t|
687
+ yield t
688
+ end
689
+ end
690
+
691
+ def validate_attribute_value_dir(token, tag_name, attr_name, attr_value)
692
+ check_enumerated_value(token, tag_name, attr_name, attr_value, ['ltr', 'rtl']) do |t|
693
+ yield t
694
+ end
695
+ end
696
+
697
+ def validate_attribute_value_draggable(token, tag_name, attr_name, attr_value)
698
+ check_enumerated_value(token, tag_name, attr_name, attr_value, ['true', 'false']) do |t|
699
+ yield t
700
+ end
701
+ end
702
+
703
+ alias validate_attribute_value_irrelevant check_boolean
704
+ alias validate_attribute_value_lang check_lang_code
705
+
706
+ def validate_attribute_value_contextmenu(token, tag_name, attr_name, attr_value)
707
+ check_id(token, tag_name, attr_name, attr_value) do |t|
708
+ yield t
709
+ end
710
+ @things_that_point_to_an_id << token
711
+ end
712
+
713
+ def validate_attribute_value_id(token, tag_name, attr_name, attr_value)
714
+ # This method has side effects. It adds 'token' to the list of
715
+ # things that define an ID (@things_that_define_an_id) so that we can
716
+ # later check 1) whether an ID is duplicated, and 2) whether all the
717
+ # things that point to something else by ID (like <label for> or
718
+ # <span contextmenu>) point to an ID that actually exists somewhere.
719
+ check_id(token, tag_name, attr_name, attr_value) do |t|
720
+ yield t
721
+ end
722
+ return if not attr_value
723
+ if @ids_we_have_known_and_loved.include?(attr_value)
724
+ yield( {:type => "ParseError",
725
+ :data => "duplicate-id",
726
+ :datavars => {"tagName" => tag_name}})
727
+ end
728
+ @ids_we_have_known_and_loved << attr_value
729
+ @things_that_define_an_id << token
730
+ end
731
+
732
+ alias validate_attribute_value_tabindex check_integer
733
+
734
+ def validate_attribute_value_ref(token, tag_name, attr_name, attr_value)
735
+ # XXX
736
+ end
737
+
738
+ def validate_attribute_value_template(token, tag_name, attr_name, attr_value)
739
+ # XXX
740
+ end
741
+
742
+ def validate_attribute_value_html_xmlns(token, tag_name, attr_name, attr_value)
743
+ if attr_value != "http://www.w3.org/1999/xhtml"
744
+ yield( {:type => "ParseError",
745
+ :data => "invalid-root-namespace",
746
+ :datavars => {"tagName" => tag_name,
747
+ "attributeName" => attr_name}})
748
+ end
749
+ end
750
+
751
+ alias validate_attribute_value_base_href check_iri
752
+ alias validate_attribute_value_base_target check_browsing_context
753
+ alias validate_attribute_value_link_href check_iri
754
+ alias validate_attribute_value_link_rel check_link_relation
755
+ alias validate_attribute_value_link_media check_media_query
756
+ alias validate_attribute_value_link_hreflang check_lang_code
757
+ alias validate_attribute_value_link_type check_mime_type
758
+ # XXX <meta> attributes
759
+ alias validate_attribute_value_style_media check_media_query
760
+ alias validate_attribute_value_style_type check_mime_type
761
+ alias validate_attribute_value_style_scoped check_boolean
762
+ alias validate_attribute_value_blockquote_cite check_iri
763
+ alias validate_attribute_value_ol_start check_integer
764
+ alias validate_attribute_value_li_value check_integer
765
+ # XXX need tests from here on
766
+ alias validate_attribute_value_a_href check_iri
767
+ alias validate_attribute_value_a_target check_browsing_context
768
+
769
+ def validate_attribute_value_a_ping(token, tag_name, attr_name, attr_value)
770
+ value_list = parse_token_list(attr_value)
771
+ for current_value in value_list
772
+ checkIRI(token, tag_name, attr_name, attr_value) do |t|
773
+ yield t
774
+ end
775
+ end
776
+ end
777
+
778
+ alias validate_attribute_value_a_rel check_link_relation
779
+ alias validate_attribute_value_a_media check_media_query
780
+ alias validate_attribute_value_a_hreflang check_lang_code
781
+ alias validate_attribute_value_a_type check_mime_type
782
+ alias validate_attribute_value_q_cite check_iri
783
+ alias validate_attribute_value_time_datetime check_date_time
784
+ alias validate_attribute_value_meter_value check_floating_point_number
785
+ alias validate_attribute_value_meter_min check_floating_point_number
786
+ alias validate_attribute_value_meter_low check_floating_point_number
787
+ alias validate_attribute_value_meter_high check_floating_point_number
788
+ alias validate_attribute_value_meter_max check_floating_point_number
789
+ alias validate_attribute_value_meter_optimum check_floating_point_number
790
+ alias validate_attribute_value_progress_value check_floating_point_number
791
+ alias validate_attribute_value_progress_max check_floating_point_number
792
+ alias validate_attribute_value_ins_cite check_iri
793
+ alias validate_attribute_value_ins_datetime check_date_time
794
+ alias validate_attribute_value_del_cite check_iri
795
+ alias validate_attribute_value_del_datetime check_date_time
796
+
797
+ ##########################################################################
798
+ # Whole document validation (IDs, etc.)
799
+ ##########################################################################
800
+
801
+ def eof
802
+ for token in @things_that_point_to_an_id
803
+ tag_name = token.fetch(:name, "").downcase
804
+ attrs_dict = token[:data] # by now html5parser has "normalized" the attrs list into a dict.
805
+ # hooray for obscure side effects!
806
+ attr_value = attrs_dict.fetch("contextmenu", "")
807
+ if attr_value and (!@ids_we_have_known_and_loved.include?(attr_value))
808
+ yield( {:type => "ParseError",
809
+ :data => "id-does-not-exist",
810
+ :datavars => {"tagName" => tag_name,
811
+ "attributeName" => "contextmenu",
812
+ "attributeValue" => attr_value}})
813
+ else
814
+ for ref_token in @things_that_define_an_id
815
+ id = ref_token.fetch(:data, {}).fetch("id", "")
816
+ if not id
817
+ continue
818
+ end
819
+ if id == attr_value
820
+ if ref_token.fetch(:name, "").downcase != "men"
821
+ yield( {:type => "ParseError",
822
+ :data => "contextmenu-must-point-to-menu"})
823
+ end
824
+ break
825
+ end
826
+ end
827
+ end
828
+ end
829
+ end
830
+ end