feedtools 0.2.26 → 0.2.27

Sign up to get free protection for your applications and to get access to all the features.
Files changed (166) hide show
  1. data/CHANGELOG +232 -216
  2. data/db/migration.rb +2 -0
  3. data/db/schema.mysql.sql +2 -0
  4. data/db/schema.postgresql.sql +3 -1
  5. data/db/schema.sqlite.sql +3 -1
  6. data/lib/feed_tools.rb +37 -14
  7. data/lib/feed_tools/database_feed_cache.rb +13 -2
  8. data/lib/feed_tools/feed.rb +430 -104
  9. data/lib/feed_tools/feed_item.rb +533 -268
  10. data/lib/feed_tools/helpers/generic_helper.rb +1 -1
  11. data/lib/feed_tools/helpers/html_helper.rb +78 -116
  12. data/lib/feed_tools/helpers/retrieval_helper.rb +33 -3
  13. data/lib/feed_tools/helpers/uri_helper.rb +46 -54
  14. data/lib/feed_tools/monkey_patch.rb +27 -1
  15. data/lib/feed_tools/vendor/html5/History.txt +10 -0
  16. data/lib/feed_tools/vendor/html5/Manifest.txt +117 -0
  17. data/lib/feed_tools/vendor/html5/README +45 -0
  18. data/lib/feed_tools/vendor/html5/Rakefile.rb +33 -0
  19. data/lib/feed_tools/vendor/html5/bin/html5 +217 -0
  20. data/lib/feed_tools/vendor/html5/lib/core_ext/string.rb +17 -0
  21. data/lib/feed_tools/vendor/html5/lib/html5.rb +13 -0
  22. data/lib/feed_tools/vendor/html5/lib/html5/constants.rb +1046 -0
  23. data/lib/feed_tools/vendor/html5/lib/html5/filters/base.rb +10 -0
  24. data/lib/feed_tools/vendor/html5/lib/html5/filters/inject_meta_charset.rb +82 -0
  25. data/lib/feed_tools/vendor/html5/lib/html5/filters/iso639codes.rb +752 -0
  26. data/lib/feed_tools/vendor/html5/lib/html5/filters/optionaltags.rb +198 -0
  27. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc2046.rb +30 -0
  28. data/lib/feed_tools/vendor/html5/lib/html5/filters/rfc3987.rb +89 -0
  29. data/lib/feed_tools/vendor/html5/lib/html5/filters/sanitizer.rb +15 -0
  30. data/lib/feed_tools/vendor/html5/lib/html5/filters/validator.rb +830 -0
  31. data/lib/feed_tools/vendor/html5/lib/html5/filters/whitespace.rb +36 -0
  32. data/lib/feed_tools/vendor/html5/lib/html5/html5parser.rb +248 -0
  33. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_body_phase.rb +46 -0
  34. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  35. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/after_head_phase.rb +50 -0
  36. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/before_head_phase.rb +41 -0
  37. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_body_phase.rb +613 -0
  38. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_caption_phase.rb +69 -0
  39. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_cell_phase.rb +78 -0
  40. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  41. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  42. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_head_phase.rb +138 -0
  43. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_row_phase.rb +89 -0
  44. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_select_phase.rb +85 -0
  45. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_body_phase.rb +86 -0
  46. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/in_table_phase.rb +115 -0
  47. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/initial_phase.rb +133 -0
  48. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/phase.rb +154 -0
  49. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/root_element_phase.rb +41 -0
  50. data/lib/feed_tools/vendor/html5/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  51. data/lib/feed_tools/vendor/html5/lib/html5/inputstream.rb +648 -0
  52. data/lib/feed_tools/vendor/html5/lib/html5/liberalxmlparser.rb +158 -0
  53. data/lib/feed_tools/vendor/html5/lib/html5/sanitizer.rb +188 -0
  54. data/lib/feed_tools/vendor/html5/lib/html5/serializer.rb +2 -0
  55. data/lib/feed_tools/vendor/html5/lib/html5/serializer/htmlserializer.rb +179 -0
  56. data/lib/feed_tools/vendor/html5/lib/html5/serializer/xhtmlserializer.rb +20 -0
  57. data/lib/feed_tools/vendor/html5/lib/html5/sniffer.rb +45 -0
  58. data/lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb +966 -0
  59. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders.rb +24 -0
  60. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/base.rb +334 -0
  61. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/hpricot.rb +231 -0
  62. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/rexml.rb +209 -0
  63. data/lib/feed_tools/vendor/html5/lib/html5/treebuilders/simpletree.rb +185 -0
  64. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers.rb +26 -0
  65. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/base.rb +162 -0
  66. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/hpricot.rb +48 -0
  67. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/rexml.rb +48 -0
  68. data/lib/feed_tools/vendor/html5/lib/html5/treewalkers/simpletree.rb +48 -0
  69. data/lib/feed_tools/vendor/html5/lib/html5/version.rb +3 -0
  70. data/lib/feed_tools/vendor/html5/testdata/encoding/chardet/test_big5.txt +51 -0
  71. data/lib/feed_tools/vendor/html5/testdata/encoding/test-yahoo-jp.dat +10 -0
  72. data/lib/feed_tools/vendor/html5/testdata/encoding/tests1.dat +394 -0
  73. data/lib/feed_tools/vendor/html5/testdata/encoding/tests2.dat +81 -0
  74. data/lib/feed_tools/vendor/html5/testdata/sanitizer/tests1.dat +416 -0
  75. data/lib/feed_tools/vendor/html5/testdata/serializer/core.test +104 -0
  76. data/lib/feed_tools/vendor/html5/testdata/serializer/injectmeta.test +65 -0
  77. data/lib/feed_tools/vendor/html5/testdata/serializer/optionaltags.test +900 -0
  78. data/lib/feed_tools/vendor/html5/testdata/serializer/options.test +60 -0
  79. data/lib/feed_tools/vendor/html5/testdata/serializer/whitespace.test +51 -0
  80. data/lib/feed_tools/vendor/html5/testdata/sites/google-results.htm +1 -0
  81. data/lib/feed_tools/vendor/html5/testdata/sites/python-ref-import.htm +1 -0
  82. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps-old.htm +1 -0
  83. data/lib/feed_tools/vendor/html5/testdata/sites/web-apps.htm +34275 -0
  84. data/lib/feed_tools/vendor/html5/testdata/sniffer/htmlOrFeed.json +43 -0
  85. data/lib/feed_tools/vendor/html5/testdata/tokenizer/contentModelFlags.test +48 -0
  86. data/lib/feed_tools/vendor/html5/testdata/tokenizer/entities.test +2339 -0
  87. data/lib/feed_tools/vendor/html5/testdata/tokenizer/escapeFlag.test +21 -0
  88. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test1.test +172 -0
  89. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test2.test +129 -0
  90. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test3.test +367 -0
  91. data/lib/feed_tools/vendor/html5/testdata/tokenizer/test4.test +198 -0
  92. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests1.dat +1950 -0
  93. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests2.dat +773 -0
  94. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests3.dat +270 -0
  95. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests4.dat +60 -0
  96. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests5.dat +175 -0
  97. data/lib/feed_tools/vendor/html5/testdata/tree-construction/tests6.dat +196 -0
  98. data/lib/feed_tools/vendor/html5/testdata/validator/attributes.test +1035 -0
  99. data/lib/feed_tools/vendor/html5/testdata/validator/base-href-attribute.test +787 -0
  100. data/lib/feed_tools/vendor/html5/testdata/validator/base-target-attribute.test +35 -0
  101. data/lib/feed_tools/vendor/html5/testdata/validator/blockquote-cite-attribute.test +7 -0
  102. data/lib/feed_tools/vendor/html5/testdata/validator/classattribute.test +152 -0
  103. data/lib/feed_tools/vendor/html5/testdata/validator/contenteditableattribute.test +59 -0
  104. data/lib/feed_tools/vendor/html5/testdata/validator/contextmenuattribute.test +115 -0
  105. data/lib/feed_tools/vendor/html5/testdata/validator/dirattribute.test +59 -0
  106. data/lib/feed_tools/vendor/html5/testdata/validator/draggableattribute.test +63 -0
  107. data/lib/feed_tools/vendor/html5/testdata/validator/html-xmlns-attribute.test +23 -0
  108. data/lib/feed_tools/vendor/html5/testdata/validator/idattribute.test +115 -0
  109. data/lib/feed_tools/vendor/html5/testdata/validator/inputattributes.test +2795 -0
  110. data/lib/feed_tools/vendor/html5/testdata/validator/irrelevantattribute.test +63 -0
  111. data/lib/feed_tools/vendor/html5/testdata/validator/langattribute.test +5579 -0
  112. data/lib/feed_tools/vendor/html5/testdata/validator/li-value-attribute.test +7 -0
  113. data/lib/feed_tools/vendor/html5/testdata/validator/link-href-attribute.test +7 -0
  114. data/lib/feed_tools/vendor/html5/testdata/validator/link-hreflang-attribute.test +7 -0
  115. data/lib/feed_tools/vendor/html5/testdata/validator/link-rel-attribute.test +271 -0
  116. data/lib/feed_tools/vendor/html5/testdata/validator/ol-start-attribute.test +7 -0
  117. data/lib/feed_tools/vendor/html5/testdata/validator/starttags.test +375 -0
  118. data/lib/feed_tools/vendor/html5/testdata/validator/style-scoped-attribute.test +7 -0
  119. data/lib/feed_tools/vendor/html5/testdata/validator/tabindexattribute.test +79 -0
  120. data/lib/feed_tools/vendor/html5/tests/preamble.rb +72 -0
  121. data/lib/feed_tools/vendor/html5/tests/test_encoding.rb +35 -0
  122. data/lib/feed_tools/vendor/html5/tests/test_lxp.rb +279 -0
  123. data/lib/feed_tools/vendor/html5/tests/test_parser.rb +68 -0
  124. data/lib/feed_tools/vendor/html5/tests/test_sanitizer.rb +142 -0
  125. data/lib/feed_tools/vendor/html5/tests/test_serializer.rb +68 -0
  126. data/lib/feed_tools/vendor/html5/tests/test_sniffer.rb +27 -0
  127. data/lib/feed_tools/vendor/html5/tests/test_stream.rb +62 -0
  128. data/lib/feed_tools/vendor/html5/tests/test_tokenizer.rb +94 -0
  129. data/lib/feed_tools/vendor/html5/tests/test_treewalkers.rb +135 -0
  130. data/lib/feed_tools/vendor/html5/tests/test_validator.rb +31 -0
  131. data/lib/feed_tools/vendor/html5/tests/tokenizer_test_parser.rb +63 -0
  132. data/lib/feed_tools/vendor/uri.rb +781 -0
  133. data/lib/feed_tools/version.rb +1 -1
  134. data/rakefile +27 -6
  135. data/test/unit/atom_test.rb +298 -210
  136. data/test/unit/helper_test.rb +7 -12
  137. data/test/unit/rdf_test.rb +51 -1
  138. data/test/unit/rss_test.rb +13 -3
  139. metadata +239 -116
  140. data/lib/feed_tools/vendor/htree.rb +0 -97
  141. data/lib/feed_tools/vendor/htree/container.rb +0 -10
  142. data/lib/feed_tools/vendor/htree/context.rb +0 -67
  143. data/lib/feed_tools/vendor/htree/display.rb +0 -27
  144. data/lib/feed_tools/vendor/htree/doc.rb +0 -149
  145. data/lib/feed_tools/vendor/htree/elem.rb +0 -262
  146. data/lib/feed_tools/vendor/htree/encoder.rb +0 -163
  147. data/lib/feed_tools/vendor/htree/equality.rb +0 -218
  148. data/lib/feed_tools/vendor/htree/extract_text.rb +0 -37
  149. data/lib/feed_tools/vendor/htree/fstr.rb +0 -33
  150. data/lib/feed_tools/vendor/htree/gencode.rb +0 -97
  151. data/lib/feed_tools/vendor/htree/htmlinfo.rb +0 -672
  152. data/lib/feed_tools/vendor/htree/inspect.rb +0 -108
  153. data/lib/feed_tools/vendor/htree/leaf.rb +0 -94
  154. data/lib/feed_tools/vendor/htree/loc.rb +0 -367
  155. data/lib/feed_tools/vendor/htree/modules.rb +0 -48
  156. data/lib/feed_tools/vendor/htree/name.rb +0 -124
  157. data/lib/feed_tools/vendor/htree/output.rb +0 -207
  158. data/lib/feed_tools/vendor/htree/parse.rb +0 -409
  159. data/lib/feed_tools/vendor/htree/raw_string.rb +0 -124
  160. data/lib/feed_tools/vendor/htree/regexp-util.rb +0 -15
  161. data/lib/feed_tools/vendor/htree/rexml.rb +0 -130
  162. data/lib/feed_tools/vendor/htree/scan.rb +0 -166
  163. data/lib/feed_tools/vendor/htree/tag.rb +0 -111
  164. data/lib/feed_tools/vendor/htree/template.rb +0 -909
  165. data/lib/feed_tools/vendor/htree/text.rb +0 -115
  166. data/lib/feed_tools/vendor/htree/traverse.rb +0 -465
@@ -0,0 +1,158 @@
1
+ # Warning: this module is experimental and subject to change and even removal
2
+ # at any time.
3
+ #
4
+ # For background/rationale, see:
5
+ # * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
6
+ # * http://tinyurl.com/ylfj8k (and follow-ups)
7
+ #
8
+ # References:
9
+ # * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
10
+ # * http://wiki.whatwg.org/wiki/HtmlVsXhtml
11
+ #
12
+ # @@TODO:
13
+ # * Selectively lowercase only XHTML, but not foreign markup
14
+ require 'html5/html5parser'
15
+ require 'html5/constants'
16
+
17
+ module HTML5
18
+
19
+ # liberal XML parser
20
+ class XMLParser < HTMLParser
21
+
22
+ def initialize(options = {})
23
+ super options
24
+ @phases[:initial] = XmlRootPhase.new(self, @tree)
25
+ end
26
+
27
+ def normalize_token(token)
28
+ case token[:type]
29
+ when :StartTag, :EmptyTag
30
+ # We need to remove the duplicate attributes and convert attributes
31
+ # to a Hash so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
32
+
33
+ token[:data] = Hash[*token[:data].reverse.flatten]
34
+
35
+ # For EmptyTags, process both a Start and an End tag
36
+ if token[:type] == :EmptyTag
37
+ save = @tokenizer.content_model_flag
38
+ @phase.processStartTag(token[:name], token[:data])
39
+ @tokenizer.content_model_flag = save
40
+ token[:data] = {}
41
+ token[:type] = :EndTag
42
+ end
43
+
44
+ when :Characters
45
+ # un-escape RCDATA_ELEMENTS (e.g. style, script)
46
+ if @tokenizer.content_model_flag == :CDATA
47
+ token[:data] = token[:data].
48
+ gsub('&lt;','<').gsub('&gt;','>').gsub('&amp;','&')
49
+ end
50
+
51
+ when :EndTag
52
+ if token[:data]
53
+ parse_error("attributes-in-end-tag")
54
+ end
55
+
56
+ when :Comment
57
+ # Rescue CDATA from the comments
58
+ if token[:data][0..6] == "[CDATA[" and token[:data][-2..-1] == "]]"
59
+ token[:type] = :Characters
60
+ token[:data] = token[:data][7 ... -2]
61
+ end
62
+ end
63
+
64
+ return token
65
+ end
66
+ end
67
+
68
+ # liberal XMTHML parser
69
+ class XHTMLParser < XMLParser
70
+
71
+ def initialize(options = {})
72
+ super options
73
+ @phases[:initial] = InitialPhase.new(self, @tree)
74
+ @phases[:rootElement] = XhmlRootPhase.new(self, @tree)
75
+ end
76
+
77
+ def normalize_token(token)
78
+ super(token)
79
+
80
+ # ensure that non-void XHTML elements have content so that separate
81
+ # open and close tags are emitted
82
+ if token[:type] == :EndTag
83
+ if VOID_ELEMENTS.include? token[:name]
84
+ if @tree.open_elements[-1].name != token["name"]
85
+ token[:type] = :EmptyTag
86
+ token["data"] ||= {}
87
+ end
88
+ else
89
+ if token[:name] == @tree.open_elements[-1].name and \
90
+ not @tree.open_elements[-1].hasContent
91
+ @tree.insertText('') unless
92
+ @tree.open_elements.any? {|e|
93
+ e.attributes.keys.include? 'xmlns' and
94
+ e.attributes['xmlns'] != 'http://www.w3.org/1999/xhtml'
95
+ }
96
+ end
97
+ end
98
+ end
99
+
100
+ return token
101
+ end
102
+ end
103
+
104
+ class XhmlRootPhase < RootElementPhase
105
+ def insert_html_element
106
+ element = @tree.createElement("html", {'xmlns' => 'http://www.w3.org/1999/xhtml'})
107
+ @tree.open_elements.push(element)
108
+ @tree.document.appendChild(element)
109
+ @parser.phase = @parser.phases[:beforeHead]
110
+ end
111
+ end
112
+
113
+ class XmlRootPhase < Phase
114
+ # Prime the Xml parser
115
+ @start_tag_handlers = Hash.new(:startTagOther)
116
+ @end_tag_handlers = Hash.new(:endTagOther)
117
+ def startTagOther(name, attributes)
118
+ @tree.open_elements.push(@tree.document)
119
+ element = @tree.createElement(name, attributes)
120
+ @tree.open_elements[-1].appendChild(element)
121
+ @tree.open_elements.push(element)
122
+ @parser.phase = XmlElementPhase.new(@parser,@tree)
123
+ end
124
+ def endTagOther(name)
125
+ super
126
+ @tree.open_elements.pop
127
+ end
128
+ end
129
+
130
+ class XmlElementPhase < Phase
131
+ # Generic handling for all XML elements
132
+
133
+ @start_tag_handlers = Hash.new(:startTagOther)
134
+ @end_tag_handlers = Hash.new(:endTagOther)
135
+
136
+ def startTagOther(name, attributes)
137
+ element = @tree.createElement(name, attributes)
138
+ @tree.open_elements[-1].appendChild(element)
139
+ @tree.open_elements.push(element)
140
+ end
141
+
142
+ def endTagOther(name)
143
+ for node in @tree.open_elements.reverse
144
+ if node.name == name
145
+ {} while @tree.open_elements.pop != node
146
+ break
147
+ else
148
+ parse_error
149
+ end
150
+ end
151
+ end
152
+
153
+ def processCharacters(data)
154
+ @tree.insertText(data)
155
+ end
156
+ end
157
+
158
+ end
@@ -0,0 +1,188 @@
1
+ require 'cgi'
2
+ require 'html5/tokenizer'
3
+
4
+ module HTML5
5
+
6
+ # This module provides sanitization of XHTML+MathML+SVG
7
+ # and of inline style attributes.
8
+ #
9
+ # It can be either at the Tokenizer stage:
10
+ #
11
+ # HTMLParser.parse(html, :tokenizer => HTMLSanitizer)
12
+ #
13
+ # or, if you already have a parse tree (in this example, a REXML tree),
14
+ # at the Serializer stage:
15
+ #
16
+ # tokens = TreeWalkers.get_tree_walker('rexml').new(tree)
17
+ # HTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
18
+ # :sanitize => true})
19
+
20
+ module HTMLSanitizeModule
21
+
22
+ ACCEPTABLE_ELEMENTS = %w[a abbr acronym address area b big blockquote br
23
+ button caption center cite code col colgroup dd del dfn dir div dl dt
24
+ em fieldset font form h1 h2 h3 h4 h5 h6 hr i img input ins kbd label
25
+ legend li map menu ol optgroup option p pre q s samp select small span
26
+ strike strong sub sup table tbody td textarea tfoot th thead tr tt u
27
+ ul var]
28
+
29
+ MATHML_ELEMENTS = %w[maction math merror mfrac mi mmultiscripts mn mo
30
+ mover mpadded mphantom mprescripts mroot mrow mspace msqrt mstyle msub
31
+ msubsup msup mtable mtd mtext mtr munder munderover none]
32
+
33
+ SVG_ELEMENTS = %w[a animate animateColor animateMotion animateTransform
34
+ circle defs desc ellipse font-face font-face-name font-face-src g
35
+ glyph hkern image linearGradient line marker metadata missing-glyph
36
+ mpath path polygon polyline radialGradient rect set stop svg switch
37
+ text title tspan use]
38
+
39
+ ACCEPTABLE_ATTRIBUTES = %w[abbr accept accept-charset accesskey action
40
+ align alt axis border cellpadding cellspacing char charoff charset
41
+ checked cite class clear cols colspan color compact coords datetime
42
+ dir disabled enctype for frame headers height href hreflang hspace id
43
+ ismap label lang longdesc maxlength media method multiple name nohref
44
+ noshade nowrap prompt readonly rel rev rows rowspan rules scope
45
+ selected shape size span src start style summary tabindex target title
46
+ type usemap valign value vspace width xml:lang]
47
+
48
+ MATHML_ATTRIBUTES = %w[actiontype align columnalign columnalign
49
+ columnalign columnlines columnspacing columnspan depth display
50
+ displaystyle equalcolumns equalrows fence fontstyle fontweight frame
51
+ height linethickness lspace mathbackground mathcolor mathvariant
52
+ mathvariant maxsize minsize other rowalign rowalign rowalign rowlines
53
+ rowspacing rowspan rspace scriptlevel selection separator stretchy
54
+ width width xlink:href xlink:show xlink:type xmlns xmlns:xlink]
55
+
56
+ SVG_ATTRIBUTES = %w[accent-height accumulate additive alphabetic
57
+ arabic-form ascent attributeName attributeType baseProfile bbox begin
58
+ by calcMode cap-height class color color-rendering content cx cy d dx
59
+ dy descent display dur end fill fill-rule font-family font-size
60
+ font-stretch font-style font-variant font-weight from fx fy g1 g2
61
+ glyph-name gradientUnits hanging height horiz-adv-x horiz-origin-x id
62
+ ideographic k keyPoints keySplines keyTimes lang marker-end
63
+ marker-mid marker-start markerHeight markerUnits markerWidth
64
+ mathematical max min name offset opacity orient origin
65
+ overline-position overline-thickness panose-1 path pathLength points
66
+ preserveAspectRatio r refX refY repeatCount repeatDur
67
+ requiredExtensions requiredFeatures restart rotate rx ry slope stemh
68
+ stemv stop-color stop-opacity strikethrough-position
69
+ strikethrough-thickness stroke stroke-dasharray stroke-dashoffset
70
+ stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity
71
+ stroke-width systemLanguage target text-anchor to transform type u1
72
+ u2 underline-position underline-thickness unicode unicode-range
73
+ units-per-em values version viewBox visibility width widths x
74
+ x-height x1 x2 xlink:actuate xlink:arcrole xlink:href xlink:role
75
+ xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns
76
+ xmlns:xlink y y1 y2 zoomAndPan]
77
+
78
+ ATTR_VAL_IS_URI = %w[href src cite action longdesc xlink:href xml:base]
79
+
80
+ ACCEPTABLE_CSS_PROPERTIES = %w[azimuth background-color
81
+ border-bottom-color border-collapse border-color border-left-color
82
+ border-right-color border-top-color clear color cursor direction
83
+ display elevation float font font-family font-size font-style
84
+ font-variant font-weight height letter-spacing line-height overflow
85
+ pause pause-after pause-before pitch pitch-range richness speak
86
+ speak-header speak-numeral speak-punctuation speech-rate stress
87
+ text-align text-decoration text-indent unicode-bidi vertical-align
88
+ voice-family volume white-space width]
89
+
90
+ ACCEPTABLE_CSS_KEYWORDS = %w[auto aqua black block blue bold both bottom
91
+ brown center collapse dashed dotted fuchsia gray green !important
92
+ italic left lime maroon medium none navy normal nowrap olive pointer
93
+ purple red right solid silver teal top transparent underline white
94
+ yellow]
95
+
96
+ ACCEPTABLE_SVG_PROPERTIES = %w[fill fill-opacity fill-rule stroke
97
+ stroke-width stroke-linecap stroke-linejoin stroke-opacity]
98
+
99
+ ACCEPTABLE_PROTOCOLS = %w[ed2k ftp http https irc mailto news gopher nntp
100
+ telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs]
101
+
102
+ # subclasses may define their own versions of these constants
103
+ ALLOWED_ELEMENTS = ACCEPTABLE_ELEMENTS + MATHML_ELEMENTS + SVG_ELEMENTS
104
+ ALLOWED_ATTRIBUTES = ACCEPTABLE_ATTRIBUTES + MATHML_ATTRIBUTES + SVG_ATTRIBUTES
105
+ ALLOWED_CSS_PROPERTIES = ACCEPTABLE_CSS_PROPERTIES
106
+ ALLOWED_CSS_KEYWORDS = ACCEPTABLE_CSS_KEYWORDS
107
+ ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
108
+ ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
109
+
110
+ def sanitize_token(token)
111
+ case token[:type]
112
+ when :StartTag, :EndTag, :EmptyTag
113
+ if self.class.const_get("ALLOWED_ELEMENTS").include?(token[:name])
114
+ if token.has_key? :data
115
+ attrs = Hash[*token[:data].flatten]
116
+ attrs.delete_if { |attr,v| !self.class.const_get("ALLOWED_ATTRIBUTES").include?(attr) }
117
+ ATTR_VAL_IS_URI.each do |attr|
118
+ val_unescaped = CGI.unescapeHTML(attrs[attr].to_s).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
119
+ if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and !self.class.const_get("ALLOWED_PROTOCOLS").include?(val_unescaped.split(':')[0])
120
+ attrs.delete attr
121
+ end
122
+ end
123
+ if attrs['style']
124
+ attrs['style'] = sanitize_css(attrs['style'])
125
+ end
126
+ token[:data] = attrs.map {|k,v| [k,v]}
127
+ end
128
+ return token
129
+ else
130
+ if token[:type] == :EndTag
131
+ token[:data] = "</#{token[:name]}>"
132
+ elsif token[:data]
133
+ attrs = token[:data].map {|k,v| " #{k}=\"#{CGI.escapeHTML(v)}\""}.join('')
134
+ token[:data] = "<#{token[:name]}#{attrs}>"
135
+ else
136
+ token[:data] = "<#{token[:name]}>"
137
+ end
138
+ token[:data].insert(-2,'/') if token[:type] == :EmptyTag
139
+ token[:type] = :Characters
140
+ token.delete(:name)
141
+ return token
142
+ end
143
+ when :Comment
144
+ token[:data] = ""
145
+ return token
146
+ else
147
+ return token
148
+ end
149
+ end
150
+
151
+ def sanitize_css(style)
152
+ # disallow urls
153
+ style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
154
+
155
+ # gauntlet
156
+ return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
157
+ return '' unless style =~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
158
+
159
+ clean = []
160
+ style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
161
+ next if val.empty?
162
+ prop.downcase!
163
+ if self.class.const_get("ALLOWED_CSS_PROPERTIES").include?(prop)
164
+ clean << "#{prop}: #{val};"
165
+ elsif %w[background border margin padding].include?(prop.split('-')[0])
166
+ clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
167
+ !self.class.const_get("ALLOWED_CSS_KEYWORDS").include?(keyword) and
168
+ keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
169
+ end
170
+ elsif self.class.const_get("ALLOWED_SVG_PROPERTIES").include?(prop)
171
+ clean << "#{prop}: #{val};"
172
+ end
173
+ end
174
+
175
+ style = clean.join(' ')
176
+ end
177
+ end
178
+
179
+ class HTMLSanitizer < HTMLTokenizer
180
+ include HTMLSanitizeModule
181
+ def each
182
+ super do |token|
183
+ yield(sanitize_token(token))
184
+ end
185
+ end
186
+ end
187
+
188
+ end
@@ -0,0 +1,2 @@
1
+ require 'html5/serializer/htmlserializer'
2
+ require 'html5/serializer/xhtmlserializer'
@@ -0,0 +1,179 @@
1
+ require 'html5/constants'
2
+
3
+ module HTML5
4
+
5
+ class HTMLSerializer
6
+
7
+ def self.serialize(stream, options = {})
8
+ new(options).serialize(stream, options[:encoding])
9
+ end
10
+
11
+ def escape(string)
12
+ string.gsub("&", "&amp;").gsub("<", "&lt;").gsub(">", "&gt;")
13
+ end
14
+
15
+ def initialize(options={})
16
+ @quote_attr_values = false
17
+ @quote_char = '"'
18
+ @use_best_quote_char = true
19
+ @minimize_boolean_attributes = true
20
+
21
+ @use_trailing_solidus = false
22
+ @space_before_trailing_solidus = true
23
+ @escape_lt_in_attrs = false
24
+ @escape_rcdata = false
25
+
26
+ @omit_optional_tags = true
27
+ @sanitize = false
28
+
29
+ @strip_whitespace = false
30
+
31
+ @inject_meta_charset = true
32
+
33
+ options.each do |name, value|
34
+ next unless instance_variables.include?("@#{name}")
35
+ @use_best_quote_char = false if name.to_s == 'quote_char'
36
+ instance_variable_set("@#{name}", value)
37
+ end
38
+
39
+ @errors = []
40
+ end
41
+
42
+ def serialize(treewalker, encoding=nil)
43
+ in_cdata = false
44
+ @errors = []
45
+
46
+ if encoding and @inject_meta_charset
47
+ require 'html5/filters/inject_meta_charset'
48
+ treewalker = Filters::InjectMetaCharset.new(treewalker, encoding)
49
+ end
50
+
51
+ if @strip_whitespace
52
+ require 'html5/filters/whitespace'
53
+ treewalker = Filters::WhitespaceFilter.new(treewalker)
54
+ end
55
+
56
+ if @sanitize
57
+ require 'html5/filters/sanitizer'
58
+ treewalker = Filters::HTMLSanitizeFilter.new(treewalker)
59
+ end
60
+
61
+ if @omit_optional_tags
62
+ require 'html5/filters/optionaltags'
63
+ treewalker = Filters::OptionalTagFilter.new(treewalker)
64
+ end
65
+
66
+ result = []
67
+ treewalker.each do |token|
68
+ type = token[:type]
69
+ if type == :Doctype
70
+ doctype = "<!DOCTYPE %s>" % token[:name]
71
+ result << doctype
72
+
73
+ elsif [:Characters, :SpaceCharacters].include? type
74
+ if type == :SpaceCharacters or in_cdata
75
+ if in_cdata and token[:data].include?("</")
76
+ serialize_error("Unexpected </ in CDATA")
77
+ end
78
+ result << token[:data]
79
+ else
80
+ result << escape(token[:data])
81
+ end
82
+
83
+ elsif [:StartTag, :EmptyTag].include? type
84
+ name = token[:name]
85
+ if RCDATA_ELEMENTS.include?(name) and not @escape_rcdata
86
+ in_cdata = true
87
+ elsif in_cdata
88
+ serialize_error(_("Unexpected child element of a CDATA element"))
89
+ end
90
+ attributes = []
91
+ for k,v in attrs = token[:data].to_a.sort
92
+ attributes << ' '
93
+
94
+ attributes << k
95
+ if not @minimize_boolean_attributes or \
96
+ (!(BOOLEAN_ATTRIBUTES[name]||[]).include?(k) \
97
+ and !BOOLEAN_ATTRIBUTES[:global].include?(k))
98
+ attributes << "="
99
+ if @quote_attr_values or v.empty?
100
+ quote_attr = true
101
+ else
102
+ quote_attr = (SPACE_CHARACTERS + %w(< > " ')).any? {|c| v.include?(c)}
103
+ end
104
+ v = v.gsub("&", "&amp;")
105
+ v = v.gsub("<", "&lt;") if @escape_lt_in_attrs
106
+ if quote_attr
107
+ quote_char = @quote_char
108
+ if @use_best_quote_char
109
+ if v.index("'") and !v.index('"')
110
+ quote_char = '"'
111
+ elsif v.index('"') and !v.index("'")
112
+ quote_char = "'"
113
+ end
114
+ end
115
+ if quote_char == "'"
116
+ v = v.gsub("'", "&#39;")
117
+ else
118
+ v = v.gsub('"', "&quot;")
119
+ end
120
+ attributes << quote_char << v << quote_char
121
+ else
122
+ attributes << v
123
+ end
124
+ end
125
+ end
126
+ if VOID_ELEMENTS.include?(name) and @use_trailing_solidus
127
+ if @space_before_trailing_solidus
128
+ attributes << " /"
129
+ else
130
+ attributes << "/"
131
+ end
132
+ end
133
+ result << "<%s%s>" % [name, attributes.join('')]
134
+
135
+ elsif type == :EndTag
136
+ name = token[:name]
137
+ if RCDATA_ELEMENTS.include?(name)
138
+ in_cdata = false
139
+ elsif in_cdata
140
+ serialize_error(_("Unexpected child element of a CDATA element"))
141
+ end
142
+ end_tag = "</#{name}>"
143
+ result << end_tag
144
+
145
+ elsif type == :Comment
146
+ data = token[:data]
147
+ serialize_error(_("Comment contains --")) if data.index("--")
148
+ comment = "<!--%s-->" % token[:data]
149
+ result << comment
150
+
151
+ else
152
+ serialize_error(token[:data])
153
+ end
154
+ end
155
+
156
+ if encoding and encoding != 'utf-8'
157
+ require 'iconv'
158
+ Iconv.iconv(encoding, 'utf-8', result.join('')).first
159
+ else
160
+ result.join('')
161
+ end
162
+ end
163
+
164
+ alias :render :serialize
165
+
166
+ def serialize_error(data="XXX ERROR MESSAGE NEEDED")
167
+ # XXX The idea is to make data mandatory.
168
+ @errors.push(data)
169
+ if @strict
170
+ raise SerializeError
171
+ end
172
+ end
173
+
174
+ end
175
+
176
+ # Error in serialized tree
177
+ class SerializeError < Exception
178
+ end
179
+ end