sanitize 2.1.1 → 6.0.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of sanitize might be problematic. Click here for more details.

@@ -1,29 +1,9 @@
1
- #--
2
- # Copyright (c) 2013 Ryan Grove <ryan@wonko.com>
3
- #
4
- # Permission is hereby granted, free of charge, to any person obtaining a copy
5
- # of this software and associated documentation files (the 'Software'), to deal
6
- # in the Software without restriction, including without limitation the rights
7
- # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
- # copies of the Software, and to permit persons to whom the Software is
9
- # furnished to do so, subject to the following conditions:
10
- #
11
- # The above copyright notice and this permission notice shall be included in all
12
- # copies or substantial portions of the Software.
13
- #
14
- # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20
- # SOFTWARE.
21
- #++
1
+ # encoding: utf-8
22
2
 
23
3
  class Sanitize
24
4
  module Config
25
- RESTRICTED = {
5
+ RESTRICTED = freeze_config(
26
6
  :elements => %w[b em i strong u]
27
- }
7
+ )
28
8
  end
29
9
  end
@@ -1,86 +1,60 @@
1
- #--
2
- # Copyright (c) 2013 Ryan Grove <ryan@wonko.com>
3
- #
4
- # Permission is hereby granted, free of charge, to any person obtaining a copy
5
- # of this software and associated documentation files (the 'Software'), to deal
6
- # in the Software without restriction, including without limitation the rights
7
- # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
- # copies of the Software, and to permit persons to whom the Software is
9
- # furnished to do so, subject to the following conditions:
10
- #
11
- # The above copyright notice and this permission notice shall be included in all
12
- # copies or substantial portions of the Software.
13
- #
14
- # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20
- # SOFTWARE.
21
- #++
1
+ # encoding: utf-8
2
+
3
+ require 'set'
22
4
 
23
5
  class Sanitize
24
6
  module Config
25
- DEFAULT = {
26
-
27
- # Whether or not to allow HTML comments. Allowing comments is strongly
28
- # discouraged, since IE allows script execution within conditional
29
- # comments.
30
- :allow_comments => false,
31
-
32
- # HTML attributes to add to specific elements. By default, no attributes
33
- # are added.
34
- :add_attributes => {},
35
-
36
- # HTML attributes to allow in specific elements. By default, no attributes
37
- # are allowed. Use the symbol :data to indicate that arbitrary HTML5
38
- # data-* attributes should be allowed.
39
- :attributes => {},
40
-
41
- # HTML elements to allow. By default, no elements are allowed (which means
42
- # that all HTML will be stripped).
43
- :elements => [],
44
-
45
- # Output format. Supported formats are :html and :xhtml. Default is :html.
46
- :output => :html,
47
-
48
- # Character encoding to use for HTML output. Default is 'utf-8'.
49
- :output_encoding => 'utf-8',
50
-
51
- # URL handling protocols to allow in specific attributes. By default, no
52
- # protocols are allowed. Use :relative in place of a protocol if you want
53
- # to allow relative URLs sans protocol.
54
- :protocols => {},
55
-
56
- # If this is true, Sanitize will remove the contents of any filtered
57
- # elements in addition to the elements themselves. By default, Sanitize
58
- # leaves the safe parts of an element's contents behind when the element
59
- # is removed.
60
- #
61
- # If this is an Array of element names, then only the contents of the
62
- # specified elements (when filtered) will be removed, and the contents of
63
- # all other filtered elements will be left behind.
64
- :remove_contents => false,
65
-
66
- # Transformers allow you to filter or alter nodes using custom logic. See
67
- # README.rdoc for details and examples.
68
- :transformers => [],
69
-
70
- # By default, transformers perform depth-first traversal (deepest node
71
- # upward). This setting allows you to specify transformers that should
72
- # perform breadth-first traversal (top node downward).
73
- :transformers_breadth => [],
74
7
 
75
- # Elements which, when removed, should have their contents surrounded by
76
- # space characters to preserve readability. For example,
77
- # `foo<div>bar</div>baz` will become 'foo bar baz' when the <div> is
78
- # removed.
79
- :whitespace_elements => %w[
80
- address article aside blockquote br dd div dl dt footer h1 h2 h3 h4 h5
81
- h6 header hgroup hr li nav ol p pre section ul
82
- ]
8
+ # Deeply freezes and returns the given configuration Hash.
9
+ def self.freeze_config(config)
10
+ if Hash === config
11
+ config.each_value {|c| freeze_config(c) }
12
+ elsif Array === config || Set === config
13
+ config.each {|c| freeze_config(c) }
14
+ end
15
+
16
+ config.freeze
17
+ end
18
+
19
+ # Returns a new Hash containing the result of deeply merging *other_config*
20
+ # into *config*. Does not modify *config* or *other_config*.
21
+ #
22
+ # This is the safest way to use a built-in Sanitize config as the basis for
23
+ # your own custom config.
24
+ def self.merge(config, other_config = {})
25
+ raise ArgumentError, 'config must be a Hash' unless Hash === config
26
+ raise ArgumentError, 'other_config must be a Hash' unless Hash === other_config
27
+
28
+ merged = {}
29
+ keys = Set.new(config.keys + other_config.keys)
30
+
31
+ keys.each do |key|
32
+ oldval = config[key]
33
+
34
+ if other_config.has_key?(key)
35
+ newval = other_config[key]
36
+
37
+ if Hash === oldval && Hash === newval
38
+ merged[key] = oldval.empty? ? newval.dup : merge(oldval, newval)
39
+ elsif Array === newval && key != :transformers
40
+ merged[key] = Set.new(newval)
41
+ else
42
+ merged[key] = can_dupe?(newval) ? newval.dup : newval
43
+ end
44
+ else
45
+ merged[key] = can_dupe?(oldval) ? oldval.dup : oldval
46
+ end
47
+ end
48
+
49
+ merged
50
+ end
51
+
52
+ # Returns `true` if `dup` may be safely called on _value_, `false`
53
+ # otherwise.
54
+ def self.can_dupe?(value)
55
+ !(true == value || false == value || value.nil? || Method === value || Numeric === value || Symbol === value)
56
+ end
57
+ private_class_method :can_dupe?
83
58
 
84
- }
85
59
  end
86
60
  end
@@ -0,0 +1,348 @@
1
+ # encoding: utf-8
2
+
3
+ require 'crass'
4
+ require 'set'
5
+
6
+ class Sanitize; class CSS
7
+ attr_reader :config
8
+
9
+ # -- Class Methods -----------------------------------------------------------
10
+
11
+ # Sanitizes inline CSS style properties.
12
+ #
13
+ # This is most useful for sanitizing non-stylesheet fragments of CSS like you
14
+ # would find in the `style` attribute of an HTML element. To sanitize a full
15
+ # CSS stylesheet, use {.stylesheet}.
16
+ #
17
+ # @example
18
+ # Sanitize::CSS.properties("background: url(foo.png); color: #fff;")
19
+ #
20
+ # @return [String] Sanitized CSS properties.
21
+ def self.properties(css, config = {})
22
+ self.new(config).properties(css)
23
+ end
24
+
25
+ # Sanitizes a full CSS stylesheet.
26
+ #
27
+ # A stylesheet may include selectors, at-rules, and comments. To sanitize only
28
+ # inline style properties such as the contents of an HTML `style` attribute,
29
+ # use {.properties}.
30
+ #
31
+ # @example
32
+ # css = %[
33
+ # .foo {
34
+ # background: url(foo.png);
35
+ # color: #fff;
36
+ # }
37
+ #
38
+ # #bar {
39
+ # font: 42pt 'Comic Sans MS';
40
+ # }
41
+ # ]
42
+ #
43
+ # Sanitize::CSS.stylesheet(css, Sanitize::Config::RELAXED)
44
+ #
45
+ # @return [String] Sanitized CSS stylesheet.
46
+ def self.stylesheet(css, config = {})
47
+ self.new(config).stylesheet(css)
48
+ end
49
+
50
+ # Sanitizes the given Crass CSS parse tree and all its children, modifying it
51
+ # in place.
52
+ #
53
+ # @example
54
+ # css = %[
55
+ # .foo {
56
+ # background: url(foo.png);
57
+ # color: #fff;
58
+ # }
59
+ #
60
+ # #bar {
61
+ # font: 42pt 'Comic Sans MS';
62
+ # }
63
+ # ]
64
+ #
65
+ # tree = Crass.parse(css)
66
+ # Sanitize::CSS.tree!(tree, Sanitize::Config::RELAXED)
67
+ #
68
+ # @return [Array] Sanitized Crass CSS parse tree.
69
+ def self.tree!(tree, config = {})
70
+ self.new(config).tree!(tree)
71
+ end
72
+
73
+ # -- Instance Methods --------------------------------------------------------
74
+
75
+ # Returns a new Sanitize::CSS object initialized with the settings in
76
+ # _config_.
77
+ def initialize(config = {})
78
+ @config = Config.merge(Config::DEFAULT[:css], config[:css] || config)
79
+
80
+ @at_rules = Set.new(@config[:at_rules])
81
+ @at_rules_with_properties = Set.new(@config[:at_rules_with_properties])
82
+ @at_rules_with_styles = Set.new(@config[:at_rules_with_styles])
83
+ @import_url_validator = @config[:import_url_validator]
84
+ end
85
+
86
+ # Sanitizes inline CSS style properties.
87
+ #
88
+ # This is most useful for sanitizing non-stylesheet fragments of CSS like you
89
+ # would find in the `style` attribute of an HTML element. To sanitize a full
90
+ # CSS stylesheet, use {#stylesheet}.
91
+ #
92
+ # @example
93
+ # scss = Sanitize::CSS.new(Sanitize::Config::RELAXED)
94
+ # scss.properties("background: url(foo.png); color: #fff;")
95
+ #
96
+ # @return [String] Sanitized CSS properties.
97
+ def properties(css)
98
+ tree = Crass.parse_properties(css,
99
+ :preserve_comments => @config[:allow_comments],
100
+ :preserve_hacks => @config[:allow_hacks])
101
+
102
+ tree!(tree)
103
+ Crass::Parser.stringify(tree)
104
+ end
105
+
106
+ # Sanitizes a full CSS stylesheet.
107
+ #
108
+ # A stylesheet may include selectors, at-rules, and comments. To sanitize only
109
+ # inline style properties such as the contents of an HTML `style` attribute,
110
+ # use {#properties}.
111
+ #
112
+ # @example
113
+ # css = %[
114
+ # .foo {
115
+ # background: url(foo.png);
116
+ # color: #fff;
117
+ # }
118
+ #
119
+ # #bar {
120
+ # font: 42pt 'Comic Sans MS';
121
+ # }
122
+ # ]
123
+ #
124
+ # scss = Sanitize::CSS.new(Sanitize::Config::RELAXED)
125
+ # scss.stylesheet(css)
126
+ #
127
+ # @return [String] Sanitized CSS stylesheet.
128
+ def stylesheet(css)
129
+ tree = Crass.parse(css,
130
+ :preserve_comments => @config[:allow_comments],
131
+ :preserve_hacks => @config[:allow_hacks])
132
+
133
+ tree!(tree)
134
+ Crass::Parser.stringify(tree)
135
+ end
136
+
137
+ # Sanitizes the given Crass CSS parse tree and all its children, modifying it
138
+ # in place.
139
+ #
140
+ # @example
141
+ # css = %[
142
+ # .foo {
143
+ # background: url(foo.png);
144
+ # color: #fff;
145
+ # }
146
+ #
147
+ # #bar {
148
+ # font: 42pt 'Comic Sans MS';
149
+ # }
150
+ # ]
151
+ #
152
+ # scss = Sanitize::CSS.new(Sanitize::Config::RELAXED)
153
+ # tree = Crass.parse(css)
154
+ #
155
+ # scss.tree!(tree)
156
+ #
157
+ # @return [Array] Sanitized Crass CSS parse tree.
158
+ def tree!(tree)
159
+ preceded_by_property = false
160
+
161
+ tree.map! do |node|
162
+ next nil if node.nil?
163
+
164
+ case node[:node]
165
+ when :at_rule
166
+ preceded_by_property = false
167
+ next at_rule!(node)
168
+
169
+ when :comment
170
+ next node if @config[:allow_comments]
171
+
172
+ when :property
173
+ prop = property!(node)
174
+ preceded_by_property = !prop.nil?
175
+ next prop
176
+
177
+ when :semicolon
178
+ # Only preserve the semicolon if it was preceded by an allowlisted
179
+ # property. Otherwise, omit it in order to prevent redundant semicolons.
180
+ if preceded_by_property
181
+ preceded_by_property = false
182
+ next node
183
+ end
184
+
185
+ when :style_rule
186
+ preceded_by_property = false
187
+ tree!(node[:children])
188
+ next node
189
+
190
+ when :whitespace
191
+ next node
192
+ end
193
+
194
+ nil
195
+ end
196
+
197
+ tree
198
+ end
199
+
200
+ # -- Protected Instance Methods ----------------------------------------------
201
+ protected
202
+
203
+ # Sanitizes a CSS at-rule node. Returns the sanitized node, or `nil` if the
204
+ # current config doesn't allow this at-rule.
205
+ def at_rule!(rule)
206
+ name = rule[:name].downcase
207
+
208
+ if @at_rules_with_styles.include?(name)
209
+ styles = Crass::Parser.parse_rules(rule[:block],
210
+ :preserve_comments => @config[:allow_comments],
211
+ :preserve_hacks => @config[:allow_hacks])
212
+
213
+ rule[:block] = tree!(styles)
214
+
215
+ elsif @at_rules_with_properties.include?(name)
216
+ props = Crass::Parser.parse_properties(rule[:block],
217
+ :preserve_comments => @config[:allow_comments],
218
+ :preserve_hacks => @config[:allow_hacks])
219
+
220
+ rule[:block] = tree!(props)
221
+
222
+ elsif @at_rules.include?(name)
223
+ return nil if name == "import" && !import_url_allowed?(rule)
224
+ return nil if rule.has_key?(:block)
225
+ else
226
+ return nil
227
+ end
228
+
229
+ rule
230
+ end
231
+
232
+ # Passes the URL value of an @import rule to a block to ensure
233
+ # it's an allowed URL
234
+ def import_url_allowed?(rule)
235
+ return true unless @import_url_validator
236
+
237
+ url_token = rule[:tokens].detect { |t| t[:node] == :url || t[:node] == :string }
238
+
239
+ # don't allow @imports with no URL value
240
+ return false unless url_token && (import_url = url_token[:value])
241
+
242
+ @import_url_validator.call(import_url)
243
+ end
244
+
245
+ # Sanitizes a CSS property node. Returns the sanitized node, or `nil` if the
246
+ # current config doesn't allow this property.
247
+ def property!(prop)
248
+ name = prop[:name].downcase
249
+
250
+ # Preserve IE * and _ hacks if desired.
251
+ if @config[:allow_hacks]
252
+ name.slice!(0) if name =~ /\A[*_]/
253
+ end
254
+
255
+ return nil unless @config[:properties].include?(name)
256
+
257
+ nodes = prop[:children].dup
258
+ combined_value = String.new
259
+
260
+ nodes.each do |child|
261
+ value = child[:value]
262
+
263
+ case child[:node]
264
+ when :ident
265
+ combined_value << value.downcase if String === value
266
+
267
+ when :function
268
+ if child.key?(:name)
269
+ name = child[:name].downcase
270
+
271
+ if name == 'url'
272
+ return nil unless valid_url?(child)
273
+ end
274
+
275
+ combined_value << name
276
+ return nil if name == 'expression' || combined_value == 'expression'
277
+ end
278
+
279
+ if Array === value
280
+ nodes.concat(value)
281
+ elsif String === value
282
+ lowercase_value = value.downcase
283
+ combined_value << lowercase_value
284
+ return nil if lowercase_value == 'expression' || combined_value == 'expression'
285
+ end
286
+
287
+ when :url
288
+ return nil unless valid_url?(child)
289
+
290
+ when :bad_url
291
+ return nil
292
+ end
293
+ end
294
+
295
+ prop
296
+ end
297
+
298
+ # Returns `true` if the given node (which may be of type `:url` or
299
+ # `:function`, since the CSS syntax can produce both) uses an allowlisted
300
+ # protocol.
301
+ def valid_url?(node)
302
+ type = node[:node]
303
+
304
+ if type == :function
305
+ return false unless node.key?(:name) && node[:name].downcase == 'url'
306
+ return false unless Array === node[:value]
307
+
308
+ # A URL function's `:value` should be an array containing no more than one
309
+ # `:string` node and any number of `:whitespace` nodes.
310
+ #
311
+ # If it contains more than one `:string` node, or if it contains any other
312
+ # nodes except `:whitespace` nodes, it's not valid.
313
+ url_string_node = nil
314
+
315
+ node[:value].each do |token|
316
+ return false unless Hash === token
317
+
318
+ case token[:node]
319
+ when :string
320
+ return false unless url_string_node.nil?
321
+ url_string_node = token
322
+
323
+ when :whitespace
324
+ next
325
+
326
+ else
327
+ return false
328
+ end
329
+ end
330
+
331
+ return false if url_string_node.nil?
332
+ url = url_string_node[:value]
333
+ elsif type == :url
334
+ url = node[:value]
335
+ else
336
+ return false
337
+ end
338
+
339
+ if url =~ Sanitize::REGEX_PROTOCOL
340
+ return @config[:protocols].include?($1.downcase)
341
+ else
342
+ return @config[:protocols].include?(:relative)
343
+ end
344
+
345
+ false
346
+ end
347
+
348
+ end; end
@@ -1,11 +1,11 @@
1
+ # encoding: utf-8
2
+
1
3
  class Sanitize; module Transformers
2
4
 
3
5
  CleanCDATA = lambda do |env|
4
- return if env[:is_whitelisted]
5
-
6
6
  node = env[:node]
7
7
 
8
- if node.cdata?
8
+ if node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE
9
9
  node.replace(Nokogiri::XML::Text.new(node.text, node.document))
10
10
  end
11
11
  end
@@ -1,10 +1,13 @@
1
+ # encoding: utf-8
2
+
1
3
  class Sanitize; module Transformers
2
4
 
3
5
  CleanComment = lambda do |env|
4
- return if env[:is_whitelisted]
5
-
6
6
  node = env[:node]
7
- node.unlink if node.comment? && !env[:config][:allow_comments]
7
+
8
+ if node.type == Nokogiri::XML::Node::COMMENT_NODE
9
+ node.unlink unless env[:is_allowlisted]
10
+ end
8
11
  end
9
12
 
10
13
  end; end
@@ -0,0 +1,57 @@
1
+ class Sanitize; module Transformers; module CSS
2
+
3
+ # Enforces a CSS allowlist on the contents of `style` attributes.
4
+ class CleanAttribute
5
+ def initialize(sanitizer_or_config)
6
+ if Sanitize::CSS === sanitizer_or_config
7
+ @scss = sanitizer_or_config
8
+ else
9
+ @scss = Sanitize::CSS.new(sanitizer_or_config)
10
+ end
11
+ end
12
+
13
+ def call(env)
14
+ node = env[:node]
15
+
16
+ return unless node.type == Nokogiri::XML::Node::ELEMENT_NODE &&
17
+ node.key?('style') && !env[:is_allowlisted]
18
+
19
+ attr = node.attribute('style')
20
+ css = @scss.properties(attr.value)
21
+
22
+ if css.strip.empty?
23
+ attr.unlink
24
+ else
25
+ attr.value = css
26
+ end
27
+ end
28
+ end
29
+
30
+ # Enforces a CSS allowlist on the contents of `<style>` elements.
31
+ class CleanElement
32
+ def initialize(sanitizer_or_config)
33
+ if Sanitize::CSS === sanitizer_or_config
34
+ @scss = sanitizer_or_config
35
+ else
36
+ @scss = Sanitize::CSS.new(sanitizer_or_config)
37
+ end
38
+ end
39
+
40
+ def call(env)
41
+ node = env[:node]
42
+
43
+ return unless node.type == Nokogiri::XML::Node::ELEMENT_NODE &&
44
+ env[:node_name] == 'style'
45
+
46
+ css = @scss.stylesheet(node.content)
47
+
48
+ if css.strip.empty?
49
+ node.unlink
50
+ else
51
+ node.children.unlink
52
+ node << Nokogiri::XML::Text.new(css, node.document)
53
+ end
54
+ end
55
+ end
56
+
57
+ end; end; end
@@ -0,0 +1,19 @@
1
+ # encoding: utf-8
2
+
3
+ class Sanitize; module Transformers
4
+
5
+ CleanDoctype = lambda do |env|
6
+ return if env[:is_allowlisted]
7
+
8
+ node = env[:node]
9
+
10
+ if node.type == Nokogiri::XML::Node::DTD_NODE
11
+ if env[:config][:allow_doctype]
12
+ node.name = 'html'
13
+ else
14
+ node.unlink
15
+ end
16
+ end
17
+ end
18
+
19
+ end; end