sanitize 2.1.1 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of sanitize might be problematic. Click here for more details.

@@ -1,29 +1,9 @@
1
- #--
2
- # Copyright (c) 2013 Ryan Grove <ryan@wonko.com>
3
- #
4
- # Permission is hereby granted, free of charge, to any person obtaining a copy
5
- # of this software and associated documentation files (the 'Software'), to deal
6
- # in the Software without restriction, including without limitation the rights
7
- # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
- # copies of the Software, and to permit persons to whom the Software is
9
- # furnished to do so, subject to the following conditions:
10
- #
11
- # The above copyright notice and this permission notice shall be included in all
12
- # copies or substantial portions of the Software.
13
- #
14
- # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20
- # SOFTWARE.
21
- #++
1
+ # encoding: utf-8
22
2
 
23
3
  class Sanitize
24
4
  module Config
25
- RESTRICTED = {
5
+ RESTRICTED = freeze_config(
26
6
  :elements => %w[b em i strong u]
27
- }
7
+ )
28
8
  end
29
9
  end
@@ -0,0 +1,218 @@
1
+ # encoding: utf-8
2
+
3
+ require 'crass'
4
+ require 'set'
5
+
6
+ class Sanitize; class CSS
7
+ attr_reader :config
8
+
9
+ # Names of CSS at-rules whose blocks may contain properties.
10
+ AT_RULES_WITH_PROPERTIES = Set.new(%w[font-face page])
11
+
12
+ # Names of CSS at-rules whose blocks may contain style rules.
13
+ AT_RULES_WITH_STYLES = Set.new(%w[document media supports])
14
+
15
+ # -- Class Methods -----------------------------------------------------------
16
+
17
+ # Sanitizes inline CSS style properties.
18
+ #
19
+ # This is most useful for sanitizing non-stylesheet fragments of CSS like you
20
+ # would find in the `style` attribute of an HTML element. To sanitize a full
21
+ # CSS stylesheet, use {.stylesheet}.
22
+ #
23
+ # @example
24
+ # Sanitize::CSS.properties("background: url(foo.png); color: #fff;")
25
+ #
26
+ # @return [String] Sanitized CSS properties.
27
+ def self.properties(css, config = {})
28
+ self.new(config).properties(css)
29
+ end
30
+
31
+ def self.stylesheet(css, config = {})
32
+ self.new(config).stylesheet(css)
33
+ end
34
+
35
+ def self.tree!(tree, config = {})
36
+ self.new(config).tree!(tree)
37
+ end
38
+
39
+ # -- Instance Methods --------------------------------------------------------
40
+
41
+ # Returns a new Sanitize::CSS object initialized with the settings in
42
+ # _config_.
43
+ def initialize(config = {})
44
+ @config = Config.merge(Config::DEFAULT[:css], config[:css] || config)
45
+ end
46
+
47
+ # Sanitizes inline CSS style properties.
48
+ #
49
+ # This is most useful for sanitizing non-stylesheet fragments of CSS like you
50
+ # would find in the `style` attribute of an HTML element. To sanitize a full
51
+ # CSS stylesheet, use {#stylesheet}.
52
+ #
53
+ # @example
54
+ # scss = Sanitize::CSS.new(Sanitize::Config::RELAXED)
55
+ # scss.properties("background: url(foo.png); color: #fff;")
56
+ #
57
+ # @return [String] Sanitized CSS properties.
58
+ def properties(css)
59
+ tree = Crass.parse_properties(css,
60
+ :preserve_comments => @config[:allow_comments],
61
+ :preserve_hacks => @config[:allow_hacks])
62
+
63
+ tree!(tree)
64
+ Crass::Parser.stringify(tree)
65
+ end
66
+
67
+ # Sanitizes a full CSS stylesheet.
68
+ #
69
+ # A stylesheet may include selectors, @ rules, and comments. To sanitize only
70
+ # inline style properties such as the contents of an HTML `style` attribute,
71
+ # use {#properties}.
72
+ #
73
+ # @example
74
+ # css = %[
75
+ # .foo {
76
+ # background: url(foo.png);
77
+ # color: #fff;
78
+ # }
79
+ #
80
+ # #bar {
81
+ # font: 42pt 'Comic Sans MS';
82
+ # }
83
+ # ]
84
+ #
85
+ # scss = Sanitize::CSS.new(Sanitize::Config::RELAXED)
86
+ # scss.stylesheet(css)
87
+ #
88
+ # @return [String] Sanitized CSS stylesheet.
89
+ def stylesheet(css)
90
+ tree = Crass.parse(css,
91
+ :preserve_comments => @config[:allow_comments],
92
+ :preserve_hacks => @config[:allow_hacks])
93
+
94
+ tree!(tree)
95
+ Crass::Parser.stringify(tree)
96
+ end
97
+
98
+ # Sanitizes the given Crass CSS parse tree and all its children, modifying it
99
+ # in place.
100
+ #
101
+ # @example
102
+ # scss = Sanitize::CSS.new(Sanitize::Config::RELAXED)
103
+ # tree = Crass.parse(css)
104
+ #
105
+ # scss.tree!(tree)
106
+ #
107
+ # @return [Array] Sanitized Crass CSS parse tree.
108
+ def tree!(tree)
109
+ tree.map! do |node|
110
+ next nil if node.nil?
111
+
112
+ case node[:node]
113
+ when :at_rule
114
+ next at_rule!(node)
115
+
116
+ when :comment
117
+ next node if @config[:allow_comments]
118
+
119
+ when :property
120
+ next property!(node)
121
+
122
+ when :style_rule
123
+ tree!(node[:children])
124
+ next node
125
+
126
+ when :whitespace
127
+ next node
128
+ end
129
+
130
+ nil
131
+ end
132
+
133
+ tree
134
+ end
135
+
136
+ # -- Protected Instance Methods ----------------------------------------------
137
+ protected
138
+
139
+ # Sanitizes a CSS at-rule node. Returns the sanitized node, or `nil` if the
140
+ # current config doesn't allow this at-rule.
141
+ def at_rule!(rule)
142
+ name = rule[:name].downcase
143
+ return nil unless @config[:at_rules].include?(name)
144
+
145
+ if AT_RULES_WITH_STYLES.include?(name)
146
+ styles = Crass::Parser.parse_rules(rule[:block][:value],
147
+ :preserve_comments => @config[:allow_comments],
148
+ :preserve_hacks => @config[:allow_hacks])
149
+
150
+ rule[:block][:value] = tree!(styles)
151
+
152
+ elsif AT_RULES_WITH_PROPERTIES.include?(name)
153
+ props = Crass::Parser.parse_properties(rule[:block][:value],
154
+ :preserve_comments => @config[:allow_comments],
155
+ :preserve_hacks => @config[:allow_hacks])
156
+
157
+ rule[:block][:value] = tree!(props)
158
+
159
+ else
160
+ rule.delete(:block)
161
+ end
162
+
163
+ rule
164
+ end
165
+
166
+ # Sanitizes a CSS property node. Returns the sanitized node, or `nil` if the
167
+ # current config doesn't allow this property.
168
+ def property!(prop)
169
+ name = prop[:name].downcase
170
+
171
+ # Preserve IE * and _ hacks if desired.
172
+ if @config[:allow_hacks]
173
+ name.slice!(0) if name =~ /\A[*_]/
174
+ end
175
+
176
+ return nil unless @config[:properties].include?(name)
177
+
178
+ nodes = prop[:children].dup
179
+ combined_value = ''
180
+
181
+ nodes.each do |child|
182
+ value = child[:value]
183
+
184
+ case child[:node]
185
+ when :ident
186
+ combined_value << value if String === value
187
+
188
+ when :function
189
+ if child.key?(:name)
190
+ return nil if child[:name].downcase == 'expression'
191
+ end
192
+
193
+ if Array === value
194
+ nodes.concat(value)
195
+ elsif String === value
196
+ combined_value << value
197
+
198
+ if value.downcase == 'expression' || combined_value.downcase == 'expression'
199
+ return nil
200
+ end
201
+ end
202
+
203
+ when :url
204
+ if value =~ Sanitize::REGEX_PROTOCOL
205
+ return nil unless @config[:protocols].include?($1.downcase)
206
+ else
207
+ return nil unless @config[:protocols].include?(:relative)
208
+ end
209
+
210
+ when :bad_url
211
+ return nil
212
+ end
213
+ end
214
+
215
+ prop
216
+ end
217
+
218
+ end; end
@@ -1,11 +1,11 @@
1
+ # encoding: utf-8
2
+
1
3
  class Sanitize; module Transformers
2
4
 
3
5
  CleanCDATA = lambda do |env|
4
- return if env[:is_whitelisted]
5
-
6
6
  node = env[:node]
7
7
 
8
- if node.cdata?
8
+ if node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE
9
9
  node.replace(Nokogiri::XML::Text.new(node.text, node.document))
10
10
  end
11
11
  end
@@ -1,10 +1,13 @@
1
+ # encoding: utf-8
2
+
1
3
  class Sanitize; module Transformers
2
4
 
3
5
  CleanComment = lambda do |env|
4
- return if env[:is_whitelisted]
5
-
6
6
  node = env[:node]
7
- node.unlink if node.comment? && !env[:config][:allow_comments]
7
+
8
+ if node.type == Nokogiri::XML::Node::COMMENT_NODE
9
+ node.unlink unless env[:is_whitelisted]
10
+ end
8
11
  end
9
12
 
10
13
  end; end
@@ -0,0 +1,57 @@
1
+ class Sanitize; module Transformers; module CSS
2
+
3
+ # Enforces a CSS whitelist on the contents of `style` attributes.
4
+ class CleanAttribute
5
+ def initialize(sanitizer_or_config)
6
+ if Sanitize::CSS === sanitizer_or_config
7
+ @scss = sanitizer_or_config
8
+ else
9
+ @scss = Sanitize::CSS.new(sanitizer_or_config)
10
+ end
11
+ end
12
+
13
+ def call(env)
14
+ node = env[:node]
15
+
16
+ return unless node.type == Nokogiri::XML::Node::ELEMENT_NODE &&
17
+ node.key?('style') && !env[:is_whitelisted]
18
+
19
+ attr = node.attribute('style')
20
+ css = @scss.properties(attr.value)
21
+
22
+ if css.strip.empty?
23
+ attr.unlink
24
+ else
25
+ attr.value = css
26
+ end
27
+ end
28
+ end
29
+
30
+ # Enforces a CSS whitelist on the contents of `<style>` elements.
31
+ class CleanElement
32
+ def initialize(sanitizer_or_config)
33
+ if Sanitize::CSS === sanitizer_or_config
34
+ @scss = sanitizer_or_config
35
+ else
36
+ @scss = Sanitize::CSS.new(sanitizer_or_config)
37
+ end
38
+ end
39
+
40
+ def call(env)
41
+ node = env[:node]
42
+
43
+ return unless node.type == Nokogiri::XML::Node::ELEMENT_NODE &&
44
+ env[:node_name] == 'style'
45
+
46
+ css = @scss.stylesheet(node.content)
47
+
48
+ if css.strip.empty?
49
+ node.unlink
50
+ else
51
+ node.children.unlink
52
+ node << Nokogiri::XML::Text.new(css, node.document)
53
+ end
54
+ end
55
+ end
56
+
57
+ end; end; end
@@ -0,0 +1,13 @@
1
+ # encoding: utf-8
2
+
3
+ class Sanitize; module Transformers
4
+
5
+ CleanDoctype = lambda do |env|
6
+ node = env[:node]
7
+
8
+ if node.type == Nokogiri::XML::Node::DTD_NODE
9
+ node.unlink unless env[:is_whitelisted]
10
+ end
11
+ end
12
+
13
+ end; end
@@ -1,155 +1,125 @@
1
- class Sanitize; module Transformers
2
-
3
- class CleanElement
4
-
5
- # Attributes that need additional escaping on `<a>` elements due to unsafe
6
- # libxml2 behavior.
7
- UNSAFE_LIBXML_ATTRS_A = Set.new(%w[
8
- name
9
- ])
10
-
11
- # Attributes that need additional escaping on all elements due to unsafe
12
- # libxml2 behavior.
13
- UNSAFE_LIBXML_ATTRS_GLOBAL = Set.new(%w[
14
- action
15
- href
16
- src
17
- ])
18
-
19
- # Mapping of original characters to escape sequences for characters that
20
- # should be escaped in attributes affected by unsafe libxml2 behavior.
21
- UNSAFE_LIBXML_ESCAPE_CHARS = {
22
- ' ' => '%20',
23
- '"' => '%22'
24
- }
25
-
26
- # Regex that matches any single character that needs to be escaped in
27
- # attributes affected by unsafe libxml2 behavior.
28
- UNSAFE_LIBXML_ESCAPE_REGEX = /[ "]/
29
-
30
- def initialize(config)
31
- @config = config
32
-
33
- # For faster lookups.
34
- @add_attributes = config[:add_attributes]
35
- @allowed_elements = Set.new(config[:elements])
36
- @attributes = config[:attributes]
37
- @protocols = config[:protocols]
38
- @remove_all_contents = false
39
- @remove_element_contents = Set.new
40
- @whitespace_elements = Set.new(config[:whitespace_elements])
41
-
42
- if config[:remove_contents].is_a?(Array)
43
- @remove_element_contents.merge(config[:remove_contents].map(&:to_s))
44
- else
45
- @remove_all_contents = !!config[:remove_contents]
1
+ # encoding: utf-8
2
+
3
+ require 'set'
4
+
5
+ class Sanitize; module Transformers; class CleanElement
6
+
7
+ # Matches a valid HTML5 data attribute name. The unicode ranges included here
8
+ # are a conservative subset of the full range of characters that are
9
+ # technically allowed, with the intent of matching the most common characters
10
+ # used in data attribute names while excluding uncommon or potentially
11
+ # misleading characters, or characters with the potential to be normalized
12
+ # into unsafe or confusing forms.
13
+ #
14
+ # If you need data attr names with characters that aren't included here (such
15
+ # as combining marks, full-width characters, or CJK), please consider creating
16
+ # a custom transformer to validate attributes according to your needs.
17
+ #
18
+ # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#embedding-custom-non-visible-data-with-the-data-*-attributes
19
+ REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u
20
+
21
+ def initialize(config)
22
+ @add_attributes = config[:add_attributes]
23
+ @attributes = config[:attributes].dup
24
+ @elements = config[:elements]
25
+ @protocols = config[:protocols]
26
+ @remove_all_contents = false
27
+ @remove_element_contents = Set.new
28
+ @whitespace_elements = {}
29
+
30
+ @attributes.each do |element_name, attrs|
31
+ unless element_name == :all
32
+ @attributes[element_name] = Set.new(attrs).merge(@attributes[:all] || [])
46
33
  end
47
34
  end
48
35
 
49
- def call(env)
50
- name = env[:node_name]
51
- node = env[:node]
36
+ # Backcompat: if :whitespace_elements is a Set, convert it to a hash.
37
+ if config[:whitespace_elements].is_a?(Set)
38
+ config[:whitespace_elements].each do |element|
39
+ @whitespace_elements[element] = {:before => ' ', :after => ' '}
40
+ end
41
+ else
42
+ @whitespace_elements = config[:whitespace_elements]
43
+ end
52
44
 
53
- return if env[:is_whitelisted] || !node.element?
45
+ if config[:remove_contents].is_a?(Set)
46
+ @remove_element_contents.merge(config[:remove_contents].map(&:to_s))
47
+ else
48
+ @remove_all_contents = !!config[:remove_contents]
49
+ end
50
+ end
54
51
 
55
- # Delete any element that isn't in the config whitelist.
56
- unless @allowed_elements.include?(name)
57
- # Elements like br, div, p, etc. need to be replaced with whitespace in
58
- # order to preserve readability.
59
- if @whitespace_elements.include?(name)
60
- node.add_previous_sibling(Nokogiri::XML::Text.new(' ', node.document))
52
+ def call(env)
53
+ node = env[:node]
54
+ return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_whitelisted]
61
55
 
62
- unless node.children.empty?
63
- node.add_next_sibling(Nokogiri::XML::Text.new(' ', node.document))
64
- end
65
- end
56
+ name = env[:node_name]
57
+
58
+ # Delete any element that isn't in the config whitelist.
59
+ unless @elements.include?(name)
60
+ # Elements like br, div, p, etc. need to be replaced with whitespace in
61
+ # order to preserve readability.
62
+ if @whitespace_elements.include?(name)
63
+ node.add_previous_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:before].to_s, node.document))
66
64
 
67
- unless @remove_all_contents || @remove_element_contents.include?(name)
68
- node.children.each {|n| node.add_previous_sibling(n) }
65
+ unless node.children.empty?
66
+ node.add_next_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:after].to_s, node.document))
69
67
  end
68
+ end
70
69
 
71
- node.unlink
72
- return
70
+ unless @remove_all_contents || @remove_element_contents.include?(name)
71
+ node.children.each {|n| node.add_previous_sibling(n) }
73
72
  end
74
73
 
75
- attr_whitelist = Set.new((@attributes[name] || []) +
76
- (@attributes[:all] || []))
74
+ node.unlink
75
+ return
76
+ end
77
77
 
78
- allow_data_attributes = attr_whitelist.include?(:data)
78
+ attr_whitelist = @attributes[name] || @attributes[:all]
79
79
 
80
- if attr_whitelist.empty?
81
- # Delete all attributes from elements with no whitelisted attributes.
82
- node.attribute_nodes.each {|attr| attr.unlink }
83
- else
84
- # Delete any attribute that isn't allowed on this element.
85
- node.attribute_nodes.each do |attr|
86
- attr_name = attr.name.downcase
87
-
88
- unless attr_whitelist.include?(attr_name)
89
- # The attribute isn't explicitly whitelisted.
90
-
91
- if allow_data_attributes && attr_name.start_with?('data-')
92
- # Arbitrary data attributes are allowed. Verify that the attribute
93
- # is a valid data attribute.
94
- attr.unlink unless attr_name =~ REGEX_DATA_ATTR
95
- else
96
- # Either the attribute isn't a data attribute, or arbitrary data
97
- # attributes aren't allowed. Remove the attribute.
98
- attr.unlink
99
- end
100
- end
101
- end
80
+ if attr_whitelist.nil?
81
+ # Delete all attributes from elements with no whitelisted attributes.
82
+ node.attribute_nodes.each {|attr| attr.unlink }
83
+ else
84
+ allow_data_attributes = attr_whitelist.include?(:data)
102
85
 
103
- # Delete remaining attributes that use unacceptable protocols.
104
- if @protocols.has_key?(name)
105
- protocol = @protocols[name]
86
+ # Delete any attribute that isn't allowed on this element.
87
+ node.attribute_nodes.each do |attr|
88
+ attr_name = attr.name.downcase
106
89
 
107
- node.attribute_nodes.each do |attr|
108
- attr_name = attr.name.downcase
109
- next false unless protocol.has_key?(attr_name)
90
+ if attr_whitelist.include?(attr_name)
91
+ # The attribute is whitelisted.
110
92
 
111
- del = if attr.value.to_s.downcase =~ REGEX_PROTOCOL
112
- !protocol[attr_name].include?($1.downcase)
113
- else
114
- !protocol[attr_name].include?(:relative)
115
- end
93
+ # Remove any attributes that use unacceptable protocols.
94
+ if @protocols.include?(name) && @protocols[name].include?(attr_name)
95
+ attr_protocols = @protocols[name][attr_name]
116
96
 
117
- if del
118
- attr.unlink
97
+ if attr.value.to_s.downcase =~ REGEX_PROTOCOL
98
+ attr.unlink unless attr_protocols.include?($1.downcase)
119
99
  else
120
- # Leading and trailing whitespace around URLs is ignored at parse
121
- # time. Stripping it here prevents it from being escaped by the
122
- # libxml2 workaround below.
123
- attr.value = attr.value.strip
100
+ attr.unlink unless attr_protocols.include?(:relative)
124
101
  end
125
102
  end
103
+ else
104
+ # The attribute isn't whitelisted.
105
+
106
+ if allow_data_attributes && attr_name.start_with?('data-')
107
+ # Arbitrary data attributes are allowed. Verify that the attribute
108
+ # is a valid data attribute.
109
+ attr.unlink unless attr_name =~ REGEX_DATA_ATTR
110
+ else
111
+ # Either the attribute isn't a data attribute, or arbitrary data
112
+ # attributes aren't allowed. Remove the attribute.
113
+ attr.unlink
114
+ end
126
115
  end
127
116
  end
117
+ end
128
118
 
129
- # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
130
- # attempt to preserve server-side includes. This can result in XSS since
131
- # an unescaped double quote can allow an attacker to inject a
132
- # non-whitelisted attribute.
133
- #
134
- # Sanitize works around this by implementing its own escaping for
135
- # affected attributes, some of which can exist on any element and some
136
- # of which can only exist on `<a>` elements.
137
- #
138
- # The relevant libxml2 code is here:
139
- # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
140
- node.attribute_nodes.each do |attr|
141
- attr_name = attr.name.downcase
142
- if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
143
- (name == 'a' && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
144
- attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
145
- end
146
- end
147
-
148
- # Add required attributes.
149
- if @add_attributes.has_key?(name)
150
- @add_attributes[name].each {|key, val| node[key] = val }
151
- end
119
+ # Add required attributes.
120
+ if @add_attributes.include?(name)
121
+ @add_attributes[name].each {|key, val| node[key] = val }
152
122
  end
153
123
  end
154
124
 
155
- end; end
125
+ end; end; end