sanitize 2.1.1 → 6.0.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of sanitize might be problematic. Click here for more details.

@@ -1,155 +1,223 @@
1
- class Sanitize; module Transformers
2
-
3
- class CleanElement
4
-
5
- # Attributes that need additional escaping on `<a>` elements due to unsafe
6
- # libxml2 behavior.
7
- UNSAFE_LIBXML_ATTRS_A = Set.new(%w[
8
- name
9
- ])
10
-
11
- # Attributes that need additional escaping on all elements due to unsafe
12
- # libxml2 behavior.
13
- UNSAFE_LIBXML_ATTRS_GLOBAL = Set.new(%w[
14
- action
15
- href
16
- src
17
- ])
18
-
19
- # Mapping of original characters to escape sequences for characters that
20
- # should be escaped in attributes affected by unsafe libxml2 behavior.
21
- UNSAFE_LIBXML_ESCAPE_CHARS = {
22
- ' ' => '%20',
23
- '"' => '%22'
24
- }
25
-
26
- # Regex that matches any single character that needs to be escaped in
27
- # attributes affected by unsafe libxml2 behavior.
28
- UNSAFE_LIBXML_ESCAPE_REGEX = /[ "]/
29
-
30
- def initialize(config)
31
- @config = config
32
-
33
- # For faster lookups.
34
- @add_attributes = config[:add_attributes]
35
- @allowed_elements = Set.new(config[:elements])
36
- @attributes = config[:attributes]
37
- @protocols = config[:protocols]
38
- @remove_all_contents = false
39
- @remove_element_contents = Set.new
40
- @whitespace_elements = Set.new(config[:whitespace_elements])
41
-
42
- if config[:remove_contents].is_a?(Array)
43
- @remove_element_contents.merge(config[:remove_contents].map(&:to_s))
44
- else
45
- @remove_all_contents = !!config[:remove_contents]
1
+ # encoding: utf-8
2
+
3
+ require 'set'
4
+
5
+ class Sanitize; module Transformers; class CleanElement
6
+
7
+ # Matches a valid HTML5 data attribute name. The unicode ranges included here
8
+ # are a conservative subset of the full range of characters that are
9
+ # technically allowed, with the intent of matching the most common characters
10
+ # used in data attribute names while excluding uncommon or potentially
11
+ # misleading characters, or characters with the potential to be normalized
12
+ # into unsafe or confusing forms.
13
+ #
14
+ # If you need data attr names with characters that aren't included here (such
15
+ # as combining marks, full-width characters, or CJK), please consider creating
16
+ # a custom transformer to validate attributes according to your needs.
17
+ #
18
+ # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#embedding-custom-non-visible-data-with-the-data-*-attributes
19
+ REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u
20
+
21
+ # Attributes that need additional escaping on `<a>` elements due to unsafe
22
+ # libxml2 behavior.
23
+ UNSAFE_LIBXML_ATTRS_A = Set.new(%w[
24
+ name
25
+ ])
26
+
27
+ # Attributes that need additional escaping on all elements due to unsafe
28
+ # libxml2 behavior.
29
+ UNSAFE_LIBXML_ATTRS_GLOBAL = Set.new(%w[
30
+ action
31
+ href
32
+ src
33
+ ])
34
+
35
+ # Mapping of original characters to escape sequences for characters that
36
+ # should be escaped in attributes affected by unsafe libxml2 behavior.
37
+ UNSAFE_LIBXML_ESCAPE_CHARS = {
38
+ ' ' => '%20',
39
+ '"' => '%22'
40
+ }
41
+
42
+ # Regex that matches any single character that needs to be escaped in
43
+ # attributes affected by unsafe libxml2 behavior.
44
+ UNSAFE_LIBXML_ESCAPE_REGEX = /[ "]/
45
+
46
+ def initialize(config)
47
+ @add_attributes = config[:add_attributes]
48
+ @attributes = config[:attributes].dup
49
+ @elements = config[:elements]
50
+ @protocols = config[:protocols]
51
+ @remove_all_contents = false
52
+ @remove_element_contents = Set.new
53
+ @whitespace_elements = {}
54
+
55
+ @attributes.each do |element_name, attrs|
56
+ unless element_name == :all
57
+ @attributes[element_name] = Set.new(attrs).merge(@attributes[:all] || [])
46
58
  end
47
59
  end
48
60
 
49
- def call(env)
50
- name = env[:node_name]
51
- node = env[:node]
52
-
53
- return if env[:is_whitelisted] || !node.element?
61
+ # Backcompat: if :whitespace_elements is a Set, convert it to a hash.
62
+ if config[:whitespace_elements].is_a?(Set)
63
+ config[:whitespace_elements].each do |element|
64
+ @whitespace_elements[element] = {:before => ' ', :after => ' '}
65
+ end
66
+ else
67
+ @whitespace_elements = config[:whitespace_elements]
68
+ end
54
69
 
55
- # Delete any element that isn't in the config whitelist.
56
- unless @allowed_elements.include?(name)
57
- # Elements like br, div, p, etc. need to be replaced with whitespace in
58
- # order to preserve readability.
59
- if @whitespace_elements.include?(name)
60
- node.add_previous_sibling(Nokogiri::XML::Text.new(' ', node.document))
70
+ if config[:remove_contents].is_a?(Enumerable)
71
+ @remove_element_contents.merge(config[:remove_contents].map(&:to_s))
72
+ else
73
+ @remove_all_contents = !!config[:remove_contents]
74
+ end
75
+ end
61
76
 
62
- unless node.children.empty?
63
- node.add_next_sibling(Nokogiri::XML::Text.new(' ', node.document))
64
- end
77
+ def call(env)
78
+ node = env[:node]
79
+ return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_allowlisted]
80
+
81
+ name = env[:node_name]
82
+
83
+ # Delete any element that isn't in the config allowlist, unless the node has
84
+ # already been deleted from the document.
85
+ #
86
+ # It's important that we not try to reparent the children of a node that has
87
+ # already been deleted, since that seems to trigger a memory leak in
88
+ # Nokogiri.
89
+ unless @elements.include?(name) || node.parent.nil?
90
+ # Elements like br, div, p, etc. need to be replaced with whitespace in
91
+ # order to preserve readability.
92
+ if @whitespace_elements.include?(name)
93
+ node.add_previous_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:before].to_s, node.document))
94
+
95
+ unless node.children.empty?
96
+ node.add_next_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:after].to_s, node.document))
65
97
  end
98
+ end
66
99
 
100
+ unless node.children.empty?
67
101
  unless @remove_all_contents || @remove_element_contents.include?(name)
68
- node.children.each {|n| node.add_previous_sibling(n) }
102
+ node.add_previous_sibling(node.children)
69
103
  end
70
-
71
- node.unlink
72
- return
73
104
  end
74
105
 
75
- attr_whitelist = Set.new((@attributes[name] || []) +
76
- (@attributes[:all] || []))
77
-
78
- allow_data_attributes = attr_whitelist.include?(:data)
79
-
80
- if attr_whitelist.empty?
81
- # Delete all attributes from elements with no whitelisted attributes.
82
- node.attribute_nodes.each {|attr| attr.unlink }
83
- else
84
- # Delete any attribute that isn't allowed on this element.
85
- node.attribute_nodes.each do |attr|
86
- attr_name = attr.name.downcase
87
-
88
- unless attr_whitelist.include?(attr_name)
89
- # The attribute isn't explicitly whitelisted.
90
-
91
- if allow_data_attributes && attr_name.start_with?('data-')
92
- # Arbitrary data attributes are allowed. Verify that the attribute
93
- # is a valid data attribute.
94
- attr.unlink unless attr_name =~ REGEX_DATA_ATTR
95
- else
96
- # Either the attribute isn't a data attribute, or arbitrary data
97
- # attributes aren't allowed. Remove the attribute.
98
- attr.unlink
99
- end
106
+ node.unlink
107
+ return
108
+ end
109
+
110
+ attr_allowlist = @attributes[name] || @attributes[:all]
111
+
112
+ if attr_allowlist.nil?
113
+ # Delete all attributes from elements with no allowlisted attributes.
114
+ node.attribute_nodes.each {|attr| attr.unlink }
115
+ else
116
+ allow_data_attributes = attr_allowlist.include?(:data)
117
+
118
+ # Delete any attribute that isn't allowed on this element.
119
+ node.attribute_nodes.each do |attr|
120
+ attr_name = attr.name.downcase
121
+
122
+ unless attr_allowlist.include?(attr_name)
123
+ # The attribute isn't in the allowlist, but may still be allowed if
124
+ # it's a data attribute.
125
+
126
+ unless allow_data_attributes && attr_name.start_with?('data-') && attr_name =~ REGEX_DATA_ATTR
127
+ # Either the attribute isn't a data attribute or arbitrary data
128
+ # attributes aren't allowed. Remove the attribute.
129
+ attr.unlink
130
+ next
100
131
  end
101
132
  end
102
133
 
103
- # Delete remaining attributes that use unacceptable protocols.
104
- if @protocols.has_key?(name)
105
- protocol = @protocols[name]
134
+ # The attribute is allowed.
106
135
 
107
- node.attribute_nodes.each do |attr|
108
- attr_name = attr.name.downcase
109
- next false unless protocol.has_key?(attr_name)
136
+ # Remove any attributes that use unacceptable protocols.
137
+ if @protocols.include?(name) && @protocols[name].include?(attr_name)
138
+ attr_protocols = @protocols[name][attr_name]
110
139
 
111
- del = if attr.value.to_s.downcase =~ REGEX_PROTOCOL
112
- !protocol[attr_name].include?($1.downcase)
113
- else
114
- !protocol[attr_name].include?(:relative)
140
+ if attr.value =~ REGEX_PROTOCOL
141
+ unless attr_protocols.include?($1.downcase)
142
+ attr.unlink
143
+ next
115
144
  end
116
145
 
117
- if del
146
+ else
147
+ unless attr_protocols.include?(:relative)
118
148
  attr.unlink
119
- else
120
- # Leading and trailing whitespace around URLs is ignored at parse
121
- # time. Stripping it here prevents it from being escaped by the
122
- # libxml2 workaround below.
123
- attr.value = attr.value.strip
149
+ next
124
150
  end
125
151
  end
152
+
153
+ # Leading and trailing whitespace around URLs is ignored at parse
154
+ # time. Stripping it here prevents it from being escaped by the
155
+ # libxml2 workaround below.
156
+ attr.value = attr.value.strip
126
157
  end
127
- end
128
158
 
129
- # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
130
- # attempt to preserve server-side includes. This can result in XSS since
131
- # an unescaped double quote can allow an attacker to inject a
132
- # non-whitelisted attribute.
133
- #
134
- # Sanitize works around this by implementing its own escaping for
135
- # affected attributes, some of which can exist on any element and some
136
- # of which can only exist on `<a>` elements.
137
- #
138
- # The relevant libxml2 code is here:
139
- # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
140
- node.attribute_nodes.each do |attr|
141
- attr_name = attr.name.downcase
159
+ # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
160
+ # attempt to preserve server-side includes. This can result in XSS since
161
+ # an unescaped double quote can allow an attacker to inject a
162
+ # non-allowlisted attribute.
163
+ #
164
+ # Sanitize works around this by implementing its own escaping for
165
+ # affected attributes, some of which can exist on any element and some
166
+ # of which can only exist on `<a>` elements.
167
+ #
168
+ # This fix is technically no longer necessary with Nokogumbo >= 2.0
169
+ # since it no longer uses libxml2's serializer, but it's retained to
170
+ # avoid breaking use cases where people might be sanitizing individual
171
+ # Nokogiri nodes and then serializing them manually without Nokogumbo.
172
+ #
173
+ # The relevant libxml2 code is here:
174
+ # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
142
175
  if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
143
- (name == 'a' && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
144
- attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
176
+ (name == 'a' && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
177
+
178
+ attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
145
179
  end
146
180
  end
181
+ end
182
+
183
+ # Add required attributes.
184
+ if @add_attributes.include?(name)
185
+ @add_attributes[name].each {|key, val| node[key] = val }
186
+ end
187
+
188
+ # Element-specific special cases.
189
+ case name
190
+
191
+ # If this is an allowlisted iframe that has children, remove all its
192
+ # children. The HTML standard says iframes shouldn't have content, but when
193
+ # they do, this content is parsed as text and is serialized verbatim without
194
+ # being escaped, which is unsafe because legacy browsers may still render it
195
+ # and execute `<script>` content. So the safe and correct thing to do is to
196
+ # always remove iframe content.
197
+ when 'iframe'
198
+ if !node.children.empty?
199
+ node.children.each do |child|
200
+ child.unlink
201
+ end
202
+ end
203
+
204
+ # Prevent the use of `<meta>` elements that set a charset other than UTF-8,
205
+ # since Sanitize's output is always UTF-8.
206
+ when 'meta'
207
+ if node.has_attribute?('charset') &&
208
+ node['charset'].downcase != 'utf-8'
209
+
210
+ node['charset'] = 'utf-8'
211
+ end
212
+
213
+ if node.has_attribute?('http-equiv') &&
214
+ node.has_attribute?('content') &&
215
+ node['http-equiv'].downcase == 'content-type' &&
216
+ node['content'].downcase =~ /;\s*charset\s*=\s*(?!utf-8)/
147
217
 
148
- # Add required attributes.
149
- if @add_attributes.has_key?(name)
150
- @add_attributes[name].each {|key, val| node[key] = val }
218
+ node['content'] = node['content'].gsub(/;\s*charset\s*=.+\z/, ';charset=utf-8')
151
219
  end
152
220
  end
153
221
  end
154
222
 
155
- end; end
223
+ end; end; end
@@ -1,3 +1,5 @@
1
+ # encoding: utf-8
2
+
1
3
  class Sanitize
2
- VERSION = '2.1.1'
4
+ VERSION = '6.0.0'
3
5
  end