sanitize 5.1.0 → 6.0.1

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of sanitize might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  class Sanitize; module Transformers; module CSS
2
2
 
3
- # Enforces a CSS whitelist on the contents of `style` attributes.
3
+ # Enforces a CSS allowlist on the contents of `style` attributes.
4
4
  class CleanAttribute
5
5
  def initialize(sanitizer_or_config)
6
6
  if Sanitize::CSS === sanitizer_or_config
@@ -14,7 +14,7 @@ class CleanAttribute
14
14
  node = env[:node]
15
15
 
16
16
  return unless node.type == Nokogiri::XML::Node::ELEMENT_NODE &&
17
- node.key?('style') && !env[:is_whitelisted]
17
+ node.key?('style') && !env[:is_allowlisted]
18
18
 
19
19
  attr = node.attribute('style')
20
20
  css = @scss.properties(attr.value)
@@ -27,7 +27,7 @@ class CleanAttribute
27
27
  end
28
28
  end
29
29
 
30
- # Enforces a CSS whitelist on the contents of `<style>` elements.
30
+ # Enforces a CSS allowlist on the contents of `<style>` elements.
31
31
  class CleanElement
32
32
  def initialize(sanitizer_or_config)
33
33
  if Sanitize::CSS === sanitizer_or_config
@@ -3,7 +3,7 @@
3
3
  class Sanitize; module Transformers
4
4
 
5
5
  CleanDoctype = lambda do |env|
6
- return if env[:is_whitelisted]
6
+ return if env[:is_allowlisted]
7
7
 
8
8
  node = env[:node]
9
9
 
@@ -1,5 +1,6 @@
1
1
  # encoding: utf-8
2
2
 
3
+ require 'cgi'
3
4
  require 'set'
4
5
 
5
6
  class Sanitize; module Transformers; class CleanElement
@@ -18,6 +19,18 @@ class Sanitize; module Transformers; class CleanElement
18
19
  # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#embedding-custom-non-visible-data-with-the-data-*-attributes
19
20
  REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u
20
21
 
22
+ # Elements whose content is treated as unescaped text by HTML parsers.
23
+ UNESCAPED_TEXT_ELEMENTS = Set.new(%w[
24
+ iframe
25
+ noembed
26
+ noframes
27
+ noscript
28
+ plaintext
29
+ script
30
+ style
31
+ xmp
32
+ ])
33
+
21
34
  # Attributes that need additional escaping on `<a>` elements due to unsafe
22
35
  # libxml2 behavior.
23
36
  UNSAFE_LIBXML_ATTRS_A = Set.new(%w[
@@ -76,11 +89,11 @@ class Sanitize; module Transformers; class CleanElement
76
89
 
77
90
  def call(env)
78
91
  node = env[:node]
79
- return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_whitelisted]
92
+ return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_allowlisted]
80
93
 
81
94
  name = env[:node_name]
82
95
 
83
- # Delete any element that isn't in the config whitelist, unless the node has
96
+ # Delete any element that isn't in the config allowlist, unless the node has
84
97
  # already been deleted from the document.
85
98
  #
86
99
  # It's important that we not try to reparent the children of a node that has
@@ -107,34 +120,31 @@ class Sanitize; module Transformers; class CleanElement
107
120
  return
108
121
  end
109
122
 
110
- attr_whitelist = @attributes[name] || @attributes[:all]
123
+ attr_allowlist = @attributes[name] || @attributes[:all]
111
124
 
112
- if attr_whitelist.nil?
113
- # Delete all attributes from elements with no whitelisted attributes.
125
+ if attr_allowlist.nil?
126
+ # Delete all attributes from elements with no allowlisted attributes.
114
127
  node.attribute_nodes.each {|attr| attr.unlink }
115
128
  else
116
- allow_data_attributes = attr_whitelist.include?(:data)
129
+ allow_data_attributes = attr_allowlist.include?(:data)
117
130
 
118
131
  # Delete any attribute that isn't allowed on this element.
119
132
  node.attribute_nodes.each do |attr|
120
133
  attr_name = attr.name.downcase
121
134
 
122
- unless attr_whitelist.include?(attr_name)
123
- # The attribute isn't whitelisted.
135
+ unless attr_allowlist.include?(attr_name)
136
+ # The attribute isn't in the allowlist, but may still be allowed if
137
+ # it's a data attribute.
124
138
 
125
- if allow_data_attributes && attr_name.start_with?('data-')
126
- # Arbitrary data attributes are allowed. If this is a data
127
- # attribute, continue.
128
- next if attr_name =~ REGEX_DATA_ATTR
139
+ unless allow_data_attributes && attr_name.start_with?('data-') && attr_name =~ REGEX_DATA_ATTR
140
+ # Either the attribute isn't a data attribute or arbitrary data
141
+ # attributes aren't allowed. Remove the attribute.
142
+ attr.unlink
143
+ next
129
144
  end
130
-
131
- # Either the attribute isn't a data attribute or arbitrary data
132
- # attributes aren't allowed. Remove the attribute.
133
- attr.unlink
134
- next
135
145
  end
136
146
 
137
- # The attribute is whitelisted.
147
+ # The attribute is allowed.
138
148
 
139
149
  # Remove any attributes that use unacceptable protocols.
140
150
  if @protocols.include?(name) && @protocols[name].include?(attr_name)
@@ -162,7 +172,7 @@ class Sanitize; module Transformers; class CleanElement
162
172
  # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
163
173
  # attempt to preserve server-side includes. This can result in XSS since
164
174
  # an unescaped double quote can allow an attacker to inject a
165
- # non-whitelisted attribute.
175
+ # non-allowlisted attribute.
166
176
  #
167
177
  # Sanitize works around this by implementing its own escaping for
168
178
  # affected attributes, some of which can exist on any element and some
@@ -188,10 +198,32 @@ class Sanitize; module Transformers; class CleanElement
188
198
  @add_attributes[name].each {|key, val| node[key] = val }
189
199
  end
190
200
 
201
+ # Make a best effort to ensure that text nodes in invalid "unescaped text"
202
+ # elements that are inside a math or svg namespace are properly escaped so
203
+ # that they don't get parsed as HTML.
204
+ #
205
+ # Sanitize is explicitly documented as not supporting MathML or SVG, but
206
+ # people sometimes allow `<math>` and `<svg>` elements in their custom
207
+ # configs without realizing that it's not safe. This workaround makes it
208
+ # slightly less unsafe, but you still shouldn't allow `<math>` or `<svg>`
209
+ # because Nokogiri doesn't parse them the same way browsers do and Sanitize
210
+ # can't guarantee that their contents are safe.
211
+ unless node.namespace.nil?
212
+ prefix = node.namespace.prefix
213
+
214
+ if (prefix == 'math' || prefix == 'svg') && UNESCAPED_TEXT_ELEMENTS.include?(name)
215
+ node.children.each do |child|
216
+ if child.type == Nokogiri::XML::Node::TEXT_NODE
217
+ child.content = CGI.escapeHTML(child.content)
218
+ end
219
+ end
220
+ end
221
+ end
222
+
191
223
  # Element-specific special cases.
192
224
  case name
193
225
 
194
- # If this is a whitelisted iframe that has children, remove all its
226
+ # If this is an allowlisted iframe that has children, remove all its
195
227
  # children. The HTML standard says iframes shouldn't have content, but when
196
228
  # they do, this content is parsed as text and is serialized verbatim without
197
229
  # being escaped, which is unsafe because legacy browsers may still render it
@@ -220,6 +252,16 @@ class Sanitize; module Transformers; class CleanElement
220
252
 
221
253
  node['content'] = node['content'].gsub(/;\s*charset\s*=.+\z/, ';charset=utf-8')
222
254
  end
255
+
256
+ # A `<noscript>` element's content is parsed differently in browsers
257
+ # depending on whether or not scripting is enabled. Since Nokogiri doesn't
258
+ # support scripting, it always parses `<noscript>` elements as if scripting
259
+ # is disabled. This results in edge cases where it's not possible to
260
+ # reliably sanitize the contents of a `<noscript>` element because Nokogiri
261
+ # can't fully replicate the parsing behavior of a scripting-enabled browser.
262
+ # The safest thing to do is to simply remove all `<noscript>` elements.
263
+ when 'noscript'
264
+ node.unlink
223
265
  end
224
266
  end
225
267
 
@@ -1,5 +1,5 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  class Sanitize
4
- VERSION = '5.1.0'
4
+ VERSION = '6.0.1'
5
5
  end
data/lib/sanitize.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  # encoding: utf-8
2
2
 
3
- require 'nokogumbo'
3
+ require 'nokogiri'
4
4
  require 'set'
5
5
 
6
6
  require_relative 'sanitize/version'
@@ -54,7 +54,7 @@ class Sanitize
54
54
  # Returns a sanitized copy of the given full _html_ document, using the
55
55
  # settings in _config_ if specified.
56
56
  #
57
- # When sanitizing a document, the `<html>` element must be whitelisted or an
57
+ # When sanitizing a document, the `<html>` element must be allowlisted or an
58
58
  # error will be raised. If this is undesirable, you should probably use
59
59
  # {#fragment} instead.
60
60
  def self.document(html, config = {})
@@ -117,7 +117,7 @@ class Sanitize
117
117
 
118
118
  # Returns a sanitized copy of the given _html_ document.
119
119
  #
120
- # When sanitizing a document, the `<html>` element must be whitelisted or an
120
+ # When sanitizing a document, the `<html>` element must be allowlisted or an
121
121
  # error will be raised. If this is undesirable, you should probably use
122
122
  # {#fragment} instead.
123
123
  def document(html)
@@ -147,20 +147,20 @@ class Sanitize
147
147
  # in place.
148
148
  #
149
149
  # If _node_ is a `Nokogiri::XML::Document`, the `<html>` element must be
150
- # whitelisted or an error will be raised.
150
+ # allowlisted or an error will be raised.
151
151
  def node!(node)
152
152
  raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
153
153
 
154
154
  if node.is_a?(Nokogiri::XML::Document)
155
155
  unless @config[:elements].include?('html')
156
- raise Error, 'When sanitizing a document, "<html>" must be whitelisted.'
156
+ raise Error, 'When sanitizing a document, "<html>" must be allowlisted.'
157
157
  end
158
158
  end
159
159
 
160
- node_whitelist = Set.new
160
+ node_allowlist = Set.new
161
161
 
162
162
  traverse(node) do |n|
163
- transform_node!(n, node_whitelist)
163
+ transform_node!(n, node_allowlist)
164
164
  end
165
165
 
166
166
  node
@@ -189,7 +189,7 @@ class Sanitize
189
189
  node.to_html(preserve_newline: true)
190
190
  end
191
191
 
192
- def transform_node!(node, node_whitelist)
192
+ def transform_node!(node, node_allowlist)
193
193
  @transformers.each do |transformer|
194
194
  # Since transform_node! may be called in a tight loop to process thousands
195
195
  # of items, we can optimize both memory and CPU performance by:
@@ -199,15 +199,19 @@ class Sanitize
199
199
  # does merge! create a new hash, it is also 2.6x slower:
200
200
  # https://github.com/JuanitoFatas/fast-ruby#hashmerge-vs-hashmerge-code
201
201
  config = @transformer_config
202
- config[:is_whitelisted] = node_whitelist.include?(node)
202
+ config[:is_allowlisted] = config[:is_whitelisted] = node_allowlist.include?(node)
203
203
  config[:node] = node
204
204
  config[:node_name] = node.name.downcase
205
- config[:node_whitelist] = node_whitelist
205
+ config[:node_allowlist] = config[:node_whitelist] = node_allowlist
206
206
 
207
- result = transformer.call(config)
207
+ result = transformer.call(**config)
208
208
 
209
- if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
210
- node_whitelist.merge(result[:node_whitelist])
209
+ if result.is_a?(Hash)
210
+ result_allowlist = result[:node_allowlist] || result[:node_whitelist]
211
+
212
+ if result_allowlist.respond_to?(:each)
213
+ node_allowlist.merge(result_allowlist)
214
+ end
211
215
  end
212
216
  end
213
217
 
@@ -11,18 +11,18 @@ describe 'Sanitize::Transformers::CleanComment' do
11
11
  end
12
12
 
13
13
  it 'should remove comments' do
14
- @s.fragment('foo <!-- comment --> bar').must_equal 'foo bar'
15
- @s.fragment('foo <!-- ').must_equal 'foo '
16
- @s.fragment('foo <!-- - -> bar').must_equal 'foo '
17
- @s.fragment("foo <!--\n\n\n\n-->bar").must_equal 'foo bar'
18
- @s.fragment("foo <!-- <!-- <!-- --> --> -->bar").must_equal 'foo --&gt; --&gt;bar'
19
- @s.fragment("foo <div <!-- comment -->>bar</div>").must_equal 'foo <div>&gt;bar</div>'
14
+ _(@s.fragment('foo <!-- comment --> bar')).must_equal 'foo bar'
15
+ _(@s.fragment('foo <!-- ')).must_equal 'foo '
16
+ _(@s.fragment('foo <!-- - -> bar')).must_equal 'foo '
17
+ _(@s.fragment("foo <!--\n\n\n\n-->bar")).must_equal 'foo bar'
18
+ _(@s.fragment("foo <!-- <!-- <!-- --> --> -->bar")).must_equal 'foo --&gt; --&gt;bar'
19
+ _(@s.fragment("foo <div <!-- comment -->>bar</div>")).must_equal 'foo <div>&gt;bar</div>'
20
20
 
21
21
  # Special case: the comment markup is inside a <script>, which makes it
22
22
  # text content and not an actual HTML comment.
23
- @s.fragment("<script><!-- comment --></script>").must_equal ''
23
+ _(@s.fragment("<script><!-- comment --></script>")).must_equal ''
24
24
 
25
- Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => false, :elements => ['script'])
25
+ _(Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => false, :elements => ['script']))
26
26
  .must_equal '<script><!-- comment --></script>'
27
27
  end
28
28
  end
@@ -33,14 +33,14 @@ describe 'Sanitize::Transformers::CleanComment' do
33
33
  end
34
34
 
35
35
  it 'should allow comments' do
36
- @s.fragment('foo <!-- comment --> bar').must_equal 'foo <!-- comment --> bar'
37
- @s.fragment('foo <!-- ').must_equal 'foo <!-- -->'
38
- @s.fragment('foo <!-- - -> bar').must_equal 'foo <!-- - -> bar-->'
39
- @s.fragment("foo <!--\n\n\n\n-->bar").must_equal "foo <!--\n\n\n\n-->bar"
40
- @s.fragment("foo <!-- <!-- <!-- --> --> -->bar").must_equal 'foo <!-- <!-- <!-- --> --&gt; --&gt;bar'
41
- @s.fragment("foo <div <!-- comment -->>bar</div>").must_equal 'foo <div>&gt;bar</div>'
42
-
43
- Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => true, :elements => ['script'])
36
+ _(@s.fragment('foo <!-- comment --> bar')).must_equal 'foo <!-- comment --> bar'
37
+ _(@s.fragment('foo <!-- ')).must_equal 'foo <!-- -->'
38
+ _(@s.fragment('foo <!-- - -> bar')).must_equal 'foo <!-- - -> bar-->'
39
+ _(@s.fragment("foo <!--\n\n\n\n-->bar")).must_equal "foo <!--\n\n\n\n-->bar"
40
+ _(@s.fragment("foo <!-- <!-- <!-- --> --> -->bar")).must_equal 'foo <!-- <!-- <!-- --> --&gt; --&gt;bar'
41
+ _(@s.fragment("foo <div <!-- comment -->>bar</div>")).must_equal 'foo <div>&gt;bar</div>'
42
+
43
+ _(Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => true, :elements => ['script']))
44
44
  .must_equal '<script><!-- comment --></script>'
45
45
  end
46
46
  end
@@ -10,15 +10,15 @@ describe 'Sanitize::Transformers::CSS::CleanAttribute' do
10
10
  end
11
11
 
12
12
  it 'should sanitize CSS properties in style attributes' do
13
- @s.fragment(%[
13
+ _(@s.fragment(%[
14
14
  <div style="color: #fff; width: expression(alert(1)); /* <-- evil! */"></div>
15
- ].strip).must_equal %[
15
+ ].strip)).must_equal %[
16
16
  <div style="color: #fff; /* <-- evil! */"></div>
17
17
  ].strip
18
18
  end
19
19
 
20
20
  it 'should remove the style attribute if the sanitized CSS is empty' do
21
- @s.fragment('<div style="width: expression(alert(1))"></div>').
21
+ _(@s.fragment('<div style="width: expression(alert(1))"></div>')).
22
22
  must_equal '<div></div>'
23
23
  end
24
24
  end
@@ -46,7 +46,7 @@ describe 'Sanitize::Transformers::CSS::CleanElement' do
46
46
  </style>
47
47
  ].strip
48
48
 
49
- @s.fragment(html).must_equal %[
49
+ _(@s.fragment(html)).must_equal %[
50
50
  <style>
51
51
  /* Yay CSS! */
52
52
  .foo { color: #fff; }
@@ -62,6 +62,6 @@ describe 'Sanitize::Transformers::CSS::CleanElement' do
62
62
  end
63
63
 
64
64
  it 'should remove the <style> element if the sanitized CSS is empty' do
65
- @s.fragment('<style></style>').must_equal ''
65
+ _(@s.fragment('<style></style>')).must_equal ''
66
66
  end
67
67
  end
@@ -11,18 +11,18 @@ describe 'Sanitize::Transformers::CleanDoctype' do
11
11
  end
12
12
 
13
13
  it 'should remove doctype declarations' do
14
- @s.document('<!DOCTYPE html><html>foo</html>').must_equal "<html>foo</html>"
15
- @s.fragment('<!DOCTYPE html>foo').must_equal 'foo'
14
+ _(@s.document('<!DOCTYPE html><html>foo</html>')).must_equal "<html>foo</html>"
15
+ _(@s.fragment('<!DOCTYPE html>foo')).must_equal 'foo'
16
16
  end
17
17
 
18
18
  it 'should not allow doctype definitions in fragments' do
19
- @s.fragment('<!DOCTYPE html><html>foo</html>')
19
+ _(@s.fragment('<!DOCTYPE html><html>foo</html>'))
20
20
  .must_equal "foo"
21
21
 
22
- @s.fragment('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
22
+ _(@s.fragment('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>'))
23
23
  .must_equal "foo"
24
24
 
25
- @s.fragment("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
25
+ _(@s.fragment("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>"))
26
26
  .must_equal "foo"
27
27
  end
28
28
  end
@@ -33,38 +33,38 @@ describe 'Sanitize::Transformers::CleanDoctype' do
33
33
  end
34
34
 
35
35
  it 'should allow doctype declarations in documents' do
36
- @s.document('<!DOCTYPE html><html>foo</html>')
36
+ _(@s.document('<!DOCTYPE html><html>foo</html>'))
37
37
  .must_equal "<!DOCTYPE html><html>foo</html>"
38
38
 
39
- @s.document('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
39
+ _(@s.document('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>'))
40
40
  .must_equal "<!DOCTYPE html><html>foo</html>"
41
41
 
42
- @s.document("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
42
+ _(@s.document("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>"))
43
43
  .must_equal "<!DOCTYPE html><html>foo</html>"
44
44
  end
45
45
 
46
46
  it 'should not allow obviously invalid doctype declarations in documents' do
47
- @s.document('<!DOCTYPE blah blah blah><html>foo</html>')
47
+ _(@s.document('<!DOCTYPE blah blah blah><html>foo</html>'))
48
48
  .must_equal "<!DOCTYPE html><html>foo</html>"
49
49
 
50
- @s.document('<!DOCTYPE blah><html>foo</html>')
50
+ _(@s.document('<!DOCTYPE blah><html>foo</html>'))
51
51
  .must_equal "<!DOCTYPE html><html>foo</html>"
52
52
 
53
- @s.document('<!DOCTYPE html BLAH "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
53
+ _(@s.document('<!DOCTYPE html BLAH "-//W3C//DTD HTML 4.01//EN"><html>foo</html>'))
54
54
  .must_equal "<!DOCTYPE html><html>foo</html>"
55
55
 
56
- @s.document('<!whatever><html>foo</html>')
56
+ _(@s.document('<!whatever><html>foo</html>'))
57
57
  .must_equal "<html>foo</html>"
58
58
  end
59
59
 
60
60
  it 'should not allow doctype definitions in fragments' do
61
- @s.fragment('<!DOCTYPE html><html>foo</html>')
61
+ _(@s.fragment('<!DOCTYPE html><html>foo</html>'))
62
62
  .must_equal "foo"
63
63
 
64
- @s.fragment('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
64
+ _(@s.fragment('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>'))
65
65
  .must_equal "foo"
66
66
 
67
- @s.fragment("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
67
+ _(@s.fragment("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>"))
68
68
  .must_equal "foo"
69
69
  end
70
70
  end