sanitize 5.0.0 → 5.2.3

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of sanitize might be problematic. Click here for more details.

@@ -19,6 +19,20 @@ require_relative 'sanitize/transformers/clean_element'
19
19
  class Sanitize
20
20
  attr_reader :config
21
21
 
22
+ # Matches one or more control characters that should be removed from HTML
23
+ # before parsing, as defined by the HTML living standard.
24
+ #
25
+ # - https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
26
+ # - https://infra.spec.whatwg.org/#control
27
+ REGEX_HTML_CONTROL_CHARACTERS = /[\u0001-\u0008\u000b\u000e-\u001f\u007f-\u009f]+/u
28
+
29
+ # Matches one or more non-characters that should be removed from HTML before
30
+ # parsing, as defined by the HTML living standard.
31
+ #
32
+ # - https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
33
+ # - https://infra.spec.whatwg.org/#noncharacter
34
+ REGEX_HTML_NON_CHARACTERS = /[\ufdd0-\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}]+/u
35
+
22
36
  # Matches an attribute value that could be treated by a browser as a URL
23
37
  # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
24
38
  # or more characters followed by a colon is considered a match, even if the
@@ -26,11 +40,12 @@ class Sanitize
26
40
  # IE6 and Opera will still parse).
27
41
  REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
28
42
 
29
- # Matches Unicode characters that should be stripped from HTML before passing
30
- # it to the parser.
43
+ # Matches one or more characters that should be stripped from HTML before
44
+ # parsing. This is a combination of `REGEX_HTML_CONTROL_CHARACTERS` and
45
+ # `REGEX_HTML_NON_CHARACTERS`.
31
46
  #
32
- # http://www.w3.org/TR/unicode-xml/#Charlist
33
- REGEX_UNSUITABLE_CHARS = /[\u0000\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
47
+ # https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
48
+ REGEX_UNSUITABLE_CHARS = /(?:#{REGEX_HTML_CONTROL_CHARACTERS}|#{REGEX_HTML_NON_CHARACTERS})/u
34
49
 
35
50
  #--
36
51
  # Class Methods
@@ -39,7 +54,7 @@ class Sanitize
39
54
  # Returns a sanitized copy of the given full _html_ document, using the
40
55
  # settings in _config_ if specified.
41
56
  #
42
- # When sanitizing a document, the `<html>` element must be whitelisted or an
57
+ # When sanitizing a document, the `<html>` element must be allowlisted or an
43
58
  # error will be raised. If this is undesirable, you should probably use
44
59
  # {#fragment} instead.
45
60
  def self.document(html, config = {})
@@ -102,13 +117,13 @@ class Sanitize
102
117
 
103
118
  # Returns a sanitized copy of the given _html_ document.
104
119
  #
105
- # When sanitizing a document, the `<html>` element must be whitelisted or an
120
+ # When sanitizing a document, the `<html>` element must be allowlisted or an
106
121
  # error will be raised. If this is undesirable, you should probably use
107
122
  # {#fragment} instead.
108
123
  def document(html)
109
124
  return '' unless html
110
125
 
111
- doc = Nokogiri::HTML5.parse(preprocess(html))
126
+ doc = Nokogiri::HTML5.parse(preprocess(html), **@config[:parser_options])
112
127
  node!(doc)
113
128
  to_html(doc)
114
129
  end
@@ -120,8 +135,7 @@ class Sanitize
120
135
  def fragment(html)
121
136
  return '' unless html
122
137
 
123
- html = preprocess(html)
124
- frag = Nokogiri::HTML5.fragment(html)
138
+ frag = Nokogiri::HTML5.fragment(preprocess(html), **@config[:parser_options])
125
139
  node!(frag)
126
140
  to_html(frag)
127
141
  end
@@ -133,20 +147,20 @@ class Sanitize
133
147
  # in place.
134
148
  #
135
149
  # If _node_ is a `Nokogiri::XML::Document`, the `<html>` element must be
136
- # whitelisted or an error will be raised.
150
+ # allowlisted or an error will be raised.
137
151
  def node!(node)
138
152
  raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
139
153
 
140
154
  if node.is_a?(Nokogiri::XML::Document)
141
155
  unless @config[:elements].include?('html')
142
- raise Error, 'When sanitizing a document, "<html>" must be whitelisted.'
156
+ raise Error, 'When sanitizing a document, "<html>" must be allowlisted.'
143
157
  end
144
158
  end
145
159
 
146
- node_whitelist = Set.new
160
+ node_allowlist = Set.new
147
161
 
148
162
  traverse(node) do |n|
149
- transform_node!(n, node_whitelist)
163
+ transform_node!(n, node_allowlist)
150
164
  end
151
165
 
152
166
  node
@@ -175,7 +189,7 @@ class Sanitize
175
189
  node.to_html(preserve_newline: true)
176
190
  end
177
191
 
178
- def transform_node!(node, node_whitelist)
192
+ def transform_node!(node, node_allowlist)
179
193
  @transformers.each do |transformer|
180
194
  # Since transform_node! may be called in a tight loop to process thousands
181
195
  # of items, we can optimize both memory and CPU performance by:
@@ -185,15 +199,19 @@ class Sanitize
185
199
  # does merge! create a new hash, it is also 2.6x slower:
186
200
  # https://github.com/JuanitoFatas/fast-ruby#hashmerge-vs-hashmerge-code
187
201
  config = @transformer_config
188
- config[:is_whitelisted] = node_whitelist.include?(node)
202
+ config[:is_allowlisted] = config[:is_whitelisted] = node_allowlist.include?(node)
189
203
  config[:node] = node
190
204
  config[:node_name] = node.name.downcase
191
- config[:node_whitelist] = node_whitelist
205
+ config[:node_allowlist] = config[:node_whitelist] = node_allowlist
206
+
207
+ result = transformer.call(**config)
192
208
 
193
- result = transformer.call(config)
209
+ if result.is_a?(Hash)
210
+ result_allowlist = result[:node_allowlist] || result[:node_whitelist]
194
211
 
195
- if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
196
- node_whitelist.merge(result[:node_whitelist])
212
+ if result_allowlist.respond_to?(:each)
213
+ node_allowlist.merge(result_allowlist)
214
+ end
197
215
  end
198
216
  end
199
217
 
@@ -56,6 +56,10 @@ class Sanitize
56
56
  # that all HTML will be stripped).
57
57
  :elements => [],
58
58
 
59
+ # HTML parsing options to pass to Nokogumbo.
60
+ # https://github.com/rubys/nokogumbo/tree/v2.0.1#parsing-options
61
+ :parser_options => {},
62
+
59
63
  # URL handling protocols to allow in specific attributes. By default, no
60
64
  # protocols are allowed. Use :relative in place of a protocol if you want
61
65
  # to allow relative URLs sans protocol.
@@ -70,7 +74,7 @@ class Sanitize
70
74
  # the specified elements (when filtered) will be removed, and the contents
71
75
  # of all other filtered elements will be left behind.
72
76
  :remove_contents => %w[
73
- iframe noembed noframes noscript script style
77
+ iframe math noembed noframes noscript plaintext script style svg xmp
74
78
  ],
75
79
 
76
80
  # Transformers allow you to filter or alter nodes using custom logic. See
@@ -6,7 +6,7 @@ class Sanitize
6
6
  :elements => BASIC[:elements] + %w[
7
7
  address article aside bdi bdo body caption col colgroup data del div
8
8
  figcaption figure footer h1 h2 h3 h4 h5 h6 head header hgroup hr html
9
- img ins main nav rp rt ruby section span style summary sup table tbody
9
+ img ins main nav rp rt ruby section span style summary table tbody
10
10
  td tfoot th thead title tr wbr
11
11
  ],
12
12
 
@@ -175,7 +175,7 @@ class Sanitize; class CSS
175
175
  next prop
176
176
 
177
177
  when :semicolon
178
- # Only preserve the semicolon if it was preceded by a whitelisted
178
+ # Only preserve the semicolon if it was preceded by an allowlisted
179
179
  # property. Otherwise, omit it in order to prevent redundant semicolons.
180
180
  if preceded_by_property
181
181
  preceded_by_property = false
@@ -296,7 +296,7 @@ class Sanitize; class CSS
296
296
  end
297
297
 
298
298
  # Returns `true` if the given node (which may be of type `:url` or
299
- # `:function`, since the CSS syntax can produce both) uses a whitelisted
299
+ # `:function`, since the CSS syntax can produce both) uses an allowlisted
300
300
  # protocol.
301
301
  def valid_url?(node)
302
302
  type = node[:node]
@@ -6,7 +6,7 @@ class Sanitize; module Transformers
6
6
  node = env[:node]
7
7
 
8
8
  if node.type == Nokogiri::XML::Node::COMMENT_NODE
9
- node.unlink unless env[:is_whitelisted]
9
+ node.unlink unless env[:is_allowlisted]
10
10
  end
11
11
  end
12
12
 
@@ -1,6 +1,6 @@
1
1
  class Sanitize; module Transformers; module CSS
2
2
 
3
- # Enforces a CSS whitelist on the contents of `style` attributes.
3
+ # Enforces a CSS allowlist on the contents of `style` attributes.
4
4
  class CleanAttribute
5
5
  def initialize(sanitizer_or_config)
6
6
  if Sanitize::CSS === sanitizer_or_config
@@ -14,7 +14,7 @@ class CleanAttribute
14
14
  node = env[:node]
15
15
 
16
16
  return unless node.type == Nokogiri::XML::Node::ELEMENT_NODE &&
17
- node.key?('style') && !env[:is_whitelisted]
17
+ node.key?('style') && !env[:is_allowlisted]
18
18
 
19
19
  attr = node.attribute('style')
20
20
  css = @scss.properties(attr.value)
@@ -27,7 +27,7 @@ class CleanAttribute
27
27
  end
28
28
  end
29
29
 
30
- # Enforces a CSS whitelist on the contents of `<style>` elements.
30
+ # Enforces a CSS allowlist on the contents of `<style>` elements.
31
31
  class CleanElement
32
32
  def initialize(sanitizer_or_config)
33
33
  if Sanitize::CSS === sanitizer_or_config
@@ -3,7 +3,7 @@
3
3
  class Sanitize; module Transformers
4
4
 
5
5
  CleanDoctype = lambda do |env|
6
- return if env[:is_whitelisted]
6
+ return if env[:is_allowlisted]
7
7
 
8
8
  node = env[:node]
9
9
 
@@ -76,11 +76,11 @@ class Sanitize; module Transformers; class CleanElement
76
76
 
77
77
  def call(env)
78
78
  node = env[:node]
79
- return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_whitelisted]
79
+ return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_allowlisted]
80
80
 
81
81
  name = env[:node_name]
82
82
 
83
- # Delete any element that isn't in the config whitelist, unless the node has
83
+ # Delete any element that isn't in the config allowlist, unless the node has
84
84
  # already been deleted from the document.
85
85
  #
86
86
  # It's important that we not try to reparent the children of a node that has
@@ -107,34 +107,31 @@ class Sanitize; module Transformers; class CleanElement
107
107
  return
108
108
  end
109
109
 
110
- attr_whitelist = @attributes[name] || @attributes[:all]
110
+ attr_allowlist = @attributes[name] || @attributes[:all]
111
111
 
112
- if attr_whitelist.nil?
113
- # Delete all attributes from elements with no whitelisted attributes.
112
+ if attr_allowlist.nil?
113
+ # Delete all attributes from elements with no allowlisted attributes.
114
114
  node.attribute_nodes.each {|attr| attr.unlink }
115
115
  else
116
- allow_data_attributes = attr_whitelist.include?(:data)
116
+ allow_data_attributes = attr_allowlist.include?(:data)
117
117
 
118
118
  # Delete any attribute that isn't allowed on this element.
119
119
  node.attribute_nodes.each do |attr|
120
120
  attr_name = attr.name.downcase
121
121
 
122
- unless attr_whitelist.include?(attr_name)
123
- # The attribute isn't whitelisted.
122
+ unless attr_allowlist.include?(attr_name)
123
+ # The attribute isn't in the allowlist, but may still be allowed if
124
+ # it's a data attribute.
124
125
 
125
- if allow_data_attributes && attr_name.start_with?('data-')
126
- # Arbitrary data attributes are allowed. If this is a data
127
- # attribute, continue.
128
- next if attr_name =~ REGEX_DATA_ATTR
126
+ unless allow_data_attributes && attr_name.start_with?('data-') && attr_name =~ REGEX_DATA_ATTR
127
+ # Either the attribute isn't a data attribute or arbitrary data
128
+ # attributes aren't allowed. Remove the attribute.
129
+ attr.unlink
130
+ next
129
131
  end
130
-
131
- # Either the attribute isn't a data attribute or arbitrary data
132
- # attributes aren't allowed. Remove the attribute.
133
- attr.unlink
134
- next
135
132
  end
136
133
 
137
- # The attribute is whitelisted.
134
+ # The attribute is allowed.
138
135
 
139
136
  # Remove any attributes that use unacceptable protocols.
140
137
  if @protocols.include?(name) && @protocols[name].include?(attr_name)
@@ -162,7 +159,7 @@ class Sanitize; module Transformers; class CleanElement
162
159
  # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
163
160
  # attempt to preserve server-side includes. This can result in XSS since
164
161
  # an unescaped double quote can allow an attacker to inject a
165
- # non-whitelisted attribute.
162
+ # non-allowlisted attribute.
166
163
  #
167
164
  # Sanitize works around this by implementing its own escaping for
168
165
  # affected attributes, some of which can exist on any element and some
@@ -191,7 +188,7 @@ class Sanitize; module Transformers; class CleanElement
191
188
  # Element-specific special cases.
192
189
  case name
193
190
 
194
- # If this is a whitelisted iframe that has children, remove all its
191
+ # If this is an allowlisted iframe that has children, remove all its
195
192
  # children. The HTML standard says iframes shouldn't have content, but when
196
193
  # they do, this content is parsed as text and is serialized verbatim without
197
194
  # being escaped, which is unsafe because legacy browsers may still render it
@@ -1,5 +1,5 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  class Sanitize
4
- VERSION = '5.0.0'
4
+ VERSION = '5.2.3'
5
5
  end
@@ -1,34 +1,3 @@
1
1
  # encoding: utf-8
2
- gem 'minitest'
3
2
  require 'minitest/autorun'
4
-
5
3
  require_relative '../lib/sanitize'
6
-
7
- # Helper to stub an instance method. Shamelessly stolen from
8
- # https://github.com/codeodor/minitest-stub_any_instance/
9
- class Object
10
- def self.stub_instance(name, value, &block)
11
- old_method = "__stubbed_method_#{name}__"
12
-
13
- class_eval do
14
- alias_method old_method, name
15
-
16
- define_method(name) do |*args|
17
- if value.respond_to?(:call) then
18
- value.call(*args)
19
- else
20
- value
21
- end
22
- end
23
- end
24
-
25
- yield
26
-
27
- ensure
28
- class_eval do
29
- undef_method name
30
- alias_method name, old_method
31
- undef_method old_method
32
- end
33
- end
34
- end
@@ -162,7 +162,7 @@ describe 'Sanitize::Transformers::CleanElement' do
162
162
  }
163
163
 
164
164
  describe 'Default config' do
165
- it 'should remove non-whitelisted elements, leaving safe contents behind' do
165
+ it 'should remove non-allowlisted elements, leaving safe contents behind' do
166
166
  Sanitize.fragment('foo <b>bar</b> <strong><a href="#a">baz</a></strong> quux')
167
167
  .must_equal 'foo bar baz quux'
168
168
 
@@ -192,21 +192,16 @@ describe 'Sanitize::Transformers::CleanElement' do
192
192
  .must_equal ''
193
193
  end
194
194
 
195
- it 'should escape the content of removed `plaintext` elements' do
196
- Sanitize.fragment('<plaintext>hello! <script>alert(0)</script>')
197
- .must_equal 'hello! &lt;script&gt;alert(0)&lt;/script&gt;'
198
- end
199
-
200
- it 'should escape the content of removed `xmp` elements' do
201
- Sanitize.fragment('<xmp>hello! <script>alert(0)</script></xmp>')
202
- .must_equal 'hello! &lt;script&gt;alert(0)&lt;/script&gt;'
203
- end
204
-
205
195
  it 'should not preserve the content of removed `iframe` elements' do
206
196
  Sanitize.fragment('<iframe>hello! <script>alert(0)</script></iframe>')
207
197
  .must_equal ''
208
198
  end
209
199
 
200
+ it 'should not preserve the content of removed `math` elements' do
201
+ Sanitize.fragment('<math>hello! <script>alert(0)</script></math>')
202
+ .must_equal ''
203
+ end
204
+
210
205
  it 'should not preserve the content of removed `noembed` elements' do
211
206
  Sanitize.fragment('<noembed>hello! <script>alert(0)</script></noembed>')
212
207
  .must_equal ''
@@ -222,6 +217,11 @@ describe 'Sanitize::Transformers::CleanElement' do
222
217
  .must_equal ''
223
218
  end
224
219
 
220
+ it 'should not preserve the content of removed `plaintext` elements' do
221
+ Sanitize.fragment('<plaintext>hello! <script>alert(0)</script>')
222
+ .must_equal ''
223
+ end
224
+
225
225
  it 'should not preserve the content of removed `script` elements' do
226
226
  Sanitize.fragment('<script>hello! <script>alert(0)</script></script>')
227
227
  .must_equal ''
@@ -232,6 +232,16 @@ describe 'Sanitize::Transformers::CleanElement' do
232
232
  .must_equal ''
233
233
  end
234
234
 
235
+ it 'should not preserve the content of removed `svg` elements' do
236
+ Sanitize.fragment('<svg>hello! <script>alert(0)</script></svg>')
237
+ .must_equal ''
238
+ end
239
+
240
+ it 'should not preserve the content of removed `xmp` elements' do
241
+ Sanitize.fragment('<xmp>hello! <script>alert(0)</script></xmp>')
242
+ .must_equal ''
243
+ end
244
+
235
245
  strings.each do |name, data|
236
246
  it "should clean #{name} HTML" do
237
247
  Sanitize.fragment(data[:html]).must_equal(data[:default])
@@ -315,7 +325,7 @@ describe 'Sanitize::Transformers::CleanElement' do
315
325
  end
316
326
 
317
327
  describe 'Custom configs' do
318
- it 'should allow attributes on all elements if whitelisted under :all' do
328
+ it 'should allow attributes on all elements if allowlisted under :all' do
319
329
  input = '<p class="foo">bar</p>'
320
330
 
321
331
  Sanitize.fragment(input).must_equal ' bar '
@@ -336,7 +346,7 @@ describe 'Sanitize::Transformers::CleanElement' do
336
346
  }).must_equal input
337
347
  end
338
348
 
339
- it "should not allow relative URLs when relative URLs aren't whitelisted" do
349
+ it "should not allow relative URLs when relative URLs aren't allowlisted" do
340
350
  input = '<a href="/foo/bar">Link</a>'
341
351
 
342
352
  Sanitize.fragment(input,
@@ -400,7 +410,7 @@ describe 'Sanitize::Transformers::CleanElement' do
400
410
  ).must_equal 'foo bar baz hi '
401
411
  end
402
412
 
403
- it 'should remove the contents of whitelisted iframes' do
413
+ it 'should remove the contents of allowlisted iframes' do
404
414
  Sanitize.fragment('<iframe>hi <script>hello</script></iframe>',
405
415
  :elements => ['iframe']
406
416
  ).must_equal '<iframe></iframe>'
@@ -481,6 +491,22 @@ describe 'Sanitize::Transformers::CleanElement' do
481
491
  }).must_equal "<a>Text</a>"
482
492
  end
483
493
 
494
+ it 'should sanitize protocols in data attributes even if data attributes are generically allowed' do
495
+ input = '<a data-url="mailto:someone@example.com">Text</a>'
496
+
497
+ Sanitize.fragment(input, {
498
+ :elements => ['a'],
499
+ :attributes => {'a' => [:data]},
500
+ :protocols => {'a' => {'data-url' => ['https']}}
501
+ }).must_equal "<a>Text</a>"
502
+
503
+ Sanitize.fragment(input, {
504
+ :elements => ['a'],
505
+ :attributes => {'a' => [:data]},
506
+ :protocols => {'a' => {'data-url' => ['mailto']}}
507
+ }).must_equal input
508
+ end
509
+
484
510
  it 'should prevent `<meta>` tags from being used to set a non-UTF-8 charset' do
485
511
  Sanitize.document('<html><head><meta charset="utf-8"></head><body>Howdy!</body></html>',
486
512
  :elements => %w[html head meta body],