sanitize 4.6.6 → 6.0.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of sanitize might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  class Sanitize; module Transformers; module CSS
2
2
 
3
- # Enforces a CSS whitelist on the contents of `style` attributes.
3
+ # Enforces a CSS allowlist on the contents of `style` attributes.
4
4
  class CleanAttribute
5
5
  def initialize(sanitizer_or_config)
6
6
  if Sanitize::CSS === sanitizer_or_config
@@ -14,7 +14,7 @@ class CleanAttribute
14
14
  node = env[:node]
15
15
 
16
16
  return unless node.type == Nokogiri::XML::Node::ELEMENT_NODE &&
17
- node.key?('style') && !env[:is_whitelisted]
17
+ node.key?('style') && !env[:is_allowlisted]
18
18
 
19
19
  attr = node.attribute('style')
20
20
  css = @scss.properties(attr.value)
@@ -27,7 +27,7 @@ class CleanAttribute
27
27
  end
28
28
  end
29
29
 
30
- # Enforces a CSS whitelist on the contents of `<style>` elements.
30
+ # Enforces a CSS allowlist on the contents of `<style>` elements.
31
31
  class CleanElement
32
32
  def initialize(sanitizer_or_config)
33
33
  if Sanitize::CSS === sanitizer_or_config
@@ -3,7 +3,7 @@
3
3
  class Sanitize; module Transformers
4
4
 
5
5
  CleanDoctype = lambda do |env|
6
- return if env[:is_whitelisted]
6
+ return if env[:is_allowlisted]
7
7
 
8
8
  node = env[:node]
9
9
 
@@ -67,7 +67,7 @@ class Sanitize; module Transformers; class CleanElement
67
67
  @whitespace_elements = config[:whitespace_elements]
68
68
  end
69
69
 
70
- if config[:remove_contents].is_a?(Set)
70
+ if config[:remove_contents].is_a?(Enumerable)
71
71
  @remove_element_contents.merge(config[:remove_contents].map(&:to_s))
72
72
  else
73
73
  @remove_all_contents = !!config[:remove_contents]
@@ -76,11 +76,11 @@ class Sanitize; module Transformers; class CleanElement
76
76
 
77
77
  def call(env)
78
78
  node = env[:node]
79
- return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_whitelisted]
79
+ return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_allowlisted]
80
80
 
81
81
  name = env[:node_name]
82
82
 
83
- # Delete any element that isn't in the config whitelist, unless the node has
83
+ # Delete any element that isn't in the config allowlist, unless the node has
84
84
  # already been deleted from the document.
85
85
  #
86
86
  # It's important that we not try to reparent the children of a node that has
@@ -97,42 +97,41 @@ class Sanitize; module Transformers; class CleanElement
97
97
  end
98
98
  end
99
99
 
100
- unless @remove_all_contents || @remove_element_contents.include?(name)
101
- node.add_previous_sibling(node.children)
100
+ unless node.children.empty?
101
+ unless @remove_all_contents || @remove_element_contents.include?(name)
102
+ node.add_previous_sibling(node.children)
103
+ end
102
104
  end
103
105
 
104
106
  node.unlink
105
107
  return
106
108
  end
107
109
 
108
- attr_whitelist = @attributes[name] || @attributes[:all]
110
+ attr_allowlist = @attributes[name] || @attributes[:all]
109
111
 
110
- if attr_whitelist.nil?
111
- # Delete all attributes from elements with no whitelisted attributes.
112
+ if attr_allowlist.nil?
113
+ # Delete all attributes from elements with no allowlisted attributes.
112
114
  node.attribute_nodes.each {|attr| attr.unlink }
113
115
  else
114
- allow_data_attributes = attr_whitelist.include?(:data)
116
+ allow_data_attributes = attr_allowlist.include?(:data)
115
117
 
116
118
  # Delete any attribute that isn't allowed on this element.
117
119
  node.attribute_nodes.each do |attr|
118
120
  attr_name = attr.name.downcase
119
121
 
120
- unless attr_whitelist.include?(attr_name)
121
- # The attribute isn't whitelisted.
122
+ unless attr_allowlist.include?(attr_name)
123
+ # The attribute isn't in the allowlist, but may still be allowed if
124
+ # it's a data attribute.
122
125
 
123
- if allow_data_attributes && attr_name.start_with?('data-')
124
- # Arbitrary data attributes are allowed. If this is a data
125
- # attribute, continue.
126
- next if attr_name =~ REGEX_DATA_ATTR
126
+ unless allow_data_attributes && attr_name.start_with?('data-') && attr_name =~ REGEX_DATA_ATTR
127
+ # Either the attribute isn't a data attribute or arbitrary data
128
+ # attributes aren't allowed. Remove the attribute.
129
+ attr.unlink
130
+ next
127
131
  end
128
-
129
- # Either the attribute isn't a data attribute or arbitrary data
130
- # attributes aren't allowed. Remove the attribute.
131
- attr.unlink
132
- next
133
132
  end
134
133
 
135
- # The attribute is whitelisted.
134
+ # The attribute is allowed.
136
135
 
137
136
  # Remove any attributes that use unacceptable protocols.
138
137
  if @protocols.include?(name) && @protocols[name].include?(attr_name)
@@ -160,12 +159,17 @@ class Sanitize; module Transformers; class CleanElement
160
159
  # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
161
160
  # attempt to preserve server-side includes. This can result in XSS since
162
161
  # an unescaped double quote can allow an attacker to inject a
163
- # non-whitelisted attribute.
162
+ # non-allowlisted attribute.
164
163
  #
165
164
  # Sanitize works around this by implementing its own escaping for
166
165
  # affected attributes, some of which can exist on any element and some
167
166
  # of which can only exist on `<a>` elements.
168
167
  #
168
+ # This fix is technically no longer necessary with Nokogumbo >= 2.0
169
+ # since it no longer uses libxml2's serializer, but it's retained to
170
+ # avoid breaking use cases where people might be sanitizing individual
171
+ # Nokogiri nodes and then serializing them manually without Nokogumbo.
172
+ #
169
173
  # The relevant libxml2 code is here:
170
174
  # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
171
175
  if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
@@ -180,6 +184,40 @@ class Sanitize; module Transformers; class CleanElement
180
184
  if @add_attributes.include?(name)
181
185
  @add_attributes[name].each {|key, val| node[key] = val }
182
186
  end
187
+
188
+ # Element-specific special cases.
189
+ case name
190
+
191
+ # If this is an allowlisted iframe that has children, remove all its
192
+ # children. The HTML standard says iframes shouldn't have content, but when
193
+ # they do, this content is parsed as text and is serialized verbatim without
194
+ # being escaped, which is unsafe because legacy browsers may still render it
195
+ # and execute `<script>` content. So the safe and correct thing to do is to
196
+ # always remove iframe content.
197
+ when 'iframe'
198
+ if !node.children.empty?
199
+ node.children.each do |child|
200
+ child.unlink
201
+ end
202
+ end
203
+
204
+ # Prevent the use of `<meta>` elements that set a charset other than UTF-8,
205
+ # since Sanitize's output is always UTF-8.
206
+ when 'meta'
207
+ if node.has_attribute?('charset') &&
208
+ node['charset'].downcase != 'utf-8'
209
+
210
+ node['charset'] = 'utf-8'
211
+ end
212
+
213
+ if node.has_attribute?('http-equiv') &&
214
+ node.has_attribute?('content') &&
215
+ node['http-equiv'].downcase == 'content-type' &&
216
+ node['content'].downcase =~ /;\s*charset\s*=\s*(?!utf-8)/
217
+
218
+ node['content'] = node['content'].gsub(/;\s*charset\s*=.+\z/, ';charset=utf-8')
219
+ end
220
+ end
183
221
  end
184
222
 
185
223
  end; end; end
@@ -1,5 +1,5 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  class Sanitize
4
- VERSION = '4.6.6'
4
+ VERSION = '6.0.0'
5
5
  end
data/lib/sanitize.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  # encoding: utf-8
2
2
 
3
- require 'nokogumbo'
3
+ require 'nokogiri'
4
4
  require 'set'
5
5
 
6
6
  require_relative 'sanitize/version'
@@ -19,6 +19,20 @@ require_relative 'sanitize/transformers/clean_element'
19
19
  class Sanitize
20
20
  attr_reader :config
21
21
 
22
+ # Matches one or more control characters that should be removed from HTML
23
+ # before parsing, as defined by the HTML living standard.
24
+ #
25
+ # - https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
26
+ # - https://infra.spec.whatwg.org/#control
27
+ REGEX_HTML_CONTROL_CHARACTERS = /[\u0001-\u0008\u000b\u000e-\u001f\u007f-\u009f]+/u
28
+
29
+ # Matches one or more non-characters that should be removed from HTML before
30
+ # parsing, as defined by the HTML living standard.
31
+ #
32
+ # - https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
33
+ # - https://infra.spec.whatwg.org/#noncharacter
34
+ REGEX_HTML_NON_CHARACTERS = /[\ufdd0-\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}]+/u
35
+
22
36
  # Matches an attribute value that could be treated by a browser as a URL
23
37
  # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
24
38
  # or more characters followed by a colon is considered a match, even if the
@@ -26,11 +40,12 @@ class Sanitize
26
40
  # IE6 and Opera will still parse).
27
41
  REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
28
42
 
29
- # Matches Unicode characters that should be stripped from HTML before passing
30
- # it to the parser.
43
+ # Matches one or more characters that should be stripped from HTML before
44
+ # parsing. This is a combination of `REGEX_HTML_CONTROL_CHARACTERS` and
45
+ # `REGEX_HTML_NON_CHARACTERS`.
31
46
  #
32
- # http://www.w3.org/TR/unicode-xml/#Charlist
33
- REGEX_UNSUITABLE_CHARS = /[\u0000\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
47
+ # https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
48
+ REGEX_UNSUITABLE_CHARS = /(?:#{REGEX_HTML_CONTROL_CHARACTERS}|#{REGEX_HTML_NON_CHARACTERS})/u
34
49
 
35
50
  #--
36
51
  # Class Methods
@@ -39,7 +54,7 @@ class Sanitize
39
54
  # Returns a sanitized copy of the given full _html_ document, using the
40
55
  # settings in _config_ if specified.
41
56
  #
42
- # When sanitizing a document, the `<html>` element must be whitelisted or an
57
+ # When sanitizing a document, the `<html>` element must be allowlisted or an
43
58
  # error will be raised. If this is undesirable, you should probably use
44
59
  # {#fragment} instead.
45
60
  def self.document(html, config = {})
@@ -102,13 +117,13 @@ class Sanitize
102
117
 
103
118
  # Returns a sanitized copy of the given _html_ document.
104
119
  #
105
- # When sanitizing a document, the `<html>` element must be whitelisted or an
120
+ # When sanitizing a document, the `<html>` element must be allowlisted or an
106
121
  # error will be raised. If this is undesirable, you should probably use
107
122
  # {#fragment} instead.
108
123
  def document(html)
109
124
  return '' unless html
110
125
 
111
- doc = Nokogiri::HTML5.parse(preprocess(html))
126
+ doc = Nokogiri::HTML5.parse(preprocess(html), **@config[:parser_options])
112
127
  node!(doc)
113
128
  to_html(doc)
114
129
  end
@@ -120,20 +135,7 @@ class Sanitize
120
135
  def fragment(html)
121
136
  return '' unless html
122
137
 
123
- html = preprocess(html)
124
- doc = Nokogiri::HTML5.parse("<html><body>#{html}")
125
-
126
- # Hack to allow fragments containing <body>. Borrowed from
127
- # Nokogiri::HTML::DocumentFragment.
128
- if html =~ /\A<body(?:\s|>)/i
129
- path = '/html/body'
130
- else
131
- path = '/html/body/node()'
132
- end
133
-
134
- frag = doc.fragment
135
- frag << doc.xpath(path)
136
-
138
+ frag = Nokogiri::HTML5.fragment(preprocess(html), **@config[:parser_options])
137
139
  node!(frag)
138
140
  to_html(frag)
139
141
  end
@@ -145,20 +147,20 @@ class Sanitize
145
147
  # in place.
146
148
  #
147
149
  # If _node_ is a `Nokogiri::XML::Document`, the `<html>` element must be
148
- # whitelisted or an error will be raised.
150
+ # allowlisted or an error will be raised.
149
151
  def node!(node)
150
152
  raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
151
153
 
152
154
  if node.is_a?(Nokogiri::XML::Document)
153
155
  unless @config[:elements].include?('html')
154
- raise Error, 'When sanitizing a document, "<html>" must be whitelisted.'
156
+ raise Error, 'When sanitizing a document, "<html>" must be allowlisted.'
155
157
  end
156
158
  end
157
159
 
158
- node_whitelist = Set.new
160
+ node_allowlist = Set.new
159
161
 
160
162
  traverse(node) do |n|
161
- transform_node!(n, node_whitelist)
163
+ transform_node!(n, node_allowlist)
162
164
  end
163
165
 
164
166
  node
@@ -184,40 +186,10 @@ class Sanitize
184
186
  end
185
187
 
186
188
  def to_html(node)
187
- replace_meta = false
188
-
189
- # Hacky workaround for a libxml2 bug that adds an undesired Content-Type
190
- # meta tag to all serialized HTML documents.
191
- #
192
- # https://github.com/sparklemotion/nokogiri/issues/1008
193
- if node.type == Nokogiri::XML::Node::DOCUMENT_NODE ||
194
- node.type == Nokogiri::XML::Node::HTML_DOCUMENT_NODE
195
-
196
- regex_meta = %r|(<html[^>]*>\s*<head[^>]*>\s*)<meta http-equiv="Content-Type" content="text/html; charset=utf-8">|i
197
-
198
- # Only replace the content-type meta tag if <meta> isn't whitelisted or
199
- # the original document didn't actually include a content-type meta tag.
200
- replace_meta = !@config[:elements].include?('meta') ||
201
- node.xpath('/html/head/meta[@http-equiv]').none? do |meta|
202
- meta['http-equiv'].casecmp('content-type').zero?
203
- end
204
- end
205
-
206
- so = Nokogiri::XML::Node::SaveOptions
207
-
208
- # Serialize to HTML without any formatting to prevent Nokogiri from adding
209
- # newlines after certain tags.
210
- html = node.to_html(
211
- :encoding => 'utf-8',
212
- :indent => 0,
213
- :save_with => so::NO_DECLARATION | so::NO_EMPTY_TAGS | so::AS_HTML
214
- )
215
-
216
- html.gsub!(regex_meta, '\1') if replace_meta
217
- html
189
+ node.to_html(preserve_newline: true)
218
190
  end
219
191
 
220
- def transform_node!(node, node_whitelist)
192
+ def transform_node!(node, node_allowlist)
221
193
  @transformers.each do |transformer|
222
194
  # Since transform_node! may be called in a tight loop to process thousands
223
195
  # of items, we can optimize both memory and CPU performance by:
@@ -227,15 +199,19 @@ class Sanitize
227
199
  # does merge! create a new hash, it is also 2.6x slower:
228
200
  # https://github.com/JuanitoFatas/fast-ruby#hashmerge-vs-hashmerge-code
229
201
  config = @transformer_config
230
- config[:is_whitelisted] = node_whitelist.include?(node)
202
+ config[:is_allowlisted] = config[:is_whitelisted] = node_allowlist.include?(node)
231
203
  config[:node] = node
232
204
  config[:node_name] = node.name.downcase
233
- config[:node_whitelist] = node_whitelist
205
+ config[:node_allowlist] = config[:node_whitelist] = node_allowlist
234
206
 
235
- result = transformer.call(config)
207
+ result = transformer.call(**config)
236
208
 
237
- if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
238
- node_whitelist.merge(result[:node_whitelist])
209
+ if result.is_a?(Hash)
210
+ result_allowlist = result[:node_allowlist] || result[:node_whitelist]
211
+
212
+ if result_allowlist.respond_to?(:each)
213
+ node_allowlist.merge(result_allowlist)
214
+ end
239
215
  end
240
216
  end
241
217
 
data/test/common.rb CHANGED
@@ -1,34 +1,3 @@
1
1
  # encoding: utf-8
2
- gem 'minitest'
3
2
  require 'minitest/autorun'
4
-
5
3
  require_relative '../lib/sanitize'
6
-
7
- # Helper to stub an instance method. Shamelessly stolen from
8
- # https://github.com/codeodor/minitest-stub_any_instance/
9
- class Object
10
- def self.stub_instance(name, value, &block)
11
- old_method = "__stubbed_method_#{name}__"
12
-
13
- class_eval do
14
- alias_method old_method, name
15
-
16
- define_method(name) do |*args|
17
- if value.respond_to?(:call) then
18
- value.call(*args)
19
- else
20
- value
21
- end
22
- end
23
- end
24
-
25
- yield
26
-
27
- ensure
28
- class_eval do
29
- undef_method name
30
- alias_method name, old_method
31
- undef_method old_method
32
- end
33
- end
34
- end
@@ -20,7 +20,7 @@ describe 'Sanitize::Transformers::CleanComment' do
20
20
 
21
21
  # Special case: the comment markup is inside a <script>, which makes it
22
22
  # text content and not an actual HTML comment.
23
- @s.fragment("<script><!-- comment --></script>").must_equal '&lt;!-- comment --&gt;'
23
+ @s.fragment("<script><!-- comment --></script>").must_equal ''
24
24
 
25
25
  Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => false, :elements => ['script'])
26
26
  .must_equal '<script><!-- comment --></script>'
@@ -40,10 +40,6 @@ describe 'Sanitize::Transformers::CleanComment' do
40
40
  @s.fragment("foo <!-- <!-- <!-- --> --> -->bar").must_equal 'foo <!-- <!-- <!-- --> --&gt; --&gt;bar'
41
41
  @s.fragment("foo <div <!-- comment -->>bar</div>").must_equal 'foo <div>&gt;bar</div>'
42
42
 
43
- # Special case: the comment markup is inside a <script>, which makes it
44
- # text content and not an actual HTML comment.
45
- @s.fragment("<script><!-- comment --></script>").must_equal '&lt;!-- comment --&gt;'
46
-
47
43
  Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => true, :elements => ['script'])
48
44
  .must_equal '<script><!-- comment --></script>'
49
45
  end
@@ -13,7 +13,7 @@ describe 'Sanitize::Transformers::CSS::CleanAttribute' do
13
13
  @s.fragment(%[
14
14
  <div style="color: #fff; width: expression(alert(1)); /* <-- evil! */"></div>
15
15
  ].strip).must_equal %[
16
- <div style="color: #fff; /* &lt;-- evil! */"></div>
16
+ <div style="color: #fff; /* <-- evil! */"></div>
17
17
  ].strip
18
18
  end
19
19
 
@@ -11,7 +11,7 @@ describe 'Sanitize::Transformers::CleanDoctype' do
11
11
  end
12
12
 
13
13
  it 'should remove doctype declarations' do
14
- @s.document('<!DOCTYPE html><html>foo</html>').must_equal "<html>foo</html>\n"
14
+ @s.document('<!DOCTYPE html><html>foo</html>').must_equal "<html>foo</html>"
15
15
  @s.fragment('<!DOCTYPE html>foo').must_equal 'foo'
16
16
  end
17
17
 
@@ -34,27 +34,27 @@ describe 'Sanitize::Transformers::CleanDoctype' do
34
34
 
35
35
  it 'should allow doctype declarations in documents' do
36
36
  @s.document('<!DOCTYPE html><html>foo</html>')
37
- .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
37
+ .must_equal "<!DOCTYPE html><html>foo</html>"
38
38
 
39
39
  @s.document('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
40
- .must_equal "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\">\n<html>foo</html>\n"
40
+ .must_equal "<!DOCTYPE html><html>foo</html>"
41
41
 
42
42
  @s.document("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
43
- .must_equal "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html>foo</html>\n"
43
+ .must_equal "<!DOCTYPE html><html>foo</html>"
44
44
  end
45
45
 
46
46
  it 'should not allow obviously invalid doctype declarations in documents' do
47
47
  @s.document('<!DOCTYPE blah blah blah><html>foo</html>')
48
- .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
48
+ .must_equal "<!DOCTYPE html><html>foo</html>"
49
49
 
50
50
  @s.document('<!DOCTYPE blah><html>foo</html>')
51
- .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
51
+ .must_equal "<!DOCTYPE html><html>foo</html>"
52
52
 
53
53
  @s.document('<!DOCTYPE html BLAH "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
54
- .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
54
+ .must_equal "<!DOCTYPE html><html>foo</html>"
55
55
 
56
56
  @s.document('<!whatever><html>foo</html>')
57
- .must_equal "<html>foo</html>\n"
57
+ .must_equal "<html>foo</html>"
58
58
  end
59
59
 
60
60
  it 'should not allow doctype definitions in fragments' do