sanitize 4.6.6 → 5.2.2

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of sanitize might be problematic. Click here for more details.

@@ -19,6 +19,20 @@ require_relative 'sanitize/transformers/clean_element'
19
19
  class Sanitize
20
20
  attr_reader :config
21
21
 
22
+ # Matches one or more control characters that should be removed from HTML
23
+ # before parsing, as defined by the HTML living standard.
24
+ #
25
+ # - https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
26
+ # - https://infra.spec.whatwg.org/#control
27
+ REGEX_HTML_CONTROL_CHARACTERS = /[\u0001-\u0008\u000b\u000e-\u001f\u007f-\u009f]+/u
28
+
29
+ # Matches one or more non-characters that should be removed from HTML before
30
+ # parsing, as defined by the HTML living standard.
31
+ #
32
+ # - https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
33
+ # - https://infra.spec.whatwg.org/#noncharacter
34
+ REGEX_HTML_NON_CHARACTERS = /[\ufdd0-\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}]+/u
35
+
22
36
  # Matches an attribute value that could be treated by a browser as a URL
23
37
  # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
24
38
  # or more characters followed by a colon is considered a match, even if the
@@ -26,11 +40,12 @@ class Sanitize
26
40
  # IE6 and Opera will still parse).
27
41
  REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
28
42
 
29
- # Matches Unicode characters that should be stripped from HTML before passing
30
- # it to the parser.
43
+ # Matches one or more characters that should be stripped from HTML before
44
+ # parsing. This is a combination of `REGEX_HTML_CONTROL_CHARACTERS` and
45
+ # `REGEX_HTML_NON_CHARACTERS`.
31
46
  #
32
- # http://www.w3.org/TR/unicode-xml/#Charlist
33
- REGEX_UNSUITABLE_CHARS = /[\u0000\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
47
+ # https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
48
+ REGEX_UNSUITABLE_CHARS = /(?:#{REGEX_HTML_CONTROL_CHARACTERS}|#{REGEX_HTML_NON_CHARACTERS})/u
34
49
 
35
50
  #--
36
51
  # Class Methods
@@ -39,7 +54,7 @@ class Sanitize
39
54
  # Returns a sanitized copy of the given full _html_ document, using the
40
55
  # settings in _config_ if specified.
41
56
  #
42
- # When sanitizing a document, the `<html>` element must be whitelisted or an
57
+ # When sanitizing a document, the `<html>` element must be allowlisted or an
43
58
  # error will be raised. If this is undesirable, you should probably use
44
59
  # {#fragment} instead.
45
60
  def self.document(html, config = {})
@@ -102,13 +117,13 @@ class Sanitize
102
117
 
103
118
  # Returns a sanitized copy of the given _html_ document.
104
119
  #
105
- # When sanitizing a document, the `<html>` element must be whitelisted or an
120
+ # When sanitizing a document, the `<html>` element must be allowlisted or an
106
121
  # error will be raised. If this is undesirable, you should probably use
107
122
  # {#fragment} instead.
108
123
  def document(html)
109
124
  return '' unless html
110
125
 
111
- doc = Nokogiri::HTML5.parse(preprocess(html))
126
+ doc = Nokogiri::HTML5.parse(preprocess(html), **@config[:parser_options])
112
127
  node!(doc)
113
128
  to_html(doc)
114
129
  end
@@ -120,20 +135,7 @@ class Sanitize
120
135
  def fragment(html)
121
136
  return '' unless html
122
137
 
123
- html = preprocess(html)
124
- doc = Nokogiri::HTML5.parse("<html><body>#{html}")
125
-
126
- # Hack to allow fragments containing <body>. Borrowed from
127
- # Nokogiri::HTML::DocumentFragment.
128
- if html =~ /\A<body(?:\s|>)/i
129
- path = '/html/body'
130
- else
131
- path = '/html/body/node()'
132
- end
133
-
134
- frag = doc.fragment
135
- frag << doc.xpath(path)
136
-
138
+ frag = Nokogiri::HTML5.fragment(preprocess(html), **@config[:parser_options])
137
139
  node!(frag)
138
140
  to_html(frag)
139
141
  end
@@ -145,20 +147,20 @@ class Sanitize
145
147
  # in place.
146
148
  #
147
149
  # If _node_ is a `Nokogiri::XML::Document`, the `<html>` element must be
148
- # whitelisted or an error will be raised.
150
+ # allowlisted or an error will be raised.
149
151
  def node!(node)
150
152
  raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
151
153
 
152
154
  if node.is_a?(Nokogiri::XML::Document)
153
155
  unless @config[:elements].include?('html')
154
- raise Error, 'When sanitizing a document, "<html>" must be whitelisted.'
156
+ raise Error, 'When sanitizing a document, "<html>" must be allowlisted.'
155
157
  end
156
158
  end
157
159
 
158
- node_whitelist = Set.new
160
+ node_allowlist = Set.new
159
161
 
160
162
  traverse(node) do |n|
161
- transform_node!(n, node_whitelist)
163
+ transform_node!(n, node_allowlist)
162
164
  end
163
165
 
164
166
  node
@@ -184,40 +186,10 @@ class Sanitize
184
186
  end
185
187
 
186
188
  def to_html(node)
187
- replace_meta = false
188
-
189
- # Hacky workaround for a libxml2 bug that adds an undesired Content-Type
190
- # meta tag to all serialized HTML documents.
191
- #
192
- # https://github.com/sparklemotion/nokogiri/issues/1008
193
- if node.type == Nokogiri::XML::Node::DOCUMENT_NODE ||
194
- node.type == Nokogiri::XML::Node::HTML_DOCUMENT_NODE
195
-
196
- regex_meta = %r|(<html[^>]*>\s*<head[^>]*>\s*)<meta http-equiv="Content-Type" content="text/html; charset=utf-8">|i
197
-
198
- # Only replace the content-type meta tag if <meta> isn't whitelisted or
199
- # the original document didn't actually include a content-type meta tag.
200
- replace_meta = !@config[:elements].include?('meta') ||
201
- node.xpath('/html/head/meta[@http-equiv]').none? do |meta|
202
- meta['http-equiv'].casecmp('content-type').zero?
203
- end
204
- end
205
-
206
- so = Nokogiri::XML::Node::SaveOptions
207
-
208
- # Serialize to HTML without any formatting to prevent Nokogiri from adding
209
- # newlines after certain tags.
210
- html = node.to_html(
211
- :encoding => 'utf-8',
212
- :indent => 0,
213
- :save_with => so::NO_DECLARATION | so::NO_EMPTY_TAGS | so::AS_HTML
214
- )
215
-
216
- html.gsub!(regex_meta, '\1') if replace_meta
217
- html
189
+ node.to_html(preserve_newline: true)
218
190
  end
219
191
 
220
- def transform_node!(node, node_whitelist)
192
+ def transform_node!(node, node_allowlist)
221
193
  @transformers.each do |transformer|
222
194
  # Since transform_node! may be called in a tight loop to process thousands
223
195
  # of items, we can optimize both memory and CPU performance by:
@@ -227,15 +199,19 @@ class Sanitize
227
199
  # does merge! create a new hash, it is also 2.6x slower:
228
200
  # https://github.com/JuanitoFatas/fast-ruby#hashmerge-vs-hashmerge-code
229
201
  config = @transformer_config
230
- config[:is_whitelisted] = node_whitelist.include?(node)
202
+ config[:is_allowlisted] = config[:is_whitelisted] = node_allowlist.include?(node)
231
203
  config[:node] = node
232
204
  config[:node_name] = node.name.downcase
233
- config[:node_whitelist] = node_whitelist
205
+ config[:node_allowlist] = config[:node_whitelist] = node_allowlist
234
206
 
235
- result = transformer.call(config)
207
+ result = transformer.call(**config)
236
208
 
237
- if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
238
- node_whitelist.merge(result[:node_whitelist])
209
+ if result.is_a?(Hash)
210
+ result_allowlist = result[:node_allowlist] || result[:node_whitelist]
211
+
212
+ if result_allowlist.respond_to?(:each)
213
+ node_allowlist.merge(result_allowlist)
214
+ end
239
215
  end
240
216
  end
241
217
 
@@ -56,6 +56,10 @@ class Sanitize
56
56
  # that all HTML will be stripped).
57
57
  :elements => [],
58
58
 
59
+ # HTML parsing options to pass to Nokogumbo.
60
+ # https://github.com/rubys/nokogumbo/tree/v2.0.1#parsing-options
61
+ :parser_options => {},
62
+
59
63
  # URL handling protocols to allow in specific attributes. By default, no
60
64
  # protocols are allowed. Use :relative in place of a protocol if you want
61
65
  # to allow relative URLs sans protocol.
@@ -66,10 +70,12 @@ class Sanitize
66
70
  # leaves the safe parts of an element's contents behind when the element
67
71
  # is removed.
68
72
  #
69
- # If this is an Array of element names, then only the contents of the
70
- # specified elements (when filtered) will be removed, and the contents of
71
- # all other filtered elements will be left behind.
72
- :remove_contents => false,
73
+ # If this is an Array or Set of element names, then only the contents of
74
+ # the specified elements (when filtered) will be removed, and the contents
75
+ # of all other filtered elements will be left behind.
76
+ :remove_contents => %w[
77
+ iframe math noembed noframes noscript plaintext script style svg xmp
78
+ ],
73
79
 
74
80
  # Transformers allow you to filter or alter nodes using custom logic. See
75
81
  # README.md for details and examples.
@@ -6,7 +6,7 @@ class Sanitize
6
6
  :elements => BASIC[:elements] + %w[
7
7
  address article aside bdi bdo body caption col colgroup data del div
8
8
  figcaption figure footer h1 h2 h3 h4 h5 h6 head header hgroup hr html
9
- img ins main nav rp rt ruby section span style summary sup table tbody
9
+ img ins main nav rp rt ruby section span style summary table tbody
10
10
  td tfoot th thead title tr wbr
11
11
  ],
12
12
 
@@ -175,7 +175,7 @@ class Sanitize; class CSS
175
175
  next prop
176
176
 
177
177
  when :semicolon
178
- # Only preserve the semicolon if it was preceded by a whitelisted
178
+ # Only preserve the semicolon if it was preceded by an allowlisted
179
179
  # property. Otherwise, omit it in order to prevent redundant semicolons.
180
180
  if preceded_by_property
181
181
  preceded_by_property = false
@@ -296,7 +296,7 @@ class Sanitize; class CSS
296
296
  end
297
297
 
298
298
  # Returns `true` if the given node (which may be of type `:url` or
299
- # `:function`, since the CSS syntax can produce both) uses a whitelisted
299
+ # `:function`, since the CSS syntax can produce both) uses an allowlisted
300
300
  # protocol.
301
301
  def valid_url?(node)
302
302
  type = node[:node]
@@ -6,7 +6,7 @@ class Sanitize; module Transformers
6
6
  node = env[:node]
7
7
 
8
8
  if node.type == Nokogiri::XML::Node::COMMENT_NODE
9
- node.unlink unless env[:is_whitelisted]
9
+ node.unlink unless env[:is_allowlisted]
10
10
  end
11
11
  end
12
12
 
@@ -1,6 +1,6 @@
1
1
  class Sanitize; module Transformers; module CSS
2
2
 
3
- # Enforces a CSS whitelist on the contents of `style` attributes.
3
+ # Enforces a CSS allowlist on the contents of `style` attributes.
4
4
  class CleanAttribute
5
5
  def initialize(sanitizer_or_config)
6
6
  if Sanitize::CSS === sanitizer_or_config
@@ -14,7 +14,7 @@ class CleanAttribute
14
14
  node = env[:node]
15
15
 
16
16
  return unless node.type == Nokogiri::XML::Node::ELEMENT_NODE &&
17
- node.key?('style') && !env[:is_whitelisted]
17
+ node.key?('style') && !env[:is_allowlisted]
18
18
 
19
19
  attr = node.attribute('style')
20
20
  css = @scss.properties(attr.value)
@@ -27,7 +27,7 @@ class CleanAttribute
27
27
  end
28
28
  end
29
29
 
30
- # Enforces a CSS whitelist on the contents of `<style>` elements.
30
+ # Enforces a CSS allowlist on the contents of `<style>` elements.
31
31
  class CleanElement
32
32
  def initialize(sanitizer_or_config)
33
33
  if Sanitize::CSS === sanitizer_or_config
@@ -3,7 +3,7 @@
3
3
  class Sanitize; module Transformers
4
4
 
5
5
  CleanDoctype = lambda do |env|
6
- return if env[:is_whitelisted]
6
+ return if env[:is_allowlisted]
7
7
 
8
8
  node = env[:node]
9
9
 
@@ -67,7 +67,7 @@ class Sanitize; module Transformers; class CleanElement
67
67
  @whitespace_elements = config[:whitespace_elements]
68
68
  end
69
69
 
70
- if config[:remove_contents].is_a?(Set)
70
+ if config[:remove_contents].is_a?(Enumerable)
71
71
  @remove_element_contents.merge(config[:remove_contents].map(&:to_s))
72
72
  else
73
73
  @remove_all_contents = !!config[:remove_contents]
@@ -76,11 +76,11 @@ class Sanitize; module Transformers; class CleanElement
76
76
 
77
77
  def call(env)
78
78
  node = env[:node]
79
- return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_whitelisted]
79
+ return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_allowlisted]
80
80
 
81
81
  name = env[:node_name]
82
82
 
83
- # Delete any element that isn't in the config whitelist, unless the node has
83
+ # Delete any element that isn't in the config allowlist, unless the node has
84
84
  # already been deleted from the document.
85
85
  #
86
86
  # It's important that we not try to reparent the children of a node that has
@@ -97,28 +97,30 @@ class Sanitize; module Transformers; class CleanElement
97
97
  end
98
98
  end
99
99
 
100
- unless @remove_all_contents || @remove_element_contents.include?(name)
101
- node.add_previous_sibling(node.children)
100
+ unless node.children.empty?
101
+ unless @remove_all_contents || @remove_element_contents.include?(name)
102
+ node.add_previous_sibling(node.children)
103
+ end
102
104
  end
103
105
 
104
106
  node.unlink
105
107
  return
106
108
  end
107
109
 
108
- attr_whitelist = @attributes[name] || @attributes[:all]
110
+ attr_allowlist = @attributes[name] || @attributes[:all]
109
111
 
110
- if attr_whitelist.nil?
111
- # Delete all attributes from elements with no whitelisted attributes.
112
+ if attr_allowlist.nil?
113
+ # Delete all attributes from elements with no allowlisted attributes.
112
114
  node.attribute_nodes.each {|attr| attr.unlink }
113
115
  else
114
- allow_data_attributes = attr_whitelist.include?(:data)
116
+ allow_data_attributes = attr_allowlist.include?(:data)
115
117
 
116
118
  # Delete any attribute that isn't allowed on this element.
117
119
  node.attribute_nodes.each do |attr|
118
120
  attr_name = attr.name.downcase
119
121
 
120
- unless attr_whitelist.include?(attr_name)
121
- # The attribute isn't whitelisted.
122
+ unless attr_allowlist.include?(attr_name)
123
+ # The attribute isn't allowed.
122
124
 
123
125
  if allow_data_attributes && attr_name.start_with?('data-')
124
126
  # Arbitrary data attributes are allowed. If this is a data
@@ -132,7 +134,7 @@ class Sanitize; module Transformers; class CleanElement
132
134
  next
133
135
  end
134
136
 
135
- # The attribute is whitelisted.
137
+ # The attribute is allowed.
136
138
 
137
139
  # Remove any attributes that use unacceptable protocols.
138
140
  if @protocols.include?(name) && @protocols[name].include?(attr_name)
@@ -160,12 +162,17 @@ class Sanitize; module Transformers; class CleanElement
160
162
  # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
161
163
  # attempt to preserve server-side includes. This can result in XSS since
162
164
  # an unescaped double quote can allow an attacker to inject a
163
- # non-whitelisted attribute.
165
+ # non-allowlisted attribute.
164
166
  #
165
167
  # Sanitize works around this by implementing its own escaping for
166
168
  # affected attributes, some of which can exist on any element and some
167
169
  # of which can only exist on `<a>` elements.
168
170
  #
171
+ # This fix is technically no longer necessary with Nokogumbo >= 2.0
172
+ # since it no longer uses libxml2's serializer, but it's retained to
173
+ # avoid breaking use cases where people might be sanitizing individual
174
+ # Nokogiri nodes and then serializing them manually without Nokogumbo.
175
+ #
169
176
  # The relevant libxml2 code is here:
170
177
  # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
171
178
  if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
@@ -180,6 +187,40 @@ class Sanitize; module Transformers; class CleanElement
180
187
  if @add_attributes.include?(name)
181
188
  @add_attributes[name].each {|key, val| node[key] = val }
182
189
  end
190
+
191
+ # Element-specific special cases.
192
+ case name
193
+
194
+ # If this is an allowlisted iframe that has children, remove all its
195
+ # children. The HTML standard says iframes shouldn't have content, but when
196
+ # they do, this content is parsed as text and is serialized verbatim without
197
+ # being escaped, which is unsafe because legacy browsers may still render it
198
+ # and execute `<script>` content. So the safe and correct thing to do is to
199
+ # always remove iframe content.
200
+ when 'iframe'
201
+ if !node.children.empty?
202
+ node.children.each do |child|
203
+ child.unlink
204
+ end
205
+ end
206
+
207
+ # Prevent the use of `<meta>` elements that set a charset other than UTF-8,
208
+ # since Sanitize's output is always UTF-8.
209
+ when 'meta'
210
+ if node.has_attribute?('charset') &&
211
+ node['charset'].downcase != 'utf-8'
212
+
213
+ node['charset'] = 'utf-8'
214
+ end
215
+
216
+ if node.has_attribute?('http-equiv') &&
217
+ node.has_attribute?('content') &&
218
+ node['http-equiv'].downcase == 'content-type' &&
219
+ node['content'].downcase =~ /;\s*charset\s*=\s*(?!utf-8)/
220
+
221
+ node['content'] = node['content'].gsub(/;\s*charset\s*=.+\z/, ';charset=utf-8')
222
+ end
223
+ end
183
224
  end
184
225
 
185
226
  end; end; end
@@ -1,5 +1,5 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  class Sanitize
4
- VERSION = '4.6.6'
4
+ VERSION = '5.2.2'
5
5
  end
@@ -1,34 +1,3 @@
1
1
  # encoding: utf-8
2
- gem 'minitest'
3
2
  require 'minitest/autorun'
4
-
5
3
  require_relative '../lib/sanitize'
6
-
7
- # Helper to stub an instance method. Shamelessly stolen from
8
- # https://github.com/codeodor/minitest-stub_any_instance/
9
- class Object
10
- def self.stub_instance(name, value, &block)
11
- old_method = "__stubbed_method_#{name}__"
12
-
13
- class_eval do
14
- alias_method old_method, name
15
-
16
- define_method(name) do |*args|
17
- if value.respond_to?(:call) then
18
- value.call(*args)
19
- else
20
- value
21
- end
22
- end
23
- end
24
-
25
- yield
26
-
27
- ensure
28
- class_eval do
29
- undef_method name
30
- alias_method name, old_method
31
- undef_method old_method
32
- end
33
- end
34
- end
@@ -20,7 +20,7 @@ describe 'Sanitize::Transformers::CleanComment' do
20
20
 
21
21
  # Special case: the comment markup is inside a <script>, which makes it
22
22
  # text content and not an actual HTML comment.
23
- @s.fragment("<script><!-- comment --></script>").must_equal '&lt;!-- comment --&gt;'
23
+ @s.fragment("<script><!-- comment --></script>").must_equal ''
24
24
 
25
25
  Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => false, :elements => ['script'])
26
26
  .must_equal '<script><!-- comment --></script>'
@@ -40,10 +40,6 @@ describe 'Sanitize::Transformers::CleanComment' do
40
40
  @s.fragment("foo <!-- <!-- <!-- --> --> -->bar").must_equal 'foo <!-- <!-- <!-- --> --&gt; --&gt;bar'
41
41
  @s.fragment("foo <div <!-- comment -->>bar</div>").must_equal 'foo <div>&gt;bar</div>'
42
42
 
43
- # Special case: the comment markup is inside a <script>, which makes it
44
- # text content and not an actual HTML comment.
45
- @s.fragment("<script><!-- comment --></script>").must_equal '&lt;!-- comment --&gt;'
46
-
47
43
  Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => true, :elements => ['script'])
48
44
  .must_equal '<script><!-- comment --></script>'
49
45
  end