sanitize 4.6.5 → 6.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sanitize might be problematic. Click here for more details.

data/lib/sanitize.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  # encoding: utf-8
2
2
 
3
- require 'nokogumbo'
3
+ require 'nokogiri'
4
4
  require 'set'
5
5
 
6
6
  require_relative 'sanitize/version'
@@ -19,6 +19,20 @@ require_relative 'sanitize/transformers/clean_element'
19
19
  class Sanitize
20
20
  attr_reader :config
21
21
 
22
+ # Matches one or more control characters that should be removed from HTML
23
+ # before parsing, as defined by the HTML living standard.
24
+ #
25
+ # - https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
26
+ # - https://infra.spec.whatwg.org/#control
27
+ REGEX_HTML_CONTROL_CHARACTERS = /[\u0001-\u0008\u000b\u000e-\u001f\u007f-\u009f]+/u
28
+
29
+ # Matches one or more non-characters that should be removed from HTML before
30
+ # parsing, as defined by the HTML living standard.
31
+ #
32
+ # - https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
33
+ # - https://infra.spec.whatwg.org/#noncharacter
34
+ REGEX_HTML_NON_CHARACTERS = /[\ufdd0-\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}]+/u
35
+
22
36
  # Matches an attribute value that could be treated by a browser as a URL
23
37
  # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
24
38
  # or more characters followed by a colon is considered a match, even if the
@@ -26,11 +40,12 @@ class Sanitize
26
40
  # IE6 and Opera will still parse).
27
41
  REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
28
42
 
29
- # Matches Unicode characters that should be stripped from HTML before passing
30
- # it to the parser.
43
+ # Matches one or more characters that should be stripped from HTML before
44
+ # parsing. This is a combination of `REGEX_HTML_CONTROL_CHARACTERS` and
45
+ # `REGEX_HTML_NON_CHARACTERS`.
31
46
  #
32
- # http://www.w3.org/TR/unicode-xml/#Charlist
33
- REGEX_UNSUITABLE_CHARS = /[\u0000\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
47
+ # https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
48
+ REGEX_UNSUITABLE_CHARS = /(?:#{REGEX_HTML_CONTROL_CHARACTERS}|#{REGEX_HTML_NON_CHARACTERS})/u
34
49
 
35
50
  #--
36
51
  # Class Methods
@@ -39,7 +54,7 @@ class Sanitize
39
54
  # Returns a sanitized copy of the given full _html_ document, using the
40
55
  # settings in _config_ if specified.
41
56
  #
42
- # When sanitizing a document, the `<html>` element must be whitelisted or an
57
+ # When sanitizing a document, the `<html>` element must be allowlisted or an
43
58
  # error will be raised. If this is undesirable, you should probably use
44
59
  # {#fragment} instead.
45
60
  def self.document(html, config = {})
@@ -96,17 +111,19 @@ class Sanitize
96
111
 
97
112
  @transformers << Transformers::CleanDoctype
98
113
  @transformers << Transformers::CleanCDATA
114
+
115
+ @transformer_config = { config: @config }
99
116
  end
100
117
 
101
118
  # Returns a sanitized copy of the given _html_ document.
102
119
  #
103
- # When sanitizing a document, the `<html>` element must be whitelisted or an
120
+ # When sanitizing a document, the `<html>` element must be allowlisted or an
104
121
  # error will be raised. If this is undesirable, you should probably use
105
122
  # {#fragment} instead.
106
123
  def document(html)
107
124
  return '' unless html
108
125
 
109
- doc = Nokogiri::HTML5.parse(preprocess(html))
126
+ doc = Nokogiri::HTML5.parse(preprocess(html), **@config[:parser_options])
110
127
  node!(doc)
111
128
  to_html(doc)
112
129
  end
@@ -118,20 +135,7 @@ class Sanitize
118
135
  def fragment(html)
119
136
  return '' unless html
120
137
 
121
- html = preprocess(html)
122
- doc = Nokogiri::HTML5.parse("<html><body>#{html}")
123
-
124
- # Hack to allow fragments containing <body>. Borrowed from
125
- # Nokogiri::HTML::DocumentFragment.
126
- if html =~ /\A<body(?:\s|>)/i
127
- path = '/html/body'
128
- else
129
- path = '/html/body/node()'
130
- end
131
-
132
- frag = doc.fragment
133
- frag << doc.xpath(path)
134
-
138
+ frag = Nokogiri::HTML5.fragment(preprocess(html), **@config[:parser_options])
135
139
  node!(frag)
136
140
  to_html(frag)
137
141
  end
@@ -143,20 +147,20 @@ class Sanitize
143
147
  # in place.
144
148
  #
145
149
  # If _node_ is a `Nokogiri::XML::Document`, the `<html>` element must be
146
- # whitelisted or an error will be raised.
150
+ # allowlisted or an error will be raised.
147
151
  def node!(node)
148
152
  raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
149
153
 
150
154
  if node.is_a?(Nokogiri::XML::Document)
151
155
  unless @config[:elements].include?('html')
152
- raise Error, 'When sanitizing a document, "<html>" must be whitelisted.'
156
+ raise Error, 'When sanitizing a document, "<html>" must be allowlisted.'
153
157
  end
154
158
  end
155
159
 
156
- node_whitelist = Set.new
160
+ node_allowlist = Set.new
157
161
 
158
162
  traverse(node) do |n|
159
- transform_node!(n, node_whitelist)
163
+ transform_node!(n, node_allowlist)
160
164
  end
161
165
 
162
166
  node
@@ -182,51 +186,32 @@ class Sanitize
182
186
  end
183
187
 
184
188
  def to_html(node)
185
- replace_meta = false
186
-
187
- # Hacky workaround for a libxml2 bug that adds an undesired Content-Type
188
- # meta tag to all serialized HTML documents.
189
- #
190
- # https://github.com/sparklemotion/nokogiri/issues/1008
191
- if node.type == Nokogiri::XML::Node::DOCUMENT_NODE ||
192
- node.type == Nokogiri::XML::Node::HTML_DOCUMENT_NODE
193
-
194
- regex_meta = %r|(<html[^>]*>\s*<head[^>]*>\s*)<meta http-equiv="Content-Type" content="text/html; charset=utf-8">|i
195
-
196
- # Only replace the content-type meta tag if <meta> isn't whitelisted or
197
- # the original document didn't actually include a content-type meta tag.
198
- replace_meta = !@config[:elements].include?('meta') ||
199
- node.xpath('/html/head/meta[@http-equiv]').none? do |meta|
200
- meta['http-equiv'].casecmp('content-type').zero?
201
- end
202
- end
203
-
204
- so = Nokogiri::XML::Node::SaveOptions
205
-
206
- # Serialize to HTML without any formatting to prevent Nokogiri from adding
207
- # newlines after certain tags.
208
- html = node.to_html(
209
- :encoding => 'utf-8',
210
- :indent => 0,
211
- :save_with => so::NO_DECLARATION | so::NO_EMPTY_TAGS | so::AS_HTML
212
- )
213
-
214
- html.gsub!(regex_meta, '\1') if replace_meta
215
- html
189
+ node.to_html(preserve_newline: true)
216
190
  end
217
191
 
218
- def transform_node!(node, node_whitelist)
192
+ def transform_node!(node, node_allowlist)
219
193
  @transformers.each do |transformer|
220
- result = transformer.call(
221
- :config => @config,
222
- :is_whitelisted => node_whitelist.include?(node),
223
- :node => node,
224
- :node_name => node.name.downcase,
225
- :node_whitelist => node_whitelist
226
- )
227
-
228
- if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
229
- node_whitelist.merge(result[:node_whitelist])
194
+ # Since transform_node! may be called in a tight loop to process thousands
195
+ # of items, we can optimize both memory and CPU performance by:
196
+ #
197
+ # 1. Reusing the same config hash for each transformer
198
+ # 2. Directly assigning values to hash instead of using merge!. Not only
199
+ # does merge! create a new hash, it is also 2.6x slower:
200
+ # https://github.com/JuanitoFatas/fast-ruby#hashmerge-vs-hashmerge-code
201
+ config = @transformer_config
202
+ config[:is_allowlisted] = config[:is_whitelisted] = node_allowlist.include?(node)
203
+ config[:node] = node
204
+ config[:node_name] = node.name.downcase
205
+ config[:node_allowlist] = config[:node_whitelist] = node_allowlist
206
+
207
+ result = transformer.call(**config)
208
+
209
+ if result.is_a?(Hash)
210
+ result_allowlist = result[:node_allowlist] || result[:node_whitelist]
211
+
212
+ if result_allowlist.respond_to?(:each)
213
+ node_allowlist.merge(result_allowlist)
214
+ end
230
215
  end
231
216
  end
232
217
 
data/test/common.rb CHANGED
@@ -1,34 +1,3 @@
1
1
  # encoding: utf-8
2
- gem 'minitest'
3
2
  require 'minitest/autorun'
4
-
5
3
  require_relative '../lib/sanitize'
6
-
7
- # Helper to stub an instance method. Shamelessly stolen from
8
- # https://github.com/codeodor/minitest-stub_any_instance/
9
- class Object
10
- def self.stub_instance(name, value, &block)
11
- old_method = "__stubbed_method_#{name}__"
12
-
13
- class_eval do
14
- alias_method old_method, name
15
-
16
- define_method(name) do |*args|
17
- if value.respond_to?(:call) then
18
- value.call(*args)
19
- else
20
- value
21
- end
22
- end
23
- end
24
-
25
- yield
26
-
27
- ensure
28
- class_eval do
29
- undef_method name
30
- alias_method name, old_method
31
- undef_method old_method
32
- end
33
- end
34
- end
@@ -11,18 +11,18 @@ describe 'Sanitize::Transformers::CleanComment' do
11
11
  end
12
12
 
13
13
  it 'should remove comments' do
14
- @s.fragment('foo <!-- comment --> bar').must_equal 'foo bar'
15
- @s.fragment('foo <!-- ').must_equal 'foo '
16
- @s.fragment('foo <!-- - -> bar').must_equal 'foo '
17
- @s.fragment("foo <!--\n\n\n\n-->bar").must_equal 'foo bar'
18
- @s.fragment("foo <!-- <!-- <!-- --> --> -->bar").must_equal 'foo --&gt; --&gt;bar'
19
- @s.fragment("foo <div <!-- comment -->>bar</div>").must_equal 'foo <div>&gt;bar</div>'
14
+ _(@s.fragment('foo <!-- comment --> bar')).must_equal 'foo bar'
15
+ _(@s.fragment('foo <!-- ')).must_equal 'foo '
16
+ _(@s.fragment('foo <!-- - -> bar')).must_equal 'foo '
17
+ _(@s.fragment("foo <!--\n\n\n\n-->bar")).must_equal 'foo bar'
18
+ _(@s.fragment("foo <!-- <!-- <!-- --> --> -->bar")).must_equal 'foo --&gt; --&gt;bar'
19
+ _(@s.fragment("foo <div <!-- comment -->>bar</div>")).must_equal 'foo <div>&gt;bar</div>'
20
20
 
21
21
  # Special case: the comment markup is inside a <script>, which makes it
22
22
  # text content and not an actual HTML comment.
23
- @s.fragment("<script><!-- comment --></script>").must_equal '&lt;!-- comment --&gt;'
23
+ _(@s.fragment("<script><!-- comment --></script>")).must_equal ''
24
24
 
25
- Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => false, :elements => ['script'])
25
+ _(Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => false, :elements => ['script']))
26
26
  .must_equal '<script><!-- comment --></script>'
27
27
  end
28
28
  end
@@ -33,18 +33,14 @@ describe 'Sanitize::Transformers::CleanComment' do
33
33
  end
34
34
 
35
35
  it 'should allow comments' do
36
- @s.fragment('foo <!-- comment --> bar').must_equal 'foo <!-- comment --> bar'
37
- @s.fragment('foo <!-- ').must_equal 'foo <!-- -->'
38
- @s.fragment('foo <!-- - -> bar').must_equal 'foo <!-- - -> bar-->'
39
- @s.fragment("foo <!--\n\n\n\n-->bar").must_equal "foo <!--\n\n\n\n-->bar"
40
- @s.fragment("foo <!-- <!-- <!-- --> --> -->bar").must_equal 'foo <!-- <!-- <!-- --> --&gt; --&gt;bar'
41
- @s.fragment("foo <div <!-- comment -->>bar</div>").must_equal 'foo <div>&gt;bar</div>'
42
-
43
- # Special case: the comment markup is inside a <script>, which makes it
44
- # text content and not an actual HTML comment.
45
- @s.fragment("<script><!-- comment --></script>").must_equal '&lt;!-- comment --&gt;'
46
-
47
- Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => true, :elements => ['script'])
36
+ _(@s.fragment('foo <!-- comment --> bar')).must_equal 'foo <!-- comment --> bar'
37
+ _(@s.fragment('foo <!-- ')).must_equal 'foo <!-- -->'
38
+ _(@s.fragment('foo <!-- - -> bar')).must_equal 'foo <!-- - -> bar-->'
39
+ _(@s.fragment("foo <!--\n\n\n\n-->bar")).must_equal "foo <!--\n\n\n\n-->bar"
40
+ _(@s.fragment("foo <!-- <!-- <!-- --> --> -->bar")).must_equal 'foo <!-- <!-- <!-- --> --&gt; --&gt;bar'
41
+ _(@s.fragment("foo <div <!-- comment -->>bar</div>")).must_equal 'foo <div>&gt;bar</div>'
42
+
43
+ _(Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => true, :elements => ['script']))
48
44
  .must_equal '<script><!-- comment --></script>'
49
45
  end
50
46
  end
@@ -10,15 +10,15 @@ describe 'Sanitize::Transformers::CSS::CleanAttribute' do
10
10
  end
11
11
 
12
12
  it 'should sanitize CSS properties in style attributes' do
13
- @s.fragment(%[
13
+ _(@s.fragment(%[
14
14
  <div style="color: #fff; width: expression(alert(1)); /* <-- evil! */"></div>
15
- ].strip).must_equal %[
16
- <div style="color: #fff; /* &lt;-- evil! */"></div>
15
+ ].strip)).must_equal %[
16
+ <div style="color: #fff; /* <-- evil! */"></div>
17
17
  ].strip
18
18
  end
19
19
 
20
20
  it 'should remove the style attribute if the sanitized CSS is empty' do
21
- @s.fragment('<div style="width: expression(alert(1))"></div>').
21
+ _(@s.fragment('<div style="width: expression(alert(1))"></div>')).
22
22
  must_equal '<div></div>'
23
23
  end
24
24
  end
@@ -46,7 +46,7 @@ describe 'Sanitize::Transformers::CSS::CleanElement' do
46
46
  </style>
47
47
  ].strip
48
48
 
49
- @s.fragment(html).must_equal %[
49
+ _(@s.fragment(html)).must_equal %[
50
50
  <style>
51
51
  /* Yay CSS! */
52
52
  .foo { color: #fff; }
@@ -62,6 +62,6 @@ describe 'Sanitize::Transformers::CSS::CleanElement' do
62
62
  end
63
63
 
64
64
  it 'should remove the <style> element if the sanitized CSS is empty' do
65
- @s.fragment('<style></style>').must_equal ''
65
+ _(@s.fragment('<style></style>')).must_equal ''
66
66
  end
67
67
  end
@@ -11,18 +11,18 @@ describe 'Sanitize::Transformers::CleanDoctype' do
11
11
  end
12
12
 
13
13
  it 'should remove doctype declarations' do
14
- @s.document('<!DOCTYPE html><html>foo</html>').must_equal "<html>foo</html>\n"
15
- @s.fragment('<!DOCTYPE html>foo').must_equal 'foo'
14
+ _(@s.document('<!DOCTYPE html><html>foo</html>')).must_equal "<html>foo</html>"
15
+ _(@s.fragment('<!DOCTYPE html>foo')).must_equal 'foo'
16
16
  end
17
17
 
18
18
  it 'should not allow doctype definitions in fragments' do
19
- @s.fragment('<!DOCTYPE html><html>foo</html>')
19
+ _(@s.fragment('<!DOCTYPE html><html>foo</html>'))
20
20
  .must_equal "foo"
21
21
 
22
- @s.fragment('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
22
+ _(@s.fragment('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>'))
23
23
  .must_equal "foo"
24
24
 
25
- @s.fragment("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
25
+ _(@s.fragment("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>"))
26
26
  .must_equal "foo"
27
27
  end
28
28
  end
@@ -33,38 +33,38 @@ describe 'Sanitize::Transformers::CleanDoctype' do
33
33
  end
34
34
 
35
35
  it 'should allow doctype declarations in documents' do
36
- @s.document('<!DOCTYPE html><html>foo</html>')
37
- .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
36
+ _(@s.document('<!DOCTYPE html><html>foo</html>'))
37
+ .must_equal "<!DOCTYPE html><html>foo</html>"
38
38
 
39
- @s.document('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
40
- .must_equal "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\">\n<html>foo</html>\n"
39
+ _(@s.document('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>'))
40
+ .must_equal "<!DOCTYPE html><html>foo</html>"
41
41
 
42
- @s.document("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
43
- .must_equal "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html>foo</html>\n"
42
+ _(@s.document("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>"))
43
+ .must_equal "<!DOCTYPE html><html>foo</html>"
44
44
  end
45
45
 
46
46
  it 'should not allow obviously invalid doctype declarations in documents' do
47
- @s.document('<!DOCTYPE blah blah blah><html>foo</html>')
48
- .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
47
+ _(@s.document('<!DOCTYPE blah blah blah><html>foo</html>'))
48
+ .must_equal "<!DOCTYPE html><html>foo</html>"
49
49
 
50
- @s.document('<!DOCTYPE blah><html>foo</html>')
51
- .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
50
+ _(@s.document('<!DOCTYPE blah><html>foo</html>'))
51
+ .must_equal "<!DOCTYPE html><html>foo</html>"
52
52
 
53
- @s.document('<!DOCTYPE html BLAH "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
54
- .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
53
+ _(@s.document('<!DOCTYPE html BLAH "-//W3C//DTD HTML 4.01//EN"><html>foo</html>'))
54
+ .must_equal "<!DOCTYPE html><html>foo</html>"
55
55
 
56
- @s.document('<!whatever><html>foo</html>')
57
- .must_equal "<html>foo</html>\n"
56
+ _(@s.document('<!whatever><html>foo</html>'))
57
+ .must_equal "<html>foo</html>"
58
58
  end
59
59
 
60
60
  it 'should not allow doctype definitions in fragments' do
61
- @s.fragment('<!DOCTYPE html><html>foo</html>')
61
+ _(@s.fragment('<!DOCTYPE html><html>foo</html>'))
62
62
  .must_equal "foo"
63
63
 
64
- @s.fragment('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
64
+ _(@s.fragment('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>'))
65
65
  .must_equal "foo"
66
66
 
67
- @s.fragment("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
67
+ _(@s.fragment("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>"))
68
68
  .must_equal "foo"
69
69
  end
70
70
  end