sanitize 4.6.4 → 6.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/sanitize.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  # encoding: utf-8
2
2
 
3
- require 'nokogumbo'
3
+ require 'nokogiri'
4
4
  require 'set'
5
5
 
6
6
  require_relative 'sanitize/version'
@@ -19,6 +19,20 @@ require_relative 'sanitize/transformers/clean_element'
19
19
  class Sanitize
20
20
  attr_reader :config
21
21
 
22
+ # Matches one or more control characters that should be removed from HTML
23
+ # before parsing, as defined by the HTML living standard.
24
+ #
25
+ # - https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
26
+ # - https://infra.spec.whatwg.org/#control
27
+ REGEX_HTML_CONTROL_CHARACTERS = /[\u0001-\u0008\u000b\u000e-\u001f\u007f-\u009f]+/u
28
+
29
+ # Matches one or more non-characters that should be removed from HTML before
30
+ # parsing, as defined by the HTML living standard.
31
+ #
32
+ # - https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
33
+ # - https://infra.spec.whatwg.org/#noncharacter
34
+ REGEX_HTML_NON_CHARACTERS = /[\ufdd0-\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}]+/u
35
+
22
36
  # Matches an attribute value that could be treated by a browser as a URL
23
37
  # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
24
38
  # or more characters followed by a colon is considered a match, even if the
@@ -26,11 +40,12 @@ class Sanitize
26
40
  # IE6 and Opera will still parse).
27
41
  REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
28
42
 
29
- # Matches Unicode characters that should be stripped from HTML before passing
30
- # it to the parser.
43
+ # Matches one or more characters that should be stripped from HTML before
44
+ # parsing. This is a combination of `REGEX_HTML_CONTROL_CHARACTERS` and
45
+ # `REGEX_HTML_NON_CHARACTERS`.
31
46
  #
32
- # http://www.w3.org/TR/unicode-xml/#Charlist
33
- REGEX_UNSUITABLE_CHARS = /[\u0000\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
47
+ # https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
48
+ REGEX_UNSUITABLE_CHARS = /(?:#{REGEX_HTML_CONTROL_CHARACTERS}|#{REGEX_HTML_NON_CHARACTERS})/u
34
49
 
35
50
  #--
36
51
  # Class Methods
@@ -39,7 +54,7 @@ class Sanitize
39
54
  # Returns a sanitized copy of the given full _html_ document, using the
40
55
  # settings in _config_ if specified.
41
56
  #
42
- # When sanitizing a document, the `<html>` element must be whitelisted or an
57
+ # When sanitizing a document, the `<html>` element must be allowlisted or an
43
58
  # error will be raised. If this is undesirable, you should probably use
44
59
  # {#fragment} instead.
45
60
  def self.document(html, config = {})
@@ -81,6 +96,7 @@ class Sanitize
81
96
 
82
97
  # Default transformers always run at the end of the chain, after any custom
83
98
  # transformers.
99
+ @transformers << Transformers::CleanElement.new(@config)
84
100
  @transformers << Transformers::CleanComment unless @config[:allow_comments]
85
101
 
86
102
  if @config[:elements].include?('style')
@@ -93,21 +109,21 @@ class Sanitize
93
109
  @transformers << Transformers::CSS::CleanAttribute.new(scss)
94
110
  end
95
111
 
96
- @transformers <<
97
- Transformers::CleanDoctype <<
98
- Transformers::CleanCDATA <<
99
- Transformers::CleanElement.new(@config)
112
+ @transformers << Transformers::CleanDoctype
113
+ @transformers << Transformers::CleanCDATA
114
+
115
+ @transformer_config = { config: @config }
100
116
  end
101
117
 
102
118
  # Returns a sanitized copy of the given _html_ document.
103
119
  #
104
- # When sanitizing a document, the `<html>` element must be whitelisted or an
120
+ # When sanitizing a document, the `<html>` element must be allowlisted or an
105
121
  # error will be raised. If this is undesirable, you should probably use
106
122
  # {#fragment} instead.
107
123
  def document(html)
108
124
  return '' unless html
109
125
 
110
- doc = Nokogiri::HTML5.parse(preprocess(html))
126
+ doc = Nokogiri::HTML5.parse(preprocess(html), **@config[:parser_options])
111
127
  node!(doc)
112
128
  to_html(doc)
113
129
  end
@@ -119,20 +135,7 @@ class Sanitize
119
135
  def fragment(html)
120
136
  return '' unless html
121
137
 
122
- html = preprocess(html)
123
- doc = Nokogiri::HTML5.parse("<html><body>#{html}")
124
-
125
- # Hack to allow fragments containing <body>. Borrowed from
126
- # Nokogiri::HTML::DocumentFragment.
127
- if html =~ /\A<body(?:\s|>)/i
128
- path = '/html/body'
129
- else
130
- path = '/html/body/node()'
131
- end
132
-
133
- frag = doc.fragment
134
- frag << doc.xpath(path)
135
-
138
+ frag = Nokogiri::HTML5.fragment(preprocess(html), **@config[:parser_options])
136
139
  node!(frag)
137
140
  to_html(frag)
138
141
  end
@@ -144,20 +147,20 @@ class Sanitize
144
147
  # in place.
145
148
  #
146
149
  # If _node_ is a `Nokogiri::XML::Document`, the `<html>` element must be
147
- # whitelisted or an error will be raised.
150
+ # allowlisted or an error will be raised.
148
151
  def node!(node)
149
152
  raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
150
153
 
151
154
  if node.is_a?(Nokogiri::XML::Document)
152
155
  unless @config[:elements].include?('html')
153
- raise Error, 'When sanitizing a document, "<html>" must be whitelisted.'
156
+ raise Error, 'When sanitizing a document, "<html>" must be allowlisted.'
154
157
  end
155
158
  end
156
159
 
157
- node_whitelist = Set.new
160
+ node_allowlist = Set.new
158
161
 
159
162
  traverse(node) do |n|
160
- transform_node!(n, node_whitelist)
163
+ transform_node!(n, node_allowlist)
161
164
  end
162
165
 
163
166
  node
@@ -183,51 +186,32 @@ class Sanitize
183
186
  end
184
187
 
185
188
  def to_html(node)
186
- replace_meta = false
187
-
188
- # Hacky workaround for a libxml2 bug that adds an undesired Content-Type
189
- # meta tag to all serialized HTML documents.
190
- #
191
- # https://github.com/sparklemotion/nokogiri/issues/1008
192
- if node.type == Nokogiri::XML::Node::DOCUMENT_NODE ||
193
- node.type == Nokogiri::XML::Node::HTML_DOCUMENT_NODE
194
-
195
- regex_meta = %r|(<html[^>]*>\s*<head[^>]*>\s*)<meta http-equiv="Content-Type" content="text/html; charset=utf-8">|i
196
-
197
- # Only replace the content-type meta tag if <meta> isn't whitelisted or
198
- # the original document didn't actually include a content-type meta tag.
199
- replace_meta = !@config[:elements].include?('meta') ||
200
- node.xpath('/html/head/meta[@http-equiv]').none? do |meta|
201
- meta['http-equiv'].casecmp('content-type').zero?
202
- end
203
- end
204
-
205
- so = Nokogiri::XML::Node::SaveOptions
206
-
207
- # Serialize to HTML without any formatting to prevent Nokogiri from adding
208
- # newlines after certain tags.
209
- html = node.to_html(
210
- :encoding => 'utf-8',
211
- :indent => 0,
212
- :save_with => so::NO_DECLARATION | so::NO_EMPTY_TAGS | so::AS_HTML
213
- )
214
-
215
- html.gsub!(regex_meta, '\1') if replace_meta
216
- html
189
+ node.to_html(preserve_newline: true)
217
190
  end
218
191
 
219
- def transform_node!(node, node_whitelist)
192
+ def transform_node!(node, node_allowlist)
220
193
  @transformers.each do |transformer|
221
- result = transformer.call(
222
- :config => @config,
223
- :is_whitelisted => node_whitelist.include?(node),
224
- :node => node,
225
- :node_name => node.name.downcase,
226
- :node_whitelist => node_whitelist
227
- )
228
-
229
- if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
230
- node_whitelist.merge(result[:node_whitelist])
194
+ # Since transform_node! may be called in a tight loop to process thousands
195
+ # of items, we can optimize both memory and CPU performance by:
196
+ #
197
+ # 1. Reusing the same config hash for each transformer
198
+ # 2. Directly assigning values to hash instead of using merge!. Not only
199
+ # does merge! create a new hash, it is also 2.6x slower:
200
+ # https://github.com/JuanitoFatas/fast-ruby#hashmerge-vs-hashmerge-code
201
+ config = @transformer_config
202
+ config[:is_allowlisted] = config[:is_whitelisted] = node_allowlist.include?(node)
203
+ config[:node] = node
204
+ config[:node_name] = node.name.downcase
205
+ config[:node_allowlist] = config[:node_whitelist] = node_allowlist
206
+
207
+ result = transformer.call(**config)
208
+
209
+ if result.is_a?(Hash)
210
+ result_allowlist = result[:node_allowlist] || result[:node_whitelist]
211
+
212
+ if result_allowlist.respond_to?(:each)
213
+ node_allowlist.merge(result_allowlist)
214
+ end
231
215
  end
232
216
  end
233
217
 
data/test/common.rb CHANGED
@@ -1,34 +1,3 @@
1
1
  # encoding: utf-8
2
- gem 'minitest'
3
2
  require 'minitest/autorun'
4
-
5
3
  require_relative '../lib/sanitize'
6
-
7
- # Helper to stub an instance method. Shamelessly stolen from
8
- # https://github.com/codeodor/minitest-stub_any_instance/
9
- class Object
10
- def self.stub_instance(name, value, &block)
11
- old_method = "__stubbed_method_#{name}__"
12
-
13
- class_eval do
14
- alias_method old_method, name
15
-
16
- define_method(name) do |*args|
17
- if value.respond_to?(:call) then
18
- value.call(*args)
19
- else
20
- value
21
- end
22
- end
23
- end
24
-
25
- yield
26
-
27
- ensure
28
- class_eval do
29
- undef_method name
30
- alias_method name, old_method
31
- undef_method old_method
32
- end
33
- end
34
- end
@@ -11,18 +11,18 @@ describe 'Sanitize::Transformers::CleanComment' do
11
11
  end
12
12
 
13
13
  it 'should remove comments' do
14
- @s.fragment('foo <!-- comment --> bar').must_equal 'foo bar'
15
- @s.fragment('foo <!-- ').must_equal 'foo '
16
- @s.fragment('foo <!-- - -> bar').must_equal 'foo '
17
- @s.fragment("foo <!--\n\n\n\n-->bar").must_equal 'foo bar'
18
- @s.fragment("foo <!-- <!-- <!-- --> --> -->bar").must_equal 'foo --&gt; --&gt;bar'
19
- @s.fragment("foo <div <!-- comment -->>bar</div>").must_equal 'foo <div>&gt;bar</div>'
14
+ _(@s.fragment('foo <!-- comment --> bar')).must_equal 'foo bar'
15
+ _(@s.fragment('foo <!-- ')).must_equal 'foo '
16
+ _(@s.fragment('foo <!-- - -> bar')).must_equal 'foo '
17
+ _(@s.fragment("foo <!--\n\n\n\n-->bar")).must_equal 'foo bar'
18
+ _(@s.fragment("foo <!-- <!-- <!-- --> --> -->bar")).must_equal 'foo --&gt; --&gt;bar'
19
+ _(@s.fragment("foo <div <!-- comment -->>bar</div>")).must_equal 'foo <div>&gt;bar</div>'
20
20
 
21
21
  # Special case: the comment markup is inside a <script>, which makes it
22
22
  # text content and not an actual HTML comment.
23
- @s.fragment("<script><!-- comment --></script>").must_equal '&lt;!-- comment --&gt;'
23
+ _(@s.fragment("<script><!-- comment --></script>")).must_equal ''
24
24
 
25
- Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => false, :elements => ['script'])
25
+ _(Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => false, :elements => ['script']))
26
26
  .must_equal '<script><!-- comment --></script>'
27
27
  end
28
28
  end
@@ -33,18 +33,14 @@ describe 'Sanitize::Transformers::CleanComment' do
33
33
  end
34
34
 
35
35
  it 'should allow comments' do
36
- @s.fragment('foo <!-- comment --> bar').must_equal 'foo <!-- comment --> bar'
37
- @s.fragment('foo <!-- ').must_equal 'foo <!-- -->'
38
- @s.fragment('foo <!-- - -> bar').must_equal 'foo <!-- - -> bar-->'
39
- @s.fragment("foo <!--\n\n\n\n-->bar").must_equal "foo <!--\n\n\n\n-->bar"
40
- @s.fragment("foo <!-- <!-- <!-- --> --> -->bar").must_equal 'foo <!-- <!-- <!-- --> --&gt; --&gt;bar'
41
- @s.fragment("foo <div <!-- comment -->>bar</div>").must_equal 'foo <div>&gt;bar</div>'
42
-
43
- # Special case: the comment markup is inside a <script>, which makes it
44
- # text content and not an actual HTML comment.
45
- @s.fragment("<script><!-- comment --></script>").must_equal '&lt;!-- comment --&gt;'
46
-
47
- Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => true, :elements => ['script'])
36
+ _(@s.fragment('foo <!-- comment --> bar')).must_equal 'foo <!-- comment --> bar'
37
+ _(@s.fragment('foo <!-- ')).must_equal 'foo <!-- -->'
38
+ _(@s.fragment('foo <!-- - -> bar')).must_equal 'foo <!-- - -> bar-->'
39
+ _(@s.fragment("foo <!--\n\n\n\n-->bar")).must_equal "foo <!--\n\n\n\n-->bar"
40
+ _(@s.fragment("foo <!-- <!-- <!-- --> --> -->bar")).must_equal 'foo <!-- <!-- <!-- --> --&gt; --&gt;bar'
41
+ _(@s.fragment("foo <div <!-- comment -->>bar</div>")).must_equal 'foo <div>&gt;bar</div>'
42
+
43
+ _(Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => true, :elements => ['script']))
48
44
  .must_equal '<script><!-- comment --></script>'
49
45
  end
50
46
  end
@@ -10,15 +10,15 @@ describe 'Sanitize::Transformers::CSS::CleanAttribute' do
10
10
  end
11
11
 
12
12
  it 'should sanitize CSS properties in style attributes' do
13
- @s.fragment(%[
13
+ _(@s.fragment(%[
14
14
  <div style="color: #fff; width: expression(alert(1)); /* <-- evil! */"></div>
15
- ].strip).must_equal %[
16
- <div style="color: #fff; /* &lt;-- evil! */"></div>
15
+ ].strip)).must_equal %[
16
+ <div style="color: #fff; /* <-- evil! */"></div>
17
17
  ].strip
18
18
  end
19
19
 
20
20
  it 'should remove the style attribute if the sanitized CSS is empty' do
21
- @s.fragment('<div style="width: expression(alert(1))"></div>').
21
+ _(@s.fragment('<div style="width: expression(alert(1))"></div>')).
22
22
  must_equal '<div></div>'
23
23
  end
24
24
  end
@@ -46,7 +46,7 @@ describe 'Sanitize::Transformers::CSS::CleanElement' do
46
46
  </style>
47
47
  ].strip
48
48
 
49
- @s.fragment(html).must_equal %[
49
+ _(@s.fragment(html)).must_equal %[
50
50
  <style>
51
51
  /* Yay CSS! */
52
52
  .foo { color: #fff; }
@@ -62,6 +62,6 @@ describe 'Sanitize::Transformers::CSS::CleanElement' do
62
62
  end
63
63
 
64
64
  it 'should remove the <style> element if the sanitized CSS is empty' do
65
- @s.fragment('<style></style>').must_equal ''
65
+ _(@s.fragment('<style></style>')).must_equal ''
66
66
  end
67
67
  end
@@ -11,18 +11,18 @@ describe 'Sanitize::Transformers::CleanDoctype' do
11
11
  end
12
12
 
13
13
  it 'should remove doctype declarations' do
14
- @s.document('<!DOCTYPE html><html>foo</html>').must_equal "<html>foo</html>\n"
15
- @s.fragment('<!DOCTYPE html>foo').must_equal 'foo'
14
+ _(@s.document('<!DOCTYPE html><html>foo</html>')).must_equal "<html>foo</html>"
15
+ _(@s.fragment('<!DOCTYPE html>foo')).must_equal 'foo'
16
16
  end
17
17
 
18
18
  it 'should not allow doctype definitions in fragments' do
19
- @s.fragment('<!DOCTYPE html><html>foo</html>')
19
+ _(@s.fragment('<!DOCTYPE html><html>foo</html>'))
20
20
  .must_equal "foo"
21
21
 
22
- @s.fragment('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
22
+ _(@s.fragment('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>'))
23
23
  .must_equal "foo"
24
24
 
25
- @s.fragment("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
25
+ _(@s.fragment("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>"))
26
26
  .must_equal "foo"
27
27
  end
28
28
  end
@@ -33,38 +33,38 @@ describe 'Sanitize::Transformers::CleanDoctype' do
33
33
  end
34
34
 
35
35
  it 'should allow doctype declarations in documents' do
36
- @s.document('<!DOCTYPE html><html>foo</html>')
37
- .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
36
+ _(@s.document('<!DOCTYPE html><html>foo</html>'))
37
+ .must_equal "<!DOCTYPE html><html>foo</html>"
38
38
 
39
- @s.document('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
40
- .must_equal "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\">\n<html>foo</html>\n"
39
+ _(@s.document('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>'))
40
+ .must_equal "<!DOCTYPE html><html>foo</html>"
41
41
 
42
- @s.document("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
43
- .must_equal "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html>foo</html>\n"
42
+ _(@s.document("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>"))
43
+ .must_equal "<!DOCTYPE html><html>foo</html>"
44
44
  end
45
45
 
46
46
  it 'should not allow obviously invalid doctype declarations in documents' do
47
- @s.document('<!DOCTYPE blah blah blah><html>foo</html>')
48
- .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
47
+ _(@s.document('<!DOCTYPE blah blah blah><html>foo</html>'))
48
+ .must_equal "<!DOCTYPE html><html>foo</html>"
49
49
 
50
- @s.document('<!DOCTYPE blah><html>foo</html>')
51
- .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
50
+ _(@s.document('<!DOCTYPE blah><html>foo</html>'))
51
+ .must_equal "<!DOCTYPE html><html>foo</html>"
52
52
 
53
- @s.document('<!DOCTYPE html BLAH "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
54
- .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
53
+ _(@s.document('<!DOCTYPE html BLAH "-//W3C//DTD HTML 4.01//EN"><html>foo</html>'))
54
+ .must_equal "<!DOCTYPE html><html>foo</html>"
55
55
 
56
- @s.document('<!whatever><html>foo</html>')
57
- .must_equal "<html>foo</html>\n"
56
+ _(@s.document('<!whatever><html>foo</html>'))
57
+ .must_equal "<html>foo</html>"
58
58
  end
59
59
 
60
60
  it 'should not allow doctype definitions in fragments' do
61
- @s.fragment('<!DOCTYPE html><html>foo</html>')
61
+ _(@s.fragment('<!DOCTYPE html><html>foo</html>'))
62
62
  .must_equal "foo"
63
63
 
64
- @s.fragment('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
64
+ _(@s.fragment('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>'))
65
65
  .must_equal "foo"
66
66
 
67
- @s.fragment("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
67
+ _(@s.fragment("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>"))
68
68
  .must_equal "foo"
69
69
  end
70
70
  end