sanitize 4.6.6 → 6.0.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of sanitize might be problematic. Click here for more details.

@@ -8,25 +8,22 @@ describe 'Sanitize::Transformers::CleanElement' do
8
8
  strings = {
9
9
  :basic => {
10
10
  :html => '<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo" style="text-decoration: underline;">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <style>.foo { color: #fff; }</style> <script>alert("hello world");</script>',
11
-
12
- :default => 'Lorem ipsum dolor sit amet .foo { color: #fff; } alert("hello world");',
13
- :restricted => '<b>Lorem</b> ipsum <strong>dolor</strong> sit amet .foo { color: #fff; } alert("hello world");',
14
- :basic => '<b>Lorem</b> <a href="pants" rel="nofollow">ipsum</a> <a href="http://foo.com/" rel="nofollow"><strong>dolor</strong></a> sit<br>amet .foo { color: #fff; } alert("hello world");',
15
- :relaxed => '<b>Lorem</b> <a href="pants" title="foo" style="text-decoration: underline;">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br>amet <style>.foo { color: #fff; }</style> alert("hello world");'
11
+ :default => 'Lorem ipsum dolor sit amet ',
12
+ :restricted => '<b>Lorem</b> ipsum <strong>dolor</strong> sit amet ',
13
+ :basic => '<b>Lorem</b> <a href="pants" rel="nofollow">ipsum</a> <a href="http://foo.com/" rel="nofollow"><strong>dolor</strong></a> sit<br>amet ',
14
+ :relaxed => '<b>Lorem</b> <a href="pants" title="foo" style="text-decoration: underline;">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br>amet <style>.foo { color: #fff; }</style> '
16
15
  },
17
16
 
18
17
  :malformed => {
19
18
  :html => 'Lo<!-- comment -->rem</b> <a href=pants title="foo>ipsum <a href="http://foo.com/"><strong>dolor</a></strong> sit<br/>amet <script>alert("hello world");',
20
-
21
- :default => 'Lorem dolor sit amet alert("hello world");',
22
- :restricted => 'Lorem <strong>dolor</strong> sit amet alert("hello world");',
23
- :basic => 'Lorem <a href="pants" rel="nofollow"><strong>dolor</strong></a> sit<br>amet alert("hello world");',
24
- :relaxed => 'Lorem <a href="pants" title="foo&gt;ipsum &lt;a href="><strong>dolor</strong></a> sit<br>amet alert("hello world");',
19
+ :default => 'Lorem dolor sit amet ',
20
+ :restricted => 'Lorem <strong>dolor</strong> sit amet ',
21
+ :basic => 'Lorem <a href="pants" rel="nofollow"><strong>dolor</strong></a> sit<br>amet ',
22
+ :relaxed => 'Lorem <a href="pants" title="foo>ipsum <a href="><strong>dolor</strong></a> sit<br>amet ',
25
23
  },
26
24
 
27
25
  :unclosed => {
28
26
  :html => '<p>a</p><blockquote>b',
29
-
30
27
  :default => ' a b ',
31
28
  :restricted => ' a b ',
32
29
  :basic => '<p>a</p><blockquote>b</blockquote>',
@@ -35,7 +32,6 @@ describe 'Sanitize::Transformers::CleanElement' do
35
32
 
36
33
  :malicious => {
37
34
  :html => '<b>Lo<!-- comment -->rem</b> <a href="javascript:pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <<foo>script>alert("hello world");</script>',
38
-
39
35
  :default => 'Lorem ipsum dolor sit amet &lt;script&gt;alert("hello world");',
40
36
  :restricted => '<b>Lorem</b> ipsum <strong>dolor</strong> sit amet &lt;script&gt;alert("hello world");',
41
37
  :basic => '<b>Lorem</b> <a rel="nofollow">ipsum</a> <a href="http://foo.com/" rel="nofollow"><strong>dolor</strong></a> sit<br>amet &lt;script&gt;alert("hello world");',
@@ -166,15 +162,15 @@ describe 'Sanitize::Transformers::CleanElement' do
166
162
  }
167
163
 
168
164
  describe 'Default config' do
169
- it 'should remove non-whitelisted elements, leaving safe contents behind' do
165
+ it 'should remove non-allowlisted elements, leaving safe contents behind' do
170
166
  Sanitize.fragment('foo <b>bar</b> <strong><a href="#a">baz</a></strong> quux')
171
167
  .must_equal 'foo bar baz quux'
172
168
 
173
169
  Sanitize.fragment('<script>alert("<xss>");</script>')
174
- .must_equal 'alert("&lt;xss&gt;");'
170
+ .must_equal ''
175
171
 
176
172
  Sanitize.fragment('<<script>script>alert("<xss>");</<script>>')
177
- .must_equal '&lt;script&gt;alert("&lt;xss&gt;");&lt;/&lt;script&gt;&gt;'
173
+ .must_equal '&lt;'
178
174
 
179
175
  Sanitize.fragment('< script <>> alert("<xss>");</script>')
180
176
  .must_equal '&lt; script &lt;&gt;&gt; alert("");'
@@ -196,6 +192,56 @@ describe 'Sanitize::Transformers::CleanElement' do
196
192
  .must_equal ''
197
193
  end
198
194
 
195
+ it 'should not preserve the content of removed `iframe` elements' do
196
+ Sanitize.fragment('<iframe>hello! <script>alert(0)</script></iframe>')
197
+ .must_equal ''
198
+ end
199
+
200
+ it 'should not preserve the content of removed `math` elements' do
201
+ Sanitize.fragment('<math>hello! <script>alert(0)</script></math>')
202
+ .must_equal ''
203
+ end
204
+
205
+ it 'should not preserve the content of removed `noembed` elements' do
206
+ Sanitize.fragment('<noembed>hello! <script>alert(0)</script></noembed>')
207
+ .must_equal ''
208
+ end
209
+
210
+ it 'should not preserve the content of removed `noframes` elements' do
211
+ Sanitize.fragment('<noframes>hello! <script>alert(0)</script></noframes>')
212
+ .must_equal ''
213
+ end
214
+
215
+ it 'should not preserve the content of removed `noscript` elements' do
216
+ Sanitize.fragment('<noscript>hello! <script>alert(0)</script></noscript>')
217
+ .must_equal ''
218
+ end
219
+
220
+ it 'should not preserve the content of removed `plaintext` elements' do
221
+ Sanitize.fragment('<plaintext>hello! <script>alert(0)</script>')
222
+ .must_equal ''
223
+ end
224
+
225
+ it 'should not preserve the content of removed `script` elements' do
226
+ Sanitize.fragment('<script>hello! <script>alert(0)</script></script>')
227
+ .must_equal ''
228
+ end
229
+
230
+ it 'should not preserve the content of removed `style` elements' do
231
+ Sanitize.fragment('<style>hello! <script>alert(0)</script></style>')
232
+ .must_equal ''
233
+ end
234
+
235
+ it 'should not preserve the content of removed `svg` elements' do
236
+ Sanitize.fragment('<svg>hello! <script>alert(0)</script></svg>')
237
+ .must_equal ''
238
+ end
239
+
240
+ it 'should not preserve the content of removed `xmp` elements' do
241
+ Sanitize.fragment('<xmp>hello! <script>alert(0)</script></xmp>')
242
+ .must_equal ''
243
+ end
244
+
199
245
  strings.each do |name, data|
200
246
  it "should clean #{name} HTML" do
201
247
  Sanitize.fragment(data[:html]).must_equal(data[:default])
@@ -234,7 +280,7 @@ describe 'Sanitize::Transformers::CleanElement' do
234
280
 
235
281
  it 'should not choke on valueless attributes' do
236
282
  @s.fragment('foo <a href>foo</a> bar')
237
- .must_equal 'foo <a href rel="nofollow">foo</a> bar'
283
+ .must_equal 'foo <a href="" rel="nofollow">foo</a> bar'
238
284
  end
239
285
 
240
286
  it 'should downcase attribute names' do
@@ -262,7 +308,7 @@ describe 'Sanitize::Transformers::CleanElement' do
262
308
 
263
309
  it 'should encode special chars in attribute values' do
264
310
  @s.fragment('<a href="http://example.com" title="<b>&eacute;xamples</b> & things">foo</a>')
265
- .must_equal '<a href="http://example.com" title="&lt;b&gt;éxamples&lt;/b&gt; &amp; things">foo</a>'
311
+ .must_equal '<a href="http://example.com" title="<bxamples</b> &amp; things">foo</a>'
266
312
  end
267
313
 
268
314
  strings.each do |name, data|
@@ -279,7 +325,7 @@ describe 'Sanitize::Transformers::CleanElement' do
279
325
  end
280
326
 
281
327
  describe 'Custom configs' do
282
- it 'should allow attributes on all elements if whitelisted under :all' do
328
+ it 'should allow attributes on all elements if allowlisted under :all' do
283
329
  input = '<p class="foo">bar</p>'
284
330
 
285
331
  Sanitize.fragment(input).must_equal ' bar '
@@ -300,7 +346,7 @@ describe 'Sanitize::Transformers::CleanElement' do
300
346
  }).must_equal input
301
347
  end
302
348
 
303
- it "should not allow relative URLs when relative URLs aren't whitelisted" do
349
+ it "should not allow relative URLs when relative URLs aren't allowlisted" do
304
350
  input = '<a href="/foo/bar">Link</a>'
305
351
 
306
352
  Sanitize.fragment(input,
@@ -344,16 +390,30 @@ describe 'Sanitize::Transformers::CleanElement' do
344
390
  ).must_equal 'foo bar '
345
391
  end
346
392
 
347
- it 'should remove the contents of specified nodes when :remove_contents is an Array of element names as strings' do
348
- Sanitize.fragment('foo bar <div>baz<span>quux</span><script>alert("hello!");</script></div>',
393
+ it 'should remove the contents of specified nodes when :remove_contents is an Array or Set of element names as strings' do
394
+ Sanitize.fragment('foo bar <div>baz<span>quux</span> <b>hi</b><script>alert("hello!");</script></div>',
349
395
  :remove_contents => ['script', 'span']
350
- ).must_equal 'foo bar baz '
396
+ ).must_equal 'foo bar baz hi '
397
+
398
+ Sanitize.fragment('foo bar <div>baz<span>quux</span> <b>hi</b><script>alert("hello!");</script></div>',
399
+ :remove_contents => Set.new(['script', 'span'])
400
+ ).must_equal 'foo bar baz hi '
351
401
  end
352
402
 
353
- it 'should remove the contents of specified nodes when :remove_contents is an Array of element names as symbols' do
354
- Sanitize.fragment('foo bar <div>baz<span>quux</span><script>alert("hello!");</script></div>',
403
+ it 'should remove the contents of specified nodes when :remove_contents is an Array or Set of element names as symbols' do
404
+ Sanitize.fragment('foo bar <div>baz<span>quux</span> <b>hi</b><script>alert("hello!");</script></div>',
355
405
  :remove_contents => [:script, :span]
356
- ).must_equal 'foo bar baz '
406
+ ).must_equal 'foo bar baz hi '
407
+
408
+ Sanitize.fragment('foo bar <div>baz<span>quux</span> <b>hi</b><script>alert("hello!");</script></div>',
409
+ :remove_contents => Set.new([:script, :span])
410
+ ).must_equal 'foo bar baz hi '
411
+ end
412
+
413
+ it 'should remove the contents of allowlisted iframes' do
414
+ Sanitize.fragment('<iframe>hi <script>hello</script></iframe>',
415
+ :elements => ['iframe']
416
+ ).must_equal '<iframe></iframe>'
357
417
  end
358
418
 
359
419
  it 'should not allow arbitrary HTML5 data attributes by default' do
@@ -413,7 +473,7 @@ describe 'Sanitize::Transformers::CleanElement' do
413
473
  s.fragment('foo<br>bar<br>baz').must_equal "foo\nbar\nbaz"
414
474
  end
415
475
 
416
- it 'handles protocols correctly regardless of case' do
476
+ it 'should handle protocols correctly regardless of case' do
417
477
  input = '<a href="hTTpS://foo.com/">Text</a>'
418
478
 
419
479
  Sanitize.fragment(input, {
@@ -430,5 +490,56 @@ describe 'Sanitize::Transformers::CleanElement' do
430
490
  :protocols => {'a' => {'href' => ['https']}}
431
491
  }).must_equal "<a>Text</a>"
432
492
  end
493
+
494
+ it 'should sanitize protocols in data attributes even if data attributes are generically allowed' do
495
+ input = '<a data-url="mailto:someone@example.com">Text</a>'
496
+
497
+ Sanitize.fragment(input, {
498
+ :elements => ['a'],
499
+ :attributes => {'a' => [:data]},
500
+ :protocols => {'a' => {'data-url' => ['https']}}
501
+ }).must_equal "<a>Text</a>"
502
+
503
+ Sanitize.fragment(input, {
504
+ :elements => ['a'],
505
+ :attributes => {'a' => [:data]},
506
+ :protocols => {'a' => {'data-url' => ['mailto']}}
507
+ }).must_equal input
508
+ end
509
+
510
+ it 'should prevent `<meta>` tags from being used to set a non-UTF-8 charset' do
511
+ Sanitize.document('<html><head><meta charset="utf-8"></head><body>Howdy!</body></html>',
512
+ :elements => %w[html head meta body],
513
+ :attributes => {'meta' => ['charset']}
514
+ ).must_equal "<html><head><meta charset=\"utf-8\"></head><body>Howdy!</body></html>"
515
+
516
+ Sanitize.document('<html><meta charset="utf-8">Howdy!</html>',
517
+ :elements => %w[html meta],
518
+ :attributes => {'meta' => ['charset']}
519
+ ).must_equal "<html><meta charset=\"utf-8\">Howdy!</html>"
520
+
521
+ Sanitize.document('<html><meta charset="us-ascii">Howdy!</html>',
522
+ :elements => %w[html meta],
523
+ :attributes => {'meta' => ['charset']}
524
+ ).must_equal "<html><meta charset=\"utf-8\">Howdy!</html>"
525
+
526
+ Sanitize.document('<html><meta http-equiv="content-type" content=" text/html; charset=us-ascii">Howdy!</html>',
527
+ :elements => %w[html meta],
528
+ :attributes => {'meta' => %w[content http-equiv]}
529
+ ).must_equal "<html><meta http-equiv=\"content-type\" content=\" text/html;charset=utf-8\">Howdy!</html>"
530
+
531
+ Sanitize.document('<html><meta http-equiv="Content-Type" content="text/plain;charset = us-ascii">Howdy!</html>',
532
+ :elements => %w[html meta],
533
+ :attributes => {'meta' => %w[content http-equiv]}
534
+ ).must_equal "<html><meta http-equiv=\"Content-Type\" content=\"text/plain;charset=utf-8\">Howdy!</html>"
535
+ end
536
+
537
+ it 'should not modify `<meta>` tags that already set a UTF-8 charset' do
538
+ Sanitize.document('<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8"></head><body>Howdy!</body></html>',
539
+ :elements => %w[html head meta body],
540
+ :attributes => {'meta' => %w[content http-equiv]}
541
+ ).must_equal "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"></head><body>Howdy!</body></html>"
542
+ end
543
+
433
544
  end
434
545
  end
@@ -43,7 +43,7 @@ describe 'Malicious HTML' do
43
43
  describe '<body>' do
44
44
  it 'should not be possible to inject JS via a malformed event attribute' do
45
45
  @s.document('<html><head></head><body onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert("XSS")></body></html>').
46
- must_equal "<html><head></head><body></body></html>\n"
46
+ must_equal "<html><head></head><body></body></html>"
47
47
  end
48
48
  end
49
49
 
@@ -65,7 +65,7 @@ describe 'Malicious HTML' do
65
65
 
66
66
  it 'should not be possible to inject <script> via a malformed <img> tag' do
67
67
  @s.fragment('<img """><script>alert("XSS")</script>">').
68
- must_equal '<img>alert("XSS")"&gt;'
68
+ must_equal '<img>"&gt;'
69
69
  end
70
70
 
71
71
  it 'should not be possible to inject protocol-based JS' do
@@ -117,24 +117,26 @@ describe 'Malicious HTML' do
117
117
  describe '<script>' do
118
118
  it 'should not be possible to inject <script> using a malformed non-alphanumeric tag name' do
119
119
  @s.fragment(%[<script/xss src="http://ha.ckers.org/xss.js">alert(1)</script>]).
120
- must_equal 'alert(1)'
120
+ must_equal ''
121
121
  end
122
122
 
123
123
  it 'should not be possible to inject <script> via extraneous open brackets' do
124
124
  @s.fragment(%[<<script>alert("XSS");//<</script>]).
125
- must_equal '&lt;alert("XSS");//&lt;'
125
+ must_equal '&lt;'
126
126
  end
127
127
  end
128
128
 
129
129
  # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
130
130
  # attempt to preserve server-side includes. This can result in XSS since an
131
- # unescaped double quote can allow an attacker to inject a non-whitelisted
131
+ # unescaped double quote can allow an attacker to inject a non-allowlisted
132
132
  # attribute. Sanitize works around this by implementing its own escaping for
133
133
  # affected attributes.
134
134
  #
135
135
  # The relevant libxml2 code is here:
136
136
  # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
137
137
  describe 'unsafe libxml2 server-side includes in attributes' do
138
+ using_unpatched_libxml2 = Nokogiri::VersionInfo.instance.libxml2_using_system?
139
+
138
140
  tag_configs = [
139
141
  {
140
142
  tag_name: 'a',
@@ -166,7 +168,21 @@ describe 'Malicious HTML' do
166
168
  input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]
167
169
 
168
170
  it 'should escape unsafe characters in attributes' do
169
- @s.fragment(input).must_equal(%[<#{tag_name} #{attr_name}="examp<!--%22%20onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
171
+ skip "behavior should only exist in nokogiri's patched libxml" if using_unpatched_libxml2
172
+
173
+ # This uses Nokogumbo's HTML-compliant serializer rather than
174
+ # libxml2's.
175
+ @s.fragment(input).
176
+ must_equal(%[<#{tag_name} #{attr_name}="examp<!--%22%20onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
177
+
178
+ # This uses the not-quite-standards-compliant libxml2 serializer via
179
+ # Nokogiri, so the output may be a little different as of Nokogiri
180
+ # 1.10.2 when using Nokogiri's vendored libxml2 due to this patch:
181
+ # https://github.com/sparklemotion/nokogiri/commit/4852e43cb6039e26d8c51af78621e539cbf46c5d
182
+ fragment = Nokogiri::HTML.fragment(input)
183
+ @s.node!(fragment)
184
+ fragment.to_html.
185
+ must_equal(%[<#{tag_name} #{attr_name}="examp&lt;!--%22%20onmouseover=alert(1)&gt;--&gt;le.com">foo</#{tag_name}>])
170
186
  end
171
187
 
172
188
  it 'should round-trip to the same output' do
@@ -179,7 +195,21 @@ describe 'Malicious HTML' do
179
195
  input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]
180
196
 
181
197
  it 'should not escape characters unnecessarily' do
182
- @s.fragment(input).must_equal(input)
198
+ skip "behavior should only exist in nokogiri's patched libxml" if using_unpatched_libxml2
199
+
200
+ # This uses Nokogumbo's HTML-compliant serializer rather than
201
+ # libxml2's.
202
+ @s.fragment(input).
203
+ must_equal(%[<#{tag_name} #{attr_name}="examp<!--&quot; onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
204
+
205
+ # This uses the not-quite-standards-compliant libxml2 serializer via
206
+ # Nokogiri, so the output may be a little different as of Nokogiri
207
+ # 1.10.2 when using Nokogiri's vendored libxml2 due to this patch:
208
+ # https://github.com/sparklemotion/nokogiri/commit/4852e43cb6039e26d8c51af78621e539cbf46c5d
209
+ fragment = Nokogiri::HTML.fragment(input)
210
+ @s.node!(fragment)
211
+ fragment.to_html.
212
+ must_equal(%[<#{tag_name} #{attr_name}='examp&lt;!--" onmouseover=alert(1)&gt;--&gt;le.com'>foo</#{tag_name}>])
183
213
  end
184
214
 
185
215
  it 'should round-trip to the same output' do
@@ -189,4 +219,17 @@ describe 'Malicious HTML' do
189
219
  end
190
220
  end
191
221
  end
222
+
223
+ # https://github.com/rgrove/sanitize/security/advisories/GHSA-p4x4-rw2p-8j8m
224
+ describe 'foreign content bypass in relaxed config' do
225
+ it 'prevents a sanitization bypass via carefully crafted foreign content' do
226
+ %w[iframe noembed noframes noscript plaintext script style xmp].each do |tag_name|
227
+ @s.fragment(%[<math><#{tag_name}>/*&lt;/#{tag_name}&gt;&lt;img src onerror=alert(1)>*/]).
228
+ must_equal ''
229
+
230
+ @s.fragment(%[<svg><#{tag_name}>/*&lt;/#{tag_name}&gt;&lt;img src onerror=alert(1)>*/]).
231
+ must_equal ''
232
+ end
233
+ end
234
+ end
192
235
  end
data/test/test_parser.rb CHANGED
@@ -19,8 +19,8 @@ describe 'Parser' do
19
19
  end
20
20
 
21
21
  it 'should not have the Nokogiri 1.4.2+ unterminated script/style element bug' do
22
- Sanitize.fragment('foo <script>bar').must_equal 'foo bar'
23
- Sanitize.fragment('foo <style>bar').must_equal 'foo bar'
22
+ Sanitize.fragment('foo <script>bar').must_equal 'foo '
23
+ Sanitize.fragment('foo <style>bar').must_equal 'foo '
24
24
  end
25
25
 
26
26
  it 'ambiguous non-tag brackets like "1 > 2 and 2 < 1" should be parsed correctly' do
@@ -28,35 +28,6 @@ describe 'Parser' do
28
28
  Sanitize.fragment('OMG HAPPY BIRTHDAY! *<:-D').must_equal 'OMG HAPPY BIRTHDAY! *&lt;:-D'
29
29
  end
30
30
 
31
- # https://github.com/sparklemotion/nokogiri/issues/1008
32
- it 'should work around the libxml2 content-type meta tag bug' do
33
- Sanitize.document('<html><head></head><body>Howdy!</body></html>',
34
- :elements => %w[html head body]
35
- ).must_equal "<html><head></head><body>Howdy!</body></html>\n"
36
-
37
- Sanitize.document('<html><head></head><body>Howdy!</body></html>',
38
- :elements => %w[html head meta body]
39
- ).must_equal "<html><head></head><body>Howdy!</body></html>\n"
40
-
41
- Sanitize.document('<html><head><meta charset="utf-8"></head><body>Howdy!</body></html>',
42
- :elements => %w[html head meta body],
43
- :attributes => {'meta' => ['charset']}
44
- ).must_equal "<html><head><meta charset=\"utf-8\"></head><body>Howdy!</body></html>\n"
45
-
46
- Sanitize.document('<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8"></head><body>Howdy!</body></html>',
47
- :elements => %w[html head meta body],
48
- :attributes => {'meta' => %w[charset content http-equiv]}
49
- ).must_equal "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"></head><body>Howdy!</body></html>\n"
50
-
51
- # Edge case: an existing content-type meta tag with a non-UTF-8 content type
52
- # will be converted to UTF-8, since that's the only output encoding we
53
- # support.
54
- Sanitize.document('<html><head><meta http-equiv="content-type" content="text/html;charset=us-ascii"></head><body>Howdy!</body></html>',
55
- :elements => %w[html head meta body],
56
- :attributes => {'meta' => %w[charset content http-equiv]}
57
- ).must_equal "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"></head><body>Howdy!</body></html>\n"
58
- end
59
-
60
31
  describe 'when siblings are added after a node during traversal' do
61
32
  it 'the added siblings should be traversed' do
62
33
  html = %[
@@ -84,7 +55,7 @@ describe 'Parser' do
84
55
  siblings << env[:node][:id]
85
56
  end
86
57
 
87
- return {:node_whitelist => [env[:node]]}
58
+ return {:node_allowlist => [env[:node]]}
88
59
  })
89
60
 
90
61
  # All siblings should be traversed, and in the order added.
@@ -25,7 +25,7 @@ describe 'Sanitize' do
25
25
 
26
26
  it 'should sanitize an HTML document' do
27
27
  @s.document('<!doctype html><html><b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script></html>')
28
- .must_equal "<html>Lorem ipsum dolor sit amet alert(\"hello world\");</html>\n"
28
+ .must_equal "<html>Lorem ipsum dolor sit amet </html>"
29
29
  end
30
30
 
31
31
  it 'should not modify the input string' do
@@ -35,14 +35,52 @@ describe 'Sanitize' do
35
35
  end
36
36
 
37
37
  it 'should not choke on frozen documents' do
38
- @s.document('<!doctype html><html><b>foo</b>'.freeze).must_equal "<html>foo</html>\n"
38
+ @s.document('<!doctype html><html><b>foo</b>'.freeze).must_equal "<html>foo</html>"
39
+ end
40
+
41
+ it 'should normalize newlines' do
42
+ @s.document("a\r\n\n\r\r\r\nz").must_equal "<html>a\n\n\n\n\nz</html>"
43
+ end
44
+
45
+ it 'should strip control characters (except ASCII whitespace)' do
46
+ sample_control_chars = "\u0001\u0008\u000b\u000e\u001f\u007f\u009f"
47
+ whitespace = "\t\n\f\u0020"
48
+ @s.document("a#{sample_control_chars}#{whitespace}z").must_equal "<html>a#{whitespace}z</html>"
49
+ end
50
+
51
+ it 'should strip non-characters' do
52
+ sample_non_chars = "\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}"
53
+ @s.document("a#{sample_non_chars}z").must_equal "<html>az</html>"
54
+ end
55
+
56
+ describe 'when html body exceeds Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH' do
57
+ let(:content) do
58
+ content = nest_html_content('<b>foo</b>', Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH)
59
+ "<html>#{content}</html>"
60
+ end
61
+
62
+ it 'raises an ArgumentError exception' do
63
+ assert_raises ArgumentError do
64
+ @s.document(content)
65
+ end
66
+ end
67
+
68
+ describe 'and :max_tree_depth of -1 is supplied in :parser_options' do
69
+ before do
70
+ @s = Sanitize.new(elements: ['html'], parser_options: { max_tree_depth: -1 })
71
+ end
72
+
73
+ it 'does not raise an ArgumentError exception' do
74
+ @s.document(content).must_equal '<html>foo</html>'
75
+ end
76
+ end
39
77
  end
40
78
  end
41
79
 
42
80
  describe '#fragment' do
43
81
  it 'should sanitize an HTML fragment' do
44
82
  @s.fragment('<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>')
45
- .must_equal 'Lorem ipsum dolor sit amet alert("hello world");'
83
+ .must_equal 'Lorem ipsum dolor sit amet '
46
84
  end
47
85
 
48
86
  it 'should not modify the input string' do
@@ -61,6 +99,44 @@ describe 'Sanitize' do
61
99
  it 'should not choke on frozen fragments' do
62
100
  @s.fragment('<b>foo</b>'.freeze).must_equal 'foo'
63
101
  end
102
+
103
+ it 'should normalize newlines' do
104
+ @s.fragment("a\r\n\n\r\r\r\nz").must_equal "a\n\n\n\n\nz"
105
+ end
106
+
107
+ it 'should strip control characters (except ASCII whitespace)' do
108
+ sample_control_chars = "\u0001\u0008\u000b\u000e\u001f\u007f\u009f"
109
+ whitespace = "\t\n\f\u0020"
110
+ @s.fragment("a#{sample_control_chars}#{whitespace}z").must_equal "a#{whitespace}z"
111
+ end
112
+
113
+ it 'should strip non-characters' do
114
+ sample_non_chars = "\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}"
115
+ @s.fragment("a#{sample_non_chars}z").must_equal "az"
116
+ end
117
+
118
+ describe 'when html body exceeds Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH' do
119
+ let(:content) do
120
+ content = nest_html_content('<b>foo</b>', Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH)
121
+ "<body>#{content}</body>"
122
+ end
123
+
124
+ it 'raises an ArgumentError exception' do
125
+ assert_raises ArgumentError do
126
+ @s.fragment(content)
127
+ end
128
+ end
129
+
130
+ describe 'and :max_tree_depth of -1 is supplied in :parser_options' do
131
+ before do
132
+ @s = Sanitize.new(parser_options: { max_tree_depth: -1 })
133
+ end
134
+
135
+ it 'does not raise an ArgumentError exception' do
136
+ @s.fragment(content).must_equal 'foo'
137
+ end
138
+ end
139
+ end
64
140
  end
65
141
 
66
142
  describe '#node!' do
@@ -71,10 +147,10 @@ describe 'Sanitize' do
71
147
  doc.xpath('/html/body/node()').each {|node| frag << node }
72
148
 
73
149
  @s.node!(frag)
74
- frag.to_html.must_equal 'Lorem ipsum dolor sit amet alert("hello world");'
150
+ frag.to_html.must_equal 'Lorem ipsum dolor sit amet '
75
151
  end
76
152
 
77
- describe "when the given node is a document and <html> isn't whitelisted" do
153
+ describe "when the given node is a document and <html> isn't allowlisted" do
78
154
  it 'should raise a Sanitize::Error' do
79
155
  doc = Nokogiri::HTML5.parse('foo')
80
156
  proc { @s.node!(doc) }.must_raise Sanitize::Error
@@ -85,28 +161,37 @@ describe 'Sanitize' do
85
161
 
86
162
  describe 'class methods' do
87
163
  describe '.document' do
88
- it 'should call #document' do
89
- Sanitize.stub_instance(:document, proc {|html| html + ' called' }) do
90
- Sanitize.document('<html>foo</html>')
91
- .must_equal '<html>foo</html> called'
92
- end
164
+ it 'should sanitize an HTML document with the given config' do
165
+ html = '<!doctype html><html><b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script></html>'
166
+ Sanitize.document(html, :elements => ['html'])
167
+ .must_equal "<html>Lorem ipsum dolor sit amet </html>"
93
168
  end
94
169
  end
95
170
 
96
171
  describe '.fragment' do
97
- it 'should call #fragment' do
98
- Sanitize.stub_instance(:fragment, proc {|html| html + ' called' }) do
99
- Sanitize.fragment('<b>foo</b>').must_equal '<b>foo</b> called'
100
- end
172
+ it 'should sanitize an HTML fragment with the given config' do
173
+ html = '<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>'
174
+ Sanitize.fragment(html, :elements => ['strong'])
175
+ .must_equal 'Lorem ipsum <strong>dolor</strong> sit amet '
101
176
  end
102
177
  end
103
178
 
104
179
  describe '.node!' do
105
- it 'should call #node!' do
106
- Sanitize.stub_instance(:node!, proc {|input| input + ' called' }) do
107
- Sanitize.node!('not really a node').must_equal 'not really a node called'
108
- end
180
+ it 'should sanitize a Nokogiri::XML::Node with the given config' do
181
+ doc = Nokogiri::HTML5.parse('<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>')
182
+ frag = doc.fragment
183
+
184
+ doc.xpath('/html/body/node()').each {|node| frag << node }
185
+
186
+ Sanitize.node!(frag, :elements => ['strong'])
187
+ frag.to_html.must_equal 'Lorem ipsum <strong>dolor</strong> sit amet '
109
188
  end
110
189
  end
111
190
  end
191
+
192
+ private
193
+
194
+ def nest_html_content(html_content, depth)
195
+ "#{'<span>' * depth}#{html_content}#{'</span>' * depth}"
196
+ end
112
197
  end