sanitize 4.6.3 → 5.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sanitize might be problematic. Click here for more details.

data/test/common.rb CHANGED
@@ -1,34 +1,3 @@
1
1
  # encoding: utf-8
2
- gem 'minitest'
3
2
  require 'minitest/autorun'
4
-
5
3
  require_relative '../lib/sanitize'
6
-
7
- # Helper to stub an instance method. Shamelessly stolen from
8
- # https://github.com/codeodor/minitest-stub_any_instance/
9
- class Object
10
- def self.stub_instance(name, value, &block)
11
- old_method = "__stubbed_method_#{name}__"
12
-
13
- class_eval do
14
- alias_method old_method, name
15
-
16
- define_method(name) do |*args|
17
- if value.respond_to?(:call) then
18
- value.call(*args)
19
- else
20
- value
21
- end
22
- end
23
- end
24
-
25
- yield
26
-
27
- ensure
28
- class_eval do
29
- undef_method name
30
- alias_method name, old_method
31
- undef_method old_method
32
- end
33
- end
34
- end
@@ -20,7 +20,7 @@ describe 'Sanitize::Transformers::CleanComment' do
20
20
 
21
21
  # Special case: the comment markup is inside a <script>, which makes it
22
22
  # text content and not an actual HTML comment.
23
- @s.fragment("<script><!-- comment --></script>").must_equal '&lt;!-- comment --&gt;'
23
+ @s.fragment("<script><!-- comment --></script>").must_equal ''
24
24
 
25
25
  Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => false, :elements => ['script'])
26
26
  .must_equal '<script><!-- comment --></script>'
@@ -40,10 +40,6 @@ describe 'Sanitize::Transformers::CleanComment' do
40
40
  @s.fragment("foo <!-- <!-- <!-- --> --> -->bar").must_equal 'foo <!-- <!-- <!-- --> --&gt; --&gt;bar'
41
41
  @s.fragment("foo <div <!-- comment -->>bar</div>").must_equal 'foo <div>&gt;bar</div>'
42
42
 
43
- # Special case: the comment markup is inside a <script>, which makes it
44
- # text content and not an actual HTML comment.
45
- @s.fragment("<script><!-- comment --></script>").must_equal '&lt;!-- comment --&gt;'
46
-
47
43
  Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => true, :elements => ['script'])
48
44
  .must_equal '<script><!-- comment --></script>'
49
45
  end
@@ -13,7 +13,7 @@ describe 'Sanitize::Transformers::CSS::CleanAttribute' do
13
13
  @s.fragment(%[
14
14
  <div style="color: #fff; width: expression(alert(1)); /* <-- evil! */"></div>
15
15
  ].strip).must_equal %[
16
- <div style="color: #fff; /* &lt;-- evil! */"></div>
16
+ <div style="color: #fff; /* <-- evil! */"></div>
17
17
  ].strip
18
18
  end
19
19
 
@@ -11,7 +11,7 @@ describe 'Sanitize::Transformers::CleanDoctype' do
11
11
  end
12
12
 
13
13
  it 'should remove doctype declarations' do
14
- @s.document('<!DOCTYPE html><html>foo</html>').must_equal "<html>foo</html>\n"
14
+ @s.document('<!DOCTYPE html><html>foo</html>').must_equal "<html>foo</html>"
15
15
  @s.fragment('<!DOCTYPE html>foo').must_equal 'foo'
16
16
  end
17
17
 
@@ -34,27 +34,27 @@ describe 'Sanitize::Transformers::CleanDoctype' do
34
34
 
35
35
  it 'should allow doctype declarations in documents' do
36
36
  @s.document('<!DOCTYPE html><html>foo</html>')
37
- .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
37
+ .must_equal "<!DOCTYPE html><html>foo</html>"
38
38
 
39
39
  @s.document('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
40
- .must_equal "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\">\n<html>foo</html>\n"
40
+ .must_equal "<!DOCTYPE html><html>foo</html>"
41
41
 
42
42
  @s.document("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
43
- .must_equal "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html>foo</html>\n"
43
+ .must_equal "<!DOCTYPE html><html>foo</html>"
44
44
  end
45
45
 
46
46
  it 'should not allow obviously invalid doctype declarations in documents' do
47
47
  @s.document('<!DOCTYPE blah blah blah><html>foo</html>')
48
- .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
48
+ .must_equal "<!DOCTYPE html><html>foo</html>"
49
49
 
50
50
  @s.document('<!DOCTYPE blah><html>foo</html>')
51
- .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
51
+ .must_equal "<!DOCTYPE html><html>foo</html>"
52
52
 
53
53
  @s.document('<!DOCTYPE html BLAH "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
54
- .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
54
+ .must_equal "<!DOCTYPE html><html>foo</html>"
55
55
 
56
56
  @s.document('<!whatever><html>foo</html>')
57
- .must_equal "<html>foo</html>\n"
57
+ .must_equal "<html>foo</html>"
58
58
  end
59
59
 
60
60
  it 'should not allow doctype definitions in fragments' do
@@ -8,25 +8,22 @@ describe 'Sanitize::Transformers::CleanElement' do
8
8
  strings = {
9
9
  :basic => {
10
10
  :html => '<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo" style="text-decoration: underline;">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <style>.foo { color: #fff; }</style> <script>alert("hello world");</script>',
11
-
12
- :default => 'Lorem ipsum dolor sit amet .foo { color: #fff; } alert("hello world");',
13
- :restricted => '<b>Lorem</b> ipsum <strong>dolor</strong> sit amet .foo { color: #fff; } alert("hello world");',
14
- :basic => '<b>Lorem</b> <a href="pants" rel="nofollow">ipsum</a> <a href="http://foo.com/" rel="nofollow"><strong>dolor</strong></a> sit<br>amet .foo { color: #fff; } alert("hello world");',
15
- :relaxed => '<b>Lorem</b> <a href="pants" title="foo" style="text-decoration: underline;">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br>amet <style>.foo { color: #fff; }</style> alert("hello world");'
11
+ :default => 'Lorem ipsum dolor sit amet ',
12
+ :restricted => '<b>Lorem</b> ipsum <strong>dolor</strong> sit amet ',
13
+ :basic => '<b>Lorem</b> <a href="pants" rel="nofollow">ipsum</a> <a href="http://foo.com/" rel="nofollow"><strong>dolor</strong></a> sit<br>amet ',
14
+ :relaxed => '<b>Lorem</b> <a href="pants" title="foo" style="text-decoration: underline;">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br>amet <style>.foo { color: #fff; }</style> '
16
15
  },
17
16
 
18
17
  :malformed => {
19
18
  :html => 'Lo<!-- comment -->rem</b> <a href=pants title="foo>ipsum <a href="http://foo.com/"><strong>dolor</a></strong> sit<br/>amet <script>alert("hello world");',
20
-
21
- :default => 'Lorem dolor sit amet alert("hello world");',
22
- :restricted => 'Lorem <strong>dolor</strong> sit amet alert("hello world");',
23
- :basic => 'Lorem <a href="pants" rel="nofollow"><strong>dolor</strong></a> sit<br>amet alert("hello world");',
24
- :relaxed => 'Lorem <a href="pants" title="foo&gt;ipsum &lt;a href="><strong>dolor</strong></a> sit<br>amet alert("hello world");',
19
+ :default => 'Lorem dolor sit amet ',
20
+ :restricted => 'Lorem <strong>dolor</strong> sit amet ',
21
+ :basic => 'Lorem <a href="pants" rel="nofollow"><strong>dolor</strong></a> sit<br>amet ',
22
+ :relaxed => 'Lorem <a href="pants" title="foo>ipsum <a href="><strong>dolor</strong></a> sit<br>amet ',
25
23
  },
26
24
 
27
25
  :unclosed => {
28
26
  :html => '<p>a</p><blockquote>b',
29
-
30
27
  :default => ' a b ',
31
28
  :restricted => ' a b ',
32
29
  :basic => '<p>a</p><blockquote>b</blockquote>',
@@ -35,7 +32,6 @@ describe 'Sanitize::Transformers::CleanElement' do
35
32
 
36
33
  :malicious => {
37
34
  :html => '<b>Lo<!-- comment -->rem</b> <a href="javascript:pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <<foo>script>alert("hello world");</script>',
38
-
39
35
  :default => 'Lorem ipsum dolor sit amet &lt;script&gt;alert("hello world");',
40
36
  :restricted => '<b>Lorem</b> ipsum <strong>dolor</strong> sit amet &lt;script&gt;alert("hello world");',
41
37
  :basic => '<b>Lorem</b> <a rel="nofollow">ipsum</a> <a href="http://foo.com/" rel="nofollow"><strong>dolor</strong></a> sit<br>amet &lt;script&gt;alert("hello world");',
@@ -171,10 +167,10 @@ describe 'Sanitize::Transformers::CleanElement' do
171
167
  .must_equal 'foo bar baz quux'
172
168
 
173
169
  Sanitize.fragment('<script>alert("<xss>");</script>')
174
- .must_equal 'alert("&lt;xss&gt;");'
170
+ .must_equal ''
175
171
 
176
172
  Sanitize.fragment('<<script>script>alert("<xss>");</<script>>')
177
- .must_equal '&lt;script&gt;alert("&lt;xss&gt;");&lt;/&lt;script&gt;&gt;'
173
+ .must_equal '&lt;'
178
174
 
179
175
  Sanitize.fragment('< script <>> alert("<xss>");</script>')
180
176
  .must_equal '&lt; script &lt;&gt;&gt; alert("");'
@@ -196,6 +192,46 @@ describe 'Sanitize::Transformers::CleanElement' do
196
192
  .must_equal ''
197
193
  end
198
194
 
195
+ it 'should escape the content of removed `plaintext` elements' do
196
+ Sanitize.fragment('<plaintext>hello! <script>alert(0)</script>')
197
+ .must_equal 'hello! &lt;script&gt;alert(0)&lt;/script&gt;'
198
+ end
199
+
200
+ it 'should escape the content of removed `xmp` elements' do
201
+ Sanitize.fragment('<xmp>hello! <script>alert(0)</script></xmp>')
202
+ .must_equal 'hello! &lt;script&gt;alert(0)&lt;/script&gt;'
203
+ end
204
+
205
+ it 'should not preserve the content of removed `iframe` elements' do
206
+ Sanitize.fragment('<iframe>hello! <script>alert(0)</script></iframe>')
207
+ .must_equal ''
208
+ end
209
+
210
+ it 'should not preserve the content of removed `noembed` elements' do
211
+ Sanitize.fragment('<noembed>hello! <script>alert(0)</script></noembed>')
212
+ .must_equal ''
213
+ end
214
+
215
+ it 'should not preserve the content of removed `noframes` elements' do
216
+ Sanitize.fragment('<noframes>hello! <script>alert(0)</script></noframes>')
217
+ .must_equal ''
218
+ end
219
+
220
+ it 'should not preserve the content of removed `noscript` elements' do
221
+ Sanitize.fragment('<noscript>hello! <script>alert(0)</script></noscript>')
222
+ .must_equal ''
223
+ end
224
+
225
+ it 'should not preserve the content of removed `script` elements' do
226
+ Sanitize.fragment('<script>hello! <script>alert(0)</script></script>')
227
+ .must_equal ''
228
+ end
229
+
230
+ it 'should not preserve the content of removed `style` elements' do
231
+ Sanitize.fragment('<style>hello! <script>alert(0)</script></style>')
232
+ .must_equal ''
233
+ end
234
+
199
235
  strings.each do |name, data|
200
236
  it "should clean #{name} HTML" do
201
237
  Sanitize.fragment(data[:html]).must_equal(data[:default])
@@ -234,7 +270,7 @@ describe 'Sanitize::Transformers::CleanElement' do
234
270
 
235
271
  it 'should not choke on valueless attributes' do
236
272
  @s.fragment('foo <a href>foo</a> bar')
237
- .must_equal 'foo <a href rel="nofollow">foo</a> bar'
273
+ .must_equal 'foo <a href="" rel="nofollow">foo</a> bar'
238
274
  end
239
275
 
240
276
  it 'should downcase attribute names' do
@@ -262,7 +298,7 @@ describe 'Sanitize::Transformers::CleanElement' do
262
298
 
263
299
  it 'should encode special chars in attribute values' do
264
300
  @s.fragment('<a href="http://example.com" title="<b>&eacute;xamples</b> & things">foo</a>')
265
- .must_equal '<a href="http://example.com" title="&lt;b&gt;éxamples&lt;/b&gt; &amp; things">foo</a>'
301
+ .must_equal '<a href="http://example.com" title="<bxamples</b> &amp; things">foo</a>'
266
302
  end
267
303
 
268
304
  strings.each do |name, data|
@@ -344,16 +380,30 @@ describe 'Sanitize::Transformers::CleanElement' do
344
380
  ).must_equal 'foo bar '
345
381
  end
346
382
 
347
- it 'should remove the contents of specified nodes when :remove_contents is an Array of element names as strings' do
348
- Sanitize.fragment('foo bar <div>baz<span>quux</span><script>alert("hello!");</script></div>',
383
+ it 'should remove the contents of specified nodes when :remove_contents is an Array or Set of element names as strings' do
384
+ Sanitize.fragment('foo bar <div>baz<span>quux</span> <b>hi</b><script>alert("hello!");</script></div>',
349
385
  :remove_contents => ['script', 'span']
350
- ).must_equal 'foo bar baz '
386
+ ).must_equal 'foo bar baz hi '
387
+
388
+ Sanitize.fragment('foo bar <div>baz<span>quux</span> <b>hi</b><script>alert("hello!");</script></div>',
389
+ :remove_contents => Set.new(['script', 'span'])
390
+ ).must_equal 'foo bar baz hi '
351
391
  end
352
392
 
353
- it 'should remove the contents of specified nodes when :remove_contents is an Array of element names as symbols' do
354
- Sanitize.fragment('foo bar <div>baz<span>quux</span><script>alert("hello!");</script></div>',
393
+ it 'should remove the contents of specified nodes when :remove_contents is an Array or Set of element names as symbols' do
394
+ Sanitize.fragment('foo bar <div>baz<span>quux</span> <b>hi</b><script>alert("hello!");</script></div>',
355
395
  :remove_contents => [:script, :span]
356
- ).must_equal 'foo bar baz '
396
+ ).must_equal 'foo bar baz hi '
397
+
398
+ Sanitize.fragment('foo bar <div>baz<span>quux</span> <b>hi</b><script>alert("hello!");</script></div>',
399
+ :remove_contents => Set.new([:script, :span])
400
+ ).must_equal 'foo bar baz hi '
401
+ end
402
+
403
+ it 'should remove the contents of whitelisted iframes' do
404
+ Sanitize.fragment('<iframe>hi <script>hello</script></iframe>',
405
+ :elements => ['iframe']
406
+ ).must_equal '<iframe></iframe>'
357
407
  end
358
408
 
359
409
  it 'should not allow arbitrary HTML5 data attributes by default' do
@@ -413,7 +463,7 @@ describe 'Sanitize::Transformers::CleanElement' do
413
463
  s.fragment('foo<br>bar<br>baz').must_equal "foo\nbar\nbaz"
414
464
  end
415
465
 
416
- it 'handles protocols correctly regardless of case' do
466
+ it 'should handle protocols correctly regardless of case' do
417
467
  input = '<a href="hTTpS://foo.com/">Text</a>'
418
468
 
419
469
  Sanitize.fragment(input, {
@@ -430,5 +480,40 @@ describe 'Sanitize::Transformers::CleanElement' do
430
480
  :protocols => {'a' => {'href' => ['https']}}
431
481
  }).must_equal "<a>Text</a>"
432
482
  end
483
+
484
+ it 'should prevent `<meta>` tags from being used to set a non-UTF-8 charset' do
485
+ Sanitize.document('<html><head><meta charset="utf-8"></head><body>Howdy!</body></html>',
486
+ :elements => %w[html head meta body],
487
+ :attributes => {'meta' => ['charset']}
488
+ ).must_equal "<html><head><meta charset=\"utf-8\"></head><body>Howdy!</body></html>"
489
+
490
+ Sanitize.document('<html><meta charset="utf-8">Howdy!</html>',
491
+ :elements => %w[html meta],
492
+ :attributes => {'meta' => ['charset']}
493
+ ).must_equal "<html><meta charset=\"utf-8\">Howdy!</html>"
494
+
495
+ Sanitize.document('<html><meta charset="us-ascii">Howdy!</html>',
496
+ :elements => %w[html meta],
497
+ :attributes => {'meta' => ['charset']}
498
+ ).must_equal "<html><meta charset=\"utf-8\">Howdy!</html>"
499
+
500
+ Sanitize.document('<html><meta http-equiv="content-type" content=" text/html; charset=us-ascii">Howdy!</html>',
501
+ :elements => %w[html meta],
502
+ :attributes => {'meta' => %w[content http-equiv]}
503
+ ).must_equal "<html><meta http-equiv=\"content-type\" content=\" text/html;charset=utf-8\">Howdy!</html>"
504
+
505
+ Sanitize.document('<html><meta http-equiv="Content-Type" content="text/plain;charset = us-ascii">Howdy!</html>',
506
+ :elements => %w[html meta],
507
+ :attributes => {'meta' => %w[content http-equiv]}
508
+ ).must_equal "<html><meta http-equiv=\"Content-Type\" content=\"text/plain;charset=utf-8\">Howdy!</html>"
509
+ end
510
+
511
+ it 'should not modify `<meta>` tags that already set a UTF-8 charset' do
512
+ Sanitize.document('<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8"></head><body>Howdy!</body></html>',
513
+ :elements => %w[html head meta body],
514
+ :attributes => {'meta' => %w[content http-equiv]}
515
+ ).must_equal "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"></head><body>Howdy!</body></html>"
516
+ end
517
+
433
518
  end
434
519
  end
@@ -43,7 +43,7 @@ describe 'Malicious HTML' do
43
43
  describe '<body>' do
44
44
  it 'should not be possible to inject JS via a malformed event attribute' do
45
45
  @s.document('<html><head></head><body onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert("XSS")></body></html>').
46
- must_equal "<html><head></head><body></body></html>\n"
46
+ must_equal "<html><head></head><body></body></html>"
47
47
  end
48
48
  end
49
49
 
@@ -65,7 +65,7 @@ describe 'Malicious HTML' do
65
65
 
66
66
  it 'should not be possible to inject <script> via a malformed <img> tag' do
67
67
  @s.fragment('<img """><script>alert("XSS")</script>">').
68
- must_equal '<img>alert("XSS")"&gt;'
68
+ must_equal '<img>"&gt;'
69
69
  end
70
70
 
71
71
  it 'should not be possible to inject protocol-based JS' do
@@ -117,12 +117,12 @@ describe 'Malicious HTML' do
117
117
  describe '<script>' do
118
118
  it 'should not be possible to inject <script> using a malformed non-alphanumeric tag name' do
119
119
  @s.fragment(%[<script/xss src="http://ha.ckers.org/xss.js">alert(1)</script>]).
120
- must_equal 'alert(1)'
120
+ must_equal ''
121
121
  end
122
122
 
123
123
  it 'should not be possible to inject <script> via extraneous open brackets' do
124
124
  @s.fragment(%[<<script>alert("XSS");//<</script>]).
125
- must_equal '&lt;alert("XSS");//&lt;'
125
+ must_equal '&lt;'
126
126
  end
127
127
  end
128
128
 
@@ -166,7 +166,19 @@ describe 'Malicious HTML' do
166
166
  input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]
167
167
 
168
168
  it 'should escape unsafe characters in attributes' do
169
- @s.fragment(input).must_equal(%[<#{tag_name} #{attr_name}="examp<!--%22%20onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
169
+ # This uses Nokogumbo's HTML-compliant serializer rather than
170
+ # libxml2's.
171
+ @s.fragment(input).
172
+ must_equal(%[<#{tag_name} #{attr_name}="examp<!--%22%20onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
173
+
174
+ # This uses the not-quite-standards-compliant libxml2 serializer via
175
+ # Nokogiri, so the output may be a little different as of Nokogiri
176
+ # 1.10.2 when using Nokogiri's vendored libxml2 due to this patch:
177
+ # https://github.com/sparklemotion/nokogiri/commit/4852e43cb6039e26d8c51af78621e539cbf46c5d
178
+ fragment = Nokogiri::HTML.fragment(input)
179
+ @s.node!(fragment)
180
+ fragment.to_html.
181
+ must_equal(%[<#{tag_name} #{attr_name}="examp&lt;!--%22%20onmouseover=alert(1)&gt;--&gt;le.com">foo</#{tag_name}>])
170
182
  end
171
183
 
172
184
  it 'should round-trip to the same output' do
@@ -179,7 +191,19 @@ describe 'Malicious HTML' do
179
191
  input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]
180
192
 
181
193
  it 'should not escape characters unnecessarily' do
182
- @s.fragment(input).must_equal(input)
194
+ # This uses Nokogumbo's HTML-compliant serializer rather than
195
+ # libxml2's.
196
+ @s.fragment(input).
197
+ must_equal(%[<#{tag_name} #{attr_name}="examp<!--&quot; onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
198
+
199
+ # This uses the not-quite-standards-compliant libxml2 serializer via
200
+ # Nokogiri, so the output may be a little different as of Nokogiri
201
+ # 1.10.2 when using Nokogiri's vendored libxml2 due to this patch:
202
+ # https://github.com/sparklemotion/nokogiri/commit/4852e43cb6039e26d8c51af78621e539cbf46c5d
203
+ fragment = Nokogiri::HTML.fragment(input)
204
+ @s.node!(fragment)
205
+ fragment.to_html.
206
+ must_equal(%[<#{tag_name} #{attr_name}='examp&lt;!--" onmouseover=alert(1)&gt;--&gt;le.com'>foo</#{tag_name}>])
183
207
  end
184
208
 
185
209
  it 'should round-trip to the same output' do
data/test/test_parser.rb CHANGED
@@ -19,8 +19,8 @@ describe 'Parser' do
19
19
  end
20
20
 
21
21
  it 'should not have the Nokogiri 1.4.2+ unterminated script/style element bug' do
22
- Sanitize.fragment('foo <script>bar').must_equal 'foo bar'
23
- Sanitize.fragment('foo <style>bar').must_equal 'foo bar'
22
+ Sanitize.fragment('foo <script>bar').must_equal 'foo '
23
+ Sanitize.fragment('foo <style>bar').must_equal 'foo '
24
24
  end
25
25
 
26
26
  it 'ambiguous non-tag brackets like "1 > 2 and 2 < 1" should be parsed correctly' do
@@ -28,35 +28,6 @@ describe 'Parser' do
28
28
  Sanitize.fragment('OMG HAPPY BIRTHDAY! *<:-D').must_equal 'OMG HAPPY BIRTHDAY! *&lt;:-D'
29
29
  end
30
30
 
31
- # https://github.com/sparklemotion/nokogiri/issues/1008
32
- it 'should work around the libxml2 content-type meta tag bug' do
33
- Sanitize.document('<html><head></head><body>Howdy!</body></html>',
34
- :elements => %w[html head body]
35
- ).must_equal "<html><head></head><body>Howdy!</body></html>\n"
36
-
37
- Sanitize.document('<html><head></head><body>Howdy!</body></html>',
38
- :elements => %w[html head meta body]
39
- ).must_equal "<html><head></head><body>Howdy!</body></html>\n"
40
-
41
- Sanitize.document('<html><head><meta charset="utf-8"></head><body>Howdy!</body></html>',
42
- :elements => %w[html head meta body],
43
- :attributes => {'meta' => ['charset']}
44
- ).must_equal "<html><head><meta charset=\"utf-8\"></head><body>Howdy!</body></html>\n"
45
-
46
- Sanitize.document('<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8"></head><body>Howdy!</body></html>',
47
- :elements => %w[html head meta body],
48
- :attributes => {'meta' => %w[charset content http-equiv]}
49
- ).must_equal "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"></head><body>Howdy!</body></html>\n"
50
-
51
- # Edge case: an existing content-type meta tag with a non-UTF-8 content type
52
- # will be converted to UTF-8, since that's the only output encoding we
53
- # support.
54
- Sanitize.document('<html><head><meta http-equiv="content-type" content="text/html;charset=us-ascii"></head><body>Howdy!</body></html>',
55
- :elements => %w[html head meta body],
56
- :attributes => {'meta' => %w[charset content http-equiv]}
57
- ).must_equal "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"></head><body>Howdy!</body></html>\n"
58
- end
59
-
60
31
  describe 'when siblings are added after a node during traversal' do
61
32
  it 'the added siblings should be traversed' do
62
33
  html = %[
@@ -25,7 +25,7 @@ describe 'Sanitize' do
25
25
 
26
26
  it 'should sanitize an HTML document' do
27
27
  @s.document('<!doctype html><html><b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script></html>')
28
- .must_equal "<html>Lorem ipsum dolor sit amet alert(\"hello world\");</html>\n"
28
+ .must_equal "<html>Lorem ipsum dolor sit amet </html>"
29
29
  end
30
30
 
31
31
  it 'should not modify the input string' do
@@ -35,14 +35,52 @@ describe 'Sanitize' do
35
35
  end
36
36
 
37
37
  it 'should not choke on frozen documents' do
38
- @s.document('<!doctype html><html><b>foo</b>'.freeze).must_equal "<html>foo</html>\n"
38
+ @s.document('<!doctype html><html><b>foo</b>'.freeze).must_equal "<html>foo</html>"
39
+ end
40
+
41
+ it 'should normalize newlines' do
42
+ @s.document("a\r\n\n\r\r\r\nz").must_equal "<html>a\n\n\n\n\nz</html>"
43
+ end
44
+
45
+ it 'should strip control characters (except ASCII whitespace)' do
46
+ sample_control_chars = "\u0001\u0008\u000b\u000e\u001f\u007f\u009f"
47
+ whitespace = "\t\n\f\u0020"
48
+ @s.document("a#{sample_control_chars}#{whitespace}z").must_equal "<html>a#{whitespace}z</html>"
49
+ end
50
+
51
+ it 'should strip non-characters' do
52
+ sample_non_chars = "\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}"
53
+ @s.document("a#{sample_non_chars}z").must_equal "<html>az</html>"
54
+ end
55
+
56
+ describe 'when html body exceeds Nokogumbo::DEFAULT_MAX_TREE_DEPTH' do
57
+ let(:content) do
58
+ content = nest_html_content('<b>foo</b>', Nokogumbo::DEFAULT_MAX_TREE_DEPTH)
59
+ "<html>#{content}</html>"
60
+ end
61
+
62
+ it 'raises an ArgumentError exception' do
63
+ assert_raises ArgumentError do
64
+ @s.document(content)
65
+ end
66
+ end
67
+
68
+ describe 'and :max_tree_depth of -1 is supplied in :parser_options' do
69
+ before do
70
+ @s = Sanitize.new(elements: ['html'], parser_options: { max_tree_depth: -1 })
71
+ end
72
+
73
+ it 'does not raise an ArgumentError exception' do
74
+ @s.document(content).must_equal '<html>foo</html>'
75
+ end
76
+ end
39
77
  end
40
78
  end
41
79
 
42
80
  describe '#fragment' do
43
81
  it 'should sanitize an HTML fragment' do
44
82
  @s.fragment('<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>')
45
- .must_equal 'Lorem ipsum dolor sit amet alert("hello world");'
83
+ .must_equal 'Lorem ipsum dolor sit amet '
46
84
  end
47
85
 
48
86
  it 'should not modify the input string' do
@@ -61,6 +99,44 @@ describe 'Sanitize' do
61
99
  it 'should not choke on frozen fragments' do
62
100
  @s.fragment('<b>foo</b>'.freeze).must_equal 'foo'
63
101
  end
102
+
103
+ it 'should normalize newlines' do
104
+ @s.fragment("a\r\n\n\r\r\r\nz").must_equal "a\n\n\n\n\nz"
105
+ end
106
+
107
+ it 'should strip control characters (except ASCII whitespace)' do
108
+ sample_control_chars = "\u0001\u0008\u000b\u000e\u001f\u007f\u009f"
109
+ whitespace = "\t\n\f\u0020"
110
+ @s.fragment("a#{sample_control_chars}#{whitespace}z").must_equal "a#{whitespace}z"
111
+ end
112
+
113
+ it 'should strip non-characters' do
114
+ sample_non_chars = "\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}"
115
+ @s.fragment("a#{sample_non_chars}z").must_equal "az"
116
+ end
117
+
118
+ describe 'when html body exceeds Nokogumbo::DEFAULT_MAX_TREE_DEPTH' do
119
+ let(:content) do
120
+ content = nest_html_content('<b>foo</b>', Nokogumbo::DEFAULT_MAX_TREE_DEPTH)
121
+ "<body>#{content}</body>"
122
+ end
123
+
124
+ it 'raises an ArgumentError exception' do
125
+ assert_raises ArgumentError do
126
+ @s.fragment(content)
127
+ end
128
+ end
129
+
130
+ describe 'and :max_tree_depth of -1 is supplied in :parser_options' do
131
+ before do
132
+ @s = Sanitize.new(parser_options: { max_tree_depth: -1 })
133
+ end
134
+
135
+ it 'does not raise an ArgumentError exception' do
136
+ @s.fragment(content).must_equal 'foo'
137
+ end
138
+ end
139
+ end
64
140
  end
65
141
 
66
142
  describe '#node!' do
@@ -71,7 +147,7 @@ describe 'Sanitize' do
71
147
  doc.xpath('/html/body/node()').each {|node| frag << node }
72
148
 
73
149
  @s.node!(frag)
74
- frag.to_html.must_equal 'Lorem ipsum dolor sit amet alert("hello world");'
150
+ frag.to_html.must_equal 'Lorem ipsum dolor sit amet '
75
151
  end
76
152
 
77
153
  describe "when the given node is a document and <html> isn't whitelisted" do
@@ -85,28 +161,37 @@ describe 'Sanitize' do
85
161
 
86
162
  describe 'class methods' do
87
163
  describe '.document' do
88
- it 'should call #document' do
89
- Sanitize.stub_instance(:document, proc {|html| html + ' called' }) do
90
- Sanitize.document('<html>foo</html>')
91
- .must_equal '<html>foo</html> called'
92
- end
164
+ it 'should sanitize an HTML document with the given config' do
165
+ html = '<!doctype html><html><b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script></html>'
166
+ Sanitize.document(html, :elements => ['html'])
167
+ .must_equal "<html>Lorem ipsum dolor sit amet </html>"
93
168
  end
94
169
  end
95
170
 
96
171
  describe '.fragment' do
97
- it 'should call #fragment' do
98
- Sanitize.stub_instance(:fragment, proc {|html| html + ' called' }) do
99
- Sanitize.fragment('<b>foo</b>').must_equal '<b>foo</b> called'
100
- end
172
+ it 'should sanitize an HTML fragment with the given config' do
173
+ html = '<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>'
174
+ Sanitize.fragment(html, :elements => ['strong'])
175
+ .must_equal 'Lorem ipsum <strong>dolor</strong> sit amet '
101
176
  end
102
177
  end
103
178
 
104
179
  describe '.node!' do
105
- it 'should call #node!' do
106
- Sanitize.stub_instance(:node!, proc {|input| input + ' called' }) do
107
- Sanitize.node!('not really a node').must_equal 'not really a node called'
108
- end
180
+ it 'should sanitize a Nokogiri::XML::Node with the given config' do
181
+ doc = Nokogiri::HTML5.parse('<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>')
182
+ frag = doc.fragment
183
+
184
+ doc.xpath('/html/body/node()').each {|node| frag << node }
185
+
186
+ Sanitize.node!(frag, :elements => ['strong'])
187
+ frag.to_html.must_equal 'Lorem ipsum <strong>dolor</strong> sit amet '
109
188
  end
110
189
  end
111
190
  end
191
+
192
+ private
193
+
194
+ def nest_html_content(html_content, depth)
195
+ "#{'<span>' * depth}#{html_content}#{'</span>' * depth}"
196
+ end
112
197
  end