sanitize 4.6.3 → 5.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sanitize might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/HISTORY.md +101 -1
- data/README.md +24 -4
- data/lib/sanitize.rb +41 -63
- data/lib/sanitize/config/default.rb +10 -4
- data/lib/sanitize/transformers/clean_element.rb +44 -3
- data/lib/sanitize/version.rb +1 -1
- data/test/common.rb +0 -31
- data/test/test_clean_comment.rb +1 -5
- data/test/test_clean_css.rb +1 -1
- data/test/test_clean_doctype.rb +8 -8
- data/test/test_clean_element.rb +108 -23
- data/test/test_malicious_html.rb +30 -6
- data/test/test_parser.rb +2 -31
- data/test/test_sanitize.rb +102 -17
- data/test/test_sanitize_css.rb +39 -12
- data/test/test_transformers.rb +22 -4
- metadata +12 -14
- data/test/test_unicode.rb +0 -95
data/test/common.rb
CHANGED
@@ -1,34 +1,3 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
gem 'minitest'
|
3
2
|
require 'minitest/autorun'
|
4
|
-
|
5
3
|
require_relative '../lib/sanitize'
|
6
|
-
|
7
|
-
# Helper to stub an instance method. Shamelessly stolen from
|
8
|
-
# https://github.com/codeodor/minitest-stub_any_instance/
|
9
|
-
class Object
|
10
|
-
def self.stub_instance(name, value, &block)
|
11
|
-
old_method = "__stubbed_method_#{name}__"
|
12
|
-
|
13
|
-
class_eval do
|
14
|
-
alias_method old_method, name
|
15
|
-
|
16
|
-
define_method(name) do |*args|
|
17
|
-
if value.respond_to?(:call) then
|
18
|
-
value.call(*args)
|
19
|
-
else
|
20
|
-
value
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
yield
|
26
|
-
|
27
|
-
ensure
|
28
|
-
class_eval do
|
29
|
-
undef_method name
|
30
|
-
alias_method name, old_method
|
31
|
-
undef_method old_method
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
data/test/test_clean_comment.rb
CHANGED
@@ -20,7 +20,7 @@ describe 'Sanitize::Transformers::CleanComment' do
|
|
20
20
|
|
21
21
|
# Special case: the comment markup is inside a <script>, which makes it
|
22
22
|
# text content and not an actual HTML comment.
|
23
|
-
@s.fragment("<script><!-- comment --></script>").must_equal '
|
23
|
+
@s.fragment("<script><!-- comment --></script>").must_equal ''
|
24
24
|
|
25
25
|
Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => false, :elements => ['script'])
|
26
26
|
.must_equal '<script><!-- comment --></script>'
|
@@ -40,10 +40,6 @@ describe 'Sanitize::Transformers::CleanComment' do
|
|
40
40
|
@s.fragment("foo <!-- <!-- <!-- --> --> -->bar").must_equal 'foo <!-- <!-- <!-- --> --> -->bar'
|
41
41
|
@s.fragment("foo <div <!-- comment -->>bar</div>").must_equal 'foo <div>>bar</div>'
|
42
42
|
|
43
|
-
# Special case: the comment markup is inside a <script>, which makes it
|
44
|
-
# text content and not an actual HTML comment.
|
45
|
-
@s.fragment("<script><!-- comment --></script>").must_equal '<!-- comment -->'
|
46
|
-
|
47
43
|
Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => true, :elements => ['script'])
|
48
44
|
.must_equal '<script><!-- comment --></script>'
|
49
45
|
end
|
data/test/test_clean_css.rb
CHANGED
@@ -13,7 +13,7 @@ describe 'Sanitize::Transformers::CSS::CleanAttribute' do
|
|
13
13
|
@s.fragment(%[
|
14
14
|
<div style="color: #fff; width: expression(alert(1)); /* <-- evil! */"></div>
|
15
15
|
].strip).must_equal %[
|
16
|
-
<div style="color: #fff; /*
|
16
|
+
<div style="color: #fff; /* <-- evil! */"></div>
|
17
17
|
].strip
|
18
18
|
end
|
19
19
|
|
data/test/test_clean_doctype.rb
CHANGED
@@ -11,7 +11,7 @@ describe 'Sanitize::Transformers::CleanDoctype' do
|
|
11
11
|
end
|
12
12
|
|
13
13
|
it 'should remove doctype declarations' do
|
14
|
-
@s.document('<!DOCTYPE html><html>foo</html>').must_equal "<html>foo</html
|
14
|
+
@s.document('<!DOCTYPE html><html>foo</html>').must_equal "<html>foo</html>"
|
15
15
|
@s.fragment('<!DOCTYPE html>foo').must_equal 'foo'
|
16
16
|
end
|
17
17
|
|
@@ -34,27 +34,27 @@ describe 'Sanitize::Transformers::CleanDoctype' do
|
|
34
34
|
|
35
35
|
it 'should allow doctype declarations in documents' do
|
36
36
|
@s.document('<!DOCTYPE html><html>foo</html>')
|
37
|
-
.must_equal "<!DOCTYPE html
|
37
|
+
.must_equal "<!DOCTYPE html><html>foo</html>"
|
38
38
|
|
39
39
|
@s.document('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
|
40
|
-
.must_equal "<!DOCTYPE html
|
40
|
+
.must_equal "<!DOCTYPE html><html>foo</html>"
|
41
41
|
|
42
42
|
@s.document("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
|
43
|
-
.must_equal "<!DOCTYPE html
|
43
|
+
.must_equal "<!DOCTYPE html><html>foo</html>"
|
44
44
|
end
|
45
45
|
|
46
46
|
it 'should not allow obviously invalid doctype declarations in documents' do
|
47
47
|
@s.document('<!DOCTYPE blah blah blah><html>foo</html>')
|
48
|
-
.must_equal "<!DOCTYPE html
|
48
|
+
.must_equal "<!DOCTYPE html><html>foo</html>"
|
49
49
|
|
50
50
|
@s.document('<!DOCTYPE blah><html>foo</html>')
|
51
|
-
.must_equal "<!DOCTYPE html
|
51
|
+
.must_equal "<!DOCTYPE html><html>foo</html>"
|
52
52
|
|
53
53
|
@s.document('<!DOCTYPE html BLAH "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
|
54
|
-
.must_equal "<!DOCTYPE html
|
54
|
+
.must_equal "<!DOCTYPE html><html>foo</html>"
|
55
55
|
|
56
56
|
@s.document('<!whatever><html>foo</html>')
|
57
|
-
.must_equal "<html>foo</html
|
57
|
+
.must_equal "<html>foo</html>"
|
58
58
|
end
|
59
59
|
|
60
60
|
it 'should not allow doctype definitions in fragments' do
|
data/test/test_clean_element.rb
CHANGED
@@ -8,25 +8,22 @@ describe 'Sanitize::Transformers::CleanElement' do
|
|
8
8
|
strings = {
|
9
9
|
:basic => {
|
10
10
|
:html => '<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo" style="text-decoration: underline;">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <style>.foo { color: #fff; }</style> <script>alert("hello world");</script>',
|
11
|
-
|
12
|
-
:
|
13
|
-
:
|
14
|
-
:
|
15
|
-
:relaxed => '<b>Lorem</b> <a href="pants" title="foo" style="text-decoration: underline;">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br>amet <style>.foo { color: #fff; }</style> alert("hello world");'
|
11
|
+
:default => 'Lorem ipsum dolor sit amet ',
|
12
|
+
:restricted => '<b>Lorem</b> ipsum <strong>dolor</strong> sit amet ',
|
13
|
+
:basic => '<b>Lorem</b> <a href="pants" rel="nofollow">ipsum</a> <a href="http://foo.com/" rel="nofollow"><strong>dolor</strong></a> sit<br>amet ',
|
14
|
+
:relaxed => '<b>Lorem</b> <a href="pants" title="foo" style="text-decoration: underline;">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br>amet <style>.foo { color: #fff; }</style> '
|
16
15
|
},
|
17
16
|
|
18
17
|
:malformed => {
|
19
18
|
:html => 'Lo<!-- comment -->rem</b> <a href=pants title="foo>ipsum <a href="http://foo.com/"><strong>dolor</a></strong> sit<br/>amet <script>alert("hello world");',
|
20
|
-
|
21
|
-
:
|
22
|
-
:
|
23
|
-
:
|
24
|
-
:relaxed => 'Lorem <a href="pants" title="foo>ipsum <a href="><strong>dolor</strong></a> sit<br>amet alert("hello world");',
|
19
|
+
:default => 'Lorem dolor sit amet ',
|
20
|
+
:restricted => 'Lorem <strong>dolor</strong> sit amet ',
|
21
|
+
:basic => 'Lorem <a href="pants" rel="nofollow"><strong>dolor</strong></a> sit<br>amet ',
|
22
|
+
:relaxed => 'Lorem <a href="pants" title="foo>ipsum <a href="><strong>dolor</strong></a> sit<br>amet ',
|
25
23
|
},
|
26
24
|
|
27
25
|
:unclosed => {
|
28
26
|
:html => '<p>a</p><blockquote>b',
|
29
|
-
|
30
27
|
:default => ' a b ',
|
31
28
|
:restricted => ' a b ',
|
32
29
|
:basic => '<p>a</p><blockquote>b</blockquote>',
|
@@ -35,7 +32,6 @@ describe 'Sanitize::Transformers::CleanElement' do
|
|
35
32
|
|
36
33
|
:malicious => {
|
37
34
|
:html => '<b>Lo<!-- comment -->rem</b> <a href="javascript:pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <<foo>script>alert("hello world");</script>',
|
38
|
-
|
39
35
|
:default => 'Lorem ipsum dolor sit amet <script>alert("hello world");',
|
40
36
|
:restricted => '<b>Lorem</b> ipsum <strong>dolor</strong> sit amet <script>alert("hello world");',
|
41
37
|
:basic => '<b>Lorem</b> <a rel="nofollow">ipsum</a> <a href="http://foo.com/" rel="nofollow"><strong>dolor</strong></a> sit<br>amet <script>alert("hello world");',
|
@@ -171,10 +167,10 @@ describe 'Sanitize::Transformers::CleanElement' do
|
|
171
167
|
.must_equal 'foo bar baz quux'
|
172
168
|
|
173
169
|
Sanitize.fragment('<script>alert("<xss>");</script>')
|
174
|
-
.must_equal '
|
170
|
+
.must_equal ''
|
175
171
|
|
176
172
|
Sanitize.fragment('<<script>script>alert("<xss>");</<script>>')
|
177
|
-
.must_equal '<
|
173
|
+
.must_equal '<'
|
178
174
|
|
179
175
|
Sanitize.fragment('< script <>> alert("<xss>");</script>')
|
180
176
|
.must_equal '< script <>> alert("");'
|
@@ -196,6 +192,46 @@ describe 'Sanitize::Transformers::CleanElement' do
|
|
196
192
|
.must_equal ''
|
197
193
|
end
|
198
194
|
|
195
|
+
it 'should escape the content of removed `plaintext` elements' do
|
196
|
+
Sanitize.fragment('<plaintext>hello! <script>alert(0)</script>')
|
197
|
+
.must_equal 'hello! <script>alert(0)</script>'
|
198
|
+
end
|
199
|
+
|
200
|
+
it 'should escape the content of removed `xmp` elements' do
|
201
|
+
Sanitize.fragment('<xmp>hello! <script>alert(0)</script></xmp>')
|
202
|
+
.must_equal 'hello! <script>alert(0)</script>'
|
203
|
+
end
|
204
|
+
|
205
|
+
it 'should not preserve the content of removed `iframe` elements' do
|
206
|
+
Sanitize.fragment('<iframe>hello! <script>alert(0)</script></iframe>')
|
207
|
+
.must_equal ''
|
208
|
+
end
|
209
|
+
|
210
|
+
it 'should not preserve the content of removed `noembed` elements' do
|
211
|
+
Sanitize.fragment('<noembed>hello! <script>alert(0)</script></noembed>')
|
212
|
+
.must_equal ''
|
213
|
+
end
|
214
|
+
|
215
|
+
it 'should not preserve the content of removed `noframes` elements' do
|
216
|
+
Sanitize.fragment('<noframes>hello! <script>alert(0)</script></noframes>')
|
217
|
+
.must_equal ''
|
218
|
+
end
|
219
|
+
|
220
|
+
it 'should not preserve the content of removed `noscript` elements' do
|
221
|
+
Sanitize.fragment('<noscript>hello! <script>alert(0)</script></noscript>')
|
222
|
+
.must_equal ''
|
223
|
+
end
|
224
|
+
|
225
|
+
it 'should not preserve the content of removed `script` elements' do
|
226
|
+
Sanitize.fragment('<script>hello! <script>alert(0)</script></script>')
|
227
|
+
.must_equal ''
|
228
|
+
end
|
229
|
+
|
230
|
+
it 'should not preserve the content of removed `style` elements' do
|
231
|
+
Sanitize.fragment('<style>hello! <script>alert(0)</script></style>')
|
232
|
+
.must_equal ''
|
233
|
+
end
|
234
|
+
|
199
235
|
strings.each do |name, data|
|
200
236
|
it "should clean #{name} HTML" do
|
201
237
|
Sanitize.fragment(data[:html]).must_equal(data[:default])
|
@@ -234,7 +270,7 @@ describe 'Sanitize::Transformers::CleanElement' do
|
|
234
270
|
|
235
271
|
it 'should not choke on valueless attributes' do
|
236
272
|
@s.fragment('foo <a href>foo</a> bar')
|
237
|
-
.must_equal 'foo <a href rel="nofollow">foo</a> bar'
|
273
|
+
.must_equal 'foo <a href="" rel="nofollow">foo</a> bar'
|
238
274
|
end
|
239
275
|
|
240
276
|
it 'should downcase attribute names' do
|
@@ -262,7 +298,7 @@ describe 'Sanitize::Transformers::CleanElement' do
|
|
262
298
|
|
263
299
|
it 'should encode special chars in attribute values' do
|
264
300
|
@s.fragment('<a href="http://example.com" title="<b>éxamples</b> & things">foo</a>')
|
265
|
-
.must_equal '<a href="http://example.com" title="
|
301
|
+
.must_equal '<a href="http://example.com" title="<b>éxamples</b> & things">foo</a>'
|
266
302
|
end
|
267
303
|
|
268
304
|
strings.each do |name, data|
|
@@ -344,16 +380,30 @@ describe 'Sanitize::Transformers::CleanElement' do
|
|
344
380
|
).must_equal 'foo bar '
|
345
381
|
end
|
346
382
|
|
347
|
-
it 'should remove the contents of specified nodes when :remove_contents is an Array of element names as strings' do
|
348
|
-
Sanitize.fragment('foo bar <div>baz<span>quux</span><script>alert("hello!");</script></div>',
|
383
|
+
it 'should remove the contents of specified nodes when :remove_contents is an Array or Set of element names as strings' do
|
384
|
+
Sanitize.fragment('foo bar <div>baz<span>quux</span> <b>hi</b><script>alert("hello!");</script></div>',
|
349
385
|
:remove_contents => ['script', 'span']
|
350
|
-
).must_equal 'foo bar baz '
|
386
|
+
).must_equal 'foo bar baz hi '
|
387
|
+
|
388
|
+
Sanitize.fragment('foo bar <div>baz<span>quux</span> <b>hi</b><script>alert("hello!");</script></div>',
|
389
|
+
:remove_contents => Set.new(['script', 'span'])
|
390
|
+
).must_equal 'foo bar baz hi '
|
351
391
|
end
|
352
392
|
|
353
|
-
it 'should remove the contents of specified nodes when :remove_contents is an Array of element names as symbols' do
|
354
|
-
Sanitize.fragment('foo bar <div>baz<span>quux</span><script>alert("hello!");</script></div>',
|
393
|
+
it 'should remove the contents of specified nodes when :remove_contents is an Array or Set of element names as symbols' do
|
394
|
+
Sanitize.fragment('foo bar <div>baz<span>quux</span> <b>hi</b><script>alert("hello!");</script></div>',
|
355
395
|
:remove_contents => [:script, :span]
|
356
|
-
).must_equal 'foo bar baz '
|
396
|
+
).must_equal 'foo bar baz hi '
|
397
|
+
|
398
|
+
Sanitize.fragment('foo bar <div>baz<span>quux</span> <b>hi</b><script>alert("hello!");</script></div>',
|
399
|
+
:remove_contents => Set.new([:script, :span])
|
400
|
+
).must_equal 'foo bar baz hi '
|
401
|
+
end
|
402
|
+
|
403
|
+
it 'should remove the contents of whitelisted iframes' do
|
404
|
+
Sanitize.fragment('<iframe>hi <script>hello</script></iframe>',
|
405
|
+
:elements => ['iframe']
|
406
|
+
).must_equal '<iframe></iframe>'
|
357
407
|
end
|
358
408
|
|
359
409
|
it 'should not allow arbitrary HTML5 data attributes by default' do
|
@@ -413,7 +463,7 @@ describe 'Sanitize::Transformers::CleanElement' do
|
|
413
463
|
s.fragment('foo<br>bar<br>baz').must_equal "foo\nbar\nbaz"
|
414
464
|
end
|
415
465
|
|
416
|
-
it '
|
466
|
+
it 'should handle protocols correctly regardless of case' do
|
417
467
|
input = '<a href="hTTpS://foo.com/">Text</a>'
|
418
468
|
|
419
469
|
Sanitize.fragment(input, {
|
@@ -430,5 +480,40 @@ describe 'Sanitize::Transformers::CleanElement' do
|
|
430
480
|
:protocols => {'a' => {'href' => ['https']}}
|
431
481
|
}).must_equal "<a>Text</a>"
|
432
482
|
end
|
483
|
+
|
484
|
+
it 'should prevent `<meta>` tags from being used to set a non-UTF-8 charset' do
|
485
|
+
Sanitize.document('<html><head><meta charset="utf-8"></head><body>Howdy!</body></html>',
|
486
|
+
:elements => %w[html head meta body],
|
487
|
+
:attributes => {'meta' => ['charset']}
|
488
|
+
).must_equal "<html><head><meta charset=\"utf-8\"></head><body>Howdy!</body></html>"
|
489
|
+
|
490
|
+
Sanitize.document('<html><meta charset="utf-8">Howdy!</html>',
|
491
|
+
:elements => %w[html meta],
|
492
|
+
:attributes => {'meta' => ['charset']}
|
493
|
+
).must_equal "<html><meta charset=\"utf-8\">Howdy!</html>"
|
494
|
+
|
495
|
+
Sanitize.document('<html><meta charset="us-ascii">Howdy!</html>',
|
496
|
+
:elements => %w[html meta],
|
497
|
+
:attributes => {'meta' => ['charset']}
|
498
|
+
).must_equal "<html><meta charset=\"utf-8\">Howdy!</html>"
|
499
|
+
|
500
|
+
Sanitize.document('<html><meta http-equiv="content-type" content=" text/html; charset=us-ascii">Howdy!</html>',
|
501
|
+
:elements => %w[html meta],
|
502
|
+
:attributes => {'meta' => %w[content http-equiv]}
|
503
|
+
).must_equal "<html><meta http-equiv=\"content-type\" content=\" text/html;charset=utf-8\">Howdy!</html>"
|
504
|
+
|
505
|
+
Sanitize.document('<html><meta http-equiv="Content-Type" content="text/plain;charset = us-ascii">Howdy!</html>',
|
506
|
+
:elements => %w[html meta],
|
507
|
+
:attributes => {'meta' => %w[content http-equiv]}
|
508
|
+
).must_equal "<html><meta http-equiv=\"Content-Type\" content=\"text/plain;charset=utf-8\">Howdy!</html>"
|
509
|
+
end
|
510
|
+
|
511
|
+
it 'should not modify `<meta>` tags that already set a UTF-8 charset' do
|
512
|
+
Sanitize.document('<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8"></head><body>Howdy!</body></html>',
|
513
|
+
:elements => %w[html head meta body],
|
514
|
+
:attributes => {'meta' => %w[content http-equiv]}
|
515
|
+
).must_equal "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"></head><body>Howdy!</body></html>"
|
516
|
+
end
|
517
|
+
|
433
518
|
end
|
434
519
|
end
|
data/test/test_malicious_html.rb
CHANGED
@@ -43,7 +43,7 @@ describe 'Malicious HTML' do
|
|
43
43
|
describe '<body>' do
|
44
44
|
it 'should not be possible to inject JS via a malformed event attribute' do
|
45
45
|
@s.document('<html><head></head><body onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert("XSS")></body></html>').
|
46
|
-
must_equal "<html><head></head><body></body></html
|
46
|
+
must_equal "<html><head></head><body></body></html>"
|
47
47
|
end
|
48
48
|
end
|
49
49
|
|
@@ -65,7 +65,7 @@ describe 'Malicious HTML' do
|
|
65
65
|
|
66
66
|
it 'should not be possible to inject <script> via a malformed <img> tag' do
|
67
67
|
@s.fragment('<img """><script>alert("XSS")</script>">').
|
68
|
-
must_equal '<img>
|
68
|
+
must_equal '<img>">'
|
69
69
|
end
|
70
70
|
|
71
71
|
it 'should not be possible to inject protocol-based JS' do
|
@@ -117,12 +117,12 @@ describe 'Malicious HTML' do
|
|
117
117
|
describe '<script>' do
|
118
118
|
it 'should not be possible to inject <script> using a malformed non-alphanumeric tag name' do
|
119
119
|
@s.fragment(%[<script/xss src="http://ha.ckers.org/xss.js">alert(1)</script>]).
|
120
|
-
must_equal '
|
120
|
+
must_equal ''
|
121
121
|
end
|
122
122
|
|
123
123
|
it 'should not be possible to inject <script> via extraneous open brackets' do
|
124
124
|
@s.fragment(%[<<script>alert("XSS");//<</script>]).
|
125
|
-
must_equal '<
|
125
|
+
must_equal '<'
|
126
126
|
end
|
127
127
|
end
|
128
128
|
|
@@ -166,7 +166,19 @@ describe 'Malicious HTML' do
|
|
166
166
|
input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]
|
167
167
|
|
168
168
|
it 'should escape unsafe characters in attributes' do
|
169
|
-
|
169
|
+
# This uses Nokogumbo's HTML-compliant serializer rather than
|
170
|
+
# libxml2's.
|
171
|
+
@s.fragment(input).
|
172
|
+
must_equal(%[<#{tag_name} #{attr_name}="examp<!--%22%20onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
|
173
|
+
|
174
|
+
# This uses the not-quite-standards-compliant libxml2 serializer via
|
175
|
+
# Nokogiri, so the output may be a little different as of Nokogiri
|
176
|
+
# 1.10.2 when using Nokogiri's vendored libxml2 due to this patch:
|
177
|
+
# https://github.com/sparklemotion/nokogiri/commit/4852e43cb6039e26d8c51af78621e539cbf46c5d
|
178
|
+
fragment = Nokogiri::HTML.fragment(input)
|
179
|
+
@s.node!(fragment)
|
180
|
+
fragment.to_html.
|
181
|
+
must_equal(%[<#{tag_name} #{attr_name}="examp<!--%22%20onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
|
170
182
|
end
|
171
183
|
|
172
184
|
it 'should round-trip to the same output' do
|
@@ -179,7 +191,19 @@ describe 'Malicious HTML' do
|
|
179
191
|
input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]
|
180
192
|
|
181
193
|
it 'should not escape characters unnecessarily' do
|
182
|
-
|
194
|
+
# This uses Nokogumbo's HTML-compliant serializer rather than
|
195
|
+
# libxml2's.
|
196
|
+
@s.fragment(input).
|
197
|
+
must_equal(%[<#{tag_name} #{attr_name}="examp<!--" onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
|
198
|
+
|
199
|
+
# This uses the not-quite-standards-compliant libxml2 serializer via
|
200
|
+
# Nokogiri, so the output may be a little different as of Nokogiri
|
201
|
+
# 1.10.2 when using Nokogiri's vendored libxml2 due to this patch:
|
202
|
+
# https://github.com/sparklemotion/nokogiri/commit/4852e43cb6039e26d8c51af78621e539cbf46c5d
|
203
|
+
fragment = Nokogiri::HTML.fragment(input)
|
204
|
+
@s.node!(fragment)
|
205
|
+
fragment.to_html.
|
206
|
+
must_equal(%[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>])
|
183
207
|
end
|
184
208
|
|
185
209
|
it 'should round-trip to the same output' do
|
data/test/test_parser.rb
CHANGED
@@ -19,8 +19,8 @@ describe 'Parser' do
|
|
19
19
|
end
|
20
20
|
|
21
21
|
it 'should not have the Nokogiri 1.4.2+ unterminated script/style element bug' do
|
22
|
-
Sanitize.fragment('foo <script>bar').must_equal 'foo
|
23
|
-
Sanitize.fragment('foo <style>bar').must_equal 'foo
|
22
|
+
Sanitize.fragment('foo <script>bar').must_equal 'foo '
|
23
|
+
Sanitize.fragment('foo <style>bar').must_equal 'foo '
|
24
24
|
end
|
25
25
|
|
26
26
|
it 'ambiguous non-tag brackets like "1 > 2 and 2 < 1" should be parsed correctly' do
|
@@ -28,35 +28,6 @@ describe 'Parser' do
|
|
28
28
|
Sanitize.fragment('OMG HAPPY BIRTHDAY! *<:-D').must_equal 'OMG HAPPY BIRTHDAY! *<:-D'
|
29
29
|
end
|
30
30
|
|
31
|
-
# https://github.com/sparklemotion/nokogiri/issues/1008
|
32
|
-
it 'should work around the libxml2 content-type meta tag bug' do
|
33
|
-
Sanitize.document('<html><head></head><body>Howdy!</body></html>',
|
34
|
-
:elements => %w[html head body]
|
35
|
-
).must_equal "<html><head></head><body>Howdy!</body></html>\n"
|
36
|
-
|
37
|
-
Sanitize.document('<html><head></head><body>Howdy!</body></html>',
|
38
|
-
:elements => %w[html head meta body]
|
39
|
-
).must_equal "<html><head></head><body>Howdy!</body></html>\n"
|
40
|
-
|
41
|
-
Sanitize.document('<html><head><meta charset="utf-8"></head><body>Howdy!</body></html>',
|
42
|
-
:elements => %w[html head meta body],
|
43
|
-
:attributes => {'meta' => ['charset']}
|
44
|
-
).must_equal "<html><head><meta charset=\"utf-8\"></head><body>Howdy!</body></html>\n"
|
45
|
-
|
46
|
-
Sanitize.document('<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8"></head><body>Howdy!</body></html>',
|
47
|
-
:elements => %w[html head meta body],
|
48
|
-
:attributes => {'meta' => %w[charset content http-equiv]}
|
49
|
-
).must_equal "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"></head><body>Howdy!</body></html>\n"
|
50
|
-
|
51
|
-
# Edge case: an existing content-type meta tag with a non-UTF-8 content type
|
52
|
-
# will be converted to UTF-8, since that's the only output encoding we
|
53
|
-
# support.
|
54
|
-
Sanitize.document('<html><head><meta http-equiv="content-type" content="text/html;charset=us-ascii"></head><body>Howdy!</body></html>',
|
55
|
-
:elements => %w[html head meta body],
|
56
|
-
:attributes => {'meta' => %w[charset content http-equiv]}
|
57
|
-
).must_equal "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"></head><body>Howdy!</body></html>\n"
|
58
|
-
end
|
59
|
-
|
60
31
|
describe 'when siblings are added after a node during traversal' do
|
61
32
|
it 'the added siblings should be traversed' do
|
62
33
|
html = %[
|
data/test/test_sanitize.rb
CHANGED
@@ -25,7 +25,7 @@ describe 'Sanitize' do
|
|
25
25
|
|
26
26
|
it 'should sanitize an HTML document' do
|
27
27
|
@s.document('<!doctype html><html><b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script></html>')
|
28
|
-
.must_equal "<html>Lorem ipsum dolor sit amet
|
28
|
+
.must_equal "<html>Lorem ipsum dolor sit amet </html>"
|
29
29
|
end
|
30
30
|
|
31
31
|
it 'should not modify the input string' do
|
@@ -35,14 +35,52 @@ describe 'Sanitize' do
|
|
35
35
|
end
|
36
36
|
|
37
37
|
it 'should not choke on frozen documents' do
|
38
|
-
@s.document('<!doctype html><html><b>foo</b>'.freeze).must_equal "<html>foo</html
|
38
|
+
@s.document('<!doctype html><html><b>foo</b>'.freeze).must_equal "<html>foo</html>"
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'should normalize newlines' do
|
42
|
+
@s.document("a\r\n\n\r\r\r\nz").must_equal "<html>a\n\n\n\n\nz</html>"
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'should strip control characters (except ASCII whitespace)' do
|
46
|
+
sample_control_chars = "\u0001\u0008\u000b\u000e\u001f\u007f\u009f"
|
47
|
+
whitespace = "\t\n\f\u0020"
|
48
|
+
@s.document("a#{sample_control_chars}#{whitespace}z").must_equal "<html>a#{whitespace}z</html>"
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'should strip non-characters' do
|
52
|
+
sample_non_chars = "\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}"
|
53
|
+
@s.document("a#{sample_non_chars}z").must_equal "<html>az</html>"
|
54
|
+
end
|
55
|
+
|
56
|
+
describe 'when html body exceeds Nokogumbo::DEFAULT_MAX_TREE_DEPTH' do
|
57
|
+
let(:content) do
|
58
|
+
content = nest_html_content('<b>foo</b>', Nokogumbo::DEFAULT_MAX_TREE_DEPTH)
|
59
|
+
"<html>#{content}</html>"
|
60
|
+
end
|
61
|
+
|
62
|
+
it 'raises an ArgumentError exception' do
|
63
|
+
assert_raises ArgumentError do
|
64
|
+
@s.document(content)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
describe 'and :max_tree_depth of -1 is supplied in :parser_options' do
|
69
|
+
before do
|
70
|
+
@s = Sanitize.new(elements: ['html'], parser_options: { max_tree_depth: -1 })
|
71
|
+
end
|
72
|
+
|
73
|
+
it 'does not raise an ArgumentError exception' do
|
74
|
+
@s.document(content).must_equal '<html>foo</html>'
|
75
|
+
end
|
76
|
+
end
|
39
77
|
end
|
40
78
|
end
|
41
79
|
|
42
80
|
describe '#fragment' do
|
43
81
|
it 'should sanitize an HTML fragment' do
|
44
82
|
@s.fragment('<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>')
|
45
|
-
.must_equal 'Lorem ipsum dolor sit amet
|
83
|
+
.must_equal 'Lorem ipsum dolor sit amet '
|
46
84
|
end
|
47
85
|
|
48
86
|
it 'should not modify the input string' do
|
@@ -61,6 +99,44 @@ describe 'Sanitize' do
|
|
61
99
|
it 'should not choke on frozen fragments' do
|
62
100
|
@s.fragment('<b>foo</b>'.freeze).must_equal 'foo'
|
63
101
|
end
|
102
|
+
|
103
|
+
it 'should normalize newlines' do
|
104
|
+
@s.fragment("a\r\n\n\r\r\r\nz").must_equal "a\n\n\n\n\nz"
|
105
|
+
end
|
106
|
+
|
107
|
+
it 'should strip control characters (except ASCII whitespace)' do
|
108
|
+
sample_control_chars = "\u0001\u0008\u000b\u000e\u001f\u007f\u009f"
|
109
|
+
whitespace = "\t\n\f\u0020"
|
110
|
+
@s.fragment("a#{sample_control_chars}#{whitespace}z").must_equal "a#{whitespace}z"
|
111
|
+
end
|
112
|
+
|
113
|
+
it 'should strip non-characters' do
|
114
|
+
sample_non_chars = "\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}"
|
115
|
+
@s.fragment("a#{sample_non_chars}z").must_equal "az"
|
116
|
+
end
|
117
|
+
|
118
|
+
describe 'when html body exceeds Nokogumbo::DEFAULT_MAX_TREE_DEPTH' do
|
119
|
+
let(:content) do
|
120
|
+
content = nest_html_content('<b>foo</b>', Nokogumbo::DEFAULT_MAX_TREE_DEPTH)
|
121
|
+
"<body>#{content}</body>"
|
122
|
+
end
|
123
|
+
|
124
|
+
it 'raises an ArgumentError exception' do
|
125
|
+
assert_raises ArgumentError do
|
126
|
+
@s.fragment(content)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
describe 'and :max_tree_depth of -1 is supplied in :parser_options' do
|
131
|
+
before do
|
132
|
+
@s = Sanitize.new(parser_options: { max_tree_depth: -1 })
|
133
|
+
end
|
134
|
+
|
135
|
+
it 'does not raise an ArgumentError exception' do
|
136
|
+
@s.fragment(content).must_equal 'foo'
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
64
140
|
end
|
65
141
|
|
66
142
|
describe '#node!' do
|
@@ -71,7 +147,7 @@ describe 'Sanitize' do
|
|
71
147
|
doc.xpath('/html/body/node()').each {|node| frag << node }
|
72
148
|
|
73
149
|
@s.node!(frag)
|
74
|
-
frag.to_html.must_equal 'Lorem ipsum dolor sit amet
|
150
|
+
frag.to_html.must_equal 'Lorem ipsum dolor sit amet '
|
75
151
|
end
|
76
152
|
|
77
153
|
describe "when the given node is a document and <html> isn't whitelisted" do
|
@@ -85,28 +161,37 @@ describe 'Sanitize' do
|
|
85
161
|
|
86
162
|
describe 'class methods' do
|
87
163
|
describe '.document' do
|
88
|
-
it 'should
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
end
|
164
|
+
it 'should sanitize an HTML document with the given config' do
|
165
|
+
html = '<!doctype html><html><b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script></html>'
|
166
|
+
Sanitize.document(html, :elements => ['html'])
|
167
|
+
.must_equal "<html>Lorem ipsum dolor sit amet </html>"
|
93
168
|
end
|
94
169
|
end
|
95
170
|
|
96
171
|
describe '.fragment' do
|
97
|
-
it 'should
|
98
|
-
|
99
|
-
|
100
|
-
|
172
|
+
it 'should sanitize an HTML fragment with the given config' do
|
173
|
+
html = '<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>'
|
174
|
+
Sanitize.fragment(html, :elements => ['strong'])
|
175
|
+
.must_equal 'Lorem ipsum <strong>dolor</strong> sit amet '
|
101
176
|
end
|
102
177
|
end
|
103
178
|
|
104
179
|
describe '.node!' do
|
105
|
-
it 'should
|
106
|
-
|
107
|
-
|
108
|
-
|
180
|
+
it 'should sanitize a Nokogiri::XML::Node with the given config' do
|
181
|
+
doc = Nokogiri::HTML5.parse('<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>')
|
182
|
+
frag = doc.fragment
|
183
|
+
|
184
|
+
doc.xpath('/html/body/node()').each {|node| frag << node }
|
185
|
+
|
186
|
+
Sanitize.node!(frag, :elements => ['strong'])
|
187
|
+
frag.to_html.must_equal 'Lorem ipsum <strong>dolor</strong> sit amet '
|
109
188
|
end
|
110
189
|
end
|
111
190
|
end
|
191
|
+
|
192
|
+
private
|
193
|
+
|
194
|
+
def nest_html_content(html_content, depth)
|
195
|
+
"#{'<span>' * depth}#{html_content}#{'</span>' * depth}"
|
196
|
+
end
|
112
197
|
end
|