sanitize 2.1.1 → 6.0.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of sanitize might be problematic. Click here for more details.

@@ -1,721 +1,197 @@
1
1
  # encoding: utf-8
2
- #--
3
- # Copyright (c) 2013 Ryan Grove <ryan@wonko.com>
4
- #
5
- # Permission is hereby granted, free of charge, to any person obtaining a copy
6
- # of this software and associated documentation files (the 'Software'), to deal
7
- # in the Software without restriction, including without limitation the rights
8
- # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- # copies of the Software, and to permit persons to whom the Software is
10
- # furnished to do so, subject to the following conditions:
11
- #
12
- # The above copyright notice and this permission notice shall be included in all
13
- # copies or substantial portions of the Software.
14
- #
15
- # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- # SOFTWARE.
22
- #++
23
-
24
- require 'rubygems'
25
- gem 'minitest'
26
-
27
- require 'minitest/autorun'
28
- require 'sanitize'
29
-
30
- strings = {
31
- :basic => {
32
- :html => '<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>',
33
- :default => 'Lorem ipsum dolor sit amet alert("hello world");',
34
- :restricted => '<b>Lorem</b> ipsum <strong>dolor</strong> sit amet alert("hello world");',
35
- :basic => '<b>Lorem</b> <a href="pants" rel="nofollow">ipsum</a> <a href="http://foo.com/" rel="nofollow"><strong>dolor</strong></a> sit<br>amet alert("hello world");',
36
- :relaxed => '<b>Lorem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br>amet alert("hello world");'
37
- },
38
-
39
- :malformed => {
40
- :html => 'Lo<!-- comment -->rem</b> <a href=pants title="foo>ipsum <a href="http://foo.com/"><strong>dolor</a></strong> sit<br/>amet <script>alert("hello world");',
41
- :default => 'Lorem dolor sit amet alert("hello world");',
42
- :restricted => 'Lorem <strong>dolor</strong> sit amet alert("hello world");',
43
- :basic => 'Lorem <a href="pants" rel="nofollow"><strong>dolor</strong></a> sit<br>amet alert("hello world");',
44
- :relaxed => 'Lorem <a href="pants" title="foo&gt;ipsum &lt;a href="><strong>dolor</strong></a> sit<br>amet alert("hello world");',
45
- :document => ' Lorem dolor sit amet alert("hello world"); '
46
- },
47
-
48
- :unclosed => {
49
- :html => '<p>a</p><blockquote>b',
50
- :default => ' a b ',
51
- :restricted => ' a b ',
52
- :basic => '<p>a</p><blockquote>b</blockquote>',
53
- :relaxed => '<p>a</p><blockquote>b</blockquote>'
54
- },
55
-
56
- :malicious => {
57
- :html => '<b>Lo<!-- comment -->rem</b> <a href="javascript:pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <<foo>script>alert("hello world");</script>',
58
- :default => 'Lorem ipsum dolor sit amet &lt;script&gt;alert("hello world");',
59
- :restricted => '<b>Lorem</b> ipsum <strong>dolor</strong> sit amet &lt;script&gt;alert("hello world");',
60
- :basic => '<b>Lorem</b> <a rel="nofollow">ipsum</a> <a href="http://foo.com/" rel="nofollow"><strong>dolor</strong></a> sit<br>amet &lt;script&gt;alert("hello world");',
61
- :relaxed => '<b>Lorem</b> <a title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br>amet &lt;script&gt;alert("hello world");'
62
- },
63
-
64
- :raw_comment => {
65
- :html => '<!-- comment -->Hello',
66
- :default => 'Hello',
67
- :restricted => 'Hello',
68
- :basic => 'Hello',
69
- :relaxed => 'Hello',
70
- :document => ' Hello ',
71
- }
72
- }
73
-
74
- tricky = {
75
- 'protocol-based JS injection: simple, no spaces' => {
76
- :html => '<a href="javascript:alert(\'XSS\');">foo</a>',
77
- :default => 'foo',
78
- :restricted => 'foo',
79
- :basic => '<a rel="nofollow">foo</a>',
80
- :relaxed => '<a>foo</a>'
81
- },
82
-
83
- 'protocol-based JS injection: simple, spaces before' => {
84
- :html => '<a href="javascript :alert(\'XSS\');">foo</a>',
85
- :default => 'foo',
86
- :restricted => 'foo',
87
- :basic => '<a rel="nofollow">foo</a>',
88
- :relaxed => '<a>foo</a>'
89
- },
90
-
91
- 'protocol-based JS injection: simple, spaces after' => {
92
- :html => '<a href="javascript: alert(\'XSS\');">foo</a>',
93
- :default => 'foo',
94
- :restricted => 'foo',
95
- :basic => '<a rel="nofollow">foo</a>',
96
- :relaxed => '<a>foo</a>'
97
- },
98
-
99
- 'protocol-based JS injection: simple, spaces before and after' => {
100
- :html => '<a href="javascript : alert(\'XSS\');">foo</a>',
101
- :default => 'foo',
102
- :restricted => 'foo',
103
- :basic => '<a rel="nofollow">foo</a>',
104
- :relaxed => '<a>foo</a>'
105
- },
106
-
107
- 'protocol-based JS injection: preceding colon' => {
108
- :html => '<a href=":javascript:alert(\'XSS\');">foo</a>',
109
- :default => 'foo',
110
- :restricted => 'foo',
111
- :basic => '<a rel="nofollow">foo</a>',
112
- :relaxed => '<a>foo</a>'
113
- },
114
-
115
- 'protocol-based JS injection: UTF-8 encoding' => {
116
- :html => '<a href="javascript&#58;">foo</a>',
117
- :default => 'foo',
118
- :restricted => 'foo',
119
- :basic => '<a rel="nofollow">foo</a>',
120
- :relaxed => '<a>foo</a>'
121
- },
122
-
123
- 'protocol-based JS injection: long UTF-8 encoding' => {
124
- :html => '<a href="javascript&#0058;">foo</a>',
125
- :default => 'foo',
126
- :restricted => 'foo',
127
- :basic => '<a rel="nofollow">foo</a>',
128
- :relaxed => '<a>foo</a>'
129
- },
130
-
131
- 'protocol-based JS injection: long UTF-8 encoding without semicolons' => {
132
- :html => '<a href=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>foo</a>',
133
- :default => 'foo',
134
- :restricted => 'foo',
135
- :basic => '<a rel="nofollow">foo</a>',
136
- :relaxed => '<a>foo</a>'
137
- },
138
-
139
- 'protocol-based JS injection: hex encoding' => {
140
- :html => '<a href="javascript&#x3A;">foo</a>',
141
- :default => 'foo',
142
- :restricted => 'foo',
143
- :basic => '<a rel="nofollow">foo</a>',
144
- :relaxed => '<a>foo</a>'
145
- },
146
-
147
- 'protocol-based JS injection: long hex encoding' => {
148
- :html => '<a href="javascript&#x003A;">foo</a>',
149
- :default => 'foo',
150
- :restricted => 'foo',
151
- :basic => '<a rel="nofollow">foo</a>',
152
- :relaxed => '<a>foo</a>'
153
- },
154
-
155
- 'protocol-based JS injection: hex encoding without semicolons' => {
156
- :html => '<a href=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>foo</a>',
157
- :default => 'foo',
158
- :restricted => 'foo',
159
- :basic => '<a rel="nofollow">foo</a>',
160
- :relaxed => '<a>foo</a>'
161
- },
162
-
163
- 'protocol-based JS injection: null char' => {
164
- :html => "<img src=java\0script:alert(\"XSS\")>",
165
- :default => '',
166
- :restricted => '',
167
- :basic => '',
168
- :relaxed => '<img src="java">' # everything following the null char gets stripped, and URL is considered relative
169
- },
170
-
171
- 'protocol-based JS injection: invalid URL char' => {
172
- :html => '<img src=java\script:alert("XSS")>',
173
- :default => '',
174
- :restricted => '',
175
- :basic => '',
176
- :relaxed => '<img>'
177
- },
178
-
179
- 'protocol-based JS injection: spaces and entities' => {
180
- :html => '<img src=" &#14; javascript:alert(\'XSS\');">',
181
- :default => '',
182
- :restricted => '',
183
- :basic => '',
184
- :relaxed => '<img src>'
185
- }
186
- }
187
-
188
- describe 'Config::DEFAULT' do
189
- it 'should translate valid HTML entities' do
190
- Sanitize.clean("Don&apos;t tas&eacute; me &amp; bro!").must_equal("Don't tasé me &amp; bro!")
191
- end
192
-
193
- it 'should translate valid HTML entities while encoding unencoded ampersands' do
194
- Sanitize.clean("cookies&sup2; & &frac14; cr&eacute;me").must_equal("cookies² &amp; ¼ créme")
195
- end
196
-
197
- it 'should never output &apos;' do
198
- Sanitize.clean("<a href='&apos;' class=\"' &#39;\">IE6 isn't a real browser</a>").wont_match(/&apos;/)
199
- end
200
-
201
- it 'should not choke on several instances of the same element in a row' do
202
- Sanitize.clean('<img src="http://www.google.com/intl/en_ALL/images/logo.gif"><img src="http://www.google.com/intl/en_ALL/images/logo.gif"><img src="http://www.google.com/intl/en_ALL/images/logo.gif"><img src="http://www.google.com/intl/en_ALL/images/logo.gif">').must_equal('')
203
- end
204
-
205
- it 'should surround the contents of :whitespace_elements with space characters when removing the element' do
206
- Sanitize.clean('foo<div>bar</div>baz').must_equal('foo bar baz')
207
- Sanitize.clean('foo<br>bar<br>baz').must_equal('foo bar baz')
208
- Sanitize.clean('foo<hr>bar<hr>baz').must_equal('foo bar baz')
209
- end
210
-
211
- strings.each do |name, data|
212
- it "should clean #{name} HTML" do
213
- Sanitize.clean(data[:html]).must_equal(data[:default])
214
- end
215
- end
2
+ require_relative 'common'
216
3
 
217
- tricky.each do |name, data|
218
- it "should not allow #{name}" do
219
- Sanitize.clean(data[:html]).must_equal(data[:default])
220
- end
221
- end
222
- end
223
-
224
- describe 'Config::RESTRICTED' do
225
- before { @s = Sanitize.new(Sanitize::Config::RESTRICTED) }
226
-
227
- strings.each do |name, data|
228
- it "should clean #{name} HTML" do
229
- @s.clean(data[:html]).must_equal(data[:restricted])
230
- end
231
- end
4
+ describe 'Sanitize' do
5
+ describe 'initializer' do
6
+ it 'should not modify a transformers array in the given config' do
7
+ transformers = [
8
+ lambda {}
9
+ ]
232
10
 
233
- tricky.each do |name, data|
234
- it "should not allow #{name}" do
235
- @s.clean(data[:html]).must_equal(data[:restricted])
11
+ Sanitize.new({ :transformers => transformers })
12
+ transformers.length.must_equal(1)
236
13
  end
237
14
  end
238
- end
239
-
240
- describe 'Config::BASIC' do
241
- before { @s = Sanitize.new(Sanitize::Config::BASIC) }
242
-
243
- it 'should not choke on valueless attributes' do
244
- @s.clean('foo <a href>foo</a> bar').must_equal('foo <a href rel="nofollow">foo</a> bar')
245
- end
246
15
 
247
- it 'should downcase attribute names' do
248
- @s.clean('<a HREF="javascript:alert(\'foo\')">bar</a>').must_equal('<a rel="nofollow">bar</a>')
249
- end
250
-
251
- strings.each do |name, data|
252
- it "should clean #{name} HTML" do
253
- @s.clean(data[:html]).must_equal(data[:basic])
254
- end
255
- end
256
-
257
- tricky.each do |name, data|
258
- it "should not allow #{name}" do
259
- @s.clean(data[:html]).must_equal(data[:basic])
16
+ describe 'instance methods' do
17
+ before do
18
+ @s = Sanitize.new
260
19
  end
261
- end
262
- end
263
20
 
264
- describe 'Config::RELAXED' do
265
- before { @s = Sanitize.new(Sanitize::Config::RELAXED) }
21
+ describe '#document' do
22
+ before do
23
+ @s = Sanitize.new(:elements => ['html'])
24
+ end
266
25
 
267
- it 'should encode special chars in attribute values' do
268
- input = '<a href="http://example.com" title="<b>&eacute;xamples</b> & things">foo</a>'
269
- output = Nokogiri::HTML.fragment('<a href="http://example.com" title="&lt;b&gt;éxamples&lt;/b&gt; &amp; things">foo</a>').to_xhtml(:encoding => 'utf-8', :indent => 0, :save_with => Nokogiri::XML::Node::SaveOptions::AS_XHTML)
270
- @s.clean(input).must_equal(output)
271
- end
26
+ it 'should sanitize an HTML document' do
27
+ @s.document('<!doctype html><html><b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script></html>')
28
+ .must_equal "<html>Lorem ipsum dolor sit amet </html>"
29
+ end
272
30
 
273
- strings.each do |name, data|
274
- it "should clean #{name} HTML" do
275
- @s.clean(data[:html]).must_equal(data[:relaxed])
276
- end
277
- end
31
+ it 'should not modify the input string' do
32
+ input = '<!DOCTYPE html><b>foo</b>'
33
+ @s.document(input)
34
+ input.must_equal('<!DOCTYPE html><b>foo</b>')
35
+ end
278
36
 
279
- tricky.each do |name, data|
280
- it "should not allow #{name}" do
281
- @s.clean(data[:html]).must_equal(data[:relaxed])
282
- end
283
- end
284
- end
37
+ it 'should not choke on frozen documents' do
38
+ @s.document('<!doctype html><html><b>foo</b>'.freeze).must_equal "<html>foo</html>"
39
+ end
285
40
 
286
- describe 'Full Document parser (using clean_document)' do
287
- before {
288
- @s = Sanitize.new({:elements => %w[!DOCTYPE html]})
289
- @default_doctype = "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">"
290
- }
41
+ it 'should normalize newlines' do
42
+ @s.document("a\r\n\n\r\r\r\nz").must_equal "<html>a\n\n\n\n\nz</html>"
43
+ end
291
44
 
292
- it 'should require HTML element is whitelisted to prevent parser errors' do
293
- assert_raises(RuntimeError, 'You must have the HTML element whitelisted') {
294
- Sanitize.clean_document!('', {:elements => [], :remove_contents => false})
295
- }
296
- end
45
+ it 'should strip control characters (except ASCII whitespace)' do
46
+ sample_control_chars = "\u0001\u0008\u000b\u000e\u001f\u007f\u009f"
47
+ whitespace = "\t\n\f\u0020"
48
+ @s.document("a#{sample_control_chars}#{whitespace}z").must_equal "<html>a#{whitespace}z</html>"
49
+ end
297
50
 
298
- it 'should NOT require HTML element to be whitelisted if remove_contents is true' do
299
- output = '<!DOCTYPE html><html>foo</html>'
300
- Sanitize.clean_document!(output, {:remove_contents => true}).must_equal "<!DOCTYPE html>\n\n"
301
- end
51
+ it 'should strip non-characters' do
52
+ sample_non_chars = "\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}"
53
+ @s.document("a#{sample_non_chars}z").must_equal "<html>az</html>"
54
+ end
302
55
 
303
- it 'adds a doctype tag if not included' do
304
- @s.clean_document('').must_equal("#{@default_doctype}\n\n")
305
- end
56
+ describe 'when html body exceeds Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH' do
57
+ let(:content) do
58
+ content = nest_html_content('<b>foo</b>', Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH)
59
+ "<html>#{content}</html>"
60
+ end
306
61
 
307
- it 'should apply whitelist filtering to HTML element' do
308
- output = "<!DOCTYPE html>\n<html anything='false'></html>\n\n"
309
- @s.clean_document(output).must_equal("<!DOCTYPE html>\n<html></html>\n")
310
- end
62
+ it 'raises an ArgumentError exception' do
63
+ assert_raises ArgumentError do
64
+ @s.document(content)
65
+ end
66
+ end
311
67
 
312
- strings.each do |name, data|
313
- it "should wrap #{name} with DOCTYPE and HTML tag" do
314
- output = data[:document] || data[:default]
315
- @s.clean_document(data[:html]).must_equal("#{@default_doctype}\n<html>#{output}</html>\n")
316
- end
317
- end
68
+ describe 'and :max_tree_depth of -1 is supplied in :parser_options' do
69
+ before do
70
+ @s = Sanitize.new(elements: ['html'], parser_options: { max_tree_depth: -1 })
71
+ end
318
72
 
319
- tricky.each do |name, data|
320
- it "should wrap #{name} with DOCTYPE and HTML tag" do
321
- @s.clean_document(data[:html]).must_equal("#{@default_doctype}\n<html>#{data[:default]}</html>\n")
73
+ it 'does not raise an ArgumentError exception' do
74
+ @s.document(content).must_equal '<html>foo</html>'
75
+ end
76
+ end
77
+ end
322
78
  end
323
- end
324
- end
325
-
326
- describe 'Custom configs' do
327
- it 'should allow attributes on all elements if whitelisted under :all' do
328
- input = '<p class="foo">bar</p>'
329
-
330
- Sanitize.clean(input).must_equal(' bar ')
331
- Sanitize.clean(input, {:elements => ['p'], :attributes => {:all => ['class']}}).must_equal(input)
332
- Sanitize.clean(input, {:elements => ['p'], :attributes => {'div' => ['class']}}).must_equal('<p>bar</p>')
333
- Sanitize.clean(input, {:elements => ['p'], :attributes => {'p' => ['title'], :all => ['class']}}).must_equal(input)
334
- end
335
-
336
- it 'should allow comments when :allow_comments == true' do
337
- input = 'foo <!-- bar --> baz'
338
- Sanitize.clean(input).must_equal('foo baz')
339
- Sanitize.clean(input, :allow_comments => true).must_equal(input)
340
- end
341
-
342
- it 'should allow relative URLs containing colons where the colon is not in the first path segment' do
343
- input = '<a href="/wiki/Special:Random">Random Page</a>'
344
- Sanitize.clean(input, { :elements => ['a'], :attributes => {'a' => ['href']}, :protocols => { 'a' => { 'href' => [:relative] }} }).must_equal(input)
345
- end
346
-
347
- it 'should allow relative URLs containing colons where the colon is part of an anchor' do
348
- input = '<a href="#fn:1">Footnote 1</a>'
349
- Sanitize.clean(input, { :elements => ['a'], :attributes => {'a' => ['href']}, :protocols => { 'a' => { 'href' => [:relative] }} }).must_equal(input)
350
- end
351
-
352
- it 'should allow relative URLs containing colons where the colon is part of an anchor' do
353
- input = '<a href="somepage#fn:1">Footnote 1</a>'
354
- Sanitize.clean(input, { :elements => ['a'], :attributes => {'a' => ['href']}, :protocols => { 'a' => { 'href' => [:relative] }} }).must_equal(input)
355
- end
356
-
357
- it 'should output HTML when :output == :html' do
358
- input = 'foo<br/>bar<br>baz'
359
- Sanitize.clean(input, :elements => ['br'], :output => :html).must_equal('foo<br>bar<br>baz')
360
- end
361
-
362
- it 'should remove the contents of filtered nodes when :remove_contents == true' do
363
- Sanitize.clean('foo bar <div>baz<span>quux</span></div>', :remove_contents => true).must_equal('foo bar ')
364
- end
365
79
 
366
- it 'should remove the contents of specified nodes when :remove_contents is an Array of element names as strings' do
367
- Sanitize.clean('foo bar <div>baz<span>quux</span><script>alert("hello!");</script></div>', :remove_contents => ['script', 'span']).must_equal('foo bar baz ')
368
- end
369
-
370
- it 'should remove the contents of specified nodes when :remove_contents is an Array of element names as symbols' do
371
- Sanitize.clean('foo bar <div>baz<span>quux</span><script>alert("hello!");</script></div>', :remove_contents => [:script, :span]).must_equal('foo bar baz ')
372
- end
373
-
374
- it 'should support encodings other than utf-8' do
375
- html = 'foo&nbsp;bar'
376
- Sanitize.clean(html).must_equal("foo\302\240bar")
377
- Sanitize.clean(html, :output_encoding => 'ASCII').must_equal("foo&#160;bar")
378
- end
379
-
380
- it 'should not allow arbitrary HTML5 data attributes by default' do
381
- config = {
382
- :elements => ['b']
383
- }
384
-
385
- Sanitize.clean('<b data-foo="bar"></b>', config)
386
- .must_equal('<b></b>')
387
-
388
- config[:attributes] = {'b' => ['class']}
389
-
390
- Sanitize.clean('<b class="foo" data-foo="bar"></b>', config)
391
- .must_equal('<b class="foo"></b>')
392
- end
393
-
394
- it 'should allow arbitrary HTML5 data attributes when the :attributes config includes :data' do
395
- config = {
396
- :attributes => {'b' => [:data]},
397
- :elements => ['b']
398
- }
399
-
400
- Sanitize.clean('<b data-foo="valid" data-bar="valid"></b>', config)
401
- .must_equal('<b data-foo="valid" data-bar="valid"></b>')
402
-
403
- Sanitize.clean('<b data-="invalid"></b>', config)
404
- .must_equal('<b></b>')
405
-
406
- Sanitize.clean('<b data-="invalid"></b>', config)
407
- .must_equal('<b></b>')
408
-
409
- Sanitize.clean('<b data-xml="invalid"></b>', config)
410
- .must_equal('<b></b>')
411
-
412
- Sanitize.clean('<b data-xmlfoo="invalid"></b>', config)
413
- .must_equal('<b></b>')
414
-
415
- Sanitize.clean('<b data-f:oo="valid"></b>', config)
416
- .must_equal('<b></b>')
417
-
418
- Sanitize.clean('<b data-f/oo="partial"></b>', config)
419
- .must_equal('<b data-f></b>') # Nokogiri quirk; not ideal, but harmless
420
-
421
- Sanitize.clean('<b data-éfoo="valid"></b>', config)
422
- .must_equal('<b></b>') # Another annoying Nokogiri quirk.
423
- end
424
- end
425
-
426
- describe 'Sanitize.clean' do
427
- it 'should not modify the input string' do
428
- input = '<b>foo</b>'
429
- Sanitize.clean(input)
430
- input.must_equal('<b>foo</b>')
431
- end
432
-
433
- it 'should return a new string' do
434
- input = '<b>foo</b>'
435
- Sanitize.clean(input).must_equal('foo')
436
- end
437
- end
438
-
439
- describe 'Sanitize.clean!' do
440
- it 'should modify the input string' do
441
- input = '<b>foo</b>'
442
- Sanitize.clean!(input)
443
- input.must_equal('foo')
444
- end
445
-
446
- it 'should return the string if it was modified' do
447
- input = '<b>foo</b>'
448
- Sanitize.clean!(input).must_equal('foo')
449
- end
450
-
451
- it 'should return nil if the string was not modified' do
452
- input = 'foo'
453
- Sanitize.clean!(input).must_equal(nil)
454
- end
455
- end
456
-
457
- describe 'Sanitize.clean_document' do
458
- before { @config = { :elements => ['html', 'p'] } }
459
-
460
- it 'should be idempotent' do
461
- input = '<!DOCTYPE html><html><p>foo</p></html>'
462
- first = Sanitize.clean_document(input, @config)
463
- second = Sanitize.clean_document(first, @config)
464
- second.must_equal first
465
- second.wont_be_nil
466
- end
467
-
468
- it 'should handle nil without raising' do
469
- Sanitize.clean_document(nil).must_equal nil
470
- end
471
-
472
- it 'should not modify the input string' do
473
- input = '<!DOCTYPE html><b>foo</b>'
474
- Sanitize.clean_document(input, @config)
475
- input.must_equal('<!DOCTYPE html><b>foo</b>')
476
- end
477
-
478
- it 'should return a new string' do
479
- input = '<!DOCTYPE html><b>foo</b>'
480
- Sanitize.clean_document(input, @config).must_equal("<!DOCTYPE html>\n<html>foo</html>\n")
481
- end
482
- end
483
-
484
- describe 'Sanitize.clean_document!' do
485
- before { @config = { :elements => ['html'] } }
486
-
487
- it 'should modify the input string' do
488
- input = '<!DOCTYPE html><html><body><b>foo</b></body></html>'
489
- Sanitize.clean_document!(input, @config)
490
- input.must_equal("<!DOCTYPE html>\n<html>foo</html>\n")
491
- end
492
-
493
- it 'should return the string if it was modified' do
494
- input = '<!DOCTYPE html><html><body><b>foo</b></body></html>'
495
- Sanitize.clean_document!(input, @config).must_equal("<!DOCTYPE html>\n<html>foo</html>\n")
496
- end
497
-
498
- it 'should return nil if the string was not modified' do
499
- input = "<!DOCTYPE html>\n<html></html>\n"
500
- Sanitize.clean_document!(input, @config).must_equal(nil)
501
- end
502
- end
503
-
504
- describe 'transformers' do
505
- # YouTube embed transformer.
506
- youtube = lambda do |env|
507
- node = env[:node]
508
- node_name = env[:node_name]
509
-
510
- # Don't continue if this node is already whitelisted or is not an element.
511
- return if env[:is_whitelisted] || !node.element?
512
-
513
- # Don't continue unless the node is an iframe.
514
- return unless node_name == 'iframe'
515
-
516
- # Verify that the video URL is actually a valid YouTube video URL.
517
- return unless node['src'] =~ /\Ahttps?:\/\/(?:www\.)?youtube(?:-nocookie)?\.com\//
518
-
519
- # We're now certain that this is a YouTube embed, but we still need to run
520
- # it through a special Sanitize step to ensure that no unwanted elements or
521
- # attributes that don't belong in a YouTube embed can sneak in.
522
- Sanitize.clean_node!(node, {
523
- :elements => %w[iframe],
524
-
525
- :attributes => {
526
- 'iframe' => %w[allowfullscreen frameborder height src width]
527
- }
528
- })
529
-
530
- # Now that we're sure that this is a valid YouTube embed and that there are
531
- # no unwanted elements or attributes hidden inside it, we can tell Sanitize
532
- # to whitelist the current node.
533
- {:node_whitelist => [node]}
534
- end
535
-
536
- it 'should receive a complete env Hash as input' do
537
- Sanitize.clean!('<SPAN>foo</SPAN>', :foo => :bar, :transformers => lambda {|env|
538
- return unless env[:node].element?
539
-
540
- env[:config][:foo].must_equal(:bar)
541
- env[:is_whitelisted].must_equal(false)
542
- env[:node].must_be_kind_of(Nokogiri::XML::Node)
543
- env[:node_name].must_equal('span')
544
- env[:node_whitelist].must_be_kind_of(Set)
545
- env[:node_whitelist].must_be_empty
546
- })
547
- end
548
-
549
- it 'should traverse all node types, including the fragment itself' do
550
- nodes = []
551
-
552
- Sanitize.clean!('<div>foo</div><!--bar--><script>cdata!</script>', :transformers => proc {|env|
553
- nodes << env[:node_name]
554
- })
555
-
556
- nodes.must_equal(%w[
557
- text div comment #cdata-section script #document-fragment
558
- ])
559
- end
560
-
561
- it 'should traverse in depth-first mode by default' do
562
- nodes = []
563
-
564
- Sanitize.clean!('<div><span>foo</span></div><p>bar</p>', :transformers => proc {|env|
565
- env[:traversal_mode].must_equal(:depth)
566
- nodes << env[:node_name] if env[:node].element?
567
- })
568
-
569
- nodes.must_equal(['span', 'div', 'p'])
570
- end
571
-
572
- it 'should traverse in breadth-first mode when using :transformers_breadth' do
573
- nodes = []
574
-
575
- Sanitize.clean!('<div><span>foo</span></div><p>bar</p>', :transformers_breadth => proc {|env|
576
- env[:traversal_mode].must_equal(:breadth)
577
- nodes << env[:node_name] if env[:node].element?
578
- })
579
-
580
- nodes.must_equal(['div', 'span', 'p'])
581
- end
582
-
583
- it 'should whitelist nodes in the node whitelist' do
584
- Sanitize.clean!('<div class="foo">foo</div><span>bar</span>', :transformers => [
585
- proc {|env|
586
- {:node_whitelist => [env[:node]]} if env[:node_name] == 'div'
587
- },
588
-
589
- proc {|env|
590
- env[:is_whitelisted].must_equal(false) unless env[:node_name] == 'div'
591
- env[:is_whitelisted].must_equal(true) if env[:node_name] == 'div'
592
- env[:node_whitelist].must_include(env[:node]) if env[:node_name] == 'div'
593
- }
594
- ]).must_equal('<div class="foo">foo</div>bar')
595
- end
596
-
597
- it 'should clear the node whitelist after each fragment' do
598
- called = false
80
+ describe '#fragment' do
81
+ it 'should sanitize an HTML fragment' do
82
+ @s.fragment('<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>')
83
+ .must_equal 'Lorem ipsum dolor sit amet '
84
+ end
599
85
 
600
- Sanitize.clean!('<div>foo</div>', :transformers => proc {|env|
601
- {:node_whitelist => [env[:node]]}
602
- })
86
+ it 'should not modify the input string' do
87
+ input = '<b>foo</b>'
88
+ @s.fragment(input)
89
+ input.must_equal '<b>foo</b>'
90
+ end
603
91
 
604
- Sanitize.clean!('<div>foo</div>', :transformers => proc {|env|
605
- called = true
606
- env[:is_whitelisted].must_equal(false)
607
- env[:node_whitelist].must_be_empty
608
- })
92
+ it 'should not choke on fragments containing <html> or <body>' do
93
+ @s.fragment('<html><b>foo</b></html>').must_equal 'foo'
94
+ @s.fragment('<body><b>foo</b></body>').must_equal 'foo'
95
+ @s.fragment('<html><body><b>foo</b></body></html>').must_equal 'foo'
96
+ @s.fragment('<!DOCTYPE html><html><body><b>foo</b></body></html>').must_equal 'foo'
97
+ end
609
98
 
610
- called.must_equal(true)
611
- end
99
+ it 'should not choke on frozen fragments' do
100
+ @s.fragment('<b>foo</b>'.freeze).must_equal 'foo'
101
+ end
612
102
 
613
- it 'should allow youtube video embeds via the youtube transformer' do
614
- input = '<iframe width="420" height="315" src="http://www.youtube.com/embed/QH2-TGUlwu4" frameborder="0" allowfullscreen bogus="bogus"><script>alert()</script></iframe>'
615
- output = Nokogiri::HTML::DocumentFragment.parse('<iframe width="420" height="315" src="http://www.youtube.com/embed/QH2-TGUlwu4" frameborder="0" allowfullscreen>alert()</iframe>').to_html(:encoding => 'utf-8', :indent => 0)
103
+ it 'should normalize newlines' do
104
+ @s.fragment("a\r\n\n\r\r\r\nz").must_equal "a\n\n\n\n\nz"
105
+ end
616
106
 
617
- Sanitize.clean!(input, :transformers => youtube).must_equal(output)
618
- end
107
+ it 'should strip control characters (except ASCII whitespace)' do
108
+ sample_control_chars = "\u0001\u0008\u000b\u000e\u001f\u007f\u009f"
109
+ whitespace = "\t\n\f\u0020"
110
+ @s.fragment("a#{sample_control_chars}#{whitespace}z").must_equal "a#{whitespace}z"
111
+ end
619
112
 
620
- it 'should allow https youtube video embeds via the youtube transformer' do
621
- input = '<iframe width="420" height="315" src="https://www.youtube.com/embed/QH2-TGUlwu4" frameborder="0" allowfullscreen bogus="bogus"><script>alert()</script></iframe>'
622
- output = Nokogiri::HTML::DocumentFragment.parse('<iframe width="420" height="315" src="https://www.youtube.com/embed/QH2-TGUlwu4" frameborder="0" allowfullscreen>alert()</iframe>').to_html(:encoding => 'utf-8', :indent => 0)
113
+ it 'should strip non-characters' do
114
+ sample_non_chars = "\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}"
115
+ @s.fragment("a#{sample_non_chars}z").must_equal "az"
116
+ end
623
117
 
624
- Sanitize.clean!(input, :transformers => youtube).must_equal(output)
625
- end
118
+ describe 'when html body exceeds Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH' do
119
+ let(:content) do
120
+ content = nest_html_content('<b>foo</b>', Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH)
121
+ "<body>#{content}</body>"
122
+ end
626
123
 
627
- it 'should allow privacy-enhanced youtube video embeds via the youtube transformer' do
628
- input = '<iframe width="420" height="315" src="http://www.youtube-nocookie.com/embed/QH2-TGUlwu4" frameborder="0" allowfullscreen bogus="bogus"><script>alert()</script></iframe>'
629
- output = Nokogiri::HTML::DocumentFragment.parse('<iframe width="420" height="315" src="http://www.youtube-nocookie.com/embed/QH2-TGUlwu4" frameborder="0" allowfullscreen>alert()</iframe>').to_html(:encoding => 'utf-8', :indent => 0)
124
+ it 'raises an ArgumentError exception' do
125
+ assert_raises ArgumentError do
126
+ @s.fragment(content)
127
+ end
128
+ end
630
129
 
631
- Sanitize.clean!(input, :transformers => youtube).must_equal(output)
632
- end
130
+ describe 'and :max_tree_depth of -1 is supplied in :parser_options' do
131
+ before do
132
+ @s = Sanitize.new(parser_options: { max_tree_depth: -1 })
133
+ end
633
134
 
634
- it 'should not allow non-youtube video embeds via the youtube transformer' do
635
- input = '<iframe width="420" height="315" src="http://www.fake-youtube.com/embed/QH2-TGUlwu4" frameborder="0" allowfullscreen></iframe>'
636
- output = ''
135
+ it 'does not raise an ArgumentError exception' do
136
+ @s.fragment(content).must_equal 'foo'
137
+ end
138
+ end
139
+ end
140
+ end
637
141
 
638
- Sanitize.clean!(input, :transformers => youtube).must_equal(output)
639
- end
640
- end
142
+ describe '#node!' do
143
+ it 'should sanitize a Nokogiri::XML::Node' do
144
+ doc = Nokogiri::HTML5.parse('<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>')
145
+ frag = doc.fragment
641
146
 
642
- describe 'bugs' do
643
- it 'should not have Nokogiri 1.4.2+ unterminated script/style element bug' do
644
- Sanitize.clean!('foo <script>bar').must_equal('foo bar')
645
- Sanitize.clean!('foo <style>bar').must_equal('foo bar')
646
- end
647
- end
147
+ doc.xpath('/html/body/node()').each {|node| frag << node }
648
148
 
649
- describe 'Malicious HTML' do
650
- make_my_diffs_pretty!
651
- parallelize_me!
149
+ @s.node!(frag)
150
+ frag.to_html.must_equal 'Lorem ipsum dolor sit amet '
151
+ end
652
152
 
653
- before do
654
- @s = Sanitize.new(Sanitize::Config::RELAXED)
153
+ describe "when the given node is a document and <html> isn't allowlisted" do
154
+ it 'should raise a Sanitize::Error' do
155
+ doc = Nokogiri::HTML5.parse('foo')
156
+ proc { @s.node!(doc) }.must_raise Sanitize::Error
157
+ end
158
+ end
159
+ end
655
160
  end
656
161
 
657
- # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
658
- # attempt to preserve server-side includes. This can result in XSS since an
659
- # unescaped double quote can allow an attacker to inject a non-whitelisted
660
- # attribute. Sanitize works around this by implementing its own escaping for
661
- # affected attributes.
662
- #
663
- # The relevant libxml2 code is here:
664
- # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
665
- describe 'unsafe libxml2 server-side includes in attributes' do
666
- tag_configs = [
667
- {
668
- tag_name: 'a',
669
- escaped_attrs: %w[ action href src name ],
670
- unescaped_attrs: []
671
- },
672
-
673
- {
674
- tag_name: 'div',
675
- escaped_attrs: %w[ action href src ],
676
- unescaped_attrs: %w[ name ]
677
- }
678
- ]
679
-
680
- before do
681
- @s = Sanitize.new({
682
- elements: %w[ a div ],
683
-
684
- attributes: {
685
- all: %w[ action href src name ]
686
- }
687
- })
162
+ describe 'class methods' do
163
+ describe '.document' do
164
+ it 'should sanitize an HTML document with the given config' do
165
+ html = '<!doctype html><html><b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script></html>'
166
+ Sanitize.document(html, :elements => ['html'])
167
+ .must_equal "<html>Lorem ipsum dolor sit amet </html>"
168
+ end
688
169
  end
689
170
 
690
- tag_configs.each do |tag_config|
691
- tag_name = tag_config[:tag_name]
692
-
693
- tag_config[:escaped_attrs].each do |attr_name|
694
- input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]
695
-
696
- it 'should escape unsafe characters in attributes' do
697
- @s.clean(input).must_equal(%[<#{tag_name} #{attr_name}="examp<!--%22%20onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
698
- end
699
-
700
- it 'should round-trip to the same output' do
701
- output = @s.clean(input)
702
- @s.clean(output).must_equal(output)
703
- end
171
+ describe '.fragment' do
172
+ it 'should sanitize an HTML fragment with the given config' do
173
+ html = '<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>'
174
+ Sanitize.fragment(html, :elements => ['strong'])
175
+ .must_equal 'Lorem ipsum <strong>dolor</strong> sit amet '
704
176
  end
177
+ end
705
178
 
706
- tag_config[:unescaped_attrs].each do |attr_name|
707
- input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]
179
+ describe '.node!' do
180
+ it 'should sanitize a Nokogiri::XML::Node with the given config' do
181
+ doc = Nokogiri::HTML5.parse('<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>')
182
+ frag = doc.fragment
708
183
 
709
- it 'should not escape characters unnecessarily' do
710
- @s.clean(input).must_equal(input)
711
- end
184
+ doc.xpath('/html/body/node()').each {|node| frag << node }
712
185
 
713
- it 'should round-trip to the same output' do
714
- output = @s.clean(input)
715
- @s.clean(output).must_equal(output)
716
- end
186
+ Sanitize.node!(frag, :elements => ['strong'])
187
+ frag.to_html.must_equal 'Lorem ipsum <strong>dolor</strong> sit amet '
717
188
  end
718
189
  end
719
190
  end
720
- end
721
191
 
192
+ private
193
+
194
+ def nest_html_content(html_content, depth)
195
+ "#{'<span>' * depth}#{html_content}#{'</span>' * depth}"
196
+ end
197
+ end