sanitize 4.6.4 → 6.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -17,124 +17,126 @@ describe 'Malicious HTML' do
17
17
 
18
18
  describe 'comments' do
19
19
  it 'should not allow script injection via conditional comments' do
20
- @s.fragment(%[<!--[if gte IE 4]>\n<script>alert('XSS');</script>\n<![endif]-->]).
20
+ _(@s.fragment(%[<!--[if gte IE 4]>\n<script>alert('XSS');</script>\n<![endif]-->])).
21
21
  must_equal ''
22
22
  end
23
23
  end
24
24
 
25
25
  describe 'interpolation (ERB, PHP, etc.)' do
26
26
  it 'should escape ERB-style tags' do
27
- @s.fragment('<% naughty_ruby_code %>').
27
+ _(@s.fragment('<% naughty_ruby_code %>')).
28
28
  must_equal '&lt;% naughty_ruby_code %&gt;'
29
29
 
30
- @s.fragment('<%= naughty_ruby_code %>').
30
+ _(@s.fragment('<%= naughty_ruby_code %>')).
31
31
  must_equal '&lt;%= naughty_ruby_code %&gt;'
32
32
  end
33
33
 
34
34
  it 'should remove PHP-style tags' do
35
- @s.fragment('<? naughtyPHPCode(); ?>').
35
+ _(@s.fragment('<? naughtyPHPCode(); ?>')).
36
36
  must_equal ''
37
37
 
38
- @s.fragment('<?= naughtyPHPCode(); ?>').
38
+ _(@s.fragment('<?= naughtyPHPCode(); ?>')).
39
39
  must_equal ''
40
40
  end
41
41
  end
42
42
 
43
43
  describe '<body>' do
44
44
  it 'should not be possible to inject JS via a malformed event attribute' do
45
- @s.document('<html><head></head><body onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert("XSS")></body></html>').
46
- must_equal "<html><head></head><body></body></html>\n"
45
+ _(@s.document('<html><head></head><body onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert("XSS")></body></html>')).
46
+ must_equal "<html><head></head><body></body></html>"
47
47
  end
48
48
  end
49
49
 
50
50
  describe '<iframe>' do
51
51
  it 'should not be possible to inject an iframe using an improperly closed tag' do
52
- @s.fragment(%[<iframe src=http://ha.ckers.org/scriptlet.html <]).
52
+ _(@s.fragment(%[<iframe src=http://ha.ckers.org/scriptlet.html <])).
53
53
  must_equal ''
54
54
  end
55
55
  end
56
56
 
57
57
  describe '<img>' do
58
58
  it 'should not be possible to inject JS via an unquoted <img> src attribute' do
59
- @s.fragment("<img src=javascript:alert('XSS')>").must_equal '<img>'
59
+ _(@s.fragment("<img src=javascript:alert('XSS')>")).must_equal '<img>'
60
60
  end
61
61
 
62
62
  it 'should not be possible to inject JS using grave accents as <img> src delimiters' do
63
- @s.fragment("<img src=`javascript:alert('XSS')`>").must_equal '<img>'
63
+ _(@s.fragment("<img src=`javascript:alert('XSS')`>")).must_equal '<img>'
64
64
  end
65
65
 
66
66
  it 'should not be possible to inject <script> via a malformed <img> tag' do
67
- @s.fragment('<img """><script>alert("XSS")</script>">').
68
- must_equal '<img>alert("XSS")"&gt;'
67
+ _(@s.fragment('<img """><script>alert("XSS")</script>">')).
68
+ must_equal '<img>"&gt;'
69
69
  end
70
70
 
71
71
  it 'should not be possible to inject protocol-based JS' do
72
- @s.fragment('<img src=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;>').
72
+ _(@s.fragment('<img src=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;>')).
73
73
  must_equal '<img>'
74
74
 
75
- @s.fragment('<img src=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>').
75
+ _(@s.fragment('<img src=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>')).
76
76
  must_equal '<img>'
77
77
 
78
- @s.fragment('<img src=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>').
78
+ _(@s.fragment('<img src=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>')).
79
79
  must_equal '<img>'
80
80
 
81
81
  # Encoded tab character.
82
- @s.fragment(%[<img src="jav&#x09;ascript:alert('XSS');">]).
82
+ _(@s.fragment(%[<img src="jav&#x09;ascript:alert('XSS');">])).
83
83
  must_equal '<img>'
84
84
 
85
85
  # Encoded newline.
86
- @s.fragment(%[<img src="jav&#x0A;ascript:alert('XSS');">]).
86
+ _(@s.fragment(%[<img src="jav&#x0A;ascript:alert('XSS');">])).
87
87
  must_equal '<img>'
88
88
 
89
89
  # Encoded carriage return.
90
- @s.fragment(%[<img src="jav&#x0D;ascript:alert('XSS');">]).
90
+ _(@s.fragment(%[<img src="jav&#x0D;ascript:alert('XSS');">])).
91
91
  must_equal '<img>'
92
92
 
93
93
  # Null byte.
94
- @s.fragment(%[<img src=java\0script:alert("XSS")>]).
94
+ _(@s.fragment(%[<img src=java\0script:alert("XSS")>])).
95
95
  must_equal '<img>'
96
96
 
97
97
  # Spaces plus meta char.
98
- @s.fragment(%[<img src=" &#14; javascript:alert('XSS');">]).
98
+ _(@s.fragment(%[<img src=" &#14; javascript:alert('XSS');">])).
99
99
  must_equal '<img>'
100
100
 
101
101
  # Mixed spaces and tabs.
102
- @s.fragment(%[<img src="j\na v\tascript://alert('XSS');">]).
102
+ _(@s.fragment(%[<img src="j\na v\tascript://alert('XSS');">])).
103
103
  must_equal '<img>'
104
104
  end
105
105
 
106
106
  it 'should not be possible to inject protocol-based JS via whitespace' do
107
- @s.fragment(%[<img src="jav\tascript:alert('XSS');">]).
107
+ _(@s.fragment(%[<img src="jav\tascript:alert('XSS');">])).
108
108
  must_equal '<img>'
109
109
  end
110
110
 
111
111
  it 'should not be possible to inject JS using a half-open <img> tag' do
112
- @s.fragment(%[<img src="javascript:alert('XSS')"]).
112
+ _(@s.fragment(%[<img src="javascript:alert('XSS')"])).
113
113
  must_equal ''
114
114
  end
115
115
  end
116
116
 
117
117
  describe '<script>' do
118
118
  it 'should not be possible to inject <script> using a malformed non-alphanumeric tag name' do
119
- @s.fragment(%[<script/xss src="http://ha.ckers.org/xss.js">alert(1)</script>]).
120
- must_equal 'alert(1)'
119
+ _(@s.fragment(%[<script/xss src="http://ha.ckers.org/xss.js">alert(1)</script>])).
120
+ must_equal ''
121
121
  end
122
122
 
123
123
  it 'should not be possible to inject <script> via extraneous open brackets' do
124
- @s.fragment(%[<<script>alert("XSS");//<</script>]).
125
- must_equal '&lt;alert("XSS");//&lt;'
124
+ _(@s.fragment(%[<<script>alert("XSS");//<</script>])).
125
+ must_equal '&lt;'
126
126
  end
127
127
  end
128
128
 
129
129
  # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
130
130
  # attempt to preserve server-side includes. This can result in XSS since an
131
- # unescaped double quote can allow an attacker to inject a non-whitelisted
131
+ # unescaped double quote can allow an attacker to inject a non-allowlisted
132
132
  # attribute. Sanitize works around this by implementing its own escaping for
133
133
  # affected attributes.
134
134
  #
135
135
  # The relevant libxml2 code is here:
136
136
  # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
137
137
  describe 'unsafe libxml2 server-side includes in attributes' do
138
+ using_unpatched_libxml2 = Nokogiri::VersionInfo.instance.libxml2_using_system?
139
+
138
140
  tag_configs = [
139
141
  {
140
142
  tag_name: 'a',
@@ -166,12 +168,26 @@ describe 'Malicious HTML' do
166
168
  input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]
167
169
 
168
170
  it 'should escape unsafe characters in attributes' do
169
- @s.fragment(input).must_equal(%[<#{tag_name} #{attr_name}="examp<!--%22%20onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
171
+ skip "behavior should only exist in nokogiri's patched libxml" if using_unpatched_libxml2
172
+
173
+ # This uses Nokogumbo's HTML-compliant serializer rather than
174
+ # libxml2's.
175
+ _(@s.fragment(input)).
176
+ must_equal(%[<#{tag_name} #{attr_name}="examp<!--%22%20onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
177
+
178
+ # This uses the not-quite-standards-compliant libxml2 serializer via
179
+ # Nokogiri, so the output may be a little different as of Nokogiri
180
+ # 1.10.2 when using Nokogiri's vendored libxml2 due to this patch:
181
+ # https://github.com/sparklemotion/nokogiri/commit/4852e43cb6039e26d8c51af78621e539cbf46c5d
182
+ fragment = Nokogiri::HTML.fragment(input)
183
+ @s.node!(fragment)
184
+ _(fragment.to_html).
185
+ must_equal(%[<#{tag_name} #{attr_name}="examp&lt;!--%22%20onmouseover=alert(1)&gt;--&gt;le.com">foo</#{tag_name}>])
170
186
  end
171
187
 
172
188
  it 'should round-trip to the same output' do
173
189
  output = @s.fragment(input)
174
- @s.fragment(output).must_equal(output)
190
+ _(@s.fragment(output)).must_equal(output)
175
191
  end
176
192
  end
177
193
 
@@ -179,14 +195,145 @@ describe 'Malicious HTML' do
179
195
  input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]
180
196
 
181
197
  it 'should not escape characters unnecessarily' do
182
- @s.fragment(input).must_equal(input)
198
+ skip "behavior should only exist in nokogiri's patched libxml" if using_unpatched_libxml2
199
+
200
+ # This uses Nokogumbo's HTML-compliant serializer rather than
201
+ # libxml2's.
202
+ _(@s.fragment(input)).
203
+ must_equal(%[<#{tag_name} #{attr_name}="examp<!--&quot; onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
204
+
205
+ # This uses the not-quite-standards-compliant libxml2 serializer via
206
+ # Nokogiri, so the output may be a little different as of Nokogiri
207
+ # 1.10.2 when using Nokogiri's vendored libxml2 due to this patch:
208
+ # https://github.com/sparklemotion/nokogiri/commit/4852e43cb6039e26d8c51af78621e539cbf46c5d
209
+ fragment = Nokogiri::HTML.fragment(input)
210
+ @s.node!(fragment)
211
+ _(fragment.to_html).
212
+ must_equal(%[<#{tag_name} #{attr_name}='examp&lt;!--" onmouseover=alert(1)&gt;--&gt;le.com'>foo</#{tag_name}>])
183
213
  end
184
214
 
185
215
  it 'should round-trip to the same output' do
186
216
  output = @s.fragment(input)
187
- @s.fragment(output).must_equal(output)
217
+ _(@s.fragment(output)).must_equal(output)
188
218
  end
189
219
  end
190
220
  end
191
221
  end
222
+
223
+ # https://github.com/rgrove/sanitize/security/advisories/GHSA-p4x4-rw2p-8j8m
224
+ describe 'foreign content bypass in relaxed config' do
225
+ it 'prevents a sanitization bypass via carefully crafted foreign content' do
226
+ %w[iframe noembed noframes noscript plaintext script style xmp].each do |tag_name|
227
+ _(@s.fragment(%[<math><#{tag_name}>/*&lt;/#{tag_name}&gt;&lt;img src onerror=alert(1)>*/])).
228
+ must_equal ''
229
+
230
+ _(@s.fragment(%[<svg><#{tag_name}>/*&lt;/#{tag_name}&gt;&lt;img src onerror=alert(1)>*/])).
231
+ must_equal ''
232
+ end
233
+ end
234
+ end
235
+
236
+ # These tests cover an unsupported and unsafe custom config that allows MathML
237
+ # and SVG elements, which Sanitize's docs specifically say multiple times in
238
+ # big prominent warnings that you SHOULD NOT DO because Sanitize doesn't
239
+ # support MathML or SVG.
240
+ #
241
+ # Do not use the custom configs you see in these tests! If you do, you may be
242
+ # creating XSS vulnerabilities in your application.
243
+ describe 'foreign content bypass in unsafe custom config that allows MathML or SVG' do
244
+ unescaped_content_elements = %w[
245
+ noembed
246
+ noframes
247
+ plaintext
248
+ script
249
+ xmp
250
+ ]
251
+
252
+ removed_content_elements = %w[
253
+ iframe
254
+ ]
255
+
256
+ removed_elements = %w[
257
+ noscript
258
+ style
259
+ ]
260
+
261
+ before do
262
+ @s = Sanitize.new(
263
+ Sanitize::Config.merge(
264
+ Sanitize::Config::RELAXED,
265
+ elements: Sanitize::Config::RELAXED[:elements] +
266
+ unescaped_content_elements +
267
+ removed_content_elements +
268
+ %w[math svg]
269
+ )
270
+ )
271
+ end
272
+
273
+ unescaped_content_elements.each do |name|
274
+ it "forcibly escapes text content inside `<#{name}>` in a MathML namespace" do
275
+ assert_equal(
276
+ "<math><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}></math>",
277
+ @s.fragment("<math><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}>")
278
+ )
279
+ end
280
+
281
+ it "forcibly escapes text content inside `<#{name}>` in an SVG namespace" do
282
+ assert_equal(
283
+ "<svg><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}></svg>",
284
+ @s.fragment("<svg><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}>")
285
+ )
286
+ end
287
+ end
288
+
289
+ removed_content_elements.each do |name|
290
+ it "removes text content inside `<#{name}>` in a MathML namespace" do
291
+ assert_equal(
292
+ "<math><#{name}></#{name}></math>",
293
+ @s.fragment("<math><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}>")
294
+ )
295
+ end
296
+
297
+ it "removes text content inside `<#{name}>` in an SVG namespace" do
298
+ assert_equal(
299
+ "<svg><#{name}></#{name}></svg>",
300
+ @s.fragment("<svg><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}>")
301
+ )
302
+ end
303
+ end
304
+
305
+ removed_elements.each do |name|
306
+ it "removes `<#{name}>` elements in a MathML namespace" do
307
+ assert_equal(
308
+ '<math></math>',
309
+ @s.fragment("<math><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}>")
310
+ )
311
+ end
312
+
313
+ it "removes `<#{name}>` elements in an SVG namespace" do
314
+ assert_equal(
315
+ '<svg></svg>',
316
+ @s.fragment("<svg><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}>")
317
+ )
318
+ end
319
+ end
320
+ end
321
+
322
+ describe 'sanitization bypass by exploiting scripting-disabled <noscript> behavior' do
323
+ before do
324
+ @s = Sanitize.new(
325
+ Sanitize::Config.merge(
326
+ Sanitize::Config::RELAXED,
327
+ elements: Sanitize::Config::RELAXED[:elements] + ['noscript']
328
+ )
329
+ )
330
+ end
331
+
332
+ it 'is prevented by removing `<noscript>` elements regardless of the allowlist' do
333
+ assert_equal(
334
+ '',
335
+ @s.fragment(%[<noscript><div id='</noscript>&lt;img src=x onerror=alert(1)&gt; '>])
336
+ )
337
+ end
338
+ end
192
339
  end
data/test/test_parser.rb CHANGED
@@ -6,55 +6,26 @@ describe 'Parser' do
6
6
  parallelize_me!
7
7
 
8
8
  it 'should translate valid entities into characters' do
9
- Sanitize.fragment("&apos;&eacute;&amp;").must_equal("'é&amp;")
9
+ _(Sanitize.fragment("&apos;&eacute;&amp;")).must_equal("'é&amp;")
10
10
  end
11
11
 
12
12
  it 'should translate orphaned ampersands into entities' do
13
- Sanitize.fragment('at&t').must_equal('at&amp;t')
13
+ _(Sanitize.fragment('at&t')).must_equal('at&amp;t')
14
14
  end
15
15
 
16
16
  it 'should not add newlines after tags when serializing a fragment' do
17
- Sanitize.fragment("<div>foo\n\n<p>bar</p><div>\nbaz</div></div><div>quux</div>", :elements => ['div', 'p'])
17
+ _(Sanitize.fragment("<div>foo\n\n<p>bar</p><div>\nbaz</div></div><div>quux</div>", :elements => ['div', 'p']))
18
18
  .must_equal "<div>foo\n\n<p>bar</p><div>\nbaz</div></div><div>quux</div>"
19
19
  end
20
20
 
21
21
  it 'should not have the Nokogiri 1.4.2+ unterminated script/style element bug' do
22
- Sanitize.fragment('foo <script>bar').must_equal 'foo bar'
23
- Sanitize.fragment('foo <style>bar').must_equal 'foo bar'
22
+ _(Sanitize.fragment('foo <script>bar')).must_equal 'foo '
23
+ _(Sanitize.fragment('foo <style>bar')).must_equal 'foo '
24
24
  end
25
25
 
26
26
  it 'ambiguous non-tag brackets like "1 > 2 and 2 < 1" should be parsed correctly' do
27
- Sanitize.fragment('1 > 2 and 2 < 1').must_equal '1 &gt; 2 and 2 &lt; 1'
28
- Sanitize.fragment('OMG HAPPY BIRTHDAY! *<:-D').must_equal 'OMG HAPPY BIRTHDAY! *&lt;:-D'
29
- end
30
-
31
- # https://github.com/sparklemotion/nokogiri/issues/1008
32
- it 'should work around the libxml2 content-type meta tag bug' do
33
- Sanitize.document('<html><head></head><body>Howdy!</body></html>',
34
- :elements => %w[html head body]
35
- ).must_equal "<html><head></head><body>Howdy!</body></html>\n"
36
-
37
- Sanitize.document('<html><head></head><body>Howdy!</body></html>',
38
- :elements => %w[html head meta body]
39
- ).must_equal "<html><head></head><body>Howdy!</body></html>\n"
40
-
41
- Sanitize.document('<html><head><meta charset="utf-8"></head><body>Howdy!</body></html>',
42
- :elements => %w[html head meta body],
43
- :attributes => {'meta' => ['charset']}
44
- ).must_equal "<html><head><meta charset=\"utf-8\"></head><body>Howdy!</body></html>\n"
45
-
46
- Sanitize.document('<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8"></head><body>Howdy!</body></html>',
47
- :elements => %w[html head meta body],
48
- :attributes => {'meta' => %w[charset content http-equiv]}
49
- ).must_equal "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"></head><body>Howdy!</body></html>\n"
50
-
51
- # Edge case: an existing content-type meta tag with a non-UTF-8 content type
52
- # will be converted to UTF-8, since that's the only output encoding we
53
- # support.
54
- Sanitize.document('<html><head><meta http-equiv="content-type" content="text/html;charset=us-ascii"></head><body>Howdy!</body></html>',
55
- :elements => %w[html head meta body],
56
- :attributes => {'meta' => %w[charset content http-equiv]}
57
- ).must_equal "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"></head><body>Howdy!</body></html>\n"
27
+ _(Sanitize.fragment('1 > 2 and 2 < 1')).must_equal '1 &gt; 2 and 2 &lt; 1'
28
+ _(Sanitize.fragment('OMG HAPPY BIRTHDAY! *<:-D')).must_equal 'OMG HAPPY BIRTHDAY! *&lt;:-D'
58
29
  end
59
30
 
60
31
  describe 'when siblings are added after a node during traversal' do
@@ -84,11 +55,11 @@ describe 'Parser' do
84
55
  siblings << env[:node][:id]
85
56
  end
86
57
 
87
- return {:node_whitelist => [env[:node]]}
58
+ return {:node_allowlist => [env[:node]]}
88
59
  })
89
60
 
90
61
  # All siblings should be traversed, and in the order added.
91
- siblings.must_equal [
62
+ _(siblings).must_equal [
92
63
  "added_one_one_one",
93
64
  "added_one_one",
94
65
  "added_one_two",
@@ -9,7 +9,7 @@ describe 'Sanitize' do
9
9
  ]
10
10
 
11
11
  Sanitize.new({ :transformers => transformers })
12
- transformers.length.must_equal(1)
12
+ _(transformers.length).must_equal(1)
13
13
  end
14
14
  end
15
15
 
@@ -24,42 +24,118 @@ describe 'Sanitize' do
24
24
  end
25
25
 
26
26
  it 'should sanitize an HTML document' do
27
- @s.document('<!doctype html><html><b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script></html>')
28
- .must_equal "<html>Lorem ipsum dolor sit amet alert(\"hello world\");</html>\n"
27
+ _(@s.document('<!doctype html><html><b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script></html>'))
28
+ .must_equal "<html>Lorem ipsum dolor sit amet </html>"
29
29
  end
30
30
 
31
31
  it 'should not modify the input string' do
32
32
  input = '<!DOCTYPE html><b>foo</b>'
33
33
  @s.document(input)
34
- input.must_equal('<!DOCTYPE html><b>foo</b>')
34
+ _(input).must_equal('<!DOCTYPE html><b>foo</b>')
35
35
  end
36
36
 
37
37
  it 'should not choke on frozen documents' do
38
- @s.document('<!doctype html><html><b>foo</b>'.freeze).must_equal "<html>foo</html>\n"
38
+ _(@s.document('<!doctype html><html><b>foo</b>'.freeze)).must_equal "<html>foo</html>"
39
+ end
40
+
41
+ it 'should normalize newlines' do
42
+ _(@s.document("a\r\n\n\r\r\r\nz")).must_equal "<html>a\n\n\n\n\nz</html>"
43
+ end
44
+
45
+ it 'should strip control characters (except ASCII whitespace)' do
46
+ sample_control_chars = "\u0001\u0008\u000b\u000e\u001f\u007f\u009f"
47
+ whitespace = "\t\n\f\u0020"
48
+ _(@s.document("a#{sample_control_chars}#{whitespace}z")).must_equal "<html>a#{whitespace}z</html>"
49
+ end
50
+
51
+ it 'should strip non-characters' do
52
+ sample_non_chars = "\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}"
53
+ _(@s.document("a#{sample_non_chars}z")).must_equal "<html>az</html>"
54
+ end
55
+
56
+ describe 'when html body exceeds Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH' do
57
+ let(:content) do
58
+ content = nest_html_content('<b>foo</b>', Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH)
59
+ "<html>#{content}</html>"
60
+ end
61
+
62
+ it 'raises an ArgumentError exception' do
63
+ assert_raises ArgumentError do
64
+ @s.document(content)
65
+ end
66
+ end
67
+
68
+ describe 'and :max_tree_depth of -1 is supplied in :parser_options' do
69
+ before do
70
+ @s = Sanitize.new(elements: ['html'], parser_options: { max_tree_depth: -1 })
71
+ end
72
+
73
+ it 'does not raise an ArgumentError exception' do
74
+ _(@s.document(content)).must_equal '<html>foo</html>'
75
+ end
76
+ end
39
77
  end
40
78
  end
41
79
 
42
80
  describe '#fragment' do
43
81
  it 'should sanitize an HTML fragment' do
44
- @s.fragment('<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>')
45
- .must_equal 'Lorem ipsum dolor sit amet alert("hello world");'
82
+ _(@s.fragment('<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>'))
83
+ .must_equal 'Lorem ipsum dolor sit amet '
46
84
  end
47
85
 
48
86
  it 'should not modify the input string' do
49
87
  input = '<b>foo</b>'
50
88
  @s.fragment(input)
51
- input.must_equal '<b>foo</b>'
89
+ _(input).must_equal '<b>foo</b>'
52
90
  end
53
91
 
54
92
  it 'should not choke on fragments containing <html> or <body>' do
55
- @s.fragment('<html><b>foo</b></html>').must_equal 'foo'
56
- @s.fragment('<body><b>foo</b></body>').must_equal 'foo'
57
- @s.fragment('<html><body><b>foo</b></body></html>').must_equal 'foo'
58
- @s.fragment('<!DOCTYPE html><html><body><b>foo</b></body></html>').must_equal 'foo'
93
+ _(@s.fragment('<html><b>foo</b></html>')).must_equal 'foo'
94
+ _(@s.fragment('<body><b>foo</b></body>')).must_equal 'foo'
95
+ _(@s.fragment('<html><body><b>foo</b></body></html>')).must_equal 'foo'
96
+ _(@s.fragment('<!DOCTYPE html><html><body><b>foo</b></body></html>')).must_equal 'foo'
59
97
  end
60
98
 
61
99
  it 'should not choke on frozen fragments' do
62
- @s.fragment('<b>foo</b>'.freeze).must_equal 'foo'
100
+ _(@s.fragment('<b>foo</b>'.freeze)).must_equal 'foo'
101
+ end
102
+
103
+ it 'should normalize newlines' do
104
+ _(@s.fragment("a\r\n\n\r\r\r\nz")).must_equal "a\n\n\n\n\nz"
105
+ end
106
+
107
+ it 'should strip control characters (except ASCII whitespace)' do
108
+ sample_control_chars = "\u0001\u0008\u000b\u000e\u001f\u007f\u009f"
109
+ whitespace = "\t\n\f\u0020"
110
+ _(@s.fragment("a#{sample_control_chars}#{whitespace}z")).must_equal "a#{whitespace}z"
111
+ end
112
+
113
+ it 'should strip non-characters' do
114
+ sample_non_chars = "\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}"
115
+ _(@s.fragment("a#{sample_non_chars}z")).must_equal "az"
116
+ end
117
+
118
+ describe 'when html body exceeds Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH' do
119
+ let(:content) do
120
+ content = nest_html_content('<b>foo</b>', Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH)
121
+ "<body>#{content}</body>"
122
+ end
123
+
124
+ it 'raises an ArgumentError exception' do
125
+ assert_raises ArgumentError do
126
+ @s.fragment(content)
127
+ end
128
+ end
129
+
130
+ describe 'and :max_tree_depth of -1 is supplied in :parser_options' do
131
+ before do
132
+ @s = Sanitize.new(parser_options: { max_tree_depth: -1 })
133
+ end
134
+
135
+ it 'does not raise an ArgumentError exception' do
136
+ _(@s.fragment(content)).must_equal 'foo'
137
+ end
138
+ end
63
139
  end
64
140
  end
65
141
 
@@ -71,13 +147,13 @@ describe 'Sanitize' do
71
147
  doc.xpath('/html/body/node()').each {|node| frag << node }
72
148
 
73
149
  @s.node!(frag)
74
- frag.to_html.must_equal 'Lorem ipsum dolor sit amet alert("hello world");'
150
+ _(frag.to_html).must_equal 'Lorem ipsum dolor sit amet '
75
151
  end
76
152
 
77
- describe "when the given node is a document and <html> isn't whitelisted" do
153
+ describe "when the given node is a document and <html> isn't allowlisted" do
78
154
  it 'should raise a Sanitize::Error' do
79
155
  doc = Nokogiri::HTML5.parse('foo')
80
- proc { @s.node!(doc) }.must_raise Sanitize::Error
156
+ _(proc { @s.node!(doc) }).must_raise Sanitize::Error
81
157
  end
82
158
  end
83
159
  end
@@ -85,28 +161,37 @@ describe 'Sanitize' do
85
161
 
86
162
  describe 'class methods' do
87
163
  describe '.document' do
88
- it 'should call #document' do
89
- Sanitize.stub_instance(:document, proc {|html| html + ' called' }) do
90
- Sanitize.document('<html>foo</html>')
91
- .must_equal '<html>foo</html> called'
92
- end
164
+ it 'should sanitize an HTML document with the given config' do
165
+ html = '<!doctype html><html><b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script></html>'
166
+ _(Sanitize.document(html, :elements => ['html']))
167
+ .must_equal "<html>Lorem ipsum dolor sit amet </html>"
93
168
  end
94
169
  end
95
170
 
96
171
  describe '.fragment' do
97
- it 'should call #fragment' do
98
- Sanitize.stub_instance(:fragment, proc {|html| html + ' called' }) do
99
- Sanitize.fragment('<b>foo</b>').must_equal '<b>foo</b> called'
100
- end
172
+ it 'should sanitize an HTML fragment with the given config' do
173
+ html = '<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>'
174
+ _(Sanitize.fragment(html, :elements => ['strong']))
175
+ .must_equal 'Lorem ipsum <strong>dolor</strong> sit amet '
101
176
  end
102
177
  end
103
178
 
104
179
  describe '.node!' do
105
- it 'should call #node!' do
106
- Sanitize.stub_instance(:node!, proc {|input| input + ' called' }) do
107
- Sanitize.node!('not really a node').must_equal 'not really a node called'
108
- end
180
+ it 'should sanitize a Nokogiri::XML::Node with the given config' do
181
+ doc = Nokogiri::HTML5.parse('<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>')
182
+ frag = doc.fragment
183
+
184
+ doc.xpath('/html/body/node()').each {|node| frag << node }
185
+
186
+ Sanitize.node!(frag, :elements => ['strong'])
187
+ _(frag.to_html).must_equal 'Lorem ipsum <strong>dolor</strong> sit amet '
109
188
  end
110
189
  end
111
190
  end
191
+
192
+ private
193
+
194
+ def nest_html_content(html_content, depth)
195
+ "#{'<span>' * depth}#{html_content}#{'</span>' * depth}"
196
+ end
112
197
  end