sanitize 4.6.4 → 6.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,124 +17,126 @@ describe 'Malicious HTML' do
17
17
 
18
18
  describe 'comments' do
19
19
  it 'should not allow script injection via conditional comments' do
20
- @s.fragment(%[<!--[if gte IE 4]>\n<script>alert('XSS');</script>\n<![endif]-->]).
20
+ _(@s.fragment(%[<!--[if gte IE 4]>\n<script>alert('XSS');</script>\n<![endif]-->])).
21
21
  must_equal ''
22
22
  end
23
23
  end
24
24
 
25
25
  describe 'interpolation (ERB, PHP, etc.)' do
26
26
  it 'should escape ERB-style tags' do
27
- @s.fragment('<% naughty_ruby_code %>').
27
+ _(@s.fragment('<% naughty_ruby_code %>')).
28
28
  must_equal '&lt;% naughty_ruby_code %&gt;'
29
29
 
30
- @s.fragment('<%= naughty_ruby_code %>').
30
+ _(@s.fragment('<%= naughty_ruby_code %>')).
31
31
  must_equal '&lt;%= naughty_ruby_code %&gt;'
32
32
  end
33
33
 
34
34
  it 'should remove PHP-style tags' do
35
- @s.fragment('<? naughtyPHPCode(); ?>').
35
+ _(@s.fragment('<? naughtyPHPCode(); ?>')).
36
36
  must_equal ''
37
37
 
38
- @s.fragment('<?= naughtyPHPCode(); ?>').
38
+ _(@s.fragment('<?= naughtyPHPCode(); ?>')).
39
39
  must_equal ''
40
40
  end
41
41
  end
42
42
 
43
43
  describe '<body>' do
44
44
  it 'should not be possible to inject JS via a malformed event attribute' do
45
- @s.document('<html><head></head><body onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert("XSS")></body></html>').
46
- must_equal "<html><head></head><body></body></html>\n"
45
+ _(@s.document('<html><head></head><body onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert("XSS")></body></html>')).
46
+ must_equal "<html><head></head><body></body></html>"
47
47
  end
48
48
  end
49
49
 
50
50
  describe '<iframe>' do
51
51
  it 'should not be possible to inject an iframe using an improperly closed tag' do
52
- @s.fragment(%[<iframe src=http://ha.ckers.org/scriptlet.html <]).
52
+ _(@s.fragment(%[<iframe src=http://ha.ckers.org/scriptlet.html <])).
53
53
  must_equal ''
54
54
  end
55
55
  end
56
56
 
57
57
  describe '<img>' do
58
58
  it 'should not be possible to inject JS via an unquoted <img> src attribute' do
59
- @s.fragment("<img src=javascript:alert('XSS')>").must_equal '<img>'
59
+ _(@s.fragment("<img src=javascript:alert('XSS')>")).must_equal '<img>'
60
60
  end
61
61
 
62
62
  it 'should not be possible to inject JS using grave accents as <img> src delimiters' do
63
- @s.fragment("<img src=`javascript:alert('XSS')`>").must_equal '<img>'
63
+ _(@s.fragment("<img src=`javascript:alert('XSS')`>")).must_equal '<img>'
64
64
  end
65
65
 
66
66
  it 'should not be possible to inject <script> via a malformed <img> tag' do
67
- @s.fragment('<img """><script>alert("XSS")</script>">').
68
- must_equal '<img>alert("XSS")"&gt;'
67
+ _(@s.fragment('<img """><script>alert("XSS")</script>">')).
68
+ must_equal '<img>"&gt;'
69
69
  end
70
70
 
71
71
  it 'should not be possible to inject protocol-based JS' do
72
- @s.fragment('<img src=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;>').
72
+ _(@s.fragment('<img src=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;>')).
73
73
  must_equal '<img>'
74
74
 
75
- @s.fragment('<img src=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>').
75
+ _(@s.fragment('<img src=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>')).
76
76
  must_equal '<img>'
77
77
 
78
- @s.fragment('<img src=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>').
78
+ _(@s.fragment('<img src=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>')).
79
79
  must_equal '<img>'
80
80
 
81
81
  # Encoded tab character.
82
- @s.fragment(%[<img src="jav&#x09;ascript:alert('XSS');">]).
82
+ _(@s.fragment(%[<img src="jav&#x09;ascript:alert('XSS');">])).
83
83
  must_equal '<img>'
84
84
 
85
85
  # Encoded newline.
86
- @s.fragment(%[<img src="jav&#x0A;ascript:alert('XSS');">]).
86
+ _(@s.fragment(%[<img src="jav&#x0A;ascript:alert('XSS');">])).
87
87
  must_equal '<img>'
88
88
 
89
89
  # Encoded carriage return.
90
- @s.fragment(%[<img src="jav&#x0D;ascript:alert('XSS');">]).
90
+ _(@s.fragment(%[<img src="jav&#x0D;ascript:alert('XSS');">])).
91
91
  must_equal '<img>'
92
92
 
93
93
  # Null byte.
94
- @s.fragment(%[<img src=java\0script:alert("XSS")>]).
94
+ _(@s.fragment(%[<img src=java\0script:alert("XSS")>])).
95
95
  must_equal '<img>'
96
96
 
97
97
  # Spaces plus meta char.
98
- @s.fragment(%[<img src=" &#14; javascript:alert('XSS');">]).
98
+ _(@s.fragment(%[<img src=" &#14; javascript:alert('XSS');">])).
99
99
  must_equal '<img>'
100
100
 
101
101
  # Mixed spaces and tabs.
102
- @s.fragment(%[<img src="j\na v\tascript://alert('XSS');">]).
102
+ _(@s.fragment(%[<img src="j\na v\tascript://alert('XSS');">])).
103
103
  must_equal '<img>'
104
104
  end
105
105
 
106
106
  it 'should not be possible to inject protocol-based JS via whitespace' do
107
- @s.fragment(%[<img src="jav\tascript:alert('XSS');">]).
107
+ _(@s.fragment(%[<img src="jav\tascript:alert('XSS');">])).
108
108
  must_equal '<img>'
109
109
  end
110
110
 
111
111
  it 'should not be possible to inject JS using a half-open <img> tag' do
112
- @s.fragment(%[<img src="javascript:alert('XSS')"]).
112
+ _(@s.fragment(%[<img src="javascript:alert('XSS')"])).
113
113
  must_equal ''
114
114
  end
115
115
  end
116
116
 
117
117
  describe '<script>' do
118
118
  it 'should not be possible to inject <script> using a malformed non-alphanumeric tag name' do
119
- @s.fragment(%[<script/xss src="http://ha.ckers.org/xss.js">alert(1)</script>]).
120
- must_equal 'alert(1)'
119
+ _(@s.fragment(%[<script/xss src="http://ha.ckers.org/xss.js">alert(1)</script>])).
120
+ must_equal ''
121
121
  end
122
122
 
123
123
  it 'should not be possible to inject <script> via extraneous open brackets' do
124
- @s.fragment(%[<<script>alert("XSS");//<</script>]).
125
- must_equal '&lt;alert("XSS");//&lt;'
124
+ _(@s.fragment(%[<<script>alert("XSS");//<</script>])).
125
+ must_equal '&lt;'
126
126
  end
127
127
  end
128
128
 
129
129
  # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
130
130
  # attempt to preserve server-side includes. This can result in XSS since an
131
- # unescaped double quote can allow an attacker to inject a non-whitelisted
131
+ # unescaped double quote can allow an attacker to inject a non-allowlisted
132
132
  # attribute. Sanitize works around this by implementing its own escaping for
133
133
  # affected attributes.
134
134
  #
135
135
  # The relevant libxml2 code is here:
136
136
  # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
137
137
  describe 'unsafe libxml2 server-side includes in attributes' do
138
+ using_unpatched_libxml2 = Nokogiri::VersionInfo.instance.libxml2_using_system?
139
+
138
140
  tag_configs = [
139
141
  {
140
142
  tag_name: 'a',
@@ -166,12 +168,26 @@ describe 'Malicious HTML' do
166
168
  input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]
167
169
 
168
170
  it 'should escape unsafe characters in attributes' do
169
- @s.fragment(input).must_equal(%[<#{tag_name} #{attr_name}="examp<!--%22%20onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
171
+ skip "behavior should only exist in nokogiri's patched libxml" if using_unpatched_libxml2
172
+
173
+ # This uses Nokogumbo's HTML-compliant serializer rather than
174
+ # libxml2's.
175
+ _(@s.fragment(input)).
176
+ must_equal(%[<#{tag_name} #{attr_name}="examp<!--%22%20onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
177
+
178
+ # This uses the not-quite-standards-compliant libxml2 serializer via
179
+ # Nokogiri, so the output may be a little different as of Nokogiri
180
+ # 1.10.2 when using Nokogiri's vendored libxml2 due to this patch:
181
+ # https://github.com/sparklemotion/nokogiri/commit/4852e43cb6039e26d8c51af78621e539cbf46c5d
182
+ fragment = Nokogiri::HTML.fragment(input)
183
+ @s.node!(fragment)
184
+ _(fragment.to_html).
185
+ must_equal(%[<#{tag_name} #{attr_name}="examp&lt;!--%22%20onmouseover=alert(1)&gt;--&gt;le.com">foo</#{tag_name}>])
170
186
  end
171
187
 
172
188
  it 'should round-trip to the same output' do
173
189
  output = @s.fragment(input)
174
- @s.fragment(output).must_equal(output)
190
+ _(@s.fragment(output)).must_equal(output)
175
191
  end
176
192
  end
177
193
 
@@ -179,14 +195,145 @@ describe 'Malicious HTML' do
179
195
  input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]
180
196
 
181
197
  it 'should not escape characters unnecessarily' do
182
- @s.fragment(input).must_equal(input)
198
+ skip "behavior should only exist in nokogiri's patched libxml" if using_unpatched_libxml2
199
+
200
+ # This uses Nokogumbo's HTML-compliant serializer rather than
201
+ # libxml2's.
202
+ _(@s.fragment(input)).
203
+ must_equal(%[<#{tag_name} #{attr_name}="examp<!--&quot; onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
204
+
205
+ # This uses the not-quite-standards-compliant libxml2 serializer via
206
+ # Nokogiri, so the output may be a little different as of Nokogiri
207
+ # 1.10.2 when using Nokogiri's vendored libxml2 due to this patch:
208
+ # https://github.com/sparklemotion/nokogiri/commit/4852e43cb6039e26d8c51af78621e539cbf46c5d
209
+ fragment = Nokogiri::HTML.fragment(input)
210
+ @s.node!(fragment)
211
+ _(fragment.to_html).
212
+ must_equal(%[<#{tag_name} #{attr_name}='examp&lt;!--" onmouseover=alert(1)&gt;--&gt;le.com'>foo</#{tag_name}>])
183
213
  end
184
214
 
185
215
  it 'should round-trip to the same output' do
186
216
  output = @s.fragment(input)
187
- @s.fragment(output).must_equal(output)
217
+ _(@s.fragment(output)).must_equal(output)
188
218
  end
189
219
  end
190
220
  end
191
221
  end
222
+
223
+ # https://github.com/rgrove/sanitize/security/advisories/GHSA-p4x4-rw2p-8j8m
224
+ describe 'foreign content bypass in relaxed config' do
225
+ it 'prevents a sanitization bypass via carefully crafted foreign content' do
226
+ %w[iframe noembed noframes noscript plaintext script style xmp].each do |tag_name|
227
+ _(@s.fragment(%[<math><#{tag_name}>/*&lt;/#{tag_name}&gt;&lt;img src onerror=alert(1)>*/])).
228
+ must_equal ''
229
+
230
+ _(@s.fragment(%[<svg><#{tag_name}>/*&lt;/#{tag_name}&gt;&lt;img src onerror=alert(1)>*/])).
231
+ must_equal ''
232
+ end
233
+ end
234
+ end
235
+
236
+ # These tests cover an unsupported and unsafe custom config that allows MathML
237
+ # and SVG elements, which Sanitize's docs specifically say multiple times in
238
+ # big prominent warnings that you SHOULD NOT DO because Sanitize doesn't
239
+ # support MathML or SVG.
240
+ #
241
+ # Do not use the custom configs you see in these tests! If you do, you may be
242
+ # creating XSS vulnerabilities in your application.
243
+ describe 'foreign content bypass in unsafe custom config that allows MathML or SVG' do
244
+ unescaped_content_elements = %w[
245
+ noembed
246
+ noframes
247
+ plaintext
248
+ script
249
+ xmp
250
+ ]
251
+
252
+ removed_content_elements = %w[
253
+ iframe
254
+ ]
255
+
256
+ removed_elements = %w[
257
+ noscript
258
+ style
259
+ ]
260
+
261
+ before do
262
+ @s = Sanitize.new(
263
+ Sanitize::Config.merge(
264
+ Sanitize::Config::RELAXED,
265
+ elements: Sanitize::Config::RELAXED[:elements] +
266
+ unescaped_content_elements +
267
+ removed_content_elements +
268
+ %w[math svg]
269
+ )
270
+ )
271
+ end
272
+
273
+ unescaped_content_elements.each do |name|
274
+ it "forcibly escapes text content inside `<#{name}>` in a MathML namespace" do
275
+ assert_equal(
276
+ "<math><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}></math>",
277
+ @s.fragment("<math><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}>")
278
+ )
279
+ end
280
+
281
+ it "forcibly escapes text content inside `<#{name}>` in an SVG namespace" do
282
+ assert_equal(
283
+ "<svg><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}></svg>",
284
+ @s.fragment("<svg><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}>")
285
+ )
286
+ end
287
+ end
288
+
289
+ removed_content_elements.each do |name|
290
+ it "removes text content inside `<#{name}>` in a MathML namespace" do
291
+ assert_equal(
292
+ "<math><#{name}></#{name}></math>",
293
+ @s.fragment("<math><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}>")
294
+ )
295
+ end
296
+
297
+ it "removes text content inside `<#{name}>` in an SVG namespace" do
298
+ assert_equal(
299
+ "<svg><#{name}></#{name}></svg>",
300
+ @s.fragment("<svg><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}>")
301
+ )
302
+ end
303
+ end
304
+
305
+ removed_elements.each do |name|
306
+ it "removes `<#{name}>` elements in a MathML namespace" do
307
+ assert_equal(
308
+ '<math></math>',
309
+ @s.fragment("<math><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}>")
310
+ )
311
+ end
312
+
313
+ it "removes `<#{name}>` elements in an SVG namespace" do
314
+ assert_equal(
315
+ '<svg></svg>',
316
+ @s.fragment("<svg><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}>")
317
+ )
318
+ end
319
+ end
320
+ end
321
+
322
+ describe 'sanitization bypass by exploiting scripting-disabled <noscript> behavior' do
323
+ before do
324
+ @s = Sanitize.new(
325
+ Sanitize::Config.merge(
326
+ Sanitize::Config::RELAXED,
327
+ elements: Sanitize::Config::RELAXED[:elements] + ['noscript']
328
+ )
329
+ )
330
+ end
331
+
332
+ it 'is prevented by removing `<noscript>` elements regardless of the allowlist' do
333
+ assert_equal(
334
+ '',
335
+ @s.fragment(%[<noscript><div id='</noscript>&lt;img src=x onerror=alert(1)&gt; '>])
336
+ )
337
+ end
338
+ end
192
339
  end
data/test/test_parser.rb CHANGED
@@ -6,55 +6,26 @@ describe 'Parser' do
6
6
  parallelize_me!
7
7
 
8
8
  it 'should translate valid entities into characters' do
9
- Sanitize.fragment("&apos;&eacute;&amp;").must_equal("'é&amp;")
9
+ _(Sanitize.fragment("&apos;&eacute;&amp;")).must_equal("'é&amp;")
10
10
  end
11
11
 
12
12
  it 'should translate orphaned ampersands into entities' do
13
- Sanitize.fragment('at&t').must_equal('at&amp;t')
13
+ _(Sanitize.fragment('at&t')).must_equal('at&amp;t')
14
14
  end
15
15
 
16
16
  it 'should not add newlines after tags when serializing a fragment' do
17
- Sanitize.fragment("<div>foo\n\n<p>bar</p><div>\nbaz</div></div><div>quux</div>", :elements => ['div', 'p'])
17
+ _(Sanitize.fragment("<div>foo\n\n<p>bar</p><div>\nbaz</div></div><div>quux</div>", :elements => ['div', 'p']))
18
18
  .must_equal "<div>foo\n\n<p>bar</p><div>\nbaz</div></div><div>quux</div>"
19
19
  end
20
20
 
21
21
  it 'should not have the Nokogiri 1.4.2+ unterminated script/style element bug' do
22
- Sanitize.fragment('foo <script>bar').must_equal 'foo bar'
23
- Sanitize.fragment('foo <style>bar').must_equal 'foo bar'
22
+ _(Sanitize.fragment('foo <script>bar')).must_equal 'foo '
23
+ _(Sanitize.fragment('foo <style>bar')).must_equal 'foo '
24
24
  end
25
25
 
26
26
  it 'ambiguous non-tag brackets like "1 > 2 and 2 < 1" should be parsed correctly' do
27
- Sanitize.fragment('1 > 2 and 2 < 1').must_equal '1 &gt; 2 and 2 &lt; 1'
28
- Sanitize.fragment('OMG HAPPY BIRTHDAY! *<:-D').must_equal 'OMG HAPPY BIRTHDAY! *&lt;:-D'
29
- end
30
-
31
- # https://github.com/sparklemotion/nokogiri/issues/1008
32
- it 'should work around the libxml2 content-type meta tag bug' do
33
- Sanitize.document('<html><head></head><body>Howdy!</body></html>',
34
- :elements => %w[html head body]
35
- ).must_equal "<html><head></head><body>Howdy!</body></html>\n"
36
-
37
- Sanitize.document('<html><head></head><body>Howdy!</body></html>',
38
- :elements => %w[html head meta body]
39
- ).must_equal "<html><head></head><body>Howdy!</body></html>\n"
40
-
41
- Sanitize.document('<html><head><meta charset="utf-8"></head><body>Howdy!</body></html>',
42
- :elements => %w[html head meta body],
43
- :attributes => {'meta' => ['charset']}
44
- ).must_equal "<html><head><meta charset=\"utf-8\"></head><body>Howdy!</body></html>\n"
45
-
46
- Sanitize.document('<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8"></head><body>Howdy!</body></html>',
47
- :elements => %w[html head meta body],
48
- :attributes => {'meta' => %w[charset content http-equiv]}
49
- ).must_equal "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"></head><body>Howdy!</body></html>\n"
50
-
51
- # Edge case: an existing content-type meta tag with a non-UTF-8 content type
52
- # will be converted to UTF-8, since that's the only output encoding we
53
- # support.
54
- Sanitize.document('<html><head><meta http-equiv="content-type" content="text/html;charset=us-ascii"></head><body>Howdy!</body></html>',
55
- :elements => %w[html head meta body],
56
- :attributes => {'meta' => %w[charset content http-equiv]}
57
- ).must_equal "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"></head><body>Howdy!</body></html>\n"
27
+ _(Sanitize.fragment('1 > 2 and 2 < 1')).must_equal '1 &gt; 2 and 2 &lt; 1'
28
+ _(Sanitize.fragment('OMG HAPPY BIRTHDAY! *<:-D')).must_equal 'OMG HAPPY BIRTHDAY! *&lt;:-D'
58
29
  end
59
30
 
60
31
  describe 'when siblings are added after a node during traversal' do
@@ -84,11 +55,11 @@ describe 'Parser' do
84
55
  siblings << env[:node][:id]
85
56
  end
86
57
 
87
- return {:node_whitelist => [env[:node]]}
58
+ return {:node_allowlist => [env[:node]]}
88
59
  })
89
60
 
90
61
  # All siblings should be traversed, and in the order added.
91
- siblings.must_equal [
62
+ _(siblings).must_equal [
92
63
  "added_one_one_one",
93
64
  "added_one_one",
94
65
  "added_one_two",
@@ -9,7 +9,7 @@ describe 'Sanitize' do
9
9
  ]
10
10
 
11
11
  Sanitize.new({ :transformers => transformers })
12
- transformers.length.must_equal(1)
12
+ _(transformers.length).must_equal(1)
13
13
  end
14
14
  end
15
15
 
@@ -24,42 +24,118 @@ describe 'Sanitize' do
24
24
  end
25
25
 
26
26
  it 'should sanitize an HTML document' do
27
- @s.document('<!doctype html><html><b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script></html>')
28
- .must_equal "<html>Lorem ipsum dolor sit amet alert(\"hello world\");</html>\n"
27
+ _(@s.document('<!doctype html><html><b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script></html>'))
28
+ .must_equal "<html>Lorem ipsum dolor sit amet </html>"
29
29
  end
30
30
 
31
31
  it 'should not modify the input string' do
32
32
  input = '<!DOCTYPE html><b>foo</b>'
33
33
  @s.document(input)
34
- input.must_equal('<!DOCTYPE html><b>foo</b>')
34
+ _(input).must_equal('<!DOCTYPE html><b>foo</b>')
35
35
  end
36
36
 
37
37
  it 'should not choke on frozen documents' do
38
- @s.document('<!doctype html><html><b>foo</b>'.freeze).must_equal "<html>foo</html>\n"
38
+ _(@s.document('<!doctype html><html><b>foo</b>'.freeze)).must_equal "<html>foo</html>"
39
+ end
40
+
41
+ it 'should normalize newlines' do
42
+ _(@s.document("a\r\n\n\r\r\r\nz")).must_equal "<html>a\n\n\n\n\nz</html>"
43
+ end
44
+
45
+ it 'should strip control characters (except ASCII whitespace)' do
46
+ sample_control_chars = "\u0001\u0008\u000b\u000e\u001f\u007f\u009f"
47
+ whitespace = "\t\n\f\u0020"
48
+ _(@s.document("a#{sample_control_chars}#{whitespace}z")).must_equal "<html>a#{whitespace}z</html>"
49
+ end
50
+
51
+ it 'should strip non-characters' do
52
+ sample_non_chars = "\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}"
53
+ _(@s.document("a#{sample_non_chars}z")).must_equal "<html>az</html>"
54
+ end
55
+
56
+ describe 'when html body exceeds Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH' do
57
+ let(:content) do
58
+ content = nest_html_content('<b>foo</b>', Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH)
59
+ "<html>#{content}</html>"
60
+ end
61
+
62
+ it 'raises an ArgumentError exception' do
63
+ assert_raises ArgumentError do
64
+ @s.document(content)
65
+ end
66
+ end
67
+
68
+ describe 'and :max_tree_depth of -1 is supplied in :parser_options' do
69
+ before do
70
+ @s = Sanitize.new(elements: ['html'], parser_options: { max_tree_depth: -1 })
71
+ end
72
+
73
+ it 'does not raise an ArgumentError exception' do
74
+ _(@s.document(content)).must_equal '<html>foo</html>'
75
+ end
76
+ end
39
77
  end
40
78
  end
41
79
 
42
80
  describe '#fragment' do
43
81
  it 'should sanitize an HTML fragment' do
44
- @s.fragment('<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>')
45
- .must_equal 'Lorem ipsum dolor sit amet alert("hello world");'
82
+ _(@s.fragment('<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>'))
83
+ .must_equal 'Lorem ipsum dolor sit amet '
46
84
  end
47
85
 
48
86
  it 'should not modify the input string' do
49
87
  input = '<b>foo</b>'
50
88
  @s.fragment(input)
51
- input.must_equal '<b>foo</b>'
89
+ _(input).must_equal '<b>foo</b>'
52
90
  end
53
91
 
54
92
  it 'should not choke on fragments containing <html> or <body>' do
55
- @s.fragment('<html><b>foo</b></html>').must_equal 'foo'
56
- @s.fragment('<body><b>foo</b></body>').must_equal 'foo'
57
- @s.fragment('<html><body><b>foo</b></body></html>').must_equal 'foo'
58
- @s.fragment('<!DOCTYPE html><html><body><b>foo</b></body></html>').must_equal 'foo'
93
+ _(@s.fragment('<html><b>foo</b></html>')).must_equal 'foo'
94
+ _(@s.fragment('<body><b>foo</b></body>')).must_equal 'foo'
95
+ _(@s.fragment('<html><body><b>foo</b></body></html>')).must_equal 'foo'
96
+ _(@s.fragment('<!DOCTYPE html><html><body><b>foo</b></body></html>')).must_equal 'foo'
59
97
  end
60
98
 
61
99
  it 'should not choke on frozen fragments' do
62
- @s.fragment('<b>foo</b>'.freeze).must_equal 'foo'
100
+ _(@s.fragment('<b>foo</b>'.freeze)).must_equal 'foo'
101
+ end
102
+
103
+ it 'should normalize newlines' do
104
+ _(@s.fragment("a\r\n\n\r\r\r\nz")).must_equal "a\n\n\n\n\nz"
105
+ end
106
+
107
+ it 'should strip control characters (except ASCII whitespace)' do
108
+ sample_control_chars = "\u0001\u0008\u000b\u000e\u001f\u007f\u009f"
109
+ whitespace = "\t\n\f\u0020"
110
+ _(@s.fragment("a#{sample_control_chars}#{whitespace}z")).must_equal "a#{whitespace}z"
111
+ end
112
+
113
+ it 'should strip non-characters' do
114
+ sample_non_chars = "\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}"
115
+ _(@s.fragment("a#{sample_non_chars}z")).must_equal "az"
116
+ end
117
+
118
+ describe 'when html body exceeds Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH' do
119
+ let(:content) do
120
+ content = nest_html_content('<b>foo</b>', Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH)
121
+ "<body>#{content}</body>"
122
+ end
123
+
124
+ it 'raises an ArgumentError exception' do
125
+ assert_raises ArgumentError do
126
+ @s.fragment(content)
127
+ end
128
+ end
129
+
130
+ describe 'and :max_tree_depth of -1 is supplied in :parser_options' do
131
+ before do
132
+ @s = Sanitize.new(parser_options: { max_tree_depth: -1 })
133
+ end
134
+
135
+ it 'does not raise an ArgumentError exception' do
136
+ _(@s.fragment(content)).must_equal 'foo'
137
+ end
138
+ end
63
139
  end
64
140
  end
65
141
 
@@ -71,13 +147,13 @@ describe 'Sanitize' do
71
147
  doc.xpath('/html/body/node()').each {|node| frag << node }
72
148
 
73
149
  @s.node!(frag)
74
- frag.to_html.must_equal 'Lorem ipsum dolor sit amet alert("hello world");'
150
+ _(frag.to_html).must_equal 'Lorem ipsum dolor sit amet '
75
151
  end
76
152
 
77
- describe "when the given node is a document and <html> isn't whitelisted" do
153
+ describe "when the given node is a document and <html> isn't allowlisted" do
78
154
  it 'should raise a Sanitize::Error' do
79
155
  doc = Nokogiri::HTML5.parse('foo')
80
- proc { @s.node!(doc) }.must_raise Sanitize::Error
156
+ _(proc { @s.node!(doc) }).must_raise Sanitize::Error
81
157
  end
82
158
  end
83
159
  end
@@ -85,28 +161,37 @@ describe 'Sanitize' do
85
161
 
86
162
  describe 'class methods' do
87
163
  describe '.document' do
88
- it 'should call #document' do
89
- Sanitize.stub_instance(:document, proc {|html| html + ' called' }) do
90
- Sanitize.document('<html>foo</html>')
91
- .must_equal '<html>foo</html> called'
92
- end
164
+ it 'should sanitize an HTML document with the given config' do
165
+ html = '<!doctype html><html><b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script></html>'
166
+ _(Sanitize.document(html, :elements => ['html']))
167
+ .must_equal "<html>Lorem ipsum dolor sit amet </html>"
93
168
  end
94
169
  end
95
170
 
96
171
  describe '.fragment' do
97
- it 'should call #fragment' do
98
- Sanitize.stub_instance(:fragment, proc {|html| html + ' called' }) do
99
- Sanitize.fragment('<b>foo</b>').must_equal '<b>foo</b> called'
100
- end
172
+ it 'should sanitize an HTML fragment with the given config' do
173
+ html = '<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>'
174
+ _(Sanitize.fragment(html, :elements => ['strong']))
175
+ .must_equal 'Lorem ipsum <strong>dolor</strong> sit amet '
101
176
  end
102
177
  end
103
178
 
104
179
  describe '.node!' do
105
- it 'should call #node!' do
106
- Sanitize.stub_instance(:node!, proc {|input| input + ' called' }) do
107
- Sanitize.node!('not really a node').must_equal 'not really a node called'
108
- end
180
+ it 'should sanitize a Nokogiri::XML::Node with the given config' do
181
+ doc = Nokogiri::HTML5.parse('<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>')
182
+ frag = doc.fragment
183
+
184
+ doc.xpath('/html/body/node()').each {|node| frag << node }
185
+
186
+ Sanitize.node!(frag, :elements => ['strong'])
187
+ _(frag.to_html).must_equal 'Lorem ipsum <strong>dolor</strong> sit amet '
109
188
  end
110
189
  end
111
190
  end
191
+
192
+ private
193
+
194
+ def nest_html_content(html_content, depth)
195
+ "#{'<span>' * depth}#{html_content}#{'</span>' * depth}"
196
+ end
112
197
  end