loofah 2.2.3 → 2.21.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +269 -31
  3. data/README.md +109 -124
  4. data/lib/loofah/concerns.rb +207 -0
  5. data/lib/loofah/elements.rb +85 -79
  6. data/lib/loofah/helpers.rb +37 -20
  7. data/lib/loofah/{html → html4}/document.rb +6 -7
  8. data/lib/loofah/html4/document_fragment.rb +15 -0
  9. data/lib/loofah/html5/document.rb +17 -0
  10. data/lib/loofah/html5/document_fragment.rb +15 -0
  11. data/lib/loofah/html5/libxml2_workarounds.rb +10 -8
  12. data/lib/loofah/html5/safelist.rb +1055 -0
  13. data/lib/loofah/html5/scrub.rb +153 -58
  14. data/lib/loofah/metahelpers.rb +11 -6
  15. data/lib/loofah/scrubber.rb +22 -15
  16. data/lib/loofah/scrubbers.rb +66 -55
  17. data/lib/loofah/version.rb +6 -0
  18. data/lib/loofah/xml/document.rb +2 -0
  19. data/lib/loofah/xml/document_fragment.rb +4 -7
  20. data/lib/loofah.rb +131 -38
  21. metadata +28 -216
  22. data/.gemtest +0 -0
  23. data/Gemfile +0 -22
  24. data/Manifest.txt +0 -40
  25. data/Rakefile +0 -79
  26. data/benchmark/benchmark.rb +0 -149
  27. data/benchmark/fragment.html +0 -96
  28. data/benchmark/helper.rb +0 -73
  29. data/benchmark/www.slashdot.com.html +0 -2560
  30. data/lib/loofah/html/document_fragment.rb +0 -40
  31. data/lib/loofah/html5/whitelist.rb +0 -186
  32. data/lib/loofah/instance_methods.rb +0 -127
  33. data/test/assets/msword.html +0 -63
  34. data/test/assets/testdata_sanitizer_tests1.dat +0 -502
  35. data/test/helper.rb +0 -18
  36. data/test/html5/test_sanitizer.rb +0 -382
  37. data/test/integration/test_ad_hoc.rb +0 -204
  38. data/test/integration/test_helpers.rb +0 -43
  39. data/test/integration/test_html.rb +0 -72
  40. data/test/integration/test_scrubbers.rb +0 -400
  41. data/test/integration/test_xml.rb +0 -55
  42. data/test/unit/test_api.rb +0 -142
  43. data/test/unit/test_encoding.rb +0 -20
  44. data/test/unit/test_helpers.rb +0 -62
  45. data/test/unit/test_scrubber.rb +0 -229
  46. data/test/unit/test_scrubbers.rb +0 -14
@@ -1,382 +0,0 @@
1
- #
2
- # these tests taken from the HTML5 sanitization project and modified for use with Loofah
3
- # see the original here: http://code.google.com/p/html5lib/source/browse/ruby/test/test_sanitizer.rb
4
- #
5
- # license text at the bottom of this file
6
- #
7
- require "helper"
8
-
9
- class Html5TestSanitizer < Loofah::TestCase
10
- include Loofah
11
-
12
- def sanitize_xhtml stream
13
- Loofah.fragment(stream).scrub!(:escape).to_xhtml
14
- end
15
-
16
- def sanitize_html stream
17
- Loofah.fragment(stream).scrub!(:escape).to_html
18
- end
19
-
20
- def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
21
- ## libxml uses double-quotes, so let's swappo-boppo our quotes before comparing.
22
- sane = sanitize_html(input).gsub('"',"'")
23
- htmloutput = htmloutput.gsub('"',"'")
24
- xhtmloutput = xhtmloutput.gsub('"',"'")
25
- rexmloutput = rexmloutput.gsub('"',"'")
26
-
27
- ## HTML5's parsers are shit. there's so much inconsistency with what has closing tags, etc, that
28
- ## it would require a lot of manual hacking to make the tests match libxml's output.
29
- ## instead, I'm taking the shotgun approach, and trying to match any of the described outputs.
30
- assert((htmloutput == sane) || (rexmloutput == sane) || (xhtmloutput == sane),
31
- %Q{given: "#{input}"\nexpected: "#{htmloutput}"\ngot: "#{sane}"})
32
- end
33
-
34
- def assert_completes_in_reasonable_time &block
35
- t0 = Time.now
36
- block.call
37
- assert_in_delta t0, Time.now, 0.1 # arbitrary seconds
38
- end
39
-
40
- (HTML5::WhiteList::ALLOWED_ELEMENTS).each do |tag_name|
41
- define_method "test_should_allow_#{tag_name}_tag" do
42
- input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
43
- htmloutput = "<#{tag_name.downcase} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name.downcase}>"
44
- xhtmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name}>"
45
- rexmloutput = xhtmloutput
46
-
47
- if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
48
- htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
49
- xhtmloutput = htmloutput
50
- elsif tag_name == 'col'
51
- htmloutput = "<col title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
52
- xhtmloutput = htmloutput
53
- rexmloutput = "<col title='1' />"
54
- elsif tag_name == 'table'
55
- htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt;baz<table title='1'> </table>"
56
- xhtmloutput = htmloutput
57
- elsif tag_name == 'image'
58
- htmloutput = "<img title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
59
- xhtmloutput = htmloutput
60
- rexmloutput = "<image title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</image>"
61
- elsif HTML5::WhiteList::VOID_ELEMENTS.include?(tag_name)
62
- htmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
63
- xhtmloutput = htmloutput
64
- htmloutput += '<br/>' if tag_name == 'br'
65
- rexmloutput = "<#{tag_name} title='1' />"
66
- end
67
- check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
68
- end
69
- end
70
-
71
- ##
72
- ## libxml2 downcases elements, so this is moot.
73
- ##
74
- # HTML5::WhiteList::ALLOWED_ELEMENTS.each do |tag_name|
75
- # define_method "test_should_forbid_#{tag_name.upcase}_tag" do
76
- # input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
77
- # output = "&lt;#{tag_name.upcase} title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/#{tag_name.upcase}&gt;"
78
- # check_sanitization(input, output, output, output)
79
- # end
80
- # end
81
-
82
- HTML5::WhiteList::ALLOWED_ATTRIBUTES.each do |attribute_name|
83
- next if attribute_name == 'style'
84
- define_method "test_should_allow_#{attribute_name}_attribute" do
85
- input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
86
- if %w[checked compact disabled ismap multiple nohref noshade nowrap readonly selected].include?(attribute_name)
87
- output = "<p #{attribute_name}>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
88
- htmloutput = "<p #{attribute_name.downcase}>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
89
- else
90
- output = "<p #{attribute_name}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
91
- htmloutput = "<p #{attribute_name.downcase}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
92
- end
93
- check_sanitization(input, htmloutput, output, output)
94
- end
95
- end
96
-
97
- def test_should_allow_data_attributes
98
- input = "<p data-foo='foo'>foo <bad>bar</bad> baz</p>"
99
-
100
- output = "<p data-foo='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
101
- htmloutput = "<p data-foo='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
102
-
103
- check_sanitization(input, htmloutput, output, output)
104
- end
105
-
106
- def test_should_allow_multi_word_data_attributes
107
- input = "<p data-foo-bar-id='11'>foo <bad>bar</bad> baz</p>"
108
- output = htmloutput = "<p data-foo-bar-id='11'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
109
-
110
- check_sanitization(input, htmloutput, output, output)
111
- end
112
-
113
- ##
114
- ## libxml2 downcases attributes, so this is moot.
115
- ##
116
- # HTML5::WhiteList::ALLOWED_ATTRIBUTES.each do |attribute_name|
117
- # define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
118
- # input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
119
- # output = "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
120
- # check_sanitization(input, output, output, output)
121
- # end
122
- # end
123
-
124
- HTML5::WhiteList::ALLOWED_PROTOCOLS.each do |protocol|
125
- define_method "test_should_allow_#{protocol}_uris" do
126
- input = %(<a href="#{protocol}">foo</a>)
127
- output = "<a href='#{protocol}'>foo</a>"
128
- check_sanitization(input, output, output, output)
129
- end
130
- end
131
-
132
- HTML5::WhiteList::ALLOWED_PROTOCOLS.each do |protocol|
133
- define_method "test_should_allow_uppercase_#{protocol}_uris" do
134
- input = %(<a href="#{protocol.upcase}">foo</a>)
135
- output = "<a href='#{protocol.upcase}'>foo</a>"
136
- check_sanitization(input, output, output, output)
137
- end
138
- end
139
-
140
- HTML5::WhiteList::ALLOWED_URI_DATA_MEDIATYPES.each do |data_uri_type|
141
- define_method "test_should_allow_data_#{data_uri_type}_uris" do
142
- input = %(<a href="data:#{data_uri_type}">foo</a>)
143
- output = "<a href='data:#{data_uri_type}'>foo</a>"
144
- check_sanitization(input, output, output, output)
145
-
146
- input = %(<a href="data:#{data_uri_type};base64,R0lGODlhAQABA">foo</a>)
147
- output = "<a href='data:#{data_uri_type};base64,R0lGODlhAQABA'>foo</a>"
148
- check_sanitization(input, output, output, output)
149
- end
150
- end
151
-
152
- HTML5::WhiteList::ALLOWED_URI_DATA_MEDIATYPES.each do |data_uri_type|
153
- define_method "test_should_allow_uppercase_data_#{data_uri_type}_uris" do
154
- input = %(<a href="DATA:#{data_uri_type.upcase}">foo</a>)
155
- output = "<a href='DATA:#{data_uri_type.upcase}'>foo</a>"
156
- check_sanitization(input, output, output, output)
157
- end
158
- end
159
-
160
- def test_should_disallow_other_uri_mediatypes
161
- input = %(<a href="data:foo">foo</a>)
162
- output = "<a>foo</a>"
163
- check_sanitization(input, output, output, output)
164
-
165
- input = %(<a href="data:image/xxx">foo</a>)
166
- output = "<a>foo</a>"
167
- check_sanitization(input, output, output, output)
168
-
169
- input = %(<a href="data:image/xxx;base64,R0lGODlhAQABA">foo</a>)
170
- output = "<a>foo</a>"
171
- check_sanitization(input, output, output, output)
172
- end
173
-
174
-
175
- HTML5::WhiteList::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
176
- next unless HTML5::WhiteList::ALLOWED_ELEMENTS.include?(tag_name)
177
- define_method "test_#{tag_name}_should_allow_local_href" do
178
- input = %(<#{tag_name} xlink:href="#foo"/>)
179
- output = "<#{tag_name.downcase} xlink:href='#foo'></#{tag_name.downcase}>"
180
- xhtmloutput = "<#{tag_name} xlink:href='#foo'></#{tag_name}>"
181
- check_sanitization(input, output, xhtmloutput, xhtmloutput)
182
- end
183
-
184
- define_method "test_#{tag_name}_should_allow_local_href_with_newline" do
185
- input = %(<#{tag_name} xlink:href="\n#foo"/>)
186
- output = "<#{tag_name.downcase} xlink:href='\n#foo'></#{tag_name.downcase}>"
187
- xhtmloutput = "<#{tag_name} xlink:href='\n#foo'></#{tag_name}>"
188
- check_sanitization(input, output, xhtmloutput, xhtmloutput)
189
- end
190
-
191
- define_method "test_#{tag_name}_should_forbid_nonlocal_href" do
192
- input = %(<#{tag_name} xlink:href="http://bad.com/foo"/>)
193
- output = "<#{tag_name.downcase}></#{tag_name.downcase}>"
194
- xhtmloutput = "<#{tag_name}></#{tag_name}>"
195
- check_sanitization(input, output, xhtmloutput, xhtmloutput)
196
- end
197
-
198
- define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_newline" do
199
- input = %(<#{tag_name} xlink:href="\nhttp://bad.com/foo"/>)
200
- output = "<#{tag_name.downcase}></#{tag_name.downcase}>"
201
- xhtmloutput = "<#{tag_name}></#{tag_name}>"
202
- check_sanitization(input, output, xhtmloutput, xhtmloutput)
203
- end
204
- end
205
-
206
- def test_figure_element_is_valid
207
- fragment = Loofah.scrub_fragment("<span>hello</span> <figure>asd</figure>", :prune)
208
- assert fragment.at_css("figure"), "<figure> tag was scrubbed"
209
- end
210
-
211
- ##
212
- ## as tenderlove says, "care < 0"
213
- ##
214
- # def test_should_handle_astral_plane_characters
215
- # input = "<p>&#x1d4b5; &#x1d538;</p>"
216
- # output = "<p>\360\235\222\265 \360\235\224\270</p>"
217
- # check_sanitization(input, output, output, output)
218
-
219
- # input = "<p><tspan>\360\235\224\270</tspan> a</p>"
220
- # output = "<p><tspan>\360\235\224\270</tspan> a</p>"
221
- # check_sanitization(input, output, output, output)
222
- # end
223
-
224
- # This affects only NS4. Is it worth fixing?
225
- # def test_javascript_includes
226
- # input = %(<div size="&{alert('XSS')}">foo</div>)
227
- # output = "<div>foo</div>"
228
- # check_sanitization(input, output, output, output)
229
- # end
230
-
231
- ##
232
- ## these tests primarily test the parser logic, not the sanitizer
233
- ## logic. i call bullshit. we're not writing a test suite for
234
- ## libxml2 here, so let's rely on the unit tests above to take care
235
- ## of our valid elements and attributes.
236
- ##
237
- require 'json'
238
- Dir[File.join(File.dirname(__FILE__), '..', 'assets', 'testdata_sanitizer_tests1.dat')].each do |filename|
239
- JSON::parse(open(filename).read).each do |test|
240
- it "testdata sanitizer #{test['name']}" do
241
- check_sanitization(
242
- test['input'],
243
- test['output'],
244
- test['xhtml'] || test['output'],
245
- test['rexml'] || test['output']
246
- )
247
- end
248
- end
249
- end
250
-
251
- ## added because we don't have any coverage above on SVG_ATTR_VAL_ALLOWS_REF
252
- HTML5::WhiteList::SVG_ATTR_VAL_ALLOWS_REF.each do |attr_name|
253
- define_method "test_should_allow_uri_refs_in_svg_attribute_#{attr_name}" do
254
- input = "<rect fill='url(#foo)' />"
255
- output = "<rect fill='url(#foo)'></rect>"
256
- check_sanitization(input, output, output, output)
257
- end
258
-
259
- define_method "test_absolute_uri_refs_in_svg_attribute_#{attr_name}" do
260
- input = "<rect fill='url(http://bad.com/) #fff' />"
261
- output = "<rect fill=' #fff'></rect>"
262
- check_sanitization(input, output, output, output)
263
- end
264
- end
265
-
266
- def test_css_negative_value_sanitization
267
- html = "<span style=\"letter-spacing:-0.03em;\">"
268
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
269
- assert_match %r/-0.03em/, sane.inner_html
270
- end
271
-
272
- def test_css_negative_value_sanitization_shorthand_css_properties
273
- html = "<span style=\"margin-left:-0.05em;\">"
274
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
275
- assert_match %r/-0.05em/, sane.inner_html
276
- end
277
-
278
- def test_css_function_sanitization_leaves_whitelisted_functions_calc
279
- html = "<span style=\"width:calc(5%)\">"
280
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
281
- assert_match %r/calc\(5%\)/, sane.inner_html
282
-
283
- html = "<span style=\"width: calc(5%)\">"
284
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
285
- assert_match %r/calc\(5%\)/, sane.inner_html
286
- end
287
-
288
- def test_css_function_sanitization_leaves_whitelisted_functions_rgb
289
- html = '<span style="color: rgb(255, 0, 0)">'
290
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
291
- assert_match %r/rgb\(255, 0, 0\)/, sane.inner_html
292
- end
293
-
294
- def test_css_function_sanitization_leaves_whitelisted_list_style_type
295
- html = "<ol style='list-style-type:lower-greek;'></ol>"
296
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
297
- assert_match %r/list-style-type:lower-greek/, sane.inner_html
298
- end
299
-
300
- def test_css_function_sanitization_strips_style_attributes_with_unsafe_functions
301
- html = "<span style=\"width:attr(data-evil-attr)\">"
302
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
303
- assert_match %r/<span><\/span>/, sane.inner_html
304
-
305
- html = "<span style=\"width: attr(data-evil-attr)\">"
306
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
307
- assert_match %r/<span><\/span>/, sane.inner_html
308
- end
309
-
310
- def test_issue_90_slow_regex
311
- skip("timing tests are hard to make pass and have little regression-testing value")
312
-
313
- html = %q{<span style="background: url('data:image/svg&#43;xml;charset=utf-8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2232%22%20height%3D%2232%22%20viewBox%3D%220%200%2032%2032%22%3E%3Cpath%20fill%3D%22%23D4C8AE%22%20d%3D%22M0%200h32v32h-32z%22%2F%3E%3Cpath%20fill%3D%22%2383604B%22%20d%3D%22M0%200h31.99v11.75h-31.99z%22%2F%3E%3Cpath%20fill%3D%22%233D2319%22%20d%3D%22M0%2011.5h32v.5h-32z%22%2F%3E%3Cpath%20fill%3D%22%23F83651%22%20d%3D%22M5%200h1v10.5h-1z%22%2F%3E%3Cpath%20fill%3D%22%23FCD050%22%20d%3D%22M6%200h1v10.5h-1z%22%2F%3E%3Cpath%20fill%3D%22%2371C797%22%20d%3D%22M7%200h1v10.5h-1z%22%2F%3E%3Cpath%20fill%3D%22%23509CF9%22%20d%3D%22M8%200h1v10.5h-1z%22%2F%3E%3ClinearGradient%20id%3D%22a%22%20gradientUnits%3D%22userSpaceOnUse%22%20x1%3D%2224.996%22%20y1%3D%2210.5%22%20x2%3D%2224.996%22%20y2%3D%224.5%22%3E%3Cstop%20offset%3D%220%22%20stop-color%3D%22%23796055%22%2F%3E%3Cstop%20offset%3D%22.434%22%20stop-color%3D%22%23614C43%22%2F%3E%3Cstop%20offset%3D%221%22%20stop-color%3D%22%233D2D28%22%2F%3E%3C%2FlinearGradient%3E%3Cpath%20fill%3D%22url(%23a)%22%20d%3D%22M28%208.5c0%201.1-.9%202-2%202h-2c-1.1%200-2-.9-2-2v-2c0-1.1.9-2%202-2h2c1.1%200%202%20.9%202%202v2z%22%2F%3E%3Cpath%20fill%3D%22%235F402E%22%20d%3D%22M28%208c0%201.1-.9%202-2%202h-2c-1.1%200-2-.9-2-2v-2c0-1.1.9-2%202-2h2c1.1%200%202%20.9%202%202v2z%22%2F%3E%3C');"></span>}
314
-
315
- assert_completes_in_reasonable_time {
316
- Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
317
- }
318
- end
319
-
320
- def test_upper_case_css_property
321
- html = "<div style=\"COLOR: BLUE; NOTAPROPERTY: RED;\">asdf</div>"
322
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_xml)
323
- assert_match(/COLOR:\s*BLUE/i, sane.at_css("div")["style"])
324
- refute_match(/NOTAPROPERTY/i, sane.at_css("div")["style"])
325
- end
326
-
327
- def test_many_properties_some_allowed
328
- html = "<div style=\"background: bold notaproperty center alsonotaproperty 10px;\">asdf</div>"
329
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_xml)
330
- assert_match(/bold\s+center\s+10px/, sane.at_css("div")["style"])
331
- end
332
-
333
- def test_many_properties_non_allowed
334
- html = "<div style=\"background: notaproperty alsonotaproperty;\">asdf</div>"
335
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_xml)
336
- assert_nil sane.at_css("div")["style"]
337
- end
338
-
339
- def test_svg_properties
340
- html = "<line style='stroke-width: 10px;'></line>"
341
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_xml)
342
- assert_match(/stroke-width:\s*10px/, sane.at_css("line")["style"])
343
- end
344
- end
345
-
346
- # <html5_license>
347
- #
348
- # Copyright (c) 2006-2008 The Authors
349
- #
350
- # Contributors:
351
- # James Graham - jg307@cam.ac.uk
352
- # Anne van Kesteren - annevankesteren@gmail.com
353
- # Lachlan Hunt - lachlan.hunt@lachy.id.au
354
- # Matt McDonald - kanashii@kanashii.ca
355
- # Sam Ruby - rubys@intertwingly.net
356
- # Ian Hickson (Google) - ian@hixie.ch
357
- # Thomas Broyer - t.broyer@ltgt.net
358
- # Jacques Distler - distler@golem.ph.utexas.edu
359
- # Henri Sivonen - hsivonen@iki.fi
360
- # The Mozilla Foundation (contributions from Henri Sivonen since 2008)
361
- #
362
- # Permission is hereby granted, free of charge, to any person
363
- # obtaining a copy of this software and associated documentation files
364
- # (the "Software"), to deal in the Software without restriction,
365
- # including without limitation the rights to use, copy, modify, merge,
366
- # publish, distribute, sublicense, and/or sell copies of the Software,
367
- # and to permit persons to whom the Software is furnished to do so,
368
- # subject to the following conditions:
369
- #
370
- # The above copyright notice and this permission notice shall be
371
- # included in all copies or substantial portions of the Software.
372
- #
373
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
374
- # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
375
- # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
376
- # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
377
- # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
378
- # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
379
- # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
380
- # SOFTWARE.
381
- #
382
- # </html5_license>
@@ -1,204 +0,0 @@
1
- require "helper"
2
-
3
- class IntegrationTestAdHoc < Loofah::TestCase
4
-
5
- context "blank input string" do
6
- context "fragment" do
7
- it "return a blank string" do
8
- assert_equal "", Loofah.scrub_fragment("", :prune).to_s
9
- end
10
- end
11
-
12
- context "document" do
13
- it "return a blank string" do
14
- assert_equal "", Loofah.scrub_document("", :prune).root.to_s
15
- end
16
- end
17
- end
18
-
19
- context "tests" do
20
- MSWORD_HTML = File.read(File.join(File.dirname(__FILE__), "..", "assets", "msword.html")).freeze
21
-
22
- def test_removal_of_illegal_tag
23
- html = <<-HTML
24
- following this there should be no jim tag
25
- <jim>jim</jim>
26
- was there?
27
- HTML
28
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
29
- assert sane.xpath("//jim").empty?
30
- end
31
-
32
- def test_removal_of_illegal_attribute
33
- html = "<p class=bar foo=bar abbr=bar />"
34
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
35
- node = sane.xpath("//p").first
36
- assert node.attributes['class']
37
- assert node.attributes['abbr']
38
- assert_nil node.attributes['foo']
39
- end
40
-
41
- def test_removal_of_illegal_url_in_href
42
- html = <<-HTML
43
- <a href='jimbo://jim.jim/'>this link should have its href removed because of illegal url</a>
44
- <a href='http://jim.jim/'>this link should be fine</a>
45
- HTML
46
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
47
- nodes = sane.xpath("//a")
48
- assert_nil nodes.first.attributes['href']
49
- assert nodes.last.attributes['href']
50
- end
51
-
52
- def test_css_sanitization
53
- html = "<p style='background-color: url(\"http://foo.com/\") ; background-color: #000 ;' />"
54
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
55
- assert_match %r/#000/, sane.inner_html
56
- refute_match %r/foo\.com/, sane.inner_html
57
- end
58
-
59
- def test_fragment_with_no_tags
60
- assert_equal "This fragment has no tags.", Loofah.scrub_fragment("This fragment has no tags.", :escape).to_xml
61
- end
62
-
63
- def test_fragment_in_p_tag
64
- assert_equal "<p>This fragment is in a p.</p>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>", :escape).to_xml
65
- end
66
-
67
- def test_fragment_in_p_tag_plus_stuff
68
- assert_equal "<p>This fragment is in a p.</p>foo<strong>bar</strong>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>foo<strong>bar</strong>", :escape).to_xml
69
- end
70
-
71
- def test_fragment_with_text_nodes_leading_and_trailing
72
- assert_equal "text<p>fragment</p>text", Loofah.scrub_fragment("text<p>fragment</p>text", :escape).to_xml
73
- end
74
-
75
- def test_whitewash_on_fragment
76
- html = "safe<frameset rows=\"*\"><frame src=\"http://example.com\"></frameset> <b>description</b>"
77
- whitewashed = Loofah.scrub_document(html, :whitewash).xpath("/html/body/*").to_s
78
- assert_equal "<p>safe</p><b>description</b>", whitewashed.gsub("\n","")
79
- end
80
-
81
- def test_fragment_whitewash_on_microsofty_markup
82
- whitewashed = Loofah.fragment(MSWORD_HTML).scrub!(:whitewash)
83
- assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed.to_s.strip
84
- end
85
-
86
- def test_document_whitewash_on_microsofty_markup
87
- whitewashed = Loofah.document(MSWORD_HTML).scrub!(:whitewash)
88
- assert_match %r(<p>Foo <b>BOLD</b></p>), whitewashed.to_s
89
- assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed.xpath("/html/body/*").to_s
90
- end
91
-
92
- def test_return_empty_string_when_nothing_left
93
- assert_equal "", Loofah.scrub_document('<script>test</script>', :prune).text
94
- end
95
-
96
- def test_nested_script_cdata_tags_should_be_scrubbed
97
- html = "<script><script src='malicious.js'></script>"
98
- stripped = Loofah.fragment(html).scrub!(:strip)
99
- assert_empty stripped.xpath("//script")
100
- refute_match("<script", stripped.to_html)
101
- end
102
-
103
- def test_nested_script_cdata_tags_should_be_scrubbed_2
104
- html = "<script><script>alert('a');</script></script>"
105
- stripped = Loofah.fragment(html).scrub!(:strip)
106
- assert_empty stripped.xpath("//script")
107
- refute_match("<script", stripped.to_html)
108
- end
109
-
110
- def test_removal_of_all_tags
111
- html = <<-HTML
112
- What's up <strong>doc</strong>?
113
- HTML
114
- stripped = Loofah.scrub_document(html, :prune).text
115
- assert_equal %Q(What\'s up doc?).strip, stripped.strip
116
- end
117
-
118
- def test_dont_remove_whitespace
119
- html = "Foo\nBar"
120
- assert_equal html, Loofah.scrub_document(html, :prune).text
121
- end
122
-
123
- def test_dont_remove_whitespace_between_tags
124
- html = "<p>Foo</p>\n<p>Bar</p>"
125
- assert_equal "Foo\nBar", Loofah.scrub_document(html, :prune).text
126
- end
127
-
128
- #
129
- # tests for CVE-2018-8048 (see https://github.com/flavorjones/loofah/issues/144)
130
- #
131
- # libxml2 >= 2.9.2 fails to escape comments within some attributes. It
132
- # wants to ensure these comments can be treated as "server-side includes",
133
- # but as a result fails to ensure that serialization is well-formed,
134
- # resulting in an opportunity for XSS injection of code into a final
135
- # re-parsed document (presumably in a browser).
136
- #
137
- # we'll test this by parsing the HTML, serializing it, then
138
- # re-parsing it to ensure there isn't any ambiguity in the output
139
- # that might allow code injection into a browser consuming
140
- # "sanitized" output.
141
- #
142
- [
143
- #
144
- # these tags and attributes are determined by the code at:
145
- #
146
- # https://git.gnome.org/browse/libxml2/tree/HTMLtree.c?h=v2.9.2#n714
147
- #
148
- {tag: "a", attr: "href"},
149
- {tag: "div", attr: "href"},
150
- {tag: "a", attr: "action"},
151
- {tag: "div", attr: "action"},
152
- {tag: "a", attr: "src"},
153
- {tag: "div", attr: "src"},
154
- {tag: "a", attr: "name"},
155
- #
156
- # note that div+name is _not_ affected by the libxml2 issue.
157
- # but we test it anyway to ensure our logic isn't modifying
158
- # attributes that don't need modifying.
159
- #
160
- {tag: "div", attr: "name", unescaped: true},
161
- ].each do |config|
162
-
163
- define_method "test_uri_escaping_of_#{config[:attr]}_attr_in_#{config[:tag]}_tag" do
164
- html = %{<#{config[:tag]} #{config[:attr]}='examp<!--" unsafeattr=foo()>-->le.com'>test</#{config[:tag]}>}
165
-
166
- reparsed = Loofah.fragment(Loofah.fragment(html).scrub!(:prune).to_html)
167
- attributes = reparsed.at_css(config[:tag]).attribute_nodes
168
-
169
- assert_equal [config[:attr]], attributes.collect(&:name)
170
- if Nokogiri::VersionInfo.instance.libxml2?
171
- if config[:unescaped]
172
- #
173
- # this attribute was emitted wrapped in single-quotes, so a double quote is A-OK.
174
- # assert that this attribute's serialization is unaffected.
175
- #
176
- assert_equal %{examp<!--" unsafeattr=foo()>-->le.com}, attributes.first.value
177
- else
178
- #
179
- # let's match the behavior in libxml < 2.9.2.
180
- # test that this attribute's serialization is well-formed and sanitized.
181
- #
182
- assert_equal %{examp<!--%22%20unsafeattr=foo()>-->le.com}, attributes.first.value
183
- end
184
- else
185
- #
186
- # yay for consistency in javaland. move along, nothing to see here.
187
- #
188
- assert_equal %{examp<!--%22 unsafeattr=foo()>-->le.com}, attributes.first.value
189
- end
190
- end
191
- end
192
-
193
- # see:
194
- # - https://github.com/flavorjones/loofah/issues/154
195
- # - https://hackerone.com/reports/429267
196
- context "xss protection from svg xmlns:xlink animate attribute" do
197
- it "sanitizes appropriate attributes" do
198
- html = %Q{<svg><a xmlns:xlink=http://www.w3.org/1999/xlink xlink:href=?><circle r=400 /><animate attributeName=xlink:href begin=0 from=javascript:alert(1) to=%26>}
199
- sanitized = Loofah.scrub_fragment(html, :escape)
200
- assert_nil sanitized.at_css("animate")["from"]
201
- end
202
- end
203
- end
204
- end
@@ -1,43 +0,0 @@
1
- require "helper"
2
-
3
- class IntegrationTestHelpers < Loofah::TestCase
4
- context ".strip_tags" do
5
- context "on safe markup" do
6
- it "strip out tags" do
7
- assert_equal "omgwtfbbq!!1!", Loofah::Helpers.strip_tags("<div>omgwtfbbq</div><span>!!1!</span>")
8
- end
9
- end
10
-
11
- context "on hack attack" do
12
- it "strip escape html entities" do
13
- bad_shit = "&lt;script&gt;alert('evil')&lt;/script&gt;"
14
- assert_equal bad_shit, Loofah::Helpers.strip_tags(bad_shit)
15
- end
16
- end
17
- end
18
-
19
- context ".sanitize" do
20
- context "on safe markup" do
21
- it "render the safe html" do
22
- html = "<div>omgwtfbbq</div><span>!!1!</span>"
23
- assert_equal html, Loofah::Helpers.sanitize(html)
24
- end
25
- end
26
-
27
- context "on hack attack" do
28
- it "strip the unsafe tags" do
29
- assert_equal "alert('evil')<span>w00t</span>", Loofah::Helpers.sanitize("<script>alert('evil')</script><span>w00t</span>")
30
- end
31
-
32
- it "strips form tags" do
33
- assert_equal "alert('evil')<span>w00t</span>", Loofah::Helpers.sanitize("<script>alert('evil')</script><form action=\"/foo/bar\" method=\"post\"><input></form><span>w00t</span>")
34
- end
35
- end
36
- end
37
-
38
- context ".sanitize_css" do
39
- it "removes unsafe css properties" do
40
- assert_match(/display:\s*block;\s*background-color:\s*blue;/, Loofah::Helpers.sanitize_css("display:block;background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg);background-color:blue"))
41
- end
42
- end
43
- end