loofah 2.2.3 → 2.9.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of loofah might be problematic. Click here for more details.

Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +124 -31
  3. data/README.md +12 -16
  4. data/lib/loofah.rb +35 -18
  5. data/lib/loofah/elements.rb +74 -73
  6. data/lib/loofah/helpers.rb +18 -7
  7. data/lib/loofah/html/document.rb +1 -0
  8. data/lib/loofah/html/document_fragment.rb +4 -2
  9. data/lib/loofah/html5/libxml2_workarounds.rb +8 -7
  10. data/lib/loofah/html5/safelist.rb +819 -0
  11. data/lib/loofah/html5/scrub.rb +63 -46
  12. data/lib/loofah/instance_methods.rb +5 -3
  13. data/lib/loofah/metahelpers.rb +2 -1
  14. data/lib/loofah/scrubber.rb +8 -7
  15. data/lib/loofah/scrubbers.rb +12 -11
  16. data/lib/loofah/version.rb +5 -0
  17. data/lib/loofah/xml/document.rb +1 -0
  18. data/lib/loofah/xml/document_fragment.rb +2 -1
  19. metadata +40 -112
  20. data/.gemtest +0 -0
  21. data/Gemfile +0 -22
  22. data/Manifest.txt +0 -40
  23. data/Rakefile +0 -79
  24. data/benchmark/benchmark.rb +0 -149
  25. data/benchmark/fragment.html +0 -96
  26. data/benchmark/helper.rb +0 -73
  27. data/benchmark/www.slashdot.com.html +0 -2560
  28. data/lib/loofah/html5/whitelist.rb +0 -186
  29. data/test/assets/msword.html +0 -63
  30. data/test/assets/testdata_sanitizer_tests1.dat +0 -502
  31. data/test/helper.rb +0 -18
  32. data/test/html5/test_sanitizer.rb +0 -382
  33. data/test/integration/test_ad_hoc.rb +0 -204
  34. data/test/integration/test_helpers.rb +0 -43
  35. data/test/integration/test_html.rb +0 -72
  36. data/test/integration/test_scrubbers.rb +0 -400
  37. data/test/integration/test_xml.rb +0 -55
  38. data/test/unit/test_api.rb +0 -142
  39. data/test/unit/test_encoding.rb +0 -20
  40. data/test/unit/test_helpers.rb +0 -62
  41. data/test/unit/test_scrubber.rb +0 -229
  42. data/test/unit/test_scrubbers.rb +0 -14
data/test/helper.rb DELETED
@@ -1,18 +0,0 @@
1
- require 'rubygems'
2
- require 'minitest/unit'
3
- require 'minitest/spec'
4
- require 'minitest/autorun'
5
- require 'rr'
6
-
7
- require File.expand_path(File.join(File.dirname(__FILE__), "..", "lib", "loofah"))
8
-
9
- # require the ActionView helpers here, since they are no longer required automatically
10
- require File.expand_path(File.join(File.dirname(__FILE__), "..", "lib", "loofah", "helpers"))
11
-
12
- puts "=> testing with Nokogiri #{Nokogiri::VERSION_INFO.inspect}"
13
-
14
- class Loofah::TestCase < MiniTest::Spec
15
- class << self
16
- alias_method :context, :describe
17
- end
18
- end
@@ -1,382 +0,0 @@
1
- #
2
- # these tests taken from the HTML5 sanitization project and modified for use with Loofah
3
- # see the original here: http://code.google.com/p/html5lib/source/browse/ruby/test/test_sanitizer.rb
4
- #
5
- # license text at the bottom of this file
6
- #
7
- require "helper"
8
-
9
- class Html5TestSanitizer < Loofah::TestCase
10
- include Loofah
11
-
12
- def sanitize_xhtml stream
13
- Loofah.fragment(stream).scrub!(:escape).to_xhtml
14
- end
15
-
16
- def sanitize_html stream
17
- Loofah.fragment(stream).scrub!(:escape).to_html
18
- end
19
-
20
- def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
21
- ## libxml uses double-quotes, so let's swappo-boppo our quotes before comparing.
22
- sane = sanitize_html(input).gsub('"',"'")
23
- htmloutput = htmloutput.gsub('"',"'")
24
- xhtmloutput = xhtmloutput.gsub('"',"'")
25
- rexmloutput = rexmloutput.gsub('"',"'")
26
-
27
- ## HTML5's parsers are shit. there's so much inconsistency with what has closing tags, etc, that
28
- ## it would require a lot of manual hacking to make the tests match libxml's output.
29
- ## instead, I'm taking the shotgun approach, and trying to match any of the described outputs.
30
- assert((htmloutput == sane) || (rexmloutput == sane) || (xhtmloutput == sane),
31
- %Q{given: "#{input}"\nexpected: "#{htmloutput}"\ngot: "#{sane}"})
32
- end
33
-
34
- def assert_completes_in_reasonable_time &block
35
- t0 = Time.now
36
- block.call
37
- assert_in_delta t0, Time.now, 0.1 # arbitrary seconds
38
- end
39
-
40
- (HTML5::WhiteList::ALLOWED_ELEMENTS).each do |tag_name|
41
- define_method "test_should_allow_#{tag_name}_tag" do
42
- input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
43
- htmloutput = "<#{tag_name.downcase} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name.downcase}>"
44
- xhtmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name}>"
45
- rexmloutput = xhtmloutput
46
-
47
- if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
48
- htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
49
- xhtmloutput = htmloutput
50
- elsif tag_name == 'col'
51
- htmloutput = "<col title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
52
- xhtmloutput = htmloutput
53
- rexmloutput = "<col title='1' />"
54
- elsif tag_name == 'table'
55
- htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt;baz<table title='1'> </table>"
56
- xhtmloutput = htmloutput
57
- elsif tag_name == 'image'
58
- htmloutput = "<img title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
59
- xhtmloutput = htmloutput
60
- rexmloutput = "<image title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</image>"
61
- elsif HTML5::WhiteList::VOID_ELEMENTS.include?(tag_name)
62
- htmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
63
- xhtmloutput = htmloutput
64
- htmloutput += '<br/>' if tag_name == 'br'
65
- rexmloutput = "<#{tag_name} title='1' />"
66
- end
67
- check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
68
- end
69
- end
70
-
71
- ##
72
- ## libxml2 downcases elements, so this is moot.
73
- ##
74
- # HTML5::WhiteList::ALLOWED_ELEMENTS.each do |tag_name|
75
- # define_method "test_should_forbid_#{tag_name.upcase}_tag" do
76
- # input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
77
- # output = "&lt;#{tag_name.upcase} title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/#{tag_name.upcase}&gt;"
78
- # check_sanitization(input, output, output, output)
79
- # end
80
- # end
81
-
82
- HTML5::WhiteList::ALLOWED_ATTRIBUTES.each do |attribute_name|
83
- next if attribute_name == 'style'
84
- define_method "test_should_allow_#{attribute_name}_attribute" do
85
- input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
86
- if %w[checked compact disabled ismap multiple nohref noshade nowrap readonly selected].include?(attribute_name)
87
- output = "<p #{attribute_name}>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
88
- htmloutput = "<p #{attribute_name.downcase}>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
89
- else
90
- output = "<p #{attribute_name}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
91
- htmloutput = "<p #{attribute_name.downcase}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
92
- end
93
- check_sanitization(input, htmloutput, output, output)
94
- end
95
- end
96
-
97
- def test_should_allow_data_attributes
98
- input = "<p data-foo='foo'>foo <bad>bar</bad> baz</p>"
99
-
100
- output = "<p data-foo='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
101
- htmloutput = "<p data-foo='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
102
-
103
- check_sanitization(input, htmloutput, output, output)
104
- end
105
-
106
- def test_should_allow_multi_word_data_attributes
107
- input = "<p data-foo-bar-id='11'>foo <bad>bar</bad> baz</p>"
108
- output = htmloutput = "<p data-foo-bar-id='11'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
109
-
110
- check_sanitization(input, htmloutput, output, output)
111
- end
112
-
113
- ##
114
- ## libxml2 downcases attributes, so this is moot.
115
- ##
116
- # HTML5::WhiteList::ALLOWED_ATTRIBUTES.each do |attribute_name|
117
- # define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
118
- # input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
119
- # output = "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
120
- # check_sanitization(input, output, output, output)
121
- # end
122
- # end
123
-
124
- HTML5::WhiteList::ALLOWED_PROTOCOLS.each do |protocol|
125
- define_method "test_should_allow_#{protocol}_uris" do
126
- input = %(<a href="#{protocol}">foo</a>)
127
- output = "<a href='#{protocol}'>foo</a>"
128
- check_sanitization(input, output, output, output)
129
- end
130
- end
131
-
132
- HTML5::WhiteList::ALLOWED_PROTOCOLS.each do |protocol|
133
- define_method "test_should_allow_uppercase_#{protocol}_uris" do
134
- input = %(<a href="#{protocol.upcase}">foo</a>)
135
- output = "<a href='#{protocol.upcase}'>foo</a>"
136
- check_sanitization(input, output, output, output)
137
- end
138
- end
139
-
140
- HTML5::WhiteList::ALLOWED_URI_DATA_MEDIATYPES.each do |data_uri_type|
141
- define_method "test_should_allow_data_#{data_uri_type}_uris" do
142
- input = %(<a href="data:#{data_uri_type}">foo</a>)
143
- output = "<a href='data:#{data_uri_type}'>foo</a>"
144
- check_sanitization(input, output, output, output)
145
-
146
- input = %(<a href="data:#{data_uri_type};base64,R0lGODlhAQABA">foo</a>)
147
- output = "<a href='data:#{data_uri_type};base64,R0lGODlhAQABA'>foo</a>"
148
- check_sanitization(input, output, output, output)
149
- end
150
- end
151
-
152
- HTML5::WhiteList::ALLOWED_URI_DATA_MEDIATYPES.each do |data_uri_type|
153
- define_method "test_should_allow_uppercase_data_#{data_uri_type}_uris" do
154
- input = %(<a href="DATA:#{data_uri_type.upcase}">foo</a>)
155
- output = "<a href='DATA:#{data_uri_type.upcase}'>foo</a>"
156
- check_sanitization(input, output, output, output)
157
- end
158
- end
159
-
160
- def test_should_disallow_other_uri_mediatypes
161
- input = %(<a href="data:foo">foo</a>)
162
- output = "<a>foo</a>"
163
- check_sanitization(input, output, output, output)
164
-
165
- input = %(<a href="data:image/xxx">foo</a>)
166
- output = "<a>foo</a>"
167
- check_sanitization(input, output, output, output)
168
-
169
- input = %(<a href="data:image/xxx;base64,R0lGODlhAQABA">foo</a>)
170
- output = "<a>foo</a>"
171
- check_sanitization(input, output, output, output)
172
- end
173
-
174
-
175
- HTML5::WhiteList::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
176
- next unless HTML5::WhiteList::ALLOWED_ELEMENTS.include?(tag_name)
177
- define_method "test_#{tag_name}_should_allow_local_href" do
178
- input = %(<#{tag_name} xlink:href="#foo"/>)
179
- output = "<#{tag_name.downcase} xlink:href='#foo'></#{tag_name.downcase}>"
180
- xhtmloutput = "<#{tag_name} xlink:href='#foo'></#{tag_name}>"
181
- check_sanitization(input, output, xhtmloutput, xhtmloutput)
182
- end
183
-
184
- define_method "test_#{tag_name}_should_allow_local_href_with_newline" do
185
- input = %(<#{tag_name} xlink:href="\n#foo"/>)
186
- output = "<#{tag_name.downcase} xlink:href='\n#foo'></#{tag_name.downcase}>"
187
- xhtmloutput = "<#{tag_name} xlink:href='\n#foo'></#{tag_name}>"
188
- check_sanitization(input, output, xhtmloutput, xhtmloutput)
189
- end
190
-
191
- define_method "test_#{tag_name}_should_forbid_nonlocal_href" do
192
- input = %(<#{tag_name} xlink:href="http://bad.com/foo"/>)
193
- output = "<#{tag_name.downcase}></#{tag_name.downcase}>"
194
- xhtmloutput = "<#{tag_name}></#{tag_name}>"
195
- check_sanitization(input, output, xhtmloutput, xhtmloutput)
196
- end
197
-
198
- define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_newline" do
199
- input = %(<#{tag_name} xlink:href="\nhttp://bad.com/foo"/>)
200
- output = "<#{tag_name.downcase}></#{tag_name.downcase}>"
201
- xhtmloutput = "<#{tag_name}></#{tag_name}>"
202
- check_sanitization(input, output, xhtmloutput, xhtmloutput)
203
- end
204
- end
205
-
206
- def test_figure_element_is_valid
207
- fragment = Loofah.scrub_fragment("<span>hello</span> <figure>asd</figure>", :prune)
208
- assert fragment.at_css("figure"), "<figure> tag was scrubbed"
209
- end
210
-
211
- ##
212
- ## as tenderlove says, "care < 0"
213
- ##
214
- # def test_should_handle_astral_plane_characters
215
- # input = "<p>&#x1d4b5; &#x1d538;</p>"
216
- # output = "<p>\360\235\222\265 \360\235\224\270</p>"
217
- # check_sanitization(input, output, output, output)
218
-
219
- # input = "<p><tspan>\360\235\224\270</tspan> a</p>"
220
- # output = "<p><tspan>\360\235\224\270</tspan> a</p>"
221
- # check_sanitization(input, output, output, output)
222
- # end
223
-
224
- # This affects only NS4. Is it worth fixing?
225
- # def test_javascript_includes
226
- # input = %(<div size="&{alert('XSS')}">foo</div>)
227
- # output = "<div>foo</div>"
228
- # check_sanitization(input, output, output, output)
229
- # end
230
-
231
- ##
232
- ## these tests primarily test the parser logic, not the sanitizer
233
- ## logic. i call bullshit. we're not writing a test suite for
234
- ## libxml2 here, so let's rely on the unit tests above to take care
235
- ## of our valid elements and attributes.
236
- ##
237
- require 'json'
238
- Dir[File.join(File.dirname(__FILE__), '..', 'assets', 'testdata_sanitizer_tests1.dat')].each do |filename|
239
- JSON::parse(open(filename).read).each do |test|
240
- it "testdata sanitizer #{test['name']}" do
241
- check_sanitization(
242
- test['input'],
243
- test['output'],
244
- test['xhtml'] || test['output'],
245
- test['rexml'] || test['output']
246
- )
247
- end
248
- end
249
- end
250
-
251
- ## added because we don't have any coverage above on SVG_ATTR_VAL_ALLOWS_REF
252
- HTML5::WhiteList::SVG_ATTR_VAL_ALLOWS_REF.each do |attr_name|
253
- define_method "test_should_allow_uri_refs_in_svg_attribute_#{attr_name}" do
254
- input = "<rect fill='url(#foo)' />"
255
- output = "<rect fill='url(#foo)'></rect>"
256
- check_sanitization(input, output, output, output)
257
- end
258
-
259
- define_method "test_absolute_uri_refs_in_svg_attribute_#{attr_name}" do
260
- input = "<rect fill='url(http://bad.com/) #fff' />"
261
- output = "<rect fill=' #fff'></rect>"
262
- check_sanitization(input, output, output, output)
263
- end
264
- end
265
-
266
- def test_css_negative_value_sanitization
267
- html = "<span style=\"letter-spacing:-0.03em;\">"
268
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
269
- assert_match %r/-0.03em/, sane.inner_html
270
- end
271
-
272
- def test_css_negative_value_sanitization_shorthand_css_properties
273
- html = "<span style=\"margin-left:-0.05em;\">"
274
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
275
- assert_match %r/-0.05em/, sane.inner_html
276
- end
277
-
278
- def test_css_function_sanitization_leaves_whitelisted_functions_calc
279
- html = "<span style=\"width:calc(5%)\">"
280
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
281
- assert_match %r/calc\(5%\)/, sane.inner_html
282
-
283
- html = "<span style=\"width: calc(5%)\">"
284
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
285
- assert_match %r/calc\(5%\)/, sane.inner_html
286
- end
287
-
288
- def test_css_function_sanitization_leaves_whitelisted_functions_rgb
289
- html = '<span style="color: rgb(255, 0, 0)">'
290
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
291
- assert_match %r/rgb\(255, 0, 0\)/, sane.inner_html
292
- end
293
-
294
- def test_css_function_sanitization_leaves_whitelisted_list_style_type
295
- html = "<ol style='list-style-type:lower-greek;'></ol>"
296
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
297
- assert_match %r/list-style-type:lower-greek/, sane.inner_html
298
- end
299
-
300
- def test_css_function_sanitization_strips_style_attributes_with_unsafe_functions
301
- html = "<span style=\"width:attr(data-evil-attr)\">"
302
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
303
- assert_match %r/<span><\/span>/, sane.inner_html
304
-
305
- html = "<span style=\"width: attr(data-evil-attr)\">"
306
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
307
- assert_match %r/<span><\/span>/, sane.inner_html
308
- end
309
-
310
- def test_issue_90_slow_regex
311
- skip("timing tests are hard to make pass and have little regression-testing value")
312
-
313
- html = %q{<span style="background: url('data:image/svg&#43;xml;charset=utf-8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2232%22%20height%3D%2232%22%20viewBox%3D%220%200%2032%2032%22%3E%3Cpath%20fill%3D%22%23D4C8AE%22%20d%3D%22M0%200h32v32h-32z%22%2F%3E%3Cpath%20fill%3D%22%2383604B%22%20d%3D%22M0%200h31.99v11.75h-31.99z%22%2F%3E%3Cpath%20fill%3D%22%233D2319%22%20d%3D%22M0%2011.5h32v.5h-32z%22%2F%3E%3Cpath%20fill%3D%22%23F83651%22%20d%3D%22M5%200h1v10.5h-1z%22%2F%3E%3Cpath%20fill%3D%22%23FCD050%22%20d%3D%22M6%200h1v10.5h-1z%22%2F%3E%3Cpath%20fill%3D%22%2371C797%22%20d%3D%22M7%200h1v10.5h-1z%22%2F%3E%3Cpath%20fill%3D%22%23509CF9%22%20d%3D%22M8%200h1v10.5h-1z%22%2F%3E%3ClinearGradient%20id%3D%22a%22%20gradientUnits%3D%22userSpaceOnUse%22%20x1%3D%2224.996%22%20y1%3D%2210.5%22%20x2%3D%2224.996%22%20y2%3D%224.5%22%3E%3Cstop%20offset%3D%220%22%20stop-color%3D%22%23796055%22%2F%3E%3Cstop%20offset%3D%22.434%22%20stop-color%3D%22%23614C43%22%2F%3E%3Cstop%20offset%3D%221%22%20stop-color%3D%22%233D2D28%22%2F%3E%3C%2FlinearGradient%3E%3Cpath%20fill%3D%22url(%23a)%22%20d%3D%22M28%208.5c0%201.1-.9%202-2%202h-2c-1.1%200-2-.9-2-2v-2c0-1.1.9-2%202-2h2c1.1%200%202%20.9%202%202v2z%22%2F%3E%3Cpath%20fill%3D%22%235F402E%22%20d%3D%22M28%208c0%201.1-.9%202-2%202h-2c-1.1%200-2-.9-2-2v-2c0-1.1.9-2%202-2h2c1.1%200%202%20.9%202%202v2z%22%2F%3E%3C');"></span>}
314
-
315
- assert_completes_in_reasonable_time {
316
- Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
317
- }
318
- end
319
-
320
- def test_upper_case_css_property
321
- html = "<div style=\"COLOR: BLUE; NOTAPROPERTY: RED;\">asdf</div>"
322
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_xml)
323
- assert_match(/COLOR:\s*BLUE/i, sane.at_css("div")["style"])
324
- refute_match(/NOTAPROPERTY/i, sane.at_css("div")["style"])
325
- end
326
-
327
- def test_many_properties_some_allowed
328
- html = "<div style=\"background: bold notaproperty center alsonotaproperty 10px;\">asdf</div>"
329
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_xml)
330
- assert_match(/bold\s+center\s+10px/, sane.at_css("div")["style"])
331
- end
332
-
333
- def test_many_properties_non_allowed
334
- html = "<div style=\"background: notaproperty alsonotaproperty;\">asdf</div>"
335
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_xml)
336
- assert_nil sane.at_css("div")["style"]
337
- end
338
-
339
- def test_svg_properties
340
- html = "<line style='stroke-width: 10px;'></line>"
341
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_xml)
342
- assert_match(/stroke-width:\s*10px/, sane.at_css("line")["style"])
343
- end
344
- end
345
-
346
- # <html5_license>
347
- #
348
- # Copyright (c) 2006-2008 The Authors
349
- #
350
- # Contributors:
351
- # James Graham - jg307@cam.ac.uk
352
- # Anne van Kesteren - annevankesteren@gmail.com
353
- # Lachlan Hunt - lachlan.hunt@lachy.id.au
354
- # Matt McDonald - kanashii@kanashii.ca
355
- # Sam Ruby - rubys@intertwingly.net
356
- # Ian Hickson (Google) - ian@hixie.ch
357
- # Thomas Broyer - t.broyer@ltgt.net
358
- # Jacques Distler - distler@golem.ph.utexas.edu
359
- # Henri Sivonen - hsivonen@iki.fi
360
- # The Mozilla Foundation (contributions from Henri Sivonen since 2008)
361
- #
362
- # Permission is hereby granted, free of charge, to any person
363
- # obtaining a copy of this software and associated documentation files
364
- # (the "Software"), to deal in the Software without restriction,
365
- # including without limitation the rights to use, copy, modify, merge,
366
- # publish, distribute, sublicense, and/or sell copies of the Software,
367
- # and to permit persons to whom the Software is furnished to do so,
368
- # subject to the following conditions:
369
- #
370
- # The above copyright notice and this permission notice shall be
371
- # included in all copies or substantial portions of the Software.
372
- #
373
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
374
- # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
375
- # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
376
- # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
377
- # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
378
- # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
379
- # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
380
- # SOFTWARE.
381
- #
382
- # </html5_license>
@@ -1,204 +0,0 @@
1
- require "helper"
2
-
3
- class IntegrationTestAdHoc < Loofah::TestCase
4
-
5
- context "blank input string" do
6
- context "fragment" do
7
- it "return a blank string" do
8
- assert_equal "", Loofah.scrub_fragment("", :prune).to_s
9
- end
10
- end
11
-
12
- context "document" do
13
- it "return a blank string" do
14
- assert_equal "", Loofah.scrub_document("", :prune).root.to_s
15
- end
16
- end
17
- end
18
-
19
- context "tests" do
20
- MSWORD_HTML = File.read(File.join(File.dirname(__FILE__), "..", "assets", "msword.html")).freeze
21
-
22
- def test_removal_of_illegal_tag
23
- html = <<-HTML
24
- following this there should be no jim tag
25
- <jim>jim</jim>
26
- was there?
27
- HTML
28
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
29
- assert sane.xpath("//jim").empty?
30
- end
31
-
32
- def test_removal_of_illegal_attribute
33
- html = "<p class=bar foo=bar abbr=bar />"
34
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
35
- node = sane.xpath("//p").first
36
- assert node.attributes['class']
37
- assert node.attributes['abbr']
38
- assert_nil node.attributes['foo']
39
- end
40
-
41
- def test_removal_of_illegal_url_in_href
42
- html = <<-HTML
43
- <a href='jimbo://jim.jim/'>this link should have its href removed because of illegal url</a>
44
- <a href='http://jim.jim/'>this link should be fine</a>
45
- HTML
46
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
47
- nodes = sane.xpath("//a")
48
- assert_nil nodes.first.attributes['href']
49
- assert nodes.last.attributes['href']
50
- end
51
-
52
- def test_css_sanitization
53
- html = "<p style='background-color: url(\"http://foo.com/\") ; background-color: #000 ;' />"
54
- sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
55
- assert_match %r/#000/, sane.inner_html
56
- refute_match %r/foo\.com/, sane.inner_html
57
- end
58
-
59
- def test_fragment_with_no_tags
60
- assert_equal "This fragment has no tags.", Loofah.scrub_fragment("This fragment has no tags.", :escape).to_xml
61
- end
62
-
63
- def test_fragment_in_p_tag
64
- assert_equal "<p>This fragment is in a p.</p>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>", :escape).to_xml
65
- end
66
-
67
- def test_fragment_in_p_tag_plus_stuff
68
- assert_equal "<p>This fragment is in a p.</p>foo<strong>bar</strong>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>foo<strong>bar</strong>", :escape).to_xml
69
- end
70
-
71
- def test_fragment_with_text_nodes_leading_and_trailing
72
- assert_equal "text<p>fragment</p>text", Loofah.scrub_fragment("text<p>fragment</p>text", :escape).to_xml
73
- end
74
-
75
- def test_whitewash_on_fragment
76
- html = "safe<frameset rows=\"*\"><frame src=\"http://example.com\"></frameset> <b>description</b>"
77
- whitewashed = Loofah.scrub_document(html, :whitewash).xpath("/html/body/*").to_s
78
- assert_equal "<p>safe</p><b>description</b>", whitewashed.gsub("\n","")
79
- end
80
-
81
- def test_fragment_whitewash_on_microsofty_markup
82
- whitewashed = Loofah.fragment(MSWORD_HTML).scrub!(:whitewash)
83
- assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed.to_s.strip
84
- end
85
-
86
- def test_document_whitewash_on_microsofty_markup
87
- whitewashed = Loofah.document(MSWORD_HTML).scrub!(:whitewash)
88
- assert_match %r(<p>Foo <b>BOLD</b></p>), whitewashed.to_s
89
- assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed.xpath("/html/body/*").to_s
90
- end
91
-
92
- def test_return_empty_string_when_nothing_left
93
- assert_equal "", Loofah.scrub_document('<script>test</script>', :prune).text
94
- end
95
-
96
- def test_nested_script_cdata_tags_should_be_scrubbed
97
- html = "<script><script src='malicious.js'></script>"
98
- stripped = Loofah.fragment(html).scrub!(:strip)
99
- assert_empty stripped.xpath("//script")
100
- refute_match("<script", stripped.to_html)
101
- end
102
-
103
- def test_nested_script_cdata_tags_should_be_scrubbed_2
104
- html = "<script><script>alert('a');</script></script>"
105
- stripped = Loofah.fragment(html).scrub!(:strip)
106
- assert_empty stripped.xpath("//script")
107
- refute_match("<script", stripped.to_html)
108
- end
109
-
110
- def test_removal_of_all_tags
111
- html = <<-HTML
112
- What's up <strong>doc</strong>?
113
- HTML
114
- stripped = Loofah.scrub_document(html, :prune).text
115
- assert_equal %Q(What\'s up doc?).strip, stripped.strip
116
- end
117
-
118
- def test_dont_remove_whitespace
119
- html = "Foo\nBar"
120
- assert_equal html, Loofah.scrub_document(html, :prune).text
121
- end
122
-
123
- def test_dont_remove_whitespace_between_tags
124
- html = "<p>Foo</p>\n<p>Bar</p>"
125
- assert_equal "Foo\nBar", Loofah.scrub_document(html, :prune).text
126
- end
127
-
128
- #
129
- # tests for CVE-2018-8048 (see https://github.com/flavorjones/loofah/issues/144)
130
- #
131
- # libxml2 >= 2.9.2 fails to escape comments within some attributes. It
132
- # wants to ensure these comments can be treated as "server-side includes",
133
- # but as a result fails to ensure that serialization is well-formed,
134
- # resulting in an opportunity for XSS injection of code into a final
135
- # re-parsed document (presumably in a browser).
136
- #
137
- # we'll test this by parsing the HTML, serializing it, then
138
- # re-parsing it to ensure there isn't any ambiguity in the output
139
- # that might allow code injection into a browser consuming
140
- # "sanitized" output.
141
- #
142
- [
143
- #
144
- # these tags and attributes are determined by the code at:
145
- #
146
- # https://git.gnome.org/browse/libxml2/tree/HTMLtree.c?h=v2.9.2#n714
147
- #
148
- {tag: "a", attr: "href"},
149
- {tag: "div", attr: "href"},
150
- {tag: "a", attr: "action"},
151
- {tag: "div", attr: "action"},
152
- {tag: "a", attr: "src"},
153
- {tag: "div", attr: "src"},
154
- {tag: "a", attr: "name"},
155
- #
156
- # note that div+name is _not_ affected by the libxml2 issue.
157
- # but we test it anyway to ensure our logic isn't modifying
158
- # attributes that don't need modifying.
159
- #
160
- {tag: "div", attr: "name", unescaped: true},
161
- ].each do |config|
162
-
163
- define_method "test_uri_escaping_of_#{config[:attr]}_attr_in_#{config[:tag]}_tag" do
164
- html = %{<#{config[:tag]} #{config[:attr]}='examp<!--" unsafeattr=foo()>-->le.com'>test</#{config[:tag]}>}
165
-
166
- reparsed = Loofah.fragment(Loofah.fragment(html).scrub!(:prune).to_html)
167
- attributes = reparsed.at_css(config[:tag]).attribute_nodes
168
-
169
- assert_equal [config[:attr]], attributes.collect(&:name)
170
- if Nokogiri::VersionInfo.instance.libxml2?
171
- if config[:unescaped]
172
- #
173
- # this attribute was emitted wrapped in single-quotes, so a double quote is A-OK.
174
- # assert that this attribute's serialization is unaffected.
175
- #
176
- assert_equal %{examp<!--" unsafeattr=foo()>-->le.com}, attributes.first.value
177
- else
178
- #
179
- # let's match the behavior in libxml < 2.9.2.
180
- # test that this attribute's serialization is well-formed and sanitized.
181
- #
182
- assert_equal %{examp<!--%22%20unsafeattr=foo()>-->le.com}, attributes.first.value
183
- end
184
- else
185
- #
186
- # yay for consistency in javaland. move along, nothing to see here.
187
- #
188
- assert_equal %{examp<!--%22 unsafeattr=foo()>-->le.com}, attributes.first.value
189
- end
190
- end
191
- end
192
-
193
- # see:
194
- # - https://github.com/flavorjones/loofah/issues/154
195
- # - https://hackerone.com/reports/429267
196
- context "xss protection from svg xmlns:xlink animate attribute" do
197
- it "sanitizes appropriate attributes" do
198
- html = %Q{<svg><a xmlns:xlink=http://www.w3.org/1999/xlink xlink:href=?><circle r=400 /><animate attributeName=xlink:href begin=0 from=javascript:alert(1) to=%26>}
199
- sanitized = Loofah.scrub_fragment(html, :escape)
200
- assert_nil sanitized.at_css("animate")["from"]
201
- end
202
- end
203
- end
204
- end