loofah 2.3.1

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of loofah might be problematic. Click here for more details.

Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/.gemtest +0 -0
  3. data/CHANGELOG.md +336 -0
  4. data/Gemfile +22 -0
  5. data/MIT-LICENSE.txt +23 -0
  6. data/Manifest.txt +41 -0
  7. data/README.md +363 -0
  8. data/Rakefile +81 -0
  9. data/SECURITY.md +18 -0
  10. data/benchmark/benchmark.rb +149 -0
  11. data/benchmark/fragment.html +96 -0
  12. data/benchmark/helper.rb +73 -0
  13. data/benchmark/www.slashdot.com.html +2560 -0
  14. data/lib/loofah.rb +83 -0
  15. data/lib/loofah/elements.rb +92 -0
  16. data/lib/loofah/helpers.rb +103 -0
  17. data/lib/loofah/html/document.rb +18 -0
  18. data/lib/loofah/html/document_fragment.rb +40 -0
  19. data/lib/loofah/html5/libxml2_workarounds.rb +26 -0
  20. data/lib/loofah/html5/safelist.rb +796 -0
  21. data/lib/loofah/html5/scrub.rb +133 -0
  22. data/lib/loofah/instance_methods.rb +127 -0
  23. data/lib/loofah/metahelpers.rb +13 -0
  24. data/lib/loofah/scrubber.rb +133 -0
  25. data/lib/loofah/scrubbers.rb +297 -0
  26. data/lib/loofah/xml/document.rb +13 -0
  27. data/lib/loofah/xml/document_fragment.rb +23 -0
  28. data/test/assets/msword.html +63 -0
  29. data/test/assets/testdata_sanitizer_tests1.dat +502 -0
  30. data/test/helper.rb +18 -0
  31. data/test/html5/test_sanitizer.rb +401 -0
  32. data/test/html5/test_scrub.rb +10 -0
  33. data/test/integration/test_ad_hoc.rb +220 -0
  34. data/test/integration/test_helpers.rb +43 -0
  35. data/test/integration/test_html.rb +72 -0
  36. data/test/integration/test_scrubbers.rb +400 -0
  37. data/test/integration/test_xml.rb +55 -0
  38. data/test/unit/test_api.rb +142 -0
  39. data/test/unit/test_encoding.rb +20 -0
  40. data/test/unit/test_helpers.rb +62 -0
  41. data/test/unit/test_scrubber.rb +229 -0
  42. data/test/unit/test_scrubbers.rb +14 -0
  43. metadata +287 -0
@@ -0,0 +1,18 @@
1
+ require 'rubygems'
2
+ require 'minitest/unit'
3
+ require 'minitest/spec'
4
+ require 'minitest/autorun'
5
+ require 'rr'
6
+
7
+ require File.expand_path(File.join(File.dirname(__FILE__), "..", "lib", "loofah"))
8
+
9
+ # require the ActionView helpers here, since they are no longer required automatically
10
+ require File.expand_path(File.join(File.dirname(__FILE__), "..", "lib", "loofah", "helpers"))
11
+
12
+ puts "=> testing with Nokogiri #{Nokogiri::VERSION_INFO.inspect}"
13
+
14
+ class Loofah::TestCase < MiniTest::Spec
15
+ class << self
16
+ alias_method :context, :describe
17
+ end
18
+ end
@@ -0,0 +1,401 @@
1
+ #
2
+ # these tests taken from the HTML5 sanitization project and modified for use with Loofah
3
+ # see the original here: http://code.google.com/p/html5lib/source/browse/ruby/test/test_sanitizer.rb
4
+ #
5
+ # license text at the bottom of this file
6
+ #
7
+ require "helper"
8
+
9
+ class Html5TestSanitizer < Loofah::TestCase
10
+ include Loofah
11
+
12
+ def sanitize_xhtml stream
13
+ Loofah.fragment(stream).scrub!(:escape).to_xhtml
14
+ end
15
+
16
+ def sanitize_html stream
17
+ Loofah.fragment(stream).scrub!(:escape).to_html
18
+ end
19
+
20
+ def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
21
+ ## libxml uses double-quotes, so let's swappo-boppo our quotes before comparing.
22
+ sane = sanitize_html(input).gsub('"',"'")
23
+ htmloutput = htmloutput.gsub('"',"'")
24
+ xhtmloutput = xhtmloutput.gsub('"',"'")
25
+ rexmloutput = rexmloutput.gsub('"',"'")
26
+
27
+ ## HTML5's parsers are shit. there's so much inconsistency with what has closing tags, etc, that
28
+ ## it would require a lot of manual hacking to make the tests match libxml's output.
29
+ ## instead, I'm taking the shotgun approach, and trying to match any of the described outputs.
30
+ assert((htmloutput == sane) || (rexmloutput == sane) || (xhtmloutput == sane),
31
+ %Q{given: "#{input}"\nexpected: "#{htmloutput}"\ngot: "#{sane}"})
32
+ end
33
+
34
+ def assert_completes_in_reasonable_time &block
35
+ t0 = Time.now
36
+ block.call
37
+ assert_in_delta t0, Time.now, 0.1 # arbitrary seconds
38
+ end
39
+
40
+ (HTML5::SafeList::ALLOWED_ELEMENTS).each do |tag_name|
41
+ define_method "test_should_allow_#{tag_name}_tag" do
42
+ input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
43
+ htmloutput = "<#{tag_name.downcase} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name.downcase}>"
44
+ xhtmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name}>"
45
+ rexmloutput = xhtmloutput
46
+
47
+ if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
48
+ htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
49
+ xhtmloutput = htmloutput
50
+ elsif tag_name == 'col'
51
+ htmloutput = "<col title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
52
+ xhtmloutput = htmloutput
53
+ rexmloutput = "<col title='1' />"
54
+ elsif tag_name == 'table'
55
+ htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt;baz<table title='1'> </table>"
56
+ xhtmloutput = htmloutput
57
+ elsif tag_name == 'image'
58
+ htmloutput = "<img title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
59
+ xhtmloutput = htmloutput
60
+ rexmloutput = "<image title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</image>"
61
+ elsif HTML5::SafeList::VOID_ELEMENTS.include?(tag_name)
62
+ htmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
63
+ xhtmloutput = htmloutput
64
+ htmloutput += '<br/>' if tag_name == 'br'
65
+ rexmloutput = "<#{tag_name} title='1' />"
66
+ end
67
+ check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
68
+ end
69
+ end
70
+
71
+ ##
72
+ ## libxml2 downcases elements, so this is moot.
73
+ ##
74
+ # HTML5::SafeList::ALLOWED_ELEMENTS.each do |tag_name|
75
+ # define_method "test_should_forbid_#{tag_name.upcase}_tag" do
76
+ # input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
77
+ # output = "&lt;#{tag_name.upcase} title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/#{tag_name.upcase}&gt;"
78
+ # check_sanitization(input, output, output, output)
79
+ # end
80
+ # end
81
+
82
+ HTML5::SafeList::ALLOWED_ATTRIBUTES.each do |attribute_name|
83
+ next if attribute_name == 'style'
84
+ define_method "test_should_allow_#{attribute_name}_attribute" do
85
+ input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
86
+ if %w[checked compact disabled ismap multiple nohref noshade nowrap readonly selected].include?(attribute_name)
87
+ output = "<p #{attribute_name}>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
88
+ htmloutput = "<p #{attribute_name.downcase}>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
89
+ else
90
+ output = "<p #{attribute_name}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
91
+ htmloutput = "<p #{attribute_name.downcase}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
92
+ end
93
+ check_sanitization(input, htmloutput, output, output)
94
+ end
95
+ end
96
+
97
+ def test_should_allow_data_attributes
98
+ input = "<p data-foo='foo'>foo <bad>bar</bad> baz</p>"
99
+
100
+ output = "<p data-foo='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
101
+ htmloutput = "<p data-foo='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
102
+
103
+ check_sanitization(input, htmloutput, output, output)
104
+ end
105
+
106
+ def test_should_allow_multi_word_data_attributes
107
+ input = "<p data-foo-bar-id='11'>foo <bad>bar</bad> baz</p>"
108
+ output = htmloutput = "<p data-foo-bar-id='11'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
109
+
110
+ check_sanitization(input, htmloutput, output, output)
111
+ end
112
+
113
+ def test_should_allow_contenteditable
114
+ input = '<p contenteditable="false">Hi!</p>'
115
+ output = '<p contenteditable="false">Hi!</p>'
116
+
117
+ check_sanitization(input, output, output, output)
118
+ end
119
+
120
+ ##
121
+ ## libxml2 downcases attributes, so this is moot.
122
+ ##
123
+ # HTML5::SafeList::ALLOWED_ATTRIBUTES.each do |attribute_name|
124
+ # define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
125
+ # input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
126
+ # output = "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
127
+ # check_sanitization(input, output, output, output)
128
+ # end
129
+ # end
130
+
131
+ HTML5::SafeList::ALLOWED_PROTOCOLS.each do |protocol|
132
+ define_method "test_should_allow_#{protocol}_uris" do
133
+ input = %(<a href="#{protocol}">foo</a>)
134
+ output = "<a href='#{protocol}'>foo</a>"
135
+ check_sanitization(input, output, output, output)
136
+ end
137
+ end
138
+
139
+ HTML5::SafeList::ALLOWED_PROTOCOLS.each do |protocol|
140
+ define_method "test_should_allow_uppercase_#{protocol}_uris" do
141
+ input = %(<a href="#{protocol.upcase}">foo</a>)
142
+ output = "<a href='#{protocol.upcase}'>foo</a>"
143
+ check_sanitization(input, output, output, output)
144
+ end
145
+ end
146
+
147
+ HTML5::SafeList::ALLOWED_URI_DATA_MEDIATYPES.each do |data_uri_type|
148
+ define_method "test_should_allow_data_#{data_uri_type}_uris" do
149
+ input = %(<a href="data:#{data_uri_type}">foo</a>)
150
+ output = "<a href='data:#{data_uri_type}'>foo</a>"
151
+ check_sanitization(input, output, output, output)
152
+
153
+ input = %(<a href="data:#{data_uri_type};base64,R0lGODlhAQABA">foo</a>)
154
+ output = "<a href='data:#{data_uri_type};base64,R0lGODlhAQABA'>foo</a>"
155
+ check_sanitization(input, output, output, output)
156
+ end
157
+ end
158
+
159
+ HTML5::SafeList::ALLOWED_URI_DATA_MEDIATYPES.each do |data_uri_type|
160
+ define_method "test_should_allow_uppercase_data_#{data_uri_type}_uris" do
161
+ input = %(<a href="DATA:#{data_uri_type.upcase}">foo</a>)
162
+ output = "<a href='DATA:#{data_uri_type.upcase}'>foo</a>"
163
+ check_sanitization(input, output, output, output)
164
+ end
165
+ end
166
+
167
+ def test_should_disallow_other_uri_mediatypes
168
+ input = %(<a href="data:foo">foo</a>)
169
+ output = "<a>foo</a>"
170
+ check_sanitization(input, output, output, output)
171
+
172
+ input = %(<a href="">foo</a>)
177
+ output = "<a>foo</a>"
178
+ check_sanitization(input, output, output, output)
179
+ end
180
+
181
+
182
+ HTML5::SafeList::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
183
+ next unless HTML5::SafeList::ALLOWED_ELEMENTS.include?(tag_name)
184
+ define_method "test_#{tag_name}_should_allow_local_href" do
185
+ input = %(<#{tag_name} xlink:href="#foo"/>)
186
+ output = "<#{tag_name.downcase} xlink:href='#foo'></#{tag_name.downcase}>"
187
+ xhtmloutput = "<#{tag_name} xlink:href='#foo'></#{tag_name}>"
188
+ check_sanitization(input, output, xhtmloutput, xhtmloutput)
189
+ end
190
+
191
+ define_method "test_#{tag_name}_should_allow_local_href_with_newline" do
192
+ input = %(<#{tag_name} xlink:href="\n#foo"/>)
193
+ output = "<#{tag_name.downcase} xlink:href='\n#foo'></#{tag_name.downcase}>"
194
+ xhtmloutput = "<#{tag_name} xlink:href='\n#foo'></#{tag_name}>"
195
+ check_sanitization(input, output, xhtmloutput, xhtmloutput)
196
+ end
197
+
198
+ define_method "test_#{tag_name}_should_forbid_nonlocal_href" do
199
+ input = %(<#{tag_name} xlink:href="http://bad.com/foo"/>)
200
+ output = "<#{tag_name.downcase}></#{tag_name.downcase}>"
201
+ xhtmloutput = "<#{tag_name}></#{tag_name}>"
202
+ check_sanitization(input, output, xhtmloutput, xhtmloutput)
203
+ end
204
+
205
+ define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_newline" do
206
+ input = %(<#{tag_name} xlink:href="\nhttp://bad.com/foo"/>)
207
+ output = "<#{tag_name.downcase}></#{tag_name.downcase}>"
208
+ xhtmloutput = "<#{tag_name}></#{tag_name}>"
209
+ check_sanitization(input, output, xhtmloutput, xhtmloutput)
210
+ end
211
+ end
212
+
213
+ def test_figure_element_is_valid
214
+ fragment = Loofah.scrub_fragment("<span>hello</span> <figure>asd</figure>", :prune)
215
+ assert fragment.at_css("figure"), "<figure> tag was scrubbed"
216
+ end
217
+
218
+ ##
219
+ ## as tenderlove says, "care < 0"
220
+ ##
221
+ # def test_should_handle_astral_plane_characters
222
+ # input = "<p>&#x1d4b5; &#x1d538;</p>"
223
+ # output = "<p>\360\235\222\265 \360\235\224\270</p>"
224
+ # check_sanitization(input, output, output, output)
225
+
226
+ # input = "<p><tspan>\360\235\224\270</tspan> a</p>"
227
+ # output = "<p><tspan>\360\235\224\270</tspan> a</p>"
228
+ # check_sanitization(input, output, output, output)
229
+ # end
230
+
231
+ # This affects only NS4. Is it worth fixing?
232
+ # def test_javascript_includes
233
+ # input = %(<div size="&{alert('XSS')}">foo</div>)
234
+ # output = "<div>foo</div>"
235
+ # check_sanitization(input, output, output, output)
236
+ # end
237
+
238
+ ##
239
+ ## these tests primarily test the parser logic, not the sanitizer
240
+ ## logic. i call bullshit. we're not writing a test suite for
241
+ ## libxml2 here, so let's rely on the unit tests above to take care
242
+ ## of our valid elements and attributes.
243
+ ##
244
+ require 'json'
245
+ Dir[File.join(File.dirname(__FILE__), '..', 'assets', 'testdata_sanitizer_tests1.dat')].each do |filename|
246
+ JSON::parse(open(filename).read).each do |test|
247
+ it "testdata sanitizer #{test['name']}" do
248
+ check_sanitization(
249
+ test['input'],
250
+ test['output'],
251
+ test['xhtml'] || test['output'],
252
+ test['rexml'] || test['output']
253
+ )
254
+ end
255
+ end
256
+ end
257
+
258
+ ## added because we don't have any coverage above on SVG_ATTR_VAL_ALLOWS_REF
259
+ HTML5::SafeList::SVG_ATTR_VAL_ALLOWS_REF.each do |attr_name|
260
+ define_method "test_should_allow_uri_refs_in_svg_attribute_#{attr_name}" do
261
+ input = "<rect fill='url(#foo)' />"
262
+ output = "<rect fill='url(#foo)'></rect>"
263
+ check_sanitization(input, output, output, output)
264
+ end
265
+
266
+ define_method "test_absolute_uri_refs_in_svg_attribute_#{attr_name}" do
267
+ input = "<rect fill='url(http://bad.com/) #fff' />"
268
+ output = "<rect fill=' #fff'></rect>"
269
+ check_sanitization(input, output, output, output)
270
+ end
271
+ end
272
+
273
+ def test_css_list_style
274
+ html = '<ul style="list-style: none"></ul>'
275
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
276
+ assert_match %r/list-style/, sane.inner_html
277
+ end
278
+
279
+ def test_css_negative_value_sanitization
280
+ html = "<span style=\"letter-spacing:-0.03em;\">"
281
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
282
+ assert_match %r/-0.03em/, sane.inner_html
283
+ end
284
+
285
+ def test_css_negative_value_sanitization_shorthand_css_properties
286
+ html = "<span style=\"margin-left:-0.05em;\">"
287
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
288
+ assert_match %r/-0.05em/, sane.inner_html
289
+ end
290
+
291
+ def test_css_high_precision_value_shorthand_css_properties
292
+ html = "<span style=\"margin-left:0.3333333334em;\">"
293
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
294
+ assert_match %r/0.3333333334em/, sane.inner_html
295
+ end
296
+
297
+ def test_css_function_sanitization_leaves_safelisted_functions_calc
298
+ html = "<span style=\"width:calc(5%)\">"
299
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
300
+ assert_match %r/calc\(5%\)/, sane.inner_html
301
+
302
+ html = "<span style=\"width: calc(5%)\">"
303
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
304
+ assert_match %r/calc\(5%\)/, sane.inner_html
305
+ end
306
+
307
+ def test_css_function_sanitization_leaves_safelisted_functions_rgb
308
+ html = '<span style="color: rgb(255, 0, 0)">'
309
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
310
+ assert_match %r/rgb\(255, 0, 0\)/, sane.inner_html
311
+ end
312
+
313
+ def test_css_function_sanitization_leaves_safelisted_list_style_type
314
+ html = "<ol style='list-style-type:lower-greek;'></ol>"
315
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
316
+ assert_match %r/list-style-type:lower-greek/, sane.inner_html
317
+ end
318
+
319
+ def test_css_function_sanitization_strips_style_attributes_with_unsafe_functions
320
+ html = "<span style=\"width:url(data-evil-url)\">"
321
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
322
+ assert_match %r/<span><\/span>/, sane.inner_html
323
+
324
+ html = "<span style=\"width: url(data-evil-url)\">"
325
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
326
+ assert_match %r/<span><\/span>/, sane.inner_html
327
+ end
328
+
329
+ def test_issue_90_slow_regex
330
+ skip("timing tests are hard to make pass and have little regression-testing value")
331
+
332
+ html = %q{<span style="background: url('data:image/svg&#43;xml;charset=utf-8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2232%22%20height%3D%2232%22%20viewBox%3D%220%200%2032%2032%22%3E%3Cpath%20fill%3D%22%23D4C8AE%22%20d%3D%22M0%200h32v32h-32z%22%2F%3E%3Cpath%20fill%3D%22%2383604B%22%20d%3D%22M0%200h31.99v11.75h-31.99z%22%2F%3E%3Cpath%20fill%3D%22%233D2319%22%20d%3D%22M0%2011.5h32v.5h-32z%22%2F%3E%3Cpath%20fill%3D%22%23F83651%22%20d%3D%22M5%200h1v10.5h-1z%22%2F%3E%3Cpath%20fill%3D%22%23FCD050%22%20d%3D%22M6%200h1v10.5h-1z%22%2F%3E%3Cpath%20fill%3D%22%2371C797%22%20d%3D%22M7%200h1v10.5h-1z%22%2F%3E%3Cpath%20fill%3D%22%23509CF9%22%20d%3D%22M8%200h1v10.5h-1z%22%2F%3E%3ClinearGradient%20id%3D%22a%22%20gradientUnits%3D%22userSpaceOnUse%22%20x1%3D%2224.996%22%20y1%3D%2210.5%22%20x2%3D%2224.996%22%20y2%3D%224.5%22%3E%3Cstop%20offset%3D%220%22%20stop-color%3D%22%23796055%22%2F%3E%3Cstop%20offset%3D%22.434%22%20stop-color%3D%22%23614C43%22%2F%3E%3Cstop%20offset%3D%221%22%20stop-color%3D%22%233D2D28%22%2F%3E%3C%2FlinearGradient%3E%3Cpath%20fill%3D%22url(%23a)%22%20d%3D%22M28%208.5c0%201.1-.9%202-2%202h-2c-1.1%200-2-.9-2-2v-2c0-1.1.9-2%202-2h2c1.1%200%202%20.9%202%202v2z%22%2F%3E%3Cpath%20fill%3D%22%235F402E%22%20d%3D%22M28%208c0%201.1-.9%202-2%202h-2c-1.1%200-2-.9-2-2v-2c0-1.1.9-2%202-2h2c1.1%200%202%20.9%202%202v2z%22%2F%3E%3C');"></span>}
333
+
334
+ assert_completes_in_reasonable_time {
335
+ Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
336
+ }
337
+ end
338
+
339
+ def test_upper_case_css_property
340
+ html = "<div style=\"COLOR: BLUE; NOTAPROPERTY: RED;\">asdf</div>"
341
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_xml)
342
+ assert_match(/COLOR:\s*BLUE/i, sane.at_css("div")["style"])
343
+ refute_match(/NOTAPROPERTY/i, sane.at_css("div")["style"])
344
+ end
345
+
346
+ def test_many_properties_some_allowed
347
+ html = "<div style=\"background: bold notaproperty center alsonotaproperty 10px;\">asdf</div>"
348
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_xml)
349
+ assert_match(/bold\s+center\s+10px/, sane.at_css("div")["style"])
350
+ end
351
+
352
+ def test_many_properties_non_allowed
353
+ html = "<div style=\"background: notaproperty alsonotaproperty;\">asdf</div>"
354
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_xml)
355
+ assert_nil sane.at_css("div")["style"]
356
+ end
357
+
358
+ def test_svg_properties
359
+ html = "<line style='stroke-width: 10px;'></line>"
360
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_xml)
361
+ assert_match(/stroke-width:\s*10px/, sane.at_css("line")["style"])
362
+ end
363
+ end
364
+
365
+ # <html5_license>
366
+ #
367
+ # Copyright (c) 2006-2008 The Authors
368
+ #
369
+ # Contributors:
370
+ # James Graham - jg307@cam.ac.uk
371
+ # Anne van Kesteren - annevankesteren@gmail.com
372
+ # Lachlan Hunt - lachlan.hunt@lachy.id.au
373
+ # Matt McDonald - kanashii@kanashii.ca
374
+ # Sam Ruby - rubys@intertwingly.net
375
+ # Ian Hickson (Google) - ian@hixie.ch
376
+ # Thomas Broyer - t.broyer@ltgt.net
377
+ # Jacques Distler - distler@golem.ph.utexas.edu
378
+ # Henri Sivonen - hsivonen@iki.fi
379
+ # The Mozilla Foundation (contributions from Henri Sivonen since 2008)
380
+ #
381
+ # Permission is hereby granted, free of charge, to any person
382
+ # obtaining a copy of this software and associated documentation files
383
+ # (the "Software"), to deal in the Software without restriction,
384
+ # including without limitation the rights to use, copy, modify, merge,
385
+ # publish, distribute, sublicense, and/or sell copies of the Software,
386
+ # and to permit persons to whom the Software is furnished to do so,
387
+ # subject to the following conditions:
388
+ #
389
+ # The above copyright notice and this permission notice shall be
390
+ # included in all copies or substantial portions of the Software.
391
+ #
392
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
393
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
394
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
395
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
396
+ # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
397
+ # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
398
+ # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
399
+ # SOFTWARE.
400
+ #
401
+ # </html5_license>
@@ -0,0 +1,10 @@
1
+ require "helper"
2
+
3
+ class UnitHTML5Scrub < Loofah::TestCase
4
+ include Loofah
5
+
6
+ def test_scrub_css
7
+ assert_equal Loofah::HTML5::Scrub.scrub_css("background: #ABC012"), "background:#ABC012;"
8
+ assert_equal Loofah::HTML5::Scrub.scrub_css("background: #abc012"), "background:#abc012;"
9
+ end
10
+ end
@@ -0,0 +1,220 @@
1
+ require "helper"
2
+
3
+ class IntegrationTestAdHoc < Loofah::TestCase
4
+ context "blank input string" do
5
+ context "fragment" do
6
+ it "return a blank string" do
7
+ assert_equal "", Loofah.scrub_fragment("", :prune).to_s
8
+ end
9
+ end
10
+
11
+ context "document" do
12
+ it "return a blank string" do
13
+ assert_equal "", Loofah.scrub_document("", :prune).root.to_s
14
+ end
15
+ end
16
+ end
17
+
18
+ context "tests" do
19
+ MSWORD_HTML = File.read(File.join(File.dirname(__FILE__), "..", "assets", "msword.html")).freeze
20
+
21
+ def test_removal_of_illegal_tag
22
+ html = <<-HTML
23
+ following this there should be no jim tag
24
+ <jim>jim</jim>
25
+ was there?
26
+ HTML
27
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
28
+ assert sane.xpath("//jim").empty?
29
+ end
30
+
31
+ def test_removal_of_illegal_attribute
32
+ html = "<p class=bar foo=bar abbr=bar />"
33
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
34
+ node = sane.xpath("//p").first
35
+ assert node.attributes["class"]
36
+ assert node.attributes["abbr"]
37
+ assert_nil node.attributes["foo"]
38
+ end
39
+
40
+ def test_removal_of_illegal_url_in_href
41
+ html = <<-HTML
42
+ <a href='jimbo://jim.jim/'>this link should have its href removed because of illegal url</a>
43
+ <a href='http://jim.jim/'>this link should be fine</a>
44
+ HTML
45
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
46
+ nodes = sane.xpath("//a")
47
+ assert_nil nodes.first.attributes["href"]
48
+ assert nodes.last.attributes["href"]
49
+ end
50
+
51
+ def test_css_sanitization
52
+ html = "<p style='background-color: url(\"http://foo.com/\") ; background-color: #000 ;' />"
53
+ sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
54
+ assert_match %r/#000/, sane.inner_html
55
+ refute_match %r/foo\.com/, sane.inner_html
56
+ end
57
+
58
+ def test_fragment_with_no_tags
59
+ assert_equal "This fragment has no tags.", Loofah.scrub_fragment("This fragment has no tags.", :escape).to_xml
60
+ end
61
+
62
+ def test_fragment_in_p_tag
63
+ assert_equal "<p>This fragment is in a p.</p>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>", :escape).to_xml
64
+ end
65
+
66
+ def test_fragment_in_p_tag_plus_stuff
67
+ assert_equal "<p>This fragment is in a p.</p>foo<strong>bar</strong>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>foo<strong>bar</strong>", :escape).to_xml
68
+ end
69
+
70
+ def test_fragment_with_text_nodes_leading_and_trailing
71
+ assert_equal "text<p>fragment</p>text", Loofah.scrub_fragment("text<p>fragment</p>text", :escape).to_xml
72
+ end
73
+
74
+ def test_whitewash_on_fragment
75
+ html = "safe<frameset rows=\"*\"><frame src=\"http://example.com\"></frameset> <b>description</b>"
76
+ whitewashed = Loofah.scrub_document(html, :whitewash).xpath("/html/body/*").to_s
77
+ assert_equal "<p>safe</p><b>description</b>", whitewashed.gsub("\n", "")
78
+ end
79
+
80
+ def test_fragment_whitewash_on_microsofty_markup
81
+ whitewashed = Loofah.fragment(MSWORD_HTML).scrub!(:whitewash)
82
+ assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed.to_s.strip
83
+ end
84
+
85
+ def test_document_whitewash_on_microsofty_markup
86
+ whitewashed = Loofah.document(MSWORD_HTML).scrub!(:whitewash)
87
+ assert_match %r(<p>Foo <b>BOLD</b></p>), whitewashed.to_s
88
+ assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed.xpath("/html/body/*").to_s
89
+ end
90
+
91
+ def test_return_empty_string_when_nothing_left
92
+ assert_equal "", Loofah.scrub_document("<script>test</script>", :prune).text
93
+ end
94
+
95
+ def test_nested_script_cdata_tags_should_be_scrubbed
96
+ html = "<script><script src='malicious.js'></script>"
97
+ stripped = Loofah.fragment(html).scrub!(:strip)
98
+ assert_empty stripped.xpath("//script")
99
+ refute_match("<script", stripped.to_html)
100
+ end
101
+
102
+ def test_nested_script_cdata_tags_should_be_scrubbed_2
103
+ html = "<script><script>alert('a');</script></script>"
104
+ stripped = Loofah.fragment(html).scrub!(:strip)
105
+ assert_empty stripped.xpath("//script")
106
+ refute_match("<script", stripped.to_html)
107
+ end
108
+
109
+ def test_removal_of_all_tags
110
+ html = <<-HTML
111
+ What's up <strong>doc</strong>?
112
+ HTML
113
+ stripped = Loofah.scrub_document(html, :prune).text
114
+ assert_equal %Q(What\'s up doc?).strip, stripped.strip
115
+ end
116
+
117
+ def test_dont_remove_whitespace
118
+ html = "Foo\nBar"
119
+ assert_equal html, Loofah.scrub_document(html, :prune).text
120
+ end
121
+
122
+ def test_dont_remove_whitespace_between_tags
123
+ html = "<p>Foo</p>\n<p>Bar</p>"
124
+ assert_equal "Foo\nBar", Loofah.scrub_document(html, :prune).text
125
+ end
126
+
127
+ #
128
+ # tests for CVE-2018-8048 (see https://github.com/flavorjones/loofah/issues/144)
129
+ #
130
+ # libxml2 >= 2.9.2 fails to escape comments within some attributes. It
131
+ # wants to ensure these comments can be treated as "server-side includes",
132
+ # but as a result fails to ensure that serialization is well-formed,
133
+ # resulting in an opportunity for XSS injection of code into a final
134
+ # re-parsed document (presumably in a browser).
135
+ #
136
+ # we'll test this by parsing the HTML, serializing it, then
137
+ # re-parsing it to ensure there isn't any ambiguity in the output
138
+ # that might allow code injection into a browser consuming
139
+ # "sanitized" output.
140
+ #
141
+ [
142
+ #
143
+ # these tags and attributes are determined by the code at:
144
+ #
145
+ # https://git.gnome.org/browse/libxml2/tree/HTMLtree.c?h=v2.9.2#n714
146
+ #
147
+ { tag: "a", attr: "href" },
148
+ { tag: "div", attr: "href" },
149
+ { tag: "a", attr: "action" },
150
+ { tag: "div", attr: "action" },
151
+ { tag: "a", attr: "src" },
152
+ { tag: "div", attr: "src" },
153
+ { tag: "a", attr: "name" },
154
+ #
155
+ # note that div+name is _not_ affected by the libxml2 issue.
156
+ # but we test it anyway to ensure our logic isn't modifying
157
+ # attributes that don't need modifying.
158
+ #
159
+ { tag: "div", attr: "name", unescaped: true },
160
+ ].each do |config|
161
+ define_method "test_uri_escaping_of_#{config[:attr]}_attr_in_#{config[:tag]}_tag" do
162
+ html = %{<#{config[:tag]} #{config[:attr]}='examp<!--" unsafeattr=foo()>-->le.com'>test</#{config[:tag]}>}
163
+
164
+ reparsed = Loofah.fragment(Loofah.fragment(html).scrub!(:prune).to_html)
165
+ attributes = reparsed.at_css(config[:tag]).attribute_nodes
166
+
167
+ assert_equal [config[:attr]], attributes.collect(&:name)
168
+ if Nokogiri::VersionInfo.instance.libxml2?
169
+ if config[:unescaped]
170
+ #
171
+ # this attribute was emitted wrapped in single-quotes, so a double quote is A-OK.
172
+ # assert that this attribute's serialization is unaffected.
173
+ #
174
+ assert_equal %{examp<!--" unsafeattr=foo()>-->le.com}, attributes.first.value
175
+ else
176
+ #
177
+ # let's match the behavior in libxml < 2.9.2.
178
+ # test that this attribute's serialization is well-formed and sanitized.
179
+ #
180
+ assert_equal %{examp<!--%22%20unsafeattr=foo()>-->le.com}, attributes.first.value
181
+ end
182
+ else
183
+ #
184
+ # yay for consistency in javaland. move along, nothing to see here.
185
+ #
186
+ assert_equal %{examp<!--%22 unsafeattr=foo()>-->le.com}, attributes.first.value
187
+ end
188
+ end
189
+ end
190
+
191
+ context "xss protection from svg animate attributes" do
192
+ # see recommendation from https://html5sec.org/#137
193
+ # to sanitize "to", "from", "values", and "by" attributes
194
+
195
+ it "sanitizes 'from', 'to', and 'by' attributes" do
196
+ # for CVE-2018-16468
197
+ # see:
198
+ # - https://github.com/flavorjones/loofah/issues/154
199
+ # - https://hackerone.com/reports/429267
200
+ html = %Q{<svg><a xmlns:xlink=http://www.w3.org/1999/xlink xlink:href=?><circle r=400 /><animate attributeName=xlink:href begin=0 from=javascript:alert(1) to=%26 by=5>}
201
+
202
+ sanitized = Loofah.scrub_fragment(html, :escape)
203
+ assert_nil sanitized.at_css("animate")["from"]
204
+ assert_nil sanitized.at_css("animate")["to"]
205
+ assert_nil sanitized.at_css("animate")["by"]
206
+ end
207
+
208
+ it "sanitizes 'values' attribute" do
209
+ # for CVE-2019-15587
210
+ # see:
211
+ # - https://github.com/flavorjones/loofah/issues/171
212
+ # - https://hackerone.com/reports/709009
213
+ html = %Q{<svg> <animate href="#foo" attributeName="href" values="javascript:alert('xss')"/> <a id="foo"> <circle r=400 /> </a> </svg>}
214
+
215
+ sanitized = Loofah.scrub_fragment(html, :escape)
216
+ assert_nil sanitized.at_css("animate")["values"]
217
+ end
218
+ end
219
+ end
220
+ end