RubyGems - loofah - Versions diffs - 2.3.1 - Mend

loofah 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

checksums.yaml +7 -0
data/.gemtest +0 -0
data/CHANGELOG.md +336 -0
data/Gemfile +22 -0
data/MIT-LICENSE.txt +23 -0
data/Manifest.txt +41 -0
data/README.md +363 -0
data/Rakefile +81 -0
data/SECURITY.md +18 -0
data/benchmark/benchmark.rb +149 -0
data/benchmark/fragment.html +96 -0
data/benchmark/helper.rb +73 -0
data/benchmark/www.slashdot.com.html +2560 -0
data/lib/loofah.rb +83 -0
data/lib/loofah/elements.rb +92 -0
data/lib/loofah/helpers.rb +103 -0
data/lib/loofah/html/document.rb +18 -0
data/lib/loofah/html/document_fragment.rb +40 -0
data/lib/loofah/html5/libxml2_workarounds.rb +26 -0
data/lib/loofah/html5/safelist.rb +796 -0
data/lib/loofah/html5/scrub.rb +133 -0
data/lib/loofah/instance_methods.rb +127 -0
data/lib/loofah/metahelpers.rb +13 -0
data/lib/loofah/scrubber.rb +133 -0
data/lib/loofah/scrubbers.rb +297 -0
data/lib/loofah/xml/document.rb +13 -0
data/lib/loofah/xml/document_fragment.rb +23 -0
data/test/assets/msword.html +63 -0
data/test/assets/testdata_sanitizer_tests1.dat +502 -0
data/test/helper.rb +18 -0
data/test/html5/test_sanitizer.rb +401 -0
data/test/html5/test_scrub.rb +10 -0
data/test/integration/test_ad_hoc.rb +220 -0
data/test/integration/test_helpers.rb +43 -0
data/test/integration/test_html.rb +72 -0
data/test/integration/test_scrubbers.rb +400 -0
data/test/integration/test_xml.rb +55 -0
data/test/unit/test_api.rb +142 -0
data/test/unit/test_encoding.rb +20 -0
data/test/unit/test_helpers.rb +62 -0
data/test/unit/test_scrubber.rb +229 -0
data/test/unit/test_scrubbers.rb +14 -0
metadata +287 -0

data/test/helper.rb ADDED

@@ -0,0 +1,18 @@
+require 'rubygems'
+require 'minitest/unit'
+require 'minitest/spec'
+require 'minitest/autorun'
+require 'rr'
+require File.expand_path(File.join(File.dirname(__FILE__), "..", "lib", "loofah"))
+# require the ActionView helpers here, since they are no longer required automatically
+require File.expand_path(File.join(File.dirname(__FILE__), "..", "lib", "loofah", "helpers"))
+puts "=> testing with Nokogiri #{Nokogiri::VERSION_INFO.inspect}"
+class Loofah::TestCase < MiniTest::Spec
+  class << self
+    alias_method :context, :describe
+  end
+end

data/test/html5/test_sanitizer.rb ADDED

@@ -0,0 +1,401 @@
+#
+#  these tests taken from the HTML5 sanitization project and modified for use with Loofah
+#  see the original here: http://code.google.com/p/html5lib/source/browse/ruby/test/test_sanitizer.rb
+#
+#  license text at the bottom of this file
+#
+require "helper"
+class Html5TestSanitizer < Loofah::TestCase
+  include Loofah
+  def sanitize_xhtml stream
+    Loofah.fragment(stream).scrub!(:escape).to_xhtml
+  end
+  def sanitize_html stream
+    Loofah.fragment(stream).scrub!(:escape).to_html
+  end
+  def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
+    ##  libxml uses double-quotes, so let's swappo-boppo our quotes before comparing.
+    sane = sanitize_html(input).gsub('"',"'")
+    htmloutput = htmloutput.gsub('"',"'")
+    xhtmloutput = xhtmloutput.gsub('"',"'")
+    rexmloutput = rexmloutput.gsub('"',"'")
+    ##  HTML5's parsers are shit. there's so much inconsistency with what has closing tags, etc, that
+    ##  it would require a lot of manual hacking to make the tests match libxml's output.
+    ##  instead, I'm taking the shotgun approach, and trying to match any of the described outputs.
+    assert((htmloutput == sane) || (rexmloutput == sane) || (xhtmloutput == sane),
+      %Q{given:    "#{input}"\nexpected: "#{htmloutput}"\ngot:      "#{sane}"})
+  end
+  def assert_completes_in_reasonable_time &block
+    t0 = Time.now
+    block.call
+    assert_in_delta t0, Time.now, 0.1 # arbitrary seconds
+  end
+  (HTML5::SafeList::ALLOWED_ELEMENTS).each do |tag_name|
+    define_method "test_should_allow_#{tag_name}_tag" do
+      input       = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
+      htmloutput  = "<#{tag_name.downcase} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name.downcase}>"
+      xhtmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name}>"
+      rexmloutput = xhtmloutput
+      if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
+        htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
+        xhtmloutput = htmloutput
+      elsif tag_name == 'col'
+        htmloutput = "<col title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
+        xhtmloutput = htmloutput
+        rexmloutput = "<col title='1' />"
+      elsif tag_name == 'table'
+        htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt;baz<table title='1'> </table>"
+        xhtmloutput = htmloutput
+      elsif tag_name == 'image'
+        htmloutput = "<img title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
+        xhtmloutput = htmloutput
+        rexmloutput = "<image title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</image>"
+      elsif HTML5::SafeList::VOID_ELEMENTS.include?(tag_name)
+        htmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
+        xhtmloutput = htmloutput
+        htmloutput += '<br/>' if tag_name == 'br'
+        rexmloutput =  "<#{tag_name} title='1' />"
+      end
+      check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
+    end
+  end
+  ##
+  ##  libxml2 downcases elements, so this is moot.
+  ##
+  # HTML5::SafeList::ALLOWED_ELEMENTS.each do |tag_name|
+  #   define_method "test_should_forbid_#{tag_name.upcase}_tag" do
+  #     input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
+  #     output = "&lt;#{tag_name.upcase} title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/#{tag_name.upcase}&gt;"
+  #     check_sanitization(input, output, output, output)
+  #   end
+  # end
+  HTML5::SafeList::ALLOWED_ATTRIBUTES.each do |attribute_name|
+    next if attribute_name == 'style'
+    define_method "test_should_allow_#{attribute_name}_attribute" do
+        input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
+      if %w[checked compact disabled ismap multiple nohref noshade nowrap readonly selected].include?(attribute_name)
+        output = "<p #{attribute_name}>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
+        htmloutput = "<p #{attribute_name.downcase}>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
+      else
+        output = "<p #{attribute_name}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
+        htmloutput = "<p #{attribute_name.downcase}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
+      end
+      check_sanitization(input, htmloutput, output, output)
+    end
+  end
+  def test_should_allow_data_attributes
+    input = "<p data-foo='foo'>foo <bad>bar</bad> baz</p>"
+    output = "<p data-foo='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
+    htmloutput = "<p data-foo='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
+    check_sanitization(input, htmloutput, output, output)
+  end
+  def test_should_allow_multi_word_data_attributes
+    input = "<p data-foo-bar-id='11'>foo <bad>bar</bad> baz</p>"
+    output = htmloutput = "<p data-foo-bar-id='11'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
+    check_sanitization(input, htmloutput, output, output)
+  end
+  def test_should_allow_contenteditable
+    input = '<p contenteditable="false">Hi!</p>'
+    output = '<p contenteditable="false">Hi!</p>'
+    check_sanitization(input, output, output, output)
+  end
+  ##
+  ##  libxml2 downcases attributes, so this is moot.
+  ##
+  # HTML5::SafeList::ALLOWED_ATTRIBUTES.each do |attribute_name|
+  #   define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
+  #     input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
+  #     output =  "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
+  #     check_sanitization(input, output, output, output)
+  #   end
+  # end
+  HTML5::SafeList::ALLOWED_PROTOCOLS.each do |protocol|
+    define_method "test_should_allow_#{protocol}_uris" do
+      input = %(<a href="#{protocol}">foo</a>)
+      output = "<a href='#{protocol}'>foo</a>"
+      check_sanitization(input, output, output, output)
+    end
+  end
+  HTML5::SafeList::ALLOWED_PROTOCOLS.each do |protocol|
+    define_method "test_should_allow_uppercase_#{protocol}_uris" do
+      input = %(<a href="#{protocol.upcase}">foo</a>)
+      output = "<a href='#{protocol.upcase}'>foo</a>"
+      check_sanitization(input, output, output, output)
+    end
+  end
+  HTML5::SafeList::ALLOWED_URI_DATA_MEDIATYPES.each do |data_uri_type|
+    define_method "test_should_allow_data_#{data_uri_type}_uris" do
+      input = %(<a href="data:#{data_uri_type}">foo</a>)
+      output = "<a href='data:#{data_uri_type}'>foo</a>"
+      check_sanitization(input, output, output, output)
+      input = %(<a href="data:#{data_uri_type};base64,R0lGODlhAQABA">foo</a>)
+      output = "<a href='data:#{data_uri_type};base64,R0lGODlhAQABA'>foo</a>"
+      check_sanitization(input, output, output, output)
+    end
+  end
+  HTML5::SafeList::ALLOWED_URI_DATA_MEDIATYPES.each do |data_uri_type|
+    define_method "test_should_allow_uppercase_data_#{data_uri_type}_uris" do
+      input = %(<a href="DATA:#{data_uri_type.upcase}">foo</a>)
+      output = "<a href='DATA:#{data_uri_type.upcase}'>foo</a>"
+      check_sanitization(input, output, output, output)
+    end
+  end
+  def test_should_disallow_other_uri_mediatypes
+    input = %(<a href="data:foo">foo</a>)
+    output = "<a>foo</a>"
+    check_sanitization(input, output, output, output)
+    input = %(<a href="data:image/xxx">foo</a>)
+    output = "<a>foo</a>"
+    check_sanitization(input, output, output, output)
+    input = %(<a href="data:image/xxx;base64,R0lGODlhAQABA">foo</a>)
+    output = "<a>foo</a>"
+    check_sanitization(input, output, output, output)
+  end
+  HTML5::SafeList::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
+    next unless HTML5::SafeList::ALLOWED_ELEMENTS.include?(tag_name)
+    define_method "test_#{tag_name}_should_allow_local_href" do
+      input = %(<#{tag_name} xlink:href="#foo"/>)
+      output = "<#{tag_name.downcase} xlink:href='#foo'></#{tag_name.downcase}>"
+      xhtmloutput = "<#{tag_name} xlink:href='#foo'></#{tag_name}>"
+      check_sanitization(input, output, xhtmloutput, xhtmloutput)
+    end
+    define_method "test_#{tag_name}_should_allow_local_href_with_newline" do
+      input = %(<#{tag_name} xlink:href="\n#foo"/>)
+      output = "<#{tag_name.downcase} xlink:href='\n#foo'></#{tag_name.downcase}>"
+      xhtmloutput = "<#{tag_name} xlink:href='\n#foo'></#{tag_name}>"
+      check_sanitization(input, output, xhtmloutput, xhtmloutput)
+    end
+    define_method "test_#{tag_name}_should_forbid_nonlocal_href" do
+      input = %(<#{tag_name} xlink:href="http://bad.com/foo"/>)
+      output = "<#{tag_name.downcase}></#{tag_name.downcase}>"
+      xhtmloutput = "<#{tag_name}></#{tag_name}>"
+      check_sanitization(input, output, xhtmloutput, xhtmloutput)
+    end
+    define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_newline" do
+      input = %(<#{tag_name} xlink:href="\nhttp://bad.com/foo"/>)
+      output = "<#{tag_name.downcase}></#{tag_name.downcase}>"
+      xhtmloutput = "<#{tag_name}></#{tag_name}>"
+      check_sanitization(input, output, xhtmloutput, xhtmloutput)
+    end
+  end
+  def test_figure_element_is_valid
+    fragment = Loofah.scrub_fragment("<span>hello</span> <figure>asd</figure>", :prune)
+    assert fragment.at_css("figure"), "<figure> tag was scrubbed"
+  end
+  ##
+  ##  as tenderlove says, "care < 0"
+  ##
+  # def test_should_handle_astral_plane_characters
+  #   input = "<p>&#x1d4b5; &#x1d538;</p>"
+  #   output = "<p>\360\235\222\265 \360\235\224\270</p>"
+  #   check_sanitization(input, output, output, output)
+  #   input = "<p><tspan>\360\235\224\270</tspan> a</p>"
+  #   output = "<p><tspan>\360\235\224\270</tspan> a</p>"
+  #   check_sanitization(input, output, output, output)
+  # end
+# This affects only NS4. Is it worth fixing?
+#  def test_javascript_includes
+#    input = %(<div size="&{alert('XSS')}">foo</div>)
+#    output = "<div>foo</div>"
+#    check_sanitization(input, output, output, output)
+#  end
+  ##
+  ##  these tests primarily test the parser logic, not the sanitizer
+  ##  logic. i call bullshit. we're not writing a test suite for
+  ##  libxml2 here, so let's rely on the unit tests above to take care
+  ##  of our valid elements and attributes.
+  ##
+  require 'json'
+  Dir[File.join(File.dirname(__FILE__), '..', 'assets', 'testdata_sanitizer_tests1.dat')].each do |filename|
+    JSON::parse(open(filename).read).each do |test|
+      it "testdata sanitizer #{test['name']}" do
+        check_sanitization(
+          test['input'],
+          test['output'],
+          test['xhtml'] || test['output'],
+          test['rexml'] || test['output']
+        )
+      end
+    end
+  end
+  ## added because we don't have any coverage above on SVG_ATTR_VAL_ALLOWS_REF
+  HTML5::SafeList::SVG_ATTR_VAL_ALLOWS_REF.each do |attr_name|
+    define_method "test_should_allow_uri_refs_in_svg_attribute_#{attr_name}" do
+      input = "<rect fill='url(#foo)' />"
+      output = "<rect fill='url(#foo)'></rect>"
+      check_sanitization(input, output, output, output)
+    end
+    define_method "test_absolute_uri_refs_in_svg_attribute_#{attr_name}" do
+      input = "<rect fill='url(http://bad.com/) #fff' />"
+      output = "<rect fill='  #fff'></rect>"
+      check_sanitization(input, output, output, output)
+    end
+  end
+  def test_css_list_style
+    html = '<ul style="list-style: none"></ul>'
+    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
+    assert_match %r/list-style/, sane.inner_html
+  end
+  def test_css_negative_value_sanitization
+    html = "<span style=\"letter-spacing:-0.03em;\">"
+    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
+    assert_match %r/-0.03em/, sane.inner_html
+  end
+  def test_css_negative_value_sanitization_shorthand_css_properties
+    html = "<span style=\"margin-left:-0.05em;\">"
+    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
+    assert_match %r/-0.05em/, sane.inner_html
+  end
+  def test_css_high_precision_value_shorthand_css_properties
+    html = "<span style=\"margin-left:0.3333333334em;\">"
+    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
+    assert_match %r/0.3333333334em/, sane.inner_html
+  end
+  def test_css_function_sanitization_leaves_safelisted_functions_calc
+    html = "<span style=\"width:calc(5%)\">"
+    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
+    assert_match %r/calc\(5%\)/, sane.inner_html
+    html = "<span style=\"width: calc(5%)\">"
+    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
+    assert_match %r/calc\(5%\)/, sane.inner_html
+  end
+  def test_css_function_sanitization_leaves_safelisted_functions_rgb
+    html = '<span style="color: rgb(255, 0, 0)">'
+    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
+    assert_match %r/rgb\(255, 0, 0\)/, sane.inner_html
+  end
+  def test_css_function_sanitization_leaves_safelisted_list_style_type
+    html = "<ol style='list-style-type:lower-greek;'></ol>"
+    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
+    assert_match %r/list-style-type:lower-greek/, sane.inner_html
+  end
+  def test_css_function_sanitization_strips_style_attributes_with_unsafe_functions
+    html = "<span style=\"width:url(data-evil-url)\">"
+    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
+    assert_match %r/<span><\/span>/, sane.inner_html
+    html = "<span style=\"width: url(data-evil-url)\">"
+    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
+    assert_match %r/<span><\/span>/, sane.inner_html
+  end
+  def test_issue_90_slow_regex
+    skip("timing tests are hard to make pass and have little regression-testing value")
+    html = %q{<span style="background: url('data:image/svg&#43;xml;charset=utf-8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20width%3D%2232%22%20height%3D%2232%22%20viewBox%3D%220%200%2032%2032%22%3E%3Cpath%20fill%3D%22%23D4C8AE%22%20d%3D%22M0%200h32v32h-32z%22%2F%3E%3Cpath%20fill%3D%22%2383604B%22%20d%3D%22M0%200h31.99v11.75h-31.99z%22%2F%3E%3Cpath%20fill%3D%22%233D2319%22%20d%3D%22M0%2011.5h32v.5h-32z%22%2F%3E%3Cpath%20fill%3D%22%23F83651%22%20d%3D%22M5%200h1v10.5h-1z%22%2F%3E%3Cpath%20fill%3D%22%23FCD050%22%20d%3D%22M6%200h1v10.5h-1z%22%2F%3E%3Cpath%20fill%3D%22%2371C797%22%20d%3D%22M7%200h1v10.5h-1z%22%2F%3E%3Cpath%20fill%3D%22%23509CF9%22%20d%3D%22M8%200h1v10.5h-1z%22%2F%3E%3ClinearGradient%20id%3D%22a%22%20gradientUnits%3D%22userSpaceOnUse%22%20x1%3D%2224.996%22%20y1%3D%2210.5%22%20x2%3D%2224.996%22%20y2%3D%224.5%22%3E%3Cstop%20offset%3D%220%22%20stop-color%3D%22%23796055%22%2F%3E%3Cstop%20offset%3D%22.434%22%20stop-color%3D%22%23614C43%22%2F%3E%3Cstop%20offset%3D%221%22%20stop-color%3D%22%233D2D28%22%2F%3E%3C%2FlinearGradient%3E%3Cpath%20fill%3D%22url(%23a)%22%20d%3D%22M28%208.5c0%201.1-.9%202-2%202h-2c-1.1%200-2-.9-2-2v-2c0-1.1.9-2%202-2h2c1.1%200%202%20.9%202%202v2z%22%2F%3E%3Cpath%20fill%3D%22%235F402E%22%20d%3D%22M28%208c0%201.1-.9%202-2%202h-2c-1.1%200-2-.9-2-2v-2c0-1.1.9-2%202-2h2c1.1%200%202%20.9%202%202v2z%22%2F%3E%3C');"></span>}
+    assert_completes_in_reasonable_time {
+      Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
+    }
+  end
+  def test_upper_case_css_property
+    html = "<div style=\"COLOR: BLUE; NOTAPROPERTY: RED;\">asdf</div>"
+    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_xml)
+    assert_match(/COLOR:\s*BLUE/i, sane.at_css("div")["style"])
+    refute_match(/NOTAPROPERTY/i, sane.at_css("div")["style"])
+  end
+  def test_many_properties_some_allowed
+    html = "<div style=\"background: bold notaproperty center alsonotaproperty 10px;\">asdf</div>"
+    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_xml)
+    assert_match(/bold\s+center\s+10px/, sane.at_css("div")["style"])
+  end
+  def test_many_properties_non_allowed
+    html = "<div style=\"background: notaproperty alsonotaproperty;\">asdf</div>"
+    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_xml)
+    assert_nil sane.at_css("div")["style"]
+  end
+  def test_svg_properties
+    html = "<line style='stroke-width: 10px;'></line>"
+    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_xml)
+    assert_match(/stroke-width:\s*10px/, sane.at_css("line")["style"])
+  end
+end
+# <html5_license>
+#
+# Copyright (c) 2006-2008 The Authors
+#
+# Contributors:
+# James Graham - jg307@cam.ac.uk
+# Anne van Kesteren - annevankesteren@gmail.com
+# Lachlan Hunt - lachlan.hunt@lachy.id.au
+# Matt McDonald - kanashii@kanashii.ca
+# Sam Ruby - rubys@intertwingly.net
+# Ian Hickson (Google) - ian@hixie.ch
+# Thomas Broyer - t.broyer@ltgt.net
+# Jacques Distler - distler@golem.ph.utexas.edu
+# Henri Sivonen - hsivonen@iki.fi
+# The Mozilla Foundation (contributions from Henri Sivonen since 2008)
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation files
+# (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge,
+# publish, distribute, sublicense, and/or sell copies of the Software,
+# and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# </html5_license>

data/test/html5/test_scrub.rb ADDED

@@ -0,0 +1,10 @@
+require "helper"
+class UnitHTML5Scrub < Loofah::TestCase
+  include Loofah
+  def test_scrub_css
+    assert_equal Loofah::HTML5::Scrub.scrub_css("background: #ABC012"), "background:#ABC012;"
+    assert_equal Loofah::HTML5::Scrub.scrub_css("background: #abc012"), "background:#abc012;"
+  end
+end

data/test/integration/test_ad_hoc.rb ADDED

@@ -0,0 +1,220 @@
+require "helper"
+class IntegrationTestAdHoc < Loofah::TestCase
+  context "blank input string" do
+    context "fragment" do
+      it "return a blank string" do
+        assert_equal "", Loofah.scrub_fragment("", :prune).to_s
+      end
+    end
+    context "document" do
+      it "return a blank string" do
+        assert_equal "", Loofah.scrub_document("", :prune).root.to_s
+      end
+    end
+  end
+  context "tests" do
+    MSWORD_HTML = File.read(File.join(File.dirname(__FILE__), "..", "assets", "msword.html")).freeze
+    def test_removal_of_illegal_tag
+      html = <<-HTML
+      following this there should be no jim tag
+      <jim>jim</jim>
+      was there?
+    HTML
+      sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
+      assert sane.xpath("//jim").empty?
+    end
+    def test_removal_of_illegal_attribute
+      html = "<p class=bar foo=bar abbr=bar />"
+      sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
+      node = sane.xpath("//p").first
+      assert node.attributes["class"]
+      assert node.attributes["abbr"]
+      assert_nil node.attributes["foo"]
+    end
+    def test_removal_of_illegal_url_in_href
+      html = <<-HTML
+      <a href='jimbo://jim.jim/'>this link should have its href removed because of illegal url</a>
+      <a href='http://jim.jim/'>this link should be fine</a>
+    HTML
+      sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
+      nodes = sane.xpath("//a")
+      assert_nil nodes.first.attributes["href"]
+      assert nodes.last.attributes["href"]
+    end
+    def test_css_sanitization
+      html = "<p style='background-color: url(\"http://foo.com/\") ; background-color: #000 ;' />"
+      sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
+      assert_match %r/#000/, sane.inner_html
+      refute_match %r/foo\.com/, sane.inner_html
+    end
+    def test_fragment_with_no_tags
+      assert_equal "This fragment has no tags.", Loofah.scrub_fragment("This fragment has no tags.", :escape).to_xml
+    end
+    def test_fragment_in_p_tag
+      assert_equal "<p>This fragment is in a p.</p>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>", :escape).to_xml
+    end
+    def test_fragment_in_p_tag_plus_stuff
+      assert_equal "<p>This fragment is in a p.</p>foo<strong>bar</strong>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>foo<strong>bar</strong>", :escape).to_xml
+    end
+    def test_fragment_with_text_nodes_leading_and_trailing
+      assert_equal "text<p>fragment</p>text", Loofah.scrub_fragment("text<p>fragment</p>text", :escape).to_xml
+    end
+    def test_whitewash_on_fragment
+      html = "safe<frameset rows=\"*\"><frame src=\"http://example.com\"></frameset> <b>description</b>"
+      whitewashed = Loofah.scrub_document(html, :whitewash).xpath("/html/body/*").to_s
+      assert_equal "<p>safe</p><b>description</b>", whitewashed.gsub("\n", "")
+    end
+    def test_fragment_whitewash_on_microsofty_markup
+      whitewashed = Loofah.fragment(MSWORD_HTML).scrub!(:whitewash)
+      assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed.to_s.strip
+    end
+    def test_document_whitewash_on_microsofty_markup
+      whitewashed = Loofah.document(MSWORD_HTML).scrub!(:whitewash)
+      assert_match %r(<p>Foo <b>BOLD</b></p>), whitewashed.to_s
+      assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed.xpath("/html/body/*").to_s
+    end
+    def test_return_empty_string_when_nothing_left
+      assert_equal "", Loofah.scrub_document("<script>test</script>", :prune).text
+    end
+    def test_nested_script_cdata_tags_should_be_scrubbed
+      html = "<script><script src='malicious.js'></script>"
+      stripped = Loofah.fragment(html).scrub!(:strip)
+      assert_empty stripped.xpath("//script")
+      refute_match("<script", stripped.to_html)
+    end
+    def test_nested_script_cdata_tags_should_be_scrubbed_2
+      html = "<script><script>alert('a');</script></script>"
+      stripped = Loofah.fragment(html).scrub!(:strip)
+      assert_empty stripped.xpath("//script")
+      refute_match("<script", stripped.to_html)
+    end
+    def test_removal_of_all_tags
+      html = <<-HTML
+      What's up <strong>doc</strong>?
+    HTML
+      stripped = Loofah.scrub_document(html, :prune).text
+      assert_equal %Q(What\'s up doc?).strip, stripped.strip
+    end
+    def test_dont_remove_whitespace
+      html = "Foo\nBar"
+      assert_equal html, Loofah.scrub_document(html, :prune).text
+    end
+    def test_dont_remove_whitespace_between_tags
+      html = "<p>Foo</p>\n<p>Bar</p>"
+      assert_equal "Foo\nBar", Loofah.scrub_document(html, :prune).text
+    end
+    #
+    #  tests for CVE-2018-8048 (see https://github.com/flavorjones/loofah/issues/144)
+    #
+    #  libxml2 >= 2.9.2 fails to escape comments within some attributes. It
+    #  wants to ensure these comments can be treated as "server-side includes",
+    #  but as a result fails to ensure that serialization is well-formed,
+    #  resulting in an opportunity for XSS injection of code into a final
+    #  re-parsed document (presumably in a browser).
+    #
+    #  we'll test this by parsing the HTML, serializing it, then
+    #  re-parsing it to ensure there isn't any ambiguity in the output
+    #  that might allow code injection into a browser consuming
+    #  "sanitized" output.
+    #
+    [
+      #
+      #  these tags and attributes are determined by the code at:
+      #
+      #    https://git.gnome.org/browse/libxml2/tree/HTMLtree.c?h=v2.9.2#n714
+      #
+      { tag: "a", attr: "href" },
+      { tag: "div", attr: "href" },
+      { tag: "a", attr: "action" },
+      { tag: "div", attr: "action" },
+      { tag: "a", attr: "src" },
+      { tag: "div", attr: "src" },
+      { tag: "a", attr: "name" },
+      #
+      #  note that div+name is _not_ affected by the libxml2 issue.
+      #  but we test it anyway to ensure our logic isn't modifying
+      #  attributes that don't need modifying.
+      #
+      { tag: "div", attr: "name", unescaped: true },
+    ].each do |config|
+      define_method "test_uri_escaping_of_#{config[:attr]}_attr_in_#{config[:tag]}_tag" do
+        html = %{<#{config[:tag]} #{config[:attr]}='examp<!--" unsafeattr=foo()>-->le.com'>test</#{config[:tag]}>}
+        reparsed = Loofah.fragment(Loofah.fragment(html).scrub!(:prune).to_html)
+        attributes = reparsed.at_css(config[:tag]).attribute_nodes
+        assert_equal [config[:attr]], attributes.collect(&:name)
+        if Nokogiri::VersionInfo.instance.libxml2?
+          if config[:unescaped]
+            #
+            #  this attribute was emitted wrapped in single-quotes, so a double quote is A-OK.
+            #  assert that this attribute's serialization is unaffected.
+            #
+            assert_equal %{examp<!--" unsafeattr=foo()>-->le.com}, attributes.first.value
+          else
+            #
+            #  let's match the behavior in libxml < 2.9.2.
+            #  test that this attribute's serialization is well-formed and sanitized.
+            #
+            assert_equal %{examp<!--%22%20unsafeattr=foo()>-->le.com}, attributes.first.value
+          end
+        else
+          #
+          #  yay for consistency in javaland. move along, nothing to see here.
+          #
+          assert_equal %{examp<!--%22 unsafeattr=foo()>-->le.com}, attributes.first.value
+        end
+      end
+    end
+    context "xss protection from svg animate attributes" do
+      # see recommendation from https://html5sec.org/#137
+      # to sanitize "to", "from", "values", and "by" attributes
+      it "sanitizes 'from', 'to', and 'by' attributes" do
+        # for CVE-2018-16468
+        # see:
+        # - https://github.com/flavorjones/loofah/issues/154
+        # - https://hackerone.com/reports/429267
+        html = %Q{<svg><a xmlns:xlink=http://www.w3.org/1999/xlink xlink:href=?><circle r=400 /><animate attributeName=xlink:href begin=0 from=javascript:alert(1) to=%26 by=5>}
+        sanitized = Loofah.scrub_fragment(html, :escape)
+        assert_nil sanitized.at_css("animate")["from"]
+        assert_nil sanitized.at_css("animate")["to"]
+        assert_nil sanitized.at_css("animate")["by"]
+      end
+      it "sanitizes 'values' attribute" do
+        # for CVE-2019-15587
+        # see:
+        # - https://github.com/flavorjones/loofah/issues/171
+        # - https://hackerone.com/reports/709009
+        html = %Q{<svg> <animate href="#foo" attributeName="href" values="javascript:alert('xss')"/> <a id="foo"> <circle r=400 /> </a> </svg>}
+        sanitized = Loofah.scrub_fragment(html, :escape)
+        assert_nil sanitized.at_css("animate")["values"]
+      end
+    end
+  end
+end