RubyGems - loofah - Versions diffs - 2.1.1 → 2.3.0 - Mend

loofah 2.1.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of loofah might be problematic. Click here for more details.

Files changed (24) hide show

checksums.yaml +5 -5
data/CHANGELOG.md +110 -31
data/Gemfile +3 -3
data/MIT-LICENSE.txt +1 -1
data/Manifest.txt +6 -2
data/README.md +369 -0
data/Rakefile +23 -21
data/SECURITY.md +18 -0
data/lib/loofah.rb +15 -14
data/lib/loofah/elements.rb +81 -6
data/lib/loofah/helpers.rb +13 -3
data/lib/loofah/html5/libxml2_workarounds.rb +26 -0
data/lib/loofah/html5/safelist.rb +800 -0
data/lib/loofah/html5/scrub.rb +43 -16
data/lib/loofah/scrubbers.rb +7 -2
data/test/assets/msword.html +63 -0
data/test/html5/test_sanitizer.rb +67 -16
data/test/html5/test_scrub.rb +10 -0
data/test/integration/test_ad_hoc.rb +161 -133
data/test/integration/test_html.rb +12 -2
data/test/unit/test_helpers.rb +4 -4
metadata +50 -41
data/README.rdoc +0 -314
data/lib/loofah/html5/whitelist.rb +0 -183

data/lib/loofah/html5/scrub.rb CHANGED

@@ -1,5 +1,3 @@
-#encoding: US-ASCII
 require 'cgi'
 require 'crass'
@@ -8,13 +6,13 @@ module Loofah
     module Scrub
       CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
-      CSS_KEYWORDISH = /\A(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)\z/
+      CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)\z/
       CRASS_SEMICOLON = {:node => :semicolon, :raw => ";"}
       class << self
         def allowed_element? element_name
-          ::Loofah::HTML5::WhiteList::ALLOWED_ELEMENTS_WITH_LIBXML2.include? element_name
+          ::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include? element_name
         end
         #  alternative implementation of the html5lib attribute scrubbing algorithm
@@ -30,31 +28,31 @@ module Loofah
               next
             end
-            unless WhiteList::ALLOWED_ATTRIBUTES.include?(attr_name)
+            unless SafeList::ALLOWED_ATTRIBUTES.include?(attr_name)
               attr_node.remove
               next
             end
-            if WhiteList::ATTR_VAL_IS_URI.include?(attr_name)
+            if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
               # this block lifted nearly verbatim from HTML5 sanitization
               val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS,'').downcase
-              if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && ! WhiteList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0])
+              if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && ! SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
                 attr_node.remove
                 next
-              elsif val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0] == 'data'
+              elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == 'data'
                 # permit only allowed data mediatypes
-                mediatype = val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[1]
+                mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
                 mediatype, _ = mediatype.split(';')[0..1] if mediatype
-                if mediatype && !WhiteList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
+                if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
                   attr_node.remove
                   next
                 end
               end
             end
-            if WhiteList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
+            if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
               attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attr_node.value
             end
-            if WhiteList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == 'xlink:href' && attr_node.value =~ /^\s*[^#\s].*/m
+            if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == 'xlink:href' && attr_node.value =~ /^\s*[^#\s].*/m
               attr_node.remove
               next
             end
@@ -65,6 +63,8 @@ module Loofah
           node.attribute_nodes.each do |attr_node|
             node.remove_attribute(attr_node.name) if attr_node.value !~ /[^[:space:]]/
           end
+          force_correct_attribute_escaping! node
         end
         def scrub_css_attribute node
@@ -79,14 +79,14 @@ module Loofah
           style_tree.each do |node|
             next unless node[:node] == :property
             next if node[:children].any? do |child|
-              [:url, :bad_url, :function].include? child[:node]
+              [:url, :bad_url].include?(child[:node]) || (child[:node] == :function && !SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase))
             end
             name = node[:name].downcase
-            if WhiteList::ALLOWED_CSS_PROPERTIES.include?(name) || WhiteList::ALLOWED_SVG_PROPERTIES.include?(name)
+            if SafeList::ALLOWED_CSS_PROPERTIES.include?(name) || SafeList::ALLOWED_SVG_PROPERTIES.include?(name)
               sanitized_tree << node << CRASS_SEMICOLON
-            elsif WhiteList::SHORTHAND_CSS_PROPERTIES.include?(name.split('-').first)
+            elsif SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split('-').first)
               value = node[:value].split.map do |keyword|
-                if WhiteList::ALLOWED_CSS_KEYWORDS.include?(keyword) || keyword =~ CSS_KEYWORDISH
+                if SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) || keyword =~ CSS_KEYWORDISH
                   keyword
                 end
               end.compact
@@ -100,6 +100,33 @@ module Loofah
           Crass::Parser.stringify sanitized_tree
         end
+        #
+        #  libxml2 >= 2.9.2 fails to escape comments within some attributes.
+        #
+        #  see comments about CVE-2018-8048 within the tests for more information
+        #
+        def force_correct_attribute_escaping! node
+          return unless Nokogiri::VersionInfo.instance.libxml2?
+          node.attribute_nodes.each do |attr_node|
+            next unless LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES.include?(attr_node.name)
+            tag_name = LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES_QUALIFYING_TAG[attr_node.name]
+            next unless tag_name.nil? || tag_name == node.name
+            #
+            #  this block is just like CGI.escape in Ruby 2.4, but
+            #  only encodes space and double-quote, to mimic
+            #  pre-2.9.2 behavior
+            #
+            encoding = attr_node.value.encoding
+            attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
+              '%' + m.unpack('H2' * m.bytesize).join('%').upcase
+            end.force_encoding(encoding)
+          end
+        end
       end
     end
   end

data/lib/loofah/scrubbers.rb CHANGED

@@ -1,7 +1,7 @@
 module Loofah
   #
   #  Loofah provides some built-in scrubbers for sanitizing with
-  #  HTML5lib's whitelist and for accomplishing some common
+  #  HTML5lib's safelist and for accomplishing some common
   #  transformation tasks.
   #
   #
@@ -99,7 +99,12 @@ module Loofah
       def scrub(node)
         return CONTINUE if html5lib_sanitize(node) == CONTINUE
-        node.before node.children
+        if node.children.length == 1 && node.children.first.cdata?
+          sanitized_text = Loofah.fragment(node.children.first.to_html).scrub!(:strip).to_html
+          node.before Nokogiri::XML::Text.new(sanitized_text, node.document)
+        else
+          node.before node.children
+        end
         node.remove
       end
     end

data/test/assets/msword.html ADDED

@@ -0,0 +1,63 @@
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8"><meta name="ProgId" content="Word.Document"><meta name="Generator" content="Microsoft Word 11"><meta name="Originator" content="Microsoft Word 11"><link rel="File-List" href="file:///C:%5CDOCUME%7E1%5CNICOLE%7E1%5CLOCALS%7E1%5CTemp%5Cmsohtml1%5C01%5Cclip_filelist.xml"><!--[if gte mso 9]><xml>
+<w:WordDocument>
+ <w:View>Normal</w:View>
+ <w:Zoom>0</w:Zoom>
+ <w:PunctuationKerning/>
+ <w:ValidateAgainstSchemas/>
+ <w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
+ <w:IgnoreMixedContent>false</w:IgnoreMixedContent>
+ <w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
+ <w:Compatibility>
+  <w:BreakWrappedTables/>
+  <w:SnapToGridInCell/>
+  <w:WrapTextWithPunct/>
+  <w:UseAsianBreakRules/>
+  <w:DontGrowAutofit/>
+ </w:Compatibility>
+ <w:BrowserLevel>MicrosoftInternetExplorer4</w:BrowserLevel>
+</w:WordDocument>
+</xml><![endif]--><!--[if gte mso 9]><xml>
+<w:LatentStyles DefLockedState="false" LatentStyleCount="156">
+</w:LatentStyles>
+</xml><![endif]--><style>
+<!--
+/* Style Definitions */
+p.MsoNormal, li.MsoNormal, div.MsoNormal
+{mso-style-parent:"";
+margin:0in;
+margin-bottom:.0001pt;
+mso-pagination:widow-orphan;
+font-size:12.0pt;
+font-family:"Times New Roman";
+mso-fareast-font-family:"Times New Roman";}
+@page Section1
+{size:8.5in 11.0in;
+margin:1.0in 1.25in 1.0in 1.25in;
+mso-header-margin:.5in;
+mso-footer-margin:.5in;
+mso-paper-source:0;}
+div.Section1
+{page:Section1;}
+-->
+</style><!--[if gte mso 10]>
+<style>
+/* Style Definitions */
+table.MsoNormalTable
+{mso-style-name:"Table Normal";
+mso-tstyle-rowband-size:0;
+mso-tstyle-colband-size:0;
+mso-style-noshow:yes;
+mso-style-parent:"";
+mso-padding-alt:0in 5.4pt 0in 5.4pt;
+mso-para-margin:0in;
+mso-para-margin-bottom:.0001pt;
+mso-pagination:widow-orphan;
+font-size:10.0pt;
+font-family:"Times New Roman";
+mso-ansi-language:#0400;
+mso-fareast-language:#0400;
+mso-bidi-language:#0400;}
+</style>
+<![endif]-->
+<p class="MsoNormal">Foo <b style="">BOLD<o:p></o:p></b></p>

data/test/html5/test_sanitizer.rb CHANGED

@@ -20,9 +20,9 @@ class Html5TestSanitizer < Loofah::TestCase
   def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
     ##  libxml uses double-quotes, so let's swappo-boppo our quotes before comparing.
     sane = sanitize_html(input).gsub('"',"'")
-    htmloutput.gsub!('"',"'")
-    xhtmloutput.gsub!('"',"'")
-    rexmloutput.gsub!('"',"'")
+    htmloutput = htmloutput.gsub('"',"'")
+    xhtmloutput = xhtmloutput.gsub('"',"'")
+    rexmloutput = rexmloutput.gsub('"',"'")
     ##  HTML5's parsers are shit. there's so much inconsistency with what has closing tags, etc, that
     ##  it would require a lot of manual hacking to make the tests match libxml's output.
@@ -37,7 +37,7 @@ class Html5TestSanitizer < Loofah::TestCase
     assert_in_delta t0, Time.now, 0.1 # arbitrary seconds
   end
-  (HTML5::WhiteList::ALLOWED_ELEMENTS).each do |tag_name|
+  (HTML5::SafeList::ALLOWED_ELEMENTS).each do |tag_name|
     define_method "test_should_allow_#{tag_name}_tag" do
       input       = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
       htmloutput  = "<#{tag_name.downcase} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name.downcase}>"
@@ -58,7 +58,7 @@ class Html5TestSanitizer < Loofah::TestCase
         htmloutput = "<img title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
         xhtmloutput = htmloutput
         rexmloutput = "<image title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</image>"
-      elsif HTML5::WhiteList::VOID_ELEMENTS.include?(tag_name)
+      elsif HTML5::SafeList::VOID_ELEMENTS.include?(tag_name)
         htmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
         xhtmloutput = htmloutput
         htmloutput += '<br/>' if tag_name == 'br'
@@ -71,7 +71,7 @@ class Html5TestSanitizer < Loofah::TestCase
   ##
   ##  libxml2 downcases elements, so this is moot.
   ##
-  # HTML5::WhiteList::ALLOWED_ELEMENTS.each do |tag_name|
+  # HTML5::SafeList::ALLOWED_ELEMENTS.each do |tag_name|
   #   define_method "test_should_forbid_#{tag_name.upcase}_tag" do
   #     input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
   #     output = "&lt;#{tag_name.upcase} title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/#{tag_name.upcase}&gt;"
@@ -79,7 +79,7 @@ class Html5TestSanitizer < Loofah::TestCase
   #   end
   # end
-  HTML5::WhiteList::ALLOWED_ATTRIBUTES.each do |attribute_name|
+  HTML5::SafeList::ALLOWED_ATTRIBUTES.each do |attribute_name|
     next if attribute_name == 'style'
     define_method "test_should_allow_#{attribute_name}_attribute" do
         input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
@@ -110,10 +110,17 @@ class Html5TestSanitizer < Loofah::TestCase
     check_sanitization(input, htmloutput, output, output)
   end
+  def test_should_allow_contenteditable
+    input = '<p contenteditable="false">Hi!</p>'
+    output = '<p contenteditable="false">Hi!</p>'
+    check_sanitization(input, output, output, output)
+  end
   ##
   ##  libxml2 downcases attributes, so this is moot.
   ##
-  # HTML5::WhiteList::ALLOWED_ATTRIBUTES.each do |attribute_name|
+  # HTML5::SafeList::ALLOWED_ATTRIBUTES.each do |attribute_name|
   #   define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
   #     input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
   #     output =  "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
@@ -121,7 +128,7 @@ class Html5TestSanitizer < Loofah::TestCase
   #   end
   # end
-  HTML5::WhiteList::ALLOWED_PROTOCOLS.each do |protocol|
+  HTML5::SafeList::ALLOWED_PROTOCOLS.each do |protocol|
     define_method "test_should_allow_#{protocol}_uris" do
       input = %(<a href="#{protocol}">foo</a>)
       output = "<a href='#{protocol}'>foo</a>"
@@ -129,15 +136,15 @@ class Html5TestSanitizer < Loofah::TestCase
     end
   end
-  HTML5::WhiteList::ALLOWED_PROTOCOLS.each do |protocol|
+  HTML5::SafeList::ALLOWED_PROTOCOLS.each do |protocol|
     define_method "test_should_allow_uppercase_#{protocol}_uris" do
       input = %(<a href="#{protocol.upcase}">foo</a>)
       output = "<a href='#{protocol.upcase}'>foo</a>"
       check_sanitization(input, output, output, output)
     end
   end
-  HTML5::WhiteList::ALLOWED_URI_DATA_MEDIATYPES.each do |data_uri_type|
+  HTML5::SafeList::ALLOWED_URI_DATA_MEDIATYPES.each do |data_uri_type|
     define_method "test_should_allow_data_#{data_uri_type}_uris" do
       input = %(<a href="data:#{data_uri_type}">foo</a>)
       output = "<a href='data:#{data_uri_type}'>foo</a>"
@@ -149,7 +156,7 @@ class Html5TestSanitizer < Loofah::TestCase
     end
   end
-  HTML5::WhiteList::ALLOWED_URI_DATA_MEDIATYPES.each do |data_uri_type|
+  HTML5::SafeList::ALLOWED_URI_DATA_MEDIATYPES.each do |data_uri_type|
     define_method "test_should_allow_uppercase_data_#{data_uri_type}_uris" do
       input = %(<a href="DATA:#{data_uri_type.upcase}">foo</a>)
       output = "<a href='DATA:#{data_uri_type.upcase}'>foo</a>"
@@ -172,8 +179,8 @@ class Html5TestSanitizer < Loofah::TestCase
   end
-  HTML5::WhiteList::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
-    next unless HTML5::WhiteList::ALLOWED_ELEMENTS.include?(tag_name)
+  HTML5::SafeList::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
+    next unless HTML5::SafeList::ALLOWED_ELEMENTS.include?(tag_name)
     define_method "test_#{tag_name}_should_allow_local_href" do
       input = %(<#{tag_name} xlink:href="#foo"/>)
       output = "<#{tag_name.downcase} xlink:href='#foo'></#{tag_name.downcase}>"
@@ -249,7 +256,7 @@ class Html5TestSanitizer < Loofah::TestCase
   end
   ## added because we don't have any coverage above on SVG_ATTR_VAL_ALLOWS_REF
-  HTML5::WhiteList::SVG_ATTR_VAL_ALLOWS_REF.each do |attr_name|
+  HTML5::SafeList::SVG_ATTR_VAL_ALLOWS_REF.each do |attr_name|
     define_method "test_should_allow_uri_refs_in_svg_attribute_#{attr_name}" do
       input = "<rect fill='url(#foo)' />"
       output = "<rect fill='url(#foo)'></rect>"
@@ -263,6 +270,12 @@ class Html5TestSanitizer < Loofah::TestCase
     end
   end
+  def test_css_list_style
+    html = '<ul style="list-style: none"></ul>'
+    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
+    assert_match %r/list-style/, sane.inner_html
+  end
   def test_css_negative_value_sanitization
     html = "<span style=\"letter-spacing:-0.03em;\">"
     sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
@@ -275,6 +288,44 @@ class Html5TestSanitizer < Loofah::TestCase
     assert_match %r/-0.05em/, sane.inner_html
   end
+  def test_css_high_precision_value_shorthand_css_properties
+    html = "<span style=\"margin-left:0.3333333334em;\">"
+    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
+    assert_match %r/0.3333333334em/, sane.inner_html
+  end
+  def test_css_function_sanitization_leaves_safelisted_functions_calc
+    html = "<span style=\"width:calc(5%)\">"
+    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
+    assert_match %r/calc\(5%\)/, sane.inner_html
+    html = "<span style=\"width: calc(5%)\">"
+    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
+    assert_match %r/calc\(5%\)/, sane.inner_html
+  end
+  def test_css_function_sanitization_leaves_safelisted_functions_rgb
+    html = '<span style="color: rgb(255, 0, 0)">'
+    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
+    assert_match %r/rgb\(255, 0, 0\)/, sane.inner_html
+  end
+  def test_css_function_sanitization_leaves_safelisted_list_style_type
+    html = "<ol style='list-style-type:lower-greek;'></ol>"
+    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
+    assert_match %r/list-style-type:lower-greek/, sane.inner_html
+  end
+  def test_css_function_sanitization_strips_style_attributes_with_unsafe_functions
+    html = "<span style=\"width:url(data-evil-url)\">"
+    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
+    assert_match %r/<span><\/span>/, sane.inner_html
+    html = "<span style=\"width: url(data-evil-url)\">"
+    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :strip).to_html)
+    assert_match %r/<span><\/span>/, sane.inner_html
+  end
   def test_issue_90_slow_regex
     skip("timing tests are hard to make pass and have little regression-testing value")

data/test/html5/test_scrub.rb ADDED

@@ -0,0 +1,10 @@
+require "helper"
+class UnitHTML5Scrub < Loofah::TestCase
+  include Loofah
+  def test_scrub_css
+    assert_equal Loofah::HTML5::Scrub.scrub_css("background: #ABC012"), "background:#ABC012;"
+    assert_equal Loofah::HTML5::Scrub.scrub_css("background: #abc012"), "background:#abc012;"
+  end
+end

data/test/integration/test_ad_hoc.rb CHANGED

@@ -16,161 +16,189 @@ class IntegrationTestAdHoc < Loofah::TestCase
     end
   end
-  def test_removal_of_illegal_tag
-    html = <<-HTML
+  context "tests" do
+    MSWORD_HTML = File.read(File.join(File.dirname(__FILE__), "..", "assets", "msword.html")).freeze
+    def test_removal_of_illegal_tag
+      html = <<-HTML
       following this there should be no jim tag
       <jim>jim</jim>
       was there?
     HTML
-    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
-    assert sane.xpath("//jim").empty?
-  end
+      sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
+      assert sane.xpath("//jim").empty?
+    end
-  def test_removal_of_illegal_attribute
-    html = "<p class=bar foo=bar abbr=bar />"
-    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
-    node = sane.xpath("//p").first
-    assert node.attributes['class']
-    assert node.attributes['abbr']
-    assert_nil node.attributes['foo']
-  end
+    def test_removal_of_illegal_attribute
+      html = "<p class=bar foo=bar abbr=bar />"
+      sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
+      node = sane.xpath("//p").first
+      assert node.attributes['class']
+      assert node.attributes['abbr']
+      assert_nil node.attributes['foo']
+    end
-  def test_removal_of_illegal_url_in_href
-    html = <<-HTML
+    def test_removal_of_illegal_url_in_href
+      html = <<-HTML
       <a href='jimbo://jim.jim/'>this link should have its href removed because of illegal url</a>
       <a href='http://jim.jim/'>this link should be fine</a>
     HTML
-    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
-    nodes = sane.xpath("//a")
-    assert_nil nodes.first.attributes['href']
-    assert nodes.last.attributes['href']
-  end
+      sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
+      nodes = sane.xpath("//a")
+      assert_nil nodes.first.attributes['href']
+      assert nodes.last.attributes['href']
+    end
-  def test_css_sanitization
-    html = "<p style='background-color: url(\"http://foo.com/\") ; background-color: #000 ;' />"
-    sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
-    assert_match %r/#000/,    sane.inner_html
-    refute_match %r/foo\.com/, sane.inner_html
-  end
+    def test_css_sanitization
+      html = "<p style='background-color: url(\"http://foo.com/\") ; background-color: #000 ;' />"
+      sane = Nokogiri::HTML(Loofah.scrub_fragment(html, :escape).to_xml)
+      assert_match %r/#000/,    sane.inner_html
+      refute_match %r/foo\.com/, sane.inner_html
+    end
-  def test_fragment_with_no_tags
-    assert_equal "This fragment has no tags.", Loofah.scrub_fragment("This fragment has no tags.", :escape).to_xml
-  end
+    def test_fragment_with_no_tags
+      assert_equal "This fragment has no tags.", Loofah.scrub_fragment("This fragment has no tags.", :escape).to_xml
+    end
-  def test_fragment_in_p_tag
-    assert_equal "<p>This fragment is in a p.</p>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>", :escape).to_xml
-  end
+    def test_fragment_in_p_tag
+      assert_equal "<p>This fragment is in a p.</p>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>", :escape).to_xml
+    end
-  def test_fragment_in_p_tag_plus_stuff
-    assert_equal "<p>This fragment is in a p.</p>foo<strong>bar</strong>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>foo<strong>bar</strong>", :escape).to_xml
-  end
+    def test_fragment_in_p_tag_plus_stuff
+      assert_equal "<p>This fragment is in a p.</p>foo<strong>bar</strong>", Loofah.scrub_fragment("<p>This fragment is in a p.</p>foo<strong>bar</strong>", :escape).to_xml
+    end
-  def test_fragment_with_text_nodes_leading_and_trailing
-    assert_equal "text<p>fragment</p>text", Loofah.scrub_fragment("text<p>fragment</p>text", :escape).to_xml
-  end
+    def test_fragment_with_text_nodes_leading_and_trailing
+      assert_equal "text<p>fragment</p>text", Loofah.scrub_fragment("text<p>fragment</p>text", :escape).to_xml
+    end
-  def test_whitewash_on_fragment
-    html = "safe<frameset rows=\"*\"><frame src=\"http://example.com\"></frameset> <b>description</b>"
-    whitewashed = Loofah.scrub_document(html, :whitewash).xpath("/html/body/*").to_s
-    assert_equal "<p>safe</p><b>description</b>", whitewashed.gsub("\n","")
-  end
+    def test_whitewash_on_fragment
+      html = "safe<frameset rows=\"*\"><frame src=\"http://example.com\"></frameset> <b>description</b>"
+      whitewashed = Loofah.scrub_document(html, :whitewash).xpath("/html/body/*").to_s
+      assert_equal "<p>safe</p><b>description</b>", whitewashed.gsub("\n","")
+    end
-  MSWORD_HTML = <<-EOHTML
-<meta http-equiv="Content-Type" content="text/html; charset=utf-8"><meta name="ProgId" content="Word.Document"><meta name="Generator" content="Microsoft Word 11"><meta name="Originator" content="Microsoft Word 11"><link rel="File-List" href="file:///C:%5CDOCUME%7E1%5CNICOLE%7E1%5CLOCALS%7E1%5CTemp%5Cmsohtml1%5C01%5Cclip_filelist.xml"><!--[if gte mso 9]><xml>
-<w:WordDocument>
- <w:View>Normal</w:View>
- <w:Zoom>0</w:Zoom>
- <w:PunctuationKerning/>
- <w:ValidateAgainstSchemas/>
- <w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
- <w:IgnoreMixedContent>false</w:IgnoreMixedContent>
- <w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
- <w:Compatibility>
-  <w:BreakWrappedTables/>
-  <w:SnapToGridInCell/>
-  <w:WrapTextWithPunct/>
-  <w:UseAsianBreakRules/>
-  <w:DontGrowAutofit/>
- </w:Compatibility>
- <w:BrowserLevel>MicrosoftInternetExplorer4</w:BrowserLevel>
-</w:WordDocument>
-</xml><![endif]--><!--[if gte mso 9]><xml>
-<w:LatentStyles DefLockedState="false" LatentStyleCount="156">
-</w:LatentStyles>
-</xml><![endif]--><style>
-<!--
-/* Style Definitions */
-p.MsoNormal, li.MsoNormal, div.MsoNormal
-{mso-style-parent:"";
-margin:0in;
-margin-bottom:.0001pt;
-mso-pagination:widow-orphan;
-font-size:12.0pt;
-font-family:"Times New Roman";
-mso-fareast-font-family:"Times New Roman";}
-@page Section1
-{size:8.5in 11.0in;
-margin:1.0in 1.25in 1.0in 1.25in;
-mso-header-margin:.5in;
-mso-footer-margin:.5in;
-mso-paper-source:0;}
-div.Section1
-{page:Section1;}
--->
-</style><!--[if gte mso 10]>
-<style>
-/* Style Definitions */
-table.MsoNormalTable
-{mso-style-name:"Table Normal";
-mso-tstyle-rowband-size:0;
-mso-tstyle-colband-size:0;
-mso-style-noshow:yes;
-mso-style-parent:"";
-mso-padding-alt:0in 5.4pt 0in 5.4pt;
-mso-para-margin:0in;
-mso-para-margin-bottom:.0001pt;
-mso-pagination:widow-orphan;
-font-size:10.0pt;
-font-family:"Times New Roman";
-mso-ansi-language:#0400;
-mso-fareast-language:#0400;
-mso-bidi-language:#0400;}
-</style>
-<![endif]-->
-<p class="MsoNormal">Foo <b style="">BOLD<o:p></o:p></b></p>
-  EOHTML
-  def test_fragment_whitewash_on_microsofty_markup
-    whitewashed = Loofah.fragment(MSWORD_HTML).scrub!(:whitewash)
-    assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed.to_s.strip
-  end
+    def test_fragment_whitewash_on_microsofty_markup
+      whitewashed = Loofah.fragment(MSWORD_HTML).scrub!(:whitewash)
+      assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed.to_s.strip
+    end
-  def test_document_whitewash_on_microsofty_markup
-    whitewashed = Loofah.document(MSWORD_HTML).scrub!(:whitewash)
-    assert_match %r(<p>Foo <b>BOLD</b></p>), whitewashed.to_s
-    assert_equal "<p>Foo <b>BOLD</b></p>",   whitewashed.xpath("/html/body/*").to_s
-  end
+    def test_document_whitewash_on_microsofty_markup
+      whitewashed = Loofah.document(MSWORD_HTML).scrub!(:whitewash)
+      assert_match %r(<p>Foo <b>BOLD</b></p>), whitewashed.to_s
+      assert_equal "<p>Foo <b>BOLD</b></p>",   whitewashed.xpath("/html/body/*").to_s
+    end
-  def test_return_empty_string_when_nothing_left
-    assert_equal "", Loofah.scrub_document('<script>test</script>', :prune).text
-  end
+    def test_return_empty_string_when_nothing_left
+      assert_equal "", Loofah.scrub_document('<script>test</script>', :prune).text
+    end
-  def test_removal_of_all_tags
-    html = <<-HTML
+    def test_nested_script_cdata_tags_should_be_scrubbed
+      html = "<script><script src='malicious.js'></script>"
+      stripped = Loofah.fragment(html).scrub!(:strip)
+      assert_empty stripped.xpath("//script")
+      refute_match("<script", stripped.to_html)
+    end
+    def test_nested_script_cdata_tags_should_be_scrubbed_2
+      html = "<script><script>alert('a');</script></script>"
+      stripped = Loofah.fragment(html).scrub!(:strip)
+      assert_empty stripped.xpath("//script")
+      refute_match("<script", stripped.to_html)
+    end
+    def test_removal_of_all_tags
+      html = <<-HTML
       What's up <strong>doc</strong>?
     HTML
-    stripped = Loofah.scrub_document(html, :prune).text
-    assert_equal %Q(What\'s up doc?).strip, stripped.strip
-  end
+      stripped = Loofah.scrub_document(html, :prune).text
+      assert_equal %Q(What\'s up doc?).strip, stripped.strip
+    end
-  def test_dont_remove_whitespace
-    html = "Foo\nBar"
-    assert_equal html, Loofah.scrub_document(html, :prune).text
-  end
+    def test_dont_remove_whitespace
+      html = "Foo\nBar"
+      assert_equal html, Loofah.scrub_document(html, :prune).text
+    end
+    def test_dont_remove_whitespace_between_tags
+      html = "<p>Foo</p>\n<p>Bar</p>"
+      assert_equal "Foo\nBar", Loofah.scrub_document(html, :prune).text
+    end
-  def test_dont_remove_whitespace_between_tags
-    html = "<p>Foo</p>\n<p>Bar</p>"
-    assert_equal "Foo\nBar", Loofah.scrub_document(html, :prune).text
+    #
+    #  tests for CVE-2018-8048 (see https://github.com/flavorjones/loofah/issues/144)
+    #
+    #  libxml2 >= 2.9.2 fails to escape comments within some attributes. It
+    #  wants to ensure these comments can be treated as "server-side includes",
+    #  but as a result fails to ensure that serialization is well-formed,
+    #  resulting in an opportunity for XSS injection of code into a final
+    #  re-parsed document (presumably in a browser).
+    #
+    #  we'll test this by parsing the HTML, serializing it, then
+    #  re-parsing it to ensure there isn't any ambiguity in the output
+    #  that might allow code injection into a browser consuming
+    #  "sanitized" output.
+    #
+    [
+      #
+      #  these tags and attributes are determined by the code at:
+      #
+      #    https://git.gnome.org/browse/libxml2/tree/HTMLtree.c?h=v2.9.2#n714
+      #
+      {tag: "a",   attr: "href"},
+      {tag: "div", attr: "href"},
+      {tag: "a",   attr: "action"},
+      {tag: "div", attr: "action"},
+      {tag: "a",   attr: "src"},
+      {tag: "div", attr: "src"},
+      {tag: "a",   attr: "name"},
+      #
+      #  note that div+name is _not_ affected by the libxml2 issue.
+      #  but we test it anyway to ensure our logic isn't modifying
+      #  attributes that don't need modifying.
+      #
+      {tag: "div", attr: "name", unescaped: true},
+    ].each do |config|
+      define_method "test_uri_escaping_of_#{config[:attr]}_attr_in_#{config[:tag]}_tag" do
+        html = %{<#{config[:tag]} #{config[:attr]}='examp<!--" unsafeattr=foo()>-->le.com'>test</#{config[:tag]}>}
+        reparsed = Loofah.fragment(Loofah.fragment(html).scrub!(:prune).to_html)
+        attributes = reparsed.at_css(config[:tag]).attribute_nodes
+        assert_equal [config[:attr]], attributes.collect(&:name)
+        if Nokogiri::VersionInfo.instance.libxml2?
+          if config[:unescaped]
+            #
+            #  this attribute was emitted wrapped in single-quotes, so a double quote is A-OK.
+            #  assert that this attribute's serialization is unaffected.
+            #
+            assert_equal %{examp<!--" unsafeattr=foo()>-->le.com}, attributes.first.value
+          else
+            #
+            #  let's match the behavior in libxml < 2.9.2.
+            #  test that this attribute's serialization is well-formed and sanitized.
+            #
+            assert_equal %{examp<!--%22%20unsafeattr=foo()>-->le.com}, attributes.first.value
+          end
+        else
+          #
+          #  yay for consistency in javaland. move along, nothing to see here.
+          #
+          assert_equal %{examp<!--%22 unsafeattr=foo()>-->le.com}, attributes.first.value
+        end
+      end
+    end
+    # see:
+    # - https://github.com/flavorjones/loofah/issues/154
+    # - https://hackerone.com/reports/429267
+    context "xss protection from svg xmlns:xlink animate attribute" do
+      it "sanitizes appropriate attributes" do
+        html = %Q{<svg><a xmlns:xlink=http://www.w3.org/1999/xlink xlink:href=?><circle r=400 /><animate attributeName=xlink:href begin=0 from=javascript:alert(1) to=%26>}
+        sanitized = Loofah.scrub_fragment(html, :escape)
+        assert_nil sanitized.at_css("animate")["from"]
+      end
+    end
   end
 end