RubyGems - sanitize - Versions diffs - 4.6.4 → 6.0.2 - Mend

sanitize 4.6.4 → 6.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +4 -4
data/HISTORY.md +259 -16
data/LICENSE +1 -1
data/README.md +89 -76
data/lib/sanitize/config/default.rb +15 -4
data/lib/sanitize/config/relaxed.rb +1 -1
data/lib/sanitize/css.rb +2 -2
data/lib/sanitize/transformers/clean_comment.rb +1 -1
data/lib/sanitize/transformers/clean_css.rb +4 -3
data/lib/sanitize/transformers/clean_doctype.rb +1 -1
data/lib/sanitize/transformers/clean_element.rb +105 -22
data/lib/sanitize/version.rb +1 -3
data/lib/sanitize.rb +56 -72
data/test/common.rb +0 -31
data/test/test_clean_comment.rb +16 -20
data/test/test_clean_css.rb +6 -6
data/test/test_clean_doctype.rb +22 -22
data/test/test_clean_element.rb +200 -82
data/test/test_config.rb +9 -9
data/test/test_malicious_css.rb +20 -7
data/test/test_malicious_html.rb +179 -32
data/test/test_parser.rb +9 -38
data/test/test_sanitize.rb +114 -29
data/test/test_sanitize_css.rb +88 -61
data/test/test_transformers.rb +52 -46
metadata +17 -33
data/test/test_unicode.rb +0 -95

data/test/test_malicious_html.rb CHANGED Viewed

@@ -17,124 +17,126 @@ describe 'Malicious HTML' do
   describe 'comments' do
     it 'should not allow script injection via conditional comments' do
-      @s.fragment(%[<!--[if gte IE 4]>\n<script>alert('XSS');</script>\n<![endif]-->]).
+      _(@s.fragment(%[<!--[if gte IE 4]>\n<script>alert('XSS');</script>\n<![endif]-->])).
         must_equal ''
     end
   end
   describe 'interpolation (ERB, PHP, etc.)' do
     it 'should escape ERB-style tags' do
-      @s.fragment('<% naughty_ruby_code %>').
+      _(@s.fragment('<% naughty_ruby_code %>')).
         must_equal '&lt;% naughty_ruby_code %&gt;'
-      @s.fragment('<%= naughty_ruby_code %>').
+      _(@s.fragment('<%= naughty_ruby_code %>')).
         must_equal '&lt;%= naughty_ruby_code %&gt;'
     end
     it 'should remove PHP-style tags' do
-      @s.fragment('<? naughtyPHPCode(); ?>').
+      _(@s.fragment('<? naughtyPHPCode(); ?>')).
         must_equal ''
-      @s.fragment('<?= naughtyPHPCode(); ?>').
+      _(@s.fragment('<?= naughtyPHPCode(); ?>')).
         must_equal ''
     end
   end
   describe '<body>' do
     it 'should not be possible to inject JS via a malformed event attribute' do
-      @s.document('<html><head></head><body onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert("XSS")></body></html>').
-        must_equal "<html><head></head><body></body></html>\n"
+      _(@s.document('<html><head></head><body onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert("XSS")></body></html>')).
+        must_equal "<html><head></head><body></body></html>"
     end
   end
   describe '<iframe>' do
     it 'should not be possible to inject an iframe using an improperly closed tag' do
-      @s.fragment(%[<iframe src=http://ha.ckers.org/scriptlet.html <]).
+      _(@s.fragment(%[<iframe src=http://ha.ckers.org/scriptlet.html <])).
         must_equal ''
     end
   end
   describe '<img>' do
     it 'should not be possible to inject JS via an unquoted <img> src attribute' do
-      @s.fragment("<img src=javascript:alert('XSS')>").must_equal '<img>'
+      _(@s.fragment("<img src=javascript:alert('XSS')>")).must_equal '<img>'
     end
     it 'should not be possible to inject JS using grave accents as <img> src delimiters' do
-      @s.fragment("<img src=`javascript:alert('XSS')`>").must_equal '<img>'
+      _(@s.fragment("<img src=`javascript:alert('XSS')`>")).must_equal '<img>'
     end
     it 'should not be possible to inject <script> via a malformed <img> tag' do
-      @s.fragment('<img """><script>alert("XSS")</script>">').
-        must_equal '<img>alert("XSS")"&gt;'
+      _(@s.fragment('<img """><script>alert("XSS")</script>">')).
+        must_equal '<img>"&gt;'
     end
     it 'should not be possible to inject protocol-based JS' do
-      @s.fragment('<img src=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;>').
+      _(@s.fragment('<img src=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;>')).
         must_equal '<img>'
-      @s.fragment('<img src=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>').
+      _(@s.fragment('<img src=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>')).
         must_equal '<img>'
-      @s.fragment('<img src=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>').
+      _(@s.fragment('<img src=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>')).
         must_equal '<img>'
       # Encoded tab character.
-      @s.fragment(%[<img src="jav&#x09;ascript:alert('XSS');">]).
+      _(@s.fragment(%[<img src="jav&#x09;ascript:alert('XSS');">])).
         must_equal '<img>'
       # Encoded newline.
-      @s.fragment(%[<img src="jav&#x0A;ascript:alert('XSS');">]).
+      _(@s.fragment(%[<img src="jav&#x0A;ascript:alert('XSS');">])).
         must_equal '<img>'
       # Encoded carriage return.
-      @s.fragment(%[<img src="jav&#x0D;ascript:alert('XSS');">]).
+      _(@s.fragment(%[<img src="jav&#x0D;ascript:alert('XSS');">])).
         must_equal '<img>'
       # Null byte.
-      @s.fragment(%[<img src=java\0script:alert("XSS")>]).
+      _(@s.fragment(%[<img src=java\0script:alert("XSS")>])).
         must_equal '<img>'
       # Spaces plus meta char.
-      @s.fragment(%[<img src=" &#14;  javascript:alert('XSS');">]).
+      _(@s.fragment(%[<img src=" &#14;  javascript:alert('XSS');">])).
         must_equal '<img>'
       # Mixed spaces and tabs.
-      @s.fragment(%[<img src="j\na v\tascript://alert('XSS');">]).
+      _(@s.fragment(%[<img src="j\na v\tascript://alert('XSS');">])).
         must_equal '<img>'
     end
     it 'should not be possible to inject protocol-based JS via whitespace' do
-      @s.fragment(%[<img src="jav\tascript:alert('XSS');">]).
+      _(@s.fragment(%[<img src="jav\tascript:alert('XSS');">])).
         must_equal '<img>'
     end
     it 'should not be possible to inject JS using a half-open <img> tag' do
-      @s.fragment(%[<img src="javascript:alert('XSS')"]).
+      _(@s.fragment(%[<img src="javascript:alert('XSS')"])).
         must_equal ''
     end
   end
   describe '<script>' do
     it 'should not be possible to inject <script> using a malformed non-alphanumeric tag name' do
-      @s.fragment(%[<script/xss src="http://ha.ckers.org/xss.js">alert(1)</script>]).
-        must_equal 'alert(1)'
+      _(@s.fragment(%[<script/xss src="http://ha.ckers.org/xss.js">alert(1)</script>])).
+        must_equal ''
     end
     it 'should not be possible to inject <script> via extraneous open brackets' do
-      @s.fragment(%[<<script>alert("XSS");//<</script>]).
-        must_equal '&lt;alert("XSS");//&lt;'
+      _(@s.fragment(%[<<script>alert("XSS");//<</script>])).
+        must_equal '&lt;'
     end
   end
   # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
   # attempt to preserve server-side includes. This can result in XSS since an
-  # unescaped double quote can allow an attacker to inject a non-whitelisted
+  # unescaped double quote can allow an attacker to inject a non-allowlisted
   # attribute. Sanitize works around this by implementing its own escaping for
   # affected attributes.
   #
   # The relevant libxml2 code is here:
   # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
   describe 'unsafe libxml2 server-side includes in attributes' do
+    using_unpatched_libxml2 = Nokogiri::VersionInfo.instance.libxml2_using_system?
     tag_configs = [
       {
         tag_name: 'a',
@@ -166,12 +168,26 @@ describe 'Malicious HTML' do
         input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]
         it 'should escape unsafe characters in attributes' do
-          @s.fragment(input).must_equal(%[<#{tag_name} #{attr_name}="examp<!--%22%20onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
+          skip "behavior should only exist in nokogiri's patched libxml" if using_unpatched_libxml2
+          # This uses Nokogumbo's HTML-compliant serializer rather than
+          # libxml2's.
+          _(@s.fragment(input)).
+            must_equal(%[<#{tag_name} #{attr_name}="examp<!--%22%20onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
+          # This uses the not-quite-standards-compliant libxml2 serializer via
+          # Nokogiri, so the output may be a little different as of Nokogiri
+          # 1.10.2 when using Nokogiri's vendored libxml2 due to this patch:
+          # https://github.com/sparklemotion/nokogiri/commit/4852e43cb6039e26d8c51af78621e539cbf46c5d
+          fragment = Nokogiri::HTML.fragment(input)
+          @s.node!(fragment)
+          _(fragment.to_html).
+            must_equal(%[<#{tag_name} #{attr_name}="examp&lt;!--%22%20onmouseover=alert(1)&gt;--&gt;le.com">foo</#{tag_name}>])
         end
         it 'should round-trip to the same output' do
           output = @s.fragment(input)
-          @s.fragment(output).must_equal(output)
+          _(@s.fragment(output)).must_equal(output)
         end
       end
@@ -179,14 +195,145 @@ describe 'Malicious HTML' do
         input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>]
         it 'should not escape characters unnecessarily' do
-          @s.fragment(input).must_equal(input)
+          skip "behavior should only exist in nokogiri's patched libxml" if using_unpatched_libxml2
+          # This uses Nokogumbo's HTML-compliant serializer rather than
+          # libxml2's.
+          _(@s.fragment(input)).
+            must_equal(%[<#{tag_name} #{attr_name}="examp<!--&quot; onmouseover=alert(1)>-->le.com">foo</#{tag_name}>])
+          # This uses the not-quite-standards-compliant libxml2 serializer via
+          # Nokogiri, so the output may be a little different as of Nokogiri
+          # 1.10.2 when using Nokogiri's vendored libxml2 due to this patch:
+          # https://github.com/sparklemotion/nokogiri/commit/4852e43cb6039e26d8c51af78621e539cbf46c5d
+          fragment = Nokogiri::HTML.fragment(input)
+          @s.node!(fragment)
+          _(fragment.to_html).
+            must_equal(%[<#{tag_name} #{attr_name}='examp&lt;!--" onmouseover=alert(1)&gt;--&gt;le.com'>foo</#{tag_name}>])
         end
         it 'should round-trip to the same output' do
           output = @s.fragment(input)
-          @s.fragment(output).must_equal(output)
+          _(@s.fragment(output)).must_equal(output)
         end
       end
     end
   end
+  # https://github.com/rgrove/sanitize/security/advisories/GHSA-p4x4-rw2p-8j8m
+  describe 'foreign content bypass in relaxed config' do
+    it 'prevents a sanitization bypass via carefully crafted foreign content' do
+      %w[iframe noembed noframes noscript plaintext script style xmp].each do |tag_name|
+        _(@s.fragment(%[<math><#{tag_name}>/*&lt;/#{tag_name}&gt;&lt;img src onerror=alert(1)>*/])).
+          must_equal ''
+        _(@s.fragment(%[<svg><#{tag_name}>/*&lt;/#{tag_name}&gt;&lt;img src onerror=alert(1)>*/])).
+          must_equal ''
+      end
+    end
+  end
+  # These tests cover an unsupported and unsafe custom config that allows MathML
+  # and SVG elements, which Sanitize's docs specifically say multiple times in
+  # big prominent warnings that you SHOULD NOT DO because Sanitize doesn't
+  # support MathML or SVG.
+  #
+  # Do not use the custom configs you see in these tests! If you do, you may be
+  # creating XSS vulnerabilities in your application.
+  describe 'foreign content bypass in unsafe custom config that allows MathML or SVG' do
+    unescaped_content_elements = %w[
+      noembed
+      noframes
+      plaintext
+      script
+      xmp
+    ]
+    removed_content_elements = %w[
+      iframe
+    ]
+    removed_elements = %w[
+      noscript
+      style
+    ]
+    before do
+      @s = Sanitize.new(
+        Sanitize::Config.merge(
+          Sanitize::Config::RELAXED,
+          elements: Sanitize::Config::RELAXED[:elements] +
+            unescaped_content_elements +
+            removed_content_elements +
+            %w[math svg]
+        )
+      )
+    end
+    unescaped_content_elements.each do |name|
+      it "forcibly escapes text content inside `<#{name}>` in a MathML namespace" do
+        assert_equal(
+          "<math><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}></math>",
+          @s.fragment("<math><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}>")
+        )
+      end
+      it "forcibly escapes text content inside `<#{name}>` in an SVG namespace" do
+        assert_equal(
+          "<svg><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}></svg>",
+          @s.fragment("<svg><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}>")
+        )
+      end
+    end
+    removed_content_elements.each do |name|
+      it "removes text content inside `<#{name}>` in a MathML namespace" do
+        assert_equal(
+          "<math><#{name}></#{name}></math>",
+          @s.fragment("<math><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}>")
+        )
+      end
+      it "removes text content inside `<#{name}>` in an SVG namespace" do
+        assert_equal(
+          "<svg><#{name}></#{name}></svg>",
+          @s.fragment("<svg><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}>")
+        )
+      end
+    end
+    removed_elements.each do |name|
+      it "removes `<#{name}>` elements in a MathML namespace" do
+        assert_equal(
+          '<math></math>',
+          @s.fragment("<math><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}>")
+        )
+      end
+      it "removes `<#{name}>` elements in an SVG namespace" do
+        assert_equal(
+          '<svg></svg>',
+          @s.fragment("<svg><#{name}>&lt;img src=x onerror=alert(1)&gt;</#{name}>")
+        )
+      end
+    end
+  end
+  describe 'sanitization bypass by exploiting scripting-disabled <noscript> behavior' do
+    before do
+      @s = Sanitize.new(
+        Sanitize::Config.merge(
+          Sanitize::Config::RELAXED,
+          elements: Sanitize::Config::RELAXED[:elements] + ['noscript']
+        )
+      )
+    end
+    it 'is prevented by removing `<noscript>` elements regardless of the allowlist' do
+      assert_equal(
+        '',
+        @s.fragment(%[<noscript><div id='</noscript>&lt;img src=x onerror=alert(1)&gt; '>])
+      )
+    end
+  end
 end

data/test/test_parser.rb CHANGED Viewed

@@ -6,55 +6,26 @@ describe 'Parser' do
   parallelize_me!
   it 'should translate valid entities into characters' do
-    Sanitize.fragment("&apos;&eacute;&amp;").must_equal("'é&amp;")
+    _(Sanitize.fragment("&apos;&eacute;&amp;")).must_equal("'é&amp;")
   end
   it 'should translate orphaned ampersands into entities' do
-    Sanitize.fragment('at&t').must_equal('at&amp;t')
+    _(Sanitize.fragment('at&t')).must_equal('at&amp;t')
   end
   it 'should not add newlines after tags when serializing a fragment' do
-    Sanitize.fragment("<div>foo\n\n<p>bar</p><div>\nbaz</div></div><div>quux</div>", :elements => ['div', 'p'])
+    _(Sanitize.fragment("<div>foo\n\n<p>bar</p><div>\nbaz</div></div><div>quux</div>", :elements => ['div', 'p']))
       .must_equal "<div>foo\n\n<p>bar</p><div>\nbaz</div></div><div>quux</div>"
   end
   it 'should not have the Nokogiri 1.4.2+ unterminated script/style element bug' do
-    Sanitize.fragment('foo <script>bar').must_equal 'foo bar'
-    Sanitize.fragment('foo <style>bar').must_equal 'foo bar'
+    _(Sanitize.fragment('foo <script>bar')).must_equal 'foo '
+    _(Sanitize.fragment('foo <style>bar')).must_equal 'foo '
   end
   it 'ambiguous non-tag brackets like "1 > 2 and 2 < 1" should be parsed correctly' do
-    Sanitize.fragment('1 > 2 and 2 < 1').must_equal '1 &gt; 2 and 2 &lt; 1'
-    Sanitize.fragment('OMG HAPPY BIRTHDAY! *<:-D').must_equal 'OMG HAPPY BIRTHDAY! *&lt;:-D'
-  end
-  # https://github.com/sparklemotion/nokogiri/issues/1008
-  it 'should work around the libxml2 content-type meta tag bug' do
-    Sanitize.document('<html><head></head><body>Howdy!</body></html>',
-      :elements => %w[html head body]
-    ).must_equal "<html><head></head><body>Howdy!</body></html>\n"
-    Sanitize.document('<html><head></head><body>Howdy!</body></html>',
-      :elements => %w[html head meta body]
-    ).must_equal "<html><head></head><body>Howdy!</body></html>\n"
-    Sanitize.document('<html><head><meta charset="utf-8"></head><body>Howdy!</body></html>',
-      :elements   => %w[html head meta body],
-      :attributes => {'meta' => ['charset']}
-    ).must_equal "<html><head><meta charset=\"utf-8\"></head><body>Howdy!</body></html>\n"
-    Sanitize.document('<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8"></head><body>Howdy!</body></html>',
-      :elements   => %w[html head meta body],
-      :attributes => {'meta' => %w[charset content http-equiv]}
-    ).must_equal "<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"></head><body>Howdy!</body></html>\n"
-    # Edge case: an existing content-type meta tag with a non-UTF-8 content type
-    # will be converted to UTF-8, since that's the only output encoding we
-    # support.
-    Sanitize.document('<html><head><meta http-equiv="content-type" content="text/html;charset=us-ascii"></head><body>Howdy!</body></html>',
-      :elements   => %w[html head meta body],
-      :attributes => {'meta' => %w[charset content http-equiv]}
-    ).must_equal "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"></head><body>Howdy!</body></html>\n"
+    _(Sanitize.fragment('1 > 2 and 2 < 1')).must_equal '1 &gt; 2 and 2 &lt; 1'
+    _(Sanitize.fragment('OMG HAPPY BIRTHDAY! *<:-D')).must_equal 'OMG HAPPY BIRTHDAY! *&lt;:-D'
   end
   describe 'when siblings are added after a node during traversal' do
@@ -84,11 +55,11 @@ describe 'Parser' do
             siblings << env[:node][:id]
           end
-          return {:node_whitelist => [env[:node]]}
+          return {:node_allowlist => [env[:node]]}
       })
       # All siblings should be traversed, and in the order added.
-      siblings.must_equal [
+      _(siblings).must_equal [
         "added_one_one_one",
         "added_one_one",
         "added_one_two",

data/test/test_sanitize.rb CHANGED Viewed

@@ -9,7 +9,7 @@ describe 'Sanitize' do
       ]
       Sanitize.new({ :transformers => transformers })
-      transformers.length.must_equal(1)
+      _(transformers.length).must_equal(1)
     end
   end
@@ -24,42 +24,118 @@ describe 'Sanitize' do
       end
       it 'should sanitize an HTML document' do
-        @s.document('<!doctype html><html><b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script></html>')
-          .must_equal "<html>Lorem ipsum dolor sit amet alert(\"hello world\");</html>\n"
+        _(@s.document('<!doctype html><html><b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script></html>'))
+          .must_equal "<html>Lorem ipsum dolor sit amet </html>"
       end
       it 'should not modify the input string' do
         input = '<!DOCTYPE html><b>foo</b>'
         @s.document(input)
-        input.must_equal('<!DOCTYPE html><b>foo</b>')
+        _(input).must_equal('<!DOCTYPE html><b>foo</b>')
       end
       it 'should not choke on frozen documents' do
-        @s.document('<!doctype html><html><b>foo</b>'.freeze).must_equal "<html>foo</html>\n"
+        _(@s.document('<!doctype html><html><b>foo</b>'.freeze)).must_equal "<html>foo</html>"
+      end
+      it 'should normalize newlines' do
+        _(@s.document("a\r\n\n\r\r\r\nz")).must_equal "<html>a\n\n\n\n\nz</html>"
+      end
+      it 'should strip control characters (except ASCII whitespace)' do
+        sample_control_chars = "\u0001\u0008\u000b\u000e\u001f\u007f\u009f"
+        whitespace = "\t\n\f\u0020"
+        _(@s.document("a#{sample_control_chars}#{whitespace}z")).must_equal "<html>a#{whitespace}z</html>"
+      end
+      it 'should strip non-characters' do
+        sample_non_chars = "\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}"
+        _(@s.document("a#{sample_non_chars}z")).must_equal "<html>az</html>"
+      end
+      describe 'when html body exceeds Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH' do
+        let(:content) do
+          content = nest_html_content('<b>foo</b>', Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH)
+          "<html>#{content}</html>"
+        end
+        it 'raises an ArgumentError exception' do
+          assert_raises ArgumentError do
+            @s.document(content)
+          end
+        end
+        describe 'and :max_tree_depth of -1 is supplied in :parser_options' do
+          before do
+            @s = Sanitize.new(elements: ['html'], parser_options: { max_tree_depth: -1 })
+          end
+          it 'does not raise an ArgumentError exception' do
+            _(@s.document(content)).must_equal '<html>foo</html>'
+          end
+        end
       end
     end
     describe '#fragment' do
       it 'should sanitize an HTML fragment' do
-        @s.fragment('<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>')
-          .must_equal 'Lorem ipsum dolor sit amet alert("hello world");'
+        _(@s.fragment('<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>'))
+          .must_equal 'Lorem ipsum dolor sit amet '
       end
       it 'should not modify the input string' do
         input = '<b>foo</b>'
         @s.fragment(input)
-        input.must_equal '<b>foo</b>'
+        _(input).must_equal '<b>foo</b>'
       end
       it 'should not choke on fragments containing <html> or <body>' do
-        @s.fragment('<html><b>foo</b></html>').must_equal 'foo'
-        @s.fragment('<body><b>foo</b></body>').must_equal 'foo'
-        @s.fragment('<html><body><b>foo</b></body></html>').must_equal 'foo'
-        @s.fragment('<!DOCTYPE html><html><body><b>foo</b></body></html>').must_equal 'foo'
+        _(@s.fragment('<html><b>foo</b></html>')).must_equal 'foo'
+        _(@s.fragment('<body><b>foo</b></body>')).must_equal 'foo'
+        _(@s.fragment('<html><body><b>foo</b></body></html>')).must_equal 'foo'
+        _(@s.fragment('<!DOCTYPE html><html><body><b>foo</b></body></html>')).must_equal 'foo'
       end
       it 'should not choke on frozen fragments' do
-        @s.fragment('<b>foo</b>'.freeze).must_equal 'foo'
+        _(@s.fragment('<b>foo</b>'.freeze)).must_equal 'foo'
+      end
+      it 'should normalize newlines' do
+        _(@s.fragment("a\r\n\n\r\r\r\nz")).must_equal "a\n\n\n\n\nz"
+      end
+      it 'should strip control characters (except ASCII whitespace)' do
+        sample_control_chars = "\u0001\u0008\u000b\u000e\u001f\u007f\u009f"
+        whitespace = "\t\n\f\u0020"
+        _(@s.fragment("a#{sample_control_chars}#{whitespace}z")).must_equal "a#{whitespace}z"
+      end
+      it 'should strip non-characters' do
+        sample_non_chars = "\ufdd0\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}"
+        _(@s.fragment("a#{sample_non_chars}z")).must_equal "az"
+      end
+      describe 'when html body exceeds Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH' do
+        let(:content) do
+          content = nest_html_content('<b>foo</b>', Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH)
+          "<body>#{content}</body>"
+        end
+        it 'raises an ArgumentError exception' do
+          assert_raises ArgumentError do
+            @s.fragment(content)
+          end
+        end
+        describe 'and :max_tree_depth of -1 is supplied in :parser_options' do
+          before do
+            @s = Sanitize.new(parser_options: { max_tree_depth: -1 })
+          end
+          it 'does not raise an ArgumentError exception' do
+            _(@s.fragment(content)).must_equal 'foo'
+          end
+        end
       end
     end
@@ -71,13 +147,13 @@ describe 'Sanitize' do
         doc.xpath('/html/body/node()').each {|node| frag << node }
         @s.node!(frag)
-        frag.to_html.must_equal 'Lorem ipsum dolor sit amet alert("hello world");'
+        _(frag.to_html).must_equal 'Lorem ipsum dolor sit amet '
       end
-      describe "when the given node is a document and <html> isn't whitelisted" do
+      describe "when the given node is a document and <html> isn't allowlisted" do
         it 'should raise a Sanitize::Error' do
           doc = Nokogiri::HTML5.parse('foo')
-          proc { @s.node!(doc) }.must_raise Sanitize::Error
+          _(proc { @s.node!(doc) }).must_raise Sanitize::Error
         end
       end
     end
@@ -85,28 +161,37 @@ describe 'Sanitize' do
   describe 'class methods' do
     describe '.document' do
-      it 'should call #document' do
-        Sanitize.stub_instance(:document, proc {|html| html + ' called' }) do
-          Sanitize.document('<html>foo</html>')
-            .must_equal '<html>foo</html> called'
-        end
+      it 'should sanitize an HTML document with the given config' do
+        html = '<!doctype html><html><b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script></html>'
+        _(Sanitize.document(html, :elements => ['html']))
+          .must_equal "<html>Lorem ipsum dolor sit amet </html>"
       end
     end
     describe '.fragment' do
-      it 'should call #fragment' do
-        Sanitize.stub_instance(:fragment, proc {|html| html + ' called' }) do
-          Sanitize.fragment('<b>foo</b>').must_equal '<b>foo</b> called'
-        end
+      it 'should sanitize an HTML fragment with the given config' do
+        html = '<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>'
+        _(Sanitize.fragment(html, :elements => ['strong']))
+          .must_equal 'Lorem ipsum <strong>dolor</strong> sit amet '
       end
     end
     describe '.node!' do
-      it 'should call #node!' do
-        Sanitize.stub_instance(:node!, proc {|input| input + ' called' }) do
-          Sanitize.node!('not really a node').must_equal 'not really a node called'
-        end
+      it 'should sanitize a Nokogiri::XML::Node with the given config' do
+        doc = Nokogiri::HTML5.parse('<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>')
+        frag = doc.fragment
+        doc.xpath('/html/body/node()').each {|node| frag << node }
+        Sanitize.node!(frag, :elements => ['strong'])
+        _(frag.to_html).must_equal 'Lorem ipsum <strong>dolor</strong> sit amet '
       end
     end
   end
+  private
+  def nest_html_content(html_content, depth)
+    "#{'<span>' * depth}#{html_content}#{'</span>' * depth}"
+  end
 end