RubyGems - sanitize - Versions diffs - 4.6.4 → 6.0.2 - Mend

sanitize 4.6.4 → 6.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +4 -4
data/HISTORY.md +259 -16
data/LICENSE +1 -1
data/README.md +89 -76
data/lib/sanitize/config/default.rb +15 -4
data/lib/sanitize/config/relaxed.rb +1 -1
data/lib/sanitize/css.rb +2 -2
data/lib/sanitize/transformers/clean_comment.rb +1 -1
data/lib/sanitize/transformers/clean_css.rb +4 -3
data/lib/sanitize/transformers/clean_doctype.rb +1 -1
data/lib/sanitize/transformers/clean_element.rb +105 -22
data/lib/sanitize/version.rb +1 -3
data/lib/sanitize.rb +56 -72
data/test/common.rb +0 -31
data/test/test_clean_comment.rb +16 -20
data/test/test_clean_css.rb +6 -6
data/test/test_clean_doctype.rb +22 -22
data/test/test_clean_element.rb +200 -82
data/test/test_config.rb +9 -9
data/test/test_malicious_css.rb +20 -7
data/test/test_malicious_html.rb +179 -32
data/test/test_parser.rb +9 -38
data/test/test_sanitize.rb +114 -29
data/test/test_sanitize_css.rb +88 -61
data/test/test_transformers.rb +52 -46
metadata +17 -33
data/test/test_unicode.rb +0 -95

data/lib/sanitize.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 # encoding: utf-8
-require 'nokogumbo'
+require 'nokogiri'
 require 'set'
 require_relative 'sanitize/version'
@@ -19,6 +19,20 @@ require_relative 'sanitize/transformers/clean_element'
 class Sanitize
   attr_reader :config
+  # Matches one or more control characters that should be removed from HTML
+  # before parsing, as defined by the HTML living standard.
+  #
+  # -   https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
+  # -   https://infra.spec.whatwg.org/#control
+  REGEX_HTML_CONTROL_CHARACTERS = /[\u0001-\u0008\u000b\u000e-\u001f\u007f-\u009f]+/u
+  # Matches one or more non-characters that should be removed from HTML before
+  # parsing, as defined by the HTML living standard.
+  #
+  # -   https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
+  # -   https://infra.spec.whatwg.org/#noncharacter
+  REGEX_HTML_NON_CHARACTERS = /[\ufdd0-\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}]+/u
   # Matches an attribute value that could be treated by a browser as a URL
   # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
   # or more characters followed by a colon is considered a match, even if the
@@ -26,11 +40,12 @@ class Sanitize
   # IE6 and Opera will still parse).
   REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
-  # Matches Unicode characters that should be stripped from HTML before passing
-  # it to the parser.
+  # Matches one or more characters that should be stripped from HTML before
+  # parsing. This is a combination of `REGEX_HTML_CONTROL_CHARACTERS` and
+  # `REGEX_HTML_NON_CHARACTERS`.
   #
-  # http://www.w3.org/TR/unicode-xml/#Charlist
-  REGEX_UNSUITABLE_CHARS = /[\u0000\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
+  # https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
+  REGEX_UNSUITABLE_CHARS = /(?:#{REGEX_HTML_CONTROL_CHARACTERS}|#{REGEX_HTML_NON_CHARACTERS})/u
   #--
   # Class Methods
@@ -39,7 +54,7 @@ class Sanitize
   # Returns a sanitized copy of the given full _html_ document, using the
   # settings in _config_ if specified.
   #
-  # When sanitizing a document, the `<html>` element must be whitelisted or an
+  # When sanitizing a document, the `<html>` element must be allowlisted or an
   # error will be raised. If this is undesirable, you should probably use
   # {#fragment} instead.
   def self.document(html, config = {})
@@ -81,6 +96,7 @@ class Sanitize
     # Default transformers always run at the end of the chain, after any custom
     # transformers.
+    @transformers << Transformers::CleanElement.new(@config)
     @transformers << Transformers::CleanComment unless @config[:allow_comments]
     if @config[:elements].include?('style')
@@ -93,21 +109,21 @@ class Sanitize
       @transformers << Transformers::CSS::CleanAttribute.new(scss)
     end
-    @transformers <<
-        Transformers::CleanDoctype <<
-        Transformers::CleanCDATA <<
-        Transformers::CleanElement.new(@config)
+    @transformers << Transformers::CleanDoctype
+    @transformers << Transformers::CleanCDATA
+    @transformer_config = { config: @config }
   end
   # Returns a sanitized copy of the given _html_ document.
   #
-  # When sanitizing a document, the `<html>` element must be whitelisted or an
+  # When sanitizing a document, the `<html>` element must be allowlisted or an
   # error will be raised. If this is undesirable, you should probably use
   # {#fragment} instead.
   def document(html)
     return '' unless html
-    doc = Nokogiri::HTML5.parse(preprocess(html))
+    doc = Nokogiri::HTML5.parse(preprocess(html), **@config[:parser_options])
     node!(doc)
     to_html(doc)
   end
@@ -119,20 +135,7 @@ class Sanitize
   def fragment(html)
     return '' unless html
-    html = preprocess(html)
-    doc  = Nokogiri::HTML5.parse("<html><body>#{html}")
-    # Hack to allow fragments containing <body>. Borrowed from
-    # Nokogiri::HTML::DocumentFragment.
-    if html =~ /\A<body(?:\s|>)/i
-      path = '/html/body'
-    else
-      path = '/html/body/node()'
-    end
-    frag = doc.fragment
-    frag << doc.xpath(path)
+    frag = Nokogiri::HTML5.fragment(preprocess(html), **@config[:parser_options])
     node!(frag)
     to_html(frag)
   end
@@ -144,20 +147,20 @@ class Sanitize
   # in place.
   #
   # If _node_ is a `Nokogiri::XML::Document`, the `<html>` element must be
-  # whitelisted or an error will be raised.
+  # allowlisted or an error will be raised.
   def node!(node)
     raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
     if node.is_a?(Nokogiri::XML::Document)
       unless @config[:elements].include?('html')
-        raise Error, 'When sanitizing a document, "<html>" must be whitelisted.'
+        raise Error, 'When sanitizing a document, "<html>" must be allowlisted.'
       end
     end
-    node_whitelist = Set.new
+    node_allowlist = Set.new
     traverse(node) do |n|
-      transform_node!(n, node_whitelist)
+      transform_node!(n, node_allowlist)
     end
     node
@@ -183,51 +186,32 @@ class Sanitize
   end
   def to_html(node)
-    replace_meta = false
-    # Hacky workaround for a libxml2 bug that adds an undesired Content-Type
-    # meta tag to all serialized HTML documents.
-    #
-    # https://github.com/sparklemotion/nokogiri/issues/1008
-    if node.type == Nokogiri::XML::Node::DOCUMENT_NODE ||
-        node.type == Nokogiri::XML::Node::HTML_DOCUMENT_NODE
-      regex_meta   = %r|(<html[^>]*>\s*<head[^>]*>\s*)<meta http-equiv="Content-Type" content="text/html; charset=utf-8">|i
-      # Only replace the content-type meta tag if <meta> isn't whitelisted or
-      # the original document didn't actually include a content-type meta tag.
-      replace_meta = !@config[:elements].include?('meta') ||
-        node.xpath('/html/head/meta[@http-equiv]').none? do |meta|
-          meta['http-equiv'].casecmp('content-type').zero?
-        end
-    end
-    so = Nokogiri::XML::Node::SaveOptions
-    # Serialize to HTML without any formatting to prevent Nokogiri from adding
-    # newlines after certain tags.
-    html = node.to_html(
-      :encoding  => 'utf-8',
-      :indent    => 0,
-      :save_with => so::NO_DECLARATION | so::NO_EMPTY_TAGS | so::AS_HTML
-    )
-    html.gsub!(regex_meta, '\1') if replace_meta
-    html
+    node.to_html(preserve_newline: true)
   end
-  def transform_node!(node, node_whitelist)
+  def transform_node!(node, node_allowlist)
     @transformers.each do |transformer|
-      result = transformer.call(
-        :config         => @config,
-        :is_whitelisted => node_whitelist.include?(node),
-        :node           => node,
-        :node_name      => node.name.downcase,
-        :node_whitelist => node_whitelist
-      )
-      if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
-        node_whitelist.merge(result[:node_whitelist])
+      # Since transform_node! may be called in a tight loop to process thousands
+      # of items, we can optimize both memory and CPU performance by:
+      #
+      # 1. Reusing the same config hash for each transformer
+      # 2. Directly assigning values to hash instead of using merge!. Not only
+      # does merge! create a new hash, it is also 2.6x slower:
+      # https://github.com/JuanitoFatas/fast-ruby#hashmerge-vs-hashmerge-code
+      config = @transformer_config
+      config[:is_allowlisted] = config[:is_whitelisted] = node_allowlist.include?(node)
+      config[:node] = node
+      config[:node_name] = node.name.downcase
+      config[:node_allowlist] = config[:node_whitelist] = node_allowlist
+      result = transformer.call(**config)
+      if result.is_a?(Hash)
+        result_allowlist = result[:node_allowlist] || result[:node_whitelist]
+        if result_allowlist.respond_to?(:each)
+          node_allowlist.merge(result_allowlist)
+        end
       end
     end

data/test/common.rb CHANGED Viewed

@@ -1,34 +1,3 @@
 # encoding: utf-8
-gem 'minitest'
 require 'minitest/autorun'
 require_relative '../lib/sanitize'
-# Helper to stub an instance method. Shamelessly stolen from
-# https://github.com/codeodor/minitest-stub_any_instance/
-class Object
-  def self.stub_instance(name, value, &block)
-    old_method = "__stubbed_method_#{name}__"
-    class_eval do
-      alias_method old_method, name
-      define_method(name) do |*args|
-        if value.respond_to?(:call) then
-          value.call(*args)
-        else
-          value
-        end
-      end
-    end
-    yield
-  ensure
-    class_eval do
-      undef_method name
-      alias_method name, old_method
-      undef_method old_method
-    end
-  end
-end

data/test/test_clean_comment.rb CHANGED Viewed

@@ -11,18 +11,18 @@ describe 'Sanitize::Transformers::CleanComment' do
     end
     it 'should remove comments' do
-      @s.fragment('foo <!-- comment --> bar').must_equal 'foo  bar'
-      @s.fragment('foo <!-- ').must_equal 'foo '
-      @s.fragment('foo <!-- - -> bar').must_equal 'foo '
-      @s.fragment("foo <!--\n\n\n\n-->bar").must_equal 'foo bar'
-      @s.fragment("foo <!-- <!-- <!-- --> --> -->bar").must_equal 'foo  --&gt; --&gt;bar'
-      @s.fragment("foo <div <!-- comment -->>bar</div>").must_equal 'foo <div>&gt;bar</div>'
+      _(@s.fragment('foo <!-- comment --> bar')).must_equal 'foo  bar'
+      _(@s.fragment('foo <!-- ')).must_equal 'foo '
+      _(@s.fragment('foo <!-- - -> bar')).must_equal 'foo '
+      _(@s.fragment("foo <!--\n\n\n\n-->bar")).must_equal 'foo bar'
+      _(@s.fragment("foo <!-- <!-- <!-- --> --> -->bar")).must_equal 'foo  --&gt; --&gt;bar'
+      _(@s.fragment("foo <div <!-- comment -->>bar</div>")).must_equal 'foo <div>&gt;bar</div>'
       # Special case: the comment markup is inside a <script>, which makes it
       # text content and not an actual HTML comment.
-      @s.fragment("<script><!-- comment --></script>").must_equal '&lt;!-- comment --&gt;'
+      _(@s.fragment("<script><!-- comment --></script>")).must_equal ''
-      Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => false, :elements => ['script'])
+      _(Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => false, :elements => ['script']))
         .must_equal '<script><!-- comment --></script>'
     end
   end
@@ -33,18 +33,14 @@ describe 'Sanitize::Transformers::CleanComment' do
     end
     it 'should allow comments' do
-      @s.fragment('foo <!-- comment --> bar').must_equal 'foo <!-- comment --> bar'
-      @s.fragment('foo <!-- ').must_equal 'foo <!-- -->'
-      @s.fragment('foo <!-- - -> bar').must_equal 'foo <!-- - -> bar-->'
-      @s.fragment("foo <!--\n\n\n\n-->bar").must_equal "foo <!--\n\n\n\n-->bar"
-      @s.fragment("foo <!-- <!-- <!-- --> --> -->bar").must_equal 'foo <!-- <!-- <!-- --> --&gt; --&gt;bar'
-      @s.fragment("foo <div <!-- comment -->>bar</div>").must_equal 'foo <div>&gt;bar</div>'
-      # Special case: the comment markup is inside a <script>, which makes it
-      # text content and not an actual HTML comment.
-      @s.fragment("<script><!-- comment --></script>").must_equal '&lt;!-- comment --&gt;'
-      Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => true, :elements => ['script'])
+      _(@s.fragment('foo <!-- comment --> bar')).must_equal 'foo <!-- comment --> bar'
+      _(@s.fragment('foo <!-- ')).must_equal 'foo <!-- -->'
+      _(@s.fragment('foo <!-- - -> bar')).must_equal 'foo <!-- - -> bar-->'
+      _(@s.fragment("foo <!--\n\n\n\n-->bar")).must_equal "foo <!--\n\n\n\n-->bar"
+      _(@s.fragment("foo <!-- <!-- <!-- --> --> -->bar")).must_equal 'foo <!-- <!-- <!-- --> --&gt; --&gt;bar'
+      _(@s.fragment("foo <div <!-- comment -->>bar</div>")).must_equal 'foo <div>&gt;bar</div>'
+      _(Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => true, :elements => ['script']))
         .must_equal '<script><!-- comment --></script>'
     end
   end

data/test/test_clean_css.rb CHANGED Viewed

@@ -10,15 +10,15 @@ describe 'Sanitize::Transformers::CSS::CleanAttribute' do
   end
   it 'should sanitize CSS properties in style attributes' do
-    @s.fragment(%[
+    _(@s.fragment(%[
       <div style="color: #fff; width: expression(alert(1)); /* <-- evil! */"></div>
-    ].strip).must_equal %[
-      <div style="color: #fff;  /* &lt;-- evil! */"></div>
+    ].strip)).must_equal %[
+      <div style="color: #fff;  /* <-- evil! */"></div>
     ].strip
   end
   it 'should remove the style attribute if the sanitized CSS is empty' do
-    @s.fragment('<div style="width: expression(alert(1))"></div>').
+    _(@s.fragment('<div style="width: expression(alert(1))"></div>')).
       must_equal '<div></div>'
   end
 end
@@ -46,7 +46,7 @@ describe 'Sanitize::Transformers::CSS::CleanElement' do
       </style>
     ].strip
-    @s.fragment(html).must_equal %[
+    _(@s.fragment(html)).must_equal %[
       <style>
       /* Yay CSS! */
       .foo { color: #fff; }
@@ -62,6 +62,6 @@ describe 'Sanitize::Transformers::CSS::CleanElement' do
   end
   it 'should remove the <style> element if the sanitized CSS is empty' do
-    @s.fragment('<style></style>').must_equal ''
+    _(@s.fragment('<style></style>')).must_equal ''
   end
 end

data/test/test_clean_doctype.rb CHANGED Viewed

@@ -11,18 +11,18 @@ describe 'Sanitize::Transformers::CleanDoctype' do
     end
     it 'should remove doctype declarations' do
-      @s.document('<!DOCTYPE html><html>foo</html>').must_equal "<html>foo</html>\n"
-      @s.fragment('<!DOCTYPE html>foo').must_equal 'foo'
+      _(@s.document('<!DOCTYPE html><html>foo</html>')).must_equal "<html>foo</html>"
+      _(@s.fragment('<!DOCTYPE html>foo')).must_equal 'foo'
     end
     it 'should not allow doctype definitions in fragments' do
-      @s.fragment('<!DOCTYPE html><html>foo</html>')
+      _(@s.fragment('<!DOCTYPE html><html>foo</html>'))
         .must_equal "foo"
-      @s.fragment('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
+      _(@s.fragment('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>'))
         .must_equal "foo"
-      @s.fragment("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
+      _(@s.fragment("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>"))
         .must_equal "foo"
     end
   end
@@ -33,38 +33,38 @@ describe 'Sanitize::Transformers::CleanDoctype' do
     end
     it 'should allow doctype declarations in documents' do
-      @s.document('<!DOCTYPE html><html>foo</html>')
-        .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
+      _(@s.document('<!DOCTYPE html><html>foo</html>'))
+        .must_equal "<!DOCTYPE html><html>foo</html>"
-      @s.document('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
-        .must_equal "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\">\n<html>foo</html>\n"
+      _(@s.document('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>'))
+        .must_equal "<!DOCTYPE html><html>foo</html>"
-      @s.document("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
-        .must_equal "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html>foo</html>\n"
+      _(@s.document("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>"))
+        .must_equal "<!DOCTYPE html><html>foo</html>"
     end
     it 'should not allow obviously invalid doctype declarations in documents' do
-      @s.document('<!DOCTYPE blah blah blah><html>foo</html>')
-        .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
+      _(@s.document('<!DOCTYPE blah blah blah><html>foo</html>'))
+        .must_equal "<!DOCTYPE html><html>foo</html>"
-      @s.document('<!DOCTYPE blah><html>foo</html>')
-        .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
+      _(@s.document('<!DOCTYPE blah><html>foo</html>'))
+        .must_equal "<!DOCTYPE html><html>foo</html>"
-      @s.document('<!DOCTYPE html BLAH "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
-        .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
+      _(@s.document('<!DOCTYPE html BLAH "-//W3C//DTD HTML 4.01//EN"><html>foo</html>'))
+        .must_equal "<!DOCTYPE html><html>foo</html>"
-      @s.document('<!whatever><html>foo</html>')
-        .must_equal "<html>foo</html>\n"
+      _(@s.document('<!whatever><html>foo</html>'))
+        .must_equal "<html>foo</html>"
     end
     it 'should not allow doctype definitions in fragments' do
-      @s.fragment('<!DOCTYPE html><html>foo</html>')
+      _(@s.fragment('<!DOCTYPE html><html>foo</html>'))
         .must_equal "foo"
-      @s.fragment('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
+      _(@s.fragment('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>'))
         .must_equal "foo"
-      @s.fragment("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
+      _(@s.fragment("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>"))
         .must_equal "foo"
     end
   end