RubyGems - sanitize - Versions diffs - 5.1.0 → 6.0.1 - Mend

sanitize 5.1.0 → 6.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of sanitize might be problematic. Click here for more details.

Files changed (25) hide show

checksums.yaml +4 -4
data/HISTORY.md +155 -18
data/LICENSE +1 -1
data/README.md +67 -74
data/lib/sanitize/config/default.rb +6 -1
data/lib/sanitize/config/relaxed.rb +1 -1
data/lib/sanitize/css.rb +2 -2
data/lib/sanitize/transformers/clean_comment.rb +1 -1
data/lib/sanitize/transformers/clean_css.rb +3 -3
data/lib/sanitize/transformers/clean_doctype.rb +1 -1
data/lib/sanitize/transformers/clean_element.rb +62 -20
data/lib/sanitize/version.rb +1 -1
data/lib/sanitize.rb +17 -13
data/test/test_clean_comment.rb +16 -16
data/test/test_clean_css.rb +5 -5
data/test/test_clean_doctype.rb +15 -15
data/test/test_clean_element.rb +130 -97
data/test/test_config.rb +9 -9
data/test/test_malicious_css.rb +7 -7
data/test/test_malicious_html.rb +153 -30
data/test/test_parser.rb +9 -9
data/test/test_sanitize.rb +29 -29
data/test/test_sanitize_css.rb +57 -57
data/test/test_transformers.rb +48 -42
metadata +17 -31

data/lib/sanitize/transformers/clean_css.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 class Sanitize; module Transformers; module CSS
-# Enforces a CSS whitelist on the contents of `style` attributes.
+# Enforces a CSS allowlist on the contents of `style` attributes.
 class CleanAttribute
   def initialize(sanitizer_or_config)
     if Sanitize::CSS === sanitizer_or_config
@@ -14,7 +14,7 @@ class CleanAttribute
     node = env[:node]
     return unless node.type == Nokogiri::XML::Node::ELEMENT_NODE &&
-        node.key?('style') && !env[:is_whitelisted]
+        node.key?('style') && !env[:is_allowlisted]
     attr = node.attribute('style')
     css  = @scss.properties(attr.value)
@@ -27,7 +27,7 @@ class CleanAttribute
   end
 end
-# Enforces a CSS whitelist on the contents of `<style>` elements.
+# Enforces a CSS allowlist on the contents of `<style>` elements.
 class CleanElement
   def initialize(sanitizer_or_config)
     if Sanitize::CSS === sanitizer_or_config

data/lib/sanitize/transformers/clean_doctype.rb CHANGED Viewed

@@ -3,7 +3,7 @@
 class Sanitize; module Transformers
   CleanDoctype = lambda do |env|
-    return if env[:is_whitelisted]
+    return if env[:is_allowlisted]
     node = env[:node]

data/lib/sanitize/transformers/clean_element.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 # encoding: utf-8
+require 'cgi'
 require 'set'
 class Sanitize; module Transformers; class CleanElement
@@ -18,6 +19,18 @@ class Sanitize; module Transformers; class CleanElement
   # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#embedding-custom-non-visible-data-with-the-data-*-attributes
   REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u
+  # Elements whose content is treated as unescaped text by HTML parsers.
+  UNESCAPED_TEXT_ELEMENTS = Set.new(%w[
+    iframe
+    noembed
+    noframes
+    noscript
+    plaintext
+    script
+    style
+    xmp
+  ])
   # Attributes that need additional escaping on `<a>` elements due to unsafe
   # libxml2 behavior.
   UNSAFE_LIBXML_ATTRS_A = Set.new(%w[
@@ -76,11 +89,11 @@ class Sanitize; module Transformers; class CleanElement
   def call(env)
     node = env[:node]
-    return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_whitelisted]
+    return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_allowlisted]
     name = env[:node_name]
-    # Delete any element that isn't in the config whitelist, unless the node has
+    # Delete any element that isn't in the config allowlist, unless the node has
     # already been deleted from the document.
     #
     # It's important that we not try to reparent the children of a node that has
@@ -107,34 +120,31 @@ class Sanitize; module Transformers; class CleanElement
       return
     end
-    attr_whitelist = @attributes[name] || @attributes[:all]
+    attr_allowlist = @attributes[name] || @attributes[:all]
-    if attr_whitelist.nil?
-      # Delete all attributes from elements with no whitelisted attributes.
+    if attr_allowlist.nil?
+      # Delete all attributes from elements with no allowlisted attributes.
       node.attribute_nodes.each {|attr| attr.unlink }
     else
-      allow_data_attributes = attr_whitelist.include?(:data)
+      allow_data_attributes = attr_allowlist.include?(:data)
       # Delete any attribute that isn't allowed on this element.
       node.attribute_nodes.each do |attr|
         attr_name = attr.name.downcase
-        unless attr_whitelist.include?(attr_name)
-          # The attribute isn't whitelisted.
+        unless attr_allowlist.include?(attr_name)
+          # The attribute isn't in the allowlist, but may still be allowed if
+          # it's a data attribute.
-          if allow_data_attributes && attr_name.start_with?('data-')
-            # Arbitrary data attributes are allowed. If this is a data
-            # attribute, continue.
-            next if attr_name =~ REGEX_DATA_ATTR
+          unless allow_data_attributes && attr_name.start_with?('data-') && attr_name =~ REGEX_DATA_ATTR
+            # Either the attribute isn't a data attribute or arbitrary data
+            # attributes aren't allowed. Remove the attribute.
+            attr.unlink
+            next
           end
-          # Either the attribute isn't a data attribute or arbitrary data
-          # attributes aren't allowed. Remove the attribute.
-          attr.unlink
-          next
         end
-        # The attribute is whitelisted.
+        # The attribute is allowed.
         # Remove any attributes that use unacceptable protocols.
         if @protocols.include?(name) && @protocols[name].include?(attr_name)
@@ -162,7 +172,7 @@ class Sanitize; module Transformers; class CleanElement
         # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
         # attempt to preserve server-side includes. This can result in XSS since
         # an unescaped double quote can allow an attacker to inject a
-        # non-whitelisted attribute.
+        # non-allowlisted attribute.
         #
         # Sanitize works around this by implementing its own escaping for
         # affected attributes, some of which can exist on any element and some
@@ -188,10 +198,32 @@ class Sanitize; module Transformers; class CleanElement
       @add_attributes[name].each {|key, val| node[key] = val }
     end
+    # Make a best effort to ensure that text nodes in invalid "unescaped text"
+    # elements that are inside a math or svg namespace are properly escaped so
+    # that they don't get parsed as HTML.
+    #
+    # Sanitize is explicitly documented as not supporting MathML or SVG, but
+    # people sometimes allow `<math>` and `<svg>` elements in their custom
+    # configs without realizing that it's not safe. This workaround makes it
+    # slightly less unsafe, but you still shouldn't allow `<math>` or `<svg>`
+    # because Nokogiri doesn't parse them the same way browsers do and Sanitize
+    # can't guarantee that their contents are safe.
+    unless node.namespace.nil?
+      prefix = node.namespace.prefix
+      if (prefix == 'math' || prefix == 'svg') && UNESCAPED_TEXT_ELEMENTS.include?(name)
+        node.children.each do |child|
+          if child.type == Nokogiri::XML::Node::TEXT_NODE
+            child.content = CGI.escapeHTML(child.content)
+          end
+        end
+      end
+    end
     # Element-specific special cases.
     case name
-    # If this is a whitelisted iframe that has children, remove all its
+    # If this is an allowlisted iframe that has children, remove all its
     # children. The HTML standard says iframes shouldn't have content, but when
     # they do, this content is parsed as text and is serialized verbatim without
     # being escaped, which is unsafe because legacy browsers may still render it
@@ -220,6 +252,16 @@ class Sanitize; module Transformers; class CleanElement
         node['content'] = node['content'].gsub(/;\s*charset\s*=.+\z/, ';charset=utf-8')
       end
+    # A `<noscript>` element's content is parsed differently in browsers
+    # depending on whether or not scripting is enabled. Since Nokogiri doesn't
+    # support scripting, it always parses `<noscript>` elements as if scripting
+    # is disabled. This results in edge cases where it's not possible to
+    # reliably sanitize the contents of a `<noscript>` element because Nokogiri
+    # can't fully replicate the parsing behavior of a scripting-enabled browser.
+    # The safest thing to do is to simply remove all `<noscript>` elements.
+    when 'noscript'
+      node.unlink
     end
   end

data/lib/sanitize/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # encoding: utf-8
 class Sanitize
-  VERSION = '5.1.0'
+  VERSION = '6.0.1'
 end

data/lib/sanitize.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 # encoding: utf-8
-require 'nokogumbo'
+require 'nokogiri'
 require 'set'
 require_relative 'sanitize/version'
@@ -54,7 +54,7 @@ class Sanitize
   # Returns a sanitized copy of the given full _html_ document, using the
   # settings in _config_ if specified.
   #
-  # When sanitizing a document, the `<html>` element must be whitelisted or an
+  # When sanitizing a document, the `<html>` element must be allowlisted or an
   # error will be raised. If this is undesirable, you should probably use
   # {#fragment} instead.
   def self.document(html, config = {})
@@ -117,7 +117,7 @@ class Sanitize
   # Returns a sanitized copy of the given _html_ document.
   #
-  # When sanitizing a document, the `<html>` element must be whitelisted or an
+  # When sanitizing a document, the `<html>` element must be allowlisted or an
   # error will be raised. If this is undesirable, you should probably use
   # {#fragment} instead.
   def document(html)
@@ -147,20 +147,20 @@ class Sanitize
   # in place.
   #
   # If _node_ is a `Nokogiri::XML::Document`, the `<html>` element must be
-  # whitelisted or an error will be raised.
+  # allowlisted or an error will be raised.
   def node!(node)
     raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
     if node.is_a?(Nokogiri::XML::Document)
       unless @config[:elements].include?('html')
-        raise Error, 'When sanitizing a document, "<html>" must be whitelisted.'
+        raise Error, 'When sanitizing a document, "<html>" must be allowlisted.'
       end
     end
-    node_whitelist = Set.new
+    node_allowlist = Set.new
     traverse(node) do |n|
-      transform_node!(n, node_whitelist)
+      transform_node!(n, node_allowlist)
     end
     node
@@ -189,7 +189,7 @@ class Sanitize
     node.to_html(preserve_newline: true)
   end
-  def transform_node!(node, node_whitelist)
+  def transform_node!(node, node_allowlist)
     @transformers.each do |transformer|
       # Since transform_node! may be called in a tight loop to process thousands
       # of items, we can optimize both memory and CPU performance by:
@@ -199,15 +199,19 @@ class Sanitize
       # does merge! create a new hash, it is also 2.6x slower:
       # https://github.com/JuanitoFatas/fast-ruby#hashmerge-vs-hashmerge-code
       config = @transformer_config
-      config[:is_whitelisted] = node_whitelist.include?(node)
+      config[:is_allowlisted] = config[:is_whitelisted] = node_allowlist.include?(node)
       config[:node] = node
       config[:node_name] = node.name.downcase
-      config[:node_whitelist] = node_whitelist
+      config[:node_allowlist] = config[:node_whitelist] = node_allowlist
-      result = transformer.call(config)
+      result = transformer.call(**config)
-      if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
-        node_whitelist.merge(result[:node_whitelist])
+      if result.is_a?(Hash)
+        result_allowlist = result[:node_allowlist] || result[:node_whitelist]
+        if result_allowlist.respond_to?(:each)
+          node_allowlist.merge(result_allowlist)
+        end
       end
     end

data/test/test_clean_comment.rb CHANGED Viewed

@@ -11,18 +11,18 @@ describe 'Sanitize::Transformers::CleanComment' do
     end
     it 'should remove comments' do
-      @s.fragment('foo <!-- comment --> bar').must_equal 'foo  bar'
-      @s.fragment('foo <!-- ').must_equal 'foo '
-      @s.fragment('foo <!-- - -> bar').must_equal 'foo '
-      @s.fragment("foo <!--\n\n\n\n-->bar").must_equal 'foo bar'
-      @s.fragment("foo <!-- <!-- <!-- --> --> -->bar").must_equal 'foo  --&gt; --&gt;bar'
-      @s.fragment("foo <div <!-- comment -->>bar</div>").must_equal 'foo <div>&gt;bar</div>'
+      _(@s.fragment('foo <!-- comment --> bar')).must_equal 'foo  bar'
+      _(@s.fragment('foo <!-- ')).must_equal 'foo '
+      _(@s.fragment('foo <!-- - -> bar')).must_equal 'foo '
+      _(@s.fragment("foo <!--\n\n\n\n-->bar")).must_equal 'foo bar'
+      _(@s.fragment("foo <!-- <!-- <!-- --> --> -->bar")).must_equal 'foo  --&gt; --&gt;bar'
+      _(@s.fragment("foo <div <!-- comment -->>bar</div>")).must_equal 'foo <div>&gt;bar</div>'
       # Special case: the comment markup is inside a <script>, which makes it
       # text content and not an actual HTML comment.
-      @s.fragment("<script><!-- comment --></script>").must_equal ''
+      _(@s.fragment("<script><!-- comment --></script>")).must_equal ''
-      Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => false, :elements => ['script'])
+      _(Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => false, :elements => ['script']))
         .must_equal '<script><!-- comment --></script>'
     end
   end
@@ -33,14 +33,14 @@ describe 'Sanitize::Transformers::CleanComment' do
     end
     it 'should allow comments' do
-      @s.fragment('foo <!-- comment --> bar').must_equal 'foo <!-- comment --> bar'
-      @s.fragment('foo <!-- ').must_equal 'foo <!-- -->'
-      @s.fragment('foo <!-- - -> bar').must_equal 'foo <!-- - -> bar-->'
-      @s.fragment("foo <!--\n\n\n\n-->bar").must_equal "foo <!--\n\n\n\n-->bar"
-      @s.fragment("foo <!-- <!-- <!-- --> --> -->bar").must_equal 'foo <!-- <!-- <!-- --> --&gt; --&gt;bar'
-      @s.fragment("foo <div <!-- comment -->>bar</div>").must_equal 'foo <div>&gt;bar</div>'
-      Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => true, :elements => ['script'])
+      _(@s.fragment('foo <!-- comment --> bar')).must_equal 'foo <!-- comment --> bar'
+      _(@s.fragment('foo <!-- ')).must_equal 'foo <!-- -->'
+      _(@s.fragment('foo <!-- - -> bar')).must_equal 'foo <!-- - -> bar-->'
+      _(@s.fragment("foo <!--\n\n\n\n-->bar")).must_equal "foo <!--\n\n\n\n-->bar"
+      _(@s.fragment("foo <!-- <!-- <!-- --> --> -->bar")).must_equal 'foo <!-- <!-- <!-- --> --&gt; --&gt;bar'
+      _(@s.fragment("foo <div <!-- comment -->>bar</div>")).must_equal 'foo <div>&gt;bar</div>'
+      _(Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => true, :elements => ['script']))
         .must_equal '<script><!-- comment --></script>'
     end
   end

data/test/test_clean_css.rb CHANGED Viewed

@@ -10,15 +10,15 @@ describe 'Sanitize::Transformers::CSS::CleanAttribute' do
   end
   it 'should sanitize CSS properties in style attributes' do
-    @s.fragment(%[
+    _(@s.fragment(%[
       <div style="color: #fff; width: expression(alert(1)); /* <-- evil! */"></div>
-    ].strip).must_equal %[
+    ].strip)).must_equal %[
       <div style="color: #fff;  /* <-- evil! */"></div>
     ].strip
   end
   it 'should remove the style attribute if the sanitized CSS is empty' do
-    @s.fragment('<div style="width: expression(alert(1))"></div>').
+    _(@s.fragment('<div style="width: expression(alert(1))"></div>')).
       must_equal '<div></div>'
   end
 end
@@ -46,7 +46,7 @@ describe 'Sanitize::Transformers::CSS::CleanElement' do
       </style>
     ].strip
-    @s.fragment(html).must_equal %[
+    _(@s.fragment(html)).must_equal %[
       <style>
       /* Yay CSS! */
       .foo { color: #fff; }
@@ -62,6 +62,6 @@ describe 'Sanitize::Transformers::CSS::CleanElement' do
   end
   it 'should remove the <style> element if the sanitized CSS is empty' do
-    @s.fragment('<style></style>').must_equal ''
+    _(@s.fragment('<style></style>')).must_equal ''
   end
 end

data/test/test_clean_doctype.rb CHANGED Viewed

@@ -11,18 +11,18 @@ describe 'Sanitize::Transformers::CleanDoctype' do
     end
     it 'should remove doctype declarations' do
-      @s.document('<!DOCTYPE html><html>foo</html>').must_equal "<html>foo</html>"
-      @s.fragment('<!DOCTYPE html>foo').must_equal 'foo'
+      _(@s.document('<!DOCTYPE html><html>foo</html>')).must_equal "<html>foo</html>"
+      _(@s.fragment('<!DOCTYPE html>foo')).must_equal 'foo'
     end
     it 'should not allow doctype definitions in fragments' do
-      @s.fragment('<!DOCTYPE html><html>foo</html>')
+      _(@s.fragment('<!DOCTYPE html><html>foo</html>'))
         .must_equal "foo"
-      @s.fragment('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
+      _(@s.fragment('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>'))
         .must_equal "foo"
-      @s.fragment("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
+      _(@s.fragment("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>"))
         .must_equal "foo"
     end
   end
@@ -33,38 +33,38 @@ describe 'Sanitize::Transformers::CleanDoctype' do
     end
     it 'should allow doctype declarations in documents' do
-      @s.document('<!DOCTYPE html><html>foo</html>')
+      _(@s.document('<!DOCTYPE html><html>foo</html>'))
         .must_equal "<!DOCTYPE html><html>foo</html>"
-      @s.document('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
+      _(@s.document('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>'))
         .must_equal "<!DOCTYPE html><html>foo</html>"
-      @s.document("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
+      _(@s.document("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>"))
         .must_equal "<!DOCTYPE html><html>foo</html>"
     end
     it 'should not allow obviously invalid doctype declarations in documents' do
-      @s.document('<!DOCTYPE blah blah blah><html>foo</html>')
+      _(@s.document('<!DOCTYPE blah blah blah><html>foo</html>'))
         .must_equal "<!DOCTYPE html><html>foo</html>"
-      @s.document('<!DOCTYPE blah><html>foo</html>')
+      _(@s.document('<!DOCTYPE blah><html>foo</html>'))
         .must_equal "<!DOCTYPE html><html>foo</html>"
-      @s.document('<!DOCTYPE html BLAH "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
+      _(@s.document('<!DOCTYPE html BLAH "-//W3C//DTD HTML 4.01//EN"><html>foo</html>'))
         .must_equal "<!DOCTYPE html><html>foo</html>"
-      @s.document('<!whatever><html>foo</html>')
+      _(@s.document('<!whatever><html>foo</html>'))
         .must_equal "<html>foo</html>"
     end
     it 'should not allow doctype definitions in fragments' do
-      @s.fragment('<!DOCTYPE html><html>foo</html>')
+      _(@s.fragment('<!DOCTYPE html><html>foo</html>'))
         .must_equal "foo"
-      @s.fragment('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
+      _(@s.fragment('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>'))
         .must_equal "foo"
-      @s.fragment("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
+      _(@s.fragment("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>"))
         .must_equal "foo"
     end
   end