RubyGems - sanitize - Versions diffs - 4.6.6 → 6.0.0 - Mend

sanitize 4.6.6 → 6.0.0

Potentially problematic release.

This version of sanitize might be problematic. Click here for more details.

Files changed (25) hide show

checksums.yaml +4 -4
data/HISTORY.md +176 -16
data/LICENSE +1 -1
data/README.md +65 -67
data/lib/sanitize/config/default.rb +10 -4
data/lib/sanitize/config/relaxed.rb +1 -1
data/lib/sanitize/css.rb +2 -2
data/lib/sanitize/transformers/clean_comment.rb +1 -1
data/lib/sanitize/transformers/clean_css.rb +3 -3
data/lib/sanitize/transformers/clean_doctype.rb +1 -1
data/lib/sanitize/transformers/clean_element.rb +60 -22
data/lib/sanitize/version.rb +1 -1
data/lib/sanitize.rb +39 -63
data/test/common.rb +0 -31
data/test/test_clean_comment.rb +1 -5
data/test/test_clean_css.rb +1 -1
data/test/test_clean_doctype.rb +8 -8
data/test/test_clean_element.rb +137 -26
data/test/test_malicious_html.rb +50 -7
data/test/test_parser.rb +3 -32
data/test/test_sanitize.rb +103 -18
data/test/test_sanitize_css.rb +43 -16
data/test/test_transformers.rb +29 -23
metadata +17 -33
data/test/test_unicode.rb +0 -95

data/lib/sanitize/transformers/clean_css.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 class Sanitize; module Transformers; module CSS
-# Enforces a CSS whitelist on the contents of `style` attributes.
+# Enforces a CSS allowlist on the contents of `style` attributes.
 class CleanAttribute
   def initialize(sanitizer_or_config)
     if Sanitize::CSS === sanitizer_or_config
@@ -14,7 +14,7 @@ class CleanAttribute
     node = env[:node]
     return unless node.type == Nokogiri::XML::Node::ELEMENT_NODE &&
-        node.key?('style') && !env[:is_whitelisted]
+        node.key?('style') && !env[:is_allowlisted]
     attr = node.attribute('style')
     css  = @scss.properties(attr.value)
@@ -27,7 +27,7 @@ class CleanAttribute
   end
 end
-# Enforces a CSS whitelist on the contents of `<style>` elements.
+# Enforces a CSS allowlist on the contents of `<style>` elements.
 class CleanElement
   def initialize(sanitizer_or_config)
     if Sanitize::CSS === sanitizer_or_config

data/lib/sanitize/transformers/clean_doctype.rb CHANGED Viewed

@@ -3,7 +3,7 @@
 class Sanitize; module Transformers
   CleanDoctype = lambda do |env|
-    return if env[:is_whitelisted]
+    return if env[:is_allowlisted]
     node = env[:node]

data/lib/sanitize/transformers/clean_element.rb CHANGED Viewed

@@ -67,7 +67,7 @@ class Sanitize; module Transformers; class CleanElement
       @whitespace_elements = config[:whitespace_elements]
     end
-    if config[:remove_contents].is_a?(Set)
+    if config[:remove_contents].is_a?(Enumerable)
       @remove_element_contents.merge(config[:remove_contents].map(&:to_s))
     else
       @remove_all_contents = !!config[:remove_contents]
@@ -76,11 +76,11 @@ class Sanitize; module Transformers; class CleanElement
   def call(env)
     node = env[:node]
-    return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_whitelisted]
+    return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_allowlisted]
     name = env[:node_name]
-    # Delete any element that isn't in the config whitelist, unless the node has
+    # Delete any element that isn't in the config allowlist, unless the node has
     # already been deleted from the document.
     #
     # It's important that we not try to reparent the children of a node that has
@@ -97,42 +97,41 @@ class Sanitize; module Transformers; class CleanElement
         end
       end
-      unless @remove_all_contents || @remove_element_contents.include?(name)
-        node.add_previous_sibling(node.children)
+      unless node.children.empty?
+        unless @remove_all_contents || @remove_element_contents.include?(name)
+          node.add_previous_sibling(node.children)
+        end
       end
       node.unlink
       return
     end
-    attr_whitelist = @attributes[name] || @attributes[:all]
+    attr_allowlist = @attributes[name] || @attributes[:all]
-    if attr_whitelist.nil?
-      # Delete all attributes from elements with no whitelisted attributes.
+    if attr_allowlist.nil?
+      # Delete all attributes from elements with no allowlisted attributes.
       node.attribute_nodes.each {|attr| attr.unlink }
     else
-      allow_data_attributes = attr_whitelist.include?(:data)
+      allow_data_attributes = attr_allowlist.include?(:data)
       # Delete any attribute that isn't allowed on this element.
       node.attribute_nodes.each do |attr|
         attr_name = attr.name.downcase
-        unless attr_whitelist.include?(attr_name)
-          # The attribute isn't whitelisted.
+        unless attr_allowlist.include?(attr_name)
+          # The attribute isn't in the allowlist, but may still be allowed if
+          # it's a data attribute.
-          if allow_data_attributes && attr_name.start_with?('data-')
-            # Arbitrary data attributes are allowed. If this is a data
-            # attribute, continue.
-            next if attr_name =~ REGEX_DATA_ATTR
+          unless allow_data_attributes && attr_name.start_with?('data-') && attr_name =~ REGEX_DATA_ATTR
+            # Either the attribute isn't a data attribute or arbitrary data
+            # attributes aren't allowed. Remove the attribute.
+            attr.unlink
+            next
           end
-          # Either the attribute isn't a data attribute or arbitrary data
-          # attributes aren't allowed. Remove the attribute.
-          attr.unlink
-          next
         end
-        # The attribute is whitelisted.
+        # The attribute is allowed.
         # Remove any attributes that use unacceptable protocols.
         if @protocols.include?(name) && @protocols[name].include?(attr_name)
@@ -160,12 +159,17 @@ class Sanitize; module Transformers; class CleanElement
         # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
         # attempt to preserve server-side includes. This can result in XSS since
         # an unescaped double quote can allow an attacker to inject a
-        # non-whitelisted attribute.
+        # non-allowlisted attribute.
         #
         # Sanitize works around this by implementing its own escaping for
         # affected attributes, some of which can exist on any element and some
         # of which can only exist on `<a>` elements.
         #
+        # This fix is technically no longer necessary with Nokogumbo >= 2.0
+        # since it no longer uses libxml2's serializer, but it's retained to
+        # avoid breaking use cases where people might be sanitizing individual
+        # Nokogiri nodes and then serializing them manually without Nokogumbo.
+        #
         # The relevant libxml2 code is here:
         # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
         if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
@@ -180,6 +184,40 @@ class Sanitize; module Transformers; class CleanElement
     if @add_attributes.include?(name)
       @add_attributes[name].each {|key, val| node[key] = val }
     end
+    # Element-specific special cases.
+    case name
+    # If this is an allowlisted iframe that has children, remove all its
+    # children. The HTML standard says iframes shouldn't have content, but when
+    # they do, this content is parsed as text and is serialized verbatim without
+    # being escaped, which is unsafe because legacy browsers may still render it
+    # and execute `<script>` content. So the safe and correct thing to do is to
+    # always remove iframe content.
+    when 'iframe'
+      if !node.children.empty?
+        node.children.each do |child|
+          child.unlink
+        end
+      end
+    # Prevent the use of `<meta>` elements that set a charset other than UTF-8,
+    # since Sanitize's output is always UTF-8.
+    when 'meta'
+      if node.has_attribute?('charset') &&
+          node['charset'].downcase != 'utf-8'
+        node['charset'] = 'utf-8'
+      end
+      if node.has_attribute?('http-equiv') &&
+          node.has_attribute?('content') &&
+          node['http-equiv'].downcase == 'content-type' &&
+          node['content'].downcase =~ /;\s*charset\s*=\s*(?!utf-8)/
+        node['content'] = node['content'].gsub(/;\s*charset\s*=.+\z/, ';charset=utf-8')
+      end
+    end
   end
 end; end; end

data/lib/sanitize/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # encoding: utf-8
 class Sanitize
-  VERSION = '4.6.6'
+  VERSION = '6.0.0'
 end

data/lib/sanitize.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 # encoding: utf-8
-require 'nokogumbo'
+require 'nokogiri'
 require 'set'
 require_relative 'sanitize/version'
@@ -19,6 +19,20 @@ require_relative 'sanitize/transformers/clean_element'
 class Sanitize
   attr_reader :config
+  # Matches one or more control characters that should be removed from HTML
+  # before parsing, as defined by the HTML living standard.
+  #
+  # -   https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
+  # -   https://infra.spec.whatwg.org/#control
+  REGEX_HTML_CONTROL_CHARACTERS = /[\u0001-\u0008\u000b\u000e-\u001f\u007f-\u009f]+/u
+  # Matches one or more non-characters that should be removed from HTML before
+  # parsing, as defined by the HTML living standard.
+  #
+  # -   https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
+  # -   https://infra.spec.whatwg.org/#noncharacter
+  REGEX_HTML_NON_CHARACTERS = /[\ufdd0-\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}]+/u
   # Matches an attribute value that could be treated by a browser as a URL
   # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
   # or more characters followed by a colon is considered a match, even if the
@@ -26,11 +40,12 @@ class Sanitize
   # IE6 and Opera will still parse).
   REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
-  # Matches Unicode characters that should be stripped from HTML before passing
-  # it to the parser.
+  # Matches one or more characters that should be stripped from HTML before
+  # parsing. This is a combination of `REGEX_HTML_CONTROL_CHARACTERS` and
+  # `REGEX_HTML_NON_CHARACTERS`.
   #
-  # http://www.w3.org/TR/unicode-xml/#Charlist
-  REGEX_UNSUITABLE_CHARS = /[\u0000\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
+  # https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
+  REGEX_UNSUITABLE_CHARS = /(?:#{REGEX_HTML_CONTROL_CHARACTERS}|#{REGEX_HTML_NON_CHARACTERS})/u
   #--
   # Class Methods
@@ -39,7 +54,7 @@ class Sanitize
   # Returns a sanitized copy of the given full _html_ document, using the
   # settings in _config_ if specified.
   #
-  # When sanitizing a document, the `<html>` element must be whitelisted or an
+  # When sanitizing a document, the `<html>` element must be allowlisted or an
   # error will be raised. If this is undesirable, you should probably use
   # {#fragment} instead.
   def self.document(html, config = {})
@@ -102,13 +117,13 @@ class Sanitize
   # Returns a sanitized copy of the given _html_ document.
   #
-  # When sanitizing a document, the `<html>` element must be whitelisted or an
+  # When sanitizing a document, the `<html>` element must be allowlisted or an
   # error will be raised. If this is undesirable, you should probably use
   # {#fragment} instead.
   def document(html)
     return '' unless html
-    doc = Nokogiri::HTML5.parse(preprocess(html))
+    doc = Nokogiri::HTML5.parse(preprocess(html), **@config[:parser_options])
     node!(doc)
     to_html(doc)
   end
@@ -120,20 +135,7 @@ class Sanitize
   def fragment(html)
     return '' unless html
-    html = preprocess(html)
-    doc  = Nokogiri::HTML5.parse("<html><body>#{html}")
-    # Hack to allow fragments containing <body>. Borrowed from
-    # Nokogiri::HTML::DocumentFragment.
-    if html =~ /\A<body(?:\s|>)/i
-      path = '/html/body'
-    else
-      path = '/html/body/node()'
-    end
-    frag = doc.fragment
-    frag << doc.xpath(path)
+    frag = Nokogiri::HTML5.fragment(preprocess(html), **@config[:parser_options])
     node!(frag)
     to_html(frag)
   end
@@ -145,20 +147,20 @@ class Sanitize
   # in place.
   #
   # If _node_ is a `Nokogiri::XML::Document`, the `<html>` element must be
-  # whitelisted or an error will be raised.
+  # allowlisted or an error will be raised.
   def node!(node)
     raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
     if node.is_a?(Nokogiri::XML::Document)
       unless @config[:elements].include?('html')
-        raise Error, 'When sanitizing a document, "<html>" must be whitelisted.'
+        raise Error, 'When sanitizing a document, "<html>" must be allowlisted.'
       end
     end
-    node_whitelist = Set.new
+    node_allowlist = Set.new
     traverse(node) do |n|
-      transform_node!(n, node_whitelist)
+      transform_node!(n, node_allowlist)
     end
     node
@@ -184,40 +186,10 @@ class Sanitize
   end
   def to_html(node)
-    replace_meta = false
-    # Hacky workaround for a libxml2 bug that adds an undesired Content-Type
-    # meta tag to all serialized HTML documents.
-    #
-    # https://github.com/sparklemotion/nokogiri/issues/1008
-    if node.type == Nokogiri::XML::Node::DOCUMENT_NODE ||
-        node.type == Nokogiri::XML::Node::HTML_DOCUMENT_NODE
-      regex_meta   = %r|(<html[^>]*>\s*<head[^>]*>\s*)<meta http-equiv="Content-Type" content="text/html; charset=utf-8">|i
-      # Only replace the content-type meta tag if <meta> isn't whitelisted or
-      # the original document didn't actually include a content-type meta tag.
-      replace_meta = !@config[:elements].include?('meta') ||
-        node.xpath('/html/head/meta[@http-equiv]').none? do |meta|
-          meta['http-equiv'].casecmp('content-type').zero?
-        end
-    end
-    so = Nokogiri::XML::Node::SaveOptions
-    # Serialize to HTML without any formatting to prevent Nokogiri from adding
-    # newlines after certain tags.
-    html = node.to_html(
-      :encoding  => 'utf-8',
-      :indent    => 0,
-      :save_with => so::NO_DECLARATION | so::NO_EMPTY_TAGS | so::AS_HTML
-    )
-    html.gsub!(regex_meta, '\1') if replace_meta
-    html
+    node.to_html(preserve_newline: true)
   end
-  def transform_node!(node, node_whitelist)
+  def transform_node!(node, node_allowlist)
     @transformers.each do |transformer|
       # Since transform_node! may be called in a tight loop to process thousands
       # of items, we can optimize both memory and CPU performance by:
@@ -227,15 +199,19 @@ class Sanitize
       # does merge! create a new hash, it is also 2.6x slower:
       # https://github.com/JuanitoFatas/fast-ruby#hashmerge-vs-hashmerge-code
       config = @transformer_config
-      config[:is_whitelisted] = node_whitelist.include?(node)
+      config[:is_allowlisted] = config[:is_whitelisted] = node_allowlist.include?(node)
       config[:node] = node
       config[:node_name] = node.name.downcase
-      config[:node_whitelist] = node_whitelist
+      config[:node_allowlist] = config[:node_whitelist] = node_allowlist
-      result = transformer.call(config)
+      result = transformer.call(**config)
-      if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
-        node_whitelist.merge(result[:node_whitelist])
+      if result.is_a?(Hash)
+        result_allowlist = result[:node_allowlist] || result[:node_whitelist]
+        if result_allowlist.respond_to?(:each)
+          node_allowlist.merge(result_allowlist)
+        end
       end
     end

data/test/common.rb CHANGED Viewed

@@ -1,34 +1,3 @@
 # encoding: utf-8
-gem 'minitest'
 require 'minitest/autorun'
 require_relative '../lib/sanitize'
-# Helper to stub an instance method. Shamelessly stolen from
-# https://github.com/codeodor/minitest-stub_any_instance/
-class Object
-  def self.stub_instance(name, value, &block)
-    old_method = "__stubbed_method_#{name}__"
-    class_eval do
-      alias_method old_method, name
-      define_method(name) do |*args|
-        if value.respond_to?(:call) then
-          value.call(*args)
-        else
-          value
-        end
-      end
-    end
-    yield
-  ensure
-    class_eval do
-      undef_method name
-      alias_method name, old_method
-      undef_method old_method
-    end
-  end
-end

data/test/test_clean_comment.rb CHANGED Viewed

@@ -20,7 +20,7 @@ describe 'Sanitize::Transformers::CleanComment' do
       # Special case: the comment markup is inside a <script>, which makes it
       # text content and not an actual HTML comment.
-      @s.fragment("<script><!-- comment --></script>").must_equal '&lt;!-- comment --&gt;'
+      @s.fragment("<script><!-- comment --></script>").must_equal ''
       Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => false, :elements => ['script'])
         .must_equal '<script><!-- comment --></script>'
@@ -40,10 +40,6 @@ describe 'Sanitize::Transformers::CleanComment' do
       @s.fragment("foo <!-- <!-- <!-- --> --> -->bar").must_equal 'foo <!-- <!-- <!-- --> --&gt; --&gt;bar'
       @s.fragment("foo <div <!-- comment -->>bar</div>").must_equal 'foo <div>&gt;bar</div>'
-      # Special case: the comment markup is inside a <script>, which makes it
-      # text content and not an actual HTML comment.
-      @s.fragment("<script><!-- comment --></script>").must_equal '&lt;!-- comment --&gt;'
       Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => true, :elements => ['script'])
         .must_equal '<script><!-- comment --></script>'
     end

data/test/test_clean_css.rb CHANGED Viewed

@@ -13,7 +13,7 @@ describe 'Sanitize::Transformers::CSS::CleanAttribute' do
     @s.fragment(%[
       <div style="color: #fff; width: expression(alert(1)); /* <-- evil! */"></div>
     ].strip).must_equal %[
-      <div style="color: #fff;  /* &lt;-- evil! */"></div>
+      <div style="color: #fff;  /* <-- evil! */"></div>
     ].strip
   end

data/test/test_clean_doctype.rb CHANGED Viewed

@@ -11,7 +11,7 @@ describe 'Sanitize::Transformers::CleanDoctype' do
     end
     it 'should remove doctype declarations' do
-      @s.document('<!DOCTYPE html><html>foo</html>').must_equal "<html>foo</html>\n"
+      @s.document('<!DOCTYPE html><html>foo</html>').must_equal "<html>foo</html>"
       @s.fragment('<!DOCTYPE html>foo').must_equal 'foo'
     end
@@ -34,27 +34,27 @@ describe 'Sanitize::Transformers::CleanDoctype' do
     it 'should allow doctype declarations in documents' do
       @s.document('<!DOCTYPE html><html>foo</html>')
-        .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
+        .must_equal "<!DOCTYPE html><html>foo</html>"
       @s.document('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
-        .must_equal "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\">\n<html>foo</html>\n"
+        .must_equal "<!DOCTYPE html><html>foo</html>"
       @s.document("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html>foo</html>")
-        .must_equal "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html>foo</html>\n"
+        .must_equal "<!DOCTYPE html><html>foo</html>"
     end
     it 'should not allow obviously invalid doctype declarations in documents' do
       @s.document('<!DOCTYPE blah blah blah><html>foo</html>')
-        .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
+        .must_equal "<!DOCTYPE html><html>foo</html>"
       @s.document('<!DOCTYPE blah><html>foo</html>')
-        .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
+        .must_equal "<!DOCTYPE html><html>foo</html>"
       @s.document('<!DOCTYPE html BLAH "-//W3C//DTD HTML 4.01//EN"><html>foo</html>')
-        .must_equal "<!DOCTYPE html>\n<html>foo</html>\n"
+        .must_equal "<!DOCTYPE html><html>foo</html>"
       @s.document('<!whatever><html>foo</html>')
-        .must_equal "<html>foo</html>\n"
+        .must_equal "<html>foo</html>"
     end
     it 'should not allow doctype definitions in fragments' do