RubyGems - sanitize - Versions diffs - 4.6.6 → 5.2.2 - Mend

sanitize 4.6.6 → 5.2.2

Potentially problematic release.

This version of sanitize might be problematic. Click here for more details.

Files changed (24) hide show

checksums.yaml +4 -4
data/HISTORY.md +156 -16
data/README.md +62 -42
data/lib/sanitize.rb +38 -62
data/lib/sanitize/config/default.rb +10 -4
data/lib/sanitize/config/relaxed.rb +1 -1
data/lib/sanitize/css.rb +2 -2
data/lib/sanitize/transformers/clean_comment.rb +1 -1
data/lib/sanitize/transformers/clean_css.rb +3 -3
data/lib/sanitize/transformers/clean_doctype.rb +1 -1
data/lib/sanitize/transformers/clean_element.rb +54 -13
data/lib/sanitize/version.rb +1 -1
data/test/common.rb +0 -31
data/test/test_clean_comment.rb +1 -5
data/test/test_clean_css.rb +1 -1
data/test/test_clean_doctype.rb +8 -8
data/test/test_clean_element.rb +121 -26
data/test/test_malicious_html.rb +50 -7
data/test/test_parser.rb +3 -32
data/test/test_sanitize.rb +103 -18
data/test/test_sanitize_css.rb +43 -16
data/test/test_transformers.rb +29 -23
metadata +19 -21
data/test/test_unicode.rb +0 -95

data/lib/sanitize.rb CHANGED

@@ -19,6 +19,20 @@ require_relative 'sanitize/transformers/clean_element'
 class Sanitize
   attr_reader :config
+  # Matches one or more control characters that should be removed from HTML
+  # before parsing, as defined by the HTML living standard.
+  #
+  # -   https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
+  # -   https://infra.spec.whatwg.org/#control
+  REGEX_HTML_CONTROL_CHARACTERS = /[\u0001-\u0008\u000b\u000e-\u001f\u007f-\u009f]+/u
+  # Matches one or more non-characters that should be removed from HTML before
+  # parsing, as defined by the HTML living standard.
+  #
+  # -   https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
+  # -   https://infra.spec.whatwg.org/#noncharacter
+  REGEX_HTML_NON_CHARACTERS = /[\ufdd0-\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}]+/u
   # Matches an attribute value that could be treated by a browser as a URL
   # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
   # or more characters followed by a colon is considered a match, even if the
@@ -26,11 +40,12 @@ class Sanitize
   # IE6 and Opera will still parse).
   REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
-  # Matches Unicode characters that should be stripped from HTML before passing
-  # it to the parser.
+  # Matches one or more characters that should be stripped from HTML before
+  # parsing. This is a combination of `REGEX_HTML_CONTROL_CHARACTERS` and
+  # `REGEX_HTML_NON_CHARACTERS`.
   #
-  # http://www.w3.org/TR/unicode-xml/#Charlist
-  REGEX_UNSUITABLE_CHARS = /[\u0000\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
+  # https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
+  REGEX_UNSUITABLE_CHARS = /(?:#{REGEX_HTML_CONTROL_CHARACTERS}|#{REGEX_HTML_NON_CHARACTERS})/u
   #--
   # Class Methods
@@ -39,7 +54,7 @@ class Sanitize
   # Returns a sanitized copy of the given full _html_ document, using the
   # settings in _config_ if specified.
   #
-  # When sanitizing a document, the `<html>` element must be whitelisted or an
+  # When sanitizing a document, the `<html>` element must be allowlisted or an
   # error will be raised. If this is undesirable, you should probably use
   # {#fragment} instead.
   def self.document(html, config = {})
@@ -102,13 +117,13 @@ class Sanitize
   # Returns a sanitized copy of the given _html_ document.
   #
-  # When sanitizing a document, the `<html>` element must be whitelisted or an
+  # When sanitizing a document, the `<html>` element must be allowlisted or an
   # error will be raised. If this is undesirable, you should probably use
   # {#fragment} instead.
   def document(html)
     return '' unless html
-    doc = Nokogiri::HTML5.parse(preprocess(html))
+    doc = Nokogiri::HTML5.parse(preprocess(html), **@config[:parser_options])
     node!(doc)
     to_html(doc)
   end
@@ -120,20 +135,7 @@ class Sanitize
   def fragment(html)
     return '' unless html
-    html = preprocess(html)
-    doc  = Nokogiri::HTML5.parse("<html><body>#{html}")
-    # Hack to allow fragments containing <body>. Borrowed from
-    # Nokogiri::HTML::DocumentFragment.
-    if html =~ /\A<body(?:\s|>)/i
-      path = '/html/body'
-    else
-      path = '/html/body/node()'
-    end
-    frag = doc.fragment
-    frag << doc.xpath(path)
+    frag = Nokogiri::HTML5.fragment(preprocess(html), **@config[:parser_options])
     node!(frag)
     to_html(frag)
   end
@@ -145,20 +147,20 @@ class Sanitize
   # in place.
   #
   # If _node_ is a `Nokogiri::XML::Document`, the `<html>` element must be
-  # whitelisted or an error will be raised.
+  # allowlisted or an error will be raised.
   def node!(node)
     raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
     if node.is_a?(Nokogiri::XML::Document)
       unless @config[:elements].include?('html')
-        raise Error, 'When sanitizing a document, "<html>" must be whitelisted.'
+        raise Error, 'When sanitizing a document, "<html>" must be allowlisted.'
       end
     end
-    node_whitelist = Set.new
+    node_allowlist = Set.new
     traverse(node) do |n|
-      transform_node!(n, node_whitelist)
+      transform_node!(n, node_allowlist)
     end
     node
@@ -184,40 +186,10 @@ class Sanitize
   end
   def to_html(node)
-    replace_meta = false
-    # Hacky workaround for a libxml2 bug that adds an undesired Content-Type
-    # meta tag to all serialized HTML documents.
-    #
-    # https://github.com/sparklemotion/nokogiri/issues/1008
-    if node.type == Nokogiri::XML::Node::DOCUMENT_NODE ||
-        node.type == Nokogiri::XML::Node::HTML_DOCUMENT_NODE
-      regex_meta   = %r|(<html[^>]*>\s*<head[^>]*>\s*)<meta http-equiv="Content-Type" content="text/html; charset=utf-8">|i
-      # Only replace the content-type meta tag if <meta> isn't whitelisted or
-      # the original document didn't actually include a content-type meta tag.
-      replace_meta = !@config[:elements].include?('meta') ||
-        node.xpath('/html/head/meta[@http-equiv]').none? do |meta|
-          meta['http-equiv'].casecmp('content-type').zero?
-        end
-    end
-    so = Nokogiri::XML::Node::SaveOptions
-    # Serialize to HTML without any formatting to prevent Nokogiri from adding
-    # newlines after certain tags.
-    html = node.to_html(
-      :encoding  => 'utf-8',
-      :indent    => 0,
-      :save_with => so::NO_DECLARATION | so::NO_EMPTY_TAGS | so::AS_HTML
-    )
-    html.gsub!(regex_meta, '\1') if replace_meta
-    html
+    node.to_html(preserve_newline: true)
   end
-  def transform_node!(node, node_whitelist)
+  def transform_node!(node, node_allowlist)
     @transformers.each do |transformer|
       # Since transform_node! may be called in a tight loop to process thousands
       # of items, we can optimize both memory and CPU performance by:
@@ -227,15 +199,19 @@ class Sanitize
       # does merge! create a new hash, it is also 2.6x slower:
       # https://github.com/JuanitoFatas/fast-ruby#hashmerge-vs-hashmerge-code
       config = @transformer_config
-      config[:is_whitelisted] = node_whitelist.include?(node)
+      config[:is_allowlisted] = config[:is_whitelisted] = node_allowlist.include?(node)
       config[:node] = node
       config[:node_name] = node.name.downcase
-      config[:node_whitelist] = node_whitelist
+      config[:node_allowlist] = config[:node_whitelist] = node_allowlist
-      result = transformer.call(config)
+      result = transformer.call(**config)
-      if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
-        node_whitelist.merge(result[:node_whitelist])
+      if result.is_a?(Hash)
+        result_allowlist = result[:node_allowlist] || result[:node_whitelist]
+        if result_allowlist.respond_to?(:each)
+          node_allowlist.merge(result_allowlist)
+        end
       end
     end

data/lib/sanitize/config/default.rb CHANGED

@@ -56,6 +56,10 @@ class Sanitize
       # that all HTML will be stripped).
       :elements => [],
+      # HTML parsing options to pass to Nokogumbo.
+      # https://github.com/rubys/nokogumbo/tree/v2.0.1#parsing-options
+      :parser_options => {},
       # URL handling protocols to allow in specific attributes. By default, no
       # protocols are allowed. Use :relative in place of a protocol if you want
       # to allow relative URLs sans protocol.
@@ -66,10 +70,12 @@ class Sanitize
       # leaves the safe parts of an element's contents behind when the element
       # is removed.
       #
-      # If this is an Array of element names, then only the contents of the
-      # specified elements (when filtered) will be removed, and the contents of
-      # all other filtered elements will be left behind.
-      :remove_contents => false,
+      # If this is an Array or Set of element names, then only the contents of
+      # the specified elements (when filtered) will be removed, and the contents
+      # of all other filtered elements will be left behind.
+      :remove_contents => %w[
+        iframe math noembed noframes noscript plaintext script style svg xmp
+      ],
       # Transformers allow you to filter or alter nodes using custom logic. See
       # README.md for details and examples.

data/lib/sanitize/config/relaxed.rb CHANGED

@@ -6,7 +6,7 @@ class Sanitize
       :elements => BASIC[:elements] + %w[
         address article aside bdi bdo body caption col colgroup data del div
         figcaption figure footer h1 h2 h3 h4 h5 h6 head header hgroup hr html
-        img ins main nav rp rt ruby section span style summary sup table tbody
+        img ins main nav rp rt ruby section span style summary table tbody
         td tfoot th thead title tr wbr
       ],

data/lib/sanitize/css.rb CHANGED

@@ -175,7 +175,7 @@ class Sanitize; class CSS
         next prop
       when :semicolon
-        # Only preserve the semicolon if it was preceded by a whitelisted
+        # Only preserve the semicolon if it was preceded by an allowlisted
         # property. Otherwise, omit it in order to prevent redundant semicolons.
         if preceded_by_property
           preceded_by_property = false
@@ -296,7 +296,7 @@ class Sanitize; class CSS
   end
   # Returns `true` if the given node (which may be of type `:url` or
-  # `:function`, since the CSS syntax can produce both) uses a whitelisted
+  # `:function`, since the CSS syntax can produce both) uses an allowlisted
   # protocol.
   def valid_url?(node)
     type = node[:node]

data/lib/sanitize/transformers/clean_comment.rb CHANGED

@@ -6,7 +6,7 @@ class Sanitize; module Transformers
     node = env[:node]
     if node.type == Nokogiri::XML::Node::COMMENT_NODE
-      node.unlink unless env[:is_whitelisted]
+      node.unlink unless env[:is_allowlisted]
     end
   end

data/lib/sanitize/transformers/clean_css.rb CHANGED

@@ -1,6 +1,6 @@
 class Sanitize; module Transformers; module CSS
-# Enforces a CSS whitelist on the contents of `style` attributes.
+# Enforces a CSS allowlist on the contents of `style` attributes.
 class CleanAttribute
   def initialize(sanitizer_or_config)
     if Sanitize::CSS === sanitizer_or_config
@@ -14,7 +14,7 @@ class CleanAttribute
     node = env[:node]
     return unless node.type == Nokogiri::XML::Node::ELEMENT_NODE &&
-        node.key?('style') && !env[:is_whitelisted]
+        node.key?('style') && !env[:is_allowlisted]
     attr = node.attribute('style')
     css  = @scss.properties(attr.value)
@@ -27,7 +27,7 @@ class CleanAttribute
   end
 end
-# Enforces a CSS whitelist on the contents of `<style>` elements.
+# Enforces a CSS allowlist on the contents of `<style>` elements.
 class CleanElement
   def initialize(sanitizer_or_config)
     if Sanitize::CSS === sanitizer_or_config

data/lib/sanitize/transformers/clean_doctype.rb CHANGED

@@ -3,7 +3,7 @@
 class Sanitize; module Transformers
   CleanDoctype = lambda do |env|
-    return if env[:is_whitelisted]
+    return if env[:is_allowlisted]
     node = env[:node]

data/lib/sanitize/transformers/clean_element.rb CHANGED

@@ -67,7 +67,7 @@ class Sanitize; module Transformers; class CleanElement
       @whitespace_elements = config[:whitespace_elements]
     end
-    if config[:remove_contents].is_a?(Set)
+    if config[:remove_contents].is_a?(Enumerable)
       @remove_element_contents.merge(config[:remove_contents].map(&:to_s))
     else
       @remove_all_contents = !!config[:remove_contents]
@@ -76,11 +76,11 @@ class Sanitize; module Transformers; class CleanElement
   def call(env)
     node = env[:node]
-    return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_whitelisted]
+    return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_allowlisted]
     name = env[:node_name]
-    # Delete any element that isn't in the config whitelist, unless the node has
+    # Delete any element that isn't in the config allowlist, unless the node has
     # already been deleted from the document.
     #
     # It's important that we not try to reparent the children of a node that has
@@ -97,28 +97,30 @@ class Sanitize; module Transformers; class CleanElement
         end
       end
-      unless @remove_all_contents || @remove_element_contents.include?(name)
-        node.add_previous_sibling(node.children)
+      unless node.children.empty?
+        unless @remove_all_contents || @remove_element_contents.include?(name)
+          node.add_previous_sibling(node.children)
+        end
       end
       node.unlink
       return
     end
-    attr_whitelist = @attributes[name] || @attributes[:all]
+    attr_allowlist = @attributes[name] || @attributes[:all]
-    if attr_whitelist.nil?
-      # Delete all attributes from elements with no whitelisted attributes.
+    if attr_allowlist.nil?
+      # Delete all attributes from elements with no allowlisted attributes.
       node.attribute_nodes.each {|attr| attr.unlink }
     else
-      allow_data_attributes = attr_whitelist.include?(:data)
+      allow_data_attributes = attr_allowlist.include?(:data)
       # Delete any attribute that isn't allowed on this element.
       node.attribute_nodes.each do |attr|
         attr_name = attr.name.downcase
-        unless attr_whitelist.include?(attr_name)
-          # The attribute isn't whitelisted.
+        unless attr_allowlist.include?(attr_name)
+          # The attribute isn't allowed.
           if allow_data_attributes && attr_name.start_with?('data-')
             # Arbitrary data attributes are allowed. If this is a data
@@ -132,7 +134,7 @@ class Sanitize; module Transformers; class CleanElement
           next
         end
-        # The attribute is whitelisted.
+        # The attribute is allowed.
         # Remove any attributes that use unacceptable protocols.
         if @protocols.include?(name) && @protocols[name].include?(attr_name)
@@ -160,12 +162,17 @@ class Sanitize; module Transformers; class CleanElement
         # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
         # attempt to preserve server-side includes. This can result in XSS since
         # an unescaped double quote can allow an attacker to inject a
-        # non-whitelisted attribute.
+        # non-allowlisted attribute.
         #
         # Sanitize works around this by implementing its own escaping for
         # affected attributes, some of which can exist on any element and some
         # of which can only exist on `<a>` elements.
         #
+        # This fix is technically no longer necessary with Nokogumbo >= 2.0
+        # since it no longer uses libxml2's serializer, but it's retained to
+        # avoid breaking use cases where people might be sanitizing individual
+        # Nokogiri nodes and then serializing them manually without Nokogumbo.
+        #
         # The relevant libxml2 code is here:
         # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
         if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
@@ -180,6 +187,40 @@ class Sanitize; module Transformers; class CleanElement
     if @add_attributes.include?(name)
       @add_attributes[name].each {|key, val| node[key] = val }
     end
+    # Element-specific special cases.
+    case name
+    # If this is an allowlisted iframe that has children, remove all its
+    # children. The HTML standard says iframes shouldn't have content, but when
+    # they do, this content is parsed as text and is serialized verbatim without
+    # being escaped, which is unsafe because legacy browsers may still render it
+    # and execute `<script>` content. So the safe and correct thing to do is to
+    # always remove iframe content.
+    when 'iframe'
+      if !node.children.empty?
+        node.children.each do |child|
+          child.unlink
+        end
+      end
+    # Prevent the use of `<meta>` elements that set a charset other than UTF-8,
+    # since Sanitize's output is always UTF-8.
+    when 'meta'
+      if node.has_attribute?('charset') &&
+          node['charset'].downcase != 'utf-8'
+        node['charset'] = 'utf-8'
+      end
+      if node.has_attribute?('http-equiv') &&
+          node.has_attribute?('content') &&
+          node['http-equiv'].downcase == 'content-type' &&
+          node['content'].downcase =~ /;\s*charset\s*=\s*(?!utf-8)/
+        node['content'] = node['content'].gsub(/;\s*charset\s*=.+\z/, ';charset=utf-8')
+      end
+    end
   end
 end; end; end

data/lib/sanitize/version.rb CHANGED

@@ -1,5 +1,5 @@
 # encoding: utf-8
 class Sanitize
-  VERSION = '4.6.6'
+  VERSION = '5.2.2'
 end

data/test/common.rb CHANGED

@@ -1,34 +1,3 @@
 # encoding: utf-8
-gem 'minitest'
 require 'minitest/autorun'
 require_relative '../lib/sanitize'
-# Helper to stub an instance method. Shamelessly stolen from
-# https://github.com/codeodor/minitest-stub_any_instance/
-class Object
-  def self.stub_instance(name, value, &block)
-    old_method = "__stubbed_method_#{name}__"
-    class_eval do
-      alias_method old_method, name
-      define_method(name) do |*args|
-        if value.respond_to?(:call) then
-          value.call(*args)
-        else
-          value
-        end
-      end
-    end
-    yield
-  ensure
-    class_eval do
-      undef_method name
-      alias_method name, old_method
-      undef_method old_method
-    end
-  end
-end

data/test/test_clean_comment.rb CHANGED

@@ -20,7 +20,7 @@ describe 'Sanitize::Transformers::CleanComment' do
       # Special case: the comment markup is inside a <script>, which makes it
       # text content and not an actual HTML comment.
-      @s.fragment("<script><!-- comment --></script>").must_equal '&lt;!-- comment --&gt;'
+      @s.fragment("<script><!-- comment --></script>").must_equal ''
       Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => false, :elements => ['script'])
         .must_equal '<script><!-- comment --></script>'
@@ -40,10 +40,6 @@ describe 'Sanitize::Transformers::CleanComment' do
       @s.fragment("foo <!-- <!-- <!-- --> --> -->bar").must_equal 'foo <!-- <!-- <!-- --> --&gt; --&gt;bar'
       @s.fragment("foo <div <!-- comment -->>bar</div>").must_equal 'foo <div>&gt;bar</div>'
-      # Special case: the comment markup is inside a <script>, which makes it
-      # text content and not an actual HTML comment.
-      @s.fragment("<script><!-- comment --></script>").must_equal '&lt;!-- comment --&gt;'
       Sanitize.fragment("<script><!-- comment --></script>", :allow_comments => true, :elements => ['script'])
         .must_equal '<script><!-- comment --></script>'
     end