RubyGems - sanitize - Versions diffs - 2.1.1 → 6.0.0 - Mend

sanitize 2.1.1 → 6.0.0

Potentially problematic release.

This version of sanitize might be problematic. Click here for more details.

Files changed (30) hide show

checksums.yaml +4 -4
data/HISTORY.md +520 -55
data/LICENSE +1 -1
data/README.md +438 -168
data/lib/sanitize/config/basic.rb +12 -32
data/lib/sanitize/config/default.rb +118 -0
data/lib/sanitize/config/relaxed.rb +716 -53
data/lib/sanitize/config/restricted.rb +3 -23
data/lib/sanitize/config.rb +53 -79
data/lib/sanitize/css.rb +348 -0
data/lib/sanitize/transformers/clean_cdata.rb +3 -3
data/lib/sanitize/transformers/clean_comment.rb +6 -3
data/lib/sanitize/transformers/clean_css.rb +57 -0
data/lib/sanitize/transformers/clean_doctype.rb +19 -0
data/lib/sanitize/transformers/clean_element.rb +192 -124
data/lib/sanitize/version.rb +3 -1
data/lib/sanitize.rb +172 -143
data/test/common.rb +3 -0
data/test/test_clean_comment.rb +47 -0
data/test/test_clean_css.rb +67 -0
data/test/test_clean_doctype.rb +71 -0
data/test/test_clean_element.rb +545 -0
data/test/test_config.rb +65 -0
data/test/test_malicious_css.rb +42 -0
data/test/test_malicious_html.rb +235 -0
data/test/test_parser.rb +75 -0
data/test/test_sanitize.rb +151 -675
data/test/test_sanitize_css.rb +424 -0
data/test/test_transformers.rb +230 -0
metadata +44 -41

data/lib/sanitize/transformers/clean_element.rb CHANGED Viewed

@@ -1,155 +1,223 @@
-class Sanitize; module Transformers
-  class CleanElement
-    # Attributes that need additional escaping on `<a>` elements due to unsafe
-    # libxml2 behavior.
-    UNSAFE_LIBXML_ATTRS_A = Set.new(%w[
-      name
-    ])
-    # Attributes that need additional escaping on all elements due to unsafe
-    # libxml2 behavior.
-    UNSAFE_LIBXML_ATTRS_GLOBAL = Set.new(%w[
-      action
-      href
-      src
-    ])
-    # Mapping of original characters to escape sequences for characters that
-    # should be escaped in attributes affected by unsafe libxml2 behavior.
-    UNSAFE_LIBXML_ESCAPE_CHARS = {
-      ' ' => '%20',
-      '"' => '%22'
-    }
-    # Regex that matches any single character that needs to be escaped in
-    # attributes affected by unsafe libxml2 behavior.
-    UNSAFE_LIBXML_ESCAPE_REGEX = /[ "]/
-    def initialize(config)
-      @config = config
-      # For faster lookups.
-      @add_attributes          = config[:add_attributes]
-      @allowed_elements        = Set.new(config[:elements])
-      @attributes              = config[:attributes]
-      @protocols               = config[:protocols]
-      @remove_all_contents     = false
-      @remove_element_contents = Set.new
-      @whitespace_elements     = Set.new(config[:whitespace_elements])
-      if config[:remove_contents].is_a?(Array)
-        @remove_element_contents.merge(config[:remove_contents].map(&:to_s))
-      else
-        @remove_all_contents = !!config[:remove_contents]
+# encoding: utf-8
+require 'set'
+class Sanitize; module Transformers; class CleanElement
+  # Matches a valid HTML5 data attribute name. The unicode ranges included here
+  # are a conservative subset of the full range of characters that are
+  # technically allowed, with the intent of matching the most common characters
+  # used in data attribute names while excluding uncommon or potentially
+  # misleading characters, or characters with the potential to be normalized
+  # into unsafe or confusing forms.
+  #
+  # If you need data attr names with characters that aren't included here (such
+  # as combining marks, full-width characters, or CJK), please consider creating
+  # a custom transformer to validate attributes according to your needs.
+  #
+  # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#embedding-custom-non-visible-data-with-the-data-*-attributes
+  REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u
+  # Attributes that need additional escaping on `<a>` elements due to unsafe
+  # libxml2 behavior.
+  UNSAFE_LIBXML_ATTRS_A = Set.new(%w[
+    name
+  ])
+  # Attributes that need additional escaping on all elements due to unsafe
+  # libxml2 behavior.
+  UNSAFE_LIBXML_ATTRS_GLOBAL = Set.new(%w[
+    action
+    href
+    src
+  ])
+  # Mapping of original characters to escape sequences for characters that
+  # should be escaped in attributes affected by unsafe libxml2 behavior.
+  UNSAFE_LIBXML_ESCAPE_CHARS = {
+    ' ' => '%20',
+    '"' => '%22'
+  }
+  # Regex that matches any single character that needs to be escaped in
+  # attributes affected by unsafe libxml2 behavior.
+  UNSAFE_LIBXML_ESCAPE_REGEX = /[ "]/
+  def initialize(config)
+    @add_attributes          = config[:add_attributes]
+    @attributes              = config[:attributes].dup
+    @elements                = config[:elements]
+    @protocols               = config[:protocols]
+    @remove_all_contents     = false
+    @remove_element_contents = Set.new
+    @whitespace_elements     = {}
+    @attributes.each do |element_name, attrs|
+      unless element_name == :all
+        @attributes[element_name] = Set.new(attrs).merge(@attributes[:all] || [])
       end
     end
-    def call(env)
-      name = env[:node_name]
-      node = env[:node]
-      return if env[:is_whitelisted] || !node.element?
+    # Backcompat: if :whitespace_elements is a Set, convert it to a hash.
+    if config[:whitespace_elements].is_a?(Set)
+      config[:whitespace_elements].each do |element|
+        @whitespace_elements[element] = {:before => ' ', :after => ' '}
+      end
+    else
+      @whitespace_elements = config[:whitespace_elements]
+    end
-      # Delete any element that isn't in the config whitelist.
-      unless @allowed_elements.include?(name)
-        # Elements like br, div, p, etc. need to be replaced with whitespace in
-        # order to preserve readability.
-        if @whitespace_elements.include?(name)
-          node.add_previous_sibling(Nokogiri::XML::Text.new(' ', node.document))
+    if config[:remove_contents].is_a?(Enumerable)
+      @remove_element_contents.merge(config[:remove_contents].map(&:to_s))
+    else
+      @remove_all_contents = !!config[:remove_contents]
+    end
+  end
-          unless node.children.empty?
-            node.add_next_sibling(Nokogiri::XML::Text.new(' ', node.document))
-          end
+  def call(env)
+    node = env[:node]
+    return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_allowlisted]
+    name = env[:node_name]
+    # Delete any element that isn't in the config allowlist, unless the node has
+    # already been deleted from the document.
+    #
+    # It's important that we not try to reparent the children of a node that has
+    # already been deleted, since that seems to trigger a memory leak in
+    # Nokogiri.
+    unless @elements.include?(name) || node.parent.nil?
+      # Elements like br, div, p, etc. need to be replaced with whitespace in
+      # order to preserve readability.
+      if @whitespace_elements.include?(name)
+        node.add_previous_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:before].to_s, node.document))
+        unless node.children.empty?
+          node.add_next_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:after].to_s, node.document))
         end
+      end
+      unless node.children.empty?
         unless @remove_all_contents || @remove_element_contents.include?(name)
-          node.children.each {|n| node.add_previous_sibling(n) }
+          node.add_previous_sibling(node.children)
         end
-        node.unlink
-        return
       end
-      attr_whitelist = Set.new((@attributes[name] || []) +
-          (@attributes[:all] || []))
-      allow_data_attributes = attr_whitelist.include?(:data)
-      if attr_whitelist.empty?
-        # Delete all attributes from elements with no whitelisted attributes.
-        node.attribute_nodes.each {|attr| attr.unlink }
-      else
-        # Delete any attribute that isn't allowed on this element.
-        node.attribute_nodes.each do |attr|
-          attr_name = attr.name.downcase
-          unless attr_whitelist.include?(attr_name)
-            # The attribute isn't explicitly whitelisted.
-            if allow_data_attributes && attr_name.start_with?('data-')
-              # Arbitrary data attributes are allowed. Verify that the attribute
-              # is a valid data attribute.
-              attr.unlink unless attr_name =~ REGEX_DATA_ATTR
-            else
-              # Either the attribute isn't a data attribute, or arbitrary data
-              # attributes aren't allowed. Remove the attribute.
-              attr.unlink
-            end
+      node.unlink
+      return
+    end
+    attr_allowlist = @attributes[name] || @attributes[:all]
+    if attr_allowlist.nil?
+      # Delete all attributes from elements with no allowlisted attributes.
+      node.attribute_nodes.each {|attr| attr.unlink }
+    else
+      allow_data_attributes = attr_allowlist.include?(:data)
+      # Delete any attribute that isn't allowed on this element.
+      node.attribute_nodes.each do |attr|
+        attr_name = attr.name.downcase
+        unless attr_allowlist.include?(attr_name)
+          # The attribute isn't in the allowlist, but may still be allowed if
+          # it's a data attribute.
+          unless allow_data_attributes && attr_name.start_with?('data-') && attr_name =~ REGEX_DATA_ATTR
+            # Either the attribute isn't a data attribute or arbitrary data
+            # attributes aren't allowed. Remove the attribute.
+            attr.unlink
+            next
           end
         end
-        # Delete remaining attributes that use unacceptable protocols.
-        if @protocols.has_key?(name)
-          protocol = @protocols[name]
+        # The attribute is allowed.
-          node.attribute_nodes.each do |attr|
-            attr_name = attr.name.downcase
-            next false unless protocol.has_key?(attr_name)
+        # Remove any attributes that use unacceptable protocols.
+        if @protocols.include?(name) && @protocols[name].include?(attr_name)
+          attr_protocols = @protocols[name][attr_name]
-            del = if attr.value.to_s.downcase =~ REGEX_PROTOCOL
-              !protocol[attr_name].include?($1.downcase)
-            else
-              !protocol[attr_name].include?(:relative)
+          if attr.value =~ REGEX_PROTOCOL
+            unless attr_protocols.include?($1.downcase)
+              attr.unlink
+              next
             end
-            if del
+          else
+            unless attr_protocols.include?(:relative)
               attr.unlink
-            else
-              # Leading and trailing whitespace around URLs is ignored at parse
-              # time. Stripping it here prevents it from being escaped by the
-              # libxml2 workaround below.
-              attr.value = attr.value.strip
+              next
             end
           end
+          # Leading and trailing whitespace around URLs is ignored at parse
+          # time. Stripping it here prevents it from being escaped by the
+          # libxml2 workaround below.
+          attr.value = attr.value.strip
         end
-      end
-      # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
-      # attempt to preserve server-side includes. This can result in XSS since
-      # an unescaped double quote can allow an attacker to inject a
-      # non-whitelisted attribute.
-      #
-      # Sanitize works around this by implementing its own escaping for
-      # affected attributes, some of which can exist on any element and some
-      # of which can only exist on `<a>` elements.
-      #
-      # The relevant libxml2 code is here:
-      # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
-      node.attribute_nodes.each do |attr|
-        attr_name = attr.name.downcase
+        # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
+        # attempt to preserve server-side includes. This can result in XSS since
+        # an unescaped double quote can allow an attacker to inject a
+        # non-allowlisted attribute.
+        #
+        # Sanitize works around this by implementing its own escaping for
+        # affected attributes, some of which can exist on any element and some
+        # of which can only exist on `<a>` elements.
+        #
+        # This fix is technically no longer necessary with Nokogumbo >= 2.0
+        # since it no longer uses libxml2's serializer, but it's retained to
+        # avoid breaking use cases where people might be sanitizing individual
+        # Nokogiri nodes and then serializing them manually without Nokogumbo.
+        #
+        # The relevant libxml2 code is here:
+        # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
         if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
-          (name == 'a' && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
-            attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
+            (name == 'a' && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
+          attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
         end
       end
+    end
+    # Add required attributes.
+    if @add_attributes.include?(name)
+      @add_attributes[name].each {|key, val| node[key] = val }
+    end
+    # Element-specific special cases.
+    case name
+    # If this is an allowlisted iframe that has children, remove all its
+    # children. The HTML standard says iframes shouldn't have content, but when
+    # they do, this content is parsed as text and is serialized verbatim without
+    # being escaped, which is unsafe because legacy browsers may still render it
+    # and execute `<script>` content. So the safe and correct thing to do is to
+    # always remove iframe content.
+    when 'iframe'
+      if !node.children.empty?
+        node.children.each do |child|
+          child.unlink
+        end
+      end
+    # Prevent the use of `<meta>` elements that set a charset other than UTF-8,
+    # since Sanitize's output is always UTF-8.
+    when 'meta'
+      if node.has_attribute?('charset') &&
+          node['charset'].downcase != 'utf-8'
+        node['charset'] = 'utf-8'
+      end
+      if node.has_attribute?('http-equiv') &&
+          node.has_attribute?('content') &&
+          node['http-equiv'].downcase == 'content-type' &&
+          node['content'].downcase =~ /;\s*charset\s*=\s*(?!utf-8)/
-      # Add required attributes.
-      if @add_attributes.has_key?(name)
-        @add_attributes[name].each {|key, val| node[key] = val }
+        node['content'] = node['content'].gsub(/;\s*charset\s*=.+\z/, ';charset=utf-8')
       end
     end
   end
-end; end
+end; end; end

data/lib/sanitize/version.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# encoding: utf-8
 class Sanitize
-  VERSION = '2.1.1'
+  VERSION = '6.0.0'
 end