RubyGems - sanitize - Versions diffs - 2.1.1 → 3.0.0 - Mend

sanitize 2.1.1 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of sanitize might be problematic. Click here for more details.

Files changed (30) hide show

checksums.yaml +5 -5
data/HISTORY.md +93 -14
data/README.md +346 -134
data/lib/sanitize.rb +177 -132
data/lib/sanitize/config.rb +53 -79
data/lib/sanitize/config/basic.rb +12 -32
data/lib/sanitize/config/default.rb +103 -0
data/lib/sanitize/config/relaxed.rb +517 -52
data/lib/sanitize/config/restricted.rb +3 -23
data/lib/sanitize/css.rb +218 -0
data/lib/sanitize/transformers/clean_cdata.rb +3 -3
data/lib/sanitize/transformers/clean_comment.rb +6 -3
data/lib/sanitize/transformers/clean_css.rb +57 -0
data/lib/sanitize/transformers/clean_doctype.rb +13 -0
data/lib/sanitize/transformers/clean_element.rb +99 -129
data/lib/sanitize/version.rb +3 -1
data/test/common.rb +34 -0
data/test/test_clean_comment.rb +51 -0
data/test/test_clean_css.rb +66 -0
data/test/test_clean_doctype.rb +71 -0
data/test/test_clean_element.rb +399 -0
data/test/test_config.rb +65 -0
data/test/test_malicious_css.rb +42 -0
data/test/test_malicious_html.rb +128 -0
data/test/test_parser.rb +104 -0
data/test/test_sanitize.rb +65 -693
data/test/test_sanitize_css.rb +222 -0
data/test/test_transformers.rb +144 -0
data/test/test_unicode.rb +84 -0
metadata +56 -8

data/lib/sanitize.rb CHANGED Viewed

@@ -1,55 +1,24 @@
 # encoding: utf-8
-#--
-# Copyright (c) 2013 Ryan Grove <ryan@wonko.com>
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the 'Software'), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#++
+require 'nokogumbo'
 require 'set'
-require 'nokogiri'
-require 'sanitize/version'
-require 'sanitize/config'
-require 'sanitize/config/restricted'
-require 'sanitize/config/basic'
-require 'sanitize/config/relaxed'
-require 'sanitize/transformers/clean_cdata'
-require 'sanitize/transformers/clean_comment'
-require 'sanitize/transformers/clean_element'
+require_relative 'sanitize/version'
+require_relative 'sanitize/config'
+require_relative 'sanitize/config/default'
+require_relative 'sanitize/config/restricted'
+require_relative 'sanitize/config/basic'
+require_relative 'sanitize/config/relaxed'
+require_relative 'sanitize/css'
+require_relative 'sanitize/transformers/clean_cdata'
+require_relative 'sanitize/transformers/clean_comment'
+require_relative 'sanitize/transformers/clean_css'
+require_relative 'sanitize/transformers/clean_doctype'
+require_relative 'sanitize/transformers/clean_element'
 class Sanitize
   attr_reader :config
-  # Matches a valid HTML5 data attribute name. The unicode ranges included here
-  # are a conservative subset of the full range of characters that are
-  # technically allowed, with the intent of matching the most common characters
-  # used in data attribute names while excluding uncommon or potentially
-  # misleading characters, or characters with the potential to be normalized
-  # into unsafe or confusing forms.
-  #
-  # If you need data attr names with characters that aren't included here (such
-  # as combining marks, full-width characters, or CJK), please consider creating
-  # a custom transformer to validate attributes according to your needs.
-  #
-  # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#embedding-custom-non-visible-data-with-the-data-*-attributes
-  REGEX_DATA_ATTR = /\Adata-(?!xml)[a-z_][\w.\u00E0-\u00F6\u00F8-\u017F\u01DD-\u02AF-]*\z/u
   # Matches an attribute value that could be treated by a browser as a URL
   # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
   # or more characters followed by a colon is considered a match, even if the
@@ -57,38 +26,47 @@ class Sanitize
   # IE6 and Opera will still parse).
   REGEX_PROTOCOL = /\A([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
+  # Matches Unicode characters that should be stripped from HTML before passing
+  # it to the parser.
+  #
+  # http://www.w3.org/TR/unicode-xml/#Charlist
+  REGEX_UNSUITABLE_CHARS = /[\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
   #--
   # Class Methods
   #++
-  # Returns a sanitized copy of _html_, using the settings in _config_ if
-  # specified.
-  def self.clean(html, config = {})
-    Sanitize.new(config).clean(html)
+  # Returns a sanitized copy of the given full _html_ document, using the
+  # settings in _config_ if specified.
+  #
+  # When sanitizing a document, the `<html>` element must be whitelisted or an
+  # error will be raised. If this is undesirable, you should probably use
+  # {#fragment} instead.
+  def self.document(html, config = {})
+    Sanitize.new(config).document(html)
   end
-  # Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
-  # were made.
-  def self.clean!(html, config = {})
-    Sanitize.new(config).clean!(html)
+  # Returns a sanitized copy of the given _html_ fragment, using the settings in
+  # _config_ if specified.
+  def self.fragment(html, config = {})
+    Sanitize.new(config).fragment(html)
   end
-  # Performs a Sanitize#clean using a full-document HTML parser instead of
-  # the default fragment parser. This will add a DOCTYPE and html tag
-  # unless they are already present
-  def self.clean_document(html, config = {})
-    Sanitize.new(config).clean_document(html)
+  # Sanitizes the given `Nokogiri::XML::Node` instance and all its children.
+  def self.node!(node, config = {})
+    Sanitize.new(config).node!(node)
   end
-  # Performs Sanitize#clean_document in place, returning _html_, or +nil+ if no
-  # changes were made.
-  def self.clean_document!(html, config = {})
-    Sanitize.new(config).clean_document!(html)
-  end
+  # Aliases for pre-3.0.0 backcompat.
+  class << Sanitize
+    # @deprecated Use {.document} instead.
+    alias_method :clean_document, :document
+    # @deprecated Use {.fragment} instead.
+    alias_method :clean, :fragment
-  # Sanitizes the specified Nokogiri::XML::Node and all its children.
-  def self.clean_node!(node, config = {})
-    Sanitize.new(config).clean_node!(node)
+    # @deprecated Use {.node!} instead.
+    alias_method :clean_node!, :node!
   end
   #--
@@ -97,97 +75,156 @@ class Sanitize
   # Returns a new Sanitize object initialized with the settings in _config_.
   def initialize(config = {})
-    @config = Config::DEFAULT.merge(config)
+    @config = Config.merge(Config::DEFAULT, config)
+    @transformers = Array(@config[:transformers].dup)
-    @transformers = {
-      :breadth => Array(@config[:transformers_breadth].dup),
-      :depth   => Array(@config[:transformers]) + Array(@config[:transformers_depth])
-    }
+    # Default transformers always run at the end of the chain, after any custom
+    # transformers.
+    @transformers << Transformers::CleanComment unless @config[:allow_comments]
+    @transformers << Transformers::CleanDoctype unless @config[:allow_doctype]
-    # Default depth transformers. These always run at the end of the chain,
-    # after any custom transformers.
-    @transformers[:depth] << Transformers::CleanComment unless @config[:allow_comments]
+    if @config[:elements].include?('style')
+      scss = Sanitize::CSS.new(config)
+      @transformers << Transformers::CSS::CleanElement.new(scss)
+    end
+    if @config[:attributes].values.any? {|attr| attr.include?('style') }
+      scss ||= Sanitize::CSS.new(config)
+      @transformers << Transformers::CSS::CleanAttribute.new(scss)
+    end
-    @transformers[:depth] <<
+    @transformers <<
         Transformers::CleanCDATA <<
         Transformers::CleanElement.new(@config)
   end
-  # Returns a sanitized copy of the given _html_ fragment.
-  def clean(html)
-    if html
-      dupe = html.dup
-      clean!(dupe) || dupe
-    end
+  # Returns a sanitized copy of the given _html_ document.
+  #
+  # When sanitizing a document, the `<html>` element must be whitelisted or an
+  # error will be raised. If this is undesirable, you should probably use
+  # {#fragment} instead.
+  def document(html)
+    return '' unless html
+    doc = Nokogiri::HTML5.parse(preprocess(html))
+    node!(doc)
+    to_html(doc)
   end
-  # Performs clean in place, returning _html_, or +nil+ if no changes were
-  # made.
-  def clean!(html, parser = Nokogiri::HTML::DocumentFragment)
-    fragment = parser.parse(html)
-    clean_node!(fragment)
+  # @deprecated Use {#document} instead.
+  alias_method :clean_document, :document
-    output_method_params = {:encoding => @config[:output_encoding], :indent => 0}
+  # Returns a sanitized copy of the given _html_ fragment.
+  def fragment(html)
+    return '' unless html
+    html = preprocess(html)
+    doc  = Nokogiri::HTML5.parse("<html><body>#{html}")
-    if @config[:output] == :xhtml
-      output_method = fragment.method(:to_xhtml)
-      output_method_params[:save_with] = Nokogiri::XML::Node::SaveOptions::AS_XHTML
-    elsif @config[:output] == :html
-      output_method = fragment.method(:to_html)
+    # Hack to allow fragments containing <body>. Borrowed from
+    # Nokogiri::HTML::DocumentFragment.
+    if html =~ /\A<body(?:\s|>)/i
+      path = '/html/body'
     else
-      raise Error, "unsupported output format: #{@config[:output]}"
+      path = '/html/body/node()'
     end
-    result = output_method.call(output_method_params)
+    frag = doc.fragment
+    doc.xpath(path).each {|node| frag << node }
-    return result == html ? nil : html[0, html.length] = result
+    node!(frag)
+    to_html(frag)
   end
-  # Returns a sanitized copy of the given full _html_ document.
-  def clean_document(html)
-    unless html.nil?
-      clean_document!(html.dup) || html
+  # @deprecated Use {#fragment} instead.
+  alias_method :clean, :fragment
+  # Sanitizes the given `Nokogiri::XML::Node` and all its children, modifying it
+  # in place.
+  #
+  # If _node_ is a `Nokogiri::XML::Document`, the `<html>` element must be
+  # whitelisted or an error will be raised.
+  def node!(node)
+    raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
+    if node.is_a?(Nokogiri::XML::Document)
+      unless @config[:elements].include?('html')
+        raise Error, 'When sanitizing a document, "<html>" must be whitelisted.'
+      end
     end
-  end
-  # Performs clean_document in place, returning _html_, or +nil+ if no changes
-  # were made.
-  def clean_document!(html)
-    if !@config[:elements].include?('html') && !@config[:remove_contents]
-      raise 'You must have the HTML element whitelisted to call #clean_document unless remove_contents is set to true'
-      # otherwise Nokogiri will raise for having multiple root nodes when
-      # it moves its children to the root document context
+    node_whitelist = Set.new
+    traverse(node) do |n|
+      transform_node!(n, node_whitelist)
     end
-    clean!(html, Nokogiri::HTML::Document)
+    node
   end
-  # Sanitizes the specified Nokogiri::XML::Node and all its children.
-  def clean_node!(node)
-    raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
+  # @deprecated Use {#node!} instead.
+  alias_method :clean_node!, :node!
-    node_whitelist = Set.new
+  private
-    unless @transformers[:breadth].empty?
-      traverse_breadth(node) {|n| transform_node!(n, node_whitelist, :breadth) }
+  # Preprocesses HTML before parsing to remove undesirable Unicode chars.
+  def preprocess(html)
+    html.to_s.dup
+    unless html.encoding.name == 'UTF-8'
+      html.encode!('UTF-8',
+        :invalid => :replace,
+        :undef   => :replace)
     end
-    traverse_depth(node) {|n| transform_node!(n, node_whitelist, :depth) }
-    node
+    html.gsub!(REGEX_UNSUITABLE_CHARS, '')
+    html
   end
-  private
+  def to_html(node)
+    replace_meta = false
-  def transform_node!(node, node_whitelist, mode)
-    @transformers[mode].each do |transformer|
-      result = transformer.call({
+    # Hacky workaround for a libxml2 bug that adds an undesired Content-Type
+    # meta tag to all serialized HTML documents.
+    #
+    # https://github.com/sparklemotion/nokogiri/issues/1008
+    if node.type == Nokogiri::XML::Node::DOCUMENT_NODE ||
+        node.type == Nokogiri::XML::Node::HTML_DOCUMENT_NODE
+      regex_meta   = %r|(<html[^>]*>\s*<head[^>]*>\s*)<meta http-equiv="Content-Type" content="text/html; charset=utf-8">|i
+      # Only replace the content-type meta tag if <meta> isn't whitelisted or
+      # the original document didn't actually include a content-type meta tag.
+      replace_meta = !@config[:elements].include?('meta') ||
+        node.xpath('/html/head/meta[@http-equiv]').none? do |meta|
+          meta['http-equiv'].downcase == 'content-type'
+        end
+    end
+    so = Nokogiri::XML::Node::SaveOptions
+    # Serialize to HTML without any formatting to prevent Nokogiri from adding
+    # newlines after certain tags.
+    html = node.to_html(
+      :encoding  => 'utf-8',
+      :indent    => 0,
+      :save_with => so::NO_DECLARATION | so::NO_EMPTY_TAGS | so::AS_HTML
+    )
+    html.gsub!(regex_meta, '\1') if replace_meta
+    html
+  end
+  def transform_node!(node, node_whitelist)
+    @transformers.each do |transformer|
+      result = transformer.call(
         :config         => @config,
         :is_whitelisted => node_whitelist.include?(node),
         :node           => node,
         :node_name      => node.name.downcase,
-        :node_whitelist => node_whitelist,
-        :traversal_mode => mode
-      })
+        :node_whitelist => node_whitelist
+      )
       if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
         node_whitelist.merge(result[:node_whitelist])
@@ -197,18 +234,26 @@ class Sanitize
     node
   end
-  # Performs breadth-first traversal, operating first on the root node, then
-  # traversing downwards.
-  def traverse_breadth(node, &block)
+  # Performs top-down traversal of the given node, operating first on the node
+  # itself, then traversing each child (if any) in order.
+  def traverse(node, &block)
     block.call(node)
-    node.children.each {|child| traverse_breadth(child, &block) }
-  end
-  # Performs depth-first traversal, operating first on the deepest nodes in the
-  # document, then traversing upwards to the root.
-  def traverse_depth(node, &block)
-    node.children.each {|child| traverse_depth(child, &block) }
-    block.call(node)
+    child = node.child
+    while child do
+      prev = child.previous_sibling
+      traverse(child, &block)
+      if child.parent != node
+        # The child was unlinked or reparented, so traverse the previous node's
+        # next sibling, or the parent's first child if there is no previous
+        # node.
+        child = prev ? prev.next_sibling : node.child
+      else
+        child = child.next_sibling
+      end
+    end
   end
   class Error < StandardError; end

data/lib/sanitize/config.rb CHANGED Viewed

@@ -1,86 +1,60 @@
-#--
-# Copyright (c) 2013 Ryan Grove <ryan@wonko.com>
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the 'Software'), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#++
+# encoding: utf-8
+require 'set'
 class Sanitize
   module Config
-    DEFAULT = {
-      # Whether or not to allow HTML comments. Allowing comments is strongly
-      # discouraged, since IE allows script execution within conditional
-      # comments.
-      :allow_comments => false,
-      # HTML attributes to add to specific elements. By default, no attributes
-      # are added.
-      :add_attributes => {},
-      # HTML attributes to allow in specific elements. By default, no attributes
-      # are allowed. Use the symbol :data to indicate that arbitrary HTML5
-      # data-* attributes should be allowed.
-      :attributes => {},
-      # HTML elements to allow. By default, no elements are allowed (which means
-      # that all HTML will be stripped).
-      :elements => [],
-      # Output format. Supported formats are :html and :xhtml. Default is :html.
-      :output => :html,
-      # Character encoding to use for HTML output. Default is 'utf-8'.
-      :output_encoding => 'utf-8',
-      # URL handling protocols to allow in specific attributes. By default, no
-      # protocols are allowed. Use :relative in place of a protocol if you want
-      # to allow relative URLs sans protocol.
-      :protocols => {},
-      # If this is true, Sanitize will remove the contents of any filtered
-      # elements in addition to the elements themselves. By default, Sanitize
-      # leaves the safe parts of an element's contents behind when the element
-      # is removed.
-      #
-      # If this is an Array of element names, then only the contents of the
-      # specified elements (when filtered) will be removed, and the contents of
-      # all other filtered elements will be left behind.
-      :remove_contents => false,
-      # Transformers allow you to filter or alter nodes using custom logic. See
-      # README.rdoc for details and examples.
-      :transformers => [],
-      # By default, transformers perform depth-first traversal (deepest node
-      # upward). This setting allows you to specify transformers that should
-      # perform breadth-first traversal (top node downward).
-      :transformers_breadth => [],
-      # Elements which, when removed, should have their contents surrounded by
-      # space characters to preserve readability. For example,
-      # `foo<div>bar</div>baz` will become 'foo bar baz' when the <div> is
-      # removed.
-      :whitespace_elements => %w[
-        address article aside blockquote br dd div dl dt footer h1 h2 h3 h4 h5
-        h6 header hgroup hr li nav ol p pre section ul
-      ]
+    # Deeply freezes and returns the given configuration Hash.
+    def self.freeze_config(config)
+      if Hash === config
+        config.each_value {|c| freeze_config(c) }
+      elsif Array === config || Set === config
+        config.each {|c| freeze_config(c) }
+      end
+      config.freeze
+    end
+    # Returns a new Hash containing the result of deeply merging *other_config*
+    # into *config*. Does not modify *config* or *other_config*.
+    #
+    # This is the safest way to use a built-in Sanitize config as the basis for
+    # your own custom config.
+    def self.merge(config, other_config = {})
+      raise ArgumentError, 'config must be a Hash' unless Hash === config
+      raise ArgumentError, 'other_config must be a Hash' unless Hash === other_config
+      merged = {}
+      keys   = Set.new(config.keys + other_config.keys)
+      keys.each do |key|
+        oldval = config[key]
+        if other_config.has_key?(key)
+          newval = other_config[key]
+          if Hash === oldval && Hash === newval
+            merged[key] = oldval.empty? ? newval.dup : merge(oldval, newval)
+          elsif Array === newval && key != :transformers
+            merged[key] = Set.new(newval)
+          else
+            merged[key] = can_dupe?(newval) ? newval.dup : newval
+          end
+        else
+          merged[key] = can_dupe?(oldval) ? oldval.dup : oldval
+        end
+      end
+      merged
+    end
+    # Returns `true` if `dup` may be safely called on _value_, `false`
+    # otherwise.
+    def self.can_dupe?(value)
+      !(true == value || false == value || value.nil? || Numeric === value || Symbol === value)
+    end
+    private_class_method :can_dupe?
-    }
   end
 end