RubyGems - sanitize - Versions diffs - 5.0.0 → 5.2.3 - Mend

sanitize 5.0.0 → 5.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of sanitize might be problematic. Click here for more details.

Files changed (21) hide show

checksums.yaml +4 -4
data/HISTORY.md +111 -18
data/README.md +60 -40
data/lib/sanitize.rb +37 -19
data/lib/sanitize/config/default.rb +5 -1
data/lib/sanitize/config/relaxed.rb +1 -1
data/lib/sanitize/css.rb +2 -2
data/lib/sanitize/transformers/clean_comment.rb +1 -1
data/lib/sanitize/transformers/clean_css.rb +3 -3
data/lib/sanitize/transformers/clean_doctype.rb +1 -1
data/lib/sanitize/transformers/clean_element.rb +17 -20
data/lib/sanitize/version.rb +1 -1
data/test/common.rb +0 -31
data/test/test_clean_element.rb +40 -14
data/test/test_malicious_html.rb +40 -6
data/test/test_parser.rb +1 -1
data/test/test_sanitize.rb +99 -14
data/test/test_sanitize_css.rb +43 -16
data/test/test_transformers.rb +25 -19
metadata +10 -12
data/test/test_unicode.rb +0 -95

data/lib/sanitize.rb CHANGED

@@ -19,6 +19,20 @@ require_relative 'sanitize/transformers/clean_element'
 class Sanitize
   attr_reader :config
+  # Matches one or more control characters that should be removed from HTML
+  # before parsing, as defined by the HTML living standard.
+  #
+  # -   https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
+  # -   https://infra.spec.whatwg.org/#control
+  REGEX_HTML_CONTROL_CHARACTERS = /[\u0001-\u0008\u000b\u000e-\u001f\u007f-\u009f]+/u
+  # Matches one or more non-characters that should be removed from HTML before
+  # parsing, as defined by the HTML living standard.
+  #
+  # -   https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
+  # -   https://infra.spec.whatwg.org/#noncharacter
+  REGEX_HTML_NON_CHARACTERS = /[\ufdd0-\ufdef\ufffe\uffff\u{1fffe}\u{1ffff}\u{2fffe}\u{2ffff}\u{3fffe}\u{3ffff}\u{4fffe}\u{4ffff}\u{5fffe}\u{5ffff}\u{6fffe}\u{6ffff}\u{7fffe}\u{7ffff}\u{8fffe}\u{8ffff}\u{9fffe}\u{9ffff}\u{afffe}\u{affff}\u{bfffe}\u{bffff}\u{cfffe}\u{cffff}\u{dfffe}\u{dffff}\u{efffe}\u{effff}\u{ffffe}\u{fffff}\u{10fffe}\u{10ffff}]+/u
   # Matches an attribute value that could be treated by a browser as a URL
   # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
   # or more characters followed by a colon is considered a match, even if the
@@ -26,11 +40,12 @@ class Sanitize
   # IE6 and Opera will still parse).
   REGEX_PROTOCOL = /\A\s*([^\/#]*?)(?:\:|&#0*58|&#x0*3a)/i
-  # Matches Unicode characters that should be stripped from HTML before passing
-  # it to the parser.
+  # Matches one or more characters that should be stripped from HTML before
+  # parsing. This is a combination of `REGEX_HTML_CONTROL_CHARACTERS` and
+  # `REGEX_HTML_NON_CHARACTERS`.
   #
-  # http://www.w3.org/TR/unicode-xml/#Charlist
-  REGEX_UNSUITABLE_CHARS = /[\u0000\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
+  # https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
+  REGEX_UNSUITABLE_CHARS = /(?:#{REGEX_HTML_CONTROL_CHARACTERS}|#{REGEX_HTML_NON_CHARACTERS})/u
   #--
   # Class Methods
@@ -39,7 +54,7 @@ class Sanitize
   # Returns a sanitized copy of the given full _html_ document, using the
   # settings in _config_ if specified.
   #
-  # When sanitizing a document, the `<html>` element must be whitelisted or an
+  # When sanitizing a document, the `<html>` element must be allowlisted or an
   # error will be raised. If this is undesirable, you should probably use
   # {#fragment} instead.
   def self.document(html, config = {})
@@ -102,13 +117,13 @@ class Sanitize
   # Returns a sanitized copy of the given _html_ document.
   #
-  # When sanitizing a document, the `<html>` element must be whitelisted or an
+  # When sanitizing a document, the `<html>` element must be allowlisted or an
   # error will be raised. If this is undesirable, you should probably use
   # {#fragment} instead.
   def document(html)
     return '' unless html
-    doc = Nokogiri::HTML5.parse(preprocess(html))
+    doc = Nokogiri::HTML5.parse(preprocess(html), **@config[:parser_options])
     node!(doc)
     to_html(doc)
   end
@@ -120,8 +135,7 @@ class Sanitize
   def fragment(html)
     return '' unless html
-    html = preprocess(html)
-    frag  = Nokogiri::HTML5.fragment(html)
+    frag = Nokogiri::HTML5.fragment(preprocess(html), **@config[:parser_options])
     node!(frag)
     to_html(frag)
   end
@@ -133,20 +147,20 @@ class Sanitize
   # in place.
   #
   # If _node_ is a `Nokogiri::XML::Document`, the `<html>` element must be
-  # whitelisted or an error will be raised.
+  # allowlisted or an error will be raised.
   def node!(node)
     raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
     if node.is_a?(Nokogiri::XML::Document)
       unless @config[:elements].include?('html')
-        raise Error, 'When sanitizing a document, "<html>" must be whitelisted.'
+        raise Error, 'When sanitizing a document, "<html>" must be allowlisted.'
       end
     end
-    node_whitelist = Set.new
+    node_allowlist = Set.new
     traverse(node) do |n|
-      transform_node!(n, node_whitelist)
+      transform_node!(n, node_allowlist)
     end
     node
@@ -175,7 +189,7 @@ class Sanitize
     node.to_html(preserve_newline: true)
   end
-  def transform_node!(node, node_whitelist)
+  def transform_node!(node, node_allowlist)
     @transformers.each do |transformer|
       # Since transform_node! may be called in a tight loop to process thousands
       # of items, we can optimize both memory and CPU performance by:
@@ -185,15 +199,19 @@ class Sanitize
       # does merge! create a new hash, it is also 2.6x slower:
       # https://github.com/JuanitoFatas/fast-ruby#hashmerge-vs-hashmerge-code
       config = @transformer_config
-      config[:is_whitelisted] = node_whitelist.include?(node)
+      config[:is_allowlisted] = config[:is_whitelisted] = node_allowlist.include?(node)
       config[:node] = node
       config[:node_name] = node.name.downcase
-      config[:node_whitelist] = node_whitelist
+      config[:node_allowlist] = config[:node_whitelist] = node_allowlist
+      result = transformer.call(**config)
-      result = transformer.call(config)
+      if result.is_a?(Hash)
+        result_allowlist = result[:node_allowlist] || result[:node_whitelist]
-      if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
-        node_whitelist.merge(result[:node_whitelist])
+        if result_allowlist.respond_to?(:each)
+          node_allowlist.merge(result_allowlist)
+        end
       end
     end

data/lib/sanitize/config/default.rb CHANGED

@@ -56,6 +56,10 @@ class Sanitize
       # that all HTML will be stripped).
       :elements => [],
+      # HTML parsing options to pass to Nokogumbo.
+      # https://github.com/rubys/nokogumbo/tree/v2.0.1#parsing-options
+      :parser_options => {},
       # URL handling protocols to allow in specific attributes. By default, no
       # protocols are allowed. Use :relative in place of a protocol if you want
       # to allow relative URLs sans protocol.
@@ -70,7 +74,7 @@ class Sanitize
       # the specified elements (when filtered) will be removed, and the contents
       # of all other filtered elements will be left behind.
       :remove_contents => %w[
-        iframe noembed noframes noscript script style
+        iframe math noembed noframes noscript plaintext script style svg xmp
       ],
       # Transformers allow you to filter or alter nodes using custom logic. See

data/lib/sanitize/config/relaxed.rb CHANGED

@@ -6,7 +6,7 @@ class Sanitize
       :elements => BASIC[:elements] + %w[
         address article aside bdi bdo body caption col colgroup data del div
         figcaption figure footer h1 h2 h3 h4 h5 h6 head header hgroup hr html
-        img ins main nav rp rt ruby section span style summary sup table tbody
+        img ins main nav rp rt ruby section span style summary table tbody
         td tfoot th thead title tr wbr
       ],

data/lib/sanitize/css.rb CHANGED

@@ -175,7 +175,7 @@ class Sanitize; class CSS
         next prop
       when :semicolon
-        # Only preserve the semicolon if it was preceded by a whitelisted
+        # Only preserve the semicolon if it was preceded by an allowlisted
         # property. Otherwise, omit it in order to prevent redundant semicolons.
         if preceded_by_property
           preceded_by_property = false
@@ -296,7 +296,7 @@ class Sanitize; class CSS
   end
   # Returns `true` if the given node (which may be of type `:url` or
-  # `:function`, since the CSS syntax can produce both) uses a whitelisted
+  # `:function`, since the CSS syntax can produce both) uses an allowlisted
   # protocol.
   def valid_url?(node)
     type = node[:node]

data/lib/sanitize/transformers/clean_comment.rb CHANGED

@@ -6,7 +6,7 @@ class Sanitize; module Transformers
     node = env[:node]
     if node.type == Nokogiri::XML::Node::COMMENT_NODE
-      node.unlink unless env[:is_whitelisted]
+      node.unlink unless env[:is_allowlisted]
     end
   end

data/lib/sanitize/transformers/clean_css.rb CHANGED

@@ -1,6 +1,6 @@
 class Sanitize; module Transformers; module CSS
-# Enforces a CSS whitelist on the contents of `style` attributes.
+# Enforces a CSS allowlist on the contents of `style` attributes.
 class CleanAttribute
   def initialize(sanitizer_or_config)
     if Sanitize::CSS === sanitizer_or_config
@@ -14,7 +14,7 @@ class CleanAttribute
     node = env[:node]
     return unless node.type == Nokogiri::XML::Node::ELEMENT_NODE &&
-        node.key?('style') && !env[:is_whitelisted]
+        node.key?('style') && !env[:is_allowlisted]
     attr = node.attribute('style')
     css  = @scss.properties(attr.value)
@@ -27,7 +27,7 @@ class CleanAttribute
   end
 end
-# Enforces a CSS whitelist on the contents of `<style>` elements.
+# Enforces a CSS allowlist on the contents of `<style>` elements.
 class CleanElement
   def initialize(sanitizer_or_config)
     if Sanitize::CSS === sanitizer_or_config

data/lib/sanitize/transformers/clean_doctype.rb CHANGED

@@ -3,7 +3,7 @@
 class Sanitize; module Transformers
   CleanDoctype = lambda do |env|
-    return if env[:is_whitelisted]
+    return if env[:is_allowlisted]
     node = env[:node]

data/lib/sanitize/transformers/clean_element.rb CHANGED

@@ -76,11 +76,11 @@ class Sanitize; module Transformers; class CleanElement
   def call(env)
     node = env[:node]
-    return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_whitelisted]
+    return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_allowlisted]
     name = env[:node_name]
-    # Delete any element that isn't in the config whitelist, unless the node has
+    # Delete any element that isn't in the config allowlist, unless the node has
     # already been deleted from the document.
     #
     # It's important that we not try to reparent the children of a node that has
@@ -107,34 +107,31 @@ class Sanitize; module Transformers; class CleanElement
       return
     end
-    attr_whitelist = @attributes[name] || @attributes[:all]
+    attr_allowlist = @attributes[name] || @attributes[:all]
-    if attr_whitelist.nil?
-      # Delete all attributes from elements with no whitelisted attributes.
+    if attr_allowlist.nil?
+      # Delete all attributes from elements with no allowlisted attributes.
       node.attribute_nodes.each {|attr| attr.unlink }
     else
-      allow_data_attributes = attr_whitelist.include?(:data)
+      allow_data_attributes = attr_allowlist.include?(:data)
       # Delete any attribute that isn't allowed on this element.
       node.attribute_nodes.each do |attr|
         attr_name = attr.name.downcase
-        unless attr_whitelist.include?(attr_name)
-          # The attribute isn't whitelisted.
+        unless attr_allowlist.include?(attr_name)
+          # The attribute isn't in the allowlist, but may still be allowed if
+          # it's a data attribute.
-          if allow_data_attributes && attr_name.start_with?('data-')
-            # Arbitrary data attributes are allowed. If this is a data
-            # attribute, continue.
-            next if attr_name =~ REGEX_DATA_ATTR
+          unless allow_data_attributes && attr_name.start_with?('data-') && attr_name =~ REGEX_DATA_ATTR
+            # Either the attribute isn't a data attribute or arbitrary data
+            # attributes aren't allowed. Remove the attribute.
+            attr.unlink
+            next
           end
-          # Either the attribute isn't a data attribute or arbitrary data
-          # attributes aren't allowed. Remove the attribute.
-          attr.unlink
-          next
         end
-        # The attribute is whitelisted.
+        # The attribute is allowed.
         # Remove any attributes that use unacceptable protocols.
         if @protocols.include?(name) && @protocols[name].include?(attr_name)
@@ -162,7 +159,7 @@ class Sanitize; module Transformers; class CleanElement
         # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an
         # attempt to preserve server-side includes. This can result in XSS since
         # an unescaped double quote can allow an attacker to inject a
-        # non-whitelisted attribute.
+        # non-allowlisted attribute.
         #
         # Sanitize works around this by implementing its own escaping for
         # affected attributes, some of which can exist on any element and some
@@ -191,7 +188,7 @@ class Sanitize; module Transformers; class CleanElement
     # Element-specific special cases.
     case name
-    # If this is a whitelisted iframe that has children, remove all its
+    # If this is an allowlisted iframe that has children, remove all its
     # children. The HTML standard says iframes shouldn't have content, but when
     # they do, this content is parsed as text and is serialized verbatim without
     # being escaped, which is unsafe because legacy browsers may still render it

data/lib/sanitize/version.rb CHANGED

@@ -1,5 +1,5 @@
 # encoding: utf-8
 class Sanitize
-  VERSION = '5.0.0'
+  VERSION = '5.2.3'
 end

data/test/common.rb CHANGED

@@ -1,34 +1,3 @@
 # encoding: utf-8
-gem 'minitest'
 require 'minitest/autorun'
 require_relative '../lib/sanitize'
-# Helper to stub an instance method. Shamelessly stolen from
-# https://github.com/codeodor/minitest-stub_any_instance/
-class Object
-  def self.stub_instance(name, value, &block)
-    old_method = "__stubbed_method_#{name}__"
-    class_eval do
-      alias_method old_method, name
-      define_method(name) do |*args|
-        if value.respond_to?(:call) then
-          value.call(*args)
-        else
-          value
-        end
-      end
-    end
-    yield
-  ensure
-    class_eval do
-      undef_method name
-      alias_method name, old_method
-      undef_method old_method
-    end
-  end
-end

data/test/test_clean_element.rb CHANGED

@@ -162,7 +162,7 @@ describe 'Sanitize::Transformers::CleanElement' do
   }
   describe 'Default config' do
-    it 'should remove non-whitelisted elements, leaving safe contents behind' do
+    it 'should remove non-allowlisted elements, leaving safe contents behind' do
       Sanitize.fragment('foo <b>bar</b> <strong><a href="#a">baz</a></strong> quux')
         .must_equal 'foo bar baz quux'
@@ -192,21 +192,16 @@ describe 'Sanitize::Transformers::CleanElement' do
         .must_equal ''
     end
-    it 'should escape the content of removed `plaintext` elements' do
-      Sanitize.fragment('<plaintext>hello! <script>alert(0)</script>')
-        .must_equal 'hello! &lt;script&gt;alert(0)&lt;/script&gt;'
-    end
-    it 'should escape the content of removed `xmp` elements' do
-      Sanitize.fragment('<xmp>hello! <script>alert(0)</script></xmp>')
-        .must_equal 'hello! &lt;script&gt;alert(0)&lt;/script&gt;'
-    end
     it 'should not preserve the content of removed `iframe` elements' do
       Sanitize.fragment('<iframe>hello! <script>alert(0)</script></iframe>')
         .must_equal ''
     end
+    it 'should not preserve the content of removed `math` elements' do
+      Sanitize.fragment('<math>hello! <script>alert(0)</script></math>')
+        .must_equal ''
+    end
     it 'should not preserve the content of removed `noembed` elements' do
       Sanitize.fragment('<noembed>hello! <script>alert(0)</script></noembed>')
         .must_equal ''
@@ -222,6 +217,11 @@ describe 'Sanitize::Transformers::CleanElement' do
         .must_equal ''
     end
+    it 'should not preserve the content of removed `plaintext` elements' do
+      Sanitize.fragment('<plaintext>hello! <script>alert(0)</script>')
+        .must_equal ''
+    end
     it 'should not preserve the content of removed `script` elements' do
       Sanitize.fragment('<script>hello! <script>alert(0)</script></script>')
         .must_equal ''
@@ -232,6 +232,16 @@ describe 'Sanitize::Transformers::CleanElement' do
         .must_equal ''
     end
+    it 'should not preserve the content of removed `svg` elements' do
+      Sanitize.fragment('<svg>hello! <script>alert(0)</script></svg>')
+        .must_equal ''
+    end
+    it 'should not preserve the content of removed `xmp` elements' do
+      Sanitize.fragment('<xmp>hello! <script>alert(0)</script></xmp>')
+        .must_equal ''
+    end
     strings.each do |name, data|
       it "should clean #{name} HTML" do
         Sanitize.fragment(data[:html]).must_equal(data[:default])
@@ -315,7 +325,7 @@ describe 'Sanitize::Transformers::CleanElement' do
   end
   describe 'Custom configs' do
-    it 'should allow attributes on all elements if whitelisted under :all' do
+    it 'should allow attributes on all elements if allowlisted under :all' do
       input = '<p class="foo">bar</p>'
       Sanitize.fragment(input).must_equal ' bar '
@@ -336,7 +346,7 @@ describe 'Sanitize::Transformers::CleanElement' do
       }).must_equal input
     end
-    it "should not allow relative URLs when relative URLs aren't whitelisted" do
+    it "should not allow relative URLs when relative URLs aren't allowlisted" do
       input = '<a href="/foo/bar">Link</a>'
       Sanitize.fragment(input,
@@ -400,7 +410,7 @@ describe 'Sanitize::Transformers::CleanElement' do
       ).must_equal 'foo bar  baz hi '
     end
-    it 'should remove the contents of whitelisted iframes' do
+    it 'should remove the contents of allowlisted iframes' do
       Sanitize.fragment('<iframe>hi <script>hello</script></iframe>',
         :elements => ['iframe']
       ).must_equal '<iframe></iframe>'
@@ -481,6 +491,22 @@ describe 'Sanitize::Transformers::CleanElement' do
       }).must_equal "<a>Text</a>"
     end
+    it 'should sanitize protocols in data attributes even if data attributes are generically allowed' do
+      input = '<a data-url="mailto:someone@example.com">Text</a>'
+      Sanitize.fragment(input, {
+        :elements => ['a'],
+        :attributes => {'a' => [:data]},
+        :protocols => {'a' => {'data-url' => ['https']}}
+      }).must_equal "<a>Text</a>"
+      Sanitize.fragment(input, {
+        :elements => ['a'],
+        :attributes => {'a' => [:data]},
+        :protocols => {'a' => {'data-url' => ['mailto']}}
+      }).must_equal input
+    end
     it 'should prevent `<meta>` tags from being used to set a non-UTF-8 charset' do
       Sanitize.document('<html><head><meta charset="utf-8"></head><body>Howdy!</body></html>',
         :elements   => %w[html head meta body],