RubyGems - loofah - Versions diffs - 2.2.3 → 2.21.1 - Mend

loofah 2.2.3 → 2.21.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +269 -31
data/README.md +109 -124
data/lib/loofah/concerns.rb +207 -0
data/lib/loofah/elements.rb +85 -79
data/lib/loofah/helpers.rb +37 -20
data/lib/loofah/{html → html4}/document.rb +6 -7
data/lib/loofah/html4/document_fragment.rb +15 -0
data/lib/loofah/html5/document.rb +17 -0
data/lib/loofah/html5/document_fragment.rb +15 -0
data/lib/loofah/html5/libxml2_workarounds.rb +10 -8
data/lib/loofah/html5/safelist.rb +1055 -0
data/lib/loofah/html5/scrub.rb +153 -58
data/lib/loofah/metahelpers.rb +11 -6
data/lib/loofah/scrubber.rb +22 -15
data/lib/loofah/scrubbers.rb +66 -55
data/lib/loofah/version.rb +6 -0
data/lib/loofah/xml/document.rb +2 -0
data/lib/loofah/xml/document_fragment.rb +4 -7
data/lib/loofah.rb +131 -38
metadata +28 -216
data/.gemtest +0 -0
data/Gemfile +0 -22
data/Manifest.txt +0 -40
data/Rakefile +0 -79
data/benchmark/benchmark.rb +0 -149
data/benchmark/fragment.html +0 -96
data/benchmark/helper.rb +0 -73
data/benchmark/www.slashdot.com.html +0 -2560
data/lib/loofah/html/document_fragment.rb +0 -40
data/lib/loofah/html5/whitelist.rb +0 -186
data/lib/loofah/instance_methods.rb +0 -127
data/test/assets/msword.html +0 -63
data/test/assets/testdata_sanitizer_tests1.dat +0 -502
data/test/helper.rb +0 -18
data/test/html5/test_sanitizer.rb +0 -382
data/test/integration/test_ad_hoc.rb +0 -204
data/test/integration/test_helpers.rb +0 -43
data/test/integration/test_html.rb +0 -72
data/test/integration/test_scrubbers.rb +0 -400
data/test/integration/test_xml.rb +0 -55
data/test/unit/test_api.rb +0 -142
data/test/unit/test_encoding.rb +0 -20
data/test/unit/test_helpers.rb +0 -62
data/test/unit/test_scrubber.rb +0 -229
data/test/unit/test_scrubbers.rb +0 -14

data/lib/loofah/html5/scrub.rb CHANGED Viewed

@@ -1,104 +1,160 @@
-require 'cgi'
-require 'crass'
+# frozen_string_literal: true
+require "cgi"
+require "crass"
 module Loofah
   module HTML5 # :nodoc:
     module Scrub
       CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
-      CSS_KEYWORDISH = /\A(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)\z/
-      CRASS_SEMICOLON = {:node => :semicolon, :raw => ";"}
+      CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/ # rubocop:disable Layout/LineLength
+      CRASS_SEMICOLON = { node: :semicolon, raw: ";" }
+      CSS_IMPORTANT = "!important"
+      CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES = /\A(["'])?[^"']+\1\z/
+      DATA_ATTRIBUTE_NAME = /\Adata-[\w-]+\z/
       class << self
-        def allowed_element? element_name
-          ::Loofah::HTML5::WhiteList::ALLOWED_ELEMENTS_WITH_LIBXML2.include? element_name
+        def allowed_element?(element_name)
+          ::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include?(element_name)
         end
         #  alternative implementation of the html5lib attribute scrubbing algorithm
-        def scrub_attributes node
+        def scrub_attributes(node)
           node.attribute_nodes.each do |attr_node|
             attr_name = if attr_node.namespace
-                          "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
-                        else
-                          attr_node.node_name
-                        end
+              "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
+            else
+              attr_node.node_name
+            end
-            if attr_name =~ /\Adata-[\w-]+\z/
+            if DATA_ATTRIBUTE_NAME.match?(attr_name)
               next
             end
-            unless WhiteList::ALLOWED_ATTRIBUTES.include?(attr_name)
+            unless SafeList::ALLOWED_ATTRIBUTES.include?(attr_name)
               attr_node.remove
               next
             end
-            if WhiteList::ATTR_VAL_IS_URI.include?(attr_name)
-              # this block lifted nearly verbatim from HTML5 sanitization
-              val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS,'').downcase
-              if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && ! WhiteList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0])
-                attr_node.remove
-                next
-              elsif val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0] == 'data'
-                # permit only allowed data mediatypes
-                mediatype = val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[1]
-                mediatype, _ = mediatype.split(';')[0..1] if mediatype
-                if mediatype && !WhiteList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
-                  attr_node.remove
-                  next
-                end
-              end
-            end
-            if WhiteList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
-              attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attr_node.value
+            if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
+              next if scrub_uri_attribute(attr_node)
             end
-            if WhiteList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == 'xlink:href' && attr_node.value =~ /^\s*[^#\s].*/m
-              attr_node.remove
-              next
+            if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
+              scrub_attribute_that_allows_local_ref(attr_node)
             end
+            next unless SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) &&
+              attr_name == "xlink:href" &&
+              attr_node.value =~ /^\s*[^#\s].*/m
+            attr_node.remove
+            next
           end
-          scrub_css_attribute node
+          scrub_css_attribute(node)
           node.attribute_nodes.each do |attr_node|
-            node.remove_attribute(attr_node.name) if attr_node.value !~ /[^[:space:]]/
+            if attr_node.value !~ /[^[:space:]]/ && attr_node.name !~ DATA_ATTRIBUTE_NAME
+              node.remove_attribute(attr_node.name)
+            end
           end
-          force_correct_attribute_escaping! node
+          force_correct_attribute_escaping!(node)
         end
-        def scrub_css_attribute node
-          style = node.attributes['style']
+        def scrub_css_attribute(node)
+          style = node.attributes["style"]
           style.value = scrub_css(style.value) if style
         end
-        def scrub_css style
-          style_tree = Crass.parse_properties style
+        def scrub_css(style)
+          url_flags = [:url, :bad_url]
+          style_tree = Crass.parse_properties(style)
           sanitized_tree = []
           style_tree.each do |node|
             next unless node[:node] == :property
             next if node[:children].any? do |child|
-              [:url, :bad_url].include?(child[:node]) || (child[:node] == :function && !WhiteList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase))
+              url_flags.include?(child[:node])
             end
             name = node[:name].downcase
-            if WhiteList::ALLOWED_CSS_PROPERTIES.include?(name) || WhiteList::ALLOWED_SVG_PROPERTIES.include?(name)
-              sanitized_tree << node << CRASS_SEMICOLON
-            elsif WhiteList::SHORTHAND_CSS_PROPERTIES.include?(name.split('-').first)
-              value = node[:value].split.map do |keyword|
-                if WhiteList::ALLOWED_CSS_KEYWORDS.include?(keyword) || keyword =~ CSS_KEYWORDISH
+            next unless SafeList::ALLOWED_CSS_PROPERTIES.include?(name) ||
+              SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
+              SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)
+            value = node[:children].map do |child|
+              case child[:node]
+              when :whitespace
+                nil
+              when :string
+                if CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES.match?(child[:raw])
+                  Crass::Parser.stringify(child)
+                end
+              when :function
+                if SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase)
+                  Crass::Parser.stringify(child)
+                end
+              when :ident
+                keyword = child[:value]
+                if !SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first) ||
+                    SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
+                    (keyword =~ CSS_KEYWORDISH)
                   keyword
                 end
-              end.compact
-              unless value.empty?
-                propstring = sprintf "%s:%s", name, value.join(" ")
-                sanitized_node = Crass.parse_properties(propstring).first
-                sanitized_tree << sanitized_node << CRASS_SEMICOLON
+              else
+                child[:raw]
               end
-            end
+            end.compact
+            next if value.empty?
+            value << CSS_IMPORTANT if node[:important]
+            propstring = format("%s:%s", name, value.join(" "))
+            sanitized_node = Crass.parse_properties(propstring).first
+            sanitized_tree << sanitized_node << CRASS_SEMICOLON
           end
-          Crass::Parser.stringify sanitized_tree
+          Crass::Parser.stringify(sanitized_tree)
+        end
+        def scrub_attribute_that_allows_local_ref(attr_node)
+          return unless attr_node.value
+          nodes = Crass::Parser.new(attr_node.value).parse_component_values
+          values = nodes.map do |node|
+            case node[:node]
+            when :url
+              if node[:value].start_with?("#")
+                node[:raw]
+              end
+            when :hash, :ident, :string
+              node[:raw]
+            end
+          end.compact
+          attr_node.value = values.join(" ")
+        end
+        def scrub_uri_attribute(attr_node)
+          # this block lifted nearly verbatim from HTML5 sanitization
+          val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
+          if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ &&
+              !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
+            attr_node.remove
+            return true
+          elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
+            # permit only allowed data mediatypes
+            mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
+            mediatype, _ = mediatype.split(";")[0..1] if mediatype
+            if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
+              attr_node.remove
+              return true
+            end
+          end
+          false
         end
         #
@@ -106,7 +162,7 @@ module Loofah
         #
         #  see comments about CVE-2018-8048 within the tests for more information
         #
-        def force_correct_attribute_escaping! node
+        def force_correct_attribute_escaping!(node)
           return unless Nokogiri::VersionInfo.instance.libxml2?
           node.attribute_nodes.each do |attr_node|
@@ -122,11 +178,50 @@ module Loofah
             #
             encoding = attr_node.value.encoding
             attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
-              '%' + m.unpack('H2' * m.bytesize).join('%').upcase
+              "%" + m.unpack("H2" * m.bytesize).join("%").upcase
             end.force_encoding(encoding)
           end
         end
+        def cdata_needs_escaping?(node)
+          # Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` tag as cdata, but it acts that way
+          node.cdata? || (Nokogiri.jruby? && node.text? && node.parent.name == "style")
+        end
+        def cdata_escape(node)
+          escaped_text = escape_tags(node.text)
+          if Nokogiri.jruby?
+            node.document.create_text_node(escaped_text)
+          else
+            node.document.create_cdata(escaped_text)
+          end
+        end
+        TABLE_FOR_ESCAPE_HTML__ = {
+          "<" => "&lt;",
+          ">" => "&gt;",
+          "&" => "&amp;",
+        }
+        def escape_tags(string)
+          # modified version of CGI.escapeHTML from ruby 3.1
+          enc = string.encoding
+          if enc.ascii_compatible?
+            string = string.b
+            string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
+            string.force_encoding(enc)
+          else
+            if enc.dummy?
+              origenc = enc
+              enc = Encoding::Converter.asciicompat_encoding(enc)
+              string = enc ? string.encode(enc) : string.b
+            end
+            table = Hash[TABLE_FOR_ESCAPE_HTML__.map { |pair| pair.map { |s| s.encode(enc) } }]
+            string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
+            string.encode!(origenc) if origenc
+            string
+          end
+        end
       end
     end
   end

data/lib/loofah/metahelpers.rb CHANGED Viewed

@@ -1,11 +1,16 @@
+# frozen_string_literal: true
 module Loofah
   module MetaHelpers # :nodoc:
-    def self.add_downcased_set_members_to_all_set_constants mojule
-      mojule.constants.each do |constant_sym|
-        constant = mojule.const_get constant_sym
-        next unless Set === constant
-        constant.dup.each do |member|
-          constant.add member.downcase
+    class << self
+      def add_downcased_set_members_to_all_set_constants(mojule)
+        mojule.constants.each do |constant_sym|
+          constant = mojule.const_get(constant_sym)
+          next unless Set === constant
+          constant.dup.each do |member|
+            constant.add(member.downcase)
+          end
         end
       end
     end

data/lib/loofah/scrubber.rb CHANGED Viewed

@@ -1,8 +1,10 @@
+# frozen_string_literal: true
 module Loofah
   #
   #  A RuntimeError raised when Loofah could not find an appropriate scrubber.
   #
-  class ScrubberNotFound < RuntimeError ; end
+  class ScrubberNotFound < RuntimeError; end
   #
   #  A Scrubber wraps up a block (or method) that is run on an HTML node (element):
@@ -23,7 +25,7 @@ module Loofah
   #
   #  This can then be run on a document:
   #
-  #    Loofah.fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
+  #    Loofah.html5_fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
   #    # => "<div>foo</div><p>bar</p>"
   #
   #  Scrubbers can be run on a document in either a top-down traversal (the
@@ -31,12 +33,11 @@ module Loofah
   #  Scrubber::STOP to terminate the traversal of a subtree.
   #
   class Scrubber
     # Top-down Scrubbers may return CONTINUE to indicate that the subtree should be traversed.
     CONTINUE = Object.new.freeze
     # Top-down Scrubbers may return STOP to indicate that the subtree should not be traversed.
-    STOP     = Object.new.freeze
+    STOP = Object.new.freeze
     # When a scrubber is initialized, the :direction may be specified
     # as :top_down (the default) or :bottom_up.
@@ -64,9 +65,11 @@ module Loofah
     def initialize(options = {}, &block)
       direction = options[:direction] || :top_down
       unless [:top_down, :bottom_up].include?(direction)
-        raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
+        raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
       end
-      @direction, @block = direction, block
+      @direction = direction
+      @block = block
     end
     #
@@ -83,7 +86,7 @@ module Loofah
     #  +scrub+, which will be called for each document node.
     #
     def scrub(node)
-      raise ScrubberNotFound, "No scrub method has been defined on #{self.class.to_s}"
+      raise ScrubberNotFound, "No scrub method has been defined on #{self.class}"
     end
     #
@@ -91,10 +94,10 @@ module Loofah
     # If the attribute is set, don't overwrite the existing value
     #
     def append_attribute(node, attribute, value)
-      current_value = node.get_attribute(attribute) || ''
+      current_value = node.get_attribute(attribute) || ""
       current_values = current_value.split(/\s+/)
       updated_value = current_values | [value]
-      node.set_attribute(attribute, updated_value.join(' '))
+      node.set_attribute(attribute, updated_value.join(" "))
     end
     private
@@ -102,11 +105,15 @@ module Loofah
     def html5lib_sanitize(node)
       case node.type
       when Nokogiri::XML::Node::ELEMENT_NODE
-        if HTML5::Scrub.allowed_element? node.name
-          HTML5::Scrub.scrub_attributes node
+        if HTML5::Scrub.allowed_element?(node.name)
+          HTML5::Scrub.scrub_attributes(node)
           return Scrubber::CONTINUE
         end
       when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
+        if HTML5::Scrub.cdata_needs_escaping?(node)
+          node.before(HTML5::Scrub.cdata_escape(node))
+          return Scrubber::STOP
+        end
         return Scrubber::CONTINUE
       end
       Scrubber::STOP
@@ -115,14 +122,14 @@ module Loofah
     def traverse_conditionally_top_down(node)
       if block
         return if block.call(node) == STOP
-      else
-        return if scrub(node) == STOP
+      elsif scrub(node) == STOP
+        return
       end
-      node.children.each {|j| traverse_conditionally_top_down(j)}
+      node.children.each { |j| traverse_conditionally_top_down(j) }
     end
     def traverse_conditionally_bottom_up(node)
-      node.children.each {|j| traverse_conditionally_bottom_up(j)}
+      node.children.each { |j| traverse_conditionally_bottom_up(j) }
       if block
         block.call(node)
       else