RubyGems - loofah - Versions diffs - 1.0.0 → 2.19.1 - Mend

loofah 1.0.0 → 2.19.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +489 -0
data/MIT-LICENSE.txt +3 -1
data/README.md +364 -0
data/SECURITY.md +18 -0
data/lib/loofah/elements.rb +88 -11
data/lib/loofah/helpers.rb +76 -2
data/lib/loofah/html/document.rb +1 -0
data/lib/loofah/html/document_fragment.rb +9 -2
data/lib/loofah/html5/libxml2_workarounds.rb +27 -0
data/lib/loofah/html5/safelist.rb +1042 -0
data/lib/loofah/html5/scrub.rb +198 -40
data/lib/loofah/instance_methods.rb +16 -10
data/lib/loofah/metahelpers.rb +9 -10
data/lib/loofah/scrubber.rb +22 -6
data/lib/loofah/scrubbers.rb +96 -16
data/lib/loofah/version.rb +5 -0
data/lib/loofah/xml/document.rb +1 -0
data/lib/loofah/xml/document_fragment.rb +5 -2
data/lib/loofah.rb +38 -25
metadata +159 -172
data/CHANGELOG.rdoc +0 -134
data/Gemfile +0 -1
data/Manifest.txt +0 -34
data/README.rdoc +0 -312
data/Rakefile +0 -53
data/benchmark/benchmark.rb +0 -149
data/benchmark/fragment.html +0 -96
data/benchmark/helper.rb +0 -73
data/benchmark/www.slashdot.com.html +0 -2560
data/lib/loofah/html5/whitelist.rb +0 -168
data/test/helper.rb +0 -7
data/test/html5/test_sanitizer.rb +0 -248
data/test/integration/test_ad_hoc.rb +0 -176
data/test/integration/test_helpers.rb +0 -33
data/test/integration/test_html.rb +0 -51
data/test/integration/test_scrubbers.rb +0 -331
data/test/integration/test_xml.rb +0 -55
data/test/unit/test_api.rb +0 -138
data/test/unit/test_helpers.rb +0 -27
data/test/unit/test_scrubber.rb +0 -229
data/test/unit/test_scrubbers.rb +0 -14

data/lib/loofah/html5/scrub.rb CHANGED Viewed

@@ -1,70 +1,228 @@
-require 'cgi'
+# frozen_string_literal: true
+require "cgi"
+require "crass"
 module Loofah
   module HTML5 # :nodoc:
     module Scrub
+      CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
+      CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/
+      CRASS_SEMICOLON = { node: :semicolon, raw: ";" }
+      CSS_IMPORTANT = '!important'
+      CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES = /\A(["'])?[^"']+\1\z/
+      DATA_ATTRIBUTE_NAME = /\Adata-[\w-]+\z/
       class << self
+        def allowed_element?(element_name)
+          ::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include?(element_name)
+        end
         #  alternative implementation of the html5lib attribute scrubbing algorithm
         def scrub_attributes(node)
           node.attribute_nodes.each do |attr_node|
             attr_name = if attr_node.namespace
-                          "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
-                        else
-                          attr_node.node_name
-                        end
-            attr_node.remove unless HashedWhiteList::ALLOWED_ATTRIBUTES[attr_name]
-            if HashedWhiteList::ATTR_VAL_IS_URI[attr_name]
-              # this block lifted nearly verbatim from HTML5 sanitization
-              val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase
-              if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ and HashedWhiteList::ALLOWED_PROTOCOLS[val_unescaped.split(':')[0]].nil?
-                attr_node.remove
-              end
+              "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
+            else
+              attr_node.node_name
             end
-            if HashedWhiteList::SVG_ATTR_VAL_ALLOWS_REF[attr_name]
-              attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attr_node.value
+            if attr_name =~ DATA_ATTRIBUTE_NAME
+              next
             end
-            if HashedWhiteList::SVG_ALLOW_LOCAL_HREF[node.name] && attr_name == 'xlink:href' && attr_node.value =~ /^\s*[^#\s].*/m
+            unless SafeList::ALLOWED_ATTRIBUTES.include?(attr_name)
               attr_node.remove
+              next
+            end
+            if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
+              next if scrub_uri_attribute(attr_node)
+            end
+            if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
+              scrub_attribute_that_allows_local_ref(attr_node)
+            end
+            if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == "xlink:href" && attr_node.value =~ /^\s*[^#\s].*/m
+              attr_node.remove
+              next
             end
           end
-          if node.attributes['style']
-            node['style'] = scrub_css(node.attributes['style'])
+          scrub_css_attribute(node)
+          node.attribute_nodes.each do |attr_node|
+            if attr_node.value !~ /[^[:space:]]/ && attr_node.name !~ DATA_ATTRIBUTE_NAME
+              node.remove_attribute(attr_node.name)
+            end
           end
+          force_correct_attribute_escaping!(node)
+        end
+        def scrub_css_attribute(node)
+          style = node.attributes["style"]
+          style.value = scrub_css(style.value) if style
         end
-        #  lifted nearly verbatim from html5lib
         def scrub_css(style)
-          # disallow urls
-          style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
-          # gauntlet
-          return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/
-          return '' unless style =~ /^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$/
-          clean = []
-          style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val|
-            next if val.empty?
-            prop.downcase!
-            if HashedWhiteList::ALLOWED_CSS_PROPERTIES[prop]
-              clean << "#{prop}: #{val};"
-            elsif %w[background border margin padding].include?(prop.split('-')[0])
-              clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
-                HashedWhiteList::ALLOWED_CSS_KEYWORDS[keyword].nil? and
-                  keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
+          style_tree = Crass.parse_properties(style)
+          sanitized_tree = []
+          style_tree.each do |node|
+            next unless node[:node] == :property
+            next if node[:children].any? do |child|
+              [:url, :bad_url].include?(child[:node])
+            end
+            name = node[:name].downcase
+            next unless SafeList::ALLOWED_CSS_PROPERTIES.include?(name) ||
+                SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
+                SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)
+            value = node[:children].map do |child|
+              case child[:node]
+              when :whitespace
+                nil
+              when :string
+                if child[:raw] =~ CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES
+                  Crass::Parser.stringify(child)
+                else
+                  nil
+                end
+              when :function
+                if SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase)
+                  Crass::Parser.stringify(child)
+                end
+              when :ident
+                keyword = child[:value]
+                if !SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first) ||
+                   SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
+                   (keyword =~ CSS_KEYWORDISH)
+                  keyword
+                end
+              else
+                child[:raw]
+              end
+            end.compact
+            next if value.empty?
+            value << CSS_IMPORTANT if node[:important]
+            propstring = format("%s:%s", name, value.join(" "))
+            sanitized_node = Crass.parse_properties(propstring).first
+            sanitized_tree << sanitized_node << CRASS_SEMICOLON
+          end
+          Crass::Parser.stringify(sanitized_tree)
+        end
+        def scrub_attribute_that_allows_local_ref(attr_node)
+          return unless attr_node.value
+          nodes = Crass::Parser.new(attr_node.value).parse_component_values
+          values = nodes.map do |node|
+            case node[:node]
+            when :url
+              if node[:value].start_with?("#")
+                node[:raw]
+              else
+                nil
               end
-            elsif HashedWhiteList::ALLOWED_SVG_PROPERTIES[prop]
-              clean << "#{prop}: #{val};"
+            when :hash, :ident, :string
+              node[:raw]
+            else
+              nil
             end
+          end.compact
+          attr_node.value = values.join(" ")
+        end
+        def scrub_uri_attribute(attr_node)
+          # this block lifted nearly verbatim from HTML5 sanitization
+          val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
+          if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
+            attr_node.remove
+            return true
+          elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
+            # permit only allowed data mediatypes
+            mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
+            mediatype, _ = mediatype.split(";")[0..1] if mediatype
+            if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
+              attr_node.remove
+              return true
+            end
+          end
+          false
+        end
+        #
+        #  libxml2 >= 2.9.2 fails to escape comments within some attributes.
+        #
+        #  see comments about CVE-2018-8048 within the tests for more information
+        #
+        def force_correct_attribute_escaping!(node)
+          return unless Nokogiri::VersionInfo.instance.libxml2?
+          node.attribute_nodes.each do |attr_node|
+            next unless LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES.include?(attr_node.name)
+            tag_name = LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES_QUALIFYING_TAG[attr_node.name]
+            next unless tag_name.nil? || tag_name == node.name
+            #
+            #  this block is just like CGI.escape in Ruby 2.4, but
+            #  only encodes space and double-quote, to mimic
+            #  pre-2.9.2 behavior
+            #
+            encoding = attr_node.value.encoding
+            attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
+              "%" + m.unpack("H2" * m.bytesize).join("%").upcase
+            end.force_encoding(encoding)
           end
+        end
-          style = clean.join(' ')
+        def cdata_needs_escaping?(node)
+          # Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` or `script` tag as cdata, but it acts that way
+          node.cdata? || (Nokogiri.jruby? && node.text? && (node.parent.name == "style" || node.parent.name == "script"))
         end
-      end
+        def cdata_escape(node)
+          escaped_text = escape_tags(node.text)
+          if Nokogiri.jruby?
+            node.document.create_text_node(escaped_text)
+          else
+            node.document.create_cdata(escaped_text)
+          end
+        end
+        TABLE_FOR_ESCAPE_HTML__ = {
+          '<' => '&lt;',
+          '>' => '&gt;',
+          '&' => '&amp;',
+        }
+        def escape_tags(string)
+          # modified version of CGI.escapeHTML from ruby 3.1
+          enc = string.encoding
+          unless enc.ascii_compatible?
+            if enc.dummy?
+              origenc = enc
+              enc = Encoding::Converter.asciicompat_encoding(enc)
+              string = enc ? string.encode(enc) : string.b
+            end
+            table = Hash[TABLE_FOR_ESCAPE_HTML__.map {|pair|pair.map {|s|s.encode(enc)}}]
+            string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
+            string.encode!(origenc) if origenc
+            string
+          else
+            string = string.b
+            string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
+            string.force_encoding(enc)
+          end
+        end
+      end
     end
   end
 end

data/lib/loofah/instance_methods.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 module Loofah
   #
   #  Mixes +scrub!+ into Document, DocumentFragment, Node and NodeSet.
@@ -41,7 +42,7 @@ module Loofah
         when Nokogiri::XML::Document
           scrubber.traverse(root) if root
         when Nokogiri::XML::DocumentFragment
-          children.each { |node| node.scrub!(scrubber) } # TODO: children.scrub! once Nokogiri 1.4.2 is out
+          children.scrub! scrubber
         else
           scrubber.traverse(self)
         end
@@ -91,29 +92,34 @@ module Loofah
     #    # decidedly not ok for browser:
     #    frag.text(:encode_special_chars => false) # => "<script>alert('EVIL');</script>"
     #
-    def text(options={})
-      result = serialize_root.children.inner_text rescue ""
+    def text(options = {})
+      result = if serialize_root
+        serialize_root.children.reject(&:comment?).map(&:inner_text).join("")
+      else
+        ""
+      end
       if options[:encode_special_chars] == false
         result # possibly dangerous if rendered in a browser
       else
         encode_special_chars result
       end
     end
     alias :inner_text :text
-    alias :to_str     :text
+    alias :to_str :text
     #
     #  Returns a plain-text version of the markup contained by the
     #  fragment, with HTML entities encoded.
     #
-    #  This method is slower than #to_text, but is clever about
-    #  whitespace around block elements.
+    #  This method is slower than #text, but is clever about
+    #  whitespace around block elements and line break elements.
     #
-    #    Loofah.document("<h1>Title</h1><div>Content</div>").to_text
-    #    # => "\nTitle\n\nContent\n"
+    #    Loofah.document("<h1>Title</h1><div>Content<br>Next line</div>").to_text
+    #    # => "\nTitle\n\nContent\nNext line\n"
     #
-    def to_text(options={})
-      Loofah::Helpers.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text(options)
+    def to_text(options = {})
+      Loofah.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text(options)
     end
   end

data/lib/loofah/metahelpers.rb CHANGED Viewed

@@ -1,15 +1,14 @@
+# frozen_string_literal: true
 module Loofah
-  module MetaHelpers
-    def self.HashifiedConstants(orig_module)
-      hashed_module = Module.new
-      orig_module.constants.each do |constant|
-        next unless orig_module.module_eval("#{constant}").is_a?(Array)
-        hashed_module.module_eval <<-CODE
-          #{constant} = {}
-          #{orig_module.name}::#{constant}.each { |c| #{constant}[c] = true ; #{constant}[c.downcase] = true }
-        CODE
+  module MetaHelpers # :nodoc:
+    def self.add_downcased_set_members_to_all_set_constants(mojule)
+      mojule.constants.each do |constant_sym|
+        constant = mojule.const_get constant_sym
+        next unless Set === constant
+        constant.dup.each do |member|
+          constant.add member.downcase
+        end
       end
-      hashed_module
     end
   end
 end

data/lib/loofah/scrubber.rb CHANGED Viewed

@@ -1,8 +1,9 @@
+# frozen_string_literal: true
 module Loofah
   #
   #  A RuntimeError raised when Loofah could not find an appropriate scrubber.
   #
-  class ScrubberNotFound < RuntimeError ; end
+  class ScrubberNotFound < RuntimeError; end
   #
   #  A Scrubber wraps up a block (or method) that is run on an HTML node (element):
@@ -36,7 +37,7 @@ module Loofah
     CONTINUE = Object.new.freeze
     # Top-down Scrubbers may return STOP to indicate that the subtree should not be traversed.
-    STOP     = Object.new.freeze
+    STOP = Object.new.freeze
     # When a scrubber is initialized, the :direction may be specified
     # as :top_down (the default) or :bottom_up.
@@ -64,7 +65,7 @@ module Loofah
     def initialize(options = {}, &block)
       direction = options[:direction] || :top_down
       unless [:top_down, :bottom_up].include?(direction)
-        raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
+        raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
       end
       @direction, @block = direction, block
     end
@@ -86,16 +87,31 @@ module Loofah
       raise ScrubberNotFound, "No scrub method has been defined on #{self.class.to_s}"
     end
+    #
+    # If the attribute is not set, add it
+    # If the attribute is set, don't overwrite the existing value
+    #
+    def append_attribute(node, attribute, value)
+      current_value = node.get_attribute(attribute) || ""
+      current_values = current_value.split(/\s+/)
+      updated_value = current_values | [value]
+      node.set_attribute(attribute, updated_value.join(" "))
+    end
     private
     def html5lib_sanitize(node)
       case node.type
       when Nokogiri::XML::Node::ELEMENT_NODE
-        if HTML5::HashedWhiteList::ALLOWED_ELEMENTS_WITH_LIBXML2[node.name]
+        if HTML5::Scrub.allowed_element? node.name
           HTML5::Scrub.scrub_attributes node
           return Scrubber::CONTINUE
         end
       when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
+        if HTML5::Scrub.cdata_needs_escaping?(node)
+          node.before(HTML5::Scrub.cdata_escape(node))
+          return Scrubber::STOP
+        end
         return Scrubber::CONTINUE
       end
       Scrubber::STOP
@@ -107,11 +123,11 @@ module Loofah
       else
         return if scrub(node) == STOP
       end
-      node.children.each {|j| traverse_conditionally_top_down(j)}
+      node.children.each { |j| traverse_conditionally_top_down(j) }
     end
     def traverse_conditionally_bottom_up(node)
-      node.children.each {|j| traverse_conditionally_bottom_up(j)}
+      node.children.each { |j| traverse_conditionally_bottom_up(j) }
       if block
         block.call(node)
       else

data/lib/loofah/scrubbers.rb CHANGED Viewed

@@ -1,7 +1,8 @@
+# frozen_string_literal: true
 module Loofah
   #
   #  Loofah provides some built-in scrubbers for sanitizing with
-  #  HTML5lib's whitelist and for accomplishing some common
+  #  HTML5lib's safelist and for accomplishing some common
   #  transformation tasks.
   #
   #
@@ -58,6 +59,30 @@ module Loofah
   #     Loofah.fragment(link_farmers_markup).scrub!(:nofollow)
   #     => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
   #
+  #
+  #  === Loofah::Scrubbers::NoOpener / scrub!(:noopener)
+  #
+  #  +:noopener+ adds a rel="noopener" attribute to all links
+  #
+  #     link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
+  #     Loofah.fragment(link_farmers_markup).scrub!(:noopener)
+  #     => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
+  #
+  #
+  #  === Loofah::Scrubbers::Unprintable / scrub!(:unprintable)
+  #
+  #  +:unprintable+ removes unprintable Unicode characters.
+  #
+  #     markup = "<p>Some text with an unprintable character at the end\u2028</p>"
+  #     Loofah.fragment(markup).scrub!(:unprintable)
+  #     => "<p>Some text with an unprintable character at the end</p>"
+  #
+  #  You may not be able to see the unprintable character in the above example, but there is a
+  #  U+2028 character right before the closing </p> tag. These characters can cause issues if
+  #  the content is ever parsed by JavaScript - more information here:
+  #
+  #     http://timelessrepo.com/json-isnt-a-javascript-subset
+  #
   module Scrubbers
     #
     #  === scrub!(:strip)
@@ -75,8 +100,9 @@ module Loofah
       def scrub(node)
         return CONTINUE if html5lib_sanitize(node) == CONTINUE
-        node.before node.inner_html
+        node.before(node.children)
         node.remove
+        return STOP
       end
     end
@@ -117,8 +143,7 @@ module Loofah
       def scrub(node)
         return CONTINUE if html5lib_sanitize(node) == CONTINUE
-        replacement_killer = Nokogiri::XML::Text.new(node.to_s, node.document)
-        node.add_next_sibling replacement_killer
+        node.add_next_sibling Nokogiri::XML::Text.new(node.to_s, node.document)
         node.remove
         return STOP
       end
@@ -150,7 +175,7 @@ module Loofah
       def scrub(node)
         case node.type
         when Nokogiri::XML::Node::ELEMENT_NODE
-          if HTML5::HashedWhiteList::ALLOWED_ELEMENTS_WITH_LIBXML2[node.name]
+          if HTML5::Scrub.allowed_element? node.name
             node.attributes.each { |attr| node.remove_attribute(attr.first) }
             return CONTINUE if node.namespaces.empty?
           end
@@ -177,9 +202,30 @@ module Loofah
       end
       def scrub(node)
-        return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == 'a')
-        node.set_attribute('rel', 'nofollow')
-        return STOP
+        return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
+        append_attribute(node, "rel", "nofollow")
+        return STOP
+      end
+    end
+    #
+    #  === scrub!(:noopener)
+    #
+    #  +:noopener+ adds a rel="noopener" attribute to all links
+    #
+    #     link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
+    #     Loofah.fragment(link_farmers_markup).scrub!(:noopener)
+    #     => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
+    #
+    class NoOpener < Scrubber
+      def initialize
+        @direction = :top_down
+      end
+      def scrub(node)
+        return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
+        append_attribute(node, "rel", "noopener")
+        return STOP
       end
     end
@@ -190,23 +236,57 @@ module Loofah
       end
       def scrub(node)
-        return CONTINUE unless Loofah::HashedElements::BLOCK_LEVEL[node.name]
-        replacement_killer = Nokogiri::XML::Text.new("\n#{node.content}\n", node.document)
-        node.add_next_sibling replacement_killer
+        return CONTINUE unless Loofah::Elements::LINEBREAKERS.include?(node.name)
+        replacement = if Loofah::Elements::INLINE_LINE_BREAK.include?(node.name)
+          "\n"
+        else
+          "\n#{node.content}\n"
+        end
+        node.add_next_sibling Nokogiri::XML::Text.new(replacement, node.document)
         node.remove
       end
     end
+    #
+    #  === scrub!(:unprintable)
+    #
+    #  +:unprintable+ removes unprintable Unicode characters.
+    #
+    #     markup = "<p>Some text with an unprintable character at the end\u2028</p>"
+    #     Loofah.fragment(markup).scrub!(:unprintable)
+    #     => "<p>Some text with an unprintable character at the end</p>"
+    #
+    #  You may not be able to see the unprintable character in the above example, but there is a
+    #  U+2028 character right before the closing </p> tag. These characters can cause issues if
+    #  the content is ever parsed by JavaScript - more information here:
+    #
+    #     http://timelessrepo.com/json-isnt-a-javascript-subset
+    #
+    class Unprintable < Scrubber
+      def initialize
+        @direction = :top_down
+      end
+      def scrub(node)
+        if node.type == Nokogiri::XML::Node::TEXT_NODE || node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE
+          node.content = node.content.gsub(/\u2028|\u2029/, "")
+        end
+        CONTINUE
+      end
+    end
     #
     #  A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune).
     #
     MAP = {
-      :escape    => Escape,
-      :prune     => Prune,
+      :escape => Escape,
+      :prune => Prune,
       :whitewash => Whitewash,
-      :strip     => Strip,
-      :nofollow  => NoFollow,
-      :newline_block_elements => NewlineBlockElements
+      :strip => Strip,
+      :nofollow => NoFollow,
+      :noopener => NoOpener,
+      :newline_block_elements => NewlineBlockElements,
+      :unprintable => Unprintable,
     }
     #

data/lib/loofah/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module Loofah
+  # The version of Loofah you are using
+  VERSION = "2.19.1"
+end

data/lib/loofah/xml/document.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 module Loofah
   module XML # :nodoc:
     #

data/lib/loofah/xml/document_fragment.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 module Loofah
   module XML # :nodoc:
     #
@@ -12,8 +13,10 @@ module Loofah
         #  constructor. Applications should use Loofah.fragment to
         #  parse a fragment.
         #
-        def parse tags
-          self.new(Loofah::XML::Document.new, tags)
+        def parse(tags)
+          doc = Loofah::XML::Document.new
+          doc.encoding = tags.encoding.name if tags.respond_to?(:encoding)
+          self.new(doc, tags)
         end
       end
     end