RubyGems - loofah - Versions diffs - 2.2.3 → 2.19.1 - Mend

loofah 2.2.3 → 2.19.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +221 -31
data/README.md +18 -24
data/lib/loofah/elements.rb +79 -75
data/lib/loofah/helpers.rb +18 -7
data/lib/loofah/html/document.rb +1 -0
data/lib/loofah/html/document_fragment.rb +4 -2
data/lib/loofah/html5/libxml2_workarounds.rb +8 -7
data/lib/loofah/html5/safelist.rb +1042 -0
data/lib/loofah/html5/scrub.rb +150 -55
data/lib/loofah/instance_methods.rb +14 -8
data/lib/loofah/metahelpers.rb +2 -1
data/lib/loofah/scrubber.rb +12 -7
data/lib/loofah/scrubbers.rb +21 -19
data/lib/loofah/version.rb +5 -0
data/lib/loofah/xml/document.rb +1 -0
data/lib/loofah/xml/document_fragment.rb +2 -1
data/lib/loofah.rb +35 -18
metadata +52 -138
data/.gemtest +0 -0
data/Gemfile +0 -22
data/Manifest.txt +0 -40
data/Rakefile +0 -79
data/benchmark/benchmark.rb +0 -149
data/benchmark/fragment.html +0 -96
data/benchmark/helper.rb +0 -73
data/benchmark/www.slashdot.com.html +0 -2560
data/lib/loofah/html5/whitelist.rb +0 -186
data/test/assets/msword.html +0 -63
data/test/assets/testdata_sanitizer_tests1.dat +0 -502
data/test/helper.rb +0 -18
data/test/html5/test_sanitizer.rb +0 -382
data/test/integration/test_ad_hoc.rb +0 -204
data/test/integration/test_helpers.rb +0 -43
data/test/integration/test_html.rb +0 -72
data/test/integration/test_scrubbers.rb +0 -400
data/test/integration/test_xml.rb +0 -55
data/test/unit/test_api.rb +0 -142
data/test/unit/test_encoding.rb +0 -20
data/test/unit/test_helpers.rb +0 -62
data/test/unit/test_scrubber.rb +0 -229
data/test/unit/test_scrubbers.rb +0 -14

data/lib/loofah/html5/scrub.rb CHANGED Viewed

@@ -1,104 +1,160 @@
-require 'cgi'
-require 'crass'
+# frozen_string_literal: true
+require "cgi"
+require "crass"
 module Loofah
   module HTML5 # :nodoc:
     module Scrub
       CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
-      CSS_KEYWORDISH = /\A(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)\z/
-      CRASS_SEMICOLON = {:node => :semicolon, :raw => ";"}
+      CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(ch|cm|r?em|ex|in|lh|mm|pc|pt|px|Q|vmax|vmin|vw|vh|%|,|\))?)\z/
+      CRASS_SEMICOLON = { node: :semicolon, raw: ";" }
+      CSS_IMPORTANT = '!important'
+      CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES = /\A(["'])?[^"']+\1\z/
+      DATA_ATTRIBUTE_NAME = /\Adata-[\w-]+\z/
       class << self
-        def allowed_element? element_name
-          ::Loofah::HTML5::WhiteList::ALLOWED_ELEMENTS_WITH_LIBXML2.include? element_name
+        def allowed_element?(element_name)
+          ::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include?(element_name)
         end
         #  alternative implementation of the html5lib attribute scrubbing algorithm
-        def scrub_attributes node
+        def scrub_attributes(node)
           node.attribute_nodes.each do |attr_node|
             attr_name = if attr_node.namespace
-                          "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
-                        else
-                          attr_node.node_name
-                        end
+              "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
+            else
+              attr_node.node_name
+            end
-            if attr_name =~ /\Adata-[\w-]+\z/
+            if attr_name =~ DATA_ATTRIBUTE_NAME
               next
             end
-            unless WhiteList::ALLOWED_ATTRIBUTES.include?(attr_name)
+            unless SafeList::ALLOWED_ATTRIBUTES.include?(attr_name)
               attr_node.remove
               next
             end
-            if WhiteList::ATTR_VAL_IS_URI.include?(attr_name)
-              # this block lifted nearly verbatim from HTML5 sanitization
-              val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS,'').downcase
-              if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && ! WhiteList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0])
-                attr_node.remove
-                next
-              elsif val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[0] == 'data'
-                # permit only allowed data mediatypes
-                mediatype = val_unescaped.split(WhiteList::PROTOCOL_SEPARATOR)[1]
-                mediatype, _ = mediatype.split(';')[0..1] if mediatype
-                if mediatype && !WhiteList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
-                  attr_node.remove
-                  next
-                end
-              end
+            if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
+              next if scrub_uri_attribute(attr_node)
             end
-            if WhiteList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
-              attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attr_node.value
+            if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
+              scrub_attribute_that_allows_local_ref(attr_node)
             end
-            if WhiteList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == 'xlink:href' && attr_node.value =~ /^\s*[^#\s].*/m
+            if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == "xlink:href" && attr_node.value =~ /^\s*[^#\s].*/m
               attr_node.remove
               next
             end
           end
-          scrub_css_attribute node
+          scrub_css_attribute(node)
           node.attribute_nodes.each do |attr_node|
-            node.remove_attribute(attr_node.name) if attr_node.value !~ /[^[:space:]]/
+            if attr_node.value !~ /[^[:space:]]/ && attr_node.name !~ DATA_ATTRIBUTE_NAME
+              node.remove_attribute(attr_node.name)
+            end
           end
-          force_correct_attribute_escaping! node
+          force_correct_attribute_escaping!(node)
         end
-        def scrub_css_attribute node
-          style = node.attributes['style']
+        def scrub_css_attribute(node)
+          style = node.attributes["style"]
           style.value = scrub_css(style.value) if style
         end
-        def scrub_css style
-          style_tree = Crass.parse_properties style
+        def scrub_css(style)
+          style_tree = Crass.parse_properties(style)
           sanitized_tree = []
           style_tree.each do |node|
             next unless node[:node] == :property
             next if node[:children].any? do |child|
-              [:url, :bad_url].include?(child[:node]) || (child[:node] == :function && !WhiteList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase))
+              [:url, :bad_url].include?(child[:node])
             end
             name = node[:name].downcase
-            if WhiteList::ALLOWED_CSS_PROPERTIES.include?(name) || WhiteList::ALLOWED_SVG_PROPERTIES.include?(name)
-              sanitized_tree << node << CRASS_SEMICOLON
-            elsif WhiteList::SHORTHAND_CSS_PROPERTIES.include?(name.split('-').first)
-              value = node[:value].split.map do |keyword|
-                if WhiteList::ALLOWED_CSS_KEYWORDS.include?(keyword) || keyword =~ CSS_KEYWORDISH
+            next unless SafeList::ALLOWED_CSS_PROPERTIES.include?(name) ||
+                SafeList::ALLOWED_SVG_PROPERTIES.include?(name) ||
+                SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first)
+            value = node[:children].map do |child|
+              case child[:node]
+              when :whitespace
+                nil
+              when :string
+                if child[:raw] =~ CSS_PROPERTY_STRING_WITHOUT_EMBEDDED_QUOTES
+                  Crass::Parser.stringify(child)
+                else
+                  nil
+                end
+              when :function
+                if SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase)
+                  Crass::Parser.stringify(child)
+                end
+              when :ident
+                keyword = child[:value]
+                if !SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split("-").first) ||
+                   SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) ||
+                   (keyword =~ CSS_KEYWORDISH)
                   keyword
                 end
-              end.compact
-              unless value.empty?
-                propstring = sprintf "%s:%s", name, value.join(" ")
-                sanitized_node = Crass.parse_properties(propstring).first
-                sanitized_tree << sanitized_node << CRASS_SEMICOLON
+              else
+                child[:raw]
               end
-            end
+            end.compact
+            next if value.empty?
+            value << CSS_IMPORTANT if node[:important]
+            propstring = format("%s:%s", name, value.join(" "))
+            sanitized_node = Crass.parse_properties(propstring).first
+            sanitized_tree << sanitized_node << CRASS_SEMICOLON
           end
-          Crass::Parser.stringify sanitized_tree
+          Crass::Parser.stringify(sanitized_tree)
+        end
+        def scrub_attribute_that_allows_local_ref(attr_node)
+          return unless attr_node.value
+          nodes = Crass::Parser.new(attr_node.value).parse_component_values
+          values = nodes.map do |node|
+            case node[:node]
+            when :url
+              if node[:value].start_with?("#")
+                node[:raw]
+              else
+                nil
+              end
+            when :hash, :ident, :string
+              node[:raw]
+            else
+              nil
+            end
+          end.compact
+          attr_node.value = values.join(" ")
+        end
+        def scrub_uri_attribute(attr_node)
+          # this block lifted nearly verbatim from HTML5 sanitization
+          val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS, "").downcase
+          if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
+            attr_node.remove
+            return true
+          elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == "data"
+            # permit only allowed data mediatypes
+            mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
+            mediatype, _ = mediatype.split(";")[0..1] if mediatype
+            if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
+              attr_node.remove
+              return true
+            end
+          end
+          false
         end
         #
@@ -106,7 +162,7 @@ module Loofah
         #
         #  see comments about CVE-2018-8048 within the tests for more information
         #
-        def force_correct_attribute_escaping! node
+        def force_correct_attribute_escaping!(node)
           return unless Nokogiri::VersionInfo.instance.libxml2?
           node.attribute_nodes.each do |attr_node|
@@ -122,11 +178,50 @@ module Loofah
             #
             encoding = attr_node.value.encoding
             attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
-              '%' + m.unpack('H2' * m.bytesize).join('%').upcase
+              "%" + m.unpack("H2" * m.bytesize).join("%").upcase
             end.force_encoding(encoding)
           end
         end
+        def cdata_needs_escaping?(node)
+          # Nokogiri's HTML4 parser on JRuby doesn't flag the child of a `style` or `script` tag as cdata, but it acts that way
+          node.cdata? || (Nokogiri.jruby? && node.text? && (node.parent.name == "style" || node.parent.name == "script"))
+        end
+        def cdata_escape(node)
+          escaped_text = escape_tags(node.text)
+          if Nokogiri.jruby?
+            node.document.create_text_node(escaped_text)
+          else
+            node.document.create_cdata(escaped_text)
+          end
+        end
+        TABLE_FOR_ESCAPE_HTML__ = {
+          '<' => '&lt;',
+          '>' => '&gt;',
+          '&' => '&amp;',
+        }
+        def escape_tags(string)
+          # modified version of CGI.escapeHTML from ruby 3.1
+          enc = string.encoding
+          unless enc.ascii_compatible?
+            if enc.dummy?
+              origenc = enc
+              enc = Encoding::Converter.asciicompat_encoding(enc)
+              string = enc ? string.encode(enc) : string.b
+            end
+            table = Hash[TABLE_FOR_ESCAPE_HTML__.map {|pair|pair.map {|s|s.encode(enc)}}]
+            string = string.gsub(/#{"[<>&]".encode(enc)}/, table)
+            string.encode!(origenc) if origenc
+            string
+          else
+            string = string.b
+            string.gsub!(/[<>&]/, TABLE_FOR_ESCAPE_HTML__)
+            string.force_encoding(enc)
+          end
+        end
       end
     end
   end

data/lib/loofah/instance_methods.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 module Loofah
   #
   #  Mixes +scrub!+ into Document, DocumentFragment, Node and NodeSet.
@@ -91,28 +92,33 @@ module Loofah
     #    # decidedly not ok for browser:
     #    frag.text(:encode_special_chars => false) # => "<script>alert('EVIL');</script>"
     #
-    def text(options={})
-      result = serialize_root.children.inner_text rescue ""
+    def text(options = {})
+      result = if serialize_root
+        serialize_root.children.reject(&:comment?).map(&:inner_text).join("")
+      else
+        ""
+      end
       if options[:encode_special_chars] == false
         result # possibly dangerous if rendered in a browser
       else
         encode_special_chars result
       end
     end
     alias :inner_text :text
-    alias :to_str     :text
+    alias :to_str :text
     #
     #  Returns a plain-text version of the markup contained by the
     #  fragment, with HTML entities encoded.
     #
-    #  This method is slower than #to_text, but is clever about
-    #  whitespace around block elements.
+    #  This method is slower than #text, but is clever about
+    #  whitespace around block elements and line break elements.
     #
-    #    Loofah.document("<h1>Title</h1><div>Content</div>").to_text
-    #    # => "\nTitle\n\nContent\n"
+    #    Loofah.document("<h1>Title</h1><div>Content<br>Next line</div>").to_text
+    #    # => "\nTitle\n\nContent\nNext line\n"
     #
-    def to_text(options={})
+    def to_text(options = {})
       Loofah.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text(options)
     end
   end

data/lib/loofah/metahelpers.rb CHANGED Viewed

@@ -1,6 +1,7 @@
+# frozen_string_literal: true
 module Loofah
   module MetaHelpers # :nodoc:
-    def self.add_downcased_set_members_to_all_set_constants mojule
+    def self.add_downcased_set_members_to_all_set_constants(mojule)
       mojule.constants.each do |constant_sym|
         constant = mojule.const_get constant_sym
         next unless Set === constant

data/lib/loofah/scrubber.rb CHANGED Viewed

@@ -1,8 +1,9 @@
+# frozen_string_literal: true
 module Loofah
   #
   #  A RuntimeError raised when Loofah could not find an appropriate scrubber.
   #
-  class ScrubberNotFound < RuntimeError ; end
+  class ScrubberNotFound < RuntimeError; end
   #
   #  A Scrubber wraps up a block (or method) that is run on an HTML node (element):
@@ -36,7 +37,7 @@ module Loofah
     CONTINUE = Object.new.freeze
     # Top-down Scrubbers may return STOP to indicate that the subtree should not be traversed.
-    STOP     = Object.new.freeze
+    STOP = Object.new.freeze
     # When a scrubber is initialized, the :direction may be specified
     # as :top_down (the default) or :bottom_up.
@@ -64,7 +65,7 @@ module Loofah
     def initialize(options = {}, &block)
       direction = options[:direction] || :top_down
       unless [:top_down, :bottom_up].include?(direction)
-        raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
+        raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
       end
       @direction, @block = direction, block
     end
@@ -91,10 +92,10 @@ module Loofah
     # If the attribute is set, don't overwrite the existing value
     #
     def append_attribute(node, attribute, value)
-      current_value = node.get_attribute(attribute) || ''
+      current_value = node.get_attribute(attribute) || ""
       current_values = current_value.split(/\s+/)
       updated_value = current_values | [value]
-      node.set_attribute(attribute, updated_value.join(' '))
+      node.set_attribute(attribute, updated_value.join(" "))
     end
     private
@@ -107,6 +108,10 @@ module Loofah
           return Scrubber::CONTINUE
         end
       when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
+        if HTML5::Scrub.cdata_needs_escaping?(node)
+          node.before(HTML5::Scrub.cdata_escape(node))
+          return Scrubber::STOP
+        end
         return Scrubber::CONTINUE
       end
       Scrubber::STOP
@@ -118,11 +123,11 @@ module Loofah
       else
         return if scrub(node) == STOP
       end
-      node.children.each {|j| traverse_conditionally_top_down(j)}
+      node.children.each { |j| traverse_conditionally_top_down(j) }
     end
     def traverse_conditionally_bottom_up(node)
-      node.children.each {|j| traverse_conditionally_bottom_up(j)}
+      node.children.each { |j| traverse_conditionally_bottom_up(j) }
       if block
         block.call(node)
       else

data/lib/loofah/scrubbers.rb CHANGED Viewed

@@ -1,7 +1,8 @@
+# frozen_string_literal: true
 module Loofah
   #
   #  Loofah provides some built-in scrubbers for sanitizing with
-  #  HTML5lib's whitelist and for accomplishing some common
+  #  HTML5lib's safelist and for accomplishing some common
   #  transformation tasks.
   #
   #
@@ -99,13 +100,9 @@ module Loofah
       def scrub(node)
         return CONTINUE if html5lib_sanitize(node) == CONTINUE
-        if node.children.length == 1 && node.children.first.cdata?
-          sanitized_text = Loofah.fragment(node.children.first.to_html).scrub!(:strip).to_html
-          node.before Nokogiri::XML::Text.new(sanitized_text, node.document)
-        else
-          node.before node.children
-        end
+        node.before(node.children)
         node.remove
+        return STOP
       end
     end
@@ -205,8 +202,8 @@ module Loofah
       end
       def scrub(node)
-        return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == 'a')
-        append_attribute(node, 'rel', 'nofollow')
+        return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
+        append_attribute(node, "rel", "nofollow")
         return STOP
       end
     end
@@ -226,8 +223,8 @@ module Loofah
       end
       def scrub(node)
-        return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == 'a')
-        append_attribute(node, 'rel', 'noopener')
+        return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
+        append_attribute(node, "rel", "noopener")
         return STOP
       end
     end
@@ -239,8 +236,13 @@ module Loofah
       end
       def scrub(node)
-        return CONTINUE unless Loofah::Elements::BLOCK_LEVEL.include?(node.name)
-        node.add_next_sibling Nokogiri::XML::Text.new("\n#{node.content}\n", node.document)
+        return CONTINUE unless Loofah::Elements::LINEBREAKERS.include?(node.name)
+        replacement = if Loofah::Elements::INLINE_LINE_BREAK.include?(node.name)
+          "\n"
+        else
+          "\n#{node.content}\n"
+        end
+        node.add_next_sibling Nokogiri::XML::Text.new(replacement, node.document)
         node.remove
       end
     end
@@ -267,7 +269,7 @@ module Loofah
       def scrub(node)
         if node.type == Nokogiri::XML::Node::TEXT_NODE || node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE
-          node.content = node.content.gsub(/\u2028|\u2029/, '')
+          node.content = node.content.gsub(/\u2028|\u2029/, "")
         end
         CONTINUE
       end
@@ -277,14 +279,14 @@ module Loofah
     #  A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune).
     #
     MAP = {
-      :escape    => Escape,
-      :prune     => Prune,
+      :escape => Escape,
+      :prune => Prune,
       :whitewash => Whitewash,
-      :strip     => Strip,
-      :nofollow  => NoFollow,
+      :strip => Strip,
+      :nofollow => NoFollow,
       :noopener => NoOpener,
       :newline_block_elements => NewlineBlockElements,
-      :unprintable => Unprintable
+      :unprintable => Unprintable,
     }
     #

data/lib/loofah/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module Loofah
+  # The version of Loofah you are using
+  VERSION = "2.19.1"
+end

data/lib/loofah/xml/document.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 module Loofah
   module XML # :nodoc:
     #

data/lib/loofah/xml/document_fragment.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 module Loofah
   module XML # :nodoc:
     #
@@ -12,7 +13,7 @@ module Loofah
         #  constructor. Applications should use Loofah.fragment to
         #  parse a fragment.
         #
-        def parse tags
+        def parse(tags)
           doc = Loofah::XML::Document.new
           doc.encoding = tags.encoding.name if tags.respond_to?(:encoding)
           self.new(doc, tags)

data/lib/loofah.rb CHANGED Viewed

@@ -1,22 +1,24 @@
+# frozen_string_literal: true
 $LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__))) unless $LOAD_PATH.include?(File.expand_path(File.dirname(__FILE__)))
-require 'nokogiri'
+require "nokogiri"
-require 'loofah/metahelpers'
-require 'loofah/elements'
+require_relative "loofah/version"
+require_relative "loofah/metahelpers"
+require_relative "loofah/elements"
-require 'loofah/html5/whitelist'
-require 'loofah/html5/libxml2_workarounds'
-require 'loofah/html5/scrub'
+require_relative "loofah/html5/safelist"
+require_relative "loofah/html5/libxml2_workarounds"
+require_relative "loofah/html5/scrub"
-require 'loofah/scrubber'
-require 'loofah/scrubbers'
+require_relative "loofah/scrubber"
+require_relative "loofah/scrubbers"
-require 'loofah/instance_methods'
-require 'loofah/xml/document'
-require 'loofah/xml/document_fragment'
-require 'loofah/html/document'
-require 'loofah/html/document_fragment'
+require_relative "loofah/instance_methods"
+require_relative "loofah/xml/document"
+require_relative "loofah/xml/document_fragment"
+require_relative "loofah/html/document"
+require_relative "loofah/html/document_fragment"
 # == Strings and IO Objects as Input
 #
@@ -27,14 +29,11 @@ require 'loofah/html/document_fragment'
 # quantities of docs.
 #
 module Loofah
-  # The version of Loofah you are using
-  VERSION = '2.2.3'
   class << self
     # Shortcut for Loofah::HTML::Document.parse
     # This method accepts the same parameters as Nokogiri::HTML::Document.parse
     def document(*args, &block)
-      Loofah::HTML::Document.parse(*args, &block)
+      remove_comments_before_html_element Loofah::HTML::Document.parse(*args, &block)
     end
     # Shortcut for Loofah::HTML::DocumentFragment.parse
@@ -77,7 +76,25 @@ module Loofah
     # A helper to remove extraneous whitespace from text-ified HTML
     def remove_extraneous_whitespace(string)
-      string.gsub(/\n\s*\n\s*\n/,"\n\n")
+      string.gsub(/\n\s*\n\s*\n/, "\n\n")
+    end
+    private
+    # remove comments that exist outside of the HTML element.
+    #
+    # these comments are allowed by the HTML spec:
+    #
+    #    https://www.w3.org/TR/html401/struct/global.html#h-7.1
+    #
+    # but are not scrubbed by Loofah because these nodes don't meet
+    # the contract that scrubbers expect of a node (e.g., it can be
+    # replaced, sibling and children nodes can be created).
+    def remove_comments_before_html_element(doc)
+      doc.children.each do |child|
+        child.unlink if child.comment?
+      end
+      doc
     end
   end
 end