RubyGems - loofah - Versions diffs - 0.4.2 → 2.25.0 - Mend

loofah 0.4.2 → 2.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +604 -0
data/MIT-LICENSE.txt +3 -1
data/README.md +410 -0
data/SECURITY.md +18 -0
data/lib/loofah/concerns.rb +207 -0
data/lib/loofah/elements.rb +98 -0
data/lib/loofah/helpers.rb +91 -4
data/lib/loofah/html4/document.rb +17 -0
data/lib/loofah/html4/document_fragment.rb +15 -0
data/lib/loofah/html5/document.rb +17 -0
data/lib/loofah/html5/document_fragment.rb +15 -0
data/lib/loofah/html5/libxml2_workarounds.rb +28 -0
data/lib/loofah/html5/safelist.rb +1058 -0
data/lib/loofah/html5/scrub.rb +211 -40
data/lib/loofah/metahelpers.rb +18 -0
data/lib/loofah/scrubber.rb +31 -13
data/lib/loofah/scrubbers.rb +262 -31
data/lib/loofah/version.rb +6 -0
data/lib/loofah/xml/document.rb +2 -0
data/lib/loofah/xml/document_fragment.rb +6 -9
data/lib/loofah.rb +131 -52
metadata +79 -158
data/CHANGELOG.rdoc +0 -92
data/DEPRECATED.rdoc +0 -12
data/Manifest.txt +0 -34
data/README.rdoc +0 -330
data/Rakefile +0 -61
data/TODO.rdoc +0 -4
data/benchmark/benchmark.rb +0 -149
data/benchmark/fragment.html +0 -96
data/benchmark/helper.rb +0 -73
data/benchmark/www.slashdot.com.html +0 -2560
data/init.rb +0 -1
data/lib/loofah/active_record.rb +0 -62
data/lib/loofah/html/document.rb +0 -22
data/lib/loofah/html/document_fragment.rb +0 -46
data/lib/loofah/html5/whitelist.rb +0 -174
data/lib/loofah/instance_methods.rb +0 -77
data/lib/loofah/xss_foliate.rb +0 -212
data/test/helper.rb +0 -8
data/test/html5/test_sanitizer.rb +0 -248
data/test/test_active_record.rb +0 -146
data/test/test_ad_hoc.rb +0 -272
data/test/test_api.rb +0 -128
data/test/test_helpers.rb +0 -28
data/test/test_scrubber.rb +0 -227
data/test/test_scrubbers.rb +0 -144
data/test/test_xss_foliate.rb +0 -171
data.tar.gz.sig +0 -0
metadata.gz.sig +0 -2

data/lib/loofah/scrubbers.rb CHANGED Viewed

@@ -1,7 +1,9 @@
+# frozen_string_literal: true
 module Loofah
   #
   #  Loofah provides some built-in scrubbers for sanitizing with
-  #  HTML5lib's whitelist and for accomplishing some common
+  #  HTML5lib's safelist and for accomplishing some common
   #  transformation tasks.
   #
   #
@@ -10,7 +12,7 @@ module Loofah
   #  +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents:
   #
   #     unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
-  #     Loofah.fragment(unsafe_html).scrub!(:strip)
+  #     Loofah.html5_fragment(unsafe_html).scrub!(:strip)
   #     => "ohai! <div>div is safe</div> but foo is <b>not</b>"
   #
   #
@@ -19,7 +21,7 @@ module Loofah
   #  +:prune+ removes unknown/unsafe tags and their contents (including their subtrees):
   #
   #     unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
-  #     Loofah.fragment(unsafe_html).scrub!(:prune)
+  #     Loofah.html5_fragment(unsafe_html).scrub!(:prune)
   #     => "ohai! <div>div is safe</div> "
   #
   #
@@ -28,7 +30,7 @@ module Loofah
   #  +:escape+ performs HTML entity escaping on the unknown/unsafe tags:
   #
   #     unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
-  #     Loofah.fragment(unsafe_html).scrub!(:escape)
+  #     Loofah.html5_fragment(unsafe_html).scrub!(:escape)
   #     => "ohai! <div>div is safe</div> &lt;foo&gt;but foo is &lt;b&gt;not&lt;/b&gt;&lt;/foo&gt;"
   #
   #
@@ -40,7 +42,7 @@ module Loofah
   #  layer of paint on top of the HTML input to make it look nice.
   #
   #     messy_markup = "ohai! <div id='foo' class='bar' style='margin: 10px'>div with attributes</div>"
-  #     Loofah.fragment(messy_markup).scrub!(:whitewash)
+  #     Loofah.html5_fragment(messy_markup).scrub!(:whitewash)
   #     => "ohai! <div>div with attributes</div>"
   #
   #  One use case for this scrubber is to clean up HTML that was
@@ -55,30 +57,71 @@ module Loofah
   #  +:nofollow+ adds a rel="nofollow" attribute to all links
   #
   #     link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
-  #     Loofah.fragment(link_farmers_markup).scrub!(:nofollow)
+  #     Loofah.html5_fragment(link_farmers_markup).scrub!(:nofollow)
   #     => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
   #
   #
+  #  === Loofah::Scrubbers::TargetBlank / scrub!(:targetblank)
+  #
+  #  +:targetblank+ adds a target="_blank" attribute to all links
+  #
+  #     link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
+  #     Loofah.html5_fragment(link_farmers_markup).scrub!(:targetblank)
+  #     => "ohai! <a href='http://www.myswarmysite.com/' target="_blank">I like your blog post</a>"
+  #
+  #
+  #  === Loofah::Scrubbers::NoOpener / scrub!(:noopener)
+  #
+  #  +:noopener+ adds a rel="noopener" attribute to all links
+  #
+  #     link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
+  #     Loofah.html5_fragment(link_farmers_markup).scrub!(:noopener)
+  #     => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
+  #
+  #  === Loofah::Scrubbers::NoReferrer / scrub!(:noreferrer)
+  #
+  #  +:noreferrer+ adds a rel="noreferrer" attribute to all links
+  #
+  #     link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
+  #     Loofah.html5_fragment(link_farmers_markup).scrub!(:noreferrer)
+  #     => "ohai! <a href='http://www.myswarmysite.com/' rel="noreferrer">I like your blog post</a>"
+  #
+  #
+  #  === Loofah::Scrubbers::Unprintable / scrub!(:unprintable)
+  #
+  #  +:unprintable+ removes unprintable Unicode characters.
+  #
+  #     markup = "<p>Some text with an unprintable character at the end\u2028</p>"
+  #     Loofah.html5_fragment(markup).scrub!(:unprintable)
+  #     => "<p>Some text with an unprintable character at the end</p>"
+  #
+  #  You may not be able to see the unprintable character in the above example, but there is a
+  #  U+2028 character right before the closing </p> tag. These characters can cause issues if
+  #  the content is ever parsed by JavaScript - more information here:
+  #
+  #     http://timelessrepo.com/json-isnt-a-javascript-subset
+  #
   module Scrubbers
     #
     #  === scrub!(:strip)
     #
     #  +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents:
     #
     #     unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
-    #     Loofah.fragment(unsafe_html).scrub!(:strip)
+    #     Loofah.html5_fragment(unsafe_html).scrub!(:strip)
     #     => "ohai! <div>div is safe</div> but foo is <b>not</b>"
     #
     class Strip < Scrubber
-      def initialize
+      def initialize # rubocop:disable Lint/MissingSuper
         @direction = :bottom_up
       end
       def scrub(node)
         return CONTINUE if html5lib_sanitize(node) == CONTINUE
-        node.before node.inner_html
+        node.before(node.children)
         node.remove
+        STOP
       end
     end
@@ -88,18 +131,19 @@ module Loofah
     #  +:prune+ removes unknown/unsafe tags and their contents (including their subtrees):
     #
     #     unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
-    #     Loofah.fragment(unsafe_html).scrub!(:prune)
+    #     Loofah.html5_fragment(unsafe_html).scrub!(:prune)
     #     => "ohai! <div>div is safe</div> "
     #
     class Prune < Scrubber
-      def initialize
+      def initialize # rubocop:disable Lint/MissingSuper
         @direction = :top_down
       end
       def scrub(node)
         return CONTINUE if html5lib_sanitize(node) == CONTINUE
         node.remove
-        return STOP
+        STOP
       end
     end
@@ -109,20 +153,20 @@ module Loofah
     #  +:escape+ performs HTML entity escaping on the unknown/unsafe tags:
     #
     #     unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
-    #     Loofah.fragment(unsafe_html).scrub!(:escape)
+    #     Loofah.html5_fragment(unsafe_html).scrub!(:escape)
     #     => "ohai! <div>div is safe</div> &lt;foo&gt;but foo is &lt;b&gt;not&lt;/b&gt;&lt;/foo&gt;"
     #
     class Escape < Scrubber
-      def initialize
+      def initialize # rubocop:disable Lint/MissingSuper
         @direction = :top_down
       end
       def scrub(node)
         return CONTINUE if html5lib_sanitize(node) == CONTINUE
-        replacement_killer = Nokogiri::XML::Text.new(node.to_s, node.document)
-        node.add_next_sibling replacement_killer
+        node.add_next_sibling(Nokogiri::XML::Text.new(node.to_s, node.document))
         node.remove
-        return STOP
+        STOP
       end
     end
@@ -135,7 +179,7 @@ module Loofah
     #  layer of paint on top of the HTML input to make it look nice.
     #
     #     messy_markup = "ohai! <div id='foo' class='bar' style='margin: 10px'>div with attributes</div>"
-    #     Loofah.fragment(messy_markup).scrub!(:whitewash)
+    #     Loofah.html5_fragment(messy_markup).scrub!(:whitewash)
     #     => "ohai! <div>div with attributes</div>"
     #
     #  One use case for this scrubber is to clean up HTML that was
@@ -145,14 +189,14 @@ module Loofah
     #  Certainly not me.
     #
     class Whitewash < Scrubber
-      def initialize
+      def initialize # rubocop:disable Lint/MissingSuper
         @direction = :top_down
       end
       def scrub(node)
         case node.type
         when Nokogiri::XML::Node::ELEMENT_NODE
-          if HTML5::HashedWhiteList::ALLOWED_ELEMENTS_WITH_LIBXML2[node.name]
+          if HTML5::Scrub.allowed_element?(node.name)
             node.attributes.each { |attr| node.remove_attribute(attr.first) }
             return CONTINUE if node.namespaces.empty?
           end
@@ -170,30 +214,217 @@ module Loofah
     #  +:nofollow+ adds a rel="nofollow" attribute to all links
     #
     #     link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
-    #     Loofah.fragment(link_farmers_markup).scrub!(:nofollow)
+    #     Loofah.html5_fragment(link_farmers_markup).scrub!(:nofollow)
     #     => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
     #
     class NoFollow < Scrubber
-      def initialize
+      def initialize # rubocop:disable Lint/MissingSuper
+        @direction = :top_down
+      end
+      def scrub(node)
+        return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
+        append_attribute(node, "rel", "nofollow")
+        STOP
+      end
+    end
+    #
+    #  === scrub!(:targetblank)
+    #
+    #  +:targetblank+ adds a target="_blank" attribute to all links.
+    #  If there is a target already set, replaces it with target="_blank".
+    #
+    #     link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
+    #     Loofah.html5_fragment(link_farmers_markup).scrub!(:targetblank)
+    #     => "ohai! <a href='http://www.myswarmysite.com/' target="_blank">I like your blog post</a>"
+    #
+    #  On modern browsers, setting target="_blank" on anchor elements implicitly provides the same
+    #  behavior as setting rel="noopener".
+    #
+    class TargetBlank < Scrubber
+      def initialize # rubocop:disable Lint/MissingSuper
+        @direction = :top_down
+      end
+      def scrub(node)
+        return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
+        href = node["href"]
+        node.set_attribute("target", "_blank") if href && href[0] != "#"
+        STOP
+      end
+    end
+    #
+    #  === scrub!(:noopener)
+    #
+    #  +:noopener+ adds a rel="noopener" attribute to all links
+    #
+    #     link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
+    #     Loofah.html5_fragment(link_farmers_markup).scrub!(:noopener)
+    #     => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
+    #
+    class NoOpener < Scrubber
+      def initialize # rubocop:disable Lint/MissingSuper
+        @direction = :top_down
+      end
+      def scrub(node)
+        return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
+        append_attribute(node, "rel", "noopener")
+        STOP
+      end
+    end
+    #
+    #  === scrub!(:noreferrer)
+    #
+    #  +:noreferrer+ adds a rel="noreferrer" attribute to all links
+    #
+    #     link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
+    #     Loofah.html5_fragment(link_farmers_markup).scrub!(:noreferrer)
+    #     => "ohai! <a href='http://www.myswarmysite.com/' rel="noreferrer">I like your blog post</a>"
+    #
+    class NoReferrer < Scrubber
+      def initialize # rubocop:disable Lint/MissingSuper
+        @direction = :top_down
+      end
+      def scrub(node)
+        return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
+        append_attribute(node, "rel", "noreferrer")
+        STOP
+      end
+    end
+    # This class probably isn't useful publicly, but is used for #to_text's current implemention
+    class NewlineBlockElements < Scrubber # :nodoc:
+      def initialize # rubocop:disable Lint/MissingSuper
+        @direction = :bottom_up
+      end
+      def scrub(node)
+        return CONTINUE unless Loofah::Elements::LINEBREAKERS.include?(node.name)
+        replacement = if Loofah::Elements::INLINE_LINE_BREAK.include?(node.name)
+          "\n"
+        else
+          "\n#{node.content}\n"
+        end
+        node.add_next_sibling(Nokogiri::XML::Text.new(replacement, node.document))
+        node.remove
+      end
+    end
+    #
+    #  === scrub!(:unprintable)
+    #
+    #  +:unprintable+ removes unprintable Unicode characters.
+    #
+    #     markup = "<p>Some text with an unprintable character at the end\u2028</p>"
+    #     Loofah.html5_fragment(markup).scrub!(:unprintable)
+    #     => "<p>Some text with an unprintable character at the end</p>"
+    #
+    #  You may not be able to see the unprintable character in the above example, but there is a
+    #  U+2028 character right before the closing </p> tag. These characters can cause issues if
+    #  the content is ever parsed by JavaScript - more information here:
+    #
+    #     http://timelessrepo.com/json-isnt-a-javascript-subset
+    #
+    class Unprintable < Scrubber
+      def initialize # rubocop:disable Lint/MissingSuper
         @direction = :top_down
       end
       def scrub(node)
-        return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == 'a')
-        node.set_attribute('rel', 'nofollow')
-        return STOP
+        if node.type == Nokogiri::XML::Node::TEXT_NODE || node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE
+          node.content = node.content.gsub(/\u2028|\u2029/, "")
+        end
+        CONTINUE
       end
     end
+    #
+    #  === scrub!(:double_breakpoint)
+    #
+    #  +:double_breakpoint+ replaces double-break tags with closing/opening paragraph tags.
+    #
+    #     markup = "<p>Some text here in a logical paragraph.<br><br>Some more text, apparently a second paragraph.</p>"
+    #     Loofah.html5_fragment(markup).scrub!(:double_breakpoint)
+    #     => "<p>Some text here in a logical paragraph.</p><p>Some more text, apparently a second paragraph.</p>"
+    #
+    class DoubleBreakpoint < Scrubber
+      def initialize # rubocop:disable Lint/MissingSuper
+        @direction = :top_down
+      end
+      def scrub(node)
+        return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "p")
+        paragraph_with_break_point_nodes = node.xpath("//p[br[following-sibling::br]]")
+        paragraph_with_break_point_nodes.each do |paragraph_node|
+          new_paragraph = paragraph_node.add_previous_sibling("<p>").first
+          paragraph_node.children.each do |child|
+            remove_blank_text_nodes(child)
+          end
+          paragraph_node.children.each do |child|
+            # already unlinked
+            next if child.parent.nil?
+            if child.name == "br" && child.next_sibling.name == "br"
+              new_paragraph = paragraph_node.add_previous_sibling("<p>").first
+              child.next_sibling.unlink
+              child.unlink
+            else
+              child.parent = new_paragraph
+            end
+          end
+          paragraph_node.unlink
+        end
+        CONTINUE
+      end
+      private
+      def remove_blank_text_nodes(node)
+        node.unlink if node.text? && node.blank?
+      end
+    end
     #
     #  A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune).
     #
     MAP = {
-      :escape => Escape,
-      :prune => Prune,
-      :whitewash => Whitewash,
-      :strip => Strip,
-      :nofollow => NoFollow
+      escape: Escape,
+      prune: Prune,
+      whitewash: Whitewash,
+      strip: Strip,
+      nofollow: NoFollow,
+      noopener: NoOpener,
+      noreferrer: NoReferrer,
+      targetblank: TargetBlank,
+      newline_block_elements: NewlineBlockElements,
+      unprintable: Unprintable,
+      double_breakpoint: DoubleBreakpoint,
     }
+    class << self
+      #
+      #  Returns an array of symbols representing the built-in scrubbers
+      #
+      def scrubber_symbols
+        MAP.keys
+      end
+    end
   end
 end

data/lib/loofah/version.rb ADDED Viewed

@@ -0,0 +1,6 @@
+# frozen_string_literal: true
+module Loofah
+  # The version of Loofah you are using
+  VERSION = "2.25.0"
+end

data/lib/loofah/xml/document.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 module Loofah
   module XML # :nodoc:
     #

data/lib/loofah/xml/document_fragment.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 module Loofah
   module XML # :nodoc:
     #
@@ -6,16 +8,11 @@ module Loofah
     #  See Loofah::ScrubBehavior for additional methods.
     #
     class DocumentFragment < Nokogiri::XML::DocumentFragment
-      include Loofah::ScrubBehavior::Node
       class << self
-        #
-        #  Overridden Nokogiri::XML::DocumentFragment
-        #  constructor. Applications should use Loofah.fragment to
-        #  parse a fragment.
-        #
-        def parse tags
-          self.new(Loofah::XML::Document.new, tags)
+        def parse(tags)
+          doc = Loofah::XML::Document.new
+          doc.encoding = tags.encoding.name if tags.respond_to?(:encoding)
+          new(doc, tags)
         end
       end
     end

data/lib/loofah.rb CHANGED Viewed

@@ -1,66 +1,155 @@
-$LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__))) unless $LOAD_PATH.include?(File.expand_path(File.dirname(__FILE__)))
+# frozen_string_literal: true
-require 'nokogiri'
+require "nokogiri"
-require 'loofah/html5/whitelist'
-require 'loofah/html5/scrub'
+module Loofah
+  class << self
+    def html5_support?
+      # Note that Loofah can only support HTML5 in Nokogiri >= 1.14.0 because it requires the
+      # subclassing fix from https://github.com/sparklemotion/nokogiri/pull/2534
+      return @html5_support if defined? @html5_support
+      @html5_support =
+        Gem::Version.new(Nokogiri::VERSION) > Gem::Version.new("1.14.0") &&
+        Nokogiri.uses_gumbo?
+    end
+  end
+end
+require_relative "loofah/version"
+require_relative "loofah/metahelpers"
+require_relative "loofah/elements"
-require 'loofah/scrubber'
-require 'loofah/scrubbers'
+require_relative "loofah/html5/safelist"
+require_relative "loofah/html5/libxml2_workarounds"
+require_relative "loofah/html5/scrub"
-require 'loofah/instance_methods'
-require 'loofah/xml/document'
-require 'loofah/xml/document_fragment'
-require 'loofah/html/document'
-require 'loofah/html/document_fragment'
+require_relative "loofah/scrubber"
+require_relative "loofah/scrubbers"
-require 'loofah/helpers'
+require_relative "loofah/concerns"
+require_relative "loofah/xml/document"
+require_relative "loofah/xml/document_fragment"
+require_relative "loofah/html4/document"
+require_relative "loofah/html4/document_fragment"
+if Loofah.html5_support?
+  require_relative "loofah/html5/document"
+  require_relative "loofah/html5/document_fragment"
+end
 # == Strings and IO Objects as Input
 #
-# Loofah.document and Loofah.fragment accept any IO object in addition
-# to accepting a string. That IO object could be a file, or a socket,
-# or a StringIO, or anything that responds to +read+ and
-# +close+. Which makes it particularly easy to sanitize mass
-# quantities of docs.
+# The following methods accept any IO object in addition to accepting a string:
+#
+# - Loofah.html4_document
+# - Loofah.html4_fragment
+# - Loofah.scrub_html4_document
+# - Loofah.scrub_html4_fragment
+#
+# - Loofah.html5_document
+# - Loofah.html5_fragment
+# - Loofah.scrub_html5_document
+# - Loofah.scrub_html5_fragment
+#
+# - Loofah.xml_document
+# - Loofah.xml_fragment
+# - Loofah.scrub_xml_document
+# - Loofah.scrub_xml_fragment
+#
+# - Loofah.document
+# - Loofah.fragment
+# - Loofah.scrub_document
+# - Loofah.scrub_fragment
+#
+# That IO object could be a file, or a socket, or a StringIO, or anything that responds to +read+
+# and +close+.
 #
 module Loofah
-  # The version of Loofah you are using
-  VERSION = '0.4.2'
-  # The minimum required version of Nokogiri
-  REQUIRED_NOKOGIRI_VERSION = '1.3.3'
+  # Alias for Loofah::HTML4
+  HTML = HTML4
   class << self
-    # Shortcut for Loofah::HTML::Document.parse
-    # This method accepts the same parameters as Nokogiri::HTML::Document.parse
-    def document(*args, &block)
-      Loofah::HTML::Document.parse(*args, &block)
+    # Shortcut for Loofah::HTML4::Document.parse(*args, &block)
+    #
+    # This method accepts the same parameters as Nokogiri::HTML4::Document.parse
+    def html4_document(*args, &block)
+      Loofah::HTML4::Document.parse(*args, &block)
+    end
+    # Shortcut for Loofah::HTML4::DocumentFragment.parse(*args, &block)
+    #
+    # This method accepts the same parameters as Nokogiri::HTML4::DocumentFragment.parse
+    def html4_fragment(*args, &block)
+      Loofah::HTML4::DocumentFragment.parse(*args, &block)
     end
-    # Shortcut for Loofah::HTML::DocumentFragment.parse
-    # This method accepts the same parameters as Nokogiri::HTML::DocumentFragment.parse
-    def fragment(*args, &block)
-      Loofah::HTML::DocumentFragment.parse(*args, &block)
+    # Shortcut for Loofah::HTML4::Document.parse(string_or_io).scrub!(method)
+    def scrub_html4_document(string_or_io, method)
+      Loofah::HTML4::Document.parse(string_or_io).scrub!(method)
     end
-    # Shortcut for Loofah.fragment(string_or_io).scrub!(method)
-    def scrub_fragment(string_or_io, method)
-      Loofah.fragment(string_or_io).scrub!(method)
+    # Shortcut for Loofah::HTML4::DocumentFragment.parse(string_or_io).scrub!(method)
+    def scrub_html4_fragment(string_or_io, method)
+      Loofah::HTML4::DocumentFragment.parse(string_or_io).scrub!(method)
     end
-    # Shortcut for Loofah.document(string_or_io).scrub!(method)
-    def scrub_document(string_or_io, method)
-      Loofah.document(string_or_io).scrub!(method)
+    if Loofah.html5_support?
+      # Shortcut for Loofah::HTML5::Document.parse(*args, &block)
+      #
+      # This method accepts the same parameters as Nokogiri::HTML5::Document.parse
+      def html5_document(*args, &block)
+        Loofah::HTML5::Document.parse(*args, &block)
+      end
+      # Shortcut for Loofah::HTML5::DocumentFragment.parse(*args, &block)
+      #
+      # This method accepts the same parameters as Nokogiri::HTML5::DocumentFragment.parse
+      def html5_fragment(*args, &block)
+        Loofah::HTML5::DocumentFragment.parse(*args, &block)
+      end
+      # Shortcut for Loofah::HTML5::Document.parse(string_or_io).scrub!(method)
+      def scrub_html5_document(string_or_io, method)
+        Loofah::HTML5::Document.parse(string_or_io).scrub!(method)
+      end
+      # Shortcut for Loofah::HTML5::DocumentFragment.parse(string_or_io).scrub!(method)
+      def scrub_html5_fragment(string_or_io, method)
+        Loofah::HTML5::DocumentFragment.parse(string_or_io).scrub!(method)
+      end
+    else
+      def html5_document(*args, &block)
+        raise NotImplementedError, "Loofah::HTML5 is not supported by your version of Nokogiri"
+      end
+      def html5_fragment(*args, &block)
+        raise NotImplementedError, "Loofah::HTML5 is not supported by your version of Nokogiri"
+      end
+      def scrub_html5_document(string_or_io, method)
+        raise NotImplementedError, "Loofah::HTML5 is not supported by your version of Nokogiri"
+      end
+      def scrub_html5_fragment(string_or_io, method)
+        raise NotImplementedError, "Loofah::HTML5 is not supported by your version of Nokogiri"
+      end
     end
-    # Shortcut for Loofah::XML::Document.parse
+    alias_method :document, :html4_document
+    alias_method :fragment, :html4_fragment
+    alias_method :scrub_document, :scrub_html4_document
+    alias_method :scrub_fragment, :scrub_html4_fragment
+    # Shortcut for Loofah::XML::Document.parse(*args, &block)
+    #
     # This method accepts the same parameters as Nokogiri::XML::Document.parse
     def xml_document(*args, &block)
       Loofah::XML::Document.parse(*args, &block)
     end
-    # Shortcut for Loofah::XML::DocumentFragment.parse
+    # Shortcut for Loofah::XML::DocumentFragment.parse(*args, &block)
+    #
     # This method accepts the same parameters as Nokogiri::XML::DocumentFragment.parse
     def xml_fragment(*args, &block)
       Loofah::XML::DocumentFragment.parse(*args, &block)
@@ -76,19 +165,9 @@ module Loofah
       Loofah.xml_document(string_or_io).scrub!(method)
     end
+    # A helper to remove extraneous whitespace from text-ified HTML
+    def remove_extraneous_whitespace(string)
+      string.gsub(/\n\s*\n\s*\n/, "\n\n")
+    end
   end
 end
-if Nokogiri::VERSION < Loofah::REQUIRED_NOKOGIRI_VERSION
-  raise RuntimeError, "Loofah requires Nokogiri #{Loofah::REQUIRED_NOKOGIRI_VERSION} or later (currently #{Nokogiri::VERSION})"
-end
-if defined? Rails.configuration and Rails.configuration.frameworks.include?([:active_record]) # rails 2.1 and later
-  Rails.configuration.after_initialize do
-    require 'loofah/active_record'
-    require 'loofah/xss_foliate'
-  end
-elsif defined? ActiveRecord::Base # rails 2.0
-  require 'loofah/active_record'
-  require 'loofah/xss_foliate'
-end