RubyGems - loofah - Versions diffs - 2.3.1 - Mend

loofah 2.3.1

Potentially problematic release.

This version of loofah might be problematic. Click here for more details.

Files changed (43) hide show

checksums.yaml +7 -0
data/.gemtest +0 -0
data/CHANGELOG.md +336 -0
data/Gemfile +22 -0
data/MIT-LICENSE.txt +23 -0
data/Manifest.txt +41 -0
data/README.md +363 -0
data/Rakefile +81 -0
data/SECURITY.md +18 -0
data/benchmark/benchmark.rb +149 -0
data/benchmark/fragment.html +96 -0
data/benchmark/helper.rb +73 -0
data/benchmark/www.slashdot.com.html +2560 -0
data/lib/loofah.rb +83 -0
data/lib/loofah/elements.rb +92 -0
data/lib/loofah/helpers.rb +103 -0
data/lib/loofah/html/document.rb +18 -0
data/lib/loofah/html/document_fragment.rb +40 -0
data/lib/loofah/html5/libxml2_workarounds.rb +26 -0
data/lib/loofah/html5/safelist.rb +796 -0
data/lib/loofah/html5/scrub.rb +133 -0
data/lib/loofah/instance_methods.rb +127 -0
data/lib/loofah/metahelpers.rb +13 -0
data/lib/loofah/scrubber.rb +133 -0
data/lib/loofah/scrubbers.rb +297 -0
data/lib/loofah/xml/document.rb +13 -0
data/lib/loofah/xml/document_fragment.rb +23 -0
data/test/assets/msword.html +63 -0
data/test/assets/testdata_sanitizer_tests1.dat +502 -0
data/test/helper.rb +18 -0
data/test/html5/test_sanitizer.rb +401 -0
data/test/html5/test_scrub.rb +10 -0
data/test/integration/test_ad_hoc.rb +220 -0
data/test/integration/test_helpers.rb +43 -0
data/test/integration/test_html.rb +72 -0
data/test/integration/test_scrubbers.rb +400 -0
data/test/integration/test_xml.rb +55 -0
data/test/unit/test_api.rb +142 -0
data/test/unit/test_encoding.rb +20 -0
data/test/unit/test_helpers.rb +62 -0
data/test/unit/test_scrubber.rb +229 -0
data/test/unit/test_scrubbers.rb +14 -0
metadata +287 -0

@@ -0,0 +1,133 @@
+require 'cgi'
+require 'crass'
+module Loofah
+  module HTML5 # :nodoc:
+    module Scrub
+      CONTROL_CHARACTERS = /[`\u0000-\u0020\u007f\u0080-\u0101]/
+      CSS_KEYWORDISH = /\A(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|-?\d{0,3}\.?\d{0,10}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)\z/
+      CRASS_SEMICOLON = {:node => :semicolon, :raw => ";"}
+      class << self
+        def allowed_element? element_name
+          ::Loofah::HTML5::SafeList::ALLOWED_ELEMENTS_WITH_LIBXML2.include? element_name
+        end
+        #  alternative implementation of the html5lib attribute scrubbing algorithm
+        def scrub_attributes node
+          node.attribute_nodes.each do |attr_node|
+            attr_name = if attr_node.namespace
+                          "#{attr_node.namespace.prefix}:#{attr_node.node_name}"
+                        else
+                          attr_node.node_name
+                        end
+            if attr_name =~ /\Adata-[\w-]+\z/
+              next
+            end
+            unless SafeList::ALLOWED_ATTRIBUTES.include?(attr_name)
+              attr_node.remove
+              next
+            end
+            if SafeList::ATTR_VAL_IS_URI.include?(attr_name)
+              # this block lifted nearly verbatim from HTML5 sanitization
+              val_unescaped = CGI.unescapeHTML(attr_node.value).gsub(CONTROL_CHARACTERS,'').downcase
+              if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && ! SafeList::ALLOWED_PROTOCOLS.include?(val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0])
+                attr_node.remove
+                next
+              elsif val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[0] == 'data'
+                # permit only allowed data mediatypes
+                mediatype = val_unescaped.split(SafeList::PROTOCOL_SEPARATOR)[1]
+                mediatype, _ = mediatype.split(';')[0..1] if mediatype
+                if mediatype && !SafeList::ALLOWED_URI_DATA_MEDIATYPES.include?(mediatype)
+                  attr_node.remove
+                  next
+                end
+              end
+            end
+            if SafeList::SVG_ATTR_VAL_ALLOWS_REF.include?(attr_name)
+              attr_node.value = attr_node.value.gsub(/url\s*\(\s*[^#\s][^)]+?\)/m, ' ') if attr_node.value
+            end
+            if SafeList::SVG_ALLOW_LOCAL_HREF.include?(node.name) && attr_name == 'xlink:href' && attr_node.value =~ /^\s*[^#\s].*/m
+              attr_node.remove
+              next
+            end
+          end
+          scrub_css_attribute node
+          node.attribute_nodes.each do |attr_node|
+            node.remove_attribute(attr_node.name) if attr_node.value !~ /[^[:space:]]/
+          end
+          force_correct_attribute_escaping! node
+        end
+        def scrub_css_attribute node
+          style = node.attributes['style']
+          style.value = scrub_css(style.value) if style
+        end
+        def scrub_css style
+          style_tree = Crass.parse_properties style
+          sanitized_tree = []
+          style_tree.each do |node|
+            next unless node[:node] == :property
+            next if node[:children].any? do |child|
+              [:url, :bad_url].include?(child[:node]) || (child[:node] == :function && !SafeList::ALLOWED_CSS_FUNCTIONS.include?(child[:name].downcase))
+            end
+            name = node[:name].downcase
+            if SafeList::ALLOWED_CSS_PROPERTIES.include?(name) || SafeList::ALLOWED_SVG_PROPERTIES.include?(name)
+              sanitized_tree << node << CRASS_SEMICOLON
+            elsif SafeList::SHORTHAND_CSS_PROPERTIES.include?(name.split('-').first)
+              value = node[:value].split.map do |keyword|
+                if SafeList::ALLOWED_CSS_KEYWORDS.include?(keyword) || keyword =~ CSS_KEYWORDISH
+                  keyword
+                end
+              end.compact
+              unless value.empty?
+                propstring = sprintf "%s:%s", name, value.join(" ")
+                sanitized_node = Crass.parse_properties(propstring).first
+                sanitized_tree << sanitized_node << CRASS_SEMICOLON
+              end
+            end
+          end
+          Crass::Parser.stringify sanitized_tree
+        end
+        #
+        #  libxml2 >= 2.9.2 fails to escape comments within some attributes.
+        #
+        #  see comments about CVE-2018-8048 within the tests for more information
+        #
+        def force_correct_attribute_escaping! node
+          return unless Nokogiri::VersionInfo.instance.libxml2?
+          node.attribute_nodes.each do |attr_node|
+            next unless LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES.include?(attr_node.name)
+            tag_name = LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES_QUALIFYING_TAG[attr_node.name]
+            next unless tag_name.nil? || tag_name == node.name
+            #
+            #  this block is just like CGI.escape in Ruby 2.4, but
+            #  only encodes space and double-quote, to mimic
+            #  pre-2.9.2 behavior
+            #
+            encoding = attr_node.value.encoding
+            attr_node.value = attr_node.value.gsub(/[ "]/) do |m|
+              '%' + m.unpack('H2' * m.bytesize).join('%').upcase
+            end.force_encoding(encoding)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/loofah/instance_methods.rb ADDED

@@ -0,0 +1,127 @@
+module Loofah
+  #
+  #  Mixes +scrub!+ into Document, DocumentFragment, Node and NodeSet.
+  #
+  #  Traverse the document or fragment, invoking the +scrubber+ on
+  #  each node.
+  #
+  #  +scrubber+ must either be one of the symbols representing the
+  #  built-in scrubbers (see Scrubbers), or a Scrubber instance.
+  #
+  #    span2div = Loofah::Scrubber.new do |node|
+  #      node.name = "div" if node.name == "span"
+  #    end
+  #    Loofah.fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
+  #    # => "<div>foo</div><p>bar</p>"
+  #
+  #  or
+  #
+  #    unsafe_html = "ohai! <div>div is safe</div> <script>but script is not</script>"
+  #    Loofah.fragment(unsafe_html).scrub!(:strip).to_s
+  #    # => "ohai! <div>div is safe</div> "
+  #
+  #  Note that this method is called implicitly from
+  #  Loofah.scrub_fragment and Loofah.scrub_document.
+  #
+  #  Please see Scrubber for more information on implementation and traversal, and
+  #  README.rdoc for more example usage.
+  #
+  module ScrubBehavior
+    module Node # :nodoc:
+      def scrub!(scrubber)
+        #
+        #  yes. this should be three separate methods. but nokogiri
+        #  decorates (or not) based on whether the module name has
+        #  already been included. and since documents get decorated
+        #  just like their constituent nodes, we need to jam all the
+        #  logic into a single module.
+        #
+        scrubber = ScrubBehavior.resolve_scrubber(scrubber)
+        case self
+        when Nokogiri::XML::Document
+          scrubber.traverse(root) if root
+        when Nokogiri::XML::DocumentFragment
+          children.scrub! scrubber
+        else
+          scrubber.traverse(self)
+        end
+        self
+      end
+    end
+    module NodeSet # :nodoc:
+      def scrub!(scrubber)
+        each { |node| node.scrub!(scrubber) }
+        self
+      end
+    end
+    def ScrubBehavior.resolve_scrubber(scrubber) # :nodoc:
+      scrubber = Scrubbers::MAP[scrubber].new if Scrubbers::MAP[scrubber]
+      unless scrubber.is_a?(Loofah::Scrubber)
+        raise Loofah::ScrubberNotFound, "not a Scrubber or a scrubber name: #{scrubber.inspect}"
+      end
+      scrubber
+    end
+  end
+  #
+  #  Overrides +text+ in HTML::Document and HTML::DocumentFragment,
+  #  and mixes in +to_text+.
+  #
+  module TextBehavior
+    #
+    #  Returns a plain-text version of the markup contained by the document,
+    #  with HTML entities encoded.
+    #
+    #  This method is significantly faster than #to_text, but isn't
+    #  clever about whitespace around block elements.
+    #
+    #    Loofah.document("<h1>Title</h1><div>Content</div>").text
+    #    # => "TitleContent"
+    #
+    #  By default, the returned text will have HTML entities
+    #  escaped. If you want unescaped entities, and you understand
+    #  that the result is unsafe to render in a browser, then you
+    #  can pass an argument as shown:
+    #
+    #    frag = Loofah.fragment("&lt;script&gt;alert('EVIL');&lt;/script&gt;")
+    #    # ok for browser:
+    #    frag.text                                 # => "&lt;script&gt;alert('EVIL');&lt;/script&gt;"
+    #    # decidedly not ok for browser:
+    #    frag.text(:encode_special_chars => false) # => "<script>alert('EVIL');</script>"
+    #
+    def text(options={})
+      result = serialize_root.children.inner_text rescue ""
+      if options[:encode_special_chars] == false
+        result # possibly dangerous if rendered in a browser
+      else
+        encode_special_chars result
+      end
+    end
+    alias :inner_text :text
+    alias :to_str     :text
+    #
+    #  Returns a plain-text version of the markup contained by the
+    #  fragment, with HTML entities encoded.
+    #
+    #  This method is slower than #to_text, but is clever about
+    #  whitespace around block elements.
+    #
+    #    Loofah.document("<h1>Title</h1><div>Content</div>").to_text
+    #    # => "\nTitle\n\nContent\n"
+    #
+    def to_text(options={})
+      Loofah.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text(options)
+    end
+  end
+  module DocumentDecorator # :nodoc:
+    def initialize(*args, &block)
+      super
+      self.decorators(Nokogiri::XML::Node) << ScrubBehavior::Node
+      self.decorators(Nokogiri::XML::NodeSet) << ScrubBehavior::NodeSet
+    end
+  end
+end

data/lib/loofah/metahelpers.rb ADDED

@@ -0,0 +1,13 @@
+module Loofah
+  module MetaHelpers # :nodoc:
+    def self.add_downcased_set_members_to_all_set_constants mojule
+      mojule.constants.each do |constant_sym|
+        constant = mojule.const_get constant_sym
+        next unless Set === constant
+        constant.dup.each do |member|
+          constant.add member.downcase
+        end
+      end
+    end
+  end
+end

data/lib/loofah/scrubber.rb ADDED

@@ -0,0 +1,133 @@
+module Loofah
+  #
+  #  A RuntimeError raised when Loofah could not find an appropriate scrubber.
+  #
+  class ScrubberNotFound < RuntimeError ; end
+  #
+  #  A Scrubber wraps up a block (or method) that is run on an HTML node (element):
+  #
+  #    # change all <span> tags to <div> tags
+  #    span2div = Loofah::Scrubber.new do |node|
+  #      node.name = "div" if node.name == "span"
+  #    end
+  #
+  #  Alternatively, this scrubber could have been implemented as:
+  #
+  #    class Span2Div < Loofah::Scrubber
+  #      def scrub(node)
+  #        node.name = "div" if node.name == "span"
+  #      end
+  #    end
+  #    span2div = Span2Div.new
+  #
+  #  This can then be run on a document:
+  #
+  #    Loofah.fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
+  #    # => "<div>foo</div><p>bar</p>"
+  #
+  #  Scrubbers can be run on a document in either a top-down traversal (the
+  #  default) or bottom-up. Top-down scrubbers can optionally return
+  #  Scrubber::STOP to terminate the traversal of a subtree.
+  #
+  class Scrubber
+    # Top-down Scrubbers may return CONTINUE to indicate that the subtree should be traversed.
+    CONTINUE = Object.new.freeze
+    # Top-down Scrubbers may return STOP to indicate that the subtree should not be traversed.
+    STOP     = Object.new.freeze
+    # When a scrubber is initialized, the :direction may be specified
+    # as :top_down (the default) or :bottom_up.
+    attr_reader :direction
+    # When a scrubber is initialized, the optional block is saved as
+    # :block. Note that, if no block is passed, then the +scrub+
+    # method is assumed to have been implemented.
+    attr_reader :block
+    #
+    #  Options may include
+    #    :direction => :top_down (the default)
+    #  or
+    #    :direction => :bottom_up
+    #
+    #  For top_down traversals, if the block returns
+    #  Loofah::Scrubber::STOP, then the traversal will be terminated
+    #  for the current node's subtree.
+    #
+    #  Alternatively, a Scrubber may inherit from Loofah::Scrubber,
+    #  and implement +scrub+, which is slightly faster than using a
+    #  block.
+    #
+    def initialize(options = {}, &block)
+      direction = options[:direction] || :top_down
+      unless [:top_down, :bottom_up].include?(direction)
+        raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
+      end
+      @direction, @block = direction, block
+    end
+    #
+    #  Calling +traverse+ will cause the document to be traversed by
+    #  either the lambda passed to the initializer or the +scrub+
+    #  method, in the direction specified at +new+ time.
+    #
+    def traverse(node)
+      direction == :bottom_up ? traverse_conditionally_bottom_up(node) : traverse_conditionally_top_down(node)
+    end
+    #
+    #  When +new+ is not passed a block, the class may implement
+    #  +scrub+, which will be called for each document node.
+    #
+    def scrub(node)
+      raise ScrubberNotFound, "No scrub method has been defined on #{self.class.to_s}"
+    end
+    #
+    # If the attribute is not set, add it
+    # If the attribute is set, don't overwrite the existing value
+    #
+    def append_attribute(node, attribute, value)
+      current_value = node.get_attribute(attribute) || ''
+      current_values = current_value.split(/\s+/)
+      updated_value = current_values | [value]
+      node.set_attribute(attribute, updated_value.join(' '))
+    end
+    private
+    def html5lib_sanitize(node)
+      case node.type
+      when Nokogiri::XML::Node::ELEMENT_NODE
+        if HTML5::Scrub.allowed_element? node.name
+          HTML5::Scrub.scrub_attributes node
+          return Scrubber::CONTINUE
+        end
+      when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
+        return Scrubber::CONTINUE
+      end
+      Scrubber::STOP
+    end
+    def traverse_conditionally_top_down(node)
+      if block
+        return if block.call(node) == STOP
+      else
+        return if scrub(node) == STOP
+      end
+      node.children.each {|j| traverse_conditionally_top_down(j)}
+    end
+    def traverse_conditionally_bottom_up(node)
+      node.children.each {|j| traverse_conditionally_bottom_up(j)}
+      if block
+        block.call(node)
+      else
+        scrub(node)
+      end
+    end
+  end
+end

data/lib/loofah/scrubbers.rb ADDED

@@ -0,0 +1,297 @@
+module Loofah
+  #
+  #  Loofah provides some built-in scrubbers for sanitizing with
+  #  HTML5lib's safelist and for accomplishing some common
+  #  transformation tasks.
+  #
+  #
+  #  === Loofah::Scrubbers::Strip / scrub!(:strip)
+  #
+  #  +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents:
+  #
+  #     unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
+  #     Loofah.fragment(unsafe_html).scrub!(:strip)
+  #     => "ohai! <div>div is safe</div> but foo is <b>not</b>"
+  #
+  #
+  #  === Loofah::Scrubbers::Prune / scrub!(:prune)
+  #
+  #  +:prune+ removes unknown/unsafe tags and their contents (including their subtrees):
+  #
+  #     unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
+  #     Loofah.fragment(unsafe_html).scrub!(:prune)
+  #     => "ohai! <div>div is safe</div> "
+  #
+  #
+  #  === Loofah::Scrubbers::Escape / scrub!(:escape)
+  #
+  #  +:escape+ performs HTML entity escaping on the unknown/unsafe tags:
+  #
+  #     unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
+  #     Loofah.fragment(unsafe_html).scrub!(:escape)
+  #     => "ohai! <div>div is safe</div> &lt;foo&gt;but foo is &lt;b&gt;not&lt;/b&gt;&lt;/foo&gt;"
+  #
+  #
+  #  === Loofah::Scrubbers::Whitewash / scrub!(:whitewash)
+  #
+  #  +:whitewash+ removes all comments, styling and attributes in
+  #  addition to doing markup-fixer-uppery and pruning unsafe tags. I
+  #  like to call this "whitewashing", since it's like putting a new
+  #  layer of paint on top of the HTML input to make it look nice.
+  #
+  #     messy_markup = "ohai! <div id='foo' class='bar' style='margin: 10px'>div with attributes</div>"
+  #     Loofah.fragment(messy_markup).scrub!(:whitewash)
+  #     => "ohai! <div>div with attributes</div>"
+  #
+  #  One use case for this scrubber is to clean up HTML that was
+  #  cut-and-pasted from Microsoft Word into a WYSIWYG editor or a
+  #  rich text editor. Microsoft's software is famous for injecting
+  #  all kinds of cruft into its HTML output. Who needs that crap?
+  #  Certainly not me.
+  #
+  #
+  #  === Loofah::Scrubbers::NoFollow / scrub!(:nofollow)
+  #
+  #  +:nofollow+ adds a rel="nofollow" attribute to all links
+  #
+  #     link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
+  #     Loofah.fragment(link_farmers_markup).scrub!(:nofollow)
+  #     => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
+  #
+  #
+  #  === Loofah::Scrubbers::NoOpener / scrub!(:noopener)
+  #
+  #  +:noopener+ adds a rel="noopener" attribute to all links
+  #
+  #     link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
+  #     Loofah.fragment(link_farmers_markup).scrub!(:noopener)
+  #     => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
+  #
+  #
+  #  === Loofah::Scrubbers::Unprintable / scrub!(:unprintable)
+  #
+  #  +:unprintable+ removes unprintable Unicode characters.
+  #
+  #     markup = "<p>Some text with an unprintable character at the end\u2028</p>"
+  #     Loofah.fragment(markup).scrub!(:unprintable)
+  #     => "<p>Some text with an unprintable character at the end</p>"
+  #
+  #  You may not be able to see the unprintable character in the above example, but there is a
+  #  U+2028 character right before the closing </p> tag. These characters can cause issues if
+  #  the content is ever parsed by JavaScript - more information here:
+  #
+  #     http://timelessrepo.com/json-isnt-a-javascript-subset
+  #
+  module Scrubbers
+    #
+    #  === scrub!(:strip)
+    #
+    #  +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents:
+    #
+    #     unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
+    #     Loofah.fragment(unsafe_html).scrub!(:strip)
+    #     => "ohai! <div>div is safe</div> but foo is <b>not</b>"
+    #
+    class Strip < Scrubber
+      def initialize
+        @direction = :bottom_up
+      end
+      def scrub(node)
+        return CONTINUE if html5lib_sanitize(node) == CONTINUE
+        if node.children.length == 1 && node.children.first.cdata?
+          sanitized_text = Loofah.fragment(node.children.first.to_html).scrub!(:strip).to_html
+          node.before Nokogiri::XML::Text.new(sanitized_text, node.document)
+        else
+          node.before node.children
+        end
+        node.remove
+      end
+    end
+    #
+    #  === scrub!(:prune)
+    #
+    #  +:prune+ removes unknown/unsafe tags and their contents (including their subtrees):
+    #
+    #     unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
+    #     Loofah.fragment(unsafe_html).scrub!(:prune)
+    #     => "ohai! <div>div is safe</div> "
+    #
+    class Prune < Scrubber
+      def initialize
+        @direction = :top_down
+      end
+      def scrub(node)
+        return CONTINUE if html5lib_sanitize(node) == CONTINUE
+        node.remove
+        return STOP
+      end
+    end
+    #
+    #  === scrub!(:escape)
+    #
+    #  +:escape+ performs HTML entity escaping on the unknown/unsafe tags:
+    #
+    #     unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
+    #     Loofah.fragment(unsafe_html).scrub!(:escape)
+    #     => "ohai! <div>div is safe</div> &lt;foo&gt;but foo is &lt;b&gt;not&lt;/b&gt;&lt;/foo&gt;"
+    #
+    class Escape < Scrubber
+      def initialize
+        @direction = :top_down
+      end
+      def scrub(node)
+        return CONTINUE if html5lib_sanitize(node) == CONTINUE
+        node.add_next_sibling Nokogiri::XML::Text.new(node.to_s, node.document)
+        node.remove
+        return STOP
+      end
+    end
+    #
+    #  === scrub!(:whitewash)
+    #
+    #  +:whitewash+ removes all comments, styling and attributes in
+    #  addition to doing markup-fixer-uppery and pruning unsafe tags. I
+    #  like to call this "whitewashing", since it's like putting a new
+    #  layer of paint on top of the HTML input to make it look nice.
+    #
+    #     messy_markup = "ohai! <div id='foo' class='bar' style='margin: 10px'>div with attributes</div>"
+    #     Loofah.fragment(messy_markup).scrub!(:whitewash)
+    #     => "ohai! <div>div with attributes</div>"
+    #
+    #  One use case for this scrubber is to clean up HTML that was
+    #  cut-and-pasted from Microsoft Word into a WYSIWYG editor or a
+    #  rich text editor. Microsoft's software is famous for injecting
+    #  all kinds of cruft into its HTML output. Who needs that crap?
+    #  Certainly not me.
+    #
+    class Whitewash < Scrubber
+      def initialize
+        @direction = :top_down
+      end
+      def scrub(node)
+        case node.type
+        when Nokogiri::XML::Node::ELEMENT_NODE
+          if HTML5::Scrub.allowed_element? node.name
+            node.attributes.each { |attr| node.remove_attribute(attr.first) }
+            return CONTINUE if node.namespaces.empty?
+          end
+        when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
+          return CONTINUE
+        end
+        node.remove
+        STOP
+      end
+    end
+    #
+    #  === scrub!(:nofollow)
+    #
+    #  +:nofollow+ adds a rel="nofollow" attribute to all links
+    #
+    #     link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
+    #     Loofah.fragment(link_farmers_markup).scrub!(:nofollow)
+    #     => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
+    #
+    class NoFollow < Scrubber
+      def initialize
+        @direction = :top_down
+      end
+      def scrub(node)
+        return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == 'a')
+        append_attribute(node, 'rel', 'nofollow')
+        return STOP
+      end
+    end
+    #
+    #  === scrub!(:noopener)
+    #
+    #  +:noopener+ adds a rel="noopener" attribute to all links
+    #
+    #     link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
+    #     Loofah.fragment(link_farmers_markup).scrub!(:noopener)
+    #     => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
+    #
+    class NoOpener < Scrubber
+      def initialize
+        @direction = :top_down
+      end
+      def scrub(node)
+        return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == 'a')
+        append_attribute(node, 'rel', 'noopener')
+        return STOP
+      end
+    end
+    # This class probably isn't useful publicly, but is used for #to_text's current implemention
+    class NewlineBlockElements < Scrubber # :nodoc:
+      def initialize
+        @direction = :bottom_up
+      end
+      def scrub(node)
+        return CONTINUE unless Loofah::Elements::BLOCK_LEVEL.include?(node.name)
+        node.add_next_sibling Nokogiri::XML::Text.new("\n#{node.content}\n", node.document)
+        node.remove
+      end
+    end
+    #
+    #  === scrub!(:unprintable)
+    #
+    #  +:unprintable+ removes unprintable Unicode characters.
+    #
+    #     markup = "<p>Some text with an unprintable character at the end\u2028</p>"
+    #     Loofah.fragment(markup).scrub!(:unprintable)
+    #     => "<p>Some text with an unprintable character at the end</p>"
+    #
+    #  You may not be able to see the unprintable character in the above example, but there is a
+    #  U+2028 character right before the closing </p> tag. These characters can cause issues if
+    #  the content is ever parsed by JavaScript - more information here:
+    #
+    #     http://timelessrepo.com/json-isnt-a-javascript-subset
+    #
+    class Unprintable < Scrubber
+      def initialize
+        @direction = :top_down
+      end
+      def scrub(node)
+        if node.type == Nokogiri::XML::Node::TEXT_NODE || node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE
+          node.content = node.content.gsub(/\u2028|\u2029/, '')
+        end
+        CONTINUE
+      end
+    end
+    #
+    #  A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune).
+    #
+    MAP = {
+      :escape    => Escape,
+      :prune     => Prune,
+      :whitewash => Whitewash,
+      :strip     => Strip,
+      :nofollow  => NoFollow,
+      :noopener => NoOpener,
+      :newline_block_elements => NewlineBlockElements,
+      :unprintable => Unprintable
+    }
+    #
+    #  Returns an array of symbols representing the built-in scrubbers
+    #
+    def self.scrubber_symbols
+      MAP.keys
+    end
+  end
+end