RubyGems - loofah - Versions diffs - 0.3.1 → 0.4.0 - Mend

loofah 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of loofah might be problematic. Click here for more details.

Files changed (25) hide show

data.tar.gz.sig +0 -0
data/CHANGELOG.rdoc +9 -0
data/Manifest.txt +3 -1
data/README.rdoc +223 -92
data/Rakefile +11 -3
data/TODO.rdoc +0 -5
data/lib/loofah.rb +27 -138
data/lib/loofah/active_record.rb +10 -18
data/lib/loofah/html/document.rb +4 -4
data/lib/loofah/html/document_fragment.rb +5 -5
data/lib/loofah/html5/scrub.rb +1 -1
data/lib/loofah/html5/whitelist.rb +1 -1
data/lib/loofah/instance_methods.rb +47 -0
data/lib/loofah/scrubber.rb +98 -76
data/lib/loofah/scrubbers.rb +199 -0
data/lib/loofah/xss_foliate.rb +71 -69
data/test/html5/test_sanitizer.rb +12 -9
data/test/test_active_record.rb +22 -0
data/test/test_ad_hoc.rb +42 -0
data/test/test_api.rb +47 -1
data/test/test_scrubber.rb +204 -102
data/test/test_scrubbers.rb +144 -0
metadata +44 -12
metadata.gz.sig +0 -0
data/test/html5/testdata/tests1.dat +0 -501

data/TODO.rdoc CHANGED

@@ -1,9 +1,4 @@
 = TODO
-* Allow developers to implement their own sanitizations.
-  * Implement a proper visitor pattern.
-  * Make internal loofah methods available.
 * Allow a <tt>text</tt> option to insert nice newlines after headers and block elements.
 * <tt>to_markdown<tt>

data/lib/loofah.rb CHANGED

@@ -6,94 +6,16 @@ require 'loofah/html5/whitelist'
 require 'loofah/html5/scrub'
 require 'loofah/scrubber'
+require 'loofah/scrubbers'
+require 'loofah/instance_methods'
+require 'loofah/xml/document'
+require 'loofah/xml/document_fragment'
 require 'loofah/html/document'
 require 'loofah/html/document_fragment'
 require 'loofah/helpers'
-#
-# Loofah is an HTML sanitizer wrapped around Nokogiri[http://nokogiri.org], an excellent
-# HTML/XML parser. If you don't know how Nokogiri[http://nokogiri.org]
-# works, you might want to pause for a moment and go check it out. I'll
-# wait.
-#
-# A Loofah::HTML::Document is a subclass of Nokogiri::HTML::Document,
-# so a parsed document gives you all the markup fixer-uppery and API
-# goodness of Nokogiri.
-#
-#   Loofah.document(unsafe_html).is_a?(Nokogiri::HTML::Document)         # => true
-#   Loofah.fragment(unsafe_html).is_a?(Nokogiri::HTML::DocumentFragment) # => true
-#
-# Loofah adds a +scrub!+ method, which can clean up your HTML in a few
-# different ways by modifying the document in-place:
-#
-#   doc.scrub!(:strip)       # replaces unknown/unsafe tags with their inner text
-#   doc.scrub!(:prune)       # removes  unknown/unsafe tags and their children
-#   doc.scrub!(:whitewash)   # removes  unknown/unsafe/namespaced tags and their children,
-#                            #          and strips all node attributes
-#   doc.scrub!(:escape)      # escapes  unknown/unsafe tags, like this: &lt;script&gt;
-#
-# Loofah overrides +to_s+ to return html:
-#
-#   unsafe_html = "ohai! <div>div is safe</div> <script>but script is not</script>"
-#
-#   doc = Loofah.fragment(unsafe_html).scrub!(:strip)
-#   doc.to_s    # => "ohai! <div>div is safe</div> "
-#
-# and +text+ to return plain text:
-#
-#   doc.text    # => "ohai! div is safe "
-#
-# Or, if you prefer, you can use the shorthand methods +scrub_fragment+ and +scrub_document+:
-#
-#   Loofah.scrub_fragment(unsafe_html, :prune).to_s
-#   Loofah.scrub_document(unsafe_html, :strip).text
-#
-# == Usage
-#
-# Let's say you have a Web 2.0 application, and you allow people to
-# send HTML snippets to each other.
-#
-# Let's also say some script-kiddie from Norland sends this to your
-# users, in an effort to swipe some credit cards:
-#
-#     <script src=http://ha.ckers.org/xss.js></script>
-#
-# Oooh, that could be bad. Here's how to fix it:
-#
-#     Loofah.scrub_fragment(dangerous_html, :escape).to_s
-#
-#     # => "&lt;script src=\"http://ha.ckers.org/xss.js\"&gt;&lt;/script&gt;"
-#
-# Loofah also makes available the sanitized markup in both HTML and
-# plain-text formats without incurring the overhead of multiple
-# parsings:
-#
-#     safe_fragment = Loofah.scrub_fragment(dangerous_html, :strip)
-#     safe_fragment.to_s    # => HTML output
-#     safe_fragment.text    # => plain text output
-#
-# And you can modify the HTML using Nokogiri's API, if you like:
-#
-#     stylized_fragment = Loofah.fragment(dangerous_html)
-#     stylized_fragment.xpath("//a/text()").wrap("<span></span>")
-#     stylized_fragment.scrub!(:strip)
-#
-# == Fragments vs Documents
-#
-# Generally speaking, unless you expect to have \&lt;html\&gt; and
-# \&lt;body\&gt; tags in your HTML, you don't have a *document*, you
-# have a *fragment*.
-#
-# For parsing fragments, you should use Loofah.fragment. Nokogiri
-# won't wrap the result in +html+ and +body+ tags, and will ignore
-# +head+ elements.
-#
-# Full HTML documents should be parsed with Loofah.document, which
-# will add the DOCTYPE declaration, and properly handle +head+ and
-# +body+ elements.
-#
 # == Strings and IO Objects as Input
 #
 # Loofah.document and Loofah.fragment accept any IO object in addition
@@ -102,64 +24,9 @@ require 'loofah/helpers'
 # +close+. Which makes it particularly easy to sanitize mass
 # quantities of docs.
 #
-# == Scrubbing Methods
-#
-# Given:
-#     unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
-#
-# === scrub!(:strip)
-#
-# +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents:
-#
-#     Loofah.fragment(unsafe_html).scrub!(:strip)
-#     # or
-#     Loofah.scrub_fragment(unsafe_html, :strip)
-#
-#     => "ohai! <div>div is safe</div> but foo is <b>not</b>"
-#
-# === scrub!(:prune)
-#
-# +:prune+ removes unknown/unsafe tags and their contents (including their subtrees):
-#
-#     Loofah.fragment(unsafe_html).scrub!(:prune)
-#     # or
-#     Loofah.scrub_fragment(unsafe_html, :prune)
-#
-#     => "ohai! <div>div is safe</div> "
-#
-# === scrub!(:escape)
-#
-# +:escape+ performs HTML entity escaping on the unknown/unsafe tags:
-#
-#     Loofah.fragment(unsafe_html).scrub!(:escape)
-#     # or
-#     Loofah.scrub_fragment(unsafe_html, :escape)
-#
-#     => "ohai! <div>div is safe</div> &lt;foo&gt;but foo is &lt;b&gt;not&lt;/b&gt;&lt;/foo&gt;"
-#
-# === scrub!(:whitewash)
-#
-# +:whitewash+ removes all comments, styling and attributes in
-# addition to doing markup-fixer-uppery and pruning unsafe tags. I
-# like to call this "whitewashing", since it's like putting a new
-# layer of paint on top of the HTML input to make it look nice.
-#
-#     messy_markup = "ohai! <div id='foo' class='bar' style='margin: 10px'>div with attributes</div>"
-#
-#     Loofah.fragment(messy_markup).scrub!(:whitewash)
-#     # or
-#     Loofah.scrub_fragment(messy_markup, :whitewash)
-#
-#     => "ohai! <div>div with attributes</div>"
-#
-# One use case for this feature is to clean up HTML that was
-# cut-and-pasted from Microsoft Word into a WYSIWYG editor or a rich
-# text editor. Microsoft's software is famous for injecting all kinds
-# of cruft into its HTML output. Who needs that? Certainly not me.
-#
 module Loofah
   # The version of Loofah you are using
-  VERSION = '0.3.1'
+  VERSION = '0.4.0'
   # The minimum required version of Nokogiri
   REQUIRED_NOKOGIRI_VERSION = '1.3.3'
@@ -187,6 +54,28 @@ module Loofah
       Loofah.document(string_or_io).scrub!(method)
     end
+    # Shortcut for Loofah::XML::Document.parse
+    # This method accepts the same parameters as Nokogiri::XML::Document.parse
+    def xml_document(*args, &block)
+      Loofah::XML::Document.parse(*args, &block)
+    end
+    # Shortcut for Loofah::XML::DocumentFragment.parse
+    # This method accepts the same parameters as Nokogiri::XML::DocumentFragment.parse
+    def xml_fragment(*args, &block)
+      Loofah::XML::DocumentFragment.parse(*args, &block)
+    end
+    # Shortcut for Loofah.xml_fragment(string_or_io).scrub!(method)
+    def scrub_xml_fragment(string_or_io, method)
+      Loofah.xml_fragment(string_or_io).scrub!(method)
+    end
+    # Shortcut for Loofah.xml_document(string_or_io).scrub!(method)
+    def scrub_xml_document(string_or_io, method)
+      Loofah.xml_document(string_or_io).scrub!(method)
+    end
   end
 end

data/lib/loofah/active_record.rb CHANGED

@@ -21,19 +21,15 @@ module Loofah
   module ActiveRecordExtension
     #
     #  :call-seq:
-    #    html_fragment(attribute, :scrub => sanitization_method)
+    #    html_fragment(attribute, :scrub => scrubber_specification)
     #
     #  Scrub an ActiveRecord attribute +attribute+ as an HTML *fragment*
-    #  using the method specified by +sanitization_method+.
+    #  using the method specified by +scrubber_specification+.
     #
-    #  +sanitization_method+ must be one of:
+    #  +scrubber_specification+ must be an argument acceptable to Loofah::InstanceMethods.scrub!, namely:
     #
-    #  * :string
-    #  * :prune
-    #  * :escape
-    #  * :whitewash
-    #
-    #  See Loofah for an explanation of each sanitization method.
+    #  * a symbol for one of the built-in scrubbers (see Loofah::Scrubbers for a full list)
+    #  * or a Scrubber instance. (see Loofah::Scrubber for help on implementing a custom scrubber)
     #
     def html_fragment(attr, options={})
       raise ArgumentError, "html_fragment requires :scrub option" unless method = options[:scrub]
@@ -44,19 +40,15 @@ module Loofah
     #
     #  :call-seq:
-    #    model.html_document(attribute, :scrub => sanitization_method)
+    #    model.html_document(attribute, :scrub => scrubber_specification)
     #
     #  Scrub an ActiveRecord attribute +attribute+ as an HTML *document*
-    #  using the method specified by +sanitization_method+.
-    #
-    #  +sanitization_method+ must be one of:
+    #  using the method specified by +scrubber_specification+.
     #
-    #  * :string
-    #  * :prune
-    #  * :escape
-    #  * :whitewash
+    #  +scrubber_specification+ must be an argument acceptable to Loofah::InstanceMethods.scrub!, namely:
     #
-    #  See Loofah for an explanation of each sanitization method.
+    #  * a symbol for one of the built-in scrubbers (see Loofah::Scrubbers for a full list)
+    #  * or a Scrubber instance.
     #
     def html_document(attr, options={})
       raise ArgumentError, "html_document requires :scrub option" unless method = options[:scrub]

data/lib/loofah/html/document.rb CHANGED

@@ -1,16 +1,16 @@
 module Loofah
-  module HTML
+  module HTML # :nodoc:
     #
     #  Subclass of Nokogiri::HTML::Document.
     #
-    #  See Loofah::ScrubberInstanceMethods for additional methods.
+    #  See Loofah::InstanceMethods for additional methods.
     #
     class Document < Nokogiri::HTML::Document
-      include Loofah::ScrubberInstanceMethods
+      include Loofah::InstanceMethods
       private
-      def __sanitize_roots # :nodoc:
+      def sanitize_roots # :nodoc:
         xpath("/html/head","/html/body")
       end

data/lib/loofah/html/document_fragment.rb CHANGED

@@ -1,12 +1,12 @@
 module Loofah
-  module HTML
+  module HTML # :nodoc:
     #
     #  Subclass of Nokogiri::HTML::DocumentFragment. Also includes Loofah::ScrubberInstanceMethods.
     #
-    #  See Loofah::ScrubberInstanceMethods for additional methods.
+    #  See Loofah::InstanceMethods for additional methods.
     #
     class DocumentFragment < Nokogiri::HTML::DocumentFragment
-      include Loofah::ScrubberInstanceMethods
+      include Loofah::InstanceMethods
       class << self
         #
@@ -23,13 +23,13 @@ module Loofah
       #  Returns the HTML markup contained by the fragment or document
       #
       def to_s
-        __sanitize_roots.children.to_s
+        sanitize_roots.children.to_s
       end
       alias :serialize :to_s
       private
-      def __sanitize_roots # :nodoc:
+      def sanitize_roots # :nodoc:
         xpath("./body").first || self
       end

data/lib/loofah/html5/scrub.rb CHANGED

@@ -1,7 +1,7 @@
 require 'cgi'
 module Loofah
-  module HTML5
+  module HTML5 # :nodoc:
     module Scrub
       class << self

data/lib/loofah/html5/whitelist.rb CHANGED

@@ -1,5 +1,5 @@
 module Loofah
-  module HTML5
+  module HTML5 # :nodoc:
     #
     #  HTML whitelist lifted from HTML5lib sanitizer code:
     #

data/lib/loofah/instance_methods.rb ADDED

@@ -0,0 +1,47 @@
+module Loofah
+  #
+  #  Methods that are mixed into Loofah::HTML::Document and Loofah::HTML::DocumentFragment.
+  #
+  module InstanceMethods
+    #
+    #  Traverse the document or fragment, invoking the +scrubber+ on
+    #  each node.
+    #
+    #  +scrubber+ must either be one of the symbols representing the
+    #  built-in scrubbers (see Scrubbers), or a Scrubber instance.
+    #
+    #    span2div = Loofah::Scrubber.new do |node|
+    #      node.name = "div" if node.name == "span"
+    #    end
+    #    Loofah.fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
+    #    # => "<div>foo</div><p>bar</p>"
+    #
+    #  or
+    #
+    #    unsafe_html = "ohai! <div>div is safe</div> <script>but script is not</script>"
+    #    Loofah.fragment(unsafe_html).scrub!(:strip).to_s
+    #    # => "ohai! <div>div is safe</div> "
+    #
+    #  Note that this method is called implicitly from
+    #  Loofah.scrub_fragment and Loofah.scrub_document.
+    #
+    #  Please see Scrubber for more information on implementation and traversal, and
+    #  README.rdoc for more example usage.
+    #
+    def scrub!(scrubber)
+      scrubber = Scrubbers::MAP[scrubber].new if Scrubbers::MAP[scrubber]
+      raise Loofah::ScrubberNotFound, "not a Scrubber or a scrubber name: #{scrubber.inspect}" unless scrubber.is_a?(Loofah::Scrubber)
+      sanitize_roots.children.each { |node| scrubber.traverse(node) }
+      self
+    end
+    #
+    #  Returns a plain-text version of the markup contained by the fragment or document
+    #
+    def text
+      sanitize_roots.children.inner_text
+    end
+    alias :inner_text :text
+    alias :to_str     :text
+  end
+end

data/lib/loofah/scrubber.rb CHANGED

@@ -1,100 +1,122 @@
 module Loofah
   #
-  #  Methods that are mixed into Loofah::HTML::Document and Loofah::HTML::DocumentFragment.
+  #  A RuntimeError raised when Loofah could not find an appropriate scrubber.
   #
-  module ScrubberInstanceMethods
+  class ScrubberNotFound < RuntimeError ; end
+  #
+  #  A Scrubber wraps up a block (or method) that is run on an HTML node (element):
+  #
+  #    # change all <span> tags to <div> tags
+  #    span2div = Loofah::Scrubber.new do |node|
+  #      node.name = "div" if node.name == "span"
+  #    end
+  #
+  #  Alternatively, this scrubber could have been implemented as:
+  #
+  #    class Span2Div < Loofah::Scrubber
+  #      def scrub(node)
+  #        node.name = "div" if node.name == "span"
+  #      end
+  #    end
+  #    span2div = Span2Div.new
+  #
+  #  This can then be run on a document:
+  #
+  #    Loofah.fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
+  #    # => "<div>foo</div><p>bar</p>"
+  #
+  #  Scrubbers can be run on a document in either a top-down traversal (the
+  #  default) or bottom-up. Top-down scrubbers can optionally return
+  #  Scrubber::STOP to terminate the traversal of a subtree.
+  #
+  class Scrubber
+    # Top-down Scrubbers may return CONTINUE to indicate that the subtree should be traversed.
+    CONTINUE = Object.new.freeze
+    # Top-down Scrubbers may return STOP to indicate that the subtree should not be traversed.
+    STOP     = Object.new.freeze
+    # When a scrubber is initialized, the :direction may be specified
+    # as :top_down (the default) or :bottom_up.
+    attr_reader :direction
+    # When a scrubber is initialized, the optional block is saved as
+    # :block. Note that, if no block is passed, then the +scrub+
+    # method is assumed to have been implemented.
+    attr_reader :block
     #
-    #  Clean up the HTML. See Loofah for full usage.
+    #  Options may include
+    #    :direction => :top_down (the default)
+    #  or
+    #    :direction => :bottom_up
     #
-    def scrub!(method)
-      case method
-      when :escape, :prune, :whitewash
-        __sanitize_roots.children.each do |node|
-          Scrubber.traverse_conditionally_top_down(node, method.to_sym)
-        end
-      when :strip
-        __sanitize_roots.children.each do |node|
-          Scrubber.traverse_conditionally_bottom_up(node, method.to_sym)
-        end
-      else
-        raise ArgumentError, "unknown sanitize filter '#{method}'"
+    #  For top_down traversals, if the block returns
+    #  Loofah::Scrubber::STOP, then the traversal will be terminated
+    #  for the current node's subtree.
+    #
+    #  Alternatively, a Scrubber may inherit from Loofah::Scrubber,
+    #  and implement +scrub+, which is slightly faster than using a
+    #  block.
+    #
+    def initialize(options = {}, &block)
+      direction = options[:direction] || :top_down
+      unless [:top_down, :bottom_up].include?(direction)
+        raise ArgumentError, "direction #{direction} must be one of :top_down or :bottom_up"
       end
-      self
+      @direction, @block = direction, block
     end
     #
-    #  Returns a plain-text version of the markup contained by the fragment or document
+    #  Calling +traverse+ will cause the document to be traversed by
+    #  either the lambda passed to the initializer or the +scrub+
+    #  method, in the direction specified at +new+ time.
     #
-    def text
-      __sanitize_roots.children.inner_text
+    def traverse(node)
+      direction == :bottom_up ? traverse_conditionally_bottom_up(node) : traverse_conditionally_top_down(node)
     end
-    alias :inner_text :text
-    alias :to_str     :text
-  end
-  module Scrubber
-    class << self
-      def sanitize(node)
-        case node.type
-        when Nokogiri::XML::Node::ELEMENT_NODE
-          if HTML5::HashedWhiteList::ALLOWED_ELEMENTS[node.name]
-            HTML5::Scrub.scrub_attributes node
-            return false
-          end
-        when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
-          return false
-        end
-        true
-      end
-      def escape(node)
-        return false unless sanitize(node)
-        replacement_killer = Nokogiri::XML::Text.new(node.to_s, node.document)
-        node.add_next_sibling replacement_killer
-        node.remove
-        return true
-      end
-      def prune(node)
-        return false unless sanitize(node)
-        node.remove
-        return true
-      end
+    #
+    #  When +new+ is not passed a block, the class may implement
+    #  +scrub+, which will be called for each document node.
+    #
+    def scrub(node)
+      raise ScrubberNotFound, "No scrub method has been defined on #{self.class.to_s}"
+    end
-      def strip(node)
-        return false unless sanitize(node)
-        replacement_killer = node.before node.inner_html
-        node.remove
-        return true
-      end
+    private
-      def whitewash(node)
-        case node.type
-        when Nokogiri::XML::Node::ELEMENT_NODE
-          if HTML5::HashedWhiteList::ALLOWED_ELEMENTS[node.name]
-            node.attributes.each { |attr| node.remove_attribute(attr.first) }
-            return false if node.namespaces.empty?
-          end
-        when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
-          return false
+    def html5lib_sanitize(node)
+      case node.type
+      when Nokogiri::XML::Node::ELEMENT_NODE
+        if HTML5::HashedWhiteList::ALLOWED_ELEMENTS[node.name]
+          HTML5::Scrub.scrub_attributes node
+          return Scrubber::CONTINUE
         end
-        node.remove
-        return true
+      when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
+        return Scrubber::CONTINUE
       end
+      Scrubber::STOP
+    end
-      def traverse_conditionally_top_down(node, method_name)
-        return if send(method_name, node)
-        node.children.each {|j| traverse_conditionally_top_down(j, method_name)}
+    def traverse_conditionally_top_down(node)
+      if block
+        return if block.call(node) == STOP
+      else
+        return if scrub(node) == STOP
       end
+      node.children.each {|j| traverse_conditionally_top_down(j)}
+    end
-      def traverse_conditionally_bottom_up(node, method_name)
-        node.children.each {|j| traverse_conditionally_bottom_up(j, method_name)}
-        return if send(method_name, node)
+    def traverse_conditionally_bottom_up(node)
+      node.children.each {|j| traverse_conditionally_bottom_up(j)}
+      if block
+        block.call(node)
+      else
+        scrub(node)
       end
     end
   end
 end