RubyGems - sanitize - Versions diffs - 1.0.5 → 1.0.6 - Mend

sanitize 1.0.5 → 1.0.6

Files changed (5) hide show

data/HISTORY +10 -0
data/README.rdoc +21 -3
data/lib/sanitize.rb +66 -37
metadata +2 -13
data/lib/sanitize/monkeypatch/hpricot.rb +0 -33

data/HISTORY CHANGED Viewed

@@ -1,6 +1,16 @@
 Sanitize History
 ================================================================================
+Version 1.0.6 (2009-02-23)
+  * Removed htmlentities gem dependency.
+  * Existing well-formed character entity references in the input string are now
+    preserved rather than being decoded and re-encoded.
+  * The ' character is now encoded as &#39; instead of &apos; to prevent
+    problems in IE6.
+  * You can now specify the symbol :all in place of an element name in the
+    attributes config hash to allow certain attributes on all elements. [Thanks
+    to Mutwin Kraus]
 Version 1.0.5 (2009-02-05)
   * Fixed a bug introduced in version 1.0.3 that prevented non-whitelisted
     protocols from being cleaned when relative URLs were allowed. [Reported by

data/README.rdoc CHANGED Viewed

@@ -15,7 +15,7 @@ or maliciously-formed HTML. When in doubt, Sanitize always errs on the side of
 caution.
 *Author*::    Ryan Grove (mailto:ryan@wonko.com)
-*Version*::   1.0.5 (2009-02-05)
+*Version*::   1.0.6 (2009-02-23)
 *Copyright*:: Copyright (c) 2009 Ryan Grove. All rights reserved.
 *License*::   MIT License (http://opensource.org/licenses/mit-license.php)
 *Website*::   http://github.com/rgrove/sanitize
@@ -24,7 +24,6 @@ caution.
 * RubyGems
 * Hpricot 0.6+
-* HTMLEntities 4.0.0+
 == Usage
@@ -100,6 +99,14 @@ attributes in lowercase.
     'img'        => ['alt', 'src', 'title']
   }
+If you'd like to allow certain attributes on all elements, use the symbol
+<code>:all</code> instead of an element name.
+  :attributes => {
+    :all => ['class'],
+    'a'  => ['href', 'title']
+  }
 ==== :add_attributes
 Attributes to add to specific elements. If the attribute already exists, it will
@@ -122,12 +129,23 @@ protocol at all), it will be removed.
   }
 If you'd like to allow the use of relative URLs which don't have a protocol,
-include the special value <code>:relative</code> in the protocol array:
+include the symbol <code>:relative</code> in the protocol array:
   :protocols => {
     'a' => {'href' => ['http', 'https', :relative]}
   }
+== Contributors
+The following lovely people have contributed to Sanitize in the form of patches
+or ideas that later became code:
+* Ryan Grove <ryan@wonko.com>
+* Adam Hooper <adam@adamhooper.com>
+* Mutwin Kraus <mutle@blogage.de>
+* Dev Purkayastha <dev.purkayastha@gmail.com>
 == License
 Copyright (c) 2009 Ryan Grove <ryan@wonko.com>

data/lib/sanitize.rb CHANGED Viewed

@@ -26,19 +26,28 @@ $:.uniq!
 require 'rubygems'
-gem 'hpricot',      '~> 0.6'
-gem 'htmlentities', '~> 4.0.0'
+gem 'hpricot', '~> 0.6'
 require 'hpricot'
-require 'htmlentities'
 require 'sanitize/config'
 require 'sanitize/config/restricted'
 require 'sanitize/config/basic'
 require 'sanitize/config/relaxed'
-require 'sanitize/monkeypatch/hpricot'
 class Sanitize
+  # Characters that should be replaced with entities in text nodes.
+  ENTITY_MAP = {
+    '<' => '&lt;',
+    '>' => '&gt;',
+    '"' => '&quot;',
+    "'" => '&#39;'
+  }
+  # Matches an unencoded ampersand that is not part of a valid character entity
+  # reference.
+  REGEX_AMPERSAND = /&(?!(?:[a-z]+|#[0-9]+|#x[0-9a-f]+);)/i
   # Matches an attribute value that could be treated by a browser as a URL
   # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
   # or more characters followed by a colon is considered a match, even if the
@@ -46,24 +55,6 @@ class Sanitize
   # IE6 and Opera will still parse).
   REGEX_PROTOCOL = /^([^:]*)(?:\:|&#0*58|&#x0*3a)/i
-  #--
-  # Class Methods
-  #++
-  # Returns a sanitized copy of _html_, using the settings in _config_ if
-  # specified.
-  def self.clean(html, config = {})
-    sanitize = Sanitize.new(config)
-    sanitize.clean(html)
-  end
-  # Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
-  # were made.
-  def self.clean!(html, config = {})
-    sanitize = Sanitize.new(config)
-    sanitize.clean!(html)
-  end
   #--
   # Instance Methods
   #++
@@ -101,10 +92,19 @@ class Sanitize
           next
         end
-        if @config[:attributes].has_key?(name)
+        node.raw_attributes ||= {}
+        attr_whitelist = ((@config[:attributes][name] || []) +
+            (@config[:attributes][:all] || [])).uniq
+        if attr_whitelist.empty?
+          # Delete all attributes from elements with no whitelisted
+          # attributes.
+          node.raw_attributes = {}
+        else
           # Delete any attribute that isn't in the whitelist for this element.
           node.raw_attributes.delete_if do |key, value|
-            !@config[:attributes][name].include?(key.to_s.downcase)
+            !attr_whitelist.include?(key.to_s.downcase)
           end
           # Delete remaining attributes that use unacceptable protocols.
@@ -122,32 +122,61 @@ class Sanitize
               end
             end
           end
-        else
-          # Delete all attributes from elements with no whitelisted
-          # attributes.
-          node.raw_attributes = {}
         end
         # Add required attributes.
         if @config[:add_attributes].has_key?(name)
           node.raw_attributes.merge!(@config[:add_attributes][name])
         end
+        # Escape special chars in attribute values.
+        node.raw_attributes.each do |key, value|
+          node.raw_attributes[key] = Sanitize.encode_html(value)
+        end
       end
     end
     # Make one last pass through the fragment and encode all special HTML chars
-    # and non-ASCII chars as entities. This eliminates certain types of
-    # maliciously-malformed nested tags and also compensates for Hpricot's
-    # burning desire to decode all entities.
-    coder = HTMLEntities.new
-    fragment.traverse_element do |node|
-      if node.text?
-        node.swap(coder.encode(node.inner_text, :named))
-      end
+    # as entities. This eliminates certain types of maliciously-malformed nested
+    # tags.
+    fragment.search('*') do |node|
+      node.swap(Sanitize.encode_html(node.to_original_html)) if node.text?
     end
     result = fragment.to_s
     return result == html ? nil : html[0, html.length] = result
   end
+  #--
+  # Class Methods
+  #++
+  class << self
+    # Returns a sanitized copy of _html_, using the settings in _config_ if
+    # specified.
+    def clean(html, config = {})
+      sanitize = Sanitize.new(config)
+      sanitize.clean(html)
+    end
+    # Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
+    # were made.
+    def clean!(html, config = {})
+      sanitize = Sanitize.new(config)
+      sanitize.clean!(html)
+    end
+    # Encodes special HTML characters (<, >, ", ', and &) in _html_ as entity
+    # references and returns the encoded string.
+    def encode_html(html)
+      str = html.dup
+      # Encode special chars.
+      ENTITY_MAP.each {|char, entity| str.gsub!(char, entity) }
+      # Convert unencoded ampersands to entity references.
+      str.gsub(REGEX_AMPERSAND, '&amp;')
+    end
+  end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: sanitize
 version: !ruby/object:Gem::Version
-  version: 1.0.5
+  version: 1.0.6
 platform: ruby
 authors:
 - Ryan Grove
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-02-05 00:00:00 -08:00
+date: 2009-02-23 00:00:00 -08:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -22,16 +22,6 @@ dependencies:
       - !ruby/object:Gem::Version
         version: "0.6"
     version:
-- !ruby/object:Gem::Dependency
-  name: htmlentities
-  type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ~>
-      - !ruby/object:Gem::Version
-        version: 4.0.0
-    version:
 description:
 email: ryan@wonko.com
 executables: []
@@ -49,7 +39,6 @@ files:
 - lib/sanitize/config/basic.rb
 - lib/sanitize/config/relaxed.rb
 - lib/sanitize/config/restricted.rb
-- lib/sanitize/monkeypatch/hpricot.rb
 has_rdoc: false
 homepage: http://github.com/rgrove/sanitize/
 post_install_message:

data/lib/sanitize/monkeypatch/hpricot.rb DELETED Viewed

@@ -1,33 +0,0 @@
-#--
-# Copyright (c) 2009 Ryan Grove <ryan@wonko.com>
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the 'Software'), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#++
-module Hpricot
-  # Monkeypatch to fix an Hpricot bug that causes HTML entities to be decoded
-  # incorrectly.
-  def self.uxs(str)
-    str.to_s.
-      gsub(/&(\w+);/) { [Hpricot::NamedCharacters[$1] || ??].pack("U*") }.
-      gsub(/\&\#(\d+);/) { [$1.to_i].pack("U*") }
-  end
-end