RubyGems - htmlfilter - Versions diffs - 1.0.0 - Mend

htmlfilter 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

data/History.rdoc ADDED Viewed

@@ -0,0 +1,6 @@
+=== 1.0.0 / 2009-06-25
+* 1 major enhancement
+  * Birthday!

data/Manifest.txt ADDED Viewed

@@ -0,0 +1,19 @@
+#!mast bin lib meta test [A-Z]*
+lib
+lib/cssfilter.rb
+lib/htmlfilter
+lib/htmlfilter/multiton.rb
+lib/htmlfilter.rb
+meta
+meta/package
+meta/project
+meta/title
+meta/version
+test
+test/test_cssfilter.rb
+test/test_htmlfilter.rb
+Rakefile
+Manifest.txt
+TODO
+README.rdoc
+History.rdoc

data/README.rdoc ADDED Viewed

@@ -0,0 +1,53 @@
+= HtmlFilter
+* http://rubyworks.github.com/htmlfilter
+== DESCRIPTION:
+HTML Filter library can be used to sanitize and sterilize
+HTML. A good idea if you let users submit HTML in comments,
+for instance.
+This library also include CssFilter. The CssFilter class will
+clean-up a cascading style sheet. It can be used to remove
+whitespace and most importantly remove urls.
+== FEATURES:
+* Santize HTML
+* Compress CSS
+== SYNOPSIS:
+Via the class.
+  html = "<<b>hello</b>"
+  HtmlFilter.new(options).filter(html)
+Or using the String extension.
+  html.html_filter  #=> "<b>hello</b>"
+See RDocs for more information.
+== REQUIREMENTS:
+* Uses a copy of multiton.rb (included)
+== INSTALL:
+* sudo gem install htmlfilter
+== LICENSE:
+(Creative Commons Attribution-ShareAlike License)
+Copyright (c) 2009 Thomas Sawyer
+See http://creativecommons.org/licenses/by-sa/3.0/deed.en
+HtmlFilter is a port of lib_filter.php, v1.15 by Cal Henderson <cal@iamcal.com>.
+This code is licensed under a Creative Commons Attribution-ShareAlike 2.5 License.
+See http://creativecommons.org/licenses/by-sa/2.5/.

data/Rakefile ADDED Viewed

@@ -0,0 +1,15 @@
+# -*- ruby -*-
+#$: << './lib'
+#require 'rubygems'
+#require 'hoe'
+#require 'htmlfilter'
+#Hoe.new('htmlfilter', HtmlFilter::VERSION) do |p|
+#  p.rubyforge_name = 'death' # if different than lowercase project name
+#  p.developer('Thomas Sawyer', 'transfire@gmail.com')
+#end
+# vim: syntax=Ruby

data/TODO ADDED Viewed

@@ -0,0 +1,7 @@
+= TODO List
+* Maybe write executable(s) to use library via commandline.
+* Elaborate on Features list in README.txt.
+* Rename class to HTMLFilter (instead of HtmlFilter)

data/lib/cssfilter.rb ADDED Viewed

@@ -0,0 +1,226 @@
+# = CSS Filter
+#
+# The CssFilter class will clean up a cascading style sheet.
+# It can be used to remove whitespace and most importantly
+# remove urls.
+#
+# == Authors
+#
+# * Trans
+#
+# == Todo
+#
+# * Allow urls to be specified per attribute type.
+#
+# == Copying
+#
+#   Copyright (c) 2007 7rans
+#require 'htmlfilter/uri'
+require 'uri'
+# = CSS Filter
+#
+# The CssFilter class will clean up a cascading style sheet.
+# It can be used to remove whitespace and most importantly
+# remove urls.
+#
+class CssFilter
+  VERSION="1.0.0"
+  # should we remove comments? (true, false)
+  attr_accessor :strip_comments
+  # should we remove urls? (true, false)
+  attr_accessor :strip_urls
+  # url schemes which will be allowed (http, ftp, mailto)
+  attr_accessor :allowed_scheme
+  # alias for allowed_scheme
+  alias_method :allowed_protocols, :allowed_scheme
+  alias_method :allowed_protocols=, :allowed_scheme=
+  # url hosts which will be allowed.
+  attr_accessor :allowed_hosts
+  # urls which will be allowed. (NOT YET USED)
+  attr_accessor :allowed_urls
+  # substitue urls  (NOT YET USED)
+  attr_accessor :substitute_urls
+  # remove blank lines.
+  attr_accessor :strip_whitespace
+  # remove blank lines.
+  attr_accessor :strip_blanklines
+  # Complete parse and rewrite of CSS document.
+  # This does a complete "cleaning" but note that
+  # is not yet a perfect parser.
+  attr_accessor :rewrite
+  # CssFilter option defaults.
+  DEFAULT = {
+    'strip_comments' => true,
+    'strip_urls' => true,
+    'allowed_urls' => [],
+    'allowed_hosts' => [],
+    'allowed_scheme' => [],
+    'strip_whitespace' => false,
+    'strip_blanklines' => true,
+    'rewrite' => false,
+    'substitute_urls' => {}
+  }
+  #
+  def initialize(options=nil)
+    if options
+      h = DEFAULT.dup
+      options.each do |k,v|
+        h[k.to_s] = v
+      end
+      options = h
+    else
+      options = DEFAULT.dup
+    end
+    options.each{ |k,v| send("#{k}=",v) }
+  end
+  #
+  def accept_host(host)
+    @hosts << host
+  end
+  #
+  def filter(css)
+    css = remove_comments(css)    if strip_comments
+    css = remove_urls(css)        if strip_urls
+    css = remove_nullvalues(css)
+    css = remove_whitespace(css)  if strip_whitespace
+    css = remove_blanklines(css)  if strip_blanklines
+    css = parse(css).to_css       if rewrite
+    css
+  end
+  #
+  def remove_comments(data)
+    data.gsub(/\/\*(.8?)\*\//,'')
+  end
+  # TODO: allowed_urls
+  def remove_urls(data)
+    urls = data.scan(/url\((.*?)\)/).flatten
+    uris = urls.collect{ |u| URI.extract(u) }.flatten
+    uris.each do |u|
+      uri = URI.parse(u)
+      unless allowed_hosts.include?(uri.host) or
+             allowed_scheme.include?(uri.scheme)
+        data.sub!(u.to_s, '')
+      end
+    end
+    data.gsub(/url\(\s*\)/, '')
+  end
+  #
+  def remove_whitespace(data)
+    data = data.gsub(/^\s*/,'')
+    data = data.gsub(/\s*$/,'')
+  end
+  #
+  def remove_blanklines(data)
+    data = data.gsub(/^\s*\n/,'')
+  end
+  #
+  def remove_nullvalues(data);
+    data = data.gsub(/\w+[:](\s+)[;]/,'')
+  end
+  # Breaks a css document up into a hash. This can be used
+  # completely rewritting the css.
+  #
+  # TODO: Not complete, does not work with "@xxx foo;" for example.
+  def parse(css)
+    tree = CssTree.new
+    entries = css.scan(/^(.*?)\{(.*?)\}/m)
+    entries.each do |ref, props|
+      tree[ref.strip] ||= {}
+      props = clean_properties(props)
+      props = props.scan(/(.*?)[:](.*?)([;]|\s*\Z)/)
+      props.each do |(key,val)|
+        tree[ref.strip][key.strip] = clean_value(val)
+      end
+    end
+    return tree
+  end
+  # Takes a css entry and ensures it is valid (as best it can).
+  # It will fix trival mistakes, and raise an error when it is
+  # beyond repair.
+  #
+  # TODO: So far this does absolutely nothing!
+  def clean_properties(atts)
+    atts
+  end
+  #
+  def clean_value(val)
+    val = val.strip
+    if urls
+      uris = URI.extract(val)
+      uris.each do |u|
+        val.sub!(u.to_s, urls)
+      end
+    end
+    return val
+  end
+end
+# CSS parse tree. This is for a "deep filtering".
+class CssTree < Hash
+  def initialize(options=nil)
+    @options = options || {}
+    super()
+  end
+  # Re-output the CSS, all tidy ;)
+  def to_css
+    css = ""
+    each do |selector, entries|
+      css << "#{selector}{"
+      entries.each do |key, value|
+        css << "#{key}:#{value};"
+      end
+      css << "}\n"
+    end
+    return css
+  end
+end

data/lib/htmlfilter/multiton.rb ADDED Viewed

@@ -0,0 +1,386 @@
+# = Multiton
+#
+# == Synopsis
+#
+# Multiton design pattern ensures only one object is allocated for a given state.
+#
+# The 'multiton' pattern is similar to a singleton, but instead of only one
+# instance, there are several similar instances.  It is useful when you want to
+# avoid constructing objects many times because of some huge expense (connecting
+# to a database for example), require a set of similar but not identical
+# objects, and cannot easily control how many times a contructor may be called.
+#
+#   class SomeMultitonClass
+#     include Multiton
+#     attr :arg
+#     def initialize(arg)
+#       @arg = arg
+#     end
+#   end
+#
+#   a = SomeMultitonClass.new(4)
+#   b = SomeMultitonClass.new(4)   # a and b are same object
+#   c = SomeMultitonClass.new(2)   # c is a different object
+#
+# == Previous Behavior
+#
+# In previous versions of Multiton the #new method was made
+# private and #instance had to be used in its stay --just like Singleton.
+# But this is less desirable for Multiton since Multitions can
+# have multiple instances, not just one.
+#
+# So instead Multiton now defines #create as a private alias of
+# the original #new method (just in case it is needed) and then
+# defines #new to handle the multiton; #instance is provided
+# as an alias for it.
+#
+#--
+# So if you must have the old behavior, all you need do is re-alias
+# #new to #create and privatize it.
+#
+#   class SomeMultitonClass
+#     include Multiton
+#     alias_method :new, :create
+#     private :new
+#     ...
+#   end
+#
+# Then only #instance will be available for creating the Multiton.
+#++
+#
+# == How It Works
+#
+# A pool of objects is searched for a previously cached object,
+# if one is not found we construct one and cache it in the pool
+# based on class and the args given to the contructor.
+#
+# A limitation of this approach is that it is impossible to
+# detect if different blocks were given to a contructor (if it takes a
+# block).  So it is the constructor arguments _only_ which determine
+# the uniqueness of an object. To workaround this, define the _class_
+# method ::multiton_id.
+#
+#   def Klass.multiton_id(*args, &block)
+#     # ...
+#   end
+#
+# Which should return a hash key used to identify the object being
+# constructed as (not) unique.
+#
+# == Authors
+#
+# * Christoph Rippel
+# * Thomas Sawyer
+#
+# = Copying
+#
+# Copyright (c) 2007 Christoph Rippel, Thomas Sawyer
+#
+# Ruby License
+#
+# This module is free software. You may use, modify, and/or redistribute this
+# software under the same terms as Ruby.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE.
+require 'thread'
+# = Multiton
+#
+# Multiton design pattern ensures only one object is allocated for a given state.
+#
+# The 'multiton' pattern is similar to a singleton, but instead of only one
+# instance, there are several similar instances.  It is useful when you want to
+# avoid constructing objects many times because of some huge expense (connecting
+# to a database for example), require a set of similar but not identical
+# objects, and cannot easily control how many times a contructor may be called.
+#
+#   class SomeMultitonClass
+#     include Multiton
+#     attr :arg
+#     def initialize(arg)
+#       @arg = arg
+#     end
+#   end
+#
+#   a = SomeMultitonClass.new(4)
+#   b = SomeMultitonClass.new(4)   # a and b are same object
+#   c = SomeMultitonClass.new(2)   # c is a different object
+#
+# == How It Works
+#
+# A pool of objects is searched for a previously cached object,
+# if one is not found we construct one and cache it in the pool
+# based on class and the args given to the contructor.
+#
+# A limitation of this approach is that it is impossible to
+# detect if different blocks were given to a contructor (if it takes a
+# block).  So it is the constructor arguments _only_ which determine
+# the uniqueness of an object. To workaround this, define the _class_
+# method ::multiton_id.
+#
+#   def Klass.multiton_id(*args, &block)
+#     # ...
+#   end
+#
+# Which should return a hash key used to identify the object being
+# constructed as (not) unique.
+module Multiton
+  #  disable build-in copying methods
+  def clone
+    raise TypeError, "can't clone Multiton #{self}"
+    #self
+  end
+  def dup
+    raise TypeError, "can't dup Multiton #{self}"
+    #self
+  end
+  # default marshalling strategy
+  protected
+  def _dump(depth=-1)
+    Marshal.dump(@multiton_initializer)
+  end
+  # Mutex to safely store multiton instances.
+  class InstanceMutex < Hash  #:nodoc:
+    def initialize
+      @global = Mutex.new
+    end
+    def initialized(arg)
+      store(arg, DummyMutex)
+    end
+    def (DummyMutex = Object.new).synchronize
+      yield
+    end
+    def default(arg)
+      @global.synchronize{ fetch(arg){ store(arg, Mutex.new) } }
+    end
+  end
+  # Multiton can be included in another module, in which case that module effectively becomes
+  # a multiton behavior distributor too. This is why we propogate #included to the base module.
+  # by putting it in another module.
+  #
+  #--
+  #    def append_features(mod)
+  #      #  help out people counting on transitive mixins
+  #      unless mod.instance_of?(Class)
+  #        raise TypeError, "Inclusion of Multiton in module #{mod}"
+  #      end
+  #      super
+  #    end
+  #++
+  module Inclusive
+    private
+    def included(base)
+      class << base
+        #alias_method(:new!, :new) unless method_defined?(:new!)
+        # gracefully handle multiple inclusions of Multiton
+        unless include?(Multiton::MetaMethods)
+          alias_method :new!, :new
+          private :allocate #, :new
+          include Multiton::MetaMethods
+          if method_defined?(:marshal_dump)
+            undef_method :marshal_dump
+            warn "warning: marshal_dump was undefined since it is incompatible with the Multiton pattern"
+          end
+        end
+      end
+    end
+  end
+  extend Inclusive
+  #
+  module MetaMethods
+    include Inclusive
+    def instance(*e, &b)
+      arg = multiton_id(*e, &b)
+      multiton_instance.fetch(arg) do
+        multiton_mutex[arg].synchronize do
+          multiton_instance.fetch(arg) do
+            val = multiton_instance[arg] = new!(*e, &b) #new(*e, &b)
+            val.instance_variable_set(:@multiton_initializer, e, &b)
+            multiton_mutex.initialized(arg)
+            val
+          end
+        end
+      end
+    end
+    alias_method :new, :instance
+    def initialized?(*e, &b)
+      multiton_instance.key?(multiton_id(*e, &b))
+    end
+    protected
+    def multiton_instance
+      @multiton_instance ||= Hash.new
+    end
+    def multiton_mutex
+      @multiton_mutex ||= InstanceMutex.new
+    end
+    def reinitialize
+      multiton_instance.clear
+      multiton_mutex.clear
+    end
+    def _load(str)
+      instance(*Marshal.load(str))
+    end
+    private
+    # Default method to to create a key to cache already constructed
+    # instances. In the use case MultitonClass.new(e), MultiClass.new(f)
+    # must be semantically equal if multiton_id(e).eql?(multiton_id(f))
+    # evaluates to true.
+    def multiton_id(*e, &b)
+      e
+    end
+    def singleton_method_added(sym)
+      super
+      if (sym == :marshal_dump) & singleton_methods.include?('marshal_dump')
+        raise TypeError, "Don't use marshal_dump - rely on _dump and _load instead"
+      end
+    end
+  end
+end
+=begin
+# TODO Convert this into a real test and/or benchmark.
+if $0 == __FILE__
+  ### Simple marshalling test #######
+  class A
+    def initialize(a,*e)
+      @e = a
+    end
+    include Multiton
+    begin
+      def self.marshal_dump(depth = -1)
+      end
+    rescue => mes
+      p mes
+      class << self; undef marshal_dump end
+    end
+  end
+  C = Class.new(A.clone)
+  s = C.instance('a','b')
+  raise unless Marshal.load(Marshal.dump(s)) == s
+  ### Interdependent initialization example and threading benchmark ###
+  class Regular_SymPlane
+    def self.multiton_id(*e)
+        a,b = e
+        (a+b - 1)*(a+b )/2  + (a > b ? a : b)
+    end
+    def initialize(a,b)
+      klass = self.class
+      if a < b
+        @l =  b > 0 ?  klass.instance(a,b-1) : nil
+        @r =  a > 0 ?  klass.instance(a-1,b) : nil
+      else
+        @l =  a > 0 ?  klass.instance(a-1,b) : nil
+        @r =  b > 0 ?  klass.instance(a,b-1) : nil
+      end
+    end
+    include Multiton
+  end
+  def nap
+  # Thread.pass
+  sleep(rand(0.01))
+  end
+  class SymPlane < Regular_SymPlane
+    @m = Mutex.new
+    @count = 0
+  end
+  class << SymPlane
+    attr_reader :count
+    def reinitialize
+      super
+      @m = Mutex.new
+      @count = 0
+    end
+    def inherited(sub_class)
+      super
+      sub_class.instance_eval { @m = Mutex.new; @count = 0 }
+    end
+    def multiton_id(*e)
+      nap()
+      super
+    end
+    def new!(*e)  # NOTICE!!!
+      super
+    ensure
+      nap()
+      @m.synchronize { p @count if (@count += 1) % 15 == 0 }
+    end
+    def run(k)
+      threads = 0
+      max = k * (k+1) / 2
+      puts ""
+      while count() < max
+        Thread.new { threads+= 1; instance(rand(30),rand(30)) }
+      end
+      puts "\nThe simulation created #{threads} threads"
+    end
+  end
+  require 'benchmark'
+  include Benchmark
+  bmbm do |x|
+    x.report('Initialize 465 SymPlane instances') { SymPlane.run(30) }
+    x.report('Reinitialize ') do
+      sleep 3
+      SymPlane.reinitialize
+    end
+  end
+end
+=end

data/lib/htmlfilter.rb ADDED Viewed

@@ -0,0 +1,516 @@
+# = HTML Filter
+#
+# HTML Filter library can be used to sanitize and sterilize
+# HTML. A good idea if you let users submit HTML in comments,
+# for instance.
+#
+# HtmlFilter is a port of lib_filter.php, v1.15 by Cal Henderson <cal@iamcal.com>
+#
+# This code is licensed under a Creative Commons Attribution-ShareAlike 2.5 License
+# http://creativecommons.org/licenses/by-sa/2.5/
+#
+# Thanks to Jang Kim for adding support for single quoted attributes.
+#
+# == Reference
+#
+# * http://iamcal.com/publish/articles/php/processing_html/
+# * http://iamcal.com/publish/articles/php/processing_html_part_2/
+#
+# == Author(s)
+#
+# * Trans
+# * George Moschovitis
+# * James Britt
+# * Cal Henderson
+# * Jang Kim
+#
+# == Copying
+#
+# Copyright (c) 2007 Trans
+require 'htmlfilter/multiton.rb'
+# = HtmlFilter
+#
+# HTML Filter library can be used to sanitize and sterilize
+# HTML. A good idea if you let users submit HTML in comments,
+# for instance.
+#
+# lib_filter.php, v1.15 by Cal Henderson <cal@iamcal.com>
+#
+# This code is licensed under a Creative Commons Attribution-ShareAlike 2.5 License
+# http://creativecommons.org/licenses/by-sa/2.5/
+#
+# Thanks to Jang Kim for adding support for single quoted attributes.
+#
+# == Reference
+#
+# * http://iamcal.com/publish/articles/php/processing_html/
+# * http://iamcal.com/publish/articles/php/processing_html_part_2/
+class HtmlFilter
+  VERSION = "1.0.0"
+  include Multiton
+  # tags and attributes that are allowed
+  #
+  # Eg.
+  #
+  #   {
+  #     'a' => ['href', 'target'],
+  #     'b' => [],
+  #     'img' => ['src', 'width', 'height', 'alt']
+  #   }
+  attr_accessor :allowed
+  # tags which should always be self-closing (e.g. "<img />")
+  attr_accessor :no_close
+  # tags which must always have seperate opening and closing
+  # tags (e.g. "<b></b>")
+  attr_accessor :always_close
+  # attributes which should be checked for valid protocols
+  # (src,href)
+  attr_accessor :protocol_attributes
+  # protocols which are allowed (http, ftp, mailto)
+  attr_accessor :allowed_protocols
+  # tags which should be removed if they contain no content
+  # (e.g. "<b></b>" or "<b />")
+  attr_accessor :remove_blanks
+  # should we remove comments? (true, false)
+  attr_accessor :strip_comments
+  # should we try and make a b tag out of "b>" (true, false)
+  attr_accessor :always_make_tags
+  # entity control option (true, false)
+  attr_accessor :allow_numbered_entities
+  # entity control option (amp, gt, lt, quot, etc.)
+  attr_accessor :allowed_entities
+  # default settings
+  DEFAULT = {
+    'allowed' => {
+      'a'   => ['href', 'target'],
+      'b'   => [],
+      'i'   => [],
+      'img' => ['src', 'width', 'height', 'alt']
+    },
+    'no_close' => ['img', 'br', 'hr'],
+    'always_close' => ['a', 'b'],
+    'protocol_attributes' => ['src', 'href'],
+    'allowed_protocols' => ['http', 'ftp', 'mailto'],
+    'remove_blanks' => ['a', 'b'],
+    'strip_comments' => true,
+    'always_make_tags' => true,
+    'allow_numbered_entities' => true,
+    'allowed_entities' => ['amp', 'gt', 'lt', 'quot']
+  }
+  # New html filter.
+  def initialize( options=nil )
+    if options
+      h = DEFAULT.dup
+      options.each do |k,v|
+        h[k.to_s] = v
+      end
+      options = h
+    else
+      options = DEFAULT.dup
+    end
+    options.each{ |k,v| send("#{k}=",v) }
+  end
+  # Filter html string.
+  def filter(data)
+    @tag_counts = {}
+    data = escape_comments(data)
+    data = balance_html(data)
+    data = check_tags(data)
+    data = process_remove_blanks(data)
+    data = validate_entities(data)
+    return data
+  end
+  private
+  #
+  # internal tag counter
+  #
+  def tag_counts ; @tag_counts; end
+  #
+  #
+  #
+  def escape_comments(data)
+    data = data.gsub(/<!--(.*?)-->/s) do
+      '<!--' + escape_special_chars(strip_single($1)) + '-->'
+    end
+    return data
+  end
+  #
+  #
+  #
+  def balance_html(data)
+    data = data.dup
+    if always_make_tags
+      # try and form html
+      data.gsub!(/>>+/, '>')
+      data.gsub!(/<<+/, '<')
+      data.gsub!(/^>/, '')
+      data.gsub!(/<([^>]*?)(?=<|$)/, '<\1>')
+      data.gsub!(/(^|>)([^<]*?)(?=>)/, '\1<\2')
+    else
+      # escape stray brackets
+      data.gsub!(/<([^>]*?)(?=<|$)/, '&lt;\1')
+      data.gsub!(/(^|>)([^<]*?)(?=>)/, '\1\2&gt;<')
+      # the last regexp causes '<>' entities to appear
+      # (we need to do a lookahead assertion so that the last bracket
+      # can be used in the next pass of the regexp)
+      data.gsub!('<>', '')
+    end
+    return data
+  end
+  #
+  #
+  #
+  def check_tags(data)
+    data = data.dup
+    data.gsub!(/<(.*?)>/s){
+      process_tag(strip_single($1))
+    }
+    tag_counts.each do |tag, cnt|
+        cnt.times{ data << "</#{tag}>" }
+    end
+    return data
+  end
+  #
+  #
+  #
+  def process_tag(data)
+    # ending tags
+    re = /^\/([a-z0-9]+)/si
+    if matches = re.match(data)
+        name = matches[1].downcase
+        if allowed.key?(name)
+            unless no_close.include?(name)
+                if tag_counts[name]
+                    tag_counts[name] -= 1
+                    return "</#{name}>"
+                end
+            end
+        else
+            return ''
+        end
+    end
+    # starting tags
+    re = /^([a-z0-9]+)(.*?)(\/?)$/si
+    if matches = re.match(data)
+        name   = matches[1].downcase
+        body   = matches[2]
+        ending = matches[3]
+        if allowed.key?(name)
+            params = ""
+            matches_2 = body.scan(/([a-z0-9]+)=(["'])(.*?)\2/si)         # <foo a="b" />
+            matches_1 = body.scan(/([a-z0-9]+)(=)([^"\s']+)/si)          # <foo a=b />
+            matches_3 = body.scan(/([a-z0-9]+)=(["'])([^"']*?)\s*$/si)   # <foo a="b />
+            matches = matches_1 + matches_2 + matches_3
+            matches.each do |match|
+                pname = match[0].downcase
+                if allowed[name].include?(pname)
+                    value = match[2]
+                    if protocol_attributes.include?(pname)
+                        value = process_param_protocol(value)
+                    end
+                    params += %{ #{pname}="#{value}"}
+                end
+            end
+            if no_close.include?(name)
+                ending = ' /'
+            end
+            if always_close.include?(name)
+                ending = ''
+            end
+            if ending.empty?
+                if tag_counts.key?(name)
+                    tag_counts[name] += 1
+                else
+                    tag_counts[name] = 1
+                end
+            end
+            unless ending.empty?
+                ending = ' /'
+            end
+            return '<' + name + params + ending + '>'
+        else
+            return ''
+        end
+    end
+    # comments
+    if /^!--(.*)--$/si =~ data
+        if strip_comments
+            return ''
+        else
+            return '<' + data + '>'
+        end
+    end
+    # garbage, ignore it
+    return ''
+  end
+  #
+  #
+  #
+  def process_param_protocol(data)
+    data = decode_entities(data)
+    re = /^([^:]+)\:/si
+    if matches = re.match(data)
+        unless allowed_protocols.include?(matches[1])
+            #data = '#'.substr(data, strlen(matches[1])+1)
+            data = '#' + data[0..matches[1].size+1]
+        end
+    end
+    return data
+  end
+  #
+  #
+  #
+  def process_remove_blanks(data)
+    data = data.dup
+    remove_blanks.each do |tag|
+        data.gsub!(/<#{tag}(\s[^>]*)?><\/#{tag}>/, '')
+        data.gsub!(/<#{tag}(\s[^>]*)?\/>/, '')
+    end
+    return data
+  end
+  #
+  #
+  #
+  def fix_case(data)
+    data_notags = strip_tags(data)
+    data_notags = data_notags.gsub(/[^a-zA-Z]/, '')
+    if data_notags.size < 5
+        return data
+    end
+    if /[a-z]/ =~ data_notags
+        return data
+    end
+    data = data.gsub(/(>|^)([^<]+?)(<|$)/s){
+        strip_single($1) +
+        fix_case_inner(strip_single($2)) +
+        strip_single($3)
+    }
+    return data
+  end
+  #
+  #
+  #
+  def fix_case_inner(data)
+    data = data.dup
+    data.downcase!
+    data.gsub!(/(^|[^\w\s\';,\\-])(\s*)([a-z])/){
+        strip_single("#{$1}#{$2}") + strip_single($3).upcase
+    }
+    return data
+  end
+  #
+  #
+  #
+  def validate_entities(data)
+    data = data.dup
+    # validate entities throughout the string
+    data.gsub!(%r!&([^&;]*)(?=(;|&|$))!){
+        check_entity(strip_single($1), strip_single($2))
+    }
+    # validate quotes outside of tags
+    data.gsub!(/(>|^)([^<]+?)(<|$)/s){
+        m1, m2, m3 = $1, $2, $3
+        strip_single(m1) +
+        strip_single(m2).gsub('\"', '&quot;') +
+        strip_single(m3)
+    }
+    return data
+  end
+  #
+  #
+  #
+  def check_entity(preamble, term)
+    if term != ';'
+        return '&amp;' + preamble
+    end
+    if is_valid_entity(preamble)
+        return '&' + preamble
+    end
+    return '&amp;' + preamble
+  end
+  #
+  #
+  #
+  def is_valid_entity(entity)
+    re = /^#([0-9]+)$/i
+    if md = re.match(entity)
+        if (md[1].to_i > 127)
+            return true
+        end
+        return allow_numbered_entities
+    end
+    if allowed_entities.include?(entity)
+        return true
+    end
+    return nil
+  end
+  # within attributes, we want to convert all hex/dec/url
+  # escape sequences into their raw characters so that we can
+  # check we don't get stray quotes/brackets inside strings.
+  def decode_entities(data)
+    data = data.dup
+    data.gsub!(/(&)#(\d+);?/){ decode_dec_entity($1, $2) }
+    data.gsub!(/(&)#x([0-9a-f]+);?/i){ decode_hex_entity($1, $2) }
+    data.gsub!(/(%)([0-9a-f]{2});?/i){ decode_hex_entity($1, $2) }
+    data = validate_entities(data)
+    return data
+  end
+  #
+  #
+  #
+  def decode_hex_entity(*m)
+    return decode_num_entity(m[1], m[2].to_i.to_s(16))
+  end
+  #
+  #
+  #
+  def decode_dec_entity(*m)
+    return decode_num_entity(m[1], m[2])
+  end
+  #
+  #
+  #
+  def decode_num_entity(orig_type, d)
+    d = d.to_i
+    d = 32 if d < 0   # space
+    # don't mess with high chars
+    if d > 127
+        return '%' + d.to_s(16) if orig_type == '%'
+        return "&#{d};" if orig_type == '&'
+    end
+    return escape_special_chars(d.chr)
+  end
+  #
+  #
+  #
+  def strip_single(data)
+    return data.gsub('\"', '"').gsub('\0', 0.chr)
+  end
+  # Certain characters have special significance in HTML, and
+  # should be represented by HTML entities if they are to
+  # preserve their meanings. This function returns a string
+  # with some of these conversions made; the translations made
+  # are those most useful for everyday web programming.
+  def escape_special_chars(data)
+    data = data.dup
+    data.gsub!( /&/n  , '&amp;' )
+    data.gsub!( /\"/n , '&quot;' )
+    data.gsub!( />/n  , '&gt;' )
+    data.gsub!( /</n  , '&lt;' )
+    data.gsub!( /'/   , '&#039;' )
+    return data
+  end
+end
+# Overload the standard String class for extra convienience.
+class String
+  def html_filter(*opts)
+    HtmlFilter.new(*opts).filter(self)
+  end
+end

data/meta/package ADDED Viewed

	@@ -0,0 +1 @@
1	+ htmlfilter

data/meta/project ADDED Viewed

	@@ -0,0 +1 @@
1	+ rubyworks

data/meta/title ADDED Viewed

	@@ -0,0 +1 @@
1	+ HTMLFilter

data/meta/version ADDED Viewed

	@@ -0,0 +1 @@
1	+ 1.0.0

data/test/test_cssfilter.rb ADDED Viewed

@@ -0,0 +1,35 @@
+require "test/unit"
+require "cssfilter"
+#require 'yaml'
+class TestCssFilter < Test::Unit::TestCase
+  def setup
+    @css = <<-END
+      * {
+        margin: 0;
+        height: 0;
+      }
+      body {
+        margin: 0;
+        height: 0;
+        background: url(http://xzy.org);
+      }
+      h1 {
+        trythis: url(http://here.org/fun.js);
+        font-size: 12pt;
+      }
+    END
+    @result = "* {\nmargin: 0;\nheight: 0;\n}\nbody {\nmargin: 0;\nheight: 0;\n}\nh1 {\ntrythis: url(http://here.org/fun.js);\nfont-size: 12pt;\n}"
+  end
+  def test_filter
+    cssfilter = CssFilter.new(:allowed_hosts=>["here.org"], :strip_whitespace => true)
+    csstree   = cssfilter.filter(@css)
+    assert_equal(@result, csstree.to_s)
+  end
+end

data/test/test_htmlfilter.rb ADDED Viewed

@@ -0,0 +1,70 @@
+require "test/unit"
+require "htmlfilter"
+class TestHtmlFilter < Test::Unit::TestCase
+  # core tests
+  def test_multiton_without_options
+    h1 = HtmlFilter.new
+    h2 = HtmlFilter.new
+    h3 = HtmlFilter.new( :strip_comments => false )
+    assert_equal( h1.object_id, h2.object_id )
+    assert_not_equal( h1.object_id, h3.object_id )
+  end
+  def test_multiton_with_options
+    h1 = HtmlFilter.new( :strip_comments => false )
+    h2 = HtmlFilter.new( :strip_comments => false )
+    h3 = HtmlFilter.new
+    assert_equal( h1.object_id, h2.object_id )
+    assert_not_equal( h1.object_id, h3.object_id )
+  end
+  def test_strip_single
+    hf = HtmlFilter.new
+    assert_equal( '"', hf.send(:strip_single,'\"') )
+    assert_equal( "\000", hf.send(:strip_single,'\0') )
+  end
+  # functional tests
+  def assert_filter(filtered, original)
+    assert_equal(filtered, original.html_filter)
+  end
+  def test_fix_quotes
+    assert_filter '<img src="foo.jpg" />', "<img src=\"foo.jpg />"
+  end
+  def test_basics
+    assert_filter '', ''
+    assert_filter 'hello', 'hello'
+  end
+  def test_balancing_tags
+    assert_filter "<b>hello</b>", "<<b>hello</b>"
+    assert_filter "<b>hello</b>", "<b>>hello</b>"
+    assert_filter "<b>hello</b>", "<b>hello<</b>"
+    assert_filter "<b>hello</b>", "<b>hello</b>>"
+    assert_filter "", "<>"
+  end
+  def test_tag_completion
+    assert_filter "hello", "hello<b>"
+    assert_filter "<b>hello</b>", "<b>hello"
+    assert_filter "hello<b>world</b>", "hello<b>world"
+    assert_filter "hello", "hello</b>"
+    assert_filter "hello", "hello<b/>"
+    assert_filter "hello<b>world</b>", "hello<b/>world"
+    assert_filter "<b><b><b>hello</b></b></b>", "<b><b><b>hello"
+    assert_filter "", "</b><b>"
+  end
+  def test_end_slashes
+    assert_filter '<img />', '<img>'
+    assert_filter '<img />', '<img/>'
+    assert_filter '', '<b/></b>'
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,75 @@
+--- !ruby/object:Gem::Specification
+name: htmlfilter
+version: !ruby/object:Gem::Version
+  version: 1.0.0
+platform: ruby
+authors: []
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-09-22 00:00:00 -04:00
+default_executable:
+dependencies: []
+description: HTML Filter library can be used to sanitize and sterilize HTML. A good idea if you let users submit HTML in comments, for instance.   This library also include CssFilter. The CssFilter class will clean-up a cascading style sheet. It can be used to remove whitespace and most importantly remove urls.
+email:
+executables: []
+extensions: []
+extra_rdoc_files:
+- Rakefile
+- Manifest.txt
+- TODO
+- README.rdoc
+- History.rdoc
+files:
+- lib/cssfilter.rb
+- lib/htmlfilter/multiton.rb
+- lib/htmlfilter.rb
+- meta/package
+- meta/project
+- meta/title
+- meta/version
+- test/test_cssfilter.rb
+- test/test_htmlfilter.rb
+- Rakefile
+- Manifest.txt
+- TODO
+- README.rdoc
+- History.rdoc
+has_rdoc: true
+homepage:
+licenses: []
+post_install_message:
+rdoc_options:
+- --inline-source
+- --title
+- htmlfilter api
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project: htmlfilter
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: HTML Filter library can be used to sanitize and sterilize HTML.
+test_files:
+- test/test_cssfilter.rb
+- test/test_htmlfilter.rb