RubyGems - sanitizer - Versions diffs - 0.1.1 → 0.1.5 - Mend

sanitizer 0.1.1 → 0.1.5

Files changed (17) hide show

data/lib/sanitizer/htmlentries.rb +14 -0
data/lib/sanitizer/sanitizer.rb +82 -0
data/lib/sanitizer/version.rb +3 -0
data/lib/{whitelist.rb → sanitizer/whitelist.rb} +0 -0
data/lib/sanitizer.rb +9 -100
data/spec/sanitizer_spec.rb +27 -6
metadata +30 -54
data/.document +0 -5
data/.rspec +0 -1
data/Gemfile +0 -13
data/LICENSE.txt +0 -20
data/README +0 -19
data/README.rdoc +0 -19
data/Rakefile +0 -49
data/VERSION +0 -1
data/spec/spec_helper.rb +0 -12
data/tags +0 -0

data/lib/sanitizer/htmlentries.rb ADDED Viewed

@@ -0,0 +1,14 @@
+class HTMLEntities
+  class Encoder #:nodoc:
+    def basic_entity_regexp
+      @basic_entity_regexp ||= (
+        case @flavor
+        when /^html/
+          /[<>"]|(\&(?!\w))/
+        else
+          /[<>'"]|(\&(?!\w))/
+        end
+      )
+    end
+  end
+end

data/lib/sanitizer/sanitizer.rb ADDED Viewed

@@ -0,0 +1,82 @@
+# encoding: utf-8
+module Sanitizer
+  # HTMLEntris
+  @@htmle = HTMLEntities.new
+  # All self.methods
+  class << self
+    def sanitize(text)
+      text = strip_tags(text)
+      text = clean_spaces(text)
+      text = html_encode(text)
+      text
+    end
+    def clean_spaces(text)
+      output = text.dup
+      output.gsub!(/\s+/, " ")
+      output
+    end
+    def strip_comments(text)
+      output = text.dup
+      output.gsub!(/(\<\!\-\-\b*[^\-\-\>]*.*?\-\-\>)/ui, "")
+      output.gsub!(/(\&lt;\s?\!--.*\s?--\&gt;)/uim, "")
+      output
+    end
+    # Remove all <script> and <style> tags
+    def strip_disallowed_tags(text)
+      output = text
+      output.gsub!(/(<script\s*.*>.*<\/script>)/uim, "")
+      output.gsub!(/(<script\s*.*\/?>)/uim, "")
+      output.gsub!(/(<link\s*.*\/?>)/uim, "")
+      output.gsub!(/(<style\s*.*>.*<\/style>)/uim, "")
+      # Stripping html entities too
+      output.gsub!(/(\&lt;script\s*.*\&gt;.*\&lt;\/script\&gt;)/uim, "")
+      output.gsub!(/(\&lt;script\s*.*\/?\&gt;)/uim, "")
+      output.gsub!(/(\&lt;link\s*.*\/?\&gt;)/uim, "")
+      output.gsub!(/(\&lt;style\s*.*\&gt;.*\&lt;\/style\&gt;)/uim, "")
+      output
+    end
+    # Remove all tags from from text
+    def strip_tags(text, *tags)
+      output = text.dup
+      if tags.empty? # clear all tags by default
+        output.gsub!(/<\/?[^>]*>/uim, "")
+        output.gsub!(/\&lt;\/?[^\&gt;]*\&gt;/uim, "")
+      else # clean only selected tags
+        strip = tags.map do |tag|
+          %Q{(#{tag})}
+        end.join('|')
+        output.gsub!(/<\/?(#{strip})[^>]*>/uim, "")
+        output.gsub!(/\&lt;\/?(#{strip})[^\&gt;]*\&gt;/uim, "")
+      end
+      output
+    end
+    # Convert invalid chars to HTML Entries
+    def html_encode(text)
+      text = text.to_s
+      @@htmle.encode(text, :named)
+    end
+    # Convert invalid chars to HTML Entries
+    def html_decode(text)
+      text = text.to_s
+      @@htmle.decode(text, :named)
+    end
+    # Alguns feeds retornam tags "escapadas" dentro do conteúdo (ex: &lt;br/&gt;)
+    # Este método deve ser utilizado após o stripping e sanitização, para não deixar que essas tags sejam exibidas como conteúdo
+    def entities_to_chars(text)
+      output = text.dup
+      output.gsub!(/\&lt;/uim, "<")
+      output.gsub!(/\&gt;/uim, ">")
+      output
+    end
+  end # self
+end

data/lib/sanitizer/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Sanitizer
+  VERSION = "0.1.5"
+end

data/lib/{whitelist.rb → sanitizer/whitelist.rb} RENAMED Viewed

File without changes

data/lib/sanitizer.rb CHANGED Viewed

@@ -1,100 +1,9 @@
-require 'whitelist'
-module Sanitizer
-  class << self
-    def sanitize(text)
-      new_text = text
-      sanitize!(new_text)
-    end
-    def sanitize!(text)
-      strip_tags(text)
-      clean_spaces(text)
-      clean_ampersand(text)
-      text
-    end
-    def clean_spaces(text)
-      text.gsub!(/\s+/, " ")
-      text
-    end
-    def clean_ampersand(text)
-      text.gsub!(/\&[^\w\;]+/, "&amp; ")
-      text
-    end
-    def strip_comments(text)
-      text.gsub!(/(\<\!\-\-\b*[^\-\-\>]*.*?\-\-\>)/ui, "")
-      text.gsub!(/(\&lt;\s?\!--.*\s?--\&gt;)/uim, "")
-      text
-    end
-    # Remove all <script> and <style> tags
-    def strip_disallowed_tags(text)
-      text.gsub!(/(<script\s*.*>.*<\/script>)/uim, "")
-      text.gsub!(/(<script\s*.*\/?>)/uim, "")
-      text.gsub!(/(<link\s*.*\/?>)/uim, "")
-      text.gsub!(/(<style\s*.*>.*<\/style>)/uim, "")
-      # Stripping html entities too
-      text.gsub!(/(\&lt;script\s*.*\&gt;.*\&lt;\/script\&gt;)/uim, "")
-      text.gsub!(/(\&lt;script\s*.*\/?\&gt;)/uim, "")
-      text.gsub!(/(\&lt;link\s*.*\/?\&gt;)/uim, "")
-      text.gsub!(/(\&lt;style\s*.*\&gt;.*\&lt;\/style\&gt;)/uim, "")
-      text
-    end
-    # Remove all tags from from text
-    def strip_tags(text, *tags)
-      if tags.empty? # clear all tags by default
-        text.gsub!(/<\/?[^>]*>/uim, "")
-        text.gsub!(/\&lt;\/?[^\&gt;]*\&gt;/uim, "")
-      else # clean only selected tags
-        strip = tags.map do |tag|
-          %Q{(#{tag})}
-        end.join('|')
-        text.gsub!(/<\/?(#{strip})[^>]*>/uim, "")
-        text.gsub!(/\&lt;\/?(#{strip})[^\&gt;]*\&gt;/uim, "")
-      end
-      text
-    end
-    # Alguns feeds retornam tags "escapadas" dentro do conteúdo (ex: &lt;br/&gt;)
-    # Este método deve ser utilizado após o stripping e sanitização, para não deixar que essas tags sejam exibidas como conteúdo
-    def entities_to_chars(text)
-      text.gsub!(/\&lt;/uim, "<")
-      text.gsub!(/\&gt;/uim, ">")
-      text
-    end
-    #  this liftend nearly verbatim from html5
-    def sanitize_css(style)
-      # disallow urls
-      style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/uim, ' ')
-      # gauntlet
-      return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/uim
-      return '' unless style =~ /^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$/uim
-      clean = []
-      style.scan(/([-\w]+)\s*:\s*([^:;]*)/uim) do |prop, val|
-        next if val.empty?
-        prop.downcase!
-        if HashedWhiteList::ALLOWED_CSS_PROPERTIES[prop]
-          clean << "#{prop}: #{val};"
-        elsif %w[background border margin padding].include?(prop.split('-')[0])
-          clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
-            HashedWhiteList::ALLOWED_CSS_KEYWORDS[keyword].nil? and
-              keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/uim
-          end
-        elsif HashedWhiteList::ALLOWED_SVG_PROPERTIES[prop]
-          clean << "#{prop}: #{val};"
-        end
-      end
-      style = clean.join(' ')
-    end
-  end # self
-end
+# encoding: utf-8
+require 'rubygems'
+require 'htmlentities'
+# Local Libs
+$:.unshift(File.dirname(__FILE__) + '/../../lib')
+require 'sanitizer/whitelist'
+require 'sanitizer/htmlentries'
+require 'sanitizer/sanitizer'

data/spec/sanitizer_spec.rb CHANGED Viewed

@@ -2,14 +2,21 @@ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
 describe Sanitizer do
-  describe "sanitize" do
+  describe "sanitize" do
     it "should strip all tags" do
       html = "<div><p>Oi <b>como</b> <a href='/xxx/'>Vai</a></p><!-- s --></div>"
       output = Sanitizer.sanitize(html)
       output.should == 'Oi como Vai'
     end
+    it "should still clean even after multiple sanitizes" do
+      html = "<div>Eu & você <b>como</b> <a href='/xxx/'>Vai</a></p><!-- s --></div>"
+      output = Sanitizer.sanitize(html)
+      output = Sanitizer.sanitize(output)
+      output = Sanitizer.sanitize(output)
+      output.should == 'Eu &amp; voc&ecirc; como Vai'
+    end
     it "should clean spaces and tags" do
       html = "<p>Oi <b>como</b>
     Vai</p>"
@@ -20,20 +27,34 @@ describe Sanitizer do
     it "should clean '&' entries" do
       html = "Eu & você"
       output = Sanitizer.sanitize(html)
-      output.should == 'Eu &amp; você'
+      output.should == "Eu &amp; voc&ecirc;"
     end
     it "should not remove valid entries" do
       html = "Eu &amp; você"
       output = Sanitizer.sanitize(html)
-      output.should == 'Eu &amp; você'
+      output.should == "Eu &amp; voc&ecirc;"
     end
   end
-  describe "strip_tags" do
+  describe "html_encode" do
+    it "should convert invalid chars to html entries" do
+      text = "João foi caçar"
+      output = Sanitizer.html_encode(text)
+      output.should == "Jo&atilde;o foi ca&ccedil;ar"
+    end
+    it "should sanitize HTML tags" do
+      text = "<p>João <b>foi</b> caçar</p>"
+      output = Sanitizer.html_encode(text)
+      output.should == "&lt;p&gt;Jo&atilde;o &lt;b&gt;foi&lt;/b&gt; ca&ccedil;ar&lt;/p&gt;"
+    end
+  end
+  describe "strip_tags" do
     it "should remove only <b> tags" do
-       html = "<p>Oi <b>como</b> <a href='/xxx/'>Vai</a></p><!-- s -->"
+      html = "<p>Oi <b>como</b> <a href='/xxx/'>Vai</a></p><!-- s -->"
       output = Sanitizer.strip_tags(html, 'b')
       output.should == "<p>Oi como <a href='/xxx/'>Vai</a></p><!-- s -->"
     end

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: sanitizer
 version: !ruby/object:Gem::Version
-  hash: 25
+  hash: 17
   prerelease:
   segments:
   - 0
   - 1
-  - 1
-  version: 0.1.1
+  - 5
+  version: 0.1.5
 platform: ruby
 authors:
 - Marcelo Eden
@@ -15,12 +15,12 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-05-06 00:00:00 -03:00
+date: 2011-05-11 00:00:00 -03:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
+  name: rspec
   prerelease: false
-  type: :development
   requirement: &id001 !ruby/object:Gem::Requirement
     none: false
     requirements:
@@ -32,82 +32,58 @@ dependencies:
         - 3
         - 0
         version: 2.3.0
-  name: rspec
+  type: :development
   version_requirements: *id001
 - !ruby/object:Gem::Dependency
+  name: ruby-debug
   prerelease: false
-  type: :development
   requirement: &id002 !ruby/object:Gem::Requirement
     none: false
     requirements:
-    - - ~>
+    - - ">="
       - !ruby/object:Gem::Version
-        hash: 23
+        hash: 3
         segments:
-        - 1
-        - 0
         - 0
-        version: 1.0.0
-  name: bundler
+        version: "0"
+  type: :development
   version_requirements: *id002
 - !ruby/object:Gem::Dependency
+  name: htmlentities
   prerelease: false
-  type: :development
   requirement: &id003 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
-        hash: 15
+        hash: 51
         segments:
-        - 1
-        - 6
+        - 4
+        - 3
         - 0
-        version: 1.6.0
-  name: jeweler
+        version: 4.3.0
+  type: :runtime
   version_requirements: *id003
-- !ruby/object:Gem::Dependency
-  prerelease: false
-  type: :development
-  requirement: &id004 !ruby/object:Gem::Requirement
-    none: false
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        hash: 3
-        segments:
-        - 0
-        version: "0"
-  name: rcov
-  version_requirements: *id004
 description: Sanitizer.clean(text)
-email: edendroid@gmail.com
+email:
+- edendroid@gmail.com
 executables: []
 extensions: []
-extra_rdoc_files:
-- LICENSE.txt
-- README
-- README.rdoc
+extra_rdoc_files: []
 files:
-- .document
-- .rspec
-- Gemfile
-- LICENSE.txt
-- README
-- README.rdoc
-- Rakefile
-- VERSION
+- lib/sanitizer/htmlentries.rb
+- lib/sanitizer/sanitizer.rb
+- lib/sanitizer/version.rb
+- lib/sanitizer/whitelist.rb
 - lib/sanitizer.rb
-- lib/whitelist.rb
 - spec/sanitizer_spec.rb
-- spec/spec_helper.rb
-- tags
 has_rdoc: true
 homepage: http://github.com/3den/sanitizer
-licenses:
-- MIT
+licenses: []
 post_install_message:
 rdoc_options: []
@@ -133,10 +109,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: "0"
 requirements: []
-rubyforge_project:
+rubyforge_project: sanitizer
 rubygems_version: 1.6.2
 signing_key:
 specification_version: 3
 summary: The simplest string cleaner ever made
-test_files: []
+test_files:
+- spec/sanitizer_spec.rb

data/.document DELETED Viewed

@@ -1,5 +0,0 @@
-lib/**/*.rb
-bin/*
--
-features/**/*.feature
-LICENSE.txt

data/.rspec DELETED Viewed

	@@ -1 +0,0 @@
1	- --color

data/Gemfile DELETED Viewed

@@ -1,13 +0,0 @@
-source "http://rubygems.org"
-# Add dependencies required to use your gem here.
-# Example:
-#   gem "activesupport", ">= 2.3.5"
-# Add dependencies to develop your gem here.
-# Include everything needed to run rake, tests, features, etc.
-group :development do
-  gem "rspec", "~> 2.3.0"
-  gem "bundler", "~> 1.0.0"
-  gem "jeweler", "~> 1.6.0"
-  gem "rcov", ">= 0"
-end

data/LICENSE.txt DELETED Viewed

@@ -1,20 +0,0 @@
-Copyright (c) 2011 Marcelo Eden
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README DELETED Viewed

@@ -1,19 +0,0 @@
-= sanitizer
-Sanitizer is a very simple and fast string cleaner for ruby, it uses only simple regular
-expressions.
-== Contributing to sanitizer
-* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
-* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
-* Fork the project
-* Start a feature/bugfix branch
-* Commit and push until you are happy with your contribution
-* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
-* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
-== Copyright
-Copyright (c) 2011 Marcelo Eden. See LICENSE.txt for
-further details.

data/README.rdoc DELETED Viewed

@@ -1,19 +0,0 @@
-= sanitizer
-Description goes here.
-== Contributing to sanitizer
-* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
-* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
-* Fork the project
-* Start a feature/bugfix branch
-* Commit and push until you are happy with your contribution
-* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
-* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
-== Copyright
-Copyright (c) 2011 Marcelo Eden. See LICENSE.txt for
-further details.

data/Rakefile DELETED Viewed

@@ -1,49 +0,0 @@
-# encoding: utf-8
-require 'rubygems'
-require 'bundler'
-begin
-  Bundler.setup(:default, :development)
-rescue Bundler::BundlerError => e
-  $stderr.puts e.message
-  $stderr.puts "Run `bundle install` to install missing gems"
-  exit e.status_code
-end
-require 'rake'
-require 'jeweler'
-Jeweler::Tasks.new do |gem|
-  # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
-  gem.name = "sanitizer"
-  gem.homepage = "http://github.com/3den/sanitizer"
-  gem.license = "MIT"
-  gem.summary = %Q{The simplest string cleaner ever made}
-  gem.description = %Q{Sanitizer.clean(text)}
-  gem.email = "edendroid@gmail.com"
-  gem.authors = ["Marcelo Eden"]
-  # dependencies defined in Gemfile
-end
-Jeweler::RubygemsDotOrgTasks.new
-require 'rspec/core'
-require 'rspec/core/rake_task'
-RSpec::Core::RakeTask.new(:spec) do |spec|
-  spec.pattern = FileList['spec/**/*_spec.rb']
-end
-RSpec::Core::RakeTask.new(:rcov) do |spec|
-  spec.pattern = 'spec/**/*_spec.rb'
-  spec.rcov = true
-end
-task :default => :spec
-require 'rake/rdoctask'
-Rake::RDocTask.new do |rdoc|
-  version = File.exist?('VERSION') ? File.read('VERSION') : ""
-  rdoc.rdoc_dir = 'rdoc'
-  rdoc.title = "sanitizer #{version}"
-  rdoc.rdoc_files.include('README*')
-  rdoc.rdoc_files.include('lib/**/*.rb')
-end

data/VERSION DELETED Viewed

	@@ -1 +0,0 @@
1	- 0.1.1

data/spec/spec_helper.rb DELETED Viewed

@@ -1,12 +0,0 @@
-$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
-$LOAD_PATH.unshift(File.dirname(__FILE__))
-require 'rspec'
-require 'sanitizer'
-# Requires supporting files with custom matchers and macros, etc,
-# in ./support/ and its subdirectories.
-Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
-RSpec.configure do |config|
-end

data/tags DELETED Viewed

File without changes