sanitizer 0.1.1 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,14 @@
1
+ class HTMLEntities
2
+ class Encoder #:nodoc:
3
+ def basic_entity_regexp
4
+ @basic_entity_regexp ||= (
5
+ case @flavor
6
+ when /^html/
7
+ /[<>"]|(\&(?!\w))/
8
+ else
9
+ /[<>'"]|(\&(?!\w))/
10
+ end
11
+ )
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,82 @@
1
+ # encoding: utf-8
2
+ module Sanitizer
3
+ # HTMLEntris
4
+ @@htmle = HTMLEntities.new
5
+
6
+ # All self.methods
7
+ class << self
8
+ def sanitize(text)
9
+ text = strip_tags(text)
10
+ text = clean_spaces(text)
11
+ text = html_encode(text)
12
+ text
13
+ end
14
+
15
+ def clean_spaces(text)
16
+ output = text.dup
17
+ output.gsub!(/\s+/, " ")
18
+ output
19
+ end
20
+
21
+ def strip_comments(text)
22
+ output = text.dup
23
+ output.gsub!(/(\<\!\-\-\b*[^\-\-\>]*.*?\-\-\>)/ui, "")
24
+ output.gsub!(/(\&lt;\s?\!--.*\s?--\&gt;)/uim, "")
25
+ output
26
+ end
27
+
28
+ # Remove all <script> and <style> tags
29
+ def strip_disallowed_tags(text)
30
+ output = text
31
+ output.gsub!(/(<script\s*.*>.*<\/script>)/uim, "")
32
+ output.gsub!(/(<script\s*.*\/?>)/uim, "")
33
+ output.gsub!(/(<link\s*.*\/?>)/uim, "")
34
+ output.gsub!(/(<style\s*.*>.*<\/style>)/uim, "")
35
+
36
+ # Stripping html entities too
37
+ output.gsub!(/(\&lt;script\s*.*\&gt;.*\&lt;\/script\&gt;)/uim, "")
38
+ output.gsub!(/(\&lt;script\s*.*\/?\&gt;)/uim, "")
39
+ output.gsub!(/(\&lt;link\s*.*\/?\&gt;)/uim, "")
40
+ output.gsub!(/(\&lt;style\s*.*\&gt;.*\&lt;\/style\&gt;)/uim, "")
41
+ output
42
+ end
43
+
44
+ # Remove all tags from from text
45
+ def strip_tags(text, *tags)
46
+ output = text.dup
47
+ if tags.empty? # clear all tags by default
48
+ output.gsub!(/<\/?[^>]*>/uim, "")
49
+ output.gsub!(/\&lt;\/?[^\&gt;]*\&gt;/uim, "")
50
+ else # clean only selected tags
51
+ strip = tags.map do |tag|
52
+ %Q{(#{tag})}
53
+ end.join('|')
54
+ output.gsub!(/<\/?(#{strip})[^>]*>/uim, "")
55
+ output.gsub!(/\&lt;\/?(#{strip})[^\&gt;]*\&gt;/uim, "")
56
+ end
57
+ output
58
+ end
59
+
60
+ # Convert invalid chars to HTML Entries
61
+ def html_encode(text)
62
+ text = text.to_s
63
+ @@htmle.encode(text, :named)
64
+ end
65
+
66
+ # Convert invalid chars to HTML Entries
67
+ def html_decode(text)
68
+ text = text.to_s
69
+ @@htmle.decode(text, :named)
70
+ end
71
+
72
+ # Alguns feeds retornam tags "escapadas" dentro do conteúdo (ex: &lt;br/&gt;)
73
+ # Este método deve ser utilizado após o stripping e sanitização, para não deixar que essas tags sejam exibidas como conteúdo
74
+ def entities_to_chars(text)
75
+ output = text.dup
76
+ output.gsub!(/\&lt;/uim, "<")
77
+ output.gsub!(/\&gt;/uim, ">")
78
+ output
79
+ end
80
+
81
+ end # self
82
+ end
@@ -0,0 +1,3 @@
1
+ module Sanitizer
2
+ VERSION = "0.1.5"
3
+ end
File without changes
data/lib/sanitizer.rb CHANGED
@@ -1,100 +1,9 @@
1
- require 'whitelist'
2
-
3
- module Sanitizer
4
-
5
- class << self
6
- def sanitize(text)
7
- new_text = text
8
- sanitize!(new_text)
9
- end
10
-
11
- def sanitize!(text)
12
- strip_tags(text)
13
- clean_spaces(text)
14
- clean_ampersand(text)
15
- text
16
- end
17
-
18
- def clean_spaces(text)
19
- text.gsub!(/\s+/, " ")
20
- text
21
- end
22
-
23
- def clean_ampersand(text)
24
- text.gsub!(/\&[^\w\;]+/, "&amp; ")
25
- text
26
- end
27
-
28
- def strip_comments(text)
29
- text.gsub!(/(\<\!\-\-\b*[^\-\-\>]*.*?\-\-\>)/ui, "")
30
- text.gsub!(/(\&lt;\s?\!--.*\s?--\&gt;)/uim, "")
31
- text
32
- end
33
-
34
- # Remove all <script> and <style> tags
35
- def strip_disallowed_tags(text)
36
- text.gsub!(/(<script\s*.*>.*<\/script>)/uim, "")
37
- text.gsub!(/(<script\s*.*\/?>)/uim, "")
38
- text.gsub!(/(<link\s*.*\/?>)/uim, "")
39
- text.gsub!(/(<style\s*.*>.*<\/style>)/uim, "")
40
-
41
- # Stripping html entities too
42
- text.gsub!(/(\&lt;script\s*.*\&gt;.*\&lt;\/script\&gt;)/uim, "")
43
- text.gsub!(/(\&lt;script\s*.*\/?\&gt;)/uim, "")
44
- text.gsub!(/(\&lt;link\s*.*\/?\&gt;)/uim, "")
45
- text.gsub!(/(\&lt;style\s*.*\&gt;.*\&lt;\/style\&gt;)/uim, "")
46
- text
47
- end
48
-
49
- # Remove all tags from from text
50
- def strip_tags(text, *tags)
51
- if tags.empty? # clear all tags by default
52
- text.gsub!(/<\/?[^>]*>/uim, "")
53
- text.gsub!(/\&lt;\/?[^\&gt;]*\&gt;/uim, "")
54
- else # clean only selected tags
55
- strip = tags.map do |tag|
56
- %Q{(#{tag})}
57
- end.join('|')
58
- text.gsub!(/<\/?(#{strip})[^>]*>/uim, "")
59
- text.gsub!(/\&lt;\/?(#{strip})[^\&gt;]*\&gt;/uim, "")
60
- end
61
- text
62
- end
63
-
64
- # Alguns feeds retornam tags "escapadas" dentro do conteúdo (ex: &lt;br/&gt;)
65
- # Este método deve ser utilizado após o stripping e sanitização, para não deixar que essas tags sejam exibidas como conteúdo
66
- def entities_to_chars(text)
67
- text.gsub!(/\&lt;/uim, "<")
68
- text.gsub!(/\&gt;/uim, ">")
69
- text
70
- end
71
-
72
- # this liftend nearly verbatim from html5
73
- def sanitize_css(style)
74
- # disallow urls
75
- style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/uim, ' ')
76
-
77
- # gauntlet
78
- return '' unless style =~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/uim
79
- return '' unless style =~ /^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$/uim
80
-
81
- clean = []
82
- style.scan(/([-\w]+)\s*:\s*([^:;]*)/uim) do |prop, val|
83
- next if val.empty?
84
- prop.downcase!
85
- if HashedWhiteList::ALLOWED_CSS_PROPERTIES[prop]
86
- clean << "#{prop}: #{val};"
87
- elsif %w[background border margin padding].include?(prop.split('-')[0])
88
- clean << "#{prop}: #{val};" unless val.split().any? do |keyword|
89
- HashedWhiteList::ALLOWED_CSS_KEYWORDS[keyword].nil? and
90
- keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/uim
91
- end
92
- elsif HashedWhiteList::ALLOWED_SVG_PROPERTIES[prop]
93
- clean << "#{prop}: #{val};"
94
- end
95
- end
96
-
97
- style = clean.join(' ')
98
- end
99
- end # self
100
- end
1
+ # encoding: utf-8
2
+ require 'rubygems'
3
+ require 'htmlentities'
4
+
5
+ # Local Libs
6
+ $:.unshift(File.dirname(__FILE__) + '/../../lib')
7
+ require 'sanitizer/whitelist'
8
+ require 'sanitizer/htmlentries'
9
+ require 'sanitizer/sanitizer'
@@ -2,14 +2,21 @@ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
2
 
3
3
  describe Sanitizer do
4
4
 
5
- describe "sanitize" do
6
-
5
+ describe "sanitize" do
7
6
  it "should strip all tags" do
8
7
  html = "<div><p>Oi <b>como</b> <a href='/xxx/'>Vai</a></p><!-- s --></div>"
9
8
  output = Sanitizer.sanitize(html)
10
9
  output.should == 'Oi como Vai'
11
10
  end
12
11
 
12
+ it "should still clean even after multiple sanitizes" do
13
+ html = "<div>Eu & você <b>como</b> <a href='/xxx/'>Vai</a></p><!-- s --></div>"
14
+ output = Sanitizer.sanitize(html)
15
+ output = Sanitizer.sanitize(output)
16
+ output = Sanitizer.sanitize(output)
17
+ output.should == 'Eu &amp; voc&ecirc; como Vai'
18
+ end
19
+
13
20
  it "should clean spaces and tags" do
14
21
  html = "<p>Oi <b>como</b>
15
22
  Vai</p>"
@@ -20,20 +27,34 @@ describe Sanitizer do
20
27
  it "should clean '&' entries" do
21
28
  html = "Eu & você"
22
29
  output = Sanitizer.sanitize(html)
23
- output.should == 'Eu &amp; você'
30
+ output.should == "Eu &amp; voc&ecirc;"
24
31
  end
25
32
 
26
33
  it "should not remove valid entries" do
27
34
  html = "Eu &amp; você"
28
35
  output = Sanitizer.sanitize(html)
29
- output.should == 'Eu &amp; você'
36
+ output.should == "Eu &amp; voc&ecirc;"
30
37
  end
31
38
  end
32
39
 
33
- describe "strip_tags" do
40
+ describe "html_encode" do
41
+
42
+ it "should convert invalid chars to html entries" do
43
+ text = "João foi caçar"
44
+ output = Sanitizer.html_encode(text)
45
+ output.should == "Jo&atilde;o foi ca&ccedil;ar"
46
+ end
34
47
 
48
+ it "should sanitize HTML tags" do
49
+ text = "<p>João <b>foi</b> caçar</p>"
50
+ output = Sanitizer.html_encode(text)
51
+ output.should == "&lt;p&gt;Jo&atilde;o &lt;b&gt;foi&lt;/b&gt; ca&ccedil;ar&lt;/p&gt;"
52
+ end
53
+ end
54
+
55
+ describe "strip_tags" do
35
56
  it "should remove only <b> tags" do
36
- html = "<p>Oi <b>como</b> <a href='/xxx/'>Vai</a></p><!-- s -->"
57
+ html = "<p>Oi <b>como</b> <a href='/xxx/'>Vai</a></p><!-- s -->"
37
58
  output = Sanitizer.strip_tags(html, 'b')
38
59
  output.should == "<p>Oi como <a href='/xxx/'>Vai</a></p><!-- s -->"
39
60
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sanitizer
3
3
  version: !ruby/object:Gem::Version
4
- hash: 25
4
+ hash: 17
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 1
10
- version: 0.1.1
9
+ - 5
10
+ version: 0.1.5
11
11
  platform: ruby
12
12
  authors:
13
13
  - Marcelo Eden
@@ -15,12 +15,12 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-05-06 00:00:00 -03:00
18
+ date: 2011-05-11 00:00:00 -03:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
22
+ name: rspec
22
23
  prerelease: false
23
- type: :development
24
24
  requirement: &id001 !ruby/object:Gem::Requirement
25
25
  none: false
26
26
  requirements:
@@ -32,82 +32,58 @@ dependencies:
32
32
  - 3
33
33
  - 0
34
34
  version: 2.3.0
35
- name: rspec
35
+ type: :development
36
36
  version_requirements: *id001
37
37
  - !ruby/object:Gem::Dependency
38
+ name: ruby-debug
38
39
  prerelease: false
39
- type: :development
40
40
  requirement: &id002 !ruby/object:Gem::Requirement
41
41
  none: false
42
42
  requirements:
43
- - - ~>
43
+ - - ">="
44
44
  - !ruby/object:Gem::Version
45
- hash: 23
45
+ hash: 3
46
46
  segments:
47
- - 1
48
- - 0
49
47
  - 0
50
- version: 1.0.0
51
- name: bundler
48
+ version: "0"
49
+ type: :development
52
50
  version_requirements: *id002
53
51
  - !ruby/object:Gem::Dependency
52
+ name: htmlentities
54
53
  prerelease: false
55
- type: :development
56
54
  requirement: &id003 !ruby/object:Gem::Requirement
57
55
  none: false
58
56
  requirements:
59
57
  - - ~>
60
58
  - !ruby/object:Gem::Version
61
- hash: 15
59
+ hash: 51
62
60
  segments:
63
- - 1
64
- - 6
61
+ - 4
62
+ - 3
65
63
  - 0
66
- version: 1.6.0
67
- name: jeweler
64
+ version: 4.3.0
65
+ type: :runtime
68
66
  version_requirements: *id003
69
- - !ruby/object:Gem::Dependency
70
- prerelease: false
71
- type: :development
72
- requirement: &id004 !ruby/object:Gem::Requirement
73
- none: false
74
- requirements:
75
- - - ">="
76
- - !ruby/object:Gem::Version
77
- hash: 3
78
- segments:
79
- - 0
80
- version: "0"
81
- name: rcov
82
- version_requirements: *id004
83
67
  description: Sanitizer.clean(text)
84
- email: edendroid@gmail.com
68
+ email:
69
+ - edendroid@gmail.com
85
70
  executables: []
86
71
 
87
72
  extensions: []
88
73
 
89
- extra_rdoc_files:
90
- - LICENSE.txt
91
- - README
92
- - README.rdoc
74
+ extra_rdoc_files: []
75
+
93
76
  files:
94
- - .document
95
- - .rspec
96
- - Gemfile
97
- - LICENSE.txt
98
- - README
99
- - README.rdoc
100
- - Rakefile
101
- - VERSION
77
+ - lib/sanitizer/htmlentries.rb
78
+ - lib/sanitizer/sanitizer.rb
79
+ - lib/sanitizer/version.rb
80
+ - lib/sanitizer/whitelist.rb
102
81
  - lib/sanitizer.rb
103
- - lib/whitelist.rb
104
82
  - spec/sanitizer_spec.rb
105
- - spec/spec_helper.rb
106
- - tags
107
83
  has_rdoc: true
108
84
  homepage: http://github.com/3den/sanitizer
109
- licenses:
110
- - MIT
85
+ licenses: []
86
+
111
87
  post_install_message:
112
88
  rdoc_options: []
113
89
 
@@ -133,10 +109,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
133
109
  version: "0"
134
110
  requirements: []
135
111
 
136
- rubyforge_project:
112
+ rubyforge_project: sanitizer
137
113
  rubygems_version: 1.6.2
138
114
  signing_key:
139
115
  specification_version: 3
140
116
  summary: The simplest string cleaner ever made
141
- test_files: []
142
-
117
+ test_files:
118
+ - spec/sanitizer_spec.rb
data/.document DELETED
@@ -1,5 +0,0 @@
1
- lib/**/*.rb
2
- bin/*
3
- -
4
- features/**/*.feature
5
- LICENSE.txt
data/.rspec DELETED
@@ -1 +0,0 @@
1
- --color
data/Gemfile DELETED
@@ -1,13 +0,0 @@
1
- source "http://rubygems.org"
2
- # Add dependencies required to use your gem here.
3
- # Example:
4
- # gem "activesupport", ">= 2.3.5"
5
-
6
- # Add dependencies to develop your gem here.
7
- # Include everything needed to run rake, tests, features, etc.
8
- group :development do
9
- gem "rspec", "~> 2.3.0"
10
- gem "bundler", "~> 1.0.0"
11
- gem "jeweler", "~> 1.6.0"
12
- gem "rcov", ">= 0"
13
- end
data/LICENSE.txt DELETED
@@ -1,20 +0,0 @@
1
- Copyright (c) 2011 Marcelo Eden
2
-
3
- Permission is hereby granted, free of charge, to any person obtaining
4
- a copy of this software and associated documentation files (the
5
- "Software"), to deal in the Software without restriction, including
6
- without limitation the rights to use, copy, modify, merge, publish,
7
- distribute, sublicense, and/or sell copies of the Software, and to
8
- permit persons to whom the Software is furnished to do so, subject to
9
- the following conditions:
10
-
11
- The above copyright notice and this permission notice shall be
12
- included in all copies or substantial portions of the Software.
13
-
14
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README DELETED
@@ -1,19 +0,0 @@
1
- = sanitizer
2
-
3
- Sanitizer is a very simple and fast string cleaner for ruby, it uses only simple regular
4
- expressions.
5
-
6
- == Contributing to sanitizer
7
-
8
- * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
9
- * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
10
- * Fork the project
11
- * Start a feature/bugfix branch
12
- * Commit and push until you are happy with your contribution
13
- * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
14
- * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
15
-
16
- == Copyright
17
-
18
- Copyright (c) 2011 Marcelo Eden. See LICENSE.txt for
19
- further details.
data/README.rdoc DELETED
@@ -1,19 +0,0 @@
1
- = sanitizer
2
-
3
- Description goes here.
4
-
5
- == Contributing to sanitizer
6
-
7
- * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
8
- * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
9
- * Fork the project
10
- * Start a feature/bugfix branch
11
- * Commit and push until you are happy with your contribution
12
- * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
13
- * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
14
-
15
- == Copyright
16
-
17
- Copyright (c) 2011 Marcelo Eden. See LICENSE.txt for
18
- further details.
19
-
data/Rakefile DELETED
@@ -1,49 +0,0 @@
1
- # encoding: utf-8
2
-
3
- require 'rubygems'
4
- require 'bundler'
5
- begin
6
- Bundler.setup(:default, :development)
7
- rescue Bundler::BundlerError => e
8
- $stderr.puts e.message
9
- $stderr.puts "Run `bundle install` to install missing gems"
10
- exit e.status_code
11
- end
12
- require 'rake'
13
-
14
- require 'jeweler'
15
- Jeweler::Tasks.new do |gem|
16
- # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
- gem.name = "sanitizer"
18
- gem.homepage = "http://github.com/3den/sanitizer"
19
- gem.license = "MIT"
20
- gem.summary = %Q{The simplest string cleaner ever made}
21
- gem.description = %Q{Sanitizer.clean(text)}
22
- gem.email = "edendroid@gmail.com"
23
- gem.authors = ["Marcelo Eden"]
24
- # dependencies defined in Gemfile
25
- end
26
- Jeweler::RubygemsDotOrgTasks.new
27
-
28
- require 'rspec/core'
29
- require 'rspec/core/rake_task'
30
- RSpec::Core::RakeTask.new(:spec) do |spec|
31
- spec.pattern = FileList['spec/**/*_spec.rb']
32
- end
33
-
34
- RSpec::Core::RakeTask.new(:rcov) do |spec|
35
- spec.pattern = 'spec/**/*_spec.rb'
36
- spec.rcov = true
37
- end
38
-
39
- task :default => :spec
40
-
41
- require 'rake/rdoctask'
42
- Rake::RDocTask.new do |rdoc|
43
- version = File.exist?('VERSION') ? File.read('VERSION') : ""
44
-
45
- rdoc.rdoc_dir = 'rdoc'
46
- rdoc.title = "sanitizer #{version}"
47
- rdoc.rdoc_files.include('README*')
48
- rdoc.rdoc_files.include('lib/**/*.rb')
49
- end
data/VERSION DELETED
@@ -1 +0,0 @@
1
- 0.1.1
data/spec/spec_helper.rb DELETED
@@ -1,12 +0,0 @@
1
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
- $LOAD_PATH.unshift(File.dirname(__FILE__))
3
- require 'rspec'
4
- require 'sanitizer'
5
-
6
- # Requires supporting files with custom matchers and macros, etc,
7
- # in ./support/ and its subdirectories.
8
- Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
9
-
10
- RSpec.configure do |config|
11
-
12
- end
data/tags DELETED
File without changes