sanitizer 0.1.8 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,7 +2,7 @@
2
2
  class HTMLEntities
3
3
  class Encoder #:nodoc:
4
4
  def basic_entity_regexp
5
- @basic_entity_regexp ||= /[<>'"]|(\&(?!(\w+\;)))/
5
+ @basic_entity_regexp ||= /[<>'"]|(\&(?!(\#?\w+\;)))/
6
6
  end
7
7
  end
8
8
  end
@@ -5,57 +5,89 @@ module Sanitizer
5
5
 
6
6
  # All self.methods
7
7
  class << self
8
- def sanitize(text)
9
- text = strip_tags(text)
10
- text = clean_spaces(text)
11
- text = html_encode(text)
8
+
9
+ # Sanitize to clean text
10
+ def sanitize!(text)
11
+ strip_tags!(text)
12
+ clean_spaces!(text)
13
+ text.replace html_encode(text)
12
14
  text
13
15
  end
16
+
17
+ def sanitize(text)
18
+ sanitize! text.dup
19
+ end
14
20
 
21
+ # Clean retundant spaces
22
+ def clean_spaces!(text)
23
+ text.gsub!(/\s+/, " ")
24
+ text.strip!
25
+ text
26
+ end
27
+
15
28
  def clean_spaces(text)
16
- output = text.dup
17
- output.gsub!(/\s+/, " ")
18
- output.strip!
19
- output
29
+ clean_spaces! text.dup
20
30
  end
21
31
 
22
- def strip_comments(text)
23
- output = text.dup
24
- output.gsub!(/(\<\!\-\-\b*[^\-\-\>]*.*?\-\-\>)/ui, "")
25
- output.gsub!(/(\&lt;\s?\!--.*\s?--\&gt;)/uim, "")
26
- output
32
+ # remove comments
33
+ def strip_comments!(text)
34
+ text.gsub!(/(\<\!\-\-\b*[^\-\-\>]*.*?\-\-\>)/ui, "")
35
+ text.gsub!(/(\&lt;\s?\!--.*\s?--\&gt;)/uim, "")
36
+ text
27
37
  end
28
38
 
39
+ def strip_comments(text)
40
+ strip_comments! text.dup
41
+ end
42
+
29
43
  # Remove all <script> and <style> tags
30
- def strip_disallowed_tags(text)
31
- output = text
32
- output.gsub!(/(<script\s*.*>.*<\/script>)/uim, "")
33
- output.gsub!(/(<script\s*.*\/?>)/uim, "")
34
- output.gsub!(/(<link\s*.*\/?>)/uim, "")
35
- output.gsub!(/(<style\s*.*>.*<\/style>)/uim, "")
44
+ def strip_disallowed_tags!(text)
45
+ text.gsub!(/(<script\s*.*>.*<\/script>)/uim, "")
46
+ text.gsub!(/(<script\s*.*\/?>)/uim, "")
47
+ text.gsub!(/(<link\s*.*\/?>)/uim, "")
48
+ text.gsub!(/(<style\s*.*>.*<\/style>)/uim, "")
36
49
 
37
50
  # Stripping html entities too
38
- output.gsub!(/(\&lt;script\s*.*\&gt;.*\&lt;\/script\&gt;)/uim, "")
39
- output.gsub!(/(\&lt;script\s*.*\/?\&gt;)/uim, "")
40
- output.gsub!(/(\&lt;link\s*.*\/?\&gt;)/uim, "")
41
- output.gsub!(/(\&lt;style\s*.*\&gt;.*\&lt;\/style\&gt;)/uim, "")
42
- output
51
+ text.gsub!(/(\&lt;script\s*.*\&gt;.*\&lt;\/script\&gt;)/uim, "")
52
+ text.gsub!(/(\&lt;script\s*.*\/?\&gt;)/uim, "")
53
+ text.gsub!(/(\&lt;link\s*.*\/?\&gt;)/uim, "")
54
+ text.gsub!(/(\&lt;style\s*.*\&gt;.*\&lt;\/style\&gt;)/uim, "")
55
+ text
43
56
  end
57
+
58
+ def strip_disallowed_tags(text)
59
+ strip_disallowed_tags! text.dup
60
+ end
44
61
 
45
62
  # Remove all tags from from text
46
- def strip_tags(text, *tags)
47
- output = text.dup
63
+ def strip_tags!(text, *tags)
48
64
  if tags.empty? # clear all tags by default
49
- output.gsub!(/<\/?[^>]*>/uim, "")
50
- output.gsub!(/\&lt;\/?[^\&gt;]*\&gt;/uim, "")
65
+ text.gsub!(/<\/?[^>]*>/uim, "")
66
+ text.gsub!(/\&lt;\/?[^\&gt;]*\&gt;/uim, "")
51
67
  else # clean only selected tags
52
68
  strip = tags.map do |tag|
53
69
  %Q{(#{tag})}
54
70
  end.join('|')
55
- output.gsub!(/<\/?(#{strip})[^>]*>/uim, "")
56
- output.gsub!(/\&lt;\/?(#{strip})[^\&gt;]*\&gt;/uim, "")
71
+ text.gsub!(/<\/?(#{strip})[^>]*>/uim, "")
72
+ text.gsub!(/\&lt;\/?(#{strip})[^\&gt;]*\&gt;/uim, "")
57
73
  end
58
- output
74
+ text
75
+ end
76
+
77
+ def strip_tags(text, *tags)
78
+ strip_tags! text.dup, *tags
79
+ end
80
+
81
+ # Alguns feeds retornam tags "escapadas" dentro do conteúdo (ex: &lt;br/&gt;)
82
+ # Este método deve ser utilizado após o stripping e sanitização, para não deixar que essas tags sejam exibidas como conteúdo
83
+ def entities_to_chars!(text)
84
+ text.gsub!(/\&lt;/uim, "<")
85
+ text.gsub!(/\&gt;/uim, ">")
86
+ text
87
+ end
88
+
89
+ def entities_to_chars(text)
90
+ entities_to_chars! text.dup
59
91
  end
60
92
 
61
93
  # Convert invalid chars to HTML Entries
@@ -68,16 +100,6 @@ module Sanitizer
68
100
  def html_decode(text)
69
101
  text = text.to_s
70
102
  @@htmle.decode(text, :named)
71
- end
72
-
73
- # Alguns feeds retornam tags "escapadas" dentro do conteúdo (ex: &lt;br/&gt;)
74
- # Este método deve ser utilizado após o stripping e sanitização, para não deixar que essas tags sejam exibidas como conteúdo
75
- def entities_to_chars(text)
76
- output = text.dup
77
- output.gsub!(/\&lt;/uim, "<")
78
- output.gsub!(/\&gt;/uim, ">")
79
- output
80
- end
81
-
103
+ end
82
104
  end # self
83
105
  end
@@ -1,4 +1,4 @@
1
1
  # encoding: utf-8
2
2
  module Sanitizer
3
- VERSION = "0.1.8"
3
+ VERSION = "0.2.0"
4
4
  end
@@ -18,6 +18,12 @@ describe Sanitizer do
18
18
  output.should == 'Eu &amp; voc&ecirc; como Vai'
19
19
  end
20
20
 
21
+ it "should not break &#8220; entities" do
22
+ html = "&#8220; Testando"
23
+ output = Sanitizer.sanitize(html)
24
+ output.should == "&#8220; Testando"
25
+ end
26
+
21
27
  it "should clean spaces and tags" do
22
28
  html = "<p>Oi <b>como</b>
23
29
  Vai</p>"
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sanitizer
3
3
  version: !ruby/object:Gem::Version
4
- hash: 11
4
+ hash: 23
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
- - 1
9
- - 8
10
- version: 0.1.8
8
+ - 2
9
+ - 0
10
+ version: 0.2.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Marcelo Eden
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-05-20 00:00:00 -03:00
18
+ date: 2011-06-29 00:00:00 -03:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency