sanitizer 0.1.8 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@
2
2
  class HTMLEntities
3
3
  class Encoder #:nodoc:
4
4
  def basic_entity_regexp
5
- @basic_entity_regexp ||= /[<>'"]|(\&(?!(\w+\;)))/
5
+ @basic_entity_regexp ||= /[<>'"]|(\&(?!(\#?\w+\;)))/
6
6
  end
7
7
  end
8
8
  end
@@ -5,57 +5,89 @@ module Sanitizer
5
5
 
6
6
  # All self.methods
7
7
  class << self
8
- def sanitize(text)
9
- text = strip_tags(text)
10
- text = clean_spaces(text)
11
- text = html_encode(text)
8
+
9
+ # Sanitize to clean text
10
+ def sanitize!(text)
11
+ strip_tags!(text)
12
+ clean_spaces!(text)
13
+ text.replace html_encode(text)
12
14
  text
13
15
  end
16
+
17
+ def sanitize(text)
18
+ sanitize! text.dup
19
+ end
14
20
 
21
+ # Clean retundant spaces
22
+ def clean_spaces!(text)
23
+ text.gsub!(/\s+/, " ")
24
+ text.strip!
25
+ text
26
+ end
27
+
15
28
  def clean_spaces(text)
16
- output = text.dup
17
- output.gsub!(/\s+/, " ")
18
- output.strip!
19
- output
29
+ clean_spaces! text.dup
20
30
  end
21
31
 
22
- def strip_comments(text)
23
- output = text.dup
24
- output.gsub!(/(\<\!\-\-\b*[^\-\-\>]*.*?\-\-\>)/ui, "")
25
- output.gsub!(/(\&lt;\s?\!--.*\s?--\&gt;)/uim, "")
26
- output
32
+ # remove comments
33
+ def strip_comments!(text)
34
+ text.gsub!(/(\<\!\-\-\b*[^\-\-\>]*.*?\-\-\>)/ui, "")
35
+ text.gsub!(/(\&lt;\s?\!--.*\s?--\&gt;)/uim, "")
36
+ text
27
37
  end
28
38
 
39
+ def strip_comments(text)
40
+ strip_comments! text.dup
41
+ end
42
+
29
43
  # Remove all <script> and <style> tags
30
- def strip_disallowed_tags(text)
31
- output = text
32
- output.gsub!(/(<script\s*.*>.*<\/script>)/uim, "")
33
- output.gsub!(/(<script\s*.*\/?>)/uim, "")
34
- output.gsub!(/(<link\s*.*\/?>)/uim, "")
35
- output.gsub!(/(<style\s*.*>.*<\/style>)/uim, "")
44
+ def strip_disallowed_tags!(text)
45
+ text.gsub!(/(<script\s*.*>.*<\/script>)/uim, "")
46
+ text.gsub!(/(<script\s*.*\/?>)/uim, "")
47
+ text.gsub!(/(<link\s*.*\/?>)/uim, "")
48
+ text.gsub!(/(<style\s*.*>.*<\/style>)/uim, "")
36
49
 
37
50
  # Stripping html entities too
38
- output.gsub!(/(\&lt;script\s*.*\&gt;.*\&lt;\/script\&gt;)/uim, "")
39
- output.gsub!(/(\&lt;script\s*.*\/?\&gt;)/uim, "")
40
- output.gsub!(/(\&lt;link\s*.*\/?\&gt;)/uim, "")
41
- output.gsub!(/(\&lt;style\s*.*\&gt;.*\&lt;\/style\&gt;)/uim, "")
42
- output
51
+ text.gsub!(/(\&lt;script\s*.*\&gt;.*\&lt;\/script\&gt;)/uim, "")
52
+ text.gsub!(/(\&lt;script\s*.*\/?\&gt;)/uim, "")
53
+ text.gsub!(/(\&lt;link\s*.*\/?\&gt;)/uim, "")
54
+ text.gsub!(/(\&lt;style\s*.*\&gt;.*\&lt;\/style\&gt;)/uim, "")
55
+ text
43
56
  end
57
+
58
+ def strip_disallowed_tags(text)
59
+ strip_disallowed_tags! text.dup
60
+ end
44
61
 
45
62
  # Remove all tags from from text
46
- def strip_tags(text, *tags)
47
- output = text.dup
63
+ def strip_tags!(text, *tags)
48
64
  if tags.empty? # clear all tags by default
49
- output.gsub!(/<\/?[^>]*>/uim, "")
50
- output.gsub!(/\&lt;\/?[^\&gt;]*\&gt;/uim, "")
65
+ text.gsub!(/<\/?[^>]*>/uim, "")
66
+ text.gsub!(/\&lt;\/?[^\&gt;]*\&gt;/uim, "")
51
67
  else # clean only selected tags
52
68
  strip = tags.map do |tag|
53
69
  %Q{(#{tag})}
54
70
  end.join('|')
55
- output.gsub!(/<\/?(#{strip})[^>]*>/uim, "")
56
- output.gsub!(/\&lt;\/?(#{strip})[^\&gt;]*\&gt;/uim, "")
71
+ text.gsub!(/<\/?(#{strip})[^>]*>/uim, "")
72
+ text.gsub!(/\&lt;\/?(#{strip})[^\&gt;]*\&gt;/uim, "")
57
73
  end
58
- output
74
+ text
75
+ end
76
+
77
+ def strip_tags(text, *tags)
78
+ strip_tags! text.dup, *tags
79
+ end
80
+
81
+ # Alguns feeds retornam tags "escapadas" dentro do conteúdo (ex: &lt;br/&gt;)
82
+ # Este método deve ser utilizado após o stripping e sanitização, para não deixar que essas tags sejam exibidas como conteúdo
83
+ def entities_to_chars!(text)
84
+ text.gsub!(/\&lt;/uim, "<")
85
+ text.gsub!(/\&gt;/uim, ">")
86
+ text
87
+ end
88
+
89
+ def entities_to_chars(text)
90
+ entities_to_chars! text.dup
59
91
  end
60
92
 
61
93
  # Convert invalid chars to HTML Entries
@@ -68,16 +100,6 @@ module Sanitizer
68
100
  def html_decode(text)
69
101
  text = text.to_s
70
102
  @@htmle.decode(text, :named)
71
- end
72
-
73
- # Alguns feeds retornam tags "escapadas" dentro do conteúdo (ex: &lt;br/&gt;)
74
- # Este método deve ser utilizado após o stripping e sanitização, para não deixar que essas tags sejam exibidas como conteúdo
75
- def entities_to_chars(text)
76
- output = text.dup
77
- output.gsub!(/\&lt;/uim, "<")
78
- output.gsub!(/\&gt;/uim, ">")
79
- output
80
- end
81
-
103
+ end
82
104
  end # self
83
105
  end
@@ -1,4 +1,4 @@
1
1
  # encoding: utf-8
2
2
  module Sanitizer
3
- VERSION = "0.1.8"
3
+ VERSION = "0.2.0"
4
4
  end
@@ -18,6 +18,12 @@ describe Sanitizer do
18
18
  output.should == 'Eu &amp; voc&ecirc; como Vai'
19
19
  end
20
20
 
21
+ it "should not break &#8220; entities" do
22
+ html = "&#8220; Testando"
23
+ output = Sanitizer.sanitize(html)
24
+ output.should == "&#8220; Testando"
25
+ end
26
+
21
27
  it "should clean spaces and tags" do
22
28
  html = "<p>Oi <b>como</b>
23
29
  Vai</p>"
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sanitizer
3
3
  version: !ruby/object:Gem::Version
4
- hash: 11
4
+ hash: 23
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
- - 1
9
- - 8
10
- version: 0.1.8
8
+ - 2
9
+ - 0
10
+ version: 0.2.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Marcelo Eden
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2011-05-20 00:00:00 -03:00
18
+ date: 2011-06-29 00:00:00 -03:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency