string_cleaner 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -1,6 +1,9 @@
1
- require 'rake/tasklib'
2
- require 'rake/rdoctask'
3
- require 'rubygems'
1
+ if RUBY_VERSION.to_f<1.9
2
+ require 'rake/tasklib'
3
+ require 'rake/rdoctask'
4
+ require 'rubygems'
5
+ end
6
+
4
7
  begin
5
8
  require 'bundler/setup'
6
9
  rescue LoadError
@@ -9,13 +9,13 @@ module String::Cleaner
9
9
 
10
10
  def fix_encoding
11
11
  utf8 = dup
12
- if utf8.respond_to?(:force_encoding)
12
+ if utf8.respond_to?(:force_encoding)
13
13
  utf8.force_encoding("UTF-8") # for Ruby 1.9+
14
14
  unless utf8.valid_encoding? # if invalid UTF-8
15
- utf8 = utf8.force_encoding("ISO8859-15")
15
+ utf8 = utf8.force_encoding("ISO8859-1")
16
16
  utf8.encode!("UTF-8", :invalid => :replace, :undef => :replace, :replace => "")
17
17
  end
18
- utf8.gsub!("\u0080", "€") # special case for euro sign from Windows-1252
18
+ utf8.gsub!(/\u0080|¤/, "€") # special case for euro sign from Windows-1252
19
19
  utf8
20
20
  else
21
21
  require "iconv"
@@ -34,31 +34,40 @@ module String::Cleaner
34
34
  end
35
35
 
36
36
  SPECIAL_SPACES = [
37
- 0x00A0, # White_Space # Zs NO-BREAK SPACE
38
- 0x1680, # White_Space # Zs OGHAM SPACE MARK
39
- 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
40
- (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
41
- 0x2028, # White_Space # Zl LINE SEPARATOR
42
- 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
43
- 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
44
- 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
45
- 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
46
- ].flatten.collect{|e| [e].pack 'U*'}
37
+ 0x00A0, # NO-BREAK SPACE
38
+ 0x1680, # OGHAM SPACE MARK
39
+ 0x180E, # MONGOLIAN VOWEL SEPARATOR
40
+ (0x2000..0x200A).to_a, # EN QUAD..HAIR SPACE
41
+ 0x2028, # LINE SEPARATOR
42
+ 0x2029, # PARAGRAPH SEPARATOR
43
+ 0x202F, # NARROW NO-BREAK SPACE
44
+ 0x205F, # MEDIUM MATHEMATICAL SPACE
45
+ 0x3000, # IDEOGRAPHIC SPACE
46
+ ].flatten.collect{|e| [e].pack 'U*'}
47
+
48
+ ZERO_WIDTH = [
49
+ 0x200B, # ZERO WIDTH SPACE
50
+ 0x200C, # ZERO WIDTH NON-JOINER
51
+ 0x200D, # ZERO WIDTH JOINER
52
+ 0x2060, # WORD JOINER
53
+ 0xFEFF, # ZERO WIDTH NO-BREAK SPACE
54
+ ].flatten.collect{|e| [e].pack 'U*'}
47
55
 
48
56
  def fix_invisible_chars
49
57
  utf8 = self.dup
50
- if utf8.respond_to?(:force_encoding)
58
+ utf8.gsub!(Regexp.new(ZERO_WIDTH.join("|")), "")
59
+ utf8 = if utf8.respond_to?(:force_encoding)
51
60
  utf8 = (utf8 << " ").split(/\n/u).each{|line|
52
61
  line.gsub!(/[\s\p{C}]/u, " ")
53
62
  }.join("\n").chop!
54
- utf8.gsub!(Regexp.new(SPECIAL_SPACES.join("|")), " ")
55
- utf8.force_encoding("UTF-8")
56
63
  else
57
64
  require "oniguruma"
58
65
  utf8.split(/\n/n).collect{|line|
59
- Oniguruma::ORegexp.new("[\\s\\p{C}]", {:encoding => Oniguruma::ENCODING_UTF8}).gsub(line, " ")
66
+ Oniguruma::ORegexp.new("[\\p{C}]", {:encoding => Oniguruma::ENCODING_UTF8}).gsub(line, " ")
60
67
  }.join("\n").chop!
61
68
  end
69
+ utf8.gsub!(Regexp.new(SPECIAL_SPACES.join("|") + "|\s"), " ")
70
+ utf8
62
71
  end
63
72
 
64
73
  def trim(chars = "")
@@ -2,146 +2,72 @@
2
2
  require File.dirname(__FILE__) + "/spec_helper"
3
3
 
4
4
  describe String::Cleaner do
5
- if "".respond_to?(:force_encoding)
6
- # specs for Ruby 1.9+
5
+ describe "#clean" do
7
6
  describe "with all 8-bit characters" do
8
7
  before :all do
9
- @input = "".force_encoding("ISO8859-15")
8
+ @input = ""
9
+ @input.force_encoding("ISO8859-15") if @input.respond_to?(:force_encoding)
10
10
  (0..255).each{|i| @input << i.chr}
11
- @input.force_encoding("UTF-8")
11
+ @input.force_encoding("UTF-8") if @input.respond_to?(:force_encoding)
12
12
  @output = @input.clean
13
13
  end
14
- it "should output a valid UTF-8 string" do
15
- @output.encoding.name.should == "UTF-8"
16
- @output.should be_valid_encoding
14
+ if RUBY_VERSION.to_f>1.9
15
+ it "should output a valid UTF-8 string" do
16
+ @output.encoding.name.should == "UTF-8"
17
+ @output.should be_valid_encoding
18
+ end
17
19
  end
18
20
  it "should wipe out the control characters" do
19
- @output.should == " \n \n !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ € ¡¢£€¥Š§š©ª«¬ ®¯°±²³Žµ¶·ž¹º»ŒœŸ¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
20
- end
21
- end
22
- it "should convert all type of spaces to normal spaces" do
23
- input = [
24
- (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
25
- 0x0020, # White_Space # Zs SPACE
26
- 0x0085, # White_Space # Cc <control-0085>
27
- 0x00A0, # White_Space # Zs NO-BREAK SPACE
28
- 0x1680, # White_Space # Zs OGHAM SPACE MARK
29
- 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
30
- (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
31
- 0x2028, # White_Space # Zl LINE SEPARATOR
32
- 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
33
- 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
34
- 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
35
- 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
36
- ].flatten.collect{ |e| [e].pack 'U*' }
37
- input.join.clean.should == " \n \n "
38
- end
39
- describe "with invalid UTF-8 sequence" do
40
- before :all do
41
- @input = "\210\004"
42
- @output = @input.clean
43
- end
44
- it "should output a valid UTF-8 string" do
45
- @output.encoding.name.should == "UTF-8"
46
- @output.should be_valid_encoding
47
- end
48
- it "should replace invisible chars by space" do
49
- @output.should == " "
50
- end
51
- end
52
- describe "with mixed valid and invalid characters" do
53
- before :all do
54
- @input = "a?^?\xddf"
55
- @output = @input.clean
56
- end
57
- it "should output a valid UTF-8 string" do
58
- @output.encoding.name.should == "UTF-8"
59
- @output.should be_valid_encoding
60
- end
61
- it "should keep the valid characters" do
62
- @output.should == "a?^?Ýf"
63
- end
64
- end
65
- describe "with already valid characters" do
66
- before :all do
67
- @input = "\n\t\r\r\n\v\n"
68
- @output = @input.clean
69
- end
70
- it "should output a valid UTF-8 string" do
71
- @output.encoding.name.should == "UTF-8"
72
- @output.should be_valid_encoding
73
- end
74
- it "should replace invisible chars by space" do
75
- @output.should == "\n \n\n \n"
76
- end
77
- end
78
- describe "with watermarked text" do
79
- before :all do
80
- @input = "Here is \357\273\277a block \357\273\277of text \357\273\277inside of which a number will be hidden!"
81
- @output = @input.clean
82
- end
83
- it "should output a valid UTF-8 string" do
84
- @output.encoding.name.should == "UTF-8"
85
- @output.should be_valid_encoding
86
- end
87
- it "should replace invisible chars by space" do
88
- @output.should == "Here is a block of text inside of which a number will be hidden!"
21
+ @output.should == " \n \n !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ € ¡¢£€¥¦§¨©ª«¬ ®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
89
22
  end
90
23
  end
91
- describe "with unusual valid spaces" do
92
- before :all do
93
- @input = []
94
- @input << "\u0020" # SPACE
95
- @input << "\u00A0" # NO-BREAK SPACE
96
- @input << "\u2000" # EN QUAD
97
- @input << "\u2001" # EM QUAD
98
- @input << "\u2002" # EN SPACE
99
- @input << "\u2003" # EM SPACE
100
- @input << "\u2004" # THREE-PER-EM SPACE
101
- @input << "\u2005" # FOUR-PER-EM SPACE
102
- @input << "\u2006" # SIX-PER-EM SPACE
103
- @input << "\u2007" # FIGURE SPACE
104
- @input << "\u2008" # PUNCTUATION SPACE
105
- @input << "\u2009" # THIN SPACE
106
- @input << "\u200A" # HAIR SPACE
107
- @input << "\u200B" # ZERO WIDTH SPACE
108
- @input << "\u202F" # NARROW NO-BREAK SPACE
109
- @input << "\u205F" # MEDIUM MATHEMATICAL SPACE
110
- @input << "\u3000" # IDEOGRAPHIC SPACE
111
- @input << "\uFEFF" # ZERO WIDTH NO-BREAK SPACE
24
+ describe "with various type of spaces" do
25
+ before do
26
+ @input = [
27
+ (0x0009..0x000D).to_a, # <control-0009>..<control-000D>
28
+ 0x0020, # SPACE
29
+ 0x0085, # <control-0085>
30
+ 0x00A0, # NO-BREAK SPACE
31
+ 0x1680, # OGHAM SPACE MARK
32
+ 0x180E, # MONGOLIAN VOWEL SEPARATOR
33
+ (0x2000..0x200A).to_a, # EN QUAD..HAIR SPACE
34
+ 0x2028, # LINE SEPARATOR
35
+ 0x2029, # PARAGRAPH SEPARATOR
36
+ 0x202F, # NARROW NO-BREAK SPACE
37
+ 0x205F, # MEDIUM MATHEMATICAL SPACE
38
+ 0x3000, # IDEOGRAPHIC SPACE
39
+ ].flatten.collect{ |e| [e].pack 'U*' }
112
40
  @output = @input.join.clean
113
41
  end
114
- it "should output a valid UTF-8 string" do
115
- @output.encoding.name.should == "UTF-8"
116
- @output.should be_valid_encoding
117
- end
118
- it "should replace invisible chars by space" do
119
- @output.should == " "*@input.size
120
- end
121
- end
122
- describe "with euro sign from both ISO 8859-15 or Windows-1252" do
123
- before :all do
124
- @input = "\x80\xA4"
125
- @output = @input.clean
126
- end
127
- it "should output a valid UTF-8 string" do
128
- @output.encoding.name.should == "UTF-8"
129
- @output.should be_valid_encoding
130
- end
131
- it "should replace invisible chars by space" do
132
- @output.should == "€€"
42
+ if RUBY_VERSION.to_f>1.9
43
+ it "should output a valid UTF-8 string" do
44
+ @output.encoding.name.should == "UTF-8"
45
+ @output.should be_valid_encoding
46
+ end
47
+ end
48
+ it "should replace all spaces to normal spaces" do
49
+ @output.clean.should == " \n \n "
50
+ end
51
+ end
52
+ describe "with various no-width characters" do
53
+ before do
54
+ @input = [
55
+ 0x200B, # ZERO WIDTH SPACE
56
+ 0x200C, # ZERO WIDTH NON-JOINER
57
+ 0x200D, # ZERO WIDTH JOINER
58
+ 0x2060, # WORD JOINER
59
+ 0xFEFF, # ZERO WIDTH NO-BREAK SPACE
60
+ ].flatten.collect{ |e| [e].pack 'U*' }
61
+ @output = @input.join.clean
133
62
  end
134
- end
135
- else
136
- # specs for Ruby 1.8.6
137
- describe "with all 8-bit characters" do
138
- before :all do
139
- @input = ""
140
- (0..255).each{|i| @input << i.chr}
141
- @output = @input.clean
63
+ if RUBY_VERSION.to_f>1.9
64
+ it "should output a valid UTF-8 string" do
65
+ @output.encoding.name.should == "UTF-8"
66
+ @output.should be_valid_encoding
67
+ end
142
68
  end
143
- it "should wipe out the control characters" do
144
- @output.should == " \n \n !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ € ¡¢£€¥¦§¨©ª«¬ ®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
69
+ it "should remove no-width characters" do
70
+ @output.should == ""
145
71
  end
146
72
  end
147
73
  describe "with invalid UTF-8 sequence" do
@@ -149,6 +75,12 @@ describe String::Cleaner do
149
75
  @input = "\210\004"
150
76
  @output = @input.clean
151
77
  end
78
+ if RUBY_VERSION.to_f>1.9
79
+ it "should output a valid UTF-8 string" do
80
+ @output.encoding.name.should == "UTF-8"
81
+ @output.should be_valid_encoding
82
+ end
83
+ end
152
84
  it "should replace invisible chars by space" do
153
85
  @output.should == " "
154
86
  end
@@ -158,6 +90,12 @@ describe String::Cleaner do
158
90
  @input = "a?^?\xddf"
159
91
  @output = @input.clean
160
92
  end
93
+ if RUBY_VERSION.to_f>1.9
94
+ it "should output a valid UTF-8 string" do
95
+ @output.encoding.name.should == "UTF-8"
96
+ @output.should be_valid_encoding
97
+ end
98
+ end
161
99
  it "should keep the valid characters" do
162
100
  @output.should == "a?^?Ýf"
163
101
  end
@@ -167,6 +105,12 @@ describe String::Cleaner do
167
105
  @input = "\n\t\r\r\n\v\n"
168
106
  @output = @input.clean
169
107
  end
108
+ if RUBY_VERSION.to_f>1.9
109
+ it "should output a valid UTF-8 string" do
110
+ @output.encoding.name.should == "UTF-8"
111
+ @output.should be_valid_encoding
112
+ end
113
+ end
170
114
  it "should replace invisible chars by space" do
171
115
  @output.should == "\n \n\n \n"
172
116
  end
@@ -176,36 +120,14 @@ describe String::Cleaner do
176
120
  @input = "Here is \357\273\277a block \357\273\277of text \357\273\277inside of which a number will be hidden!"
177
121
  @output = @input.clean
178
122
  end
179
- it "should replace invisible chars by space" do
180
- @output.should == "Here is a block of text inside of which a number will be hidden!"
181
- end
182
- end
183
- describe "with unusual valid spaces" do
184
- before :all do
185
- @input = []
186
- # "\uXXXX" doesn't exists yet on Ruby 1.8.6
187
- @input << " " # SPACE
188
- @input << "\xC2\xA0" # NO-BREAK SPACE
189
- @input << "\xE2\x80\x80" # EN QUAD
190
- @input << "\xE2\x80\x81" # EM QUAD
191
- @input << "\xE2\x80\x82" # EN SPACE
192
- @input << "\xE2\x80\x83" # EM SPACE
193
- @input << "\xE2\x80\x84" # THREE-PER-EM SPACE
194
- @input << "\xE2\x80\x85" # FOUR-PER-EM SPACE
195
- @input << "\xE2\x80\x86" # SIX-PER-EM SPACE
196
- @input << "\xE2\x80\x87" # FIGURE SPACE
197
- @input << "\xE2\x80\x88" # PUNCTUATION SPACE
198
- @input << "\xE2\x80\x89" # THIN SPACE
199
- @input << "\xE2\x80\x8A" # HAIR SPACE
200
- @input << "\xE2\x80\x8B" # ZERO WIDTH SPACE
201
- @input << "\xE2\x80\xAF" # NARROW NO-BREAK SPACE
202
- @input << "\xE2\x81\x9F" # MEDIUM MATHEMATICAL SPACE
203
- @input << "\xE3\x80\x80" # IDEOGRAPHIC SPACE
204
- @input << "\xEF\xBB\xBF" # ZERO WIDTH NO-BREAK SPACE
205
- @output = @input.join.clean
123
+ if RUBY_VERSION.to_f>1.9
124
+ it "should output a valid UTF-8 string" do
125
+ @output.encoding.name.should == "UTF-8"
126
+ @output.should be_valid_encoding
127
+ end
206
128
  end
207
129
  it "should replace invisible chars by space" do
208
- @output.should == " "*@input.size
130
+ @output.should == "Here is a block of text inside of which a number will be hidden!"
209
131
  end
210
132
  end
211
133
  describe "with euro sign from both ISO 8859-15 or Windows-1252" do
@@ -213,6 +135,12 @@ describe String::Cleaner do
213
135
  @input = "\x80\xA4"
214
136
  @output = @input.clean
215
137
  end
138
+ if RUBY_VERSION.to_f>1.9
139
+ it "should output a valid UTF-8 string" do
140
+ @output.encoding.name.should == "UTF-8"
141
+ @output.should be_valid_encoding
142
+ end
143
+ end
216
144
  it "should replace invisible chars by space" do
217
145
  @output.should == "€€"
218
146
  end
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{string_cleaner}
5
- s.version = "0.2.1"
5
+ s.version = "0.2.2"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Joseph Halter"]
metadata CHANGED
@@ -1,12 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: string_cleaner
3
3
  version: !ruby/object:Gem::Version
4
+ hash: 19
4
5
  prerelease: false
5
6
  segments:
6
7
  - 0
7
8
  - 2
8
- - 1
9
- version: 0.2.1
9
+ - 2
10
+ version: 0.2.2
10
11
  platform: ruby
11
12
  authors:
12
13
  - Joseph Halter
@@ -25,6 +26,7 @@ dependencies:
25
26
  requirements:
26
27
  - - ">="
27
28
  - !ruby/object:Gem::Version
29
+ hash: 3
28
30
  segments:
29
31
  - 0
30
32
  version: "0"
@@ -38,6 +40,7 @@ dependencies:
38
40
  requirements:
39
41
  - - ">="
40
42
  - !ruby/object:Gem::Version
43
+ hash: 3
41
44
  segments:
42
45
  - 0
43
46
  version: "0"
@@ -75,6 +78,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
75
78
  requirements:
76
79
  - - ">="
77
80
  - !ruby/object:Gem::Version
81
+ hash: 3
78
82
  segments:
79
83
  - 0
80
84
  version: "0"
@@ -83,6 +87,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
83
87
  requirements:
84
88
  - - ">="
85
89
  - !ruby/object:Gem::Version
90
+ hash: 3
86
91
  segments:
87
92
  - 0
88
93
  version: "0"