string_cleaner 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +6 -3
- data/lib/string_cleaner.rb +26 -17
- data/spec/string_cleaner_spec.rb +83 -155
- data/string_cleaner.gemspec +1 -1
- metadata +7 -2
data/Rakefile
CHANGED
data/lib/string_cleaner.rb
CHANGED
@@ -9,13 +9,13 @@ module String::Cleaner
|
|
9
9
|
|
10
10
|
def fix_encoding
|
11
11
|
utf8 = dup
|
12
|
-
if utf8.respond_to?(:force_encoding)
|
12
|
+
if utf8.respond_to?(:force_encoding)
|
13
13
|
utf8.force_encoding("UTF-8") # for Ruby 1.9+
|
14
14
|
unless utf8.valid_encoding? # if invalid UTF-8
|
15
|
-
utf8 = utf8.force_encoding("ISO8859-
|
15
|
+
utf8 = utf8.force_encoding("ISO8859-1")
|
16
16
|
utf8.encode!("UTF-8", :invalid => :replace, :undef => :replace, :replace => "")
|
17
17
|
end
|
18
|
-
utf8.gsub!(
|
18
|
+
utf8.gsub!(/\u0080|¤/, "€") # special case for euro sign from Windows-1252
|
19
19
|
utf8
|
20
20
|
else
|
21
21
|
require "iconv"
|
@@ -34,31 +34,40 @@ module String::Cleaner
|
|
34
34
|
end
|
35
35
|
|
36
36
|
SPECIAL_SPACES = [
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
37
|
+
0x00A0, # NO-BREAK SPACE
|
38
|
+
0x1680, # OGHAM SPACE MARK
|
39
|
+
0x180E, # MONGOLIAN VOWEL SEPARATOR
|
40
|
+
(0x2000..0x200A).to_a, # EN QUAD..HAIR SPACE
|
41
|
+
0x2028, # LINE SEPARATOR
|
42
|
+
0x2029, # PARAGRAPH SEPARATOR
|
43
|
+
0x202F, # NARROW NO-BREAK SPACE
|
44
|
+
0x205F, # MEDIUM MATHEMATICAL SPACE
|
45
|
+
0x3000, # IDEOGRAPHIC SPACE
|
46
|
+
].flatten.collect{|e| [e].pack 'U*'}
|
47
|
+
|
48
|
+
ZERO_WIDTH = [
|
49
|
+
0x200B, # ZERO WIDTH SPACE
|
50
|
+
0x200C, # ZERO WIDTH NON-JOINER
|
51
|
+
0x200D, # ZERO WIDTH JOINER
|
52
|
+
0x2060, # WORD JOINER
|
53
|
+
0xFEFF, # ZERO WIDTH NO-BREAK SPACE
|
54
|
+
].flatten.collect{|e| [e].pack 'U*'}
|
47
55
|
|
48
56
|
def fix_invisible_chars
|
49
57
|
utf8 = self.dup
|
50
|
-
|
58
|
+
utf8.gsub!(Regexp.new(ZERO_WIDTH.join("|")), "")
|
59
|
+
utf8 = if utf8.respond_to?(:force_encoding)
|
51
60
|
utf8 = (utf8 << " ").split(/\n/u).each{|line|
|
52
61
|
line.gsub!(/[\s\p{C}]/u, " ")
|
53
62
|
}.join("\n").chop!
|
54
|
-
utf8.gsub!(Regexp.new(SPECIAL_SPACES.join("|")), " ")
|
55
|
-
utf8.force_encoding("UTF-8")
|
56
63
|
else
|
57
64
|
require "oniguruma"
|
58
65
|
utf8.split(/\n/n).collect{|line|
|
59
|
-
Oniguruma::ORegexp.new("[\\
|
66
|
+
Oniguruma::ORegexp.new("[\\p{C}]", {:encoding => Oniguruma::ENCODING_UTF8}).gsub(line, " ")
|
60
67
|
}.join("\n").chop!
|
61
68
|
end
|
69
|
+
utf8.gsub!(Regexp.new(SPECIAL_SPACES.join("|") + "|\s"), " ")
|
70
|
+
utf8
|
62
71
|
end
|
63
72
|
|
64
73
|
def trim(chars = "")
|
data/spec/string_cleaner_spec.rb
CHANGED
@@ -2,146 +2,72 @@
|
|
2
2
|
require File.dirname(__FILE__) + "/spec_helper"
|
3
3
|
|
4
4
|
describe String::Cleaner do
|
5
|
-
|
6
|
-
# specs for Ruby 1.9+
|
5
|
+
describe "#clean" do
|
7
6
|
describe "with all 8-bit characters" do
|
8
7
|
before :all do
|
9
|
-
@input = ""
|
8
|
+
@input = ""
|
9
|
+
@input.force_encoding("ISO8859-15") if @input.respond_to?(:force_encoding)
|
10
10
|
(0..255).each{|i| @input << i.chr}
|
11
|
-
@input.force_encoding("UTF-8")
|
11
|
+
@input.force_encoding("UTF-8") if @input.respond_to?(:force_encoding)
|
12
12
|
@output = @input.clean
|
13
13
|
end
|
14
|
-
|
15
|
-
|
16
|
-
|
14
|
+
if RUBY_VERSION.to_f>1.9
|
15
|
+
it "should output a valid UTF-8 string" do
|
16
|
+
@output.encoding.name.should == "UTF-8"
|
17
|
+
@output.should be_valid_encoding
|
18
|
+
end
|
17
19
|
end
|
18
20
|
it "should wipe out the control characters" do
|
19
|
-
@output.should == " \n \n !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ €
|
20
|
-
end
|
21
|
-
end
|
22
|
-
it "should convert all type of spaces to normal spaces" do
|
23
|
-
input = [
|
24
|
-
(0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
|
25
|
-
0x0020, # White_Space # Zs SPACE
|
26
|
-
0x0085, # White_Space # Cc <control-0085>
|
27
|
-
0x00A0, # White_Space # Zs NO-BREAK SPACE
|
28
|
-
0x1680, # White_Space # Zs OGHAM SPACE MARK
|
29
|
-
0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
|
30
|
-
(0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
|
31
|
-
0x2028, # White_Space # Zl LINE SEPARATOR
|
32
|
-
0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
|
33
|
-
0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
|
34
|
-
0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
|
35
|
-
0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
|
36
|
-
].flatten.collect{ |e| [e].pack 'U*' }
|
37
|
-
input.join.clean.should == " \n \n "
|
38
|
-
end
|
39
|
-
describe "with invalid UTF-8 sequence" do
|
40
|
-
before :all do
|
41
|
-
@input = "\210\004"
|
42
|
-
@output = @input.clean
|
43
|
-
end
|
44
|
-
it "should output a valid UTF-8 string" do
|
45
|
-
@output.encoding.name.should == "UTF-8"
|
46
|
-
@output.should be_valid_encoding
|
47
|
-
end
|
48
|
-
it "should replace invisible chars by space" do
|
49
|
-
@output.should == " "
|
50
|
-
end
|
51
|
-
end
|
52
|
-
describe "with mixed valid and invalid characters" do
|
53
|
-
before :all do
|
54
|
-
@input = "a?^?\xddf"
|
55
|
-
@output = @input.clean
|
56
|
-
end
|
57
|
-
it "should output a valid UTF-8 string" do
|
58
|
-
@output.encoding.name.should == "UTF-8"
|
59
|
-
@output.should be_valid_encoding
|
60
|
-
end
|
61
|
-
it "should keep the valid characters" do
|
62
|
-
@output.should == "a?^?Ýf"
|
63
|
-
end
|
64
|
-
end
|
65
|
-
describe "with already valid characters" do
|
66
|
-
before :all do
|
67
|
-
@input = "\n\t\r\r\n\v\n"
|
68
|
-
@output = @input.clean
|
69
|
-
end
|
70
|
-
it "should output a valid UTF-8 string" do
|
71
|
-
@output.encoding.name.should == "UTF-8"
|
72
|
-
@output.should be_valid_encoding
|
73
|
-
end
|
74
|
-
it "should replace invisible chars by space" do
|
75
|
-
@output.should == "\n \n\n \n"
|
76
|
-
end
|
77
|
-
end
|
78
|
-
describe "with watermarked text" do
|
79
|
-
before :all do
|
80
|
-
@input = "Here is \357\273\277a block \357\273\277of text \357\273\277inside of which a number will be hidden!"
|
81
|
-
@output = @input.clean
|
82
|
-
end
|
83
|
-
it "should output a valid UTF-8 string" do
|
84
|
-
@output.encoding.name.should == "UTF-8"
|
85
|
-
@output.should be_valid_encoding
|
86
|
-
end
|
87
|
-
it "should replace invisible chars by space" do
|
88
|
-
@output.should == "Here is a block of text inside of which a number will be hidden!"
|
21
|
+
@output.should == " \n \n !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ € ¡¢£€¥¦§¨©ª«¬ ®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
|
89
22
|
end
|
90
23
|
end
|
91
|
-
describe "with
|
92
|
-
before
|
93
|
-
@input = [
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
@input << "\u200B" # ZERO WIDTH SPACE
|
108
|
-
@input << "\u202F" # NARROW NO-BREAK SPACE
|
109
|
-
@input << "\u205F" # MEDIUM MATHEMATICAL SPACE
|
110
|
-
@input << "\u3000" # IDEOGRAPHIC SPACE
|
111
|
-
@input << "\uFEFF" # ZERO WIDTH NO-BREAK SPACE
|
24
|
+
describe "with various type of spaces" do
|
25
|
+
before do
|
26
|
+
@input = [
|
27
|
+
(0x0009..0x000D).to_a, # <control-0009>..<control-000D>
|
28
|
+
0x0020, # SPACE
|
29
|
+
0x0085, # <control-0085>
|
30
|
+
0x00A0, # NO-BREAK SPACE
|
31
|
+
0x1680, # OGHAM SPACE MARK
|
32
|
+
0x180E, # MONGOLIAN VOWEL SEPARATOR
|
33
|
+
(0x2000..0x200A).to_a, # EN QUAD..HAIR SPACE
|
34
|
+
0x2028, # LINE SEPARATOR
|
35
|
+
0x2029, # PARAGRAPH SEPARATOR
|
36
|
+
0x202F, # NARROW NO-BREAK SPACE
|
37
|
+
0x205F, # MEDIUM MATHEMATICAL SPACE
|
38
|
+
0x3000, # IDEOGRAPHIC SPACE
|
39
|
+
].flatten.collect{ |e| [e].pack 'U*' }
|
112
40
|
@output = @input.join.clean
|
113
41
|
end
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
42
|
+
if RUBY_VERSION.to_f>1.9
|
43
|
+
it "should output a valid UTF-8 string" do
|
44
|
+
@output.encoding.name.should == "UTF-8"
|
45
|
+
@output.should be_valid_encoding
|
46
|
+
end
|
47
|
+
end
|
48
|
+
it "should replace all spaces to normal spaces" do
|
49
|
+
@output.clean.should == " \n \n "
|
50
|
+
end
|
51
|
+
end
|
52
|
+
describe "with various no-width characters" do
|
53
|
+
before do
|
54
|
+
@input = [
|
55
|
+
0x200B, # ZERO WIDTH SPACE
|
56
|
+
0x200C, # ZERO WIDTH NON-JOINER
|
57
|
+
0x200D, # ZERO WIDTH JOINER
|
58
|
+
0x2060, # WORD JOINER
|
59
|
+
0xFEFF, # ZERO WIDTH NO-BREAK SPACE
|
60
|
+
].flatten.collect{ |e| [e].pack 'U*' }
|
61
|
+
@output = @input.join.clean
|
133
62
|
end
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
@input = ""
|
140
|
-
(0..255).each{|i| @input << i.chr}
|
141
|
-
@output = @input.clean
|
63
|
+
if RUBY_VERSION.to_f>1.9
|
64
|
+
it "should output a valid UTF-8 string" do
|
65
|
+
@output.encoding.name.should == "UTF-8"
|
66
|
+
@output.should be_valid_encoding
|
67
|
+
end
|
142
68
|
end
|
143
|
-
it "should
|
144
|
-
@output.should == "
|
69
|
+
it "should remove no-width characters" do
|
70
|
+
@output.should == ""
|
145
71
|
end
|
146
72
|
end
|
147
73
|
describe "with invalid UTF-8 sequence" do
|
@@ -149,6 +75,12 @@ describe String::Cleaner do
|
|
149
75
|
@input = "\210\004"
|
150
76
|
@output = @input.clean
|
151
77
|
end
|
78
|
+
if RUBY_VERSION.to_f>1.9
|
79
|
+
it "should output a valid UTF-8 string" do
|
80
|
+
@output.encoding.name.should == "UTF-8"
|
81
|
+
@output.should be_valid_encoding
|
82
|
+
end
|
83
|
+
end
|
152
84
|
it "should replace invisible chars by space" do
|
153
85
|
@output.should == " "
|
154
86
|
end
|
@@ -158,6 +90,12 @@ describe String::Cleaner do
|
|
158
90
|
@input = "a?^?\xddf"
|
159
91
|
@output = @input.clean
|
160
92
|
end
|
93
|
+
if RUBY_VERSION.to_f>1.9
|
94
|
+
it "should output a valid UTF-8 string" do
|
95
|
+
@output.encoding.name.should == "UTF-8"
|
96
|
+
@output.should be_valid_encoding
|
97
|
+
end
|
98
|
+
end
|
161
99
|
it "should keep the valid characters" do
|
162
100
|
@output.should == "a?^?Ýf"
|
163
101
|
end
|
@@ -167,6 +105,12 @@ describe String::Cleaner do
|
|
167
105
|
@input = "\n\t\r\r\n\v\n"
|
168
106
|
@output = @input.clean
|
169
107
|
end
|
108
|
+
if RUBY_VERSION.to_f>1.9
|
109
|
+
it "should output a valid UTF-8 string" do
|
110
|
+
@output.encoding.name.should == "UTF-8"
|
111
|
+
@output.should be_valid_encoding
|
112
|
+
end
|
113
|
+
end
|
170
114
|
it "should replace invisible chars by space" do
|
171
115
|
@output.should == "\n \n\n \n"
|
172
116
|
end
|
@@ -176,36 +120,14 @@ describe String::Cleaner do
|
|
176
120
|
@input = "Here is \357\273\277a block \357\273\277of text \357\273\277inside of which a number will be hidden!"
|
177
121
|
@output = @input.clean
|
178
122
|
end
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
before :all do
|
185
|
-
@input = []
|
186
|
-
# "\uXXXX" doesn't exists yet on Ruby 1.8.6
|
187
|
-
@input << " " # SPACE
|
188
|
-
@input << "\xC2\xA0" # NO-BREAK SPACE
|
189
|
-
@input << "\xE2\x80\x80" # EN QUAD
|
190
|
-
@input << "\xE2\x80\x81" # EM QUAD
|
191
|
-
@input << "\xE2\x80\x82" # EN SPACE
|
192
|
-
@input << "\xE2\x80\x83" # EM SPACE
|
193
|
-
@input << "\xE2\x80\x84" # THREE-PER-EM SPACE
|
194
|
-
@input << "\xE2\x80\x85" # FOUR-PER-EM SPACE
|
195
|
-
@input << "\xE2\x80\x86" # SIX-PER-EM SPACE
|
196
|
-
@input << "\xE2\x80\x87" # FIGURE SPACE
|
197
|
-
@input << "\xE2\x80\x88" # PUNCTUATION SPACE
|
198
|
-
@input << "\xE2\x80\x89" # THIN SPACE
|
199
|
-
@input << "\xE2\x80\x8A" # HAIR SPACE
|
200
|
-
@input << "\xE2\x80\x8B" # ZERO WIDTH SPACE
|
201
|
-
@input << "\xE2\x80\xAF" # NARROW NO-BREAK SPACE
|
202
|
-
@input << "\xE2\x81\x9F" # MEDIUM MATHEMATICAL SPACE
|
203
|
-
@input << "\xE3\x80\x80" # IDEOGRAPHIC SPACE
|
204
|
-
@input << "\xEF\xBB\xBF" # ZERO WIDTH NO-BREAK SPACE
|
205
|
-
@output = @input.join.clean
|
123
|
+
if RUBY_VERSION.to_f>1.9
|
124
|
+
it "should output a valid UTF-8 string" do
|
125
|
+
@output.encoding.name.should == "UTF-8"
|
126
|
+
@output.should be_valid_encoding
|
127
|
+
end
|
206
128
|
end
|
207
129
|
it "should replace invisible chars by space" do
|
208
|
-
@output.should == " "
|
130
|
+
@output.should == "Here is a block of text inside of which a number will be hidden!"
|
209
131
|
end
|
210
132
|
end
|
211
133
|
describe "with euro sign from both ISO 8859-15 or Windows-1252" do
|
@@ -213,6 +135,12 @@ describe String::Cleaner do
|
|
213
135
|
@input = "\x80\xA4"
|
214
136
|
@output = @input.clean
|
215
137
|
end
|
138
|
+
if RUBY_VERSION.to_f>1.9
|
139
|
+
it "should output a valid UTF-8 string" do
|
140
|
+
@output.encoding.name.should == "UTF-8"
|
141
|
+
@output.should be_valid_encoding
|
142
|
+
end
|
143
|
+
end
|
216
144
|
it "should replace invisible chars by space" do
|
217
145
|
@output.should == "€€"
|
218
146
|
end
|
data/string_cleaner.gemspec
CHANGED
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: string_cleaner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
+
hash: 19
|
4
5
|
prerelease: false
|
5
6
|
segments:
|
6
7
|
- 0
|
7
8
|
- 2
|
8
|
-
-
|
9
|
-
version: 0.2.
|
9
|
+
- 2
|
10
|
+
version: 0.2.2
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Joseph Halter
|
@@ -25,6 +26,7 @@ dependencies:
|
|
25
26
|
requirements:
|
26
27
|
- - ">="
|
27
28
|
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
28
30
|
segments:
|
29
31
|
- 0
|
30
32
|
version: "0"
|
@@ -38,6 +40,7 @@ dependencies:
|
|
38
40
|
requirements:
|
39
41
|
- - ">="
|
40
42
|
- !ruby/object:Gem::Version
|
43
|
+
hash: 3
|
41
44
|
segments:
|
42
45
|
- 0
|
43
46
|
version: "0"
|
@@ -75,6 +78,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
75
78
|
requirements:
|
76
79
|
- - ">="
|
77
80
|
- !ruby/object:Gem::Version
|
81
|
+
hash: 3
|
78
82
|
segments:
|
79
83
|
- 0
|
80
84
|
version: "0"
|
@@ -83,6 +87,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
83
87
|
requirements:
|
84
88
|
- - ">="
|
85
89
|
- !ruby/object:Gem::Version
|
90
|
+
hash: 3
|
86
91
|
segments:
|
87
92
|
- 0
|
88
93
|
version: "0"
|