string_cleaner 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +6 -3
- data/lib/string_cleaner.rb +26 -17
- data/spec/string_cleaner_spec.rb +83 -155
- data/string_cleaner.gemspec +1 -1
- metadata +7 -2
data/Rakefile
CHANGED
data/lib/string_cleaner.rb
CHANGED
@@ -9,13 +9,13 @@ module String::Cleaner
|
|
9
9
|
|
10
10
|
def fix_encoding
|
11
11
|
utf8 = dup
|
12
|
-
if utf8.respond_to?(:force_encoding)
|
12
|
+
if utf8.respond_to?(:force_encoding)
|
13
13
|
utf8.force_encoding("UTF-8") # for Ruby 1.9+
|
14
14
|
unless utf8.valid_encoding? # if invalid UTF-8
|
15
|
-
utf8 = utf8.force_encoding("ISO8859-
|
15
|
+
utf8 = utf8.force_encoding("ISO8859-1")
|
16
16
|
utf8.encode!("UTF-8", :invalid => :replace, :undef => :replace, :replace => "")
|
17
17
|
end
|
18
|
-
utf8.gsub!(
|
18
|
+
utf8.gsub!(/\u0080|¤/, "€") # special case for euro sign from Windows-1252
|
19
19
|
utf8
|
20
20
|
else
|
21
21
|
require "iconv"
|
@@ -34,31 +34,40 @@ module String::Cleaner
|
|
34
34
|
end
|
35
35
|
|
36
36
|
SPECIAL_SPACES = [
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
37
|
+
0x00A0, # NO-BREAK SPACE
|
38
|
+
0x1680, # OGHAM SPACE MARK
|
39
|
+
0x180E, # MONGOLIAN VOWEL SEPARATOR
|
40
|
+
(0x2000..0x200A).to_a, # EN QUAD..HAIR SPACE
|
41
|
+
0x2028, # LINE SEPARATOR
|
42
|
+
0x2029, # PARAGRAPH SEPARATOR
|
43
|
+
0x202F, # NARROW NO-BREAK SPACE
|
44
|
+
0x205F, # MEDIUM MATHEMATICAL SPACE
|
45
|
+
0x3000, # IDEOGRAPHIC SPACE
|
46
|
+
].flatten.collect{|e| [e].pack 'U*'}
|
47
|
+
|
48
|
+
ZERO_WIDTH = [
|
49
|
+
0x200B, # ZERO WIDTH SPACE
|
50
|
+
0x200C, # ZERO WIDTH NON-JOINER
|
51
|
+
0x200D, # ZERO WIDTH JOINER
|
52
|
+
0x2060, # WORD JOINER
|
53
|
+
0xFEFF, # ZERO WIDTH NO-BREAK SPACE
|
54
|
+
].flatten.collect{|e| [e].pack 'U*'}
|
47
55
|
|
48
56
|
def fix_invisible_chars
|
49
57
|
utf8 = self.dup
|
50
|
-
|
58
|
+
utf8.gsub!(Regexp.new(ZERO_WIDTH.join("|")), "")
|
59
|
+
utf8 = if utf8.respond_to?(:force_encoding)
|
51
60
|
utf8 = (utf8 << " ").split(/\n/u).each{|line|
|
52
61
|
line.gsub!(/[\s\p{C}]/u, " ")
|
53
62
|
}.join("\n").chop!
|
54
|
-
utf8.gsub!(Regexp.new(SPECIAL_SPACES.join("|")), " ")
|
55
|
-
utf8.force_encoding("UTF-8")
|
56
63
|
else
|
57
64
|
require "oniguruma"
|
58
65
|
utf8.split(/\n/n).collect{|line|
|
59
|
-
Oniguruma::ORegexp.new("[\\
|
66
|
+
Oniguruma::ORegexp.new("[\\p{C}]", {:encoding => Oniguruma::ENCODING_UTF8}).gsub(line, " ")
|
60
67
|
}.join("\n").chop!
|
61
68
|
end
|
69
|
+
utf8.gsub!(Regexp.new(SPECIAL_SPACES.join("|") + "|\s"), " ")
|
70
|
+
utf8
|
62
71
|
end
|
63
72
|
|
64
73
|
def trim(chars = "")
|
data/spec/string_cleaner_spec.rb
CHANGED
@@ -2,146 +2,72 @@
|
|
2
2
|
require File.dirname(__FILE__) + "/spec_helper"
|
3
3
|
|
4
4
|
describe String::Cleaner do
|
5
|
-
|
6
|
-
# specs for Ruby 1.9+
|
5
|
+
describe "#clean" do
|
7
6
|
describe "with all 8-bit characters" do
|
8
7
|
before :all do
|
9
|
-
@input = ""
|
8
|
+
@input = ""
|
9
|
+
@input.force_encoding("ISO8859-15") if @input.respond_to?(:force_encoding)
|
10
10
|
(0..255).each{|i| @input << i.chr}
|
11
|
-
@input.force_encoding("UTF-8")
|
11
|
+
@input.force_encoding("UTF-8") if @input.respond_to?(:force_encoding)
|
12
12
|
@output = @input.clean
|
13
13
|
end
|
14
|
-
|
15
|
-
|
16
|
-
|
14
|
+
if RUBY_VERSION.to_f>1.9
|
15
|
+
it "should output a valid UTF-8 string" do
|
16
|
+
@output.encoding.name.should == "UTF-8"
|
17
|
+
@output.should be_valid_encoding
|
18
|
+
end
|
17
19
|
end
|
18
20
|
it "should wipe out the control characters" do
|
19
|
-
@output.should == " \n \n !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ €
|
20
|
-
end
|
21
|
-
end
|
22
|
-
it "should convert all type of spaces to normal spaces" do
|
23
|
-
input = [
|
24
|
-
(0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
|
25
|
-
0x0020, # White_Space # Zs SPACE
|
26
|
-
0x0085, # White_Space # Cc <control-0085>
|
27
|
-
0x00A0, # White_Space # Zs NO-BREAK SPACE
|
28
|
-
0x1680, # White_Space # Zs OGHAM SPACE MARK
|
29
|
-
0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
|
30
|
-
(0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
|
31
|
-
0x2028, # White_Space # Zl LINE SEPARATOR
|
32
|
-
0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
|
33
|
-
0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
|
34
|
-
0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
|
35
|
-
0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
|
36
|
-
].flatten.collect{ |e| [e].pack 'U*' }
|
37
|
-
input.join.clean.should == " \n \n "
|
38
|
-
end
|
39
|
-
describe "with invalid UTF-8 sequence" do
|
40
|
-
before :all do
|
41
|
-
@input = "\210\004"
|
42
|
-
@output = @input.clean
|
43
|
-
end
|
44
|
-
it "should output a valid UTF-8 string" do
|
45
|
-
@output.encoding.name.should == "UTF-8"
|
46
|
-
@output.should be_valid_encoding
|
47
|
-
end
|
48
|
-
it "should replace invisible chars by space" do
|
49
|
-
@output.should == " "
|
50
|
-
end
|
51
|
-
end
|
52
|
-
describe "with mixed valid and invalid characters" do
|
53
|
-
before :all do
|
54
|
-
@input = "a?^?\xddf"
|
55
|
-
@output = @input.clean
|
56
|
-
end
|
57
|
-
it "should output a valid UTF-8 string" do
|
58
|
-
@output.encoding.name.should == "UTF-8"
|
59
|
-
@output.should be_valid_encoding
|
60
|
-
end
|
61
|
-
it "should keep the valid characters" do
|
62
|
-
@output.should == "a?^?Ýf"
|
63
|
-
end
|
64
|
-
end
|
65
|
-
describe "with already valid characters" do
|
66
|
-
before :all do
|
67
|
-
@input = "\n\t\r\r\n\v\n"
|
68
|
-
@output = @input.clean
|
69
|
-
end
|
70
|
-
it "should output a valid UTF-8 string" do
|
71
|
-
@output.encoding.name.should == "UTF-8"
|
72
|
-
@output.should be_valid_encoding
|
73
|
-
end
|
74
|
-
it "should replace invisible chars by space" do
|
75
|
-
@output.should == "\n \n\n \n"
|
76
|
-
end
|
77
|
-
end
|
78
|
-
describe "with watermarked text" do
|
79
|
-
before :all do
|
80
|
-
@input = "Here is \357\273\277a block \357\273\277of text \357\273\277inside of which a number will be hidden!"
|
81
|
-
@output = @input.clean
|
82
|
-
end
|
83
|
-
it "should output a valid UTF-8 string" do
|
84
|
-
@output.encoding.name.should == "UTF-8"
|
85
|
-
@output.should be_valid_encoding
|
86
|
-
end
|
87
|
-
it "should replace invisible chars by space" do
|
88
|
-
@output.should == "Here is a block of text inside of which a number will be hidden!"
|
21
|
+
@output.should == " \n \n !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ € ¡¢£€¥¦§¨©ª«¬ ®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
|
89
22
|
end
|
90
23
|
end
|
91
|
-
describe "with
|
92
|
-
before
|
93
|
-
@input = [
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
@input << "\u200B" # ZERO WIDTH SPACE
|
108
|
-
@input << "\u202F" # NARROW NO-BREAK SPACE
|
109
|
-
@input << "\u205F" # MEDIUM MATHEMATICAL SPACE
|
110
|
-
@input << "\u3000" # IDEOGRAPHIC SPACE
|
111
|
-
@input << "\uFEFF" # ZERO WIDTH NO-BREAK SPACE
|
24
|
+
describe "with various type of spaces" do
|
25
|
+
before do
|
26
|
+
@input = [
|
27
|
+
(0x0009..0x000D).to_a, # <control-0009>..<control-000D>
|
28
|
+
0x0020, # SPACE
|
29
|
+
0x0085, # <control-0085>
|
30
|
+
0x00A0, # NO-BREAK SPACE
|
31
|
+
0x1680, # OGHAM SPACE MARK
|
32
|
+
0x180E, # MONGOLIAN VOWEL SEPARATOR
|
33
|
+
(0x2000..0x200A).to_a, # EN QUAD..HAIR SPACE
|
34
|
+
0x2028, # LINE SEPARATOR
|
35
|
+
0x2029, # PARAGRAPH SEPARATOR
|
36
|
+
0x202F, # NARROW NO-BREAK SPACE
|
37
|
+
0x205F, # MEDIUM MATHEMATICAL SPACE
|
38
|
+
0x3000, # IDEOGRAPHIC SPACE
|
39
|
+
].flatten.collect{ |e| [e].pack 'U*' }
|
112
40
|
@output = @input.join.clean
|
113
41
|
end
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
42
|
+
if RUBY_VERSION.to_f>1.9
|
43
|
+
it "should output a valid UTF-8 string" do
|
44
|
+
@output.encoding.name.should == "UTF-8"
|
45
|
+
@output.should be_valid_encoding
|
46
|
+
end
|
47
|
+
end
|
48
|
+
it "should replace all spaces to normal spaces" do
|
49
|
+
@output.clean.should == " \n \n "
|
50
|
+
end
|
51
|
+
end
|
52
|
+
describe "with various no-width characters" do
|
53
|
+
before do
|
54
|
+
@input = [
|
55
|
+
0x200B, # ZERO WIDTH SPACE
|
56
|
+
0x200C, # ZERO WIDTH NON-JOINER
|
57
|
+
0x200D, # ZERO WIDTH JOINER
|
58
|
+
0x2060, # WORD JOINER
|
59
|
+
0xFEFF, # ZERO WIDTH NO-BREAK SPACE
|
60
|
+
].flatten.collect{ |e| [e].pack 'U*' }
|
61
|
+
@output = @input.join.clean
|
133
62
|
end
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
@input = ""
|
140
|
-
(0..255).each{|i| @input << i.chr}
|
141
|
-
@output = @input.clean
|
63
|
+
if RUBY_VERSION.to_f>1.9
|
64
|
+
it "should output a valid UTF-8 string" do
|
65
|
+
@output.encoding.name.should == "UTF-8"
|
66
|
+
@output.should be_valid_encoding
|
67
|
+
end
|
142
68
|
end
|
143
|
-
it "should
|
144
|
-
@output.should == "
|
69
|
+
it "should remove no-width characters" do
|
70
|
+
@output.should == ""
|
145
71
|
end
|
146
72
|
end
|
147
73
|
describe "with invalid UTF-8 sequence" do
|
@@ -149,6 +75,12 @@ describe String::Cleaner do
|
|
149
75
|
@input = "\210\004"
|
150
76
|
@output = @input.clean
|
151
77
|
end
|
78
|
+
if RUBY_VERSION.to_f>1.9
|
79
|
+
it "should output a valid UTF-8 string" do
|
80
|
+
@output.encoding.name.should == "UTF-8"
|
81
|
+
@output.should be_valid_encoding
|
82
|
+
end
|
83
|
+
end
|
152
84
|
it "should replace invisible chars by space" do
|
153
85
|
@output.should == " "
|
154
86
|
end
|
@@ -158,6 +90,12 @@ describe String::Cleaner do
|
|
158
90
|
@input = "a?^?\xddf"
|
159
91
|
@output = @input.clean
|
160
92
|
end
|
93
|
+
if RUBY_VERSION.to_f>1.9
|
94
|
+
it "should output a valid UTF-8 string" do
|
95
|
+
@output.encoding.name.should == "UTF-8"
|
96
|
+
@output.should be_valid_encoding
|
97
|
+
end
|
98
|
+
end
|
161
99
|
it "should keep the valid characters" do
|
162
100
|
@output.should == "a?^?Ýf"
|
163
101
|
end
|
@@ -167,6 +105,12 @@ describe String::Cleaner do
|
|
167
105
|
@input = "\n\t\r\r\n\v\n"
|
168
106
|
@output = @input.clean
|
169
107
|
end
|
108
|
+
if RUBY_VERSION.to_f>1.9
|
109
|
+
it "should output a valid UTF-8 string" do
|
110
|
+
@output.encoding.name.should == "UTF-8"
|
111
|
+
@output.should be_valid_encoding
|
112
|
+
end
|
113
|
+
end
|
170
114
|
it "should replace invisible chars by space" do
|
171
115
|
@output.should == "\n \n\n \n"
|
172
116
|
end
|
@@ -176,36 +120,14 @@ describe String::Cleaner do
|
|
176
120
|
@input = "Here is \357\273\277a block \357\273\277of text \357\273\277inside of which a number will be hidden!"
|
177
121
|
@output = @input.clean
|
178
122
|
end
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
before :all do
|
185
|
-
@input = []
|
186
|
-
# "\uXXXX" doesn't exists yet on Ruby 1.8.6
|
187
|
-
@input << " " # SPACE
|
188
|
-
@input << "\xC2\xA0" # NO-BREAK SPACE
|
189
|
-
@input << "\xE2\x80\x80" # EN QUAD
|
190
|
-
@input << "\xE2\x80\x81" # EM QUAD
|
191
|
-
@input << "\xE2\x80\x82" # EN SPACE
|
192
|
-
@input << "\xE2\x80\x83" # EM SPACE
|
193
|
-
@input << "\xE2\x80\x84" # THREE-PER-EM SPACE
|
194
|
-
@input << "\xE2\x80\x85" # FOUR-PER-EM SPACE
|
195
|
-
@input << "\xE2\x80\x86" # SIX-PER-EM SPACE
|
196
|
-
@input << "\xE2\x80\x87" # FIGURE SPACE
|
197
|
-
@input << "\xE2\x80\x88" # PUNCTUATION SPACE
|
198
|
-
@input << "\xE2\x80\x89" # THIN SPACE
|
199
|
-
@input << "\xE2\x80\x8A" # HAIR SPACE
|
200
|
-
@input << "\xE2\x80\x8B" # ZERO WIDTH SPACE
|
201
|
-
@input << "\xE2\x80\xAF" # NARROW NO-BREAK SPACE
|
202
|
-
@input << "\xE2\x81\x9F" # MEDIUM MATHEMATICAL SPACE
|
203
|
-
@input << "\xE3\x80\x80" # IDEOGRAPHIC SPACE
|
204
|
-
@input << "\xEF\xBB\xBF" # ZERO WIDTH NO-BREAK SPACE
|
205
|
-
@output = @input.join.clean
|
123
|
+
if RUBY_VERSION.to_f>1.9
|
124
|
+
it "should output a valid UTF-8 string" do
|
125
|
+
@output.encoding.name.should == "UTF-8"
|
126
|
+
@output.should be_valid_encoding
|
127
|
+
end
|
206
128
|
end
|
207
129
|
it "should replace invisible chars by space" do
|
208
|
-
@output.should == " "
|
130
|
+
@output.should == "Here is a block of text inside of which a number will be hidden!"
|
209
131
|
end
|
210
132
|
end
|
211
133
|
describe "with euro sign from both ISO 8859-15 or Windows-1252" do
|
@@ -213,6 +135,12 @@ describe String::Cleaner do
|
|
213
135
|
@input = "\x80\xA4"
|
214
136
|
@output = @input.clean
|
215
137
|
end
|
138
|
+
if RUBY_VERSION.to_f>1.9
|
139
|
+
it "should output a valid UTF-8 string" do
|
140
|
+
@output.encoding.name.should == "UTF-8"
|
141
|
+
@output.should be_valid_encoding
|
142
|
+
end
|
143
|
+
end
|
216
144
|
it "should replace invisible chars by space" do
|
217
145
|
@output.should == "€€"
|
218
146
|
end
|
data/string_cleaner.gemspec
CHANGED
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: string_cleaner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
+
hash: 19
|
4
5
|
prerelease: false
|
5
6
|
segments:
|
6
7
|
- 0
|
7
8
|
- 2
|
8
|
-
-
|
9
|
-
version: 0.2.
|
9
|
+
- 2
|
10
|
+
version: 0.2.2
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Joseph Halter
|
@@ -25,6 +26,7 @@ dependencies:
|
|
25
26
|
requirements:
|
26
27
|
- - ">="
|
27
28
|
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
28
30
|
segments:
|
29
31
|
- 0
|
30
32
|
version: "0"
|
@@ -38,6 +40,7 @@ dependencies:
|
|
38
40
|
requirements:
|
39
41
|
- - ">="
|
40
42
|
- !ruby/object:Gem::Version
|
43
|
+
hash: 3
|
41
44
|
segments:
|
42
45
|
- 0
|
43
46
|
version: "0"
|
@@ -75,6 +78,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
75
78
|
requirements:
|
76
79
|
- - ">="
|
77
80
|
- !ruby/object:Gem::Version
|
81
|
+
hash: 3
|
78
82
|
segments:
|
79
83
|
- 0
|
80
84
|
version: "0"
|
@@ -83,6 +87,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
83
87
|
requirements:
|
84
88
|
- - ">="
|
85
89
|
- !ruby/object:Gem::Version
|
90
|
+
hash: 3
|
86
91
|
segments:
|
87
92
|
- 0
|
88
93
|
version: "0"
|