string_cleaner 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -1,6 +1,9 @@
1
- require 'rake/tasklib'
2
- require 'rake/rdoctask'
3
- require 'rubygems'
1
+ if RUBY_VERSION.to_f<1.9
2
+ require 'rake/tasklib'
3
+ require 'rake/rdoctask'
4
+ require 'rubygems'
5
+ end
6
+
4
7
  begin
5
8
  require 'bundler/setup'
6
9
  rescue LoadError
@@ -9,13 +9,13 @@ module String::Cleaner
9
9
 
10
10
  def fix_encoding
11
11
  utf8 = dup
12
- if utf8.respond_to?(:force_encoding)
12
+ if utf8.respond_to?(:force_encoding)
13
13
  utf8.force_encoding("UTF-8") # for Ruby 1.9+
14
14
  unless utf8.valid_encoding? # if invalid UTF-8
15
- utf8 = utf8.force_encoding("ISO8859-15")
15
+ utf8 = utf8.force_encoding("ISO8859-1")
16
16
  utf8.encode!("UTF-8", :invalid => :replace, :undef => :replace, :replace => "")
17
17
  end
18
- utf8.gsub!("\u0080", "€") # special case for euro sign from Windows-1252
18
+ utf8.gsub!(/\u0080|¤/, "€") # special case for euro sign from Windows-1252
19
19
  utf8
20
20
  else
21
21
  require "iconv"
@@ -34,31 +34,40 @@ module String::Cleaner
34
34
  end
35
35
 
36
36
  SPECIAL_SPACES = [
37
- 0x00A0, # White_Space # Zs NO-BREAK SPACE
38
- 0x1680, # White_Space # Zs OGHAM SPACE MARK
39
- 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
40
- (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
41
- 0x2028, # White_Space # Zl LINE SEPARATOR
42
- 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
43
- 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
44
- 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
45
- 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
46
- ].flatten.collect{|e| [e].pack 'U*'}
37
+ 0x00A0, # NO-BREAK SPACE
38
+ 0x1680, # OGHAM SPACE MARK
39
+ 0x180E, # MONGOLIAN VOWEL SEPARATOR
40
+ (0x2000..0x200A).to_a, # EN QUAD..HAIR SPACE
41
+ 0x2028, # LINE SEPARATOR
42
+ 0x2029, # PARAGRAPH SEPARATOR
43
+ 0x202F, # NARROW NO-BREAK SPACE
44
+ 0x205F, # MEDIUM MATHEMATICAL SPACE
45
+ 0x3000, # IDEOGRAPHIC SPACE
46
+ ].flatten.collect{|e| [e].pack 'U*'}
47
+
48
+ ZERO_WIDTH = [
49
+ 0x200B, # ZERO WIDTH SPACE
50
+ 0x200C, # ZERO WIDTH NON-JOINER
51
+ 0x200D, # ZERO WIDTH JOINER
52
+ 0x2060, # WORD JOINER
53
+ 0xFEFF, # ZERO WIDTH NO-BREAK SPACE
54
+ ].flatten.collect{|e| [e].pack 'U*'}
47
55
 
48
56
  def fix_invisible_chars
49
57
  utf8 = self.dup
50
- if utf8.respond_to?(:force_encoding)
58
+ utf8.gsub!(Regexp.new(ZERO_WIDTH.join("|")), "")
59
+ utf8 = if utf8.respond_to?(:force_encoding)
51
60
  utf8 = (utf8 << " ").split(/\n/u).each{|line|
52
61
  line.gsub!(/[\s\p{C}]/u, " ")
53
62
  }.join("\n").chop!
54
- utf8.gsub!(Regexp.new(SPECIAL_SPACES.join("|")), " ")
55
- utf8.force_encoding("UTF-8")
56
63
  else
57
64
  require "oniguruma"
58
65
  utf8.split(/\n/n).collect{|line|
59
- Oniguruma::ORegexp.new("[\\s\\p{C}]", {:encoding => Oniguruma::ENCODING_UTF8}).gsub(line, " ")
66
+ Oniguruma::ORegexp.new("[\\p{C}]", {:encoding => Oniguruma::ENCODING_UTF8}).gsub(line, " ")
60
67
  }.join("\n").chop!
61
68
  end
69
+ utf8.gsub!(Regexp.new(SPECIAL_SPACES.join("|") + "|\s"), " ")
70
+ utf8
62
71
  end
63
72
 
64
73
  def trim(chars = "")
@@ -2,146 +2,72 @@
2
2
  require File.dirname(__FILE__) + "/spec_helper"
3
3
 
4
4
  describe String::Cleaner do
5
- if "".respond_to?(:force_encoding)
6
- # specs for Ruby 1.9+
5
+ describe "#clean" do
7
6
  describe "with all 8-bit characters" do
8
7
  before :all do
9
- @input = "".force_encoding("ISO8859-15")
8
+ @input = ""
9
+ @input.force_encoding("ISO8859-15") if @input.respond_to?(:force_encoding)
10
10
  (0..255).each{|i| @input << i.chr}
11
- @input.force_encoding("UTF-8")
11
+ @input.force_encoding("UTF-8") if @input.respond_to?(:force_encoding)
12
12
  @output = @input.clean
13
13
  end
14
- it "should output a valid UTF-8 string" do
15
- @output.encoding.name.should == "UTF-8"
16
- @output.should be_valid_encoding
14
+ if RUBY_VERSION.to_f>1.9
15
+ it "should output a valid UTF-8 string" do
16
+ @output.encoding.name.should == "UTF-8"
17
+ @output.should be_valid_encoding
18
+ end
17
19
  end
18
20
  it "should wipe out the control characters" do
19
- @output.should == " \n \n !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ € ¡¢£€¥Š§š©ª«¬ ®¯°±²³Žµ¶·ž¹º»ŒœŸ¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
20
- end
21
- end
22
- it "should convert all type of spaces to normal spaces" do
23
- input = [
24
- (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
25
- 0x0020, # White_Space # Zs SPACE
26
- 0x0085, # White_Space # Cc <control-0085>
27
- 0x00A0, # White_Space # Zs NO-BREAK SPACE
28
- 0x1680, # White_Space # Zs OGHAM SPACE MARK
29
- 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
30
- (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
31
- 0x2028, # White_Space # Zl LINE SEPARATOR
32
- 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
33
- 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
34
- 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
35
- 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
36
- ].flatten.collect{ |e| [e].pack 'U*' }
37
- input.join.clean.should == " \n \n "
38
- end
39
- describe "with invalid UTF-8 sequence" do
40
- before :all do
41
- @input = "\210\004"
42
- @output = @input.clean
43
- end
44
- it "should output a valid UTF-8 string" do
45
- @output.encoding.name.should == "UTF-8"
46
- @output.should be_valid_encoding
47
- end
48
- it "should replace invisible chars by space" do
49
- @output.should == " "
50
- end
51
- end
52
- describe "with mixed valid and invalid characters" do
53
- before :all do
54
- @input = "a?^?\xddf"
55
- @output = @input.clean
56
- end
57
- it "should output a valid UTF-8 string" do
58
- @output.encoding.name.should == "UTF-8"
59
- @output.should be_valid_encoding
60
- end
61
- it "should keep the valid characters" do
62
- @output.should == "a?^?Ýf"
63
- end
64
- end
65
- describe "with already valid characters" do
66
- before :all do
67
- @input = "\n\t\r\r\n\v\n"
68
- @output = @input.clean
69
- end
70
- it "should output a valid UTF-8 string" do
71
- @output.encoding.name.should == "UTF-8"
72
- @output.should be_valid_encoding
73
- end
74
- it "should replace invisible chars by space" do
75
- @output.should == "\n \n\n \n"
76
- end
77
- end
78
- describe "with watermarked text" do
79
- before :all do
80
- @input = "Here is \357\273\277a block \357\273\277of text \357\273\277inside of which a number will be hidden!"
81
- @output = @input.clean
82
- end
83
- it "should output a valid UTF-8 string" do
84
- @output.encoding.name.should == "UTF-8"
85
- @output.should be_valid_encoding
86
- end
87
- it "should replace invisible chars by space" do
88
- @output.should == "Here is a block of text inside of which a number will be hidden!"
21
+ @output.should == " \n \n !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ € ¡¢£€¥¦§¨©ª«¬ ®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
89
22
  end
90
23
  end
91
- describe "with unusual valid spaces" do
92
- before :all do
93
- @input = []
94
- @input << "\u0020" # SPACE
95
- @input << "\u00A0" # NO-BREAK SPACE
96
- @input << "\u2000" # EN QUAD
97
- @input << "\u2001" # EM QUAD
98
- @input << "\u2002" # EN SPACE
99
- @input << "\u2003" # EM SPACE
100
- @input << "\u2004" # THREE-PER-EM SPACE
101
- @input << "\u2005" # FOUR-PER-EM SPACE
102
- @input << "\u2006" # SIX-PER-EM SPACE
103
- @input << "\u2007" # FIGURE SPACE
104
- @input << "\u2008" # PUNCTUATION SPACE
105
- @input << "\u2009" # THIN SPACE
106
- @input << "\u200A" # HAIR SPACE
107
- @input << "\u200B" # ZERO WIDTH SPACE
108
- @input << "\u202F" # NARROW NO-BREAK SPACE
109
- @input << "\u205F" # MEDIUM MATHEMATICAL SPACE
110
- @input << "\u3000" # IDEOGRAPHIC SPACE
111
- @input << "\uFEFF" # ZERO WIDTH NO-BREAK SPACE
24
+ describe "with various type of spaces" do
25
+ before do
26
+ @input = [
27
+ (0x0009..0x000D).to_a, # <control-0009>..<control-000D>
28
+ 0x0020, # SPACE
29
+ 0x0085, # <control-0085>
30
+ 0x00A0, # NO-BREAK SPACE
31
+ 0x1680, # OGHAM SPACE MARK
32
+ 0x180E, # MONGOLIAN VOWEL SEPARATOR
33
+ (0x2000..0x200A).to_a, # EN QUAD..HAIR SPACE
34
+ 0x2028, # LINE SEPARATOR
35
+ 0x2029, # PARAGRAPH SEPARATOR
36
+ 0x202F, # NARROW NO-BREAK SPACE
37
+ 0x205F, # MEDIUM MATHEMATICAL SPACE
38
+ 0x3000, # IDEOGRAPHIC SPACE
39
+ ].flatten.collect{ |e| [e].pack 'U*' }
112
40
  @output = @input.join.clean
113
41
  end
114
- it "should output a valid UTF-8 string" do
115
- @output.encoding.name.should == "UTF-8"
116
- @output.should be_valid_encoding
117
- end
118
- it "should replace invisible chars by space" do
119
- @output.should == " "*@input.size
120
- end
121
- end
122
- describe "with euro sign from both ISO 8859-15 or Windows-1252" do
123
- before :all do
124
- @input = "\x80\xA4"
125
- @output = @input.clean
126
- end
127
- it "should output a valid UTF-8 string" do
128
- @output.encoding.name.should == "UTF-8"
129
- @output.should be_valid_encoding
130
- end
131
- it "should replace invisible chars by space" do
132
- @output.should == "€€"
42
+ if RUBY_VERSION.to_f>1.9
43
+ it "should output a valid UTF-8 string" do
44
+ @output.encoding.name.should == "UTF-8"
45
+ @output.should be_valid_encoding
46
+ end
47
+ end
48
+ it "should replace all spaces to normal spaces" do
49
+ @output.clean.should == " \n \n "
50
+ end
51
+ end
52
+ describe "with various no-width characters" do
53
+ before do
54
+ @input = [
55
+ 0x200B, # ZERO WIDTH SPACE
56
+ 0x200C, # ZERO WIDTH NON-JOINER
57
+ 0x200D, # ZERO WIDTH JOINER
58
+ 0x2060, # WORD JOINER
59
+ 0xFEFF, # ZERO WIDTH NO-BREAK SPACE
60
+ ].flatten.collect{ |e| [e].pack 'U*' }
61
+ @output = @input.join.clean
133
62
  end
134
- end
135
- else
136
- # specs for Ruby 1.8.6
137
- describe "with all 8-bit characters" do
138
- before :all do
139
- @input = ""
140
- (0..255).each{|i| @input << i.chr}
141
- @output = @input.clean
63
+ if RUBY_VERSION.to_f>1.9
64
+ it "should output a valid UTF-8 string" do
65
+ @output.encoding.name.should == "UTF-8"
66
+ @output.should be_valid_encoding
67
+ end
142
68
  end
143
- it "should wipe out the control characters" do
144
- @output.should == " \n \n !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ € ¡¢£€¥¦§¨©ª«¬ ®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
69
+ it "should remove no-width characters" do
70
+ @output.should == ""
145
71
  end
146
72
  end
147
73
  describe "with invalid UTF-8 sequence" do
@@ -149,6 +75,12 @@ describe String::Cleaner do
149
75
  @input = "\210\004"
150
76
  @output = @input.clean
151
77
  end
78
+ if RUBY_VERSION.to_f>1.9
79
+ it "should output a valid UTF-8 string" do
80
+ @output.encoding.name.should == "UTF-8"
81
+ @output.should be_valid_encoding
82
+ end
83
+ end
152
84
  it "should replace invisible chars by space" do
153
85
  @output.should == " "
154
86
  end
@@ -158,6 +90,12 @@ describe String::Cleaner do
158
90
  @input = "a?^?\xddf"
159
91
  @output = @input.clean
160
92
  end
93
+ if RUBY_VERSION.to_f>1.9
94
+ it "should output a valid UTF-8 string" do
95
+ @output.encoding.name.should == "UTF-8"
96
+ @output.should be_valid_encoding
97
+ end
98
+ end
161
99
  it "should keep the valid characters" do
162
100
  @output.should == "a?^?Ýf"
163
101
  end
@@ -167,6 +105,12 @@ describe String::Cleaner do
167
105
  @input = "\n\t\r\r\n\v\n"
168
106
  @output = @input.clean
169
107
  end
108
+ if RUBY_VERSION.to_f>1.9
109
+ it "should output a valid UTF-8 string" do
110
+ @output.encoding.name.should == "UTF-8"
111
+ @output.should be_valid_encoding
112
+ end
113
+ end
170
114
  it "should replace invisible chars by space" do
171
115
  @output.should == "\n \n\n \n"
172
116
  end
@@ -176,36 +120,14 @@ describe String::Cleaner do
176
120
  @input = "Here is \357\273\277a block \357\273\277of text \357\273\277inside of which a number will be hidden!"
177
121
  @output = @input.clean
178
122
  end
179
- it "should replace invisible chars by space" do
180
- @output.should == "Here is a block of text inside of which a number will be hidden!"
181
- end
182
- end
183
- describe "with unusual valid spaces" do
184
- before :all do
185
- @input = []
186
- # "\uXXXX" doesn't exists yet on Ruby 1.8.6
187
- @input << " " # SPACE
188
- @input << "\xC2\xA0" # NO-BREAK SPACE
189
- @input << "\xE2\x80\x80" # EN QUAD
190
- @input << "\xE2\x80\x81" # EM QUAD
191
- @input << "\xE2\x80\x82" # EN SPACE
192
- @input << "\xE2\x80\x83" # EM SPACE
193
- @input << "\xE2\x80\x84" # THREE-PER-EM SPACE
194
- @input << "\xE2\x80\x85" # FOUR-PER-EM SPACE
195
- @input << "\xE2\x80\x86" # SIX-PER-EM SPACE
196
- @input << "\xE2\x80\x87" # FIGURE SPACE
197
- @input << "\xE2\x80\x88" # PUNCTUATION SPACE
198
- @input << "\xE2\x80\x89" # THIN SPACE
199
- @input << "\xE2\x80\x8A" # HAIR SPACE
200
- @input << "\xE2\x80\x8B" # ZERO WIDTH SPACE
201
- @input << "\xE2\x80\xAF" # NARROW NO-BREAK SPACE
202
- @input << "\xE2\x81\x9F" # MEDIUM MATHEMATICAL SPACE
203
- @input << "\xE3\x80\x80" # IDEOGRAPHIC SPACE
204
- @input << "\xEF\xBB\xBF" # ZERO WIDTH NO-BREAK SPACE
205
- @output = @input.join.clean
123
+ if RUBY_VERSION.to_f>1.9
124
+ it "should output a valid UTF-8 string" do
125
+ @output.encoding.name.should == "UTF-8"
126
+ @output.should be_valid_encoding
127
+ end
206
128
  end
207
129
  it "should replace invisible chars by space" do
208
- @output.should == " "*@input.size
130
+ @output.should == "Here is a block of text inside of which a number will be hidden!"
209
131
  end
210
132
  end
211
133
  describe "with euro sign from both ISO 8859-15 or Windows-1252" do
@@ -213,6 +135,12 @@ describe String::Cleaner do
213
135
  @input = "\x80\xA4"
214
136
  @output = @input.clean
215
137
  end
138
+ if RUBY_VERSION.to_f>1.9
139
+ it "should output a valid UTF-8 string" do
140
+ @output.encoding.name.should == "UTF-8"
141
+ @output.should be_valid_encoding
142
+ end
143
+ end
216
144
  it "should replace invisible chars by space" do
217
145
  @output.should == "€€"
218
146
  end
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{string_cleaner}
5
- s.version = "0.2.1"
5
+ s.version = "0.2.2"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Joseph Halter"]
metadata CHANGED
@@ -1,12 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: string_cleaner
3
3
  version: !ruby/object:Gem::Version
4
+ hash: 19
4
5
  prerelease: false
5
6
  segments:
6
7
  - 0
7
8
  - 2
8
- - 1
9
- version: 0.2.1
9
+ - 2
10
+ version: 0.2.2
10
11
  platform: ruby
11
12
  authors:
12
13
  - Joseph Halter
@@ -25,6 +26,7 @@ dependencies:
25
26
  requirements:
26
27
  - - ">="
27
28
  - !ruby/object:Gem::Version
29
+ hash: 3
28
30
  segments:
29
31
  - 0
30
32
  version: "0"
@@ -38,6 +40,7 @@ dependencies:
38
40
  requirements:
39
41
  - - ">="
40
42
  - !ruby/object:Gem::Version
43
+ hash: 3
41
44
  segments:
42
45
  - 0
43
46
  version: "0"
@@ -75,6 +78,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
75
78
  requirements:
76
79
  - - ">="
77
80
  - !ruby/object:Gem::Version
81
+ hash: 3
78
82
  segments:
79
83
  - 0
80
84
  version: "0"
@@ -83,6 +87,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
83
87
  requirements:
84
88
  - - ">="
85
89
  - !ruby/object:Gem::Version
90
+ hash: 3
86
91
  segments:
87
92
  - 0
88
93
  version: "0"