string_cleaner 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Joseph Halter
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,27 @@
1
+ = string_cleaner
2
+
3
+ Just add a method .clean to String which does:
4
+ * convert invalid UTF-8 strings from ISO-8859-15 to UTF-8 to fix them
5
+ * recognize euro char from both ISO 8859-15 and Windows-1252
6
+ * replace \r\n and \r with \n normalizing end of lines
7
+ * replace control characters and other invisible chars by spaces
8
+
9
+ == Install
10
+
11
+ sudo gem install JosephHalter-string_cleaner
12
+
13
+ == Ruby 1.9+
14
+
15
+ Ruby 1.9+ has native support for unicode and specs are 100% passing.
16
+
17
+ == Ruby 1.8.x
18
+
19
+ Because Ruby 1.8.x has no native support for Unicode, you must install oniguruma and the jasherai-oniguruma gem.
20
+
21
+ == Example usage
22
+
23
+ "\210\004".clean # => " "
24
+
25
+ == Copyright
26
+
27
+ Copyright (c) 2009 Joseph Halter. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,46 @@
1
+ require "rubygems"
2
+ require "rake"
3
+
4
+ begin
5
+ require "jeweler"
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "string_cleaner"
8
+ gem.summary = %Q{TODO}
9
+ gem.email = "joseph@openhood.com"
10
+ gem.homepage = "http://github.com/JosephHalter/string_cleaner"
11
+ gem.authors = ["Joseph Halter"]
12
+
13
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
14
+ end
15
+ rescue LoadError
16
+ puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
17
+ end
18
+
19
+ require "spec/rake/spectask"
20
+ Spec::Rake::SpecTask.new(:spec) do |spec|
21
+ spec.libs << "lib" << "spec"
22
+ spec.spec_files = FileList["spec/**/*_spec.rb"]
23
+ end
24
+
25
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
26
+ spec.libs << "lib" << "spec"
27
+ spec.pattern = "spec/**/*_spec.rb"
28
+ spec.rcov = true
29
+ end
30
+
31
+ task :default => :spec
32
+
33
+ require "rake/rdoctask"
34
+ Rake::RDocTask.new do |rdoc|
35
+ if File.exist?("VERSION.yml")
36
+ config = YAML.load(File.read("VERSION.yml"))
37
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
38
+ else
39
+ version = ""
40
+ end
41
+
42
+ rdoc.rdoc_dir = "rdoc"
43
+ rdoc.title = "test #{version}"
44
+ rdoc.rdoc_files.include("README*")
45
+ rdoc.rdoc_files.include("lib/**/*.rb")
46
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,59 @@
1
+ # encoding: UTF-8
2
+ module String::Cleaner
3
+
4
+ # convert invalid UTF-8 strings from ISO-8859-15 to UTF-8 to fix them
5
+ # recognize euro char from both ISO 8859-15 and Windows-1252
6
+ # replace \r\n and \r with \n normalizing end of lines
7
+ # replace control characters and other invisible chars by spaces
8
+ def clean
9
+ utf8 = self.dup
10
+ if utf8.respond_to?(:force_encoding)
11
+
12
+ # for Ruby 1.9+
13
+ utf8.force_encoding("UTF-8")
14
+ unless utf8.valid_encoding? # if invalid UTF-8
15
+ utf8 = utf8.force_encoding("ISO8859-15")
16
+ utf8.encode!("UTF-8", :invalid => :replace, :undef => :replace, :replace => "")
17
+ utf8.gsub!("\xC2\x80", "€") # special case for euro sign from Windows-1252
18
+ utf8.force_encoding("UTF-8")
19
+ end
20
+
21
+ # normalize end of lines
22
+ utf8.gsub!(/\r\n/u, "\n")
23
+ utf8.gsub!(/\r/u, "\n")
24
+
25
+ # normalize invisible chars
26
+ utf8 = (utf8 << " ").split(/\n/u).each{|line|
27
+ line.gsub!(/[\s\p{C}]/u, " ")
28
+ }.join("\n").chop!
29
+ utf8.force_encoding("UTF-8")
30
+ else
31
+
32
+ # for Ruby 1.8.6, use iconv
33
+ require "iconv"
34
+ utf8 << " "
35
+ utf8 = begin
36
+ Iconv.new("UTF-8", "UTF-8").iconv(utf8)
37
+ rescue
38
+ utf8.gsub!(/\x80/n, "\xA4")
39
+ utf8 = Iconv.new("UTF-8//IGNORE", "ISO8859-15").iconv(utf8)
40
+ end
41
+
42
+ # normalize end of lines
43
+ utf8.gsub!(/\r\n/n, "\n")
44
+ utf8.gsub!(/\r/n, "\n")
45
+
46
+ # normalize invisible chars using oniguruma
47
+ require "oniguruma"
48
+ utf8 = utf8.split(/\n/n).collect{|line|
49
+ Oniguruma::ORegexp.new("[\\s\\p{C}]", {:encoding => Oniguruma::ENCODING_UTF8}).gsub(line, " ")
50
+ }.join("\n").chop!
51
+ end
52
+ utf8
53
+ replace(utf8)
54
+ end
55
+
56
+ end
57
+ class String
58
+ include String::Cleaner
59
+ end
@@ -0,0 +1,3 @@
1
+ require "spec"
2
+ require "rubygems"
3
+ require File.join(File.dirname(__FILE__), "..", "lib", "string_cleaner")
@@ -0,0 +1,204 @@
1
+ # encoding: UTF-8
2
+ require File.dirname(__FILE__) + "/spec_helper"
3
+
4
+ describe String::Cleaner do
5
+ if "".respond_to?(:force_encoding)
6
+ # specs for Ruby 1.9+
7
+ describe "with all 8-bit characters" do
8
+ before :all do
9
+ @input = "".force_encoding("ISO8859-15")
10
+ (0..255).each{|i| @input << i.chr}
11
+ @input.force_encoding("UTF-8")
12
+ @output = @input.clean
13
+ end
14
+ it "should output a valid UTF-8 string" do
15
+ @output.encoding.name.should == "UTF-8"
16
+ @output.should be_valid_encoding
17
+ end
18
+ it "should wipe out the control characters" do
19
+ @output.should == " \n \n !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ € ¡¢£€¥Š§š©ª«¬ ®¯°±²³Žµ¶·ž¹º»ŒœŸ¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
20
+ end
21
+ end
22
+ describe "with invalid UTF-8 sequence" do
23
+ before :all do
24
+ @input = "\210\004"
25
+ @output = @input.clean
26
+ end
27
+ it "should output a valid UTF-8 string" do
28
+ @output.encoding.name.should == "UTF-8"
29
+ @output.should be_valid_encoding
30
+ end
31
+ it "should replace invisible chars by space" do
32
+ @output.should == " "
33
+ end
34
+ end
35
+ describe "with mixed valid and invalid characters" do
36
+ before :all do
37
+ @input = "a?^?\xddf"
38
+ @output = @input.clean
39
+ end
40
+ it "should output a valid UTF-8 string" do
41
+ @output.encoding.name.should == "UTF-8"
42
+ @output.should be_valid_encoding
43
+ end
44
+ it "should keep the valid characters" do
45
+ @output.should == "a?^?Ýf"
46
+ end
47
+ end
48
+ describe "with already valid characters" do
49
+ before :all do
50
+ @input = "\n\t\r\r\n\v\n"
51
+ @output = @input.clean
52
+ end
53
+ it "should output a valid UTF-8 string" do
54
+ @output.encoding.name.should == "UTF-8"
55
+ @output.should be_valid_encoding
56
+ end
57
+ it "should replace invisible chars by space" do
58
+ @output.should == "\n \n\n \n"
59
+ end
60
+ end
61
+ describe "with watermarked text" do
62
+ before :all do
63
+ @input = "Here is \357\273\277a block \357\273\277of text \357\273\277inside of which a number will be hidden!"
64
+ @output = @input.clean
65
+ end
66
+ it "should output a valid UTF-8 string" do
67
+ @output.encoding.name.should == "UTF-8"
68
+ @output.should be_valid_encoding
69
+ end
70
+ it "should replace invisible chars by space" do
71
+ @output.should == "Here is a block of text inside of which a number will be hidden!"
72
+ end
73
+ end
74
+ describe "with unusual valid spaces" do
75
+ before :all do
76
+ @input = []
77
+ @input << "\u0020" # SPACE
78
+ @input << "\u00A0" # NO-BREAK SPACE
79
+ @input << "\u2000" # EN QUAD
80
+ @input << "\u2001" # EM QUAD
81
+ @input << "\u2002" # EN SPACE
82
+ @input << "\u2003" # EM SPACE
83
+ @input << "\u2004" # THREE-PER-EM SPACE
84
+ @input << "\u2005" # FOUR-PER-EM SPACE
85
+ @input << "\u2006" # SIX-PER-EM SPACE
86
+ @input << "\u2007" # FIGURE SPACE
87
+ @input << "\u2008" # PUNCTUATION SPACE
88
+ @input << "\u2009" # THIN SPACE
89
+ @input << "\u200A" # HAIR SPACE
90
+ @input << "\u200B" # ZERO WIDTH SPACE
91
+ @input << "\u202F" # NARROW NO-BREAK SPACE
92
+ @input << "\u205F" # MEDIUM MATHEMATICAL SPACE
93
+ @input << "\u3000" # IDEOGRAPHIC SPACE
94
+ @input << "\uFEFF" # ZERO WIDTH NO-BREAK SPACE
95
+ @output = @input.join.clean
96
+ end
97
+ it "should output a valid UTF-8 string" do
98
+ @output.encoding.name.should == "UTF-8"
99
+ @output.should be_valid_encoding
100
+ end
101
+ it "should replace invisible chars by space" do
102
+ @output.should == " "*@input.size
103
+ end
104
+ end
105
+ describe "with euro sign from both ISO 8859-15 or Windows-1252" do
106
+ before :all do
107
+ @input = "\x80\xA4"
108
+ @output = @input.clean
109
+ end
110
+ it "should output a valid UTF-8 string" do
111
+ @output.encoding.name.should == "UTF-8"
112
+ @output.should be_valid_encoding
113
+ end
114
+ it "should replace invisible chars by space" do
115
+ @output.should == "€€"
116
+ end
117
+ end
118
+ else
119
+ # specs for Ruby 1.8.6
120
+ describe "with all 8-bit characters" do
121
+ before :all do
122
+ @input = ""
123
+ (0..255).each{|i| @input << i.chr}
124
+ @output = @input.clean
125
+ end
126
+ it "should wipe out the control characters" do
127
+ @output.should == " \n \n !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ € ¡¢£€¥Š§š©ª«¬ ®¯°±²³Žµ¶·ž¹º»ŒœŸ¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
128
+ end
129
+ end
130
+ describe "with invalid UTF-8 sequence" do
131
+ before :all do
132
+ @input = "\210\004"
133
+ @output = @input.clean
134
+ end
135
+ it "should replace invisible chars by space" do
136
+ @output.should == " "
137
+ end
138
+ end
139
+ describe "with mixed valid and invalid characters" do
140
+ before :all do
141
+ @input = "a?^?\xddf"
142
+ @output = @input.clean
143
+ end
144
+ it "should keep the valid characters" do
145
+ @output.should == "a?^?Ýf"
146
+ end
147
+ end
148
+ describe "with already valid characters" do
149
+ before :all do
150
+ @input = "\n\t\r\r\n\v\n"
151
+ @output = @input.clean
152
+ end
153
+ it "should replace invisible chars by space" do
154
+ @output.should == "\n \n\n \n"
155
+ end
156
+ end
157
+ describe "with watermarked text" do
158
+ before :all do
159
+ @input = "Here is \357\273\277a block \357\273\277of text \357\273\277inside of which a number will be hidden!"
160
+ @output = @input.clean
161
+ end
162
+ it "should replace invisible chars by space" do
163
+ @output.should == "Here is a block of text inside of which a number will be hidden!"
164
+ end
165
+ end
166
+ describe "with unusual valid spaces" do
167
+ before :all do
168
+ @input = []
169
+ # "\uXXXX" doesn't exists yet on Ruby 1.8.6
170
+ @input << " " # SPACE
171
+ @input << "\xC2\xA0" # NO-BREAK SPACE
172
+ @input << "\xE2\x80\x80" # EN QUAD
173
+ @input << "\xE2\x80\x81" # EM QUAD
174
+ @input << "\xE2\x80\x82" # EN SPACE
175
+ @input << "\xE2\x80\x83" # EM SPACE
176
+ @input << "\xE2\x80\x84" # THREE-PER-EM SPACE
177
+ @input << "\xE2\x80\x85" # FOUR-PER-EM SPACE
178
+ @input << "\xE2\x80\x86" # SIX-PER-EM SPACE
179
+ @input << "\xE2\x80\x87" # FIGURE SPACE
180
+ @input << "\xE2\x80\x88" # PUNCTUATION SPACE
181
+ @input << "\xE2\x80\x89" # THIN SPACE
182
+ @input << "\xE2\x80\x8A" # HAIR SPACE
183
+ @input << "\xE2\x80\x8B" # ZERO WIDTH SPACE
184
+ @input << "\xE2\x80\xAF" # NARROW NO-BREAK SPACE
185
+ @input << "\xE2\x81\x9F" # MEDIUM MATHEMATICAL SPACE
186
+ @input << "\xE3\x80\x80" # IDEOGRAPHIC SPACE
187
+ @input << "\xEF\xBB\xBF" # ZERO WIDTH NO-BREAK SPACE
188
+ @output = @input.join.clean
189
+ end
190
+ it "should replace invisible chars by space" do
191
+ @output.should == " "*@input.size
192
+ end
193
+ end
194
+ describe "with euro sign from both ISO 8859-15 or Windows-1252" do
195
+ before :all do
196
+ @input = "\x80\xA4"
197
+ @output = @input.clean
198
+ end
199
+ it "should replace invisible chars by space" do
200
+ @output.should == "€€"
201
+ end
202
+ end
203
+ end
204
+ end
@@ -0,0 +1,46 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{string_cleaner}
5
+ s.version = "0.1.0"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Joseph Halter"]
9
+ s.date = %q{2009-08-26}
10
+ s.email = %q{joseph@openhood.com}
11
+ s.extra_rdoc_files = [
12
+ "LICENSE",
13
+ "README.rdoc"
14
+ ]
15
+ s.files = [
16
+ ".gitignore",
17
+ "LICENSE",
18
+ "README.rdoc",
19
+ "Rakefile",
20
+ "VERSION",
21
+ "lib/string_cleaner.rb",
22
+ "spec/spec_helper.rb",
23
+ "spec/string_cleaner_spec.rb",
24
+ "string_cleaner.gemspec"
25
+ ]
26
+ s.has_rdoc = true
27
+ s.homepage = %q{http://github.com/JosephHalter/string_cleaner}
28
+ s.rdoc_options = ["--charset=UTF-8"]
29
+ s.require_paths = ["lib"]
30
+ s.rubygems_version = %q{1.3.1}
31
+ s.summary = %q{Fix invalid UTF-8 and wipe invisible chars, fully compatible with Ruby 1.8 & 1.9 with extensive specs}
32
+ s.test_files = [
33
+ "spec/spec_helper.rb",
34
+ "spec/string_cleaner_spec.rb"
35
+ ]
36
+
37
+ if s.respond_to? :specification_version then
38
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
39
+ s.specification_version = 2
40
+
41
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
42
+ else
43
+ end
44
+ else
45
+ end
46
+ end
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: string_cleaner
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Joseph Halter
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2009-08-26 00:00:00 +02:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description:
23
+ email: joseph@openhood.com
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files:
29
+ - LICENSE
30
+ - README.rdoc
31
+ files:
32
+ - .gitignore
33
+ - LICENSE
34
+ - README.rdoc
35
+ - Rakefile
36
+ - VERSION
37
+ - lib/string_cleaner.rb
38
+ - spec/spec_helper.rb
39
+ - spec/string_cleaner_spec.rb
40
+ - string_cleaner.gemspec
41
+ has_rdoc: true
42
+ homepage: http://github.com/JosephHalter/string_cleaner
43
+ licenses: []
44
+
45
+ post_install_message:
46
+ rdoc_options:
47
+ - --charset=UTF-8
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ hash: 3
56
+ segments:
57
+ - 0
58
+ version: "0"
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ none: false
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ hash: 3
65
+ segments:
66
+ - 0
67
+ version: "0"
68
+ requirements: []
69
+
70
+ rubyforge_project:
71
+ rubygems_version: 1.3.7
72
+ signing_key:
73
+ specification_version: 2
74
+ summary: Fix invalid UTF-8 and wipe invisible chars, fully compatible with Ruby 1.8 & 1.9 with extensive specs
75
+ test_files:
76
+ - spec/spec_helper.rb
77
+ - spec/string_cleaner_spec.rb