string_cleaner 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Joseph Halter
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,27 @@
1
+ = string_cleaner
2
+
3
+ Just add a method .clean to String which does:
4
+ * convert invalid UTF-8 strings from ISO-8859-15 to UTF-8 to fix them
5
+ * recognize euro char from both ISO 8859-15 and Windows-1252
6
+ * replace \r\n and \r with \n normalizing end of lines
7
+ * replace control characters and other invisible chars by spaces
8
+
9
+ == Install
10
+
11
+ sudo gem install JosephHalter-string_cleaner
12
+
13
+ == Ruby 1.9+
14
+
15
+ Ruby 1.9+ has native support for unicode and specs are 100% passing.
16
+
17
+ == Ruby 1.8.x
18
+
19
+ Because Ruby 1.8.x has no native support for Unicode, you must install oniguruma and the jasherai-oniguruma gem.
20
+
21
+ == Example usage
22
+
23
+ "\210\004".clean # => " "
24
+
25
+ == Copyright
26
+
27
+ Copyright (c) 2009 Joseph Halter. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,46 @@
1
+ require "rubygems"
2
+ require "rake"
3
+
4
+ begin
5
+ require "jeweler"
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "string_cleaner"
8
+ gem.summary = %Q{TODO}
9
+ gem.email = "joseph@openhood.com"
10
+ gem.homepage = "http://github.com/JosephHalter/string_cleaner"
11
+ gem.authors = ["Joseph Halter"]
12
+
13
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
14
+ end
15
+ rescue LoadError
16
+ puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
17
+ end
18
+
19
+ require "spec/rake/spectask"
20
+ Spec::Rake::SpecTask.new(:spec) do |spec|
21
+ spec.libs << "lib" << "spec"
22
+ spec.spec_files = FileList["spec/**/*_spec.rb"]
23
+ end
24
+
25
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
26
+ spec.libs << "lib" << "spec"
27
+ spec.pattern = "spec/**/*_spec.rb"
28
+ spec.rcov = true
29
+ end
30
+
31
+ task :default => :spec
32
+
33
+ require "rake/rdoctask"
34
+ Rake::RDocTask.new do |rdoc|
35
+ if File.exist?("VERSION.yml")
36
+ config = YAML.load(File.read("VERSION.yml"))
37
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
38
+ else
39
+ version = ""
40
+ end
41
+
42
+ rdoc.rdoc_dir = "rdoc"
43
+ rdoc.title = "test #{version}"
44
+ rdoc.rdoc_files.include("README*")
45
+ rdoc.rdoc_files.include("lib/**/*.rb")
46
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,59 @@
1
+ # encoding: UTF-8
2
+ module String::Cleaner
3
+
4
+ # convert invalid UTF-8 strings from ISO-8859-15 to UTF-8 to fix them
5
+ # recognize euro char from both ISO 8859-15 and Windows-1252
6
+ # replace \r\n and \r with \n normalizing end of lines
7
+ # replace control characters and other invisible chars by spaces
8
+ def clean
9
+ utf8 = self.dup
10
+ if utf8.respond_to?(:force_encoding)
11
+
12
+ # for Ruby 1.9+
13
+ utf8.force_encoding("UTF-8")
14
+ unless utf8.valid_encoding? # if invalid UTF-8
15
+ utf8 = utf8.force_encoding("ISO8859-15")
16
+ utf8.encode!("UTF-8", :invalid => :replace, :undef => :replace, :replace => "")
17
+ utf8.gsub!("\xC2\x80", "€") # special case for euro sign from Windows-1252
18
+ utf8.force_encoding("UTF-8")
19
+ end
20
+
21
+ # normalize end of lines
22
+ utf8.gsub!(/\r\n/u, "\n")
23
+ utf8.gsub!(/\r/u, "\n")
24
+
25
+ # normalize invisible chars
26
+ utf8 = (utf8 << " ").split(/\n/u).each{|line|
27
+ line.gsub!(/[\s\p{C}]/u, " ")
28
+ }.join("\n").chop!
29
+ utf8.force_encoding("UTF-8")
30
+ else
31
+
32
+ # for Ruby 1.8.6, use iconv
33
+ require "iconv"
34
+ utf8 << " "
35
+ utf8 = begin
36
+ Iconv.new("UTF-8", "UTF-8").iconv(utf8)
37
+ rescue
38
+ utf8.gsub!(/\x80/n, "\xA4")
39
+ utf8 = Iconv.new("UTF-8//IGNORE", "ISO8859-15").iconv(utf8)
40
+ end
41
+
42
+ # normalize end of lines
43
+ utf8.gsub!(/\r\n/n, "\n")
44
+ utf8.gsub!(/\r/n, "\n")
45
+
46
+ # normalize invisible chars using oniguruma
47
+ require "oniguruma"
48
+ utf8 = utf8.split(/\n/n).collect{|line|
49
+ Oniguruma::ORegexp.new("[\\s\\p{C}]", {:encoding => Oniguruma::ENCODING_UTF8}).gsub(line, " ")
50
+ }.join("\n").chop!
51
+ end
52
+ utf8
53
+ replace(utf8)
54
+ end
55
+
56
+ end
57
+ class String
58
+ include String::Cleaner
59
+ end
@@ -0,0 +1,3 @@
1
+ require "spec"
2
+ require "rubygems"
3
+ require File.join(File.dirname(__FILE__), "..", "lib", "string_cleaner")
@@ -0,0 +1,204 @@
1
+ # encoding: UTF-8
2
+ require File.dirname(__FILE__) + "/spec_helper"
3
+
4
+ describe String::Cleaner do
5
+ if "".respond_to?(:force_encoding)
6
+ # specs for Ruby 1.9+
7
+ describe "with all 8-bit characters" do
8
+ before :all do
9
+ @input = "".force_encoding("ISO8859-15")
10
+ (0..255).each{|i| @input << i.chr}
11
+ @input.force_encoding("UTF-8")
12
+ @output = @input.clean
13
+ end
14
+ it "should output a valid UTF-8 string" do
15
+ @output.encoding.name.should == "UTF-8"
16
+ @output.should be_valid_encoding
17
+ end
18
+ it "should wipe out the control characters" do
19
+ @output.should == " \n \n !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ € ¡¢£€¥Š§š©ª«¬ ®¯°±²³Žµ¶·ž¹º»ŒœŸ¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
20
+ end
21
+ end
22
+ describe "with invalid UTF-8 sequence" do
23
+ before :all do
24
+ @input = "\210\004"
25
+ @output = @input.clean
26
+ end
27
+ it "should output a valid UTF-8 string" do
28
+ @output.encoding.name.should == "UTF-8"
29
+ @output.should be_valid_encoding
30
+ end
31
+ it "should replace invisible chars by space" do
32
+ @output.should == " "
33
+ end
34
+ end
35
+ describe "with mixed valid and invalid characters" do
36
+ before :all do
37
+ @input = "a?^?\xddf"
38
+ @output = @input.clean
39
+ end
40
+ it "should output a valid UTF-8 string" do
41
+ @output.encoding.name.should == "UTF-8"
42
+ @output.should be_valid_encoding
43
+ end
44
+ it "should keep the valid characters" do
45
+ @output.should == "a?^?Ýf"
46
+ end
47
+ end
48
+ describe "with already valid characters" do
49
+ before :all do
50
+ @input = "\n\t\r\r\n\v\n"
51
+ @output = @input.clean
52
+ end
53
+ it "should output a valid UTF-8 string" do
54
+ @output.encoding.name.should == "UTF-8"
55
+ @output.should be_valid_encoding
56
+ end
57
+ it "should replace invisible chars by space" do
58
+ @output.should == "\n \n\n \n"
59
+ end
60
+ end
61
+ describe "with watermarked text" do
62
+ before :all do
63
+ @input = "Here is \357\273\277a block \357\273\277of text \357\273\277inside of which a number will be hidden!"
64
+ @output = @input.clean
65
+ end
66
+ it "should output a valid UTF-8 string" do
67
+ @output.encoding.name.should == "UTF-8"
68
+ @output.should be_valid_encoding
69
+ end
70
+ it "should replace invisible chars by space" do
71
+ @output.should == "Here is a block of text inside of which a number will be hidden!"
72
+ end
73
+ end
74
+ describe "with unusual valid spaces" do
75
+ before :all do
76
+ @input = []
77
+ @input << "\u0020" # SPACE
78
+ @input << "\u00A0" # NO-BREAK SPACE
79
+ @input << "\u2000" # EN QUAD
80
+ @input << "\u2001" # EM QUAD
81
+ @input << "\u2002" # EN SPACE
82
+ @input << "\u2003" # EM SPACE
83
+ @input << "\u2004" # THREE-PER-EM SPACE
84
+ @input << "\u2005" # FOUR-PER-EM SPACE
85
+ @input << "\u2006" # SIX-PER-EM SPACE
86
+ @input << "\u2007" # FIGURE SPACE
87
+ @input << "\u2008" # PUNCTUATION SPACE
88
+ @input << "\u2009" # THIN SPACE
89
+ @input << "\u200A" # HAIR SPACE
90
+ @input << "\u200B" # ZERO WIDTH SPACE
91
+ @input << "\u202F" # NARROW NO-BREAK SPACE
92
+ @input << "\u205F" # MEDIUM MATHEMATICAL SPACE
93
+ @input << "\u3000" # IDEOGRAPHIC SPACE
94
+ @input << "\uFEFF" # ZERO WIDTH NO-BREAK SPACE
95
+ @output = @input.join.clean
96
+ end
97
+ it "should output a valid UTF-8 string" do
98
+ @output.encoding.name.should == "UTF-8"
99
+ @output.should be_valid_encoding
100
+ end
101
+ it "should replace invisible chars by space" do
102
+ @output.should == " "*@input.size
103
+ end
104
+ end
105
+ describe "with euro sign from both ISO 8859-15 or Windows-1252" do
106
+ before :all do
107
+ @input = "\x80\xA4"
108
+ @output = @input.clean
109
+ end
110
+ it "should output a valid UTF-8 string" do
111
+ @output.encoding.name.should == "UTF-8"
112
+ @output.should be_valid_encoding
113
+ end
114
+ it "should replace invisible chars by space" do
115
+ @output.should == "€€"
116
+ end
117
+ end
118
+ else
119
+ # specs for Ruby 1.8.6
120
+ describe "with all 8-bit characters" do
121
+ before :all do
122
+ @input = ""
123
+ (0..255).each{|i| @input << i.chr}
124
+ @output = @input.clean
125
+ end
126
+ it "should wipe out the control characters" do
127
+ @output.should == " \n \n !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ € ¡¢£€¥Š§š©ª«¬ ®¯°±²³Žµ¶·ž¹º»ŒœŸ¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
128
+ end
129
+ end
130
+ describe "with invalid UTF-8 sequence" do
131
+ before :all do
132
+ @input = "\210\004"
133
+ @output = @input.clean
134
+ end
135
+ it "should replace invisible chars by space" do
136
+ @output.should == " "
137
+ end
138
+ end
139
+ describe "with mixed valid and invalid characters" do
140
+ before :all do
141
+ @input = "a?^?\xddf"
142
+ @output = @input.clean
143
+ end
144
+ it "should keep the valid characters" do
145
+ @output.should == "a?^?Ýf"
146
+ end
147
+ end
148
+ describe "with already valid characters" do
149
+ before :all do
150
+ @input = "\n\t\r\r\n\v\n"
151
+ @output = @input.clean
152
+ end
153
+ it "should replace invisible chars by space" do
154
+ @output.should == "\n \n\n \n"
155
+ end
156
+ end
157
+ describe "with watermarked text" do
158
+ before :all do
159
+ @input = "Here is \357\273\277a block \357\273\277of text \357\273\277inside of which a number will be hidden!"
160
+ @output = @input.clean
161
+ end
162
+ it "should replace invisible chars by space" do
163
+ @output.should == "Here is a block of text inside of which a number will be hidden!"
164
+ end
165
+ end
166
+ describe "with unusual valid spaces" do
167
+ before :all do
168
+ @input = []
169
+ # "\uXXXX" doesn't exists yet on Ruby 1.8.6
170
+ @input << " " # SPACE
171
+ @input << "\xC2\xA0" # NO-BREAK SPACE
172
+ @input << "\xE2\x80\x80" # EN QUAD
173
+ @input << "\xE2\x80\x81" # EM QUAD
174
+ @input << "\xE2\x80\x82" # EN SPACE
175
+ @input << "\xE2\x80\x83" # EM SPACE
176
+ @input << "\xE2\x80\x84" # THREE-PER-EM SPACE
177
+ @input << "\xE2\x80\x85" # FOUR-PER-EM SPACE
178
+ @input << "\xE2\x80\x86" # SIX-PER-EM SPACE
179
+ @input << "\xE2\x80\x87" # FIGURE SPACE
180
+ @input << "\xE2\x80\x88" # PUNCTUATION SPACE
181
+ @input << "\xE2\x80\x89" # THIN SPACE
182
+ @input << "\xE2\x80\x8A" # HAIR SPACE
183
+ @input << "\xE2\x80\x8B" # ZERO WIDTH SPACE
184
+ @input << "\xE2\x80\xAF" # NARROW NO-BREAK SPACE
185
+ @input << "\xE2\x81\x9F" # MEDIUM MATHEMATICAL SPACE
186
+ @input << "\xE3\x80\x80" # IDEOGRAPHIC SPACE
187
+ @input << "\xEF\xBB\xBF" # ZERO WIDTH NO-BREAK SPACE
188
+ @output = @input.join.clean
189
+ end
190
+ it "should replace invisible chars by space" do
191
+ @output.should == " "*@input.size
192
+ end
193
+ end
194
+ describe "with euro sign from both ISO 8859-15 or Windows-1252" do
195
+ before :all do
196
+ @input = "\x80\xA4"
197
+ @output = @input.clean
198
+ end
199
+ it "should replace invisible chars by space" do
200
+ @output.should == "€€"
201
+ end
202
+ end
203
+ end
204
+ end
@@ -0,0 +1,46 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{string_cleaner}
5
+ s.version = "0.1.0"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Joseph Halter"]
9
+ s.date = %q{2009-08-26}
10
+ s.email = %q{joseph@openhood.com}
11
+ s.extra_rdoc_files = [
12
+ "LICENSE",
13
+ "README.rdoc"
14
+ ]
15
+ s.files = [
16
+ ".gitignore",
17
+ "LICENSE",
18
+ "README.rdoc",
19
+ "Rakefile",
20
+ "VERSION",
21
+ "lib/string_cleaner.rb",
22
+ "spec/spec_helper.rb",
23
+ "spec/string_cleaner_spec.rb",
24
+ "string_cleaner.gemspec"
25
+ ]
26
+ s.has_rdoc = true
27
+ s.homepage = %q{http://github.com/JosephHalter/string_cleaner}
28
+ s.rdoc_options = ["--charset=UTF-8"]
29
+ s.require_paths = ["lib"]
30
+ s.rubygems_version = %q{1.3.1}
31
+ s.summary = %q{Fix invalid UTF-8 and wipe invisible chars, fully compatible with Ruby 1.8 & 1.9 with extensive specs}
32
+ s.test_files = [
33
+ "spec/spec_helper.rb",
34
+ "spec/string_cleaner_spec.rb"
35
+ ]
36
+
37
+ if s.respond_to? :specification_version then
38
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
39
+ s.specification_version = 2
40
+
41
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
42
+ else
43
+ end
44
+ else
45
+ end
46
+ end
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: string_cleaner
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Joseph Halter
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2009-08-26 00:00:00 +02:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description:
23
+ email: joseph@openhood.com
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files:
29
+ - LICENSE
30
+ - README.rdoc
31
+ files:
32
+ - .gitignore
33
+ - LICENSE
34
+ - README.rdoc
35
+ - Rakefile
36
+ - VERSION
37
+ - lib/string_cleaner.rb
38
+ - spec/spec_helper.rb
39
+ - spec/string_cleaner_spec.rb
40
+ - string_cleaner.gemspec
41
+ has_rdoc: true
42
+ homepage: http://github.com/JosephHalter/string_cleaner
43
+ licenses: []
44
+
45
+ post_install_message:
46
+ rdoc_options:
47
+ - --charset=UTF-8
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ hash: 3
56
+ segments:
57
+ - 0
58
+ version: "0"
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ none: false
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ hash: 3
65
+ segments:
66
+ - 0
67
+ version: "0"
68
+ requirements: []
69
+
70
+ rubyforge_project:
71
+ rubygems_version: 1.3.7
72
+ signing_key:
73
+ specification_version: 2
74
+ summary: Fix invalid UTF-8 and wipe invisible chars, fully compatible with Ruby 1.8 & 1.9 with extensive specs
75
+ test_files:
76
+ - spec/spec_helper.rb
77
+ - spec/string_cleaner_spec.rb