RubyGems - string_cleaner - Versions diffs - 0.1.0 - Mend

string_cleaner 0.1.0

Files changed (10) hide show

data/.gitignore +5 -0
data/LICENSE +20 -0
data/README.rdoc +27 -0
data/Rakefile +46 -0
data/VERSION +1 -0
data/lib/string_cleaner.rb +59 -0
data/spec/spec_helper.rb +3 -0
data/spec/string_cleaner_spec.rb +204 -0
data/string_cleaner.gemspec +46 -0
metadata +77 -0

data/.gitignore ADDED Viewed

@@ -0,0 +1,5 @@
+*.sw?
+.DS_Store
+coverage
+rdoc
+pkg

data/LICENSE ADDED Viewed

@@ -0,0 +1,20 @@
+Copyright (c) 2009 Joseph Halter
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.rdoc ADDED Viewed

@@ -0,0 +1,27 @@
+= string_cleaner
+Just add a method .clean to String which does:
+* convert invalid UTF-8 strings from ISO-8859-15 to UTF-8 to fix them
+* recognize euro char from both ISO 8859-15 and Windows-1252
+* replace \r\n and \r with \n normalizing end of lines
+* replace control characters and other invisible chars by spaces
+== Install
+  sudo gem install JosephHalter-string_cleaner
+== Ruby 1.9+
+Ruby 1.9+ has native support for unicode and specs are 100% passing.
+== Ruby 1.8.x
+Because Ruby 1.8.x has no native support for Unicode, you must install oniguruma and the jasherai-oniguruma gem.
+== Example usage
+  "\210\004".clean # => " "
+== Copyright
+Copyright (c) 2009 Joseph Halter. See LICENSE for details.

data/Rakefile ADDED Viewed

@@ -0,0 +1,46 @@
+require "rubygems"
+require "rake"
+begin
+  require "jeweler"
+  Jeweler::Tasks.new do |gem|
+    gem.name = "string_cleaner"
+    gem.summary = %Q{TODO}
+    gem.email = "joseph@openhood.com"
+    gem.homepage = "http://github.com/JosephHalter/string_cleaner"
+    gem.authors = ["Joseph Halter"]
+    # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
+  end
+rescue LoadError
+  puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
+end
+require "spec/rake/spectask"
+Spec::Rake::SpecTask.new(:spec) do |spec|
+  spec.libs << "lib" << "spec"
+  spec.spec_files = FileList["spec/**/*_spec.rb"]
+end
+Spec::Rake::SpecTask.new(:rcov) do |spec|
+  spec.libs << "lib" << "spec"
+  spec.pattern = "spec/**/*_spec.rb"
+  spec.rcov = true
+end
+task :default => :spec
+require "rake/rdoctask"
+Rake::RDocTask.new do |rdoc|
+  if File.exist?("VERSION.yml")
+    config = YAML.load(File.read("VERSION.yml"))
+    version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
+  else
+    version = ""
+  end
+  rdoc.rdoc_dir = "rdoc"
+  rdoc.title = "test #{version}"
+  rdoc.rdoc_files.include("README*")
+  rdoc.rdoc_files.include("lib/**/*.rb")
+end

data/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.1.0

data/lib/string_cleaner.rb ADDED Viewed

@@ -0,0 +1,59 @@
+# encoding: UTF-8
+module String::Cleaner
+  # convert invalid UTF-8 strings from ISO-8859-15 to UTF-8 to fix them
+  # recognize euro char from both ISO 8859-15 and Windows-1252
+  # replace \r\n and \r with \n normalizing end of lines
+  # replace control characters and other invisible chars by spaces
+  def clean
+    utf8 = self.dup
+    if utf8.respond_to?(:force_encoding)
+      # for Ruby 1.9+
+      utf8.force_encoding("UTF-8")
+      unless utf8.valid_encoding? # if invalid UTF-8
+        utf8 = utf8.force_encoding("ISO8859-15")
+        utf8.encode!("UTF-8", :invalid => :replace, :undef => :replace, :replace => "")
+        utf8.gsub!("\xC2\x80", "€") # special case for euro sign from Windows-1252
+        utf8.force_encoding("UTF-8")
+      end
+      # normalize end of lines
+      utf8.gsub!(/\r\n/u, "\n")
+      utf8.gsub!(/\r/u, "\n")
+      # normalize invisible chars
+      utf8 = (utf8 << " ").split(/\n/u).each{|line|
+        line.gsub!(/[\s\p{C}]/u, " ")
+      }.join("\n").chop!
+      utf8.force_encoding("UTF-8")
+    else
+      # for Ruby 1.8.6, use iconv
+      require "iconv"
+      utf8 << " "
+      utf8 = begin
+        Iconv.new("UTF-8", "UTF-8").iconv(utf8)
+      rescue
+        utf8.gsub!(/\x80/n, "\xA4")
+        utf8 = Iconv.new("UTF-8//IGNORE", "ISO8859-15").iconv(utf8)
+      end
+      # normalize end of lines
+      utf8.gsub!(/\r\n/n, "\n")
+      utf8.gsub!(/\r/n, "\n")
+      # normalize invisible chars using oniguruma
+      require "oniguruma"
+      utf8 = utf8.split(/\n/n).collect{|line|
+        Oniguruma::ORegexp.new("[\\s\\p{C}]", {:encoding => Oniguruma::ENCODING_UTF8}).gsub(line, " ")
+      }.join("\n").chop!
+    end
+    utf8
+    replace(utf8)
+  end
+end
+class String
+  include String::Cleaner
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,3 @@
+require "spec"
+require "rubygems"
+require File.join(File.dirname(__FILE__), "..", "lib", "string_cleaner")

data/spec/string_cleaner_spec.rb ADDED Viewed

@@ -0,0 +1,204 @@
+# encoding: UTF-8
+require File.dirname(__FILE__) + "/spec_helper"
+describe String::Cleaner do
+  if "".respond_to?(:force_encoding)
+    # specs for Ruby 1.9+
+    describe "with all 8-bit characters" do
+      before :all do
+        @input = "".force_encoding("ISO8859-15")
+        (0..255).each{|i| @input << i.chr}
+        @input.force_encoding("UTF-8")
+        @output = @input.clean
+      end
+      it "should output a valid UTF-8 string" do
+        @output.encoding.name.should == "UTF-8"
+        @output.should be_valid_encoding
+      end
+      it "should wipe out the control characters" do
+        @output.should == "          \n  \n                   !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ €                                ¡¢£€¥Š§š©ª«¬ ®¯°±²³Žµ¶·ž¹º»ŒœŸ¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
+      end
+    end
+    describe "with invalid UTF-8 sequence" do
+      before :all do
+        @input = "\210\004"
+        @output = @input.clean
+      end
+      it "should output a valid UTF-8 string" do
+        @output.encoding.name.should == "UTF-8"
+        @output.should be_valid_encoding
+      end
+      it "should replace invisible chars by space" do
+        @output.should == "  "
+      end
+    end
+    describe "with mixed valid and invalid characters" do
+      before :all do
+        @input = "a?^?\xddf"
+        @output = @input.clean
+      end
+      it "should output a valid UTF-8 string" do
+        @output.encoding.name.should == "UTF-8"
+        @output.should be_valid_encoding
+      end
+      it "should keep the valid characters" do
+        @output.should == "a?^?Ýf"
+      end
+    end
+    describe "with already valid characters" do
+      before :all do
+        @input = "\n\t\r\r\n\v\n"
+        @output = @input.clean
+      end
+      it "should output a valid UTF-8 string" do
+        @output.encoding.name.should == "UTF-8"
+        @output.should be_valid_encoding
+      end
+      it "should replace invisible chars by space" do
+        @output.should == "\n \n\n \n"
+      end
+    end
+    describe "with watermarked text" do
+      before :all do
+        @input = "Here is \357\273\277a block \357\273\277of text \357\273\277inside of which a number will be hidden!"
+        @output = @input.clean
+      end
+      it "should output a valid UTF-8 string" do
+        @output.encoding.name.should == "UTF-8"
+        @output.should be_valid_encoding
+      end
+      it "should replace invisible chars by space" do
+        @output.should == "Here is  a block  of text  inside of which a number will be hidden!"
+      end
+    end
+    describe "with unusual valid spaces" do
+      before :all do
+        @input = []
+        @input << "\u0020" # SPACE
+        @input << "\u00A0" # NO-BREAK SPACE
+        @input << "\u2000" # EN QUAD
+        @input << "\u2001" # EM QUAD
+        @input << "\u2002" # EN SPACE
+        @input << "\u2003" # EM SPACE
+        @input << "\u2004" # THREE-PER-EM SPACE
+        @input << "\u2005" # FOUR-PER-EM SPACE
+        @input << "\u2006" # SIX-PER-EM SPACE
+        @input << "\u2007" # FIGURE SPACE
+        @input << "\u2008" # PUNCTUATION SPACE
+        @input << "\u2009" # THIN SPACE
+        @input << "\u200A" # HAIR SPACE
+        @input << "\u200B" # ZERO WIDTH SPACE
+        @input << "\u202F" # NARROW NO-BREAK SPACE
+        @input << "\u205F" # MEDIUM MATHEMATICAL SPACE
+        @input << "\u3000" # IDEOGRAPHIC SPACE
+        @input << "\uFEFF" # ZERO WIDTH NO-BREAK SPACE
+        @output = @input.join.clean
+      end
+      it "should output a valid UTF-8 string" do
+        @output.encoding.name.should == "UTF-8"
+        @output.should be_valid_encoding
+      end
+      it "should replace invisible chars by space" do
+        @output.should == " "*@input.size
+      end
+    end
+    describe "with euro sign from both ISO 8859-15 or Windows-1252" do
+      before :all do
+        @input = "\x80\xA4"
+        @output = @input.clean
+      end
+      it "should output a valid UTF-8 string" do
+        @output.encoding.name.should == "UTF-8"
+        @output.should be_valid_encoding
+      end
+      it "should replace invisible chars by space" do
+        @output.should == "€€"
+      end
+    end
+  else
+    # specs for Ruby 1.8.6
+    describe "with all 8-bit characters" do
+      before :all do
+        @input = ""
+        (0..255).each{|i| @input << i.chr}
+        @output = @input.clean
+      end
+      it "should wipe out the control characters" do
+        @output.should == "          \n  \n                   !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ €                                ¡¢£€¥Š§š©ª«¬ ®¯°±²³Žµ¶·ž¹º»ŒœŸ¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
+      end
+    end
+    describe "with invalid UTF-8 sequence" do
+      before :all do
+        @input = "\210\004"
+        @output = @input.clean
+      end
+      it "should replace invisible chars by space" do
+        @output.should == "  "
+      end
+    end
+    describe "with mixed valid and invalid characters" do
+      before :all do
+        @input = "a?^?\xddf"
+        @output = @input.clean
+      end
+      it "should keep the valid characters" do
+        @output.should == "a?^?Ýf"
+      end
+    end
+    describe "with already valid characters" do
+      before :all do
+        @input = "\n\t\r\r\n\v\n"
+        @output = @input.clean
+      end
+      it "should replace invisible chars by space" do
+        @output.should == "\n \n\n \n"
+      end
+    end
+    describe "with watermarked text" do
+      before :all do
+        @input = "Here is \357\273\277a block \357\273\277of text \357\273\277inside of which a number will be hidden!"
+        @output = @input.clean
+      end
+      it "should replace invisible chars by space" do
+        @output.should == "Here is  a block  of text  inside of which a number will be hidden!"
+      end
+    end
+    describe "with unusual valid spaces" do
+      before :all do
+        @input = []
+        # "\uXXXX" doesn't exists yet on Ruby 1.8.6
+        @input << " "            # SPACE
+        @input << "\xC2\xA0"     # NO-BREAK SPACE
+        @input << "\xE2\x80\x80" # EN QUAD
+        @input << "\xE2\x80\x81" # EM QUAD
+        @input << "\xE2\x80\x82" # EN SPACE
+        @input << "\xE2\x80\x83" # EM SPACE
+        @input << "\xE2\x80\x84" # THREE-PER-EM SPACE
+        @input << "\xE2\x80\x85" # FOUR-PER-EM SPACE
+        @input << "\xE2\x80\x86" # SIX-PER-EM SPACE
+        @input << "\xE2\x80\x87" # FIGURE SPACE
+        @input << "\xE2\x80\x88" # PUNCTUATION SPACE
+        @input << "\xE2\x80\x89" # THIN SPACE
+        @input << "\xE2\x80\x8A" # HAIR SPACE
+        @input << "\xE2\x80\x8B" # ZERO WIDTH SPACE
+        @input << "\xE2\x80\xAF" # NARROW NO-BREAK SPACE
+        @input << "\xE2\x81\x9F" # MEDIUM MATHEMATICAL SPACE
+        @input << "\xE3\x80\x80" # IDEOGRAPHIC SPACE
+        @input << "\xEF\xBB\xBF" # ZERO WIDTH NO-BREAK SPACE
+        @output = @input.join.clean
+      end
+      it "should replace invisible chars by space" do
+        @output.should == " "*@input.size
+      end
+    end
+    describe "with euro sign from both ISO 8859-15 or Windows-1252" do
+      before :all do
+        @input = "\x80\xA4"
+        @output = @input.clean
+      end
+      it "should replace invisible chars by space" do
+        @output.should == "€€"
+      end
+    end
+  end
+end

data/string_cleaner.gemspec ADDED Viewed

@@ -0,0 +1,46 @@
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = %q{string_cleaner}
+  s.version = "0.1.0"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["Joseph Halter"]
+  s.date = %q{2009-08-26}
+  s.email = %q{joseph@openhood.com}
+  s.extra_rdoc_files = [
+    "LICENSE",
+     "README.rdoc"
+  ]
+  s.files = [
+    ".gitignore",
+     "LICENSE",
+     "README.rdoc",
+     "Rakefile",
+     "VERSION",
+     "lib/string_cleaner.rb",
+     "spec/spec_helper.rb",
+     "spec/string_cleaner_spec.rb",
+     "string_cleaner.gemspec"
+  ]
+  s.has_rdoc = true
+  s.homepage = %q{http://github.com/JosephHalter/string_cleaner}
+  s.rdoc_options = ["--charset=UTF-8"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = %q{1.3.1}
+  s.summary = %q{Fix invalid UTF-8 and wipe invisible chars, fully compatible with Ruby 1.8 & 1.9 with extensive specs}
+  s.test_files = [
+    "spec/spec_helper.rb",
+     "spec/string_cleaner_spec.rb"
+  ]
+  if s.respond_to? :specification_version then
+    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
+    s.specification_version = 2
+    if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
+    else
+    end
+  else
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,77 @@
+--- !ruby/object:Gem::Specification
+name: string_cleaner
+version: !ruby/object:Gem::Version
+  hash: 27
+  prerelease: false
+  segments:
+  - 0
+  - 1
+  - 0
+  version: 0.1.0
+platform: ruby
+authors:
+- Joseph Halter
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-08-26 00:00:00 +02:00
+default_executable:
+dependencies: []
+description:
+email: joseph@openhood.com
+executables: []
+extensions: []
+extra_rdoc_files:
+- LICENSE
+- README.rdoc
+files:
+- .gitignore
+- LICENSE
+- README.rdoc
+- Rakefile
+- VERSION
+- lib/string_cleaner.rb
+- spec/spec_helper.rb
+- spec/string_cleaner_spec.rb
+- string_cleaner.gemspec
+has_rdoc: true
+homepage: http://github.com/JosephHalter/string_cleaner
+licenses: []
+post_install_message:
+rdoc_options:
+- --charset=UTF-8
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.7
+signing_key:
+specification_version: 2
+summary: Fix invalid UTF-8 and wipe invisible chars, fully compatible with Ruby 1.8 & 1.9 with extensive specs
+test_files:
+- spec/spec_helper.rb
+- spec/string_cleaner_spec.rb