RubyGems - string_cleaner - Versions diffs - 0.1.0 - Mend

string_cleaner 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

data/.gitignore +5 -0
data/LICENSE +20 -0
data/README.rdoc +27 -0
data/Rakefile +46 -0
data/VERSION +1 -0
data/lib/string_cleaner.rb +59 -0
data/spec/spec_helper.rb +3 -0
data/spec/string_cleaner_spec.rb +204 -0
data/string_cleaner.gemspec +46 -0
metadata +77 -0

data/.gitignore ADDED Viewed

@@ -0,0 +1,5 @@
+*.sw?
+.DS_Store
+coverage
+rdoc
+pkg

data/LICENSE ADDED Viewed

@@ -0,0 +1,20 @@
+Copyright (c) 2009 Joseph Halter
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.rdoc ADDED Viewed

@@ -0,0 +1,27 @@
+= string_cleaner
+Just add a method .clean to String which does:
+* convert invalid UTF-8 strings from ISO-8859-15 to UTF-8 to fix them
+* recognize euro char from both ISO 8859-15 and Windows-1252
+* replace \r\n and \r with \n normalizing end of lines
+* replace control characters and other invisible chars by spaces
+== Install
+  sudo gem install JosephHalter-string_cleaner
+== Ruby 1.9+
+Ruby 1.9+ has native support for unicode and specs are 100% passing.
+== Ruby 1.8.x
+Because Ruby 1.8.x has no native support for Unicode, you must install oniguruma and the jasherai-oniguruma gem.
+== Example usage
+  "\210\004".clean # => " "
+== Copyright
+Copyright (c) 2009 Joseph Halter. See LICENSE for details.

data/Rakefile ADDED Viewed

@@ -0,0 +1,46 @@
+require "rubygems"
+require "rake"
+begin
+  require "jeweler"
+  Jeweler::Tasks.new do |gem|
+    gem.name = "string_cleaner"
+    gem.summary = %Q{TODO}
+    gem.email = "joseph@openhood.com"
+    gem.homepage = "http://github.com/JosephHalter/string_cleaner"
+    gem.authors = ["Joseph Halter"]
+    # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
+  end
+rescue LoadError
+  puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
+end
+require "spec/rake/spectask"
+Spec::Rake::SpecTask.new(:spec) do |spec|
+  spec.libs << "lib" << "spec"
+  spec.spec_files = FileList["spec/**/*_spec.rb"]
+end
+Spec::Rake::SpecTask.new(:rcov) do |spec|
+  spec.libs << "lib" << "spec"
+  spec.pattern = "spec/**/*_spec.rb"
+  spec.rcov = true
+end
+task :default => :spec
+require "rake/rdoctask"
+Rake::RDocTask.new do |rdoc|
+  if File.exist?("VERSION.yml")
+    config = YAML.load(File.read("VERSION.yml"))
+    version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
+  else
+    version = ""
+  end
+  rdoc.rdoc_dir = "rdoc"
+  rdoc.title = "test #{version}"
+  rdoc.rdoc_files.include("README*")
+  rdoc.rdoc_files.include("lib/**/*.rb")
+end

data/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.1.0

data/lib/string_cleaner.rb ADDED Viewed

@@ -0,0 +1,59 @@
+# encoding: UTF-8
+module String::Cleaner
+  # convert invalid UTF-8 strings from ISO-8859-15 to UTF-8 to fix them
+  # recognize euro char from both ISO 8859-15 and Windows-1252
+  # replace \r\n and \r with \n normalizing end of lines
+  # replace control characters and other invisible chars by spaces
+  def clean
+    utf8 = self.dup
+    if utf8.respond_to?(:force_encoding)
+      # for Ruby 1.9+
+      utf8.force_encoding("UTF-8")
+      unless utf8.valid_encoding? # if invalid UTF-8
+        utf8 = utf8.force_encoding("ISO8859-15")
+        utf8.encode!("UTF-8", :invalid => :replace, :undef => :replace, :replace => "")
+        utf8.gsub!("\xC2\x80", "€") # special case for euro sign from Windows-1252
+        utf8.force_encoding("UTF-8")
+      end
+      # normalize end of lines
+      utf8.gsub!(/\r\n/u, "\n")
+      utf8.gsub!(/\r/u, "\n")
+      # normalize invisible chars
+      utf8 = (utf8 << " ").split(/\n/u).each{|line|
+        line.gsub!(/[\s\p{C}]/u, " ")
+      }.join("\n").chop!
+      utf8.force_encoding("UTF-8")
+    else
+      # for Ruby 1.8.6, use iconv
+      require "iconv"
+      utf8 << " "
+      utf8 = begin
+        Iconv.new("UTF-8", "UTF-8").iconv(utf8)
+      rescue
+        utf8.gsub!(/\x80/n, "\xA4")
+        utf8 = Iconv.new("UTF-8//IGNORE", "ISO8859-15").iconv(utf8)
+      end
+      # normalize end of lines
+      utf8.gsub!(/\r\n/n, "\n")
+      utf8.gsub!(/\r/n, "\n")
+      # normalize invisible chars using oniguruma
+      require "oniguruma"
+      utf8 = utf8.split(/\n/n).collect{|line|
+        Oniguruma::ORegexp.new("[\\s\\p{C}]", {:encoding => Oniguruma::ENCODING_UTF8}).gsub(line, " ")
+      }.join("\n").chop!
+    end
+    utf8
+    replace(utf8)
+  end
+end
+class String
+  include String::Cleaner
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,3 @@
+require "spec"
+require "rubygems"
+require File.join(File.dirname(__FILE__), "..", "lib", "string_cleaner")

data/spec/string_cleaner_spec.rb ADDED Viewed

@@ -0,0 +1,204 @@
+# encoding: UTF-8
+require File.dirname(__FILE__) + "/spec_helper"
+describe String::Cleaner do
+  if "".respond_to?(:force_encoding)
+    # specs for Ruby 1.9+
+    describe "with all 8-bit characters" do
+      before :all do
+        @input = "".force_encoding("ISO8859-15")
+        (0..255).each{|i| @input << i.chr}
+        @input.force_encoding("UTF-8")
+        @output = @input.clean
+      end
+      it "should output a valid UTF-8 string" do
+        @output.encoding.name.should == "UTF-8"
+        @output.should be_valid_encoding
+      end
+      it "should wipe out the control characters" do
+        @output.should == "          \n  \n                   !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ €                                ¡¢£€¥Š§š©ª«¬ ®¯°±²³Žµ¶·ž¹º»ŒœŸ¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
+      end
+    end
+    describe "with invalid UTF-8 sequence" do
+      before :all do
+        @input = "\210\004"
+        @output = @input.clean
+      end
+      it "should output a valid UTF-8 string" do
+        @output.encoding.name.should == "UTF-8"
+        @output.should be_valid_encoding
+      end
+      it "should replace invisible chars by space" do
+        @output.should == "  "
+      end
+    end
+    describe "with mixed valid and invalid characters" do
+      before :all do
+        @input = "a?^?\xddf"
+        @output = @input.clean
+      end
+      it "should output a valid UTF-8 string" do
+        @output.encoding.name.should == "UTF-8"
+        @output.should be_valid_encoding
+      end
+      it "should keep the valid characters" do
+        @output.should == "a?^?Ýf"
+      end
+    end
+    describe "with already valid characters" do
+      before :all do
+        @input = "\n\t\r\r\n\v\n"
+        @output = @input.clean
+      end
+      it "should output a valid UTF-8 string" do
+        @output.encoding.name.should == "UTF-8"
+        @output.should be_valid_encoding
+      end
+      it "should replace invisible chars by space" do
+        @output.should == "\n \n\n \n"
+      end
+    end
+    describe "with watermarked text" do
+      before :all do
+        @input = "Here is \357\273\277a block \357\273\277of text \357\273\277inside of which a number will be hidden!"
+        @output = @input.clean
+      end
+      it "should output a valid UTF-8 string" do
+        @output.encoding.name.should == "UTF-8"
+        @output.should be_valid_encoding
+      end
+      it "should replace invisible chars by space" do
+        @output.should == "Here is  a block  of text  inside of which a number will be hidden!"
+      end
+    end
+    describe "with unusual valid spaces" do
+      before :all do
+        @input = []
+        @input << "\u0020" # SPACE
+        @input << "\u00A0" # NO-BREAK SPACE
+        @input << "\u2000" # EN QUAD
+        @input << "\u2001" # EM QUAD
+        @input << "\u2002" # EN SPACE
+        @input << "\u2003" # EM SPACE
+        @input << "\u2004" # THREE-PER-EM SPACE
+        @input << "\u2005" # FOUR-PER-EM SPACE
+        @input << "\u2006" # SIX-PER-EM SPACE
+        @input << "\u2007" # FIGURE SPACE
+        @input << "\u2008" # PUNCTUATION SPACE
+        @input << "\u2009" # THIN SPACE
+        @input << "\u200A" # HAIR SPACE
+        @input << "\u200B" # ZERO WIDTH SPACE
+        @input << "\u202F" # NARROW NO-BREAK SPACE
+        @input << "\u205F" # MEDIUM MATHEMATICAL SPACE
+        @input << "\u3000" # IDEOGRAPHIC SPACE
+        @input << "\uFEFF" # ZERO WIDTH NO-BREAK SPACE
+        @output = @input.join.clean
+      end
+      it "should output a valid UTF-8 string" do
+        @output.encoding.name.should == "UTF-8"
+        @output.should be_valid_encoding
+      end
+      it "should replace invisible chars by space" do
+        @output.should == " "*@input.size
+      end
+    end
+    describe "with euro sign from both ISO 8859-15 or Windows-1252" do
+      before :all do
+        @input = "\x80\xA4"
+        @output = @input.clean
+      end
+      it "should output a valid UTF-8 string" do
+        @output.encoding.name.should == "UTF-8"
+        @output.should be_valid_encoding
+      end
+      it "should replace invisible chars by space" do
+        @output.should == "€€"
+      end
+    end
+  else
+    # specs for Ruby 1.8.6
+    describe "with all 8-bit characters" do
+      before :all do
+        @input = ""
+        (0..255).each{|i| @input << i.chr}
+        @output = @input.clean
+      end
+      it "should wipe out the control characters" do
+        @output.should == "          \n  \n                   !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ €                                ¡¢£€¥Š§š©ª«¬ ®¯°±²³Žµ¶·ž¹º»ŒœŸ¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
+      end
+    end
+    describe "with invalid UTF-8 sequence" do
+      before :all do
+        @input = "\210\004"
+        @output = @input.clean
+      end
+      it "should replace invisible chars by space" do
+        @output.should == "  "
+      end
+    end
+    describe "with mixed valid and invalid characters" do
+      before :all do
+        @input = "a?^?\xddf"
+        @output = @input.clean
+      end
+      it "should keep the valid characters" do
+        @output.should == "a?^?Ýf"
+      end
+    end
+    describe "with already valid characters" do
+      before :all do
+        @input = "\n\t\r\r\n\v\n"
+        @output = @input.clean
+      end
+      it "should replace invisible chars by space" do
+        @output.should == "\n \n\n \n"
+      end
+    end
+    describe "with watermarked text" do
+      before :all do
+        @input = "Here is \357\273\277a block \357\273\277of text \357\273\277inside of which a number will be hidden!"
+        @output = @input.clean
+      end
+      it "should replace invisible chars by space" do
+        @output.should == "Here is  a block  of text  inside of which a number will be hidden!"
+      end
+    end
+    describe "with unusual valid spaces" do
+      before :all do
+        @input = []
+        # "\uXXXX" doesn't exists yet on Ruby 1.8.6
+        @input << " "            # SPACE
+        @input << "\xC2\xA0"     # NO-BREAK SPACE
+        @input << "\xE2\x80\x80" # EN QUAD
+        @input << "\xE2\x80\x81" # EM QUAD
+        @input << "\xE2\x80\x82" # EN SPACE
+        @input << "\xE2\x80\x83" # EM SPACE
+        @input << "\xE2\x80\x84" # THREE-PER-EM SPACE
+        @input << "\xE2\x80\x85" # FOUR-PER-EM SPACE
+        @input << "\xE2\x80\x86" # SIX-PER-EM SPACE
+        @input << "\xE2\x80\x87" # FIGURE SPACE
+        @input << "\xE2\x80\x88" # PUNCTUATION SPACE
+        @input << "\xE2\x80\x89" # THIN SPACE
+        @input << "\xE2\x80\x8A" # HAIR SPACE
+        @input << "\xE2\x80\x8B" # ZERO WIDTH SPACE
+        @input << "\xE2\x80\xAF" # NARROW NO-BREAK SPACE
+        @input << "\xE2\x81\x9F" # MEDIUM MATHEMATICAL SPACE
+        @input << "\xE3\x80\x80" # IDEOGRAPHIC SPACE
+        @input << "\xEF\xBB\xBF" # ZERO WIDTH NO-BREAK SPACE
+        @output = @input.join.clean
+      end
+      it "should replace invisible chars by space" do
+        @output.should == " "*@input.size
+      end
+    end
+    describe "with euro sign from both ISO 8859-15 or Windows-1252" do
+      before :all do
+        @input = "\x80\xA4"
+        @output = @input.clean
+      end
+      it "should replace invisible chars by space" do
+        @output.should == "€€"
+      end
+    end
+  end
+end

data/string_cleaner.gemspec ADDED Viewed

@@ -0,0 +1,46 @@
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = %q{string_cleaner}
+  s.version = "0.1.0"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["Joseph Halter"]
+  s.date = %q{2009-08-26}
+  s.email = %q{joseph@openhood.com}
+  s.extra_rdoc_files = [
+    "LICENSE",
+     "README.rdoc"
+  ]
+  s.files = [
+    ".gitignore",
+     "LICENSE",
+     "README.rdoc",
+     "Rakefile",
+     "VERSION",
+     "lib/string_cleaner.rb",
+     "spec/spec_helper.rb",
+     "spec/string_cleaner_spec.rb",
+     "string_cleaner.gemspec"
+  ]
+  s.has_rdoc = true
+  s.homepage = %q{http://github.com/JosephHalter/string_cleaner}
+  s.rdoc_options = ["--charset=UTF-8"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = %q{1.3.1}
+  s.summary = %q{Fix invalid UTF-8 and wipe invisible chars, fully compatible with Ruby 1.8 & 1.9 with extensive specs}
+  s.test_files = [
+    "spec/spec_helper.rb",
+     "spec/string_cleaner_spec.rb"
+  ]
+  if s.respond_to? :specification_version then
+    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
+    s.specification_version = 2
+    if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
+    else
+    end
+  else
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,77 @@
+--- !ruby/object:Gem::Specification
+name: string_cleaner
+version: !ruby/object:Gem::Version
+  hash: 27
+  prerelease: false
+  segments:
+  - 0
+  - 1
+  - 0
+  version: 0.1.0
+platform: ruby
+authors:
+- Joseph Halter
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-08-26 00:00:00 +02:00
+default_executable:
+dependencies: []
+description:
+email: joseph@openhood.com
+executables: []
+extensions: []
+extra_rdoc_files:
+- LICENSE
+- README.rdoc
+files:
+- .gitignore
+- LICENSE
+- README.rdoc
+- Rakefile
+- VERSION
+- lib/string_cleaner.rb
+- spec/spec_helper.rb
+- spec/string_cleaner_spec.rb
+- string_cleaner.gemspec
+has_rdoc: true
+homepage: http://github.com/JosephHalter/string_cleaner
+licenses: []
+post_install_message:
+rdoc_options:
+- --charset=UTF-8
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.7
+signing_key:
+specification_version: 2
+summary: Fix invalid UTF-8 and wipe invisible chars, fully compatible with Ruby 1.8 & 1.9 with extensive specs
+test_files:
+- spec/spec_helper.rb
+- spec/string_cleaner_spec.rb