RubyGems - string_cleaner - Versions diffs - 0.2.1 → 0.2.2 - Mend

string_cleaner 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

data/Rakefile +6 -3
data/lib/string_cleaner.rb +26 -17
data/spec/string_cleaner_spec.rb +83 -155
data/string_cleaner.gemspec +1 -1
metadata +7 -2

data/Rakefile CHANGED Viewed

@@ -1,6 +1,9 @@
-require 'rake/tasklib'
-require 'rake/rdoctask'
-require 'rubygems'
+if RUBY_VERSION.to_f<1.9
+  require 'rake/tasklib'
+  require 'rake/rdoctask'
+  require 'rubygems'
+end
 begin
   require 'bundler/setup'
 rescue LoadError

data/lib/string_cleaner.rb CHANGED Viewed

@@ -9,13 +9,13 @@ module String::Cleaner
   def fix_encoding
     utf8 = dup
-    if utf8.respond_to?(:force_encoding)
+    if utf8.respond_to?(:force_encoding)
       utf8.force_encoding("UTF-8") # for Ruby 1.9+
       unless utf8.valid_encoding? # if invalid UTF-8
-        utf8 = utf8.force_encoding("ISO8859-15")
+        utf8 = utf8.force_encoding("ISO8859-1")
         utf8.encode!("UTF-8", :invalid => :replace, :undef => :replace, :replace => "")
       end
-      utf8.gsub!("\u0080", "€") # special case for euro sign from Windows-1252
+      utf8.gsub!(/\u0080|¤/, "€") # special case for euro sign from Windows-1252
       utf8
     else
       require "iconv"
@@ -34,31 +34,40 @@ module String::Cleaner
   end
   SPECIAL_SPACES = [
-            0x00A0,                # White_Space # Zs       NO-BREAK SPACE
-            0x1680,                # White_Space # Zs       OGHAM SPACE MARK
-            0x180E,                # White_Space # Zs       MONGOLIAN VOWEL SEPARATOR
-            (0x2000..0x200A).to_a, # White_Space # Zs  [11] EN QUAD..HAIR SPACE
-            0x2028,                # White_Space # Zl       LINE SEPARATOR
-            0x2029,                # White_Space # Zp       PARAGRAPH SEPARATOR
-            0x202F,                # White_Space # Zs       NARROW NO-BREAK SPACE
-            0x205F,                # White_Space # Zs       MEDIUM MATHEMATICAL SPACE
-            0x3000,                # White_Space # Zs       IDEOGRAPHIC SPACE
-          ].flatten.collect{|e| [e].pack 'U*'}
+    0x00A0,                # NO-BREAK SPACE
+    0x1680,                # OGHAM SPACE MARK
+    0x180E,                # MONGOLIAN VOWEL SEPARATOR
+    (0x2000..0x200A).to_a, # EN QUAD..HAIR SPACE
+    0x2028,                # LINE SEPARATOR
+    0x2029,                # PARAGRAPH SEPARATOR
+    0x202F,                # NARROW NO-BREAK SPACE
+    0x205F,                # MEDIUM MATHEMATICAL SPACE
+    0x3000,                # IDEOGRAPHIC SPACE
+  ].flatten.collect{|e| [e].pack 'U*'}
+  ZERO_WIDTH = [
+    0x200B,                # ZERO WIDTH SPACE
+    0x200C,                # ZERO WIDTH NON-JOINER
+    0x200D,                # ZERO WIDTH JOINER
+    0x2060,                # WORD JOINER
+    0xFEFF,                # ZERO WIDTH NO-BREAK SPACE
+  ].flatten.collect{|e| [e].pack 'U*'}
   def fix_invisible_chars
     utf8 = self.dup
-    if utf8.respond_to?(:force_encoding)
+    utf8.gsub!(Regexp.new(ZERO_WIDTH.join("|")), "")
+    utf8 = if utf8.respond_to?(:force_encoding)
       utf8 = (utf8 << " ").split(/\n/u).each{|line|
         line.gsub!(/[\s\p{C}]/u, " ")
       }.join("\n").chop!
-      utf8.gsub!(Regexp.new(SPECIAL_SPACES.join("|")), " ")
-      utf8.force_encoding("UTF-8")
     else
       require "oniguruma"
       utf8.split(/\n/n).collect{|line|
-        Oniguruma::ORegexp.new("[\\s\\p{C}]", {:encoding => Oniguruma::ENCODING_UTF8}).gsub(line, " ")
+        Oniguruma::ORegexp.new("[\\p{C}]", {:encoding => Oniguruma::ENCODING_UTF8}).gsub(line, " ")
       }.join("\n").chop!
     end
+    utf8.gsub!(Regexp.new(SPECIAL_SPACES.join("|") + "|\s"), " ")
+    utf8
   end
   def trim(chars = "")

data/spec/string_cleaner_spec.rb CHANGED Viewed

@@ -2,146 +2,72 @@
 require File.dirname(__FILE__) + "/spec_helper"
 describe String::Cleaner do
-  if "".respond_to?(:force_encoding)
-    # specs for Ruby 1.9+
+  describe "#clean" do
     describe "with all 8-bit characters" do
       before :all do
-        @input = "".force_encoding("ISO8859-15")
+        @input = ""
+        @input.force_encoding("ISO8859-15") if @input.respond_to?(:force_encoding)
         (0..255).each{|i| @input << i.chr}
-        @input.force_encoding("UTF-8")
+        @input.force_encoding("UTF-8") if @input.respond_to?(:force_encoding)
         @output = @input.clean
       end
-      it "should output a valid UTF-8 string" do
-        @output.encoding.name.should == "UTF-8"
-        @output.should be_valid_encoding
+      if RUBY_VERSION.to_f>1.9
+        it "should output a valid UTF-8 string" do
+          @output.encoding.name.should == "UTF-8"
+          @output.should be_valid_encoding
+        end
       end
       it "should wipe out the control characters" do
-        @output.should == "          \n  \n                   !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ €                                ¡¢£€¥Š§š©ª«¬ ®¯°±²³Žµ¶·ž¹º»ŒœŸ¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
-      end
-    end
-    it "should convert all type of spaces to normal spaces" do
-      input = [
-                (0x0009..0x000D).to_a, # White_Space # Cc   [5] <control-0009>..<control-000D>
-                0x0020,                # White_Space # Zs       SPACE
-                0x0085,                # White_Space # Cc       <control-0085>
-                0x00A0,                # White_Space # Zs       NO-BREAK SPACE
-                0x1680,                # White_Space # Zs       OGHAM SPACE MARK
-                0x180E,                # White_Space # Zs       MONGOLIAN VOWEL SEPARATOR
-                (0x2000..0x200A).to_a, # White_Space # Zs  [11] EN QUAD..HAIR SPACE
-                0x2028,                # White_Space # Zl       LINE SEPARATOR
-                0x2029,                # White_Space # Zp       PARAGRAPH SEPARATOR
-                0x202F,                # White_Space # Zs       NARROW NO-BREAK SPACE
-                0x205F,                # White_Space # Zs       MEDIUM MATHEMATICAL SPACE
-                0x3000,                # White_Space # Zs       IDEOGRAPHIC SPACE
-              ].flatten.collect{ |e| [e].pack 'U*' }
-      input.join.clean.should == " \n  \n                     "
-    end
-    describe "with invalid UTF-8 sequence" do
-      before :all do
-        @input = "\210\004"
-        @output = @input.clean
-      end
-      it "should output a valid UTF-8 string" do
-        @output.encoding.name.should == "UTF-8"
-        @output.should be_valid_encoding
-      end
-      it "should replace invisible chars by space" do
-        @output.should == "  "
-      end
-    end
-    describe "with mixed valid and invalid characters" do
-      before :all do
-        @input = "a?^?\xddf"
-        @output = @input.clean
-      end
-      it "should output a valid UTF-8 string" do
-        @output.encoding.name.should == "UTF-8"
-        @output.should be_valid_encoding
-      end
-      it "should keep the valid characters" do
-        @output.should == "a?^?Ýf"
-      end
-    end
-    describe "with already valid characters" do
-      before :all do
-        @input = "\n\t\r\r\n\v\n"
-        @output = @input.clean
-      end
-      it "should output a valid UTF-8 string" do
-        @output.encoding.name.should == "UTF-8"
-        @output.should be_valid_encoding
-      end
-      it "should replace invisible chars by space" do
-        @output.should == "\n \n\n \n"
-      end
-    end
-    describe "with watermarked text" do
-      before :all do
-        @input = "Here is \357\273\277a block \357\273\277of text \357\273\277inside of which a number will be hidden!"
-        @output = @input.clean
-      end
-      it "should output a valid UTF-8 string" do
-        @output.encoding.name.should == "UTF-8"
-        @output.should be_valid_encoding
-      end
-      it "should replace invisible chars by space" do
-        @output.should == "Here is  a block  of text  inside of which a number will be hidden!"
+        @output.should == "          \n  \n                   !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ €                                ¡¢£€¥¦§¨©ª«¬ ®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
       end
     end
-    describe "with unusual valid spaces" do
-      before :all do
-        @input = []
-        @input << "\u0020" # SPACE
-        @input << "\u00A0" # NO-BREAK SPACE
-        @input << "\u2000" # EN QUAD
-        @input << "\u2001" # EM QUAD
-        @input << "\u2002" # EN SPACE
-        @input << "\u2003" # EM SPACE
-        @input << "\u2004" # THREE-PER-EM SPACE
-        @input << "\u2005" # FOUR-PER-EM SPACE
-        @input << "\u2006" # SIX-PER-EM SPACE
-        @input << "\u2007" # FIGURE SPACE
-        @input << "\u2008" # PUNCTUATION SPACE
-        @input << "\u2009" # THIN SPACE
-        @input << "\u200A" # HAIR SPACE
-        @input << "\u200B" # ZERO WIDTH SPACE
-        @input << "\u202F" # NARROW NO-BREAK SPACE
-        @input << "\u205F" # MEDIUM MATHEMATICAL SPACE
-        @input << "\u3000" # IDEOGRAPHIC SPACE
-        @input << "\uFEFF" # ZERO WIDTH NO-BREAK SPACE
+    describe "with various type of spaces" do
+      before do
+        @input = [
+          (0x0009..0x000D).to_a, # <control-0009>..<control-000D>
+          0x0020,                # SPACE
+          0x0085,                # <control-0085>
+          0x00A0,                # NO-BREAK SPACE
+          0x1680,                # OGHAM SPACE MARK
+          0x180E,                # MONGOLIAN VOWEL SEPARATOR
+          (0x2000..0x200A).to_a, # EN QUAD..HAIR SPACE
+          0x2028,                # LINE SEPARATOR
+          0x2029,                # PARAGRAPH SEPARATOR
+          0x202F,                # NARROW NO-BREAK SPACE
+          0x205F,                # MEDIUM MATHEMATICAL SPACE
+          0x3000,                # IDEOGRAPHIC SPACE
+        ].flatten.collect{ |e| [e].pack 'U*' }
         @output = @input.join.clean
       end
-      it "should output a valid UTF-8 string" do
-        @output.encoding.name.should == "UTF-8"
-        @output.should be_valid_encoding
-      end
-      it "should replace invisible chars by space" do
-        @output.should == " "*@input.size
-      end
-    end
-    describe "with euro sign from both ISO 8859-15 or Windows-1252" do
-      before :all do
-        @input = "\x80\xA4"
-        @output = @input.clean
-      end
-      it "should output a valid UTF-8 string" do
-        @output.encoding.name.should == "UTF-8"
-        @output.should be_valid_encoding
-      end
-      it "should replace invisible chars by space" do
-        @output.should == "€€"
+      if RUBY_VERSION.to_f>1.9
+        it "should output a valid UTF-8 string" do
+          @output.encoding.name.should == "UTF-8"
+          @output.should be_valid_encoding
+        end
+      end
+      it "should replace all spaces to normal spaces" do
+        @output.clean.should == " \n  \n                     "
+      end
+    end
+    describe "with various no-width characters" do
+      before do
+        @input = [
+          0x200B,                # ZERO WIDTH SPACE
+          0x200C,                # ZERO WIDTH NON-JOINER
+          0x200D,                # ZERO WIDTH JOINER
+          0x2060,                # WORD JOINER
+          0xFEFF,                # ZERO WIDTH NO-BREAK SPACE
+        ].flatten.collect{ |e| [e].pack 'U*' }
+        @output = @input.join.clean
       end
-    end
-  else
-    # specs for Ruby 1.8.6
-    describe "with all 8-bit characters" do
-      before :all do
-        @input = ""
-        (0..255).each{|i| @input << i.chr}
-        @output = @input.clean
+      if RUBY_VERSION.to_f>1.9
+        it "should output a valid UTF-8 string" do
+          @output.encoding.name.should == "UTF-8"
+          @output.should be_valid_encoding
+        end
       end
-      it "should wipe out the control characters" do
-        @output.should == "          \n  \n                   !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ €                                ¡¢£€¥¦§¨©ª«¬ ®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
+      it "should remove no-width characters" do
+        @output.should == ""
       end
     end
     describe "with invalid UTF-8 sequence" do
@@ -149,6 +75,12 @@ describe String::Cleaner do
         @input = "\210\004"
         @output = @input.clean
       end
+      if RUBY_VERSION.to_f>1.9
+        it "should output a valid UTF-8 string" do
+          @output.encoding.name.should == "UTF-8"
+          @output.should be_valid_encoding
+        end
+      end
       it "should replace invisible chars by space" do
         @output.should == "  "
       end
@@ -158,6 +90,12 @@ describe String::Cleaner do
         @input = "a?^?\xddf"
         @output = @input.clean
       end
+      if RUBY_VERSION.to_f>1.9
+        it "should output a valid UTF-8 string" do
+          @output.encoding.name.should == "UTF-8"
+          @output.should be_valid_encoding
+        end
+      end
       it "should keep the valid characters" do
         @output.should == "a?^?Ýf"
       end
@@ -167,6 +105,12 @@ describe String::Cleaner do
         @input = "\n\t\r\r\n\v\n"
         @output = @input.clean
       end
+      if RUBY_VERSION.to_f>1.9
+        it "should output a valid UTF-8 string" do
+          @output.encoding.name.should == "UTF-8"
+          @output.should be_valid_encoding
+        end
+      end
       it "should replace invisible chars by space" do
         @output.should == "\n \n\n \n"
       end
@@ -176,36 +120,14 @@ describe String::Cleaner do
         @input = "Here is \357\273\277a block \357\273\277of text \357\273\277inside of which a number will be hidden!"
         @output = @input.clean
       end
-      it "should replace invisible chars by space" do
-        @output.should == "Here is  a block  of text  inside of which a number will be hidden!"
-      end
-    end
-    describe "with unusual valid spaces" do
-      before :all do
-        @input = []
-        # "\uXXXX" doesn't exists yet on Ruby 1.8.6
-        @input << " "            # SPACE
-        @input << "\xC2\xA0"     # NO-BREAK SPACE
-        @input << "\xE2\x80\x80" # EN QUAD
-        @input << "\xE2\x80\x81" # EM QUAD
-        @input << "\xE2\x80\x82" # EN SPACE
-        @input << "\xE2\x80\x83" # EM SPACE
-        @input << "\xE2\x80\x84" # THREE-PER-EM SPACE
-        @input << "\xE2\x80\x85" # FOUR-PER-EM SPACE
-        @input << "\xE2\x80\x86" # SIX-PER-EM SPACE
-        @input << "\xE2\x80\x87" # FIGURE SPACE
-        @input << "\xE2\x80\x88" # PUNCTUATION SPACE
-        @input << "\xE2\x80\x89" # THIN SPACE
-        @input << "\xE2\x80\x8A" # HAIR SPACE
-        @input << "\xE2\x80\x8B" # ZERO WIDTH SPACE
-        @input << "\xE2\x80\xAF" # NARROW NO-BREAK SPACE
-        @input << "\xE2\x81\x9F" # MEDIUM MATHEMATICAL SPACE
-        @input << "\xE3\x80\x80" # IDEOGRAPHIC SPACE
-        @input << "\xEF\xBB\xBF" # ZERO WIDTH NO-BREAK SPACE
-        @output = @input.join.clean
+      if RUBY_VERSION.to_f>1.9
+        it "should output a valid UTF-8 string" do
+          @output.encoding.name.should == "UTF-8"
+          @output.should be_valid_encoding
+        end
       end
       it "should replace invisible chars by space" do
-        @output.should == " "*@input.size
+        @output.should == "Here is a block of text inside of which a number will be hidden!"
       end
     end
     describe "with euro sign from both ISO 8859-15 or Windows-1252" do
@@ -213,6 +135,12 @@ describe String::Cleaner do
         @input = "\x80\xA4"
         @output = @input.clean
       end
+      if RUBY_VERSION.to_f>1.9
+        it "should output a valid UTF-8 string" do
+          @output.encoding.name.should == "UTF-8"
+          @output.should be_valid_encoding
+        end
+      end
       it "should replace invisible chars by space" do
         @output.should == "€€"
       end

data/string_cleaner.gemspec CHANGED Viewed

@@ -2,7 +2,7 @@
 Gem::Specification.new do |s|
   s.name = %q{string_cleaner}
-  s.version = "0.2.1"
+  s.version = "0.2.2"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Joseph Halter"]

metadata CHANGED Viewed

@@ -1,12 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: string_cleaner
 version: !ruby/object:Gem::Version
+  hash: 19
   prerelease: false
   segments:
   - 0
   - 2
-  - 1
-  version: 0.2.1
+  - 2
+  version: 0.2.2
 platform: ruby
 authors:
 - Joseph Halter
@@ -25,6 +26,7 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
+        hash: 3
         segments:
         - 0
         version: "0"
@@ -38,6 +40,7 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
+        hash: 3
         segments:
         - 0
         version: "0"
@@ -75,6 +78,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
       segments:
       - 0
       version: "0"
@@ -83,6 +87,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
       segments:
       - 0
       version: "0"