RubyGems - string_cleaner - Versions diffs - 0.1.0 → 0.2.0 - Mend

string_cleaner 0.1.0 → 0.2.0

Files changed (8) hide show

data/.gitignore CHANGED Viewed

@@ -1,5 +1,6 @@
-*.sw?
+.bundle
 .DS_Store
 coverage
 rdoc
 pkg
+*.gem

data/Rakefile CHANGED Viewed

@@ -1,46 +1,26 @@
-require "rubygems"
-require "rake"
+require 'rubygems'
 begin
-  require "jeweler"
-  Jeweler::Tasks.new do |gem|
-    gem.name = "string_cleaner"
-    gem.summary = %Q{TODO}
-    gem.email = "joseph@openhood.com"
-    gem.homepage = "http://github.com/JosephHalter/string_cleaner"
-    gem.authors = ["Joseph Halter"]
-    # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
-  end
+  require 'bundler/setup'
 rescue LoadError
-  puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
+  puts 'You must `gem install bundler` and `bundle install` to run rake tasks'
 end
-require "spec/rake/spectask"
-Spec::Rake::SpecTask.new(:spec) do |spec|
-  spec.libs << "lib" << "spec"
-  spec.spec_files = FileList["spec/**/*_spec.rb"]
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec) do |spec|
+  spec.pattern = 'spec/**/*_spec.rb'
 end
-Spec::Rake::SpecTask.new(:rcov) do |spec|
-  spec.libs << "lib" << "spec"
-  spec.pattern = "spec/**/*_spec.rb"
+RSpec::Core::RakeTask.new(:rcov) do |spec|
+  spec.pattern = 'spec/**/*_spec.rb'
   spec.rcov = true
 end
 task :default => :spec
-require "rake/rdoctask"
+require 'rake/rdoctask'
 Rake::RDocTask.new do |rdoc|
-  if File.exist?("VERSION.yml")
-    config = YAML.load(File.read("VERSION.yml"))
-    version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
-  else
-    version = ""
-  end
-  rdoc.rdoc_dir = "rdoc"
-  rdoc.title = "test #{version}"
-  rdoc.rdoc_files.include("README*")
-  rdoc.rdoc_files.include("lib/**/*.rb")
-end
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "string_cleaner"
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end

data/lib/string_cleaner.rb CHANGED Viewed

@@ -1,56 +1,170 @@
 # encoding: UTF-8
+require "unidecoder"
 module String::Cleaner
-  # convert invalid UTF-8 strings from ISO-8859-15 to UTF-8 to fix them
-  # recognize euro char from both ISO 8859-15 and Windows-1252
-  # replace \r\n and \r with \n normalizing end of lines
-  # replace control characters and other invisible chars by spaces
   def clean
-    utf8 = self.dup
-    if utf8.respond_to?(:force_encoding)
+    fix_encoding.fix_endlines.fix_invisible_chars
+  end
-      # for Ruby 1.9+
-      utf8.force_encoding("UTF-8")
+  def fix_encoding
+    utf8 = dup
+    if utf8.respond_to?(:force_encoding)
+      utf8.force_encoding("UTF-8") # for Ruby 1.9+
       unless utf8.valid_encoding? # if invalid UTF-8
         utf8 = utf8.force_encoding("ISO8859-15")
         utf8.encode!("UTF-8", :invalid => :replace, :undef => :replace, :replace => "")
         utf8.gsub!("\xC2\x80", "€") # special case for euro sign from Windows-1252
         utf8.force_encoding("UTF-8")
       end
-      # normalize end of lines
-      utf8.gsub!(/\r\n/u, "\n")
-      utf8.gsub!(/\r/u, "\n")
-      # normalize invisible chars
-      utf8 = (utf8 << " ").split(/\n/u).each{|line|
-        line.gsub!(/[\s\p{C}]/u, " ")
-      }.join("\n").chop!
-      utf8.force_encoding("UTF-8")
+      utf8
     else
-      # for Ruby 1.8.6, use iconv
       require "iconv"
       utf8 << " "
-      utf8 = begin
+      begin
         Iconv.new("UTF-8", "UTF-8").iconv(utf8)
       rescue
         utf8.gsub!(/\x80/n, "\xA4")
-        utf8 = Iconv.new("UTF-8//IGNORE", "ISO8859-15").iconv(utf8)
+        Iconv.new("UTF-8//IGNORE", "ISO8859-15").iconv(utf8)
       end
+    end
+  end
-      # normalize end of lines
-      utf8.gsub!(/\r\n/n, "\n")
-      utf8.gsub!(/\r/n, "\n")
-      # normalize invisible chars using oniguruma
+  def fix_endlines
+    gsub(/(?:\r\n|\r)/u, "\n")
+  end
+  SPECIAL_SPACES = [
+            0x00A0,                # White_Space # Zs       NO-BREAK SPACE
+            0x1680,                # White_Space # Zs       OGHAM SPACE MARK
+            0x180E,                # White_Space # Zs       MONGOLIAN VOWEL SEPARATOR
+            (0x2000..0x200A).to_a, # White_Space # Zs  [11] EN QUAD..HAIR SPACE
+            0x2028,                # White_Space # Zl       LINE SEPARATOR
+            0x2029,                # White_Space # Zp       PARAGRAPH SEPARATOR
+            0x202F,                # White_Space # Zs       NARROW NO-BREAK SPACE
+            0x205F,                # White_Space # Zs       MEDIUM MATHEMATICAL SPACE
+            0x3000,                # White_Space # Zs       IDEOGRAPHIC SPACE
+          ].flatten.collect{|e| [e].pack 'U*'}
+  def fix_invisible_chars
+    utf8 = self.dup
+    if utf8.respond_to?(:force_encoding)
+      utf8 = (utf8 << " ").split(/\n/u).each{|line|
+        line.gsub!(/[\s\p{C}]/u, " ")
+      }.join("\n").chop!
+      utf8.gsub!(Regexp.new(SPECIAL_SPACES.join("|")), " ")
+      utf8.force_encoding("UTF-8")
+    else
       require "oniguruma"
-      utf8 = utf8.split(/\n/n).collect{|line|
+      utf8.split(/\n/n).collect{|line|
         Oniguruma::ORegexp.new("[\\s\\p{C}]", {:encoding => Oniguruma::ENCODING_UTF8}).gsub(line, " ")
       }.join("\n").chop!
     end
-    utf8
-    replace(utf8)
+  end
+  def trim(chars = "")
+    chars.size>0 ? gsub(/\A[#{chars}]+|[#{chars}]+\z/, "") : strip
+  end
+  def to_permalink(separator="-")
+    fix_endlines.to_ascii(chartable).downcase.gsub(/[^a-z0-9]+/, separator).trim(separator)
+  end
+  def nl2br
+    gsub("\n", "<br/>\n")
+  end
+  def to_nicer_sym
+    to_permalink("_").to_sym
+  end
+  def chartable(options = {})
+    options = {
+      :clean_binary => true,
+      :translit_symbols => true,
+    }.merge(options)
+    char = "%c"
+    table = {
+      "`" => "'",  # dec = 96
+      "¦" => "|",  # dec = 166, broken vertical bar
+      "¨" => "",   # dec = 168, spacing diaeresis - umlaut
+      "ª" => "",   # dec = 170, feminine ordinal indicator
+      "«" => "\"", # dec = 171, left double angle quotes
+      "¬" => "!",  # dec = 172, not sign
+      "" => "-",  # dec = 173, soft hyphen
+      "¯" => "-",  # dec = 175, spacing macron - overline
+      "²" => "2",  # dec = 178, superscript two - squared
+      "³" => "3",  # dec = 179, superscript three - cubed
+      "´" => "'",  # dec = 180, acute accent - spacing acute
+      "·" => "",   # dec = 183, middle dot - Georgian comma
+      "¸" => "",   # dec = 184, spacing cedilla
+      "¹" => "1",  # dec = 185, superscript one
+      "º" => "0",  # dec = 186, masculine ordinal indicator
+      "»" => "\"", # dec = 187, right double angle quotes
+      "¿" => "",   # dec = 191, inverted question mark
+      "Ý" => "Y",  # dec = 221
+      "–" => "-",  # hex = 2013, en dash
+      "—" => "-",  # hex = 2014, em dash
+      "‚" => "'",  # hex = 201A, single low-9 quotation mark
+      "„" => "\"", # hex = 201E, double low-9 quotation mark
+    }
+    if options[:clean_binary]
+      table[char %   0] = ""  # null
+      table[char %   1] = ""  # start of heading
+      table[char %   2] = ""  # start of text
+      table[char %   3] = ""  # end of text
+      table[char %   4] = ""  # end of transmission
+      table[char %   5] = ""  # enquiry
+      table[char %   6] = ""  # acknowledge
+      table[char %   7] = ""  # bell
+      table[char %   8] = ""  # backspace
+      table[char %   9] = " " # tab
+      table[char %  11] = ""  # vertical tab
+      table[char %  12] = ""  # form feed
+      table[char %  14] = ""  # shift out
+      table[char %  15] = ""  # shift in
+      table[char %  16] = ""  # data link escape
+      table[char %  17] = ""  # device control 1
+      table[char %  18] = ""  # device control 2
+      table[char %  19] = ""  # device control 3
+      table[char %  20] = ""  # device control 4
+      table[char %  21] = ""  # negative acknowledgement
+      table[char %  22] = ""  # synchronous idle
+      table[char %  23] = ""  # end of transmission block
+      table[char %  24] = ""  # cancel
+      table[char %  25] = ""  # end of medium
+      table[char %  26] = ""  # substitute
+      table[char %  27] = ""  # escape
+      table[char %  28] = ""  # file separator
+      table[char %  29] = ""  # group separator
+      table[char %  30] = ""  # record separator
+      table[char %  31] = ""  # unit separator
+      table[char % 127] = ""  # delete
+    end
+    if options[:translit_symbols]
+      table["$"]        = " dollars "              # dec = 36, dollar sign
+      table["%"]        = " percent "              # dec = 37, percent sign
+      table["&"]        = " and "                  # dec = 38, ampersand
+      table["@"]        = " at "                   # dec = 64, at symbol
+      table[char % 128] = " euros "                # windows euro
+      table["¢"]        = " cents "                # dec = 162, cent sign
+      table["£"]        = " pounds "               # dec = 163, pound sign
+      table["¤"]        = " euros "                # dec = 164, currency sign
+      table["¥"]        = " yens "                 # dec = 165, yen sign
+      table["§"]        = " section "              # dec = 167, section sign
+      table["©"]        = " copyright "            # dec = 169, copyright sign
+      table["®"]        = " registered trademark " # dec = 174, registered trade mark sign
+      table["°"]        = " degrees "              # dec = 176, degree sign
+      table["±"]        = " approx "               # dec = 177, plus-or-minus sign
+      table["µ"]        = " micro "                # dec = 181, micro sign
+      table["¶"]        = " paragraph "            # dec = 182, pilcrow sign - paragraph sign
+      table["¼"]        = " 1/4 "                  # dec = 188, fraction one quarter
+      table["½"]        = " 1/2 "                  # dec = 189, fraction one half
+      table["¾"]        = " 3/4 "                  # dec = 190, fraction three quarters
+      table["€"]        = " euros "                # hex = 20AC, unicode euro
+      table["™"]        = " trademark "            # hex = 2122, trade mark
+    end
+    table
   end
 end

data/spec/spec_helper.rb CHANGED Viewed

@@ -1,3 +1,2 @@
-require "spec"
-require "rubygems"
-require File.join(File.dirname(__FILE__), "..", "lib", "string_cleaner")
+require 'bundler/setup'
+require 'string_cleaner'

data/spec/string_cleaner_spec.rb CHANGED Viewed

@@ -19,6 +19,23 @@ describe String::Cleaner do
         @output.should == "          \n  \n                   !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ €                                ¡¢£€¥Š§š©ª«¬ ®¯°±²³Žµ¶·ž¹º»ŒœŸ¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
       end
     end
+    it "should convert all type of spaces to normal spaces" do
+      input = [
+                (0x0009..0x000D).to_a, # White_Space # Cc   [5] <control-0009>..<control-000D>
+                0x0020,                # White_Space # Zs       SPACE
+                0x0085,                # White_Space # Cc       <control-0085>
+                0x00A0,                # White_Space # Zs       NO-BREAK SPACE
+                0x1680,                # White_Space # Zs       OGHAM SPACE MARK
+                0x180E,                # White_Space # Zs       MONGOLIAN VOWEL SEPARATOR
+                (0x2000..0x200A).to_a, # White_Space # Zs  [11] EN QUAD..HAIR SPACE
+                0x2028,                # White_Space # Zl       LINE SEPARATOR
+                0x2029,                # White_Space # Zp       PARAGRAPH SEPARATOR
+                0x202F,                # White_Space # Zs       NARROW NO-BREAK SPACE
+                0x205F,                # White_Space # Zs       MEDIUM MATHEMATICAL SPACE
+                0x3000,                # White_Space # Zs       IDEOGRAPHIC SPACE
+              ].flatten.collect{ |e| [e].pack 'U*' }
+      input.join.clean.should == " \n  \n                     "
+    end
     describe "with invalid UTF-8 sequence" do
       before :all do
         @input = "\210\004"
@@ -201,4 +218,54 @@ describe String::Cleaner do
       end
     end
   end
+  describe "#trim(chars = \"\")" do
+    it "should use #strip when used without params" do
+      string, expected = "", mock
+      string.stub(:strip).and_return expected
+      string.trim.should be expected
+    end
+    it "should remove multiple characters at once from beginning and end" do
+      prefix, suffix = " rhuif dww f f", "dqz  qafdédsj iowe fcms. qpo asttt t dtt"
+      to_remove = "acdeéfhijmopqrstuwz "
+      "#{prefix}d#{suffix}".trim(to_remove).should eql "."
+      "#{prefix}D#{suffix}".trim(to_remove).should eql "Ddqz  qafdédsj iowe fcms."
+    end
+  end
+  describe "#fix_endlines" do
+    it "should convert windows endlines" do
+      "this is a\r\ntest\r\n".fix_endlines.should eql "this is a\ntest\n"
+    end
+    it "should convert old mac endlines" do
+      "this is a\rtest\r".fix_endlines.should eql "this is a\ntest\n"
+    end
+    it "should not modify proper linux endlines" do
+      "this is a\ntest\n".fix_endlines.should eql "this is a\ntest\n"
+    end
+    it "should convert mixed endlines" do
+      "this is a\n\rtest\r\n".fix_endlines.should eql "this is a\n\ntest\n"
+    end
+  end
+  describe "#to_permalink(separator=\"-\")" do
+    it "should create nice permalink for string with many accents" do
+      crazy = "  ÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöøùúûüý - Hello world, I'm a crazy string!! "
+      crazy.to_permalink.should == "aaaaaaceeeeiiiidnoooooxouuuuyaaaaaaceeeeiiiinoooooouuuuy-hello-world-i-m-a-crazy-string"
+    end
+    it "should create nice permalink even for evil string" do
+      evil = (128..255).inject(""){ |acc, b| acc += ("%c" % b) }
+      evil.to_permalink.should == "euros-cents-pounds-euros-yens-section-copyright-registered-trademark-degrees-approx-23-micro-paragraph-10-1-4-1-2-3-4-aaaaaaaeceeeeiiiidnoooooxouuuuythssaaaaaaaeceeeeiiiidnooooo-ouuuuythy"
+    end
+    it "should remove endlines too" do
+      "this\nis\ta\ntest".to_permalink("_").should eql "this_is_a_test"
+    end
+  end
+  describe "#nl2br" do
+    it "should convert \n to <br/>\n" do
+      "this\nis\ta\ntest\r".nl2br.should eql "this<br/>\nis\ta<br/>\ntest\r"
+    end
+  end
+  describe "#to_nicer_sym" do
+    it "should convert \"Select or Other\" to :select_or_other" do
+      "Select or Other".to_nicer_sym.should be :select_or_other
+    end
+  end
 end

data/string_cleaner.gemspec CHANGED Viewed

@@ -2,11 +2,11 @@
 Gem::Specification.new do |s|
   s.name = %q{string_cleaner}
-  s.version = "0.1.0"
+  s.version = "0.2.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Joseph Halter"]
-  s.date = %q{2009-08-26}
+  s.date = %q{2010-10-18}
   s.email = %q{joseph@openhood.com}
   s.extra_rdoc_files = [
     "LICENSE",
@@ -17,7 +17,6 @@ Gem::Specification.new do |s|
      "LICENSE",
      "README.rdoc",
      "Rakefile",
-     "VERSION",
      "lib/string_cleaner.rb",
      "spec/spec_helper.rb",
      "spec/string_cleaner_spec.rb",
@@ -33,6 +32,8 @@ Gem::Specification.new do |s|
     "spec/spec_helper.rb",
      "spec/string_cleaner_spec.rb"
   ]
+  s.add_runtime_dependency "unidecoder"
+  s.add_development_dependency "rspec"
   if s.respond_to? :specification_version then
     current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION

metadata CHANGED Viewed

@@ -1,13 +1,12 @@
 --- !ruby/object:Gem::Specification
 name: string_cleaner
 version: !ruby/object:Gem::Version
-  hash: 27
   prerelease: false
   segments:
   - 0
-  - 1
+  - 2
   - 0
-  version: 0.1.0
+  version: 0.2.0
 platform: ruby
 authors:
 - Joseph Halter
@@ -15,10 +14,35 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-08-26 00:00:00 +02:00
+date: 2010-10-18 00:00:00 +02:00
 default_executable:
-dependencies: []
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: unidecoder
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 0
+        version: "0"
+  type: :runtime
+  version_requirements: *id001
+- !ruby/object:Gem::Dependency
+  name: rspec
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 0
+        version: "0"
+  type: :development
+  version_requirements: *id002
 description:
 email: joseph@openhood.com
 executables: []
@@ -33,7 +57,6 @@ files:
 - LICENSE
 - README.rdoc
 - Rakefile
-- VERSION
 - lib/string_cleaner.rb
 - spec/spec_helper.rb
 - spec/string_cleaner_spec.rb
@@ -52,7 +75,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      hash: 3
       segments:
       - 0
       version: "0"
@@ -61,7 +83,6 @@ required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      hash: 3
       segments:
       - 0
       version: "0"

data/VERSION DELETED Viewed

	@@ -1 +0,0 @@
1	- 0.1.0