RubyGems - string_cleaner - Versions diffs - 0.1.0 → 0.2.0 - Mend

string_cleaner 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/.gitignore CHANGED Viewed

@@ -1,5 +1,6 @@
-*.sw?
+.bundle
 .DS_Store
 coverage
 rdoc
 pkg
+*.gem

data/Rakefile CHANGED Viewed

@@ -1,46 +1,26 @@
-require "rubygems"
-require "rake"
+require 'rubygems'
 begin
-  require "jeweler"
-  Jeweler::Tasks.new do |gem|
-    gem.name = "string_cleaner"
-    gem.summary = %Q{TODO}
-    gem.email = "joseph@openhood.com"
-    gem.homepage = "http://github.com/JosephHalter/string_cleaner"
-    gem.authors = ["Joseph Halter"]
-    # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
-  end
+  require 'bundler/setup'
 rescue LoadError
-  puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
+  puts 'You must `gem install bundler` and `bundle install` to run rake tasks'
 end
-require "spec/rake/spectask"
-Spec::Rake::SpecTask.new(:spec) do |spec|
-  spec.libs << "lib" << "spec"
-  spec.spec_files = FileList["spec/**/*_spec.rb"]
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec) do |spec|
+  spec.pattern = 'spec/**/*_spec.rb'
 end
-Spec::Rake::SpecTask.new(:rcov) do |spec|
-  spec.libs << "lib" << "spec"
-  spec.pattern = "spec/**/*_spec.rb"
+RSpec::Core::RakeTask.new(:rcov) do |spec|
+  spec.pattern = 'spec/**/*_spec.rb'
   spec.rcov = true
 end
 task :default => :spec
-require "rake/rdoctask"
+require 'rake/rdoctask'
 Rake::RDocTask.new do |rdoc|
-  if File.exist?("VERSION.yml")
-    config = YAML.load(File.read("VERSION.yml"))
-    version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
-  else
-    version = ""
-  end
-  rdoc.rdoc_dir = "rdoc"
-  rdoc.title = "test #{version}"
-  rdoc.rdoc_files.include("README*")
-  rdoc.rdoc_files.include("lib/**/*.rb")
-end
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "string_cleaner"
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end

data/lib/string_cleaner.rb CHANGED Viewed

@@ -1,56 +1,170 @@
 # encoding: UTF-8
+require "unidecoder"
 module String::Cleaner
-  # convert invalid UTF-8 strings from ISO-8859-15 to UTF-8 to fix them
-  # recognize euro char from both ISO 8859-15 and Windows-1252
-  # replace \r\n and \r with \n normalizing end of lines
-  # replace control characters and other invisible chars by spaces
   def clean
-    utf8 = self.dup
-    if utf8.respond_to?(:force_encoding)
+    fix_encoding.fix_endlines.fix_invisible_chars
+  end
-      # for Ruby 1.9+
-      utf8.force_encoding("UTF-8")
+  def fix_encoding
+    utf8 = dup
+    if utf8.respond_to?(:force_encoding)
+      utf8.force_encoding("UTF-8") # for Ruby 1.9+
       unless utf8.valid_encoding? # if invalid UTF-8
         utf8 = utf8.force_encoding("ISO8859-15")
         utf8.encode!("UTF-8", :invalid => :replace, :undef => :replace, :replace => "")
         utf8.gsub!("\xC2\x80", "€") # special case for euro sign from Windows-1252
         utf8.force_encoding("UTF-8")
       end
-      # normalize end of lines
-      utf8.gsub!(/\r\n/u, "\n")
-      utf8.gsub!(/\r/u, "\n")
-      # normalize invisible chars
-      utf8 = (utf8 << " ").split(/\n/u).each{|line|
-        line.gsub!(/[\s\p{C}]/u, " ")
-      }.join("\n").chop!
-      utf8.force_encoding("UTF-8")
+      utf8
     else
-      # for Ruby 1.8.6, use iconv
       require "iconv"
       utf8 << " "
-      utf8 = begin
+      begin
         Iconv.new("UTF-8", "UTF-8").iconv(utf8)
       rescue
         utf8.gsub!(/\x80/n, "\xA4")
-        utf8 = Iconv.new("UTF-8//IGNORE", "ISO8859-15").iconv(utf8)
+        Iconv.new("UTF-8//IGNORE", "ISO8859-15").iconv(utf8)
       end
+    end
+  end
-      # normalize end of lines
-      utf8.gsub!(/\r\n/n, "\n")
-      utf8.gsub!(/\r/n, "\n")
-      # normalize invisible chars using oniguruma
+  def fix_endlines
+    gsub(/(?:\r\n|\r)/u, "\n")
+  end
+  SPECIAL_SPACES = [
+            0x00A0,                # White_Space # Zs       NO-BREAK SPACE
+            0x1680,                # White_Space # Zs       OGHAM SPACE MARK
+            0x180E,                # White_Space # Zs       MONGOLIAN VOWEL SEPARATOR
+            (0x2000..0x200A).to_a, # White_Space # Zs  [11] EN QUAD..HAIR SPACE
+            0x2028,                # White_Space # Zl       LINE SEPARATOR
+            0x2029,                # White_Space # Zp       PARAGRAPH SEPARATOR
+            0x202F,                # White_Space # Zs       NARROW NO-BREAK SPACE
+            0x205F,                # White_Space # Zs       MEDIUM MATHEMATICAL SPACE
+            0x3000,                # White_Space # Zs       IDEOGRAPHIC SPACE
+          ].flatten.collect{|e| [e].pack 'U*'}
+  def fix_invisible_chars
+    utf8 = self.dup
+    if utf8.respond_to?(:force_encoding)
+      utf8 = (utf8 << " ").split(/\n/u).each{|line|
+        line.gsub!(/[\s\p{C}]/u, " ")
+      }.join("\n").chop!
+      utf8.gsub!(Regexp.new(SPECIAL_SPACES.join("|")), " ")
+      utf8.force_encoding("UTF-8")
+    else
       require "oniguruma"
-      utf8 = utf8.split(/\n/n).collect{|line|
+      utf8.split(/\n/n).collect{|line|
         Oniguruma::ORegexp.new("[\\s\\p{C}]", {:encoding => Oniguruma::ENCODING_UTF8}).gsub(line, " ")
       }.join("\n").chop!
     end
-    utf8
-    replace(utf8)
+  end
+  def trim(chars = "")
+    chars.size>0 ? gsub(/\A[#{chars}]+|[#{chars}]+\z/, "") : strip
+  end
+  def to_permalink(separator="-")
+    fix_endlines.to_ascii(chartable).downcase.gsub(/[^a-z0-9]+/, separator).trim(separator)
+  end
+  def nl2br
+    gsub("\n", "<br/>\n")
+  end
+  def to_nicer_sym
+    to_permalink("_").to_sym
+  end
+  def chartable(options = {})
+    options = {
+      :clean_binary => true,
+      :translit_symbols => true,
+    }.merge(options)
+    char = "%c"
+    table = {
+      "`" => "'",  # dec = 96
+      "¦" => "|",  # dec = 166, broken vertical bar
+      "¨" => "",   # dec = 168, spacing diaeresis - umlaut
+      "ª" => "",   # dec = 170, feminine ordinal indicator
+      "«" => "\"", # dec = 171, left double angle quotes
+      "¬" => "!",  # dec = 172, not sign
+      "" => "-",  # dec = 173, soft hyphen
+      "¯" => "-",  # dec = 175, spacing macron - overline
+      "²" => "2",  # dec = 178, superscript two - squared
+      "³" => "3",  # dec = 179, superscript three - cubed
+      "´" => "'",  # dec = 180, acute accent - spacing acute
+      "·" => "",   # dec = 183, middle dot - Georgian comma
+      "¸" => "",   # dec = 184, spacing cedilla
+      "¹" => "1",  # dec = 185, superscript one
+      "º" => "0",  # dec = 186, masculine ordinal indicator
+      "»" => "\"", # dec = 187, right double angle quotes
+      "¿" => "",   # dec = 191, inverted question mark
+      "Ý" => "Y",  # dec = 221
+      "–" => "-",  # hex = 2013, en dash
+      "—" => "-",  # hex = 2014, em dash
+      "‚" => "'",  # hex = 201A, single low-9 quotation mark
+      "„" => "\"", # hex = 201E, double low-9 quotation mark
+    }
+    if options[:clean_binary]
+      table[char %   0] = ""  # null
+      table[char %   1] = ""  # start of heading
+      table[char %   2] = ""  # start of text
+      table[char %   3] = ""  # end of text
+      table[char %   4] = ""  # end of transmission
+      table[char %   5] = ""  # enquiry
+      table[char %   6] = ""  # acknowledge
+      table[char %   7] = ""  # bell
+      table[char %   8] = ""  # backspace
+      table[char %   9] = " " # tab
+      table[char %  11] = ""  # vertical tab
+      table[char %  12] = ""  # form feed
+      table[char %  14] = ""  # shift out
+      table[char %  15] = ""  # shift in
+      table[char %  16] = ""  # data link escape
+      table[char %  17] = ""  # device control 1
+      table[char %  18] = ""  # device control 2
+      table[char %  19] = ""  # device control 3
+      table[char %  20] = ""  # device control 4
+      table[char %  21] = ""  # negative acknowledgement
+      table[char %  22] = ""  # synchronous idle
+      table[char %  23] = ""  # end of transmission block
+      table[char %  24] = ""  # cancel
+      table[char %  25] = ""  # end of medium
+      table[char %  26] = ""  # substitute
+      table[char %  27] = ""  # escape
+      table[char %  28] = ""  # file separator
+      table[char %  29] = ""  # group separator
+      table[char %  30] = ""  # record separator
+      table[char %  31] = ""  # unit separator
+      table[char % 127] = ""  # delete
+    end
+    if options[:translit_symbols]
+      table["$"]        = " dollars "              # dec = 36, dollar sign
+      table["%"]        = " percent "              # dec = 37, percent sign
+      table["&"]        = " and "                  # dec = 38, ampersand
+      table["@"]        = " at "                   # dec = 64, at symbol
+      table[char % 128] = " euros "                # windows euro
+      table["¢"]        = " cents "                # dec = 162, cent sign
+      table["£"]        = " pounds "               # dec = 163, pound sign
+      table["¤"]        = " euros "                # dec = 164, currency sign
+      table["¥"]        = " yens "                 # dec = 165, yen sign
+      table["§"]        = " section "              # dec = 167, section sign
+      table["©"]        = " copyright "            # dec = 169, copyright sign
+      table["®"]        = " registered trademark " # dec = 174, registered trade mark sign
+      table["°"]        = " degrees "              # dec = 176, degree sign
+      table["±"]        = " approx "               # dec = 177, plus-or-minus sign
+      table["µ"]        = " micro "                # dec = 181, micro sign
+      table["¶"]        = " paragraph "            # dec = 182, pilcrow sign - paragraph sign
+      table["¼"]        = " 1/4 "                  # dec = 188, fraction one quarter
+      table["½"]        = " 1/2 "                  # dec = 189, fraction one half
+      table["¾"]        = " 3/4 "                  # dec = 190, fraction three quarters
+      table["€"]        = " euros "                # hex = 20AC, unicode euro
+      table["™"]        = " trademark "            # hex = 2122, trade mark
+    end
+    table
   end
 end

data/spec/spec_helper.rb CHANGED Viewed

@@ -1,3 +1,2 @@
-require "spec"
-require "rubygems"
-require File.join(File.dirname(__FILE__), "..", "lib", "string_cleaner")
+require 'bundler/setup'
+require 'string_cleaner'

data/spec/string_cleaner_spec.rb CHANGED Viewed

@@ -19,6 +19,23 @@ describe String::Cleaner do
         @output.should == "          \n  \n                   !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ €                                ¡¢£€¥Š§š©ª«¬ ®¯°±²³Žµ¶·ž¹º»ŒœŸ¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
       end
     end
+    it "should convert all type of spaces to normal spaces" do
+      input = [
+                (0x0009..0x000D).to_a, # White_Space # Cc   [5] <control-0009>..<control-000D>
+                0x0020,                # White_Space # Zs       SPACE
+                0x0085,                # White_Space # Cc       <control-0085>
+                0x00A0,                # White_Space # Zs       NO-BREAK SPACE
+                0x1680,                # White_Space # Zs       OGHAM SPACE MARK
+                0x180E,                # White_Space # Zs       MONGOLIAN VOWEL SEPARATOR
+                (0x2000..0x200A).to_a, # White_Space # Zs  [11] EN QUAD..HAIR SPACE
+                0x2028,                # White_Space # Zl       LINE SEPARATOR
+                0x2029,                # White_Space # Zp       PARAGRAPH SEPARATOR
+                0x202F,                # White_Space # Zs       NARROW NO-BREAK SPACE
+                0x205F,                # White_Space # Zs       MEDIUM MATHEMATICAL SPACE
+                0x3000,                # White_Space # Zs       IDEOGRAPHIC SPACE
+              ].flatten.collect{ |e| [e].pack 'U*' }
+      input.join.clean.should == " \n  \n                     "
+    end
     describe "with invalid UTF-8 sequence" do
       before :all do
         @input = "\210\004"
@@ -201,4 +218,54 @@ describe String::Cleaner do
       end
     end
   end
+  describe "#trim(chars = \"\")" do
+    it "should use #strip when used without params" do
+      string, expected = "", mock
+      string.stub(:strip).and_return expected
+      string.trim.should be expected
+    end
+    it "should remove multiple characters at once from beginning and end" do
+      prefix, suffix = " rhuif dww f f", "dqz  qafdédsj iowe fcms. qpo asttt t dtt"
+      to_remove = "acdeéfhijmopqrstuwz "
+      "#{prefix}d#{suffix}".trim(to_remove).should eql "."
+      "#{prefix}D#{suffix}".trim(to_remove).should eql "Ddqz  qafdédsj iowe fcms."
+    end
+  end
+  describe "#fix_endlines" do
+    it "should convert windows endlines" do
+      "this is a\r\ntest\r\n".fix_endlines.should eql "this is a\ntest\n"
+    end
+    it "should convert old mac endlines" do
+      "this is a\rtest\r".fix_endlines.should eql "this is a\ntest\n"
+    end
+    it "should not modify proper linux endlines" do
+      "this is a\ntest\n".fix_endlines.should eql "this is a\ntest\n"
+    end
+    it "should convert mixed endlines" do
+      "this is a\n\rtest\r\n".fix_endlines.should eql "this is a\n\ntest\n"
+    end
+  end
+  describe "#to_permalink(separator=\"-\")" do
+    it "should create nice permalink for string with many accents" do
+      crazy = "  ÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöøùúûüý - Hello world, I'm a crazy string!! "
+      crazy.to_permalink.should == "aaaaaaceeeeiiiidnoooooxouuuuyaaaaaaceeeeiiiinoooooouuuuy-hello-world-i-m-a-crazy-string"
+    end
+    it "should create nice permalink even for evil string" do
+      evil = (128..255).inject(""){ |acc, b| acc += ("%c" % b) }
+      evil.to_permalink.should == "euros-cents-pounds-euros-yens-section-copyright-registered-trademark-degrees-approx-23-micro-paragraph-10-1-4-1-2-3-4-aaaaaaaeceeeeiiiidnoooooxouuuuythssaaaaaaaeceeeeiiiidnooooo-ouuuuythy"
+    end
+    it "should remove endlines too" do
+      "this\nis\ta\ntest".to_permalink("_").should eql "this_is_a_test"
+    end
+  end
+  describe "#nl2br" do
+    it "should convert \n to <br/>\n" do
+      "this\nis\ta\ntest\r".nl2br.should eql "this<br/>\nis\ta<br/>\ntest\r"
+    end
+  end
+  describe "#to_nicer_sym" do
+    it "should convert \"Select or Other\" to :select_or_other" do
+      "Select or Other".to_nicer_sym.should be :select_or_other
+    end
+  end
 end

data/string_cleaner.gemspec CHANGED Viewed

@@ -2,11 +2,11 @@
 Gem::Specification.new do |s|
   s.name = %q{string_cleaner}
-  s.version = "0.1.0"
+  s.version = "0.2.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Joseph Halter"]
-  s.date = %q{2009-08-26}
+  s.date = %q{2010-10-18}
   s.email = %q{joseph@openhood.com}
   s.extra_rdoc_files = [
     "LICENSE",
@@ -17,7 +17,6 @@ Gem::Specification.new do |s|
      "LICENSE",
      "README.rdoc",
      "Rakefile",
-     "VERSION",
      "lib/string_cleaner.rb",
      "spec/spec_helper.rb",
      "spec/string_cleaner_spec.rb",
@@ -33,6 +32,8 @@ Gem::Specification.new do |s|
     "spec/spec_helper.rb",
      "spec/string_cleaner_spec.rb"
   ]
+  s.add_runtime_dependency "unidecoder"
+  s.add_development_dependency "rspec"
   if s.respond_to? :specification_version then
     current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION

metadata CHANGED Viewed

@@ -1,13 +1,12 @@
 --- !ruby/object:Gem::Specification
 name: string_cleaner
 version: !ruby/object:Gem::Version
-  hash: 27
   prerelease: false
   segments:
   - 0
-  - 1
+  - 2
   - 0
-  version: 0.1.0
+  version: 0.2.0
 platform: ruby
 authors:
 - Joseph Halter
@@ -15,10 +14,35 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-08-26 00:00:00 +02:00
+date: 2010-10-18 00:00:00 +02:00
 default_executable:
-dependencies: []
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: unidecoder
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 0
+        version: "0"
+  type: :runtime
+  version_requirements: *id001
+- !ruby/object:Gem::Dependency
+  name: rspec
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 0
+        version: "0"
+  type: :development
+  version_requirements: *id002
 description:
 email: joseph@openhood.com
 executables: []
@@ -33,7 +57,6 @@ files:
 - LICENSE
 - README.rdoc
 - Rakefile
-- VERSION
 - lib/string_cleaner.rb
 - spec/spec_helper.rb
 - spec/string_cleaner_spec.rb
@@ -52,7 +75,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      hash: 3
       segments:
       - 0
       version: "0"
@@ -61,7 +83,6 @@ required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      hash: 3
       segments:
       - 0
       version: "0"

data/VERSION DELETED Viewed

	@@ -1 +0,0 @@
1	- 0.1.0