RubyGems - kyanite - Versions diffs - 0.8.0 → 0.8.1 - Mend

kyanite 0.8.0 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/History.rdoc +1 -1
data/lib/kyanite/string/chars.rb +51 -32
data/lib/kyanite/string/chars_const.rb +7 -7
data/test/string/test_chars.rb +52 -24
data/version.rb +1 -1
metadata +2 -2

data/History.rdoc CHANGED

@@ -1,5 +1,5 @@
 == 0.8.0 2012-11-17
-* added String#to_ascii with human-like handling of unicode special characters
+* added String#reduce with human-like handling of unicode special characters
 == 0.7.5 2012-11-14
 * added FSymbol class

data/lib/kyanite/string/chars.rb CHANGED

@@ -45,56 +45,75 @@ class String
     end
-    # Reduces the string to a ASCII encoding. Example:
+    # Reduces a rich unicode string to a very limited character set like humans do. Example:
+    #  "Céline hören".reduce
+    #  => "Celine hoeren"
+    #
+    # Handles all characters from ISO/IEC 8859-1 and CP1252
+    # like humans do, not just deleting the accents.
+    # So it's not a 1:1 translation, some unicode characters are translated to
+    # multible characters. Example:
+    #  "ÄÖÜäöüß".reduce
+    #  => "AeOeUeaeoeuess"
+    #
+    # For many unicode characters, this behaviour is based on +UnicodeUtils.nfkd+. Example:
     #  ffi = "\uFB03"
     #  ix = "\u2168"
     #  high23="²³"
     #  high5 = "\u2075"
     #  all = ffi + ix + high23 + high5
-    #  all.to_ascii
+    #  all.reduce
     #  => "ffiIX235"
     #
-    # Based on +UnicodeUtils.nfkd+, but handles all characters from ISO/IEC 8859-1 and CP1252
-    # like humans do, not just deleting the accents. Example:
-    #  "ÄÖÜäöüß".to_ascii
-    #  => "AeOeUeaeoeuess"
+    # You can preserve some characters, e.g. all special characters of a specific language. Example:
+    #  "Céline hören 10€".reduce( :preserve => "ÄÖÜäöüß")
+    #  => "Celine hören 10EUR"
+    #
+    # Newlines are preserved by default, but all other nonprintable ascii characters below \\x20 are removed.
     #
-    # 1. Converts ÄÖÜäöüßàáâăäãāåạąæảấầắằ etc. to AeOeUeaeoeuessaaaaaaaaaaaaaaaa.
-    # 2. Then removes all non-Ascii-chars.
-    # 3. Then removes all non-printable Ascii-chars.
-    # 4. Caution: Also Newlines are removed.
-    # About 10 times slower than {#reduce94 reduce94}, but more accurate.
+    # There is also a fast mode. It's about 10 times faster, but it supports only 1:1 translation.
+    #  "Céline hören 10€".reduce( :preserve => "ÄÖÜäöüß€", :fast => true )
+    #  => "Celine hören 10€"
+    #
+    #  "ÄÖÜäöüß€".reduce( :fast => true )
+    #  => "AOUaous"
+    #
+    # Your result will only contain these characters:
+    # * printable letters and basic symbols of the 7bit ASCII charset (\\x20..\\x7e)
+    # * preserved characters as defined in the options (max 18)
+    # * newlines (\\x0a and \\x0d)
     #
-    def to_ascii
-      result = self.to_ascii_extra_chars
-      result.tr!(TR_FULL, TR_REDUCED)     # not necessary, only for performance
-      return UnicodeUtils.nfkd(result).delete('^ -~') # delete is faster than gsub
+    # Options:
+    # [:preserve] Special characters to preserve. You can only preserve up to 18 characters.
+    # [:fast]     Fast mode, if true. About 10 times faster, but it supports only 1:1 translation.
+    #
+    # @return [String]
+    def reduce( options ={} )
+      preserve = options[:preserve] || ''
+      raise ArgumentError, 'max preserve string length is 18 chars'   if preserve.length > 18
+      result = self.delete("\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0b\x0c\x0e-\x1f")
+      result.tr!(preserve, "\x0e-\x1f")                               if preserve.length > 0
+      result = result.to_ascii_extra_chars                            unless options[:fast]
+      result.tr!(TR_FULL, TR_REDUCED)
+      result = UnicodeUtils.nfkd(result)                              unless options[:fast]
+      result.delete!("^\x09-\x7e")
+      result.tr!("\x0e-\x1f", preserve)                               if preserve.length > 0
+      result
     end
-    # Reduces the string to a base94 encoding.
-    # About 10 times faster than with +UnicodeUtils+.
-    # 1. Converts àáâăäãāåạąæảấầắằÀÁÂĂÄÃĀÅẠĄÆẢẤẦẮẰ etc. to aaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAA.
-    # 2. Then removes all non-Ascii-chars.
-    # 3. Then removes all non-printable Ascii-chars.
-    # 4. Caution: Also Newlines are removed.
-    #
-    # See tests and examples {TestKyaniteStringChars#test_reduce94_a here}.
+    # @deprecated
     # @return [String]
     def reduce94( options={} )
-      dup.reduce94!(options)
+      reduce(  {:fast => true}.merge(options)  )
     end
-    # In-place-variant of {#reduce94 reduce94}.
-    # @return [String]
-    def reduce94!( options={} )
-      self.gsub!( 'ß', options[:german_sz] )        if options[:german_sz]
-      self.tr!(TR_FULL, TR_REDUCED)
-      self.delete!('^ -~')
-      self
-    end
     # Reduziert den String auf ein Base53-Encoding,
     # bestehend aus Großbuchstaben, Minuszeichen und zu Kleinbuchstaben umgeformten Sonderzeichen.

data/lib/kyanite/string/chars_const.rb CHANGED

@@ -16,12 +16,12 @@ unless defined?(TR_UPCASE_ALL_REGEXP)
   klammer_auf = "\u227a\u226a\u3008\u276c\u2329\u25c1\u25c0"
   klammer_zu =  "\u227b\u226b\u3009\u276d\u232a\u25b7\u25b6"
-  # Sowohl reduce94 als auch to_ascii werden diese Zeichen übersetzen.
+  # Sowohl reduce94 als auch reduce werden diese Zeichen übersetzen.
   # Zeichen, die TR_FULL ergänzen und die UnicodeUtils.nfkd nicht korrekt umsetzt.
   tr_full_b =    %q{£₤¢‹¥›•«×»÷‚‘ƒ’ˆ§´¡„¿“¦”†‡µ′″°¤∗·⋅} + leerzeichen + klammer_auf + klammer_zu
   tr_reduced_b = %q{LLc"Y"*"*"/''f'^P'!"?"|"~~u'"~~***} + (" "*leerzeichen.length) + ("<"*klammer_auf.length) + (">"*klammer_zu.length)
-  # Nur to_ascii wird diese Zeichen übersetzen.
+  # Nur reduce wird diese Zeichen übersetzen.
   # Zeichen, die in TR_FULL schon drin sind und die UnicodeUtils.nfkd nicht korrekt umsetzt
   tr_full_c =    %q{ØøðđÐĐħĦıĸłŁŧþŦÞаАбБцчЦЧдДеэЕЭфФгГхХийИЙюяЮЯкКлЛмМнНоОпПрРсшщСШЩтТуУвВжзЖЗ}
   tr_reduced_c = %q{OoddDDhHiklLttTTaAbBccCCdDeeEEfFgGhHiiIIjjJJkKlLmMnNoOpPrRsssSSStTuUvVzzZZ}
@@ -29,7 +29,7 @@ unless defined?(TR_UPCASE_ALL_REGEXP)
-  # Nur to_ascii wird diese Zeichen übersetzen.
+  # Nur reduce wird diese Zeichen übersetzen.
   TR_EXTRA_CHARS = [
   [/ß/, 'ss'],
   [/Ö/, 'Oe'],
@@ -267,7 +267,7 @@ if $0 == __FILE__ then
   see "Überprüfe TR_EXTRA_CHARS"
   see "========================"
   see
-  see "defined in", "Dup if <>0", "Trivial?", "Hex Code", "Character", "reduce94", "to_ascii", "Klassifizierung"
+  see "defined in", "Dup if <>0", "Trivial?", "Hex Code", "Character", "reduce94", "reduce", "Klassifizierung"
   startline = 14
   i = 0
   all = ""
@@ -280,7 +280,7 @@ if $0 == __FILE__ then
     c.to_array_of_hex,                    # sein Code in HEX
     c,                                    # das Zeichen
     c.reduce94,                           # was reduce94 daraus macht
-    c.to_ascii,                           # was to_ascii daraus macht
+    c.reduce,                           # was reduce daraus macht
     UnicodeUtils.char_type(c)
     i+=1
@@ -293,7 +293,7 @@ if $0 == __FILE__ then
   see "Überprüfe TR_FULL"
   see "================="
   see
-  see "Nr", "Dup if <>0", "Trivial?", "Hex Code", "Character", "reduce94", "to_ascii", "Klassifizierung"
+  see "Nr", "Dup if <>0", "Trivial?", "Hex Code", "Character", "reduce94", "reduce", "Klassifizierung"
   i = 0
   all = ""
   #TR_FULL_TO_ASCII.each_char do |c|
@@ -305,7 +305,7 @@ if $0 == __FILE__ then
     c.to_array_of_hex,                    # sein Code in HEX
     c,                                    # das Zeichen
     c.reduce94,                           # was reduce94 daraus macht
-    c.to_ascii,                           # was to_ascii daraus macht
+    c.reduce,                           # was reduce daraus macht
     UnicodeUtils.char_type(c)
     i+=1

data/test/string/test_chars.rb CHANGED

@@ -44,7 +44,7 @@ class TestKyaniteStringChars < UnitTest
       assert_equal 0, all.to_a.to_set.size-i-1, "TR_FULL: Dup in Zeichen Nr. #{i} Zeichen #{c} >> #{r}"
       assert c.to_array_of_codepoints[0] > 127, "TR_FULL: Trivialität in Zeichen Nr. #{i} Zeichen #{c} >> #{r}"
       assert r.to_array_of_codepoints[0] <= 127, "TR_FULL: Zeichen Nr. #{i} Zeichen #{c} >> #{r} wird nicht in ASCII umgesetzt"
-      assert_equal c.reduce94, c.to_ascii[0]
+      assert_equal c.reduce94, c.reduce[0]
       i+=1
     end
   end
@@ -87,7 +87,7 @@ class TestKyaniteStringChars < UnitTest
     full    = 'ªàáâăãāåạąảấầắằÀÁÂĂÃĀÅẠĄẢẤẦẮẰ'
     reduced = 'aaaaaaaaaaaaaaaAAAAAAAAAAAAAA'
     assert_equal reduced,       full.reduce94
-    assert_equal reduced,       full.to_ascii
+    assert_equal reduced,       full.reduce
   end
   def test_to_ascii_b
@@ -95,7 +95,7 @@ class TestKyaniteStringChars < UnitTest
     reduced1 =  'cccccCCCCCdDeeeeeeeeeeEEEEEEEEEE'
     reduced2 =  'ccccchCCCCChdDeeeeeeeeeeEEEEEEEEEE'
     assert_equal reduced1,       full.reduce94
-    assert_equal reduced2,       full.to_ascii
+    assert_equal reduced2,       full.reduce
   end
   def test_to_ascii_c
@@ -103,14 +103,14 @@ class TestKyaniteStringChars < UnitTest
     reduced1 =  'ggggGGGGhHiiiiiiiiIIIIIIIIIjJkKllllLLLL'
     reduced2 =  'ggghgGGGhGhHiiiiiiiiIIIIIIIIIjJkKllllLLLL'
     assert_equal reduced1,       full.reduce94
-    assert_equal reduced2,       full.to_ascii
+    assert_equal reduced2,       full.reduce
   end
   def test_to_ascii_e
     full    = 'ńňñņŉŃŇÑŅòóôŏõōőơÒÓÔŎÕŌŐƠ'
     reduced = 'nnnnnNNNNooooooooOOOOOOOO'
     assert_equal reduced,       full.reduce94
-    assert_equal reduced,       full.to_ascii
+    assert_equal reduced,       full.reduce
   end
   def test_to_ascii_f
@@ -118,24 +118,24 @@ class TestKyaniteStringChars < UnitTest
     reduced1 =  'rrrRRRssssSSSSttTTuuuuuuuuuuUUUUUUUUUUwWyyyYYYzzzZZZ'
     reduced2 =  'rrrRRRssshsSSShSttTTuuuuuuuuuuUUUUUUUUUUwWyyyYYYzzzhZZZh'
     assert_equal reduced1,       full.reduce94
-    assert_equal reduced2,       full.to_ascii
+    assert_equal reduced2,       full.reduce
   end
   def test_to_ascii_zusammengesetzt
     full    = 'ĳĲſ…'
     reduced = 'ijIJs...'
-    assert_equal reduced,       full.to_ascii
+    assert_equal reduced,       full.reduce
   end
   def test_to_ascii_same_same
     same_same    = '^!"$%&/()=?@*+~#<>|,;:.-_ {[]}\\'
-    assert_equal same_same,     same_same.to_ascii
+    assert_equal same_same,     same_same.reduce
     same_same    = "'0123456789"
-    assert_equal same_same,     same_same.to_ascii
+    assert_equal same_same,     same_same.reduce
     same_same    = 'abcdefghijklmnopqrstuvwxyz'
-    assert_equal same_same,     same_same.to_ascii
+    assert_equal same_same,     same_same.reduce
     same_same    = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
-    assert_equal same_same,     same_same.to_ascii
+    assert_equal same_same,     same_same.reduce
   end
@@ -143,7 +143,7 @@ class TestKyaniteStringChars < UnitTest
     full = '¯¨'
     reduced = ' ' * full.length
     assert_equal 2,             full.length
-    assert_equal reduced,       full.to_ascii
+    assert_equal reduced,       full.reduce
   end
   def test_to_ascii_s
@@ -155,29 +155,63 @@ class TestKyaniteStringChars < UnitTest
     reduced1 = "sOUAoua"
     reduced2 = "ffiIX235EURssOeUeAeoeueae"
     assert_equal reduced1,       full.reduce94
-    assert_equal reduced2,       full.to_ascii
+    assert_equal reduced2,       full.reduce
   end
   def test_LANG_SPECIAL_CHARS
     LANG_SPECIAL_CHARS .each do | lang, (full, reduced) |
-      #see lang, full, reduced, full.to_ascii, full.reduce94
-      assert_equal reduced,       full.to_ascii
+      #see lang, full, reduced, full.reduce, full.reduce94
+      assert_equal reduced,       full.reduce
     end
   end
   def test_spaces
     spaces =  "\u0020\u00a0\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u202f\u205f\u3000\u2420\u2423"
-    assert_equal spaces.to_ascii, " " * spaces.length
+    assert_equal spaces.reduce, " " * spaces.length
     assert_equal spaces.reduce94, " " * spaces.length
   end
   def test_minus_signs
     minus = "\u00ac\u2212\u2010\u2011\u2012\u2013\u2014\u2015\u2500"
-    assert_equal minus.to_ascii, "-" * minus.length
+    assert_equal minus.reduce, "-" * minus.length
     #assert_equal spaces.reduce94, " " * spaces.length
   end
+  def test_preserve
+          # 0123456789012345678901234567890123456789
+    test = "ßàáâăäãāāāåạąæảấầắằÀÁÂĂÄÃĀÅẠĄÆẢẤẦẮẰćĉčçċĆĈČÇĊďðđĎÐĐèéêěĕëēėęếÈÉÊĚĔËĒĖĘẾĝğġģĜĞĠĢĥħĤĦìíîĭïĩīıįĳÌÍÎĬÏĨĪİĮĲĵĴķĶĺľłļŀĹĽŁĻĿńňñņŉŋŃŇÑŅŊòóôŏöõōøőơœÒÓÔŎÖÕŌØŐƠŒŕřŗŔŘŖśŝšßşŚŜŠŞţťŧþŢŤŦÞùúûŭüũūůűųưÙÚÛŬÜŨŪŮŰŲƯŵŴýŷÿÝŶŸźżžŹŻŽ"
+    belassen = test[10..27]
+    exp = "ssaaaaaeaaaaåạąæảấầắằÀÁÂĂÄÃĀÅẠAAEAAAAAccccchCCCCChdddDDDeeeeeeeeeeEEEEEEEEEEggghgGGGhGhhHHiiiiiiiiiijIIIIIIIIIIJjJkKlllllLLLLLnnnnnnjNNNNNJoooooeooooooeOOOOOeOOOOOOErrrRRRssshsssSSShSttttTTTTuuuuueuuuuuuUUUUUeUUUUUUwWyyyYYYzzzhZZZh"
+    assert_equal exp, test.reduce(:preserve => belassen)
+    assert_raise ArgumentError do
+      belassen = test[10..28]
+      test.reduce(:preserve => belassen)
+    end
+    test = "Háâaäãaållo\nWelt"
+    assert_equal "Haaaäaaallo\nWelt", test.reduce( :preserve =>"äöüßÄÖÜ" )
+  end
+  def test_examples
+    assert_equal "Celine hoeren",       "Céline hören".reduce
+    assert_equal "AeOeUeaeoeuess",      "ÄÖÜäöüß".reduce
+    assert_equal "Celine hören 10EUR",  "Céline hören 10€".reduce( :preserve => "ÄÖÜäöüß")
+    assert_equal "Celine hören 10€",    "Céline hören 10€".reduce( :preserve => "ÄÖÜäöüß€", :fast => true)
+    assert_equal "AOUaous",             "ÄÖÜäöüß€".reduce( :fast => true )
+  end
+  def test_newlines_and_nonprintables
+    test = "Céli\x00ne\nhöre\x0c\x0e\x0fn"
+    assert_equal "Celine\nhören",      test.reduce( :preserve => "ÄÖÜäöüß")
+    assert_equal "Celine\nhoeren",     test.reduce
+    assert_equal "Celine\nhoren",      test.reduce(:fast => true )
+  end
@@ -247,13 +281,7 @@ ENDOFSTRING
     assert_equal 'SCHEIZE',   'Scheiße'.reduce53(:german_sz => 'z')
     assert_equal 'SCHEIZE',   'Scheiße'.reduce53(:german_sz => 'Z')
     assert_equal 'SCHEISSE',  'Scheiße'.reduce53(:german_sz => 'SS')
-    # geht vielleicht in Ruby 1.9
-    assert_equal 'Scheize',   'Scheiße'.reduce94(:german_sz => 'z')
-    assert_equal 'ScheiZe',   'Scheiße'.reduce94(:german_sz => 'Z')
-    assert_equal 'Scheisse',  'Scheiße'.reduce94(:german_sz => 'ss')
-    assert_equal 'Schei$e',   'Scheiße'.reduce94(:german_sz => '$')
-    assert_equal 'Schei$e',   'Schei$e'.reduce94
+    assert_equal 'Scheiß Arsche',   'Scheiß Ärsche'.reduce94(:preserve => 'ß')
   end

data/version.rb CHANGED

@@ -2,7 +2,7 @@
 module Kyanite
-  VERSION  = '0.8.0'
+  VERSION  = '0.8.1'
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: kyanite
 version: !ruby/object:Gem::Version
-  version: 0.8.0
+  version: 0.8.1
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-11-17 00:00:00.000000000 Z
+date: 2012-11-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: drumherum