RubyGems - alphabets - Versions diffs - 0.1.3 → 1.0.1 - Mend

alphabets 0.1.3 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +5 -5
data/CHANGELOG.md +5 -3
data/NOTES.md +193 -193
data/README.md +19 -26
data/Rakefile +28 -28
data/lib/alphabets/alphabets.rb +186 -184
data/lib/alphabets/variants.rb +72 -72
data/lib/alphabets/version.rb +23 -23
data/lib/alphabets.rb +46 -40
data/test/helper.rb +10 -10
data/test/test_downcase.rb +18 -18
data/test/test_reader.rb +1 -1
data/test/test_unaccent.rb +36 -36
data/test/test_variants.rb +36 -36
metadata +22 -15

data/lib/alphabets/alphabets.rb CHANGED Viewed

@@ -1,184 +1,186 @@
-class Alphabet
-##  "simple" unaccent (remove accents/diacritics and unfold ligatures) translation table / mapping
-UNACCENT = Reader.parse( <<TXT )
-    Ä A   ä a
-    Á A   á a
-    À A   à a
-    Ã A   ã a
-    Â A   â a
-    Å A   å a
-    Æ AE  æ ae   # ae ligature
-          ā a
-          ă a
-    Ą A   ą a    # ą - U+0105 (261) - LATIN SMALL LETTER A WITH OGONEK
-    Ç C   ç c    # ç - U+00E7 (231) - LATIN SMALL LETTER C WITH CEDILLA
-    Ć C   ć c
-    Č C   č c
-    Ď D   ď d
-    Ð D   ð d    # iceland - d
-    É E   é e
-    È E   è e
-    Ê E   ê e
-    Ë E   ë e
-          ė e
-    Ę E   ę e
-    Ě E   ě e
-          ğ g
-    İ I
-    Í I   í i
-    Ì I   ì i
-    Î I   î i
-          ī i
-          ı i    # ı - U+0131 (305) - LATIN SMALL LETTER DOTLESS I
-    Ï I   ï i
-    Ł L   ł l
-    Ñ N   ñ n
-    Ń N   ń n
-    Ň N   ň n
-    Ö O   ö o
-    Ő O   ő o    # hungarian - use OE/oe  - why? (it's not a ligature) why not?
-    Ó O   ó o
-    Ò O   ò o
-    Õ O   õ o
-    Ô O   ô o
-          ø o
-    Œ OE  œ oe   # oe ligature
-    Ř R   ř r
-    Ś S   ś s
-    Ş S   ş s   # ş - U+015F (351) - LATIN SMALL LETTER S WITH CEDILLA
-    Ș S   ș s   # ș - U+0219 (537) - LATIN SMALL LETTER S WITH COMMA BELOW
-    Š S   š s
-          ß ss  # ß - U+00DF (223) - LATIN SMALL LETTER SHARP S
-    Ţ T   ţ t   # ţ - U+0163 (355) - LATIN SMALL LETTER T WITH CEDILLA
-    Ț T   ț t   # ț - U+021B (539) - LATIN SMALL LETTER T WITH COMMA BELOW
-    Ť T   ť t
-    Þ P   þ p   # þ - U+00FE (254) - LATIN SMALL LETTER THORN
-                #### fix/check!!!! icelandic - use p is p or th - why? why not?
-    Ü U   ü u
-    Ú U   ú u
-    Ù U   ù u
-          ū u
-    Ů U   ů u
-    Û U   û u
-    Ý Y   ý y
-    Ÿ Y   ÿ y
-    Ź Z   ź z
-    Ż Z   ż z
-    Ž Z   ž z
-TXT
-##
-# Notes:
-#  Romanian did NOT initially get its Ș/ș and Ț/ț (with comma) letters,
-#  because these letters were initially unified with Ş/ş and Ţ/ţ (with cedilla)
-#  by the Unicode Consortium, considering the shapes with comma beneath
-#  to be glyph variants of the shapes with cedilla.
-#  However, the letters with explicit comma below were later added to the Unicode standard and are also in ISO 8859-16.
-##  de,at,ch translation for umlauts
-UNACCENT_DE = Reader.parse( <<TXT )
-    Ä AE  ä ae    ### note: Use upcase AE, OE, UE and NOT titlecase Ae, Oe, Ue - why? why not? e.g.VÖST => VOEST or Ö => OE
-    Ö OE  ö oe
-    Ü UE  ü ue
-          ß ss
-TXT
-## add UNACCENT_ES - why? why not?  is Espanyol catalan spelling or spanish (castillian)?
-# 'ñ'=>'ny',    ## e.g. Español => Espanyol
-DOWNCASE = %w[A B C D E F G H I J K L M N O P Q R S T U V W X Y Z].reduce({}) do |h,ch|
-    h[ch] = ch.downcase
-    h
-  end.merge( Reader.parse( <<TXT ) )
-    Ä ä
-    Á á
-    À à
-    Â â
-    Å å
-    Æ æ   # LATIN LETTER AE  - ae ligature
-    Ą ą
-    Ã ã
-    Ç ç   # LATIN LETTER C WITH CEDILLA
-    Č č
-    Ć ć
-    Ď ď
-    Ð ð    # iceland - d
-    É é
-    È è
-    Ë ë
-    Ê ê
-    Ę ę
-    Ě ě
-    İ i
-    Í í
-    Ì ì
-    Ï ï
-    Î î
-    Ł ł
-    Ń ń
-    Ň ň
-    Ñ ñ
-    Ö ö
-    Ő ő
-    Œ œ   # LATIN LIGATURE OE
-    Ó ó
-    Ò ò
-    Ô ô
-    Õ õ
-    Þ þ    # iceland - p
-    Ř ř
-    Ś ś
-    Ş ş   # LATIN LETTER S WITH CEDILLA
-    Ș ș   # LATIN LETTER S WITH COMMA BELOW
-    Š š
-    Ţ ţ   # LATIN LETTER T WITH CEDILLA
-    Ț ț   # LATIN LETTER T WITH COMMA BELOW
-    Ť ť
-    Ü ü
-    Ú ú
-    Ù ù
-    Ů ů
-    Û û
-    Ý ý
-    Ÿ ÿ
-    Ž ž
-    Ż ż
-    Ź ź
-TXT
-end  # class Alphabet
+class Alphabet
+##  "simple" unaccent (remove accents/diacritics and unfold ligatures) translation table / mapping
+UNACCENT = Reader.parse( <<TXT )
+    Ä A   ä a
+    Á A   á a
+    À A   à a
+    Ã A   ã a
+    Â A   â a
+    Å A   å a
+    Æ AE  æ ae   # ae ligature
+          ā a
+          ă a
+    Ą A   ą a    # ą - U+0105 (261) - LATIN SMALL LETTER A WITH OGONEK
+    Ç C   ç c    # ç - U+00E7 (231) - LATIN SMALL LETTER C WITH CEDILLA
+    Ć C   ć c
+    Č C   č c
+    Ď D   ď d
+    Ð D   ð d    # iceland - d
+    É E   é e
+    È E   è e
+    Ê E   ê e
+    Ë E   ë e
+          ė e
+    Ę E   ę e
+    Ě E   ě e
+          ğ g
+    İ I
+    Í I   í i
+    Ì I   ì i
+    Î I   î i
+          ī i
+          ı i    # ı - U+0131 (305) - LATIN SMALL LETTER DOTLESS I
+    Ï I   ï i
+    Ł L   ł l
+    Ñ N   ñ n
+    Ń N   ń n
+    Ň N   ň n
+    Ņ N   ņ n        # N/n with cedilla U+0145 / U+0146 (capital / small)
+    Ö O   ö o
+    Ő O   ő o    # hungarian - use OE/oe  - why? (it's not a ligature) why not?
+    Ó O   ó o
+    Ò O   ò o
+    Õ O   õ o
+    Ô O   ô o
+          ø o
+    Œ OE  œ oe   # oe ligature
+    Ř R   ř r
+    Ś S   ś s
+    Ş S   ş s   # ş - U+015F (351) - LATIN SMALL LETTER S WITH CEDILLA
+    Ș S   ș s   # ș - U+0219 (537) - LATIN SMALL LETTER S WITH COMMA BELOW
+    Š S   š s
+          ß ss  # ß - U+00DF (223) - LATIN SMALL LETTER SHARP S
+    Ţ T   ţ t   # ţ - U+0163 (355) - LATIN SMALL LETTER T WITH CEDILLA
+    Ț T   ț t   # ț - U+021B (539) - LATIN SMALL LETTER T WITH COMMA BELOW
+    Ť T   ť t
+    Þ P   þ p   # þ - U+00FE (254) - LATIN SMALL LETTER THORN
+                #### fix/check!!!! icelandic - use p is p or th - why? why not?
+    Ü U   ü u
+    Ú U   ú u
+    Ù U   ù u
+          ū u
+    Ů U   ů u
+    Û U   û u
+    Ý Y   ý y
+    Ÿ Y   ÿ y
+    Ź Z   ź z
+    Ż Z   ż z
+    Ž Z   ž z
+TXT
+##
+# Notes:
+#  Romanian did NOT initially get its Ș/ș and Ț/ț (with comma) letters,
+#  because these letters were initially unified with Ş/ş and Ţ/ţ (with cedilla)
+#  by the Unicode Consortium, considering the shapes with comma beneath
+#  to be glyph variants of the shapes with cedilla.
+#  However, the letters with explicit comma below were later added to the Unicode standard and are also in ISO 8859-16.
+##  de,at,ch translation for umlauts
+UNACCENT_DE = Reader.parse( <<TXT )
+    Ä AE  ä ae    ### note: Use upcase AE, OE, UE and NOT titlecase Ae, Oe, Ue - why? why not? e.g.VÖST => VOEST or Ö => OE
+    Ö OE  ö oe
+    Ü UE  ü ue
+          ß ss
+TXT
+## add UNACCENT_ES - why? why not?  is Espanyol catalan spelling or spanish (castillian)?
+# 'ñ'=>'ny',    ## e.g. Español => Espanyol
+DOWNCASE = %w[A B C D E F G H I J K L M N O P Q R S T U V W X Y Z].reduce({}) do |h,ch|
+    h[ch] = ch.downcase
+    h
+  end.merge( Reader.parse( <<TXT ) )
+    Ä ä
+    Á á
+    À à
+    Â â
+    Å å
+    Æ æ   # LATIN LETTER AE  - ae ligature
+    Ą ą
+    Ã ã
+    Ç ç   # LATIN LETTER C WITH CEDILLA
+    Č č
+    Ć ć
+    Ď ď
+    Ð ð    # iceland - d
+    É é
+    È è
+    Ë ë
+    Ê ê
+    Ę ę
+    Ě ě
+    İ i
+    Í í
+    Ì ì
+    Ï ï
+    Î î
+    Ł ł
+    Ń ń
+    Ň ň
+    Ñ ñ
+    Ņ ņ
+    Ö ö
+    Ő ő
+    Œ œ   # LATIN LIGATURE OE
+    Ó ó
+    Ò ò
+    Ô ô
+    Õ õ
+    Þ þ    # iceland - p
+    Ř ř
+    Ś ś
+    Ş ş   # LATIN LETTER S WITH CEDILLA
+    Ș ș   # LATIN LETTER S WITH COMMA BELOW
+    Š š
+    Ţ ţ   # LATIN LETTER T WITH CEDILLA
+    Ț ț   # LATIN LETTER T WITH COMMA BELOW
+    Ť ť
+    Ü ü
+    Ú ú
+    Ù ù
+    Ů ů
+    Û û
+    Ý ý
+    Ÿ ÿ
+    Ž ž
+    Ż ż
+    Ź ź
+TXT
+end  # class Alphabet

data/lib/alphabets/variants.rb CHANGED Viewed

@@ -1,72 +1,72 @@
-# encoding: utf-8
-class Variant    ## (spelling) variant finder / builder for names
-  EN_UNACCENTER = Alphabet.find_unaccenter( :en ) ## assume english (en) as default for know - change to universal/int'l/default or something - why? why not?
-  DE_UNACCENTER = Alphabet.find_unaccenter( :de )
-def self.find( name )
-  alt_names = []
-  freq = Alphabet.frequency_table( name )
-  en = EN_UNACCENTER
-  if en.count( freq ) > 0    # check if includes äöü (that is, character with accents or diacritics) etc.
-    alt_names <<  en.unaccent( name )
-  end
-  de = DE_UNACCENTER
-  if de.count( freq ) > 0
-    alt_names <<  de.unaccent( name )
-  end
-  ## todo - make uniq  e.g. Preußen is Preussen, Preussen 2x
-  alt_names = alt_names.uniq
-  alt_names
-end
-end  # class Variant
-######################################
-#  expiremental class - use (just) Name or NameQ or NameVariant or NameAnalyzer/Query or similar - why? why not?
-##   let's wait for now with usage - let's add more methods as we go along and find more - why? why not?
-class NameQuery
-  def initialize( name )
-    @name = name
-  end
-  def frequency_table
-    @freq ||= Alphabet.frequency_table( @name )
-  end
-  def variants
-    @variants ||= find_variants
-  end
-private
-  EN_UNACCENTER = Alphabet.find_unaccenter( :en ) ## assume english (en) as default for know - change to universal/int'l/default or something - why? why not?
-  DE_UNACCENTER = Alphabet.find_unaccenter( :de )
-  def find_variants
-    alt_names = []
-    freq = frequency_table
-    en = EN_UNACCENTER
-    if en.count( freq ) > 0    # check if includes äöü (that is, character with accents or diacritics) etc.
-      alt_names <<  en.unaccent( @name )
-    end
-    de = DE_UNACCENTER
-    if de.count( freq ) > 0
-      alt_names <<  de.unaccent( @name )
-    end
-    ## todo - make uniq  e.g. Preußen is Preussen, Preussen 2x
-    alt_names = alt_names.uniq
-    alt_names
-  end
-end  ## class VariantName
+# encoding: utf-8
+class Variant    ## (spelling) variant finder / builder for names
+  EN_UNACCENTER = Alphabet.find_unaccenter( :en ) ## assume english (en) as default for know - change to universal/int'l/default or something - why? why not?
+  DE_UNACCENTER = Alphabet.find_unaccenter( :de )
+def self.find( name )
+  alt_names = []
+  freq = Alphabet.frequency_table( name )
+  en = EN_UNACCENTER
+  if en.count( freq ) > 0    # check if includes äöü (that is, character with accents or diacritics) etc.
+    alt_names <<  en.unaccent( name )
+  end
+  de = DE_UNACCENTER
+  if de.count( freq ) > 0
+    alt_names <<  de.unaccent( name )
+  end
+  ## todo - make uniq  e.g. Preußen is Preussen, Preussen 2x
+  alt_names = alt_names.uniq
+  alt_names
+end
+end  # class Variant
+######################################
+#  expiremental class - use (just) Name or NameQ or NameVariant or NameAnalyzer/Query or similar - why? why not?
+##   let's wait for now with usage - let's add more methods as we go along and find more - why? why not?
+class NameQuery
+  def initialize( name )
+    @name = name
+  end
+  def frequency_table
+    @freq ||= Alphabet.frequency_table( @name )
+  end
+  def variants
+    @variants ||= find_variants
+  end
+private
+  EN_UNACCENTER = Alphabet.find_unaccenter( :en ) ## assume english (en) as default for know - change to universal/int'l/default or something - why? why not?
+  DE_UNACCENTER = Alphabet.find_unaccenter( :de )
+  def find_variants
+    alt_names = []
+    freq = frequency_table
+    en = EN_UNACCENTER
+    if en.count( freq ) > 0    # check if includes äöü (that is, character with accents or diacritics) etc.
+      alt_names <<  en.unaccent( @name )
+    end
+    de = DE_UNACCENTER
+    if de.count( freq ) > 0
+      alt_names <<  de.unaccent( @name )
+    end
+    ## todo - make uniq  e.g. Preußen is Preussen, Preussen 2x
+    alt_names = alt_names.uniq
+    alt_names
+  end
+end  ## class VariantName

data/lib/alphabets/version.rb CHANGED Viewed

@@ -1,23 +1,23 @@
-# encoding: utf-8
-## todo/check: use a module Alphabets with s to keep version and banner separate - why? why not?
-class Alphabet
-  MAJOR = 0    ## todo: namespace inside version or something - why? why not??
-  MINOR = 1
-  PATCH = 3
-  VERSION = [MAJOR,MINOR,PATCH].join('.')
-  def self.version
-    VERSION
-  end
-  def self.banner
-    "alphabets/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
-  end
-  def self.root
-    File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )
-  end
-end # class Alphabet
+# encoding: utf-8
+## todo/check: use a module Alphabets with s to keep version and banner separate - why? why not?
+class Alphabet
+  MAJOR = 1    ## todo: namespace inside version or something - why? why not??
+  MINOR = 0
+  PATCH = 1
+  VERSION = [MAJOR,MINOR,PATCH].join('.')
+  def self.version
+    VERSION
+  end
+  def self.banner
+    "alphabets/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}] in (#{root})"
+  end
+  def self.root
+    File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )
+  end
+end # class Alphabet

data/lib/alphabets.rb CHANGED Viewed

@@ -1,40 +1,46 @@
-# encoding: utf-8
-require 'pp'
-###
-# our own code
-require 'alphabets/version' # let version always go first
-require 'alphabets/reader'
-require 'alphabets/alphabets'
-require 'alphabets/utils'
-require 'alphabets/variants'
-## add "global" convenience helper
-def downcase_i18n( name )
-  Alphabet.downcase_i18n( name )
-end
-def unaccent( name )
-  Alphabet.unaccent( name )   ## using "default" language character mapping / table
-end
-def undiacritic( name ) unaccent( name ); end    ## alias for unaccent
-def variants( name )    ## todo/check: rename to unaccent_variants or unaccent_names - why? why not?
-  Variant.find( name )
-end
-## add convenience aliases - also add Alpha - why? why not?
-Abc       = Alphabet
-Alphabets = Alphabet
-Alpha     = Alphabet
-puts Alphabet.banner   # say hello
+## note - pp now part of ruby core since version ???
+###           check no longer required!!!
+require 'pp'
+###
+# our own code
+require_relative 'alphabets/version' # let version always go first
+require_relative 'alphabets/reader'
+require_relative 'alphabets/alphabets'
+require_relative 'alphabets/utils'
+require_relative 'alphabets/variants'
+##
+##  add globals inside Kernel ??
+##    lets us use alias_method
+##    what else?   why? why not?
+## add "global" convenience helper
+def downcase_i18n( name )
+  Alphabet.downcase_i18n( name )
+end
+def unaccent( name )
+  Alphabet.unaccent( name )   ## using "default" language character mapping / table
+end
+def undiacritic( name ) unaccent( name ); end    ## alias for unaccent
+def variants( name )    ## todo/check: rename to unaccent_variants or unaccent_names - why? why not?
+  Variant.find( name )
+end
+## add convenience aliases - also add Alpha - why? why not?
+Abc       = Alphabet
+Alphabets = Alphabet
+Alpha     = Alphabet
+puts Alphabet.banner   # say hello

data/test/helper.rb CHANGED Viewed

@@ -1,10 +1,10 @@
-## $:.unshift(File.dirname(__FILE__))
-## minitest setup
-require 'minitest/autorun'
-## our own code
-require 'alphabets'
+## $:.unshift(File.dirname(__FILE__))
+## minitest setup
+require 'minitest/autorun'
+## our own code
+require 'alphabets'