RubyGems - alphabets - Versions diffs - 0.0.1 → 0.1.0 - Mend

alphabets 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/Manifest.txt +3 -0
data/NOTES.md +40 -0
data/lib/alphabets.rb +6 -0
data/lib/alphabets/alphabets.rb +102 -170
data/lib/alphabets/reader.rb +62 -0
data/lib/alphabets/utils.rb +75 -0
data/lib/alphabets/version.rb +2 -2
data/test/test_reader.rb +37 -0
metadata +15 -6

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 2679f375c1118915625e06811cf773a4d59707a6
-  data.tar.gz: 5375336c3b4d0002923547f5a9d41498d629722f
+  metadata.gz: b5d826c435c38e5c8faf7963d7de6e6dcf0e2fb7
+  data.tar.gz: 5187392b8e6fbb12e249709526edf1bb2def2513
 SHA512:
-  metadata.gz: 3ff7470fe10524e8b0080cc36c9eb7a2534f70f75a4f47ca1cf46698bb2cf68a8845b45a364c01a65469bfa3d42f7865172aff8217103766043675b30924bf7a
-  data.tar.gz: '018796bf5c80bc970458716cf297f53616fb2ee1a031e91fa184d5351483564760a688e283368d760838832350b683d6951c4ce857eaa168177269ae451c10dd'
+  metadata.gz: 0e8aac5a5a65d137c710a9623d444d9f996171caee9869cb32af78cd09b26ac0404fa88a40499279527ba6d9276c029396a34241f507adf289551e9327a6ce05
+  data.tar.gz: 7cb3dda8f5804fc39f67c866c319b24af8ee6119609052a7eb7ff6c0927f1ede8b50cbcf89d98709d83ba72bc900d56efafb4ce038d4f80554c916f057b6b5ec

data/Manifest.txt CHANGED

@@ -5,9 +5,12 @@ README.md
 Rakefile
 lib/alphabets.rb
 lib/alphabets/alphabets.rb
+lib/alphabets/reader.rb
+lib/alphabets/utils.rb
 lib/alphabets/variants.rb
 lib/alphabets/version.rb
 test/helper.rb
 test/test_downcase.rb
+test/test_reader.rb
 test/test_unaccent.rb
 test/test_variants.rb

data/NOTES.md CHANGED

@@ -1,3 +1,43 @@
 # Notes
 ## Todos
+## Terminology
+Use Upcase, Downcase AND Titlecase (!)
+- Example: Ö  -> Upcase: OE, Downcase: oe, Titlecase: Oe (!)
+- Example: Æ  -> Upcase: AE, Downcase: ae, Titlecase: Ae (!)
+## Libraries
+- <https://github.com/SixArm/sixarm_ruby_unaccent> - Replace a string's accent characters with ASCII characters. Based on Perl Text::Unaccent from CPAN.
+## Links
+**Unicode w/ Ruby - Ruby ♡ Unicode**
+- <https://idiosyncratic-ruby.com/66-ruby-has-character>
+Ruby has Character - Ruby comes with good support for Unicode-related features. Read on if you want to learn more about important Unicode fundamentals and how to use them in Ruby...
+- <https://idiosyncratic-ruby.com/41-proper-unicoding>
+Proper Unicoding - Ruby's Regexp engine has a powerful feature built in: It can match for Unicode character properties. But what exactly are properties you can match for?
+- <https://idiosyncratic-ruby.com/30-regex-with-class>
+Regex with Class - Ruby's regex engine defines a lot of shortcut character classes. Besides the common meta characters (\w, etc.), there is also the POSIX style expressions and the unicode property syntax. This is an overview of all character classes
+**W3C**
+- <https://www.w3.org/TR/charmod-norm/>
+- <https://www.w3.org/International/wiki/Case_folding>
+In Western European languages, the letter 'i' (U+0069) upper cases to a dotless 'I' (U+0049). In Turkish, this letter upper cases to a dotted upper case letter 'İ' (U+0130). Similarly, 'I' (U+0049) lower cases to 'ı' (U+0131), which is a dotless lowercase letter i.

data/lib/alphabets.rb CHANGED

@@ -6,7 +6,9 @@ require 'pp'
 ###
 # our own code
 require 'alphabets/version' # let version always go first
+require 'alphabets/reader'
 require 'alphabets/alphabets'
+require 'alphabets/utils'
 require 'alphabets/variants'
@@ -20,6 +22,10 @@ def unaccent( name )
   Alphabet.unaccent( name )   ## using "default" language character mapping / table
 end
+def undiacritic( name ) unaccent( name ); end    ## alias for unaccent
 def variants( name )    ## todo/check: rename to unaccent_variants or unaccent_names - why? why not?
   Variant.find( name )
 end

data/lib/alphabets/alphabets.rb CHANGED

@@ -1,186 +1,118 @@
-# encoding: utf-8
-class Alphabet   ## todo/fix: add alias Abc  and Alpha too? why? why not?
-  def self.frequency_table( name )   ## todo/check: use/rename to char_frequency_table
-    ## calculate the frequency table of letters, digits, etc.
-    freq = Hash.new(0)
-    name.each_char do |ch|
-       freq[ch] += 1
-    end
-    freq
-  end
-  def self.count( freq, mapping_or_chars )
-    chars = if mapping_or_chars.is_a?( Hash )
-              mapping_or_chars.keys
-            else   ## todo/fix: check for is_a? Array and if is String split into Array (on char at a time?) - why? why not?
-              mapping_or_chars  ## assume it's an array/list of characters
-            end
-    chars.reduce(0) do |count,ch|
-      count += freq[ch]
-      count
-    end
-  end
-  def self.tr( name, mapping )
-    buf = String.new
-    name.each_char do |ch|
-      buf << if mapping[ch]
-                mapping[ch]
-              else
-                ch
-              end
-    end
-    buf
-  end
-  class Unaccenter #Worker    ## todo/change - find a better name - why? why not?
-    def initialize( mapping )
-      @mapping = mapping
-    end
-    def count( name )      Alphabet.count( name, @mapping ); end
-    def unaccent( name )   Alphabet.tr( name, @mapping );    end
-  end  # class Unaccent Worker
-  def self.find_unaccenter( key )
-    if key == :de
-      @de ||= Unaccenter.new( UNACCENT_DE )
-      @de
-    else
-      ## use uni(versal) or unicode or something - why? why not?
-      ##  use all or int'l (international) - why? why not?
-      ##  use en  (english) - why? why not?
-      @default ||= Unaccenter.new( UNACCENT )
-      @default
-    end
-  end
-  def self.unaccent( name )
-    @default ||= Unaccenter.new( UNACCENT )
-    @default.unaccent( name )
-  end
-  def self.downcase_i18n( name )    ## our very own downcase for int'l characters / letters
-    tr( name, DOWNCASE )
-  end
-  ## add downcase_uni  - univeral/unicode - why? why not?
-  ##  "simple" unaccent (remove accents/diacritics and unfold ligatures) translation table / mapping
-  UNACCENT = {
-    'Ä'=>'A',  'ä'=>'a',
-    'Á'=>'A',  'á'=>'a',
-               'à'=>'a',
-               'ã'=>'a',
-               'â'=>'a',
-    'Å'=>'A',  'å'=>'a',
-               'æ'=>'ae',
-               'ā'=>'a',
-               'ă'=>'a',
-               'ą'=>'a',
-    'Ç' =>'C', 'ç'=>'c',
-               'ć'=>'c',
-    'Č'=>'C',  'č'=>'c',
-    'É'=>'E',  'é'=>'e',
-               'è'=>'e',
-               'ê'=>'e',
-               'ë'=>'e',
-               'ė'=>'e',
-               'ę'=>'e',
-               'ğ'=>'g',
-    'İ'=>'I',
-    'Í'=>'I',  'í'=>'i',
-               'î'=>'i',
-               'ī'=>'i',
-               'ı'=>'i',
-    'Ł'=>'L', 'ł'=>'l',
-               'ñ'=>'n',
-               'ń'=>'n',
-               'ň'=>'n',
-    'Ö'=>'O',  'ö'=>'o',
-               'ó'=>'o',
-               'õ'=>'o',
-               'ô'=>'o',
-               'ø'=>'o',
-               'ő'=>'o',
-                'ř'=>'r',
-    'Ś'=>'S',
-    'Ş'=>'S',  'ş'=>'s',
-    'Š'=>'S',  'š'=>'s',
-               'ș'=>'s',  ## U+0219
-               'ß'=>'ss',
-               'ţ'=>'t',  ## U+0163
-               'ț'=>'t',  ## U+021B
-               'þ'=>'th',   #### fix!!!! use p - why? why not?
-    'Ü'=>'U',  'ü'=>'u',
-    'Ú'=>'U',  'ú'=>'u',
-               'ū'=>'u',
-               'ý'=>'y',
-               'ź'=>'z',
-               'ż'=>'z',
-    'Ž'=>'Z',  'ž'=>'z',
-  }
-  ##  de,at,ch translation for umlauts
-  UNACCENT_DE = {
-    'Ä'=>'Ae',  'ä'=>'ae',  ### Use AE, OE, UE and NOT Ae, Oe, Ue - why? why not? e.g.VÖST => VOEST or Ö => OE
-    'Ö'=>'Oe',  'ö'=>'oe',
-    'Ü'=>'Ue',  'ü'=>'ue',
-                'ß'=>'ss',
-  }
+class Alphabet
+##  "simple" unaccent (remove accents/diacritics and unfold ligatures) translation table / mapping
+UNACCENT = Reader.parse( <<TXT )
+    Ä A   ä a
+    Á A   á a
+          à a
+          ã a
+          â a
+    Å A   å a
+    Æ AE  æ ae   # ae ligature
+          ā a
+          ă a
+          ą a
+    Ç C   ç c
+          ć c
+    Č C   č c
+    É E   é e
+          è e
+          ê e
+          ë e
+          ė e
+          ę e
+          ğ g
+    İ I
+    Í I   í i
+          î i
+          ī i
+          ı i    # small dotless i
+    Ł L   ł l
+          ñ n
+          ń n
+          ň n
+    Ö O   ö o
+          ó o
+          õ o
+          ô o
+          ø o
+          ő o
+    Œ OE  œ oe   # oe ligature
+          ř r
+    Ś S   ś s
+    Ş S   ş s
+    Š S   š s
+          ș s   # U+0219
+          ß ss
+          ţ t   # U+0163
+          ț t   # U+021B
+          þ p    #### fix/check!!!! icelandic - use p is p or th - why? why not?
+    Ü U   ü u
+    Ú U   ú u
+          ū u
+          ý y
+          ź z
+          ż z
+    Ž Z   ž z
+TXT
+##  de,at,ch translation for umlauts
+UNACCENT_DE = Reader.parse( <<TXT )
+    Ä AE  ä ae    ### note: Use upcase AE, OE, UE and NOT titlecase Ae, Oe, Ue - why? why not? e.g.VÖST => VOEST or Ö => OE
+    Ö OE  ö oe
+    Ü UE  ü ue
+          ß ss
+TXT
   ## add UNACCENT_ES - why? why not?  is Espanyol catalan spelling or spanish (castillian)?
   # 'ñ'=>'ny',    ## e.g. Español => Espanyol
-  DOWNCASE = %w[A B C D E F G H I J K L M N O P Q R S T U V W X Y Z].reduce({}) do |h,ch|
+DOWNCASE = %w[A B C D E F G H I J K L M N O P Q R S T U V W X Y Z].reduce({}) do |h,ch|
     h[ch] = ch.downcase
     h
-  end.merge(
-    'Ä'=>'ä',
-    'Á'=>'á',
-    'Å'=>'å',
+  end.merge( Reader.parse( <<TXT ) )
+    Ä ä
+    Á á
+    Å å
+    Æ æ   # ae ligature
-    'Ç'=>'ç',
-    'Č'=>'č',
+    Ç ç
+    Č č
-    'É'=>'é',
+    É é
-    'İ'=>'?',   ## fix - add lowercase
-    'Í'=>'í',
+    İ i
+    Í í
-    'Ł'=>'ł',
+    Ł ł
-    'Ö'=>'ö',
+    Ö ö
+    Œ œ   # oe ligature
-    'Ś'=>'?',   ## fix - add lowercase
-    'Ş'=>'ş',
-    'Š'=>'š',
+    Ś ś
+    Ş ş
+    Š š
-    'Ü'=>'ü',
-    'Ú'=>'ú',
+    Ü ü
+    Ú ú
-    'Ž'=>'ž',
-  )
+    Ž ž
+TXT
 end  # class Alphabet

data/lib/alphabets/reader.rb ADDED

@@ -0,0 +1,62 @@
+class Alphabet
+class Reader   ## todo/check: rename to CharReader or something - why? why not?
+  def self.read( path )   ## use - rename to read_file or from_file etc. - why? why not?
+    txt = File.open( path, 'r:utf-8' ).read
+    parse( txt )
+  end
+  def self.parse( txt )
+    h = {}  ## char(acter) table mappings
+    txt.each_line do |line|
+      line = line.strip
+      next if line.empty?
+      next if line.start_with?( '#' )   ## skip comments too
+      ## strip inline (until end-of-line) comments too
+      ##  e.g  ţ  t  ## U+0163
+      ##   =>  ţ  t
+      line = line.sub( /#.*/, '' ).strip
+      ## pp line
+      values = line.split( /[ \t]+/ )
+      ## pp values
+      ## check - must be a even - a multiple of two
+      if values.size % 2 != 0
+        puts "** !!! ERROR !!! - missing mapping pair - mappings must be even (a multiple of two):"
+        pp values
+        exit 1
+      end
+      # add mappings in pairs
+      values.each_slice(2) do |slice|
+        ## pp slice
+        key   = slice[0]
+        value = slice[1]
+        ## check - key must be a single-character/letter in unicode
+        if key.size != 1
+          puts "** !!! ERROR !!! - mapping character must be a single-character, size is #{key.size}"
+          pp slice
+          exit 1
+        end
+        ## check - check for duplicates
+        if h[ key ]
+          puts "** !!! ERROR !!! - duplicate mapping character; key already present"
+          pp slice
+          exit 1
+        else
+          h[ key ] = value
+        end
+      end
+    end
+    h
+  end # method parse
+end # class Reader
+end # class Alphabet

data/lib/alphabets/utils.rb ADDED

@@ -0,0 +1,75 @@
+class Alphabet
+  def self.frequency_table( name )   ## todo/check: use/rename to char_frequency_table
+    ## calculate the frequency table of letters, digits, etc.
+    freq = Hash.new(0)
+    name.each_char do |ch|
+       freq[ch] += 1
+    end
+    freq
+  end
+  def self.count( freq, mapping_or_chars )
+    chars = if mapping_or_chars.is_a?( Hash )
+              mapping_or_chars.keys
+            else   ## todo/fix: check for is_a? Array and if is String split into Array (on char at a time?) - why? why not?
+              mapping_or_chars  ## assume it's an array/list of characters
+            end
+    chars.reduce(0) do |count,ch|
+      count += freq[ch]
+      count
+    end
+  end
+  def self.sub( name, mapping )   ## todo/check: use a different/better name - gsub/map/replace/fold/... - why? why not?
+    buf = String.new
+    name.each_char do |ch|
+      buf << if mapping[ch]
+                mapping[ch]
+              else
+                ch
+              end
+    end
+    buf
+  end
+  class Unaccenter #Worker    ## todo/change - find a better name - why? why not?
+    def initialize( mapping )
+      @mapping = mapping
+    end
+    def count( freq )      Alphabet.count( freq, @mapping ); end
+    def unaccent( name )   Alphabet.sub( name, @mapping );   end
+  end  # class Unaccent Worker
+  def self.find_unaccenter( key )
+    if key == :de
+      @de ||= Unaccenter.new( UNACCENT_DE )
+      @de
+    else
+      ## use uni(versal) or unicode or something - why? why not?
+      ##  use all or int'l (international) - why? why not?
+      ##  use en  (english) - why? why not?
+      @default ||= Unaccenter.new( UNACCENT )
+      @default
+    end
+  end
+  def self.unaccent( name )
+    @default ||= Unaccenter.new( UNACCENT )
+    @default.unaccent( name )
+  end
+  def self.downcase_i18n( name )    ## our very own downcase for int'l characters / letters
+    sub( name, DOWNCASE )
+  end
+  ## add downcase_uni  - univeral/unicode - why? why not?
+end  # class Alphabet

data/lib/alphabets/version.rb CHANGED

@@ -5,8 +5,8 @@
 class Alphabet
   MAJOR = 0    ## todo: namespace inside version or something - why? why not??
-  MINOR = 0
-  PATCH = 1
+  MINOR = 1
+  PATCH = 0
   VERSION = [MAJOR,MINOR,PATCH].join('.')
   def self.version

data/test/test_reader.rb ADDED

@@ -0,0 +1,37 @@
+###
+#  to run use
+#     ruby -I ./lib -I ./test test/test_reader.rb
+require 'helper'
+class TestReader < MiniTest::Test
+  def test_parse
+    h = Alphabet::Reader.parse( <<TXT )
+      ## hello
+      Ä  A   ä  a   ## hello
+      Á  A   á  a
+             à  a
+             ã  a
+             â  a   ### yada yada
+      Å  A   å  a
+             æ   ae
+    Ç C ç c
+        ć c
+         ß ss
+TXT
+    pp h
+    assert_equal 'A',   h['Ä']
+    assert_equal 'a',   h['ä']
+    assert_equal 'ae',  h['æ']
+    assert_equal 'ss',  h['ß']
+  end
+end # class TestReader

metadata CHANGED

@@ -1,43 +1,49 @@
 --- !ruby/object:Gem::Specification
 name: alphabets
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.1.0
 platform: ruby
 authors:
 - Gerald Bauer
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2019-08-13 00:00:00.000000000 Z
+date: 2019-08-14 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rdoc
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
         version: '4.0'
+    - - "<"
+      - !ruby/object:Gem::Version
+        version: '7'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
         version: '4.0'
+    - - "<"
+      - !ruby/object:Gem::Version
+        version: '7'
 - !ruby/object:Gem::Dependency
   name: hoe
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '3.16'
+        version: '3.18'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '3.16'
+        version: '3.18'
 description: 'alphabets - '
 email: opensport@googlegroups.com
 executables: []
@@ -55,10 +61,13 @@ files:
 - Rakefile
 - lib/alphabets.rb
 - lib/alphabets/alphabets.rb
+- lib/alphabets/reader.rb
+- lib/alphabets/utils.rb
 - lib/alphabets/variants.rb
 - lib/alphabets/version.rb
 - test/helper.rb
 - test/test_downcase.rb
+- test/test_reader.rb
 - test/test_unaccent.rb
 - test/test_variants.rb
 homepage: https://github.com/sportdb/sport.db