RubyGems - zhongwen_tools - Versions diffs - 0.17.5 → 0.18.0 - Mend

zhongwen_tools 0.17.5 → 0.18.0

Files changed (8) hide show

checksums.yaml +4 -4
data/lib/zhongwen_tools/regex.rb +7 -1
data/lib/zhongwen_tools/romanization.rb +13 -15
data/lib/zhongwen_tools/romanization/pinyin.rb +37 -31
data/lib/zhongwen_tools/romanization/pinyin_table.rb +39 -12
data/lib/zhongwen_tools/version.rb +1 -1
data/test/test_pinyin.rb +34 -8
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: e122be905ec8ae3b0b3a65a76ad5fa933cc3193a
-  data.tar.gz: 3b56c89628c85d369e51f12506632fdf6607930f
+  metadata.gz: 225b43aacf009731b4034a754af359efd7d067d1
+  data.tar.gz: d65490aa4067bbedf88e2805d7635bb694261bbe
 SHA512:
-  metadata.gz: 6fa55501be98d80df5618b1220a90190438806c98ce55984217f147d7844d178dddeb5c4c1456ae3be075e1f9cfd77b296ffc0496256d9efeaaf7878487f48ab
-  data.tar.gz: 83d9a37cbcb286de3eb71fa5ce96647b0b165f9dd86efd4547edae16659919ebcc0d67878b9146507180f179391c8afee4c1b95ccd7c3a7cb56e9a6e1ea340b6
+  metadata.gz: 14e73ad7b0b16325186b0643202416957605a0726596e44db660ba324c72685393b47c705fc40869cac9cf9815de533bff985e9daa36d903fe1dddc45587579a
+  data.tar.gz: 17ed1d4fbce22e1d13f8df22a67f716a85668efa9bd00ad6d6d89dada1f8d47320dce4b207119242cb43ddbc4ec3525a528b424a851bd70398bfa039c23ed737

data/lib/zhongwen_tools/regex.rb CHANGED Viewed

@@ -99,6 +99,11 @@ module ZhongwenTools
       }
     end
+    def self.py_syllabic_nasals
+      # NOTE: includes combining diatrical marks for n̄ňm̄m̌m̀
+      /((N̄|n̄|ň)g?|[ŇŃǸńǹ]g?|m̄|m̌|m̀|ḿ)/
+    end
     def self.py_tones
       {
         'a' => '[āáǎàa]',
@@ -111,7 +116,8 @@ module ZhongwenTools
     end
     def self.only_tones
-      /([āáǎàēéěèīíǐìōóǒòūúǔùǖǘǚǜ])/
+      # NOTE: includes combining diatrical marks for n̄ňm̄m̌m̀
+      /([āáǎàēéěèīíǐìōóǒòūúǔùǖǘǚǜńǹḿŃŇǸ]|N̄|n̄|ň|m̄|m̌|m̀)/
     end
   end
 end

data/lib/zhongwen_tools/romanization.rb CHANGED Viewed

@@ -8,10 +8,8 @@ require 'zhongwen_tools/romanization/yale'
 require 'zhongwen_tools/romanization/mps2'
 require 'zhongwen_tools/romanization/romanization_table'
-# NOTE: Creates several dynamic Modules and their associated methods.
-#       e.g. ZhongwenTools::Romanization::ZhuyinFuhao.to_bpmf
-#            ZhongwenTools::Romanization::WadeGiles.to_wg
 module ZhongwenTools
+  # Public: Romanization converts, detects and splits different romanizations.
   module Romanization
     def self.convert(str, to, from)
       # NOTE: don't convert if it already is converted.
@@ -105,14 +103,14 @@ module ZhongwenTools
     end
     def self.convert_romanization(str, from, to)
-        # NOTE: extract/refactor tokens cause tests to fail.
-        if from == :pyn
-          tokens = ZhongwenTools::Romanization::Pinyin.split_pyn(str).uniq
-        else
-          tokens = romanization_module(from).send(:split, str).uniq
-        end
+      # NOTE: extract/refactor tokens cause tests to fail.
+      if from == :pyn
+        tokens = ZhongwenTools::Romanization::Pinyin.split_pyn(str).uniq
+      else
+        tokens = romanization_module(from).send(:split, str).uniq
+      end
-     tokens.collect do |t|
+      tokens.collect do |t|
         search, replace = find_token_replacement(t, str, to, from)
         str =  str.gsub(search, replace)
       end
@@ -193,14 +191,15 @@ module ZhongwenTools
       # TODO: memoize
       @memoized_romanization_values = {}
       @memoized_romanization_values[type] = ZhongwenTools::Romanization::ROMANIZATIONS_TABLE.map do |r|
-        "[#{r[type][0]}#{r[type][0].upcase}]#{r[type][1..-1]}" || r[:pyn]
+        "[#{ r[type][0] }#{ r[type][0].upcase }]#{ r[type][1..-1] }" || r[:pyn]
       end.flatten
       @memoized_romanization_values[type]
     end
-    def self.romanization_module(type)
-      module_name = RomanizationTypes.find{ |k,v| v.include?(type.to_s) }.first
+    def self.romanization_module(type = :py)
+      module_name = ROMANIZATION_TYPES.find{ |_k, v| v.include?(type.to_s) }.first
       ZhongwenTools::Romanization.const_get(module_name)
     end
@@ -208,8 +207,7 @@ module ZhongwenTools
       !str[/\-/].nil?
     end
-    # Internal: Creates romanization modules and their methods.
-    RomanizationTypes = {
+    ROMANIZATION_TYPES = {
       ZhuyinFuhao: %w(bpmf zhuyin_fuhao zhuyinfuhao zyfh zhyfh bopomofo),
       WadeGiles: %w(wg wade_giles),
       Yale: ['yale'],

data/lib/zhongwen_tools/romanization/pinyin.rb CHANGED Viewed

@@ -4,41 +4,43 @@ require 'zhongwen_tools/caps'
 require 'zhongwen_tools/romanization'
 module ZhongwenTools
+  # Public: Romanization converts to pinyin and pyn.
   module Romanization
     def self.convert_to_py(str, from)
       str =  convert_romanization(str, from, :pyn) if from != :pyn
-      ZhongwenTools::Romanization::Pinyin.convert_pyn_to_pinyin(str)
+      Pinyin.convert_pyn_to_pinyin(str)
     end
     def self.convert_to_pyn(str, from)
       orig_str = str.dup
       if from == :py
-        str = ZhongwenTools::Romanization::Pinyin.convert_pinyin_to_pyn(str)
+        str = Romanization::Pinyin.convert_pinyin_to_pyn(str)
       else
         str = convert_romanization(str, from, :pyn)
       end
-      str = ZhongwenTools::Romanization::Pinyin.add_hyphens_to_pyn(str) if hyphenated?(orig_str)
+      str = Romanization::Pinyin.add_hyphens_to_pyn(str) if hyphenated?(orig_str)
       str
     end
+    # Public: methods to convert, detect and split pinyin or
+    #         pyn (pinyin with numbers, e.g. hao3).
     module Pinyin
       %w(pinyin py pyn).each do |romanization|
         define_singleton_method("to_#{romanization}") do |*args|
           str, from = args
-          from ||= ZhongwenTools::Romanization.romanization? str
+          from ||= Romanization.romanization? str
-          # _convert_romanization str, _set_type(type.to_sym), _set_type(from)
-          ZhongwenTools::Romanization.convert str, py_type(romanization), (py_type(from) || from)
+          Romanization.convert str, py_type(romanization), (py_type(from) || from)
         end
       end
       def self.split_pyn(str)
         # FIXME: ignore punctuation
-        regex = str[/[1-5]/].nil? ?  /(#{ZhongwenTools::Regex.pinyin_toneless})/ : /(#{ZhongwenTools::Regex.pyn}|#{ZhongwenTools::Regex.pinyin_toneless})/
-        # NOTE: p[/[^\-]*/].to_s is 25% faster thang gsub('-', '')
+        regex = str[/[1-5]/].nil? ?  /(#{ Regex.pinyin_toneless })/ : /(#{ Regex.pyn }|#{ Regex.pinyin_toneless })/
+        # NOTE: p[/[^\-]*/].to_s is 25% faster than gsub('-', '')
         str.scan(regex).map{ |arr| arr[0].strip[/[^\-]*/].to_s }.flatten
       end
@@ -50,7 +52,7 @@ module ZhongwenTools
           # NOTE: Special Case "fǎnguāng" should be "fǎn" + "guāng"
           #       Special Case "yìnián" should be "yì" + "nián"
           word = word.gsub('ngu', 'n-gu')
-            .gsub(/([#{ ZhongwenTools::Regex.only_tones }])(ni[#{ ZhongwenTools::Regex.py_tones['a'] }])/){ "#{ $1 }-#{ $2 }" }
+          word = word.gsub(/([#{ Regex.only_tones }])(ni[#{ Regex.py_tones['a'] }])/){ "#{ $1 }-#{ $2 }" }
           result = word.split(/['\-]/).flatten.map do |x|
             find_py(x)
           end
@@ -70,14 +72,15 @@ module ZhongwenTools
       #
       # Returns Boolean.
       def self.py?(str)
-        if str[ZhongwenTools::Regex.only_tones].nil? && str[/[1-5]/].nil?
+        if str[Regex.only_tones].nil? && str[/[1-5]/].nil?
           pyn?(str)
         else
-          # NOTE: py regex does not include capitals with tones.
+          # TODO: py regex does not include capitals with tones.
           # NOTE: Special Case "fǎnguāng" should be "fǎn" + "guāng"
-          regex = /(#{ ZhongwenTools::Regex.punc }|#{ ZhongwenTools::Regex.py }|[\s\-])/
+          regex = /(#{ Regex.punc }|#{ Regex.py }|#{ Regex.py_syllabic_nasals }|[\s\-])/
           str = str.gsub('ngu', 'n-gu')
-          ZhongwenTools::Caps.downcase(str).gsub(regex, '').strip == ''
+          Caps.downcase(str).gsub(regex, '').strip == ''
         end
       end
@@ -90,8 +93,9 @@ module ZhongwenTools
       # Returns Boolean.
       def self.pyn?(str)
         # FIXME: use strip_punctuation method
-        normalized_str = ZhongwenTools::Caps.downcase(str.gsub(ZhongwenTools::Regex.punc, '').gsub(/[\s\-]/, ''))
+        normalized_str = Caps.downcase(str.gsub(Regex.punc, '').gsub(/[\s\-]/, ''))
         pyn_arr = split_pyn(normalized_str).map{ |p| p }
+        pyn_arr << normalized_str if pyn_arr.size == 0 && PYN_SYLLABIC_NASALS.include?(normalized_str.gsub(/[1-5]/, ''))
         pyn_matches_properly?(pyn_arr, normalized_str) &&
           are_all_pyn_syllables_complete?(pyn_arr)
@@ -112,7 +116,7 @@ module ZhongwenTools
       end
       def self.are_all_pyn_syllables_complete?(pyn_arr)
-        pyns = ROMANIZATIONS_TABLE.map{ |r| r[:pyn] }
+        pyns = ROMANIZATIONS_TABLE.map{ |r| r[:pyn] } + PYN_SYLLABIC_NASALS
         pyn_syllables = pyn_arr.select do |p|
           pyns.include?(p.gsub(/[1-5]/, ''))
@@ -128,20 +132,21 @@ module ZhongwenTools
       end
       def self.normalize_pinyin(pinyin)
-        [ZhongwenTools::Caps.downcase(pinyin), capitalized?(pinyin)]
+        [Caps.downcase(pinyin), capitalized?(pinyin)]
       end
       def self.find_py(str)
-        str.scan(ZhongwenTools::Regex.py).map{ |x| x.compact[0] }
+        regex = /(#{ Regex.py }|#{ Regex.py_syllabic_nasals })/
+        str.scan(regex).map{ |x| x.compact[0] }
       end
       def self.recapitalize(obj, capitalized)
         return obj unless capitalized
-        if obj.class == String
-          ZhongwenTools::Caps.capitalize(obj)
-        elsif obj.class == Array
-          [ZhongwenTools::Caps.capitalize(obj[0]), obj[1..-1]].flatten
+        if obj.is_a? String
+          Caps.capitalize(obj)
+        elsif obj.is_a? Array
+          [Caps.capitalize(obj[0]), obj[1..-1]].flatten
         end
       end
@@ -161,9 +166,8 @@ module ZhongwenTools
           # NOTE: if a word is upcase, then it will be converted the same
           #       as a word that is only capitalized.
           word, is_capitalized = normalize_pinyin(word)
           pys = split_py(word)
-          #is_capitalized ? ZhongwenTools::Caps.capitalize(result) : result
           recapitalize(current_pyn(word, pys), is_capitalized)
         end
@@ -171,11 +175,12 @@ module ZhongwenTools
       end
       def self.capitalized?(str)
-        str[0] != ZhongwenTools::Caps.downcase(str[0])
+        str[0] != Caps.downcase(str[0])
       end
       def self.current_pyn(pyn, pinyin_arr)
         replacements = []
         pinyin_arr.each do |pinyin|
           replace =  pinyin_replacement(pinyin)
           match = pinyin
@@ -194,6 +199,7 @@ module ZhongwenTools
         matches = PYN_PY.values.select do |x|
           py.include? x
         end
         match = select_pinyin_match(matches)
         replace = PYN_PY.find{ |k, v| k if v == match }[0]
@@ -220,13 +226,13 @@ module ZhongwenTools
       #  Returns a string with actual pinyin
       def self.convert_pyn_to_pinyin(str)
         regex = Regex.pinyin_num
-        # Using gsub is ~8x faster than using scan and each.
-        # Explanation: if it's pinyin without vowels, e.g. m, ng, then convert,
-        #              otherwise, check if it needs an apostrophe (http://www.pinyin.info/romanization/hanyu/apostrophes.html).
-        #              If it does, add it and then convert. Otherwise, just convert.
-        #              Oh, and if it has double hyphens, replace with one hyphen.
-        #              And finally, correct those apostrophes at the very end.
-        #              It's like magic.
+        # NOTE: Using gsub is ~8x faster than using scan and each.
+        # NOTE: if it's pinyin without vowels, e.g. m, ng, then convert,
+        #       otherwise, check if it needs an apostrophe (http://www.pinyin.info/romanization/hanyu/apostrophes.html).
+        #       If it does, add it and then convert. Otherwise, just convert it.
+        #       Oh, and if it has double hyphens, replace with one hyphen.
+        #       And finally, correct those apostrophes at the very end.
+        #       It's like magic.
         str.gsub(regex) do
           ($3.nil? ? "#{ PYN_PY[$1] }" : ($2 == '' && %w(a e o).include?($3[0,1]))? "'#{ PYN_PY["#{ $3 }#{ $6 }"]}#{ $4 }#{ $5 }" : "#{ $2 }#{ PYN_PY["#{ $3 }#{ $6 }"] }#{ $4 }#{ $5 }") + (($7.to_s.length > 1) ? '-' : '')
         end.gsub("-'", '-').sub(/^'/, '')

data/lib/zhongwen_tools/romanization/pinyin_table.rb CHANGED Viewed

@@ -2,10 +2,12 @@
 # NOTE: This table works for pyn -> pinyin conversion, but it introduces
 #       mistakes when converting pinyin to pyn. In practice, pinyin can't
-#       be converted to pyn with complete accuracy unless it is properly
+#       be converted to pyn with complete accuracy unless it is properly
 #       formatted.
 module ZhongwenTools
   module Romanization
+    PYN_SYLLABIC_NASALS = %w(ng m n)
     PYN_PY = {
       'A1' => 'Ā',
       'A2' => 'Á',
@@ -152,17 +154,42 @@ module ZhongwenTools
       'm3' => 'm̌', # using combining diacritical marks
       'm4' => 'm̀', # using combining diacritical marks
       'm5' => 'm',
-      'n1' => 'ēn',
-      'n2' => 'én',
-      'n3' => 'ěn',
-      'n4' => 'èn',
-      'n5' => 'en',
-      'ng1' => 'ēng',
-      'ng2' => 'éng',
-      'ng3' => 'ěng',
-      'ng4' => 'èng',
-      'ng5' => 'eng',
-      'r5' => 'r'
+      'n1' => 'n̄',
+      'n2' => 'ń',
+      'n3' => 'ň',
+      'n4' => 'ǹ',
+      'n5' => 'n',
+      'Ng1' => 'N̄g', # using combining diacritical marks
+      'Ng2' => 'Ńg',
+      'Ng3' => 'Ňg', # using combining diacritical marks
+      'Ng4' => 'Ǹg',
+      'Ng5' => 'Ng',
+      'ng1' => 'n̄g', # using combining diacritical marks
+      'ng2' => 'ńg',
+      'ng3' => 'ňg', # using combining diacritical marks
+      'ng4' => 'ǹg',
+      'ng5' => 'ng',
+      'r5' => 'r',
+      'ang1' => 'āng',
+      'ang2' => 'áng',
+      'ang3' => 'ǎng',
+      'ang4' => 'àng',
+      'ang5' => 'ang',
+      'eng1' => 'ēng',
+      'eng2' => 'éng',
+      'eng3' => 'ěng',
+      'eng4' => 'èng',
+      'eng5' => 'eng',
+      'ing1' => 'īng',
+      'ing2' => 'íng',
+      'ing3' => 'ǐng',
+      'ing4' => 'ìng',
+      'ing5' => 'ing',
+      'ong1' => 'ōng',
+      'ong2' => 'óng',
+      'ong3' => 'ǒng',
+      'ong4' => 'òng',
+      'ong5' => 'ong',
     }
   end
 end

data/lib/zhongwen_tools/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module ZhongwenTools
-  VERSION = '0.17.5'
+  VERSION = '0.18.0'
 end

data/test/test_pinyin.rb CHANGED Viewed

@@ -1,11 +1,11 @@
 # encoding: utf-8
-$:.unshift File.join(File.dirname(__FILE__),'..','lib')
+$LOAD_PATH.unshift File.join(File.dirname(__FILE__), '..', 'lib')
 require './test/test_helper'
 require 'zhongwen_tools/romanization/pinyin'
 class TestPinyin < Minitest::Test
   def test_split_pyn
-     @split_words.each do |w|
+    @split_words.each do |w|
       assert_equal w[:split], ZhongwenTools::Romanization::Pinyin.split_pyn(w[:pyn])
     end
   end
@@ -24,12 +24,16 @@ class TestPinyin < Minitest::Test
       refute ZhongwenTools::Romanization::Pinyin.py?(w[:pyn]), w.inspect
     end
-    assert  ZhongwenTools::Romanization::Pinyin.py? 'fǎnguāngjìng'
+    @syllabic_nasals.each do |w|
+      assert ZhongwenTools::Romanization::Pinyin.py?(w[:py]), w.inspect
+    end
+    assert ZhongwenTools::Romanization::Pinyin.py? 'fǎnguāngjìng'
     english_words = %w(cyan moose cling touch)
     english_words.each do |w|
-      refute ZhongwenTools::Romanization::Pinyin.py? w
+      refute ZhongwenTools::Romanization::Pinyin.py?(w), w
     end
   end
@@ -39,8 +43,14 @@ class TestPinyin < Minitest::Test
       assert ZhongwenTools::Romanization::Pinyin.pyn?(w[:pyn]), w.inspect
     end
-     assert ZhongwenTools::Romanization::Pinyin.pyn?('ma2-fan')
-     assert ZhongwenTools::Romanization::Pinyin.pyn?('yo1')
+    assert ZhongwenTools::Romanization::Pinyin.pyn?('ma2-fan')
+    assert ZhongwenTools::Romanization::Pinyin.pyn?('yo1')
+  end
+  def test_syllabic_nasal_pyn?
+    assert ZhongwenTools::Romanization::Pinyin.pyn?('ng3')
+    assert ZhongwenTools::Romanization::Pinyin.pyn?('m3')
+    assert ZhongwenTools::Romanization::Pinyin.pyn?('n3')
   end
   def test_pyn_to_pinyin
@@ -48,27 +58,43 @@ class TestPinyin < Minitest::Test
       assert_equal word[:py], ZhongwenTools::Romanization::Pinyin.to_pinyin(word[:pyn])
       assert_equal word[:py], ZhongwenTools::Romanization::Pinyin.to_py(word[:pyn])
     end
+    @syllabic_nasals.each do |word|
+      assert_equal word[:py], ZhongwenTools::Romanization::Pinyin.to_pinyin(word[:pyn])
+      assert_equal word[:py], ZhongwenTools::Romanization::Pinyin.to_py(word[:pyn])
+    end
   end
   def test_pinyin_to_pyn
     @words.each do |word|
       assert_equal word[:pyn], ZhongwenTools::Romanization::Pinyin.to_pyn(word[:py])
     end
+    @syllabic_nasals.each do |word|
+      assert_equal word[:pyn], ZhongwenTools::Romanization::Pinyin.to_pyn(word[:py]), word
+    end
     assert_equal 'yi2ge4', ZhongwenTools::Romanization::Pinyin.to_pyn('yígè')
     assert_equal 'yi4nian2', ZhongwenTools::Romanization::Pinyin.to_pyn('yìnián', :py)
   end
   def setup
     @hyphenated_words = [
-      {:pyn => 'A1-la1-bo2', :py => 'Ālābó'},
+      { :pyn => 'A1-la1-bo2', :py => 'Ālābó' },
       { :pyn => 'Mao2 Ze2-dong1', :py => 'Máo Zédōng' }
     ]
     @split_words = [
-      {:pyn => 'A1-la1-bo2',  :py => 'Ālābó', :split => %w(A1 la1 bo2), split_py: %w(Ā lā bó) },
+      { :pyn => 'A1-la1-bo2',  :py => 'Ālābó', :split => %w(A1 la1 bo2), split_py: %w(Ā lā bó) },
       { :pyn => 'Mao2 Ze2-dong1',  :py => 'Máo Zédōng', :split => %w(Mao2 Ze2 dong1), :split_py =>  %w(Máo Zé dōng) }
     ]
+    @syllabic_nasals = [
+      { pyn: 'ng3', py: 'ňg'},
+      { pyn: 'm3', py: 'm̌'},
+      { pyn: 'n3', py: 'ň'},
+      { pyn: 'Ng3', py: 'Ňg'}
+    ]
     @words = [
       { pyn: 'A1la1bo2', py: 'Ālābó'},

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: zhongwen_tools
 version: !ruby/object:Gem::Version
-  version: 0.17.5
+  version: 0.18.0
 platform: ruby
 authors:
 - Steven Daniels
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-02-25 00:00:00.000000000 Z
+date: 2015-03-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake