RubyGems - zhongwen_tools - Versions diffs - 0.11.1 → 0.12.1 - Mend

zhongwen_tools 0.11.1 → 0.12.1

Files changed (10) hide show

checksums.yaml +4 -4
data/lib/zhongwen_tools/romanization.rb +19 -18
data/lib/zhongwen_tools/romanization/conversion_table.rb +2 -0
data/lib/zhongwen_tools/romanization/detect.rb +1 -1
data/lib/zhongwen_tools/romanization/string.rb +15 -1
data/lib/zhongwen_tools/string.rb +2 -1
data/lib/zhongwen_tools/string/fullwidth.rb +5 -1
data/lib/zhongwen_tools/version.rb +1 -1
data/test/test_romanization.rb +20 -3
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 9649f50da4798d4a4606af88575190af02466a4f
-  data.tar.gz: edd58503dbd3310b5e18569c45cfcba15301b5a9
+  metadata.gz: 38e857f5b289cca5e024238a437b3e69ca74a443
+  data.tar.gz: cdd2214ad7fb466252e5416f485a261b447dcaf6
 SHA512:
-  metadata.gz: acccbe6b57274b2d706f7cf565e2b061580086be1ab52ae2aba99bafabc47f87d12bad1bbb44eed890564454c4bb9a37d6c6ae555e46a072ab30d883ca86f783
-  data.tar.gz: 0dd7a4044acbb69dac6f215752671426b36b646d0c6b81f452656a278bf3c68259843593367430ae951aea134acf8cff02dd357a1518b778f0b95fd164668b0c
+  metadata.gz: 1185558e187c41e55870236bae4261c3be2f4227acd9fe3b7c3d53cfeeec17ed7a7dae6d516ef44055872cb1fbc2f71c31973caa02d4ad71984790a8070ac2f3
+  data.tar.gz: f7cb7d3b2c486e9faebb56d751b4077bbce7f63d5629dcc00c94470432d61e2460a2f2b41535cf767abd64cb3a5a79dcedb491da36fb377b9f630bad4db4b427

data/lib/zhongwen_tools/romanization.rb CHANGED Viewed

@@ -75,14 +75,16 @@ module ZhongwenTools
       end.gsub("-'","-").sub(/^'/,'')
     end
-    # http://en.wikipedia.org/wiki/Pinyin
-    # http://talkbank.org/pinyin/Trad_chart_IPA.php
-    # for ipa
     def _to_romanization str, to, from
       convert_to = _set_type to
       convert_from = _set_type from
-      tokens = str.split(/[ \-]/).uniq
+      begin
+        tokens = self.send("split_#{from}").uniq
+      rescue
+        tokens = str.split(/[ \-]/).uniq
+      end
       tokens.collect do |t|
         search = t.gsub(/[1-5].*/,'')
@@ -121,20 +123,15 @@ module ZhongwenTools
       result =
         if to == :py
-          raise NotImplementedError, 'method not implemented' if from != :pyn
-          # convert to pyn first.
-          # TODO: test :zyfh -> py
-          # str = _to_romanization str, to, :pyn if from != :pyn
+          str = _to_romanization str, :pyn, from if from != :pyn
           _to_pinyin str
         elsif to == :pyn
           if from == :py
             _convert_pinyin_to_pyn(str)
           else
-             _to_romanization str, :pyn, from
+            _to_romanization str, :pyn, from
           end
         else
-           str = _to_romanization str, to, :pyn if from != :pyn
           _to_romanization str, to, from
         end
@@ -149,7 +146,6 @@ module ZhongwenTools
       words =  pinyin.split(' ')
       pyn = words.map do |word|
-        #binding.pry if word == "Wǒmen"
         pys = word.split(/['\-]/).flatten.map{|x| x.scan(Regex.py).map{|x| (x - [nil])[0]}}.flatten
         _current_pyn(word, pys)
       end
@@ -158,22 +154,27 @@ module ZhongwenTools
     end
     def _current_pyn(pyn, pinyin_arr)
+      replacements = []
       pinyin_arr.each do |pinyin|
-        pyn = pyn.sub(pinyin, pinyin_replacement(pinyin))
+        replace =  pinyin_replacement(pinyin)
+        match = pinyin
+        pyn = pyn.sub(/(#{replacements.join('.*')}.*)#{match}/){ $1 + replace}
+        replacements << replace
       end
       pyn.gsub("'",'')
     end
     def pinyin_replacement(py)
-      #take the longest pinyin match.
-      match = PYN_PY.values.select do |x|
+      matches = PYN_PY.values.select do |x|
         py.include? x
-      end.sort{|x,y| x.length <=> y.length}[-1]
+      end
+      # take the longest pinyin match. Use bytes because 'è' is prefered over 'n' or 'r' or 'm'
+      match = matches.sort{|x,y| x.bytes.to_a.length <=> y.bytes.to_a.length}[-1]
-      #binding.pry
       # Edge case.. en/eng pyn -> py conversion is one way only.
-      match = match[/(ē|é|ě|è|e)n?g?/].nil? ? match : match.chars[0]
+      match = match[/^(ē|é|ě|è|e)n?g?/].nil? ? match : match.chars[0]
       replace = PYN_PY.find{|k,v| k if v == match}[0]

data/lib/zhongwen_tools/romanization/conversion_table.rb CHANGED Viewed

@@ -2,6 +2,8 @@
 module ZhongwenTools
   module Romanization
     # TODO: remove excess values, i.e. keys whose value == :pyn
+    # TODO: http://en.wikipedia.org/wiki/Jyutping
+    # TODO: http://en.wikipedia.org/wiki/Simplified_Wade
 ROMANIZATIONS_TABLE = [{:zyfh =>  " ㄚ", :wg =>  "a", :mps2 =>  "a", :yale =>  "a", :typy =>  "a", :pyn =>  "a"},
 { :zyfh =>  "ㄞ", :wg =>  "ai", :mps2 =>  "ai", :yale =>  "ai", :typy =>  "ai", :pyn =>  "ai"},
 { :zyfh =>  "ㄢ", :wg =>  "an", :mps2 =>  "an", :yale =>  "an", :typy =>  "an", :pyn =>  "an"},

data/lib/zhongwen_tools/romanization/detect.rb CHANGED Viewed

@@ -126,7 +126,7 @@ module ZhongwenTools
     #
     # Returns a Regexp.
     def detect_regex(type)
-      /#{ROMANIZATIONS_TABLE.map{ |r| r[type] || r[:pyn] }.sort{|x,y| x.size <=> y.size}.reverse.join('|')}/
+      /#{ROMANIZATIONS_TABLE.map{ |r| "[#{r[type][0]}#{r[type][0].upcase}]#{r[type][1..-1]}" || r[:pyn] }.flatten.sort{|x,y| x.size <=> y.size}.reverse.join('|')}/
     end
   end
 end

data/lib/zhongwen_tools/romanization/string.rb CHANGED Viewed

@@ -16,8 +16,22 @@ module ZhongwenTools
     def split_pyn(str = nil)
       str ||= self
       puts "WARNING: string is not valid pinyin-num format. #{str}" unless str.pyn?
+      # FIXME: ignore punctuation
+      str.scan(/(#{Regex.pyn})/).map{ |arr| arr[0].strip.gsub('-','') }.flatten
+    end
+    def split_zyfh(str = nil)
+      str ||= self
+      str.scan(/([#{Regex.bopomofo}]*)/).map{ |arr| arr[0].strip.gsub('-','') }.flatten - ['']
+    end
-      str.scan(/(#{ZhongwenTools::Regex.pyn})/).map{ |arr| arr[0].strip.gsub('-','') }.flatten
+    %w(typy wg yale mps2).each do |type|
+      define_method("split_#{type}") do |str = nil|
+        str ||= self
+        # TODO: ignore tonal marks from other systems wade giles, tongyong etc.
+        str.scan(/(#{detect_regex(type.to_sym)}*)/).map{ |arr| arr[0].strip.gsub('-','') }.flatten - ['']
+      end
     end
   end
 end

data/lib/zhongwen_tools/string.rb CHANGED Viewed

@@ -24,7 +24,8 @@ class String
   def capitalize
     #sub only substitues the first occurence.
-    self.sub(self.chars[0], self.chars[0].upcase)
+    c = self.chars[0]
+    self.sub(c, c.upcase) unless c.nil?
   end
   def scan_utf8(regex)

data/lib/zhongwen_tools/string/fullwidth.rb CHANGED Viewed

@@ -76,6 +76,10 @@ module ZhongwenTools
     '＝' => '=',
     "；" => ";",
     "＜" => "<",
-    "＞" => ">"
+    "＞" => ">",
+    "？" => "?",
+    "。" => ".",
+    "！" => "!",
+    '，' => ','
   }
 end

data/lib/zhongwen_tools/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module ZhongwenTools
-  VERSION = '0.11.1'
+  VERSION = '0.12.1'
 end

data/test/test_romanization.rb CHANGED Viewed

@@ -22,6 +22,18 @@ class TestRomanization < Minitest::Test
     assert @alabo[:py].py?
     assert 'Ā-lā-bó'.py?
     assert 'Zhong1 wen2'.to_pinyin.py?
+    @romanizations.each do |rom|
+      rom.each do |type, entry|
+        if type == :bopomofo
+        assert_equal rom[:py].downcase, entry.to_pinyin(type).downcase, "to_pinyin(#{type}) should convert to pinyin."
+        assert_equal rom[:py].downcase, entry.to_pinyin.downcase, "to_pinyin(#{type}) should convert to pinyin, but it isn't detected properly"
+        else
+        assert_equal rom[:py], entry.to_pinyin(type), "to_pinyin(#{type}) should convert to pinyin."
+        assert_equal rom[:py], entry.to_pinyin, "to_pinyin(#{type}) should convert to pinyin, but it isn't detected properly" unless type == :typy
+        end
+      end
+    end
   end
   def test_pyn
@@ -42,6 +54,9 @@ class TestRomanization < Minitest::Test
     assert_equal 'Wo3men5', "Wǒmen".to_pyn(:py)
+    assert_equal 'hao3xue2', 'hǎoxué'.to_pyn(:py)
+    assert_equal 'tai4re4', 'tàirè'.to_pyn(:py)
+    assert_equal 'tai4tai5', "tàitai".to_pyn(:py)
     #assert_equal 'Wu1-lu2-ha1-nuo4-fu1', 'Wūlúhānuòfū'.to_pyn(:py)
     #"007：Dàpò Liàngzǐ Wēijī", "007: Da4po4 Liang4zi3 Wei1ji1"
   end
@@ -122,10 +137,12 @@ class TestRomanization < Minitest::Test
     @romanizations = [
       # FIXME: bopomofo, tongyong pinyin, wade-giles tones are all wrong.
       # TODO: test IPA
-      { :pyn => 'ni3 hao', :py => 'nǐ hǎo', :bopomofo => 'ㄋㄧ3 ㄏㄠ3', :yale => 'ni3 hau3', :typy => 'ni3 hao3', :wg => 'ni3 hao3'},#, :ipa => ''}
-      { :pyn => 'zhong1 guo2', :py => 'nǐ hǎo', :bopomofo => 'ㄋㄧ3 ㄏㄠ3', :yale => 'ni3 hau3', :typy => 'ni3 hao3', :wg => 'chung1 kuo2'},#, :ipa => ''}
-      { :pyn => 'chui1 niu3', :py => '', :bopomofo => '', :yale => 'chwei1 nyou3', :typy => 'chuei1 niou3', :wg => 'chung1 kuo2'},#, :ipa => ''}
+      { :pyn => 'ni3 hao3', :py => 'nǐ hǎo', :bopomofo => 'ㄋㄧ3 ㄏㄠ3', :yale => 'ni3 hau3', :typy => 'ni3 hao3', :wg => 'ni3 hao3'},#, :ipa => ''}
+      { :pyn => 'Zhong1guo2', :py => 'Zhōngguó', :bopomofo => 'ㄓㄨㄥ1ㄍㄨㄛ2', :yale => 'Jung1gwo2', :typy => 'Jhong1guo2', :wg => 'Chung1kuo2'},#, :ipa => ''}
+      { :pyn => 'chui1 niu3', :py => "chuī niǔ", :bopomofo =>  "ㄔㄨㄟ1 ㄋㄧㄡ3", :yale => "chwei1 nyou3", :typy => "chuei1 niou3", :wg => "ch`ui1 niu3"},#, :ipa => ''}
+      { :pyn => 'Mao2 Ze2-dong1', :py => 'Máo Zédōng', :bopomofo => 'ㄇㄠ2 ㄗㄜ2ㄉㄨㄥ1', :yale => 'Mau2 Dze2-dung1', :typy => 'Mao2 Ze2-dong1', :wg => 'Mao2 Tse2-tung1'},#, :ipa => ''}
     ]
     @str = 'ni3 hao3'
     @mzd = 'Mao2 Ze2 dong1'
     @mzd2 = 'Mao2 Ze2-dong1'

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: zhongwen_tools
 version: !ruby/object:Gem::Version
-  version: 0.11.1
+  version: 0.12.1
 platform: ruby
 authors:
 - Steven Daniels
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-06-02 00:00:00.000000000 Z
+date: 2014-06-11 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake