RubyGems - zhongwen_tools - Versions diffs - 0.18.1 → 0.18.2 - Mend

zhongwen_tools 0.18.1 → 0.18.2

Files changed (10) hide show

checksums.yaml +4 -4
data/README.md +1 -1
data/lib/zhongwen_tools/caps.rb +66 -66
data/lib/zhongwen_tools/romanization/pinyin.rb +27 -20
data/lib/zhongwen_tools/romanization.rb +8 -8
data/lib/zhongwen_tools/version.rb +1 -1
data/test/test_caps.rb +1 -1
data/test/test_pinyin.rb +14 -8
data/zhongwen_tools.gemspec +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: abe0b5477d8f04c2cabfe1054feffa3d1994b9d2
-  data.tar.gz: a14a4bc66804d0cbe0e3892ec48639384ae0f9bf
+  metadata.gz: 47bccf56d8e66407103478019b7b7e7355a493c8
+  data.tar.gz: 355e9aa4f41356610290d76461991274d563a9ad
 SHA512:
-  metadata.gz: 34c8c883922b2e7cf314a6866bb54f6bc4462225492449699e70f65ccf1bd364be3cf9988a2d18edd08be1103703831e81de999dfb31a87c14dad5e1ccabaf8a
-  data.tar.gz: 23c65688e09fa36a15ef0c2add20bee70c0ce167072f73a043fc68bd34b00b5475cada95909956643855a3bb57c1f36f52e8dadb6aa7f40d85816f443f22f9df
+  metadata.gz: f018708bf11c96460191d42aeb1f2734708936063d05d1f55fa745d6ccf5a5e1085a5e47ff542c1387ff57b5d07700c4a723bb0f2aa05f810e6d51aced91e2d1
+  data.tar.gz: a2edcea2042ba9236a295800375618c4efc0b74d11713797bcca9e475d6a36dcd85157b31e15ca8499da8ae598aef8d8899d3198586bf8b4382ba0bd349e5404

data/README.md CHANGED Viewed

@@ -5,7 +5,7 @@ Methods for dealing with Chinese.
 Status](https://img.shields.io/travis/stevendaniels/zhongwen_tools.svg?style=flat-square)](https://travis-ci.org/stevendaniels/zhongwen_tools) [![Dependency Status](https://img.shields.io/gemnasium/stevendaniels/zhongwen_tools.svg?style=flat-square)](https://gemnasium.com/stevendaniels/zhongwen_tools) [![Code Climate](https://img.shields.io/codeclimate/github/stevendaniels/zhongwen_tools.svg?style=flat-square)](https://codeclimate.com/github/stevendaniels/zhongwen_tools) [![Coverage Status](https://img.shields.io/coveralls/stevendaniels/zhongwen_tools.svg?style=flat-square)](https://coveralls.io/r/stevendaniels/zhongwen_tools)
 [![Gem Version](https://img.shields.io/gem/v/zhongwen_tools.svg?style=flat-square)](http://badge.fury.io/rb/zhongwen_tools)
-## Installation
+##INSTALLATION
 Install as a gem

data/lib/zhongwen_tools/caps.rb CHANGED Viewed

@@ -1,74 +1,74 @@
 # encoding: utf-8
 module ZhongwenTools
+  # Public: Module for pinyin/fullwidth capitalization
   module Caps
+    def self.downcase(str)
+      regex = /(#{ZhongwenTools::Caps::CAPS.keys.join('|')})/
+      str.gsub(regex, ZhongwenTools::Caps::CAPS).downcase
+    end
-  def self.downcase(str)
-    regex = /(#{ZhongwenTools::Caps::CAPS.keys.join('|')})/
-    str.gsub(regex, ZhongwenTools::Caps::CAPS).downcase
-  end
+    def self.upcase(str)
+      str.gsub(/(#{ZhongwenTools::Caps::CAPS.values.join('|')})/) do
+        ZhongwenTools::Caps::CAPS.find { |_, v| v == Regexp.last_match[0] }[0]
+      end.upcase
+    end
-  def self.upcase(str)
-    str.gsub(/(#{ZhongwenTools::Caps::CAPS.values.join('|')})/){
-      ZhongwenTools::Caps::CAPS.find{ |k, v| v == $1 }[0]
-    }.upcase
-  end
-  def self.capitalize(str)
-    str.sub(str[0], ZhongwenTools::Caps.upcase(str[0]))
-  end
+    def self.capitalize(str)
+      first_letter = str[/#{Regex.py}|[ĀÁǍÀĒÉĚÈĪÍǏÌŌÓǑÒ]/][0]
+      str.sub(first_letter, ZhongwenTools::Caps.upcase(first_letter))
+    end
-  CAPS = {
-    'Ā' => 'ā',
-    'Á' => 'á',
-    'Ǎ' => 'ǎ',
-    'À' => 'à',
-    'Ē' => 'ē',
-    'É' => 'é',
-    'Ě' => 'ě',
-    'È' => 'è',
-    'Ī' => 'ī',
-    'Í' => 'í',
-    'Ǐ' => 'ǐ',
-    'Ì' => 'ì',
-    'Ō' => 'ō',
-    'Ó' => 'ó',
-    'Ǒ' => 'ǒ',
-    'Ò' => 'ò',
-    'Ǖ' => 'ǖ', # using combining diatrical marks
-    'Ǘ' => 'ǘ', # using combining diatrical marks
-    'Ǚ' => 'ǚ', # using combining diatrical marks
-    'Ǜ' => 'ǜ', # using combining diatrical marks
-    'Ū' => 'ū',
-    'Ú' => 'ú',
-    'Ǔ' => 'ǔ',
-    'Ù' => 'ù',
-    'Ａ' => 'ａ',
-    'Ｂ' => 'ｂ',
-    'Ｃ' => 'ｃ',
-    'Ｄ' => 'ｄ',
-    'Ｅ' => 'ｅ',
-    'Ｆ' => 'ｆ',
-    'Ｇ' => 'ｇ',
-    'Ｈ' => 'ｈ',
-    'Ｉ' => 'ｉ',
-    'Ｊ' => 'ｊ',
-    'Ｋ' => 'ｋ',
-    'Ｌ' => 'ｌ',
-    'Ｍ' => 'ｍ',
-    'Ｎ' => 'ｎ',
-    'Ｏ' => 'ｏ',
-    'Ｐ' => 'ｐ',
-    'Ｑ' => 'ｑ',
-    'Ｒ' => 'ｒ',
-    'Ｓ' => 'ｓ',
-    'Ｔ' => 'ｔ',
-    'Ｕ' => 'ｕ',
-    'Ｖ' => 'ｖ',
-    'Ｗ' => 'ｗ',
-    'Ｘ' => 'ｘ',
-    'Ｙ' => 'ｙ',
-    'Ｚ' => 'ｚ'
-  }
+    CAPS = {
+      'Ā' => 'ā',
+      'Á' => 'á',
+      'Ǎ' => 'ǎ',
+      'À' => 'à',
+      'Ē' => 'ē',
+      'É' => 'é',
+      'Ě' => 'ě',
+      'È' => 'è',
+      'Ī' => 'ī',
+      'Í' => 'í',
+      'Ǐ' => 'ǐ',
+      'Ì' => 'ì',
+      'Ō' => 'ō',
+      'Ó' => 'ó',
+      'Ǒ' => 'ǒ',
+      'Ò' => 'ò',
+      'Ǖ' => 'ǖ', # using combining diatrical marks
+      'Ǘ' => 'ǘ', # using combining diatrical marks
+      'Ǚ' => 'ǚ', # using combining diatrical marks
+      'Ǜ' => 'ǜ', # using combining diatrical marks
+      'Ū' => 'ū',
+      'Ú' => 'ú',
+      'Ǔ' => 'ǔ',
+      'Ù' => 'ù',
+      'Ａ' => 'ａ',
+      'Ｂ' => 'ｂ',
+      'Ｃ' => 'ｃ',
+      'Ｄ' => 'ｄ',
+      'Ｅ' => 'ｅ',
+      'Ｆ' => 'ｆ',
+      'Ｇ' => 'ｇ',
+      'Ｈ' => 'ｈ',
+      'Ｉ' => 'ｉ',
+      'Ｊ' => 'ｊ',
+      'Ｋ' => 'ｋ',
+      'Ｌ' => 'ｌ',
+      'Ｍ' => 'ｍ',
+      'Ｎ' => 'ｎ',
+      'Ｏ' => 'ｏ',
+      'Ｐ' => 'ｐ',
+      'Ｑ' => 'ｑ',
+      'Ｒ' => 'ｒ',
+      'Ｓ' => 'ｓ',
+      'Ｔ' => 'ｔ',
+      'Ｕ' => 'ｕ',
+      'Ｖ' => 'ｖ',
+      'Ｗ' => 'ｗ',
+      'Ｘ' => 'ｘ',
+      'Ｙ' => 'ｙ',
+      'Ｚ' => 'ｚ'
+    }
   end
 end

data/lib/zhongwen_tools/romanization/pinyin.rb CHANGED Viewed

@@ -39,9 +39,9 @@ module ZhongwenTools
       def self.split_pyn(str)
         # FIXME: ignore punctuation
-        regex = str[/[1-5]/].nil? ?  /(#{ Regex.pinyin_toneless })/ : /(#{ Regex.pyn }|#{ Regex.pinyin_toneless })/
+        regex = str[/[1-5]/].nil? ? /(#{ Regex.pinyin_toneless })/ : /(#{ Regex.pyn }|#{ Regex.pinyin_toneless })/
         # NOTE: p[/[^\-]*/].to_s is 25% faster than gsub('-', '')
-        str.scan(regex).map{ |arr| arr[0].strip[/[^\-]*/].to_s }.flatten
+        str.scan(regex).map { |arr| arr[0].strip[/[^\-]*/].to_s }.flatten
       end
       def self.split_py(str)
@@ -49,11 +49,8 @@ module ZhongwenTools
         results = words.map do |word|
           word, is_capitalized = normalize_pinyin(word)
-          # NOTE: Special Case split_py("fǎnguāng") # => ["fǎn" + "guāng"]
-          #       Special Case split_py("yìnián")   # => ["yì" + "nián"]
-          #                    split_py("Xīní")     # => ["Xī", "ní"]
-          word = word.gsub(/(n)(g(#{ Regex.py_tones['o'] }|u))/){ "#{ $1 }-#{ $2 }" }
-          word = word.gsub(/([#{ Regex.only_tones }])(n(#{ Regex.py_tones['v'] }|#{ Regex.py_tones['i'] }|[iu][#{ Regex.py_tones['a'] }]))/){ "#{ $1 }-#{ $2 }" }
+          word = normalize_n_g(word)
+          word = normalize_n(word)
           result = word.split(/['\-]/).flatten.map do |x|
             find_py(x)
           end
@@ -135,13 +132,29 @@ module ZhongwenTools
         { pyn: :pyn, py: :py, pinyin: :py }[romanization]
       end
+      # NOTE: Special Case split_py("fǎnguāng") # => ["fǎn" + "guāng"]
+      #       In pinyin, sāngēng == sān gēng and sāng'ēng = sāng ēng
+      def self.normalize_n_g(pinyin)
+        regex = /(?<n_part>n)(?<g_part>g(#{Regex.py_tones['o']}|#{Regex.py_tones['u']}|#{Regex.py_tones['a']}|#{Regex.py_tones['e']}))/
+        pinyin.gsub(regex) do
+          "#{Regexp.last_match[:n_part]}-#{Regexp.last_match[:g_part]}"
+        end
+      end
+      def self.normalize_n(pinyin)
+        #       Special Case split_py("yìnián")   # => ["yì" + "nián"]
+        #                    split_py("Xīní")     # => ["Xī", "ní"]
+        regex = /([#{ Regex.only_tones }])(n(#{Regex.py_tones['v']}|#{Regex.py_tones['i']}|[iu]|#{Regex.py_tones['e']}|[#{Regex.py_tones['a']}]))/
+        pinyin.gsub(regex) { "#{ $1 }-#{ $2 }" }
+      end
       def self.normalize_pinyin(pinyin)
         [Caps.downcase(pinyin), capitalized?(pinyin)]
       end
       def self.find_py(str)
         regex = /(#{ Regex.py }|#{ Regex.py_syllabic_nasals })/
-        str.scan(regex).map{ |x| x.compact[0] }
+        str.scan(regex).map { |x| x.compact[0] }
       end
       def self.recapitalize(obj, capitalized)
@@ -179,21 +192,15 @@ module ZhongwenTools
       end
       def self.capitalized?(str)
-        str[0] != Caps.downcase(str[0])
+        first_letter = str[/#{Regex.py}|[ĀÁǍÀĒÉĚÈĪÍǏÌŌÓǑÒ]|#{Regex.py_syllabic_nasals}/][0]
+        first_letter != Caps.downcase(first_letter)
       end
       def self.current_pyn(pyn, pinyin_arr)
-        replacements = []
         pinyin_arr.each do |pinyin|
           replace =  pinyin_replacement(pinyin)
-          match = pinyin
-          if replacements.size > 0
-            pyn = pyn.sub(/(#{ replacements.join('.*') }.*)#{ match }/){ $1 + replace }
-          else
-            pyn = pyn.sub(/#{match}/){ "#{ $1 }#{ replace }" }
-          end
-          replacements << replace
+          pyn.sub!(pinyin, replace)
         end
         pyn.gsub("'", '')
@@ -205,14 +212,14 @@ module ZhongwenTools
         end
         match = select_pinyin_match(matches)
-        replace = PYN_PY.find{ |k, v| k if v == match }[0]
+        replace = PYN_PY.find { |k, v| k if v == match }[0]
         py.gsub(match, replace).gsub(/([^\d ]*)(\d)([^\d ]*)/){ $1 + $3 + $2 }
       end
       def self.select_pinyin_match(matches)
         # take the longest pinyin match. Use bytes because 'è' is prefered over 'n' or 'r' or 'm'
-        match = matches.sort{ |x, y| x.bytes.to_a.length <=> y.bytes.to_a.length }[-1]
+        match = matches.sort { |x, y| x.bytes.to_a.length <=> y.bytes.to_a.length }[-1]
         # Edge case.. en/eng pyn -> py conversion is one way only.
         match[/^(ē|é|ě|è|e)n?g?/].nil? ? match : match.chars[0]

data/lib/zhongwen_tools/romanization.rb CHANGED Viewed

@@ -68,19 +68,19 @@ module ZhongwenTools
       type ||= romanization?(str)
       if type == :py
-         ZhongwenTools::Romanization::Pinyin.split_py(str)
+        ZhongwenTools::Romanization::Pinyin.split_py(str)
       elsif type == :pyn
-         ZhongwenTools::Romanization::Pinyin.split_pyn(str)
+        ZhongwenTools::Romanization::Pinyin.split_pyn(str)
       elsif type == :bpmf
-         ZhongwenTools::Romanization::ZhuyinFuhao.split(str)
+        ZhongwenTools::Romanization::ZhuyinFuhao.split(str)
       elsif type == :wg
-         ZhongwenTools::Romanization::WadeGiles.split(str)
+        ZhongwenTools::Romanization::WadeGiles.split(str)
       elsif type == :typy
-         ZhongwenTools::Romanization::TongyongPinyin.split(str)
+        ZhongwenTools::Romanization::TongyongPinyin.split(str)
       elsif type == :yale
-         ZhongwenTools::Romanization::Yale.split(str)
+        ZhongwenTools::Romanization::Yale.split(str)
       elsif type == :mps2
-         ZhongwenTools::Romanization::MPS2.split(str)
+        ZhongwenTools::Romanization::MPS2.split(str)
       end
     end
@@ -88,7 +88,7 @@ module ZhongwenTools
     def self.detect_romanization(str, regex)
       normalized_str = str.downcase.gsub(ZhongwenTools::Regex.punc, '').gsub(/[1-5\s\-']/, '')
-      #TODO: ignore tonal marks from other systems wade giles, tongyong etc.
+      # TODO: ignore tonal marks from other systems wade giles, tongyong etc.
       normalized_str.scan(regex).join == normalized_str
     end

data/lib/zhongwen_tools/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module ZhongwenTools
-  VERSION = '0.18.1'
+  VERSION = '0.18.2'
 end

data/test/test_caps.rb CHANGED Viewed

@@ -17,10 +17,10 @@ class TestCaps < Minitest::Test
   def test_capitalize
     assert_equal @caps[:c], ZhongwenTools::Caps.capitalize(@caps[:d])
+    assert_equal '"Zheng4qie1"', ZhongwenTools::Caps.capitalize('"Zheng4qie1"')
   end
   def setup
     @caps = { u: 'ĀLĀBÓ', d: 'ālābó', c: 'Ālābó' }
   end
 end

data/test/test_pinyin.rb CHANGED Viewed

@@ -15,15 +15,19 @@ class TestPinyin < Minitest::Test
       assert_equal w[:split_py], ZhongwenTools::Romanization::Pinyin.split_py(w[:py])
     end
-    assert_equal ['fǎn', 'guāng', 'jìng'], ZhongwenTools::Romanization::Pinyin.split_py('fǎnguāngjìng')
+    assert_equal %w(fǎn guāng jìng), ZhongwenTools::Romanization::Pinyin.split_py('fǎnguāngjìng')
     assert_equal ['Yīng', 'guó'], ZhongwenTools::Romanization::Pinyin.split_py('Yīngguó')
     assert_equal ['Xī', 'ní'], ZhongwenTools::Romanization::Pinyin.split_py('Xīní')
-    assert_equal ['bàn', 'gōng', 'lóu'], ZhongwenTools::Romanization::Pinyin.split_py('bàngōnglóu')
+    assert_equal %w(bàn gōng lóu), ZhongwenTools::Romanization::Pinyin.split_py('bàngōnglóu')
     assert_equal ['jì', 'nǚ'], ZhongwenTools::Romanization::Pinyin.split_py('jìnǚ')
     assert_equal ['sè', 'guǐ'], ZhongwenTools::Romanization::Pinyin.split_py('sèguǐ')
     assert_equal ['qǔ', 'nuǎn'], ZhongwenTools::Romanization::Pinyin.split_py('qǔnuǎn')
-    assert_equal ['wán', 'yì', 'r'], ZhongwenTools::Romanization::Pinyin.split_py('wányìr')
-    assert_equal ['yīng', "ér"], ZhongwenTools::Romanization::Pinyin.split_py("yīng'ér")
+    assert_equal %w(wán yì r), ZhongwenTools::Romanization::Pinyin.split_py('wányìr')
+    assert_equal ['yīng', 'ér'], ZhongwenTools::Romanization::Pinyin.split_py("yīng'ér")
+    assert_equal ['xiǎn', 'gù'], ZhongwenTools::Romanization::Pinyin.split_py('xiǎngù')
+    assert_equal ['nián', 'gāo'], ZhongwenTools::Romanization::Pinyin.split_py('niángāo')
+    assert_equal %w(fú shè néng), ZhongwenTools::Romanization::Pinyin.split_py('fúshènéng')
+    assert_equal ['sān', 'gēng'], ZhongwenTools::Romanization::Pinyin.split_py('sāngēng')
   end
   def test_py?
@@ -84,17 +88,19 @@ class TestPinyin < Minitest::Test
     assert_equal 'yi2ge4', ZhongwenTools::Romanization::Pinyin.to_pyn('yígè')
     assert_equal 'yi4nian2', ZhongwenTools::Romanization::Pinyin.to_pyn('yìnián', :py)
+    assert_equal 'hei1hu1hu1', ZhongwenTools::Romanization::Pinyin.to_pyn('hēihūhū', :py)
+    assert_equal '"Zheng4qie1"',  ZhongwenTools::Romanization::Pinyin.to_pyn('"Zhèngqiē"', :py)
   end
   def setup
     @hyphenated_words = [
-      { :pyn => 'A1-la1-bo2', :py => 'Ālābó' },
-      { :pyn => 'Mao2 Ze2-dong1', :py => 'Máo Zédōng' }
+      { pyn: 'A1-la1-bo2', py: 'Ālābó' },
+      { pyn: 'Mao2 Ze2-dong1', py: 'Máo Zédōng' }
     ]
     @split_words = [
-      { :pyn => 'A1-la1-bo2',  :py => 'Ālābó', :split => %w(A1 la1 bo2), split_py: %w(Ā lā bó) },
-      { :pyn => 'Mao2 Ze2-dong1',  :py => 'Máo Zédōng', :split => %w(Mao2 Ze2 dong1), :split_py =>  %w(Máo Zé dōng) }
+      { pyn: 'A1-la1-bo2',  py: 'Ālābó', split: %w(A1 la1 bo2), split_py: %w(Ā lā bó) },
+      { pyn: 'Mao2 Ze2-dong1',  py: 'Máo Zédōng', split: %w(Mao2 Ze2 dong1), split_py:  %w(Máo Zé dōng) }
     ]
     @syllabic_nasals = [

data/zhongwen_tools.gemspec CHANGED Viewed

@@ -1,5 +1,5 @@
 # -*- encoding: utf-8 -*-
-$:.push File.expand_path('../lib', __FILE__)
+$LOAD_PATH.push File.expand_path('../lib', __FILE__)
 require 'zhongwen_tools/version'
 Gem::Specification.new do |s|

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: zhongwen_tools
 version: !ruby/object:Gem::Version
-  version: 0.18.1
+  version: 0.18.2
 platform: ruby
 authors:
 - Steven Daniels
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-04-02 00:00:00.000000000 Z
+date: 2015-04-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake