zhongwen_tools 0.18.1 → 0.18.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: abe0b5477d8f04c2cabfe1054feffa3d1994b9d2
4
- data.tar.gz: a14a4bc66804d0cbe0e3892ec48639384ae0f9bf
3
+ metadata.gz: 47bccf56d8e66407103478019b7b7e7355a493c8
4
+ data.tar.gz: 355e9aa4f41356610290d76461991274d563a9ad
5
5
  SHA512:
6
- metadata.gz: 34c8c883922b2e7cf314a6866bb54f6bc4462225492449699e70f65ccf1bd364be3cf9988a2d18edd08be1103703831e81de999dfb31a87c14dad5e1ccabaf8a
7
- data.tar.gz: 23c65688e09fa36a15ef0c2add20bee70c0ce167072f73a043fc68bd34b00b5475cada95909956643855a3bb57c1f36f52e8dadb6aa7f40d85816f443f22f9df
6
+ metadata.gz: f018708bf11c96460191d42aeb1f2734708936063d05d1f55fa745d6ccf5a5e1085a5e47ff542c1387ff57b5d07700c4a723bb0f2aa05f810e6d51aced91e2d1
7
+ data.tar.gz: a2edcea2042ba9236a295800375618c4efc0b74d11713797bcca9e475d6a36dcd85157b31e15ca8499da8ae598aef8d8899d3198586bf8b4382ba0bd349e5404
data/README.md CHANGED
@@ -5,7 +5,7 @@ Methods for dealing with Chinese.
5
5
  Status](https://img.shields.io/travis/stevendaniels/zhongwen_tools.svg?style=flat-square)](https://travis-ci.org/stevendaniels/zhongwen_tools) [![Dependency Status](https://img.shields.io/gemnasium/stevendaniels/zhongwen_tools.svg?style=flat-square)](https://gemnasium.com/stevendaniels/zhongwen_tools) [![Code Climate](https://img.shields.io/codeclimate/github/stevendaniels/zhongwen_tools.svg?style=flat-square)](https://codeclimate.com/github/stevendaniels/zhongwen_tools) [![Coverage Status](https://img.shields.io/coveralls/stevendaniels/zhongwen_tools.svg?style=flat-square)](https://coveralls.io/r/stevendaniels/zhongwen_tools)
6
6
  [![Gem Version](https://img.shields.io/gem/v/zhongwen_tools.svg?style=flat-square)](http://badge.fury.io/rb/zhongwen_tools)
7
7
 
8
- ## Installation
8
+ ##INSTALLATION
9
9
 
10
10
  Install as a gem
11
11
 
@@ -1,74 +1,74 @@
1
1
  # encoding: utf-8
2
-
3
2
  module ZhongwenTools
3
+ # Public: Module for pinyin/fullwidth capitalization
4
4
  module Caps
5
+ def self.downcase(str)
6
+ regex = /(#{ZhongwenTools::Caps::CAPS.keys.join('|')})/
7
+ str.gsub(regex, ZhongwenTools::Caps::CAPS).downcase
8
+ end
5
9
 
6
- def self.downcase(str)
7
- regex = /(#{ZhongwenTools::Caps::CAPS.keys.join('|')})/
8
- str.gsub(regex, ZhongwenTools::Caps::CAPS).downcase
9
- end
10
+ def self.upcase(str)
11
+ str.gsub(/(#{ZhongwenTools::Caps::CAPS.values.join('|')})/) do
12
+ ZhongwenTools::Caps::CAPS.find { |_, v| v == Regexp.last_match[0] }[0]
13
+ end.upcase
14
+ end
10
15
 
11
- def self.upcase(str)
12
- str.gsub(/(#{ZhongwenTools::Caps::CAPS.values.join('|')})/){
13
- ZhongwenTools::Caps::CAPS.find{ |k, v| v == $1 }[0]
14
- }.upcase
15
- end
16
-
17
- def self.capitalize(str)
18
- str.sub(str[0], ZhongwenTools::Caps.upcase(str[0]))
19
- end
16
+ def self.capitalize(str)
17
+ first_letter = str[/#{Regex.py}|[ĀÁǍÀĒÉĚÈĪÍǏÌŌÓǑÒ]/][0]
18
+ str.sub(first_letter, ZhongwenTools::Caps.upcase(first_letter))
19
+ end
20
20
 
21
- CAPS = {
22
- 'Ā' => 'ā',
23
- 'Á' => 'á',
24
- 'Ǎ' => 'ǎ',
25
- 'À' => 'à',
26
- 'Ē' => 'ē',
27
- 'É' => 'é',
28
- 'Ě' => 'ě',
29
- 'È' => 'è',
30
- 'Ī' => 'ī',
31
- 'Í' => 'í',
32
- 'Ǐ' => 'ǐ',
33
- 'Ì' => 'ì',
34
- 'Ō' => 'ō',
35
- 'Ó' => 'ó',
36
- 'Ǒ' => 'ǒ',
37
- 'Ò' => 'ò',
38
- 'Ǖ' => 'ǖ', # using combining diatrical marks
39
- 'Ǘ' => 'ǘ', # using combining diatrical marks
40
- 'Ǚ' => 'ǚ', # using combining diatrical marks
41
- 'Ǜ' => 'ǜ', # using combining diatrical marks
42
- 'Ū' => 'ū',
43
- 'Ú' => 'ú',
44
- 'Ǔ' => 'ǔ',
45
- 'Ù' => 'ù',
46
- 'A' => 'a',
47
- 'B' => 'b',
48
- 'C' => 'c',
49
- 'D' => 'd',
50
- 'E' => 'e',
51
- 'F' => 'f',
52
- 'G' => 'g',
53
- 'H' => 'h',
54
- 'I' => 'i',
55
- 'J' => 'j',
56
- 'K' => 'k',
57
- 'L' => 'l',
58
- 'M' => 'm',
59
- 'N' => 'n',
60
- 'O' => 'o',
61
- 'P' => 'p',
62
- 'Q' => 'q',
63
- 'R' => 'r',
64
- 'S' => 's',
65
- 'T' => 't',
66
- 'U' => 'u',
67
- 'V' => 'v',
68
- 'W' => 'w',
69
- 'X' => 'x',
70
- 'Y' => 'y',
71
- 'Z' => 'z'
72
- }
21
+ CAPS = {
22
+ 'Ā' => 'ā',
23
+ 'Á' => 'á',
24
+ 'Ǎ' => 'ǎ',
25
+ 'À' => 'à',
26
+ 'Ē' => 'ē',
27
+ 'É' => 'é',
28
+ 'Ě' => 'ě',
29
+ 'È' => 'è',
30
+ 'Ī' => 'ī',
31
+ 'Í' => 'í',
32
+ 'Ǐ' => 'ǐ',
33
+ 'Ì' => 'ì',
34
+ 'Ō' => 'ō',
35
+ 'Ó' => 'ó',
36
+ 'Ǒ' => 'ǒ',
37
+ 'Ò' => 'ò',
38
+ 'Ǖ' => 'ǖ', # using combining diatrical marks
39
+ 'Ǘ' => 'ǘ', # using combining diatrical marks
40
+ 'Ǚ' => 'ǚ', # using combining diatrical marks
41
+ 'Ǜ' => 'ǜ', # using combining diatrical marks
42
+ 'Ū' => 'ū',
43
+ 'Ú' => 'ú',
44
+ 'Ǔ' => 'ǔ',
45
+ 'Ù' => 'ù',
46
+ 'A' => 'a',
47
+ 'B' => 'b',
48
+ 'C' => 'c',
49
+ 'D' => 'd',
50
+ 'E' => 'e',
51
+ 'F' => 'f',
52
+ 'G' => 'g',
53
+ 'H' => 'h',
54
+ 'I' => 'i',
55
+ 'J' => 'j',
56
+ 'K' => 'k',
57
+ 'L' => 'l',
58
+ 'M' => 'm',
59
+ 'N' => 'n',
60
+ 'O' => 'o',
61
+ 'P' => 'p',
62
+ 'Q' => 'q',
63
+ 'R' => 'r',
64
+ 'S' => 's',
65
+ 'T' => 't',
66
+ 'U' => 'u',
67
+ 'V' => 'v',
68
+ 'W' => 'w',
69
+ 'X' => 'x',
70
+ 'Y' => 'y',
71
+ 'Z' => 'z'
72
+ }
73
73
  end
74
74
  end
@@ -39,9 +39,9 @@ module ZhongwenTools
39
39
 
40
40
  def self.split_pyn(str)
41
41
  # FIXME: ignore punctuation
42
- regex = str[/[1-5]/].nil? ? /(#{ Regex.pinyin_toneless })/ : /(#{ Regex.pyn }|#{ Regex.pinyin_toneless })/
42
+ regex = str[/[1-5]/].nil? ? /(#{ Regex.pinyin_toneless })/ : /(#{ Regex.pyn }|#{ Regex.pinyin_toneless })/
43
43
  # NOTE: p[/[^\-]*/].to_s is 25% faster than gsub('-', '')
44
- str.scan(regex).map{ |arr| arr[0].strip[/[^\-]*/].to_s }.flatten
44
+ str.scan(regex).map { |arr| arr[0].strip[/[^\-]*/].to_s }.flatten
45
45
  end
46
46
 
47
47
  def self.split_py(str)
@@ -49,11 +49,8 @@ module ZhongwenTools
49
49
 
50
50
  results = words.map do |word|
51
51
  word, is_capitalized = normalize_pinyin(word)
52
- # NOTE: Special Case split_py("fǎnguāng") # => ["fǎn" + "guāng"]
53
- # Special Case split_py("yìnián") # => ["yì" + "nián"]
54
- # split_py("Xīní") # => ["Xī", "ní"]
55
- word = word.gsub(/(n)(g(#{ Regex.py_tones['o'] }|u))/){ "#{ $1 }-#{ $2 }" }
56
- word = word.gsub(/([#{ Regex.only_tones }])(n(#{ Regex.py_tones['v'] }|#{ Regex.py_tones['i'] }|[iu][#{ Regex.py_tones['a'] }]))/){ "#{ $1 }-#{ $2 }" }
52
+ word = normalize_n_g(word)
53
+ word = normalize_n(word)
57
54
  result = word.split(/['\-]/).flatten.map do |x|
58
55
  find_py(x)
59
56
  end
@@ -135,13 +132,29 @@ module ZhongwenTools
135
132
  { pyn: :pyn, py: :py, pinyin: :py }[romanization]
136
133
  end
137
134
 
135
+ # NOTE: Special Case split_py("fǎnguāng") # => ["fǎn" + "guāng"]
136
+ # In pinyin, sāngēng == sān gēng and sāng'ēng = sāng ēng
137
+ def self.normalize_n_g(pinyin)
138
+ regex = /(?<n_part>n)(?<g_part>g(#{Regex.py_tones['o']}|#{Regex.py_tones['u']}|#{Regex.py_tones['a']}|#{Regex.py_tones['e']}))/
139
+ pinyin.gsub(regex) do
140
+ "#{Regexp.last_match[:n_part]}-#{Regexp.last_match[:g_part]}"
141
+ end
142
+ end
143
+
144
+ def self.normalize_n(pinyin)
145
+ # Special Case split_py("yìnián") # => ["yì" + "nián"]
146
+ # split_py("Xīní") # => ["Xī", "ní"]
147
+ regex = /([#{ Regex.only_tones }])(n(#{Regex.py_tones['v']}|#{Regex.py_tones['i']}|[iu]|#{Regex.py_tones['e']}|[#{Regex.py_tones['a']}]))/
148
+ pinyin.gsub(regex) { "#{ $1 }-#{ $2 }" }
149
+ end
150
+
138
151
  def self.normalize_pinyin(pinyin)
139
152
  [Caps.downcase(pinyin), capitalized?(pinyin)]
140
153
  end
141
154
 
142
155
  def self.find_py(str)
143
156
  regex = /(#{ Regex.py }|#{ Regex.py_syllabic_nasals })/
144
- str.scan(regex).map{ |x| x.compact[0] }
157
+ str.scan(regex).map { |x| x.compact[0] }
145
158
  end
146
159
 
147
160
  def self.recapitalize(obj, capitalized)
@@ -179,21 +192,15 @@ module ZhongwenTools
179
192
  end
180
193
 
181
194
  def self.capitalized?(str)
182
- str[0] != Caps.downcase(str[0])
195
+ first_letter = str[/#{Regex.py}|[ĀÁǍÀĒÉĚÈĪÍǏÌŌÓǑÒ]|#{Regex.py_syllabic_nasals}/][0]
196
+
197
+ first_letter != Caps.downcase(first_letter)
183
198
  end
184
199
 
185
200
  def self.current_pyn(pyn, pinyin_arr)
186
- replacements = []
187
-
188
201
  pinyin_arr.each do |pinyin|
189
202
  replace = pinyin_replacement(pinyin)
190
- match = pinyin
191
- if replacements.size > 0
192
- pyn = pyn.sub(/(#{ replacements.join('.*') }.*)#{ match }/){ $1 + replace }
193
- else
194
- pyn = pyn.sub(/#{match}/){ "#{ $1 }#{ replace }" }
195
- end
196
- replacements << replace
203
+ pyn.sub!(pinyin, replace)
197
204
  end
198
205
 
199
206
  pyn.gsub("'", '')
@@ -205,14 +212,14 @@ module ZhongwenTools
205
212
  end
206
213
 
207
214
  match = select_pinyin_match(matches)
208
- replace = PYN_PY.find{ |k, v| k if v == match }[0]
215
+ replace = PYN_PY.find { |k, v| k if v == match }[0]
209
216
 
210
217
  py.gsub(match, replace).gsub(/([^\d ]*)(\d)([^\d ]*)/){ $1 + $3 + $2 }
211
218
  end
212
219
 
213
220
  def self.select_pinyin_match(matches)
214
221
  # take the longest pinyin match. Use bytes because 'è' is prefered over 'n' or 'r' or 'm'
215
- match = matches.sort{ |x, y| x.bytes.to_a.length <=> y.bytes.to_a.length }[-1]
222
+ match = matches.sort { |x, y| x.bytes.to_a.length <=> y.bytes.to_a.length }[-1]
216
223
 
217
224
  # Edge case.. en/eng pyn -> py conversion is one way only.
218
225
  match[/^(ē|é|ě|è|e)n?g?/].nil? ? match : match.chars[0]
@@ -68,19 +68,19 @@ module ZhongwenTools
68
68
  type ||= romanization?(str)
69
69
 
70
70
  if type == :py
71
- ZhongwenTools::Romanization::Pinyin.split_py(str)
71
+ ZhongwenTools::Romanization::Pinyin.split_py(str)
72
72
  elsif type == :pyn
73
- ZhongwenTools::Romanization::Pinyin.split_pyn(str)
73
+ ZhongwenTools::Romanization::Pinyin.split_pyn(str)
74
74
  elsif type == :bpmf
75
- ZhongwenTools::Romanization::ZhuyinFuhao.split(str)
75
+ ZhongwenTools::Romanization::ZhuyinFuhao.split(str)
76
76
  elsif type == :wg
77
- ZhongwenTools::Romanization::WadeGiles.split(str)
77
+ ZhongwenTools::Romanization::WadeGiles.split(str)
78
78
  elsif type == :typy
79
- ZhongwenTools::Romanization::TongyongPinyin.split(str)
79
+ ZhongwenTools::Romanization::TongyongPinyin.split(str)
80
80
  elsif type == :yale
81
- ZhongwenTools::Romanization::Yale.split(str)
81
+ ZhongwenTools::Romanization::Yale.split(str)
82
82
  elsif type == :mps2
83
- ZhongwenTools::Romanization::MPS2.split(str)
83
+ ZhongwenTools::Romanization::MPS2.split(str)
84
84
  end
85
85
  end
86
86
 
@@ -88,7 +88,7 @@ module ZhongwenTools
88
88
 
89
89
  def self.detect_romanization(str, regex)
90
90
  normalized_str = str.downcase.gsub(ZhongwenTools::Regex.punc, '').gsub(/[1-5\s\-']/, '')
91
- #TODO: ignore tonal marks from other systems wade giles, tongyong etc.
91
+ # TODO: ignore tonal marks from other systems wade giles, tongyong etc.
92
92
 
93
93
  normalized_str.scan(regex).join == normalized_str
94
94
  end
@@ -1,3 +1,3 @@
1
1
  module ZhongwenTools
2
- VERSION = '0.18.1'
2
+ VERSION = '0.18.2'
3
3
  end
data/test/test_caps.rb CHANGED
@@ -17,10 +17,10 @@ class TestCaps < Minitest::Test
17
17
 
18
18
  def test_capitalize
19
19
  assert_equal @caps[:c], ZhongwenTools::Caps.capitalize(@caps[:d])
20
+ assert_equal '"Zheng4qie1"', ZhongwenTools::Caps.capitalize('"Zheng4qie1"')
20
21
  end
21
22
 
22
23
  def setup
23
24
  @caps = { u: 'ĀLĀBÓ', d: 'ālābó', c: 'Ālābó' }
24
25
  end
25
26
  end
26
-
data/test/test_pinyin.rb CHANGED
@@ -15,15 +15,19 @@ class TestPinyin < Minitest::Test
15
15
  assert_equal w[:split_py], ZhongwenTools::Romanization::Pinyin.split_py(w[:py])
16
16
  end
17
17
 
18
- assert_equal ['fǎn', 'guāng', 'jìng'], ZhongwenTools::Romanization::Pinyin.split_py('fǎnguāngjìng')
18
+ assert_equal %w(fǎn guāng jìng), ZhongwenTools::Romanization::Pinyin.split_py('fǎnguāngjìng')
19
19
  assert_equal ['Yīng', 'guó'], ZhongwenTools::Romanization::Pinyin.split_py('Yīngguó')
20
20
  assert_equal ['Xī', 'ní'], ZhongwenTools::Romanization::Pinyin.split_py('Xīní')
21
- assert_equal ['bàn', 'gōng', 'lóu'], ZhongwenTools::Romanization::Pinyin.split_py('bàngōnglóu')
21
+ assert_equal %w(bàn gōng lóu), ZhongwenTools::Romanization::Pinyin.split_py('bàngōnglóu')
22
22
  assert_equal ['jì', 'nǚ'], ZhongwenTools::Romanization::Pinyin.split_py('jìnǚ')
23
23
  assert_equal ['sè', 'guǐ'], ZhongwenTools::Romanization::Pinyin.split_py('sèguǐ')
24
24
  assert_equal ['qǔ', 'nuǎn'], ZhongwenTools::Romanization::Pinyin.split_py('qǔnuǎn')
25
- assert_equal ['wán', '', 'r'], ZhongwenTools::Romanization::Pinyin.split_py('wányìr')
26
- assert_equal ['yīng', "ér"], ZhongwenTools::Romanization::Pinyin.split_py("yīng'ér")
25
+ assert_equal %w(wán yì r), ZhongwenTools::Romanization::Pinyin.split_py('wányìr')
26
+ assert_equal ['yīng', 'ér'], ZhongwenTools::Romanization::Pinyin.split_py("yīng'ér")
27
+ assert_equal ['xiǎn', 'gù'], ZhongwenTools::Romanization::Pinyin.split_py('xiǎngù')
28
+ assert_equal ['nián', 'gāo'], ZhongwenTools::Romanization::Pinyin.split_py('niángāo')
29
+ assert_equal %w(fú shè néng), ZhongwenTools::Romanization::Pinyin.split_py('fúshènéng')
30
+ assert_equal ['sān', 'gēng'], ZhongwenTools::Romanization::Pinyin.split_py('sāngēng')
27
31
  end
28
32
 
29
33
  def test_py?
@@ -84,17 +88,19 @@ class TestPinyin < Minitest::Test
84
88
 
85
89
  assert_equal 'yi2ge4', ZhongwenTools::Romanization::Pinyin.to_pyn('yígè')
86
90
  assert_equal 'yi4nian2', ZhongwenTools::Romanization::Pinyin.to_pyn('yìnián', :py)
91
+ assert_equal 'hei1hu1hu1', ZhongwenTools::Romanization::Pinyin.to_pyn('hēihūhū', :py)
92
+ assert_equal '"Zheng4qie1"', ZhongwenTools::Romanization::Pinyin.to_pyn('"Zhèngqiē"', :py)
87
93
  end
88
94
 
89
95
  def setup
90
96
  @hyphenated_words = [
91
- { :pyn => 'A1-la1-bo2', :py => 'Ālābó' },
92
- { :pyn => 'Mao2 Ze2-dong1', :py => 'Máo Zédōng' }
97
+ { pyn: 'A1-la1-bo2', py: 'Ālābó' },
98
+ { pyn: 'Mao2 Ze2-dong1', py: 'Máo Zédōng' }
93
99
  ]
94
100
 
95
101
  @split_words = [
96
- { :pyn => 'A1-la1-bo2', :py => 'Ālābó', :split => %w(A1 la1 bo2), split_py: %w(Ā lā bó) },
97
- { :pyn => 'Mao2 Ze2-dong1', :py => 'Máo Zédōng', :split => %w(Mao2 Ze2 dong1), :split_py => %w(Máo Zé dōng) }
102
+ { pyn: 'A1-la1-bo2', py: 'Ālābó', split: %w(A1 la1 bo2), split_py: %w(Ā lā bó) },
103
+ { pyn: 'Mao2 Ze2-dong1', py: 'Máo Zédōng', split: %w(Mao2 Ze2 dong1), split_py: %w(Máo Zé dōng) }
98
104
  ]
99
105
 
100
106
  @syllabic_nasals = [
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
- $:.push File.expand_path('../lib', __FILE__)
2
+ $LOAD_PATH.push File.expand_path('../lib', __FILE__)
3
3
  require 'zhongwen_tools/version'
4
4
 
5
5
  Gem::Specification.new do |s|
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: zhongwen_tools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.18.1
4
+ version: 0.18.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Steven Daniels
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-02 00:00:00.000000000 Z
11
+ date: 2015-04-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake