zhongwen_tools 0.18.1 → 0.18.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: abe0b5477d8f04c2cabfe1054feffa3d1994b9d2
4
- data.tar.gz: a14a4bc66804d0cbe0e3892ec48639384ae0f9bf
3
+ metadata.gz: 47bccf56d8e66407103478019b7b7e7355a493c8
4
+ data.tar.gz: 355e9aa4f41356610290d76461991274d563a9ad
5
5
  SHA512:
6
- metadata.gz: 34c8c883922b2e7cf314a6866bb54f6bc4462225492449699e70f65ccf1bd364be3cf9988a2d18edd08be1103703831e81de999dfb31a87c14dad5e1ccabaf8a
7
- data.tar.gz: 23c65688e09fa36a15ef0c2add20bee70c0ce167072f73a043fc68bd34b00b5475cada95909956643855a3bb57c1f36f52e8dadb6aa7f40d85816f443f22f9df
6
+ metadata.gz: f018708bf11c96460191d42aeb1f2734708936063d05d1f55fa745d6ccf5a5e1085a5e47ff542c1387ff57b5d07700c4a723bb0f2aa05f810e6d51aced91e2d1
7
+ data.tar.gz: a2edcea2042ba9236a295800375618c4efc0b74d11713797bcca9e475d6a36dcd85157b31e15ca8499da8ae598aef8d8899d3198586bf8b4382ba0bd349e5404
data/README.md CHANGED
@@ -5,7 +5,7 @@ Methods for dealing with Chinese.
5
5
  Status](https://img.shields.io/travis/stevendaniels/zhongwen_tools.svg?style=flat-square)](https://travis-ci.org/stevendaniels/zhongwen_tools) [![Dependency Status](https://img.shields.io/gemnasium/stevendaniels/zhongwen_tools.svg?style=flat-square)](https://gemnasium.com/stevendaniels/zhongwen_tools) [![Code Climate](https://img.shields.io/codeclimate/github/stevendaniels/zhongwen_tools.svg?style=flat-square)](https://codeclimate.com/github/stevendaniels/zhongwen_tools) [![Coverage Status](https://img.shields.io/coveralls/stevendaniels/zhongwen_tools.svg?style=flat-square)](https://coveralls.io/r/stevendaniels/zhongwen_tools)
6
6
  [![Gem Version](https://img.shields.io/gem/v/zhongwen_tools.svg?style=flat-square)](http://badge.fury.io/rb/zhongwen_tools)
7
7
 
8
- ## Installation
8
+ ##INSTALLATION
9
9
 
10
10
  Install as a gem
11
11
 
@@ -1,74 +1,74 @@
1
1
  # encoding: utf-8
2
-
3
2
  module ZhongwenTools
3
+ # Public: Module for pinyin/fullwidth capitalization
4
4
  module Caps
5
+ def self.downcase(str)
6
+ regex = /(#{ZhongwenTools::Caps::CAPS.keys.join('|')})/
7
+ str.gsub(regex, ZhongwenTools::Caps::CAPS).downcase
8
+ end
5
9
 
6
- def self.downcase(str)
7
- regex = /(#{ZhongwenTools::Caps::CAPS.keys.join('|')})/
8
- str.gsub(regex, ZhongwenTools::Caps::CAPS).downcase
9
- end
10
+ def self.upcase(str)
11
+ str.gsub(/(#{ZhongwenTools::Caps::CAPS.values.join('|')})/) do
12
+ ZhongwenTools::Caps::CAPS.find { |_, v| v == Regexp.last_match[0] }[0]
13
+ end.upcase
14
+ end
10
15
 
11
- def self.upcase(str)
12
- str.gsub(/(#{ZhongwenTools::Caps::CAPS.values.join('|')})/){
13
- ZhongwenTools::Caps::CAPS.find{ |k, v| v == $1 }[0]
14
- }.upcase
15
- end
16
-
17
- def self.capitalize(str)
18
- str.sub(str[0], ZhongwenTools::Caps.upcase(str[0]))
19
- end
16
+ def self.capitalize(str)
17
+ first_letter = str[/#{Regex.py}|[ĀÁǍÀĒÉĚÈĪÍǏÌŌÓǑÒ]/][0]
18
+ str.sub(first_letter, ZhongwenTools::Caps.upcase(first_letter))
19
+ end
20
20
 
21
- CAPS = {
22
- 'Ā' => 'ā',
23
- 'Á' => 'á',
24
- 'Ǎ' => 'ǎ',
25
- 'À' => 'à',
26
- 'Ē' => 'ē',
27
- 'É' => 'é',
28
- 'Ě' => 'ě',
29
- 'È' => 'è',
30
- 'Ī' => 'ī',
31
- 'Í' => 'í',
32
- 'Ǐ' => 'ǐ',
33
- 'Ì' => 'ì',
34
- 'Ō' => 'ō',
35
- 'Ó' => 'ó',
36
- 'Ǒ' => 'ǒ',
37
- 'Ò' => 'ò',
38
- 'Ǖ' => 'ǖ', # using combining diatrical marks
39
- 'Ǘ' => 'ǘ', # using combining diatrical marks
40
- 'Ǚ' => 'ǚ', # using combining diatrical marks
41
- 'Ǜ' => 'ǜ', # using combining diatrical marks
42
- 'Ū' => 'ū',
43
- 'Ú' => 'ú',
44
- 'Ǔ' => 'ǔ',
45
- 'Ù' => 'ù',
46
- 'A' => 'a',
47
- 'B' => 'b',
48
- 'C' => 'c',
49
- 'D' => 'd',
50
- 'E' => 'e',
51
- 'F' => 'f',
52
- 'G' => 'g',
53
- 'H' => 'h',
54
- 'I' => 'i',
55
- 'J' => 'j',
56
- 'K' => 'k',
57
- 'L' => 'l',
58
- 'M' => 'm',
59
- 'N' => 'n',
60
- 'O' => 'o',
61
- 'P' => 'p',
62
- 'Q' => 'q',
63
- 'R' => 'r',
64
- 'S' => 's',
65
- 'T' => 't',
66
- 'U' => 'u',
67
- 'V' => 'v',
68
- 'W' => 'w',
69
- 'X' => 'x',
70
- 'Y' => 'y',
71
- 'Z' => 'z'
72
- }
21
+ CAPS = {
22
+ 'Ā' => 'ā',
23
+ 'Á' => 'á',
24
+ 'Ǎ' => 'ǎ',
25
+ 'À' => 'à',
26
+ 'Ē' => 'ē',
27
+ 'É' => 'é',
28
+ 'Ě' => 'ě',
29
+ 'È' => 'è',
30
+ 'Ī' => 'ī',
31
+ 'Í' => 'í',
32
+ 'Ǐ' => 'ǐ',
33
+ 'Ì' => 'ì',
34
+ 'Ō' => 'ō',
35
+ 'Ó' => 'ó',
36
+ 'Ǒ' => 'ǒ',
37
+ 'Ò' => 'ò',
38
+ 'Ǖ' => 'ǖ', # using combining diatrical marks
39
+ 'Ǘ' => 'ǘ', # using combining diatrical marks
40
+ 'Ǚ' => 'ǚ', # using combining diatrical marks
41
+ 'Ǜ' => 'ǜ', # using combining diatrical marks
42
+ 'Ū' => 'ū',
43
+ 'Ú' => 'ú',
44
+ 'Ǔ' => 'ǔ',
45
+ 'Ù' => 'ù',
46
+ 'A' => 'a',
47
+ 'B' => 'b',
48
+ 'C' => 'c',
49
+ 'D' => 'd',
50
+ 'E' => 'e',
51
+ 'F' => 'f',
52
+ 'G' => 'g',
53
+ 'H' => 'h',
54
+ 'I' => 'i',
55
+ 'J' => 'j',
56
+ 'K' => 'k',
57
+ 'L' => 'l',
58
+ 'M' => 'm',
59
+ 'N' => 'n',
60
+ 'O' => 'o',
61
+ 'P' => 'p',
62
+ 'Q' => 'q',
63
+ 'R' => 'r',
64
+ 'S' => 's',
65
+ 'T' => 't',
66
+ 'U' => 'u',
67
+ 'V' => 'v',
68
+ 'W' => 'w',
69
+ 'X' => 'x',
70
+ 'Y' => 'y',
71
+ 'Z' => 'z'
72
+ }
73
73
  end
74
74
  end
@@ -39,9 +39,9 @@ module ZhongwenTools
39
39
 
40
40
  def self.split_pyn(str)
41
41
  # FIXME: ignore punctuation
42
- regex = str[/[1-5]/].nil? ? /(#{ Regex.pinyin_toneless })/ : /(#{ Regex.pyn }|#{ Regex.pinyin_toneless })/
42
+ regex = str[/[1-5]/].nil? ? /(#{ Regex.pinyin_toneless })/ : /(#{ Regex.pyn }|#{ Regex.pinyin_toneless })/
43
43
  # NOTE: p[/[^\-]*/].to_s is 25% faster than gsub('-', '')
44
- str.scan(regex).map{ |arr| arr[0].strip[/[^\-]*/].to_s }.flatten
44
+ str.scan(regex).map { |arr| arr[0].strip[/[^\-]*/].to_s }.flatten
45
45
  end
46
46
 
47
47
  def self.split_py(str)
@@ -49,11 +49,8 @@ module ZhongwenTools
49
49
 
50
50
  results = words.map do |word|
51
51
  word, is_capitalized = normalize_pinyin(word)
52
- # NOTE: Special Case split_py("fǎnguāng") # => ["fǎn" + "guāng"]
53
- # Special Case split_py("yìnián") # => ["yì" + "nián"]
54
- # split_py("Xīní") # => ["Xī", "ní"]
55
- word = word.gsub(/(n)(g(#{ Regex.py_tones['o'] }|u))/){ "#{ $1 }-#{ $2 }" }
56
- word = word.gsub(/([#{ Regex.only_tones }])(n(#{ Regex.py_tones['v'] }|#{ Regex.py_tones['i'] }|[iu][#{ Regex.py_tones['a'] }]))/){ "#{ $1 }-#{ $2 }" }
52
+ word = normalize_n_g(word)
53
+ word = normalize_n(word)
57
54
  result = word.split(/['\-]/).flatten.map do |x|
58
55
  find_py(x)
59
56
  end
@@ -135,13 +132,29 @@ module ZhongwenTools
135
132
  { pyn: :pyn, py: :py, pinyin: :py }[romanization]
136
133
  end
137
134
 
135
+ # NOTE: Special Case split_py("fǎnguāng") # => ["fǎn" + "guāng"]
136
+ # In pinyin, sāngēng == sān gēng and sāng'ēng = sāng ēng
137
+ def self.normalize_n_g(pinyin)
138
+ regex = /(?<n_part>n)(?<g_part>g(#{Regex.py_tones['o']}|#{Regex.py_tones['u']}|#{Regex.py_tones['a']}|#{Regex.py_tones['e']}))/
139
+ pinyin.gsub(regex) do
140
+ "#{Regexp.last_match[:n_part]}-#{Regexp.last_match[:g_part]}"
141
+ end
142
+ end
143
+
144
+ def self.normalize_n(pinyin)
145
+ # Special Case split_py("yìnián") # => ["yì" + "nián"]
146
+ # split_py("Xīní") # => ["Xī", "ní"]
147
+ regex = /([#{ Regex.only_tones }])(n(#{Regex.py_tones['v']}|#{Regex.py_tones['i']}|[iu]|#{Regex.py_tones['e']}|[#{Regex.py_tones['a']}]))/
148
+ pinyin.gsub(regex) { "#{ $1 }-#{ $2 }" }
149
+ end
150
+
138
151
  def self.normalize_pinyin(pinyin)
139
152
  [Caps.downcase(pinyin), capitalized?(pinyin)]
140
153
  end
141
154
 
142
155
  def self.find_py(str)
143
156
  regex = /(#{ Regex.py }|#{ Regex.py_syllabic_nasals })/
144
- str.scan(regex).map{ |x| x.compact[0] }
157
+ str.scan(regex).map { |x| x.compact[0] }
145
158
  end
146
159
 
147
160
  def self.recapitalize(obj, capitalized)
@@ -179,21 +192,15 @@ module ZhongwenTools
179
192
  end
180
193
 
181
194
  def self.capitalized?(str)
182
- str[0] != Caps.downcase(str[0])
195
+ first_letter = str[/#{Regex.py}|[ĀÁǍÀĒÉĚÈĪÍǏÌŌÓǑÒ]|#{Regex.py_syllabic_nasals}/][0]
196
+
197
+ first_letter != Caps.downcase(first_letter)
183
198
  end
184
199
 
185
200
  def self.current_pyn(pyn, pinyin_arr)
186
- replacements = []
187
-
188
201
  pinyin_arr.each do |pinyin|
189
202
  replace = pinyin_replacement(pinyin)
190
- match = pinyin
191
- if replacements.size > 0
192
- pyn = pyn.sub(/(#{ replacements.join('.*') }.*)#{ match }/){ $1 + replace }
193
- else
194
- pyn = pyn.sub(/#{match}/){ "#{ $1 }#{ replace }" }
195
- end
196
- replacements << replace
203
+ pyn.sub!(pinyin, replace)
197
204
  end
198
205
 
199
206
  pyn.gsub("'", '')
@@ -205,14 +212,14 @@ module ZhongwenTools
205
212
  end
206
213
 
207
214
  match = select_pinyin_match(matches)
208
- replace = PYN_PY.find{ |k, v| k if v == match }[0]
215
+ replace = PYN_PY.find { |k, v| k if v == match }[0]
209
216
 
210
217
  py.gsub(match, replace).gsub(/([^\d ]*)(\d)([^\d ]*)/){ $1 + $3 + $2 }
211
218
  end
212
219
 
213
220
  def self.select_pinyin_match(matches)
214
221
  # take the longest pinyin match. Use bytes because 'è' is prefered over 'n' or 'r' or 'm'
215
- match = matches.sort{ |x, y| x.bytes.to_a.length <=> y.bytes.to_a.length }[-1]
222
+ match = matches.sort { |x, y| x.bytes.to_a.length <=> y.bytes.to_a.length }[-1]
216
223
 
217
224
  # Edge case.. en/eng pyn -> py conversion is one way only.
218
225
  match[/^(ē|é|ě|è|e)n?g?/].nil? ? match : match.chars[0]
@@ -68,19 +68,19 @@ module ZhongwenTools
68
68
  type ||= romanization?(str)
69
69
 
70
70
  if type == :py
71
- ZhongwenTools::Romanization::Pinyin.split_py(str)
71
+ ZhongwenTools::Romanization::Pinyin.split_py(str)
72
72
  elsif type == :pyn
73
- ZhongwenTools::Romanization::Pinyin.split_pyn(str)
73
+ ZhongwenTools::Romanization::Pinyin.split_pyn(str)
74
74
  elsif type == :bpmf
75
- ZhongwenTools::Romanization::ZhuyinFuhao.split(str)
75
+ ZhongwenTools::Romanization::ZhuyinFuhao.split(str)
76
76
  elsif type == :wg
77
- ZhongwenTools::Romanization::WadeGiles.split(str)
77
+ ZhongwenTools::Romanization::WadeGiles.split(str)
78
78
  elsif type == :typy
79
- ZhongwenTools::Romanization::TongyongPinyin.split(str)
79
+ ZhongwenTools::Romanization::TongyongPinyin.split(str)
80
80
  elsif type == :yale
81
- ZhongwenTools::Romanization::Yale.split(str)
81
+ ZhongwenTools::Romanization::Yale.split(str)
82
82
  elsif type == :mps2
83
- ZhongwenTools::Romanization::MPS2.split(str)
83
+ ZhongwenTools::Romanization::MPS2.split(str)
84
84
  end
85
85
  end
86
86
 
@@ -88,7 +88,7 @@ module ZhongwenTools
88
88
 
89
89
  def self.detect_romanization(str, regex)
90
90
  normalized_str = str.downcase.gsub(ZhongwenTools::Regex.punc, '').gsub(/[1-5\s\-']/, '')
91
- #TODO: ignore tonal marks from other systems wade giles, tongyong etc.
91
+ # TODO: ignore tonal marks from other systems wade giles, tongyong etc.
92
92
 
93
93
  normalized_str.scan(regex).join == normalized_str
94
94
  end
@@ -1,3 +1,3 @@
1
1
  module ZhongwenTools
2
- VERSION = '0.18.1'
2
+ VERSION = '0.18.2'
3
3
  end
data/test/test_caps.rb CHANGED
@@ -17,10 +17,10 @@ class TestCaps < Minitest::Test
17
17
 
18
18
  def test_capitalize
19
19
  assert_equal @caps[:c], ZhongwenTools::Caps.capitalize(@caps[:d])
20
+ assert_equal '"Zheng4qie1"', ZhongwenTools::Caps.capitalize('"Zheng4qie1"')
20
21
  end
21
22
 
22
23
  def setup
23
24
  @caps = { u: 'ĀLĀBÓ', d: 'ālābó', c: 'Ālābó' }
24
25
  end
25
26
  end
26
-
data/test/test_pinyin.rb CHANGED
@@ -15,15 +15,19 @@ class TestPinyin < Minitest::Test
15
15
  assert_equal w[:split_py], ZhongwenTools::Romanization::Pinyin.split_py(w[:py])
16
16
  end
17
17
 
18
- assert_equal ['fǎn', 'guāng', 'jìng'], ZhongwenTools::Romanization::Pinyin.split_py('fǎnguāngjìng')
18
+ assert_equal %w(fǎn guāng jìng), ZhongwenTools::Romanization::Pinyin.split_py('fǎnguāngjìng')
19
19
  assert_equal ['Yīng', 'guó'], ZhongwenTools::Romanization::Pinyin.split_py('Yīngguó')
20
20
  assert_equal ['Xī', 'ní'], ZhongwenTools::Romanization::Pinyin.split_py('Xīní')
21
- assert_equal ['bàn', 'gōng', 'lóu'], ZhongwenTools::Romanization::Pinyin.split_py('bàngōnglóu')
21
+ assert_equal %w(bàn gōng lóu), ZhongwenTools::Romanization::Pinyin.split_py('bàngōnglóu')
22
22
  assert_equal ['jì', 'nǚ'], ZhongwenTools::Romanization::Pinyin.split_py('jìnǚ')
23
23
  assert_equal ['sè', 'guǐ'], ZhongwenTools::Romanization::Pinyin.split_py('sèguǐ')
24
24
  assert_equal ['qǔ', 'nuǎn'], ZhongwenTools::Romanization::Pinyin.split_py('qǔnuǎn')
25
- assert_equal ['wán', '', 'r'], ZhongwenTools::Romanization::Pinyin.split_py('wányìr')
26
- assert_equal ['yīng', "ér"], ZhongwenTools::Romanization::Pinyin.split_py("yīng'ér")
25
+ assert_equal %w(wán yì r), ZhongwenTools::Romanization::Pinyin.split_py('wányìr')
26
+ assert_equal ['yīng', 'ér'], ZhongwenTools::Romanization::Pinyin.split_py("yīng'ér")
27
+ assert_equal ['xiǎn', 'gù'], ZhongwenTools::Romanization::Pinyin.split_py('xiǎngù')
28
+ assert_equal ['nián', 'gāo'], ZhongwenTools::Romanization::Pinyin.split_py('niángāo')
29
+ assert_equal %w(fú shè néng), ZhongwenTools::Romanization::Pinyin.split_py('fúshènéng')
30
+ assert_equal ['sān', 'gēng'], ZhongwenTools::Romanization::Pinyin.split_py('sāngēng')
27
31
  end
28
32
 
29
33
  def test_py?
@@ -84,17 +88,19 @@ class TestPinyin < Minitest::Test
84
88
 
85
89
  assert_equal 'yi2ge4', ZhongwenTools::Romanization::Pinyin.to_pyn('yígè')
86
90
  assert_equal 'yi4nian2', ZhongwenTools::Romanization::Pinyin.to_pyn('yìnián', :py)
91
+ assert_equal 'hei1hu1hu1', ZhongwenTools::Romanization::Pinyin.to_pyn('hēihūhū', :py)
92
+ assert_equal '"Zheng4qie1"', ZhongwenTools::Romanization::Pinyin.to_pyn('"Zhèngqiē"', :py)
87
93
  end
88
94
 
89
95
  def setup
90
96
  @hyphenated_words = [
91
- { :pyn => 'A1-la1-bo2', :py => 'Ālābó' },
92
- { :pyn => 'Mao2 Ze2-dong1', :py => 'Máo Zédōng' }
97
+ { pyn: 'A1-la1-bo2', py: 'Ālābó' },
98
+ { pyn: 'Mao2 Ze2-dong1', py: 'Máo Zédōng' }
93
99
  ]
94
100
 
95
101
  @split_words = [
96
- { :pyn => 'A1-la1-bo2', :py => 'Ālābó', :split => %w(A1 la1 bo2), split_py: %w(Ā lā bó) },
97
- { :pyn => 'Mao2 Ze2-dong1', :py => 'Máo Zédōng', :split => %w(Mao2 Ze2 dong1), :split_py => %w(Máo Zé dōng) }
102
+ { pyn: 'A1-la1-bo2', py: 'Ālābó', split: %w(A1 la1 bo2), split_py: %w(Ā lā bó) },
103
+ { pyn: 'Mao2 Ze2-dong1', py: 'Máo Zédōng', split: %w(Mao2 Ze2 dong1), split_py: %w(Máo Zé dōng) }
98
104
  ]
99
105
 
100
106
  @syllabic_nasals = [
@@ -1,5 +1,5 @@
1
1
  # -*- encoding: utf-8 -*-
2
- $:.push File.expand_path('../lib', __FILE__)
2
+ $LOAD_PATH.push File.expand_path('../lib', __FILE__)
3
3
  require 'zhongwen_tools/version'
4
4
 
5
5
  Gem::Specification.new do |s|
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: zhongwen_tools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.18.1
4
+ version: 0.18.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Steven Daniels
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-02 00:00:00.000000000 Z
11
+ date: 2015-04-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake