zhongwen_tools 0.18.1 → 0.18.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/zhongwen_tools/caps.rb +66 -66
- data/lib/zhongwen_tools/romanization/pinyin.rb +27 -20
- data/lib/zhongwen_tools/romanization.rb +8 -8
- data/lib/zhongwen_tools/version.rb +1 -1
- data/test/test_caps.rb +1 -1
- data/test/test_pinyin.rb +14 -8
- data/zhongwen_tools.gemspec +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 47bccf56d8e66407103478019b7b7e7355a493c8
|
|
4
|
+
data.tar.gz: 355e9aa4f41356610290d76461991274d563a9ad
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f018708bf11c96460191d42aeb1f2734708936063d05d1f55fa745d6ccf5a5e1085a5e47ff542c1387ff57b5d07700c4a723bb0f2aa05f810e6d51aced91e2d1
|
|
7
|
+
data.tar.gz: a2edcea2042ba9236a295800375618c4efc0b74d11713797bcca9e475d6a36dcd85157b31e15ca8499da8ae598aef8d8899d3198586bf8b4382ba0bd349e5404
|
data/README.md
CHANGED
|
@@ -5,7 +5,7 @@ Methods for dealing with Chinese.
|
|
|
5
5
|
Status](https://img.shields.io/travis/stevendaniels/zhongwen_tools.svg?style=flat-square)](https://travis-ci.org/stevendaniels/zhongwen_tools) [](https://gemnasium.com/stevendaniels/zhongwen_tools) [](https://codeclimate.com/github/stevendaniels/zhongwen_tools) [](https://coveralls.io/r/stevendaniels/zhongwen_tools)
|
|
6
6
|
[](http://badge.fury.io/rb/zhongwen_tools)
|
|
7
7
|
|
|
8
|
-
##
|
|
8
|
+
##INSTALLATION
|
|
9
9
|
|
|
10
10
|
Install as a gem
|
|
11
11
|
|
data/lib/zhongwen_tools/caps.rb
CHANGED
|
@@ -1,74 +1,74 @@
|
|
|
1
1
|
# encoding: utf-8
|
|
2
|
-
|
|
3
2
|
module ZhongwenTools
|
|
3
|
+
# Public: Module for pinyin/fullwidth capitalization
|
|
4
4
|
module Caps
|
|
5
|
+
def self.downcase(str)
|
|
6
|
+
regex = /(#{ZhongwenTools::Caps::CAPS.keys.join('|')})/
|
|
7
|
+
str.gsub(regex, ZhongwenTools::Caps::CAPS).downcase
|
|
8
|
+
end
|
|
5
9
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
+
def self.upcase(str)
|
|
11
|
+
str.gsub(/(#{ZhongwenTools::Caps::CAPS.values.join('|')})/) do
|
|
12
|
+
ZhongwenTools::Caps::CAPS.find { |_, v| v == Regexp.last_match[0] }[0]
|
|
13
|
+
end.upcase
|
|
14
|
+
end
|
|
10
15
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
ZhongwenTools::Caps
|
|
14
|
-
|
|
15
|
-
end
|
|
16
|
-
|
|
17
|
-
def self.capitalize(str)
|
|
18
|
-
str.sub(str[0], ZhongwenTools::Caps.upcase(str[0]))
|
|
19
|
-
end
|
|
16
|
+
def self.capitalize(str)
|
|
17
|
+
first_letter = str[/#{Regex.py}|[ĀÁǍÀĒÉĚÈĪÍǏÌŌÓǑÒ]/][0]
|
|
18
|
+
str.sub(first_letter, ZhongwenTools::Caps.upcase(first_letter))
|
|
19
|
+
end
|
|
20
20
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
21
|
+
CAPS = {
|
|
22
|
+
'Ā' => 'ā',
|
|
23
|
+
'Á' => 'á',
|
|
24
|
+
'Ǎ' => 'ǎ',
|
|
25
|
+
'À' => 'à',
|
|
26
|
+
'Ē' => 'ē',
|
|
27
|
+
'É' => 'é',
|
|
28
|
+
'Ě' => 'ě',
|
|
29
|
+
'È' => 'è',
|
|
30
|
+
'Ī' => 'ī',
|
|
31
|
+
'Í' => 'í',
|
|
32
|
+
'Ǐ' => 'ǐ',
|
|
33
|
+
'Ì' => 'ì',
|
|
34
|
+
'Ō' => 'ō',
|
|
35
|
+
'Ó' => 'ó',
|
|
36
|
+
'Ǒ' => 'ǒ',
|
|
37
|
+
'Ò' => 'ò',
|
|
38
|
+
'Ǖ' => 'ǖ', # using combining diatrical marks
|
|
39
|
+
'Ǘ' => 'ǘ', # using combining diatrical marks
|
|
40
|
+
'Ǚ' => 'ǚ', # using combining diatrical marks
|
|
41
|
+
'Ǜ' => 'ǜ', # using combining diatrical marks
|
|
42
|
+
'Ū' => 'ū',
|
|
43
|
+
'Ú' => 'ú',
|
|
44
|
+
'Ǔ' => 'ǔ',
|
|
45
|
+
'Ù' => 'ù',
|
|
46
|
+
'A' => 'a',
|
|
47
|
+
'B' => 'b',
|
|
48
|
+
'C' => 'c',
|
|
49
|
+
'D' => 'd',
|
|
50
|
+
'E' => 'e',
|
|
51
|
+
'F' => 'f',
|
|
52
|
+
'G' => 'g',
|
|
53
|
+
'H' => 'h',
|
|
54
|
+
'I' => 'i',
|
|
55
|
+
'J' => 'j',
|
|
56
|
+
'K' => 'k',
|
|
57
|
+
'L' => 'l',
|
|
58
|
+
'M' => 'm',
|
|
59
|
+
'N' => 'n',
|
|
60
|
+
'O' => 'o',
|
|
61
|
+
'P' => 'p',
|
|
62
|
+
'Q' => 'q',
|
|
63
|
+
'R' => 'r',
|
|
64
|
+
'S' => 's',
|
|
65
|
+
'T' => 't',
|
|
66
|
+
'U' => 'u',
|
|
67
|
+
'V' => 'v',
|
|
68
|
+
'W' => 'w',
|
|
69
|
+
'X' => 'x',
|
|
70
|
+
'Y' => 'y',
|
|
71
|
+
'Z' => 'z'
|
|
72
|
+
}
|
|
73
73
|
end
|
|
74
74
|
end
|
|
@@ -39,9 +39,9 @@ module ZhongwenTools
|
|
|
39
39
|
|
|
40
40
|
def self.split_pyn(str)
|
|
41
41
|
# FIXME: ignore punctuation
|
|
42
|
-
regex = str[/[1-5]/].nil? ?
|
|
42
|
+
regex = str[/[1-5]/].nil? ? /(#{ Regex.pinyin_toneless })/ : /(#{ Regex.pyn }|#{ Regex.pinyin_toneless })/
|
|
43
43
|
# NOTE: p[/[^\-]*/].to_s is 25% faster than gsub('-', '')
|
|
44
|
-
str.scan(regex).map{ |arr| arr[0].strip[/[^\-]*/].to_s }.flatten
|
|
44
|
+
str.scan(regex).map { |arr| arr[0].strip[/[^\-]*/].to_s }.flatten
|
|
45
45
|
end
|
|
46
46
|
|
|
47
47
|
def self.split_py(str)
|
|
@@ -49,11 +49,8 @@ module ZhongwenTools
|
|
|
49
49
|
|
|
50
50
|
results = words.map do |word|
|
|
51
51
|
word, is_capitalized = normalize_pinyin(word)
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
# split_py("Xīní") # => ["Xī", "ní"]
|
|
55
|
-
word = word.gsub(/(n)(g(#{ Regex.py_tones['o'] }|u))/){ "#{ $1 }-#{ $2 }" }
|
|
56
|
-
word = word.gsub(/([#{ Regex.only_tones }])(n(#{ Regex.py_tones['v'] }|#{ Regex.py_tones['i'] }|[iu][#{ Regex.py_tones['a'] }]))/){ "#{ $1 }-#{ $2 }" }
|
|
52
|
+
word = normalize_n_g(word)
|
|
53
|
+
word = normalize_n(word)
|
|
57
54
|
result = word.split(/['\-]/).flatten.map do |x|
|
|
58
55
|
find_py(x)
|
|
59
56
|
end
|
|
@@ -135,13 +132,29 @@ module ZhongwenTools
|
|
|
135
132
|
{ pyn: :pyn, py: :py, pinyin: :py }[romanization]
|
|
136
133
|
end
|
|
137
134
|
|
|
135
|
+
# NOTE: Special Case split_py("fǎnguāng") # => ["fǎn" + "guāng"]
|
|
136
|
+
# In pinyin, sāngēng == sān gēng and sāng'ēng = sāng ēng
|
|
137
|
+
def self.normalize_n_g(pinyin)
|
|
138
|
+
regex = /(?<n_part>n)(?<g_part>g(#{Regex.py_tones['o']}|#{Regex.py_tones['u']}|#{Regex.py_tones['a']}|#{Regex.py_tones['e']}))/
|
|
139
|
+
pinyin.gsub(regex) do
|
|
140
|
+
"#{Regexp.last_match[:n_part]}-#{Regexp.last_match[:g_part]}"
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def self.normalize_n(pinyin)
|
|
145
|
+
# Special Case split_py("yìnián") # => ["yì" + "nián"]
|
|
146
|
+
# split_py("Xīní") # => ["Xī", "ní"]
|
|
147
|
+
regex = /([#{ Regex.only_tones }])(n(#{Regex.py_tones['v']}|#{Regex.py_tones['i']}|[iu]|#{Regex.py_tones['e']}|[#{Regex.py_tones['a']}]))/
|
|
148
|
+
pinyin.gsub(regex) { "#{ $1 }-#{ $2 }" }
|
|
149
|
+
end
|
|
150
|
+
|
|
138
151
|
def self.normalize_pinyin(pinyin)
|
|
139
152
|
[Caps.downcase(pinyin), capitalized?(pinyin)]
|
|
140
153
|
end
|
|
141
154
|
|
|
142
155
|
def self.find_py(str)
|
|
143
156
|
regex = /(#{ Regex.py }|#{ Regex.py_syllabic_nasals })/
|
|
144
|
-
str.scan(regex).map{ |x| x.compact[0] }
|
|
157
|
+
str.scan(regex).map { |x| x.compact[0] }
|
|
145
158
|
end
|
|
146
159
|
|
|
147
160
|
def self.recapitalize(obj, capitalized)
|
|
@@ -179,21 +192,15 @@ module ZhongwenTools
|
|
|
179
192
|
end
|
|
180
193
|
|
|
181
194
|
def self.capitalized?(str)
|
|
182
|
-
str[
|
|
195
|
+
first_letter = str[/#{Regex.py}|[ĀÁǍÀĒÉĚÈĪÍǏÌŌÓǑÒ]|#{Regex.py_syllabic_nasals}/][0]
|
|
196
|
+
|
|
197
|
+
first_letter != Caps.downcase(first_letter)
|
|
183
198
|
end
|
|
184
199
|
|
|
185
200
|
def self.current_pyn(pyn, pinyin_arr)
|
|
186
|
-
replacements = []
|
|
187
|
-
|
|
188
201
|
pinyin_arr.each do |pinyin|
|
|
189
202
|
replace = pinyin_replacement(pinyin)
|
|
190
|
-
|
|
191
|
-
if replacements.size > 0
|
|
192
|
-
pyn = pyn.sub(/(#{ replacements.join('.*') }.*)#{ match }/){ $1 + replace }
|
|
193
|
-
else
|
|
194
|
-
pyn = pyn.sub(/#{match}/){ "#{ $1 }#{ replace }" }
|
|
195
|
-
end
|
|
196
|
-
replacements << replace
|
|
203
|
+
pyn.sub!(pinyin, replace)
|
|
197
204
|
end
|
|
198
205
|
|
|
199
206
|
pyn.gsub("'", '')
|
|
@@ -205,14 +212,14 @@ module ZhongwenTools
|
|
|
205
212
|
end
|
|
206
213
|
|
|
207
214
|
match = select_pinyin_match(matches)
|
|
208
|
-
replace = PYN_PY.find{ |k, v| k if v == match }[0]
|
|
215
|
+
replace = PYN_PY.find { |k, v| k if v == match }[0]
|
|
209
216
|
|
|
210
217
|
py.gsub(match, replace).gsub(/([^\d ]*)(\d)([^\d ]*)/){ $1 + $3 + $2 }
|
|
211
218
|
end
|
|
212
219
|
|
|
213
220
|
def self.select_pinyin_match(matches)
|
|
214
221
|
# take the longest pinyin match. Use bytes because 'è' is prefered over 'n' or 'r' or 'm'
|
|
215
|
-
match = matches.sort{ |x, y| x.bytes.to_a.length <=> y.bytes.to_a.length }[-1]
|
|
222
|
+
match = matches.sort { |x, y| x.bytes.to_a.length <=> y.bytes.to_a.length }[-1]
|
|
216
223
|
|
|
217
224
|
# Edge case.. en/eng pyn -> py conversion is one way only.
|
|
218
225
|
match[/^(ē|é|ě|è|e)n?g?/].nil? ? match : match.chars[0]
|
|
@@ -68,19 +68,19 @@ module ZhongwenTools
|
|
|
68
68
|
type ||= romanization?(str)
|
|
69
69
|
|
|
70
70
|
if type == :py
|
|
71
|
-
|
|
71
|
+
ZhongwenTools::Romanization::Pinyin.split_py(str)
|
|
72
72
|
elsif type == :pyn
|
|
73
|
-
|
|
73
|
+
ZhongwenTools::Romanization::Pinyin.split_pyn(str)
|
|
74
74
|
elsif type == :bpmf
|
|
75
|
-
|
|
75
|
+
ZhongwenTools::Romanization::ZhuyinFuhao.split(str)
|
|
76
76
|
elsif type == :wg
|
|
77
|
-
|
|
77
|
+
ZhongwenTools::Romanization::WadeGiles.split(str)
|
|
78
78
|
elsif type == :typy
|
|
79
|
-
|
|
79
|
+
ZhongwenTools::Romanization::TongyongPinyin.split(str)
|
|
80
80
|
elsif type == :yale
|
|
81
|
-
|
|
81
|
+
ZhongwenTools::Romanization::Yale.split(str)
|
|
82
82
|
elsif type == :mps2
|
|
83
|
-
|
|
83
|
+
ZhongwenTools::Romanization::MPS2.split(str)
|
|
84
84
|
end
|
|
85
85
|
end
|
|
86
86
|
|
|
@@ -88,7 +88,7 @@ module ZhongwenTools
|
|
|
88
88
|
|
|
89
89
|
def self.detect_romanization(str, regex)
|
|
90
90
|
normalized_str = str.downcase.gsub(ZhongwenTools::Regex.punc, '').gsub(/[1-5\s\-']/, '')
|
|
91
|
-
#TODO: ignore tonal marks from other systems wade giles, tongyong etc.
|
|
91
|
+
# TODO: ignore tonal marks from other systems wade giles, tongyong etc.
|
|
92
92
|
|
|
93
93
|
normalized_str.scan(regex).join == normalized_str
|
|
94
94
|
end
|
data/test/test_caps.rb
CHANGED
|
@@ -17,10 +17,10 @@ class TestCaps < Minitest::Test
|
|
|
17
17
|
|
|
18
18
|
def test_capitalize
|
|
19
19
|
assert_equal @caps[:c], ZhongwenTools::Caps.capitalize(@caps[:d])
|
|
20
|
+
assert_equal '"Zheng4qie1"', ZhongwenTools::Caps.capitalize('"Zheng4qie1"')
|
|
20
21
|
end
|
|
21
22
|
|
|
22
23
|
def setup
|
|
23
24
|
@caps = { u: 'ĀLĀBÓ', d: 'ālābó', c: 'Ālābó' }
|
|
24
25
|
end
|
|
25
26
|
end
|
|
26
|
-
|
data/test/test_pinyin.rb
CHANGED
|
@@ -15,15 +15,19 @@ class TestPinyin < Minitest::Test
|
|
|
15
15
|
assert_equal w[:split_py], ZhongwenTools::Romanization::Pinyin.split_py(w[:py])
|
|
16
16
|
end
|
|
17
17
|
|
|
18
|
-
assert_equal
|
|
18
|
+
assert_equal %w(fǎn guāng jìng), ZhongwenTools::Romanization::Pinyin.split_py('fǎnguāngjìng')
|
|
19
19
|
assert_equal ['Yīng', 'guó'], ZhongwenTools::Romanization::Pinyin.split_py('Yīngguó')
|
|
20
20
|
assert_equal ['Xī', 'ní'], ZhongwenTools::Romanization::Pinyin.split_py('Xīní')
|
|
21
|
-
assert_equal
|
|
21
|
+
assert_equal %w(bàn gōng lóu), ZhongwenTools::Romanization::Pinyin.split_py('bàngōnglóu')
|
|
22
22
|
assert_equal ['jì', 'nǚ'], ZhongwenTools::Romanization::Pinyin.split_py('jìnǚ')
|
|
23
23
|
assert_equal ['sè', 'guǐ'], ZhongwenTools::Romanization::Pinyin.split_py('sèguǐ')
|
|
24
24
|
assert_equal ['qǔ', 'nuǎn'], ZhongwenTools::Romanization::Pinyin.split_py('qǔnuǎn')
|
|
25
|
-
assert_equal
|
|
26
|
-
assert_equal ['yīng',
|
|
25
|
+
assert_equal %w(wán yì r), ZhongwenTools::Romanization::Pinyin.split_py('wányìr')
|
|
26
|
+
assert_equal ['yīng', 'ér'], ZhongwenTools::Romanization::Pinyin.split_py("yīng'ér")
|
|
27
|
+
assert_equal ['xiǎn', 'gù'], ZhongwenTools::Romanization::Pinyin.split_py('xiǎngù')
|
|
28
|
+
assert_equal ['nián', 'gāo'], ZhongwenTools::Romanization::Pinyin.split_py('niángāo')
|
|
29
|
+
assert_equal %w(fú shè néng), ZhongwenTools::Romanization::Pinyin.split_py('fúshènéng')
|
|
30
|
+
assert_equal ['sān', 'gēng'], ZhongwenTools::Romanization::Pinyin.split_py('sāngēng')
|
|
27
31
|
end
|
|
28
32
|
|
|
29
33
|
def test_py?
|
|
@@ -84,17 +88,19 @@ class TestPinyin < Minitest::Test
|
|
|
84
88
|
|
|
85
89
|
assert_equal 'yi2ge4', ZhongwenTools::Romanization::Pinyin.to_pyn('yígè')
|
|
86
90
|
assert_equal 'yi4nian2', ZhongwenTools::Romanization::Pinyin.to_pyn('yìnián', :py)
|
|
91
|
+
assert_equal 'hei1hu1hu1', ZhongwenTools::Romanization::Pinyin.to_pyn('hēihūhū', :py)
|
|
92
|
+
assert_equal '"Zheng4qie1"', ZhongwenTools::Romanization::Pinyin.to_pyn('"Zhèngqiē"', :py)
|
|
87
93
|
end
|
|
88
94
|
|
|
89
95
|
def setup
|
|
90
96
|
@hyphenated_words = [
|
|
91
|
-
{ :
|
|
92
|
-
{ :
|
|
97
|
+
{ pyn: 'A1-la1-bo2', py: 'Ālābó' },
|
|
98
|
+
{ pyn: 'Mao2 Ze2-dong1', py: 'Máo Zédōng' }
|
|
93
99
|
]
|
|
94
100
|
|
|
95
101
|
@split_words = [
|
|
96
|
-
{ :
|
|
97
|
-
{ :
|
|
102
|
+
{ pyn: 'A1-la1-bo2', py: 'Ālābó', split: %w(A1 la1 bo2), split_py: %w(Ā lā bó) },
|
|
103
|
+
{ pyn: 'Mao2 Ze2-dong1', py: 'Máo Zédōng', split: %w(Mao2 Ze2 dong1), split_py: %w(Máo Zé dōng) }
|
|
98
104
|
]
|
|
99
105
|
|
|
100
106
|
@syllabic_nasals = [
|
data/zhongwen_tools.gemspec
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: zhongwen_tools
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.18.
|
|
4
|
+
version: 0.18.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Steven Daniels
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2015-04-
|
|
11
|
+
date: 2015-04-27 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rake
|