zhongwen_tools 0.18.1 → 0.18.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/zhongwen_tools/caps.rb +66 -66
- data/lib/zhongwen_tools/romanization/pinyin.rb +27 -20
- data/lib/zhongwen_tools/romanization.rb +8 -8
- data/lib/zhongwen_tools/version.rb +1 -1
- data/test/test_caps.rb +1 -1
- data/test/test_pinyin.rb +14 -8
- data/zhongwen_tools.gemspec +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 47bccf56d8e66407103478019b7b7e7355a493c8
|
4
|
+
data.tar.gz: 355e9aa4f41356610290d76461991274d563a9ad
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f018708bf11c96460191d42aeb1f2734708936063d05d1f55fa745d6ccf5a5e1085a5e47ff542c1387ff57b5d07700c4a723bb0f2aa05f810e6d51aced91e2d1
|
7
|
+
data.tar.gz: a2edcea2042ba9236a295800375618c4efc0b74d11713797bcca9e475d6a36dcd85157b31e15ca8499da8ae598aef8d8899d3198586bf8b4382ba0bd349e5404
|
data/README.md
CHANGED
@@ -5,7 +5,7 @@ Methods for dealing with Chinese.
|
|
5
5
|
Status](https://img.shields.io/travis/stevendaniels/zhongwen_tools.svg?style=flat-square)](https://travis-ci.org/stevendaniels/zhongwen_tools) [![Dependency Status](https://img.shields.io/gemnasium/stevendaniels/zhongwen_tools.svg?style=flat-square)](https://gemnasium.com/stevendaniels/zhongwen_tools) [![Code Climate](https://img.shields.io/codeclimate/github/stevendaniels/zhongwen_tools.svg?style=flat-square)](https://codeclimate.com/github/stevendaniels/zhongwen_tools) [![Coverage Status](https://img.shields.io/coveralls/stevendaniels/zhongwen_tools.svg?style=flat-square)](https://coveralls.io/r/stevendaniels/zhongwen_tools)
|
6
6
|
[![Gem Version](https://img.shields.io/gem/v/zhongwen_tools.svg?style=flat-square)](http://badge.fury.io/rb/zhongwen_tools)
|
7
7
|
|
8
|
-
##
|
8
|
+
##INSTALLATION
|
9
9
|
|
10
10
|
Install as a gem
|
11
11
|
|
data/lib/zhongwen_tools/caps.rb
CHANGED
@@ -1,74 +1,74 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
|
3
2
|
module ZhongwenTools
|
3
|
+
# Public: Module for pinyin/fullwidth capitalization
|
4
4
|
module Caps
|
5
|
+
def self.downcase(str)
|
6
|
+
regex = /(#{ZhongwenTools::Caps::CAPS.keys.join('|')})/
|
7
|
+
str.gsub(regex, ZhongwenTools::Caps::CAPS).downcase
|
8
|
+
end
|
5
9
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
+
def self.upcase(str)
|
11
|
+
str.gsub(/(#{ZhongwenTools::Caps::CAPS.values.join('|')})/) do
|
12
|
+
ZhongwenTools::Caps::CAPS.find { |_, v| v == Regexp.last_match[0] }[0]
|
13
|
+
end.upcase
|
14
|
+
end
|
10
15
|
|
11
|
-
|
12
|
-
|
13
|
-
ZhongwenTools::Caps
|
14
|
-
|
15
|
-
end
|
16
|
-
|
17
|
-
def self.capitalize(str)
|
18
|
-
str.sub(str[0], ZhongwenTools::Caps.upcase(str[0]))
|
19
|
-
end
|
16
|
+
def self.capitalize(str)
|
17
|
+
first_letter = str[/#{Regex.py}|[ĀÁǍÀĒÉĚÈĪÍǏÌŌÓǑÒ]/][0]
|
18
|
+
str.sub(first_letter, ZhongwenTools::Caps.upcase(first_letter))
|
19
|
+
end
|
20
20
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
21
|
+
CAPS = {
|
22
|
+
'Ā' => 'ā',
|
23
|
+
'Á' => 'á',
|
24
|
+
'Ǎ' => 'ǎ',
|
25
|
+
'À' => 'à',
|
26
|
+
'Ē' => 'ē',
|
27
|
+
'É' => 'é',
|
28
|
+
'Ě' => 'ě',
|
29
|
+
'È' => 'è',
|
30
|
+
'Ī' => 'ī',
|
31
|
+
'Í' => 'í',
|
32
|
+
'Ǐ' => 'ǐ',
|
33
|
+
'Ì' => 'ì',
|
34
|
+
'Ō' => 'ō',
|
35
|
+
'Ó' => 'ó',
|
36
|
+
'Ǒ' => 'ǒ',
|
37
|
+
'Ò' => 'ò',
|
38
|
+
'Ǖ' => 'ǖ', # using combining diatrical marks
|
39
|
+
'Ǘ' => 'ǘ', # using combining diatrical marks
|
40
|
+
'Ǚ' => 'ǚ', # using combining diatrical marks
|
41
|
+
'Ǜ' => 'ǜ', # using combining diatrical marks
|
42
|
+
'Ū' => 'ū',
|
43
|
+
'Ú' => 'ú',
|
44
|
+
'Ǔ' => 'ǔ',
|
45
|
+
'Ù' => 'ù',
|
46
|
+
'A' => 'a',
|
47
|
+
'B' => 'b',
|
48
|
+
'C' => 'c',
|
49
|
+
'D' => 'd',
|
50
|
+
'E' => 'e',
|
51
|
+
'F' => 'f',
|
52
|
+
'G' => 'g',
|
53
|
+
'H' => 'h',
|
54
|
+
'I' => 'i',
|
55
|
+
'J' => 'j',
|
56
|
+
'K' => 'k',
|
57
|
+
'L' => 'l',
|
58
|
+
'M' => 'm',
|
59
|
+
'N' => 'n',
|
60
|
+
'O' => 'o',
|
61
|
+
'P' => 'p',
|
62
|
+
'Q' => 'q',
|
63
|
+
'R' => 'r',
|
64
|
+
'S' => 's',
|
65
|
+
'T' => 't',
|
66
|
+
'U' => 'u',
|
67
|
+
'V' => 'v',
|
68
|
+
'W' => 'w',
|
69
|
+
'X' => 'x',
|
70
|
+
'Y' => 'y',
|
71
|
+
'Z' => 'z'
|
72
|
+
}
|
73
73
|
end
|
74
74
|
end
|
@@ -39,9 +39,9 @@ module ZhongwenTools
|
|
39
39
|
|
40
40
|
def self.split_pyn(str)
|
41
41
|
# FIXME: ignore punctuation
|
42
|
-
regex = str[/[1-5]/].nil? ?
|
42
|
+
regex = str[/[1-5]/].nil? ? /(#{ Regex.pinyin_toneless })/ : /(#{ Regex.pyn }|#{ Regex.pinyin_toneless })/
|
43
43
|
# NOTE: p[/[^\-]*/].to_s is 25% faster than gsub('-', '')
|
44
|
-
str.scan(regex).map{ |arr| arr[0].strip[/[^\-]*/].to_s }.flatten
|
44
|
+
str.scan(regex).map { |arr| arr[0].strip[/[^\-]*/].to_s }.flatten
|
45
45
|
end
|
46
46
|
|
47
47
|
def self.split_py(str)
|
@@ -49,11 +49,8 @@ module ZhongwenTools
|
|
49
49
|
|
50
50
|
results = words.map do |word|
|
51
51
|
word, is_capitalized = normalize_pinyin(word)
|
52
|
-
|
53
|
-
|
54
|
-
# split_py("Xīní") # => ["Xī", "ní"]
|
55
|
-
word = word.gsub(/(n)(g(#{ Regex.py_tones['o'] }|u))/){ "#{ $1 }-#{ $2 }" }
|
56
|
-
word = word.gsub(/([#{ Regex.only_tones }])(n(#{ Regex.py_tones['v'] }|#{ Regex.py_tones['i'] }|[iu][#{ Regex.py_tones['a'] }]))/){ "#{ $1 }-#{ $2 }" }
|
52
|
+
word = normalize_n_g(word)
|
53
|
+
word = normalize_n(word)
|
57
54
|
result = word.split(/['\-]/).flatten.map do |x|
|
58
55
|
find_py(x)
|
59
56
|
end
|
@@ -135,13 +132,29 @@ module ZhongwenTools
|
|
135
132
|
{ pyn: :pyn, py: :py, pinyin: :py }[romanization]
|
136
133
|
end
|
137
134
|
|
135
|
+
# NOTE: Special Case split_py("fǎnguāng") # => ["fǎn" + "guāng"]
|
136
|
+
# In pinyin, sāngēng == sān gēng and sāng'ēng = sāng ēng
|
137
|
+
def self.normalize_n_g(pinyin)
|
138
|
+
regex = /(?<n_part>n)(?<g_part>g(#{Regex.py_tones['o']}|#{Regex.py_tones['u']}|#{Regex.py_tones['a']}|#{Regex.py_tones['e']}))/
|
139
|
+
pinyin.gsub(regex) do
|
140
|
+
"#{Regexp.last_match[:n_part]}-#{Regexp.last_match[:g_part]}"
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
def self.normalize_n(pinyin)
|
145
|
+
# Special Case split_py("yìnián") # => ["yì" + "nián"]
|
146
|
+
# split_py("Xīní") # => ["Xī", "ní"]
|
147
|
+
regex = /([#{ Regex.only_tones }])(n(#{Regex.py_tones['v']}|#{Regex.py_tones['i']}|[iu]|#{Regex.py_tones['e']}|[#{Regex.py_tones['a']}]))/
|
148
|
+
pinyin.gsub(regex) { "#{ $1 }-#{ $2 }" }
|
149
|
+
end
|
150
|
+
|
138
151
|
def self.normalize_pinyin(pinyin)
|
139
152
|
[Caps.downcase(pinyin), capitalized?(pinyin)]
|
140
153
|
end
|
141
154
|
|
142
155
|
def self.find_py(str)
|
143
156
|
regex = /(#{ Regex.py }|#{ Regex.py_syllabic_nasals })/
|
144
|
-
str.scan(regex).map{ |x| x.compact[0] }
|
157
|
+
str.scan(regex).map { |x| x.compact[0] }
|
145
158
|
end
|
146
159
|
|
147
160
|
def self.recapitalize(obj, capitalized)
|
@@ -179,21 +192,15 @@ module ZhongwenTools
|
|
179
192
|
end
|
180
193
|
|
181
194
|
def self.capitalized?(str)
|
182
|
-
str[
|
195
|
+
first_letter = str[/#{Regex.py}|[ĀÁǍÀĒÉĚÈĪÍǏÌŌÓǑÒ]|#{Regex.py_syllabic_nasals}/][0]
|
196
|
+
|
197
|
+
first_letter != Caps.downcase(first_letter)
|
183
198
|
end
|
184
199
|
|
185
200
|
def self.current_pyn(pyn, pinyin_arr)
|
186
|
-
replacements = []
|
187
|
-
|
188
201
|
pinyin_arr.each do |pinyin|
|
189
202
|
replace = pinyin_replacement(pinyin)
|
190
|
-
|
191
|
-
if replacements.size > 0
|
192
|
-
pyn = pyn.sub(/(#{ replacements.join('.*') }.*)#{ match }/){ $1 + replace }
|
193
|
-
else
|
194
|
-
pyn = pyn.sub(/#{match}/){ "#{ $1 }#{ replace }" }
|
195
|
-
end
|
196
|
-
replacements << replace
|
203
|
+
pyn.sub!(pinyin, replace)
|
197
204
|
end
|
198
205
|
|
199
206
|
pyn.gsub("'", '')
|
@@ -205,14 +212,14 @@ module ZhongwenTools
|
|
205
212
|
end
|
206
213
|
|
207
214
|
match = select_pinyin_match(matches)
|
208
|
-
replace = PYN_PY.find{ |k, v| k if v == match }[0]
|
215
|
+
replace = PYN_PY.find { |k, v| k if v == match }[0]
|
209
216
|
|
210
217
|
py.gsub(match, replace).gsub(/([^\d ]*)(\d)([^\d ]*)/){ $1 + $3 + $2 }
|
211
218
|
end
|
212
219
|
|
213
220
|
def self.select_pinyin_match(matches)
|
214
221
|
# take the longest pinyin match. Use bytes because 'è' is prefered over 'n' or 'r' or 'm'
|
215
|
-
match = matches.sort{ |x, y| x.bytes.to_a.length <=> y.bytes.to_a.length }[-1]
|
222
|
+
match = matches.sort { |x, y| x.bytes.to_a.length <=> y.bytes.to_a.length }[-1]
|
216
223
|
|
217
224
|
# Edge case.. en/eng pyn -> py conversion is one way only.
|
218
225
|
match[/^(ē|é|ě|è|e)n?g?/].nil? ? match : match.chars[0]
|
@@ -68,19 +68,19 @@ module ZhongwenTools
|
|
68
68
|
type ||= romanization?(str)
|
69
69
|
|
70
70
|
if type == :py
|
71
|
-
|
71
|
+
ZhongwenTools::Romanization::Pinyin.split_py(str)
|
72
72
|
elsif type == :pyn
|
73
|
-
|
73
|
+
ZhongwenTools::Romanization::Pinyin.split_pyn(str)
|
74
74
|
elsif type == :bpmf
|
75
|
-
|
75
|
+
ZhongwenTools::Romanization::ZhuyinFuhao.split(str)
|
76
76
|
elsif type == :wg
|
77
|
-
|
77
|
+
ZhongwenTools::Romanization::WadeGiles.split(str)
|
78
78
|
elsif type == :typy
|
79
|
-
|
79
|
+
ZhongwenTools::Romanization::TongyongPinyin.split(str)
|
80
80
|
elsif type == :yale
|
81
|
-
|
81
|
+
ZhongwenTools::Romanization::Yale.split(str)
|
82
82
|
elsif type == :mps2
|
83
|
-
|
83
|
+
ZhongwenTools::Romanization::MPS2.split(str)
|
84
84
|
end
|
85
85
|
end
|
86
86
|
|
@@ -88,7 +88,7 @@ module ZhongwenTools
|
|
88
88
|
|
89
89
|
def self.detect_romanization(str, regex)
|
90
90
|
normalized_str = str.downcase.gsub(ZhongwenTools::Regex.punc, '').gsub(/[1-5\s\-']/, '')
|
91
|
-
#TODO: ignore tonal marks from other systems wade giles, tongyong etc.
|
91
|
+
# TODO: ignore tonal marks from other systems wade giles, tongyong etc.
|
92
92
|
|
93
93
|
normalized_str.scan(regex).join == normalized_str
|
94
94
|
end
|
data/test/test_caps.rb
CHANGED
@@ -17,10 +17,10 @@ class TestCaps < Minitest::Test
|
|
17
17
|
|
18
18
|
def test_capitalize
|
19
19
|
assert_equal @caps[:c], ZhongwenTools::Caps.capitalize(@caps[:d])
|
20
|
+
assert_equal '"Zheng4qie1"', ZhongwenTools::Caps.capitalize('"Zheng4qie1"')
|
20
21
|
end
|
21
22
|
|
22
23
|
def setup
|
23
24
|
@caps = { u: 'ĀLĀBÓ', d: 'ālābó', c: 'Ālābó' }
|
24
25
|
end
|
25
26
|
end
|
26
|
-
|
data/test/test_pinyin.rb
CHANGED
@@ -15,15 +15,19 @@ class TestPinyin < Minitest::Test
|
|
15
15
|
assert_equal w[:split_py], ZhongwenTools::Romanization::Pinyin.split_py(w[:py])
|
16
16
|
end
|
17
17
|
|
18
|
-
assert_equal
|
18
|
+
assert_equal %w(fǎn guāng jìng), ZhongwenTools::Romanization::Pinyin.split_py('fǎnguāngjìng')
|
19
19
|
assert_equal ['Yīng', 'guó'], ZhongwenTools::Romanization::Pinyin.split_py('Yīngguó')
|
20
20
|
assert_equal ['Xī', 'ní'], ZhongwenTools::Romanization::Pinyin.split_py('Xīní')
|
21
|
-
assert_equal
|
21
|
+
assert_equal %w(bàn gōng lóu), ZhongwenTools::Romanization::Pinyin.split_py('bàngōnglóu')
|
22
22
|
assert_equal ['jì', 'nǚ'], ZhongwenTools::Romanization::Pinyin.split_py('jìnǚ')
|
23
23
|
assert_equal ['sè', 'guǐ'], ZhongwenTools::Romanization::Pinyin.split_py('sèguǐ')
|
24
24
|
assert_equal ['qǔ', 'nuǎn'], ZhongwenTools::Romanization::Pinyin.split_py('qǔnuǎn')
|
25
|
-
assert_equal
|
26
|
-
assert_equal ['yīng',
|
25
|
+
assert_equal %w(wán yì r), ZhongwenTools::Romanization::Pinyin.split_py('wányìr')
|
26
|
+
assert_equal ['yīng', 'ér'], ZhongwenTools::Romanization::Pinyin.split_py("yīng'ér")
|
27
|
+
assert_equal ['xiǎn', 'gù'], ZhongwenTools::Romanization::Pinyin.split_py('xiǎngù')
|
28
|
+
assert_equal ['nián', 'gāo'], ZhongwenTools::Romanization::Pinyin.split_py('niángāo')
|
29
|
+
assert_equal %w(fú shè néng), ZhongwenTools::Romanization::Pinyin.split_py('fúshènéng')
|
30
|
+
assert_equal ['sān', 'gēng'], ZhongwenTools::Romanization::Pinyin.split_py('sāngēng')
|
27
31
|
end
|
28
32
|
|
29
33
|
def test_py?
|
@@ -84,17 +88,19 @@ class TestPinyin < Minitest::Test
|
|
84
88
|
|
85
89
|
assert_equal 'yi2ge4', ZhongwenTools::Romanization::Pinyin.to_pyn('yígè')
|
86
90
|
assert_equal 'yi4nian2', ZhongwenTools::Romanization::Pinyin.to_pyn('yìnián', :py)
|
91
|
+
assert_equal 'hei1hu1hu1', ZhongwenTools::Romanization::Pinyin.to_pyn('hēihūhū', :py)
|
92
|
+
assert_equal '"Zheng4qie1"', ZhongwenTools::Romanization::Pinyin.to_pyn('"Zhèngqiē"', :py)
|
87
93
|
end
|
88
94
|
|
89
95
|
def setup
|
90
96
|
@hyphenated_words = [
|
91
|
-
{ :
|
92
|
-
{ :
|
97
|
+
{ pyn: 'A1-la1-bo2', py: 'Ālābó' },
|
98
|
+
{ pyn: 'Mao2 Ze2-dong1', py: 'Máo Zédōng' }
|
93
99
|
]
|
94
100
|
|
95
101
|
@split_words = [
|
96
|
-
{ :
|
97
|
-
{ :
|
102
|
+
{ pyn: 'A1-la1-bo2', py: 'Ālābó', split: %w(A1 la1 bo2), split_py: %w(Ā lā bó) },
|
103
|
+
{ pyn: 'Mao2 Ze2-dong1', py: 'Máo Zédōng', split: %w(Mao2 Ze2 dong1), split_py: %w(Máo Zé dōng) }
|
98
104
|
]
|
99
105
|
|
100
106
|
@syllabic_nasals = [
|
data/zhongwen_tools.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zhongwen_tools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.18.
|
4
|
+
version: 0.18.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Steven Daniels
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-04-
|
11
|
+
date: 2015-04-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|