zhongwen_tools 0.16.5 → 0.17.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -5
- data/lib/zhongwen_tools/regex.rb +5 -5
- data/lib/zhongwen_tools/romanization/mps2.rb +22 -0
- data/lib/zhongwen_tools/romanization/pinyin.rb +12 -13
- data/lib/zhongwen_tools/romanization/tongyong_pinyin.rb +29 -0
- data/lib/zhongwen_tools/romanization/wade_giles.rb +29 -0
- data/lib/zhongwen_tools/romanization/yale.rb +22 -0
- data/lib/zhongwen_tools/romanization/zhuyin_fuhao.rb +31 -0
- data/lib/zhongwen_tools/romanization.rb +40 -94
- data/lib/zhongwen_tools/ruby_19.rb +2 -1
- data/lib/zhongwen_tools/string_extension.rb +4 -0
- data/lib/zhongwen_tools/version.rb +1 -1
- data/test/test_pinyin.rb +2 -0
- data/test/test_regex.rb +6 -1
- data/zhongwen_tools.gemspec +4 -0
- metadata +21 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 5253f60895b1fcdea86c8f43061cd5f8c647f854
|
|
4
|
+
data.tar.gz: 75afec0bbf2e89ccbf22fbbffb76222496745805
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: bf20813d7c304375d47ba1a4555d69f14364339f26f0b7afa51bca059775a1816f9cc7af4d4f91115f60b8e467346d31f049248b1cd501890805cfedb7d41627
|
|
7
|
+
data.tar.gz: 9cc3eb9986dd62767e0d51a8257d1f0f2525956862c80a8f81debfcbf650a258f45315570f17f45c94195593e2ecb47752c777c05df2d0e85babcd5781d8fa62
|
data/README.md
CHANGED
|
@@ -116,7 +116,6 @@ You can monkey patch the String class.
|
|
|
116
116
|
|
|
117
117
|
'金枪鱼'.to_zhhk #=> '吞拿魚'
|
|
118
118
|
|
|
119
|
-
|
|
120
119
|
#### Integer Extensions
|
|
121
120
|
|
|
122
121
|
You can also monkey patch the Integer class!
|
|
@@ -150,11 +149,8 @@ The core functionality of ZhongwenTools excludes converting between
|
|
|
150
149
|
simplified and traditional Chinese. You can use it by requiring
|
|
151
150
|
'zhongwen_tools/core' instead of 'zhongwen_tools'
|
|
152
151
|
|
|
153
|
-
require 'zhongwen_tools/core'
|
|
152
|
+
require 'zhongwen_tools/core'
|
|
154
153
|
require 'zhongwen_tools/core_ext/string'
|
|
155
154
|
|
|
156
155
|
'ni3 hao3'.to_pinyin #=> 'nǐ hǎo'
|
|
157
156
|
'你們好'.to_zhs #=> NoMethodError
|
|
158
|
-
|
|
159
|
-
##TODO:
|
|
160
|
-
1. create a generic ZhongwenTools::Romanization.split method for convenience
|
data/lib/zhongwen_tools/regex.rb
CHANGED
|
@@ -33,11 +33,11 @@ module ZhongwenTools
|
|
|
33
33
|
end
|
|
34
34
|
|
|
35
35
|
def self.zh
|
|
36
|
-
/
|
|
36
|
+
/\p{Han}/
|
|
37
37
|
end
|
|
38
38
|
|
|
39
39
|
def self.punc
|
|
40
|
-
/
|
|
40
|
+
/\p{Punct}/
|
|
41
41
|
end
|
|
42
42
|
|
|
43
43
|
def self.zh_punc
|
|
@@ -74,7 +74,7 @@ module ZhongwenTools
|
|
|
74
74
|
#
|
|
75
75
|
# Returns a Regex.
|
|
76
76
|
def self.bopomofo
|
|
77
|
-
/
|
|
77
|
+
/\p{Bopomofo}/
|
|
78
78
|
end
|
|
79
79
|
|
|
80
80
|
private
|
|
@@ -86,6 +86,7 @@ module ZhongwenTools
|
|
|
86
86
|
{
|
|
87
87
|
nl_regex: /([nN]eng?|[lnLN](a(i|ng?|o)?|e(i|ng)?|i(ang|a[on]?|e|ng?|u)?|o(ng?|u)|u(o|i|an?|n)?|ve?))/,
|
|
88
88
|
bpm_regex: /([mM]iu|[pmPM]ou|[bpmBPM](o|e(i|ng?)?|a(ng?|i|o)?|i(e|ng?|a[no])?|u))/,
|
|
89
|
+
y_regex: /[yY](a(o|ng?)?|e|i(n|ng)?|o(u|ng)?|u(e|a?n)?)/,
|
|
89
90
|
f_regex: /([fF](ou?|[ae](ng?|i)?|u))/,
|
|
90
91
|
dt_regex: /([dD](e(i|ng?)|i(a[on]?|u))|[dtDT](a(i|ng?|o)?|e(i|ng)?|i(a[on]?|e|ng|u)?|o(ng?|u)|u(o|i|an?|n)?))/,
|
|
91
92
|
gkh_regex: /([ghkGHK](a(i|ng?|o)?|e(i|ng?)?|o(u|ng)|u(a(i|ng?)?|i|n|o)?))/,
|
|
@@ -94,8 +95,7 @@ module ZhongwenTools
|
|
|
94
95
|
r_regex: /([rR]([ae]ng?|i|e|ao|ou|ong|u[oin]|ua?n?))/,
|
|
95
96
|
jqx_regex: /([jqxJQX](i(a(o|ng?)?|[eu]|ong|ng?)?|u(e|a?n)?))/,
|
|
96
97
|
aeo_regex: /(([aA](i|o|ng?)?|[oO]u?|[eE](i|ng?|r)?))/,
|
|
97
|
-
w_regex: /([wW](a(i|ng?)?|o|e(i|ng?)?|u))
|
|
98
|
-
y_regex: /[yY](a(o|ng?)?|e|in?g?|o(u|ng)?|u(e|a?n)?)/
|
|
98
|
+
w_regex: /([wW](a(i|ng?)?|o|e(i|ng?)?|u))/
|
|
99
99
|
}
|
|
100
100
|
end
|
|
101
101
|
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
module ZhongwenTools
|
|
2
|
+
module Romanization
|
|
3
|
+
module MPS2
|
|
4
|
+
def self.to_mps2(*args)
|
|
5
|
+
str, from = args
|
|
6
|
+
from ||= ZhongwenTools::Romanization.romanization?(str)
|
|
7
|
+
|
|
8
|
+
ZhongwenTools::Romanization.convert str, :mps2, from.to_sym
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def self.mps2?(str)
|
|
12
|
+
regex = ZhongwenTools::Romanization.detect_regex(:mps2)
|
|
13
|
+
ZhongwenTools::Romanization.detect_romanization(str, regex)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def self.split(str)
|
|
17
|
+
regex = /(#{ ZhongwenTools::Romanization.detect_regex(:mps2) }*)/
|
|
18
|
+
ZhongwenTools::Romanization.split_romanization(str, regex)
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -5,7 +5,6 @@ require 'zhongwen_tools/romanization'
|
|
|
5
5
|
|
|
6
6
|
module ZhongwenTools
|
|
7
7
|
module Romanization
|
|
8
|
-
|
|
9
8
|
def self.convert_to_py(str, from)
|
|
10
9
|
str = convert_romanization(str, from, :pyn) if from != :pyn
|
|
11
10
|
ZhongwenTools::Romanization::Pinyin.convert_pyn_to_pinyin(str)
|
|
@@ -31,7 +30,7 @@ module ZhongwenTools
|
|
|
31
30
|
str, from = args
|
|
32
31
|
from ||= ZhongwenTools::Romanization.romanization? str
|
|
33
32
|
|
|
34
|
-
#_convert_romanization str, _set_type(type.to_sym), _set_type(from)
|
|
33
|
+
# _convert_romanization str, _set_type(type.to_sym), _set_type(from)
|
|
35
34
|
ZhongwenTools::Romanization.convert str, py_type(romanization), (py_type(from) || from)
|
|
36
35
|
end
|
|
37
36
|
end
|
|
@@ -40,7 +39,7 @@ module ZhongwenTools
|
|
|
40
39
|
# FIXME: ignore punctuation
|
|
41
40
|
regex = str[/[1-5]/].nil? ? /(#{ZhongwenTools::Regex.pinyin_toneless})/ : /(#{ZhongwenTools::Regex.pyn}|#{ZhongwenTools::Regex.pinyin_toneless})/
|
|
42
41
|
|
|
43
|
-
str.scan(regex).map{ |arr| arr[0].strip.gsub('-','') }.flatten
|
|
42
|
+
str.scan(regex).map{ |arr| arr[0].strip.gsub('-', '') }.flatten
|
|
44
43
|
end
|
|
45
44
|
|
|
46
45
|
def self.split_py(str)
|
|
@@ -49,7 +48,9 @@ module ZhongwenTools
|
|
|
49
48
|
results = words.map do |word|
|
|
50
49
|
word, is_capitalized = normalize_pinyin(word)
|
|
51
50
|
# NOTE: Special Case "fǎnguāng" should be "fǎn" + "guāng"
|
|
51
|
+
# Special Case "yìnián" should be "yì" + "nián"
|
|
52
52
|
word = word.gsub('ngu', 'n-gu')
|
|
53
|
+
.gsub(/([#{ ZhongwenTools::Regex.only_tones }])(ni[#{ ZhongwenTools::Regex.py_tones['a'] }])/){ "#{ $1 }-#{ $2 }" }
|
|
53
54
|
result = word.split(/['\-]/).flatten.map do |x|
|
|
54
55
|
find_py(x)
|
|
55
56
|
end
|
|
@@ -89,7 +90,7 @@ module ZhongwenTools
|
|
|
89
90
|
# Returns Boolean.
|
|
90
91
|
def self.pyn?(str)
|
|
91
92
|
# FIXME: use strip_punctuation method
|
|
92
|
-
normalized_str = ZhongwenTools::Caps.downcase(str.gsub(ZhongwenTools::Regex.punc,'').gsub(/[\s\-]/,''))
|
|
93
|
+
normalized_str = ZhongwenTools::Caps.downcase(str.gsub(ZhongwenTools::Regex.punc, '').gsub(/[\s\-]/, ''))
|
|
93
94
|
pyn_arr = split_pyn(normalized_str).map{ |p| p }
|
|
94
95
|
|
|
95
96
|
pyn_matches_properly?(pyn_arr, normalized_str) &&
|
|
@@ -126,7 +127,6 @@ module ZhongwenTools
|
|
|
126
127
|
{ pyn: :pyn, py: :py, pinyin: :py }[romanization]
|
|
127
128
|
end
|
|
128
129
|
|
|
129
|
-
|
|
130
130
|
def self.normalize_pinyin(pinyin)
|
|
131
131
|
[ZhongwenTools::Caps.downcase(pinyin), capitalized?(pinyin)]
|
|
132
132
|
end
|
|
@@ -180,9 +180,9 @@ module ZhongwenTools
|
|
|
180
180
|
replace = pinyin_replacement(pinyin)
|
|
181
181
|
match = pinyin
|
|
182
182
|
if replacements.size > 0
|
|
183
|
-
pyn = pyn.sub(/(#{replacements.join('.*')}.*)#{match}/){ $1 + replace }
|
|
183
|
+
pyn = pyn.sub(/(#{ replacements.join('.*') }.*)#{ match }/){ $1 + replace }
|
|
184
184
|
else
|
|
185
|
-
pyn = pyn.sub(/#{match}/){ "#{$1}#{replace}"}
|
|
185
|
+
pyn = pyn.sub(/#{match}/){ "#{ $1 }#{ replace }" }
|
|
186
186
|
end
|
|
187
187
|
replacements << replace
|
|
188
188
|
end
|
|
@@ -195,20 +195,19 @@ module ZhongwenTools
|
|
|
195
195
|
py.include? x
|
|
196
196
|
end
|
|
197
197
|
match = select_pinyin_match(matches)
|
|
198
|
-
replace = PYN_PY.find{|k,v| k if v == match}[0]
|
|
198
|
+
replace = PYN_PY.find{ |k, v| k if v == match }[0]
|
|
199
199
|
|
|
200
|
-
py.gsub(match, replace).gsub(/([^\d ]*)(\d)([^\d ]*)/){$1 + $3 + $2}
|
|
200
|
+
py.gsub(match, replace).gsub(/([^\d ]*)(\d)([^\d ]*)/){ $1 + $3 + $2 }
|
|
201
201
|
end
|
|
202
202
|
|
|
203
203
|
def self.select_pinyin_match(matches)
|
|
204
204
|
# take the longest pinyin match. Use bytes because 'è' is prefered over 'n' or 'r' or 'm'
|
|
205
|
-
match = matches.sort{|x,y| x.bytes.to_a.length <=> y.bytes.to_a.length}[-1]
|
|
205
|
+
match = matches.sort{ |x, y| x.bytes.to_a.length <=> y.bytes.to_a.length }[-1]
|
|
206
206
|
|
|
207
207
|
# Edge case.. en/eng pyn -> py conversion is one way only.
|
|
208
208
|
match[/^(ē|é|ě|è|e)n?g?/].nil? ? match : match.chars[0]
|
|
209
209
|
end
|
|
210
210
|
|
|
211
|
-
|
|
212
211
|
# Internal: Replaces numbered pinyin with actual pinyin. Pinyin separated with hyphens are combined as one word.
|
|
213
212
|
#
|
|
214
213
|
# str - A String to replace with actual pinyin
|
|
@@ -229,8 +228,8 @@ module ZhongwenTools
|
|
|
229
228
|
# And finally, correct those apostrophes at the very end.
|
|
230
229
|
# It's like magic.
|
|
231
230
|
str.gsub(regex) do
|
|
232
|
-
($3.nil? ? "#{PYN_PY[$1]}" : ($2 == '' &&
|
|
233
|
-
end.gsub("-'",
|
|
231
|
+
($3.nil? ? "#{ PYN_PY[$1] }" : ($2 == '' && %w(a e o).include?($3[0,1]))? "'#{ PYN_PY["#{ $3 }#{ $6 }"]}#{ $4 }#{ $5 }" : "#{ $2 }#{ PYN_PY["#{ $3 }#{ $6 }"] }#{ $4 }#{ $5 }") + (($7.to_s.length > 1) ? '-' : '')
|
|
232
|
+
end.gsub("-'", '-').sub(/^'/, '')
|
|
234
233
|
end
|
|
235
234
|
end
|
|
236
235
|
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
module ZhongwenTools
|
|
2
|
+
module Romanization
|
|
3
|
+
module TongyongPinyin
|
|
4
|
+
def self.to_typy(*args)
|
|
5
|
+
str, from = args
|
|
6
|
+
from ||= ZhongwenTools::Romanization.romanization?(str)
|
|
7
|
+
|
|
8
|
+
ZhongwenTools::Romanization.convert str, :typy, from.to_sym
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def self.typy?(str)
|
|
12
|
+
regex = ZhongwenTools::Romanization.detect_regex(:typy)
|
|
13
|
+
ZhongwenTools::Romanization.detect_romanization(str, regex)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def self.split(str)
|
|
17
|
+
regex = /(#{ ZhongwenTools::Romanization.detect_regex(:typy) }*)/
|
|
18
|
+
ZhongwenTools::Romanization.split_romanization(str, regex)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
class << self
|
|
22
|
+
[:tongyong, :tongyong_pinyin].each do |m|
|
|
23
|
+
alias_method "to_#{ m }".to_sym, :to_typy
|
|
24
|
+
alias_method "#{ m }?", :typy?
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
module ZhongwenTools
|
|
2
|
+
module Romanization
|
|
3
|
+
module WadeGiles
|
|
4
|
+
def self.to_wg(*args)
|
|
5
|
+
str, from = args
|
|
6
|
+
from ||= ZhongwenTools::Romanization.romanization?(str)
|
|
7
|
+
|
|
8
|
+
ZhongwenTools::Romanization.convert str, :wg, from.to_sym
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def self.wg?(str)
|
|
12
|
+
regex = ZhongwenTools::Romanization.detect_regex(:wg)
|
|
13
|
+
ZhongwenTools::Romanization.detect_romanization(str, regex)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def self.split(str)
|
|
17
|
+
regex = /(#{ ZhongwenTools::Romanization.detect_regex(:wg) }*)/
|
|
18
|
+
ZhongwenTools::Romanization.split_romanization(str, regex)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
class << self
|
|
22
|
+
[:wade_giles, :wadegiles].each do |m|
|
|
23
|
+
alias_method "to_#{ m }".to_sym, :to_wg
|
|
24
|
+
alias_method "#{ m }?", :wg?
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
module ZhongwenTools
|
|
2
|
+
module Romanization
|
|
3
|
+
module Yale
|
|
4
|
+
def self.to_yale(*args)
|
|
5
|
+
str, from = args
|
|
6
|
+
from ||= ZhongwenTools::Romanization.romanization?(str)
|
|
7
|
+
|
|
8
|
+
ZhongwenTools::Romanization.convert str, :yale, from.to_sym
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def self.yale?(str)
|
|
12
|
+
regex = ZhongwenTools::Romanization.detect_regex(:yale)
|
|
13
|
+
ZhongwenTools::Romanization.detect_romanization(str, regex)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def self.split(str)
|
|
17
|
+
regex = /(#{ ZhongwenTools::Romanization.detect_regex(:yale) }*)/
|
|
18
|
+
ZhongwenTools::Romanization.split_romanization(str, regex)
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
module ZhongwenTools
|
|
2
|
+
module Romanization
|
|
3
|
+
module ZhuyinFuhao
|
|
4
|
+
def self.to_bpmf(*args)
|
|
5
|
+
str, from = args
|
|
6
|
+
from ||= ZhongwenTools::Romanization.romanization?(str)
|
|
7
|
+
|
|
8
|
+
ZhongwenTools::Romanization.convert str, :bpmf, from.to_sym
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def self.bpmf?(str)
|
|
12
|
+
regex = ZhongwenTools::Regex.bopomofo
|
|
13
|
+
|
|
14
|
+
ZhongwenTools::Romanization.detect_romanization(str, regex)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def self.split(str)
|
|
18
|
+
regex = /([#{ZhongwenTools::Regex.bopomofo}]*)/
|
|
19
|
+
|
|
20
|
+
ZhongwenTools::Romanization.split_romanization(str, regex)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
class << self
|
|
24
|
+
[:zhuyin_fuhao, :zhuyinfuhao, :zyfh, :zhyfh, :bopomofo].each do |m|
|
|
25
|
+
alias_method "to_#{ m }".to_sym, :to_bpmf
|
|
26
|
+
alias_method "#{ m }?", :bpmf?
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
# encoding: utf-8
|
|
2
2
|
require 'zhongwen_tools/romanization/pinyin'
|
|
3
3
|
require 'zhongwen_tools/romanization/pinyin_table'
|
|
4
|
+
require 'zhongwen_tools/romanization/zhuyin_fuhao'
|
|
5
|
+
require 'zhongwen_tools/romanization/tongyong_pinyin'
|
|
6
|
+
require 'zhongwen_tools/romanization/wade_giles'
|
|
7
|
+
require 'zhongwen_tools/romanization/yale'
|
|
8
|
+
require 'zhongwen_tools/romanization/mps2'
|
|
4
9
|
require 'zhongwen_tools/romanization/romanization_table'
|
|
5
10
|
|
|
6
11
|
# NOTE: Creates several dynamic Modules and their associated methods.
|
|
@@ -29,7 +34,12 @@ module ZhongwenTools
|
|
|
29
34
|
# belongs to another romanization system p a romanization
|
|
30
35
|
# system, use the romanization modules specific function.
|
|
31
36
|
#
|
|
32
|
-
#
|
|
37
|
+
# Zhuyin Fuhao, Tongyong Pinyin, Wade Giles, MSP2 or Yale.
|
|
38
|
+
# http://en.wikipedia.org/wiki/Tongyong_Pinyin
|
|
39
|
+
# http://pinyin.info/romanization/tongyong/
|
|
40
|
+
# http://en.wikipedia.org/wiki/Wade%E2%80%93Giles
|
|
41
|
+
# http://en.wikipedia.org/wiki/Bopomofo
|
|
42
|
+
# http://pinyin.info/romanization/bopomofo/index.html # str - a String to test.
|
|
33
43
|
#
|
|
34
44
|
# Examples
|
|
35
45
|
# romanization?('hao3') #=> :pyn
|
|
@@ -56,17 +66,44 @@ module ZhongwenTools
|
|
|
56
66
|
end
|
|
57
67
|
end
|
|
58
68
|
|
|
59
|
-
def split(str, type = nil)
|
|
69
|
+
def self.split(str, type = nil)
|
|
60
70
|
type ||= romanization?(str)
|
|
61
71
|
|
|
62
72
|
if type == :py
|
|
73
|
+
ZhongwenTools::Romanization::Pinyin.split_py(str)
|
|
63
74
|
elsif type == :pyn
|
|
75
|
+
ZhongwenTools::Romanization::Pinyin.split_pyn(str)
|
|
76
|
+
elsif type == :bpmf
|
|
77
|
+
ZhongwenTools::Romanization::ZhuyinFuhao.split(str)
|
|
78
|
+
elsif type == :wg
|
|
79
|
+
ZhongwenTools::Romanization::WadeGiles.split(str)
|
|
80
|
+
elsif type == :typy
|
|
81
|
+
ZhongwenTools::Romanization::TongyongPinyin.split(str)
|
|
82
|
+
elsif type == :yale
|
|
83
|
+
ZhongwenTools::Romanization::Yale.split(str)
|
|
84
|
+
elsif type == :mps2
|
|
85
|
+
ZhongwenTools::Romanization::MPS2.split(str)
|
|
64
86
|
end
|
|
65
|
-
|
|
66
87
|
end
|
|
67
88
|
|
|
68
89
|
private
|
|
69
90
|
|
|
91
|
+
def self.detect_romanization(str, regex)
|
|
92
|
+
normalized_str = str.downcase.gsub(ZhongwenTools::Regex.punc, '').gsub(/[1-5\s\-']/, '')
|
|
93
|
+
#TODO: ignore tonal marks from other systems wade giles, tongyong etc.
|
|
94
|
+
|
|
95
|
+
normalized_str.scan(regex).join == normalized_str
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def self.split_romanization(str, regex)
|
|
99
|
+
# TODO: ignore tonal marks from other systems wade giles, tongyong etc.
|
|
100
|
+
results = str.scan(regex).map do |arr|
|
|
101
|
+
arr[0].strip.gsub('-','')
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
results.flatten - ['']
|
|
105
|
+
end
|
|
106
|
+
|
|
70
107
|
def self.convert_romanization(str, from, to)
|
|
71
108
|
# NOTE: extract/refactor tokens cause tests to fail.
|
|
72
109
|
if from == :pyn
|
|
@@ -104,7 +141,6 @@ module ZhongwenTools
|
|
|
104
141
|
replace = token_replacement(token, from).fetch(to){ search }
|
|
105
142
|
replace = fix_capitalization(str, token, replace)
|
|
106
143
|
|
|
107
|
-
|
|
108
144
|
[search, replace]
|
|
109
145
|
end
|
|
110
146
|
|
|
@@ -127,82 +163,6 @@ module ZhongwenTools
|
|
|
127
163
|
result || {}
|
|
128
164
|
end
|
|
129
165
|
|
|
130
|
-
|
|
131
|
-
# <module_name>::<romanization_type>?(str)
|
|
132
|
-
#
|
|
133
|
-
# Public: Checks if a String is a romanization:
|
|
134
|
-
# Zhuyin Fuhao, Tongyong Pinyin, Wade Giles, MSP2 or Yale.
|
|
135
|
-
# http://en.wikipedia.org/wiki/Tongyong_Pinyin
|
|
136
|
-
# http://pinyin.info/romanization/tongyong/
|
|
137
|
-
# http://en.wikipedia.org/wiki/Wade%E2%80%93Giles
|
|
138
|
-
# http://en.wikipedia.org/wiki/Bopomofo
|
|
139
|
-
# http://pinyin.info/romanization/bopomofo/index.html
|
|
140
|
-
#
|
|
141
|
-
# str - a String. Optional if the object calling the method is a String.
|
|
142
|
-
#
|
|
143
|
-
# Examples
|
|
144
|
-
#
|
|
145
|
-
# typy?('chuei niou') #=> true
|
|
146
|
-
# wg?('Mao2 Tse2 Tung1') #=> true
|
|
147
|
-
# bpmf?('ㄊㄥ') #=> true
|
|
148
|
-
#
|
|
149
|
-
# Returns a boolean.
|
|
150
|
-
def self.create_detect_method(romanization_module, name)
|
|
151
|
-
romanization_module.define_singleton_method("#{name}?") do |str|
|
|
152
|
-
|
|
153
|
-
regex = romanization_module == :ZhuyinFuhao ? ZhongwenTools::Regex.bopomofo : ZhongwenTools::Romanization.detect_regex(name.to_sym)
|
|
154
|
-
normalized_str = str.downcase.gsub(ZhongwenTools::Regex.punc,'').gsub(/[1-5\s\-']/,'')
|
|
155
|
-
#TODO: ignore tonal marks from other systems wade giles, tongyong etc.
|
|
156
|
-
normalized_str.scan(regex).join == normalized_str
|
|
157
|
-
end
|
|
158
|
-
end
|
|
159
|
-
|
|
160
|
-
# <module_name>::to_<romanization_type>(str)
|
|
161
|
-
# Public: Converts to the given romanization from pyn (pinyin using numbers instead of tone marks.
|
|
162
|
-
#
|
|
163
|
-
# str = a String to be converted
|
|
164
|
-
#
|
|
165
|
-
# Examples:
|
|
166
|
-
#
|
|
167
|
-
#
|
|
168
|
-
#
|
|
169
|
-
# ZhongwenTools::Romanization::ZhuyinFuhao.to_zyfh('Mao2 Ze2-dong1') # => 'ㄇㄠ2 ㄗㄜ2ㄉㄨㄥ1'
|
|
170
|
-
#
|
|
171
|
-
# Returns a String.
|
|
172
|
-
def self.create_convert_method(romanization_module, romanization_name, name)
|
|
173
|
-
romanization_module.define_singleton_method("to_#{ name }") do |*args|
|
|
174
|
-
str, from = args
|
|
175
|
-
from ||= ZhongwenTools::Romanization.romanization?(str)
|
|
176
|
-
|
|
177
|
-
ZhongwenTools::Romanization.convert str, romanization_name, from.to_sym
|
|
178
|
-
end
|
|
179
|
-
end
|
|
180
|
-
|
|
181
|
-
# <module_name>::split(str)
|
|
182
|
-
# Public: splits the romanization's string.
|
|
183
|
-
#
|
|
184
|
-
# str - a String to be split
|
|
185
|
-
#
|
|
186
|
-
# Examples
|
|
187
|
-
#
|
|
188
|
-
#
|
|
189
|
-
# split('zhong1guo2')
|
|
190
|
-
# # => ['zhong1', 'guo2']
|
|
191
|
-
#
|
|
192
|
-
# Returns an Array of Strings.
|
|
193
|
-
def self.create_split_method(romanization_module, name)
|
|
194
|
-
regex = romanization_module == :ZhuyinFuhao ? /([#{ZhongwenTools::Regex.bopomofo}]*)/ : /(#{ZhongwenTools::Romanization.detect_regex(name.to_sym)}*)/
|
|
195
|
-
|
|
196
|
-
romanization_module.define_singleton_method("split") do |str|
|
|
197
|
-
# TODO: ignore tonal marks from other systems wade giles, tongyong etc.
|
|
198
|
-
results = str.scan(regex).map do |arr|
|
|
199
|
-
arr[0].strip.gsub('-','')
|
|
200
|
-
end
|
|
201
|
-
|
|
202
|
-
results.flatten - ['']
|
|
203
|
-
end
|
|
204
|
-
end
|
|
205
|
-
|
|
206
166
|
# Internal: Produces a Regexp for a romanization type.
|
|
207
167
|
#
|
|
208
168
|
# type - a Symbol for the romanization type.
|
|
@@ -252,19 +212,5 @@ module ZhongwenTools
|
|
|
252
212
|
TongyongPinyin: %w(typy tongyong tongyong_pinyin),
|
|
253
213
|
MPS2: ['mps2']
|
|
254
214
|
}
|
|
255
|
-
|
|
256
|
-
RomanizationTypes.each do |module_name, names|
|
|
257
|
-
romanization_module = self.const_set(module_name, Module.new) unless self.const_defined?(module_name)
|
|
258
|
-
romanization_module ||= self.const_get(module_name)
|
|
259
|
-
|
|
260
|
-
romanization_name = names.first.to_sym
|
|
261
|
-
|
|
262
|
-
names.each do |name|
|
|
263
|
-
create_convert_method(romanization_module, romanization_name, name)
|
|
264
|
-
end
|
|
265
|
-
|
|
266
|
-
create_detect_method(romanization_module, romanization_name)
|
|
267
|
-
create_split_method(romanization_module, romanization_name)
|
|
268
|
-
end
|
|
269
215
|
end
|
|
270
216
|
end
|
data/test/test_pinyin.rb
CHANGED
|
@@ -54,6 +54,8 @@ class TestPinyin < Minitest::Test
|
|
|
54
54
|
@words.each do |word|
|
|
55
55
|
assert_equal word[:pyn], ZhongwenTools::Romanization::Pinyin.to_pyn(word[:py])
|
|
56
56
|
end
|
|
57
|
+
assert_equal 'yi2ge4', ZhongwenTools::Romanization::Pinyin.to_pyn('yígè')
|
|
58
|
+
assert_equal 'yi4nian2', ZhongwenTools::Romanization::Pinyin.to_pyn('yìnián', :py)
|
|
57
59
|
end
|
|
58
60
|
|
|
59
61
|
def setup
|
data/test/test_regex.rb
CHANGED
|
@@ -30,7 +30,12 @@ class TestRegex < Minitest::Test
|
|
|
30
30
|
refute '.'[ZhongwenTools::Regex.zh_punc]
|
|
31
31
|
assert '.'[ZhongwenTools::Regex.punc]
|
|
32
32
|
assert '。'[ZhongwenTools::Regex.zh_punc]
|
|
33
|
-
|
|
33
|
+
assert '。'[ZhongwenTools::Regex.punc]
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def test_bopomofo
|
|
37
|
+
assert "ㄅ"[ZhongwenTools::Regex.bopomofo]
|
|
38
|
+
# ㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩ
|
|
34
39
|
end
|
|
35
40
|
|
|
36
41
|
def test_zh
|
data/zhongwen_tools.gemspec
CHANGED
|
@@ -27,4 +27,8 @@ Gem::Specification.new do |s|
|
|
|
27
27
|
s.add_development_dependency('pry', '~> 0.9', '>= 0.9.12')
|
|
28
28
|
s.add_development_dependency('minitest-reporters', '~> 1.0', '>= 1.0.4')
|
|
29
29
|
end
|
|
30
|
+
|
|
31
|
+
if RUBY_VERSION >= '2.1'
|
|
32
|
+
s.add_development_dependency('memory_profiler', '0.0.4')
|
|
33
|
+
end
|
|
30
34
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: zhongwen_tools
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.17.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Steven Daniels
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2014-
|
|
11
|
+
date: 2014-12-31 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rake
|
|
@@ -144,6 +144,20 @@ dependencies:
|
|
|
144
144
|
- - ">="
|
|
145
145
|
- !ruby/object:Gem::Version
|
|
146
146
|
version: 1.0.4
|
|
147
|
+
- !ruby/object:Gem::Dependency
|
|
148
|
+
name: memory_profiler
|
|
149
|
+
requirement: !ruby/object:Gem::Requirement
|
|
150
|
+
requirements:
|
|
151
|
+
- - '='
|
|
152
|
+
- !ruby/object:Gem::Version
|
|
153
|
+
version: 0.0.4
|
|
154
|
+
type: :development
|
|
155
|
+
prerelease: false
|
|
156
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
157
|
+
requirements:
|
|
158
|
+
- - '='
|
|
159
|
+
- !ruby/object:Gem::Version
|
|
160
|
+
version: 0.0.4
|
|
147
161
|
description: Chinese tools for romanization conversions and other helpful string functions
|
|
148
162
|
for Chinese.
|
|
149
163
|
email:
|
|
@@ -167,9 +181,14 @@ files:
|
|
|
167
181
|
- lib/zhongwen_tools/number/number_table.rb
|
|
168
182
|
- lib/zhongwen_tools/regex.rb
|
|
169
183
|
- lib/zhongwen_tools/romanization.rb
|
|
184
|
+
- lib/zhongwen_tools/romanization/mps2.rb
|
|
170
185
|
- lib/zhongwen_tools/romanization/pinyin.rb
|
|
171
186
|
- lib/zhongwen_tools/romanization/pinyin_table.rb
|
|
172
187
|
- lib/zhongwen_tools/romanization/romanization_table.rb
|
|
188
|
+
- lib/zhongwen_tools/romanization/tongyong_pinyin.rb
|
|
189
|
+
- lib/zhongwen_tools/romanization/wade_giles.rb
|
|
190
|
+
- lib/zhongwen_tools/romanization/yale.rb
|
|
191
|
+
- lib/zhongwen_tools/romanization/zhuyin_fuhao.rb
|
|
173
192
|
- lib/zhongwen_tools/ruby_19.rb
|
|
174
193
|
- lib/zhongwen_tools/script.rb
|
|
175
194
|
- lib/zhongwen_tools/script/conversion_data
|