zhongwen_tools 0.12.4 → 0.15.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +1 -1
- data/README.md +74 -165
- data/Rakefile +0 -1
- data/lib/zhongwen_tools/{string/caps.rb → caps.rb} +19 -1
- data/lib/zhongwen_tools/core.rb +19 -0
- data/lib/zhongwen_tools/core_ext/integer.rb +8 -0
- data/lib/zhongwen_tools/core_ext/string.rb +10 -0
- data/lib/zhongwen_tools/fullwidth.rb +102 -0
- data/lib/zhongwen_tools/integer_extension.rb +31 -0
- data/lib/zhongwen_tools/number/number_table.rb +44 -0
- data/lib/zhongwen_tools/number.rb +221 -0
- data/lib/zhongwen_tools/regex.rb +38 -22
- data/lib/zhongwen_tools/romanization/pinyin.rb +231 -0
- data/lib/zhongwen_tools/romanization/{pyn_to_py.rb → pinyin_table.rb} +2 -1
- data/lib/zhongwen_tools/romanization/romanization_table.rb +425 -0
- data/lib/zhongwen_tools/romanization.rb +199 -136
- data/lib/zhongwen_tools/{string/ruby19.rb → ruby_19.rb} +1 -2
- data/lib/zhongwen_tools/{conversion → script}/conversion_data +0 -0
- data/lib/zhongwen_tools/{conversion.rb → script.rb} +21 -34
- data/lib/zhongwen_tools/string_extension.rb +136 -0
- data/lib/zhongwen_tools/unicode.rb +25 -0
- data/lib/zhongwen_tools/uri.rb +14 -0
- data/lib/zhongwen_tools/version.rb +1 -1
- data/lib/zhongwen_tools/zhongwen.rb +29 -0
- data/lib/zhongwen_tools.rb +2 -3
- data/test/test_caps.rb +26 -0
- data/test/test_core.rb +13 -0
- data/test/test_fullwidth.rb +30 -0
- data/test/test_helper.rb +4 -12
- data/test/test_helpers/unload_zhongwen_tools_script.rb +5 -0
- data/test/test_integer_extension.rb +34 -0
- data/test/test_number.rb +79 -0
- data/test/test_pinyin.rb +68 -0
- data/test/test_regex.rb +41 -0
- data/test/test_romanization.rb +110 -133
- data/test/{test_conversion.rb → test_script.rb} +41 -44
- data/test/test_string_extension.rb +94 -0
- data/test/test_unicode.rb +27 -0
- data/test/test_uri.rb +16 -0
- data/test/test_zhongwen.rb +37 -0
- data/zhongwen_tools.gemspec +1 -1
- metadata +93 -52
- data/Gemfile.1.8.7 +0 -8
- data/lib/zhongwen_tools/conversion/string.rb +0 -19
- data/lib/zhongwen_tools/integer.rb +0 -28
- data/lib/zhongwen_tools/numbers.rb +0 -195
- data/lib/zhongwen_tools/regex/ruby18.rb +0 -15
- data/lib/zhongwen_tools/romanization/conversion_table.rb +0 -425
- data/lib/zhongwen_tools/romanization/detect.rb +0 -141
- data/lib/zhongwen_tools/romanization/string.rb +0 -36
- data/lib/zhongwen_tools/string/fullwidth.rb +0 -85
- data/lib/zhongwen_tools/string/ruby18.rb +0 -96
- data/lib/zhongwen_tools/string.rb +0 -164
- data/test/test_integer.rb +0 -31
- data/test/test_numbers.rb +0 -68
- data/test/test_string.rb +0 -133
@@ -0,0 +1,221 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'zhongwen_tools/regex'
|
3
|
+
require 'zhongwen_tools/zhongwen'
|
4
|
+
require 'zhongwen_tools/romanization/pinyin'
|
5
|
+
require 'zhongwen_tools/number/number_table'
|
6
|
+
|
7
|
+
# Number.to_pyn, to_i, to_zhs, etc.
|
8
|
+
module ZhongwenTools
|
9
|
+
module Number
|
10
|
+
def self.number?(obj)
|
11
|
+
klass = obj.class
|
12
|
+
|
13
|
+
if klass == String
|
14
|
+
regex = /([\d]|#{ZhongwenTools::Regex.zh_numbers}){1,}/
|
15
|
+
"#{obj}".gsub(regex, '') == ''
|
16
|
+
elsif klass == Integer
|
17
|
+
true
|
18
|
+
elsif klass == Fixnum
|
19
|
+
true
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
#needs to be a class method
|
24
|
+
%w(i zhs zht pyn).each do |action|
|
25
|
+
define_singleton_method("to_#{ action }") do |*args|
|
26
|
+
obj, from, separator = args
|
27
|
+
from ||= number_type(obj)
|
28
|
+
|
29
|
+
convert(obj, action.to_sym, from.to_sym, separator)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.to_zh(obj, type = :zhs, from = nil)
|
34
|
+
type = type.to_sym
|
35
|
+
|
36
|
+
if type == :zht
|
37
|
+
to_zht(obj, from)
|
38
|
+
else
|
39
|
+
to_zhs(obj, from)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def self.convert(obj, to, from, separator = '')
|
46
|
+
fail ArgumentError unless [:zhs, :zht, :i, :pyn].include?(to.to_sym)
|
47
|
+
fail ArgumentError unless [String, Integer, Fixnum].include?(obj.class)
|
48
|
+
|
49
|
+
number = convert_from from, to, obj
|
50
|
+
|
51
|
+
if to == :i
|
52
|
+
combine_integers(number)
|
53
|
+
elsif to == :pyn
|
54
|
+
regex = /#{ %w(yi4 wan4 qian1 bai2 shi2).map{ |x| 'ling2\-' + x }.join('|')}/
|
55
|
+
finalize_number(number, '-').gsub(regex, '').gsub(/\-+/, '-').gsub(/\-$/, '')
|
56
|
+
else
|
57
|
+
finalize_number(number)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.number_type(obj)
|
62
|
+
klass = obj.class
|
63
|
+
|
64
|
+
if klass == Fixnum || klass == Integer
|
65
|
+
:i
|
66
|
+
else
|
67
|
+
if ZhongwenTools::Zhongwen.zh?(obj)
|
68
|
+
# need to check zhs or zht
|
69
|
+
if zht?(obj)
|
70
|
+
:zht
|
71
|
+
else
|
72
|
+
:zhs
|
73
|
+
end
|
74
|
+
else #assume it is pyn
|
75
|
+
#if ZhongwenTools.const_defined?(:Romanization) && ZhongwenTools::Romanization.romanization?(obj)
|
76
|
+
# might need to convert to pyn
|
77
|
+
:pyn
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def self.zht?(str)
|
83
|
+
str[/#{ZhongwenTools::Regex.zht_numbers }*/] == str
|
84
|
+
end
|
85
|
+
|
86
|
+
def self.zhs?(str)
|
87
|
+
!zht?(str)
|
88
|
+
end
|
89
|
+
|
90
|
+
def self.convert_from(from, to, number)
|
91
|
+
if from == :zht
|
92
|
+
convert_from_zh(to, number)
|
93
|
+
elsif from == :zhs
|
94
|
+
convert_from_zh(to, number)
|
95
|
+
elsif from == :i
|
96
|
+
convert_from_integer(to, number)
|
97
|
+
elsif from == :pyn
|
98
|
+
convert_from_pyn(to, number)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def self.convert_from_pyn(to, pyn)
|
103
|
+
# convert to pyn
|
104
|
+
# split the pyn and then
|
105
|
+
pyns = ZhongwenTools::Romanization::Pinyin.split_pyn(pyn)
|
106
|
+
|
107
|
+
pyns.map do |p|
|
108
|
+
convert_number(p).fetch( to ){ p }
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def self.convert_from_zh(to, number)
|
113
|
+
converted_number = number.chars.map do |zh|
|
114
|
+
convert_number(zh).fetch(to){ zh }
|
115
|
+
end
|
116
|
+
|
117
|
+
converted_number
|
118
|
+
end
|
119
|
+
|
120
|
+
def self.combine_integers(integers)
|
121
|
+
return combine_year(integers) if year?(integers)
|
122
|
+
|
123
|
+
number = 0
|
124
|
+
length = integers.size
|
125
|
+
skipped = false
|
126
|
+
|
127
|
+
integers.each_with_index do |curr_num, i|
|
128
|
+
next if skipped == i
|
129
|
+
|
130
|
+
if (i+2) <= length
|
131
|
+
number, i = combine_integer(integers, number, curr_num, i)
|
132
|
+
skipped = i + 1
|
133
|
+
else
|
134
|
+
number = adjust_integer(number, curr_num)
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
number
|
139
|
+
end
|
140
|
+
|
141
|
+
def self.year?(integers)
|
142
|
+
integers.select{ |i| i < 10 }.size == integers.size
|
143
|
+
end
|
144
|
+
|
145
|
+
def self.combine_year(integers)
|
146
|
+
integers.map{ |i| i.to_s }.join.to_i
|
147
|
+
end
|
148
|
+
|
149
|
+
def self.combine_integer integers, result, curr_num, i
|
150
|
+
next_number = integers[i + 1]
|
151
|
+
if number_multiplier? next_number
|
152
|
+
result += next_number * curr_num
|
153
|
+
end
|
154
|
+
|
155
|
+
[result, i]
|
156
|
+
end
|
157
|
+
|
158
|
+
def self.adjust_integer(number, curr_num)
|
159
|
+
number_multiplier?(curr_num) ? number * curr_num : number + curr_num
|
160
|
+
end
|
161
|
+
|
162
|
+
def self.number_multiplier?(number)
|
163
|
+
[10,100,1_000,10_000,100_000_000].include? number
|
164
|
+
end
|
165
|
+
|
166
|
+
|
167
|
+
def self.convert_from_integer to, int
|
168
|
+
# FIXME: this will fail for numbers over 1 billion.
|
169
|
+
result = []
|
170
|
+
nums = convert_integer_to_reversed_array_of_integers(int)
|
171
|
+
|
172
|
+
nums.each_with_index do |num, i|
|
173
|
+
wan = wan_level(wan, i)
|
174
|
+
|
175
|
+
if i == 0
|
176
|
+
result << convert_integer(num, to) unless num == 0
|
177
|
+
else
|
178
|
+
result << convert_wan_level(i, to)
|
179
|
+
# checks the wan level and ...
|
180
|
+
result << convert_integer(num, to) if wan_ok?(num, wan, i)
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
result.reverse!
|
185
|
+
end
|
186
|
+
|
187
|
+
def self.convert_integer_to_reversed_array_of_integers(int)
|
188
|
+
int.to_s.chars.to_a.reverse.map{ |x| x.to_i }
|
189
|
+
end
|
190
|
+
|
191
|
+
def self.wan_ok?(num, wan, i)
|
192
|
+
(num == 1 && (10**(i) / 10_000 ** wan) != 10) || num != 1
|
193
|
+
end
|
194
|
+
|
195
|
+
def self.wan_level(wan, i)
|
196
|
+
wan ||= 0
|
197
|
+
wan += 1 if (i + 1) % 5 == 0
|
198
|
+
|
199
|
+
wan
|
200
|
+
end
|
201
|
+
|
202
|
+
def self.convert_wan_level(i, to)
|
203
|
+
convert_integer((10**(i)), to) || convert_integer((10**(i) / 10_000), to) || convert_integer((10**(i) / 10_000**2), to)
|
204
|
+
end
|
205
|
+
|
206
|
+
def self.convert_integer(int, to)
|
207
|
+
NUMBERS_TABLE.find{ |x| x[:i] == int }.fetch(to){ 0 }
|
208
|
+
end
|
209
|
+
|
210
|
+
def self.convert_number(number)
|
211
|
+
NUMBERS_TABLE.find{ |x| x[:zhs] == number || x[:zht] == number || x[:pyn] == number }
|
212
|
+
end
|
213
|
+
|
214
|
+
def self.finalize_number(number, separator = '')
|
215
|
+
# FIXME: is finalize_number the best name you can think of?
|
216
|
+
# NOTE: Figuring out usage of "liang" vs. "er" is pretty
|
217
|
+
# difficult, so always use "er" instead.
|
218
|
+
number.join(separator).gsub(/零#{ZhongwenTools::Regex.zh_number_multiple}/u,'')
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
data/lib/zhongwen_tools/regex.rb
CHANGED
@@ -1,51 +1,68 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
+
|
2
3
|
module ZhongwenTools
|
3
4
|
module Regex
|
4
|
-
|
5
|
-
|
6
|
-
def pyn
|
7
|
-
/(#{pyn_regexes.values.join('|')}|r)([1-5])?([\s\-]+)?/
|
5
|
+
def self.pyn
|
6
|
+
/(#{pyn_regexes.values.join('|')}|r)([1-5])([\s\-]+)?/
|
8
7
|
end
|
9
8
|
|
10
|
-
def py
|
9
|
+
def self.py
|
11
10
|
# FIXME: need to detect Ālābó
|
12
11
|
# ([ĀÁǍÀA][io]?|[io]?|[][āáǎàaēéěèeūúǔùu]?o?|[ĒÉĚÈE]i?|[]i?|[ŌÓǑÒO]u?|[]u?|u[āáǎàaēoēéěèe]?i?|[]e?)(n?g?r?)){1,}
|
13
|
-
/(#{pyn_regexes.map{|k,v| v.to_s[7..-2].
|
12
|
+
/(#{pyn_regexes.map{|k,v| v.to_s[7..-2].gsub(/[aeiouv]/,py_tones)}.join('|')}([\s\-])?)/
|
14
13
|
end
|
15
14
|
|
16
|
-
def pinyin_num
|
15
|
+
def self.pinyin_num
|
17
16
|
/(([BPMFDTNLGKHZCSRJQXWYbpmfdtnlgkhzcsrjqxwy]?[h]?)(A[io]?|a[io]?|i[aeu]?o?|Ei?|ei?|Ou?|ou?|u[aoe]?i?|ve?)?(n?g?)(r?)([1-5])(\-+)?)/
|
18
17
|
end
|
19
18
|
|
20
|
-
def
|
19
|
+
def self.pinyin_toneless
|
20
|
+
/(#{pyn_regexes.values.join('|')}|r)([\s\-]+)?/
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.fullwidth
|
21
24
|
/[0-9A-Za-z%.:#$&+-/\=;<>]/
|
22
25
|
end
|
23
26
|
|
24
|
-
def capital_letters
|
25
|
-
/(#{Regexp.union(ZhongwenTools::
|
27
|
+
def self.capital_letters
|
28
|
+
/(#{Regexp.union(ZhongwenTools::Caps::CAPS.keys)})/
|
26
29
|
end
|
27
30
|
|
28
|
-
def lowercase_letters
|
29
|
-
/(#{Regexp.union(ZhongwenTools::
|
31
|
+
def self.lowercase_letters
|
32
|
+
/(#{Regexp.union(ZhongwenTools::Caps::CAPS.values)})/
|
30
33
|
end
|
31
34
|
|
32
|
-
def zh
|
35
|
+
def self.zh
|
33
36
|
/[\u2E80-\u2E99]|[\u2E9B-\u2EF3]|[\u2F00-\u2FD5]|[\u3005|\u3007]|[\u3021-\u3029]|[\u3038-\u303B]|[\u3400-\u4DB5]|[\u4E00-\u9FCC]|[\uF900-\uFA6D]|[\uFA70-\uFAD9]/
|
34
37
|
end
|
35
38
|
|
36
|
-
def punc
|
39
|
+
def self.punc
|
37
40
|
/[\u0021-\u0023]|[\u0025-\u002A]|[\u002C-\u002F]|[\u003A\u003B\u003F\u0040]|[\u005B-\u005D\u005F\u007B\u007D\u00A1\u00A7\u00AB\u00B6\u00B7\u00BB\u00BF\u037E\u0387]/
|
38
41
|
end
|
39
42
|
|
40
|
-
def zh_punc
|
43
|
+
def self.zh_punc
|
41
44
|
# TODO: includes non-zh punctuation codes. Should only include punctuation in CJK ranges.
|
42
45
|
/[\u2E00-\u2E2E]|[\u2E30-\u2E3B]|[\u3001-\u3003]|[\u3008-\u3011]|[\u3014-\u301F]|[\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF]|[\uA60D-\uA60F]|[\uA673\uA67E]|[\uA6F2-\uA6F7]|[\uA874-\uA877]|[\uA8CE\uA8CF]|[\uA8F8-\uA8FA]|[\uA92E\uA92F\uA95F]|[\uA9C1-\uA9CD]|[\uA9DE\uA9DF]|[\uAA5C-\uAA5F]|[\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F]|[\uFE10-\uFE19]|[\uFE30-\uFE52]|[\uFE54-\uFE61]|[\uFE63\uFE68\uFE6A\uFE6B]|[\uFF01-\uFF03]|[\uFF05-\uFF0A]|[\uFF0C-\uFF0F]|[\uFF1A\uFF1B\uFF1F\uFF20]|[\uFF3B-\uFF3D]|[\uFF3F\uFF5B\uFF5D]|[\uFF5F-\uFF65]/
|
43
46
|
end
|
44
47
|
|
45
|
-
def zh_numbers
|
48
|
+
def self.zh_numbers
|
46
49
|
# TODO: include numbers like yotta, etc.
|
47
50
|
# 垓 秭 穰 溝 澗 正 載 --> beyond 100,000,000!
|
48
|
-
|
51
|
+
# Regional: Dong Guai
|
52
|
+
/[〇零一壹幺二贰貳两兩三弎叁參仨四肆䦉五伍六陆陸七柒八捌九玖十拾廿卅百佰千仟万萬亿億]/
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.zhs_numbers
|
56
|
+
# TODO: check if 佰,仟 are the financial numbers in zhs
|
57
|
+
/[〇零一壹幺二贰两三弎叁仨四肆䦉五伍六陆七柒八捌九玖十拾廿卅百佰千仟万亿]/
|
58
|
+
end
|
59
|
+
|
60
|
+
def self.zht_numbers
|
61
|
+
/[〇零一壹幺二貳兩三弎參仨四肆䦉五伍六陸七柒八捌九玖十拾廿卅佰千仟萬億]/
|
62
|
+
end
|
63
|
+
|
64
|
+
def self.zh_number_multiple
|
65
|
+
/[拾十百佰千仟仟万萬亿億]/
|
49
66
|
end
|
50
67
|
|
51
68
|
# Public: A Regex for bopomofo, a.k.a. Zhuyin Fuhao 注音符号.
|
@@ -56,12 +73,13 @@ module ZhongwenTools
|
|
56
73
|
# bopomofo #=> <Regex>
|
57
74
|
#
|
58
75
|
# Returns a Regex.
|
59
|
-
def bopomofo
|
76
|
+
def self.bopomofo
|
60
77
|
/[ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩ]/
|
61
78
|
end
|
62
79
|
|
63
80
|
private
|
64
|
-
|
81
|
+
|
82
|
+
def self.pyn_regexes
|
65
83
|
# http://stackoverflow.com/questions/20736291/regex-for-matching-pinyin
|
66
84
|
# https://www.debuggex.com/r/_9kbxA6f00gIGiVo
|
67
85
|
# NOTE: you might need to change the order of these regexes for more accurate matching of some pinyin.
|
@@ -81,7 +99,7 @@ module ZhongwenTools
|
|
81
99
|
}
|
82
100
|
end
|
83
101
|
|
84
|
-
def py_tones
|
102
|
+
def self.py_tones
|
85
103
|
py_tones = {
|
86
104
|
'a' => '[āáǎàa]',
|
87
105
|
'e' => '[ēéěèe]',
|
@@ -93,5 +111,3 @@ module ZhongwenTools
|
|
93
111
|
end
|
94
112
|
end
|
95
113
|
end
|
96
|
-
|
97
|
-
require File.expand_path("../regex/ruby18", __FILE__) if RUBY_VERSION < '1.9'
|
@@ -0,0 +1,231 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'zhongwen_tools/regex'
|
3
|
+
require 'zhongwen_tools/caps'
|
4
|
+
require 'zhongwen_tools/romanization'
|
5
|
+
|
6
|
+
module ZhongwenTools
|
7
|
+
module Romanization
|
8
|
+
|
9
|
+
def self.convert_to_py(str, from)
|
10
|
+
str = convert_romanization(str, from, :pyn) if from != :pyn
|
11
|
+
ZhongwenTools::Romanization::Pinyin.convert_pyn_to_pinyin(str)
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.convert_to_pyn(str, from)
|
15
|
+
orig_str = str.dup
|
16
|
+
|
17
|
+
if from == :py
|
18
|
+
str = ZhongwenTools::Romanization::Pinyin.convert_pinyin_to_pyn(str)
|
19
|
+
else
|
20
|
+
str = convert_romanization(str, from, :pyn)
|
21
|
+
end
|
22
|
+
|
23
|
+
str = ZhongwenTools::Romanization::Pinyin.add_hyphens_to_pyn(str) if hyphenated?(orig_str)
|
24
|
+
|
25
|
+
str
|
26
|
+
end
|
27
|
+
|
28
|
+
module Pinyin
|
29
|
+
%w(pinyin py pyn).each do |romanization|
|
30
|
+
define_singleton_method("to_#{romanization}") do |*args|
|
31
|
+
str, from = args
|
32
|
+
from ||= ZhongwenTools::Romanization.romanization? str
|
33
|
+
|
34
|
+
#_convert_romanization str, _set_type(type.to_sym), _set_type(from)
|
35
|
+
ZhongwenTools::Romanization.convert str, py_type(romanization), (py_type(from) || from)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.split_pyn(str)
|
40
|
+
# FIXME: ignore punctuation
|
41
|
+
regex = str[/[1-5]/].nil? ? /(#{ZhongwenTools::Regex.pinyin_toneless})/ : /(#{ZhongwenTools::Regex.pyn})/
|
42
|
+
|
43
|
+
str.scan(regex).map{ |arr| arr[0].strip.gsub('-','') }.flatten
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.split_py(str)
|
47
|
+
words = str.split(' ')
|
48
|
+
|
49
|
+
results = words.map do |word|
|
50
|
+
word, is_capitalized = normalize_pinyin(word)
|
51
|
+
result = word.split(/['\-]/).flatten.map do |x|
|
52
|
+
find_py(x)
|
53
|
+
end
|
54
|
+
|
55
|
+
recapitalize(result.flatten, is_capitalized)
|
56
|
+
end
|
57
|
+
|
58
|
+
results.flatten
|
59
|
+
end
|
60
|
+
|
61
|
+
# Public: checks if a string is pinyin.
|
62
|
+
# http://en.wikipedia.org/wiki/Pinyin
|
63
|
+
#
|
64
|
+
# Examples
|
65
|
+
# py?('nǐ hǎo')
|
66
|
+
# # => true
|
67
|
+
#
|
68
|
+
# Returns Boolean.
|
69
|
+
def self.py?(str)
|
70
|
+
# NOTE: py regex does not include capitals with tones.
|
71
|
+
#ZhongwenTools::Caps.downcase(str).gsub(ZhongwenTools::Regex.punc,'').gsub(Regex.py, '').gsub(/[\s\-]/,'').strip == ''
|
72
|
+
regex = /(#{ ZhongwenTools::Regex.punc }|#{ ZhongwenTools::Regex.py }|[\s\-])/
|
73
|
+
ZhongwenTools::Caps.downcase(str).gsub(regex, '').strip == ''
|
74
|
+
end
|
75
|
+
|
76
|
+
# Public: checks if a string is pinyin.
|
77
|
+
#
|
78
|
+
# Examples
|
79
|
+
# pyn?('pin1-yin1')
|
80
|
+
# # => true
|
81
|
+
#
|
82
|
+
# Returns Boolean.
|
83
|
+
def self.pyn?(str)
|
84
|
+
# FIXME: use strip_punctuation method
|
85
|
+
normalized_str = ZhongwenTools::Caps.downcase(str.gsub(ZhongwenTools::Regex.punc,'').gsub(/[\s\-]/,''))
|
86
|
+
pyn_arr = split_pyn(normalized_str).map{ |p| p }
|
87
|
+
|
88
|
+
pyn_matches_properly?(pyn_arr, normalized_str) &&
|
89
|
+
are_all_pyn_syllables_complete?(pyn_arr)
|
90
|
+
end
|
91
|
+
|
92
|
+
def self.add_hyphens_to_pyn(str)
|
93
|
+
results = str.split(' ').map do |s|
|
94
|
+
split_pyn(s).join('-')
|
95
|
+
end
|
96
|
+
|
97
|
+
results.join(' ')
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
def self.pyn_matches_properly?(pyn_arr, normalized_str)
|
103
|
+
pyn_arr.join('') == normalized_str
|
104
|
+
end
|
105
|
+
|
106
|
+
def self.are_all_pyn_syllables_complete?(pyn_arr)
|
107
|
+
pyns = ROMANIZATIONS_TABLE.map{ |r| r[:pyn] }
|
108
|
+
|
109
|
+
pyn_syllables = pyn_arr.select do |p|
|
110
|
+
pyns.include?(p.gsub(/[1-5]/, ''))
|
111
|
+
end
|
112
|
+
|
113
|
+
pyn_arr.size == pyn_syllables.size
|
114
|
+
end
|
115
|
+
|
116
|
+
def self.py_type(romanization)
|
117
|
+
romanization = romanization.to_s.downcase.to_sym
|
118
|
+
|
119
|
+
{ pyn: :pyn, py: :py, pinyin: :py }[romanization]
|
120
|
+
end
|
121
|
+
|
122
|
+
|
123
|
+
def self.normalize_pinyin(pinyin)
|
124
|
+
[ZhongwenTools::Caps.downcase(pinyin), capitalized?(pinyin)]
|
125
|
+
end
|
126
|
+
|
127
|
+
def self.find_py(str)
|
128
|
+
str.scan(ZhongwenTools::Regex.py).map{ |x| (x - [nil])[0] }
|
129
|
+
|
130
|
+
end
|
131
|
+
|
132
|
+
def self.recapitalize(obj, capitalized)
|
133
|
+
return obj unless capitalized
|
134
|
+
|
135
|
+
if obj.class == String
|
136
|
+
ZhongwenTools::Caps.capitalize(obj)
|
137
|
+
elsif obj.class == Array
|
138
|
+
[ZhongwenTools::Caps.capitalize(obj[0]), obj[1..-1]].flatten
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
# Internal: converts real pinyin to pinyin number string.
|
143
|
+
#
|
144
|
+
# pinyin - A String for the pinyin.
|
145
|
+
#
|
146
|
+
# Examples
|
147
|
+
#
|
148
|
+
# convert_pinyin_to_pyn('Nǐ hǎo ma') #=> 'Ni3 hao3 ma5?'
|
149
|
+
#
|
150
|
+
# Returns a String in pinyin number format.
|
151
|
+
def self.convert_pinyin_to_pyn(pinyin)
|
152
|
+
words = pinyin.split(' ')
|
153
|
+
|
154
|
+
pyn = words.map do |word|
|
155
|
+
# NOTE: if a word is upcase, then it will be converted the same
|
156
|
+
# as a word that is only capitalized.
|
157
|
+
word, is_capitalized = normalize_pinyin(word)
|
158
|
+
|
159
|
+
pys = split_py(word)
|
160
|
+
#is_capitalized ? ZhongwenTools::Caps.capitalize(result) : result
|
161
|
+
recapitalize(current_pyn(word, pys), is_capitalized)
|
162
|
+
end
|
163
|
+
|
164
|
+
pyn.join(' ')
|
165
|
+
end
|
166
|
+
|
167
|
+
def self.capitalized?(str)
|
168
|
+
str[0] != ZhongwenTools::Caps.downcase(str[0])
|
169
|
+
end
|
170
|
+
|
171
|
+
def self.current_pyn(pyn, pinyin_arr)
|
172
|
+
replacements = []
|
173
|
+
pinyin_arr.each do |pinyin|
|
174
|
+
replace = pinyin_replacement(pinyin)
|
175
|
+
match = pinyin
|
176
|
+
if replacements.size > 0
|
177
|
+
pyn = pyn.sub(/(#{replacements.join('.*')}.*)#{match}/){ $1 + replace }
|
178
|
+
else
|
179
|
+
pyn = pyn.sub(/#{match}/){ "#{$1}#{replace}"}
|
180
|
+
end
|
181
|
+
replacements << replace
|
182
|
+
end
|
183
|
+
|
184
|
+
pyn.gsub("'", '')
|
185
|
+
end
|
186
|
+
|
187
|
+
def self.pinyin_replacement(py)
|
188
|
+
matches = PYN_PY.values.select do |x|
|
189
|
+
py.include? x
|
190
|
+
end
|
191
|
+
match = select_pinyin_match(matches)
|
192
|
+
replace = PYN_PY.find{|k,v| k if v == match}[0]
|
193
|
+
|
194
|
+
py.gsub(match, replace).gsub(/([^\d ]*)(\d)([^\d ]*)/){$1 + $3 + $2}
|
195
|
+
end
|
196
|
+
|
197
|
+
def self.select_pinyin_match(matches)
|
198
|
+
# take the longest pinyin match. Use bytes because 'è' is prefered over 'n' or 'r' or 'm'
|
199
|
+
match = matches.sort{|x,y| x.bytes.to_a.length <=> y.bytes.to_a.length}[-1]
|
200
|
+
|
201
|
+
# Edge case.. en/eng pyn -> py conversion is one way only.
|
202
|
+
match[/^(ē|é|ě|è|e)n?g?/].nil? ? match : match.chars[0]
|
203
|
+
end
|
204
|
+
|
205
|
+
|
206
|
+
# Internal: Replaces numbered pinyin with actual pinyin. Pinyin separated with hyphens are combined as one word.
|
207
|
+
#
|
208
|
+
# str - A String to replace with actual pinyin
|
209
|
+
#
|
210
|
+
# Examples
|
211
|
+
#
|
212
|
+
# convert_pyn_to_pinyin 'Ni3 hao3 ma5?' # => "Nǐ hǎo ma?"
|
213
|
+
#
|
214
|
+
#
|
215
|
+
# Returns a string with actual pinyin
|
216
|
+
def self.convert_pyn_to_pinyin(str)
|
217
|
+
regex = Regex.pinyin_num
|
218
|
+
# Using gsub is ~8x faster than using scan and each.
|
219
|
+
# Explanation: if it's pinyin without vowels, e.g. m, ng, then convert,
|
220
|
+
# otherwise, check if it needs an apostrophe (http://www.pinyin.info/romanization/hanyu/apostrophes.html).
|
221
|
+
# If it does, add it and then convert. Otherwise, just convert.
|
222
|
+
# Oh, and if it has double hyphens, replace with one hyphen.
|
223
|
+
# And finally, correct those apostrophes at the very end.
|
224
|
+
# It's like magic.
|
225
|
+
str.gsub(regex) do
|
226
|
+
($3.nil? ? "#{PYN_PY[$1]}" : ($2 == '' && ['a','e','o'].include?($3[0,1]))? "'#{PYN_PY["#{$3}#{$6}"]}#{$4}#{$5}" : "#{$2}#{PYN_PY["#{$3}#{$6}"]}#{$4}#{$5}") + (($7.to_s.length > 1) ? '-' : '')
|
227
|
+
end.gsub("-'","-").sub(/^'/,'')
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
end
|
@@ -2,7 +2,8 @@
|
|
2
2
|
|
3
3
|
# NOTE: This table works for pyn -> pinyin conversion, but it introduces
|
4
4
|
# mistakes when converting pinyin to pyn. In practice, pinyin can't
|
5
|
-
# be converted to pyn
|
5
|
+
# be converted to pyn with complete accuracy unless it is properly
|
6
|
+
# formatted.
|
6
7
|
module ZhongwenTools
|
7
8
|
module Romanization
|
8
9
|
PYN_PY = {
|