zhongwen_tools 0.12.4 → 0.15.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +1 -1
- data/README.md +74 -165
- data/Rakefile +0 -1
- data/lib/zhongwen_tools/{string/caps.rb → caps.rb} +19 -1
- data/lib/zhongwen_tools/core.rb +19 -0
- data/lib/zhongwen_tools/core_ext/integer.rb +8 -0
- data/lib/zhongwen_tools/core_ext/string.rb +10 -0
- data/lib/zhongwen_tools/fullwidth.rb +102 -0
- data/lib/zhongwen_tools/integer_extension.rb +31 -0
- data/lib/zhongwen_tools/number/number_table.rb +44 -0
- data/lib/zhongwen_tools/number.rb +221 -0
- data/lib/zhongwen_tools/regex.rb +38 -22
- data/lib/zhongwen_tools/romanization/pinyin.rb +231 -0
- data/lib/zhongwen_tools/romanization/{pyn_to_py.rb → pinyin_table.rb} +2 -1
- data/lib/zhongwen_tools/romanization/romanization_table.rb +425 -0
- data/lib/zhongwen_tools/romanization.rb +199 -136
- data/lib/zhongwen_tools/{string/ruby19.rb → ruby_19.rb} +1 -2
- data/lib/zhongwen_tools/{conversion → script}/conversion_data +0 -0
- data/lib/zhongwen_tools/{conversion.rb → script.rb} +21 -34
- data/lib/zhongwen_tools/string_extension.rb +136 -0
- data/lib/zhongwen_tools/unicode.rb +25 -0
- data/lib/zhongwen_tools/uri.rb +14 -0
- data/lib/zhongwen_tools/version.rb +1 -1
- data/lib/zhongwen_tools/zhongwen.rb +29 -0
- data/lib/zhongwen_tools.rb +2 -3
- data/test/test_caps.rb +26 -0
- data/test/test_core.rb +13 -0
- data/test/test_fullwidth.rb +30 -0
- data/test/test_helper.rb +4 -12
- data/test/test_helpers/unload_zhongwen_tools_script.rb +5 -0
- data/test/test_integer_extension.rb +34 -0
- data/test/test_number.rb +79 -0
- data/test/test_pinyin.rb +68 -0
- data/test/test_regex.rb +41 -0
- data/test/test_romanization.rb +110 -133
- data/test/{test_conversion.rb → test_script.rb} +41 -44
- data/test/test_string_extension.rb +94 -0
- data/test/test_unicode.rb +27 -0
- data/test/test_uri.rb +16 -0
- data/test/test_zhongwen.rb +37 -0
- data/zhongwen_tools.gemspec +1 -1
- metadata +93 -52
- data/Gemfile.1.8.7 +0 -8
- data/lib/zhongwen_tools/conversion/string.rb +0 -19
- data/lib/zhongwen_tools/integer.rb +0 -28
- data/lib/zhongwen_tools/numbers.rb +0 -195
- data/lib/zhongwen_tools/regex/ruby18.rb +0 -15
- data/lib/zhongwen_tools/romanization/conversion_table.rb +0 -425
- data/lib/zhongwen_tools/romanization/detect.rb +0 -141
- data/lib/zhongwen_tools/romanization/string.rb +0 -36
- data/lib/zhongwen_tools/string/fullwidth.rb +0 -85
- data/lib/zhongwen_tools/string/ruby18.rb +0 -96
- data/lib/zhongwen_tools/string.rb +0 -164
- data/test/test_integer.rb +0 -31
- data/test/test_numbers.rb +0 -68
- data/test/test_string.rb +0 -133
@@ -1,111 +1,120 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
require 'zhongwen_tools/
|
3
|
-
require 'zhongwen_tools/romanization/
|
4
|
-
require 'zhongwen_tools/romanization/
|
5
|
-
|
6
|
-
|
7
|
-
#
|
8
|
-
#
|
9
|
-
# Pinyin mā má mǎ mà ma
|
10
|
-
# Tongyong Pinyin ma má mǎ mà må # this will be difficult.
|
11
|
-
# Wade–Giles ma¹ ma² ma³ ma⁴ ma⁰
|
12
|
-
# Zhuyin ㄇㄚ ㄇㄚˊ ㄇㄚˇ ㄇㄚˋ •ㄇㄚ
|
2
|
+
require 'zhongwen_tools/romanization/pinyin'
|
3
|
+
require 'zhongwen_tools/romanization/pinyin_table'
|
4
|
+
require 'zhongwen_tools/romanization/romanization_table'
|
5
|
+
|
6
|
+
# NOTE: Creates several dynamic Modules and their associated methods.
|
7
|
+
# e.g. ZhongwenTools::Romanization::ZhuyinFuhao.to_bpmf
|
8
|
+
# ZhongwenTools::Romanization::WadeGiles.to_wg
|
13
9
|
module ZhongwenTools
|
14
10
|
module Romanization
|
15
|
-
|
11
|
+
def self.convert(str, to, from)
|
12
|
+
# NOTE: don't convert if it already is converted.
|
13
|
+
return str if to == from
|
16
14
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
15
|
+
if to == :py
|
16
|
+
convert_to_py(str, from)
|
17
|
+
elsif to == :pyn
|
18
|
+
convert_to_pyn(str, from)
|
19
|
+
else
|
20
|
+
convert_to_other(str, from, to)
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
24
|
-
|
25
|
-
|
26
|
-
#
|
27
|
-
#
|
28
|
-
#
|
24
|
+
# Public: Checks the romanization type for the string.
|
25
|
+
# Romanization types are like ducks. If it walks, talks, and acts
|
26
|
+
# like a duck, it is a duck. Therefore, where a String is both
|
27
|
+
# pinyin and another romanization system, it will be identified
|
28
|
+
# as pinyin. If you need to determine whether a py/pyn string
|
29
|
+
# belongs to another romanization system p a romanization
|
30
|
+
# system, use the romanization modules specific function.
|
29
31
|
#
|
30
|
-
#
|
31
|
-
# Otherwise, the first argument is a String and the second argument is the :from option.
|
32
|
-
#
|
33
|
-
# Examples:
|
32
|
+
# str - a String to test.
|
34
33
|
#
|
34
|
+
# Examples
|
35
|
+
# romanization?('hao3') #=> :pyn
|
36
|
+
# romanization?('zzzz') #=> nil
|
35
37
|
#
|
36
|
-
# _romanization_options('hao3', :pyn) #=> 'hao3' :pyn
|
37
|
-
# _romanization_options('hao3') #=> 'hao3', :pyn
|
38
38
|
#
|
39
|
-
# Returns
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
39
|
+
# Returns a String for the romanization system or Nil if the string is not
|
40
|
+
# a romanization.
|
41
|
+
def self.romanization?(str)
|
42
|
+
if ZhongwenTools::Romanization::Pinyin.py?(str)
|
43
|
+
:py
|
44
|
+
elsif ZhongwenTools::Romanization::Pinyin.pyn?(str)
|
45
|
+
:pyn
|
46
|
+
elsif ZhongwenTools::Romanization::ZhuyinFuhao.bpmf?(str)
|
47
|
+
:bpmf
|
48
|
+
elsif ZhongwenTools::Romanization::WadeGiles.wg?(str)
|
49
|
+
:wg
|
50
|
+
elsif ZhongwenTools::Romanization::TongyongPinyin.typy?(str)
|
51
|
+
:typy
|
52
|
+
elsif ZhongwenTools::Romanization::Yale.yale?(str)
|
53
|
+
:yale
|
54
|
+
elsif ZhongwenTools::Romanization::MPS2.mps2?(str)
|
55
|
+
:mps2
|
47
56
|
end
|
48
|
-
|
49
|
-
[str, from.to_sym]
|
50
57
|
end
|
51
58
|
|
52
|
-
|
53
|
-
|
54
|
-
# str - A String to replace with actual pinyin
|
55
|
-
#
|
56
|
-
# Examples
|
57
|
-
# _to_pinyin 'Ni3 hao3 ma5?'
|
58
|
-
# # => "Nǐ hǎo ma?"
|
59
|
-
# # => 'Zhong1-guo2-ren2'
|
60
|
-
#
|
61
|
-
#
|
62
|
-
# Returns a string with actual pinyin
|
63
|
-
def _to_pinyin str
|
64
|
-
regex = Regex.pinyin_num
|
65
|
-
# Using gsub is ~8x faster than using scan and each.
|
66
|
-
# Explanation: if it's pinyin without vowels, e.g. m, ng, then convert,
|
67
|
-
# otherwise, check if it needs an apostrophe (http://www.pinyin.info/romanization/hanyu/apostrophes.html).
|
68
|
-
# If it does, add it and then convert. Otherwise, just convert.
|
69
|
-
# Oh, and if double hyphens are used, replace them with one hyphen.
|
70
|
-
# And finally, correct those apostrophes at the very end.
|
71
|
-
str.gsub(regex) do
|
72
|
-
($3.nil? ? "#{PYN_PY[$1]}" : ($2 == '' && ['a','e','o'].include?($3[0,1]))? "'#{PYN_PY["#{$3}#{$6}"]}#{$4}#{$5}" : "#{$2}#{PYN_PY["#{$3}#{$6}"]}#{$4}#{$5}") + (($7.to_s.length > 1) ? '-' : '')
|
73
|
-
end.gsub("-'","-").sub(/^'/,'')
|
74
|
-
end
|
59
|
+
def split(str, type = nil)
|
60
|
+
type ||= romanization?(str)
|
75
61
|
|
76
|
-
|
77
|
-
|
78
|
-
begin
|
79
|
-
tokens = str.send("split_#{from}").uniq
|
80
|
-
rescue
|
81
|
-
tokens = str.split(/[ \-]/).uniq
|
62
|
+
if type == :py
|
63
|
+
elsif type == :pyn
|
82
64
|
end
|
83
65
|
|
84
|
-
|
85
|
-
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
def self.convert_romanization(str, from, to)
|
71
|
+
# NOTE: extract/refactor tokens cause tests to fail.
|
72
|
+
if from == :pyn
|
73
|
+
tokens = ZhongwenTools::Romanization::Pinyin.split_pyn(str).uniq
|
74
|
+
else
|
75
|
+
tokens = romanization_module(from).send(:split, str).uniq
|
76
|
+
end
|
77
|
+
|
78
|
+
tokens.collect do |t|
|
79
|
+
search, replace = find_token_replacement(t, str, to, from)
|
86
80
|
str = str.gsub(search, replace)
|
87
81
|
end
|
88
82
|
|
89
83
|
str
|
90
84
|
end
|
91
85
|
|
92
|
-
def
|
86
|
+
def self.convert_to_other(str, from, to)
|
87
|
+
if from == :py
|
88
|
+
str = ZhongwenTools::Romanization::Pinyin.convert_pinyin_to_pyn(str)
|
89
|
+
from = :pyn
|
90
|
+
end
|
91
|
+
|
92
|
+
str = convert_romanization(str, from, to)
|
93
|
+
|
94
|
+
if to == :bpmf
|
95
|
+
str.gsub('-', '')
|
96
|
+
else
|
97
|
+
str
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def self.find_token_replacement(token, str, to, from)
|
93
102
|
search = token.gsub(/[1-5].*/,'')
|
94
103
|
|
95
|
-
replace =
|
96
|
-
replace =
|
104
|
+
replace = token_replacement(token, from).fetch(to){ search }
|
105
|
+
replace = fix_capitalization(str, token, replace)
|
97
106
|
|
98
107
|
|
99
108
|
[search, replace]
|
100
109
|
end
|
101
110
|
|
102
|
-
def
|
111
|
+
def self.fix_capitalization(str, token, replace)
|
103
112
|
replace = replace.capitalize if(token.downcase != token)
|
104
113
|
|
105
114
|
replace
|
106
115
|
end
|
107
116
|
|
108
|
-
def
|
117
|
+
def self.token_replacement(token, from = nil)
|
109
118
|
token = token.downcase.gsub(/[1-5].*/,'')
|
110
119
|
result = ROMANIZATIONS_TABLE.find do |x|
|
111
120
|
if from.nil?
|
@@ -118,90 +127,144 @@ module ZhongwenTools
|
|
118
127
|
result || {}
|
119
128
|
end
|
120
129
|
|
121
|
-
def _convert_romanization str, to, from
|
122
|
-
return str if to == from
|
123
130
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
131
|
+
# <module_name>::<romanization_type>?(str)
|
132
|
+
#
|
133
|
+
# Public: Checks if a String is a romanization:
|
134
|
+
# Zhuyin Fuhao, Tongyong Pinyin, Wade Giles, MSP2 or Yale.
|
135
|
+
# http://en.wikipedia.org/wiki/Tongyong_Pinyin
|
136
|
+
# http://pinyin.info/romanization/tongyong/
|
137
|
+
# http://en.wikipedia.org/wiki/Wade%E2%80%93Giles
|
138
|
+
# http://en.wikipedia.org/wiki/Bopomofo
|
139
|
+
# http://pinyin.info/romanization/bopomofo/index.html
|
140
|
+
#
|
141
|
+
# str - a String. Optional if the object calling the method is a String.
|
142
|
+
#
|
143
|
+
# Examples
|
144
|
+
#
|
145
|
+
# typy?('chuei niou') #=> true
|
146
|
+
# wg?('Mao2 Tse2 Tung1') #=> true
|
147
|
+
# bpmf?('ㄊㄥ') #=> true
|
148
|
+
#
|
149
|
+
# Returns a boolean.
|
150
|
+
def self.create_detect_method(romanization_module, name)
|
151
|
+
romanization_module.define_singleton_method("#{name}?") do |str|
|
152
|
+
|
153
|
+
regex = romanization_module == :ZhuyinFuhao ? ZhongwenTools::Regex.bopomofo : ZhongwenTools::Romanization.detect_regex(name.to_sym)
|
154
|
+
normalized_str = str.downcase.gsub(ZhongwenTools::Regex.punc,'').gsub(/[1-5\s\-']/,'')
|
155
|
+
#TODO: ignore tonal marks from other systems wade giles, tongyong etc.
|
156
|
+
normalized_str.scan(regex).join == normalized_str
|
157
|
+
end
|
145
158
|
end
|
146
159
|
|
147
|
-
|
148
|
-
|
149
|
-
|
160
|
+
# <module_name>::to_<romanization_type>(str)
|
161
|
+
# Public: Converts to the given romanization from pyn (pinyin using numbers instead of tone marks.
|
162
|
+
#
|
163
|
+
# str = a String to be converted
|
164
|
+
#
|
165
|
+
# Examples:
|
166
|
+
#
|
167
|
+
#
|
168
|
+
#
|
169
|
+
# ZhongwenTools::Romanization::ZhuyinFuhao.to_zyfh('Mao2 Ze2-dong1') # => 'ㄇㄠ2 ㄗㄜ2ㄉㄨㄥ1'
|
170
|
+
#
|
171
|
+
# Returns a String.
|
172
|
+
def self.create_convert_method(romanization_module, romanization_name, name)
|
173
|
+
romanization_module.define_singleton_method("to_#{ name }") do |*args|
|
174
|
+
str, from = args
|
175
|
+
from ||= ZhongwenTools::Romanization.romanization?(str)
|
150
176
|
|
151
|
-
|
152
|
-
pys = word.split(/['\-]/).flatten.map{|x| x.scan(Regex.py).map{|x| (x - [nil])[0]}}.flatten
|
153
|
-
_current_pyn(word, pys)
|
177
|
+
ZhongwenTools::Romanization.convert str, romanization_name, from.to_sym
|
154
178
|
end
|
155
|
-
|
156
|
-
pyn.join(' ')
|
157
179
|
end
|
158
180
|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
181
|
+
# <module_name>::split(str)
|
182
|
+
# Public: splits the romanization's string.
|
183
|
+
#
|
184
|
+
# str - a String to be split
|
185
|
+
#
|
186
|
+
# Examples
|
187
|
+
#
|
188
|
+
#
|
189
|
+
# split('zhong1guo2')
|
190
|
+
# # => ['zhong1', 'guo2']
|
191
|
+
#
|
192
|
+
# Returns an Array of Strings.
|
193
|
+
def self.create_split_method(romanization_module, name)
|
194
|
+
regex = romanization_module == :ZhuyinFuhao ? /([#{ZhongwenTools::Regex.bopomofo}]*)/ : /(#{ZhongwenTools::Romanization.detect_regex(name.to_sym)}*)/
|
195
|
+
|
196
|
+
romanization_module.define_singleton_method("split") do |str|
|
197
|
+
# TODO: ignore tonal marks from other systems wade giles, tongyong etc.
|
198
|
+
results = str.scan(regex).map do |arr|
|
199
|
+
arr[0].strip.gsub('-','')
|
200
|
+
end
|
201
|
+
|
202
|
+
results.flatten - ['']
|
166
203
|
end
|
204
|
+
end
|
167
205
|
|
168
|
-
|
206
|
+
# Internal: Produces a Regexp for a romanization type.
|
207
|
+
#
|
208
|
+
# type - a Symbol for the romanization type.
|
209
|
+
#
|
210
|
+
# Examples:
|
211
|
+
#
|
212
|
+
#
|
213
|
+
# detect_regex(:typy) #=> <Regexp>
|
214
|
+
#
|
215
|
+
# Returns a Regexp.
|
216
|
+
def self.detect_regex(type)
|
217
|
+
/#{romanization_values(type).sort{|x,y| x.size <=> y.size}.reverse.join('|')}/
|
169
218
|
end
|
170
219
|
|
171
|
-
|
172
|
-
|
173
|
-
|
220
|
+
# Internal: Selects the romanization values for a particular romanization type.
|
221
|
+
#
|
222
|
+
# type - a Symbol for the romanization type.
|
223
|
+
#
|
224
|
+
# Examples:
|
225
|
+
#
|
226
|
+
#
|
227
|
+
# romanization_values(:typy) #=> ['a', ..., 'r']
|
228
|
+
#
|
229
|
+
# Returns an Array that contains the romanization's values.
|
230
|
+
def self.romanization_values(type)
|
231
|
+
results = ZhongwenTools::Romanization::ROMANIZATIONS_TABLE.map do |r|
|
232
|
+
"[#{r[type][0]}#{r[type][0].upcase}]#{r[type][1..-1]}" || r[:pyn]
|
174
233
|
end
|
175
|
-
match = select_pinyin_match(matches)
|
176
|
-
replace = PYN_PY.find{|k,v| k if v == match}[0]
|
177
234
|
|
178
|
-
|
235
|
+
results.flatten
|
179
236
|
end
|
180
237
|
|
181
|
-
def
|
182
|
-
|
183
|
-
|
238
|
+
def self.romanization_module(type)
|
239
|
+
module_name = RomanizationTypes.find{ |k,v| v.include?(type.to_s) }.first
|
240
|
+
ZhongwenTools::Romanization.const_get(module_name)
|
241
|
+
end
|
184
242
|
|
185
|
-
|
186
|
-
|
243
|
+
def self.hyphenated?(str)
|
244
|
+
!str[/\-/].nil?
|
187
245
|
end
|
188
246
|
|
247
|
+
# Internal: Creates romanization modules and their methods.
|
248
|
+
RomanizationTypes = {
|
249
|
+
ZhuyinFuhao: %w(bpmf zhuyin_fuhao zhuyinfuhao zyfh zhyfh bopomofo),
|
250
|
+
WadeGiles: %w(wg wade_giles),
|
251
|
+
Yale: ['yale'],
|
252
|
+
TongyongPinyin: %w(typy tongyong tongyong_pinyin),
|
253
|
+
MPS2: ['mps2']
|
254
|
+
}
|
189
255
|
|
190
|
-
|
191
|
-
|
192
|
-
|
256
|
+
RomanizationTypes.each do |module_name, names|
|
257
|
+
romanization_module = self.const_set(module_name, Module.new) unless self.const_defined?(module_name)
|
258
|
+
romanization_module ||= self.const_get(module_name)
|
193
259
|
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
elsif [:tongyong, :typy, :ty].include? type
|
199
|
-
:typy
|
200
|
-
elsif type == :pinyin
|
201
|
-
:py
|
260
|
+
romanization_name = names.first.to_sym
|
261
|
+
|
262
|
+
names.each do |name|
|
263
|
+
create_convert_method(romanization_module, romanization_name, name)
|
202
264
|
end
|
265
|
+
|
266
|
+
create_detect_method(romanization_module, romanization_name)
|
267
|
+
create_split_method(romanization_module, romanization_name)
|
203
268
|
end
|
204
269
|
end
|
205
270
|
end
|
206
|
-
|
207
|
-
require 'zhongwen_tools/romanization/detect'
|
File without changes
|
@@ -1,39 +1,27 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
require 'zhongwen_tools/string'
|
3
2
|
|
4
3
|
module ZhongwenTools
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
def to_zhs(str = nil)
|
10
|
-
str ||= self
|
11
|
-
|
12
|
-
convert(:zhs, str)
|
4
|
+
module Script
|
5
|
+
def self.zht?(str)
|
6
|
+
str == convert(:zht, str) || str == convert(:zhhk, str)
|
13
7
|
end
|
14
8
|
|
15
|
-
def
|
16
|
-
str
|
17
|
-
|
18
|
-
convert(:zht, str)
|
9
|
+
def self.zhs?(str)
|
10
|
+
str == convert(:zhs, str)
|
19
11
|
end
|
20
12
|
|
21
|
-
def
|
22
|
-
|
13
|
+
def self.to_zhs(str, type)
|
14
|
+
type = type.to_sym
|
15
|
+
fail ArgumentError unless [:zhs, :zhcn].include? type
|
23
16
|
|
24
|
-
convert(
|
17
|
+
convert(type, str)
|
25
18
|
end
|
26
19
|
|
27
|
-
def
|
28
|
-
|
20
|
+
def self.to_zht(str, type)
|
21
|
+
type = type.to_sym
|
22
|
+
fail ArgumentError unless [:zht, :zhtw, :zhhk].include? type
|
29
23
|
|
30
|
-
convert(
|
31
|
-
end
|
32
|
-
|
33
|
-
def to_zhcn(str = nil)
|
34
|
-
str ||= self
|
35
|
-
|
36
|
-
convert(:zhcn, str)
|
24
|
+
convert(type, str)
|
37
25
|
end
|
38
26
|
|
39
27
|
ZH_TYPES = {
|
@@ -42,9 +30,10 @@ module ZhongwenTools
|
|
42
30
|
:zhtw => [2,0],
|
43
31
|
:zhhk => [3,0],
|
44
32
|
:zhcn => [4,1]
|
45
|
-
}
|
33
|
+
} unless defined?(ZH_TYPES)
|
34
|
+
|
35
|
+
ZH_CONVERSION_TABLE = [] unless defined?(ZH_CONVERSION_TABLE)
|
46
36
|
|
47
|
-
ZH_CONVERSION_TABLE = []
|
48
37
|
|
49
38
|
private
|
50
39
|
# Conversion data and algorithm shamelessly stolen from chinese_convt gem.
|
@@ -56,8 +45,8 @@ module ZhongwenTools
|
|
56
45
|
# Zhongwen Tools is ~12X faster.
|
57
46
|
# + Zhongwen Tools uses Ruby's nifty str[/regex/] = replacement
|
58
47
|
# instead of indices. Conversion tests using indices fail with Ruby 1.8.
|
59
|
-
def load_table
|
60
|
-
filename = File.expand_path('../
|
48
|
+
def self.load_table
|
49
|
+
filename = File.expand_path('../script/conversion_data', __FILE__)
|
61
50
|
File.open(filename).read.split("\n&\n").each do |group|
|
62
51
|
ZH_CONVERSION_TABLE << group.split("\n").map do |type|
|
63
52
|
Hash[ type.split(',').map{ |term| term.split(':') } ]
|
@@ -67,12 +56,12 @@ module ZhongwenTools
|
|
67
56
|
nil
|
68
57
|
end
|
69
58
|
|
70
|
-
def convert(type, str)
|
59
|
+
def self.convert(type, str)
|
71
60
|
load_table if ZH_CONVERSION_TABLE.length == 0
|
72
61
|
types = ZH_TYPES[type] || ZH_TYPES[:zht]
|
73
62
|
|
74
63
|
begin
|
75
|
-
str_len =
|
64
|
+
str_len = str.chars.to_a.size
|
76
65
|
n = (str_len < 6)? str_len : 6
|
77
66
|
convert_zhongwen(str.dup, str.dup, types, n)
|
78
67
|
|
@@ -81,7 +70,7 @@ module ZhongwenTools
|
|
81
70
|
end
|
82
71
|
end
|
83
72
|
|
84
|
-
def convert_zhongwen(str0, str1, types, n)
|
73
|
+
def self.convert_zhongwen(str0, str1, types, n)
|
85
74
|
ZH_CONVERSION_TABLE.last(n).each do |group|
|
86
75
|
types.each do |t|
|
87
76
|
group[t].each do |key , value|
|
@@ -97,5 +86,3 @@ module ZhongwenTools
|
|
97
86
|
end
|
98
87
|
end
|
99
88
|
end
|
100
|
-
|
101
|
-
require 'zhongwen_tools/conversion/string'
|
@@ -0,0 +1,136 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module ZhongwenTools
|
4
|
+
module StringExtension
|
5
|
+
def capitalize
|
6
|
+
ZhongwenTools::Caps.capitalize(self)
|
7
|
+
end
|
8
|
+
|
9
|
+
def zh_downcase
|
10
|
+
ZhongwenTools::Caps.downcase(self)
|
11
|
+
end
|
12
|
+
|
13
|
+
def zh_upcase
|
14
|
+
ZhongwenTools::Caps.upcase(self)
|
15
|
+
end
|
16
|
+
|
17
|
+
def has_zh?
|
18
|
+
ZhongwenTools::Zhongwen.has_zh?(self)
|
19
|
+
end
|
20
|
+
|
21
|
+
def has_zh_punctuation?
|
22
|
+
ZhongwenTools::Zhongwen.has_zh_punctuation?(self)
|
23
|
+
end
|
24
|
+
|
25
|
+
def zh?
|
26
|
+
ZhongwenTools::Zhongwen.zh?(self)
|
27
|
+
end
|
28
|
+
|
29
|
+
def strip_zh_punctuation
|
30
|
+
ZhongwenTools::Zhongwen.strip_zh_punctuation(self)
|
31
|
+
end
|
32
|
+
|
33
|
+
def uri_encode
|
34
|
+
ZhongwenTools::URI.encode(self)
|
35
|
+
end
|
36
|
+
|
37
|
+
def uri_escape
|
38
|
+
ZhongwenTools::URI.escape(self)
|
39
|
+
end
|
40
|
+
|
41
|
+
def ascii?
|
42
|
+
ZhongwenTools::Unicode.ascii?(self)
|
43
|
+
end
|
44
|
+
|
45
|
+
def multibyte?
|
46
|
+
ZhongwenTools::Unicode.multibyte?(self)
|
47
|
+
end
|
48
|
+
|
49
|
+
def halfwidth?
|
50
|
+
ZhongwenTools::Fullwidth.halfwidth?(self)
|
51
|
+
end
|
52
|
+
|
53
|
+
def fullwidth?
|
54
|
+
ZhongwenTools::Fullwidth.fullwidth?(self)
|
55
|
+
end
|
56
|
+
|
57
|
+
def to_halfwidth
|
58
|
+
ZhongwenTools::Fullwidth.to_halfwidth(self)
|
59
|
+
end
|
60
|
+
|
61
|
+
def to_codepoint
|
62
|
+
ZhongwenTools::Unicode.to_codepoint(self)
|
63
|
+
end
|
64
|
+
|
65
|
+
def from_codepoint
|
66
|
+
ZhongwenTools::Unicode.from_codepoint(self)
|
67
|
+
end
|
68
|
+
|
69
|
+
def to_pinyin(from = nil)
|
70
|
+
ZhongwenTools::Romanization::Pinyin::to_py(self, from)
|
71
|
+
end
|
72
|
+
|
73
|
+
alias_method :to_py, :to_pinyin
|
74
|
+
|
75
|
+
def to_pyn(from = nil)
|
76
|
+
ZhongwenTools::Romanization::Pinyin::to_pyn(self, from)
|
77
|
+
end
|
78
|
+
|
79
|
+
def to_bpmf(from = nil)
|
80
|
+
ZhongwenTools::Romanization::ZhuyinFuhao::to_bpmf(self, from)
|
81
|
+
end
|
82
|
+
|
83
|
+
alias_method :to_zyfh, :to_bpmf
|
84
|
+
alias_method :to_zhyfh, :to_bpmf
|
85
|
+
alias_method :to_bopomofo, :to_bpmf
|
86
|
+
|
87
|
+
def to_wg(from = nil)
|
88
|
+
ZhongwenTools::Romanization::WadeGiles::to_wg(self, from)
|
89
|
+
end
|
90
|
+
|
91
|
+
alias_method :to_wade_giles, :to_wg
|
92
|
+
|
93
|
+
def to_yale(from = nil)
|
94
|
+
ZhongwenTools::Romanization::Yale::to_yale(self, from)
|
95
|
+
end
|
96
|
+
|
97
|
+
def to_typy(from = nil)
|
98
|
+
ZhongwenTools::Romanization::TongyongPinyin::to_typy(self, from)
|
99
|
+
end
|
100
|
+
|
101
|
+
alias_method :to_tongyong, :to_typy
|
102
|
+
alias_method :to_tongyong_pinyin, :to_typy
|
103
|
+
|
104
|
+
def to_mps2(from = nil)
|
105
|
+
ZhongwenTools::Romanization::MPS2::to_mps2(self, from)
|
106
|
+
end
|
107
|
+
|
108
|
+
def zhs?
|
109
|
+
ZhongwenTools::Script.zhs?(self)
|
110
|
+
end
|
111
|
+
|
112
|
+
def zht?
|
113
|
+
ZhongwenTools::Script.zht?(self)
|
114
|
+
end
|
115
|
+
|
116
|
+
def to_zhcn
|
117
|
+
ZhongwenTools::Script.to_zhs(self, :zhcn)
|
118
|
+
end
|
119
|
+
|
120
|
+
def to_zhhk
|
121
|
+
ZhongwenTools::Script.to_zht(self, :zhhk)
|
122
|
+
end
|
123
|
+
|
124
|
+
def to_zhs
|
125
|
+
ZhongwenTools::Script.to_zhs(self, :zhs)
|
126
|
+
end
|
127
|
+
|
128
|
+
def to_zht
|
129
|
+
ZhongwenTools::Script.to_zht(self, :zht)
|
130
|
+
end
|
131
|
+
|
132
|
+
def to_zhtw
|
133
|
+
ZhongwenTools::Script.to_zht(self, :zhtw)
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module ZhongwenTools
|
4
|
+
module Unicode
|
5
|
+
def self.to_codepoint(str)
|
6
|
+
str.chars.map{ |c| "\\u%04x" % c.unpack("U")[0] }.join
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.from_codepoint(str)
|
10
|
+
results = (str.split(/\\?u/) - ['']).map do |s|
|
11
|
+
[s.hex].pack("U")
|
12
|
+
end
|
13
|
+
|
14
|
+
results.join
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.ascii?(str)
|
18
|
+
str.chars.to_a.size == str.bytes.to_a.size
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.multibyte?(str)
|
22
|
+
!ascii?(str)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|