zhongwen_tools 0.9.0 → 0.11.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/Rakefile +0 -41
- data/lib/zhongwen_tools.rb +3 -5
- data/lib/zhongwen_tools/conversion.rb +20 -17
- data/lib/zhongwen_tools/integer.rb +1 -1
- data/lib/zhongwen_tools/numbers.rb +2 -1
- data/lib/zhongwen_tools/regex.rb +4 -5
- data/lib/zhongwen_tools/romanization.rb +90 -120
- data/lib/zhongwen_tools/romanization/conversion_table.rb +417 -417
- data/lib/zhongwen_tools/romanization/detect.rb +68 -39
- data/lib/zhongwen_tools/romanization/pyn_to_py.rb +2 -1
- data/lib/zhongwen_tools/string.rb +3 -3
- data/lib/zhongwen_tools/version.rb +1 -1
- data/test/test_romanization.rb +39 -7
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0cbc9e019d51f46d9c6b609bbc711c4f3b7dfde5
|
4
|
+
data.tar.gz: 06a3a6ff340d5104e205fadff0989301ddc521bd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3599feaa1f681e31d8b7ed58c24c980b5317352f95d32a90a6b27f4ffdc62592c1048efcbe5dde3fbb60d1ea05bb0bb7053916833fd60ba58434599bc9fc7d18
|
7
|
+
data.tar.gz: 819c5c711845e7f2b9f9f39af4042c0c6f80ff64b7b86d3f3b486c7bc5418c669d9f3ee73e7988f849e5201d6ddd8b18f615927ed0511befb57dd2c21a5e9e52
|
data/README.md
CHANGED
data/Rakefile
CHANGED
@@ -8,46 +8,5 @@ Rake::TestTask.new do |t|
|
|
8
8
|
t.test_files = FileList['test/test*.rb']
|
9
9
|
end
|
10
10
|
|
11
|
-
|
12
|
-
|
13
11
|
desc "Run tests"
|
14
12
|
task :default => :test
|
15
|
-
|
16
|
-
|
17
|
-
def setup_18
|
18
|
-
commands = [
|
19
|
-
'cp Gemfile Gemfile.bak',
|
20
|
-
'mv .ruby-version-1.8.7 .ruby-version',
|
21
|
-
'cp Gemfile.1.8.7 Gemfile',
|
22
|
-
'bundle install --local',
|
23
|
-
]
|
24
|
-
|
25
|
-
commands.each{ |c| `#{ c }` }
|
26
|
-
end
|
27
|
-
|
28
|
-
def teardown_18
|
29
|
-
commands = [
|
30
|
-
'mv .ruby-version .ruby-version-1.8.7 ',
|
31
|
-
'cp Gemfile.bak Gemfile && rm Gemfile.bak',
|
32
|
-
'bundle install --local'
|
33
|
-
]
|
34
|
-
|
35
|
-
commands.each{ |c|
|
36
|
-
`#{ c }`
|
37
|
-
}
|
38
|
-
end
|
39
|
-
|
40
|
-
|
41
|
-
namespace :ruby_18 do
|
42
|
-
desc "Switch to 1.8.7"
|
43
|
-
task :setup do
|
44
|
-
setup_18 if File.exist?('.ruby-version-1.8.7')
|
45
|
-
puts 'Using Ruby 1.8.7'
|
46
|
-
end
|
47
|
-
|
48
|
-
desc "Teardown to 1.8.7"
|
49
|
-
task :teardown do
|
50
|
-
teardown_18 unless File.exist?('.ruby-version-1.8.7')
|
51
|
-
puts 'Using Ruby 2.1.0'
|
52
|
-
end
|
53
|
-
end
|
data/lib/zhongwen_tools.rb
CHANGED
@@ -1,9 +1,7 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
5
|
-
#require File.expand_path("../zhongwen_tools/romanization", __FILE__)
|
6
|
-
#require File.expand_path("../zhongwen_tools/conversion", __FILE__)
|
2
|
+
require 'zhongwen_tools/string'
|
3
|
+
require 'zhongwen_tools/numbers'
|
4
|
+
require 'zhongwen_tools/version'
|
7
5
|
|
8
6
|
module ZhongwenTools
|
9
7
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
+
require 'zhongwen_tools/string'
|
2
3
|
|
3
4
|
module ZhongwenTools
|
4
5
|
|
@@ -68,31 +69,33 @@ module ZhongwenTools
|
|
68
69
|
|
69
70
|
def convert(type, str)
|
70
71
|
load_table if ZH_CONVERSION_TABLE.length == 0
|
71
|
-
|
72
|
+
types = ZH_TYPES[type] || ZH_TYPES[:zht]
|
72
73
|
|
73
74
|
begin
|
74
|
-
str0 = str.dup
|
75
|
-
str1 = str.dup
|
76
75
|
str_len = ZhongwenTools::String.size(str)
|
77
|
-
n = (
|
78
|
-
|
79
|
-
arr.each do |t|
|
80
|
-
group[t].each do |key , value|
|
81
|
-
while !! q = str0.index( key )
|
82
|
-
str0[/#{key}/] = "#" * value.size
|
83
|
-
str1[/#{key}/] = value
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
str1
|
76
|
+
n = (str_len < 6)? str_len : 6
|
77
|
+
convert_zhongwen(str.dup, str.dup, types, n)
|
90
78
|
|
91
79
|
rescue
|
92
80
|
"[#{$!}]"
|
93
81
|
end
|
94
82
|
end
|
83
|
+
|
84
|
+
def convert_zhongwen(str0, str1, types, n)
|
85
|
+
ZH_CONVERSION_TABLE.last(n).each do |group|
|
86
|
+
types.each do |t|
|
87
|
+
group[t].each do |key , value|
|
88
|
+
until str0.index(key).nil?
|
89
|
+
str0[/#{key}/] = "#" * value.size
|
90
|
+
str1[/#{key}/] = value
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
str1
|
97
|
+
end
|
95
98
|
end
|
96
99
|
end
|
97
100
|
|
98
|
-
require
|
101
|
+
require 'zhongwen_tools/conversion/string'
|
data/lib/zhongwen_tools/regex.rb
CHANGED
@@ -4,13 +4,13 @@ module ZhongwenTools
|
|
4
4
|
extend self
|
5
5
|
|
6
6
|
def pyn
|
7
|
-
#
|
8
|
-
/(#{pyn_regexes.values.join('|')})([1-5])?([\s\-]+)?/
|
7
|
+
/(#{pyn_regexes.values.join('|')}|r)([1-5])?([\s\-]+)?/
|
9
8
|
end
|
10
9
|
|
11
10
|
def py
|
12
|
-
#
|
13
|
-
|
11
|
+
# FIXME: need to detect Ālābó
|
12
|
+
# ([ĀÁǍÀA][io]?|[io]?|[][āáǎàaēéěèeūúǔùu]?o?|[ĒÉĚÈE]i?|[]i?|[ŌÓǑÒO]u?|[]u?|u[āáǎàaēoēéěèe]?i?|[]e?)(n?g?r?)){1,}
|
13
|
+
/(#{pyn_regexes.map{|k,v| v.to_s[7..-2].gsub_with_hash(/[aeiouv]/,py_tones)}.join('|')}([\s\-])?)/
|
14
14
|
end
|
15
15
|
|
16
16
|
def fullwidth
|
@@ -85,7 +85,6 @@ module ZhongwenTools
|
|
85
85
|
'o' => '[ōóǒòo]',
|
86
86
|
'u' => '[ūúǔùu]',
|
87
87
|
'v' => '[ǖǘǚǜü]'
|
88
|
-
#([ĀÁǍÀA][io]?|[io]?|[][āáǎàaēéěèeūúǔùu]?o?|[ĒÉĚÈE]i?|[]i?|[ŌÓǑÒO]u?|[]u?|u[āáǎàaēoēéěèe]?i?|[]e?)(n?g?r?)){1,}
|
89
88
|
}
|
90
89
|
end
|
91
90
|
end
|
@@ -1,58 +1,52 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
5
|
-
require
|
6
|
-
|
2
|
+
require 'zhongwen_tools/string'
|
3
|
+
require 'zhongwen_tools/romanization/conversion_table'
|
4
|
+
require 'zhongwen_tools/romanization/string'
|
5
|
+
require 'zhongwen_tools/romanization/pyn_to_py'
|
6
|
+
|
7
|
+
# TODO: follow tone conventions for different systems.
|
8
|
+
# IPA mä˥˥ mä˧˥ mä˨˩˦ mä˥˩ mä
|
9
|
+
# Pinyin mā má mǎ mà ma
|
10
|
+
# Tongyong Pinyin ma má mǎ mà må # this will be difficult.
|
11
|
+
# Wade–Giles ma¹ ma² ma³ ma⁴ ma⁰
|
12
|
+
# Zhuyin ㄇㄚ ㄇㄚˊ ㄇㄚˇ ㄇㄚˋ •ㄇㄚ
|
7
13
|
module ZhongwenTools
|
8
14
|
module Romanization
|
9
15
|
extend self
|
10
16
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
def to_bopomofo *args
|
18
|
-
str, from = _romanization_options(args)
|
19
|
-
|
20
|
-
_convert_romanization str, :zyfh, from
|
21
|
-
end
|
22
|
-
|
23
|
-
def to_yale(*args)
|
24
|
-
str, from = _romanization_options(args)
|
25
|
-
_convert_romanization str, :yale, from
|
26
|
-
end
|
27
|
-
|
28
|
-
def to_wade_giles(*args)
|
29
|
-
str, from = _romanization_options(args)
|
30
|
-
_convert_romanization str, :wg, from
|
31
|
-
end
|
32
|
-
|
33
|
-
def to_typy(*args)
|
34
|
-
str, from = _romanization_options(args)
|
35
|
-
_convert_romanization str, :typy, from
|
36
|
-
end
|
37
|
-
|
38
|
-
def to_pyn(*args)
|
39
|
-
# needs to guess what the romanization type is.
|
40
|
-
str, from = _romanization_options(args)
|
41
|
-
_convert_romanization str, :pyn, from
|
17
|
+
%w(pinyin py pyn bopomofo bpmf zhuyin zyfh zhyfh zhuyin_fuhao yale wade_giles wg typy tongyong mps2).each do |type|
|
18
|
+
define_method("to_#{type}") do |*args|
|
19
|
+
str, from = _romanization_options(args)
|
20
|
+
_convert_romanization str, _set_type(type.to_sym), _set_type(from)
|
21
|
+
end
|
42
22
|
end
|
43
23
|
|
44
24
|
private
|
45
25
|
|
26
|
+
# Private: Provides romanization options for romanization methods. If no :from argument is given, then
|
27
|
+
# the method will try to guess the romanization. This can sometimes provide sub-optimal
|
28
|
+
# romanization suggestions. See lib/zhongwen_tools/romanization/detect.rb#romanization? for details.
|
29
|
+
#
|
30
|
+
# args - an Array of arguments. If the Object is a String, then the first argument should be the :from option.
|
31
|
+
# Otherwise, the first argument is a String and the second argument is the :from option.
|
32
|
+
#
|
33
|
+
# Examples:
|
34
|
+
#
|
35
|
+
#
|
36
|
+
# _romanization_options('hao3', :pyn) #=> 'hao3' :pyn
|
37
|
+
# _romanization_options('hao3') #=> 'hao3', :pyn
|
38
|
+
#
|
39
|
+
# Returns an Array. The first item is a String to be converted. The second item is a Symbol for the :from option.
|
46
40
|
def _romanization_options(args)
|
47
41
|
if self.class.to_s != 'String'
|
48
42
|
str = args[0]
|
49
|
-
from =
|
43
|
+
from = args[1] || str.romanization? || :pyn
|
50
44
|
else
|
51
45
|
str = self
|
52
|
-
from =
|
46
|
+
from = args[0] || str.romanization? || :pyn
|
53
47
|
end
|
54
48
|
|
55
|
-
[str, from]
|
49
|
+
[str, from.to_sym]
|
56
50
|
end
|
57
51
|
|
58
52
|
# Private: Replaces numbered pinyin with actual pinyin. Pinyin separated with hyphens are combined as one word.
|
@@ -70,8 +64,12 @@ module ZhongwenTools
|
|
70
64
|
# TODO: move regex to ZhongwenTools::Regex
|
71
65
|
regex = /(([BPMFDTNLGKHZCSRJQXWYbpmfdtnlgkhzcsrjqxwy]?[h]?)(A[io]?|a[io]?|i[aeu]?o?|Ei?|ei?|Ou?|ou?|u[aoe]?i?|ve?)?(n?g?)(r?)([1-5])(\-+)?)/
|
72
66
|
|
73
|
-
#
|
74
|
-
# Explanation: if it's pinyin without vowels, e.g. m, ng, then convert,
|
67
|
+
# Using gsub is ~8x faster than using scan and each.
|
68
|
+
# Explanation: if it's pinyin without vowels, e.g. m, ng, then convert,
|
69
|
+
# otherwise, check if it needs an apostrophe (http://www.pinyin.info/romanization/hanyu/apostrophes.html).
|
70
|
+
# If it does, add it and then convert. Otherwise, just convert.
|
71
|
+
# Oh, and if double hyphens are used, replace them with one hyphen.
|
72
|
+
# And finally, correct those apostrophes at the very end.
|
75
73
|
str.gsub(regex) do
|
76
74
|
($3.nil? ? "#{PYN_PY[$1]}" : ($2 == '' && ['a','e','o'].include?($3[0,1]))? "'#{PYN_PY["#{$3}#{$6}"]}#{$4}#{$5}" : "#{$2}#{PYN_PY["#{$3}#{$6}"]}#{$4}#{$5}") + (($7.to_s.length > 1) ? '-' : '')
|
77
75
|
end.gsub("-'","-").sub(/^'/,'')
|
@@ -88,10 +86,10 @@ module ZhongwenTools
|
|
88
86
|
tokens.collect do |t|
|
89
87
|
search = t.gsub(/[1-5].*/,'')
|
90
88
|
|
91
|
-
if from.nil?
|
92
|
-
replace = (_replacement(t) || {}).fetch(
|
89
|
+
if from.nil?
|
90
|
+
replace = (_replacement(t) || {}).fetch(to){search}
|
93
91
|
else
|
94
|
-
replace = (_replacement(t,
|
92
|
+
replace = (_replacement(t, from) || {}).fetch(to){search}
|
95
93
|
end
|
96
94
|
|
97
95
|
replace = _fix_capitalization(str, t, replace)
|
@@ -121,32 +119,28 @@ module ZhongwenTools
|
|
121
119
|
def _convert_romanization str, to, from
|
122
120
|
return str if to == from
|
123
121
|
|
124
|
-
|
125
|
-
if
|
122
|
+
result =
|
123
|
+
if to == :py
|
124
|
+
raise NotImplementedError, 'method not implemented' if from != :pyn
|
125
|
+
# convert to pyn first.
|
126
|
+
# TODO: test :zyfh -> py
|
127
|
+
# str = _to_romanization str, to, :pyn if from != :pyn
|
126
128
|
_to_pinyin str
|
129
|
+
|
130
|
+
elsif to == :pyn
|
131
|
+
if from == :py
|
132
|
+
_convert_pinyin_to_pyn(str)
|
133
|
+
else
|
134
|
+
_to_romanization str, :pyn, from
|
135
|
+
end
|
127
136
|
else
|
128
|
-
|
129
|
-
#convert to pyn first.
|
130
|
-
end
|
131
|
-
elsif to == :zyfh
|
132
|
-
if from == :py
|
133
|
-
#need to convert pinyin to pyn
|
134
|
-
raise NotImplementedError, 'method not implemented'
|
135
|
-
end
|
136
|
-
_to_romanization(str, to, from).gsub('-','')
|
137
|
-
elsif to == :pyn
|
138
|
-
if from == :py
|
139
|
-
_convert_pinyin_to_pyn(str)
|
140
|
-
else
|
141
|
-
raise NotImplementedError, 'method not implemented'
|
142
|
-
end
|
143
|
-
else
|
144
|
-
if from == :pyn
|
137
|
+
str = _to_romanization str, to, :pyn if from != :pyn
|
145
138
|
_to_romanization str, to, from
|
146
|
-
else
|
147
|
-
raise NotImplementedError, 'method not implemented'
|
148
139
|
end
|
149
|
-
|
140
|
+
|
141
|
+
# TODO: check to see if wade giles, yale etc. can have hyphens.
|
142
|
+
result = result.gsub('-','') if to == :zyfh
|
143
|
+
result
|
150
144
|
end
|
151
145
|
|
152
146
|
def _convert_pinyin_to_pyn(pinyin)
|
@@ -155,75 +149,51 @@ module ZhongwenTools
|
|
155
149
|
words = pinyin.split(' ')
|
156
150
|
|
157
151
|
pyn = words.map do |word|
|
158
|
-
pys = word.split(/['\-]/).flatten.map{|x| x.scan(
|
159
|
-
|
152
|
+
pys = word.split(/['\-]/).flatten.map{|x| x.scan(Regex.py).map{|x| (x - [nil])[0]}}.flatten
|
153
|
+
_current_pyn(word, pys)
|
154
|
+
end
|
155
|
+
|
156
|
+
pyn.join(' ')
|
157
|
+
end
|
160
158
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
end.sort{|x,y| x.length <=> y.length}[-1]
|
159
|
+
def _current_pyn(pyn, pinyin_arr)
|
160
|
+
pinyin_arr.each do |pinyin|
|
161
|
+
pyn = pyn.sub(pinyin, pinyin_replacement(pinyin))
|
162
|
+
end
|
166
163
|
|
167
|
-
|
168
|
-
|
164
|
+
pyn.gsub("'",'')
|
165
|
+
end
|
169
166
|
|
170
|
-
|
171
|
-
|
167
|
+
def pinyin_replacement(py)
|
168
|
+
#take the longest pinyin match.
|
169
|
+
match = PYN_PY.values.select do |x|
|
170
|
+
py.include? x
|
171
|
+
end.sort{|x,y| x.length <=> y.length}[-1]
|
172
172
|
|
173
|
-
|
174
|
-
|
173
|
+
# Edge case.. en/eng pyn -> py conversion is one way only.
|
174
|
+
match = match[/(ē|é|ě|è)n?g?/].nil? ? match : match.chars[0]
|
175
175
|
|
176
|
-
|
177
|
-
end
|
176
|
+
replace = PYN_PY.find{|k,v| k if v == match}[0]
|
178
177
|
|
179
|
-
|
178
|
+
py.gsub(match, replace).gsub(/([^\d ]*)(\d)([^\d ]*)/){$1 + $3 + $2}
|
180
179
|
end
|
181
180
|
|
182
181
|
|
183
182
|
def _set_type(type)
|
184
183
|
type = type.to_s.downcase.to_sym
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
when :bopomofo
|
189
|
-
:zyfh
|
190
|
-
when :bpmf
|
191
|
-
type = :zyfh
|
192
|
-
when :zhyfh
|
193
|
-
type = :zyfh
|
194
|
-
when :zyfh
|
184
|
+
return type if [:zyfh, :wg, :typy, :py, :mps2, :yale, :pyn].include? type
|
185
|
+
|
186
|
+
if [:zhuyinfuhao, :zhuyin, :zhuyin_fuhao, :bopomofo, :bpmf, :zhyfh].include? type
|
195
187
|
:zyfh
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
:yale
|
200
|
-
when :tongyong
|
201
|
-
type = :typy
|
202
|
-
when :wg
|
203
|
-
type = :wg
|
204
|
-
when :typy
|
188
|
+
elsif [:wade_giles, 'wade-giles'.to_sym].include? type
|
189
|
+
:wg
|
190
|
+
elsif [:tongyong, :typy, :ty].include? type
|
205
191
|
:typy
|
206
|
-
|
207
|
-
type = :typy
|
208
|
-
when :pyn
|
192
|
+
elsif type == :pinyin
|
209
193
|
:py
|
210
|
-
when :pinyin
|
211
|
-
type = :py
|
212
|
-
when :py
|
213
|
-
type = :py
|
214
|
-
when :msp2
|
215
|
-
:msp2
|
216
|
-
else
|
217
|
-
nil
|
218
194
|
end
|
219
195
|
end
|
220
|
-
|
221
|
-
alias_method :to_py, :to_pinyin
|
222
|
-
alias_method :to_zhyfh, :to_bopomofo
|
223
|
-
alias_method :to_zhuyin, :to_bopomofo
|
224
|
-
alias_method :to_zhuyin_fuhao, :to_bopomofo
|
225
|
-
alias_method :to_bpmf, :to_bopomofo
|
226
|
-
alias_method :to_wg, :to_wade_giles
|
227
|
-
alias_method :to_tongyong, :to_typy
|
228
196
|
end
|
229
197
|
end
|
198
|
+
|
199
|
+
require 'zhongwen_tools/romanization/detect'
|