zhongwen_tools 0.12.1 → 0.12.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/zhongwen_tools/regex.rb +4 -0
- data/lib/zhongwen_tools/romanization.rb +28 -23
- data/lib/zhongwen_tools/romanization/detect.rb +6 -2
- data/lib/zhongwen_tools/version.rb +1 -1
- data/test/test_romanization.rb +13 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 17555fc2b7ad68dc9185b7f0ae0eea1226a799eb
|
4
|
+
data.tar.gz: 4b53940c086bf4b839fd5cf22e5a6bb380692b26
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7ac4b646e848da7548a3b9f8915b3202099186c913acd78a6130826bef2bda1bcd3ae18b8ab5d19409d30c59722fd6dd708b863d57b197e4336fb81ae4e20785
|
7
|
+
data.tar.gz: adc22130db84d0320b5763484435d94913d5359c1983a9631202cfe6f3181173e1185b0a1962a99521bc2d142a6df1930d504c1ef2159fecbd8509b59ce17856
|
data/lib/zhongwen_tools/regex.rb
CHANGED
@@ -13,6 +13,10 @@ module ZhongwenTools
|
|
13
13
|
/(#{pyn_regexes.map{|k,v| v.to_s[7..-2].gsub_with_hash(/[aeiouv]/,py_tones)}.join('|')}([\s\-])?)/
|
14
14
|
end
|
15
15
|
|
16
|
+
def pinyin_num
|
17
|
+
/(([BPMFDTNLGKHZCSRJQXWYbpmfdtnlgkhzcsrjqxwy]?[h]?)(A[io]?|a[io]?|i[aeu]?o?|Ei?|ei?|Ou?|ou?|u[aoe]?i?|ve?)?(n?g?)(r?)([1-5])(\-+)?)/
|
18
|
+
end
|
19
|
+
|
16
20
|
def fullwidth
|
17
21
|
/[0-9A-Za-z%.:#$&+-/\=;<>]/
|
18
22
|
end
|
@@ -61,9 +61,7 @@ module ZhongwenTools
|
|
61
61
|
#
|
62
62
|
# Returns a string with actual pinyin
|
63
63
|
def _to_pinyin str
|
64
|
-
|
65
|
-
regex = /(([BPMFDTNLGKHZCSRJQXWYbpmfdtnlgkhzcsrjqxwy]?[h]?)(A[io]?|a[io]?|i[aeu]?o?|Ei?|ei?|Ou?|ou?|u[aoe]?i?|ve?)?(n?g?)(r?)([1-5])(\-+)?)/
|
66
|
-
|
64
|
+
regex = Regex.pinyin_num
|
67
65
|
# Using gsub is ~8x faster than using scan and each.
|
68
66
|
# Explanation: if it's pinyin without vowels, e.g. m, ng, then convert,
|
69
67
|
# otherwise, check if it needs an apostrophe (http://www.pinyin.info/romanization/hanyu/apostrophes.html).
|
@@ -76,31 +74,31 @@ module ZhongwenTools
|
|
76
74
|
end
|
77
75
|
|
78
76
|
def _to_romanization str, to, from
|
79
|
-
|
80
|
-
convert_from = _set_type from
|
81
|
-
|
77
|
+
# NOTE: extract/refactor tokens cause tests to fail.
|
82
78
|
begin
|
83
|
-
tokens =
|
79
|
+
tokens = str.send("split_#{from}").uniq
|
84
80
|
rescue
|
85
81
|
tokens = str.split(/[ \-]/).uniq
|
86
82
|
end
|
87
83
|
|
88
84
|
tokens.collect do |t|
|
89
|
-
search = t
|
90
|
-
|
91
|
-
if from.nil?
|
92
|
-
replace = (_replacement(t) || {}).fetch(to){search}
|
93
|
-
else
|
94
|
-
replace = (_replacement(t, from) || {}).fetch(to){search}
|
95
|
-
end
|
96
|
-
|
97
|
-
replace = _fix_capitalization(str, t, replace)
|
85
|
+
search, replace = _token_search_replace(t, str, to, from)
|
98
86
|
str = str.gsub(search, replace)
|
99
87
|
end
|
100
88
|
|
101
89
|
str
|
102
90
|
end
|
103
91
|
|
92
|
+
def _token_search_replace(token, str, to, from)
|
93
|
+
search = token.gsub(/[1-5].*/,'')
|
94
|
+
|
95
|
+
replace = _replacement(token, from).fetch(to){ search }
|
96
|
+
replace = _fix_capitalization(str, token, replace)
|
97
|
+
|
98
|
+
|
99
|
+
[search, replace]
|
100
|
+
end
|
101
|
+
|
104
102
|
def _fix_capitalization(str, token, replace)
|
105
103
|
replace = replace.capitalize if(token.downcase != token)
|
106
104
|
|
@@ -109,13 +107,15 @@ module ZhongwenTools
|
|
109
107
|
|
110
108
|
def _replacement(token, from = nil)
|
111
109
|
token = token.downcase.gsub(/[1-5].*/,'')
|
112
|
-
ROMANIZATIONS_TABLE.find do |x|
|
110
|
+
result = ROMANIZATIONS_TABLE.find do |x|
|
113
111
|
if from.nil?
|
114
112
|
x.values.include?(token)
|
115
113
|
else
|
116
114
|
x[from] == token
|
117
115
|
end
|
118
116
|
end
|
117
|
+
|
118
|
+
result || {}
|
119
119
|
end
|
120
120
|
|
121
121
|
def _convert_romanization str, to, from
|
@@ -132,6 +132,10 @@ module ZhongwenTools
|
|
132
132
|
_to_romanization str, :pyn, from
|
133
133
|
end
|
134
134
|
else
|
135
|
+
if from == :py
|
136
|
+
str = _convert_pinyin_to_pyn(str)
|
137
|
+
from = :pyn
|
138
|
+
end
|
135
139
|
_to_romanization str, to, from
|
136
140
|
end
|
137
141
|
|
@@ -142,7 +146,6 @@ module ZhongwenTools
|
|
142
146
|
|
143
147
|
def _convert_pinyin_to_pyn(pinyin)
|
144
148
|
# TODO: should method check to make sure pinyin is accurate?
|
145
|
-
pyn = []
|
146
149
|
words = pinyin.split(' ')
|
147
150
|
|
148
151
|
pyn = words.map do |word|
|
@@ -169,16 +172,18 @@ module ZhongwenTools
|
|
169
172
|
matches = PYN_PY.values.select do |x|
|
170
173
|
py.include? x
|
171
174
|
end
|
175
|
+
match = select_pinyin_match(matches)
|
176
|
+
replace = PYN_PY.find{|k,v| k if v == match}[0]
|
172
177
|
|
178
|
+
py.gsub(match, replace).gsub(/([^\d ]*)(\d)([^\d ]*)/){$1 + $3 + $2}
|
179
|
+
end
|
180
|
+
|
181
|
+
def select_pinyin_match(matches)
|
173
182
|
# take the longest pinyin match. Use bytes because 'è' is prefered over 'n' or 'r' or 'm'
|
174
183
|
match = matches.sort{|x,y| x.bytes.to_a.length <=> y.bytes.to_a.length}[-1]
|
175
184
|
|
176
185
|
# Edge case.. en/eng pyn -> py conversion is one way only.
|
177
|
-
match
|
178
|
-
|
179
|
-
replace = PYN_PY.find{|k,v| k if v == match}[0]
|
180
|
-
|
181
|
-
py.gsub(match, replace).gsub(/([^\d ]*)(\d)([^\d ]*)/){$1 + $3 + $2}
|
186
|
+
match[/^(ē|é|ě|è|e)n?g?/].nil? ? match : match.chars[0]
|
182
187
|
end
|
183
188
|
|
184
189
|
|
@@ -103,7 +103,7 @@ module ZhongwenTools
|
|
103
103
|
def romanization?(str = nil)
|
104
104
|
str ||= self
|
105
105
|
|
106
|
-
[:pyn, :py, :zyfh, :wg, :typy, :yale, :
|
106
|
+
[:pyn, :py, :zyfh, :wg, :typy, :yale, :mps2].find do |type|
|
107
107
|
self.send("#{type}?", str)
|
108
108
|
end
|
109
109
|
end
|
@@ -126,7 +126,11 @@ module ZhongwenTools
|
|
126
126
|
#
|
127
127
|
# Returns a Regexp.
|
128
128
|
def detect_regex(type)
|
129
|
-
/#{
|
129
|
+
/#{regex_values(type).sort{|x,y| x.size <=> y.size}.reverse.join('|')}/
|
130
|
+
end
|
131
|
+
|
132
|
+
def regex_values(type)
|
133
|
+
ROMANIZATIONS_TABLE.map{ |r| "[#{r[type][0]}#{r[type][0].upcase}]#{r[type][1..-1]}" || r[:pyn] }.flatten
|
130
134
|
end
|
131
135
|
end
|
132
136
|
end
|
data/test/test_romanization.rb
CHANGED
@@ -71,6 +71,19 @@ class TestRomanization < Minitest::Test
|
|
71
71
|
assert 'ㄋㄧ3 ㄏㄠ3'.zyfh?
|
72
72
|
|
73
73
|
assert_equal 'ㄋㄧ3 ㄏㄠ3', 'ni3 hau3'.to_bpmf(:yale)
|
74
|
+
|
75
|
+
t = :bopomofo
|
76
|
+
@romanizations.each do |rom|
|
77
|
+
rom.each do |type, entry|
|
78
|
+
#if type == :bopomofo
|
79
|
+
assert_equal rom[t].downcase, entry.send("to_#{t}", type).downcase, "to_#{t}(#{type}) should convert to #{t}."
|
80
|
+
assert_equal rom[t].downcase, entry.send("to_#{t}").downcase, "to_#{t}(#{type}) should convert to #{t}, but it isn't detected properly"
|
81
|
+
#else
|
82
|
+
#assert_equal rom[:t], entry.to_pinyin(type), "to_pinyin(#{type}) should convert to pinyin."
|
83
|
+
#assert_equal rom[:t], entry.to_pinyin, "to_pinyin(#{type}) should convert to pinyin, but it isn't detected properly" unless type == :typy
|
84
|
+
#end
|
85
|
+
end
|
86
|
+
end
|
74
87
|
end
|
75
88
|
|
76
89
|
def test_wade_giles
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zhongwen_tools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Steven Daniels
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-06-
|
11
|
+
date: 2014-06-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|