zhongwen_tools 0.12.1 → 0.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/zhongwen_tools/regex.rb +4 -0
- data/lib/zhongwen_tools/romanization.rb +28 -23
- data/lib/zhongwen_tools/romanization/detect.rb +6 -2
- data/lib/zhongwen_tools/version.rb +1 -1
- data/test/test_romanization.rb +13 -0
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 17555fc2b7ad68dc9185b7f0ae0eea1226a799eb
|
|
4
|
+
data.tar.gz: 4b53940c086bf4b839fd5cf22e5a6bb380692b26
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7ac4b646e848da7548a3b9f8915b3202099186c913acd78a6130826bef2bda1bcd3ae18b8ab5d19409d30c59722fd6dd708b863d57b197e4336fb81ae4e20785
|
|
7
|
+
data.tar.gz: adc22130db84d0320b5763484435d94913d5359c1983a9631202cfe6f3181173e1185b0a1962a99521bc2d142a6df1930d504c1ef2159fecbd8509b59ce17856
|
data/lib/zhongwen_tools/regex.rb
CHANGED
|
@@ -13,6 +13,10 @@ module ZhongwenTools
|
|
|
13
13
|
/(#{pyn_regexes.map{|k,v| v.to_s[7..-2].gsub_with_hash(/[aeiouv]/,py_tones)}.join('|')}([\s\-])?)/
|
|
14
14
|
end
|
|
15
15
|
|
|
16
|
+
def pinyin_num
|
|
17
|
+
/(([BPMFDTNLGKHZCSRJQXWYbpmfdtnlgkhzcsrjqxwy]?[h]?)(A[io]?|a[io]?|i[aeu]?o?|Ei?|ei?|Ou?|ou?|u[aoe]?i?|ve?)?(n?g?)(r?)([1-5])(\-+)?)/
|
|
18
|
+
end
|
|
19
|
+
|
|
16
20
|
def fullwidth
|
|
17
21
|
/[0-9A-Za-z%.:#$&+-/\=;<>]/
|
|
18
22
|
end
|
|
@@ -61,9 +61,7 @@ module ZhongwenTools
|
|
|
61
61
|
#
|
|
62
62
|
# Returns a string with actual pinyin
|
|
63
63
|
def _to_pinyin str
|
|
64
|
-
|
|
65
|
-
regex = /(([BPMFDTNLGKHZCSRJQXWYbpmfdtnlgkhzcsrjqxwy]?[h]?)(A[io]?|a[io]?|i[aeu]?o?|Ei?|ei?|Ou?|ou?|u[aoe]?i?|ve?)?(n?g?)(r?)([1-5])(\-+)?)/
|
|
66
|
-
|
|
64
|
+
regex = Regex.pinyin_num
|
|
67
65
|
# Using gsub is ~8x faster than using scan and each.
|
|
68
66
|
# Explanation: if it's pinyin without vowels, e.g. m, ng, then convert,
|
|
69
67
|
# otherwise, check if it needs an apostrophe (http://www.pinyin.info/romanization/hanyu/apostrophes.html).
|
|
@@ -76,31 +74,31 @@ module ZhongwenTools
|
|
|
76
74
|
end
|
|
77
75
|
|
|
78
76
|
def _to_romanization str, to, from
|
|
79
|
-
|
|
80
|
-
convert_from = _set_type from
|
|
81
|
-
|
|
77
|
+
# NOTE: extract/refactor tokens cause tests to fail.
|
|
82
78
|
begin
|
|
83
|
-
tokens =
|
|
79
|
+
tokens = str.send("split_#{from}").uniq
|
|
84
80
|
rescue
|
|
85
81
|
tokens = str.split(/[ \-]/).uniq
|
|
86
82
|
end
|
|
87
83
|
|
|
88
84
|
tokens.collect do |t|
|
|
89
|
-
search = t
|
|
90
|
-
|
|
91
|
-
if from.nil?
|
|
92
|
-
replace = (_replacement(t) || {}).fetch(to){search}
|
|
93
|
-
else
|
|
94
|
-
replace = (_replacement(t, from) || {}).fetch(to){search}
|
|
95
|
-
end
|
|
96
|
-
|
|
97
|
-
replace = _fix_capitalization(str, t, replace)
|
|
85
|
+
search, replace = _token_search_replace(t, str, to, from)
|
|
98
86
|
str = str.gsub(search, replace)
|
|
99
87
|
end
|
|
100
88
|
|
|
101
89
|
str
|
|
102
90
|
end
|
|
103
91
|
|
|
92
|
+
def _token_search_replace(token, str, to, from)
|
|
93
|
+
search = token.gsub(/[1-5].*/,'')
|
|
94
|
+
|
|
95
|
+
replace = _replacement(token, from).fetch(to){ search }
|
|
96
|
+
replace = _fix_capitalization(str, token, replace)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
[search, replace]
|
|
100
|
+
end
|
|
101
|
+
|
|
104
102
|
def _fix_capitalization(str, token, replace)
|
|
105
103
|
replace = replace.capitalize if(token.downcase != token)
|
|
106
104
|
|
|
@@ -109,13 +107,15 @@ module ZhongwenTools
|
|
|
109
107
|
|
|
110
108
|
def _replacement(token, from = nil)
|
|
111
109
|
token = token.downcase.gsub(/[1-5].*/,'')
|
|
112
|
-
ROMANIZATIONS_TABLE.find do |x|
|
|
110
|
+
result = ROMANIZATIONS_TABLE.find do |x|
|
|
113
111
|
if from.nil?
|
|
114
112
|
x.values.include?(token)
|
|
115
113
|
else
|
|
116
114
|
x[from] == token
|
|
117
115
|
end
|
|
118
116
|
end
|
|
117
|
+
|
|
118
|
+
result || {}
|
|
119
119
|
end
|
|
120
120
|
|
|
121
121
|
def _convert_romanization str, to, from
|
|
@@ -132,6 +132,10 @@ module ZhongwenTools
|
|
|
132
132
|
_to_romanization str, :pyn, from
|
|
133
133
|
end
|
|
134
134
|
else
|
|
135
|
+
if from == :py
|
|
136
|
+
str = _convert_pinyin_to_pyn(str)
|
|
137
|
+
from = :pyn
|
|
138
|
+
end
|
|
135
139
|
_to_romanization str, to, from
|
|
136
140
|
end
|
|
137
141
|
|
|
@@ -142,7 +146,6 @@ module ZhongwenTools
|
|
|
142
146
|
|
|
143
147
|
def _convert_pinyin_to_pyn(pinyin)
|
|
144
148
|
# TODO: should method check to make sure pinyin is accurate?
|
|
145
|
-
pyn = []
|
|
146
149
|
words = pinyin.split(' ')
|
|
147
150
|
|
|
148
151
|
pyn = words.map do |word|
|
|
@@ -169,16 +172,18 @@ module ZhongwenTools
|
|
|
169
172
|
matches = PYN_PY.values.select do |x|
|
|
170
173
|
py.include? x
|
|
171
174
|
end
|
|
175
|
+
match = select_pinyin_match(matches)
|
|
176
|
+
replace = PYN_PY.find{|k,v| k if v == match}[0]
|
|
172
177
|
|
|
178
|
+
py.gsub(match, replace).gsub(/([^\d ]*)(\d)([^\d ]*)/){$1 + $3 + $2}
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
def select_pinyin_match(matches)
|
|
173
182
|
# take the longest pinyin match. Use bytes because 'è' is prefered over 'n' or 'r' or 'm'
|
|
174
183
|
match = matches.sort{|x,y| x.bytes.to_a.length <=> y.bytes.to_a.length}[-1]
|
|
175
184
|
|
|
176
185
|
# Edge case.. en/eng pyn -> py conversion is one way only.
|
|
177
|
-
match
|
|
178
|
-
|
|
179
|
-
replace = PYN_PY.find{|k,v| k if v == match}[0]
|
|
180
|
-
|
|
181
|
-
py.gsub(match, replace).gsub(/([^\d ]*)(\d)([^\d ]*)/){$1 + $3 + $2}
|
|
186
|
+
match[/^(ē|é|ě|è|e)n?g?/].nil? ? match : match.chars[0]
|
|
182
187
|
end
|
|
183
188
|
|
|
184
189
|
|
|
@@ -103,7 +103,7 @@ module ZhongwenTools
|
|
|
103
103
|
def romanization?(str = nil)
|
|
104
104
|
str ||= self
|
|
105
105
|
|
|
106
|
-
[:pyn, :py, :zyfh, :wg, :typy, :yale, :
|
|
106
|
+
[:pyn, :py, :zyfh, :wg, :typy, :yale, :mps2].find do |type|
|
|
107
107
|
self.send("#{type}?", str)
|
|
108
108
|
end
|
|
109
109
|
end
|
|
@@ -126,7 +126,11 @@ module ZhongwenTools
|
|
|
126
126
|
#
|
|
127
127
|
# Returns a Regexp.
|
|
128
128
|
def detect_regex(type)
|
|
129
|
-
/#{
|
|
129
|
+
/#{regex_values(type).sort{|x,y| x.size <=> y.size}.reverse.join('|')}/
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def regex_values(type)
|
|
133
|
+
ROMANIZATIONS_TABLE.map{ |r| "[#{r[type][0]}#{r[type][0].upcase}]#{r[type][1..-1]}" || r[:pyn] }.flatten
|
|
130
134
|
end
|
|
131
135
|
end
|
|
132
136
|
end
|
data/test/test_romanization.rb
CHANGED
|
@@ -71,6 +71,19 @@ class TestRomanization < Minitest::Test
|
|
|
71
71
|
assert 'ㄋㄧ3 ㄏㄠ3'.zyfh?
|
|
72
72
|
|
|
73
73
|
assert_equal 'ㄋㄧ3 ㄏㄠ3', 'ni3 hau3'.to_bpmf(:yale)
|
|
74
|
+
|
|
75
|
+
t = :bopomofo
|
|
76
|
+
@romanizations.each do |rom|
|
|
77
|
+
rom.each do |type, entry|
|
|
78
|
+
#if type == :bopomofo
|
|
79
|
+
assert_equal rom[t].downcase, entry.send("to_#{t}", type).downcase, "to_#{t}(#{type}) should convert to #{t}."
|
|
80
|
+
assert_equal rom[t].downcase, entry.send("to_#{t}").downcase, "to_#{t}(#{type}) should convert to #{t}, but it isn't detected properly"
|
|
81
|
+
#else
|
|
82
|
+
#assert_equal rom[:t], entry.to_pinyin(type), "to_pinyin(#{type}) should convert to pinyin."
|
|
83
|
+
#assert_equal rom[:t], entry.to_pinyin, "to_pinyin(#{type}) should convert to pinyin, but it isn't detected properly" unless type == :typy
|
|
84
|
+
#end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
74
87
|
end
|
|
75
88
|
|
|
76
89
|
def test_wade_giles
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: zhongwen_tools
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.12.
|
|
4
|
+
version: 0.12.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Steven Daniels
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2014-06-
|
|
11
|
+
date: 2014-06-18 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rake
|