zhongwen_tools 0.11.1 → 0.12.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/zhongwen_tools/romanization.rb +19 -18
- data/lib/zhongwen_tools/romanization/conversion_table.rb +2 -0
- data/lib/zhongwen_tools/romanization/detect.rb +1 -1
- data/lib/zhongwen_tools/romanization/string.rb +15 -1
- data/lib/zhongwen_tools/string.rb +2 -1
- data/lib/zhongwen_tools/string/fullwidth.rb +5 -1
- data/lib/zhongwen_tools/version.rb +1 -1
- data/test/test_romanization.rb +20 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 38e857f5b289cca5e024238a437b3e69ca74a443
|
4
|
+
data.tar.gz: cdd2214ad7fb466252e5416f485a261b447dcaf6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1185558e187c41e55870236bae4261c3be2f4227acd9fe3b7c3d53cfeeec17ed7a7dae6d516ef44055872cb1fbc2f71c31973caa02d4ad71984790a8070ac2f3
|
7
|
+
data.tar.gz: f7cb7d3b2c486e9faebb56d751b4077bbce7f63d5629dcc00c94470432d61e2460a2f2b41535cf767abd64cb3a5a79dcedb491da36fb377b9f630bad4db4b427
|
@@ -75,14 +75,16 @@ module ZhongwenTools
|
|
75
75
|
end.gsub("-'","-").sub(/^'/,'')
|
76
76
|
end
|
77
77
|
|
78
|
-
# http://en.wikipedia.org/wiki/Pinyin
|
79
|
-
# http://talkbank.org/pinyin/Trad_chart_IPA.php
|
80
|
-
# for ipa
|
81
78
|
def _to_romanization str, to, from
|
82
79
|
convert_to = _set_type to
|
83
80
|
convert_from = _set_type from
|
84
81
|
|
85
|
-
|
82
|
+
begin
|
83
|
+
tokens = self.send("split_#{from}").uniq
|
84
|
+
rescue
|
85
|
+
tokens = str.split(/[ \-]/).uniq
|
86
|
+
end
|
87
|
+
|
86
88
|
tokens.collect do |t|
|
87
89
|
search = t.gsub(/[1-5].*/,'')
|
88
90
|
|
@@ -121,20 +123,15 @@ module ZhongwenTools
|
|
121
123
|
|
122
124
|
result =
|
123
125
|
if to == :py
|
124
|
-
|
125
|
-
# convert to pyn first.
|
126
|
-
# TODO: test :zyfh -> py
|
127
|
-
# str = _to_romanization str, to, :pyn if from != :pyn
|
126
|
+
str = _to_romanization str, :pyn, from if from != :pyn
|
128
127
|
_to_pinyin str
|
129
|
-
|
130
128
|
elsif to == :pyn
|
131
129
|
if from == :py
|
132
130
|
_convert_pinyin_to_pyn(str)
|
133
131
|
else
|
134
|
-
|
132
|
+
_to_romanization str, :pyn, from
|
135
133
|
end
|
136
134
|
else
|
137
|
-
str = _to_romanization str, to, :pyn if from != :pyn
|
138
135
|
_to_romanization str, to, from
|
139
136
|
end
|
140
137
|
|
@@ -149,7 +146,6 @@ module ZhongwenTools
|
|
149
146
|
words = pinyin.split(' ')
|
150
147
|
|
151
148
|
pyn = words.map do |word|
|
152
|
-
#binding.pry if word == "Wǒmen"
|
153
149
|
pys = word.split(/['\-]/).flatten.map{|x| x.scan(Regex.py).map{|x| (x - [nil])[0]}}.flatten
|
154
150
|
_current_pyn(word, pys)
|
155
151
|
end
|
@@ -158,22 +154,27 @@ module ZhongwenTools
|
|
158
154
|
end
|
159
155
|
|
160
156
|
def _current_pyn(pyn, pinyin_arr)
|
157
|
+
replacements = []
|
161
158
|
pinyin_arr.each do |pinyin|
|
162
|
-
|
159
|
+
replace = pinyin_replacement(pinyin)
|
160
|
+
match = pinyin
|
161
|
+
pyn = pyn.sub(/(#{replacements.join('.*')}.*)#{match}/){ $1 + replace}
|
162
|
+
replacements << replace
|
163
163
|
end
|
164
164
|
|
165
165
|
pyn.gsub("'",'')
|
166
166
|
end
|
167
167
|
|
168
168
|
def pinyin_replacement(py)
|
169
|
-
|
170
|
-
match = PYN_PY.values.select do |x|
|
169
|
+
matches = PYN_PY.values.select do |x|
|
171
170
|
py.include? x
|
172
|
-
end
|
171
|
+
end
|
172
|
+
|
173
|
+
# take the longest pinyin match. Use bytes because 'è' is prefered over 'n' or 'r' or 'm'
|
174
|
+
match = matches.sort{|x,y| x.bytes.to_a.length <=> y.bytes.to_a.length}[-1]
|
173
175
|
|
174
|
-
#binding.pry
|
175
176
|
# Edge case.. en/eng pyn -> py conversion is one way only.
|
176
|
-
match = match[
|
177
|
+
match = match[/^(ē|é|ě|è|e)n?g?/].nil? ? match : match.chars[0]
|
177
178
|
|
178
179
|
replace = PYN_PY.find{|k,v| k if v == match}[0]
|
179
180
|
|
@@ -2,6 +2,8 @@
|
|
2
2
|
module ZhongwenTools
|
3
3
|
module Romanization
|
4
4
|
# TODO: remove excess values, i.e. keys whose value == :pyn
|
5
|
+
# TODO: http://en.wikipedia.org/wiki/Jyutping
|
6
|
+
# TODO: http://en.wikipedia.org/wiki/Simplified_Wade
|
5
7
|
ROMANIZATIONS_TABLE = [{:zyfh => " ㄚ", :wg => "a", :mps2 => "a", :yale => "a", :typy => "a", :pyn => "a"},
|
6
8
|
{ :zyfh => "ㄞ", :wg => "ai", :mps2 => "ai", :yale => "ai", :typy => "ai", :pyn => "ai"},
|
7
9
|
{ :zyfh => "ㄢ", :wg => "an", :mps2 => "an", :yale => "an", :typy => "an", :pyn => "an"},
|
@@ -126,7 +126,7 @@ module ZhongwenTools
|
|
126
126
|
#
|
127
127
|
# Returns a Regexp.
|
128
128
|
def detect_regex(type)
|
129
|
-
/#{ROMANIZATIONS_TABLE.map{ |r| r[type] || r[:pyn] }.sort{|x,y| x.size <=> y.size}.reverse.join('|')}/
|
129
|
+
/#{ROMANIZATIONS_TABLE.map{ |r| "[#{r[type][0]}#{r[type][0].upcase}]#{r[type][1..-1]}" || r[:pyn] }.flatten.sort{|x,y| x.size <=> y.size}.reverse.join('|')}/
|
130
130
|
end
|
131
131
|
end
|
132
132
|
end
|
@@ -16,8 +16,22 @@ module ZhongwenTools
|
|
16
16
|
def split_pyn(str = nil)
|
17
17
|
str ||= self
|
18
18
|
puts "WARNING: string is not valid pinyin-num format. #{str}" unless str.pyn?
|
19
|
+
# FIXME: ignore punctuation
|
20
|
+
str.scan(/(#{Regex.pyn})/).map{ |arr| arr[0].strip.gsub('-','') }.flatten
|
21
|
+
end
|
22
|
+
|
23
|
+
def split_zyfh(str = nil)
|
24
|
+
str ||= self
|
25
|
+
|
26
|
+
str.scan(/([#{Regex.bopomofo}]*)/).map{ |arr| arr[0].strip.gsub('-','') }.flatten - ['']
|
27
|
+
end
|
19
28
|
|
20
|
-
|
29
|
+
%w(typy wg yale mps2).each do |type|
|
30
|
+
define_method("split_#{type}") do |str = nil|
|
31
|
+
str ||= self
|
32
|
+
# TODO: ignore tonal marks from other systems wade giles, tongyong etc.
|
33
|
+
str.scan(/(#{detect_regex(type.to_sym)}*)/).map{ |arr| arr[0].strip.gsub('-','') }.flatten - ['']
|
34
|
+
end
|
21
35
|
end
|
22
36
|
end
|
23
37
|
end
|
data/test/test_romanization.rb
CHANGED
@@ -22,6 +22,18 @@ class TestRomanization < Minitest::Test
|
|
22
22
|
assert @alabo[:py].py?
|
23
23
|
assert 'Ā-lā-bó'.py?
|
24
24
|
assert 'Zhong1 wen2'.to_pinyin.py?
|
25
|
+
|
26
|
+
@romanizations.each do |rom|
|
27
|
+
rom.each do |type, entry|
|
28
|
+
if type == :bopomofo
|
29
|
+
assert_equal rom[:py].downcase, entry.to_pinyin(type).downcase, "to_pinyin(#{type}) should convert to pinyin."
|
30
|
+
assert_equal rom[:py].downcase, entry.to_pinyin.downcase, "to_pinyin(#{type}) should convert to pinyin, but it isn't detected properly"
|
31
|
+
else
|
32
|
+
assert_equal rom[:py], entry.to_pinyin(type), "to_pinyin(#{type}) should convert to pinyin."
|
33
|
+
assert_equal rom[:py], entry.to_pinyin, "to_pinyin(#{type}) should convert to pinyin, but it isn't detected properly" unless type == :typy
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
25
37
|
end
|
26
38
|
|
27
39
|
def test_pyn
|
@@ -42,6 +54,9 @@ class TestRomanization < Minitest::Test
|
|
42
54
|
|
43
55
|
|
44
56
|
assert_equal 'Wo3men5', "Wǒmen".to_pyn(:py)
|
57
|
+
assert_equal 'hao3xue2', 'hǎoxué'.to_pyn(:py)
|
58
|
+
assert_equal 'tai4re4', 'tàirè'.to_pyn(:py)
|
59
|
+
assert_equal 'tai4tai5', "tàitai".to_pyn(:py)
|
45
60
|
#assert_equal 'Wu1-lu2-ha1-nuo4-fu1', 'Wūlúhānuòfū'.to_pyn(:py)
|
46
61
|
#"007:Dàpò Liàngzǐ Wēijī", "007: Da4po4 Liang4zi3 Wei1ji1"
|
47
62
|
end
|
@@ -122,10 +137,12 @@ class TestRomanization < Minitest::Test
|
|
122
137
|
@romanizations = [
|
123
138
|
# FIXME: bopomofo, tongyong pinyin, wade-giles tones are all wrong.
|
124
139
|
# TODO: test IPA
|
125
|
-
{ :pyn => 'ni3
|
126
|
-
{ :pyn => '
|
127
|
-
{ :pyn => 'chui1 niu3', :py =>
|
140
|
+
{ :pyn => 'ni3 hao3', :py => 'nǐ hǎo', :bopomofo => 'ㄋㄧ3 ㄏㄠ3', :yale => 'ni3 hau3', :typy => 'ni3 hao3', :wg => 'ni3 hao3'},#, :ipa => ''}
|
141
|
+
{ :pyn => 'Zhong1guo2', :py => 'Zhōngguó', :bopomofo => 'ㄓㄨㄥ1ㄍㄨㄛ2', :yale => 'Jung1gwo2', :typy => 'Jhong1guo2', :wg => 'Chung1kuo2'},#, :ipa => ''}
|
142
|
+
{ :pyn => 'chui1 niu3', :py => "chuī niǔ", :bopomofo => "ㄔㄨㄟ1 ㄋㄧㄡ3", :yale => "chwei1 nyou3", :typy => "chuei1 niou3", :wg => "ch`ui1 niu3"},#, :ipa => ''}
|
143
|
+
{ :pyn => 'Mao2 Ze2-dong1', :py => 'Máo Zédōng', :bopomofo => 'ㄇㄠ2 ㄗㄜ2ㄉㄨㄥ1', :yale => 'Mau2 Dze2-dung1', :typy => 'Mao2 Ze2-dong1', :wg => 'Mao2 Tse2-tung1'},#, :ipa => ''}
|
128
144
|
]
|
145
|
+
|
129
146
|
@str = 'ni3 hao3'
|
130
147
|
@mzd = 'Mao2 Ze2 dong1'
|
131
148
|
@mzd2 = 'Mao2 Ze2-dong1'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zhongwen_tools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.12.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Steven Daniels
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-06-
|
11
|
+
date: 2014-06-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|