zhongwen_tools 0.11.1 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/zhongwen_tools/romanization.rb +19 -18
- data/lib/zhongwen_tools/romanization/conversion_table.rb +2 -0
- data/lib/zhongwen_tools/romanization/detect.rb +1 -1
- data/lib/zhongwen_tools/romanization/string.rb +15 -1
- data/lib/zhongwen_tools/string.rb +2 -1
- data/lib/zhongwen_tools/string/fullwidth.rb +5 -1
- data/lib/zhongwen_tools/version.rb +1 -1
- data/test/test_romanization.rb +20 -3
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 38e857f5b289cca5e024238a437b3e69ca74a443
|
|
4
|
+
data.tar.gz: cdd2214ad7fb466252e5416f485a261b447dcaf6
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 1185558e187c41e55870236bae4261c3be2f4227acd9fe3b7c3d53cfeeec17ed7a7dae6d516ef44055872cb1fbc2f71c31973caa02d4ad71984790a8070ac2f3
|
|
7
|
+
data.tar.gz: f7cb7d3b2c486e9faebb56d751b4077bbce7f63d5629dcc00c94470432d61e2460a2f2b41535cf767abd64cb3a5a79dcedb491da36fb377b9f630bad4db4b427
|
|
@@ -75,14 +75,16 @@ module ZhongwenTools
|
|
|
75
75
|
end.gsub("-'","-").sub(/^'/,'')
|
|
76
76
|
end
|
|
77
77
|
|
|
78
|
-
# http://en.wikipedia.org/wiki/Pinyin
|
|
79
|
-
# http://talkbank.org/pinyin/Trad_chart_IPA.php
|
|
80
|
-
# for ipa
|
|
81
78
|
def _to_romanization str, to, from
|
|
82
79
|
convert_to = _set_type to
|
|
83
80
|
convert_from = _set_type from
|
|
84
81
|
|
|
85
|
-
|
|
82
|
+
begin
|
|
83
|
+
tokens = self.send("split_#{from}").uniq
|
|
84
|
+
rescue
|
|
85
|
+
tokens = str.split(/[ \-]/).uniq
|
|
86
|
+
end
|
|
87
|
+
|
|
86
88
|
tokens.collect do |t|
|
|
87
89
|
search = t.gsub(/[1-5].*/,'')
|
|
88
90
|
|
|
@@ -121,20 +123,15 @@ module ZhongwenTools
|
|
|
121
123
|
|
|
122
124
|
result =
|
|
123
125
|
if to == :py
|
|
124
|
-
|
|
125
|
-
# convert to pyn first.
|
|
126
|
-
# TODO: test :zyfh -> py
|
|
127
|
-
# str = _to_romanization str, to, :pyn if from != :pyn
|
|
126
|
+
str = _to_romanization str, :pyn, from if from != :pyn
|
|
128
127
|
_to_pinyin str
|
|
129
|
-
|
|
130
128
|
elsif to == :pyn
|
|
131
129
|
if from == :py
|
|
132
130
|
_convert_pinyin_to_pyn(str)
|
|
133
131
|
else
|
|
134
|
-
|
|
132
|
+
_to_romanization str, :pyn, from
|
|
135
133
|
end
|
|
136
134
|
else
|
|
137
|
-
str = _to_romanization str, to, :pyn if from != :pyn
|
|
138
135
|
_to_romanization str, to, from
|
|
139
136
|
end
|
|
140
137
|
|
|
@@ -149,7 +146,6 @@ module ZhongwenTools
|
|
|
149
146
|
words = pinyin.split(' ')
|
|
150
147
|
|
|
151
148
|
pyn = words.map do |word|
|
|
152
|
-
#binding.pry if word == "Wǒmen"
|
|
153
149
|
pys = word.split(/['\-]/).flatten.map{|x| x.scan(Regex.py).map{|x| (x - [nil])[0]}}.flatten
|
|
154
150
|
_current_pyn(word, pys)
|
|
155
151
|
end
|
|
@@ -158,22 +154,27 @@ module ZhongwenTools
|
|
|
158
154
|
end
|
|
159
155
|
|
|
160
156
|
def _current_pyn(pyn, pinyin_arr)
|
|
157
|
+
replacements = []
|
|
161
158
|
pinyin_arr.each do |pinyin|
|
|
162
|
-
|
|
159
|
+
replace = pinyin_replacement(pinyin)
|
|
160
|
+
match = pinyin
|
|
161
|
+
pyn = pyn.sub(/(#{replacements.join('.*')}.*)#{match}/){ $1 + replace}
|
|
162
|
+
replacements << replace
|
|
163
163
|
end
|
|
164
164
|
|
|
165
165
|
pyn.gsub("'",'')
|
|
166
166
|
end
|
|
167
167
|
|
|
168
168
|
def pinyin_replacement(py)
|
|
169
|
-
|
|
170
|
-
match = PYN_PY.values.select do |x|
|
|
169
|
+
matches = PYN_PY.values.select do |x|
|
|
171
170
|
py.include? x
|
|
172
|
-
end
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# take the longest pinyin match. Use bytes because 'è' is prefered over 'n' or 'r' or 'm'
|
|
174
|
+
match = matches.sort{|x,y| x.bytes.to_a.length <=> y.bytes.to_a.length}[-1]
|
|
173
175
|
|
|
174
|
-
#binding.pry
|
|
175
176
|
# Edge case.. en/eng pyn -> py conversion is one way only.
|
|
176
|
-
match = match[
|
|
177
|
+
match = match[/^(ē|é|ě|è|e)n?g?/].nil? ? match : match.chars[0]
|
|
177
178
|
|
|
178
179
|
replace = PYN_PY.find{|k,v| k if v == match}[0]
|
|
179
180
|
|
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
module ZhongwenTools
|
|
3
3
|
module Romanization
|
|
4
4
|
# TODO: remove excess values, i.e. keys whose value == :pyn
|
|
5
|
+
# TODO: http://en.wikipedia.org/wiki/Jyutping
|
|
6
|
+
# TODO: http://en.wikipedia.org/wiki/Simplified_Wade
|
|
5
7
|
ROMANIZATIONS_TABLE = [{:zyfh => " ㄚ", :wg => "a", :mps2 => "a", :yale => "a", :typy => "a", :pyn => "a"},
|
|
6
8
|
{ :zyfh => "ㄞ", :wg => "ai", :mps2 => "ai", :yale => "ai", :typy => "ai", :pyn => "ai"},
|
|
7
9
|
{ :zyfh => "ㄢ", :wg => "an", :mps2 => "an", :yale => "an", :typy => "an", :pyn => "an"},
|
|
@@ -126,7 +126,7 @@ module ZhongwenTools
|
|
|
126
126
|
#
|
|
127
127
|
# Returns a Regexp.
|
|
128
128
|
def detect_regex(type)
|
|
129
|
-
/#{ROMANIZATIONS_TABLE.map{ |r| r[type] || r[:pyn] }.sort{|x,y| x.size <=> y.size}.reverse.join('|')}/
|
|
129
|
+
/#{ROMANIZATIONS_TABLE.map{ |r| "[#{r[type][0]}#{r[type][0].upcase}]#{r[type][1..-1]}" || r[:pyn] }.flatten.sort{|x,y| x.size <=> y.size}.reverse.join('|')}/
|
|
130
130
|
end
|
|
131
131
|
end
|
|
132
132
|
end
|
|
@@ -16,8 +16,22 @@ module ZhongwenTools
|
|
|
16
16
|
def split_pyn(str = nil)
|
|
17
17
|
str ||= self
|
|
18
18
|
puts "WARNING: string is not valid pinyin-num format. #{str}" unless str.pyn?
|
|
19
|
+
# FIXME: ignore punctuation
|
|
20
|
+
str.scan(/(#{Regex.pyn})/).map{ |arr| arr[0].strip.gsub('-','') }.flatten
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def split_zyfh(str = nil)
|
|
24
|
+
str ||= self
|
|
25
|
+
|
|
26
|
+
str.scan(/([#{Regex.bopomofo}]*)/).map{ |arr| arr[0].strip.gsub('-','') }.flatten - ['']
|
|
27
|
+
end
|
|
19
28
|
|
|
20
|
-
|
|
29
|
+
%w(typy wg yale mps2).each do |type|
|
|
30
|
+
define_method("split_#{type}") do |str = nil|
|
|
31
|
+
str ||= self
|
|
32
|
+
# TODO: ignore tonal marks from other systems wade giles, tongyong etc.
|
|
33
|
+
str.scan(/(#{detect_regex(type.to_sym)}*)/).map{ |arr| arr[0].strip.gsub('-','') }.flatten - ['']
|
|
34
|
+
end
|
|
21
35
|
end
|
|
22
36
|
end
|
|
23
37
|
end
|
data/test/test_romanization.rb
CHANGED
|
@@ -22,6 +22,18 @@ class TestRomanization < Minitest::Test
|
|
|
22
22
|
assert @alabo[:py].py?
|
|
23
23
|
assert 'Ā-lā-bó'.py?
|
|
24
24
|
assert 'Zhong1 wen2'.to_pinyin.py?
|
|
25
|
+
|
|
26
|
+
@romanizations.each do |rom|
|
|
27
|
+
rom.each do |type, entry|
|
|
28
|
+
if type == :bopomofo
|
|
29
|
+
assert_equal rom[:py].downcase, entry.to_pinyin(type).downcase, "to_pinyin(#{type}) should convert to pinyin."
|
|
30
|
+
assert_equal rom[:py].downcase, entry.to_pinyin.downcase, "to_pinyin(#{type}) should convert to pinyin, but it isn't detected properly"
|
|
31
|
+
else
|
|
32
|
+
assert_equal rom[:py], entry.to_pinyin(type), "to_pinyin(#{type}) should convert to pinyin."
|
|
33
|
+
assert_equal rom[:py], entry.to_pinyin, "to_pinyin(#{type}) should convert to pinyin, but it isn't detected properly" unless type == :typy
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
25
37
|
end
|
|
26
38
|
|
|
27
39
|
def test_pyn
|
|
@@ -42,6 +54,9 @@ class TestRomanization < Minitest::Test
|
|
|
42
54
|
|
|
43
55
|
|
|
44
56
|
assert_equal 'Wo3men5', "Wǒmen".to_pyn(:py)
|
|
57
|
+
assert_equal 'hao3xue2', 'hǎoxué'.to_pyn(:py)
|
|
58
|
+
assert_equal 'tai4re4', 'tàirè'.to_pyn(:py)
|
|
59
|
+
assert_equal 'tai4tai5', "tàitai".to_pyn(:py)
|
|
45
60
|
#assert_equal 'Wu1-lu2-ha1-nuo4-fu1', 'Wūlúhānuòfū'.to_pyn(:py)
|
|
46
61
|
#"007:Dàpò Liàngzǐ Wēijī", "007: Da4po4 Liang4zi3 Wei1ji1"
|
|
47
62
|
end
|
|
@@ -122,10 +137,12 @@ class TestRomanization < Minitest::Test
|
|
|
122
137
|
@romanizations = [
|
|
123
138
|
# FIXME: bopomofo, tongyong pinyin, wade-giles tones are all wrong.
|
|
124
139
|
# TODO: test IPA
|
|
125
|
-
{ :pyn => 'ni3
|
|
126
|
-
{ :pyn => '
|
|
127
|
-
{ :pyn => 'chui1 niu3', :py =>
|
|
140
|
+
{ :pyn => 'ni3 hao3', :py => 'nǐ hǎo', :bopomofo => 'ㄋㄧ3 ㄏㄠ3', :yale => 'ni3 hau3', :typy => 'ni3 hao3', :wg => 'ni3 hao3'},#, :ipa => ''}
|
|
141
|
+
{ :pyn => 'Zhong1guo2', :py => 'Zhōngguó', :bopomofo => 'ㄓㄨㄥ1ㄍㄨㄛ2', :yale => 'Jung1gwo2', :typy => 'Jhong1guo2', :wg => 'Chung1kuo2'},#, :ipa => ''}
|
|
142
|
+
{ :pyn => 'chui1 niu3', :py => "chuī niǔ", :bopomofo => "ㄔㄨㄟ1 ㄋㄧㄡ3", :yale => "chwei1 nyou3", :typy => "chuei1 niou3", :wg => "ch`ui1 niu3"},#, :ipa => ''}
|
|
143
|
+
{ :pyn => 'Mao2 Ze2-dong1', :py => 'Máo Zédōng', :bopomofo => 'ㄇㄠ2 ㄗㄜ2ㄉㄨㄥ1', :yale => 'Mau2 Dze2-dung1', :typy => 'Mao2 Ze2-dong1', :wg => 'Mao2 Tse2-tung1'},#, :ipa => ''}
|
|
128
144
|
]
|
|
145
|
+
|
|
129
146
|
@str = 'ni3 hao3'
|
|
130
147
|
@mzd = 'Mao2 Ze2 dong1'
|
|
131
148
|
@mzd2 = 'Mao2 Ze2-dong1'
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: zhongwen_tools
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.12.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Steven Daniels
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2014-06-
|
|
11
|
+
date: 2014-06-11 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rake
|