arnebrasseur-pinyin 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +12 -0
- data/Manifest.txt +31 -0
- data/README.txt +50 -0
- data/TODO +23 -0
- data/examples/cgiform/cgiform.rb +24 -0
- data/examples/cgiform/template.rhtml +69 -0
- data/examples/hello.rb +12 -0
- data/lib/pinyin.rb +90 -0
- data/lib/pinyin/conversion.rb +51 -0
- data/lib/pinyin/conversions.rb +75 -0
- data/lib/pinyin/conversions/hanyu.rb +77 -0
- data/lib/pinyin/data/comparison.csv +410 -0
- data/lib/pinyin/data/final.csv +10 -0
- data/lib/pinyin/data/initial.csv +7 -0
- data/lib/pinyin/data/paladiy.txt +421 -0
- data/lib/pinyin/data/rules.yaml +24 -0
- data/lib/pinyin/data/valid_pinyin.yaml +454 -0
- data/lib/pinyin/exception.rb +14 -0
- data/lib/pinyin/groundwork.rb +183 -0
- data/lib/pinyin/string.rb +16 -0
- data/lib/pinyin/support.rb +12 -0
- data/lib/pinyin/tones.rb +47 -0
- data/lib/pinyin/tones/accents.rb +62 -0
- data/lib/pinyin/tones/marks.rb +30 -0
- data/lib/pinyin/tones/no_tones.rb +6 -0
- data/lib/pinyin/tones/numbers.rb +25 -0
- data/rakefile +22 -0
- data/script/update +4 -0
- data/test/test_comparison.rb +35 -0
- data/test/test_hanyu_coverage.rb +35 -0
- metadata +102 -0
@@ -0,0 +1,14 @@
|
|
1
|
+
module Pinyin
|
2
|
+
# All exceptions arising from this module inherit from Pinyin::Error
|
3
|
+
Error = Class.new StandardError
|
4
|
+
|
5
|
+
class ParseError < Error
|
6
|
+
attr_reader :input, :position
|
7
|
+
|
8
|
+
def initialize(input, position)
|
9
|
+
@input=input
|
10
|
+
@position=position
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
@@ -0,0 +1,183 @@
|
|
1
|
+
# Classes and constants used throughout the module
|
2
|
+
# * Initial
|
3
|
+
# * Final
|
4
|
+
# * TonelessSyllable
|
5
|
+
# * Syllable
|
6
|
+
# * ILLEGAL_COMBINATIONS
|
7
|
+
|
8
|
+
module Pinyin
|
9
|
+
|
10
|
+
#
|
11
|
+
# A Chinese initial (start of a syllable)
|
12
|
+
#
|
13
|
+
|
14
|
+
class Initial
|
15
|
+
attr :name
|
16
|
+
def initialize(n)
|
17
|
+
@name=n
|
18
|
+
end
|
19
|
+
|
20
|
+
All = %w(
|
21
|
+
Empty Bo Po Mo Fo De Te Ne Le Ge Ke He
|
22
|
+
Ji Qi Xi Zhi Chi Shi Ri Zi Ci Si
|
23
|
+
).map{|c| const_set c, Initial.new(c)}
|
24
|
+
|
25
|
+
class <<self
|
26
|
+
private :new
|
27
|
+
end
|
28
|
+
|
29
|
+
Groups=[
|
30
|
+
Group_0=[ Empty ],
|
31
|
+
Group_1=[ Bo,Po,Mo,Fo], #Bilabial and Labio-dental
|
32
|
+
Group_2=[ De,Te,Ne,Le ], #Plosive, nasal and lateral approximant alveolar
|
33
|
+
Group_3=[ Ge,Ke,He ], #Velar
|
34
|
+
Group_4=[ Ji,Qi,Xi ], #Alveolo-palatal
|
35
|
+
Group_5=[ Zhi,Chi,Shi,Ri ], #Retroflex
|
36
|
+
Group_6=[ Zi,Ci,Si ], #Fricative and affricate alveolar
|
37
|
+
]
|
38
|
+
|
39
|
+
def +(f)
|
40
|
+
TonelessSyllable.new(self,f)
|
41
|
+
end
|
42
|
+
|
43
|
+
def inspect()
|
44
|
+
"<#{self.class.name}::#{@name}>"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# A Chinese final (end of a syllable)
|
50
|
+
#
|
51
|
+
|
52
|
+
class Final
|
53
|
+
attr :name
|
54
|
+
def initialize(n)
|
55
|
+
@name=n
|
56
|
+
end
|
57
|
+
|
58
|
+
All=%w(
|
59
|
+
Empty A O E Ee Ai Ei Ao Ou An En Ang Eng Ong Er
|
60
|
+
I Ia Io Ie Iai Iao Iu Ian In Iang Ing
|
61
|
+
U Ua Uo Uai Ui Uan Un Uang Ueng V Ue Van Vn Iong
|
62
|
+
).map{|c| const_set c, Final.new(c)}
|
63
|
+
|
64
|
+
class <<self
|
65
|
+
private :new
|
66
|
+
end
|
67
|
+
|
68
|
+
Groups=[
|
69
|
+
Group_0=[ Empty ],
|
70
|
+
Group_A=[ A,O,E,Ee,Ai,Ei,Ao,Ou,An,En,Ang,Eng,Ong,Er ],
|
71
|
+
Group_I=[ I,Ia,Io,Ie,Iai,Iao,Iu,Ian,In,Iang,Ing ],
|
72
|
+
Group_U=[ U,Ua,Uo,Uai,Ui,Uan,Un,Uang,Ueng ],
|
73
|
+
Group_V=[ V,Ue,Van,Vn,Iong]
|
74
|
+
]
|
75
|
+
def inspect()
|
76
|
+
"<#{self.class.name}::#{name}>"
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
#
|
82
|
+
# Combination of an initial and a final
|
83
|
+
# Not to be confused with a syllable that has the neutral tone
|
84
|
+
#
|
85
|
+
|
86
|
+
class TonelessSyllable
|
87
|
+
attr_accessor :initial, :final
|
88
|
+
|
89
|
+
def initialize(initial, final)
|
90
|
+
self.initial = initial
|
91
|
+
self.final = final
|
92
|
+
end
|
93
|
+
|
94
|
+
def +(tone)
|
95
|
+
Syllable.new(initial, final, tone)
|
96
|
+
end
|
97
|
+
|
98
|
+
def inspect
|
99
|
+
"<#{self.class.name} <initial=#{initial.name}, final=#{final.name}>>"
|
100
|
+
end
|
101
|
+
|
102
|
+
def self.illegal?(i,f)
|
103
|
+
ILLEGAL_COMBINATIONS.any? {|in_gr, fin_gr| in_gr.include?(i) && fin_gr.include?(f)}
|
104
|
+
end
|
105
|
+
|
106
|
+
alias :to_s :inspect
|
107
|
+
end
|
108
|
+
|
109
|
+
|
110
|
+
#
|
111
|
+
# Syllable : initial, final and tone
|
112
|
+
#
|
113
|
+
|
114
|
+
class Syllable < TonelessSyllable
|
115
|
+
attr_accessor :tone
|
116
|
+
|
117
|
+
def initialize(initial, final, tone)
|
118
|
+
super(initial, final)
|
119
|
+
self.tone = tone
|
120
|
+
end
|
121
|
+
|
122
|
+
def inspect
|
123
|
+
"<#{self.class.name} <initial=#{initial.name}, final=#{final.name}, tone=#{tone}>>"
|
124
|
+
end
|
125
|
+
|
126
|
+
alias :to_s :inspect
|
127
|
+
end
|
128
|
+
|
129
|
+
#
|
130
|
+
# Some groups of initials and finals may not be combined
|
131
|
+
# This list is not exhaustive but is sufficient to resolve ambiguity
|
132
|
+
#
|
133
|
+
|
134
|
+
ILLEGAL_COMBINATIONS=
|
135
|
+
[
|
136
|
+
[Initial::Group_0, Final::Group_0],
|
137
|
+
[Initial::Group_1, Final::Group_0],
|
138
|
+
[Initial::Group_2, Final::Group_0],
|
139
|
+
[Initial::Group_3, Final::Group_0],
|
140
|
+
[Initial::Group_4, Final::Group_0],
|
141
|
+
|
142
|
+
[Initial::Group_4, Final::Group_U],
|
143
|
+
[Initial::Group_4, Final::Group_A],
|
144
|
+
|
145
|
+
[Initial::Group_3, Final::Group_I],
|
146
|
+
[Initial::Group_5, Final::Group_I],
|
147
|
+
[Initial::Group_6, Final::Group_I],
|
148
|
+
|
149
|
+
[Initial::Group_1, Final::Group_V],
|
150
|
+
[Initial::Group_3, Final::Group_V],
|
151
|
+
|
152
|
+
#2008.05.26 lo is also valid!
|
153
|
+
#[Initial::Group_2, [Final::O]], #Only bo, po, mo and fo are valid -o combinations
|
154
|
+
[Initial::Group_3, [Final::O]],
|
155
|
+
[Initial::Group_4, [Final::O]],
|
156
|
+
[Initial::Group_5, [Final::O]],
|
157
|
+
[Initial::Group_6, [Final::O]],
|
158
|
+
|
159
|
+
[[Initial::Empty], [Final::Ong]]
|
160
|
+
# TODO: Ong is actually the same as Ueng, in Hanyu Pinyin : -ong or weng
|
161
|
+
]
|
162
|
+
|
163
|
+
class <<self
|
164
|
+
|
165
|
+
|
166
|
+
#
|
167
|
+
# Yields a block for any valid initial/final pair
|
168
|
+
#
|
169
|
+
|
170
|
+
def valid_combinations
|
171
|
+
require 'yaml'
|
172
|
+
inp = YAML::load(IO.read(File.join(File.dirname(__FILE__), 'data', 'valid_pinyin.yaml')))
|
173
|
+
inp.each do |final, initials|
|
174
|
+
final = Final.const_get(final)
|
175
|
+
initials.each do |initial, pinyin|
|
176
|
+
initial = Initial.const_get(initial)
|
177
|
+
yield(initial, final)
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
class String
|
2
|
+
PINYIN_CACHE={}
|
3
|
+
def pretty_tones
|
4
|
+
self.gsub('u:','ü').gsub(/[A-Za-züÜ]{1,5}\d/) do |m|
|
5
|
+
m.downcase!
|
6
|
+
PINYIN_CACHE[m] || PINYIN_CACHE[m]=(Pinyin.HanyuWriter(:accents) << Pinyin.HanyuReader(:numbers).parse(m.downcase))
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
def bpmf
|
11
|
+
self.gsub('u:','ü').scan(/[A-Za-züÜ]{1,5}\d/).map do |m|
|
12
|
+
Pinyin.ZhuyinWriter(:marks) <<
|
13
|
+
(Pinyin.HanyuReader(:numbers) << m.downcase)
|
14
|
+
end.join(' ')
|
15
|
+
end
|
16
|
+
end
|
data/lib/pinyin/tones.rb
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
module Pinyin
|
2
|
+
#
|
3
|
+
# Base class for Tone classes
|
4
|
+
#
|
5
|
+
class Tone
|
6
|
+
VALID_TONES = 1..5
|
7
|
+
MAX_TONE = NEUTRAL_TONE = 5
|
8
|
+
|
9
|
+
class <<self
|
10
|
+
def add_tone(s,t)
|
11
|
+
s
|
12
|
+
end
|
13
|
+
|
14
|
+
def peek_tone(s)
|
15
|
+
NEUTRAL_TONE
|
16
|
+
end
|
17
|
+
|
18
|
+
def pop_tone(s)
|
19
|
+
[NEUTRAL_TONE, s]
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
def normalize(t)
|
24
|
+
if VALID_TONES === t
|
25
|
+
t
|
26
|
+
else
|
27
|
+
t %= MAX_TONE
|
28
|
+
t = NEUTRAL_TONE if t == 0
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
require "pinyin/tones/marks"
|
37
|
+
require "pinyin/tones/numbers"
|
38
|
+
require "pinyin/tones/accents"
|
39
|
+
require "pinyin/tones/no_tones"
|
40
|
+
|
41
|
+
module Pinyin
|
42
|
+
module Tones
|
43
|
+
All = [Numbers, Marks, Accents, NoTones]
|
44
|
+
MAX_TONE = NEUTRAL_TONE = 5
|
45
|
+
VALID_TONES = 1..5
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module Pinyin
|
2
|
+
module Tones
|
3
|
+
class Accents < Tone
|
4
|
+
class <<self
|
5
|
+
|
6
|
+
UNICODE_TONE_GLYPHS={
|
7
|
+
:a=>[97, 257, 225, 462, 224],
|
8
|
+
:e=>[101, 275, 233, 283, 232],
|
9
|
+
:i=>[105, 299, 237, 464, 236],
|
10
|
+
:o=>[111, 333, 243, 466, 242],
|
11
|
+
:u=>[117, 363, 250, 468, 249],
|
12
|
+
:v=>[252, 470, 472, 474, 476]
|
13
|
+
}
|
14
|
+
|
15
|
+
def tone_glyph(letter,tone)
|
16
|
+
if (u=UNICODE_TONE_GLYPHS[letter.to_sym][tone%MAX_TONE])
|
17
|
+
[u].pack('U')
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def add_tone(syll, tone)
|
22
|
+
syll.gsub!('ü','v')
|
23
|
+
tone %= MAX_TONE
|
24
|
+
case syll
|
25
|
+
when /a/ : syll.sub(/a/, tone_glyph(:a,tone))
|
26
|
+
when /e/ : syll.sub(/e/, tone_glyph(:e,tone))
|
27
|
+
when /o/ : syll.sub(/o/, tone_glyph(:o,tone))
|
28
|
+
when /(i|u|v)/ : syll.sub($1, tone_glyph($1,tone))
|
29
|
+
else syll
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def peek_tone(syll)
|
34
|
+
unpacked = syll.unpack('U*')
|
35
|
+
each_tone_glyph do |vowel, tones|
|
36
|
+
tone_glyph=unpacked.find {|t| tones.include?(t)}
|
37
|
+
normalize( tones.index(tone_glyph) ) if tone_glyph
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def pop_tone(syll)
|
42
|
+
unpacked = syll.unpack('U*')
|
43
|
+
each_tone_glyph do |vowel, tones|
|
44
|
+
if tone_glyph = unpacked.find {|t| tones.include?(t)}
|
45
|
+
unpacked[unpacked.index(tone_glyph)]=vowel.to_s[0]
|
46
|
+
break [normalize(tones.index(tone_glyph)), unpacked.pack('U*')]
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
def each_tone_glyph
|
53
|
+
[:a,:e,:i,:o,:u,:v].each do |v| #Order is significant
|
54
|
+
vowel, tones = v, UNICODE_TONE_GLYPHS[v]
|
55
|
+
yield vowel,tones
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Pinyin
|
2
|
+
module Tones
|
3
|
+
class Marks < Tone
|
4
|
+
class <<self
|
5
|
+
|
6
|
+
GLYPHS=['˙', '', 'ˊ', 'ˇ', 'ˋ']
|
7
|
+
|
8
|
+
def add_tone(syll,tone)
|
9
|
+
syll + GLYPHS[normalize(tone) % 5]
|
10
|
+
end
|
11
|
+
|
12
|
+
def peek_tone(syll)
|
13
|
+
case syll
|
14
|
+
when /ˊ/ : 2
|
15
|
+
when /ˇ/ : 3
|
16
|
+
when /ˋ/ : 4
|
17
|
+
when /˙/ : NEUTRAL_TONE
|
18
|
+
else
|
19
|
+
1
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def pop_tone(syll)
|
24
|
+
[ peek_tone(syll), syll[/\A[^#{GLYPHS.join}]+/] ]
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Pinyin
|
2
|
+
module Tones
|
3
|
+
class Numbers < Tone
|
4
|
+
class <<self
|
5
|
+
|
6
|
+
def add_tone(syll, tone)
|
7
|
+
syll + normalize(tone).to_s
|
8
|
+
end
|
9
|
+
|
10
|
+
def peek_tone(syll)
|
11
|
+
if syll =~ /(\d)\Z/
|
12
|
+
normalize Integer($1)
|
13
|
+
else
|
14
|
+
NEUTRAL_TONE
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def pop_tone(syll)
|
19
|
+
[ peek_tone(syll), syll[/\A\D+/] ]
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
data/rakefile
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'hoe'
|
3
|
+
|
4
|
+
$:.unshift './lib'
|
5
|
+
|
6
|
+
require 'pinyin'
|
7
|
+
|
8
|
+
Hoe.new('pinyin', Pinyin::VERSION) do |p|
|
9
|
+
p.rubyforge_name = 'pinyin'
|
10
|
+
p.summary = 'A conversion library for Chinese transcription methods like Hanyu Pinyin, Bopomofo and Wade-Giles.'
|
11
|
+
p.description = p.paragraphs_of('README.txt', 2).join
|
12
|
+
p.url = 'http://rubyforge.org/projects/pinyin'
|
13
|
+
p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
|
14
|
+
p.email = 'pinyin@arnebrasseur.net'
|
15
|
+
p.author = 'Arne Brasseur'
|
16
|
+
p.extra_deps << ['facets', '>= 2.4.0']
|
17
|
+
p.spec_extras = {
|
18
|
+
:extra_rdoc_files => ['README.txt', 'History.txt'],
|
19
|
+
:rdoc_options => ['--main', 'README.txt'],
|
20
|
+
:platform => Gem::Platform::RUBY
|
21
|
+
}
|
22
|
+
end
|
data/script/update
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'pinyin'
|
2
|
+
require 'test/unit'
|
3
|
+
require 'csv'
|
4
|
+
|
5
|
+
|
6
|
+
# This test uses the chart from piyin.info to compare all implemted conversion types
|
7
|
+
# Since I can't find another reference of the hanyu pinyin 'lo', I have removed it from the table
|
8
|
+
|
9
|
+
class TestCompare < Test::Unit::TestCase
|
10
|
+
CHART=CSV.parse(IO.read(File.dirname(__FILE__)+'/../lib/pinyin/data/comparison.csv'))
|
11
|
+
COMPARE=[:hanyu, :wadegiles, :zhuyin, :tongyong]
|
12
|
+
|
13
|
+
|
14
|
+
# Test all combinations, included parsing/unparsing the same type
|
15
|
+
|
16
|
+
def test_do_comparisons
|
17
|
+
COMPARE.each do |from|
|
18
|
+
COMPARE.each do |to|
|
19
|
+
compare(from,to)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def compare(from, to)
|
25
|
+
reader = Pinyin::Reader.new(from, :no_tones)
|
26
|
+
writer = Pinyin::Writer.new(to, :no_tones)
|
27
|
+
|
28
|
+
ifrom = CHART[0].index from.to_s
|
29
|
+
ito = CHART[0].index to.to_s
|
30
|
+
|
31
|
+
CHART[1..-1].each do |vals|
|
32
|
+
assert_equal(vals[ito].strip, writer << (reader << vals[ifrom].strip), "Converting from #{from} to #{to} value #{vals[ito]}")
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|