pinyin 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,455 @@
1
+ # Valid pinyin syllables by final and initial
2
+ # Hpricoted from http://en.wikipedia.org/wiki/Pinyin_table
3
+ # Hand-edited for Ueng/weng which is under Ong in the table
4
+ #
5
+ # Note that, in order to only use 7-bit ASCII characters,
6
+ # the u with two dots (umlaut) is written v, and the e with
7
+ # a little hat on top (circumflex) is written E.
8
+ ---
9
+ V:
10
+ Ne: nü
11
+ Qi: qu
12
+ Empty: yu
13
+ Le: lü
14
+ Xi: xu
15
+ Ji: ju
16
+ Ian:
17
+ Ne: nian
18
+ De: dian
19
+ Qi: qian
20
+ Mo: mian
21
+ Bo: bian
22
+ Empty: yan
23
+ Le: lian
24
+ Xi: xian
25
+ Te: tian
26
+ Ji: jian
27
+ Po: pian
28
+ Ee:
29
+ Empty: ê
30
+ Ua:
31
+ Empty: wa
32
+ Chi: chua
33
+ Ke: kua
34
+ Zhi: zhua
35
+ He: hua
36
+ Ge: gua
37
+ Shi: shua
38
+ Iu:
39
+ Ne: niu
40
+ De: diu
41
+ Qi: qiu
42
+ Mo: miu
43
+ Empty: you
44
+ Le: liu
45
+ Xi: xiu
46
+ Ji: jiu
47
+ Iao:
48
+ Ne: niao
49
+ De: diao
50
+ Qi: qiao
51
+ Mo: miao
52
+ Bo: biao
53
+ Empty: yao
54
+ Le: liao
55
+ Xi: xiao
56
+ Te: tiao
57
+ Ji: jiao
58
+ Po: piao
59
+ A:
60
+ Si: sa
61
+ Ne: na
62
+ De: da
63
+ Mo: ma
64
+ Bo: ba
65
+ Empty: a
66
+ Le: la
67
+ Zi: za
68
+ Chi: cha
69
+ Ke: ka
70
+ Zhi: zha
71
+ Ci: ca
72
+ Te: ta
73
+ He: ha
74
+ Ge: ga
75
+ Shi: sha
76
+ Fo: fa
77
+ Po: pa
78
+ Vn:
79
+ Qi: qun
80
+ Empty: yun
81
+ Xi: xun
82
+ Ji: jun
83
+ Uan:
84
+ Si: suan
85
+ Ri: ruan
86
+ Ne: nuan
87
+ De: duan
88
+ Empty: wan
89
+ Le: luan
90
+ Zi: zuan
91
+ Chi: chuan
92
+ Ke: kuan
93
+ Zhi: zhuan
94
+ Ci: cuan
95
+ Te: tuan
96
+ He: huan
97
+ Ge: guan
98
+ Shi: shuan
99
+ Ing:
100
+ Ne: ning
101
+ De: ding
102
+ Qi: qing
103
+ Mo: ming
104
+ Bo: bing
105
+ Empty: ying
106
+ Le: ling
107
+ Xi: xing
108
+ Te: ting
109
+ Ji: jing
110
+ Po: ping
111
+ Ia:
112
+ Qi: qia
113
+ Empty: ya
114
+ Le: lia
115
+ Xi: xia
116
+ Ji: jia
117
+ Er:
118
+ Empty: er
119
+ An:
120
+ Si: san
121
+ Ri: ran
122
+ Ne: nan
123
+ De: dan
124
+ Mo: man
125
+ Bo: ban
126
+ Empty: an
127
+ Le: lan
128
+ Zi: zan
129
+ Chi: chan
130
+ Ke: kan
131
+ Zhi: zhan
132
+ Ci: can
133
+ Te: tan
134
+ He: han
135
+ Ge: gan
136
+ Shi: shan
137
+ Fo: fan
138
+ Po: pan
139
+ Empty:
140
+ Si: si
141
+ Ri: ri
142
+ Zi: zi
143
+ Chi: chi
144
+ Zhi: zhi
145
+ Ci: ci
146
+ Shi: shi
147
+ Van:
148
+ Qi: quan
149
+ Empty: yuan
150
+ Xi: xuan
151
+ Ji: juan
152
+ Un:
153
+ Si: sun
154
+ Ri: run
155
+ De: dun
156
+ Empty: wen
157
+ Le: lun
158
+ Zi: zun
159
+ Chi: chun
160
+ Ke: kun
161
+ Zhi: zhun
162
+ Ci: cun
163
+ Te: tun
164
+ He: hun
165
+ Ge: gun
166
+ Shi: shun
167
+ Ao:
168
+ Si: sao
169
+ Ri: rao
170
+ Ne: nao
171
+ De: dao
172
+ Mo: mao
173
+ Bo: bao
174
+ Empty: ao
175
+ Le: lao
176
+ Zi: zao
177
+ Chi: chao
178
+ Ke: kao
179
+ Zhi: zhao
180
+ Ci: cao
181
+ Te: tao
182
+ He: hao
183
+ Ge: gao
184
+ Shi: shao
185
+ Po: pao
186
+ Uo:
187
+ Si: suo
188
+ Ri: ruo
189
+ Ne: nuo
190
+ De: duo
191
+ Empty: wo
192
+ Le: luo
193
+ Zi: zuo
194
+ Chi: chuo
195
+ Ke: kuo
196
+ Zhi: zhuo
197
+ Ci: cuo
198
+ Te: tuo
199
+ He: huo
200
+ Ge: guo
201
+ Shi: shuo
202
+ Ang:
203
+ Si: sang
204
+ Ri: rang
205
+ Ne: nang
206
+ De: dang
207
+ Mo: mang
208
+ Bo: bang
209
+ Empty: ang
210
+ Le: lang
211
+ Zi: zang
212
+ Chi: chang
213
+ Ke: kang
214
+ Zhi: zhang
215
+ Ci: cang
216
+ Te: tang
217
+ He: hang
218
+ Ge: gang
219
+ Shi: shang
220
+ Fo: fang
221
+ Po: pang
222
+ Ei:
223
+ Ne: nei
224
+ De: dei
225
+ Mo: mei
226
+ Bo: bei
227
+ Empty: ei
228
+ Le: lei
229
+ Zi: zei
230
+ Zhi: zhei
231
+ He: hei
232
+ Ge: gei
233
+ Shi: shei
234
+ Fo: fei
235
+ Po: pei
236
+ O:
237
+ Mo: mo
238
+ Bo: bo
239
+ Empty: o
240
+ Fo: fo
241
+ Po: po
242
+ Ue:
243
+ Ne: nüe
244
+ Qi: que
245
+ Empty: yue
246
+ Le: lüe
247
+ Xi: xue
248
+ Ji: jue
249
+ In:
250
+ Ne: nin
251
+ Qi: qin
252
+ Mo: min
253
+ Bo: bin
254
+ Empty: yin
255
+ Le: lin
256
+ Xi: xin
257
+ Ji: jin
258
+ Po: pin
259
+ E:
260
+ Si: se
261
+ Ri: re
262
+ Ne: ne
263
+ De: de
264
+ Mo: me
265
+ Empty: e
266
+ Le: le
267
+ Zi: ze
268
+ Chi: che
269
+ Ke: ke
270
+ Zhi: zhe
271
+ Ci: ce
272
+ Te: te
273
+ He: he
274
+ Ge: ge
275
+ Shi: she
276
+ Iang:
277
+ Ne: niang
278
+ Qi: qiang
279
+ Empty: yang
280
+ Le: liang
281
+ Xi: xiang
282
+ Ji: jiang
283
+ Iai:
284
+ Empty: yai
285
+ Ie:
286
+ Ne: nie
287
+ De: die
288
+ Qi: qie
289
+ Mo: mie
290
+ Bo: bie
291
+ Empty: ye
292
+ Le: lie
293
+ Xi: xie
294
+ Te: tie
295
+ Ji: jie
296
+ Po: pie
297
+ Io:
298
+ Empty: yo
299
+ Ou:
300
+ Si: sou
301
+ Ri: rou
302
+ Ne: nou
303
+ De: dou
304
+ Mo: mou
305
+ Empty: ou
306
+ Le: lou
307
+ Zi: zou
308
+ Chi: chou
309
+ Ke: kou
310
+ Zhi: zhou
311
+ Ci: cou
312
+ Te: tou
313
+ He: hou
314
+ Ge: gou
315
+ Shi: shou
316
+ Fo: fou
317
+ Po: pou
318
+ Uai:
319
+ Empty: wai
320
+ Chi: chuai
321
+ Ke: kuai
322
+ Zhi: zhuai
323
+ He: huai
324
+ Ge: guai
325
+ Shi: shuai
326
+ Ueng:
327
+ Empty: weng
328
+ Ong:
329
+ Si: song
330
+ Ri: rong
331
+ Ne: nong
332
+ De: dong
333
+ Le: long
334
+ Zi: zong
335
+ Chi: chong
336
+ Ke: kong
337
+ Zhi: zhong
338
+ Ci: cong
339
+ Te: tong
340
+ He: hong
341
+ Ge: gong
342
+ Eng:
343
+ Si: seng
344
+ Ri: reng
345
+ Ne: neng
346
+ De: deng
347
+ Mo: meng
348
+ Bo: beng
349
+ Empty: eng
350
+ Le: leng
351
+ Zi: zeng
352
+ Chi: cheng
353
+ Ke: keng
354
+ Zhi: zheng
355
+ Ci: ceng
356
+ Te: teng
357
+ He: heng
358
+ Ge: geng
359
+ Shi: sheng
360
+ Fo: feng
361
+ Po: peng
362
+ Ai:
363
+ Si: sai
364
+ Ne: nai
365
+ De: dai
366
+ Mo: mai
367
+ Bo: bai
368
+ Empty: ai
369
+ Le: lai
370
+ Zi: zai
371
+ Chi: chai
372
+ Ke: kai
373
+ Zhi: zhai
374
+ Ci: cai
375
+ Te: tai
376
+ He: hai
377
+ Ge: gai
378
+ Shi: shai
379
+ Po: pai
380
+ Iong:
381
+ Qi: qiong
382
+ Empty: yong
383
+ Xi: xiong
384
+ Ji: jiong
385
+ Uang:
386
+ Empty: wang
387
+ Chi: chuang
388
+ Ke: kuang
389
+ Zhi: zhuang
390
+ He: huang
391
+ Ge: guang
392
+ Shi: shuang
393
+ Ui:
394
+ Si: sui
395
+ Ri: rui
396
+ De: dui
397
+ Empty: wei
398
+ Zi: zui
399
+ Chi: chui
400
+ Ke: kui
401
+ Zhi: zhui
402
+ Ci: cui
403
+ Te: tui
404
+ He: hui
405
+ Ge: gui
406
+ Shi: shui
407
+ I:
408
+ Ne: ni
409
+ De: di
410
+ Qi: qi
411
+ Mo: mi
412
+ Bo: bi
413
+ Empty: yi
414
+ Le: li
415
+ Xi: xi
416
+ Te: ti
417
+ Ji: ji
418
+ Po: pi
419
+ En:
420
+ Si: sen
421
+ Ri: ren
422
+ Ne: nen
423
+ Mo: men
424
+ Bo: ben
425
+ Empty: en
426
+ Zi: zen
427
+ Chi: chen
428
+ Ke: ken
429
+ Zhi: zhen
430
+ Ci: cen
431
+ He: hen
432
+ Ge: gen
433
+ Shi: shen
434
+ Fo: fen
435
+ Po: pen
436
+ U:
437
+ Si: su
438
+ Ri: ru
439
+ Ne: nu
440
+ De: du
441
+ Mo: mu
442
+ Bo: bu
443
+ Empty: wu
444
+ Le: lu
445
+ Zi: zu
446
+ Chi: chu
447
+ Ke: ku
448
+ Zhi: zhu
449
+ Ci: cu
450
+ Te: tu
451
+ He: hu
452
+ Ge: gu
453
+ Shi: shu
454
+ Fo: fu
455
+ Po: pu
data/lib/exception.rb ADDED
@@ -0,0 +1,14 @@
1
+ module Pinyin
2
+ # All exceptions arising from this module inherit from Pinyin::Error
3
+ Error = Class.new StandardError
4
+
5
+ class ParseError < Error
6
+ attr_reader :input, :position
7
+
8
+ def initialize(input, position)
9
+ @input=input
10
+ @position=position
11
+ end
12
+ end
13
+ end
14
+
data/lib/groundwork.rb ADDED
@@ -0,0 +1,148 @@
1
+ # Classes and constants used throughout the module
2
+ # * Initial
3
+ # * Final
4
+ # * TonelessSyllable
5
+ # * Syllable
6
+ # * ILLEGAL_COMBINATIONS
7
+
8
+ module Pinyin
9
+ # A Chinese initial (start of a syllable)
10
+ class Initial
11
+ attr :name
12
+ def initialize(n)
13
+ @name=n
14
+ end
15
+
16
+ All = %w(
17
+ Empty Bo Po Mo Fo De Te Ne Le Ge Ke He
18
+ Ji Qi Xi Zhi Chi Shi Ri Zi Ci Si
19
+ ).map{|c| const_set c, Initial.new(c)}
20
+
21
+ class <<self
22
+ private :new
23
+ end
24
+
25
+ Groups=[
26
+ Group_0=[ Empty ],
27
+ Group_1=[ Bo,Po,Mo,Fo], #Bilabial and Labio-dental
28
+ Group_2=[ De,Te,Ne,Le ], #Plosive, nasal and lateral approximant alveolar
29
+ Group_3=[ Ge,Ke,He ], #Velar
30
+ Group_4=[ Ji,Qi,Xi ], #Alveolo-palatal
31
+ Group_5=[ Zhi,Chi,Shi,Ri ], #Retroflex
32
+ Group_6=[ Zi,Ci,Si ], #Fricative and affricate alveolar
33
+ ]
34
+
35
+ def +(f)
36
+ TonelessSyllable.new(self,f)
37
+ end
38
+
39
+ def inspect()
40
+ "<#{self.class.name}::#{@name}>"
41
+ end
42
+ end
43
+
44
+
45
+ # A Chinese final (end of a syllable)
46
+ class Final
47
+ attr :name
48
+ def initialize(n)
49
+ @name=n
50
+ end
51
+
52
+ All=%w(
53
+ Empty A O E Ee Ai Ei Ao Ou An En Ang Eng Ong Er
54
+ I Ia Io Ie Iai Iao Iu Ian In Iang Ing
55
+ U Ua Uo Uai Ui Uan Un Uang Ueng V Ue Van Vn Iong
56
+ ).map{|c| const_set c, Final.new(c)}
57
+
58
+ class <<self
59
+ private :new
60
+ end
61
+
62
+ Groups=[
63
+ Group_0=[ Empty ],
64
+ Group_A=[ A,O,E,Ee,Ai,Ei,Ao,Ou,An,En,Ang,Eng,Ong,Er ],
65
+ Group_I=[ I,Ia,Io,Ie,Iai,Iao,Iu,Ian,In,Iang,Ing ],
66
+ Group_U=[ U,Ua,Uo,Uai,Ui,Uan,Un,Uang,Ueng ],
67
+ Group_V=[ V,Ue,Van,Vn,Iong]
68
+ ]
69
+ def inspect()
70
+ "<#{self.class.name}::#{name}>"
71
+ end
72
+ end
73
+
74
+
75
+ # Combination of an initial and a final
76
+ # Not to be confused with a syllable that has the neutral tone
77
+ class TonelessSyllable
78
+ attr_accessor :initial, :final
79
+
80
+ def initialize(initial, final)
81
+ self.initial = initial
82
+ self.final = final
83
+ end
84
+
85
+ def +(tone)
86
+ Syllable.new(initial, final, tone)
87
+ end
88
+
89
+ def inspect
90
+ "<#{self.class.name} <initial=#{initial.name}, final=#{final.name}>>"
91
+ end
92
+
93
+ def self.illegal?(i,f)
94
+ ILLEGAL_COMBINATIONS.any? {|in_gr, fin_gr| in_gr.include?(i) && fin_gr.include?(f)}
95
+ end
96
+
97
+ alias :to_s :inspect
98
+ end
99
+
100
+
101
+ # Syllable : initial, final and tone
102
+ class Syllable < TonelessSyllable
103
+ attr_accessor :tone
104
+
105
+ def initialize(initial, final, tone)
106
+ super(initial, final)
107
+ self.tone = tone
108
+ end
109
+
110
+ def inspect
111
+ "<#{self.class.name} <initial=#{initial.name}, final=#{final.name}, tone=#{tone}>>"
112
+ end
113
+
114
+ alias :to_s :inspect
115
+ end
116
+
117
+
118
+ # Some groups of initials and finals may not be combined
119
+ # This list is not exhaustive but is sufficient to resolve ambiguity
120
+ ILLEGAL_COMBINATIONS=
121
+ [
122
+ [Initial::Group_0, Final::Group_0],
123
+ [Initial::Group_1, Final::Group_0],
124
+ [Initial::Group_2, Final::Group_0],
125
+ [Initial::Group_3, Final::Group_0],
126
+ [Initial::Group_4, Final::Group_0],
127
+
128
+ [Initial::Group_4, Final::Group_U],
129
+ [Initial::Group_4, Final::Group_A],
130
+
131
+ [Initial::Group_3, Final::Group_I],
132
+ [Initial::Group_5, Final::Group_I],
133
+ [Initial::Group_6, Final::Group_I],
134
+
135
+ [Initial::Group_1, Final::Group_V],
136
+ [Initial::Group_3, Final::Group_V],
137
+
138
+ [Initial::Group_2, [Final::O]], #Only bo, po, mo and fo are valid -o combinations
139
+ [Initial::Group_3, [Final::O]],
140
+ [Initial::Group_4, [Final::O]],
141
+ [Initial::Group_5, [Final::O]],
142
+ [Initial::Group_6, [Final::O]],
143
+
144
+ [[Initial::Empty], [Final::Ong]] # Some say ong and ueng is actually the same final, zhuyin uses the same representation, but ueng only has standalone form weng
145
+
146
+ ]
147
+
148
+ end
data/lib/pinyin.rb ADDED
@@ -0,0 +1,71 @@
1
+ # Handle several romanization systems for Mandarin Chinese
2
+ #
3
+ # Author:: Arne Brasseur (pinyin@arnebrasseur.net)
4
+ # Copyright:: Copyright (c) 2007, Arne Brasseur
5
+ # Licence:: GNU General Public License, latest version
6
+
7
+ $: << File.dirname(__FILE__)
8
+
9
+ require 'support'
10
+ require 'groundwork'
11
+ require 'exception'
12
+
13
+ require 'tones'
14
+ Pinyin::Tones::All.each{|m| require 'tones/'+m}
15
+
16
+ require 'conversions'
17
+
18
+
19
+ module Pinyin
20
+ class Reader
21
+ def initialize(conv, tone)
22
+ @conv = conv.to_s #Conversions.const_get conv.to_s.camelize
23
+ @tone = Tones.const_get tone.to_s.camelize
24
+ end
25
+
26
+ def parse(str)
27
+ Conversions.tokenize(str).map do |s, pos|
28
+ tone,syll = @tone.pop_tone(s)
29
+ tsyll = Conversions.parse(@conv,syll)
30
+ ini, fin = tsyll.initial, tsyll.final
31
+ raise ParseError.new(s,pos),"Illegal syllable <#{s}> in input <#{str}> at position #{pos}." unless tone && fin && ini
32
+ Syllable.new(ini, fin, tone)
33
+ end
34
+ end
35
+
36
+ alias :<< :parse
37
+ end
38
+
39
+ class Writer
40
+ def initialize(conv, tone)
41
+ @conv = conv.to_s #Conversions.const_get conv.to_s.camelize
42
+ @tone = Tones.const_get tone.to_s.camelize
43
+ end
44
+
45
+ def unparse(py)
46
+ conv=lambda {|syll| @tone.add_tone(Conversions.unparse(@conv,syll),syll.tone)}
47
+ if py.respond_to? :map
48
+ py.map(&conv).join(' ')
49
+ else
50
+ conv.call(py)
51
+ end
52
+ end
53
+
54
+ alias :<< :unparse
55
+ end
56
+
57
+ class Converter
58
+ def initialize(from, from_tone, to, to_tone)
59
+ @reader = Reader.new(from, from_tone)
60
+ @writer = Writer.new(to, to_tone)
61
+ end
62
+
63
+ def convert(str)
64
+ @writer.unparse @reader.parse(str)
65
+ end
66
+
67
+ alias :<< :convert
68
+ end
69
+ end
70
+
71
+
data/lib/support.rb ADDED
@@ -0,0 +1,16 @@
1
+ class String
2
+ def camelize
3
+ self.split(/_/).map{|p|p.capitalize}.join
4
+ end
5
+
6
+ def chars
7
+ self.unpack('U*').map{|c| [c].pack('U')}
8
+ end
9
+ end
10
+
11
+ class Object
12
+ def returning(s)
13
+ yield(s)
14
+ s
15
+ end
16
+ end