pinyin 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,455 @@
1
+ # Valid pinyin syllables by final and initial
2
+ # Hpricoted from http://en.wikipedia.org/wiki/Pinyin_table
3
+ # Hand-edited for Ueng/weng which is under Ong in the table
4
+ #
5
+ # Note that, in order to only use 7-bit ASCII characters,
6
+ # the u with two dots (umlaut) is written v, and the e with
7
+ # a little hat on top (circumflex) is written E.
8
+ ---
9
+ V:
10
+ Ne: nü
11
+ Qi: qu
12
+ Empty: yu
13
+ Le: lü
14
+ Xi: xu
15
+ Ji: ju
16
+ Ian:
17
+ Ne: nian
18
+ De: dian
19
+ Qi: qian
20
+ Mo: mian
21
+ Bo: bian
22
+ Empty: yan
23
+ Le: lian
24
+ Xi: xian
25
+ Te: tian
26
+ Ji: jian
27
+ Po: pian
28
+ Ee:
29
+ Empty: ê
30
+ Ua:
31
+ Empty: wa
32
+ Chi: chua
33
+ Ke: kua
34
+ Zhi: zhua
35
+ He: hua
36
+ Ge: gua
37
+ Shi: shua
38
+ Iu:
39
+ Ne: niu
40
+ De: diu
41
+ Qi: qiu
42
+ Mo: miu
43
+ Empty: you
44
+ Le: liu
45
+ Xi: xiu
46
+ Ji: jiu
47
+ Iao:
48
+ Ne: niao
49
+ De: diao
50
+ Qi: qiao
51
+ Mo: miao
52
+ Bo: biao
53
+ Empty: yao
54
+ Le: liao
55
+ Xi: xiao
56
+ Te: tiao
57
+ Ji: jiao
58
+ Po: piao
59
+ A:
60
+ Si: sa
61
+ Ne: na
62
+ De: da
63
+ Mo: ma
64
+ Bo: ba
65
+ Empty: a
66
+ Le: la
67
+ Zi: za
68
+ Chi: cha
69
+ Ke: ka
70
+ Zhi: zha
71
+ Ci: ca
72
+ Te: ta
73
+ He: ha
74
+ Ge: ga
75
+ Shi: sha
76
+ Fo: fa
77
+ Po: pa
78
+ Vn:
79
+ Qi: qun
80
+ Empty: yun
81
+ Xi: xun
82
+ Ji: jun
83
+ Uan:
84
+ Si: suan
85
+ Ri: ruan
86
+ Ne: nuan
87
+ De: duan
88
+ Empty: wan
89
+ Le: luan
90
+ Zi: zuan
91
+ Chi: chuan
92
+ Ke: kuan
93
+ Zhi: zhuan
94
+ Ci: cuan
95
+ Te: tuan
96
+ He: huan
97
+ Ge: guan
98
+ Shi: shuan
99
+ Ing:
100
+ Ne: ning
101
+ De: ding
102
+ Qi: qing
103
+ Mo: ming
104
+ Bo: bing
105
+ Empty: ying
106
+ Le: ling
107
+ Xi: xing
108
+ Te: ting
109
+ Ji: jing
110
+ Po: ping
111
+ Ia:
112
+ Qi: qia
113
+ Empty: ya
114
+ Le: lia
115
+ Xi: xia
116
+ Ji: jia
117
+ Er:
118
+ Empty: er
119
+ An:
120
+ Si: san
121
+ Ri: ran
122
+ Ne: nan
123
+ De: dan
124
+ Mo: man
125
+ Bo: ban
126
+ Empty: an
127
+ Le: lan
128
+ Zi: zan
129
+ Chi: chan
130
+ Ke: kan
131
+ Zhi: zhan
132
+ Ci: can
133
+ Te: tan
134
+ He: han
135
+ Ge: gan
136
+ Shi: shan
137
+ Fo: fan
138
+ Po: pan
139
+ Empty:
140
+ Si: si
141
+ Ri: ri
142
+ Zi: zi
143
+ Chi: chi
144
+ Zhi: zhi
145
+ Ci: ci
146
+ Shi: shi
147
+ Van:
148
+ Qi: quan
149
+ Empty: yuan
150
+ Xi: xuan
151
+ Ji: juan
152
+ Un:
153
+ Si: sun
154
+ Ri: run
155
+ De: dun
156
+ Empty: wen
157
+ Le: lun
158
+ Zi: zun
159
+ Chi: chun
160
+ Ke: kun
161
+ Zhi: zhun
162
+ Ci: cun
163
+ Te: tun
164
+ He: hun
165
+ Ge: gun
166
+ Shi: shun
167
+ Ao:
168
+ Si: sao
169
+ Ri: rao
170
+ Ne: nao
171
+ De: dao
172
+ Mo: mao
173
+ Bo: bao
174
+ Empty: ao
175
+ Le: lao
176
+ Zi: zao
177
+ Chi: chao
178
+ Ke: kao
179
+ Zhi: zhao
180
+ Ci: cao
181
+ Te: tao
182
+ He: hao
183
+ Ge: gao
184
+ Shi: shao
185
+ Po: pao
186
+ Uo:
187
+ Si: suo
188
+ Ri: ruo
189
+ Ne: nuo
190
+ De: duo
191
+ Empty: wo
192
+ Le: luo
193
+ Zi: zuo
194
+ Chi: chuo
195
+ Ke: kuo
196
+ Zhi: zhuo
197
+ Ci: cuo
198
+ Te: tuo
199
+ He: huo
200
+ Ge: guo
201
+ Shi: shuo
202
+ Ang:
203
+ Si: sang
204
+ Ri: rang
205
+ Ne: nang
206
+ De: dang
207
+ Mo: mang
208
+ Bo: bang
209
+ Empty: ang
210
+ Le: lang
211
+ Zi: zang
212
+ Chi: chang
213
+ Ke: kang
214
+ Zhi: zhang
215
+ Ci: cang
216
+ Te: tang
217
+ He: hang
218
+ Ge: gang
219
+ Shi: shang
220
+ Fo: fang
221
+ Po: pang
222
+ Ei:
223
+ Ne: nei
224
+ De: dei
225
+ Mo: mei
226
+ Bo: bei
227
+ Empty: ei
228
+ Le: lei
229
+ Zi: zei
230
+ Zhi: zhei
231
+ He: hei
232
+ Ge: gei
233
+ Shi: shei
234
+ Fo: fei
235
+ Po: pei
236
+ O:
237
+ Mo: mo
238
+ Bo: bo
239
+ Empty: o
240
+ Fo: fo
241
+ Po: po
242
+ Ue:
243
+ Ne: nüe
244
+ Qi: que
245
+ Empty: yue
246
+ Le: lüe
247
+ Xi: xue
248
+ Ji: jue
249
+ In:
250
+ Ne: nin
251
+ Qi: qin
252
+ Mo: min
253
+ Bo: bin
254
+ Empty: yin
255
+ Le: lin
256
+ Xi: xin
257
+ Ji: jin
258
+ Po: pin
259
+ E:
260
+ Si: se
261
+ Ri: re
262
+ Ne: ne
263
+ De: de
264
+ Mo: me
265
+ Empty: e
266
+ Le: le
267
+ Zi: ze
268
+ Chi: che
269
+ Ke: ke
270
+ Zhi: zhe
271
+ Ci: ce
272
+ Te: te
273
+ He: he
274
+ Ge: ge
275
+ Shi: she
276
+ Iang:
277
+ Ne: niang
278
+ Qi: qiang
279
+ Empty: yang
280
+ Le: liang
281
+ Xi: xiang
282
+ Ji: jiang
283
+ Iai:
284
+ Empty: yai
285
+ Ie:
286
+ Ne: nie
287
+ De: die
288
+ Qi: qie
289
+ Mo: mie
290
+ Bo: bie
291
+ Empty: ye
292
+ Le: lie
293
+ Xi: xie
294
+ Te: tie
295
+ Ji: jie
296
+ Po: pie
297
+ Io:
298
+ Empty: yo
299
+ Ou:
300
+ Si: sou
301
+ Ri: rou
302
+ Ne: nou
303
+ De: dou
304
+ Mo: mou
305
+ Empty: ou
306
+ Le: lou
307
+ Zi: zou
308
+ Chi: chou
309
+ Ke: kou
310
+ Zhi: zhou
311
+ Ci: cou
312
+ Te: tou
313
+ He: hou
314
+ Ge: gou
315
+ Shi: shou
316
+ Fo: fou
317
+ Po: pou
318
+ Uai:
319
+ Empty: wai
320
+ Chi: chuai
321
+ Ke: kuai
322
+ Zhi: zhuai
323
+ He: huai
324
+ Ge: guai
325
+ Shi: shuai
326
+ Ueng:
327
+ Empty: weng
328
+ Ong:
329
+ Si: song
330
+ Ri: rong
331
+ Ne: nong
332
+ De: dong
333
+ Le: long
334
+ Zi: zong
335
+ Chi: chong
336
+ Ke: kong
337
+ Zhi: zhong
338
+ Ci: cong
339
+ Te: tong
340
+ He: hong
341
+ Ge: gong
342
+ Eng:
343
+ Si: seng
344
+ Ri: reng
345
+ Ne: neng
346
+ De: deng
347
+ Mo: meng
348
+ Bo: beng
349
+ Empty: eng
350
+ Le: leng
351
+ Zi: zeng
352
+ Chi: cheng
353
+ Ke: keng
354
+ Zhi: zheng
355
+ Ci: ceng
356
+ Te: teng
357
+ He: heng
358
+ Ge: geng
359
+ Shi: sheng
360
+ Fo: feng
361
+ Po: peng
362
+ Ai:
363
+ Si: sai
364
+ Ne: nai
365
+ De: dai
366
+ Mo: mai
367
+ Bo: bai
368
+ Empty: ai
369
+ Le: lai
370
+ Zi: zai
371
+ Chi: chai
372
+ Ke: kai
373
+ Zhi: zhai
374
+ Ci: cai
375
+ Te: tai
376
+ He: hai
377
+ Ge: gai
378
+ Shi: shai
379
+ Po: pai
380
+ Iong:
381
+ Qi: qiong
382
+ Empty: yong
383
+ Xi: xiong
384
+ Ji: jiong
385
+ Uang:
386
+ Empty: wang
387
+ Chi: chuang
388
+ Ke: kuang
389
+ Zhi: zhuang
390
+ He: huang
391
+ Ge: guang
392
+ Shi: shuang
393
+ Ui:
394
+ Si: sui
395
+ Ri: rui
396
+ De: dui
397
+ Empty: wei
398
+ Zi: zui
399
+ Chi: chui
400
+ Ke: kui
401
+ Zhi: zhui
402
+ Ci: cui
403
+ Te: tui
404
+ He: hui
405
+ Ge: gui
406
+ Shi: shui
407
+ I:
408
+ Ne: ni
409
+ De: di
410
+ Qi: qi
411
+ Mo: mi
412
+ Bo: bi
413
+ Empty: yi
414
+ Le: li
415
+ Xi: xi
416
+ Te: ti
417
+ Ji: ji
418
+ Po: pi
419
+ En:
420
+ Si: sen
421
+ Ri: ren
422
+ Ne: nen
423
+ Mo: men
424
+ Bo: ben
425
+ Empty: en
426
+ Zi: zen
427
+ Chi: chen
428
+ Ke: ken
429
+ Zhi: zhen
430
+ Ci: cen
431
+ He: hen
432
+ Ge: gen
433
+ Shi: shen
434
+ Fo: fen
435
+ Po: pen
436
+ U:
437
+ Si: su
438
+ Ri: ru
439
+ Ne: nu
440
+ De: du
441
+ Mo: mu
442
+ Bo: bu
443
+ Empty: wu
444
+ Le: lu
445
+ Zi: zu
446
+ Chi: chu
447
+ Ke: ku
448
+ Zhi: zhu
449
+ Ci: cu
450
+ Te: tu
451
+ He: hu
452
+ Ge: gu
453
+ Shi: shu
454
+ Fo: fu
455
+ Po: pu
data/lib/exception.rb ADDED
@@ -0,0 +1,14 @@
1
+ module Pinyin
2
+ # All exceptions arising from this module inherit from Pinyin::Error
3
+ Error = Class.new StandardError
4
+
5
+ class ParseError < Error
6
+ attr_reader :input, :position
7
+
8
+ def initialize(input, position)
9
+ @input=input
10
+ @position=position
11
+ end
12
+ end
13
+ end
14
+
data/lib/groundwork.rb ADDED
@@ -0,0 +1,148 @@
1
+ # Classes and constants used throughout the module
2
+ # * Initial
3
+ # * Final
4
+ # * TonelessSyllable
5
+ # * Syllable
6
+ # * ILLEGAL_COMBINATIONS
7
+
8
+ module Pinyin
9
+ # A Chinese initial (start of a syllable)
10
+ class Initial
11
+ attr :name
12
+ def initialize(n)
13
+ @name=n
14
+ end
15
+
16
+ All = %w(
17
+ Empty Bo Po Mo Fo De Te Ne Le Ge Ke He
18
+ Ji Qi Xi Zhi Chi Shi Ri Zi Ci Si
19
+ ).map{|c| const_set c, Initial.new(c)}
20
+
21
+ class <<self
22
+ private :new
23
+ end
24
+
25
+ Groups=[
26
+ Group_0=[ Empty ],
27
+ Group_1=[ Bo,Po,Mo,Fo], #Bilabial and Labio-dental
28
+ Group_2=[ De,Te,Ne,Le ], #Plosive, nasal and lateral approximant alveolar
29
+ Group_3=[ Ge,Ke,He ], #Velar
30
+ Group_4=[ Ji,Qi,Xi ], #Alveolo-palatal
31
+ Group_5=[ Zhi,Chi,Shi,Ri ], #Retroflex
32
+ Group_6=[ Zi,Ci,Si ], #Fricative and affricate alveolar
33
+ ]
34
+
35
+ def +(f)
36
+ TonelessSyllable.new(self,f)
37
+ end
38
+
39
+ def inspect()
40
+ "<#{self.class.name}::#{@name}>"
41
+ end
42
+ end
43
+
44
+
45
+ # A Chinese final (end of a syllable)
46
+ class Final
47
+ attr :name
48
+ def initialize(n)
49
+ @name=n
50
+ end
51
+
52
+ All=%w(
53
+ Empty A O E Ee Ai Ei Ao Ou An En Ang Eng Ong Er
54
+ I Ia Io Ie Iai Iao Iu Ian In Iang Ing
55
+ U Ua Uo Uai Ui Uan Un Uang Ueng V Ue Van Vn Iong
56
+ ).map{|c| const_set c, Final.new(c)}
57
+
58
+ class <<self
59
+ private :new
60
+ end
61
+
62
+ Groups=[
63
+ Group_0=[ Empty ],
64
+ Group_A=[ A,O,E,Ee,Ai,Ei,Ao,Ou,An,En,Ang,Eng,Ong,Er ],
65
+ Group_I=[ I,Ia,Io,Ie,Iai,Iao,Iu,Ian,In,Iang,Ing ],
66
+ Group_U=[ U,Ua,Uo,Uai,Ui,Uan,Un,Uang,Ueng ],
67
+ Group_V=[ V,Ue,Van,Vn,Iong]
68
+ ]
69
+ def inspect()
70
+ "<#{self.class.name}::#{name}>"
71
+ end
72
+ end
73
+
74
+
75
+ # Combination of an initial and a final
76
+ # Not to be confused with a syllable that has the neutral tone
77
+ class TonelessSyllable
78
+ attr_accessor :initial, :final
79
+
80
+ def initialize(initial, final)
81
+ self.initial = initial
82
+ self.final = final
83
+ end
84
+
85
+ def +(tone)
86
+ Syllable.new(initial, final, tone)
87
+ end
88
+
89
+ def inspect
90
+ "<#{self.class.name} <initial=#{initial.name}, final=#{final.name}>>"
91
+ end
92
+
93
+ def self.illegal?(i,f)
94
+ ILLEGAL_COMBINATIONS.any? {|in_gr, fin_gr| in_gr.include?(i) && fin_gr.include?(f)}
95
+ end
96
+
97
+ alias :to_s :inspect
98
+ end
99
+
100
+
101
+ # Syllable : initial, final and tone
102
+ class Syllable < TonelessSyllable
103
+ attr_accessor :tone
104
+
105
+ def initialize(initial, final, tone)
106
+ super(initial, final)
107
+ self.tone = tone
108
+ end
109
+
110
+ def inspect
111
+ "<#{self.class.name} <initial=#{initial.name}, final=#{final.name}, tone=#{tone}>>"
112
+ end
113
+
114
+ alias :to_s :inspect
115
+ end
116
+
117
+
118
+ # Some groups of initials and finals may not be combined
119
+ # This list is not exhaustive but is sufficient to resolve ambiguity
120
+ ILLEGAL_COMBINATIONS=
121
+ [
122
+ [Initial::Group_0, Final::Group_0],
123
+ [Initial::Group_1, Final::Group_0],
124
+ [Initial::Group_2, Final::Group_0],
125
+ [Initial::Group_3, Final::Group_0],
126
+ [Initial::Group_4, Final::Group_0],
127
+
128
+ [Initial::Group_4, Final::Group_U],
129
+ [Initial::Group_4, Final::Group_A],
130
+
131
+ [Initial::Group_3, Final::Group_I],
132
+ [Initial::Group_5, Final::Group_I],
133
+ [Initial::Group_6, Final::Group_I],
134
+
135
+ [Initial::Group_1, Final::Group_V],
136
+ [Initial::Group_3, Final::Group_V],
137
+
138
+ [Initial::Group_2, [Final::O]], #Only bo, po, mo and fo are valid -o combinations
139
+ [Initial::Group_3, [Final::O]],
140
+ [Initial::Group_4, [Final::O]],
141
+ [Initial::Group_5, [Final::O]],
142
+ [Initial::Group_6, [Final::O]],
143
+
144
+ [[Initial::Empty], [Final::Ong]] # Some say ong and ueng is actually the same final, zhuyin uses the same representation, but ueng only has standalone form weng
145
+
146
+ ]
147
+
148
+ end
data/lib/pinyin.rb ADDED
@@ -0,0 +1,71 @@
1
+ # Handle several romanization systems for Mandarin Chinese
2
+ #
3
+ # Author:: Arne Brasseur (pinyin@arnebrasseur.net)
4
+ # Copyright:: Copyright (c) 2007, Arne Brasseur
5
+ # Licence:: GNU General Public License, latest version
6
+
7
+ $: << File.dirname(__FILE__)
8
+
9
+ require 'support'
10
+ require 'groundwork'
11
+ require 'exception'
12
+
13
+ require 'tones'
14
+ Pinyin::Tones::All.each{|m| require 'tones/'+m}
15
+
16
+ require 'conversions'
17
+
18
+
19
+ module Pinyin
20
+ class Reader
21
+ def initialize(conv, tone)
22
+ @conv = conv.to_s #Conversions.const_get conv.to_s.camelize
23
+ @tone = Tones.const_get tone.to_s.camelize
24
+ end
25
+
26
+ def parse(str)
27
+ Conversions.tokenize(str).map do |s, pos|
28
+ tone,syll = @tone.pop_tone(s)
29
+ tsyll = Conversions.parse(@conv,syll)
30
+ ini, fin = tsyll.initial, tsyll.final
31
+ raise ParseError.new(s,pos),"Illegal syllable <#{s}> in input <#{str}> at position #{pos}." unless tone && fin && ini
32
+ Syllable.new(ini, fin, tone)
33
+ end
34
+ end
35
+
36
+ alias :<< :parse
37
+ end
38
+
39
+ class Writer
40
+ def initialize(conv, tone)
41
+ @conv = conv.to_s #Conversions.const_get conv.to_s.camelize
42
+ @tone = Tones.const_get tone.to_s.camelize
43
+ end
44
+
45
+ def unparse(py)
46
+ conv=lambda {|syll| @tone.add_tone(Conversions.unparse(@conv,syll),syll.tone)}
47
+ if py.respond_to? :map
48
+ py.map(&conv).join(' ')
49
+ else
50
+ conv.call(py)
51
+ end
52
+ end
53
+
54
+ alias :<< :unparse
55
+ end
56
+
57
+ class Converter
58
+ def initialize(from, from_tone, to, to_tone)
59
+ @reader = Reader.new(from, from_tone)
60
+ @writer = Writer.new(to, to_tone)
61
+ end
62
+
63
+ def convert(str)
64
+ @writer.unparse @reader.parse(str)
65
+ end
66
+
67
+ alias :<< :convert
68
+ end
69
+ end
70
+
71
+
data/lib/support.rb ADDED
@@ -0,0 +1,16 @@
1
+ class String
2
+ def camelize
3
+ self.split(/_/).map{|p|p.capitalize}.join
4
+ end
5
+
6
+ def chars
7
+ self.unpack('U*').map{|c| [c].pack('U')}
8
+ end
9
+ end
10
+
11
+ class Object
12
+ def returning(s)
13
+ yield(s)
14
+ s
15
+ end
16
+ end