pinyin 0.0.1 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +12 -0
- data/Manifest.txt +31 -0
- data/README.txt +725 -0
- data/Rakefile +36 -0
- data/TODO +2 -1
- data/examples/cgiform/cgiform.rb +0 -0
- data/examples/cgiform/template.rhtml +0 -0
- data/examples/hello.rb +0 -0
- data/lib/pinyin.rb +31 -12
- data/lib/pinyin/conversion.rb +51 -0
- data/lib/{conversions.rb → pinyin/conversions.rb} +7 -6
- data/lib/pinyin/conversions/hanyu.rb +77 -0
- data/lib/{data → pinyin/data}/comparison.csv +0 -0
- data/lib/{data → pinyin/data}/final.csv +0 -0
- data/lib/{data → pinyin/data}/initial.csv +0 -0
- data/lib/{data → pinyin/data}/paladiy.txt +0 -0
- data/lib/{data → pinyin/data}/rules.yaml +0 -0
- data/lib/{data → pinyin/data}/valid_pinyin.yaml +2 -3
- data/lib/{exception.rb → pinyin/exception.rb} +0 -0
- data/lib/{groundwork.rb → pinyin/groundwork.rb} +40 -5
- data/lib/pinyin/string.rb +14 -0
- data/lib/{support.rb → pinyin/support.rb} +1 -5
- data/lib/pinyin/tones.rb +47 -0
- data/lib/{tones → pinyin/tones}/accents.rb +14 -11
- data/lib/{tones → pinyin/tones}/marks.rb +10 -5
- data/lib/pinyin/tones/no_tones.rb +6 -0
- data/lib/pinyin/tones/numbers.rb +25 -0
- data/rakefile +17 -34
- data/script/update +4 -0
- data/test/{comparison_test.rb → test_comparison.rb} +1 -1
- data/test/{hanyu_coverage.rb → test_hanyu_coverage.rb} +3 -1
- metadata +92 -61
- data/lib/tones.rb +0 -19
- data/lib/tones/no_tones.rb +0 -16
- data/lib/tones/numbers.rb +0 -24
data/Rakefile
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
|
3
|
+
require 'rake'
|
4
|
+
require 'rake/testtask'
|
5
|
+
require 'hoe'
|
6
|
+
|
7
|
+
$:.unshift './lib'
|
8
|
+
|
9
|
+
require 'pinyin'
|
10
|
+
|
11
|
+
Hoe.new('pinyin', Pinyin::VERSION) do |p|
|
12
|
+
p.rubyforge_name = 'pinyin'
|
13
|
+
p.summary = "A conversion library for Chinese transcription methods like Hanyu Pinyin, Bopomofo and Wade-Giles"
|
14
|
+
p.description = p.paragraphs_of('README', 2).join
|
15
|
+
p.url = "http://rubyforge.org/projects/pinyin"
|
16
|
+
p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
|
17
|
+
p.email = "pinyin@arnebrasseur.net"
|
18
|
+
p.author = 'Arne Brasseur'
|
19
|
+
p.remote_rdoc_dir=""
|
20
|
+
p.spec_extras = {
|
21
|
+
:extra_rdoc_files => ["README", "History.txt"],
|
22
|
+
:rdoc_options => ["--main", "README"],
|
23
|
+
:platform => Gem::Platform::RUBY
|
24
|
+
}
|
25
|
+
end
|
26
|
+
|
27
|
+
task :default => [:test_units]
|
28
|
+
|
29
|
+
namespace "test" do
|
30
|
+
Rake::TestTask.new("pinyin") do |t|
|
31
|
+
$: << File.dirname(__FILE__) + '/lib'
|
32
|
+
t.pattern = 'test/*.rb'
|
33
|
+
t.verbose = true
|
34
|
+
t.warning = true
|
35
|
+
end
|
36
|
+
end
|
data/TODO
CHANGED
@@ -11,6 +11,7 @@
|
|
11
11
|
- Yale
|
12
12
|
|
13
13
|
- Research some rare pinyin syllables : lo, yo ^e, yai
|
14
|
+
- Get a definitive answer about ong/ueng/weng
|
14
15
|
- Add a general README as rdoc start page
|
15
16
|
- Add a README to the data/ directory with info on sources, contents and purposes
|
16
17
|
- More tests
|
@@ -19,4 +20,4 @@
|
|
19
20
|
!More
|
20
21
|
The core lib basically does translation on the syllable level. It can handle strings with syllables nicely seperated by spaces. Successive layers should make it possible to convert a sentence with interpunction into a different system. It should be possible to write compound words together in Hanyu, and have the syllables seperated by dashes when converting to WG. For instance:
|
21
22
|
|
22
|
-
Wǒ de péngyǒu, shì dàifu. => Wǒ te p`éng-yǔ, shih tài-fu.
|
23
|
+
Wǒ de péngyǒu, shì dàifu. => Wǒ te p`éng-yǔ, shih tài-fu.>>>>>>> .r211
|
data/examples/cgiform/cgiform.rb
CHANGED
File without changes
|
File without changes
|
data/examples/hello.rb
CHANGED
File without changes
|
data/lib/pinyin.rb
CHANGED
@@ -6,21 +6,24 @@
|
|
6
6
|
|
7
7
|
$: << File.dirname(__FILE__)
|
8
8
|
|
9
|
-
require
|
10
|
-
require 'groundwork'
|
11
|
-
require 'exception'
|
9
|
+
require "facets/string/camelcase"
|
12
10
|
|
13
|
-
require '
|
14
|
-
|
15
|
-
|
16
|
-
require 'conversions'
|
11
|
+
require 'pinyin/support'
|
12
|
+
require 'pinyin/groundwork'
|
13
|
+
require 'pinyin/exception'
|
17
14
|
|
15
|
+
require 'pinyin/tones'
|
16
|
+
require 'pinyin/conversion'
|
17
|
+
require 'pinyin/conversions'
|
18
|
+
require 'pinyin/conversions/hanyu'
|
18
19
|
|
19
20
|
module Pinyin
|
21
|
+
VERSION = "0.1.4"
|
22
|
+
|
20
23
|
class Reader
|
21
24
|
def initialize(conv, tone)
|
22
|
-
@conv = conv.to_s
|
23
|
-
@tone = Tones.const_get tone.to_s.
|
25
|
+
@conv = conv.to_s
|
26
|
+
@tone = Tones.const_get tone.to_s.camelcase
|
24
27
|
end
|
25
28
|
|
26
29
|
def parse(str)
|
@@ -28,9 +31,13 @@ module Pinyin
|
|
28
31
|
tone,syll = @tone.pop_tone(s)
|
29
32
|
tsyll = Conversions.parse(@conv,syll)
|
30
33
|
ini, fin = tsyll.initial, tsyll.final
|
31
|
-
|
34
|
+
unless tone && fin && ini
|
35
|
+
raise ParseError.new(s,pos),"Illegal syllable <#{s}> in input <#{str}> at position #{pos}."
|
36
|
+
end
|
32
37
|
Syllable.new(ini, fin, tone)
|
33
38
|
end
|
39
|
+
rescue Object => e
|
40
|
+
raise ParseError.new(str,0), "Parsing of #{str.inspect} failed : #{e}"
|
34
41
|
end
|
35
42
|
|
36
43
|
alias :<< :parse
|
@@ -38,8 +45,8 @@ module Pinyin
|
|
38
45
|
|
39
46
|
class Writer
|
40
47
|
def initialize(conv, tone)
|
41
|
-
@conv = conv.to_s
|
42
|
-
@tone = Tones.const_get tone.to_s.
|
48
|
+
@conv = conv.to_s
|
49
|
+
@tone = Tones.const_get tone.to_s.camelcase
|
43
50
|
end
|
44
51
|
|
45
52
|
def unparse(py)
|
@@ -66,6 +73,18 @@ module Pinyin
|
|
66
73
|
|
67
74
|
alias :<< :convert
|
68
75
|
end
|
76
|
+
|
77
|
+
class <<self
|
78
|
+
Conversions::All.each do |c|
|
79
|
+
define_method "#{c.to_s.camelcase}Reader" do |tone|
|
80
|
+
Reader.new(c, tone)
|
81
|
+
end
|
82
|
+
|
83
|
+
define_method "#{c.to_s.camelcase}Writer" do |tone|
|
84
|
+
Writer.new(c, tone)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
69
88
|
end
|
70
89
|
|
71
90
|
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module Pinyin
|
2
|
+
|
3
|
+
#
|
4
|
+
# Base class for conversions like Hanyu pinyin,
|
5
|
+
# Wade-Giles, etc.
|
6
|
+
#
|
7
|
+
class Conversion
|
8
|
+
|
9
|
+
# Separator between syllables in the same word
|
10
|
+
# For Wade-Giles this is a dash, Hanyu pinyin
|
11
|
+
# uses a single quote in certain situations
|
12
|
+
attr_reader :syllable_separator
|
13
|
+
|
14
|
+
# The tone handling object
|
15
|
+
attr_reader :tones
|
16
|
+
|
17
|
+
# An optional lambda that preprocesses input
|
18
|
+
attr_reader :preprocessor
|
19
|
+
|
20
|
+
# The name of this conversion, the same name used
|
21
|
+
# in the data file and that is also available as
|
22
|
+
# a method name on Initial and Final objects.
|
23
|
+
#
|
24
|
+
# By default the underscorized class name
|
25
|
+
attr_reader :name
|
26
|
+
|
27
|
+
def initialize(tone = :numbers, options = {})
|
28
|
+
@preprocessor = options[:preprocessor] || lambda {|s| s}
|
29
|
+
|
30
|
+
if Tone === tone
|
31
|
+
@tone = tone
|
32
|
+
else
|
33
|
+
@tone = Pinyin::Tones.const_get(tone.to_s.camelcase)
|
34
|
+
end
|
35
|
+
|
36
|
+
@name = self.class.name.underscore
|
37
|
+
end
|
38
|
+
|
39
|
+
# Converts a string into an array of strings and
|
40
|
+
# syllable objects.
|
41
|
+
def parse(string)
|
42
|
+
end
|
43
|
+
|
44
|
+
# Converts an array of strings and syllable objects
|
45
|
+
# into a string
|
46
|
+
def unparse(array)
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
@@ -12,7 +12,7 @@ module Pinyin
|
|
12
12
|
klazz=Pinyin.const_get c
|
13
13
|
begin
|
14
14
|
CSV.open(DATA_DIR+c.downcase+'.csv', 'r').each do |name, *values|
|
15
|
-
All << name unless All.index name || name =~ /name|standalone/i
|
15
|
+
All << name.to_s unless All.index name || name =~ /name|standalone/i
|
16
16
|
klazz.class_eval {attr_accessor name.to_sym}
|
17
17
|
values.each_with_index do |v,i|
|
18
18
|
klazz::All[i].send(name+'=', v)
|
@@ -63,12 +63,13 @@ module Pinyin
|
|
63
63
|
end
|
64
64
|
|
65
65
|
private
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
66
|
+
def self.apply_rules(type, string)
|
67
|
+
returning string.dup do |s|
|
68
|
+
@@rules[type] && @@rules[type].each do |rule|
|
69
|
+
s.gsub!(Regexp.new(rule['match']),rule['subst'])
|
70
|
+
end
|
70
71
|
end
|
71
72
|
end
|
72
|
-
|
73
|
+
|
73
74
|
end
|
74
75
|
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
module Pinyin
|
2
|
+
module Conversions
|
3
|
+
class Hanyu
|
4
|
+
def initialize(tone = :numbers, options = {})
|
5
|
+
@options = options
|
6
|
+
@options[:preprocess] ||= lambda {|s| s.gsub(/u:|Ü/, 'ü').downcase }
|
7
|
+
|
8
|
+
if Class === tone
|
9
|
+
@tone = tone
|
10
|
+
else
|
11
|
+
@tone = Pinyin::Tones.const_get(tone.to_s.camelcase)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def valid_character_regexp
|
16
|
+
@valid_character_regexp ||= valid_character_regexp!
|
17
|
+
end
|
18
|
+
|
19
|
+
def valid_character_regexp!
|
20
|
+
valid_chars = []
|
21
|
+
Pinyin.valid_combinations do |i,f|
|
22
|
+
1.upto(5) do |tone|
|
23
|
+
valid_chars += @tone.add_tone(Conversions.unparse(:hanyu,TonelessSyllable.new(i,f)), tone).chars
|
24
|
+
end
|
25
|
+
end
|
26
|
+
valid_chars.sort!.uniq!
|
27
|
+
Regexp.new(valid_chars.map{|ch| Regexp.escape(ch)}.join('|'))
|
28
|
+
end
|
29
|
+
|
30
|
+
def parse(string)
|
31
|
+
result = []
|
32
|
+
looking_at = []
|
33
|
+
string.chars.each do |ch|
|
34
|
+
head, syll = parse_tail(looking_at)
|
35
|
+
looking_at << ch
|
36
|
+
if syll && !parse_tail(looking_at)
|
37
|
+
puts "-> #{syll.inspect}"
|
38
|
+
result << head.to_s unless head.empty?
|
39
|
+
result << syll
|
40
|
+
looking_at = [ch]
|
41
|
+
end
|
42
|
+
end
|
43
|
+
result
|
44
|
+
end
|
45
|
+
|
46
|
+
def parse_tail(chars)
|
47
|
+
7.downto(1) do |i|
|
48
|
+
head = chars[0...-i]
|
49
|
+
tail = chars[-i..-1]
|
50
|
+
syll = parse_syllable( tail )
|
51
|
+
return head, syll if syll
|
52
|
+
end
|
53
|
+
nil
|
54
|
+
end
|
55
|
+
|
56
|
+
def parse_syllable(tone_syll)
|
57
|
+
tone_syll = tone_syll.to_s
|
58
|
+
tone_syll = @options[:preprocess].call(tone_syll) if @options[:preprocess]
|
59
|
+
# p tone_syll
|
60
|
+
tone, syll = @tone.pop_tone(tone_syll)
|
61
|
+
if tone && syll
|
62
|
+
ini_fini = Conversions.parse(:hanyu,syll)
|
63
|
+
if ini_fini
|
64
|
+
p tone, syll, ini_fini
|
65
|
+
ini, fini = ini_fini.initial, ini_fini.final
|
66
|
+
end
|
67
|
+
|
68
|
+
return Syllable.new(ini, fini, tone) if tone && ini && fini
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# self.gsub('u:','ü').gsub(/[A-Za-züÜ]{1,5}\d/) do |m|
|
73
|
+
#Pinyin.HanyuWriter(:accents) << Pinyin.HanyuReader(:numbers).parse(m.downcase)
|
74
|
+
#end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
@@ -2,9 +2,8 @@
|
|
2
2
|
# Hpricoted from http://en.wikipedia.org/wiki/Pinyin_table
|
3
3
|
# Hand-edited for Ueng/weng which is under Ong in the table
|
4
4
|
#
|
5
|
-
#
|
6
|
-
#
|
7
|
-
# a little hat on top (circumflex) is written E.
|
5
|
+
# TODO : This is actually wrong, Ong should be removed in favor
|
6
|
+
# of -ueng/weng
|
8
7
|
---
|
9
8
|
V:
|
10
9
|
Ne: nü
|
File without changes
|
@@ -6,7 +6,11 @@
|
|
6
6
|
# * ILLEGAL_COMBINATIONS
|
7
7
|
|
8
8
|
module Pinyin
|
9
|
+
|
10
|
+
#
|
9
11
|
# A Chinese initial (start of a syllable)
|
12
|
+
#
|
13
|
+
|
10
14
|
class Initial
|
11
15
|
attr :name
|
12
16
|
def initialize(n)
|
@@ -41,8 +45,10 @@ module Pinyin
|
|
41
45
|
end
|
42
46
|
end
|
43
47
|
|
44
|
-
|
48
|
+
#
|
45
49
|
# A Chinese final (end of a syllable)
|
50
|
+
#
|
51
|
+
|
46
52
|
class Final
|
47
53
|
attr :name
|
48
54
|
def initialize(n)
|
@@ -72,8 +78,11 @@ module Pinyin
|
|
72
78
|
end
|
73
79
|
|
74
80
|
|
81
|
+
#
|
75
82
|
# Combination of an initial and a final
|
76
83
|
# Not to be confused with a syllable that has the neutral tone
|
84
|
+
#
|
85
|
+
|
77
86
|
class TonelessSyllable
|
78
87
|
attr_accessor :initial, :final
|
79
88
|
|
@@ -98,7 +107,10 @@ module Pinyin
|
|
98
107
|
end
|
99
108
|
|
100
109
|
|
110
|
+
#
|
101
111
|
# Syllable : initial, final and tone
|
112
|
+
#
|
113
|
+
|
102
114
|
class Syllable < TonelessSyllable
|
103
115
|
attr_accessor :tone
|
104
116
|
|
@@ -114,9 +126,11 @@ module Pinyin
|
|
114
126
|
alias :to_s :inspect
|
115
127
|
end
|
116
128
|
|
117
|
-
|
129
|
+
#
|
118
130
|
# Some groups of initials and finals may not be combined
|
119
131
|
# This list is not exhaustive but is sufficient to resolve ambiguity
|
132
|
+
#
|
133
|
+
|
120
134
|
ILLEGAL_COMBINATIONS=
|
121
135
|
[
|
122
136
|
[Initial::Group_0, Final::Group_0],
|
@@ -135,14 +149,35 @@ module Pinyin
|
|
135
149
|
[Initial::Group_1, Final::Group_V],
|
136
150
|
[Initial::Group_3, Final::Group_V],
|
137
151
|
|
138
|
-
|
152
|
+
#2008.05.26 lo is also valid!
|
153
|
+
#[Initial::Group_2, [Final::O]], #Only bo, po, mo and fo are valid -o combinations
|
139
154
|
[Initial::Group_3, [Final::O]],
|
140
155
|
[Initial::Group_4, [Final::O]],
|
141
156
|
[Initial::Group_5, [Final::O]],
|
142
157
|
[Initial::Group_6, [Final::O]],
|
143
158
|
|
144
|
-
[[Initial::Empty], [Final::Ong]]
|
145
|
-
|
159
|
+
[[Initial::Empty], [Final::Ong]]
|
160
|
+
# TODO: Ong is actually the same as Ueng, in Hanyu Pinyin : -ong or weng
|
146
161
|
]
|
147
162
|
|
163
|
+
class <<self
|
164
|
+
|
165
|
+
|
166
|
+
#
|
167
|
+
# Yields a block for any valid initial/final pair
|
168
|
+
#
|
169
|
+
|
170
|
+
def valid_combinations
|
171
|
+
require 'yaml'
|
172
|
+
inp = YAML::load(IO.read(File.join(File.dirname(__FILE__), 'data', 'valid_pinyin.yaml')))
|
173
|
+
inp.each do |final, initials|
|
174
|
+
final = Final.const_get(final)
|
175
|
+
initials.each do |initial, pinyin|
|
176
|
+
initial = Initial.const_get(initial)
|
177
|
+
yield(initial, final)
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
148
183
|
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
class String
|
2
|
+
def pretty_tones
|
3
|
+
self.gsub('u:','ü').gsub(/[A-Za-züÜ]{1,5}\d/) do |m|
|
4
|
+
Pinyin.HanyuWriter(:accents) << Pinyin.HanyuReader(:numbers).parse(m.downcase)
|
5
|
+
end
|
6
|
+
end
|
7
|
+
|
8
|
+
def bpmf
|
9
|
+
self.gsub('u:','ü').scan(/[A-Za-züÜ]{1,5}\d/).map do |m|
|
10
|
+
Pinyin.ZhuyinWriter(:marks) <<
|
11
|
+
(Pinyin.HanyuReader(:numbers) << m.downcase)
|
12
|
+
end.join(' ')
|
13
|
+
end
|
14
|
+
end
|