pinyin 0.0.1 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,36 @@
1
+ require 'rubygems'
2
+
3
+ require 'rake'
4
+ require 'rake/testtask'
5
+ require 'hoe'
6
+
7
+ $:.unshift './lib'
8
+
9
+ require 'pinyin'
10
+
11
+ Hoe.new('pinyin', Pinyin::VERSION) do |p|
12
+ p.rubyforge_name = 'pinyin'
13
+ p.summary = "A conversion library for Chinese transcription methods like Hanyu Pinyin, Bopomofo and Wade-Giles"
14
+ p.description = p.paragraphs_of('README', 2).join
15
+ p.url = "http://rubyforge.org/projects/pinyin"
16
+ p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
17
+ p.email = "pinyin@arnebrasseur.net"
18
+ p.author = 'Arne Brasseur'
19
+ p.remote_rdoc_dir=""
20
+ p.spec_extras = {
21
+ :extra_rdoc_files => ["README", "History.txt"],
22
+ :rdoc_options => ["--main", "README"],
23
+ :platform => Gem::Platform::RUBY
24
+ }
25
+ end
26
+
27
+ task :default => [:test_units]
28
+
29
+ namespace "test" do
30
+ Rake::TestTask.new("pinyin") do |t|
31
+ $: << File.dirname(__FILE__) + '/lib'
32
+ t.pattern = 'test/*.rb'
33
+ t.verbose = true
34
+ t.warning = true
35
+ end
36
+ end
data/TODO CHANGED
@@ -11,6 +11,7 @@
11
11
  - Yale
12
12
 
13
13
  - Research some rare pinyin syllables : lo, yo ^e, yai
14
+ - Get a definitive answer about ong/ueng/weng
14
15
  - Add a general README as rdoc start page
15
16
  - Add a README to the data/ directory with info on sources, contents and purposes
16
17
  - More tests
@@ -19,4 +20,4 @@
19
20
  !More
20
21
  The core lib basically does translation on the syllable level. It can handle strings with syllables nicely seperated by spaces. Successive layers should make it possible to convert a sentence with interpunction into a different system. It should be possible to write compound words together in Hanyu, and have the syllables seperated by dashes when converting to WG. For instance:
21
22
 
22
- Wǒ de péngyǒu, shì dàifu. => Wǒ te p`éng-yǔ, shih tài-fu.
23
+ Wǒ de péngyǒu, shì dàifu. => Wǒ te p`éng-yǔ, shih tài-fu.>>>>>>> .r211
File without changes
File without changes
File without changes
@@ -6,21 +6,24 @@
6
6
 
7
7
  $: << File.dirname(__FILE__)
8
8
 
9
- require 'support'
10
- require 'groundwork'
11
- require 'exception'
9
+ require "facets/string/camelcase"
12
10
 
13
- require 'tones'
14
- Pinyin::Tones::All.each{|m| require 'tones/'+m}
15
-
16
- require 'conversions'
11
+ require 'pinyin/support'
12
+ require 'pinyin/groundwork'
13
+ require 'pinyin/exception'
17
14
 
15
+ require 'pinyin/tones'
16
+ require 'pinyin/conversion'
17
+ require 'pinyin/conversions'
18
+ require 'pinyin/conversions/hanyu'
18
19
 
19
20
  module Pinyin
21
+ VERSION = "0.1.4"
22
+
20
23
  class Reader
21
24
  def initialize(conv, tone)
22
- @conv = conv.to_s #Conversions.const_get conv.to_s.camelize
23
- @tone = Tones.const_get tone.to_s.camelize
25
+ @conv = conv.to_s
26
+ @tone = Tones.const_get tone.to_s.camelcase
24
27
  end
25
28
 
26
29
  def parse(str)
@@ -28,9 +31,13 @@ module Pinyin
28
31
  tone,syll = @tone.pop_tone(s)
29
32
  tsyll = Conversions.parse(@conv,syll)
30
33
  ini, fin = tsyll.initial, tsyll.final
31
- raise ParseError.new(s,pos),"Illegal syllable <#{s}> in input <#{str}> at position #{pos}." unless tone && fin && ini
34
+ unless tone && fin && ini
35
+ raise ParseError.new(s,pos),"Illegal syllable <#{s}> in input <#{str}> at position #{pos}."
36
+ end
32
37
  Syllable.new(ini, fin, tone)
33
38
  end
39
+ rescue Object => e
40
+ raise ParseError.new(str,0), "Parsing of #{str.inspect} failed : #{e}"
34
41
  end
35
42
 
36
43
  alias :<< :parse
@@ -38,8 +45,8 @@ module Pinyin
38
45
 
39
46
  class Writer
40
47
  def initialize(conv, tone)
41
- @conv = conv.to_s #Conversions.const_get conv.to_s.camelize
42
- @tone = Tones.const_get tone.to_s.camelize
48
+ @conv = conv.to_s
49
+ @tone = Tones.const_get tone.to_s.camelcase
43
50
  end
44
51
 
45
52
  def unparse(py)
@@ -66,6 +73,18 @@ module Pinyin
66
73
 
67
74
  alias :<< :convert
68
75
  end
76
+
77
+ class <<self
78
+ Conversions::All.each do |c|
79
+ define_method "#{c.to_s.camelcase}Reader" do |tone|
80
+ Reader.new(c, tone)
81
+ end
82
+
83
+ define_method "#{c.to_s.camelcase}Writer" do |tone|
84
+ Writer.new(c, tone)
85
+ end
86
+ end
87
+ end
69
88
  end
70
89
 
71
90
 
@@ -0,0 +1,51 @@
1
+ module Pinyin
2
+
3
+ #
4
+ # Base class for conversions like Hanyu pinyin,
5
+ # Wade-Giles, etc.
6
+ #
7
+ class Conversion
8
+
9
+ # Separator between syllables in the same word
10
+ # For Wade-Giles this is a dash, Hanyu pinyin
11
+ # uses a single quote in certain situations
12
+ attr_reader :syllable_separator
13
+
14
+ # The tone handling object
15
+ attr_reader :tones
16
+
17
+ # An optional lambda that preprocesses input
18
+ attr_reader :preprocessor
19
+
20
+ # The name of this conversion, the same name used
21
+ # in the data file and that is also available as
22
+ # a method name on Initial and Final objects.
23
+ #
24
+ # By default the underscorized class name
25
+ attr_reader :name
26
+
27
+ def initialize(tone = :numbers, options = {})
28
+ @preprocessor = options[:preprocessor] || lambda {|s| s}
29
+
30
+ if Tone === tone
31
+ @tone = tone
32
+ else
33
+ @tone = Pinyin::Tones.const_get(tone.to_s.camelcase)
34
+ end
35
+
36
+ @name = self.class.name.underscore
37
+ end
38
+
39
+ # Converts a string into an array of strings and
40
+ # syllable objects.
41
+ def parse(string)
42
+ end
43
+
44
+ # Converts an array of strings and syllable objects
45
+ # into a string
46
+ def unparse(array)
47
+ end
48
+
49
+ end
50
+ end
51
+
@@ -12,7 +12,7 @@ module Pinyin
12
12
  klazz=Pinyin.const_get c
13
13
  begin
14
14
  CSV.open(DATA_DIR+c.downcase+'.csv', 'r').each do |name, *values|
15
- All << name unless All.index name || name =~ /name|standalone/i
15
+ All << name.to_s unless All.index name || name =~ /name|standalone/i
16
16
  klazz.class_eval {attr_accessor name.to_sym}
17
17
  values.each_with_index do |v,i|
18
18
  klazz::All[i].send(name+'=', v)
@@ -63,12 +63,13 @@ module Pinyin
63
63
  end
64
64
 
65
65
  private
66
- def self.apply_rules(type, string)
67
- returning string.dup do |s|
68
- @@rules[type] && @@rules[type].each do |rule|
69
- s.gsub!(Regexp.new(rule['match']),rule['subst'])
66
+ def self.apply_rules(type, string)
67
+ returning string.dup do |s|
68
+ @@rules[type] && @@rules[type].each do |rule|
69
+ s.gsub!(Regexp.new(rule['match']),rule['subst'])
70
+ end
70
71
  end
71
72
  end
72
- end
73
+
73
74
  end
74
75
  end
@@ -0,0 +1,77 @@
1
+ module Pinyin
2
+ module Conversions
3
+ class Hanyu
4
+ def initialize(tone = :numbers, options = {})
5
+ @options = options
6
+ @options[:preprocess] ||= lambda {|s| s.gsub(/u:|Ü/, 'ü').downcase }
7
+
8
+ if Class === tone
9
+ @tone = tone
10
+ else
11
+ @tone = Pinyin::Tones.const_get(tone.to_s.camelcase)
12
+ end
13
+ end
14
+
15
+ def valid_character_regexp
16
+ @valid_character_regexp ||= valid_character_regexp!
17
+ end
18
+
19
+ def valid_character_regexp!
20
+ valid_chars = []
21
+ Pinyin.valid_combinations do |i,f|
22
+ 1.upto(5) do |tone|
23
+ valid_chars += @tone.add_tone(Conversions.unparse(:hanyu,TonelessSyllable.new(i,f)), tone).chars
24
+ end
25
+ end
26
+ valid_chars.sort!.uniq!
27
+ Regexp.new(valid_chars.map{|ch| Regexp.escape(ch)}.join('|'))
28
+ end
29
+
30
+ def parse(string)
31
+ result = []
32
+ looking_at = []
33
+ string.chars.each do |ch|
34
+ head, syll = parse_tail(looking_at)
35
+ looking_at << ch
36
+ if syll && !parse_tail(looking_at)
37
+ puts "-> #{syll.inspect}"
38
+ result << head.to_s unless head.empty?
39
+ result << syll
40
+ looking_at = [ch]
41
+ end
42
+ end
43
+ result
44
+ end
45
+
46
+ def parse_tail(chars)
47
+ 7.downto(1) do |i|
48
+ head = chars[0...-i]
49
+ tail = chars[-i..-1]
50
+ syll = parse_syllable( tail )
51
+ return head, syll if syll
52
+ end
53
+ nil
54
+ end
55
+
56
+ def parse_syllable(tone_syll)
57
+ tone_syll = tone_syll.to_s
58
+ tone_syll = @options[:preprocess].call(tone_syll) if @options[:preprocess]
59
+ # p tone_syll
60
+ tone, syll = @tone.pop_tone(tone_syll)
61
+ if tone && syll
62
+ ini_fini = Conversions.parse(:hanyu,syll)
63
+ if ini_fini
64
+ p tone, syll, ini_fini
65
+ ini, fini = ini_fini.initial, ini_fini.final
66
+ end
67
+
68
+ return Syllable.new(ini, fini, tone) if tone && ini && fini
69
+ end
70
+ end
71
+
72
+ # self.gsub('u:','ü').gsub(/[A-Za-züÜ]{1,5}\d/) do |m|
73
+ #Pinyin.HanyuWriter(:accents) << Pinyin.HanyuReader(:numbers).parse(m.downcase)
74
+ #end
75
+ end
76
+ end
77
+ end
File without changes
File without changes
File without changes
File without changes
File without changes
@@ -2,9 +2,8 @@
2
2
  # Hpricoted from http://en.wikipedia.org/wiki/Pinyin_table
3
3
  # Hand-edited for Ueng/weng which is under Ong in the table
4
4
  #
5
- # Note that, in order to only use 7-bit ASCII characters,
6
- # the u with two dots (umlaut) is written v, and the e with
7
- # a little hat on top (circumflex) is written E.
5
+ # TODO : This is actually wrong, Ong should be removed in favor
6
+ # of -ueng/weng
8
7
  ---
9
8
  V:
10
9
  Ne: nü
@@ -6,7 +6,11 @@
6
6
  # * ILLEGAL_COMBINATIONS
7
7
 
8
8
  module Pinyin
9
+
10
+ #
9
11
  # A Chinese initial (start of a syllable)
12
+ #
13
+
10
14
  class Initial
11
15
  attr :name
12
16
  def initialize(n)
@@ -41,8 +45,10 @@ module Pinyin
41
45
  end
42
46
  end
43
47
 
44
-
48
+ #
45
49
  # A Chinese final (end of a syllable)
50
+ #
51
+
46
52
  class Final
47
53
  attr :name
48
54
  def initialize(n)
@@ -72,8 +78,11 @@ module Pinyin
72
78
  end
73
79
 
74
80
 
81
+ #
75
82
  # Combination of an initial and a final
76
83
  # Not to be confused with a syllable that has the neutral tone
84
+ #
85
+
77
86
  class TonelessSyllable
78
87
  attr_accessor :initial, :final
79
88
 
@@ -98,7 +107,10 @@ module Pinyin
98
107
  end
99
108
 
100
109
 
110
+ #
101
111
  # Syllable : initial, final and tone
112
+ #
113
+
102
114
  class Syllable < TonelessSyllable
103
115
  attr_accessor :tone
104
116
 
@@ -114,9 +126,11 @@ module Pinyin
114
126
  alias :to_s :inspect
115
127
  end
116
128
 
117
-
129
+ #
118
130
  # Some groups of initials and finals may not be combined
119
131
  # This list is not exhaustive but is sufficient to resolve ambiguity
132
+ #
133
+
120
134
  ILLEGAL_COMBINATIONS=
121
135
  [
122
136
  [Initial::Group_0, Final::Group_0],
@@ -135,14 +149,35 @@ module Pinyin
135
149
  [Initial::Group_1, Final::Group_V],
136
150
  [Initial::Group_3, Final::Group_V],
137
151
 
138
- [Initial::Group_2, [Final::O]], #Only bo, po, mo and fo are valid -o combinations
152
+ #2008.05.26 lo is also valid!
153
+ #[Initial::Group_2, [Final::O]], #Only bo, po, mo and fo are valid -o combinations
139
154
  [Initial::Group_3, [Final::O]],
140
155
  [Initial::Group_4, [Final::O]],
141
156
  [Initial::Group_5, [Final::O]],
142
157
  [Initial::Group_6, [Final::O]],
143
158
 
144
- [[Initial::Empty], [Final::Ong]] # Some say ong and ueng is actually the same final, zhuyin uses the same representation, but ueng only has standalone form weng
145
-
159
+ [[Initial::Empty], [Final::Ong]]
160
+ # TODO: Ong is actually the same as Ueng, in Hanyu Pinyin : -ong or weng
146
161
  ]
147
162
 
163
+ class <<self
164
+
165
+
166
+ #
167
+ # Yields a block for any valid initial/final pair
168
+ #
169
+
170
+ def valid_combinations
171
+ require 'yaml'
172
+ inp = YAML::load(IO.read(File.join(File.dirname(__FILE__), 'data', 'valid_pinyin.yaml')))
173
+ inp.each do |final, initials|
174
+ final = Final.const_get(final)
175
+ initials.each do |initial, pinyin|
176
+ initial = Initial.const_get(initial)
177
+ yield(initial, final)
178
+ end
179
+ end
180
+ end
181
+ end
182
+
148
183
  end
@@ -0,0 +1,14 @@
1
+ class String
2
+ def pretty_tones
3
+ self.gsub('u:','ü').gsub(/[A-Za-züÜ]{1,5}\d/) do |m|
4
+ Pinyin.HanyuWriter(:accents) << Pinyin.HanyuReader(:numbers).parse(m.downcase)
5
+ end
6
+ end
7
+
8
+ def bpmf
9
+ self.gsub('u:','ü').scan(/[A-Za-züÜ]{1,5}\d/).map do |m|
10
+ Pinyin.ZhuyinWriter(:marks) <<
11
+ (Pinyin.HanyuReader(:numbers) << m.downcase)
12
+ end.join(' ')
13
+ end
14
+ end
@@ -1,14 +1,10 @@
1
1
  class String
2
- def camelize
3
- self.split(/_/).map{|p|p.capitalize}.join
4
- end
5
-
6
2
  def chars
7
3
  self.unpack('U*').map{|c| [c].pack('U')}
8
4
  end
9
5
  end
10
6
 
11
- class Object
7
+ module Kernel
12
8
  def returning(s)
13
9
  yield(s)
14
10
  s