arnebrasseur-pinyin 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,12 @@
1
+ == 0.1.3 / 2008-07-18
2
+
3
+ * made compatible with the latest release of Facets
4
+
5
+ == 0.1.0 / 2007-12-14
6
+
7
+ * Converted to Hoe, bugfixes
8
+
9
+ == 0.0.1 / 2007-07-26
10
+
11
+ * Birthday!
12
+
@@ -0,0 +1,31 @@
1
+ History.txt
2
+ Manifest.txt
3
+ README.txt
4
+ rakefile
5
+ TODO
6
+ examples/cgiform/cgiform.rb
7
+ examples/cgiform/template.rhtml
8
+ examples/hello.rb
9
+ lib/pinyin.rb
10
+ lib/pinyin/conversion.rb
11
+ lib/pinyin/conversions.rb
12
+ lib/pinyin/conversions/hanyu.rb
13
+ lib/pinyin/data/comparison.csv
14
+ lib/pinyin/data/final.csv
15
+ lib/pinyin/data/initial.csv
16
+ lib/pinyin/data/paladiy.txt
17
+ lib/pinyin/data/rules.yaml
18
+ lib/pinyin/data/valid_pinyin.yaml
19
+ lib/pinyin/exception.rb
20
+ lib/pinyin/groundwork.rb
21
+ lib/pinyin/string.rb
22
+ lib/pinyin/support.rb
23
+ lib/pinyin/tones.rb
24
+ lib/pinyin/tones/accents.rb
25
+ lib/pinyin/tones/marks.rb
26
+ lib/pinyin/tones/no_tones.rb
27
+ lib/pinyin/tones/numbers.rb
28
+ rakefile
29
+ script/update
30
+ test/test_comparison.rb
31
+ test/test_hanyu_coverage.rb
@@ -0,0 +1,50 @@
1
+ pinyin
2
+ by Arne Brasseur
3
+
4
+ == DESCRIPTION:
5
+
6
+ Pinyin can convert between various systems for phonetically
7
+ writing Mandarin Chinese. It can also handle various representation
8
+ of tones, so it can be used to convert pinyin with numbers
9
+ to pinyin with tones.
10
+
11
+ Supported formats include Hanyu Pinyin, Bopomofo, Wade-Giles
12
+ and International Phonetic Alphabet (IPA).
13
+
14
+ == FEATURES/PROBLEMS:
15
+
16
+ == SYNOPSIS:
17
+
18
+ require 'pinyin'
19
+
20
+ reader = Pinyin::Reader.new(:hanyu, :tones)
21
+ reader << "wo3 ai4 ni3"
22
+ # => [<Pinyin::Syllable <initial=Empty, final=Uo, tone=3>>,
23
+ # <Pinyin::Syllable <initial=Empty, final=Ai, tone=4>>,
24
+ # <Pinyin::Syllable <initial=Ne, final=I, tone=3>>]
25
+
26
+ writer = Pinyin::Writer.new(:zhuyin, :marks)
27
+
28
+ writer << (reader << "wo3 ai4 ni3")
29
+ # => "ㄨㄛˇ ㄞˋ ㄋㄧˇ"
30
+
31
+ require 'pinyin/string'
32
+
33
+ "wo3 ai4 ni3".pretty_tones
34
+ # => "wǒ ài nǐ"
35
+
36
+ == REQUIREMENTS:
37
+
38
+ * $KCODE has to be set to "UTF8" for everything to work correctly
39
+ * Facets
40
+
41
+ == INSTALL:
42
+
43
+ * gem install pinyin
44
+
45
+ == LICENSE:
46
+ Copyright (c) 2004-2007, Arne Brasseur. (http://www.arnebrasseur.net)
47
+
48
+ Available as Free Software under the GPLv3 License, see LICENSE.txt for
49
+ details
50
+
data/TODO ADDED
@@ -0,0 +1,23 @@
1
+ !Core
2
+
3
+ - Additional tone sytems
4
+ - Superscript numbers (for wade giles)
5
+ - IPA tone notation
6
+
7
+ - Additional transcription systems
8
+ - MSP2 (or how do you call that)
9
+ - Palladiy (To make things interesting)
10
+ - Gwoyueh
11
+ - Yale
12
+
13
+ - Research some rare pinyin syllables : lo, yo ^e, yai
14
+ - Get a definitive answer about ong/ueng/weng
15
+ - Add a general README as rdoc start page
16
+ - Add a README to the data/ directory with info on sources, contents and purposes
17
+ - More tests
18
+ - Add remembering of parameters to cgiform example, other examples
19
+
20
+ !More
21
+ The core lib basically does translation on the syllable level. It can handle strings with syllables nicely seperated by spaces. Successive layers should make it possible to convert a sentence with interpunction into a different system. It should be possible to write compound words together in Hanyu, and have the syllables seperated by dashes when converting to WG. For instance:
22
+
23
+ Wǒ de péngyǒu, shì dàifu. => Wǒ te p`éng-yǔ, shih tài-fu.>>>>>>> .r211
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/ruby -w
2
+
3
+
4
+ require 'cgi'
5
+ require 'erb'
6
+
7
+ $: << File.dirname(__FILE__)+'/../../lib'
8
+ require 'pinyin'
9
+
10
+ cgi=CGI.new("xhtml1")
11
+
12
+ params=cgi.params
13
+ begin
14
+ if params['pinyin'] && params['pinyin'] != '' && params['pinyin'] != []
15
+ @converted = Pinyin::Writer.new(params['to'], params['to_tone']) << (Pinyin::Reader.new(params['from'],params['from_tone']) << params['pinyin'].first)
16
+ end
17
+ rescue
18
+ cgi.out{$!.to_s}
19
+ cgi.out{params['pinyin'].inspect}
20
+ end
21
+
22
+ cgi.out("text/html; charset=utf-8") do
23
+ ERB.new(IO.read('template.rhtml')).result(binding)
24
+ end
@@ -0,0 +1,69 @@
1
+ <!doctype html>
2
+ <html>
3
+ <head>
4
+ <title>Ruby Pinyin CGIForm example</title>
5
+ <style type='text/css'>
6
+ body {
7
+ font-family: sans-serif;
8
+ }
9
+
10
+ div#wrap {
11
+ width: 40%;
12
+ margin: 0 auto;
13
+ }
14
+
15
+ table {
16
+ width: 100%;
17
+ }
18
+ div#converted_text {
19
+ border: 1px dotted #000;
20
+ }
21
+
22
+ textarea {
23
+ width: 100%;
24
+ height: 10em;
25
+ margin: 0 auto;
26
+ }
27
+ </style>
28
+ </head>
29
+ <body>
30
+ <div id='wrap'>
31
+ <h2>Pinyin example application</h2>
32
+ <h3>Enter some pinyin text and choose your format</h3>
33
+ <table>
34
+ <form method='post'>
35
+ <tr>
36
+ <td colspan='2'>
37
+ <textarea name='pinyin'><%=params['pinyin'].first if params['pinyin'] != []%></textarea>
38
+ </td>
39
+ </tr>
40
+
41
+ <tr><td>From</td><td>To</td></tr>
42
+ <% Pinyin::Conversions::All.each do |f|%>
43
+ <tr>
44
+ <td><input type='radio' name='from' value='<%=f%>'><%=f.capitalize%></input></td>
45
+ <td><input type='radio' name='to' value='<%=f%>'><%=f.capitalize%></input></td>
46
+ </tr>
47
+ <% end %>
48
+ <tr><td>From tone</td><td>To tone</td></tr>
49
+ <% Pinyin::Tones::All.each do |f|%>
50
+ <tr>
51
+ <td><input type='radio' name='from_tone' value='<%=f%>'><%=f.capitalize%></input></td>
52
+ <td><input type='radio' name='to_tone' value='<%=f%>'><%=f.capitalize%></input></td>
53
+ </tr>
54
+ <% end %>
55
+ <tr>
56
+ <td><input type='submit'></input></td>
57
+ <td>&nbsp;</td>
58
+ </tr>
59
+ </form>
60
+ </table>
61
+ <% if @converted %>
62
+ <h2>Converted:</h2>
63
+ <div id='converted_text'>
64
+ <%= @converted %>
65
+ </div>
66
+ <% end %>
67
+ </div>
68
+ </body>
69
+ </html>
@@ -0,0 +1,12 @@
1
+ $: << File.join(File.dirname(__FILE__), '../lib')
2
+
3
+ require 'pinyin'
4
+
5
+ conv1 = Pinyin::Converter.new(:hanyu, :numbers, :wadegiles, :accents)
6
+ conv2 = Pinyin::Converter.new(:wadegiles, :accents, :zhuyin, :marks)
7
+
8
+ pinyin = 'wo3 de2 peng2 you3 shi4 dai4 fu'
9
+ wadegiles = conv1 << pinyin
10
+ zhuyin = conv2 << wadegiles
11
+
12
+ puts pinyin, wadegiles, zhuyin
@@ -0,0 +1,90 @@
1
+ # Handle several romanization systems for Mandarin Chinese
2
+ #
3
+ # Author:: Arne Brasseur (pinyin@arnebrasseur.net)
4
+ # Copyright:: Copyright (c) 2007, Arne Brasseur
5
+ # Licence:: GNU General Public License, latest version
6
+
7
+ $: << File.dirname(__FILE__)
8
+
9
+ require "facets/string/camelcase"
10
+
11
+ require 'pinyin/support'
12
+ require 'pinyin/groundwork'
13
+ require 'pinyin/exception'
14
+
15
+ require 'pinyin/tones'
16
+ require 'pinyin/conversion'
17
+ require 'pinyin/conversions'
18
+ require 'pinyin/conversions/hanyu'
19
+
20
+ module Pinyin
21
+ VERSION = "0.1.5"
22
+
23
+ class Reader
24
+ def initialize(conv, tone)
25
+ @conv = conv.to_s
26
+ @tone = Tones.const_get tone.to_s.camelcase
27
+ end
28
+
29
+ def parse(str)
30
+ Conversions.tokenize(str).map do |s, pos|
31
+ tone,syll = @tone.pop_tone(s)
32
+ tsyll = Conversions.parse(@conv,syll)
33
+ ini, fin = tsyll.initial, tsyll.final
34
+ unless tone && fin && ini
35
+ raise ParseError.new(s,pos),"Illegal syllable <#{s}> in input <#{str}> at position #{pos}."
36
+ end
37
+ Syllable.new(ini, fin, tone)
38
+ end
39
+ rescue Object => e
40
+ raise ParseError.new(str,0), "Parsing of #{str.inspect} failed : #{e}"
41
+ end
42
+
43
+ alias :<< :parse
44
+ end
45
+
46
+ class Writer
47
+ def initialize(conv, tone)
48
+ @conv = conv.to_s
49
+ @tone = Tones.const_get tone.to_s.camelcase
50
+ end
51
+
52
+ def unparse(py)
53
+ conv=lambda {|syll| @tone.add_tone(Conversions.unparse(@conv,syll),syll.tone)}
54
+ if py.respond_to? :map
55
+ py.map(&conv).join(' ')
56
+ else
57
+ conv.call(py)
58
+ end
59
+ end
60
+
61
+ alias :<< :unparse
62
+ end
63
+
64
+ class Converter
65
+ def initialize(from, from_tone, to, to_tone)
66
+ @reader = Reader.new(from, from_tone)
67
+ @writer = Writer.new(to, to_tone)
68
+ end
69
+
70
+ def convert(str)
71
+ @writer.unparse @reader.parse(str)
72
+ end
73
+
74
+ alias :<< :convert
75
+ end
76
+
77
+ class <<self
78
+ Conversions::All.each do |c|
79
+ define_method "#{c.to_s.camelcase}Reader" do |tone|
80
+ Reader.new(c, tone)
81
+ end
82
+
83
+ define_method "#{c.to_s.camelcase}Writer" do |tone|
84
+ Writer.new(c, tone)
85
+ end
86
+ end
87
+ end
88
+ end
89
+
90
+
@@ -0,0 +1,51 @@
1
+ module Pinyin
2
+
3
+ #
4
+ # Base class for conversions like Hanyu pinyin,
5
+ # Wade-Giles, etc.
6
+ #
7
+ class Conversion
8
+
9
+ # Separator between syllables in the same word
10
+ # For Wade-Giles this is a dash, Hanyu pinyin
11
+ # uses a single quote in certain situations
12
+ attr_reader :syllable_separator
13
+
14
+ # The tone handling object
15
+ attr_reader :tones
16
+
17
+ # An optional lambda that preprocesses input
18
+ attr_reader :preprocessor
19
+
20
+ # The name of this conversion, the same name used
21
+ # in the data file and that is also available as
22
+ # a method name on Initial and Final objects.
23
+ #
24
+ # By default the underscorized class name
25
+ attr_reader :name
26
+
27
+ def initialize(tone = :numbers, options = {})
28
+ @preprocessor = options[:preprocessor] || lambda {|s| s}
29
+
30
+ if Tone === tone
31
+ @tone = tone
32
+ else
33
+ @tone = Pinyin::Tones.const_get(tone.to_s.camelcase)
34
+ end
35
+
36
+ @name = self.class.name.underscore
37
+ end
38
+
39
+ # Converts a string into an array of strings and
40
+ # syllable objects.
41
+ def parse(string)
42
+ end
43
+
44
+ # Converts an array of strings and syllable objects
45
+ # into a string
46
+ def unparse(array)
47
+ end
48
+
49
+ end
50
+ end
51
+
@@ -0,0 +1,75 @@
1
+ require 'csv'
2
+ require 'yaml'
3
+
4
+ module Pinyin
5
+ module Conversions
6
+ All=[]
7
+
8
+ DATA_DIR=File.dirname(__FILE__)+'/data/'
9
+
10
+ #Load various representations for initials and finals
11
+ %w(Initial Final).each do |c|
12
+ klazz=Pinyin.const_get c
13
+ begin
14
+ CSV.open(DATA_DIR+c.downcase+'.csv', 'r').each do |name, *values|
15
+ All << name.to_s unless All.index name || name =~ /name|standalone/i
16
+ klazz.class_eval {attr_accessor name.to_sym}
17
+ values.each_with_index do |v,i|
18
+ klazz::All[i].send(name+'=', v)
19
+ end
20
+ end
21
+ rescue
22
+ puts "Bad data in #{c.downcase}.csv : " + $!
23
+ raise
24
+ end
25
+
26
+ end
27
+
28
+ #Substitution rules
29
+ @@rules=YAML::load(IO.read(DATA_DIR+'rules.yaml'))
30
+
31
+ def self.parse(type, string)
32
+ if (fin = Final::All.find {|f| f.respond_to?("#{type}_standalone") && f.send("#{type}_standalone") == string})
33
+ TonelessSyllable.new(Initial::Empty, fin)
34
+ else
35
+ Initial::All.find do |ini|
36
+ Final::All.find do |fin|
37
+ next if TonelessSyllable.illegal?(ini,fin)
38
+ return TonelessSyllable.new(ini,fin) if apply_rules(type, (ini.send(type)||'') + (fin.send(type)||'')) == string
39
+ end
40
+ end
41
+ end
42
+ end
43
+
44
+ def self.unparse(type, tsyll)
45
+ if tsyll.initial.send(type)
46
+ apply_rules(type, tsyll.initial.send(type) + (tsyll.final.send(type) || ''))
47
+ elsif tsyll.final.respond_to?(type.to_s+'_standalone') && standalone = tsyll.final.send(type.to_s+'_standalone')
48
+ standalone
49
+ else
50
+ apply_rules(type, tsyll.final.send(type))
51
+ end
52
+ end
53
+
54
+ def self.tokenize(str)
55
+ returning [] do |ary|
56
+ str,pos = str.dup, 0
57
+ while s=str.slice!(/[^' ]*/) and s != ""
58
+ ary << [s.strip, pos]
59
+ pos+=s.length
60
+ str.slice!(/[' ]/)
61
+ end
62
+ end
63
+ end
64
+
65
+ private
66
+ def self.apply_rules(type, string)
67
+ returning string.dup do |s|
68
+ @@rules[type] && @@rules[type].each do |rule|
69
+ s.gsub!(Regexp.new(rule['match']),rule['subst'])
70
+ end
71
+ end
72
+ end
73
+
74
+ end
75
+ end