ting 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,16 @@
1
+ == 0.2 / 2010-04-04
2
+
3
+ * Added support for superscript numeral tones and IPA tone marks
4
+
5
+ == 0.1.3 / 2008-07-18
6
+
7
+ * made compatible with the latest release of Facets
8
+
9
+ == 0.1.0 / 2007-12-14
10
+
11
+ * Converted to Hoe, bugfixes
12
+
13
+ == 0.0.1 / 2007-07-26
14
+
15
+ * Birthday!
16
+
@@ -0,0 +1,94 @@
1
+ = Ting
2
+
3
+ Ting can convert between various systems for phonetically
4
+ writing Mandarin Chinese. It can also handle various representation
5
+ of tones, so it can be used to convert pinyin with numbers
6
+ to pinyin with tones.
7
+
8
+ Hanyu Pinyin, Bopomofo, Wade-Giles, Tongyong Pinyin
9
+ and International Phonetic Alphabet (IPA) are supported.
10
+
11
+ == SYNOPSIS
12
+
13
+ To parse your strings create a +Reader+ object. Ting.reader() takes two
14
+ parameters : the transliteration format, and the way that tones are represented.
15
+
16
+ To some extent these can be mixed and matched.
17
+
18
+ To generate pinyin/wade-giles/etc. create a +Writer+ object. Use Ting.writer()
19
+
20
+ === Formats
21
+
22
+ * +:hanyu+ Hanyu Pinyin
23
+ * +:zhuyin+ Zhuyin Fuhao (a.k.a. Bopomofo)
24
+ * +:wadegiles+ Wade Giles
25
+ * +:ipa+ International Phonetic Alphabet
26
+ * +:tongyong+ Tongyong Pinyin
27
+
28
+ === Tones
29
+
30
+ * +:numbers+ Simply put a number after the syllable, easy to type
31
+ * +:accents+ Use diacritics, follows the Hanyu Pinyin rules, there needs to be at least one vowel to apply this to, not usable with IPA or Bopomofo
32
+ * +:supernum+ Superscript numerals, typically used for Wade-Giles
33
+ * +:marks+ Tone mark after the syllable, typically used for Bopomofo
34
+ * +:ipa+ IPA tone marks
35
+ * +:no_tones+ Use no tones
36
+
37
+ == Examples
38
+
39
+ Parse Hanyu Pinyin
40
+
41
+ require 'ting'
42
+
43
+ reader = Ting.reader(:hanyu, :numbers)
44
+ reader << "wo3 ai4 ni3"
45
+ # => [<Ting::Syllable <initial=Empty, final=Uo, tone=3>>,
46
+ # <Ting::Syllable <initial=Empty, final=Ai, tone=4>>,
47
+ # <Ting::Syllable <initial=Ne, final=I, tone=3>>]
48
+
49
+ Generate Bopomofo
50
+
51
+ zhuyin = Ting.writer(:zhuyin, :marks)
52
+ zhuyin << (reader << "wo3 ai4 ni3")
53
+ # => "ㄨㄛˇ ㄞˋ ㄋㄧˇ"
54
+
55
+ Generate Wade-Giles
56
+
57
+ wadegiles = Ting.writer(:wadegiles, :supernum)
58
+ wadegiles << (reader << "yi1 ge5 bu2 gou4")
59
+ # => "i¹ ko pu² kou⁴"
60
+
61
+ Generate IPA
62
+
63
+ ipa = Ting.writer.new(:ipa, :ipa)
64
+ ipa << (reader << "you3 peng2 zi4 yuan2 fang1 lai2")
65
+ # => "iou˧˩˧ pʰeŋ˧˥ ts˥˩ yɛn˧˥ faŋ˥˥ lai˧˥"
66
+
67
+ Since this is such a common use case, a convenience method to add diacritics to pinyin.
68
+
69
+ require 'ting/string'
70
+
71
+ "wo3 ai4 ni3".pretty_tones
72
+ # => "wǒ ài nǐ"
73
+
74
+ Note that syllables need to be separated by spaces, feeding "peng2you3" to the parser
75
+ does not work. The String#pretty_tones method does handle these things a bit more gracefully.
76
+
77
+ If you need to parse input that does not conform, consider using a regexp to scan for valid
78
+ syllables, then feed the syllables to the parser one by one. Have a look at #pretty_tones for
79
+ an example of how to do this.
80
+
81
+ == REQUIREMENTS
82
+
83
+ * $KCODE has to be set to "UTF8" for everything to work correctly
84
+
85
+ == INSTALL
86
+
87
+ * gem install ting
88
+
89
+ == LICENSE
90
+
91
+ Copyright (c) 2004-2010, Arne Brasseur. (http://www.arnebrasseur.net)
92
+
93
+ Available as Free Software under the GPLv3 License, see LICENSE.txt for details
94
+
@@ -0,0 +1,15 @@
1
+ require 'rubygems'
2
+
3
+ require 'rake'
4
+ require 'rake/testtask'
5
+
6
+ task :default => [:"test:ting"]
7
+
8
+ namespace "test" do
9
+ Rake::TestTask.new("ting") do |t|
10
+ $: << File.dirname(__FILE__) + '/lib'
11
+ t.pattern = 'test/*.rb'
12
+ t.verbose = true
13
+ t.warning = true
14
+ end
15
+ end
data/TODO ADDED
@@ -0,0 +1,15 @@
1
+ - Additional transcription systems
2
+ - MSP2 (or how do you call that)
3
+ - Palladiy (To make things interesting)
4
+ - Gwoyueh
5
+ - Yale
6
+
7
+ - Research some rare pinyin syllables : lo, yo ^e, yai
8
+ - Get a definitive answer about ong/ueng/weng
9
+ - Add a README to the data/ directory with info on sources, contents and purposes
10
+ - More tests
11
+ - Add remembering of parameters to cgiform example, other examples
12
+
13
+ The core lib basically does translation on the syllable level. It can handle strings with syllables nicely seperated by spaces. Successive layers should make it possible to convert a sentence with interpunction into a different system. It should be possible to write compound words together in Hanyu, and have the syllables seperated by dashes when converting to WG. For instance:
14
+
15
+ Wǒ de péngyǒu, shì dàifu. => Wǒ te p`éng-yǔ, shih tài-fu.
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/ruby -w
2
+
3
+
4
+ require 'cgi'
5
+ require 'erb'
6
+
7
+ $: << File.dirname(__FILE__)+'/../../lib'
8
+ require 'pinyin'
9
+
10
+ cgi=CGI.new("xhtml1")
11
+
12
+ params=cgi.params
13
+ begin
14
+ if params['pinyin'] && params['pinyin'] != '' && params['pinyin'] != []
15
+ @converted = Pinyin::Writer.new(params['to'], params['to_tone']) << (Pinyin::Reader.new(params['from'],params['from_tone']) << params['pinyin'].first)
16
+ end
17
+ rescue
18
+ cgi.out{$!.to_s}
19
+ cgi.out{params['pinyin'].inspect}
20
+ end
21
+
22
+ cgi.out("text/html; charset=utf-8") do
23
+ ERB.new(IO.read('template.rhtml')).result(binding)
24
+ end
@@ -0,0 +1,69 @@
1
+ <!doctype html>
2
+ <html>
3
+ <head>
4
+ <title>Ruby Pinyin CGIForm example</title>
5
+ <style type='text/css'>
6
+ body {
7
+ font-family: sans-serif;
8
+ }
9
+
10
+ div#wrap {
11
+ width: 40%;
12
+ margin: 0 auto;
13
+ }
14
+
15
+ table {
16
+ width: 100%;
17
+ }
18
+ div#converted_text {
19
+ border: 1px dotted #000;
20
+ }
21
+
22
+ textarea {
23
+ width: 100%;
24
+ height: 10em;
25
+ margin: 0 auto;
26
+ }
27
+ </style>
28
+ </head>
29
+ <body>
30
+ <div id='wrap'>
31
+ <h2>Pinyin example application</h2>
32
+ <h3>Enter some pinyin text and choose your format</h3>
33
+ <table>
34
+ <form method='post'>
35
+ <tr>
36
+ <td colspan='2'>
37
+ <textarea name='pinyin'><%=params['pinyin'].first if params['pinyin'] != []%></textarea>
38
+ </td>
39
+ </tr>
40
+
41
+ <tr><td>From</td><td>To</td></tr>
42
+ <% Pinyin::Conversions::All.each do |f|%>
43
+ <tr>
44
+ <td><input type='radio' name='from' value='<%=f%>'><%=f.capitalize%></input></td>
45
+ <td><input type='radio' name='to' value='<%=f%>'><%=f.capitalize%></input></td>
46
+ </tr>
47
+ <% end %>
48
+ <tr><td>From tone</td><td>To tone</td></tr>
49
+ <% Pinyin::Tones::All.each do |f|%>
50
+ <tr>
51
+ <td><input type='radio' name='from_tone' value='<%=f%>'><%=f.capitalize%></input></td>
52
+ <td><input type='radio' name='to_tone' value='<%=f%>'><%=f.capitalize%></input></td>
53
+ </tr>
54
+ <% end %>
55
+ <tr>
56
+ <td><input type='submit'></input></td>
57
+ <td>&nbsp;</td>
58
+ </tr>
59
+ </form>
60
+ </table>
61
+ <% if @converted %>
62
+ <h2>Converted:</h2>
63
+ <div id='converted_text'>
64
+ <%= @converted %>
65
+ </div>
66
+ <% end %>
67
+ </div>
68
+ </body>
69
+ </html>
@@ -0,0 +1,12 @@
1
+ $: << File.join(File.dirname(__FILE__), '../lib')
2
+
3
+ require 'pinyin'
4
+
5
+ conv1 = Pinyin::Converter.new(:hanyu, :numbers, :wadegiles, :accents)
6
+ conv2 = Pinyin::Converter.new(:wadegiles, :accents, :zhuyin, :marks)
7
+
8
+ pinyin = 'wo3 de2 peng2 you3 shi4 dai4 fu'
9
+ wadegiles = conv1 << pinyin
10
+ zhuyin = conv2 << wadegiles
11
+
12
+ puts pinyin, wadegiles, zhuyin
@@ -0,0 +1,93 @@
1
+ # Handle several romanization systems for Mandarin Chinese
2
+ #
3
+ # Author:: Arne Brasseur (arne@arnebrasseur.net)
4
+ # Copyright:: Copyright (c) 2007-2010, Arne Brasseur
5
+ # Licence:: GNU General Public License, v3
6
+
7
+ $: << File.dirname(__FILE__)
8
+
9
+ require 'ting/support'
10
+ require 'ting/groundwork'
11
+ require 'ting/exception'
12
+
13
+ require 'ting/tones'
14
+ require 'ting/conversion'
15
+ require 'ting/conversions'
16
+ require 'ting/conversions/hanyu'
17
+
18
+ module Ting
19
+ VERSION = "0.2.0"
20
+
21
+ class Reader
22
+ def initialize(conv, tone)
23
+ @conv = conv.to_s
24
+ @tone = Tones.const_get tone.to_s.camelcase
25
+ @cache = {}
26
+ end
27
+
28
+ def parse(str)
29
+ return @cache[str] ||= Conversions.tokenize(str).map do |s, pos|
30
+ tone,syll = @tone.pop_tone(s)
31
+ tsyll = Conversions.parse(@conv,syll)
32
+ ini, fin = tsyll.initial, tsyll.final
33
+ unless tone && fin && ini
34
+ raise ParseError.new(s,pos),"Illegal syllable <#{s}> in input <#{str}> at position #{pos}."
35
+ end
36
+ Syllable.new(ini, fin, tone)
37
+ end
38
+ rescue Object => e
39
+ raise ParseError.new(str,0), "Parsing of #{str.inspect} failed : #{e}"
40
+ end
41
+
42
+ alias :<< :parse
43
+ end
44
+
45
+ class Writer
46
+ def initialize(conv, tone)
47
+ @conv = conv.to_s
48
+ @tone = Tones.const_get tone.to_s.camelcase
49
+ @cache = {}
50
+ end
51
+
52
+ def generate(py)
53
+ conv=lambda {|syll| @tone.add_tone(Conversions.unparse(@conv,syll),syll.tone)}
54
+ return @cache[py] ||= if py.respond_to? :map
55
+ py.map(&conv).join(' ')
56
+ else
57
+ conv.call(py)
58
+ end
59
+ end
60
+
61
+ alias :<< :generate
62
+ alias :unparse :generate
63
+ end
64
+
65
+ class Converter
66
+ def initialize(from, from_tone, to, to_tone)
67
+ @reader = Reader.new(from, from_tone)
68
+ @writer = Writer.new(to, to_tone)
69
+ end
70
+
71
+ def convert(str)
72
+ @writer.unparse @reader.parse(str)
73
+ end
74
+
75
+ alias :<< :convert
76
+ end
77
+
78
+ class <<self
79
+ READERS={}
80
+ WRITERS={}
81
+
82
+ def reader(format, tones)
83
+ return READERS[[format, tones]] ||= Reader.new(format,tones)
84
+ end
85
+ def writer(format, tones)
86
+ return WRITERS[[format, tones]] ||= Writer.new(format,tones)
87
+ end
88
+ end
89
+
90
+ end
91
+
92
+
93
+ Pinyin = Ting #legacy support
@@ -0,0 +1,51 @@
1
+ module Ting
2
+
3
+ #
4
+ # Base class for conversions like Hanyu pinyin,
5
+ # Wade-Giles, etc.
6
+ #
7
+ class Conversion
8
+
9
+ # Separator between syllables in the same word
10
+ # For Wade-Giles this is a dash, Hanyu pinyin
11
+ # uses a single quote in certain situations
12
+ attr_reader :syllable_separator
13
+
14
+ # The tone handling object
15
+ attr_reader :tones
16
+
17
+ # An optional lambda that preprocesses input
18
+ attr_reader :preprocessor
19
+
20
+ # The name of this conversion, the same name used
21
+ # in the data file and that is also available as
22
+ # a method name on Initial and Final objects.
23
+ #
24
+ # By default the underscorized class name
25
+ attr_reader :name
26
+
27
+ def initialize(tone = :numbers, options = {})
28
+ @preprocessor = options[:preprocessor] || lambda {|s| s}
29
+
30
+ if Tone === tone
31
+ @tone = tone
32
+ else
33
+ @tone = Ting::Tones.const_get(tone.to_s.camelcase)
34
+ end
35
+
36
+ @name = self.class.name.underscore
37
+ end
38
+
39
+ # Converts a string into an array of strings and
40
+ # syllable objects.
41
+ def parse(string)
42
+ end
43
+
44
+ # Converts an array of strings and syllable objects
45
+ # into a string
46
+ def unparse(array)
47
+ end
48
+
49
+ end
50
+ end
51
+
@@ -0,0 +1,75 @@
1
+ require 'csv'
2
+ require 'yaml'
3
+
4
+ module Ting
5
+ module Conversions
6
+ All=[]
7
+
8
+ DATA_DIR=File.dirname(__FILE__)+'/data/'
9
+
10
+ #Load various representations for initials and finals
11
+ %w(Initial Final).each do |c|
12
+ klazz=Ting.const_get c
13
+ begin
14
+ CSV.open(DATA_DIR+c.downcase+'.csv', 'r').each do |name, *values|
15
+ All << name.to_s unless All.index name || name =~ /name|standalone/i
16
+ klazz.class_eval {attr_accessor name.to_sym}
17
+ values.each_with_index do |v,i|
18
+ klazz::All[i].send(name+'=', v)
19
+ end
20
+ end
21
+ rescue
22
+ puts "Bad data in #{c.downcase}.csv : " + $!
23
+ raise
24
+ end
25
+
26
+ end
27
+
28
+ #Substitution rules
29
+ @@rules=YAML::load(IO.read(DATA_DIR+'rules.yaml'))
30
+
31
+ def self.parse(type, string)
32
+ if (fin = Final::All.find {|f| f.respond_to?("#{type}_standalone") && f.send("#{type}_standalone") == string})
33
+ TonelessSyllable.new(Initial::Empty, fin)
34
+ else
35
+ Initial::All.find do |ini|
36
+ Final::All.find do |fin|
37
+ next if TonelessSyllable.illegal?(ini,fin)
38
+ return TonelessSyllable.new(ini,fin) if apply_rules(type, (ini.send(type)||'') + (fin.send(type)||'')) == string
39
+ end
40
+ end
41
+ end
42
+ end
43
+
44
+ def self.unparse(type, tsyll)
45
+ if tsyll.initial.send(type)
46
+ apply_rules(type, tsyll.initial.send(type) + (tsyll.final.send(type) || ''))
47
+ elsif tsyll.final.respond_to?(type.to_s+'_standalone') && standalone = tsyll.final.send(type.to_s+'_standalone')
48
+ standalone
49
+ else
50
+ apply_rules(type, tsyll.final.send(type))
51
+ end
52
+ end
53
+
54
+ def self.tokenize(str)
55
+ returning [] do |ary|
56
+ str,pos = str.dup, 0
57
+ while s=str.slice!(/[^' ]*/) and s != ""
58
+ ary << [s.strip, pos]
59
+ pos+=s.length
60
+ str.slice!(/[' ]/)
61
+ end
62
+ end
63
+ end
64
+
65
+ private
66
+ def self.apply_rules(type, string)
67
+ returning string.dup do |s|
68
+ @@rules[type] && @@rules[type].each do |rule|
69
+ s.gsub!(Regexp.new(rule['match']),rule['subst'])
70
+ end
71
+ end
72
+ end
73
+
74
+ end
75
+ end