ting 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +16 -0
- data/README.rdoc +94 -0
- data/Rakefile +15 -0
- data/TODO +15 -0
- data/examples/cgiform/cgiform.rb +24 -0
- data/examples/cgiform/template.rhtml +69 -0
- data/examples/hello.rb +12 -0
- data/lib/ting.rb +93 -0
- data/lib/ting/conversion.rb +51 -0
- data/lib/ting/conversions.rb +75 -0
- data/lib/ting/conversions/hanyu.rb +77 -0
- data/lib/ting/data/comparison.csv +410 -0
- data/lib/ting/data/final.csv +10 -0
- data/lib/ting/data/initial.csv +7 -0
- data/lib/ting/data/paladiy.txt +421 -0
- data/lib/ting/data/rules.yaml +24 -0
- data/lib/ting/data/valid_pinyin.yaml +454 -0
- data/lib/ting/exception.rb +17 -0
- data/lib/ting/groundwork.rb +177 -0
- data/lib/ting/string.rb +17 -0
- data/lib/ting/support.rb +19 -0
- data/lib/ting/tones.rb +65 -0
- data/lib/ting/tones/accents.rb +62 -0
- data/lib/ting/tones/ipa.rb +24 -0
- data/lib/ting/tones/marks.rb +30 -0
- data/lib/ting/tones/no_tones.rb +7 -0
- data/lib/ting/tones/numbers.rb +25 -0
- data/lib/ting/tones/supernum.rb +24 -0
- data/test/test_comparison.rb +35 -0
- data/test/test_hanyu_coverage.rb +35 -0
- metadata +95 -0
data/History.txt
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
== 0.2 / 2010-04-04
|
2
|
+
|
3
|
+
* Added support for superscript numeral tones and IPA tone marks
|
4
|
+
|
5
|
+
== 0.1.3 / 2008-07-18
|
6
|
+
|
7
|
+
* made compatible with the latest release of Facets
|
8
|
+
|
9
|
+
== 0.1.0 / 2007-12-14
|
10
|
+
|
11
|
+
* Converted to Hoe, bugfixes
|
12
|
+
|
13
|
+
== 0.0.1 / 2007-07-26
|
14
|
+
|
15
|
+
* Birthday!
|
16
|
+
|
data/README.rdoc
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
= Ting
|
2
|
+
|
3
|
+
Ting can convert between various systems for phonetically
|
4
|
+
writing Mandarin Chinese. It can also handle various representation
|
5
|
+
of tones, so it can be used to convert pinyin with numbers
|
6
|
+
to pinyin with tones.
|
7
|
+
|
8
|
+
Hanyu Pinyin, Bopomofo, Wade-Giles, Tongyong Pinyin
|
9
|
+
and International Phonetic Alphabet (IPA) are supported.
|
10
|
+
|
11
|
+
== SYNOPSIS
|
12
|
+
|
13
|
+
To parse your strings create a +Reader+ object. Ting.reader() takes two
|
14
|
+
parameters : the transliteration format, and the way that tones are represented.
|
15
|
+
|
16
|
+
To some extent these can be mixed and matched.
|
17
|
+
|
18
|
+
To generate pinyin/wade-giles/etc. create a +Writer+ object. Use Ting.writer()
|
19
|
+
|
20
|
+
=== Formats
|
21
|
+
|
22
|
+
* +:hanyu+ Hanyu Pinyin
|
23
|
+
* +:zhuyin+ Zhuyin Fuhao (a.k.a. Bopomofo)
|
24
|
+
* +:wadegiles+ Wade Giles
|
25
|
+
* +:ipa+ International Phonetic Alphabet
|
26
|
+
* +:tongyong+ Tongyong Pinyin
|
27
|
+
|
28
|
+
=== Tones
|
29
|
+
|
30
|
+
* +:numbers+ Simply put a number after the syllable, easy to type
|
31
|
+
* +:accents+ Use diacritics, follows the Hanyu Pinyin rules, there needs to be at least one vowel to apply this to, not usable with IPA or Bopomofo
|
32
|
+
* +:supernum+ Superscript numerals, typically used for Wade-Giles
|
33
|
+
* +:marks+ Tone mark after the syllable, typically used for Bopomofo
|
34
|
+
* +:ipa+ IPA tone marks
|
35
|
+
* +:no_tones+ Use no tones
|
36
|
+
|
37
|
+
== Examples
|
38
|
+
|
39
|
+
Parse Hanyu Pinyin
|
40
|
+
|
41
|
+
require 'ting'
|
42
|
+
|
43
|
+
reader = Ting.reader(:hanyu, :numbers)
|
44
|
+
reader << "wo3 ai4 ni3"
|
45
|
+
# => [<Ting::Syllable <initial=Empty, final=Uo, tone=3>>,
|
46
|
+
# <Ting::Syllable <initial=Empty, final=Ai, tone=4>>,
|
47
|
+
# <Ting::Syllable <initial=Ne, final=I, tone=3>>]
|
48
|
+
|
49
|
+
Generate Bopomofo
|
50
|
+
|
51
|
+
zhuyin = Ting.writer(:zhuyin, :marks)
|
52
|
+
zhuyin << (reader << "wo3 ai4 ni3")
|
53
|
+
# => "ㄨㄛˇ ㄞˋ ㄋㄧˇ"
|
54
|
+
|
55
|
+
Generate Wade-Giles
|
56
|
+
|
57
|
+
wadegiles = Ting.writer(:wadegiles, :supernum)
|
58
|
+
wadegiles << (reader << "yi1 ge5 bu2 gou4")
|
59
|
+
# => "i¹ ko pu² kou⁴"
|
60
|
+
|
61
|
+
Generate IPA
|
62
|
+
|
63
|
+
ipa = Ting.writer.new(:ipa, :ipa)
|
64
|
+
ipa << (reader << "you3 peng2 zi4 yuan2 fang1 lai2")
|
65
|
+
# => "iou˧˩˧ pʰeŋ˧˥ ts˥˩ yɛn˧˥ faŋ˥˥ lai˧˥"
|
66
|
+
|
67
|
+
Since this is such a common use case, a convenience method to add diacritics to pinyin.
|
68
|
+
|
69
|
+
require 'ting/string'
|
70
|
+
|
71
|
+
"wo3 ai4 ni3".pretty_tones
|
72
|
+
# => "wǒ ài nǐ"
|
73
|
+
|
74
|
+
Note that syllables need to be separated by spaces, feeding "peng2you3" to the parser
|
75
|
+
does not work. The String#pretty_tones method does handle these things a bit more gracefully.
|
76
|
+
|
77
|
+
If you need to parse input that does not conform, consider using a regexp to scan for valid
|
78
|
+
syllables, then feed the syllables to the parser one by one. Have a look at #pretty_tones for
|
79
|
+
an example of how to do this.
|
80
|
+
|
81
|
+
== REQUIREMENTS
|
82
|
+
|
83
|
+
* $KCODE has to be set to "UTF8" for everything to work correctly
|
84
|
+
|
85
|
+
== INSTALL
|
86
|
+
|
87
|
+
* gem install ting
|
88
|
+
|
89
|
+
== LICENSE
|
90
|
+
|
91
|
+
Copyright (c) 2004-2010, Arne Brasseur. (http://www.arnebrasseur.net)
|
92
|
+
|
93
|
+
Available as Free Software under the GPLv3 License, see LICENSE.txt for details
|
94
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
|
3
|
+
require 'rake'
|
4
|
+
require 'rake/testtask'
|
5
|
+
|
6
|
+
task :default => [:"test:ting"]
|
7
|
+
|
8
|
+
namespace "test" do
|
9
|
+
Rake::TestTask.new("ting") do |t|
|
10
|
+
$: << File.dirname(__FILE__) + '/lib'
|
11
|
+
t.pattern = 'test/*.rb'
|
12
|
+
t.verbose = true
|
13
|
+
t.warning = true
|
14
|
+
end
|
15
|
+
end
|
data/TODO
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
- Additional transcription systems
|
2
|
+
- MSP2 (or how do you call that)
|
3
|
+
- Palladiy (To make things interesting)
|
4
|
+
- Gwoyueh
|
5
|
+
- Yale
|
6
|
+
|
7
|
+
- Research some rare pinyin syllables : lo, yo ^e, yai
|
8
|
+
- Get a definitive answer about ong/ueng/weng
|
9
|
+
- Add a README to the data/ directory with info on sources, contents and purposes
|
10
|
+
- More tests
|
11
|
+
- Add remembering of parameters to cgiform example, other examples
|
12
|
+
|
13
|
+
The core lib basically does translation on the syllable level. It can handle strings with syllables nicely seperated by spaces. Successive layers should make it possible to convert a sentence with interpunction into a different system. It should be possible to write compound words together in Hanyu, and have the syllables seperated by dashes when converting to WG. For instance:
|
14
|
+
|
15
|
+
Wǒ de péngyǒu, shì dàifu. => Wǒ te p`éng-yǔ, shih tài-fu.
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
|
3
|
+
|
4
|
+
require 'cgi'
|
5
|
+
require 'erb'
|
6
|
+
|
7
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
8
|
+
require 'pinyin'
|
9
|
+
|
10
|
+
cgi=CGI.new("xhtml1")
|
11
|
+
|
12
|
+
params=cgi.params
|
13
|
+
begin
|
14
|
+
if params['pinyin'] && params['pinyin'] != '' && params['pinyin'] != []
|
15
|
+
@converted = Pinyin::Writer.new(params['to'], params['to_tone']) << (Pinyin::Reader.new(params['from'],params['from_tone']) << params['pinyin'].first)
|
16
|
+
end
|
17
|
+
rescue
|
18
|
+
cgi.out{$!.to_s}
|
19
|
+
cgi.out{params['pinyin'].inspect}
|
20
|
+
end
|
21
|
+
|
22
|
+
cgi.out("text/html; charset=utf-8") do
|
23
|
+
ERB.new(IO.read('template.rhtml')).result(binding)
|
24
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
<!doctype html>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>Ruby Pinyin CGIForm example</title>
|
5
|
+
<style type='text/css'>
|
6
|
+
body {
|
7
|
+
font-family: sans-serif;
|
8
|
+
}
|
9
|
+
|
10
|
+
div#wrap {
|
11
|
+
width: 40%;
|
12
|
+
margin: 0 auto;
|
13
|
+
}
|
14
|
+
|
15
|
+
table {
|
16
|
+
width: 100%;
|
17
|
+
}
|
18
|
+
div#converted_text {
|
19
|
+
border: 1px dotted #000;
|
20
|
+
}
|
21
|
+
|
22
|
+
textarea {
|
23
|
+
width: 100%;
|
24
|
+
height: 10em;
|
25
|
+
margin: 0 auto;
|
26
|
+
}
|
27
|
+
</style>
|
28
|
+
</head>
|
29
|
+
<body>
|
30
|
+
<div id='wrap'>
|
31
|
+
<h2>Pinyin example application</h2>
|
32
|
+
<h3>Enter some pinyin text and choose your format</h3>
|
33
|
+
<table>
|
34
|
+
<form method='post'>
|
35
|
+
<tr>
|
36
|
+
<td colspan='2'>
|
37
|
+
<textarea name='pinyin'><%=params['pinyin'].first if params['pinyin'] != []%></textarea>
|
38
|
+
</td>
|
39
|
+
</tr>
|
40
|
+
|
41
|
+
<tr><td>From</td><td>To</td></tr>
|
42
|
+
<% Pinyin::Conversions::All.each do |f|%>
|
43
|
+
<tr>
|
44
|
+
<td><input type='radio' name='from' value='<%=f%>'><%=f.capitalize%></input></td>
|
45
|
+
<td><input type='radio' name='to' value='<%=f%>'><%=f.capitalize%></input></td>
|
46
|
+
</tr>
|
47
|
+
<% end %>
|
48
|
+
<tr><td>From tone</td><td>To tone</td></tr>
|
49
|
+
<% Pinyin::Tones::All.each do |f|%>
|
50
|
+
<tr>
|
51
|
+
<td><input type='radio' name='from_tone' value='<%=f%>'><%=f.capitalize%></input></td>
|
52
|
+
<td><input type='radio' name='to_tone' value='<%=f%>'><%=f.capitalize%></input></td>
|
53
|
+
</tr>
|
54
|
+
<% end %>
|
55
|
+
<tr>
|
56
|
+
<td><input type='submit'></input></td>
|
57
|
+
<td> </td>
|
58
|
+
</tr>
|
59
|
+
</form>
|
60
|
+
</table>
|
61
|
+
<% if @converted %>
|
62
|
+
<h2>Converted:</h2>
|
63
|
+
<div id='converted_text'>
|
64
|
+
<%= @converted %>
|
65
|
+
</div>
|
66
|
+
<% end %>
|
67
|
+
</div>
|
68
|
+
</body>
|
69
|
+
</html>
|
data/examples/hello.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
$: << File.join(File.dirname(__FILE__), '../lib')
|
2
|
+
|
3
|
+
require 'pinyin'
|
4
|
+
|
5
|
+
conv1 = Pinyin::Converter.new(:hanyu, :numbers, :wadegiles, :accents)
|
6
|
+
conv2 = Pinyin::Converter.new(:wadegiles, :accents, :zhuyin, :marks)
|
7
|
+
|
8
|
+
pinyin = 'wo3 de2 peng2 you3 shi4 dai4 fu'
|
9
|
+
wadegiles = conv1 << pinyin
|
10
|
+
zhuyin = conv2 << wadegiles
|
11
|
+
|
12
|
+
puts pinyin, wadegiles, zhuyin
|
data/lib/ting.rb
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
# Handle several romanization systems for Mandarin Chinese
|
2
|
+
#
|
3
|
+
# Author:: Arne Brasseur (arne@arnebrasseur.net)
|
4
|
+
# Copyright:: Copyright (c) 2007-2010, Arne Brasseur
|
5
|
+
# Licence:: GNU General Public License, v3
|
6
|
+
|
7
|
+
$: << File.dirname(__FILE__)
|
8
|
+
|
9
|
+
require 'ting/support'
|
10
|
+
require 'ting/groundwork'
|
11
|
+
require 'ting/exception'
|
12
|
+
|
13
|
+
require 'ting/tones'
|
14
|
+
require 'ting/conversion'
|
15
|
+
require 'ting/conversions'
|
16
|
+
require 'ting/conversions/hanyu'
|
17
|
+
|
18
|
+
module Ting
|
19
|
+
VERSION = "0.2.0"
|
20
|
+
|
21
|
+
class Reader
|
22
|
+
def initialize(conv, tone)
|
23
|
+
@conv = conv.to_s
|
24
|
+
@tone = Tones.const_get tone.to_s.camelcase
|
25
|
+
@cache = {}
|
26
|
+
end
|
27
|
+
|
28
|
+
def parse(str)
|
29
|
+
return @cache[str] ||= Conversions.tokenize(str).map do |s, pos|
|
30
|
+
tone,syll = @tone.pop_tone(s)
|
31
|
+
tsyll = Conversions.parse(@conv,syll)
|
32
|
+
ini, fin = tsyll.initial, tsyll.final
|
33
|
+
unless tone && fin && ini
|
34
|
+
raise ParseError.new(s,pos),"Illegal syllable <#{s}> in input <#{str}> at position #{pos}."
|
35
|
+
end
|
36
|
+
Syllable.new(ini, fin, tone)
|
37
|
+
end
|
38
|
+
rescue Object => e
|
39
|
+
raise ParseError.new(str,0), "Parsing of #{str.inspect} failed : #{e}"
|
40
|
+
end
|
41
|
+
|
42
|
+
alias :<< :parse
|
43
|
+
end
|
44
|
+
|
45
|
+
class Writer
|
46
|
+
def initialize(conv, tone)
|
47
|
+
@conv = conv.to_s
|
48
|
+
@tone = Tones.const_get tone.to_s.camelcase
|
49
|
+
@cache = {}
|
50
|
+
end
|
51
|
+
|
52
|
+
def generate(py)
|
53
|
+
conv=lambda {|syll| @tone.add_tone(Conversions.unparse(@conv,syll),syll.tone)}
|
54
|
+
return @cache[py] ||= if py.respond_to? :map
|
55
|
+
py.map(&conv).join(' ')
|
56
|
+
else
|
57
|
+
conv.call(py)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
alias :<< :generate
|
62
|
+
alias :unparse :generate
|
63
|
+
end
|
64
|
+
|
65
|
+
class Converter
|
66
|
+
def initialize(from, from_tone, to, to_tone)
|
67
|
+
@reader = Reader.new(from, from_tone)
|
68
|
+
@writer = Writer.new(to, to_tone)
|
69
|
+
end
|
70
|
+
|
71
|
+
def convert(str)
|
72
|
+
@writer.unparse @reader.parse(str)
|
73
|
+
end
|
74
|
+
|
75
|
+
alias :<< :convert
|
76
|
+
end
|
77
|
+
|
78
|
+
class <<self
|
79
|
+
READERS={}
|
80
|
+
WRITERS={}
|
81
|
+
|
82
|
+
def reader(format, tones)
|
83
|
+
return READERS[[format, tones]] ||= Reader.new(format,tones)
|
84
|
+
end
|
85
|
+
def writer(format, tones)
|
86
|
+
return WRITERS[[format, tones]] ||= Writer.new(format,tones)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
|
93
|
+
Pinyin = Ting #legacy support
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module Ting
|
2
|
+
|
3
|
+
#
|
4
|
+
# Base class for conversions like Hanyu pinyin,
|
5
|
+
# Wade-Giles, etc.
|
6
|
+
#
|
7
|
+
class Conversion
|
8
|
+
|
9
|
+
# Separator between syllables in the same word
|
10
|
+
# For Wade-Giles this is a dash, Hanyu pinyin
|
11
|
+
# uses a single quote in certain situations
|
12
|
+
attr_reader :syllable_separator
|
13
|
+
|
14
|
+
# The tone handling object
|
15
|
+
attr_reader :tones
|
16
|
+
|
17
|
+
# An optional lambda that preprocesses input
|
18
|
+
attr_reader :preprocessor
|
19
|
+
|
20
|
+
# The name of this conversion, the same name used
|
21
|
+
# in the data file and that is also available as
|
22
|
+
# a method name on Initial and Final objects.
|
23
|
+
#
|
24
|
+
# By default the underscorized class name
|
25
|
+
attr_reader :name
|
26
|
+
|
27
|
+
def initialize(tone = :numbers, options = {})
|
28
|
+
@preprocessor = options[:preprocessor] || lambda {|s| s}
|
29
|
+
|
30
|
+
if Tone === tone
|
31
|
+
@tone = tone
|
32
|
+
else
|
33
|
+
@tone = Ting::Tones.const_get(tone.to_s.camelcase)
|
34
|
+
end
|
35
|
+
|
36
|
+
@name = self.class.name.underscore
|
37
|
+
end
|
38
|
+
|
39
|
+
# Converts a string into an array of strings and
|
40
|
+
# syllable objects.
|
41
|
+
def parse(string)
|
42
|
+
end
|
43
|
+
|
44
|
+
# Converts an array of strings and syllable objects
|
45
|
+
# into a string
|
46
|
+
def unparse(array)
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'yaml'
|
3
|
+
|
4
|
+
module Ting
|
5
|
+
module Conversions
|
6
|
+
All=[]
|
7
|
+
|
8
|
+
DATA_DIR=File.dirname(__FILE__)+'/data/'
|
9
|
+
|
10
|
+
#Load various representations for initials and finals
|
11
|
+
%w(Initial Final).each do |c|
|
12
|
+
klazz=Ting.const_get c
|
13
|
+
begin
|
14
|
+
CSV.open(DATA_DIR+c.downcase+'.csv', 'r').each do |name, *values|
|
15
|
+
All << name.to_s unless All.index name || name =~ /name|standalone/i
|
16
|
+
klazz.class_eval {attr_accessor name.to_sym}
|
17
|
+
values.each_with_index do |v,i|
|
18
|
+
klazz::All[i].send(name+'=', v)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
rescue
|
22
|
+
puts "Bad data in #{c.downcase}.csv : " + $!
|
23
|
+
raise
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
#Substitution rules
|
29
|
+
@@rules=YAML::load(IO.read(DATA_DIR+'rules.yaml'))
|
30
|
+
|
31
|
+
def self.parse(type, string)
|
32
|
+
if (fin = Final::All.find {|f| f.respond_to?("#{type}_standalone") && f.send("#{type}_standalone") == string})
|
33
|
+
TonelessSyllable.new(Initial::Empty, fin)
|
34
|
+
else
|
35
|
+
Initial::All.find do |ini|
|
36
|
+
Final::All.find do |fin|
|
37
|
+
next if TonelessSyllable.illegal?(ini,fin)
|
38
|
+
return TonelessSyllable.new(ini,fin) if apply_rules(type, (ini.send(type)||'') + (fin.send(type)||'')) == string
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.unparse(type, tsyll)
|
45
|
+
if tsyll.initial.send(type)
|
46
|
+
apply_rules(type, tsyll.initial.send(type) + (tsyll.final.send(type) || ''))
|
47
|
+
elsif tsyll.final.respond_to?(type.to_s+'_standalone') && standalone = tsyll.final.send(type.to_s+'_standalone')
|
48
|
+
standalone
|
49
|
+
else
|
50
|
+
apply_rules(type, tsyll.final.send(type))
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def self.tokenize(str)
|
55
|
+
returning [] do |ary|
|
56
|
+
str,pos = str.dup, 0
|
57
|
+
while s=str.slice!(/[^' ]*/) and s != ""
|
58
|
+
ary << [s.strip, pos]
|
59
|
+
pos+=s.length
|
60
|
+
str.slice!(/[' ]/)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
def self.apply_rules(type, string)
|
67
|
+
returning string.dup do |s|
|
68
|
+
@@rules[type] && @@rules[type].each do |rule|
|
69
|
+
s.gsub!(Regexp.new(rule['match']),rule['subst'])
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
end
|