ting 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +16 -0
- data/README.rdoc +94 -0
- data/Rakefile +15 -0
- data/TODO +15 -0
- data/examples/cgiform/cgiform.rb +24 -0
- data/examples/cgiform/template.rhtml +69 -0
- data/examples/hello.rb +12 -0
- data/lib/ting.rb +93 -0
- data/lib/ting/conversion.rb +51 -0
- data/lib/ting/conversions.rb +75 -0
- data/lib/ting/conversions/hanyu.rb +77 -0
- data/lib/ting/data/comparison.csv +410 -0
- data/lib/ting/data/final.csv +10 -0
- data/lib/ting/data/initial.csv +7 -0
- data/lib/ting/data/paladiy.txt +421 -0
- data/lib/ting/data/rules.yaml +24 -0
- data/lib/ting/data/valid_pinyin.yaml +454 -0
- data/lib/ting/exception.rb +17 -0
- data/lib/ting/groundwork.rb +177 -0
- data/lib/ting/string.rb +17 -0
- data/lib/ting/support.rb +19 -0
- data/lib/ting/tones.rb +65 -0
- data/lib/ting/tones/accents.rb +62 -0
- data/lib/ting/tones/ipa.rb +24 -0
- data/lib/ting/tones/marks.rb +30 -0
- data/lib/ting/tones/no_tones.rb +7 -0
- data/lib/ting/tones/numbers.rb +25 -0
- data/lib/ting/tones/supernum.rb +24 -0
- data/test/test_comparison.rb +35 -0
- data/test/test_hanyu_coverage.rb +35 -0
- metadata +95 -0
data/History.txt
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
== 0.2 / 2010-04-04
|
2
|
+
|
3
|
+
* Added support for superscript numeral tones and IPA tone marks
|
4
|
+
|
5
|
+
== 0.1.3 / 2008-07-18
|
6
|
+
|
7
|
+
* made compatible with the latest release of Facets
|
8
|
+
|
9
|
+
== 0.1.0 / 2007-12-14
|
10
|
+
|
11
|
+
* Converted to Hoe, bugfixes
|
12
|
+
|
13
|
+
== 0.0.1 / 2007-07-26
|
14
|
+
|
15
|
+
* Birthday!
|
16
|
+
|
data/README.rdoc
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
= Ting
|
2
|
+
|
3
|
+
Ting can convert between various systems for phonetically
|
4
|
+
writing Mandarin Chinese. It can also handle various representation
|
5
|
+
of tones, so it can be used to convert pinyin with numbers
|
6
|
+
to pinyin with tones.
|
7
|
+
|
8
|
+
Hanyu Pinyin, Bopomofo, Wade-Giles, Tongyong Pinyin
|
9
|
+
and International Phonetic Alphabet (IPA) are supported.
|
10
|
+
|
11
|
+
== SYNOPSIS
|
12
|
+
|
13
|
+
To parse your strings create a +Reader+ object. Ting.reader() takes two
|
14
|
+
parameters : the transliteration format, and the way that tones are represented.
|
15
|
+
|
16
|
+
To some extent these can be mixed and matched.
|
17
|
+
|
18
|
+
To generate pinyin/wade-giles/etc. create a +Writer+ object. Use Ting.writer()
|
19
|
+
|
20
|
+
=== Formats
|
21
|
+
|
22
|
+
* +:hanyu+ Hanyu Pinyin
|
23
|
+
* +:zhuyin+ Zhuyin Fuhao (a.k.a. Bopomofo)
|
24
|
+
* +:wadegiles+ Wade Giles
|
25
|
+
* +:ipa+ International Phonetic Alphabet
|
26
|
+
* +:tongyong+ Tongyong Pinyin
|
27
|
+
|
28
|
+
=== Tones
|
29
|
+
|
30
|
+
* +:numbers+ Simply put a number after the syllable, easy to type
|
31
|
+
* +:accents+ Use diacritics, follows the Hanyu Pinyin rules, there needs to be at least one vowel to apply this to, not usable with IPA or Bopomofo
|
32
|
+
* +:supernum+ Superscript numerals, typically used for Wade-Giles
|
33
|
+
* +:marks+ Tone mark after the syllable, typically used for Bopomofo
|
34
|
+
* +:ipa+ IPA tone marks
|
35
|
+
* +:no_tones+ Use no tones
|
36
|
+
|
37
|
+
== Examples
|
38
|
+
|
39
|
+
Parse Hanyu Pinyin
|
40
|
+
|
41
|
+
require 'ting'
|
42
|
+
|
43
|
+
reader = Ting.reader(:hanyu, :numbers)
|
44
|
+
reader << "wo3 ai4 ni3"
|
45
|
+
# => [<Ting::Syllable <initial=Empty, final=Uo, tone=3>>,
|
46
|
+
# <Ting::Syllable <initial=Empty, final=Ai, tone=4>>,
|
47
|
+
# <Ting::Syllable <initial=Ne, final=I, tone=3>>]
|
48
|
+
|
49
|
+
Generate Bopomofo
|
50
|
+
|
51
|
+
zhuyin = Ting.writer(:zhuyin, :marks)
|
52
|
+
zhuyin << (reader << "wo3 ai4 ni3")
|
53
|
+
# => "ㄨㄛˇ ㄞˋ ㄋㄧˇ"
|
54
|
+
|
55
|
+
Generate Wade-Giles
|
56
|
+
|
57
|
+
wadegiles = Ting.writer(:wadegiles, :supernum)
|
58
|
+
wadegiles << (reader << "yi1 ge5 bu2 gou4")
|
59
|
+
# => "i¹ ko pu² kou⁴"
|
60
|
+
|
61
|
+
Generate IPA
|
62
|
+
|
63
|
+
ipa = Ting.writer.new(:ipa, :ipa)
|
64
|
+
ipa << (reader << "you3 peng2 zi4 yuan2 fang1 lai2")
|
65
|
+
# => "iou˧˩˧ pʰeŋ˧˥ ts˥˩ yɛn˧˥ faŋ˥˥ lai˧˥"
|
66
|
+
|
67
|
+
Since this is such a common use case, a convenience method to add diacritics to pinyin.
|
68
|
+
|
69
|
+
require 'ting/string'
|
70
|
+
|
71
|
+
"wo3 ai4 ni3".pretty_tones
|
72
|
+
# => "wǒ ài nǐ"
|
73
|
+
|
74
|
+
Note that syllables need to be separated by spaces, feeding "peng2you3" to the parser
|
75
|
+
does not work. The String#pretty_tones method does handle these things a bit more gracefully.
|
76
|
+
|
77
|
+
If you need to parse input that does not conform, consider using a regexp to scan for valid
|
78
|
+
syllables, then feed the syllables to the parser one by one. Have a look at #pretty_tones for
|
79
|
+
an example of how to do this.
|
80
|
+
|
81
|
+
== REQUIREMENTS
|
82
|
+
|
83
|
+
* $KCODE has to be set to "UTF8" for everything to work correctly
|
84
|
+
|
85
|
+
== INSTALL
|
86
|
+
|
87
|
+
* gem install ting
|
88
|
+
|
89
|
+
== LICENSE
|
90
|
+
|
91
|
+
Copyright (c) 2004-2010, Arne Brasseur. (http://www.arnebrasseur.net)
|
92
|
+
|
93
|
+
Available as Free Software under the GPLv3 License, see LICENSE.txt for details
|
94
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
|
3
|
+
require 'rake'
|
4
|
+
require 'rake/testtask'
|
5
|
+
|
6
|
+
task :default => [:"test:ting"]
|
7
|
+
|
8
|
+
namespace "test" do
|
9
|
+
Rake::TestTask.new("ting") do |t|
|
10
|
+
$: << File.dirname(__FILE__) + '/lib'
|
11
|
+
t.pattern = 'test/*.rb'
|
12
|
+
t.verbose = true
|
13
|
+
t.warning = true
|
14
|
+
end
|
15
|
+
end
|
data/TODO
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
- Additional transcription systems
|
2
|
+
- MSP2 (or how do you call that)
|
3
|
+
- Palladiy (To make things interesting)
|
4
|
+
- Gwoyueh
|
5
|
+
- Yale
|
6
|
+
|
7
|
+
- Research some rare pinyin syllables : lo, yo ^e, yai
|
8
|
+
- Get a definitive answer about ong/ueng/weng
|
9
|
+
- Add a README to the data/ directory with info on sources, contents and purposes
|
10
|
+
- More tests
|
11
|
+
- Add remembering of parameters to cgiform example, other examples
|
12
|
+
|
13
|
+
The core lib basically does translation on the syllable level. It can handle strings with syllables nicely seperated by spaces. Successive layers should make it possible to convert a sentence with interpunction into a different system. It should be possible to write compound words together in Hanyu, and have the syllables seperated by dashes when converting to WG. For instance:
|
14
|
+
|
15
|
+
Wǒ de péngyǒu, shì dàifu. => Wǒ te p`éng-yǔ, shih tài-fu.
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
|
3
|
+
|
4
|
+
require 'cgi'
|
5
|
+
require 'erb'
|
6
|
+
|
7
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
8
|
+
require 'pinyin'
|
9
|
+
|
10
|
+
cgi=CGI.new("xhtml1")
|
11
|
+
|
12
|
+
params=cgi.params
|
13
|
+
begin
|
14
|
+
if params['pinyin'] && params['pinyin'] != '' && params['pinyin'] != []
|
15
|
+
@converted = Pinyin::Writer.new(params['to'], params['to_tone']) << (Pinyin::Reader.new(params['from'],params['from_tone']) << params['pinyin'].first)
|
16
|
+
end
|
17
|
+
rescue
|
18
|
+
cgi.out{$!.to_s}
|
19
|
+
cgi.out{params['pinyin'].inspect}
|
20
|
+
end
|
21
|
+
|
22
|
+
cgi.out("text/html; charset=utf-8") do
|
23
|
+
ERB.new(IO.read('template.rhtml')).result(binding)
|
24
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
<!doctype html>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>Ruby Pinyin CGIForm example</title>
|
5
|
+
<style type='text/css'>
|
6
|
+
body {
|
7
|
+
font-family: sans-serif;
|
8
|
+
}
|
9
|
+
|
10
|
+
div#wrap {
|
11
|
+
width: 40%;
|
12
|
+
margin: 0 auto;
|
13
|
+
}
|
14
|
+
|
15
|
+
table {
|
16
|
+
width: 100%;
|
17
|
+
}
|
18
|
+
div#converted_text {
|
19
|
+
border: 1px dotted #000;
|
20
|
+
}
|
21
|
+
|
22
|
+
textarea {
|
23
|
+
width: 100%;
|
24
|
+
height: 10em;
|
25
|
+
margin: 0 auto;
|
26
|
+
}
|
27
|
+
</style>
|
28
|
+
</head>
|
29
|
+
<body>
|
30
|
+
<div id='wrap'>
|
31
|
+
<h2>Pinyin example application</h2>
|
32
|
+
<h3>Enter some pinyin text and choose your format</h3>
|
33
|
+
<table>
|
34
|
+
<form method='post'>
|
35
|
+
<tr>
|
36
|
+
<td colspan='2'>
|
37
|
+
<textarea name='pinyin'><%=params['pinyin'].first if params['pinyin'] != []%></textarea>
|
38
|
+
</td>
|
39
|
+
</tr>
|
40
|
+
|
41
|
+
<tr><td>From</td><td>To</td></tr>
|
42
|
+
<% Pinyin::Conversions::All.each do |f|%>
|
43
|
+
<tr>
|
44
|
+
<td><input type='radio' name='from' value='<%=f%>'><%=f.capitalize%></input></td>
|
45
|
+
<td><input type='radio' name='to' value='<%=f%>'><%=f.capitalize%></input></td>
|
46
|
+
</tr>
|
47
|
+
<% end %>
|
48
|
+
<tr><td>From tone</td><td>To tone</td></tr>
|
49
|
+
<% Pinyin::Tones::All.each do |f|%>
|
50
|
+
<tr>
|
51
|
+
<td><input type='radio' name='from_tone' value='<%=f%>'><%=f.capitalize%></input></td>
|
52
|
+
<td><input type='radio' name='to_tone' value='<%=f%>'><%=f.capitalize%></input></td>
|
53
|
+
</tr>
|
54
|
+
<% end %>
|
55
|
+
<tr>
|
56
|
+
<td><input type='submit'></input></td>
|
57
|
+
<td> </td>
|
58
|
+
</tr>
|
59
|
+
</form>
|
60
|
+
</table>
|
61
|
+
<% if @converted %>
|
62
|
+
<h2>Converted:</h2>
|
63
|
+
<div id='converted_text'>
|
64
|
+
<%= @converted %>
|
65
|
+
</div>
|
66
|
+
<% end %>
|
67
|
+
</div>
|
68
|
+
</body>
|
69
|
+
</html>
|
data/examples/hello.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
$: << File.join(File.dirname(__FILE__), '../lib')
|
2
|
+
|
3
|
+
require 'pinyin'
|
4
|
+
|
5
|
+
conv1 = Pinyin::Converter.new(:hanyu, :numbers, :wadegiles, :accents)
|
6
|
+
conv2 = Pinyin::Converter.new(:wadegiles, :accents, :zhuyin, :marks)
|
7
|
+
|
8
|
+
pinyin = 'wo3 de2 peng2 you3 shi4 dai4 fu'
|
9
|
+
wadegiles = conv1 << pinyin
|
10
|
+
zhuyin = conv2 << wadegiles
|
11
|
+
|
12
|
+
puts pinyin, wadegiles, zhuyin
|
data/lib/ting.rb
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
# Handle several romanization systems for Mandarin Chinese
|
2
|
+
#
|
3
|
+
# Author:: Arne Brasseur (arne@arnebrasseur.net)
|
4
|
+
# Copyright:: Copyright (c) 2007-2010, Arne Brasseur
|
5
|
+
# Licence:: GNU General Public License, v3
|
6
|
+
|
7
|
+
$: << File.dirname(__FILE__)
|
8
|
+
|
9
|
+
require 'ting/support'
|
10
|
+
require 'ting/groundwork'
|
11
|
+
require 'ting/exception'
|
12
|
+
|
13
|
+
require 'ting/tones'
|
14
|
+
require 'ting/conversion'
|
15
|
+
require 'ting/conversions'
|
16
|
+
require 'ting/conversions/hanyu'
|
17
|
+
|
18
|
+
module Ting
|
19
|
+
VERSION = "0.2.0"
|
20
|
+
|
21
|
+
class Reader
|
22
|
+
def initialize(conv, tone)
|
23
|
+
@conv = conv.to_s
|
24
|
+
@tone = Tones.const_get tone.to_s.camelcase
|
25
|
+
@cache = {}
|
26
|
+
end
|
27
|
+
|
28
|
+
def parse(str)
|
29
|
+
return @cache[str] ||= Conversions.tokenize(str).map do |s, pos|
|
30
|
+
tone,syll = @tone.pop_tone(s)
|
31
|
+
tsyll = Conversions.parse(@conv,syll)
|
32
|
+
ini, fin = tsyll.initial, tsyll.final
|
33
|
+
unless tone && fin && ini
|
34
|
+
raise ParseError.new(s,pos),"Illegal syllable <#{s}> in input <#{str}> at position #{pos}."
|
35
|
+
end
|
36
|
+
Syllable.new(ini, fin, tone)
|
37
|
+
end
|
38
|
+
rescue Object => e
|
39
|
+
raise ParseError.new(str,0), "Parsing of #{str.inspect} failed : #{e}"
|
40
|
+
end
|
41
|
+
|
42
|
+
alias :<< :parse
|
43
|
+
end
|
44
|
+
|
45
|
+
class Writer
|
46
|
+
def initialize(conv, tone)
|
47
|
+
@conv = conv.to_s
|
48
|
+
@tone = Tones.const_get tone.to_s.camelcase
|
49
|
+
@cache = {}
|
50
|
+
end
|
51
|
+
|
52
|
+
def generate(py)
|
53
|
+
conv=lambda {|syll| @tone.add_tone(Conversions.unparse(@conv,syll),syll.tone)}
|
54
|
+
return @cache[py] ||= if py.respond_to? :map
|
55
|
+
py.map(&conv).join(' ')
|
56
|
+
else
|
57
|
+
conv.call(py)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
alias :<< :generate
|
62
|
+
alias :unparse :generate
|
63
|
+
end
|
64
|
+
|
65
|
+
class Converter
|
66
|
+
def initialize(from, from_tone, to, to_tone)
|
67
|
+
@reader = Reader.new(from, from_tone)
|
68
|
+
@writer = Writer.new(to, to_tone)
|
69
|
+
end
|
70
|
+
|
71
|
+
def convert(str)
|
72
|
+
@writer.unparse @reader.parse(str)
|
73
|
+
end
|
74
|
+
|
75
|
+
alias :<< :convert
|
76
|
+
end
|
77
|
+
|
78
|
+
class <<self
|
79
|
+
READERS={}
|
80
|
+
WRITERS={}
|
81
|
+
|
82
|
+
def reader(format, tones)
|
83
|
+
return READERS[[format, tones]] ||= Reader.new(format,tones)
|
84
|
+
end
|
85
|
+
def writer(format, tones)
|
86
|
+
return WRITERS[[format, tones]] ||= Writer.new(format,tones)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
|
93
|
+
Pinyin = Ting #legacy support
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module Ting
|
2
|
+
|
3
|
+
#
|
4
|
+
# Base class for conversions like Hanyu pinyin,
|
5
|
+
# Wade-Giles, etc.
|
6
|
+
#
|
7
|
+
class Conversion
|
8
|
+
|
9
|
+
# Separator between syllables in the same word
|
10
|
+
# For Wade-Giles this is a dash, Hanyu pinyin
|
11
|
+
# uses a single quote in certain situations
|
12
|
+
attr_reader :syllable_separator
|
13
|
+
|
14
|
+
# The tone handling object
|
15
|
+
attr_reader :tones
|
16
|
+
|
17
|
+
# An optional lambda that preprocesses input
|
18
|
+
attr_reader :preprocessor
|
19
|
+
|
20
|
+
# The name of this conversion, the same name used
|
21
|
+
# in the data file and that is also available as
|
22
|
+
# a method name on Initial and Final objects.
|
23
|
+
#
|
24
|
+
# By default the underscorized class name
|
25
|
+
attr_reader :name
|
26
|
+
|
27
|
+
def initialize(tone = :numbers, options = {})
|
28
|
+
@preprocessor = options[:preprocessor] || lambda {|s| s}
|
29
|
+
|
30
|
+
if Tone === tone
|
31
|
+
@tone = tone
|
32
|
+
else
|
33
|
+
@tone = Ting::Tones.const_get(tone.to_s.camelcase)
|
34
|
+
end
|
35
|
+
|
36
|
+
@name = self.class.name.underscore
|
37
|
+
end
|
38
|
+
|
39
|
+
# Converts a string into an array of strings and
|
40
|
+
# syllable objects.
|
41
|
+
def parse(string)
|
42
|
+
end
|
43
|
+
|
44
|
+
# Converts an array of strings and syllable objects
|
45
|
+
# into a string
|
46
|
+
def unparse(array)
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'yaml'
|
3
|
+
|
4
|
+
module Ting
|
5
|
+
module Conversions
|
6
|
+
All=[]
|
7
|
+
|
8
|
+
DATA_DIR=File.dirname(__FILE__)+'/data/'
|
9
|
+
|
10
|
+
#Load various representations for initials and finals
|
11
|
+
%w(Initial Final).each do |c|
|
12
|
+
klazz=Ting.const_get c
|
13
|
+
begin
|
14
|
+
CSV.open(DATA_DIR+c.downcase+'.csv', 'r').each do |name, *values|
|
15
|
+
All << name.to_s unless All.index name || name =~ /name|standalone/i
|
16
|
+
klazz.class_eval {attr_accessor name.to_sym}
|
17
|
+
values.each_with_index do |v,i|
|
18
|
+
klazz::All[i].send(name+'=', v)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
rescue
|
22
|
+
puts "Bad data in #{c.downcase}.csv : " + $!
|
23
|
+
raise
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
#Substitution rules
|
29
|
+
@@rules=YAML::load(IO.read(DATA_DIR+'rules.yaml'))
|
30
|
+
|
31
|
+
def self.parse(type, string)
|
32
|
+
if (fin = Final::All.find {|f| f.respond_to?("#{type}_standalone") && f.send("#{type}_standalone") == string})
|
33
|
+
TonelessSyllable.new(Initial::Empty, fin)
|
34
|
+
else
|
35
|
+
Initial::All.find do |ini|
|
36
|
+
Final::All.find do |fin|
|
37
|
+
next if TonelessSyllable.illegal?(ini,fin)
|
38
|
+
return TonelessSyllable.new(ini,fin) if apply_rules(type, (ini.send(type)||'') + (fin.send(type)||'')) == string
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.unparse(type, tsyll)
|
45
|
+
if tsyll.initial.send(type)
|
46
|
+
apply_rules(type, tsyll.initial.send(type) + (tsyll.final.send(type) || ''))
|
47
|
+
elsif tsyll.final.respond_to?(type.to_s+'_standalone') && standalone = tsyll.final.send(type.to_s+'_standalone')
|
48
|
+
standalone
|
49
|
+
else
|
50
|
+
apply_rules(type, tsyll.final.send(type))
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def self.tokenize(str)
|
55
|
+
returning [] do |ary|
|
56
|
+
str,pos = str.dup, 0
|
57
|
+
while s=str.slice!(/[^' ]*/) and s != ""
|
58
|
+
ary << [s.strip, pos]
|
59
|
+
pos+=s.length
|
60
|
+
str.slice!(/[' ]/)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
def self.apply_rules(type, string)
|
67
|
+
returning string.dup do |s|
|
68
|
+
@@rules[type] && @@rules[type].each do |rule|
|
69
|
+
s.gsub!(Regexp.new(rule['match']),rule['subst'])
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
end
|