arnebrasseur-pinyin 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +12 -0
- data/Manifest.txt +31 -0
- data/README.txt +50 -0
- data/TODO +23 -0
- data/examples/cgiform/cgiform.rb +24 -0
- data/examples/cgiform/template.rhtml +69 -0
- data/examples/hello.rb +12 -0
- data/lib/pinyin.rb +90 -0
- data/lib/pinyin/conversion.rb +51 -0
- data/lib/pinyin/conversions.rb +75 -0
- data/lib/pinyin/conversions/hanyu.rb +77 -0
- data/lib/pinyin/data/comparison.csv +410 -0
- data/lib/pinyin/data/final.csv +10 -0
- data/lib/pinyin/data/initial.csv +7 -0
- data/lib/pinyin/data/paladiy.txt +421 -0
- data/lib/pinyin/data/rules.yaml +24 -0
- data/lib/pinyin/data/valid_pinyin.yaml +454 -0
- data/lib/pinyin/exception.rb +14 -0
- data/lib/pinyin/groundwork.rb +183 -0
- data/lib/pinyin/string.rb +16 -0
- data/lib/pinyin/support.rb +12 -0
- data/lib/pinyin/tones.rb +47 -0
- data/lib/pinyin/tones/accents.rb +62 -0
- data/lib/pinyin/tones/marks.rb +30 -0
- data/lib/pinyin/tones/no_tones.rb +6 -0
- data/lib/pinyin/tones/numbers.rb +25 -0
- data/rakefile +22 -0
- data/script/update +4 -0
- data/test/test_comparison.rb +35 -0
- data/test/test_hanyu_coverage.rb +35 -0
- metadata +102 -0
data/History.txt
ADDED
data/Manifest.txt
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
History.txt
|
2
|
+
Manifest.txt
|
3
|
+
README.txt
|
4
|
+
rakefile
|
5
|
+
TODO
|
6
|
+
examples/cgiform/cgiform.rb
|
7
|
+
examples/cgiform/template.rhtml
|
8
|
+
examples/hello.rb
|
9
|
+
lib/pinyin.rb
|
10
|
+
lib/pinyin/conversion.rb
|
11
|
+
lib/pinyin/conversions.rb
|
12
|
+
lib/pinyin/conversions/hanyu.rb
|
13
|
+
lib/pinyin/data/comparison.csv
|
14
|
+
lib/pinyin/data/final.csv
|
15
|
+
lib/pinyin/data/initial.csv
|
16
|
+
lib/pinyin/data/paladiy.txt
|
17
|
+
lib/pinyin/data/rules.yaml
|
18
|
+
lib/pinyin/data/valid_pinyin.yaml
|
19
|
+
lib/pinyin/exception.rb
|
20
|
+
lib/pinyin/groundwork.rb
|
21
|
+
lib/pinyin/string.rb
|
22
|
+
lib/pinyin/support.rb
|
23
|
+
lib/pinyin/tones.rb
|
24
|
+
lib/pinyin/tones/accents.rb
|
25
|
+
lib/pinyin/tones/marks.rb
|
26
|
+
lib/pinyin/tones/no_tones.rb
|
27
|
+
lib/pinyin/tones/numbers.rb
|
28
|
+
rakefile
|
29
|
+
script/update
|
30
|
+
test/test_comparison.rb
|
31
|
+
test/test_hanyu_coverage.rb
|
data/README.txt
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
pinyin
|
2
|
+
by Arne Brasseur
|
3
|
+
|
4
|
+
== DESCRIPTION:
|
5
|
+
|
6
|
+
Pinyin can convert between various systems for phonetically
|
7
|
+
writing Mandarin Chinese. It can also handle various representation
|
8
|
+
of tones, so it can be used to convert pinyin with numbers
|
9
|
+
to pinyin with tones.
|
10
|
+
|
11
|
+
Supported formats include Hanyu Pinyin, Bopomofo, Wade-Giles
|
12
|
+
and International Phonetic Alphabet (IPA).
|
13
|
+
|
14
|
+
== FEATURES/PROBLEMS:
|
15
|
+
|
16
|
+
== SYNOPSIS:
|
17
|
+
|
18
|
+
require 'pinyin'
|
19
|
+
|
20
|
+
reader = Pinyin::Reader.new(:hanyu, :tones)
|
21
|
+
reader << "wo3 ai4 ni3"
|
22
|
+
# => [<Pinyin::Syllable <initial=Empty, final=Uo, tone=3>>,
|
23
|
+
# <Pinyin::Syllable <initial=Empty, final=Ai, tone=4>>,
|
24
|
+
# <Pinyin::Syllable <initial=Ne, final=I, tone=3>>]
|
25
|
+
|
26
|
+
writer = Pinyin::Writer.new(:zhuyin, :marks)
|
27
|
+
|
28
|
+
writer << (reader << "wo3 ai4 ni3")
|
29
|
+
# => "ㄨㄛˇ ㄞˋ ㄋㄧˇ"
|
30
|
+
|
31
|
+
require 'pinyin/string'
|
32
|
+
|
33
|
+
"wo3 ai4 ni3".pretty_tones
|
34
|
+
# => "wǒ ài nǐ"
|
35
|
+
|
36
|
+
== REQUIREMENTS:
|
37
|
+
|
38
|
+
* $KCODE has to be set to "UTF8" for everything to work correctly
|
39
|
+
* Facets
|
40
|
+
|
41
|
+
== INSTALL:
|
42
|
+
|
43
|
+
* gem install pinyin
|
44
|
+
|
45
|
+
== LICENSE:
|
46
|
+
Copyright (c) 2004-2007, Arne Brasseur. (http://www.arnebrasseur.net)
|
47
|
+
|
48
|
+
Available as Free Software under the GPLv3 License, see LICENSE.txt for
|
49
|
+
details
|
50
|
+
|
data/TODO
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
!Core
|
2
|
+
|
3
|
+
- Additional tone sytems
|
4
|
+
- Superscript numbers (for wade giles)
|
5
|
+
- IPA tone notation
|
6
|
+
|
7
|
+
- Additional transcription systems
|
8
|
+
- MSP2 (or how do you call that)
|
9
|
+
- Palladiy (To make things interesting)
|
10
|
+
- Gwoyueh
|
11
|
+
- Yale
|
12
|
+
|
13
|
+
- Research some rare pinyin syllables : lo, yo ^e, yai
|
14
|
+
- Get a definitive answer about ong/ueng/weng
|
15
|
+
- Add a general README as rdoc start page
|
16
|
+
- Add a README to the data/ directory with info on sources, contents and purposes
|
17
|
+
- More tests
|
18
|
+
- Add remembering of parameters to cgiform example, other examples
|
19
|
+
|
20
|
+
!More
|
21
|
+
The core lib basically does translation on the syllable level. It can handle strings with syllables nicely seperated by spaces. Successive layers should make it possible to convert a sentence with interpunction into a different system. It should be possible to write compound words together in Hanyu, and have the syllables seperated by dashes when converting to WG. For instance:
|
22
|
+
|
23
|
+
Wǒ de péngyǒu, shì dàifu. => Wǒ te p`éng-yǔ, shih tài-fu.>>>>>>> .r211
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
|
3
|
+
|
4
|
+
require 'cgi'
|
5
|
+
require 'erb'
|
6
|
+
|
7
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
8
|
+
require 'pinyin'
|
9
|
+
|
10
|
+
cgi=CGI.new("xhtml1")
|
11
|
+
|
12
|
+
params=cgi.params
|
13
|
+
begin
|
14
|
+
if params['pinyin'] && params['pinyin'] != '' && params['pinyin'] != []
|
15
|
+
@converted = Pinyin::Writer.new(params['to'], params['to_tone']) << (Pinyin::Reader.new(params['from'],params['from_tone']) << params['pinyin'].first)
|
16
|
+
end
|
17
|
+
rescue
|
18
|
+
cgi.out{$!.to_s}
|
19
|
+
cgi.out{params['pinyin'].inspect}
|
20
|
+
end
|
21
|
+
|
22
|
+
cgi.out("text/html; charset=utf-8") do
|
23
|
+
ERB.new(IO.read('template.rhtml')).result(binding)
|
24
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
<!doctype html>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>Ruby Pinyin CGIForm example</title>
|
5
|
+
<style type='text/css'>
|
6
|
+
body {
|
7
|
+
font-family: sans-serif;
|
8
|
+
}
|
9
|
+
|
10
|
+
div#wrap {
|
11
|
+
width: 40%;
|
12
|
+
margin: 0 auto;
|
13
|
+
}
|
14
|
+
|
15
|
+
table {
|
16
|
+
width: 100%;
|
17
|
+
}
|
18
|
+
div#converted_text {
|
19
|
+
border: 1px dotted #000;
|
20
|
+
}
|
21
|
+
|
22
|
+
textarea {
|
23
|
+
width: 100%;
|
24
|
+
height: 10em;
|
25
|
+
margin: 0 auto;
|
26
|
+
}
|
27
|
+
</style>
|
28
|
+
</head>
|
29
|
+
<body>
|
30
|
+
<div id='wrap'>
|
31
|
+
<h2>Pinyin example application</h2>
|
32
|
+
<h3>Enter some pinyin text and choose your format</h3>
|
33
|
+
<table>
|
34
|
+
<form method='post'>
|
35
|
+
<tr>
|
36
|
+
<td colspan='2'>
|
37
|
+
<textarea name='pinyin'><%=params['pinyin'].first if params['pinyin'] != []%></textarea>
|
38
|
+
</td>
|
39
|
+
</tr>
|
40
|
+
|
41
|
+
<tr><td>From</td><td>To</td></tr>
|
42
|
+
<% Pinyin::Conversions::All.each do |f|%>
|
43
|
+
<tr>
|
44
|
+
<td><input type='radio' name='from' value='<%=f%>'><%=f.capitalize%></input></td>
|
45
|
+
<td><input type='radio' name='to' value='<%=f%>'><%=f.capitalize%></input></td>
|
46
|
+
</tr>
|
47
|
+
<% end %>
|
48
|
+
<tr><td>From tone</td><td>To tone</td></tr>
|
49
|
+
<% Pinyin::Tones::All.each do |f|%>
|
50
|
+
<tr>
|
51
|
+
<td><input type='radio' name='from_tone' value='<%=f%>'><%=f.capitalize%></input></td>
|
52
|
+
<td><input type='radio' name='to_tone' value='<%=f%>'><%=f.capitalize%></input></td>
|
53
|
+
</tr>
|
54
|
+
<% end %>
|
55
|
+
<tr>
|
56
|
+
<td><input type='submit'></input></td>
|
57
|
+
<td> </td>
|
58
|
+
</tr>
|
59
|
+
</form>
|
60
|
+
</table>
|
61
|
+
<% if @converted %>
|
62
|
+
<h2>Converted:</h2>
|
63
|
+
<div id='converted_text'>
|
64
|
+
<%= @converted %>
|
65
|
+
</div>
|
66
|
+
<% end %>
|
67
|
+
</div>
|
68
|
+
</body>
|
69
|
+
</html>
|
data/examples/hello.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
$: << File.join(File.dirname(__FILE__), '../lib')
|
2
|
+
|
3
|
+
require 'pinyin'
|
4
|
+
|
5
|
+
conv1 = Pinyin::Converter.new(:hanyu, :numbers, :wadegiles, :accents)
|
6
|
+
conv2 = Pinyin::Converter.new(:wadegiles, :accents, :zhuyin, :marks)
|
7
|
+
|
8
|
+
pinyin = 'wo3 de2 peng2 you3 shi4 dai4 fu'
|
9
|
+
wadegiles = conv1 << pinyin
|
10
|
+
zhuyin = conv2 << wadegiles
|
11
|
+
|
12
|
+
puts pinyin, wadegiles, zhuyin
|
data/lib/pinyin.rb
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
# Handle several romanization systems for Mandarin Chinese
|
2
|
+
#
|
3
|
+
# Author:: Arne Brasseur (pinyin@arnebrasseur.net)
|
4
|
+
# Copyright:: Copyright (c) 2007, Arne Brasseur
|
5
|
+
# Licence:: GNU General Public License, latest version
|
6
|
+
|
7
|
+
$: << File.dirname(__FILE__)
|
8
|
+
|
9
|
+
require "facets/string/camelcase"
|
10
|
+
|
11
|
+
require 'pinyin/support'
|
12
|
+
require 'pinyin/groundwork'
|
13
|
+
require 'pinyin/exception'
|
14
|
+
|
15
|
+
require 'pinyin/tones'
|
16
|
+
require 'pinyin/conversion'
|
17
|
+
require 'pinyin/conversions'
|
18
|
+
require 'pinyin/conversions/hanyu'
|
19
|
+
|
20
|
+
module Pinyin
|
21
|
+
VERSION = "0.1.5"
|
22
|
+
|
23
|
+
class Reader
|
24
|
+
def initialize(conv, tone)
|
25
|
+
@conv = conv.to_s
|
26
|
+
@tone = Tones.const_get tone.to_s.camelcase
|
27
|
+
end
|
28
|
+
|
29
|
+
def parse(str)
|
30
|
+
Conversions.tokenize(str).map do |s, pos|
|
31
|
+
tone,syll = @tone.pop_tone(s)
|
32
|
+
tsyll = Conversions.parse(@conv,syll)
|
33
|
+
ini, fin = tsyll.initial, tsyll.final
|
34
|
+
unless tone && fin && ini
|
35
|
+
raise ParseError.new(s,pos),"Illegal syllable <#{s}> in input <#{str}> at position #{pos}."
|
36
|
+
end
|
37
|
+
Syllable.new(ini, fin, tone)
|
38
|
+
end
|
39
|
+
rescue Object => e
|
40
|
+
raise ParseError.new(str,0), "Parsing of #{str.inspect} failed : #{e}"
|
41
|
+
end
|
42
|
+
|
43
|
+
alias :<< :parse
|
44
|
+
end
|
45
|
+
|
46
|
+
class Writer
|
47
|
+
def initialize(conv, tone)
|
48
|
+
@conv = conv.to_s
|
49
|
+
@tone = Tones.const_get tone.to_s.camelcase
|
50
|
+
end
|
51
|
+
|
52
|
+
def unparse(py)
|
53
|
+
conv=lambda {|syll| @tone.add_tone(Conversions.unparse(@conv,syll),syll.tone)}
|
54
|
+
if py.respond_to? :map
|
55
|
+
py.map(&conv).join(' ')
|
56
|
+
else
|
57
|
+
conv.call(py)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
alias :<< :unparse
|
62
|
+
end
|
63
|
+
|
64
|
+
class Converter
|
65
|
+
def initialize(from, from_tone, to, to_tone)
|
66
|
+
@reader = Reader.new(from, from_tone)
|
67
|
+
@writer = Writer.new(to, to_tone)
|
68
|
+
end
|
69
|
+
|
70
|
+
def convert(str)
|
71
|
+
@writer.unparse @reader.parse(str)
|
72
|
+
end
|
73
|
+
|
74
|
+
alias :<< :convert
|
75
|
+
end
|
76
|
+
|
77
|
+
class <<self
|
78
|
+
Conversions::All.each do |c|
|
79
|
+
define_method "#{c.to_s.camelcase}Reader" do |tone|
|
80
|
+
Reader.new(c, tone)
|
81
|
+
end
|
82
|
+
|
83
|
+
define_method "#{c.to_s.camelcase}Writer" do |tone|
|
84
|
+
Writer.new(c, tone)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module Pinyin
|
2
|
+
|
3
|
+
#
|
4
|
+
# Base class for conversions like Hanyu pinyin,
|
5
|
+
# Wade-Giles, etc.
|
6
|
+
#
|
7
|
+
class Conversion
|
8
|
+
|
9
|
+
# Separator between syllables in the same word
|
10
|
+
# For Wade-Giles this is a dash, Hanyu pinyin
|
11
|
+
# uses a single quote in certain situations
|
12
|
+
attr_reader :syllable_separator
|
13
|
+
|
14
|
+
# The tone handling object
|
15
|
+
attr_reader :tones
|
16
|
+
|
17
|
+
# An optional lambda that preprocesses input
|
18
|
+
attr_reader :preprocessor
|
19
|
+
|
20
|
+
# The name of this conversion, the same name used
|
21
|
+
# in the data file and that is also available as
|
22
|
+
# a method name on Initial and Final objects.
|
23
|
+
#
|
24
|
+
# By default the underscorized class name
|
25
|
+
attr_reader :name
|
26
|
+
|
27
|
+
def initialize(tone = :numbers, options = {})
|
28
|
+
@preprocessor = options[:preprocessor] || lambda {|s| s}
|
29
|
+
|
30
|
+
if Tone === tone
|
31
|
+
@tone = tone
|
32
|
+
else
|
33
|
+
@tone = Pinyin::Tones.const_get(tone.to_s.camelcase)
|
34
|
+
end
|
35
|
+
|
36
|
+
@name = self.class.name.underscore
|
37
|
+
end
|
38
|
+
|
39
|
+
# Converts a string into an array of strings and
|
40
|
+
# syllable objects.
|
41
|
+
def parse(string)
|
42
|
+
end
|
43
|
+
|
44
|
+
# Converts an array of strings and syllable objects
|
45
|
+
# into a string
|
46
|
+
def unparse(array)
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'yaml'
|
3
|
+
|
4
|
+
module Pinyin
|
5
|
+
module Conversions
|
6
|
+
All=[]
|
7
|
+
|
8
|
+
DATA_DIR=File.dirname(__FILE__)+'/data/'
|
9
|
+
|
10
|
+
#Load various representations for initials and finals
|
11
|
+
%w(Initial Final).each do |c|
|
12
|
+
klazz=Pinyin.const_get c
|
13
|
+
begin
|
14
|
+
CSV.open(DATA_DIR+c.downcase+'.csv', 'r').each do |name, *values|
|
15
|
+
All << name.to_s unless All.index name || name =~ /name|standalone/i
|
16
|
+
klazz.class_eval {attr_accessor name.to_sym}
|
17
|
+
values.each_with_index do |v,i|
|
18
|
+
klazz::All[i].send(name+'=', v)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
rescue
|
22
|
+
puts "Bad data in #{c.downcase}.csv : " + $!
|
23
|
+
raise
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
#Substitution rules
|
29
|
+
@@rules=YAML::load(IO.read(DATA_DIR+'rules.yaml'))
|
30
|
+
|
31
|
+
def self.parse(type, string)
|
32
|
+
if (fin = Final::All.find {|f| f.respond_to?("#{type}_standalone") && f.send("#{type}_standalone") == string})
|
33
|
+
TonelessSyllable.new(Initial::Empty, fin)
|
34
|
+
else
|
35
|
+
Initial::All.find do |ini|
|
36
|
+
Final::All.find do |fin|
|
37
|
+
next if TonelessSyllable.illegal?(ini,fin)
|
38
|
+
return TonelessSyllable.new(ini,fin) if apply_rules(type, (ini.send(type)||'') + (fin.send(type)||'')) == string
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.unparse(type, tsyll)
|
45
|
+
if tsyll.initial.send(type)
|
46
|
+
apply_rules(type, tsyll.initial.send(type) + (tsyll.final.send(type) || ''))
|
47
|
+
elsif tsyll.final.respond_to?(type.to_s+'_standalone') && standalone = tsyll.final.send(type.to_s+'_standalone')
|
48
|
+
standalone
|
49
|
+
else
|
50
|
+
apply_rules(type, tsyll.final.send(type))
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def self.tokenize(str)
|
55
|
+
returning [] do |ary|
|
56
|
+
str,pos = str.dup, 0
|
57
|
+
while s=str.slice!(/[^' ]*/) and s != ""
|
58
|
+
ary << [s.strip, pos]
|
59
|
+
pos+=s.length
|
60
|
+
str.slice!(/[' ]/)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
def self.apply_rules(type, string)
|
67
|
+
returning string.dup do |s|
|
68
|
+
@@rules[type] && @@rules[type].each do |rule|
|
69
|
+
s.gsub!(Regexp.new(rule['match']),rule['subst'])
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
end
|