pinyin 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/TODO +22 -0
- data/examples/cgiform/cgiform.rb +24 -0
- data/examples/cgiform/template.rhtml +69 -0
- data/examples/hello.rb +12 -0
- data/lib/conversions.rb +74 -0
- data/lib/data/comparison.csv +410 -0
- data/lib/data/final.csv +10 -0
- data/lib/data/initial.csv +7 -0
- data/lib/data/paladiy.txt +421 -0
- data/lib/data/rules.yaml +24 -0
- data/lib/data/valid_pinyin.yaml +455 -0
- data/lib/exception.rb +14 -0
- data/lib/groundwork.rb +148 -0
- data/lib/pinyin.rb +71 -0
- data/lib/support.rb +16 -0
- data/lib/tones/accents.rb +59 -0
- data/lib/tones/marks.rb +25 -0
- data/lib/tones/no_tones.rb +16 -0
- data/lib/tones/numbers.rb +24 -0
- data/lib/tones.rb +19 -0
- data/rakefile +39 -0
- data/test/comparison_test.rb +35 -0
- data/test/hanyu_coverage.rb +33 -0
- metadata +74 -0
data/TODO
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
!Core
|
2
|
+
|
3
|
+
- Additional tone sytems
|
4
|
+
- Superscript numbers (for wade giles)
|
5
|
+
- IPA tone notation
|
6
|
+
|
7
|
+
- Additional transcription systems
|
8
|
+
- MSP2 (or how do you call that)
|
9
|
+
- Palladiy (To make things interesting)
|
10
|
+
- Gwoyueh
|
11
|
+
- Yale
|
12
|
+
|
13
|
+
- Research some rare pinyin syllables : lo, yo ^e, yai
|
14
|
+
- Add a general README as rdoc start page
|
15
|
+
- Add a README to the data/ directory with info on sources, contents and purposes
|
16
|
+
- More tests
|
17
|
+
- Add remembering of parameters to cgiform example, other examples
|
18
|
+
|
19
|
+
!More
|
20
|
+
The core lib basically does translation on the syllable level. It can handle strings with syllables nicely seperated by spaces. Successive layers should make it possible to convert a sentence with interpunction into a different system. It should be possible to write compound words together in Hanyu, and have the syllables seperated by dashes when converting to WG. For instance:
|
21
|
+
|
22
|
+
Wǒ de péngyǒu, shì dàifu. => Wǒ te p`éng-yǔ, shih tài-fu.
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
|
3
|
+
|
4
|
+
require 'cgi'
|
5
|
+
require 'erb'
|
6
|
+
|
7
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
8
|
+
require 'pinyin'
|
9
|
+
|
10
|
+
cgi=CGI.new("xhtml1")
|
11
|
+
|
12
|
+
params=cgi.params
|
13
|
+
begin
|
14
|
+
if params['pinyin'] && params['pinyin'] != '' && params['pinyin'] != []
|
15
|
+
@converted = Pinyin::Writer.new(params['to'], params['to_tone']) << (Pinyin::Reader.new(params['from'],params['from_tone']) << params['pinyin'].first)
|
16
|
+
end
|
17
|
+
rescue
|
18
|
+
cgi.out{$!.to_s}
|
19
|
+
cgi.out{params['pinyin'].inspect}
|
20
|
+
end
|
21
|
+
|
22
|
+
cgi.out("text/html; charset=utf-8") do
|
23
|
+
ERB.new(IO.read('template.rhtml')).result(binding)
|
24
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
<!doctype html>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>Ruby Pinyin CGIForm example</title>
|
5
|
+
<style type='text/css'>
|
6
|
+
body {
|
7
|
+
font-family: sans-serif;
|
8
|
+
}
|
9
|
+
|
10
|
+
div#wrap {
|
11
|
+
width: 40%;
|
12
|
+
margin: 0 auto;
|
13
|
+
}
|
14
|
+
|
15
|
+
table {
|
16
|
+
width: 100%;
|
17
|
+
}
|
18
|
+
div#converted_text {
|
19
|
+
border: 1px dotted #000;
|
20
|
+
}
|
21
|
+
|
22
|
+
textarea {
|
23
|
+
width: 100%;
|
24
|
+
height: 10em;
|
25
|
+
margin: 0 auto;
|
26
|
+
}
|
27
|
+
</style>
|
28
|
+
</head>
|
29
|
+
<body>
|
30
|
+
<div id='wrap'>
|
31
|
+
<h2>Pinyin example application</h2>
|
32
|
+
<h3>Enter some pinyin text and choose your format</h3>
|
33
|
+
<table>
|
34
|
+
<form method='post'>
|
35
|
+
<tr>
|
36
|
+
<td colspan='2'>
|
37
|
+
<textarea name='pinyin'><%=params['pinyin'].first if params['pinyin'] != []%></textarea>
|
38
|
+
</td>
|
39
|
+
</tr>
|
40
|
+
|
41
|
+
<tr><td>From</td><td>To</td></tr>
|
42
|
+
<% Pinyin::Conversions::All.each do |f|%>
|
43
|
+
<tr>
|
44
|
+
<td><input type='radio' name='from' value='<%=f%>'><%=f.capitalize%></input></td>
|
45
|
+
<td><input type='radio' name='to' value='<%=f%>'><%=f.capitalize%></input></td>
|
46
|
+
</tr>
|
47
|
+
<% end %>
|
48
|
+
<tr><td>From tone</td><td>To tone</td></tr>
|
49
|
+
<% Pinyin::Tones::All.each do |f|%>
|
50
|
+
<tr>
|
51
|
+
<td><input type='radio' name='from_tone' value='<%=f%>'><%=f.capitalize%></input></td>
|
52
|
+
<td><input type='radio' name='to_tone' value='<%=f%>'><%=f.capitalize%></input></td>
|
53
|
+
</tr>
|
54
|
+
<% end %>
|
55
|
+
<tr>
|
56
|
+
<td><input type='submit'></input></td>
|
57
|
+
<td> </td>
|
58
|
+
</tr>
|
59
|
+
</form>
|
60
|
+
</table>
|
61
|
+
<% if @converted %>
|
62
|
+
<h2>Converted:</h2>
|
63
|
+
<div id='converted_text'>
|
64
|
+
<%= @converted %>
|
65
|
+
</div>
|
66
|
+
<% end %>
|
67
|
+
</div>
|
68
|
+
</body>
|
69
|
+
</html>
|
data/examples/hello.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
$: << File.join(File.dirname(__FILE__), '../lib')
|
2
|
+
|
3
|
+
require 'pinyin'
|
4
|
+
|
5
|
+
conv1 = Pinyin::Converter.new(:hanyu, :numbers, :wadegiles, :accents)
|
6
|
+
conv2 = Pinyin::Converter.new(:wadegiles, :accents, :zhuyin, :marks)
|
7
|
+
|
8
|
+
pinyin = 'wo3 de2 peng2 you3 shi4 dai4 fu'
|
9
|
+
wadegiles = conv1 << pinyin
|
10
|
+
zhuyin = conv2 << wadegiles
|
11
|
+
|
12
|
+
puts pinyin, wadegiles, zhuyin
|
data/lib/conversions.rb
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'yaml'
|
3
|
+
|
4
|
+
module Pinyin
|
5
|
+
module Conversions
|
6
|
+
All=[]
|
7
|
+
|
8
|
+
DATA_DIR=File.dirname(__FILE__)+'/data/'
|
9
|
+
|
10
|
+
#Load various representations for initials and finals
|
11
|
+
%w(Initial Final).each do |c|
|
12
|
+
klazz=Pinyin.const_get c
|
13
|
+
begin
|
14
|
+
CSV.open(DATA_DIR+c.downcase+'.csv', 'r').each do |name, *values|
|
15
|
+
All << name unless All.index name || name =~ /name|standalone/i
|
16
|
+
klazz.class_eval {attr_accessor name.to_sym}
|
17
|
+
values.each_with_index do |v,i|
|
18
|
+
klazz::All[i].send(name+'=', v)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
rescue
|
22
|
+
puts "Bad data in #{c.downcase}.csv : " + $!
|
23
|
+
raise
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
#Substitution rules
|
29
|
+
@@rules=YAML::load(IO.read(DATA_DIR+'rules.yaml'))
|
30
|
+
|
31
|
+
def self.parse(type, string)
|
32
|
+
if (fin = Final::All.find {|f| f.respond_to?("#{type}_standalone") && f.send("#{type}_standalone") == string})
|
33
|
+
TonelessSyllable.new(Initial::Empty, fin)
|
34
|
+
else
|
35
|
+
Initial::All.find do |ini|
|
36
|
+
Final::All.find do |fin|
|
37
|
+
next if TonelessSyllable.illegal?(ini,fin)
|
38
|
+
return TonelessSyllable.new(ini,fin) if apply_rules(type, (ini.send(type)||'') + (fin.send(type)||'')) == string
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.unparse(type, tsyll)
|
45
|
+
if tsyll.initial.send(type)
|
46
|
+
apply_rules(type, tsyll.initial.send(type) + (tsyll.final.send(type) || ''))
|
47
|
+
elsif tsyll.final.respond_to?(type.to_s+'_standalone') && standalone = tsyll.final.send(type.to_s+'_standalone')
|
48
|
+
standalone
|
49
|
+
else
|
50
|
+
apply_rules(type, tsyll.final.send(type))
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def self.tokenize(str)
|
55
|
+
returning [] do |ary|
|
56
|
+
str,pos = str.dup, 0
|
57
|
+
while s=str.slice!(/[^' ]*/) and s != ""
|
58
|
+
ary << [s.strip, pos]
|
59
|
+
pos+=s.length
|
60
|
+
str.slice!(/[' ]/)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
def self.apply_rules(type, string)
|
67
|
+
returning string.dup do |s|
|
68
|
+
@@rules[type] && @@rules[type].each do |rule|
|
69
|
+
s.gsub!(Regexp.new(rule['match']),rule['subst'])
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|