pinyin 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/TODO +22 -0
- data/examples/cgiform/cgiform.rb +24 -0
- data/examples/cgiform/template.rhtml +69 -0
- data/examples/hello.rb +12 -0
- data/lib/conversions.rb +74 -0
- data/lib/data/comparison.csv +410 -0
- data/lib/data/final.csv +10 -0
- data/lib/data/initial.csv +7 -0
- data/lib/data/paladiy.txt +421 -0
- data/lib/data/rules.yaml +24 -0
- data/lib/data/valid_pinyin.yaml +455 -0
- data/lib/exception.rb +14 -0
- data/lib/groundwork.rb +148 -0
- data/lib/pinyin.rb +71 -0
- data/lib/support.rb +16 -0
- data/lib/tones/accents.rb +59 -0
- data/lib/tones/marks.rb +25 -0
- data/lib/tones/no_tones.rb +16 -0
- data/lib/tones/numbers.rb +24 -0
- data/lib/tones.rb +19 -0
- data/rakefile +39 -0
- data/test/comparison_test.rb +35 -0
- data/test/hanyu_coverage.rb +33 -0
- metadata +74 -0
data/TODO
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
!Core
|
2
|
+
|
3
|
+
- Additional tone sytems
|
4
|
+
- Superscript numbers (for wade giles)
|
5
|
+
- IPA tone notation
|
6
|
+
|
7
|
+
- Additional transcription systems
|
8
|
+
- MSP2 (or how do you call that)
|
9
|
+
- Palladiy (To make things interesting)
|
10
|
+
- Gwoyueh
|
11
|
+
- Yale
|
12
|
+
|
13
|
+
- Research some rare pinyin syllables : lo, yo ^e, yai
|
14
|
+
- Add a general README as rdoc start page
|
15
|
+
- Add a README to the data/ directory with info on sources, contents and purposes
|
16
|
+
- More tests
|
17
|
+
- Add remembering of parameters to cgiform example, other examples
|
18
|
+
|
19
|
+
!More
|
20
|
+
The core lib basically does translation on the syllable level. It can handle strings with syllables nicely seperated by spaces. Successive layers should make it possible to convert a sentence with interpunction into a different system. It should be possible to write compound words together in Hanyu, and have the syllables seperated by dashes when converting to WG. For instance:
|
21
|
+
|
22
|
+
Wǒ de péngyǒu, shì dàifu. => Wǒ te p`éng-yǔ, shih tài-fu.
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
|
3
|
+
|
4
|
+
require 'cgi'
|
5
|
+
require 'erb'
|
6
|
+
|
7
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
8
|
+
require 'pinyin'
|
9
|
+
|
10
|
+
cgi=CGI.new("xhtml1")
|
11
|
+
|
12
|
+
params=cgi.params
|
13
|
+
begin
|
14
|
+
if params['pinyin'] && params['pinyin'] != '' && params['pinyin'] != []
|
15
|
+
@converted = Pinyin::Writer.new(params['to'], params['to_tone']) << (Pinyin::Reader.new(params['from'],params['from_tone']) << params['pinyin'].first)
|
16
|
+
end
|
17
|
+
rescue
|
18
|
+
cgi.out{$!.to_s}
|
19
|
+
cgi.out{params['pinyin'].inspect}
|
20
|
+
end
|
21
|
+
|
22
|
+
cgi.out("text/html; charset=utf-8") do
|
23
|
+
ERB.new(IO.read('template.rhtml')).result(binding)
|
24
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
<!doctype html>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>Ruby Pinyin CGIForm example</title>
|
5
|
+
<style type='text/css'>
|
6
|
+
body {
|
7
|
+
font-family: sans-serif;
|
8
|
+
}
|
9
|
+
|
10
|
+
div#wrap {
|
11
|
+
width: 40%;
|
12
|
+
margin: 0 auto;
|
13
|
+
}
|
14
|
+
|
15
|
+
table {
|
16
|
+
width: 100%;
|
17
|
+
}
|
18
|
+
div#converted_text {
|
19
|
+
border: 1px dotted #000;
|
20
|
+
}
|
21
|
+
|
22
|
+
textarea {
|
23
|
+
width: 100%;
|
24
|
+
height: 10em;
|
25
|
+
margin: 0 auto;
|
26
|
+
}
|
27
|
+
</style>
|
28
|
+
</head>
|
29
|
+
<body>
|
30
|
+
<div id='wrap'>
|
31
|
+
<h2>Pinyin example application</h2>
|
32
|
+
<h3>Enter some pinyin text and choose your format</h3>
|
33
|
+
<table>
|
34
|
+
<form method='post'>
|
35
|
+
<tr>
|
36
|
+
<td colspan='2'>
|
37
|
+
<textarea name='pinyin'><%=params['pinyin'].first if params['pinyin'] != []%></textarea>
|
38
|
+
</td>
|
39
|
+
</tr>
|
40
|
+
|
41
|
+
<tr><td>From</td><td>To</td></tr>
|
42
|
+
<% Pinyin::Conversions::All.each do |f|%>
|
43
|
+
<tr>
|
44
|
+
<td><input type='radio' name='from' value='<%=f%>'><%=f.capitalize%></input></td>
|
45
|
+
<td><input type='radio' name='to' value='<%=f%>'><%=f.capitalize%></input></td>
|
46
|
+
</tr>
|
47
|
+
<% end %>
|
48
|
+
<tr><td>From tone</td><td>To tone</td></tr>
|
49
|
+
<% Pinyin::Tones::All.each do |f|%>
|
50
|
+
<tr>
|
51
|
+
<td><input type='radio' name='from_tone' value='<%=f%>'><%=f.capitalize%></input></td>
|
52
|
+
<td><input type='radio' name='to_tone' value='<%=f%>'><%=f.capitalize%></input></td>
|
53
|
+
</tr>
|
54
|
+
<% end %>
|
55
|
+
<tr>
|
56
|
+
<td><input type='submit'></input></td>
|
57
|
+
<td> </td>
|
58
|
+
</tr>
|
59
|
+
</form>
|
60
|
+
</table>
|
61
|
+
<% if @converted %>
|
62
|
+
<h2>Converted:</h2>
|
63
|
+
<div id='converted_text'>
|
64
|
+
<%= @converted %>
|
65
|
+
</div>
|
66
|
+
<% end %>
|
67
|
+
</div>
|
68
|
+
</body>
|
69
|
+
</html>
|
data/examples/hello.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
$: << File.join(File.dirname(__FILE__), '../lib')
|
2
|
+
|
3
|
+
require 'pinyin'
|
4
|
+
|
5
|
+
conv1 = Pinyin::Converter.new(:hanyu, :numbers, :wadegiles, :accents)
|
6
|
+
conv2 = Pinyin::Converter.new(:wadegiles, :accents, :zhuyin, :marks)
|
7
|
+
|
8
|
+
pinyin = 'wo3 de2 peng2 you3 shi4 dai4 fu'
|
9
|
+
wadegiles = conv1 << pinyin
|
10
|
+
zhuyin = conv2 << wadegiles
|
11
|
+
|
12
|
+
puts pinyin, wadegiles, zhuyin
|
data/lib/conversions.rb
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'yaml'
|
3
|
+
|
4
|
+
module Pinyin
|
5
|
+
module Conversions
|
6
|
+
All=[]
|
7
|
+
|
8
|
+
DATA_DIR=File.dirname(__FILE__)+'/data/'
|
9
|
+
|
10
|
+
#Load various representations for initials and finals
|
11
|
+
%w(Initial Final).each do |c|
|
12
|
+
klazz=Pinyin.const_get c
|
13
|
+
begin
|
14
|
+
CSV.open(DATA_DIR+c.downcase+'.csv', 'r').each do |name, *values|
|
15
|
+
All << name unless All.index name || name =~ /name|standalone/i
|
16
|
+
klazz.class_eval {attr_accessor name.to_sym}
|
17
|
+
values.each_with_index do |v,i|
|
18
|
+
klazz::All[i].send(name+'=', v)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
rescue
|
22
|
+
puts "Bad data in #{c.downcase}.csv : " + $!
|
23
|
+
raise
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
#Substitution rules
|
29
|
+
@@rules=YAML::load(IO.read(DATA_DIR+'rules.yaml'))
|
30
|
+
|
31
|
+
def self.parse(type, string)
|
32
|
+
if (fin = Final::All.find {|f| f.respond_to?("#{type}_standalone") && f.send("#{type}_standalone") == string})
|
33
|
+
TonelessSyllable.new(Initial::Empty, fin)
|
34
|
+
else
|
35
|
+
Initial::All.find do |ini|
|
36
|
+
Final::All.find do |fin|
|
37
|
+
next if TonelessSyllable.illegal?(ini,fin)
|
38
|
+
return TonelessSyllable.new(ini,fin) if apply_rules(type, (ini.send(type)||'') + (fin.send(type)||'')) == string
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.unparse(type, tsyll)
|
45
|
+
if tsyll.initial.send(type)
|
46
|
+
apply_rules(type, tsyll.initial.send(type) + (tsyll.final.send(type) || ''))
|
47
|
+
elsif tsyll.final.respond_to?(type.to_s+'_standalone') && standalone = tsyll.final.send(type.to_s+'_standalone')
|
48
|
+
standalone
|
49
|
+
else
|
50
|
+
apply_rules(type, tsyll.final.send(type))
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def self.tokenize(str)
|
55
|
+
returning [] do |ary|
|
56
|
+
str,pos = str.dup, 0
|
57
|
+
while s=str.slice!(/[^' ]*/) and s != ""
|
58
|
+
ary << [s.strip, pos]
|
59
|
+
pos+=s.length
|
60
|
+
str.slice!(/[' ]/)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
def self.apply_rules(type, string)
|
67
|
+
returning string.dup do |s|
|
68
|
+
@@rules[type] && @@rules[type].each do |rule|
|
69
|
+
s.gsub!(Regexp.new(rule['match']),rule['subst'])
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|