RubyGems - arnebrasseur-pinyin - Versions diffs - 0.1.5 - Mend

arnebrasseur-pinyin 0.1.5

Files changed (31) hide show

data/History.txt +12 -0
data/Manifest.txt +31 -0
data/README.txt +50 -0
data/TODO +23 -0
data/examples/cgiform/cgiform.rb +24 -0
data/examples/cgiform/template.rhtml +69 -0
data/examples/hello.rb +12 -0
data/lib/pinyin.rb +90 -0
data/lib/pinyin/conversion.rb +51 -0
data/lib/pinyin/conversions.rb +75 -0
data/lib/pinyin/conversions/hanyu.rb +77 -0
data/lib/pinyin/data/comparison.csv +410 -0
data/lib/pinyin/data/final.csv +10 -0
data/lib/pinyin/data/initial.csv +7 -0
data/lib/pinyin/data/paladiy.txt +421 -0
data/lib/pinyin/data/rules.yaml +24 -0
data/lib/pinyin/data/valid_pinyin.yaml +454 -0
data/lib/pinyin/exception.rb +14 -0
data/lib/pinyin/groundwork.rb +183 -0
data/lib/pinyin/string.rb +16 -0
data/lib/pinyin/support.rb +12 -0
data/lib/pinyin/tones.rb +47 -0
data/lib/pinyin/tones/accents.rb +62 -0
data/lib/pinyin/tones/marks.rb +30 -0
data/lib/pinyin/tones/no_tones.rb +6 -0
data/lib/pinyin/tones/numbers.rb +25 -0
data/rakefile +22 -0
data/script/update +4 -0
data/test/test_comparison.rb +35 -0
data/test/test_hanyu_coverage.rb +35 -0
metadata +102 -0

data/History.txt ADDED

@@ -0,0 +1,12 @@
+== 0.1.3 / 2008-07-18
+* made compatible with the latest release of Facets
+== 0.1.0 / 2007-12-14
+* Converted to Hoe, bugfixes
+== 0.0.1 / 2007-07-26
+* Birthday!

data/Manifest.txt ADDED

@@ -0,0 +1,31 @@
+History.txt
+Manifest.txt
+README.txt
+rakefile
+TODO
+examples/cgiform/cgiform.rb
+examples/cgiform/template.rhtml
+examples/hello.rb
+lib/pinyin.rb
+lib/pinyin/conversion.rb
+lib/pinyin/conversions.rb
+lib/pinyin/conversions/hanyu.rb
+lib/pinyin/data/comparison.csv
+lib/pinyin/data/final.csv
+lib/pinyin/data/initial.csv
+lib/pinyin/data/paladiy.txt
+lib/pinyin/data/rules.yaml
+lib/pinyin/data/valid_pinyin.yaml
+lib/pinyin/exception.rb
+lib/pinyin/groundwork.rb
+lib/pinyin/string.rb
+lib/pinyin/support.rb
+lib/pinyin/tones.rb
+lib/pinyin/tones/accents.rb
+lib/pinyin/tones/marks.rb
+lib/pinyin/tones/no_tones.rb
+lib/pinyin/tones/numbers.rb
+rakefile
+script/update
+test/test_comparison.rb
+test/test_hanyu_coverage.rb

data/README.txt ADDED

@@ -0,0 +1,50 @@
+pinyin
+    by Arne Brasseur
+== DESCRIPTION:
+Pinyin can convert between various systems for phonetically
+writing Mandarin Chinese. It can also handle various representation
+of tones, so it can be used to convert pinyin with numbers
+to pinyin with tones.
+Supported formats include Hanyu Pinyin, Bopomofo, Wade-Giles
+and International Phonetic Alphabet (IPA).
+== FEATURES/PROBLEMS:
+== SYNOPSIS:
+   require 'pinyin'
+   reader = Pinyin::Reader.new(:hanyu, :tones)
+   reader << "wo3 ai4 ni3"
+    # => [<Pinyin::Syllable <initial=Empty, final=Uo, tone=3>>,
+    #     <Pinyin::Syllable <initial=Empty, final=Ai, tone=4>>,
+    #     <Pinyin::Syllable <initial=Ne, final=I, tone=3>>]
+   writer = Pinyin::Writer.new(:zhuyin, :marks)
+   writer << (reader << "wo3 ai4 ni3")
+   # => "ㄨㄛˇ ㄞˋ ㄋㄧˇ"
+   require 'pinyin/string'
+   "wo3 ai4 ni3".pretty_tones
+   # => "wǒ ài nǐ"
+== REQUIREMENTS:
+* $KCODE has to be set to "UTF8" for everything to work correctly
+* Facets
+== INSTALL:
+* gem install pinyin
+== LICENSE:
+Copyright (c) 2004-2007, Arne Brasseur. (http://www.arnebrasseur.net)
+Available as Free Software under the GPLv3 License, see LICENSE.txt for
+details

data/TODO ADDED

@@ -0,0 +1,23 @@
+!Core
+- Additional tone sytems
+  - Superscript numbers (for wade giles)
+  - IPA tone notation
+- Additional transcription systems
+  - MSP2 (or how do you call that)
+  - Palladiy (To make things interesting)
+  - Gwoyueh
+  - Yale
+- Research some rare pinyin syllables : lo, yo ^e, yai
+- Get a definitive answer about ong/ueng/weng
+- Add a general README as rdoc start page
+- Add a README to the data/ directory with info on sources, contents and purposes
+- More tests
+- Add remembering of parameters to cgiform example, other examples
+!More
+The core lib basically does translation on the syllable level. It can handle strings with syllables nicely seperated by spaces. Successive layers should make it possible to convert a sentence with interpunction into a different system. It should be possible to write compound words together in Hanyu, and have the syllables seperated by dashes when converting to WG. For instance:
+Wǒ de péngyǒu, shì dàifu.  =>  Wǒ te p`éng-yǔ, shih tài-fu.>>>>>>> .r211

data/examples/cgiform/cgiform.rb ADDED

@@ -0,0 +1,24 @@
+#!/usr/bin/ruby -w
+require 'cgi'
+require 'erb'
+$: << File.dirname(__FILE__)+'/../../lib'
+require 'pinyin'
+cgi=CGI.new("xhtml1")
+params=cgi.params
+begin
+  if params['pinyin'] && params['pinyin'] != '' && params['pinyin'] != []
+    @converted = Pinyin::Writer.new(params['to'], params['to_tone']) << (Pinyin::Reader.new(params['from'],params['from_tone']) << params['pinyin'].first)
+  end
+rescue
+  cgi.out{$!.to_s}
+  cgi.out{params['pinyin'].inspect}
+end
+cgi.out("text/html; charset=utf-8") do
+  ERB.new(IO.read('template.rhtml')).result(binding)
+end

data/examples/cgiform/template.rhtml ADDED

@@ -0,0 +1,69 @@
+<!doctype html>
+<html>
+  <head>
+    <title>Ruby Pinyin CGIForm example</title>
+    <style type='text/css'>
+      body {
+        font-family: sans-serif;
+      }
+      div#wrap {
+        width: 40%;
+        margin: 0 auto;
+      }
+      table {
+        width: 100%;
+      }
+      div#converted_text {
+        border: 1px dotted #000;
+      }
+      textarea {
+	width: 100%;
+	height: 10em;
+	margin: 0 auto;
+      }
+    </style>
+  </head>
+  <body>
+    <div id='wrap'>
+    <h2>Pinyin example application</h2>
+    <h3>Enter some pinyin text and choose your format</h3>
+    <table>
+      <form method='post'>
+        <tr>
+          <td colspan='2'>
+            <textarea name='pinyin'><%=params['pinyin'].first if params['pinyin'] != []%></textarea>
+          </td>
+        </tr>
+        <tr><td>From</td><td>To</td></tr>
+        <% Pinyin::Conversions::All.each do |f|%>
+          <tr>
+            <td><input type='radio' name='from' value='<%=f%>'><%=f.capitalize%></input></td>
+            <td><input type='radio' name='to' value='<%=f%>'><%=f.capitalize%></input></td>
+          </tr>
+        <% end %>
+        <tr><td>From tone</td><td>To tone</td></tr>
+        <% Pinyin::Tones::All.each do |f|%>
+          <tr>
+            <td><input type='radio' name='from_tone' value='<%=f%>'><%=f.capitalize%></input></td>
+            <td><input type='radio' name='to_tone' value='<%=f%>'><%=f.capitalize%></input></td>
+          </tr>
+        <% end %>
+        <tr>
+          <td><input type='submit'></input></td>
+          <td>&nbsp;</td>
+        </tr>
+      </form>
+    </table>
+    <% if @converted %>
+      <h2>Converted:</h2>
+      <div id='converted_text'>
+        <%= @converted %>
+      </div>
+    <% end %>
+    </div>
+  </body>
+</html>

data/examples/hello.rb ADDED

@@ -0,0 +1,12 @@
+$: << File.join(File.dirname(__FILE__), '../lib')
+require 'pinyin'
+conv1 = Pinyin::Converter.new(:hanyu, :numbers, :wadegiles, :accents)
+conv2 = Pinyin::Converter.new(:wadegiles, :accents, :zhuyin, :marks)
+pinyin    = 'wo3 de2 peng2 you3 shi4 dai4 fu'
+wadegiles = conv1 << pinyin
+zhuyin    = conv2 << wadegiles
+puts pinyin, wadegiles, zhuyin

data/lib/pinyin.rb ADDED

@@ -0,0 +1,90 @@
+# Handle several romanization systems for Mandarin Chinese
+#
+# Author::     Arne Brasseur (pinyin@arnebrasseur.net)
+# Copyright::  Copyright (c) 2007, Arne Brasseur
+# Licence::    GNU General Public License, latest version
+$: << File.dirname(__FILE__)
+require "facets/string/camelcase"
+require 'pinyin/support'
+require 'pinyin/groundwork'
+require 'pinyin/exception'
+require 'pinyin/tones'
+require 'pinyin/conversion'
+require 'pinyin/conversions'
+require 'pinyin/conversions/hanyu'
+module Pinyin
+  VERSION = "0.1.5"
+  class Reader
+    def initialize(conv, tone)
+      @conv = conv.to_s
+      @tone = Tones.const_get tone.to_s.camelcase
+    end
+    def parse(str)
+      Conversions.tokenize(str).map do |s, pos|
+        tone,syll = @tone.pop_tone(s)
+        tsyll = Conversions.parse(@conv,syll)
+        ini, fin = tsyll.initial, tsyll.final
+        unless tone && fin && ini
+          raise ParseError.new(s,pos),"Illegal syllable <#{s}> in input <#{str}> at position #{pos}."
+        end
+        Syllable.new(ini, fin, tone)
+      end
+    rescue Object => e
+      raise ParseError.new(str,0), "Parsing of #{str.inspect} failed : #{e}"
+    end
+    alias :<< :parse
+  end
+  class Writer
+    def initialize(conv, tone)
+      @conv = conv.to_s
+      @tone = Tones.const_get tone.to_s.camelcase
+    end
+    def unparse(py)
+      conv=lambda {|syll| @tone.add_tone(Conversions.unparse(@conv,syll),syll.tone)}
+      if py.respond_to? :map
+        py.map(&conv).join(' ')
+      else
+        conv.call(py)
+      end
+    end
+    alias :<< :unparse
+  end
+  class Converter
+    def initialize(from, from_tone, to, to_tone)
+      @reader = Reader.new(from, from_tone)
+      @writer = Writer.new(to, to_tone)
+    end
+    def convert(str)
+      @writer.unparse @reader.parse(str)
+    end
+    alias :<< :convert
+  end
+  class <<self
+    Conversions::All.each do |c|
+      define_method "#{c.to_s.camelcase}Reader" do |tone|
+        Reader.new(c, tone)
+      end
+      define_method "#{c.to_s.camelcase}Writer" do |tone|
+        Writer.new(c, tone)
+      end
+    end
+  end
+end

data/lib/pinyin/conversion.rb ADDED

@@ -0,0 +1,51 @@
+module Pinyin
+  #
+  # Base class for conversions like Hanyu pinyin,
+  # Wade-Giles, etc.
+  #
+  class Conversion
+    # Separator between syllables in the same word
+    # For Wade-Giles this is a dash, Hanyu pinyin
+    # uses a single quote in certain situations
+    attr_reader :syllable_separator
+    # The tone handling object
+    attr_reader :tones
+    # An optional lambda that preprocesses input
+    attr_reader :preprocessor
+    # The name of this conversion, the same name used
+    # in the data file and that is also available as
+    # a method name on Initial and Final objects.
+    #
+    # By default the underscorized class name
+    attr_reader :name
+    def initialize(tone = :numbers, options = {})
+      @preprocessor = options[:preprocessor] || lambda {|s| s}
+      if Tone === tone
+        @tone = tone
+      else
+        @tone = Pinyin::Tones.const_get(tone.to_s.camelcase)
+      end
+      @name = self.class.name.underscore
+    end
+    # Converts a string into an array of strings and
+    # syllable objects.
+    def parse(string)
+    end
+    # Converts an array of strings and syllable objects
+    # into a string
+    def unparse(array)
+    end
+  end
+end

data/lib/pinyin/conversions.rb ADDED

@@ -0,0 +1,75 @@
+require 'csv'
+require 'yaml'
+module Pinyin
+  module Conversions
+    All=[]
+    DATA_DIR=File.dirname(__FILE__)+'/data/'
+    #Load various representations for initials and finals
+    %w(Initial Final).each do |c|
+      klazz=Pinyin.const_get c
+      begin
+        CSV.open(DATA_DIR+c.downcase+'.csv', 'r').each do |name, *values|
+          All << name.to_s unless All.index name || name =~ /name|standalone/i
+          klazz.class_eval {attr_accessor name.to_sym}
+          values.each_with_index do |v,i|
+            klazz::All[i].send(name+'=', v)
+          end
+        end
+      rescue
+        puts "Bad data in #{c.downcase}.csv : " + $!
+        raise
+      end
+    end
+    #Substitution rules
+    @@rules=YAML::load(IO.read(DATA_DIR+'rules.yaml'))
+    def self.parse(type, string)
+      if (fin = Final::All.find {|f| f.respond_to?("#{type}_standalone") && f.send("#{type}_standalone") == string})
+        TonelessSyllable.new(Initial::Empty, fin)
+      else
+        Initial::All.find do |ini|
+          Final::All.find do |fin|
+            next                                  if TonelessSyllable.illegal?(ini,fin)
+            return TonelessSyllable.new(ini,fin)  if apply_rules(type, (ini.send(type)||'') + (fin.send(type)||'')) == string
+          end
+        end
+      end
+    end
+    def self.unparse(type, tsyll)
+      if tsyll.initial.send(type)
+        apply_rules(type, tsyll.initial.send(type) + (tsyll.final.send(type) || ''))
+      elsif tsyll.final.respond_to?(type.to_s+'_standalone') && standalone = tsyll.final.send(type.to_s+'_standalone')
+        standalone
+      else
+        apply_rules(type, tsyll.final.send(type))
+      end
+    end
+    def self.tokenize(str)
+      returning [] do |ary|
+        str,pos = str.dup, 0
+        while s=str.slice!(/[^' ]*/) and s != ""
+          ary << [s.strip, pos]
+          pos+=s.length
+          str.slice!(/[' ]/)
+        end
+      end
+    end
+    private
+      def self.apply_rules(type, string)
+        returning string.dup do |s|
+          @@rules[type] && @@rules[type].each do |rule|
+            s.gsub!(Regexp.new(rule['match']),rule['subst'])
+          end
+        end
+      end
+  end
+end