hanzi 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/hanzi.rb ADDED
@@ -0,0 +1,81 @@
1
+ # encoding: utf-8
2
+
3
+ class Hanzi
4
+ class << self
5
+ attr_accessor :data
6
+
7
+ def load_data
8
+ return if @data
9
+ @data = []
10
+
11
+ file_path = File.expand_path('../../lib/data/cedict_ts.u8', __FILE__)
12
+ File.open(file_path).each_line do |line|
13
+ next if line.start_with?('#')
14
+ line = line.force_encoding('utf-8')
15
+
16
+ # CC-CEDICT format:
17
+ # Traditional Simplified [pin1 yin1] /English equivalent 1/equivalent 2/
18
+ line_data = {}
19
+ line_data[:traditional] = line[0, line.index(' ')]
20
+
21
+ line = line[line.index(' ') + 1, line.length]
22
+ line_data[:simplified] = line[0, line.index(' ')]
23
+
24
+ line = line[line.index('['), line.length]
25
+ line_data[:pinyin] = line[1, line.index(']') - 1].downcase
26
+
27
+ line = line[line.index('/'), line.rindex('/')]
28
+ line_data[:english] = line[1, line.rindex('/') - 1]
29
+
30
+ @data << line_data
31
+ end
32
+
33
+ end
34
+
35
+ def to_pinyin(text, options={})
36
+ load_data if @data.nil?
37
+
38
+ result = ''
39
+ pos = 0
40
+
41
+ loop do
42
+ char = text[pos]
43
+ break if !char
44
+
45
+ if char.ord < 0x4E00 || char.ord > 0x9FFF
46
+ # it's not a chinese character.
47
+ result << char
48
+ pos += 1
49
+ else
50
+ # it's a chinese character. start by trying to find a long word match,
51
+ # and if it fails, all the way down to a single hanzi.
52
+ match = nil
53
+ match_length = 0
54
+ 4.downto(1) do |length|
55
+ match = find_match(text[pos, length])
56
+ match_length = length
57
+ break if match
58
+ end
59
+
60
+ if match
61
+ result << match[:pinyin].gsub("\s", '')
62
+ pos += match_length
63
+ else
64
+ result << char
65
+ pos += 1
66
+ end
67
+ end
68
+ end
69
+
70
+ result
71
+ end
72
+
73
+ private
74
+ def find_match(text)
75
+ entry = @data.find do |word|
76
+ word[:simplified] == text || word[:traditional] == text
77
+ end
78
+ end
79
+
80
+ end
81
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,17 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+
12
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
13
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
14
+ require 'hanzi'
15
+
16
+ class Test::Unit::TestCase
17
+ end
@@ -0,0 +1,37 @@
1
+ # encoding: utf-8
2
+
3
+ require 'helper'
4
+
5
+ class TestHanzi < Test::Unit::TestCase
6
+
7
+ def test_should_init_data
8
+ Hanzi.load_data
9
+ assert Hanzi.data.count > 0
10
+ end
11
+
12
+ def test_convert_with_tones
13
+ result = Hanzi.to_pinyin('为什么')
14
+ assert_equal 'wei4shen2me5', result
15
+ end
16
+
17
+ def test_second_word
18
+ result = Hanzi.to_pinyin('走红')
19
+ assert_equal 'zou3hong2', result
20
+ end
21
+
22
+ def test_can_convert_traditional
23
+ result = Hanzi.to_pinyin('簡單')
24
+ assert_equal 'jian3dan1', result
25
+ end
26
+
27
+ def test_can_convert_with_surrounding_english
28
+ result = Hanzi.to_pinyin('no! 为什么!')
29
+ assert_equal 'no! wei4shen2me5!', result
30
+ end
31
+
32
+ def test_can_convert_sentence_of_hanzi
33
+ result = Hanzi.to_pinyin('你好, 我是康昱辰。')
34
+ assert_equal 'ni3hao3, wo3shi4kang1yu4chen2。', result
35
+ end
36
+
37
+ end
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: hanzi
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Steve Jackson
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-01-12 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rdoc
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '3.12'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '3.12'
30
+ - !ruby/object:Gem::Dependency
31
+ name: jeweler
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: 1.8.4
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: 1.8.4
46
+ description: Convert Hanzi to pinyin. Unlike other similar gems, this includes tones
47
+ and can accurately translate common words.
48
+ email: steven.j.jackson@gmail.com
49
+ executables: []
50
+ extensions: []
51
+ extra_rdoc_files:
52
+ - LICENSE.txt
53
+ - README.rdoc
54
+ files:
55
+ - .document
56
+ - Gemfile
57
+ - LICENSE.txt
58
+ - README.rdoc
59
+ - Rakefile
60
+ - VERSION
61
+ - hanzi.gemspec
62
+ - lib/data/cedict_ts.u8
63
+ - lib/hanzi.rb
64
+ - test/helper.rb
65
+ - test/test_hanzi.rb
66
+ homepage: http://github.com/stevejackson/hanzi
67
+ licenses:
68
+ - MIT
69
+ post_install_message:
70
+ rdoc_options: []
71
+ require_paths:
72
+ - lib
73
+ required_ruby_version: !ruby/object:Gem::Requirement
74
+ none: false
75
+ requirements:
76
+ - - ! '>='
77
+ - !ruby/object:Gem::Version
78
+ version: '0'
79
+ segments:
80
+ - 0
81
+ hash: 2598915566647370745
82
+ required_rubygems_version: !ruby/object:Gem::Requirement
83
+ none: false
84
+ requirements:
85
+ - - ! '>='
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ requirements: []
89
+ rubyforge_project:
90
+ rubygems_version: 1.8.24
91
+ signing_key:
92
+ specification_version: 3
93
+ summary: Convert Hanzi to pinyin. Unlike other similar gems, this includes tones and
94
+ can accurately translate common words.
95
+ test_files: []