hanzi 0.4.0 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +3 -0
- data/README.rdoc +4 -0
- data/VERSION +1 -1
- data/hanzi.gemspec +9 -3
- data/lib/hanzi.rb +39 -11
- data/test/test_hanzi.rb +28 -0
- metadata +36 -4
data/Gemfile
CHANGED
data/README.rdoc
CHANGED
@@ -41,6 +41,10 @@ To run tests:
|
|
41
41
|
|
42
42
|
rake test
|
43
43
|
|
44
|
+
There is a script for profiling:
|
45
|
+
|
46
|
+
ruby test/profile_speed.rb # generates output in /tmp
|
47
|
+
|
44
48
|
== Contributing to hanzi
|
45
49
|
|
46
50
|
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.4.
|
1
|
+
0.4.1
|
data/hanzi.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "hanzi"
|
8
|
-
s.version = "0.4.
|
8
|
+
s.version = "0.4.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Steve Jackson"]
|
12
|
-
s.date = "2013-
|
12
|
+
s.date = "2013-02-06"
|
13
13
|
s.description = "Convert Hanzi to pinyin. Unlike other similar gems, this includes tones and can accurately translate common words."
|
14
14
|
s.email = "steven.j.jackson@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -32,22 +32,28 @@ Gem::Specification.new do |s|
|
|
32
32
|
s.homepage = "http://github.com/stevejackson/hanzi"
|
33
33
|
s.licenses = ["MIT"]
|
34
34
|
s.require_paths = ["lib"]
|
35
|
-
s.rubygems_version = "1.8.
|
35
|
+
s.rubygems_version = "1.8.23"
|
36
36
|
s.summary = "Convert Hanzi to pinyin. Unlike other similar gems, this includes tones and can accurately translate common words."
|
37
37
|
|
38
38
|
if s.respond_to? :specification_version then
|
39
39
|
s.specification_version = 3
|
40
40
|
|
41
41
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
42
|
+
s.add_runtime_dependency(%q<fast_trie>, [">= 0"])
|
42
43
|
s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
|
43
44
|
s.add_development_dependency(%q<jeweler>, ["~> 1.8.4"])
|
45
|
+
s.add_development_dependency(%q<ruby-prof>, [">= 0"])
|
44
46
|
else
|
47
|
+
s.add_dependency(%q<fast_trie>, [">= 0"])
|
45
48
|
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
46
49
|
s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
|
50
|
+
s.add_dependency(%q<ruby-prof>, [">= 0"])
|
47
51
|
end
|
48
52
|
else
|
53
|
+
s.add_dependency(%q<fast_trie>, [">= 0"])
|
49
54
|
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
50
55
|
s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
|
56
|
+
s.add_dependency(%q<ruby-prof>, [">= 0"])
|
51
57
|
end
|
52
58
|
end
|
53
59
|
|
data/lib/hanzi.rb
CHANGED
@@ -1,14 +1,18 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
+
require 'trie'
|
2
3
|
|
3
4
|
class Hanzi
|
4
5
|
class << self
|
5
6
|
attr_accessor :data
|
7
|
+
attr_accessor :data_trie
|
6
8
|
|
7
9
|
def load_data
|
8
10
|
return if @data
|
9
11
|
@data = []
|
12
|
+
@data_trie = Trie.new
|
10
13
|
|
11
14
|
file_path = File.expand_path('../../lib/data/cedict_ts.u8', __FILE__)
|
15
|
+
index = 0
|
12
16
|
File.open(file_path).each_line do |line|
|
13
17
|
next if line.start_with?('#')
|
14
18
|
line = line.force_encoding('utf-8')
|
@@ -27,9 +31,26 @@ class Hanzi
|
|
27
31
|
line = line[line.index('/'), line.rindex('/')]
|
28
32
|
line_data[:english] = line[1, line.rindex('/') - 1]
|
29
33
|
|
34
|
+
existing_count_simplified = 0
|
35
|
+
if find_first_hanzi_match(line_data[:simplified])
|
36
|
+
existing_count_simplified = matching_entries(line_data[:simplified]).count
|
37
|
+
end
|
38
|
+
@data_trie.add(line_data[:simplified] + existing_count_simplified.to_s, index)
|
39
|
+
|
40
|
+
if line_data[:simplified] != line_data[:traditional]
|
41
|
+
existing_count_traditional = 0
|
42
|
+
if find_first_hanzi_match(line_data[:traditional])
|
43
|
+
existing_count_traditional = matching_entries(line_data[:traditional]).count
|
44
|
+
end
|
45
|
+
|
46
|
+
@data_trie.add(line_data[:traditional] + existing_count_traditional.to_s, index)
|
47
|
+
end
|
48
|
+
|
30
49
|
@data << line_data
|
31
|
-
end
|
32
50
|
|
51
|
+
|
52
|
+
index += 1
|
53
|
+
end
|
33
54
|
end
|
34
55
|
|
35
56
|
def to_pinyin(text, options={})
|
@@ -52,7 +73,7 @@ class Hanzi
|
|
52
73
|
match = nil
|
53
74
|
match_length = 0
|
54
75
|
4.downto(1) do |length|
|
55
|
-
match =
|
76
|
+
match = find_first_hanzi_match(text[pos, length])
|
56
77
|
match_length = length
|
57
78
|
break if match
|
58
79
|
end
|
@@ -73,37 +94,44 @@ class Hanzi
|
|
73
94
|
def to_english(text)
|
74
95
|
load_data if @data.nil?
|
75
96
|
|
76
|
-
entry =
|
97
|
+
entry = find_first_hanzi_match(text)
|
77
98
|
entry[:english] if entry && entry[:english]
|
78
99
|
end
|
79
100
|
|
80
101
|
def to_simplified(text)
|
81
102
|
load_data if @data.nil?
|
82
103
|
|
83
|
-
entry =
|
104
|
+
entry = find_first_hanzi_match(text)
|
84
105
|
entry[:simplified] if entry && entry[:simplified]
|
85
106
|
end
|
86
107
|
|
87
108
|
def to_traditional(text)
|
88
109
|
load_data if @data.nil?
|
89
110
|
|
90
|
-
entry =
|
111
|
+
entry = find_first_hanzi_match(text)
|
91
112
|
entry[:traditional] if entry && entry[:traditional]
|
92
113
|
end
|
93
114
|
|
94
115
|
def matching_entries(text)
|
95
116
|
load_data if @data.nil?
|
96
117
|
|
97
|
-
|
98
|
-
|
118
|
+
results = []
|
119
|
+
index = 0
|
120
|
+
loop do
|
121
|
+
id = @data_trie.get(text + index.to_s)
|
122
|
+
break if !id
|
123
|
+
|
124
|
+
results << @data[id]
|
125
|
+
index += 1
|
99
126
|
end
|
127
|
+
|
128
|
+
results
|
100
129
|
end
|
101
130
|
|
102
131
|
private
|
103
|
-
def
|
104
|
-
|
105
|
-
|
106
|
-
end
|
132
|
+
def find_first_hanzi_match(text)
|
133
|
+
id = @data_trie.get(text + "0")
|
134
|
+
@data[id] if id
|
107
135
|
end
|
108
136
|
|
109
137
|
end
|
data/test/test_hanzi.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
require 'helper'
|
4
|
+
require 'benchmark'
|
4
5
|
|
5
6
|
class TestHanzi < Test::Unit::TestCase
|
6
7
|
|
@@ -71,4 +72,31 @@ class TestHanzi < Test::Unit::TestCase
|
|
71
72
|
assert_equal nil, result
|
72
73
|
end
|
73
74
|
|
75
|
+
def test_speed
|
76
|
+
entries = words = ["果断", "过度", "过渡", "过奖", "过滤", "过失", "过问", "过瘾", "过于", "嗨", "海拔",
|
77
|
+
"海滨", "含糊", "寒暄", "含义", "罕见", "捍卫", "航空", "行列", "航天", "航行", "豪迈", "毫米",
|
78
|
+
"毫无", "耗费", "好客", "号召", "呵", "和蔼", "合并", "合成", "合乎", "合伙", "和解", "和睦", "和气",
|
79
|
+
"合身", "合算", "和谐", "嘿", "痕迹", "狠心", "恨不得", "哼", "哄", "烘", "轰动", "红包", "宏观",
|
80
|
+
"洪水", "宏伟", "喉咙", "吼", "后代", "后顾之忧", "后勤", "候选", "忽略", "呼啸", "呼吁", "胡乱",
|
81
|
+
"湖泊", "互联网", "华丽", "华侨", "化肥", "划分", "画蛇添足", "化石", "话筒", "化验", "化妆", "怀孕",
|
82
|
+
"欢乐", "环节", "还原", "缓和", "患者", "荒凉", "慌忙", "荒谬", "荒唐", "黄昏", "恍然大悟", "辉煌",
|
83
|
+
"挥霍", "回报", "回避", "回顾", "回收", "悔恨", "毁灭", "汇报", "贿赂", "会晤", "昏迷", "浑身", "混合",
|
84
|
+
"混乱", "混淆", "混浊", "活该", "活力", "火箭", "火焰", "火药", "货币", "或许", "基地", "机动", "饥饿",
|
85
|
+
"激发", "机构", "机关", "基金", "激励", "机灵", "机密", "激情", "讥笑", "机械", "基因", "机遇", "机智",
|
86
|
+
"即便", "级别", "疾病", "嫉妒", "极端", "急功近利", "籍贯", "即将", "急剧", "急切", "集团", "极限",
|
87
|
+
"吉祥", "急于求成", "及早", "急躁", "给予", "继承", "季度", "忌讳", "计较", "寂静", "季军", "技能",
|
88
|
+
"技巧", "寄托", "继往开来", "迹象", "记性", "纪要", "记载", "家常", "加工", "家伙", "加剧", "家属",
|
89
|
+
"空虚", "孔", "恐吓", "恐惧", "空白", "空隙", "口气", "口腔", "口头", "口音", "枯竭", "枯燥", "苦尽甘来"]
|
90
|
+
|
91
|
+
Hanzi.load_data
|
92
|
+
|
93
|
+
time = Benchmark.realtime {
|
94
|
+
entries.each do |word|
|
95
|
+
Hanzi.matching_entries(word)
|
96
|
+
end
|
97
|
+
}
|
98
|
+
|
99
|
+
assert time < 0.01, "Lookups took #{time}s, should be less than 0.01s"
|
100
|
+
end
|
101
|
+
|
74
102
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hanzi
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,8 +9,24 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-02-06 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: fast_trie
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
14
30
|
- !ruby/object:Gem::Dependency
|
15
31
|
name: rdoc
|
16
32
|
requirement: !ruby/object:Gem::Requirement
|
@@ -43,6 +59,22 @@ dependencies:
|
|
43
59
|
- - ~>
|
44
60
|
- !ruby/object:Gem::Version
|
45
61
|
version: 1.8.4
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: ruby-prof
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
46
78
|
description: Convert Hanzi to pinyin. Unlike other similar gems, this includes tones
|
47
79
|
and can accurately translate common words.
|
48
80
|
email: steven.j.jackson@gmail.com
|
@@ -78,7 +110,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
78
110
|
version: '0'
|
79
111
|
segments:
|
80
112
|
- 0
|
81
|
-
hash:
|
113
|
+
hash: 2045449454900323420
|
82
114
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
83
115
|
none: false
|
84
116
|
requirements:
|
@@ -87,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
87
119
|
version: '0'
|
88
120
|
requirements: []
|
89
121
|
rubyforge_project:
|
90
|
-
rubygems_version: 1.8.
|
122
|
+
rubygems_version: 1.8.23
|
91
123
|
signing_key:
|
92
124
|
specification_version: 3
|
93
125
|
summary: Convert Hanzi to pinyin. Unlike other similar gems, this includes tones and
|