hanzi 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +3 -0
- data/README.rdoc +4 -0
- data/VERSION +1 -1
- data/hanzi.gemspec +9 -3
- data/lib/hanzi.rb +39 -11
- data/test/test_hanzi.rb +28 -0
- metadata +36 -4
data/Gemfile
CHANGED
data/README.rdoc
CHANGED
@@ -41,6 +41,10 @@ To run tests:
|
|
41
41
|
|
42
42
|
rake test
|
43
43
|
|
44
|
+
There is a script for profiling:
|
45
|
+
|
46
|
+
ruby test/profile_speed.rb # generates output in /tmp
|
47
|
+
|
44
48
|
== Contributing to hanzi
|
45
49
|
|
46
50
|
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.4.
|
1
|
+
0.4.1
|
data/hanzi.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "hanzi"
|
8
|
-
s.version = "0.4.
|
8
|
+
s.version = "0.4.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Steve Jackson"]
|
12
|
-
s.date = "2013-
|
12
|
+
s.date = "2013-02-06"
|
13
13
|
s.description = "Convert Hanzi to pinyin. Unlike other similar gems, this includes tones and can accurately translate common words."
|
14
14
|
s.email = "steven.j.jackson@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -32,22 +32,28 @@ Gem::Specification.new do |s|
|
|
32
32
|
s.homepage = "http://github.com/stevejackson/hanzi"
|
33
33
|
s.licenses = ["MIT"]
|
34
34
|
s.require_paths = ["lib"]
|
35
|
-
s.rubygems_version = "1.8.
|
35
|
+
s.rubygems_version = "1.8.23"
|
36
36
|
s.summary = "Convert Hanzi to pinyin. Unlike other similar gems, this includes tones and can accurately translate common words."
|
37
37
|
|
38
38
|
if s.respond_to? :specification_version then
|
39
39
|
s.specification_version = 3
|
40
40
|
|
41
41
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
42
|
+
s.add_runtime_dependency(%q<fast_trie>, [">= 0"])
|
42
43
|
s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
|
43
44
|
s.add_development_dependency(%q<jeweler>, ["~> 1.8.4"])
|
45
|
+
s.add_development_dependency(%q<ruby-prof>, [">= 0"])
|
44
46
|
else
|
47
|
+
s.add_dependency(%q<fast_trie>, [">= 0"])
|
45
48
|
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
46
49
|
s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
|
50
|
+
s.add_dependency(%q<ruby-prof>, [">= 0"])
|
47
51
|
end
|
48
52
|
else
|
53
|
+
s.add_dependency(%q<fast_trie>, [">= 0"])
|
49
54
|
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
50
55
|
s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
|
56
|
+
s.add_dependency(%q<ruby-prof>, [">= 0"])
|
51
57
|
end
|
52
58
|
end
|
53
59
|
|
data/lib/hanzi.rb
CHANGED
@@ -1,14 +1,18 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
+
require 'trie'
|
2
3
|
|
3
4
|
class Hanzi
|
4
5
|
class << self
|
5
6
|
attr_accessor :data
|
7
|
+
attr_accessor :data_trie
|
6
8
|
|
7
9
|
def load_data
|
8
10
|
return if @data
|
9
11
|
@data = []
|
12
|
+
@data_trie = Trie.new
|
10
13
|
|
11
14
|
file_path = File.expand_path('../../lib/data/cedict_ts.u8', __FILE__)
|
15
|
+
index = 0
|
12
16
|
File.open(file_path).each_line do |line|
|
13
17
|
next if line.start_with?('#')
|
14
18
|
line = line.force_encoding('utf-8')
|
@@ -27,9 +31,26 @@ class Hanzi
|
|
27
31
|
line = line[line.index('/'), line.rindex('/')]
|
28
32
|
line_data[:english] = line[1, line.rindex('/') - 1]
|
29
33
|
|
34
|
+
existing_count_simplified = 0
|
35
|
+
if find_first_hanzi_match(line_data[:simplified])
|
36
|
+
existing_count_simplified = matching_entries(line_data[:simplified]).count
|
37
|
+
end
|
38
|
+
@data_trie.add(line_data[:simplified] + existing_count_simplified.to_s, index)
|
39
|
+
|
40
|
+
if line_data[:simplified] != line_data[:traditional]
|
41
|
+
existing_count_traditional = 0
|
42
|
+
if find_first_hanzi_match(line_data[:traditional])
|
43
|
+
existing_count_traditional = matching_entries(line_data[:traditional]).count
|
44
|
+
end
|
45
|
+
|
46
|
+
@data_trie.add(line_data[:traditional] + existing_count_traditional.to_s, index)
|
47
|
+
end
|
48
|
+
|
30
49
|
@data << line_data
|
31
|
-
end
|
32
50
|
|
51
|
+
|
52
|
+
index += 1
|
53
|
+
end
|
33
54
|
end
|
34
55
|
|
35
56
|
def to_pinyin(text, options={})
|
@@ -52,7 +73,7 @@ class Hanzi
|
|
52
73
|
match = nil
|
53
74
|
match_length = 0
|
54
75
|
4.downto(1) do |length|
|
55
|
-
match =
|
76
|
+
match = find_first_hanzi_match(text[pos, length])
|
56
77
|
match_length = length
|
57
78
|
break if match
|
58
79
|
end
|
@@ -73,37 +94,44 @@ class Hanzi
|
|
73
94
|
def to_english(text)
|
74
95
|
load_data if @data.nil?
|
75
96
|
|
76
|
-
entry =
|
97
|
+
entry = find_first_hanzi_match(text)
|
77
98
|
entry[:english] if entry && entry[:english]
|
78
99
|
end
|
79
100
|
|
80
101
|
def to_simplified(text)
|
81
102
|
load_data if @data.nil?
|
82
103
|
|
83
|
-
entry =
|
104
|
+
entry = find_first_hanzi_match(text)
|
84
105
|
entry[:simplified] if entry && entry[:simplified]
|
85
106
|
end
|
86
107
|
|
87
108
|
def to_traditional(text)
|
88
109
|
load_data if @data.nil?
|
89
110
|
|
90
|
-
entry =
|
111
|
+
entry = find_first_hanzi_match(text)
|
91
112
|
entry[:traditional] if entry && entry[:traditional]
|
92
113
|
end
|
93
114
|
|
94
115
|
def matching_entries(text)
|
95
116
|
load_data if @data.nil?
|
96
117
|
|
97
|
-
|
98
|
-
|
118
|
+
results = []
|
119
|
+
index = 0
|
120
|
+
loop do
|
121
|
+
id = @data_trie.get(text + index.to_s)
|
122
|
+
break if !id
|
123
|
+
|
124
|
+
results << @data[id]
|
125
|
+
index += 1
|
99
126
|
end
|
127
|
+
|
128
|
+
results
|
100
129
|
end
|
101
130
|
|
102
131
|
private
|
103
|
-
def
|
104
|
-
|
105
|
-
|
106
|
-
end
|
132
|
+
def find_first_hanzi_match(text)
|
133
|
+
id = @data_trie.get(text + "0")
|
134
|
+
@data[id] if id
|
107
135
|
end
|
108
136
|
|
109
137
|
end
|
data/test/test_hanzi.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
require 'helper'
|
4
|
+
require 'benchmark'
|
4
5
|
|
5
6
|
class TestHanzi < Test::Unit::TestCase
|
6
7
|
|
@@ -71,4 +72,31 @@ class TestHanzi < Test::Unit::TestCase
|
|
71
72
|
assert_equal nil, result
|
72
73
|
end
|
73
74
|
|
75
|
+
def test_speed
|
76
|
+
entries = words = ["果断", "过度", "过渡", "过奖", "过滤", "过失", "过问", "过瘾", "过于", "嗨", "海拔",
|
77
|
+
"海滨", "含糊", "寒暄", "含义", "罕见", "捍卫", "航空", "行列", "航天", "航行", "豪迈", "毫米",
|
78
|
+
"毫无", "耗费", "好客", "号召", "呵", "和蔼", "合并", "合成", "合乎", "合伙", "和解", "和睦", "和气",
|
79
|
+
"合身", "合算", "和谐", "嘿", "痕迹", "狠心", "恨不得", "哼", "哄", "烘", "轰动", "红包", "宏观",
|
80
|
+
"洪水", "宏伟", "喉咙", "吼", "后代", "后顾之忧", "后勤", "候选", "忽略", "呼啸", "呼吁", "胡乱",
|
81
|
+
"湖泊", "互联网", "华丽", "华侨", "化肥", "划分", "画蛇添足", "化石", "话筒", "化验", "化妆", "怀孕",
|
82
|
+
"欢乐", "环节", "还原", "缓和", "患者", "荒凉", "慌忙", "荒谬", "荒唐", "黄昏", "恍然大悟", "辉煌",
|
83
|
+
"挥霍", "回报", "回避", "回顾", "回收", "悔恨", "毁灭", "汇报", "贿赂", "会晤", "昏迷", "浑身", "混合",
|
84
|
+
"混乱", "混淆", "混浊", "活该", "活力", "火箭", "火焰", "火药", "货币", "或许", "基地", "机动", "饥饿",
|
85
|
+
"激发", "机构", "机关", "基金", "激励", "机灵", "机密", "激情", "讥笑", "机械", "基因", "机遇", "机智",
|
86
|
+
"即便", "级别", "疾病", "嫉妒", "极端", "急功近利", "籍贯", "即将", "急剧", "急切", "集团", "极限",
|
87
|
+
"吉祥", "急于求成", "及早", "急躁", "给予", "继承", "季度", "忌讳", "计较", "寂静", "季军", "技能",
|
88
|
+
"技巧", "寄托", "继往开来", "迹象", "记性", "纪要", "记载", "家常", "加工", "家伙", "加剧", "家属",
|
89
|
+
"空虚", "孔", "恐吓", "恐惧", "空白", "空隙", "口气", "口腔", "口头", "口音", "枯竭", "枯燥", "苦尽甘来"]
|
90
|
+
|
91
|
+
Hanzi.load_data
|
92
|
+
|
93
|
+
time = Benchmark.realtime {
|
94
|
+
entries.each do |word|
|
95
|
+
Hanzi.matching_entries(word)
|
96
|
+
end
|
97
|
+
}
|
98
|
+
|
99
|
+
assert time < 0.01, "Lookups took #{time}s, should be less than 0.01s"
|
100
|
+
end
|
101
|
+
|
74
102
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hanzi
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,8 +9,24 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-02-06 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: fast_trie
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
14
30
|
- !ruby/object:Gem::Dependency
|
15
31
|
name: rdoc
|
16
32
|
requirement: !ruby/object:Gem::Requirement
|
@@ -43,6 +59,22 @@ dependencies:
|
|
43
59
|
- - ~>
|
44
60
|
- !ruby/object:Gem::Version
|
45
61
|
version: 1.8.4
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: ruby-prof
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
46
78
|
description: Convert Hanzi to pinyin. Unlike other similar gems, this includes tones
|
47
79
|
and can accurately translate common words.
|
48
80
|
email: steven.j.jackson@gmail.com
|
@@ -78,7 +110,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
78
110
|
version: '0'
|
79
111
|
segments:
|
80
112
|
- 0
|
81
|
-
hash:
|
113
|
+
hash: 2045449454900323420
|
82
114
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
83
115
|
none: false
|
84
116
|
requirements:
|
@@ -87,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
87
119
|
version: '0'
|
88
120
|
requirements: []
|
89
121
|
rubyforge_project:
|
90
|
-
rubygems_version: 1.8.
|
122
|
+
rubygems_version: 1.8.23
|
91
123
|
signing_key:
|
92
124
|
specification_version: 3
|
93
125
|
summary: Convert Hanzi to pinyin. Unlike other similar gems, this includes tones and
|