hanzi 0.4.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -1,6 +1,9 @@
1
1
  source "http://rubygems.org"
2
2
 
3
+ gem 'fast_trie'
4
+
3
5
  group :development do
4
6
  gem "rdoc", "~> 3.12"
5
7
  gem "jeweler", "~> 1.8.4"
8
+ gem 'ruby-prof'
6
9
  end
@@ -41,6 +41,10 @@ To run tests:
41
41
 
42
42
  rake test
43
43
 
44
+ There is a script for profiling:
45
+
46
+ ruby test/profile_speed.rb # generates output in /tmp
47
+
44
48
  == Contributing to hanzi
45
49
 
46
50
  * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.4.0
1
+ 0.4.1
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "hanzi"
8
- s.version = "0.4.0"
8
+ s.version = "0.4.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Steve Jackson"]
12
- s.date = "2013-01-23"
12
+ s.date = "2013-02-06"
13
13
  s.description = "Convert Hanzi to pinyin. Unlike other similar gems, this includes tones and can accurately translate common words."
14
14
  s.email = "steven.j.jackson@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -32,22 +32,28 @@ Gem::Specification.new do |s|
32
32
  s.homepage = "http://github.com/stevejackson/hanzi"
33
33
  s.licenses = ["MIT"]
34
34
  s.require_paths = ["lib"]
35
- s.rubygems_version = "1.8.24"
35
+ s.rubygems_version = "1.8.23"
36
36
  s.summary = "Convert Hanzi to pinyin. Unlike other similar gems, this includes tones and can accurately translate common words."
37
37
 
38
38
  if s.respond_to? :specification_version then
39
39
  s.specification_version = 3
40
40
 
41
41
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
42
+ s.add_runtime_dependency(%q<fast_trie>, [">= 0"])
42
43
  s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
43
44
  s.add_development_dependency(%q<jeweler>, ["~> 1.8.4"])
45
+ s.add_development_dependency(%q<ruby-prof>, [">= 0"])
44
46
  else
47
+ s.add_dependency(%q<fast_trie>, [">= 0"])
45
48
  s.add_dependency(%q<rdoc>, ["~> 3.12"])
46
49
  s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
50
+ s.add_dependency(%q<ruby-prof>, [">= 0"])
47
51
  end
48
52
  else
53
+ s.add_dependency(%q<fast_trie>, [">= 0"])
49
54
  s.add_dependency(%q<rdoc>, ["~> 3.12"])
50
55
  s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
56
+ s.add_dependency(%q<ruby-prof>, [">= 0"])
51
57
  end
52
58
  end
53
59
 
@@ -1,14 +1,18 @@
1
1
  # encoding: utf-8
2
+ require 'trie'
2
3
 
3
4
  class Hanzi
4
5
  class << self
5
6
  attr_accessor :data
7
+ attr_accessor :data_trie
6
8
 
7
9
  def load_data
8
10
  return if @data
9
11
  @data = []
12
+ @data_trie = Trie.new
10
13
 
11
14
  file_path = File.expand_path('../../lib/data/cedict_ts.u8', __FILE__)
15
+ index = 0
12
16
  File.open(file_path).each_line do |line|
13
17
  next if line.start_with?('#')
14
18
  line = line.force_encoding('utf-8')
@@ -27,9 +31,26 @@ class Hanzi
27
31
  line = line[line.index('/'), line.rindex('/')]
28
32
  line_data[:english] = line[1, line.rindex('/') - 1]
29
33
 
34
+ existing_count_simplified = 0
35
+ if find_first_hanzi_match(line_data[:simplified])
36
+ existing_count_simplified = matching_entries(line_data[:simplified]).count
37
+ end
38
+ @data_trie.add(line_data[:simplified] + existing_count_simplified.to_s, index)
39
+
40
+ if line_data[:simplified] != line_data[:traditional]
41
+ existing_count_traditional = 0
42
+ if find_first_hanzi_match(line_data[:traditional])
43
+ existing_count_traditional = matching_entries(line_data[:traditional]).count
44
+ end
45
+
46
+ @data_trie.add(line_data[:traditional] + existing_count_traditional.to_s, index)
47
+ end
48
+
30
49
  @data << line_data
31
- end
32
50
 
51
+
52
+ index += 1
53
+ end
33
54
  end
34
55
 
35
56
  def to_pinyin(text, options={})
@@ -52,7 +73,7 @@ class Hanzi
52
73
  match = nil
53
74
  match_length = 0
54
75
  4.downto(1) do |length|
55
- match = find_hanzi_match(text[pos, length])
76
+ match = find_first_hanzi_match(text[pos, length])
56
77
  match_length = length
57
78
  break if match
58
79
  end
@@ -73,37 +94,44 @@ class Hanzi
73
94
  def to_english(text)
74
95
  load_data if @data.nil?
75
96
 
76
- entry = find_hanzi_match(text)
97
+ entry = find_first_hanzi_match(text)
77
98
  entry[:english] if entry && entry[:english]
78
99
  end
79
100
 
80
101
  def to_simplified(text)
81
102
  load_data if @data.nil?
82
103
 
83
- entry = find_hanzi_match(text)
104
+ entry = find_first_hanzi_match(text)
84
105
  entry[:simplified] if entry && entry[:simplified]
85
106
  end
86
107
 
87
108
  def to_traditional(text)
88
109
  load_data if @data.nil?
89
110
 
90
- entry = find_hanzi_match(text)
111
+ entry = find_first_hanzi_match(text)
91
112
  entry[:traditional] if entry && entry[:traditional]
92
113
  end
93
114
 
94
115
  def matching_entries(text)
95
116
  load_data if @data.nil?
96
117
 
97
- entries = @data.select do |word|
98
- word[:simplified] == text || word[:traditional] == text
118
+ results = []
119
+ index = 0
120
+ loop do
121
+ id = @data_trie.get(text + index.to_s)
122
+ break if !id
123
+
124
+ results << @data[id]
125
+ index += 1
99
126
  end
127
+
128
+ results
100
129
  end
101
130
 
102
131
  private
103
- def find_hanzi_match(text)
104
- entry = @data.find do |word|
105
- word[:simplified] == text || word[:traditional] == text
106
- end
132
+ def find_first_hanzi_match(text)
133
+ id = @data_trie.get(text + "0")
134
+ @data[id] if id
107
135
  end
108
136
 
109
137
  end
@@ -1,6 +1,7 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  require 'helper'
4
+ require 'benchmark'
4
5
 
5
6
  class TestHanzi < Test::Unit::TestCase
6
7
 
@@ -71,4 +72,31 @@ class TestHanzi < Test::Unit::TestCase
71
72
  assert_equal nil, result
72
73
  end
73
74
 
75
+ def test_speed
76
+ entries = words = ["果断", "过度", "过渡", "过奖", "过滤", "过失", "过问", "过瘾", "过于", "嗨", "海拔",
77
+ "海滨", "含糊", "寒暄", "含义", "罕见", "捍卫", "航空", "行列", "航天", "航行", "豪迈", "毫米",
78
+ "毫无", "耗费", "好客", "号召", "呵", "和蔼", "合并", "合成", "合乎", "合伙", "和解", "和睦", "和气",
79
+ "合身", "合算", "和谐", "嘿", "痕迹", "狠心", "恨不得", "哼", "哄", "烘", "轰动", "红包", "宏观",
80
+ "洪水", "宏伟", "喉咙", "吼", "后代", "后顾之忧", "后勤", "候选", "忽略", "呼啸", "呼吁", "胡乱",
81
+ "湖泊", "互联网", "华丽", "华侨", "化肥", "划分", "画蛇添足", "化石", "话筒", "化验", "化妆", "怀孕",
82
+ "欢乐", "环节", "还原", "缓和", "患者", "荒凉", "慌忙", "荒谬", "荒唐", "黄昏", "恍然大悟", "辉煌",
83
+ "挥霍", "回报", "回避", "回顾", "回收", "悔恨", "毁灭", "汇报", "贿赂", "会晤", "昏迷", "浑身", "混合",
84
+ "混乱", "混淆", "混浊", "活该", "活力", "火箭", "火焰", "火药", "货币", "或许", "基地", "机动", "饥饿",
85
+ "激发", "机构", "机关", "基金", "激励", "机灵", "机密", "激情", "讥笑", "机械", "基因", "机遇", "机智",
86
+ "即便", "级别", "疾病", "嫉妒", "极端", "急功近利", "籍贯", "即将", "急剧", "急切", "集团", "极限",
87
+ "吉祥", "急于求成", "及早", "急躁", "给予", "继承", "季度", "忌讳", "计较", "寂静", "季军", "技能",
88
+ "技巧", "寄托", "继往开来", "迹象", "记性", "纪要", "记载", "家常", "加工", "家伙", "加剧", "家属",
89
+ "空虚", "孔", "恐吓", "恐惧", "空白", "空隙", "口气", "口腔", "口头", "口音", "枯竭", "枯燥", "苦尽甘来"]
90
+
91
+ Hanzi.load_data
92
+
93
+ time = Benchmark.realtime {
94
+ entries.each do |word|
95
+ Hanzi.matching_entries(word)
96
+ end
97
+ }
98
+
99
+ assert time < 0.01, "Lookups took #{time}s, should be less than 0.01s"
100
+ end
101
+
74
102
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hanzi
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,8 +9,24 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-01-23 00:00:00.000000000 Z
12
+ date: 2013-02-06 00:00:00.000000000 Z
13
13
  dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: fast_trie
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
14
30
  - !ruby/object:Gem::Dependency
15
31
  name: rdoc
16
32
  requirement: !ruby/object:Gem::Requirement
@@ -43,6 +59,22 @@ dependencies:
43
59
  - - ~>
44
60
  - !ruby/object:Gem::Version
45
61
  version: 1.8.4
62
+ - !ruby/object:Gem::Dependency
63
+ name: ruby-prof
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
46
78
  description: Convert Hanzi to pinyin. Unlike other similar gems, this includes tones
47
79
  and can accurately translate common words.
48
80
  email: steven.j.jackson@gmail.com
@@ -78,7 +110,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
78
110
  version: '0'
79
111
  segments:
80
112
  - 0
81
- hash: -3371334032433433338
113
+ hash: 2045449454900323420
82
114
  required_rubygems_version: !ruby/object:Gem::Requirement
83
115
  none: false
84
116
  requirements:
@@ -87,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
87
119
  version: '0'
88
120
  requirements: []
89
121
  rubyforge_project:
90
- rubygems_version: 1.8.24
122
+ rubygems_version: 1.8.23
91
123
  signing_key:
92
124
  specification_version: 3
93
125
  summary: Convert Hanzi to pinyin. Unlike other similar gems, this includes tones and