hanzi 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile CHANGED
@@ -1,6 +1,9 @@
1
1
  source "http://rubygems.org"
2
2
 
3
+ gem 'fast_trie'
4
+
3
5
  group :development do
4
6
  gem "rdoc", "~> 3.12"
5
7
  gem "jeweler", "~> 1.8.4"
8
+ gem 'ruby-prof'
6
9
  end
@@ -41,6 +41,10 @@ To run tests:
41
41
 
42
42
  rake test
43
43
 
44
+ There is a script for profiling:
45
+
46
+ ruby test/profile_speed.rb # generates output in /tmp
47
+
44
48
  == Contributing to hanzi
45
49
 
46
50
  * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.4.0
1
+ 0.4.1
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "hanzi"
8
- s.version = "0.4.0"
8
+ s.version = "0.4.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Steve Jackson"]
12
- s.date = "2013-01-23"
12
+ s.date = "2013-02-06"
13
13
  s.description = "Convert Hanzi to pinyin. Unlike other similar gems, this includes tones and can accurately translate common words."
14
14
  s.email = "steven.j.jackson@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -32,22 +32,28 @@ Gem::Specification.new do |s|
32
32
  s.homepage = "http://github.com/stevejackson/hanzi"
33
33
  s.licenses = ["MIT"]
34
34
  s.require_paths = ["lib"]
35
- s.rubygems_version = "1.8.24"
35
+ s.rubygems_version = "1.8.23"
36
36
  s.summary = "Convert Hanzi to pinyin. Unlike other similar gems, this includes tones and can accurately translate common words."
37
37
 
38
38
  if s.respond_to? :specification_version then
39
39
  s.specification_version = 3
40
40
 
41
41
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
42
+ s.add_runtime_dependency(%q<fast_trie>, [">= 0"])
42
43
  s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
43
44
  s.add_development_dependency(%q<jeweler>, ["~> 1.8.4"])
45
+ s.add_development_dependency(%q<ruby-prof>, [">= 0"])
44
46
  else
47
+ s.add_dependency(%q<fast_trie>, [">= 0"])
45
48
  s.add_dependency(%q<rdoc>, ["~> 3.12"])
46
49
  s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
50
+ s.add_dependency(%q<ruby-prof>, [">= 0"])
47
51
  end
48
52
  else
53
+ s.add_dependency(%q<fast_trie>, [">= 0"])
49
54
  s.add_dependency(%q<rdoc>, ["~> 3.12"])
50
55
  s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
56
+ s.add_dependency(%q<ruby-prof>, [">= 0"])
51
57
  end
52
58
  end
53
59
 
@@ -1,14 +1,18 @@
1
1
  # encoding: utf-8
2
+ require 'trie'
2
3
 
3
4
  class Hanzi
4
5
  class << self
5
6
  attr_accessor :data
7
+ attr_accessor :data_trie
6
8
 
7
9
  def load_data
8
10
  return if @data
9
11
  @data = []
12
+ @data_trie = Trie.new
10
13
 
11
14
  file_path = File.expand_path('../../lib/data/cedict_ts.u8', __FILE__)
15
+ index = 0
12
16
  File.open(file_path).each_line do |line|
13
17
  next if line.start_with?('#')
14
18
  line = line.force_encoding('utf-8')
@@ -27,9 +31,26 @@ class Hanzi
27
31
  line = line[line.index('/'), line.rindex('/')]
28
32
  line_data[:english] = line[1, line.rindex('/') - 1]
29
33
 
34
+ existing_count_simplified = 0
35
+ if find_first_hanzi_match(line_data[:simplified])
36
+ existing_count_simplified = matching_entries(line_data[:simplified]).count
37
+ end
38
+ @data_trie.add(line_data[:simplified] + existing_count_simplified.to_s, index)
39
+
40
+ if line_data[:simplified] != line_data[:traditional]
41
+ existing_count_traditional = 0
42
+ if find_first_hanzi_match(line_data[:traditional])
43
+ existing_count_traditional = matching_entries(line_data[:traditional]).count
44
+ end
45
+
46
+ @data_trie.add(line_data[:traditional] + existing_count_traditional.to_s, index)
47
+ end
48
+
30
49
  @data << line_data
31
- end
32
50
 
51
+
52
+ index += 1
53
+ end
33
54
  end
34
55
 
35
56
  def to_pinyin(text, options={})
@@ -52,7 +73,7 @@ class Hanzi
52
73
  match = nil
53
74
  match_length = 0
54
75
  4.downto(1) do |length|
55
- match = find_hanzi_match(text[pos, length])
76
+ match = find_first_hanzi_match(text[pos, length])
56
77
  match_length = length
57
78
  break if match
58
79
  end
@@ -73,37 +94,44 @@ class Hanzi
73
94
  def to_english(text)
74
95
  load_data if @data.nil?
75
96
 
76
- entry = find_hanzi_match(text)
97
+ entry = find_first_hanzi_match(text)
77
98
  entry[:english] if entry && entry[:english]
78
99
  end
79
100
 
80
101
  def to_simplified(text)
81
102
  load_data if @data.nil?
82
103
 
83
- entry = find_hanzi_match(text)
104
+ entry = find_first_hanzi_match(text)
84
105
  entry[:simplified] if entry && entry[:simplified]
85
106
  end
86
107
 
87
108
  def to_traditional(text)
88
109
  load_data if @data.nil?
89
110
 
90
- entry = find_hanzi_match(text)
111
+ entry = find_first_hanzi_match(text)
91
112
  entry[:traditional] if entry && entry[:traditional]
92
113
  end
93
114
 
94
115
  def matching_entries(text)
95
116
  load_data if @data.nil?
96
117
 
97
- entries = @data.select do |word|
98
- word[:simplified] == text || word[:traditional] == text
118
+ results = []
119
+ index = 0
120
+ loop do
121
+ id = @data_trie.get(text + index.to_s)
122
+ break if !id
123
+
124
+ results << @data[id]
125
+ index += 1
99
126
  end
127
+
128
+ results
100
129
  end
101
130
 
102
131
  private
103
- def find_hanzi_match(text)
104
- entry = @data.find do |word|
105
- word[:simplified] == text || word[:traditional] == text
106
- end
132
+ def find_first_hanzi_match(text)
133
+ id = @data_trie.get(text + "0")
134
+ @data[id] if id
107
135
  end
108
136
 
109
137
  end
@@ -1,6 +1,7 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  require 'helper'
4
+ require 'benchmark'
4
5
 
5
6
  class TestHanzi < Test::Unit::TestCase
6
7
 
@@ -71,4 +72,31 @@ class TestHanzi < Test::Unit::TestCase
71
72
  assert_equal nil, result
72
73
  end
73
74
 
75
+ def test_speed
76
+ entries = words = ["果断", "过度", "过渡", "过奖", "过滤", "过失", "过问", "过瘾", "过于", "嗨", "海拔",
77
+ "海滨", "含糊", "寒暄", "含义", "罕见", "捍卫", "航空", "行列", "航天", "航行", "豪迈", "毫米",
78
+ "毫无", "耗费", "好客", "号召", "呵", "和蔼", "合并", "合成", "合乎", "合伙", "和解", "和睦", "和气",
79
+ "合身", "合算", "和谐", "嘿", "痕迹", "狠心", "恨不得", "哼", "哄", "烘", "轰动", "红包", "宏观",
80
+ "洪水", "宏伟", "喉咙", "吼", "后代", "后顾之忧", "后勤", "候选", "忽略", "呼啸", "呼吁", "胡乱",
81
+ "湖泊", "互联网", "华丽", "华侨", "化肥", "划分", "画蛇添足", "化石", "话筒", "化验", "化妆", "怀孕",
82
+ "欢乐", "环节", "还原", "缓和", "患者", "荒凉", "慌忙", "荒谬", "荒唐", "黄昏", "恍然大悟", "辉煌",
83
+ "挥霍", "回报", "回避", "回顾", "回收", "悔恨", "毁灭", "汇报", "贿赂", "会晤", "昏迷", "浑身", "混合",
84
+ "混乱", "混淆", "混浊", "活该", "活力", "火箭", "火焰", "火药", "货币", "或许", "基地", "机动", "饥饿",
85
+ "激发", "机构", "机关", "基金", "激励", "机灵", "机密", "激情", "讥笑", "机械", "基因", "机遇", "机智",
86
+ "即便", "级别", "疾病", "嫉妒", "极端", "急功近利", "籍贯", "即将", "急剧", "急切", "集团", "极限",
87
+ "吉祥", "急于求成", "及早", "急躁", "给予", "继承", "季度", "忌讳", "计较", "寂静", "季军", "技能",
88
+ "技巧", "寄托", "继往开来", "迹象", "记性", "纪要", "记载", "家常", "加工", "家伙", "加剧", "家属",
89
+ "空虚", "孔", "恐吓", "恐惧", "空白", "空隙", "口气", "口腔", "口头", "口音", "枯竭", "枯燥", "苦尽甘来"]
90
+
91
+ Hanzi.load_data
92
+
93
+ time = Benchmark.realtime {
94
+ entries.each do |word|
95
+ Hanzi.matching_entries(word)
96
+ end
97
+ }
98
+
99
+ assert time < 0.01, "Lookups took #{time}s, should be less than 0.01s"
100
+ end
101
+
74
102
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hanzi
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,8 +9,24 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-01-23 00:00:00.000000000 Z
12
+ date: 2013-02-06 00:00:00.000000000 Z
13
13
  dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: fast_trie
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
14
30
  - !ruby/object:Gem::Dependency
15
31
  name: rdoc
16
32
  requirement: !ruby/object:Gem::Requirement
@@ -43,6 +59,22 @@ dependencies:
43
59
  - - ~>
44
60
  - !ruby/object:Gem::Version
45
61
  version: 1.8.4
62
+ - !ruby/object:Gem::Dependency
63
+ name: ruby-prof
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
46
78
  description: Convert Hanzi to pinyin. Unlike other similar gems, this includes tones
47
79
  and can accurately translate common words.
48
80
  email: steven.j.jackson@gmail.com
@@ -78,7 +110,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
78
110
  version: '0'
79
111
  segments:
80
112
  - 0
81
- hash: -3371334032433433338
113
+ hash: 2045449454900323420
82
114
  required_rubygems_version: !ruby/object:Gem::Requirement
83
115
  none: false
84
116
  requirements:
@@ -87,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
87
119
  version: '0'
88
120
  requirements: []
89
121
  rubyforge_project:
90
- rubygems_version: 1.8.24
122
+ rubygems_version: 1.8.23
91
123
  signing_key:
92
124
  specification_version: 3
93
125
  summary: Convert Hanzi to pinyin. Unlike other similar gems, this includes tones and