igo-ruby 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +13 -0
- data/Gemfile.lock +28 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +19 -0
- data/Rakefile +53 -0
- data/VERSION +1 -0
- data/lib/igo-ruby.rb +9 -0
- data/lib/igo/dictionary.rb +187 -0
- data/lib/igo/tagger.rb +144 -0
- data/lib/igo/trie.rb +203 -0
- data/lib/igo/util.rb +74 -0
- data/spec/igo-ruby_spec.rb +7 -0
- data/spec/spec_helper.rb +12 -0
- data/test/test.rb +9 -0
- metadata +146 -0
data/Gemfile
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
|
6
|
+
# Add dependencies to develop your gem here.
|
7
|
+
# Include everything needed to run rake, tests, features, etc.
|
8
|
+
group :development do
|
9
|
+
gem "rspec", "~> 2.1.0"
|
10
|
+
gem "bundler", "~> 1.0.0"
|
11
|
+
gem "jeweler", "~> 1.5.1"
|
12
|
+
gem "rcov", ">= 0"
|
13
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
diff-lcs (1.1.2)
|
5
|
+
git (1.2.5)
|
6
|
+
jeweler (1.5.1)
|
7
|
+
bundler (~> 1.0.0)
|
8
|
+
git (>= 1.2.5)
|
9
|
+
rake
|
10
|
+
rake (0.8.7)
|
11
|
+
rcov (0.9.9)
|
12
|
+
rspec (2.1.0)
|
13
|
+
rspec-core (~> 2.1.0)
|
14
|
+
rspec-expectations (~> 2.1.0)
|
15
|
+
rspec-mocks (~> 2.1.0)
|
16
|
+
rspec-core (2.1.0)
|
17
|
+
rspec-expectations (2.1.0)
|
18
|
+
diff-lcs (~> 1.1.2)
|
19
|
+
rspec-mocks (2.1.0)
|
20
|
+
|
21
|
+
PLATFORMS
|
22
|
+
ruby
|
23
|
+
|
24
|
+
DEPENDENCIES
|
25
|
+
bundler (~> 1.0.0)
|
26
|
+
jeweler (~> 1.5.1)
|
27
|
+
rcov
|
28
|
+
rspec (~> 2.1.0)
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010 kyow
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
= igo-ruby
|
2
|
+
|
3
|
+
Description goes here.
|
4
|
+
|
5
|
+
== Contributing to igo-ruby
|
6
|
+
|
7
|
+
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
|
8
|
+
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
|
9
|
+
* Fork the project
|
10
|
+
* Start a feature/bugfix branch
|
11
|
+
* Commit and push until you are happy with your contribution
|
12
|
+
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
13
|
+
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
14
|
+
|
15
|
+
== Copyright
|
16
|
+
|
17
|
+
Copyright (c) 2010 kyow. See LICENSE.txt for
|
18
|
+
further details.
|
19
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'rake'
|
11
|
+
|
12
|
+
require 'jeweler'
|
13
|
+
Jeweler::Tasks.new do |gem|
|
14
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
15
|
+
gem.name = "igo-ruby"
|
16
|
+
gem.homepage = "http://github.com/kyow/igo-ruby"
|
17
|
+
gem.license = "MIT"
|
18
|
+
gem.summary = %Q{Ruby port of Igo Japanese morphological analyzer.}
|
19
|
+
gem.description = %Q{Ruby port of Igo Japanese morphological analyzer.}
|
20
|
+
gem.email = "24signals@gmail.com"
|
21
|
+
gem.authors = ["K.Nishi"]
|
22
|
+
# Include your dependencies below. Runtime dependencies are required when using your gem,
|
23
|
+
# and development dependencies are only needed for development (ie running rake tasks, tests, etc)
|
24
|
+
# gem.add_runtime_dependency 'jabber4r', '> 0.1'
|
25
|
+
# gem.add_development_dependency 'rspec', '> 1.2.3'
|
26
|
+
|
27
|
+
gem.files = Rake::FileList.new('lib/**/*.rb', '[A-Z]*')
|
28
|
+
gem.required_rubygems_version = ">1.3.6"
|
29
|
+
end
|
30
|
+
Jeweler::RubygemsDotOrgTasks.new
|
31
|
+
|
32
|
+
require 'rspec/core'
|
33
|
+
require 'rspec/core/rake_task'
|
34
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
35
|
+
spec.pattern = FileList['spec/**/*_spec.rb']
|
36
|
+
end
|
37
|
+
|
38
|
+
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
39
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
40
|
+
spec.rcov = true
|
41
|
+
end
|
42
|
+
|
43
|
+
task :default => :spec
|
44
|
+
|
45
|
+
require 'rake/rdoctask'
|
46
|
+
Rake::RDocTask.new do |rdoc|
|
47
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
48
|
+
|
49
|
+
rdoc.rdoc_dir = 'rdoc'
|
50
|
+
rdoc.title = "igo-ruby #{version}"
|
51
|
+
rdoc.rdoc_files.include('README*')
|
52
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
53
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/lib/igo-ruby.rb
ADDED
@@ -0,0 +1,187 @@
|
|
1
|
+
#require 'trie'
|
2
|
+
#require 'util'
|
3
|
+
#require 'nkf'
|
4
|
+
|
5
|
+
# 辞書
|
6
|
+
|
7
|
+
class ViterbiNode
|
8
|
+
attr_accessor :cost, :prev, :word_id, :start, :length, :left_id, :right_id, :is_space
|
9
|
+
def initialize(word_id, start, length, left_id, right_id, is_space)
|
10
|
+
@cost = 0
|
11
|
+
@prev = nil
|
12
|
+
@word_id = word_id
|
13
|
+
@start = start
|
14
|
+
@length = length
|
15
|
+
@left_id = left_id
|
16
|
+
@right_id = right_id
|
17
|
+
@is_space = is_space
|
18
|
+
# puts "==viterbinode #{word_id} #{start} #{length} #{left_id} #{right_id} #{is_space}"
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.make_BOSEOS
|
22
|
+
return ViterbiNode.new(0, 0, 0, 0, 0, false)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class CharCategory
|
27
|
+
def initialize(data_dir)
|
28
|
+
@categories = CharCategory.read_categories(data_dir)
|
29
|
+
fmis = FileMappedInputStream.new(data_dir + "/code2category")
|
30
|
+
@char2id = fmis.get_int_array(fmis.size / 4 / 2)
|
31
|
+
@eql_masks = fmis.get_int_array(fmis.size / 4 /2)
|
32
|
+
fmis.close
|
33
|
+
end
|
34
|
+
|
35
|
+
def category(code)
|
36
|
+
return @categories[@char2id[code]]
|
37
|
+
end
|
38
|
+
|
39
|
+
def compatible?(code1, code2)
|
40
|
+
# puts @eql_masks[code1] & @eql_masks[code2]
|
41
|
+
return (@eql_masks[code1] & @eql_masks[code2]) != 0
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.read_categories(data_dir)
|
45
|
+
data = FileMappedInputStream::get_int_array(data_dir + "/char.category")
|
46
|
+
size = data.size / 4
|
47
|
+
ary = []
|
48
|
+
for i in 0 .. (size - 1)
|
49
|
+
ary.push(Category.new(data[i * 4], data[i * 4 + 1], data[i * 4 + 2] == 1, data[i * 4 + 3] == 1))
|
50
|
+
end
|
51
|
+
return ary
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
class Category
|
56
|
+
attr_reader :id, :length, :invoke, :group
|
57
|
+
def initialize(i, l, iv, g)
|
58
|
+
@id = i
|
59
|
+
@length = l
|
60
|
+
@invoke = iv
|
61
|
+
@group = g
|
62
|
+
# puts "==category #{i} #{l} #{iv} #{g}"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
class Matrix
|
67
|
+
def initialize(data_dir)
|
68
|
+
fmis = FileMappedInputStream.new(data_dir + "/matrix.bin")
|
69
|
+
@left_size = fmis.get_int
|
70
|
+
@right_size = fmis.get_int
|
71
|
+
@matrix = fmis.get_short_array(@left_size * @right_size)
|
72
|
+
fmis.close
|
73
|
+
end
|
74
|
+
|
75
|
+
def link_cost(left_id, right_id)
|
76
|
+
return @matrix[right_id * @right_size + left_id]
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
class Unknown
|
81
|
+
def initialize(data_dir)
|
82
|
+
@category = CharCategory.new(data_dir)
|
83
|
+
@space_id = @category.category(' '.unpack("U*")[0]).id
|
84
|
+
end
|
85
|
+
|
86
|
+
def search(text, start, wdic, result)
|
87
|
+
txt = text.unpack("U*")
|
88
|
+
length = txt.size
|
89
|
+
ch = txt[start]
|
90
|
+
ct = @category.category(ch)
|
91
|
+
|
92
|
+
# puts "Unknown.search ch=#{ch} length=#{length} start=#{start}"
|
93
|
+
# p ct
|
94
|
+
# p result
|
95
|
+
# p ct.invoke
|
96
|
+
if !result.empty? and !ct.invoke
|
97
|
+
# puts "result return"
|
98
|
+
return
|
99
|
+
end
|
100
|
+
# puts "---i"
|
101
|
+
|
102
|
+
is_space = (ct.id == @space_id)
|
103
|
+
limit = [length, ct.length + start].min
|
104
|
+
|
105
|
+
# puts "limit = #{limit} #{length} #{ct.length}"
|
106
|
+
|
107
|
+
for i in start..(limit - 1)
|
108
|
+
# puts "[a]"
|
109
|
+
wdic.search_from_trie_id(ct.id, start, (i - start) + 1, is_space, result)
|
110
|
+
|
111
|
+
if((i + 1) != limit and !(@category.compatible?(ch, text[i + 1])))
|
112
|
+
return
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
if ct.group and limit < length
|
117
|
+
# puts "[b]"
|
118
|
+
for i in limit..(length - 1)
|
119
|
+
# puts "[c] COMPATIBLE? #{ch} #{txt[i + 1]}"
|
120
|
+
|
121
|
+
if not @category.compatible?(ch, txt[i])
|
122
|
+
# puts "[d] #{i} #{start}"
|
123
|
+
wdic.search_from_trie_id(ct.id, start, i - start, is_space, result)
|
124
|
+
return
|
125
|
+
end
|
126
|
+
end
|
127
|
+
# puts "[e] #{length} #{start}"
|
128
|
+
wdic.search_from_trie_id(ct.id, start, length - start, is_space, result)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
class WordDic
|
134
|
+
def initialize(data_dir)
|
135
|
+
@trie = Searcher.new(data_dir + "/word2id")
|
136
|
+
@data = FileMappedInputStream.get_string(data_dir + "/word.dat")
|
137
|
+
@indices = FileMappedInputStream.get_int_array(data_dir + "/word.ary.idx")
|
138
|
+
|
139
|
+
fmis = FileMappedInputStream.new(data_dir + "/word.inf")
|
140
|
+
word_count = fmis.size / (4 + 2 + 2 + 2)
|
141
|
+
@data_offsets = fmis.get_int_array(word_count)
|
142
|
+
@left_ids = fmis.get_short_array(word_count)
|
143
|
+
@right_ids = fmis.get_short_array(word_count)
|
144
|
+
@costs = fmis.get_short_array(word_count)
|
145
|
+
fmis.close
|
146
|
+
end
|
147
|
+
|
148
|
+
def cost(word_id)
|
149
|
+
return @costs[word_id]
|
150
|
+
end
|
151
|
+
|
152
|
+
def search(text, start, result)
|
153
|
+
indices = @indices
|
154
|
+
left_ids = @left_ids
|
155
|
+
right_ids = @right_ids
|
156
|
+
|
157
|
+
@trie.each_common_prefix(text, start, Proc.new { |start, offset, trie_id|
|
158
|
+
ed = @indices[trie_id + 1]
|
159
|
+
|
160
|
+
for i in indices[trie_id]..(ed - 1)
|
161
|
+
result.push(ViterbiNode.new(i, start, offset, @left_ids[i], right_ids[i], false))
|
162
|
+
end
|
163
|
+
})
|
164
|
+
end
|
165
|
+
|
166
|
+
def search_from_trie_id(trie_id, start, word_length, is_space, result)
|
167
|
+
ed = @indices[trie_id + 1]
|
168
|
+
for i in @indices[trie_id]..(ed - 1)
|
169
|
+
result.push(ViterbiNode.new(i, start, word_length, @left_ids[i], @right_ids[i], is_space))
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
def word_data(word_id)
|
174
|
+
# s = UTFConverter.utf16to8(@data)
|
175
|
+
|
176
|
+
# st = format("%x", @data_offsets[word_id] * 2)
|
177
|
+
# ed = format("%x", @data_offsets[word_id + 1] * 2)
|
178
|
+
|
179
|
+
# puts "WORD DATA: #{word_id} = #{st} : #{ed}"
|
180
|
+
# p s
|
181
|
+
# puts "nkf= " + NKF.nkf('-W16L0 --utf8', s)
|
182
|
+
# p [s].pack("U*")
|
183
|
+
return @data.slice(@data_offsets[word_id]*2..@data_offsets[word_id + 1]*2 - 1)
|
184
|
+
# return NKF.nkf('-W16L0 --utf8', s)
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
data/lib/igo/tagger.rb
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
require 'igo/dictionary'
|
2
|
+
require 'igo/trie'
|
3
|
+
|
4
|
+
module Igo
|
5
|
+
|
6
|
+
class Morpheme
|
7
|
+
attr_accessor :surface, :feature, :start
|
8
|
+
def initialize(surface, feature, start)
|
9
|
+
@surface = surface
|
10
|
+
@feature = feature
|
11
|
+
@start = start
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
# 形態素解析を行う
|
16
|
+
class Tagger
|
17
|
+
def self.__BOS_NODES
|
18
|
+
return [ViterbiNode.make_BOSEOS]
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize(dir)
|
22
|
+
@wdc = WordDic.new(dir)
|
23
|
+
@unk = Unknown.new(dir)
|
24
|
+
@mtx = Matrix.new(dir)
|
25
|
+
end
|
26
|
+
|
27
|
+
#
|
28
|
+
def parse(text, result=[])
|
29
|
+
vn = impl(text, result)
|
30
|
+
txt = text.unpack("U*")
|
31
|
+
while vn
|
32
|
+
surface = txt.slice(vn.start, vn.length).pack("U*")
|
33
|
+
|
34
|
+
s = @wdc.word_data(vn.word_id)
|
35
|
+
|
36
|
+
# puts s.size
|
37
|
+
feature = NKF.nkf('-W16L0 --utf8', s)
|
38
|
+
# feature = @wdc.word_data(vn.word_id)
|
39
|
+
result.push(Morpheme.new(surface, feature, vn.start))
|
40
|
+
vn = vn.prev
|
41
|
+
end
|
42
|
+
return result
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
# 分かち書きを行う
|
47
|
+
def wakati(text, result=[])
|
48
|
+
vn = impl(text, result)
|
49
|
+
txt = text.unpack("U*")
|
50
|
+
|
51
|
+
while vn
|
52
|
+
# puts "s:#{vn.start} len:#{vn.length}"
|
53
|
+
a = txt.slice(vn.start, vn.length).pack("U*")
|
54
|
+
result.push(a)
|
55
|
+
vn = vn.prev
|
56
|
+
end
|
57
|
+
return result
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
def impl(text, result=[])
|
63
|
+
txs = text.unpack("U*")
|
64
|
+
len = txs.size
|
65
|
+
|
66
|
+
# puts "len=#{len}"
|
67
|
+
|
68
|
+
node_ary = [Tagger.__BOS_NODES]
|
69
|
+
for i in 0..(len-1)
|
70
|
+
node_ary.push([])
|
71
|
+
end
|
72
|
+
|
73
|
+
for i in 0..(len-1)
|
74
|
+
per_result = []
|
75
|
+
|
76
|
+
# puts "==> node_ary[#{i}].length = #{!node_ary[i].empty?}"
|
77
|
+
# p node_ary
|
78
|
+
unless node_ary[i].empty?
|
79
|
+
@wdc.search(text, i, per_result)
|
80
|
+
# puts "---WDC---"
|
81
|
+
# p per_result
|
82
|
+
@unk.search(text, i, @wdc, per_result)
|
83
|
+
# puts "---UNK---"
|
84
|
+
# p per_result
|
85
|
+
prevs = node_ary[i]
|
86
|
+
|
87
|
+
for j in 0..(per_result.size - 1)
|
88
|
+
vn = per_result[j]
|
89
|
+
# p vn
|
90
|
+
if(vn.is_space)
|
91
|
+
# puts "#{j} is space (#{i + vn.length}) i=#{i} len=#{vn.length}"
|
92
|
+
node_ary[i + vn.length] = prevs
|
93
|
+
# p node_ary
|
94
|
+
# node_ary[i + vn.length].push(prevs)
|
95
|
+
else
|
96
|
+
# puts "#{j} is NOT space (#{i + vn.length}) i=#{i} len=#{vn.length}"
|
97
|
+
node_ary[i + vn.length].push(set_min_cost_node(vn, prevs))
|
98
|
+
# p node_ary
|
99
|
+
# node_ary[i + vn.length] + set_min_cost_node(vn, prevs)
|
100
|
+
end
|
101
|
+
# p node_ary
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
cur = set_min_cost_node(ViterbiNode.make_BOSEOS, node_ary[len]).prev
|
107
|
+
|
108
|
+
# reverse
|
109
|
+
head = nil
|
110
|
+
while cur.prev
|
111
|
+
tmp = cur.prev
|
112
|
+
cur.prev = head
|
113
|
+
head = cur
|
114
|
+
cur = tmp
|
115
|
+
end
|
116
|
+
return head
|
117
|
+
|
118
|
+
# return cur.reverse
|
119
|
+
|
120
|
+
end
|
121
|
+
|
122
|
+
def set_min_cost_node(vn, prevs)
|
123
|
+
f = vn.prev = prevs[0]
|
124
|
+
# puts "=> set_min_cost_node"
|
125
|
+
# p f
|
126
|
+
|
127
|
+
vn.cost = f.cost + @mtx.link_cost(f.right_id, vn.left_id)
|
128
|
+
|
129
|
+
# puts "#{vn.cost} #{f.cost} #{f.right_id} #{vn.left_id} #{@mtx.link_cost(f.right_id, vn.left_id)} #{}"
|
130
|
+
|
131
|
+
for i in 1..(prevs.size - 1)
|
132
|
+
p = prevs[i]
|
133
|
+
cost = p.cost + @mtx.link_cost(p.right_id, vn.left_id)
|
134
|
+
if(cost < vn.cost)
|
135
|
+
vn.cost = cost
|
136
|
+
vn.prev = p
|
137
|
+
end
|
138
|
+
end
|
139
|
+
vn.cost += @wdc.cost(vn.word_id)
|
140
|
+
return vn
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
end
|
data/lib/igo/trie.rb
ADDED
@@ -0,0 +1,203 @@
|
|
1
|
+
require 'igo/util'
|
2
|
+
|
3
|
+
class String
|
4
|
+
def starts_with?(prefix)
|
5
|
+
prefix = prefix.to_s
|
6
|
+
self[0, prefix.length] == prefix
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
class Node
|
11
|
+
class Base
|
12
|
+
def self.ids(nid)
|
13
|
+
return (-1 * nid) - 1
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
class Chck
|
18
|
+
TERMINATE_CODE = 0
|
19
|
+
TERMINATE_CHAR = TERMINATE_CODE.chr
|
20
|
+
VACANT_CODE = 1
|
21
|
+
CODE_LIMIT = 0xffff
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
class KeyStream
|
26
|
+
|
27
|
+
def initialize(key, start = 0)
|
28
|
+
@s = key
|
29
|
+
@cur = start
|
30
|
+
@len = key.unpack("U*").size
|
31
|
+
end
|
32
|
+
|
33
|
+
def compare_to(ks)
|
34
|
+
return rest.compare_to(ks.rest)
|
35
|
+
end
|
36
|
+
|
37
|
+
def start_with(prefix, beg, len)
|
38
|
+
s = @s
|
39
|
+
c = @cur
|
40
|
+
if @len - c < len
|
41
|
+
return false
|
42
|
+
end
|
43
|
+
# puts "c = #{c} len = #{len}"
|
44
|
+
# p s.unpack("U*")[c]
|
45
|
+
# p [s.unpack("U*")[c]].pack("U*")
|
46
|
+
word = s.unpack("U*")[c]
|
47
|
+
if word.nil?
|
48
|
+
return (prefix.slice(beg, len-beg) == nil)
|
49
|
+
else
|
50
|
+
[word].pack("U*").starts_with?(prefix.slice(beg, len-beg))
|
51
|
+
end
|
52
|
+
# return [s.unpack("U*")[c]].pack("U*").starts_with?(prefix.slice(beg, len-beg))
|
53
|
+
end
|
54
|
+
|
55
|
+
def rest
|
56
|
+
return @s.slice(@cur, @s.length)
|
57
|
+
end
|
58
|
+
|
59
|
+
def read
|
60
|
+
# puts "CUR=#{@cur}"
|
61
|
+
|
62
|
+
if eos?
|
63
|
+
# puts "EOS!!"
|
64
|
+
return Node::Chck::TERMINATE_CODE
|
65
|
+
else
|
66
|
+
r = @s.unpack("U*")[@cur]
|
67
|
+
# puts [r].pack("U*").tosjis
|
68
|
+
result = [r].pack("U*")
|
69
|
+
# result = @s.unpack("U*")[@cur]
|
70
|
+
@cur += 1
|
71
|
+
return r
|
72
|
+
# p = @cur
|
73
|
+
# @cur += 1
|
74
|
+
# return @s[p]
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def eos?
|
79
|
+
# puts "eos? #{@cur} == #{@len}"
|
80
|
+
return (@cur == @len) ? true : false
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
# DoubleArray検索用のクラス
|
85
|
+
class Searcher
|
86
|
+
def initialize(path)
|
87
|
+
fmis = FileMappedInputStream.new(path)
|
88
|
+
node_size = fmis.get_int()
|
89
|
+
tind_size = fmis.get_int()
|
90
|
+
tail_size = fmis.get_int()
|
91
|
+
@key_set_size = tind_size
|
92
|
+
@begs = fmis.get_int_array(tind_size)
|
93
|
+
@base = fmis.get_int_array(node_size)
|
94
|
+
@lens = fmis.get_short_array(tind_size)
|
95
|
+
@chck = fmis.get_char_array(node_size)
|
96
|
+
@tail = fmis.get_string(tail_size)
|
97
|
+
|
98
|
+
#p @begs[0]
|
99
|
+
#p @base[0]
|
100
|
+
#p @lens[0]
|
101
|
+
#print @tail.tosjis
|
102
|
+
#print @tail[0].tosjis
|
103
|
+
|
104
|
+
fmis.close
|
105
|
+
end
|
106
|
+
|
107
|
+
def size
|
108
|
+
return @key_set_size
|
109
|
+
end
|
110
|
+
|
111
|
+
def search(key)
|
112
|
+
base = @base
|
113
|
+
chck = @chck
|
114
|
+
node = @base[0]
|
115
|
+
kin = KeyStream.new(key)
|
116
|
+
|
117
|
+
while true
|
118
|
+
code = kin.read
|
119
|
+
idx = node + code
|
120
|
+
node = base[idx]
|
121
|
+
|
122
|
+
if(chck[idx] == code)
|
123
|
+
if(node >= 0)
|
124
|
+
next
|
125
|
+
elsif(kin.eos? or key_exists?(kin, node))
|
126
|
+
return Node::Base.ids(node)
|
127
|
+
end
|
128
|
+
return -1
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
def each_common_prefix(key, start, callback)
|
134
|
+
base = @base
|
135
|
+
chck = @chck
|
136
|
+
node = @base[0]
|
137
|
+
offset = -1
|
138
|
+
kin = KeyStream.new(key, start)
|
139
|
+
|
140
|
+
# puts "each_common_prefix"
|
141
|
+
while true
|
142
|
+
code = kin.read
|
143
|
+
offset += 1
|
144
|
+
terminal_index = node
|
145
|
+
# terminal_index = node + Node::Chck::TERMINATE_CODE
|
146
|
+
#puts "code #{code.tosjis}"
|
147
|
+
|
148
|
+
if(chck[terminal_index] == Node::Chck::TERMINATE_CODE)
|
149
|
+
callback.call(start, offset, Node::Base.ids(base[terminal_index]))
|
150
|
+
|
151
|
+
# puts "code -> #{code} #{Node::Chck::TERMINATE_CHAR}"
|
152
|
+
|
153
|
+
if(code == Node::Chck::TERMINATE_CODE)
|
154
|
+
# puts code
|
155
|
+
# puts "(1)"
|
156
|
+
return
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
# TODO
|
161
|
+
#puts "code #{code.tosjis}"
|
162
|
+
# p code
|
163
|
+
idx = node + code
|
164
|
+
node = base[idx]
|
165
|
+
|
166
|
+
# code = [code].pack('U*')
|
167
|
+
|
168
|
+
if(chck[idx] == code)
|
169
|
+
if(node >= 0)
|
170
|
+
next
|
171
|
+
else
|
172
|
+
# id = Node.Base.ids(node)
|
173
|
+
# if(kin.start_with(@tail, @begs[id], lens[id]))
|
174
|
+
# callback.call(start, offset+@lens[id]+1, id)
|
175
|
+
# end
|
176
|
+
|
177
|
+
call_if_key_including(kin, node, start, offset, callback)
|
178
|
+
end
|
179
|
+
end
|
180
|
+
# puts code
|
181
|
+
# puts "(2)"
|
182
|
+
return
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
private
|
187
|
+
|
188
|
+
def call_if_key_including(kin, node, start, offset, callback)
|
189
|
+
# puts "call_if_key_including"
|
190
|
+
node_id = Node::Base.ids(node)
|
191
|
+
if(kin.start_with(@tail, @begs[node_id], @lens[node_id]))
|
192
|
+
callback.call(start, offset + @lens[node_id] + 1, node_id)
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
def key_exists?(kin, node)
|
197
|
+
nid = Node.Base.ids(node)
|
198
|
+
beg = @begs[nid]
|
199
|
+
s = @tail.slice(beg, beg + @lens[nid])
|
200
|
+
return kin.rest == s ? true : false
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
data/lib/igo/util.rb
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
# ユーティリティ
|
2
|
+
|
3
|
+
class FileMappedInputStream
|
4
|
+
def initialize(path)
|
5
|
+
@path = path
|
6
|
+
@cur = 0
|
7
|
+
@file = open(path, "r+b")
|
8
|
+
# @file.binmode
|
9
|
+
end
|
10
|
+
|
11
|
+
def get_int()
|
12
|
+
return @file.read(4).unpack("i*")[0]
|
13
|
+
end
|
14
|
+
|
15
|
+
def get_int_array(count)
|
16
|
+
# return map(count * 4).unpack("i*")
|
17
|
+
return @file.read(count * 4).unpack("i*")
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.get_int_array(path)
|
21
|
+
fmis = FileMappedInputStream.new(path)
|
22
|
+
array = fmis.get_int_array((File::stat(path).size)/4)
|
23
|
+
fmis.close
|
24
|
+
return array
|
25
|
+
end
|
26
|
+
|
27
|
+
def get_short_array(count)
|
28
|
+
# return map(count * 2).unpack("s*")
|
29
|
+
return @file.read(count * 2).unpack("s*")
|
30
|
+
end
|
31
|
+
|
32
|
+
def get_char_array(count)
|
33
|
+
# return map(count * 2).unpack("S!*")
|
34
|
+
return @file.read(count * 2).unpack("S!*")
|
35
|
+
end
|
36
|
+
|
37
|
+
def get_string(count)
|
38
|
+
# return map(count * 2)
|
39
|
+
# puts "read count = #{count}"
|
40
|
+
return @file.read(count * 2)
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.get_string(path)
|
44
|
+
fmis = FileMappedInputStream.new(path)
|
45
|
+
str = fmis.get_string((File::stat(path).size)/2)
|
46
|
+
fmis.close
|
47
|
+
|
48
|
+
return str
|
49
|
+
end
|
50
|
+
|
51
|
+
def size
|
52
|
+
return File::stat(@path).size
|
53
|
+
end
|
54
|
+
|
55
|
+
def close
|
56
|
+
@file.close
|
57
|
+
end
|
58
|
+
|
59
|
+
def self.get_char_array(path)
|
60
|
+
fmis = FileMappedInputStream.new(path)
|
61
|
+
array = fmis.get_char_array(fmis.size / 2)
|
62
|
+
fmis.close
|
63
|
+
return array
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
|
68
|
+
def __map(size)
|
69
|
+
@file.pos = @cur
|
70
|
+
@cur += size
|
71
|
+
return @file.read(size)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
3
|
+
require 'rspec'
|
4
|
+
require 'igo-ruby'
|
5
|
+
|
6
|
+
# Requires supporting files with custom matchers and macros, etc,
|
7
|
+
# in ./support/ and its subdirectories.
|
8
|
+
Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
|
9
|
+
|
10
|
+
RSpec.configure do |config|
|
11
|
+
|
12
|
+
end
|
data/test/test.rb
ADDED
metadata
ADDED
@@ -0,0 +1,146 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: igo-ruby
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 0
|
10
|
+
version: 0.1.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- K.Nishi
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-12-11 00:00:00 +09:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
prerelease: false
|
23
|
+
version_requirements: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ~>
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
hash: 11
|
29
|
+
segments:
|
30
|
+
- 2
|
31
|
+
- 1
|
32
|
+
- 0
|
33
|
+
version: 2.1.0
|
34
|
+
name: rspec
|
35
|
+
requirement: *id001
|
36
|
+
type: :development
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
prerelease: false
|
39
|
+
version_requirements: &id002 !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ~>
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
hash: 23
|
45
|
+
segments:
|
46
|
+
- 1
|
47
|
+
- 0
|
48
|
+
- 0
|
49
|
+
version: 1.0.0
|
50
|
+
name: bundler
|
51
|
+
requirement: *id002
|
52
|
+
type: :development
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
prerelease: false
|
55
|
+
version_requirements: &id003 !ruby/object:Gem::Requirement
|
56
|
+
none: false
|
57
|
+
requirements:
|
58
|
+
- - ~>
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
hash: 1
|
61
|
+
segments:
|
62
|
+
- 1
|
63
|
+
- 5
|
64
|
+
- 1
|
65
|
+
version: 1.5.1
|
66
|
+
name: jeweler
|
67
|
+
requirement: *id003
|
68
|
+
type: :development
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
prerelease: false
|
71
|
+
version_requirements: &id004 !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
73
|
+
requirements:
|
74
|
+
- - ">="
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
hash: 3
|
77
|
+
segments:
|
78
|
+
- 0
|
79
|
+
version: "0"
|
80
|
+
name: rcov
|
81
|
+
requirement: *id004
|
82
|
+
type: :development
|
83
|
+
description: Ruby port of Igo Japanese morphological analyzer.
|
84
|
+
email: 24signals@gmail.com
|
85
|
+
executables: []
|
86
|
+
|
87
|
+
extensions: []
|
88
|
+
|
89
|
+
extra_rdoc_files:
|
90
|
+
- LICENSE.txt
|
91
|
+
- README.rdoc
|
92
|
+
files:
|
93
|
+
- Gemfile
|
94
|
+
- Gemfile.lock
|
95
|
+
- LICENSE.txt
|
96
|
+
- README.rdoc
|
97
|
+
- Rakefile
|
98
|
+
- VERSION
|
99
|
+
- lib/igo-ruby.rb
|
100
|
+
- lib/igo/dictionary.rb
|
101
|
+
- lib/igo/tagger.rb
|
102
|
+
- lib/igo/trie.rb
|
103
|
+
- lib/igo/util.rb
|
104
|
+
- spec/igo-ruby_spec.rb
|
105
|
+
- spec/spec_helper.rb
|
106
|
+
- test/test.rb
|
107
|
+
has_rdoc: true
|
108
|
+
homepage: http://github.com/kyow/igo-ruby
|
109
|
+
licenses:
|
110
|
+
- MIT
|
111
|
+
post_install_message:
|
112
|
+
rdoc_options: []
|
113
|
+
|
114
|
+
require_paths:
|
115
|
+
- lib
|
116
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
117
|
+
none: false
|
118
|
+
requirements:
|
119
|
+
- - ">="
|
120
|
+
- !ruby/object:Gem::Version
|
121
|
+
hash: 3
|
122
|
+
segments:
|
123
|
+
- 0
|
124
|
+
version: "0"
|
125
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
126
|
+
none: false
|
127
|
+
requirements:
|
128
|
+
- - ">"
|
129
|
+
- !ruby/object:Gem::Version
|
130
|
+
hash: 23
|
131
|
+
segments:
|
132
|
+
- 1
|
133
|
+
- 3
|
134
|
+
- 6
|
135
|
+
version: 1.3.6
|
136
|
+
requirements: []
|
137
|
+
|
138
|
+
rubyforge_project:
|
139
|
+
rubygems_version: 1.3.7
|
140
|
+
signing_key:
|
141
|
+
specification_version: 3
|
142
|
+
summary: Ruby port of Igo Japanese morphological analyzer.
|
143
|
+
test_files:
|
144
|
+
- spec/igo-ruby_spec.rb
|
145
|
+
- spec/spec_helper.rb
|
146
|
+
- test/test.rb
|