okura 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ module Okura
2
+ Version = '0.0.0'
3
+ end
@@ -0,0 +1,216 @@
1
+ module Okura
2
+ module WordDic
3
+ class Naive
4
+ def initialize
5
+ @size=0
6
+ @root=TrieNode.new
7
+ end
8
+ class TrieNode
9
+ def initialize
10
+ @nodes={}
11
+ @leafs=[]
12
+ end
13
+ def add word,i=0
14
+ if i==word.surface.length
15
+ @leafs.push word
16
+ else
17
+ fst=word.surface[i]
18
+ node=@nodes[fst]
19
+ @nodes[fst]=node=TrieNode.new if node.nil?
20
+ node.add word,i+1
21
+ end
22
+ end
23
+ def find_all str,i,res=Array.new
24
+ res.concat @leafs
25
+ return res unless i < str.length
26
+ node=@nodes[str[i]]
27
+ return res if node.nil?
28
+ node.find_all(str,i+1,res)
29
+ res
30
+ end
31
+ end
32
+ attr_reader :size
33
+ def define word
34
+ @size+=1
35
+ @root.add word
36
+ end
37
+ # -> [Word]
38
+ def possible_words str,i
39
+ @root.find_all str,i
40
+ end
41
+ def word_size
42
+ @size
43
+ end
44
+ end
45
+ class DoubleArray
46
+ # Words -> [Integer] -> [Integer]
47
+ def initialize words,base,check
48
+ @words,@base,@check=words,base,check
49
+ end
50
+ def possible_words str,i
51
+ ret=[]
52
+ prev=nil
53
+ cur=1
54
+ str[i..-1].bytes.each{|c|
55
+ next_index=@base[cur]+c+1
56
+ break unless @check[next_index]==cur
57
+ prev,cur=cur,next_index
58
+ # check EOS node
59
+ eos_index=@base[cur]
60
+ if @check[eos_index]==cur
61
+ raise "@base[#{eos_index}] should < 0 but #{@base[eos_index]}" unless @base[eos_index] < 0
62
+ ret.push -@base[eos_index]-1
63
+ end
64
+ }
65
+ return ret.map{|x|@words.group(x)}.flatten(1)
66
+ end
67
+ def word_size
68
+ @words.word_size
69
+ end
70
+ class Builder
71
+ class DAData
72
+ def initialize root
73
+ # offset | +0 | +1 | +2 | ...
74
+ # data | -data_id-1 | child(0) | child(1) | ...
75
+ #
76
+ # base[0] = -last free cell
77
+ # check[0] = -first free cell
78
+ # 1 = root node id
79
+ @base=[0,0]
80
+ @check=[0,0]
81
+ @length=2
82
+ b,node_id=construct! root
83
+ @base[1]=b
84
+ end
85
+ attr_reader :base
86
+ attr_reader :check
87
+ attr_reader :length
88
+
89
+ def construct! node,parent=1
90
+ # base[parent_node_id] should == s
91
+ # -base[s+0] : data id
92
+ # s+1+c : child node id for char c
93
+ # check[m] : parent node id for node m
94
+ s=find_free_space_for node
95
+ if node.has_data?
96
+ alloc! s,parent
97
+ @base[s]=-node.data_id-1
98
+ end
99
+ node.children.each{|c,cn| alloc! child_index(s,c),parent }
100
+ node.children.each{|c,cn|
101
+ idx=child_index(s,c)
102
+ @base[idx]=construct! cn,idx
103
+ }
104
+ s
105
+ end
106
+ def child_index base,c
107
+ base+c+1
108
+ end
109
+ def alloc! index,parent
110
+ assert index>0
111
+ assert free?(index)
112
+ if length <= index
113
+ expand!(index+1)
114
+ end
115
+ assert has_free_cell?
116
+
117
+ prev_free=-@base[index]
118
+ next_free=-@check[index]
119
+ @base[next_free]=-prev_free
120
+ @check[prev_free]=-next_free
121
+ @base[index]=0 # dummy value
122
+ @check[index]=parent
123
+ assert !free?(index)
124
+ end
125
+ def expand! size
126
+ if size <= length
127
+ return
128
+ end
129
+ (length...size).each{|i|
130
+ if has_free_cell?
131
+ @base[i]=@base[0]
132
+ @check[i]=0
133
+ @check[-@base[0]]=-i
134
+ @base[0]=-i
135
+ else
136
+ @base[i]=0
137
+ @check[i]=0
138
+ @base[0]=-i
139
+ @check[0]=-i
140
+ end
141
+ }
142
+ @length=size
143
+ end
144
+ def free? index
145
+ length <= index || @check[index] <= 0
146
+ end
147
+ def find_free_space_for node
148
+ alloc_indexes=node.children.keys.map{|c|c+1}
149
+ alloc_indexes+=[0] if node.has_data?
150
+ return 0 if alloc_indexes.empty?
151
+ min=alloc_indexes.min
152
+ i=-@check[0]
153
+ while i!=0
154
+ assert free?(i)
155
+ if 0 < i-min && alloc_indexes.all?{|idx|free?(idx+i-min)}
156
+ return i-min
157
+ end
158
+ i=-@check[i]
159
+ end
160
+ # free space not found
161
+ return [length-min,1].max
162
+ end
163
+ def has_free_cell?
164
+ @base[0]!=0
165
+ end
166
+ def assert cond
167
+ raise unless cond
168
+ end
169
+ end
170
+ class Node
171
+ def initialize
172
+ @data_id=nil
173
+ @children={}
174
+ end
175
+ attr_reader :data_id
176
+ attr_reader :children
177
+ def has_data?
178
+ !!data_id
179
+ end
180
+ def add bytes,idx,data_id
181
+ if idx==bytes.length
182
+ @data_id=data_id
183
+ else
184
+ c=bytes[idx]
185
+ (@children[c]||=Node.new).add(bytes,idx+1,data_id)
186
+ end
187
+ end
188
+ end
189
+ def initialize
190
+ @root=Node.new
191
+ @words=Okura::Words::Builder.new
192
+ end
193
+ def define word
194
+ word_group_id=@words.add word
195
+ key=word.surface.bytes.to_a
196
+ @root.add key,0,word_group_id
197
+ end
198
+ def build
199
+ da=DAData.new @root
200
+ DoubleArray.new *data_for_serialize
201
+ end
202
+ # -> [ Words, [Integer], [Integer] ]
203
+ def data_for_serialize
204
+ da=DAData.new @root
205
+ [@words.build,da.base,da.check]
206
+ end
207
+ # [ Words, [Integer], [Integer] ] -> WordDic::DoubleArray
208
+ def self.build_from_serialized data
209
+ words,base,check=data
210
+ puts base.length
211
+ DoubleArray.new words,base,check
212
+ end
213
+ end
214
+ end
215
+ end
216
+ end
data/lib/okura.rb ADDED
@@ -0,0 +1,388 @@
1
+ # coding: utf-8
2
+ require 'okura/word_dic'
3
+
4
+ module Okura
5
+ class Tagger
6
+ def initialize dic,mat
7
+ @dic,@mat=dic,mat
8
+ end
9
+ attr_reader :dic
10
+ attr_reader :mat
11
+ # -> [String]
12
+ def wakati str,mat
13
+ mincost_path=parse(str).mincost_path
14
+ return nil if mincost_path.nil?
15
+ return mincost_path.map{|node|node.word.surface}
16
+ end
17
+ # -> Nodes
18
+ def parse str
19
+ chars=str.split(//)
20
+ nodes=Nodes.new(chars.length+2,@mat)
21
+ nodes.add(0,Node.mk_bos_eos)
22
+ nodes.add(chars.length+1,Node.mk_bos_eos)
23
+ str.length.times{|i|
24
+ @dic.possible_words(str,i).each{|w|
25
+ nodes.add(i+1,Node.new(w))
26
+ }
27
+ }
28
+ nodes
29
+ end
30
+ end
31
+ class Nodes
32
+ def initialize len,mat
33
+ @mat=mat
34
+ @begins=(0...len).map{[]}
35
+ @ends=(0...len).map{[]}
36
+ end
37
+ def [](i)
38
+ @begins[i]
39
+ end
40
+ def length
41
+ @begins.length
42
+ end
43
+ # Matrix -> [Node] | nil
44
+ def mincost_path
45
+ return [] if length==0
46
+ # calc cost
47
+ self[0].each{|n|
48
+ n.total_cost=n.word.cost
49
+ n.nearest_prev=nil
50
+ }
51
+ (1...length).each{|i|
52
+ prevs=@ends[i-1]
53
+ curs=@begins[i]
54
+ prevs.each{|prev|
55
+ # 途中で行き止まりのNodeはtotal_costが設定されない
56
+ next if prev.total_cost.nil?
57
+ curs.each{|cur|
58
+ join_cost=@mat.cost(prev.word.right.id,cur.word.left.id)
59
+ next if join_cost.nil?
60
+ cost=prev.total_cost+join_cost+cur.word.cost
61
+ if !cur.total_cost || cost < cur.total_cost
62
+ cur.total_cost=cost
63
+ cur.nearest_prev=prev
64
+ end
65
+ }
66
+ }
67
+ }
68
+ # calc mincost path
69
+ ret=[]
70
+ cur=self[-1][0]
71
+ until cur.nil?
72
+ ret.push cur
73
+ cur=cur.nearest_prev
74
+ end
75
+ # TODO: disconnected
76
+ # return nil unless ...
77
+ # success
78
+ return ret.reverse
79
+ end
80
+ def add i,node
81
+ @begins[i].push node
82
+ @ends[i+node.length-1].push node
83
+ end
84
+ end
85
+ class Node
86
+ def initialize word
87
+ @word=word
88
+ @nearest_prev=nil
89
+ @total_cost=nil
90
+ end
91
+ attr_reader :word
92
+ attr_accessor :nearest_prev
93
+ attr_accessor :total_cost
94
+ def length
95
+ word.surface.length
96
+ end
97
+ def to_s
98
+ "Node(#{word},#{total_cost})"
99
+ end
100
+ def self.mk_bos_eos
101
+ f=Features::BOS_EOS
102
+ node=Node.new Word.new('BOS/EOS',f,f,0)
103
+ def node.length; 1; end
104
+ node
105
+ end
106
+ end
107
+ class Words
108
+ class CompactStringArray
109
+ def initialize str,indices
110
+ @str=str
111
+ @indices=indices
112
+ end
113
+ def get id
114
+ raise 'bad id' unless id < @indices.length
115
+ from=@indices[id]
116
+ to=(id+1 < @indices.length) ? @indices[id+1] : @str.length
117
+ @str[from...to]
118
+ end
119
+ def [](id)
120
+ get id
121
+ end
122
+ class Builder
123
+ def initialize
124
+ @indices=[]
125
+ @surfaces=[]
126
+ @size=0
127
+ end
128
+ def build
129
+ Okura::Words::CompactStringArray.new @surfaces.join(''),@indices
130
+ end
131
+ def add surface
132
+ id=@indices.length
133
+ @indices.push @size
134
+ @surfaces.push surface
135
+ @size+=surface.size
136
+ id
137
+ end
138
+ end
139
+ end
140
+ class Builder
141
+ def initialize
142
+ # group id -> [Word]
143
+ @groups=[]
144
+ @next_group_id=0
145
+ # surface -> id
146
+ @group_ids={}
147
+ @surfaces=Okura::Words::CompactStringArray::Builder.new
148
+ @left_features=Features.new
149
+ @right_features=Features.new
150
+ @surface_ids=[]
151
+ @left_ids=[]
152
+ @right_ids=[]
153
+ @costs=[]
154
+ end
155
+ def add word
156
+ unless @group_ids.has_key? word.surface
157
+ gid=add_group! word.surface
158
+ wid=add_word! gid,word
159
+ @group_ids[word.surface]=gid
160
+ @groups[gid]=[wid]
161
+ gid
162
+ else
163
+ gid=@group_ids[word.surface]
164
+ wid=add_word! gid,word
165
+ @groups[gid].push wid
166
+ gid
167
+ end
168
+ end
169
+ def build
170
+ Okura::Words.new(
171
+ @groups,@surfaces.build,@left_features,@right_features,@surface_ids,@left_ids,@right_ids,@costs
172
+ )
173
+ end
174
+ private
175
+ def add_group! surface
176
+ group_id=@surfaces.add surface
177
+ group_id
178
+ end
179
+ def add_word! group_id,word
180
+ wid=@surface_ids.length
181
+ @surface_ids.push group_id
182
+ @left_ids.push word.left.id
183
+ @right_ids.push word.right.id
184
+ @left_features.add word.left.id,word.left.text
185
+ @right_features.add word.right.id,word.right.text
186
+ @costs.push word.cost
187
+ wid
188
+ end
189
+ end
190
+ def initialize groups,surfaces,left_features,right_features,surface_ids,left_ids,right_ids,costs
191
+ # group id -> [word id]
192
+ @groups=groups
193
+ @surfaces=surfaces
194
+ @left_features=left_features
195
+ @right_features=right_features
196
+ @surface_ids=surface_ids
197
+ @left_ids=left_ids
198
+ @right_ids=right_ids
199
+ @costs=costs
200
+ end
201
+ def group group_id
202
+ @groups[group_id].map{|wid|
203
+ Word.new(
204
+ @surfaces[@surface_ids[wid]],
205
+ @left_features[@left_ids[wid]],
206
+ @right_features[@right_ids[wid]],
207
+ @costs[wid]
208
+ )
209
+ }
210
+ end
211
+ def word_size
212
+ @groups.inject(0){|a,x|a+x.size}
213
+ end
214
+ end
215
+ class Word
216
+ def initialize surface,left,right,cost
217
+ raise "bad feature: #{left.inspect}" unless left.respond_to? :text
218
+ @surface,@left,@right,@cost=surface,left,right,cost
219
+ end
220
+ # String
221
+ attr_reader :surface
222
+ # Feature
223
+ attr_reader :left
224
+ # Feature
225
+ attr_reader :right
226
+ # Integer
227
+ attr_reader :cost
228
+ def == other
229
+ return [surface,left,right,cost] ==
230
+ [other.surface,other.left,other.right,other.cost]
231
+ end
232
+ def hash
233
+ [surface,left,right,cost].hash
234
+ end
235
+ def to_s
236
+ "Word(#{surface},#{left.id},#{right.id},#{cost})"
237
+ end
238
+ end
239
+ class Feature
240
+ def initialize id,text
241
+ @id,@text=id,text
242
+ end
243
+ attr_reader :id
244
+ attr_reader :text
245
+ def to_s
246
+ "Feature(#{id},#{text})"
247
+ end
248
+ def == other
249
+ return self.id==other.id
250
+ end
251
+ def hash
252
+ self.id.hash
253
+ end
254
+ end
255
+ class Features
256
+ def initialize
257
+ @map_id={}
258
+ end
259
+ # Integer -> Feature
260
+ def from_id id
261
+ @map_id[id]
262
+ end
263
+ def [](id)
264
+ from_id id
265
+ end
266
+ def add id,text
267
+ @map_id[id]=Feature.new id,text
268
+ end
269
+ def size
270
+ @map_id.size
271
+ end
272
+ BOS_EOS=Feature.new 0,'BOS/EOS'
273
+ end
274
+ class Dic
275
+ def initialize word_dic,unk_dic
276
+ @word_dic,@unk_dic=word_dic,unk_dic
277
+ end
278
+ attr_reader :word_dic
279
+ attr_reader :unk_dic
280
+ # -> [Word]
281
+ def possible_words str,i
282
+ ret=@word_dic.possible_words str,i
283
+ ret.concat(@unk_dic.possible_words(str,i,!ret.empty?))
284
+ ret
285
+ end
286
+ end
287
+ class UnkDic
288
+ # CharTypes -> Features ->
289
+ def initialize char_types
290
+ @char_types=char_types
291
+ # CharType.name => [Word]
292
+ @templates={}
293
+ end
294
+ # -> [Word]
295
+ def possible_words str,i,found_in_normal_dic
296
+ ret=[]
297
+ first_char_type=@char_types.type_for str[i].ord
298
+ return [] if found_in_normal_dic && !first_char_type.invoke?
299
+
300
+ collect_result ret,first_char_type,str[i..i] if first_char_type.length > 0
301
+
302
+ l=1
303
+ str[(i+1)..-1].each_codepoint{|cp|
304
+ break unless first_char_type.accept? cp
305
+ l+=1
306
+ collect_result ret,first_char_type,str[i...(i+l)] if first_char_type.length >= l
307
+ }
308
+ collect_result ret,first_char_type,str[i...(i+l)] if first_char_type.group? && first_char_type.length < l
309
+
310
+ ret
311
+ end
312
+ private
313
+ def collect_result ret,type,surface
314
+ (@templates[type.name]||[]).each{|tp|
315
+ ret.push Word.new surface,tp.left,tp.right,tp.cost
316
+ }
317
+ end
318
+ public
319
+ # String -> Feature -> Feature -> Integer ->
320
+ def define type_name,left,right,cost
321
+ type=@char_types.named type_name
322
+ (@templates[type_name]||=[]).push Word.new '',left,right,cost
323
+ end
324
+ def word_templates_for type_name
325
+ @templates[type_name].dup
326
+ end
327
+ def rule_size
328
+ @templates.values.inject(0){|sum,t|sum+t.size}
329
+ end
330
+ end
331
+ class CharTypes
332
+ def initialize
333
+ @types={}
334
+ @mapping={}
335
+ @compat_mapping={}
336
+ end
337
+ def type_for charcode
338
+ @mapping[charcode]||default_type||
339
+ (raise "Char type for 0x#{charcode.to_s(16)} is not defined,"+
340
+ " and DEFAULT type is not defined too")
341
+ end
342
+ def define_type name,invoke,group,length
343
+ @types[name]=CharType.new(name,invoke,group,length)
344
+ end
345
+ def define_map charcode,type,compat_types
346
+ @mapping[charcode]=type
347
+ type.add charcode
348
+ compat_types.each{|ct|ct.add charcode}
349
+ end
350
+ def named name
351
+ @types[name] || (raise "Undefined char type: #{name}")
352
+ end
353
+ def default_type
354
+ named 'DEFAULT'
355
+ end
356
+ end
357
+ class CharType
358
+ def initialize name,invoke,group,length
359
+ @name,@invoke,@group,@length=name,invoke,group,length
360
+ @accept_charcodes={}
361
+ end
362
+ def add charcode
363
+ @accept_charcodes[charcode]=true
364
+ end
365
+ attr_reader :name
366
+ attr_reader :length
367
+ def group?; @group; end
368
+ def invoke?; @invoke; end
369
+ def accept? charcode
370
+ @accept_charcodes[charcode]
371
+ end
372
+ end
373
+ class Matrix
374
+ def initialize rsize,lsize
375
+ @mat=[nil]*(lsize*rsize)
376
+ @lsize,@rsize=lsize,rsize
377
+ end
378
+ # Feature.id -> Feature.id -> Int
379
+ def cost rid,lid
380
+ @mat[rid*lsize+lid]
381
+ end
382
+ def set(rid,lid,cost)
383
+ @mat[rid*lsize+lid]=cost
384
+ end
385
+ attr_reader :rsize
386
+ attr_reader :lsize
387
+ end
388
+ end