okura 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,3 @@
1
+ module Okura
2
+ Version = '0.0.0'
3
+ end
@@ -0,0 +1,216 @@
1
+ module Okura
2
+ module WordDic
3
+ class Naive
4
+ def initialize
5
+ @size=0
6
+ @root=TrieNode.new
7
+ end
8
+ class TrieNode
9
+ def initialize
10
+ @nodes={}
11
+ @leafs=[]
12
+ end
13
+ def add word,i=0
14
+ if i==word.surface.length
15
+ @leafs.push word
16
+ else
17
+ fst=word.surface[i]
18
+ node=@nodes[fst]
19
+ @nodes[fst]=node=TrieNode.new if node.nil?
20
+ node.add word,i+1
21
+ end
22
+ end
23
+ def find_all str,i,res=Array.new
24
+ res.concat @leafs
25
+ return res unless i < str.length
26
+ node=@nodes[str[i]]
27
+ return res if node.nil?
28
+ node.find_all(str,i+1,res)
29
+ res
30
+ end
31
+ end
32
+ attr_reader :size
33
+ def define word
34
+ @size+=1
35
+ @root.add word
36
+ end
37
+ # -> [Word]
38
+ def possible_words str,i
39
+ @root.find_all str,i
40
+ end
41
+ def word_size
42
+ @size
43
+ end
44
+ end
45
+ class DoubleArray
46
+ # Words -> [Integer] -> [Integer]
47
+ def initialize words,base,check
48
+ @words,@base,@check=words,base,check
49
+ end
50
+ def possible_words str,i
51
+ ret=[]
52
+ prev=nil
53
+ cur=1
54
+ str[i..-1].bytes.each{|c|
55
+ next_index=@base[cur]+c+1
56
+ break unless @check[next_index]==cur
57
+ prev,cur=cur,next_index
58
+ # check EOS node
59
+ eos_index=@base[cur]
60
+ if @check[eos_index]==cur
61
+ raise "@base[#{eos_index}] should < 0 but #{@base[eos_index]}" unless @base[eos_index] < 0
62
+ ret.push -@base[eos_index]-1
63
+ end
64
+ }
65
+ return ret.map{|x|@words.group(x)}.flatten(1)
66
+ end
67
+ def word_size
68
+ @words.word_size
69
+ end
70
+ class Builder
71
+ class DAData
72
+ def initialize root
73
+ # offset | +0 | +1 | +2 | ...
74
+ # data | -data_id-1 | child(0) | child(1) | ...
75
+ #
76
+ # base[0] = -last free cell
77
+ # check[0] = -first free cell
78
+ # 1 = root node id
79
+ @base=[0,0]
80
+ @check=[0,0]
81
+ @length=2
82
+ b,node_id=construct! root
83
+ @base[1]=b
84
+ end
85
+ attr_reader :base
86
+ attr_reader :check
87
+ attr_reader :length
88
+
89
+ def construct! node,parent=1
90
+ # base[parent_node_id] should == s
91
+ # -base[s+0] : data id
92
+ # s+1+c : child node id for char c
93
+ # check[m] : parent node id for node m
94
+ s=find_free_space_for node
95
+ if node.has_data?
96
+ alloc! s,parent
97
+ @base[s]=-node.data_id-1
98
+ end
99
+ node.children.each{|c,cn| alloc! child_index(s,c),parent }
100
+ node.children.each{|c,cn|
101
+ idx=child_index(s,c)
102
+ @base[idx]=construct! cn,idx
103
+ }
104
+ s
105
+ end
106
+ def child_index base,c
107
+ base+c+1
108
+ end
109
+ def alloc! index,parent
110
+ assert index>0
111
+ assert free?(index)
112
+ if length <= index
113
+ expand!(index+1)
114
+ end
115
+ assert has_free_cell?
116
+
117
+ prev_free=-@base[index]
118
+ next_free=-@check[index]
119
+ @base[next_free]=-prev_free
120
+ @check[prev_free]=-next_free
121
+ @base[index]=0 # dummy value
122
+ @check[index]=parent
123
+ assert !free?(index)
124
+ end
125
+ def expand! size
126
+ if size <= length
127
+ return
128
+ end
129
+ (length...size).each{|i|
130
+ if has_free_cell?
131
+ @base[i]=@base[0]
132
+ @check[i]=0
133
+ @check[-@base[0]]=-i
134
+ @base[0]=-i
135
+ else
136
+ @base[i]=0
137
+ @check[i]=0
138
+ @base[0]=-i
139
+ @check[0]=-i
140
+ end
141
+ }
142
+ @length=size
143
+ end
144
+ def free? index
145
+ length <= index || @check[index] <= 0
146
+ end
147
+ def find_free_space_for node
148
+ alloc_indexes=node.children.keys.map{|c|c+1}
149
+ alloc_indexes+=[0] if node.has_data?
150
+ return 0 if alloc_indexes.empty?
151
+ min=alloc_indexes.min
152
+ i=-@check[0]
153
+ while i!=0
154
+ assert free?(i)
155
+ if 0 < i-min && alloc_indexes.all?{|idx|free?(idx+i-min)}
156
+ return i-min
157
+ end
158
+ i=-@check[i]
159
+ end
160
+ # free space not found
161
+ return [length-min,1].max
162
+ end
163
+ def has_free_cell?
164
+ @base[0]!=0
165
+ end
166
+ def assert cond
167
+ raise unless cond
168
+ end
169
+ end
170
+ class Node
171
+ def initialize
172
+ @data_id=nil
173
+ @children={}
174
+ end
175
+ attr_reader :data_id
176
+ attr_reader :children
177
+ def has_data?
178
+ !!data_id
179
+ end
180
+ def add bytes,idx,data_id
181
+ if idx==bytes.length
182
+ @data_id=data_id
183
+ else
184
+ c=bytes[idx]
185
+ (@children[c]||=Node.new).add(bytes,idx+1,data_id)
186
+ end
187
+ end
188
+ end
189
+ def initialize
190
+ @root=Node.new
191
+ @words=Okura::Words::Builder.new
192
+ end
193
+ def define word
194
+ word_group_id=@words.add word
195
+ key=word.surface.bytes.to_a
196
+ @root.add key,0,word_group_id
197
+ end
198
+ def build
199
+ da=DAData.new @root
200
+ DoubleArray.new *data_for_serialize
201
+ end
202
+ # -> [ Words, [Integer], [Integer] ]
203
+ def data_for_serialize
204
+ da=DAData.new @root
205
+ [@words.build,da.base,da.check]
206
+ end
207
+ # [ Words, [Integer], [Integer] ] -> WordDic::DoubleArray
208
+ def self.build_from_serialized data
209
+ words,base,check=data
210
+ puts base.length
211
+ DoubleArray.new words,base,check
212
+ end
213
+ end
214
+ end
215
+ end
216
+ end
data/lib/okura.rb ADDED
@@ -0,0 +1,388 @@
1
+ # coding: utf-8
2
+ require 'okura/word_dic'
3
+
4
+ module Okura
5
+ class Tagger
6
+ def initialize dic,mat
7
+ @dic,@mat=dic,mat
8
+ end
9
+ attr_reader :dic
10
+ attr_reader :mat
11
+ # -> [String]
12
+ def wakati str,mat
13
+ mincost_path=parse(str).mincost_path
14
+ return nil if mincost_path.nil?
15
+ return mincost_path.map{|node|node.word.surface}
16
+ end
17
+ # -> Nodes
18
+ def parse str
19
+ chars=str.split(//)
20
+ nodes=Nodes.new(chars.length+2,@mat)
21
+ nodes.add(0,Node.mk_bos_eos)
22
+ nodes.add(chars.length+1,Node.mk_bos_eos)
23
+ str.length.times{|i|
24
+ @dic.possible_words(str,i).each{|w|
25
+ nodes.add(i+1,Node.new(w))
26
+ }
27
+ }
28
+ nodes
29
+ end
30
+ end
31
+ class Nodes
32
+ def initialize len,mat
33
+ @mat=mat
34
+ @begins=(0...len).map{[]}
35
+ @ends=(0...len).map{[]}
36
+ end
37
+ def [](i)
38
+ @begins[i]
39
+ end
40
+ def length
41
+ @begins.length
42
+ end
43
+ # Matrix -> [Node] | nil
44
+ def mincost_path
45
+ return [] if length==0
46
+ # calc cost
47
+ self[0].each{|n|
48
+ n.total_cost=n.word.cost
49
+ n.nearest_prev=nil
50
+ }
51
+ (1...length).each{|i|
52
+ prevs=@ends[i-1]
53
+ curs=@begins[i]
54
+ prevs.each{|prev|
55
+ # 途中で行き止まりのNodeはtotal_costが設定されない
56
+ next if prev.total_cost.nil?
57
+ curs.each{|cur|
58
+ join_cost=@mat.cost(prev.word.right.id,cur.word.left.id)
59
+ next if join_cost.nil?
60
+ cost=prev.total_cost+join_cost+cur.word.cost
61
+ if !cur.total_cost || cost < cur.total_cost
62
+ cur.total_cost=cost
63
+ cur.nearest_prev=prev
64
+ end
65
+ }
66
+ }
67
+ }
68
+ # calc mincost path
69
+ ret=[]
70
+ cur=self[-1][0]
71
+ until cur.nil?
72
+ ret.push cur
73
+ cur=cur.nearest_prev
74
+ end
75
+ # TODO: disconnected
76
+ # return nil unless ...
77
+ # success
78
+ return ret.reverse
79
+ end
80
+ def add i,node
81
+ @begins[i].push node
82
+ @ends[i+node.length-1].push node
83
+ end
84
+ end
85
+ class Node
86
+ def initialize word
87
+ @word=word
88
+ @nearest_prev=nil
89
+ @total_cost=nil
90
+ end
91
+ attr_reader :word
92
+ attr_accessor :nearest_prev
93
+ attr_accessor :total_cost
94
+ def length
95
+ word.surface.length
96
+ end
97
+ def to_s
98
+ "Node(#{word},#{total_cost})"
99
+ end
100
+ def self.mk_bos_eos
101
+ f=Features::BOS_EOS
102
+ node=Node.new Word.new('BOS/EOS',f,f,0)
103
+ def node.length; 1; end
104
+ node
105
+ end
106
+ end
107
+ class Words
108
+ class CompactStringArray
109
+ def initialize str,indices
110
+ @str=str
111
+ @indices=indices
112
+ end
113
+ def get id
114
+ raise 'bad id' unless id < @indices.length
115
+ from=@indices[id]
116
+ to=(id+1 < @indices.length) ? @indices[id+1] : @str.length
117
+ @str[from...to]
118
+ end
119
+ def [](id)
120
+ get id
121
+ end
122
+ class Builder
123
+ def initialize
124
+ @indices=[]
125
+ @surfaces=[]
126
+ @size=0
127
+ end
128
+ def build
129
+ Okura::Words::CompactStringArray.new @surfaces.join(''),@indices
130
+ end
131
+ def add surface
132
+ id=@indices.length
133
+ @indices.push @size
134
+ @surfaces.push surface
135
+ @size+=surface.size
136
+ id
137
+ end
138
+ end
139
+ end
140
+ class Builder
141
+ def initialize
142
+ # group id -> [Word]
143
+ @groups=[]
144
+ @next_group_id=0
145
+ # surface -> id
146
+ @group_ids={}
147
+ @surfaces=Okura::Words::CompactStringArray::Builder.new
148
+ @left_features=Features.new
149
+ @right_features=Features.new
150
+ @surface_ids=[]
151
+ @left_ids=[]
152
+ @right_ids=[]
153
+ @costs=[]
154
+ end
155
+ def add word
156
+ unless @group_ids.has_key? word.surface
157
+ gid=add_group! word.surface
158
+ wid=add_word! gid,word
159
+ @group_ids[word.surface]=gid
160
+ @groups[gid]=[wid]
161
+ gid
162
+ else
163
+ gid=@group_ids[word.surface]
164
+ wid=add_word! gid,word
165
+ @groups[gid].push wid
166
+ gid
167
+ end
168
+ end
169
+ def build
170
+ Okura::Words.new(
171
+ @groups,@surfaces.build,@left_features,@right_features,@surface_ids,@left_ids,@right_ids,@costs
172
+ )
173
+ end
174
+ private
175
+ def add_group! surface
176
+ group_id=@surfaces.add surface
177
+ group_id
178
+ end
179
+ def add_word! group_id,word
180
+ wid=@surface_ids.length
181
+ @surface_ids.push group_id
182
+ @left_ids.push word.left.id
183
+ @right_ids.push word.right.id
184
+ @left_features.add word.left.id,word.left.text
185
+ @right_features.add word.right.id,word.right.text
186
+ @costs.push word.cost
187
+ wid
188
+ end
189
+ end
190
+ def initialize groups,surfaces,left_features,right_features,surface_ids,left_ids,right_ids,costs
191
+ # group id -> [word id]
192
+ @groups=groups
193
+ @surfaces=surfaces
194
+ @left_features=left_features
195
+ @right_features=right_features
196
+ @surface_ids=surface_ids
197
+ @left_ids=left_ids
198
+ @right_ids=right_ids
199
+ @costs=costs
200
+ end
201
+ def group group_id
202
+ @groups[group_id].map{|wid|
203
+ Word.new(
204
+ @surfaces[@surface_ids[wid]],
205
+ @left_features[@left_ids[wid]],
206
+ @right_features[@right_ids[wid]],
207
+ @costs[wid]
208
+ )
209
+ }
210
+ end
211
+ def word_size
212
+ @groups.inject(0){|a,x|a+x.size}
213
+ end
214
+ end
215
+ class Word
216
+ def initialize surface,left,right,cost
217
+ raise "bad feature: #{left.inspect}" unless left.respond_to? :text
218
+ @surface,@left,@right,@cost=surface,left,right,cost
219
+ end
220
+ # String
221
+ attr_reader :surface
222
+ # Feature
223
+ attr_reader :left
224
+ # Feature
225
+ attr_reader :right
226
+ # Integer
227
+ attr_reader :cost
228
+ def == other
229
+ return [surface,left,right,cost] ==
230
+ [other.surface,other.left,other.right,other.cost]
231
+ end
232
+ def hash
233
+ [surface,left,right,cost].hash
234
+ end
235
+ def to_s
236
+ "Word(#{surface},#{left.id},#{right.id},#{cost})"
237
+ end
238
+ end
239
+ class Feature
240
+ def initialize id,text
241
+ @id,@text=id,text
242
+ end
243
+ attr_reader :id
244
+ attr_reader :text
245
+ def to_s
246
+ "Feature(#{id},#{text})"
247
+ end
248
+ def == other
249
+ return self.id==other.id
250
+ end
251
+ def hash
252
+ self.id.hash
253
+ end
254
+ end
255
+ class Features
256
+ def initialize
257
+ @map_id={}
258
+ end
259
+ # Integer -> Feature
260
+ def from_id id
261
+ @map_id[id]
262
+ end
263
+ def [](id)
264
+ from_id id
265
+ end
266
+ def add id,text
267
+ @map_id[id]=Feature.new id,text
268
+ end
269
+ def size
270
+ @map_id.size
271
+ end
272
+ BOS_EOS=Feature.new 0,'BOS/EOS'
273
+ end
274
+ class Dic
275
+ def initialize word_dic,unk_dic
276
+ @word_dic,@unk_dic=word_dic,unk_dic
277
+ end
278
+ attr_reader :word_dic
279
+ attr_reader :unk_dic
280
+ # -> [Word]
281
+ def possible_words str,i
282
+ ret=@word_dic.possible_words str,i
283
+ ret.concat(@unk_dic.possible_words(str,i,!ret.empty?))
284
+ ret
285
+ end
286
+ end
287
+ class UnkDic
288
+ # CharTypes -> Features ->
289
+ def initialize char_types
290
+ @char_types=char_types
291
+ # CharType.name => [Word]
292
+ @templates={}
293
+ end
294
+ # -> [Word]
295
+ def possible_words str,i,found_in_normal_dic
296
+ ret=[]
297
+ first_char_type=@char_types.type_for str[i].ord
298
+ return [] if found_in_normal_dic && !first_char_type.invoke?
299
+
300
+ collect_result ret,first_char_type,str[i..i] if first_char_type.length > 0
301
+
302
+ l=1
303
+ str[(i+1)..-1].each_codepoint{|cp|
304
+ break unless first_char_type.accept? cp
305
+ l+=1
306
+ collect_result ret,first_char_type,str[i...(i+l)] if first_char_type.length >= l
307
+ }
308
+ collect_result ret,first_char_type,str[i...(i+l)] if first_char_type.group? && first_char_type.length < l
309
+
310
+ ret
311
+ end
312
+ private
313
+ def collect_result ret,type,surface
314
+ (@templates[type.name]||[]).each{|tp|
315
+ ret.push Word.new surface,tp.left,tp.right,tp.cost
316
+ }
317
+ end
318
+ public
319
+ # String -> Feature -> Feature -> Integer ->
320
+ def define type_name,left,right,cost
321
+ type=@char_types.named type_name
322
+ (@templates[type_name]||=[]).push Word.new '',left,right,cost
323
+ end
324
+ def word_templates_for type_name
325
+ @templates[type_name].dup
326
+ end
327
+ def rule_size
328
+ @templates.values.inject(0){|sum,t|sum+t.size}
329
+ end
330
+ end
331
+ class CharTypes
332
+ def initialize
333
+ @types={}
334
+ @mapping={}
335
+ @compat_mapping={}
336
+ end
337
+ def type_for charcode
338
+ @mapping[charcode]||default_type||
339
+ (raise "Char type for 0x#{charcode.to_s(16)} is not defined,"+
340
+ " and DEFAULT type is not defined too")
341
+ end
342
+ def define_type name,invoke,group,length
343
+ @types[name]=CharType.new(name,invoke,group,length)
344
+ end
345
+ def define_map charcode,type,compat_types
346
+ @mapping[charcode]=type
347
+ type.add charcode
348
+ compat_types.each{|ct|ct.add charcode}
349
+ end
350
+ def named name
351
+ @types[name] || (raise "Undefined char type: #{name}")
352
+ end
353
+ def default_type
354
+ named 'DEFAULT'
355
+ end
356
+ end
357
+ class CharType
358
+ def initialize name,invoke,group,length
359
+ @name,@invoke,@group,@length=name,invoke,group,length
360
+ @accept_charcodes={}
361
+ end
362
+ def add charcode
363
+ @accept_charcodes[charcode]=true
364
+ end
365
+ attr_reader :name
366
+ attr_reader :length
367
+ def group?; @group; end
368
+ def invoke?; @invoke; end
369
+ def accept? charcode
370
+ @accept_charcodes[charcode]
371
+ end
372
+ end
373
+ class Matrix
374
+ def initialize rsize,lsize
375
+ @mat=[nil]*(lsize*rsize)
376
+ @lsize,@rsize=lsize,rsize
377
+ end
378
+ # Feature.id -> Feature.id -> Int
379
+ def cost rid,lid
380
+ @mat[rid*lsize+lid]
381
+ end
382
+ def set(rid,lid,cost)
383
+ @mat[rid*lsize+lid]=cost
384
+ end
385
+ attr_reader :rsize
386
+ attr_reader :lsize
387
+ end
388
+ end