okura 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/okura/console.rb +23 -0
- data/lib/okura/parser.rb +131 -0
- data/lib/okura/serializer.rb +317 -0
- data/lib/okura/version.rb +3 -0
- data/lib/okura/word_dic.rb +216 -0
- data/lib/okura.rb +388 -0
- data/test/okura_spec.rb +721 -0
- data/test/spec_helper.rb +6 -0
- data/test/words_spec.rb +36 -0
- metadata +86 -0
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'okura'
|
2
|
+
require 'okura/serializer'
|
3
|
+
|
4
|
+
module Okura
|
5
|
+
class Console
|
6
|
+
def run_console dict_dir
|
7
|
+
tagger=Okura::Serializer::FormatInfo.create_tagger(dict_dir)
|
8
|
+
print 'okura> '
|
9
|
+
while $stdin.gets
|
10
|
+
nodes=tagger.parse($_.strip)
|
11
|
+
(0...nodes.length).each{|i|
|
12
|
+
puts nodes[i].map{|n|"#{n.word.surface}\t#{n.word.right.text} #{n.word.cost}"}
|
13
|
+
puts
|
14
|
+
}
|
15
|
+
nodes.mincost_path.each{|n|
|
16
|
+
puts "#{n.word.surface}\t#{n.word.right.text}"
|
17
|
+
}
|
18
|
+
print 'okura> '
|
19
|
+
end
|
20
|
+
return 0
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/okura/parser.rb
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
module Okura
|
2
|
+
module Parser
|
3
|
+
|
4
|
+
def parse_error line
|
5
|
+
raise 'parse error: '+line
|
6
|
+
end
|
7
|
+
|
8
|
+
module Base
|
9
|
+
|
10
|
+
def initialize io
|
11
|
+
@io=io
|
12
|
+
end
|
13
|
+
|
14
|
+
include Enumerable
|
15
|
+
|
16
|
+
def each &b
|
17
|
+
return Enumerator.new(self) unless b
|
18
|
+
|
19
|
+
@io.each_line {|line|
|
20
|
+
b.call *parse(line)
|
21
|
+
}
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
class Matrix
|
26
|
+
include Base
|
27
|
+
|
28
|
+
def initialize io
|
29
|
+
@io=io
|
30
|
+
@rid_size,@lid_size=io.readline.split(/\s/).map(&:to_i)
|
31
|
+
end
|
32
|
+
|
33
|
+
attr_reader :rid_size
|
34
|
+
attr_reader :lid_size
|
35
|
+
|
36
|
+
def parse line
|
37
|
+
rid,lid,cost=line.split(/\s/).map(&:to_i)
|
38
|
+
[rid,lid,cost]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
class Word
|
43
|
+
include Base
|
44
|
+
def parse line
|
45
|
+
ti,ts=:to_i,:to_s
|
46
|
+
cols=line.split /,/
|
47
|
+
cols[0..3].zip([ts,ti,ti,ti]).map{|v,f|f.to_proc.call v}
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
class Feature
|
52
|
+
include Base
|
53
|
+
def parse line
|
54
|
+
id_s,name=line.strip.split(/ /,2)
|
55
|
+
id=id_s.to_i
|
56
|
+
[id,name]
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
class UnkDic
|
61
|
+
include Base
|
62
|
+
def parse line
|
63
|
+
type_s,lid_s,rid_s,cost_s,additional=line.split(/,/,5)
|
64
|
+
lid,rid,cost=[lid_s,rid_s,cost_s].map(&:to_i)
|
65
|
+
[type_s,lid,rid,cost]
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
class CharType
|
70
|
+
def initialize
|
71
|
+
@callbacks={
|
72
|
+
:mapping_single=>[],
|
73
|
+
:mapping_range=>[],
|
74
|
+
:define_type=>[]
|
75
|
+
}
|
76
|
+
end
|
77
|
+
|
78
|
+
def on_mapping_single &b
|
79
|
+
@callbacks[:mapping_single] << b
|
80
|
+
end
|
81
|
+
|
82
|
+
def on_mapping_range &b
|
83
|
+
@callbacks[:mapping_range] << b
|
84
|
+
end
|
85
|
+
|
86
|
+
def on_chartype_def &b
|
87
|
+
@callbacks[:define_type] << b
|
88
|
+
end
|
89
|
+
|
90
|
+
def parse_all io
|
91
|
+
io.each_line {|line|
|
92
|
+
parse line
|
93
|
+
}
|
94
|
+
end
|
95
|
+
|
96
|
+
def parse line
|
97
|
+
cols=line.gsub(/\s*#.*$/,'').split(/\s+/)
|
98
|
+
return if cols.empty?
|
99
|
+
case cols[0]
|
100
|
+
when /^0x([0-9a-fA-F]{4})(?:\.\.0x([0-9a-fA-F]{4}))?$/
|
101
|
+
# mapping
|
102
|
+
parse_error line unless cols.size >= 2
|
103
|
+
type=cols[1]
|
104
|
+
compat_types=cols[2..-1]
|
105
|
+
from=$1.to_i(16)
|
106
|
+
if $2
|
107
|
+
# mapping(range)
|
108
|
+
to=$2.to_i(16)
|
109
|
+
@callbacks[:mapping_range].each{|c|
|
110
|
+
c.call from,to,type,compat_types
|
111
|
+
}
|
112
|
+
else
|
113
|
+
# mapping(single)
|
114
|
+
@callbacks[:mapping_single].each{|c|
|
115
|
+
c.call from,type,compat_types
|
116
|
+
}
|
117
|
+
end
|
118
|
+
when /^\w+$/
|
119
|
+
parse_error line unless cols.size == 4
|
120
|
+
# typedef
|
121
|
+
@callbacks[:define_type].each{|c|
|
122
|
+
c.call cols[0],(cols[1]=='1'),(cols[2]=='1'),Integer(cols[3])
|
123
|
+
}
|
124
|
+
else
|
125
|
+
# error
|
126
|
+
parse_error line
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
@@ -0,0 +1,317 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
require 'okura'
|
3
|
+
require 'okura/parser'
|
4
|
+
|
5
|
+
module Okura
|
6
|
+
module Serializer
|
7
|
+
# 辞書ファイルのコンパイル形式を表現し、コンパイルとロードの制御を担当する
|
8
|
+
class FormatInfo
|
9
|
+
def initialize
|
10
|
+
@word_dic=:DoubleArray
|
11
|
+
@unk_dic=:Marshal
|
12
|
+
@features=:Marshal
|
13
|
+
@char_types=:Marshal
|
14
|
+
@matrix=:Marshal
|
15
|
+
@encoding='EUC-JP'
|
16
|
+
end
|
17
|
+
attr_accessor :word_dic
|
18
|
+
attr_accessor :unk_dic
|
19
|
+
attr_accessor :features
|
20
|
+
attr_accessor :char_types
|
21
|
+
attr_accessor :matrix
|
22
|
+
attr_accessor :encoding
|
23
|
+
|
24
|
+
# 指定されたディレクトリにあるソースをコンパイルする
|
25
|
+
def compile_dict src_dir,bin_dir
|
26
|
+
open_dest(bin_dir,'format-info'){|dest| self.compile dest}
|
27
|
+
features_l=open_src(src_dir,'left-id.def'){|src|
|
28
|
+
open_dest(bin_dir,'left-id.bin'){|dest|
|
29
|
+
serializer_for('Features',features).compile(src,dest)
|
30
|
+
}
|
31
|
+
}
|
32
|
+
|
33
|
+
word_src_files=
|
34
|
+
Dir.chdir(src_dir){ Dir.glob('*.csv') }.
|
35
|
+
map{|file|File.join(src_dir,file)}
|
36
|
+
open_dest(bin_dir,'word_dic.bin'){|dest|
|
37
|
+
serializer_for('WordDic',word_dic).compile(features_l,word_src_files,encoding,dest)
|
38
|
+
}
|
39
|
+
|
40
|
+
char_types=open_src(src_dir,'char.def'){|src|
|
41
|
+
open_dest(bin_dir,'char_types.bin'){|dest|
|
42
|
+
serializer_for('CharTypes',@char_types).compile(src,dest)
|
43
|
+
}
|
44
|
+
}
|
45
|
+
|
46
|
+
open_src(src_dir,'unk.def'){|src|
|
47
|
+
open_dest(bin_dir,'unk_dic.bin'){|dest|
|
48
|
+
serializer_for('UnkDic',unk_dic).compile(char_types,features_l,src,dest)
|
49
|
+
}
|
50
|
+
}
|
51
|
+
|
52
|
+
open_src(src_dir,'matrix.def'){|src|
|
53
|
+
open_dest(bin_dir,'matrix.bin'){|dest|
|
54
|
+
serializer_for('Matrix',matrix).compile(src,dest)
|
55
|
+
}
|
56
|
+
}
|
57
|
+
end
|
58
|
+
# 指定されたディレクトリにあるコンパイル済み辞書をロードし、Taggerを作成する
|
59
|
+
def self.create_tagger bin_dir
|
60
|
+
format_info=File.open(File.join(bin_dir,'format-info')){|f| self.load f }
|
61
|
+
format_info.create_tagger bin_dir
|
62
|
+
end
|
63
|
+
def create_tagger bin_dir
|
64
|
+
features_l=open_bin(bin_dir,'left-id.bin'){|bin|
|
65
|
+
serializer_for('Features',features).load(bin)
|
66
|
+
}
|
67
|
+
wd=open_bin(bin_dir,'word_dic.bin'){|f|
|
68
|
+
serializer_for('WordDic',word_dic).load(f)
|
69
|
+
}
|
70
|
+
ud=open_bin(bin_dir,'unk_dic.bin'){|f|
|
71
|
+
serializer_for('UnkDic',unk_dic).load(f)
|
72
|
+
}
|
73
|
+
mat=open_bin(bin_dir,'matrix.bin'){|f|
|
74
|
+
serializer_for('Matrix',matrix).load(f)
|
75
|
+
}
|
76
|
+
dic=Okura::Dic.new wd,ud
|
77
|
+
tagger=Okura::Tagger.new dic,mat
|
78
|
+
tagger
|
79
|
+
end
|
80
|
+
# このFormatInfoオブジェクトをシリアライズする
|
81
|
+
def compile io
|
82
|
+
YAML.dump({
|
83
|
+
word_dic: word_dic,
|
84
|
+
unk_dic: unk_dic,
|
85
|
+
features: features,
|
86
|
+
char_types: char_types,
|
87
|
+
matrix: matrix
|
88
|
+
},io)
|
89
|
+
end
|
90
|
+
# シリアライズされたFormatInfoオブジェクトを復元する
|
91
|
+
def self.load io
|
92
|
+
data=YAML.load(io)
|
93
|
+
fi=FormatInfo.new
|
94
|
+
fi.word_dic=data[:word_dic]
|
95
|
+
fi.unk_dic=data[:unk_dic]
|
96
|
+
fi.features=data[:features]
|
97
|
+
fi.char_types=data[:char_types]
|
98
|
+
fi.matrix=data[:matrix]
|
99
|
+
fi
|
100
|
+
end
|
101
|
+
private
|
102
|
+
def open_src dir,filename,&block
|
103
|
+
File.open(File.join(dir,filename),"r:#{encoding}:UTF-8",&block)
|
104
|
+
end
|
105
|
+
def open_dest dir,filename,&block
|
106
|
+
File.open(File.join(dir,filename),'wb:ASCII-8BIT',&block)
|
107
|
+
end
|
108
|
+
def open_bin dir,filename,&block
|
109
|
+
File.open(File.join(dir,filename),'rb:ASCII-8BIT',&block)
|
110
|
+
end
|
111
|
+
def serializer_for data_type_name,format_type_name
|
112
|
+
data_type=Okura::Serializer.const_get data_type_name
|
113
|
+
format_type=data_type.const_get format_type_name
|
114
|
+
format_type.new
|
115
|
+
end
|
116
|
+
end
|
117
|
+
module Features
|
118
|
+
class Marshal
|
119
|
+
def compile(input,output)
|
120
|
+
parser=Okura::Parser::Feature.new input
|
121
|
+
features=Okura::Features.new
|
122
|
+
parser.each{|id,text|
|
123
|
+
features.add id,text
|
124
|
+
}
|
125
|
+
::Marshal.dump(features,output)
|
126
|
+
features
|
127
|
+
end
|
128
|
+
def load(io)
|
129
|
+
::Marshal.load(io)
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
module WordDic
|
134
|
+
def self.each_input inputs,encoding,&block
|
135
|
+
inputs.each{|input|
|
136
|
+
case input
|
137
|
+
when String
|
138
|
+
File.open(input,"r:#{encoding}:UTF-8",&block)
|
139
|
+
else
|
140
|
+
block.call input
|
141
|
+
end
|
142
|
+
}
|
143
|
+
end
|
144
|
+
class Naive
|
145
|
+
def compile(features,inputs,encoding,output)
|
146
|
+
dic=Okura::WordDic::Naive.new
|
147
|
+
Okura::Serializer::WordDic.each_input(inputs,encoding){|input|
|
148
|
+
parser=Okura::Parser::Word.new(input)
|
149
|
+
parser.each{|surface,lid,rid,cost|
|
150
|
+
word=Okura::Word.new(
|
151
|
+
surface,
|
152
|
+
features.from_id(lid),
|
153
|
+
features.from_id(rid),
|
154
|
+
cost
|
155
|
+
)
|
156
|
+
dic.define word
|
157
|
+
}
|
158
|
+
}
|
159
|
+
Marshal.dump(dic,output)
|
160
|
+
end
|
161
|
+
def load(io)
|
162
|
+
Marshal.load(io)
|
163
|
+
end
|
164
|
+
end
|
165
|
+
class DoubleArray
|
166
|
+
def compile(features,inputs,encoding,output)
|
167
|
+
puts 'loading'
|
168
|
+
dic=Okura::WordDic::DoubleArray::Builder.new
|
169
|
+
Okura::Serializer::WordDic.each_input(inputs,encoding){|input|
|
170
|
+
parser=Okura::Parser::Word.new(input)
|
171
|
+
parser.each{|surface,lid,rid,cost|
|
172
|
+
word=Okura::Word.new(
|
173
|
+
surface,
|
174
|
+
features.from_id(lid),
|
175
|
+
features.from_id(rid),
|
176
|
+
cost
|
177
|
+
)
|
178
|
+
dic.define word
|
179
|
+
}
|
180
|
+
}
|
181
|
+
|
182
|
+
writer=Okura::Serializer::BinaryWriter.new output
|
183
|
+
words,base,check=dic.data_for_serialize
|
184
|
+
raise 'base.length!=check.length' if base.length!=check.length
|
185
|
+
puts 'serialize words'
|
186
|
+
words.instance_eval do
|
187
|
+
writer.write_object @groups
|
188
|
+
writer.write_object @left_features
|
189
|
+
writer.write_object @right_features
|
190
|
+
writer.write_int32_array @left_ids
|
191
|
+
writer.write_int32_array @right_ids
|
192
|
+
writer.write_int32_array @costs
|
193
|
+
writer.write_int32_array @surface_ids
|
194
|
+
puts 'serialize surfaces'
|
195
|
+
@surfaces.instance_eval do
|
196
|
+
writer.write_object @str
|
197
|
+
writer.write_int32_array @indices
|
198
|
+
end
|
199
|
+
end
|
200
|
+
puts 'serialize DAT indices'
|
201
|
+
writer.write_int32_array base
|
202
|
+
writer.write_int32_array check
|
203
|
+
end
|
204
|
+
def load(io)
|
205
|
+
reader=Okura::Serializer::BinaryReader.new io
|
206
|
+
words=begin
|
207
|
+
groups=reader.read_object
|
208
|
+
left_features=reader.read_object
|
209
|
+
right_features=reader.read_object
|
210
|
+
left_ids=reader.read_int32_array
|
211
|
+
right_ids=reader.read_int32_array
|
212
|
+
costs=reader.read_int32_array
|
213
|
+
surface_ids=reader.read_int32_array
|
214
|
+
surfaces=begin
|
215
|
+
str=reader.read_object
|
216
|
+
indices=reader.read_int32_array
|
217
|
+
Okura::Words::CompactStringArray.new str,indices
|
218
|
+
end
|
219
|
+
Okura::Words.new(
|
220
|
+
groups,surfaces,left_features,right_features,surface_ids,left_ids,right_ids,costs
|
221
|
+
)
|
222
|
+
end
|
223
|
+
base=reader.read_int32_array
|
224
|
+
check=reader.read_int32_array
|
225
|
+
Okura::WordDic::DoubleArray::Builder.build_from_serialized [words,base,check]
|
226
|
+
end
|
227
|
+
end
|
228
|
+
end
|
229
|
+
module CharTypes
|
230
|
+
class Marshal
|
231
|
+
def compile(input,output)
|
232
|
+
cts=Okura::CharTypes.new
|
233
|
+
|
234
|
+
parser=Okura::Parser::CharType.new
|
235
|
+
parser.on_chartype_def{|name,invoke,group,length|
|
236
|
+
cts.define_type(name,invoke,group,length)
|
237
|
+
}
|
238
|
+
parser.on_mapping_single{|char,type,ctypes|
|
239
|
+
cts.define_map char,cts.named(type),ctypes.map{|ct|cts.named(ct)}
|
240
|
+
}
|
241
|
+
parser.on_mapping_range{|from,to,type,ctypes|
|
242
|
+
(from..to).each{|char|
|
243
|
+
cts.define_map char,cts.named(type),ctypes.map{|ct|cts.named(ct)}
|
244
|
+
}
|
245
|
+
}
|
246
|
+
parser.parse_all input
|
247
|
+
|
248
|
+
::Marshal.dump(cts,output)
|
249
|
+
cts
|
250
|
+
end
|
251
|
+
def load(io)
|
252
|
+
::Marshal.load(io)
|
253
|
+
end
|
254
|
+
end
|
255
|
+
end
|
256
|
+
module UnkDic
|
257
|
+
class Marshal
|
258
|
+
def compile(char_types,features,input,output)
|
259
|
+
unk=Okura::UnkDic.new char_types
|
260
|
+
parser=Okura::Parser::UnkDic.new input
|
261
|
+
parser.each{|type_name,lid,rid,cost|
|
262
|
+
unk.define type_name,features.from_id(lid),features.from_id(rid),cost
|
263
|
+
}
|
264
|
+
::Marshal.dump(unk,output)
|
265
|
+
end
|
266
|
+
def load(io)
|
267
|
+
::Marshal.load(io)
|
268
|
+
end
|
269
|
+
end
|
270
|
+
end
|
271
|
+
module Matrix
|
272
|
+
class Marshal
|
273
|
+
def compile(input,output)
|
274
|
+
parser=Okura::Parser::Matrix.new input
|
275
|
+
mat=Okura::Matrix.new parser.rid_size,parser.lid_size
|
276
|
+
parser.each{|rid,lid,cost|
|
277
|
+
mat.set(rid,lid,cost)
|
278
|
+
}
|
279
|
+
::Marshal.dump(mat,output)
|
280
|
+
end
|
281
|
+
def load(io)
|
282
|
+
::Marshal.load(io)
|
283
|
+
end
|
284
|
+
end
|
285
|
+
end
|
286
|
+
class BinaryReader
|
287
|
+
def initialize io
|
288
|
+
@io=io
|
289
|
+
end
|
290
|
+
def read_int32
|
291
|
+
@io.read(4).unpack('l').first
|
292
|
+
end
|
293
|
+
def read_int32_array
|
294
|
+
size=read_int32
|
295
|
+
@io.read(4*size).unpack('l*')
|
296
|
+
end
|
297
|
+
def read_object
|
298
|
+
Marshal.load @io
|
299
|
+
end
|
300
|
+
end
|
301
|
+
class BinaryWriter
|
302
|
+
def initialize io
|
303
|
+
@io=io
|
304
|
+
end
|
305
|
+
def write_int32 value
|
306
|
+
@io.write [value].pack('l')
|
307
|
+
end
|
308
|
+
def write_int32_array value
|
309
|
+
write_int32 value.length
|
310
|
+
@io.write value.pack('l*')
|
311
|
+
end
|
312
|
+
def write_object obj
|
313
|
+
Marshal.dump obj,@io
|
314
|
+
end
|
315
|
+
end
|
316
|
+
end
|
317
|
+
end
|