okura 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/okura/console.rb +23 -0
- data/lib/okura/parser.rb +131 -0
- data/lib/okura/serializer.rb +317 -0
- data/lib/okura/version.rb +3 -0
- data/lib/okura/word_dic.rb +216 -0
- data/lib/okura.rb +388 -0
- data/test/okura_spec.rb +721 -0
- data/test/spec_helper.rb +6 -0
- data/test/words_spec.rb +36 -0
- metadata +86 -0
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'okura'
|
2
|
+
require 'okura/serializer'
|
3
|
+
|
4
|
+
module Okura
|
5
|
+
class Console
|
6
|
+
def run_console dict_dir
|
7
|
+
tagger=Okura::Serializer::FormatInfo.create_tagger(dict_dir)
|
8
|
+
print 'okura> '
|
9
|
+
while $stdin.gets
|
10
|
+
nodes=tagger.parse($_.strip)
|
11
|
+
(0...nodes.length).each{|i|
|
12
|
+
puts nodes[i].map{|n|"#{n.word.surface}\t#{n.word.right.text} #{n.word.cost}"}
|
13
|
+
puts
|
14
|
+
}
|
15
|
+
nodes.mincost_path.each{|n|
|
16
|
+
puts "#{n.word.surface}\t#{n.word.right.text}"
|
17
|
+
}
|
18
|
+
print 'okura> '
|
19
|
+
end
|
20
|
+
return 0
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/okura/parser.rb
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
module Okura
|
2
|
+
module Parser
|
3
|
+
|
4
|
+
def parse_error line
|
5
|
+
raise 'parse error: '+line
|
6
|
+
end
|
7
|
+
|
8
|
+
module Base
|
9
|
+
|
10
|
+
def initialize io
|
11
|
+
@io=io
|
12
|
+
end
|
13
|
+
|
14
|
+
include Enumerable
|
15
|
+
|
16
|
+
def each &b
|
17
|
+
return Enumerator.new(self) unless b
|
18
|
+
|
19
|
+
@io.each_line {|line|
|
20
|
+
b.call *parse(line)
|
21
|
+
}
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
class Matrix
|
26
|
+
include Base
|
27
|
+
|
28
|
+
def initialize io
|
29
|
+
@io=io
|
30
|
+
@rid_size,@lid_size=io.readline.split(/\s/).map(&:to_i)
|
31
|
+
end
|
32
|
+
|
33
|
+
attr_reader :rid_size
|
34
|
+
attr_reader :lid_size
|
35
|
+
|
36
|
+
def parse line
|
37
|
+
rid,lid,cost=line.split(/\s/).map(&:to_i)
|
38
|
+
[rid,lid,cost]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
class Word
|
43
|
+
include Base
|
44
|
+
def parse line
|
45
|
+
ti,ts=:to_i,:to_s
|
46
|
+
cols=line.split /,/
|
47
|
+
cols[0..3].zip([ts,ti,ti,ti]).map{|v,f|f.to_proc.call v}
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
class Feature
|
52
|
+
include Base
|
53
|
+
def parse line
|
54
|
+
id_s,name=line.strip.split(/ /,2)
|
55
|
+
id=id_s.to_i
|
56
|
+
[id,name]
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
class UnkDic
|
61
|
+
include Base
|
62
|
+
def parse line
|
63
|
+
type_s,lid_s,rid_s,cost_s,additional=line.split(/,/,5)
|
64
|
+
lid,rid,cost=[lid_s,rid_s,cost_s].map(&:to_i)
|
65
|
+
[type_s,lid,rid,cost]
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
class CharType
|
70
|
+
def initialize
|
71
|
+
@callbacks={
|
72
|
+
:mapping_single=>[],
|
73
|
+
:mapping_range=>[],
|
74
|
+
:define_type=>[]
|
75
|
+
}
|
76
|
+
end
|
77
|
+
|
78
|
+
def on_mapping_single &b
|
79
|
+
@callbacks[:mapping_single] << b
|
80
|
+
end
|
81
|
+
|
82
|
+
def on_mapping_range &b
|
83
|
+
@callbacks[:mapping_range] << b
|
84
|
+
end
|
85
|
+
|
86
|
+
def on_chartype_def &b
|
87
|
+
@callbacks[:define_type] << b
|
88
|
+
end
|
89
|
+
|
90
|
+
def parse_all io
|
91
|
+
io.each_line {|line|
|
92
|
+
parse line
|
93
|
+
}
|
94
|
+
end
|
95
|
+
|
96
|
+
def parse line
|
97
|
+
cols=line.gsub(/\s*#.*$/,'').split(/\s+/)
|
98
|
+
return if cols.empty?
|
99
|
+
case cols[0]
|
100
|
+
when /^0x([0-9a-fA-F]{4})(?:\.\.0x([0-9a-fA-F]{4}))?$/
|
101
|
+
# mapping
|
102
|
+
parse_error line unless cols.size >= 2
|
103
|
+
type=cols[1]
|
104
|
+
compat_types=cols[2..-1]
|
105
|
+
from=$1.to_i(16)
|
106
|
+
if $2
|
107
|
+
# mapping(range)
|
108
|
+
to=$2.to_i(16)
|
109
|
+
@callbacks[:mapping_range].each{|c|
|
110
|
+
c.call from,to,type,compat_types
|
111
|
+
}
|
112
|
+
else
|
113
|
+
# mapping(single)
|
114
|
+
@callbacks[:mapping_single].each{|c|
|
115
|
+
c.call from,type,compat_types
|
116
|
+
}
|
117
|
+
end
|
118
|
+
when /^\w+$/
|
119
|
+
parse_error line unless cols.size == 4
|
120
|
+
# typedef
|
121
|
+
@callbacks[:define_type].each{|c|
|
122
|
+
c.call cols[0],(cols[1]=='1'),(cols[2]=='1'),Integer(cols[3])
|
123
|
+
}
|
124
|
+
else
|
125
|
+
# error
|
126
|
+
parse_error line
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
@@ -0,0 +1,317 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
require 'okura'
|
3
|
+
require 'okura/parser'
|
4
|
+
|
5
|
+
module Okura
|
6
|
+
module Serializer
|
7
|
+
# 辞書ファイルのコンパイル形式を表現し、コンパイルとロードの制御を担当する
|
8
|
+
class FormatInfo
|
9
|
+
def initialize
|
10
|
+
@word_dic=:DoubleArray
|
11
|
+
@unk_dic=:Marshal
|
12
|
+
@features=:Marshal
|
13
|
+
@char_types=:Marshal
|
14
|
+
@matrix=:Marshal
|
15
|
+
@encoding='EUC-JP'
|
16
|
+
end
|
17
|
+
attr_accessor :word_dic
|
18
|
+
attr_accessor :unk_dic
|
19
|
+
attr_accessor :features
|
20
|
+
attr_accessor :char_types
|
21
|
+
attr_accessor :matrix
|
22
|
+
attr_accessor :encoding
|
23
|
+
|
24
|
+
# 指定されたディレクトリにあるソースをコンパイルする
|
25
|
+
def compile_dict src_dir,bin_dir
|
26
|
+
open_dest(bin_dir,'format-info'){|dest| self.compile dest}
|
27
|
+
features_l=open_src(src_dir,'left-id.def'){|src|
|
28
|
+
open_dest(bin_dir,'left-id.bin'){|dest|
|
29
|
+
serializer_for('Features',features).compile(src,dest)
|
30
|
+
}
|
31
|
+
}
|
32
|
+
|
33
|
+
word_src_files=
|
34
|
+
Dir.chdir(src_dir){ Dir.glob('*.csv') }.
|
35
|
+
map{|file|File.join(src_dir,file)}
|
36
|
+
open_dest(bin_dir,'word_dic.bin'){|dest|
|
37
|
+
serializer_for('WordDic',word_dic).compile(features_l,word_src_files,encoding,dest)
|
38
|
+
}
|
39
|
+
|
40
|
+
char_types=open_src(src_dir,'char.def'){|src|
|
41
|
+
open_dest(bin_dir,'char_types.bin'){|dest|
|
42
|
+
serializer_for('CharTypes',@char_types).compile(src,dest)
|
43
|
+
}
|
44
|
+
}
|
45
|
+
|
46
|
+
open_src(src_dir,'unk.def'){|src|
|
47
|
+
open_dest(bin_dir,'unk_dic.bin'){|dest|
|
48
|
+
serializer_for('UnkDic',unk_dic).compile(char_types,features_l,src,dest)
|
49
|
+
}
|
50
|
+
}
|
51
|
+
|
52
|
+
open_src(src_dir,'matrix.def'){|src|
|
53
|
+
open_dest(bin_dir,'matrix.bin'){|dest|
|
54
|
+
serializer_for('Matrix',matrix).compile(src,dest)
|
55
|
+
}
|
56
|
+
}
|
57
|
+
end
|
58
|
+
# 指定されたディレクトリにあるコンパイル済み辞書をロードし、Taggerを作成する
|
59
|
+
def self.create_tagger bin_dir
|
60
|
+
format_info=File.open(File.join(bin_dir,'format-info')){|f| self.load f }
|
61
|
+
format_info.create_tagger bin_dir
|
62
|
+
end
|
63
|
+
def create_tagger bin_dir
|
64
|
+
features_l=open_bin(bin_dir,'left-id.bin'){|bin|
|
65
|
+
serializer_for('Features',features).load(bin)
|
66
|
+
}
|
67
|
+
wd=open_bin(bin_dir,'word_dic.bin'){|f|
|
68
|
+
serializer_for('WordDic',word_dic).load(f)
|
69
|
+
}
|
70
|
+
ud=open_bin(bin_dir,'unk_dic.bin'){|f|
|
71
|
+
serializer_for('UnkDic',unk_dic).load(f)
|
72
|
+
}
|
73
|
+
mat=open_bin(bin_dir,'matrix.bin'){|f|
|
74
|
+
serializer_for('Matrix',matrix).load(f)
|
75
|
+
}
|
76
|
+
dic=Okura::Dic.new wd,ud
|
77
|
+
tagger=Okura::Tagger.new dic,mat
|
78
|
+
tagger
|
79
|
+
end
|
80
|
+
# このFormatInfoオブジェクトをシリアライズする
|
81
|
+
def compile io
|
82
|
+
YAML.dump({
|
83
|
+
word_dic: word_dic,
|
84
|
+
unk_dic: unk_dic,
|
85
|
+
features: features,
|
86
|
+
char_types: char_types,
|
87
|
+
matrix: matrix
|
88
|
+
},io)
|
89
|
+
end
|
90
|
+
# シリアライズされたFormatInfoオブジェクトを復元する
|
91
|
+
def self.load io
|
92
|
+
data=YAML.load(io)
|
93
|
+
fi=FormatInfo.new
|
94
|
+
fi.word_dic=data[:word_dic]
|
95
|
+
fi.unk_dic=data[:unk_dic]
|
96
|
+
fi.features=data[:features]
|
97
|
+
fi.char_types=data[:char_types]
|
98
|
+
fi.matrix=data[:matrix]
|
99
|
+
fi
|
100
|
+
end
|
101
|
+
private
|
102
|
+
def open_src dir,filename,&block
|
103
|
+
File.open(File.join(dir,filename),"r:#{encoding}:UTF-8",&block)
|
104
|
+
end
|
105
|
+
def open_dest dir,filename,&block
|
106
|
+
File.open(File.join(dir,filename),'wb:ASCII-8BIT',&block)
|
107
|
+
end
|
108
|
+
def open_bin dir,filename,&block
|
109
|
+
File.open(File.join(dir,filename),'rb:ASCII-8BIT',&block)
|
110
|
+
end
|
111
|
+
def serializer_for data_type_name,format_type_name
|
112
|
+
data_type=Okura::Serializer.const_get data_type_name
|
113
|
+
format_type=data_type.const_get format_type_name
|
114
|
+
format_type.new
|
115
|
+
end
|
116
|
+
end
|
117
|
+
module Features
|
118
|
+
class Marshal
|
119
|
+
def compile(input,output)
|
120
|
+
parser=Okura::Parser::Feature.new input
|
121
|
+
features=Okura::Features.new
|
122
|
+
parser.each{|id,text|
|
123
|
+
features.add id,text
|
124
|
+
}
|
125
|
+
::Marshal.dump(features,output)
|
126
|
+
features
|
127
|
+
end
|
128
|
+
def load(io)
|
129
|
+
::Marshal.load(io)
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
module WordDic
|
134
|
+
def self.each_input inputs,encoding,&block
|
135
|
+
inputs.each{|input|
|
136
|
+
case input
|
137
|
+
when String
|
138
|
+
File.open(input,"r:#{encoding}:UTF-8",&block)
|
139
|
+
else
|
140
|
+
block.call input
|
141
|
+
end
|
142
|
+
}
|
143
|
+
end
|
144
|
+
class Naive
|
145
|
+
def compile(features,inputs,encoding,output)
|
146
|
+
dic=Okura::WordDic::Naive.new
|
147
|
+
Okura::Serializer::WordDic.each_input(inputs,encoding){|input|
|
148
|
+
parser=Okura::Parser::Word.new(input)
|
149
|
+
parser.each{|surface,lid,rid,cost|
|
150
|
+
word=Okura::Word.new(
|
151
|
+
surface,
|
152
|
+
features.from_id(lid),
|
153
|
+
features.from_id(rid),
|
154
|
+
cost
|
155
|
+
)
|
156
|
+
dic.define word
|
157
|
+
}
|
158
|
+
}
|
159
|
+
Marshal.dump(dic,output)
|
160
|
+
end
|
161
|
+
def load(io)
|
162
|
+
Marshal.load(io)
|
163
|
+
end
|
164
|
+
end
|
165
|
+
class DoubleArray
|
166
|
+
def compile(features,inputs,encoding,output)
|
167
|
+
puts 'loading'
|
168
|
+
dic=Okura::WordDic::DoubleArray::Builder.new
|
169
|
+
Okura::Serializer::WordDic.each_input(inputs,encoding){|input|
|
170
|
+
parser=Okura::Parser::Word.new(input)
|
171
|
+
parser.each{|surface,lid,rid,cost|
|
172
|
+
word=Okura::Word.new(
|
173
|
+
surface,
|
174
|
+
features.from_id(lid),
|
175
|
+
features.from_id(rid),
|
176
|
+
cost
|
177
|
+
)
|
178
|
+
dic.define word
|
179
|
+
}
|
180
|
+
}
|
181
|
+
|
182
|
+
writer=Okura::Serializer::BinaryWriter.new output
|
183
|
+
words,base,check=dic.data_for_serialize
|
184
|
+
raise 'base.length!=check.length' if base.length!=check.length
|
185
|
+
puts 'serialize words'
|
186
|
+
words.instance_eval do
|
187
|
+
writer.write_object @groups
|
188
|
+
writer.write_object @left_features
|
189
|
+
writer.write_object @right_features
|
190
|
+
writer.write_int32_array @left_ids
|
191
|
+
writer.write_int32_array @right_ids
|
192
|
+
writer.write_int32_array @costs
|
193
|
+
writer.write_int32_array @surface_ids
|
194
|
+
puts 'serialize surfaces'
|
195
|
+
@surfaces.instance_eval do
|
196
|
+
writer.write_object @str
|
197
|
+
writer.write_int32_array @indices
|
198
|
+
end
|
199
|
+
end
|
200
|
+
puts 'serialize DAT indices'
|
201
|
+
writer.write_int32_array base
|
202
|
+
writer.write_int32_array check
|
203
|
+
end
|
204
|
+
def load(io)
|
205
|
+
reader=Okura::Serializer::BinaryReader.new io
|
206
|
+
words=begin
|
207
|
+
groups=reader.read_object
|
208
|
+
left_features=reader.read_object
|
209
|
+
right_features=reader.read_object
|
210
|
+
left_ids=reader.read_int32_array
|
211
|
+
right_ids=reader.read_int32_array
|
212
|
+
costs=reader.read_int32_array
|
213
|
+
surface_ids=reader.read_int32_array
|
214
|
+
surfaces=begin
|
215
|
+
str=reader.read_object
|
216
|
+
indices=reader.read_int32_array
|
217
|
+
Okura::Words::CompactStringArray.new str,indices
|
218
|
+
end
|
219
|
+
Okura::Words.new(
|
220
|
+
groups,surfaces,left_features,right_features,surface_ids,left_ids,right_ids,costs
|
221
|
+
)
|
222
|
+
end
|
223
|
+
base=reader.read_int32_array
|
224
|
+
check=reader.read_int32_array
|
225
|
+
Okura::WordDic::DoubleArray::Builder.build_from_serialized [words,base,check]
|
226
|
+
end
|
227
|
+
end
|
228
|
+
end
|
229
|
+
module CharTypes
|
230
|
+
class Marshal
|
231
|
+
def compile(input,output)
|
232
|
+
cts=Okura::CharTypes.new
|
233
|
+
|
234
|
+
parser=Okura::Parser::CharType.new
|
235
|
+
parser.on_chartype_def{|name,invoke,group,length|
|
236
|
+
cts.define_type(name,invoke,group,length)
|
237
|
+
}
|
238
|
+
parser.on_mapping_single{|char,type,ctypes|
|
239
|
+
cts.define_map char,cts.named(type),ctypes.map{|ct|cts.named(ct)}
|
240
|
+
}
|
241
|
+
parser.on_mapping_range{|from,to,type,ctypes|
|
242
|
+
(from..to).each{|char|
|
243
|
+
cts.define_map char,cts.named(type),ctypes.map{|ct|cts.named(ct)}
|
244
|
+
}
|
245
|
+
}
|
246
|
+
parser.parse_all input
|
247
|
+
|
248
|
+
::Marshal.dump(cts,output)
|
249
|
+
cts
|
250
|
+
end
|
251
|
+
def load(io)
|
252
|
+
::Marshal.load(io)
|
253
|
+
end
|
254
|
+
end
|
255
|
+
end
|
256
|
+
module UnkDic
|
257
|
+
class Marshal
|
258
|
+
def compile(char_types,features,input,output)
|
259
|
+
unk=Okura::UnkDic.new char_types
|
260
|
+
parser=Okura::Parser::UnkDic.new input
|
261
|
+
parser.each{|type_name,lid,rid,cost|
|
262
|
+
unk.define type_name,features.from_id(lid),features.from_id(rid),cost
|
263
|
+
}
|
264
|
+
::Marshal.dump(unk,output)
|
265
|
+
end
|
266
|
+
def load(io)
|
267
|
+
::Marshal.load(io)
|
268
|
+
end
|
269
|
+
end
|
270
|
+
end
|
271
|
+
module Matrix
|
272
|
+
class Marshal
|
273
|
+
def compile(input,output)
|
274
|
+
parser=Okura::Parser::Matrix.new input
|
275
|
+
mat=Okura::Matrix.new parser.rid_size,parser.lid_size
|
276
|
+
parser.each{|rid,lid,cost|
|
277
|
+
mat.set(rid,lid,cost)
|
278
|
+
}
|
279
|
+
::Marshal.dump(mat,output)
|
280
|
+
end
|
281
|
+
def load(io)
|
282
|
+
::Marshal.load(io)
|
283
|
+
end
|
284
|
+
end
|
285
|
+
end
|
286
|
+
class BinaryReader
|
287
|
+
def initialize io
|
288
|
+
@io=io
|
289
|
+
end
|
290
|
+
def read_int32
|
291
|
+
@io.read(4).unpack('l').first
|
292
|
+
end
|
293
|
+
def read_int32_array
|
294
|
+
size=read_int32
|
295
|
+
@io.read(4*size).unpack('l*')
|
296
|
+
end
|
297
|
+
def read_object
|
298
|
+
Marshal.load @io
|
299
|
+
end
|
300
|
+
end
|
301
|
+
class BinaryWriter
|
302
|
+
def initialize io
|
303
|
+
@io=io
|
304
|
+
end
|
305
|
+
def write_int32 value
|
306
|
+
@io.write [value].pack('l')
|
307
|
+
end
|
308
|
+
def write_int32_array value
|
309
|
+
write_int32 value.length
|
310
|
+
@io.write value.pack('l*')
|
311
|
+
end
|
312
|
+
def write_object obj
|
313
|
+
Marshal.dump obj,@io
|
314
|
+
end
|
315
|
+
end
|
316
|
+
end
|
317
|
+
end
|