okura 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,23 @@
1
+ require 'okura'
2
+ require 'okura/serializer'
3
+
4
+ module Okura
5
+ class Console
6
+ def run_console dict_dir
7
+ tagger=Okura::Serializer::FormatInfo.create_tagger(dict_dir)
8
+ print 'okura> '
9
+ while $stdin.gets
10
+ nodes=tagger.parse($_.strip)
11
+ (0...nodes.length).each{|i|
12
+ puts nodes[i].map{|n|"#{n.word.surface}\t#{n.word.right.text} #{n.word.cost}"}
13
+ puts
14
+ }
15
+ nodes.mincost_path.each{|n|
16
+ puts "#{n.word.surface}\t#{n.word.right.text}"
17
+ }
18
+ print 'okura> '
19
+ end
20
+ return 0
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,131 @@
1
+ module Okura
2
+ module Parser
3
+
4
+ def parse_error line
5
+ raise 'parse error: '+line
6
+ end
7
+
8
+ module Base
9
+
10
+ def initialize io
11
+ @io=io
12
+ end
13
+
14
+ include Enumerable
15
+
16
+ def each &b
17
+ return Enumerator.new(self) unless b
18
+
19
+ @io.each_line {|line|
20
+ b.call *parse(line)
21
+ }
22
+ end
23
+ end
24
+
25
+ class Matrix
26
+ include Base
27
+
28
+ def initialize io
29
+ @io=io
30
+ @rid_size,@lid_size=io.readline.split(/\s/).map(&:to_i)
31
+ end
32
+
33
+ attr_reader :rid_size
34
+ attr_reader :lid_size
35
+
36
+ def parse line
37
+ rid,lid,cost=line.split(/\s/).map(&:to_i)
38
+ [rid,lid,cost]
39
+ end
40
+ end
41
+
42
+ class Word
43
+ include Base
44
+ def parse line
45
+ ti,ts=:to_i,:to_s
46
+ cols=line.split /,/
47
+ cols[0..3].zip([ts,ti,ti,ti]).map{|v,f|f.to_proc.call v}
48
+ end
49
+ end
50
+
51
+ class Feature
52
+ include Base
53
+ def parse line
54
+ id_s,name=line.strip.split(/ /,2)
55
+ id=id_s.to_i
56
+ [id,name]
57
+ end
58
+ end
59
+
60
+ class UnkDic
61
+ include Base
62
+ def parse line
63
+ type_s,lid_s,rid_s,cost_s,additional=line.split(/,/,5)
64
+ lid,rid,cost=[lid_s,rid_s,cost_s].map(&:to_i)
65
+ [type_s,lid,rid,cost]
66
+ end
67
+ end
68
+
69
+ class CharType
70
+ def initialize
71
+ @callbacks={
72
+ :mapping_single=>[],
73
+ :mapping_range=>[],
74
+ :define_type=>[]
75
+ }
76
+ end
77
+
78
+ def on_mapping_single &b
79
+ @callbacks[:mapping_single] << b
80
+ end
81
+
82
+ def on_mapping_range &b
83
+ @callbacks[:mapping_range] << b
84
+ end
85
+
86
+ def on_chartype_def &b
87
+ @callbacks[:define_type] << b
88
+ end
89
+
90
+ def parse_all io
91
+ io.each_line {|line|
92
+ parse line
93
+ }
94
+ end
95
+
96
+ def parse line
97
+ cols=line.gsub(/\s*#.*$/,'').split(/\s+/)
98
+ return if cols.empty?
99
+ case cols[0]
100
+ when /^0x([0-9a-fA-F]{4})(?:\.\.0x([0-9a-fA-F]{4}))?$/
101
+ # mapping
102
+ parse_error line unless cols.size >= 2
103
+ type=cols[1]
104
+ compat_types=cols[2..-1]
105
+ from=$1.to_i(16)
106
+ if $2
107
+ # mapping(range)
108
+ to=$2.to_i(16)
109
+ @callbacks[:mapping_range].each{|c|
110
+ c.call from,to,type,compat_types
111
+ }
112
+ else
113
+ # mapping(single)
114
+ @callbacks[:mapping_single].each{|c|
115
+ c.call from,type,compat_types
116
+ }
117
+ end
118
+ when /^\w+$/
119
+ parse_error line unless cols.size == 4
120
+ # typedef
121
+ @callbacks[:define_type].each{|c|
122
+ c.call cols[0],(cols[1]=='1'),(cols[2]=='1'),Integer(cols[3])
123
+ }
124
+ else
125
+ # error
126
+ parse_error line
127
+ end
128
+ end
129
+ end
130
+ end
131
+ end
@@ -0,0 +1,317 @@
1
+ require 'yaml'
2
+ require 'okura'
3
+ require 'okura/parser'
4
+
5
+ module Okura
6
+ module Serializer
7
+ # 辞書ファイルのコンパイル形式を表現し、コンパイルとロードの制御を担当する
8
+ class FormatInfo
9
+ def initialize
10
+ @word_dic=:DoubleArray
11
+ @unk_dic=:Marshal
12
+ @features=:Marshal
13
+ @char_types=:Marshal
14
+ @matrix=:Marshal
15
+ @encoding='EUC-JP'
16
+ end
17
+ attr_accessor :word_dic
18
+ attr_accessor :unk_dic
19
+ attr_accessor :features
20
+ attr_accessor :char_types
21
+ attr_accessor :matrix
22
+ attr_accessor :encoding
23
+
24
+ # 指定されたディレクトリにあるソースをコンパイルする
25
+ def compile_dict src_dir,bin_dir
26
+ open_dest(bin_dir,'format-info'){|dest| self.compile dest}
27
+ features_l=open_src(src_dir,'left-id.def'){|src|
28
+ open_dest(bin_dir,'left-id.bin'){|dest|
29
+ serializer_for('Features',features).compile(src,dest)
30
+ }
31
+ }
32
+
33
+ word_src_files=
34
+ Dir.chdir(src_dir){ Dir.glob('*.csv') }.
35
+ map{|file|File.join(src_dir,file)}
36
+ open_dest(bin_dir,'word_dic.bin'){|dest|
37
+ serializer_for('WordDic',word_dic).compile(features_l,word_src_files,encoding,dest)
38
+ }
39
+
40
+ char_types=open_src(src_dir,'char.def'){|src|
41
+ open_dest(bin_dir,'char_types.bin'){|dest|
42
+ serializer_for('CharTypes',@char_types).compile(src,dest)
43
+ }
44
+ }
45
+
46
+ open_src(src_dir,'unk.def'){|src|
47
+ open_dest(bin_dir,'unk_dic.bin'){|dest|
48
+ serializer_for('UnkDic',unk_dic).compile(char_types,features_l,src,dest)
49
+ }
50
+ }
51
+
52
+ open_src(src_dir,'matrix.def'){|src|
53
+ open_dest(bin_dir,'matrix.bin'){|dest|
54
+ serializer_for('Matrix',matrix).compile(src,dest)
55
+ }
56
+ }
57
+ end
58
+ # 指定されたディレクトリにあるコンパイル済み辞書をロードし、Taggerを作成する
59
+ def self.create_tagger bin_dir
60
+ format_info=File.open(File.join(bin_dir,'format-info')){|f| self.load f }
61
+ format_info.create_tagger bin_dir
62
+ end
63
+ def create_tagger bin_dir
64
+ features_l=open_bin(bin_dir,'left-id.bin'){|bin|
65
+ serializer_for('Features',features).load(bin)
66
+ }
67
+ wd=open_bin(bin_dir,'word_dic.bin'){|f|
68
+ serializer_for('WordDic',word_dic).load(f)
69
+ }
70
+ ud=open_bin(bin_dir,'unk_dic.bin'){|f|
71
+ serializer_for('UnkDic',unk_dic).load(f)
72
+ }
73
+ mat=open_bin(bin_dir,'matrix.bin'){|f|
74
+ serializer_for('Matrix',matrix).load(f)
75
+ }
76
+ dic=Okura::Dic.new wd,ud
77
+ tagger=Okura::Tagger.new dic,mat
78
+ tagger
79
+ end
80
+ # このFormatInfoオブジェクトをシリアライズする
81
+ def compile io
82
+ YAML.dump({
83
+ word_dic: word_dic,
84
+ unk_dic: unk_dic,
85
+ features: features,
86
+ char_types: char_types,
87
+ matrix: matrix
88
+ },io)
89
+ end
90
+ # シリアライズされたFormatInfoオブジェクトを復元する
91
+ def self.load io
92
+ data=YAML.load(io)
93
+ fi=FormatInfo.new
94
+ fi.word_dic=data[:word_dic]
95
+ fi.unk_dic=data[:unk_dic]
96
+ fi.features=data[:features]
97
+ fi.char_types=data[:char_types]
98
+ fi.matrix=data[:matrix]
99
+ fi
100
+ end
101
+ private
102
+ def open_src dir,filename,&block
103
+ File.open(File.join(dir,filename),"r:#{encoding}:UTF-8",&block)
104
+ end
105
+ def open_dest dir,filename,&block
106
+ File.open(File.join(dir,filename),'wb:ASCII-8BIT',&block)
107
+ end
108
+ def open_bin dir,filename,&block
109
+ File.open(File.join(dir,filename),'rb:ASCII-8BIT',&block)
110
+ end
111
+ def serializer_for data_type_name,format_type_name
112
+ data_type=Okura::Serializer.const_get data_type_name
113
+ format_type=data_type.const_get format_type_name
114
+ format_type.new
115
+ end
116
+ end
117
+ module Features
118
+ class Marshal
119
+ def compile(input,output)
120
+ parser=Okura::Parser::Feature.new input
121
+ features=Okura::Features.new
122
+ parser.each{|id,text|
123
+ features.add id,text
124
+ }
125
+ ::Marshal.dump(features,output)
126
+ features
127
+ end
128
+ def load(io)
129
+ ::Marshal.load(io)
130
+ end
131
+ end
132
+ end
133
+ module WordDic
134
+ def self.each_input inputs,encoding,&block
135
+ inputs.each{|input|
136
+ case input
137
+ when String
138
+ File.open(input,"r:#{encoding}:UTF-8",&block)
139
+ else
140
+ block.call input
141
+ end
142
+ }
143
+ end
144
+ class Naive
145
+ def compile(features,inputs,encoding,output)
146
+ dic=Okura::WordDic::Naive.new
147
+ Okura::Serializer::WordDic.each_input(inputs,encoding){|input|
148
+ parser=Okura::Parser::Word.new(input)
149
+ parser.each{|surface,lid,rid,cost|
150
+ word=Okura::Word.new(
151
+ surface,
152
+ features.from_id(lid),
153
+ features.from_id(rid),
154
+ cost
155
+ )
156
+ dic.define word
157
+ }
158
+ }
159
+ Marshal.dump(dic,output)
160
+ end
161
+ def load(io)
162
+ Marshal.load(io)
163
+ end
164
+ end
165
+ class DoubleArray
166
+ def compile(features,inputs,encoding,output)
167
+ puts 'loading'
168
+ dic=Okura::WordDic::DoubleArray::Builder.new
169
+ Okura::Serializer::WordDic.each_input(inputs,encoding){|input|
170
+ parser=Okura::Parser::Word.new(input)
171
+ parser.each{|surface,lid,rid,cost|
172
+ word=Okura::Word.new(
173
+ surface,
174
+ features.from_id(lid),
175
+ features.from_id(rid),
176
+ cost
177
+ )
178
+ dic.define word
179
+ }
180
+ }
181
+
182
+ writer=Okura::Serializer::BinaryWriter.new output
183
+ words,base,check=dic.data_for_serialize
184
+ raise 'base.length!=check.length' if base.length!=check.length
185
+ puts 'serialize words'
186
+ words.instance_eval do
187
+ writer.write_object @groups
188
+ writer.write_object @left_features
189
+ writer.write_object @right_features
190
+ writer.write_int32_array @left_ids
191
+ writer.write_int32_array @right_ids
192
+ writer.write_int32_array @costs
193
+ writer.write_int32_array @surface_ids
194
+ puts 'serialize surfaces'
195
+ @surfaces.instance_eval do
196
+ writer.write_object @str
197
+ writer.write_int32_array @indices
198
+ end
199
+ end
200
+ puts 'serialize DAT indices'
201
+ writer.write_int32_array base
202
+ writer.write_int32_array check
203
+ end
204
+ def load(io)
205
+ reader=Okura::Serializer::BinaryReader.new io
206
+ words=begin
207
+ groups=reader.read_object
208
+ left_features=reader.read_object
209
+ right_features=reader.read_object
210
+ left_ids=reader.read_int32_array
211
+ right_ids=reader.read_int32_array
212
+ costs=reader.read_int32_array
213
+ surface_ids=reader.read_int32_array
214
+ surfaces=begin
215
+ str=reader.read_object
216
+ indices=reader.read_int32_array
217
+ Okura::Words::CompactStringArray.new str,indices
218
+ end
219
+ Okura::Words.new(
220
+ groups,surfaces,left_features,right_features,surface_ids,left_ids,right_ids,costs
221
+ )
222
+ end
223
+ base=reader.read_int32_array
224
+ check=reader.read_int32_array
225
+ Okura::WordDic::DoubleArray::Builder.build_from_serialized [words,base,check]
226
+ end
227
+ end
228
+ end
229
+ module CharTypes
230
+ class Marshal
231
+ def compile(input,output)
232
+ cts=Okura::CharTypes.new
233
+
234
+ parser=Okura::Parser::CharType.new
235
+ parser.on_chartype_def{|name,invoke,group,length|
236
+ cts.define_type(name,invoke,group,length)
237
+ }
238
+ parser.on_mapping_single{|char,type,ctypes|
239
+ cts.define_map char,cts.named(type),ctypes.map{|ct|cts.named(ct)}
240
+ }
241
+ parser.on_mapping_range{|from,to,type,ctypes|
242
+ (from..to).each{|char|
243
+ cts.define_map char,cts.named(type),ctypes.map{|ct|cts.named(ct)}
244
+ }
245
+ }
246
+ parser.parse_all input
247
+
248
+ ::Marshal.dump(cts,output)
249
+ cts
250
+ end
251
+ def load(io)
252
+ ::Marshal.load(io)
253
+ end
254
+ end
255
+ end
256
+ module UnkDic
257
+ class Marshal
258
+ def compile(char_types,features,input,output)
259
+ unk=Okura::UnkDic.new char_types
260
+ parser=Okura::Parser::UnkDic.new input
261
+ parser.each{|type_name,lid,rid,cost|
262
+ unk.define type_name,features.from_id(lid),features.from_id(rid),cost
263
+ }
264
+ ::Marshal.dump(unk,output)
265
+ end
266
+ def load(io)
267
+ ::Marshal.load(io)
268
+ end
269
+ end
270
+ end
271
+ module Matrix
272
+ class Marshal
273
+ def compile(input,output)
274
+ parser=Okura::Parser::Matrix.new input
275
+ mat=Okura::Matrix.new parser.rid_size,parser.lid_size
276
+ parser.each{|rid,lid,cost|
277
+ mat.set(rid,lid,cost)
278
+ }
279
+ ::Marshal.dump(mat,output)
280
+ end
281
+ def load(io)
282
+ ::Marshal.load(io)
283
+ end
284
+ end
285
+ end
286
+ class BinaryReader
287
+ def initialize io
288
+ @io=io
289
+ end
290
+ def read_int32
291
+ @io.read(4).unpack('l').first
292
+ end
293
+ def read_int32_array
294
+ size=read_int32
295
+ @io.read(4*size).unpack('l*')
296
+ end
297
+ def read_object
298
+ Marshal.load @io
299
+ end
300
+ end
301
+ class BinaryWriter
302
+ def initialize io
303
+ @io=io
304
+ end
305
+ def write_int32 value
306
+ @io.write [value].pack('l')
307
+ end
308
+ def write_int32_array value
309
+ write_int32 value.length
310
+ @io.write value.pack('l*')
311
+ end
312
+ def write_object obj
313
+ Marshal.dump obj,@io
314
+ end
315
+ end
316
+ end
317
+ end