okura 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,23 @@
1
+ require 'okura'
2
+ require 'okura/serializer'
3
+
4
+ module Okura
5
+ class Console
6
+ def run_console dict_dir
7
+ tagger=Okura::Serializer::FormatInfo.create_tagger(dict_dir)
8
+ print 'okura> '
9
+ while $stdin.gets
10
+ nodes=tagger.parse($_.strip)
11
+ (0...nodes.length).each{|i|
12
+ puts nodes[i].map{|n|"#{n.word.surface}\t#{n.word.right.text} #{n.word.cost}"}
13
+ puts
14
+ }
15
+ nodes.mincost_path.each{|n|
16
+ puts "#{n.word.surface}\t#{n.word.right.text}"
17
+ }
18
+ print 'okura> '
19
+ end
20
+ return 0
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,131 @@
1
+ module Okura
2
+ module Parser
3
+
4
+ def parse_error line
5
+ raise 'parse error: '+line
6
+ end
7
+
8
+ module Base
9
+
10
+ def initialize io
11
+ @io=io
12
+ end
13
+
14
+ include Enumerable
15
+
16
+ def each &b
17
+ return Enumerator.new(self) unless b
18
+
19
+ @io.each_line {|line|
20
+ b.call *parse(line)
21
+ }
22
+ end
23
+ end
24
+
25
+ class Matrix
26
+ include Base
27
+
28
+ def initialize io
29
+ @io=io
30
+ @rid_size,@lid_size=io.readline.split(/\s/).map(&:to_i)
31
+ end
32
+
33
+ attr_reader :rid_size
34
+ attr_reader :lid_size
35
+
36
+ def parse line
37
+ rid,lid,cost=line.split(/\s/).map(&:to_i)
38
+ [rid,lid,cost]
39
+ end
40
+ end
41
+
42
+ class Word
43
+ include Base
44
+ def parse line
45
+ ti,ts=:to_i,:to_s
46
+ cols=line.split /,/
47
+ cols[0..3].zip([ts,ti,ti,ti]).map{|v,f|f.to_proc.call v}
48
+ end
49
+ end
50
+
51
+ class Feature
52
+ include Base
53
+ def parse line
54
+ id_s,name=line.strip.split(/ /,2)
55
+ id=id_s.to_i
56
+ [id,name]
57
+ end
58
+ end
59
+
60
+ class UnkDic
61
+ include Base
62
+ def parse line
63
+ type_s,lid_s,rid_s,cost_s,additional=line.split(/,/,5)
64
+ lid,rid,cost=[lid_s,rid_s,cost_s].map(&:to_i)
65
+ [type_s,lid,rid,cost]
66
+ end
67
+ end
68
+
69
+ class CharType
70
+ def initialize
71
+ @callbacks={
72
+ :mapping_single=>[],
73
+ :mapping_range=>[],
74
+ :define_type=>[]
75
+ }
76
+ end
77
+
78
+ def on_mapping_single &b
79
+ @callbacks[:mapping_single] << b
80
+ end
81
+
82
+ def on_mapping_range &b
83
+ @callbacks[:mapping_range] << b
84
+ end
85
+
86
+ def on_chartype_def &b
87
+ @callbacks[:define_type] << b
88
+ end
89
+
90
+ def parse_all io
91
+ io.each_line {|line|
92
+ parse line
93
+ }
94
+ end
95
+
96
+ def parse line
97
+ cols=line.gsub(/\s*#.*$/,'').split(/\s+/)
98
+ return if cols.empty?
99
+ case cols[0]
100
+ when /^0x([0-9a-fA-F]{4})(?:\.\.0x([0-9a-fA-F]{4}))?$/
101
+ # mapping
102
+ parse_error line unless cols.size >= 2
103
+ type=cols[1]
104
+ compat_types=cols[2..-1]
105
+ from=$1.to_i(16)
106
+ if $2
107
+ # mapping(range)
108
+ to=$2.to_i(16)
109
+ @callbacks[:mapping_range].each{|c|
110
+ c.call from,to,type,compat_types
111
+ }
112
+ else
113
+ # mapping(single)
114
+ @callbacks[:mapping_single].each{|c|
115
+ c.call from,type,compat_types
116
+ }
117
+ end
118
+ when /^\w+$/
119
+ parse_error line unless cols.size == 4
120
+ # typedef
121
+ @callbacks[:define_type].each{|c|
122
+ c.call cols[0],(cols[1]=='1'),(cols[2]=='1'),Integer(cols[3])
123
+ }
124
+ else
125
+ # error
126
+ parse_error line
127
+ end
128
+ end
129
+ end
130
+ end
131
+ end
@@ -0,0 +1,317 @@
1
+ require 'yaml'
2
+ require 'okura'
3
+ require 'okura/parser'
4
+
5
+ module Okura
6
+ module Serializer
7
+ # 辞書ファイルのコンパイル形式を表現し、コンパイルとロードの制御を担当する
8
+ class FormatInfo
9
+ def initialize
10
+ @word_dic=:DoubleArray
11
+ @unk_dic=:Marshal
12
+ @features=:Marshal
13
+ @char_types=:Marshal
14
+ @matrix=:Marshal
15
+ @encoding='EUC-JP'
16
+ end
17
+ attr_accessor :word_dic
18
+ attr_accessor :unk_dic
19
+ attr_accessor :features
20
+ attr_accessor :char_types
21
+ attr_accessor :matrix
22
+ attr_accessor :encoding
23
+
24
+ # 指定されたディレクトリにあるソースをコンパイルする
25
+ def compile_dict src_dir,bin_dir
26
+ open_dest(bin_dir,'format-info'){|dest| self.compile dest}
27
+ features_l=open_src(src_dir,'left-id.def'){|src|
28
+ open_dest(bin_dir,'left-id.bin'){|dest|
29
+ serializer_for('Features',features).compile(src,dest)
30
+ }
31
+ }
32
+
33
+ word_src_files=
34
+ Dir.chdir(src_dir){ Dir.glob('*.csv') }.
35
+ map{|file|File.join(src_dir,file)}
36
+ open_dest(bin_dir,'word_dic.bin'){|dest|
37
+ serializer_for('WordDic',word_dic).compile(features_l,word_src_files,encoding,dest)
38
+ }
39
+
40
+ char_types=open_src(src_dir,'char.def'){|src|
41
+ open_dest(bin_dir,'char_types.bin'){|dest|
42
+ serializer_for('CharTypes',@char_types).compile(src,dest)
43
+ }
44
+ }
45
+
46
+ open_src(src_dir,'unk.def'){|src|
47
+ open_dest(bin_dir,'unk_dic.bin'){|dest|
48
+ serializer_for('UnkDic',unk_dic).compile(char_types,features_l,src,dest)
49
+ }
50
+ }
51
+
52
+ open_src(src_dir,'matrix.def'){|src|
53
+ open_dest(bin_dir,'matrix.bin'){|dest|
54
+ serializer_for('Matrix',matrix).compile(src,dest)
55
+ }
56
+ }
57
+ end
58
+ # 指定されたディレクトリにあるコンパイル済み辞書をロードし、Taggerを作成する
59
+ def self.create_tagger bin_dir
60
+ format_info=File.open(File.join(bin_dir,'format-info')){|f| self.load f }
61
+ format_info.create_tagger bin_dir
62
+ end
63
+ def create_tagger bin_dir
64
+ features_l=open_bin(bin_dir,'left-id.bin'){|bin|
65
+ serializer_for('Features',features).load(bin)
66
+ }
67
+ wd=open_bin(bin_dir,'word_dic.bin'){|f|
68
+ serializer_for('WordDic',word_dic).load(f)
69
+ }
70
+ ud=open_bin(bin_dir,'unk_dic.bin'){|f|
71
+ serializer_for('UnkDic',unk_dic).load(f)
72
+ }
73
+ mat=open_bin(bin_dir,'matrix.bin'){|f|
74
+ serializer_for('Matrix',matrix).load(f)
75
+ }
76
+ dic=Okura::Dic.new wd,ud
77
+ tagger=Okura::Tagger.new dic,mat
78
+ tagger
79
+ end
80
+ # このFormatInfoオブジェクトをシリアライズする
81
+ def compile io
82
+ YAML.dump({
83
+ word_dic: word_dic,
84
+ unk_dic: unk_dic,
85
+ features: features,
86
+ char_types: char_types,
87
+ matrix: matrix
88
+ },io)
89
+ end
90
+ # シリアライズされたFormatInfoオブジェクトを復元する
91
+ def self.load io
92
+ data=YAML.load(io)
93
+ fi=FormatInfo.new
94
+ fi.word_dic=data[:word_dic]
95
+ fi.unk_dic=data[:unk_dic]
96
+ fi.features=data[:features]
97
+ fi.char_types=data[:char_types]
98
+ fi.matrix=data[:matrix]
99
+ fi
100
+ end
101
+ private
102
+ def open_src dir,filename,&block
103
+ File.open(File.join(dir,filename),"r:#{encoding}:UTF-8",&block)
104
+ end
105
+ def open_dest dir,filename,&block
106
+ File.open(File.join(dir,filename),'wb:ASCII-8BIT',&block)
107
+ end
108
+ def open_bin dir,filename,&block
109
+ File.open(File.join(dir,filename),'rb:ASCII-8BIT',&block)
110
+ end
111
+ def serializer_for data_type_name,format_type_name
112
+ data_type=Okura::Serializer.const_get data_type_name
113
+ format_type=data_type.const_get format_type_name
114
+ format_type.new
115
+ end
116
+ end
117
+ module Features
118
+ class Marshal
119
+ def compile(input,output)
120
+ parser=Okura::Parser::Feature.new input
121
+ features=Okura::Features.new
122
+ parser.each{|id,text|
123
+ features.add id,text
124
+ }
125
+ ::Marshal.dump(features,output)
126
+ features
127
+ end
128
+ def load(io)
129
+ ::Marshal.load(io)
130
+ end
131
+ end
132
+ end
133
+ module WordDic
134
+ def self.each_input inputs,encoding,&block
135
+ inputs.each{|input|
136
+ case input
137
+ when String
138
+ File.open(input,"r:#{encoding}:UTF-8",&block)
139
+ else
140
+ block.call input
141
+ end
142
+ }
143
+ end
144
+ class Naive
145
+ def compile(features,inputs,encoding,output)
146
+ dic=Okura::WordDic::Naive.new
147
+ Okura::Serializer::WordDic.each_input(inputs,encoding){|input|
148
+ parser=Okura::Parser::Word.new(input)
149
+ parser.each{|surface,lid,rid,cost|
150
+ word=Okura::Word.new(
151
+ surface,
152
+ features.from_id(lid),
153
+ features.from_id(rid),
154
+ cost
155
+ )
156
+ dic.define word
157
+ }
158
+ }
159
+ Marshal.dump(dic,output)
160
+ end
161
+ def load(io)
162
+ Marshal.load(io)
163
+ end
164
+ end
165
+ class DoubleArray
166
+ def compile(features,inputs,encoding,output)
167
+ puts 'loading'
168
+ dic=Okura::WordDic::DoubleArray::Builder.new
169
+ Okura::Serializer::WordDic.each_input(inputs,encoding){|input|
170
+ parser=Okura::Parser::Word.new(input)
171
+ parser.each{|surface,lid,rid,cost|
172
+ word=Okura::Word.new(
173
+ surface,
174
+ features.from_id(lid),
175
+ features.from_id(rid),
176
+ cost
177
+ )
178
+ dic.define word
179
+ }
180
+ }
181
+
182
+ writer=Okura::Serializer::BinaryWriter.new output
183
+ words,base,check=dic.data_for_serialize
184
+ raise 'base.length!=check.length' if base.length!=check.length
185
+ puts 'serialize words'
186
+ words.instance_eval do
187
+ writer.write_object @groups
188
+ writer.write_object @left_features
189
+ writer.write_object @right_features
190
+ writer.write_int32_array @left_ids
191
+ writer.write_int32_array @right_ids
192
+ writer.write_int32_array @costs
193
+ writer.write_int32_array @surface_ids
194
+ puts 'serialize surfaces'
195
+ @surfaces.instance_eval do
196
+ writer.write_object @str
197
+ writer.write_int32_array @indices
198
+ end
199
+ end
200
+ puts 'serialize DAT indices'
201
+ writer.write_int32_array base
202
+ writer.write_int32_array check
203
+ end
204
+ def load(io)
205
+ reader=Okura::Serializer::BinaryReader.new io
206
+ words=begin
207
+ groups=reader.read_object
208
+ left_features=reader.read_object
209
+ right_features=reader.read_object
210
+ left_ids=reader.read_int32_array
211
+ right_ids=reader.read_int32_array
212
+ costs=reader.read_int32_array
213
+ surface_ids=reader.read_int32_array
214
+ surfaces=begin
215
+ str=reader.read_object
216
+ indices=reader.read_int32_array
217
+ Okura::Words::CompactStringArray.new str,indices
218
+ end
219
+ Okura::Words.new(
220
+ groups,surfaces,left_features,right_features,surface_ids,left_ids,right_ids,costs
221
+ )
222
+ end
223
+ base=reader.read_int32_array
224
+ check=reader.read_int32_array
225
+ Okura::WordDic::DoubleArray::Builder.build_from_serialized [words,base,check]
226
+ end
227
+ end
228
+ end
229
+ module CharTypes
230
+ class Marshal
231
+ def compile(input,output)
232
+ cts=Okura::CharTypes.new
233
+
234
+ parser=Okura::Parser::CharType.new
235
+ parser.on_chartype_def{|name,invoke,group,length|
236
+ cts.define_type(name,invoke,group,length)
237
+ }
238
+ parser.on_mapping_single{|char,type,ctypes|
239
+ cts.define_map char,cts.named(type),ctypes.map{|ct|cts.named(ct)}
240
+ }
241
+ parser.on_mapping_range{|from,to,type,ctypes|
242
+ (from..to).each{|char|
243
+ cts.define_map char,cts.named(type),ctypes.map{|ct|cts.named(ct)}
244
+ }
245
+ }
246
+ parser.parse_all input
247
+
248
+ ::Marshal.dump(cts,output)
249
+ cts
250
+ end
251
+ def load(io)
252
+ ::Marshal.load(io)
253
+ end
254
+ end
255
+ end
256
+ module UnkDic
257
+ class Marshal
258
+ def compile(char_types,features,input,output)
259
+ unk=Okura::UnkDic.new char_types
260
+ parser=Okura::Parser::UnkDic.new input
261
+ parser.each{|type_name,lid,rid,cost|
262
+ unk.define type_name,features.from_id(lid),features.from_id(rid),cost
263
+ }
264
+ ::Marshal.dump(unk,output)
265
+ end
266
+ def load(io)
267
+ ::Marshal.load(io)
268
+ end
269
+ end
270
+ end
271
+ module Matrix
272
+ class Marshal
273
+ def compile(input,output)
274
+ parser=Okura::Parser::Matrix.new input
275
+ mat=Okura::Matrix.new parser.rid_size,parser.lid_size
276
+ parser.each{|rid,lid,cost|
277
+ mat.set(rid,lid,cost)
278
+ }
279
+ ::Marshal.dump(mat,output)
280
+ end
281
+ def load(io)
282
+ ::Marshal.load(io)
283
+ end
284
+ end
285
+ end
286
+ class BinaryReader
287
+ def initialize io
288
+ @io=io
289
+ end
290
+ def read_int32
291
+ @io.read(4).unpack('l').first
292
+ end
293
+ def read_int32_array
294
+ size=read_int32
295
+ @io.read(4*size).unpack('l*')
296
+ end
297
+ def read_object
298
+ Marshal.load @io
299
+ end
300
+ end
301
+ class BinaryWriter
302
+ def initialize io
303
+ @io=io
304
+ end
305
+ def write_int32 value
306
+ @io.write [value].pack('l')
307
+ end
308
+ def write_int32_array value
309
+ write_int32 value.length
310
+ @io.write value.pack('l*')
311
+ end
312
+ def write_object obj
313
+ Marshal.dump obj,@io
314
+ end
315
+ end
316
+ end
317
+ end