okura 0.0.0 → 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/okura +83 -0
- data/lib/okura.rb +4 -4
- data/lib/okura/serializer.rb +22 -14
- data/test/okura_spec.rb +20 -22
- metadata +4 -2
data/bin/okura
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
# -*- filetype:ruby -*-
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
|
6
|
+
commands={
|
7
|
+
'console' => lambda {|args|
|
8
|
+
require 'okura/console'
|
9
|
+
raise IllegalOption.new('Usage: console dict_dir') unless args.length==1
|
10
|
+
dict_dir=args[0]
|
11
|
+
Okura::Console.new.run_console dict_dir
|
12
|
+
},
|
13
|
+
'compile' => lambda {|args|
|
14
|
+
require 'okura/serializer'
|
15
|
+
require 'optparse'
|
16
|
+
format_info=Okura::Serializer::FormatInfo.new
|
17
|
+
opt=OptionParser.new
|
18
|
+
dry=false
|
19
|
+
opt.on('--word_dic=TYPE'){|t| format_info.word_dic=t.to_sym}
|
20
|
+
opt.on('--dry'){|d|dry=d}
|
21
|
+
opt.on('--encoding=ENCODING'){|encoding| format_info.encoding=encoding}
|
22
|
+
opt.parse! args
|
23
|
+
raise IllegalOption.new('Usage: compile dict_dir dest_dir') unless args.length==2
|
24
|
+
dict_dir=args[0]
|
25
|
+
bin_dir=args[1]
|
26
|
+
puts <<-EOS
|
27
|
+
Settings
|
28
|
+
- File Encoding: #{format_info.encoding}
|
29
|
+
- Format:
|
30
|
+
- word_dic: #{format_info.word_dic}
|
31
|
+
- unk_dic: #{format_info.unk_dic}
|
32
|
+
- features: #{format_info.features}
|
33
|
+
- char_types: #{format_info.char_types}
|
34
|
+
- matrix: #{format_info.matrix}
|
35
|
+
EOS
|
36
|
+
next if dry
|
37
|
+
Dir.mkdir bin_dir unless File.exists? bin_dir
|
38
|
+
format_info.compile_dict(dict_dir,bin_dir)
|
39
|
+
},
|
40
|
+
'help' => lambda {|args|
|
41
|
+
puts <<-EOS
|
42
|
+
USAGE:
|
43
|
+
okura command [args]
|
44
|
+
okura --version
|
45
|
+
command:
|
46
|
+
compile src_dir dest_dir
|
47
|
+
console dict_dir
|
48
|
+
help
|
49
|
+
EOS
|
50
|
+
}
|
51
|
+
}
|
52
|
+
|
53
|
+
class IllegalOption < StandardError
|
54
|
+
def initialize msg
|
55
|
+
@message=msg
|
56
|
+
end
|
57
|
+
attr_reader :message
|
58
|
+
end
|
59
|
+
|
60
|
+
if $*.empty?
|
61
|
+
commands['help'].call []
|
62
|
+
else
|
63
|
+
command=$*.first
|
64
|
+
if command == '--version'
|
65
|
+
require 'okura/version'
|
66
|
+
puts "Okura #{Okura::Version}"
|
67
|
+
exit 0
|
68
|
+
end
|
69
|
+
unless commands.has_key? command
|
70
|
+
puts "unknown command: #{command}"
|
71
|
+
commands[:help].call []
|
72
|
+
exit 1
|
73
|
+
end
|
74
|
+
begin
|
75
|
+
commands[command].call $*[1..-1]
|
76
|
+
exit 0
|
77
|
+
rescue IllegalOption
|
78
|
+
$stderr.puts $!.message
|
79
|
+
exit 1
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
|
data/lib/okura.rb
CHANGED
@@ -113,8 +113,8 @@ module Okura
|
|
113
113
|
def get id
|
114
114
|
raise 'bad id' unless id < @indices.length
|
115
115
|
from=@indices[id]
|
116
|
-
to=(id+1 < @indices.length) ? @indices[id+1] : @str.
|
117
|
-
|
116
|
+
to=(id+1 < @indices.length) ? @indices[id+1] : @str.bytesize
|
117
|
+
(from...to).map{|i|@str.getbyte(i)}.pack('C*').force_encoding 'UTF-8'
|
118
118
|
end
|
119
119
|
def [](id)
|
120
120
|
get id
|
@@ -132,7 +132,7 @@ module Okura
|
|
132
132
|
id=@indices.length
|
133
133
|
@indices.push @size
|
134
134
|
@surfaces.push surface
|
135
|
-
@size+=surface.
|
135
|
+
@size+=surface.bytesize
|
136
136
|
id
|
137
137
|
end
|
138
138
|
end
|
@@ -258,7 +258,7 @@ module Okura
|
|
258
258
|
end
|
259
259
|
# Integer -> Feature
|
260
260
|
def from_id id
|
261
|
-
@map_id[id]
|
261
|
+
@map_id[id] || (raise "Features: ID undefined (#{id})")
|
262
262
|
end
|
263
263
|
def [](id)
|
264
264
|
from_id id
|
data/lib/okura/serializer.rb
CHANGED
@@ -30,11 +30,17 @@ module Okura
|
|
30
30
|
}
|
31
31
|
}
|
32
32
|
|
33
|
+
features_r=open_src(src_dir,'right-id.def'){|src|
|
34
|
+
open_dest(bin_dir,'right-id.bin'){|dest|
|
35
|
+
serializer_for('Features',features).compile(src,dest)
|
36
|
+
}
|
37
|
+
}
|
38
|
+
|
33
39
|
word_src_files=
|
34
40
|
Dir.chdir(src_dir){ Dir.glob('*.csv') }.
|
35
41
|
map{|file|File.join(src_dir,file)}
|
36
42
|
open_dest(bin_dir,'word_dic.bin'){|dest|
|
37
|
-
serializer_for('WordDic',word_dic).compile(features_l,word_src_files,encoding,dest)
|
43
|
+
serializer_for('WordDic',word_dic).compile(features_l,features_r,word_src_files,encoding,dest)
|
38
44
|
}
|
39
45
|
|
40
46
|
char_types=open_src(src_dir,'char.def'){|src|
|
@@ -45,7 +51,7 @@ module Okura
|
|
45
51
|
|
46
52
|
open_src(src_dir,'unk.def'){|src|
|
47
53
|
open_dest(bin_dir,'unk_dic.bin'){|dest|
|
48
|
-
serializer_for('UnkDic',unk_dic).compile(char_types,features_l,src,dest)
|
54
|
+
serializer_for('UnkDic',unk_dic).compile(char_types,features_l,features_r,src,dest)
|
49
55
|
}
|
50
56
|
}
|
51
57
|
|
@@ -64,6 +70,9 @@ module Okura
|
|
64
70
|
features_l=open_bin(bin_dir,'left-id.bin'){|bin|
|
65
71
|
serializer_for('Features',features).load(bin)
|
66
72
|
}
|
73
|
+
features_r=open_bin(bin_dir,'right-id.bin'){|bin|
|
74
|
+
serializer_for('Features',features).load(bin)
|
75
|
+
}
|
67
76
|
wd=open_bin(bin_dir,'word_dic.bin'){|f|
|
68
77
|
serializer_for('WordDic',word_dic).load(f)
|
69
78
|
}
|
@@ -142,15 +151,15 @@ module Okura
|
|
142
151
|
}
|
143
152
|
end
|
144
153
|
class Naive
|
145
|
-
def compile(
|
154
|
+
def compile(features_l,features_r,inputs,encoding,output)
|
146
155
|
dic=Okura::WordDic::Naive.new
|
147
156
|
Okura::Serializer::WordDic.each_input(inputs,encoding){|input|
|
148
157
|
parser=Okura::Parser::Word.new(input)
|
149
158
|
parser.each{|surface,lid,rid,cost|
|
150
159
|
word=Okura::Word.new(
|
151
160
|
surface,
|
152
|
-
|
153
|
-
|
161
|
+
features_l.from_id(lid),
|
162
|
+
features_r.from_id(rid),
|
154
163
|
cost
|
155
164
|
)
|
156
165
|
dic.define word
|
@@ -163,16 +172,16 @@ module Okura
|
|
163
172
|
end
|
164
173
|
end
|
165
174
|
class DoubleArray
|
166
|
-
def compile(
|
167
|
-
puts 'loading'
|
175
|
+
def compile(features_l,features_r,inputs,encoding,output)
|
176
|
+
puts 'loading...'
|
168
177
|
dic=Okura::WordDic::DoubleArray::Builder.new
|
169
178
|
Okura::Serializer::WordDic.each_input(inputs,encoding){|input|
|
170
179
|
parser=Okura::Parser::Word.new(input)
|
171
180
|
parser.each{|surface,lid,rid,cost|
|
172
181
|
word=Okura::Word.new(
|
173
182
|
surface,
|
174
|
-
|
175
|
-
|
183
|
+
features_l.from_id(lid),
|
184
|
+
features_r.from_id(rid),
|
176
185
|
cost
|
177
186
|
)
|
178
187
|
dic.define word
|
@@ -182,7 +191,7 @@ module Okura
|
|
182
191
|
writer=Okura::Serializer::BinaryWriter.new output
|
183
192
|
words,base,check=dic.data_for_serialize
|
184
193
|
raise 'base.length!=check.length' if base.length!=check.length
|
185
|
-
puts '
|
194
|
+
puts 'writing words...'
|
186
195
|
words.instance_eval do
|
187
196
|
writer.write_object @groups
|
188
197
|
writer.write_object @left_features
|
@@ -191,13 +200,12 @@ module Okura
|
|
191
200
|
writer.write_int32_array @right_ids
|
192
201
|
writer.write_int32_array @costs
|
193
202
|
writer.write_int32_array @surface_ids
|
194
|
-
puts 'serialize surfaces'
|
195
203
|
@surfaces.instance_eval do
|
196
204
|
writer.write_object @str
|
197
205
|
writer.write_int32_array @indices
|
198
206
|
end
|
199
207
|
end
|
200
|
-
puts '
|
208
|
+
puts 'writing word index...'
|
201
209
|
writer.write_int32_array base
|
202
210
|
writer.write_int32_array check
|
203
211
|
end
|
@@ -255,11 +263,11 @@ module Okura
|
|
255
263
|
end
|
256
264
|
module UnkDic
|
257
265
|
class Marshal
|
258
|
-
def compile(char_types,
|
266
|
+
def compile(char_types,features_l,features_r,input,output)
|
259
267
|
unk=Okura::UnkDic.new char_types
|
260
268
|
parser=Okura::Parser::UnkDic.new input
|
261
269
|
parser.each{|type_name,lid,rid,cost|
|
262
|
-
unk.define type_name,
|
270
|
+
unk.define type_name,features_l.from_id(lid),features_r.from_id(rid),cost
|
263
271
|
}
|
264
272
|
::Marshal.dump(unk,output)
|
265
273
|
end
|
data/test/okura_spec.rb
CHANGED
@@ -191,16 +191,12 @@ Z,9,10,5244,記号,空白,*,*,*,*,*
|
|
191
191
|
tagger.dic.word_dic.word_size.should == 3
|
192
192
|
tagger.mat.cost(0,1).should == 5
|
193
193
|
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
u1=tagger.dic.unk_dic.word_templates_for('A')
|
201
|
-
u1.left.name.should == 'F5'
|
202
|
-
u1.right.name.should == 'F6'
|
203
|
-
}
|
194
|
+
w2=tagger.dic.word_dic.possible_words('w2',0)[0]
|
195
|
+
w2.left.text.should == 'F5'
|
196
|
+
w2.right.text.should == 'F6'
|
197
|
+
u1=tagger.dic.unk_dic.word_templates_for('A')[0]
|
198
|
+
u1.left.text.should == 'F5'
|
199
|
+
u1.right.text.should == 'F6'
|
204
200
|
}
|
205
201
|
end
|
206
202
|
end
|
@@ -272,17 +268,18 @@ TYPE3 0 1 3
|
|
272
268
|
# subject : Serializer class
|
273
269
|
it 'コンパイルして復元できる' do
|
274
270
|
serializer=subject.new
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
271
|
+
features_l=Okura::Features.new
|
272
|
+
features_l.add 854,f(854)
|
273
|
+
features_l.add 645,f(645)
|
274
|
+
features_r=Okura::Features.new
|
275
|
+
features_r.add 458,f(458)
|
276
|
+
features_r.add 546,f(546)
|
280
277
|
out=StringIO.new
|
281
278
|
src=<<-EOS
|
282
279
|
あがなう,854,458,6636,動詞,自立,*,*,五段・ワ行促音便,基本形,あがなう,アガナウ,アガナウ,あがなう/購う/贖う,
|
283
280
|
あがめる,645,546,1234,動詞,自立,*,*,一段,基本形,あがめる,アガメル,アガメル,あがめる/崇める,
|
284
281
|
EOS
|
285
|
-
serializer.compile(
|
282
|
+
serializer.compile(features_l,features_r,[as_io(src)],'UTF-8',out)
|
286
283
|
out.rewind
|
287
284
|
wd=serializer.load(out)
|
288
285
|
|
@@ -307,13 +304,14 @@ TYPE3 0 1 3
|
|
307
304
|
cts.define_type 'Z',false,true,0
|
308
305
|
cts.define_map 0x0001,cts.named('A'),[]
|
309
306
|
cts.define_map 0x0002,cts.named('Z'),[]
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
307
|
+
features_l=Okura::Features.new
|
308
|
+
features_l.add 5,'F5'
|
309
|
+
features_l.add 9,'F9'
|
310
|
+
features_r=Okura::Features.new
|
311
|
+
features_r.add 6,'F6'
|
312
|
+
features_r.add 10,'F10'
|
315
313
|
out=StringIO.new
|
316
|
-
serializer.compile(cts,
|
314
|
+
serializer.compile(cts,features_l,features_r,as_io(<<-EOS),out)
|
317
315
|
A,5,6,3274,記号,一般,*,*,*,*,*
|
318
316
|
Z,9,10,5244,記号,空白,*,*,*,*,*
|
319
317
|
EOS
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: okura
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -45,7 +45,8 @@ dependencies:
|
|
45
45
|
version: 0.5.4
|
46
46
|
description: Pure ruby morpheme analyzer, using MeCab format dic
|
47
47
|
email: discommunicative@gmail.com
|
48
|
-
executables:
|
48
|
+
executables:
|
49
|
+
- okura
|
49
50
|
extensions: []
|
50
51
|
extra_rdoc_files: []
|
51
52
|
files:
|
@@ -58,6 +59,7 @@ files:
|
|
58
59
|
- test/okura_spec.rb
|
59
60
|
- test/spec_helper.rb
|
60
61
|
- test/words_spec.rb
|
62
|
+
- bin/okura
|
61
63
|
homepage: https://github.com/todesking/okura
|
62
64
|
licenses: []
|
63
65
|
post_install_message:
|