okura 0.0.0 → 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/okura +83 -0
- data/lib/okura.rb +4 -4
- data/lib/okura/serializer.rb +22 -14
- data/test/okura_spec.rb +20 -22
- metadata +4 -2
data/bin/okura
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
# -*- filetype:ruby -*-
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
|
6
|
+
commands={
|
7
|
+
'console' => lambda {|args|
|
8
|
+
require 'okura/console'
|
9
|
+
raise IllegalOption.new('Usage: console dict_dir') unless args.length==1
|
10
|
+
dict_dir=args[0]
|
11
|
+
Okura::Console.new.run_console dict_dir
|
12
|
+
},
|
13
|
+
'compile' => lambda {|args|
|
14
|
+
require 'okura/serializer'
|
15
|
+
require 'optparse'
|
16
|
+
format_info=Okura::Serializer::FormatInfo.new
|
17
|
+
opt=OptionParser.new
|
18
|
+
dry=false
|
19
|
+
opt.on('--word_dic=TYPE'){|t| format_info.word_dic=t.to_sym}
|
20
|
+
opt.on('--dry'){|d|dry=d}
|
21
|
+
opt.on('--encoding=ENCODING'){|encoding| format_info.encoding=encoding}
|
22
|
+
opt.parse! args
|
23
|
+
raise IllegalOption.new('Usage: compile dict_dir dest_dir') unless args.length==2
|
24
|
+
dict_dir=args[0]
|
25
|
+
bin_dir=args[1]
|
26
|
+
puts <<-EOS
|
27
|
+
Settings
|
28
|
+
- File Encoding: #{format_info.encoding}
|
29
|
+
- Format:
|
30
|
+
- word_dic: #{format_info.word_dic}
|
31
|
+
- unk_dic: #{format_info.unk_dic}
|
32
|
+
- features: #{format_info.features}
|
33
|
+
- char_types: #{format_info.char_types}
|
34
|
+
- matrix: #{format_info.matrix}
|
35
|
+
EOS
|
36
|
+
next if dry
|
37
|
+
Dir.mkdir bin_dir unless File.exists? bin_dir
|
38
|
+
format_info.compile_dict(dict_dir,bin_dir)
|
39
|
+
},
|
40
|
+
'help' => lambda {|args|
|
41
|
+
puts <<-EOS
|
42
|
+
USAGE:
|
43
|
+
okura command [args]
|
44
|
+
okura --version
|
45
|
+
command:
|
46
|
+
compile src_dir dest_dir
|
47
|
+
console dict_dir
|
48
|
+
help
|
49
|
+
EOS
|
50
|
+
}
|
51
|
+
}
|
52
|
+
|
53
|
+
class IllegalOption < StandardError
|
54
|
+
def initialize msg
|
55
|
+
@message=msg
|
56
|
+
end
|
57
|
+
attr_reader :message
|
58
|
+
end
|
59
|
+
|
60
|
+
if $*.empty?
|
61
|
+
commands['help'].call []
|
62
|
+
else
|
63
|
+
command=$*.first
|
64
|
+
if command == '--version'
|
65
|
+
require 'okura/version'
|
66
|
+
puts "Okura #{Okura::Version}"
|
67
|
+
exit 0
|
68
|
+
end
|
69
|
+
unless commands.has_key? command
|
70
|
+
puts "unknown command: #{command}"
|
71
|
+
commands[:help].call []
|
72
|
+
exit 1
|
73
|
+
end
|
74
|
+
begin
|
75
|
+
commands[command].call $*[1..-1]
|
76
|
+
exit 0
|
77
|
+
rescue IllegalOption
|
78
|
+
$stderr.puts $!.message
|
79
|
+
exit 1
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
|
data/lib/okura.rb
CHANGED
@@ -113,8 +113,8 @@ module Okura
|
|
113
113
|
def get id
|
114
114
|
raise 'bad id' unless id < @indices.length
|
115
115
|
from=@indices[id]
|
116
|
-
to=(id+1 < @indices.length) ? @indices[id+1] : @str.
|
117
|
-
|
116
|
+
to=(id+1 < @indices.length) ? @indices[id+1] : @str.bytesize
|
117
|
+
(from...to).map{|i|@str.getbyte(i)}.pack('C*').force_encoding 'UTF-8'
|
118
118
|
end
|
119
119
|
def [](id)
|
120
120
|
get id
|
@@ -132,7 +132,7 @@ module Okura
|
|
132
132
|
id=@indices.length
|
133
133
|
@indices.push @size
|
134
134
|
@surfaces.push surface
|
135
|
-
@size+=surface.
|
135
|
+
@size+=surface.bytesize
|
136
136
|
id
|
137
137
|
end
|
138
138
|
end
|
@@ -258,7 +258,7 @@ module Okura
|
|
258
258
|
end
|
259
259
|
# Integer -> Feature
|
260
260
|
def from_id id
|
261
|
-
@map_id[id]
|
261
|
+
@map_id[id] || (raise "Features: ID undefined (#{id})")
|
262
262
|
end
|
263
263
|
def [](id)
|
264
264
|
from_id id
|
data/lib/okura/serializer.rb
CHANGED
@@ -30,11 +30,17 @@ module Okura
|
|
30
30
|
}
|
31
31
|
}
|
32
32
|
|
33
|
+
features_r=open_src(src_dir,'right-id.def'){|src|
|
34
|
+
open_dest(bin_dir,'right-id.bin'){|dest|
|
35
|
+
serializer_for('Features',features).compile(src,dest)
|
36
|
+
}
|
37
|
+
}
|
38
|
+
|
33
39
|
word_src_files=
|
34
40
|
Dir.chdir(src_dir){ Dir.glob('*.csv') }.
|
35
41
|
map{|file|File.join(src_dir,file)}
|
36
42
|
open_dest(bin_dir,'word_dic.bin'){|dest|
|
37
|
-
serializer_for('WordDic',word_dic).compile(features_l,word_src_files,encoding,dest)
|
43
|
+
serializer_for('WordDic',word_dic).compile(features_l,features_r,word_src_files,encoding,dest)
|
38
44
|
}
|
39
45
|
|
40
46
|
char_types=open_src(src_dir,'char.def'){|src|
|
@@ -45,7 +51,7 @@ module Okura
|
|
45
51
|
|
46
52
|
open_src(src_dir,'unk.def'){|src|
|
47
53
|
open_dest(bin_dir,'unk_dic.bin'){|dest|
|
48
|
-
serializer_for('UnkDic',unk_dic).compile(char_types,features_l,src,dest)
|
54
|
+
serializer_for('UnkDic',unk_dic).compile(char_types,features_l,features_r,src,dest)
|
49
55
|
}
|
50
56
|
}
|
51
57
|
|
@@ -64,6 +70,9 @@ module Okura
|
|
64
70
|
features_l=open_bin(bin_dir,'left-id.bin'){|bin|
|
65
71
|
serializer_for('Features',features).load(bin)
|
66
72
|
}
|
73
|
+
features_r=open_bin(bin_dir,'right-id.bin'){|bin|
|
74
|
+
serializer_for('Features',features).load(bin)
|
75
|
+
}
|
67
76
|
wd=open_bin(bin_dir,'word_dic.bin'){|f|
|
68
77
|
serializer_for('WordDic',word_dic).load(f)
|
69
78
|
}
|
@@ -142,15 +151,15 @@ module Okura
|
|
142
151
|
}
|
143
152
|
end
|
144
153
|
class Naive
|
145
|
-
def compile(
|
154
|
+
def compile(features_l,features_r,inputs,encoding,output)
|
146
155
|
dic=Okura::WordDic::Naive.new
|
147
156
|
Okura::Serializer::WordDic.each_input(inputs,encoding){|input|
|
148
157
|
parser=Okura::Parser::Word.new(input)
|
149
158
|
parser.each{|surface,lid,rid,cost|
|
150
159
|
word=Okura::Word.new(
|
151
160
|
surface,
|
152
|
-
|
153
|
-
|
161
|
+
features_l.from_id(lid),
|
162
|
+
features_r.from_id(rid),
|
154
163
|
cost
|
155
164
|
)
|
156
165
|
dic.define word
|
@@ -163,16 +172,16 @@ module Okura
|
|
163
172
|
end
|
164
173
|
end
|
165
174
|
class DoubleArray
|
166
|
-
def compile(
|
167
|
-
puts 'loading'
|
175
|
+
def compile(features_l,features_r,inputs,encoding,output)
|
176
|
+
puts 'loading...'
|
168
177
|
dic=Okura::WordDic::DoubleArray::Builder.new
|
169
178
|
Okura::Serializer::WordDic.each_input(inputs,encoding){|input|
|
170
179
|
parser=Okura::Parser::Word.new(input)
|
171
180
|
parser.each{|surface,lid,rid,cost|
|
172
181
|
word=Okura::Word.new(
|
173
182
|
surface,
|
174
|
-
|
175
|
-
|
183
|
+
features_l.from_id(lid),
|
184
|
+
features_r.from_id(rid),
|
176
185
|
cost
|
177
186
|
)
|
178
187
|
dic.define word
|
@@ -182,7 +191,7 @@ module Okura
|
|
182
191
|
writer=Okura::Serializer::BinaryWriter.new output
|
183
192
|
words,base,check=dic.data_for_serialize
|
184
193
|
raise 'base.length!=check.length' if base.length!=check.length
|
185
|
-
puts '
|
194
|
+
puts 'writing words...'
|
186
195
|
words.instance_eval do
|
187
196
|
writer.write_object @groups
|
188
197
|
writer.write_object @left_features
|
@@ -191,13 +200,12 @@ module Okura
|
|
191
200
|
writer.write_int32_array @right_ids
|
192
201
|
writer.write_int32_array @costs
|
193
202
|
writer.write_int32_array @surface_ids
|
194
|
-
puts 'serialize surfaces'
|
195
203
|
@surfaces.instance_eval do
|
196
204
|
writer.write_object @str
|
197
205
|
writer.write_int32_array @indices
|
198
206
|
end
|
199
207
|
end
|
200
|
-
puts '
|
208
|
+
puts 'writing word index...'
|
201
209
|
writer.write_int32_array base
|
202
210
|
writer.write_int32_array check
|
203
211
|
end
|
@@ -255,11 +263,11 @@ module Okura
|
|
255
263
|
end
|
256
264
|
module UnkDic
|
257
265
|
class Marshal
|
258
|
-
def compile(char_types,
|
266
|
+
def compile(char_types,features_l,features_r,input,output)
|
259
267
|
unk=Okura::UnkDic.new char_types
|
260
268
|
parser=Okura::Parser::UnkDic.new input
|
261
269
|
parser.each{|type_name,lid,rid,cost|
|
262
|
-
unk.define type_name,
|
270
|
+
unk.define type_name,features_l.from_id(lid),features_r.from_id(rid),cost
|
263
271
|
}
|
264
272
|
::Marshal.dump(unk,output)
|
265
273
|
end
|
data/test/okura_spec.rb
CHANGED
@@ -191,16 +191,12 @@ Z,9,10,5244,記号,空白,*,*,*,*,*
|
|
191
191
|
tagger.dic.word_dic.word_size.should == 3
|
192
192
|
tagger.mat.cost(0,1).should == 5
|
193
193
|
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
u1=tagger.dic.unk_dic.word_templates_for('A')
|
201
|
-
u1.left.name.should == 'F5'
|
202
|
-
u1.right.name.should == 'F6'
|
203
|
-
}
|
194
|
+
w2=tagger.dic.word_dic.possible_words('w2',0)[0]
|
195
|
+
w2.left.text.should == 'F5'
|
196
|
+
w2.right.text.should == 'F6'
|
197
|
+
u1=tagger.dic.unk_dic.word_templates_for('A')[0]
|
198
|
+
u1.left.text.should == 'F5'
|
199
|
+
u1.right.text.should == 'F6'
|
204
200
|
}
|
205
201
|
end
|
206
202
|
end
|
@@ -272,17 +268,18 @@ TYPE3 0 1 3
|
|
272
268
|
# subject : Serializer class
|
273
269
|
it 'コンパイルして復元できる' do
|
274
270
|
serializer=subject.new
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
271
|
+
features_l=Okura::Features.new
|
272
|
+
features_l.add 854,f(854)
|
273
|
+
features_l.add 645,f(645)
|
274
|
+
features_r=Okura::Features.new
|
275
|
+
features_r.add 458,f(458)
|
276
|
+
features_r.add 546,f(546)
|
280
277
|
out=StringIO.new
|
281
278
|
src=<<-EOS
|
282
279
|
あがなう,854,458,6636,動詞,自立,*,*,五段・ワ行促音便,基本形,あがなう,アガナウ,アガナウ,あがなう/購う/贖う,
|
283
280
|
あがめる,645,546,1234,動詞,自立,*,*,一段,基本形,あがめる,アガメル,アガメル,あがめる/崇める,
|
284
281
|
EOS
|
285
|
-
serializer.compile(
|
282
|
+
serializer.compile(features_l,features_r,[as_io(src)],'UTF-8',out)
|
286
283
|
out.rewind
|
287
284
|
wd=serializer.load(out)
|
288
285
|
|
@@ -307,13 +304,14 @@ TYPE3 0 1 3
|
|
307
304
|
cts.define_type 'Z',false,true,0
|
308
305
|
cts.define_map 0x0001,cts.named('A'),[]
|
309
306
|
cts.define_map 0x0002,cts.named('Z'),[]
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
307
|
+
features_l=Okura::Features.new
|
308
|
+
features_l.add 5,'F5'
|
309
|
+
features_l.add 9,'F9'
|
310
|
+
features_r=Okura::Features.new
|
311
|
+
features_r.add 6,'F6'
|
312
|
+
features_r.add 10,'F10'
|
315
313
|
out=StringIO.new
|
316
|
-
serializer.compile(cts,
|
314
|
+
serializer.compile(cts,features_l,features_r,as_io(<<-EOS),out)
|
317
315
|
A,5,6,3274,記号,一般,*,*,*,*,*
|
318
316
|
Z,9,10,5244,記号,空白,*,*,*,*,*
|
319
317
|
EOS
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: okura
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -45,7 +45,8 @@ dependencies:
|
|
45
45
|
version: 0.5.4
|
46
46
|
description: Pure ruby morpheme analyzer, using MeCab format dic
|
47
47
|
email: discommunicative@gmail.com
|
48
|
-
executables:
|
48
|
+
executables:
|
49
|
+
- okura
|
49
50
|
extensions: []
|
50
51
|
extra_rdoc_files: []
|
51
52
|
files:
|
@@ -58,6 +59,7 @@ files:
|
|
58
59
|
- test/okura_spec.rb
|
59
60
|
- test/spec_helper.rb
|
60
61
|
- test/words_spec.rb
|
62
|
+
- bin/okura
|
61
63
|
homepage: https://github.com/todesking/okura
|
62
64
|
licenses: []
|
63
65
|
post_install_message:
|