okura 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/okura/console.rb +23 -0
- data/lib/okura/parser.rb +131 -0
- data/lib/okura/serializer.rb +317 -0
- data/lib/okura/version.rb +3 -0
- data/lib/okura/word_dic.rb +216 -0
- data/lib/okura.rb +388 -0
- data/test/okura_spec.rb +721 -0
- data/test/spec_helper.rb +6 -0
- data/test/words_spec.rb +36 -0
- metadata +86 -0
data/test/okura_spec.rb
ADDED
@@ -0,0 +1,721 @@
|
|
1
|
+
#-*- coding:utf-8
|
2
|
+
require File.join(File.dirname(__FILE__),'spec_helper.rb')
|
3
|
+
require File.join(File.dirname(__FILE__),'..','lib','okura')
|
4
|
+
require File.join(File.dirname(__FILE__),'..','lib','okura','parser')
|
5
|
+
require File.join(File.dirname(__FILE__),'..','lib','okura','serializer')
|
6
|
+
|
7
|
+
require 'tmpdir'
|
8
|
+
|
9
|
+
def with_dict_dir &block
|
10
|
+
Dir.mktmpdir {|src_dir|
|
11
|
+
Dir.mktmpdir {|bin_dir|
|
12
|
+
yield src_dir,bin_dir
|
13
|
+
}
|
14
|
+
}
|
15
|
+
end
|
16
|
+
|
17
|
+
def set_content(dir,filename,content)
|
18
|
+
File.open(File.join(dir,filename),'w'){|file|
|
19
|
+
file.write content
|
20
|
+
}
|
21
|
+
end
|
22
|
+
|
23
|
+
def as_io str
|
24
|
+
StringIO.new str
|
25
|
+
end
|
26
|
+
def w surface,l,r,cost
|
27
|
+
l=f(l) unless l.respond_to? :id
|
28
|
+
r=f(r) unless r.respond_to? :id
|
29
|
+
Okura::Word.new surface,l,r,cost
|
30
|
+
end
|
31
|
+
def f id,name="F#{id}"
|
32
|
+
Okura::Feature.new id,name
|
33
|
+
end
|
34
|
+
def n *args
|
35
|
+
Okura::Node.new *args
|
36
|
+
end
|
37
|
+
|
38
|
+
describe Okura::Parser do
|
39
|
+
describe 'Matrix' do
|
40
|
+
it 'MeCab形式のMatrixファイルを読める' do
|
41
|
+
parser=Okura::Parser::Matrix.new as_io(<<-EOS)
|
42
|
+
2 3
|
43
|
+
0 0 0
|
44
|
+
0 1 1
|
45
|
+
1 0 2
|
46
|
+
1 1 3
|
47
|
+
1 2 10
|
48
|
+
EOS
|
49
|
+
parser.rid_size.should == 2
|
50
|
+
parser.lid_size.should == 3
|
51
|
+
parser.each.to_a.should == [
|
52
|
+
[0,0,0],
|
53
|
+
[0,1,1],
|
54
|
+
[1,0,2],
|
55
|
+
[1,1,3],
|
56
|
+
[1,2,10]
|
57
|
+
]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
describe 'Word' do
|
61
|
+
it 'MeCab形式の単語ファイルを読める' do
|
62
|
+
parser=Okura::Parser::Word.new as_io(<<-EOS)
|
63
|
+
あがなう,854,458,6636,動詞,自立,*,*,五段・ワ行促音便,基本形,あがなう,アガナウ,アガナウ,あがなう/購う/贖う,
|
64
|
+
あがめる,645,546,1234,動詞,自立,*,*,一段,基本形,あがめる,アガメル,アガメル,あがめる/崇める,
|
65
|
+
EOS
|
66
|
+
parser.each.to_a.map{|x|x[0..3]}.should == [
|
67
|
+
['あがなう',854,458,6636],
|
68
|
+
['あがめる',645,546,1234]
|
69
|
+
]
|
70
|
+
end
|
71
|
+
it 'ダブルクオートでエスケープされた単語定義も扱える'
|
72
|
+
end
|
73
|
+
describe 'Feature' do
|
74
|
+
it 'MeCab形式の品詞ファイルを読める' do
|
75
|
+
parser=Okura::Parser::Feature.new as_io(<<-EOS)
|
76
|
+
0 BOS/EOS,*,*,*,*,*,BOS/EOS
|
77
|
+
1 その他,間投,*,*,*,*,*
|
78
|
+
EOS
|
79
|
+
parser.each.to_a.should == [
|
80
|
+
[0,'BOS/EOS,*,*,*,*,*,BOS/EOS'],
|
81
|
+
[1,'その他,間投,*,*,*,*,*']
|
82
|
+
]
|
83
|
+
end
|
84
|
+
end
|
85
|
+
describe 'CharType' do
|
86
|
+
it 'MeCab形式の文字種定義ファイルを読める' do
|
87
|
+
parser=Okura::Parser::CharType.new
|
88
|
+
h={single:[],range:[],type:[]}
|
89
|
+
parser.on_mapping_single {|code,type,ctypes| h[:single]<<[code,type,ctypes]}
|
90
|
+
parser.on_mapping_range{|from,to,type,ctypes| h[:range]<<[from,to,type,ctypes]}
|
91
|
+
parser.on_chartype_def{|name,invoke,group,length| h[:type]<<[name,invoke,group,length]}
|
92
|
+
|
93
|
+
parser.parse_all as_io(<<-EOS)
|
94
|
+
DEFAULT 0 1 0 # DEFAULT is a mandatory category!
|
95
|
+
KATAKANA 1 0 2
|
96
|
+
|
97
|
+
0x000D SPACE # CR
|
98
|
+
0x003A..0x0040 SYMBOL
|
99
|
+
# KANJI
|
100
|
+
0x5146 KANJINUMERIC KANJI
|
101
|
+
EOS
|
102
|
+
|
103
|
+
h[:single].should == [
|
104
|
+
[0x000D, 'SPACE', []],
|
105
|
+
[0x5146, 'KANJINUMERIC', %w(KANJI)]
|
106
|
+
]
|
107
|
+
h[:range].should == [
|
108
|
+
[0x003A, 0x0040, 'SYMBOL', []]
|
109
|
+
]
|
110
|
+
h[:type].should == [
|
111
|
+
['DEFAULT', false, true, 0],
|
112
|
+
['KATAKANA', true, false, 2]
|
113
|
+
]
|
114
|
+
end
|
115
|
+
end
|
116
|
+
describe 'UnkDic' do
|
117
|
+
it '未知語の定義を読める' do
|
118
|
+
parser=Okura::Parser::UnkDic.new as_io(<<-EOS)
|
119
|
+
A,5,6,3274,記号,一般,*,*,*,*,*
|
120
|
+
Z,9,10,5244,記号,空白,*,*,*,*,*
|
121
|
+
EOS
|
122
|
+
parser.to_a.should == [
|
123
|
+
['A',5,6,3274],
|
124
|
+
['Z',9,10,5244]
|
125
|
+
]
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
describe 'Compile and load' do
|
131
|
+
describe Okura::Serializer::FormatInfo do
|
132
|
+
it 'シリアライズして復元できる' do
|
133
|
+
info=Okura::Serializer::FormatInfo.new
|
134
|
+
info.word_dic=:Naive
|
135
|
+
info.features=:Marshal
|
136
|
+
info.char_types=:Marshal
|
137
|
+
info.unk_dic=:Marshal
|
138
|
+
info.matrix=:Marshal
|
139
|
+
|
140
|
+
out=StringIO.new
|
141
|
+
info.compile(out)
|
142
|
+
out.rewind
|
143
|
+
|
144
|
+
loaded=Okura::Serializer::FormatInfo.load(out)
|
145
|
+
loaded.word_dic.should == :Naive
|
146
|
+
loaded.features.should == :Marshal
|
147
|
+
loaded.char_types.should == :Marshal
|
148
|
+
loaded.unk_dic.should == :Marshal
|
149
|
+
loaded.matrix.should == :Marshal
|
150
|
+
end
|
151
|
+
it '設定に基づいて辞書をコンパイル/ロードできる' do
|
152
|
+
with_dict_dir{|src_dir,bin_dir|
|
153
|
+
set_content(src_dir,'w1.csv',<<-EOS)
|
154
|
+
w1,1,2,1000,
|
155
|
+
EOS
|
156
|
+
set_content(src_dir,'w2.csv',<<-EOS)
|
157
|
+
w2,5,6,2000,
|
158
|
+
w3,9,10,3000,
|
159
|
+
EOS
|
160
|
+
set_content(src_dir,'left-id.def',<<-EOS)
|
161
|
+
1 F1
|
162
|
+
5 F5
|
163
|
+
9 F9
|
164
|
+
EOS
|
165
|
+
set_content(src_dir,'right-id.def',<<-EOS)
|
166
|
+
2 F2
|
167
|
+
6 F6
|
168
|
+
10 F10
|
169
|
+
EOS
|
170
|
+
set_content(src_dir,'char.def',<<-EOS)
|
171
|
+
A 0 0 1
|
172
|
+
Z 1 1 3
|
173
|
+
EOS
|
174
|
+
set_content(src_dir,'unk.def',<<-EOS)
|
175
|
+
A,5,6,3274,記号,一般,*,*,*,*,*
|
176
|
+
Z,9,10,5244,記号,空白,*,*,*,*,*
|
177
|
+
EOS
|
178
|
+
set_content(src_dir,'matrix.def',<<-EOS)
|
179
|
+
2 3
|
180
|
+
0 0 10
|
181
|
+
0 1 5
|
182
|
+
EOS
|
183
|
+
|
184
|
+
fi=Okura::Serializer::FormatInfo.new
|
185
|
+
fi.encoding='UTF-8'
|
186
|
+
fi.compile_dict(src_dir,bin_dir)
|
187
|
+
|
188
|
+
tagger=Okura::Serializer::FormatInfo.create_tagger(bin_dir)
|
189
|
+
|
190
|
+
tagger.dic.unk_dic.rule_size.should == 2
|
191
|
+
tagger.dic.word_dic.word_size.should == 3
|
192
|
+
tagger.mat.cost(0,1).should == 5
|
193
|
+
|
194
|
+
pending {
|
195
|
+
w2=tagger.dic.word_dic.possible_words('w2',0)[0]
|
196
|
+
w2.left.name.should == 'F5'
|
197
|
+
w2.right.name.should == 'F6'
|
198
|
+
}
|
199
|
+
penging {
|
200
|
+
u1=tagger.dic.unk_dic.word_templates_for('A')
|
201
|
+
u1.left.name.should == 'F5'
|
202
|
+
u1.right.name.should == 'F6'
|
203
|
+
}
|
204
|
+
}
|
205
|
+
end
|
206
|
+
end
|
207
|
+
describe Okura::Serializer::Features::Marshal do
|
208
|
+
it 'コンパイルして復元できる' do
|
209
|
+
serializer=Okura::Serializer::Features::Marshal.new
|
210
|
+
out=StringIO.new
|
211
|
+
serializer.compile(as_io(<<-EOS),out)
|
212
|
+
0 BOS/EOS,*,*,*,*,*,BOS/EOS
|
213
|
+
1 その他,間投,*,*,*,*,*
|
214
|
+
EOS
|
215
|
+
out.rewind
|
216
|
+
|
217
|
+
features=serializer.load(out)
|
218
|
+
features.from_id(0).text.should == 'BOS/EOS,*,*,*,*,*,BOS/EOS'
|
219
|
+
features.from_id(1).text.should == 'その他,間投,*,*,*,*,*'
|
220
|
+
end
|
221
|
+
end
|
222
|
+
describe Okura::Serializer::CharTypes::Marshal do
|
223
|
+
it 'コンパイルして復元できる' do
|
224
|
+
serializer=Okura::Serializer::CharTypes::Marshal.new
|
225
|
+
out=StringIO.new
|
226
|
+
serializer.compile(as_io(<<-EOS),out)
|
227
|
+
DEFAULT 0 1 0 # DEFAULT is a mandatory category!
|
228
|
+
TYPE1 1 0 0
|
229
|
+
TYPE2 0 1 0
|
230
|
+
TYPE3 0 1 3
|
231
|
+
|
232
|
+
# comment
|
233
|
+
|
234
|
+
0x0021 TYPE1
|
235
|
+
0x0022 TYPE2 # comment
|
236
|
+
0x0023..0x0040 TYPE3
|
237
|
+
0x0099 TYPE1 TYPE2 # 互換カテゴリ
|
238
|
+
0xABCd TYPE1 DEFAULT
|
239
|
+
EOS
|
240
|
+
out.rewind
|
241
|
+
|
242
|
+
cts=serializer.load(out)
|
243
|
+
|
244
|
+
cts.type_for(0x21).name.should == 'TYPE1'
|
245
|
+
cts.type_for(0x22).name.should == 'TYPE2'
|
246
|
+
cts.type_for(0x23).name.should == 'TYPE3'
|
247
|
+
cts.type_for(0x40).name.should == 'TYPE3'
|
248
|
+
cts.type_for(0x41).name.should == 'DEFAULT'
|
249
|
+
cts.type_for(0x99).name.should == 'TYPE1'
|
250
|
+
|
251
|
+
t1,t2,t3=cts.named('TYPE1'), cts.named('TYPE2'), cts.named('TYPE3')
|
252
|
+
|
253
|
+
t1.name.should == 'TYPE1'
|
254
|
+
|
255
|
+
t1.invoke?.should be_true
|
256
|
+
t2.invoke?.should be_false
|
257
|
+
|
258
|
+
t1.group?.should be_false
|
259
|
+
t2.group?.should be_true
|
260
|
+
|
261
|
+
t2.length.should == 0
|
262
|
+
t3.length.should == 3
|
263
|
+
|
264
|
+
t1.should be_accept(0x21)
|
265
|
+
t1.should_not be_accept(0x22)
|
266
|
+
t2.should be_accept(0x22)
|
267
|
+
|
268
|
+
t1.should be_accept(0x99)
|
269
|
+
end
|
270
|
+
end
|
271
|
+
shared_examples_for 'WordDic serializer' do
|
272
|
+
# subject : Serializer class
|
273
|
+
it 'コンパイルして復元できる' do
|
274
|
+
serializer=subject.new
|
275
|
+
features=Okura::Features.new
|
276
|
+
features.add 854,f(854)
|
277
|
+
features.add 458,f(458)
|
278
|
+
features.add 645,f(645)
|
279
|
+
features.add 546,f(546)
|
280
|
+
out=StringIO.new
|
281
|
+
src=<<-EOS
|
282
|
+
あがなう,854,458,6636,動詞,自立,*,*,五段・ワ行促音便,基本形,あがなう,アガナウ,アガナウ,あがなう/購う/贖う,
|
283
|
+
あがめる,645,546,1234,動詞,自立,*,*,一段,基本形,あがめる,アガメル,アガメル,あがめる/崇める,
|
284
|
+
EOS
|
285
|
+
serializer.compile(features,[as_io(src)],'UTF-8',out)
|
286
|
+
out.rewind
|
287
|
+
wd=serializer.load(out)
|
288
|
+
|
289
|
+
wd.possible_words('あがなう',0).should == [w('あがなう',f(854),f(458),6636)]
|
290
|
+
wd.possible_words('あがめる',0).should == [w('あがめる',f(645),f(546),1234)]
|
291
|
+
wd.possible_words('あがめる',1).should == []
|
292
|
+
end
|
293
|
+
end
|
294
|
+
describe Okura::Serializer::WordDic::Naive do
|
295
|
+
subject { Okura::Serializer::WordDic::Naive }
|
296
|
+
it_should_behave_like 'WordDic serializer'
|
297
|
+
end
|
298
|
+
describe Okura::Serializer::WordDic::DoubleArray do
|
299
|
+
subject { Okura::Serializer::WordDic::DoubleArray }
|
300
|
+
it_should_behave_like 'WordDic serializer'
|
301
|
+
end
|
302
|
+
describe Okura::Serializer::UnkDic::Marshal do
|
303
|
+
it 'コンパイルして復元できる' do
|
304
|
+
serializer=Okura::Serializer::UnkDic::Marshal.new
|
305
|
+
cts=Okura::CharTypes.new
|
306
|
+
cts.define_type 'A',true,false,10
|
307
|
+
cts.define_type 'Z',false,true,0
|
308
|
+
cts.define_map 0x0001,cts.named('A'),[]
|
309
|
+
cts.define_map 0x0002,cts.named('Z'),[]
|
310
|
+
features=Okura::Features.new
|
311
|
+
features.add 5,'F5'
|
312
|
+
features.add 6,'F6'
|
313
|
+
features.add 9,'F9'
|
314
|
+
features.add 10,'F10'
|
315
|
+
out=StringIO.new
|
316
|
+
serializer.compile(cts,features,as_io(<<-EOS),out)
|
317
|
+
A,5,6,3274,記号,一般,*,*,*,*,*
|
318
|
+
Z,9,10,5244,記号,空白,*,*,*,*,*
|
319
|
+
EOS
|
320
|
+
out.rewind
|
321
|
+
|
322
|
+
unk=serializer.load(out)
|
323
|
+
unk.word_templates_for('A').first.cost.should == 3274
|
324
|
+
unk.word_templates_for('Z').first.cost.should == 5244
|
325
|
+
end
|
326
|
+
end
|
327
|
+
describe Okura::Serializer::Matrix::Marshal do
|
328
|
+
it 'コンパイルして復元できる' do
|
329
|
+
serializer=Okura::Serializer::Matrix::Marshal.new
|
330
|
+
out=StringIO.new
|
331
|
+
serializer.compile(as_io(<<-EOS),out)
|
332
|
+
2 3
|
333
|
+
0 0 0
|
334
|
+
0 1 1
|
335
|
+
1 0 2
|
336
|
+
1 1 3
|
337
|
+
1 2 10
|
338
|
+
EOS
|
339
|
+
out.rewind
|
340
|
+
|
341
|
+
mat=serializer.load(out)
|
342
|
+
mat.cost(0,0).should == 0
|
343
|
+
mat.cost(1,2).should == 10
|
344
|
+
end
|
345
|
+
end
|
346
|
+
end
|
347
|
+
|
348
|
+
describe Okura::Matrix do
|
349
|
+
describe '#cost' do
|
350
|
+
it '渡された二つのFeature idを元にコストを返せる' do
|
351
|
+
m=Okura::Matrix.new 2,2
|
352
|
+
m.set(0,0,0)
|
353
|
+
m.set(0,1,1)
|
354
|
+
m.set(1,0,2)
|
355
|
+
m.set(1,1,3)
|
356
|
+
|
357
|
+
m.cost(1,1).should == 3
|
358
|
+
end
|
359
|
+
end
|
360
|
+
end
|
361
|
+
|
362
|
+
shared_examples_for 'WordDic' do
|
363
|
+
# subject = dict builder
|
364
|
+
def w surface
|
365
|
+
Okura::Word.new surface,f(1),f(1),1
|
366
|
+
end
|
367
|
+
|
368
|
+
describe '#possible_words' do
|
369
|
+
it '登録された単語のサイズを取得できる' do
|
370
|
+
subject.build.word_size.should == 0
|
371
|
+
subject.define w('aaa')
|
372
|
+
subject.define w('bbb')
|
373
|
+
subject.build.word_size.should == 2
|
374
|
+
end
|
375
|
+
it '同じ表記の単語を複数登録できる' do
|
376
|
+
w1=Okura::Word.new 'w',f(1),f(2),100
|
377
|
+
w2=Okura::Word.new 'w',f(10),f(20),200
|
378
|
+
subject.define w1
|
379
|
+
subject.define w1
|
380
|
+
subject.define w2
|
381
|
+
|
382
|
+
wd=subject.build
|
383
|
+
|
384
|
+
wd.possible_words('w',0).should == [w1,w1,w2]
|
385
|
+
end
|
386
|
+
it '文字列と位置から、辞書に登録された単語を返せる' do
|
387
|
+
subject.define w('aaa')
|
388
|
+
subject.define w('bbb')
|
389
|
+
subject.define w('aa')
|
390
|
+
subject.define w('aaaa')
|
391
|
+
subject.define w('aaaaa')
|
392
|
+
|
393
|
+
wd=subject.build
|
394
|
+
|
395
|
+
wd.possible_words('bbbaaa',0).should == [w('bbb')]
|
396
|
+
wd.possible_words('bbbaaa',1).should == []
|
397
|
+
wd.possible_words('bbbaaa',3).should == [w('aa'),w('aaa')]
|
398
|
+
end
|
399
|
+
it 'マルチバイト文字にも対応している' do
|
400
|
+
subject.define w('ニワトリ')
|
401
|
+
wd=subject.build
|
402
|
+
|
403
|
+
wd.possible_words('ニワトリ',0).should == [w('ニワトリ')]
|
404
|
+
wd.possible_words('ニワトリ',1).should == []
|
405
|
+
end
|
406
|
+
def matches words,str,dest
|
407
|
+
words.each{|word| subject.define w(word) }
|
408
|
+
dic=subject.build
|
409
|
+
dic.possible_words(str,0).should == dest.map{|d|w(d)}
|
410
|
+
end
|
411
|
+
it { matches %w() , '' , %w() }
|
412
|
+
it { matches %w() , 'aaa' , %w() }
|
413
|
+
it { matches %w(a) , '' , %w() }
|
414
|
+
it { matches %w(a) , 'a' , %w(a) }
|
415
|
+
it { matches %w(a) , 'aa' , %w(a) }
|
416
|
+
it { matches %w(a) , 'b' , %w() }
|
417
|
+
it { matches %w(aa) , 'a' , %w() }
|
418
|
+
it { matches %w(aa) , 'aa' , %w(aa) }
|
419
|
+
it { matches %w(aa) , 'aaa' , %w(aa) }
|
420
|
+
it { matches %w(aa) , 'ab' , %w() }
|
421
|
+
it { matches %w(a aa) , 'a' , %w(a) }
|
422
|
+
it { matches %w(a aa) , 'aa' , %w(a aa) }
|
423
|
+
it { matches %w(a aa) , 'aaa' , %w(a aa) }
|
424
|
+
it { matches %w(a aa) , 'aab' , %w(a aa) }
|
425
|
+
it { matches %w(a aa ab) , 'aab' , %w(a aa) }
|
426
|
+
it { matches %w(a aa ab) , 'ab' , %w(a ab) }
|
427
|
+
it { matches %w(a aa ab) , 'aa' , %w(a aa) }
|
428
|
+
it { matches %w(a b) , 'ba' , %w(b) }
|
429
|
+
it { matches %w(アイウ) , 'アイウ' , %w(アイウ) }
|
430
|
+
it { matches %w(ア アイ) , 'アイウ' , %w(ア アイ) }
|
431
|
+
it { matches %w(ア アイ) , 'aアイウ' , %w() }
|
432
|
+
end
|
433
|
+
end
|
434
|
+
|
435
|
+
describe Okura::WordDic::Naive do
|
436
|
+
class NaiveBuilder
|
437
|
+
def initialize
|
438
|
+
@wd=Okura::WordDic::Naive.new
|
439
|
+
end
|
440
|
+
def define *args
|
441
|
+
@wd.define *args
|
442
|
+
end
|
443
|
+
def build
|
444
|
+
@wd
|
445
|
+
end
|
446
|
+
end
|
447
|
+
subject { NaiveBuilder.new }
|
448
|
+
it_should_behave_like 'WordDic'
|
449
|
+
end
|
450
|
+
|
451
|
+
describe Okura::WordDic::DoubleArray do
|
452
|
+
subject { Okura::WordDic::DoubleArray::Builder.new }
|
453
|
+
def base(dic)
|
454
|
+
dic.instance_eval{@base}
|
455
|
+
end
|
456
|
+
def check(dic)
|
457
|
+
dic.instance_eval{@check}
|
458
|
+
end
|
459
|
+
def words(dic)
|
460
|
+
dic.instance_eval{@words}
|
461
|
+
end
|
462
|
+
it_should_behave_like 'WordDic'
|
463
|
+
end
|
464
|
+
|
465
|
+
describe Okura::Features do
|
466
|
+
end
|
467
|
+
|
468
|
+
describe Okura::CharTypes do
|
469
|
+
describe '#type_for' do
|
470
|
+
describe '文字に対するCharTypeが定義されていない場合' do
|
471
|
+
describe '文字種DEFAULTが定義されている場合' do
|
472
|
+
subject {
|
473
|
+
cts=Okura::CharTypes.new
|
474
|
+
cts.define_type 'DEFAULT',false,false,0
|
475
|
+
cts
|
476
|
+
}
|
477
|
+
it 'CharType#default_typeが返る' do
|
478
|
+
subject.type_for('a'.ord).name.should == subject.default_type.name
|
479
|
+
end
|
480
|
+
end
|
481
|
+
describe '文字種DEFAULTが定義されてない場合' do
|
482
|
+
subject { cts=Okura::CharTypes.new }
|
483
|
+
it 'エラーになる' do
|
484
|
+
expect { subject.type_for('a'.ord) }.to raise_error
|
485
|
+
end
|
486
|
+
end
|
487
|
+
end
|
488
|
+
end
|
489
|
+
describe '#define_map' do
|
490
|
+
describe '互換カテゴリが指定された場合' do
|
491
|
+
subject {
|
492
|
+
cts=Okura::CharTypes.new
|
493
|
+
cts.define_type 'A',true,true,10
|
494
|
+
cts.define_type 'B',true,true,10
|
495
|
+
cts.define_map 1,cts.named('A'),[cts.named('B')]
|
496
|
+
cts
|
497
|
+
}
|
498
|
+
it '互換カテゴリが正しく認識される' do
|
499
|
+
subject.named('A').accept?(1).should be_true
|
500
|
+
subject.named('B').accept?(1).should be_true
|
501
|
+
end
|
502
|
+
end
|
503
|
+
end
|
504
|
+
end
|
505
|
+
|
506
|
+
describe Okura::UnkDic do
|
507
|
+
describe '#possible_words' do
|
508
|
+
describe '互換カテゴリ' do
|
509
|
+
subject {
|
510
|
+
cts=Okura::CharTypes.new
|
511
|
+
cts.define_type 'KATAKANA',false,true,0
|
512
|
+
cts.define_type 'HIRAGANA',false,true,0
|
513
|
+
cts.define_map 'ア'.ord,cts.named('KATAKANA'),[]
|
514
|
+
cts.define_map 'ー'.ord,cts.named('HIRAGANA'),[cts.named('KATAKANA')]
|
515
|
+
ud=Okura::UnkDic.new cts
|
516
|
+
ud.define 'KATAKANA',f(10),f(20),1000
|
517
|
+
ud.define 'HIRAGANA',f(1),f(2),1000
|
518
|
+
ud
|
519
|
+
}
|
520
|
+
it '互換カテゴリを正しく解釈する' do
|
521
|
+
subject.possible_words('アーー',0,false).should == [w('アーー',10,20,1000)]
|
522
|
+
end
|
523
|
+
end
|
524
|
+
describe '未知語定義' do
|
525
|
+
describe '同一文字種に複数の未知語定義があった場合' do
|
526
|
+
subject do
|
527
|
+
cts=Okura::CharTypes.new
|
528
|
+
cts.define_type 'A',true,true,0
|
529
|
+
cts.define_map 'A'.ord,cts.named('A'),[]
|
530
|
+
ud=Okura::UnkDic.new cts
|
531
|
+
ud.define 'A',f(10),f(20),1000
|
532
|
+
ud.define 'A',f(11),f(21),1111
|
533
|
+
ud
|
534
|
+
end
|
535
|
+
it 'すべての定義から未知語を抽出する' do
|
536
|
+
subject.possible_words('A',0,false).should == [
|
537
|
+
w('A',10,20,1000),
|
538
|
+
w('A',11,21,1111)
|
539
|
+
]
|
540
|
+
end
|
541
|
+
end
|
542
|
+
end
|
543
|
+
end
|
544
|
+
describe '#possible_words: 文字コードによる挙動:' do
|
545
|
+
subject do
|
546
|
+
cts=Okura::CharTypes.new
|
547
|
+
cts.define_type 'A',true,true,0
|
548
|
+
cts.define_map 'あ'.ord,cts.named('A'),[]
|
549
|
+
ud=Okura::UnkDic.new cts
|
550
|
+
ud.define 'A',f(10),f(20),1000
|
551
|
+
ud
|
552
|
+
end
|
553
|
+
describe 'UTF8文字列が来たとき' do
|
554
|
+
it '正しく解析できる' do
|
555
|
+
subject.possible_words('あいう'.encode('UTF-8'),0,false).map(&:surface).should == %w(あ)
|
556
|
+
end
|
557
|
+
end
|
558
|
+
describe 'UTF8じゃない文字列が来たとき' do
|
559
|
+
it 'エラーになる' do
|
560
|
+
expect { subject.possible_words('あいう'.encode('SHIFT_JIS'),0,false) }.to raise_error
|
561
|
+
end
|
562
|
+
end
|
563
|
+
end
|
564
|
+
describe '#possible_words: 先頭文字のカテゴリによる挙動:' do
|
565
|
+
def create_chartypes typename_under_test
|
566
|
+
cts=Okura::CharTypes.new
|
567
|
+
cts.define_type 'T000',false,false,0
|
568
|
+
cts.define_type 'T012',false,true,2
|
569
|
+
cts.define_type 'T100',true,false,0
|
570
|
+
cts.define_type 'T102',true,false,2
|
571
|
+
cts.define_type 'T110',true,true,0
|
572
|
+
cts.define_type 'T112',true,true,2
|
573
|
+
cts.define_type 'ZZZZ',true,true,2
|
574
|
+
|
575
|
+
cts.define_map 'A'.ord,cts.named(typename_under_test),[]
|
576
|
+
cts.define_map 'Z'.ord,cts.named('ZZZZ'),[]
|
577
|
+
|
578
|
+
cts
|
579
|
+
end
|
580
|
+
def create_subject typename_under_test
|
581
|
+
udic=Okura::UnkDic.new create_chartypes(typename_under_test)
|
582
|
+
udic.define typename_under_test,f(10),(20),1000
|
583
|
+
udic
|
584
|
+
end
|
585
|
+
describe 'invoke=0のとき' do
|
586
|
+
subject { create_subject 'T012' }
|
587
|
+
describe '辞書に単語がある場合' do
|
588
|
+
it '未知語を抽出しない' do
|
589
|
+
subject.possible_words('AAA',0,true).should be_empty
|
590
|
+
end
|
591
|
+
end
|
592
|
+
end
|
593
|
+
describe 'invoke=1のとき' do
|
594
|
+
describe '辞書に単語がある場合' do
|
595
|
+
subject { create_subject 'T102' }
|
596
|
+
it 'も、未知語を抽出する' do
|
597
|
+
subject.possible_words('AAAZ',0,true).should_not be_empty
|
598
|
+
end
|
599
|
+
end
|
600
|
+
describe '先頭文字のカテゴリに対応する未知語定義がなかった場合' do
|
601
|
+
subject { create_subject 'T112' }
|
602
|
+
it '未知語を抽出しない' do
|
603
|
+
subject.possible_words('ZZ',0,false).should be_empty
|
604
|
+
end
|
605
|
+
end
|
606
|
+
describe '辞書に単語がない場合' do
|
607
|
+
describe 'group=0のとき' do
|
608
|
+
describe 'length=0のとき' do
|
609
|
+
subject { create_subject 'T100' }
|
610
|
+
it '未知語を抽出しない' do
|
611
|
+
subject.possible_words('AAAZ',0,false).should be_empty
|
612
|
+
end
|
613
|
+
end
|
614
|
+
describe 'length=2のとき' do
|
615
|
+
subject { create_subject 'T102' }
|
616
|
+
it '2文字までの同種文字列を未知語とする' do
|
617
|
+
subject.possible_words('AAAZ',0,false).map(&:surface).should == %w(A AA)
|
618
|
+
end
|
619
|
+
end
|
620
|
+
end
|
621
|
+
describe 'group=1のとき' do
|
622
|
+
describe 'length=0のとき' do
|
623
|
+
subject { create_subject 'T110' }
|
624
|
+
it '同種の文字列を長さ制限なしでまとめて未知語とする' do
|
625
|
+
subject.possible_words('AAAAAZ',0,false).map(&:surface).should == %w(AAAAA)
|
626
|
+
end
|
627
|
+
it '連続が一文字の場合も未知語として取れる' do
|
628
|
+
subject.possible_words('AZZZ',0,false).map(&:surface).should == %w(A)
|
629
|
+
end
|
630
|
+
it '1文字しかなくても正しく扱える' do
|
631
|
+
subject.possible_words('A',0,false).map(&:surface).should == %w(A)
|
632
|
+
end
|
633
|
+
end
|
634
|
+
describe 'length=2のとき' do
|
635
|
+
subject { create_subject 'T112' }
|
636
|
+
it 'length=0の結果に加え、2文字までの同種文字列を未知語とする' do
|
637
|
+
subject.possible_words('AAAAAZ',0,false).map(&:surface).should == %w(A AA AAAAA)
|
638
|
+
end
|
639
|
+
it '1文字しかなくても正しく扱える' do
|
640
|
+
subject.possible_words('A',0,false).map(&:surface).should == %w(A)
|
641
|
+
end
|
642
|
+
it '2文字しかなくても正しく扱える' do
|
643
|
+
subject.possible_words('AA',0,false).map(&:surface).should == %w(A AA)
|
644
|
+
end
|
645
|
+
it '3文字しかなくても正しく扱える' do
|
646
|
+
subject.possible_words('AAA',0,false).map(&:surface).should == %w(A AA AAA)
|
647
|
+
end
|
648
|
+
end
|
649
|
+
end
|
650
|
+
end
|
651
|
+
end
|
652
|
+
end
|
653
|
+
end
|
654
|
+
|
655
|
+
describe Okura::Tagger do
|
656
|
+
describe '#parse' do
|
657
|
+
it '文字列を解析してNodesを返せる' do
|
658
|
+
dic=Okura::WordDic::Naive.new
|
659
|
+
dic.define w('a',1,1,0)
|
660
|
+
dic.define w('aa',1,1,10)
|
661
|
+
dic.define w('b',2,2,3)
|
662
|
+
tagger=Okura::Tagger.new dic,nil
|
663
|
+
|
664
|
+
nodes=tagger.parse('aab')
|
665
|
+
|
666
|
+
nodes[0][0].word.should == w('BOS/EOS',0,0,0)
|
667
|
+
nodes[4][0].word.should == w('BOS/EOS',0,0,0)
|
668
|
+
nodes[1].size.should == 2
|
669
|
+
nodes[3][0].word.should == w('b',2,2,3)
|
670
|
+
end
|
671
|
+
end
|
672
|
+
end
|
673
|
+
|
674
|
+
describe Okura::Node do
|
675
|
+
describe '#make_bos_eos' do
|
676
|
+
describe '#length' do
|
677
|
+
it 'returns 1' do
|
678
|
+
Okura::Node.mk_bos_eos.length.should == 1
|
679
|
+
end
|
680
|
+
end
|
681
|
+
end
|
682
|
+
end
|
683
|
+
|
684
|
+
describe Okura::Nodes do
|
685
|
+
describe '#mincost_path' do
|
686
|
+
it '最小コストのパスを返せる' do
|
687
|
+
mat=Okura::Matrix.new 2,2
|
688
|
+
mat.set(0,1,10)
|
689
|
+
mat.set(1,0,10)
|
690
|
+
nodes=Okura::Nodes.new 3,mat
|
691
|
+
nodes.add(0,Okura::Node.mk_bos_eos)
|
692
|
+
nodes.add(1,n(w('a',1,1,10)))
|
693
|
+
nodes.add(1,n(w('b',1,1,0)))
|
694
|
+
nodes.add(2,Okura::Node.mk_bos_eos)
|
695
|
+
|
696
|
+
mcp=nodes.mincost_path
|
697
|
+
mcp.length.should == 3
|
698
|
+
mcp[0].word.surface.should == 'BOS/EOS'
|
699
|
+
mcp[1].word.surface.should == 'b'
|
700
|
+
mcp[2].word.surface.should == 'BOS/EOS'
|
701
|
+
end
|
702
|
+
it '単語長が1を超えても動く' do
|
703
|
+
mat=Okura::Matrix.new 2,2
|
704
|
+
mat.set(0,1,10)
|
705
|
+
mat.set(1,0,10)
|
706
|
+
mat.set(1,1,10)
|
707
|
+
nodes=Okura::Nodes.new 4,mat
|
708
|
+
nodes.add(0,Okura::Node.mk_bos_eos)
|
709
|
+
nodes.add(1,n(w('a',1,1,10)))
|
710
|
+
nodes.add(1,n(w('bb',1,1,0)))
|
711
|
+
nodes.add(2,n(w('a',1,1,10)))
|
712
|
+
nodes.add(3,Okura::Node.mk_bos_eos)
|
713
|
+
|
714
|
+
mcp=nodes.mincost_path
|
715
|
+
mcp.length.should == 3
|
716
|
+
mcp[0].word.surface.should == 'BOS/EOS'
|
717
|
+
mcp[1].word.surface.should == 'bb'
|
718
|
+
mcp[2].word.surface.should == 'BOS/EOS'
|
719
|
+
end
|
720
|
+
end
|
721
|
+
end
|