okura 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,721 @@
1
+ #-*- coding:utf-8
2
+ require File.join(File.dirname(__FILE__),'spec_helper.rb')
3
+ require File.join(File.dirname(__FILE__),'..','lib','okura')
4
+ require File.join(File.dirname(__FILE__),'..','lib','okura','parser')
5
+ require File.join(File.dirname(__FILE__),'..','lib','okura','serializer')
6
+
7
+ require 'tmpdir'
8
+
9
+ def with_dict_dir &block
10
+ Dir.mktmpdir {|src_dir|
11
+ Dir.mktmpdir {|bin_dir|
12
+ yield src_dir,bin_dir
13
+ }
14
+ }
15
+ end
16
+
17
+ def set_content(dir,filename,content)
18
+ File.open(File.join(dir,filename),'w'){|file|
19
+ file.write content
20
+ }
21
+ end
22
+
23
+ def as_io str
24
+ StringIO.new str
25
+ end
26
+ def w surface,l,r,cost
27
+ l=f(l) unless l.respond_to? :id
28
+ r=f(r) unless r.respond_to? :id
29
+ Okura::Word.new surface,l,r,cost
30
+ end
31
+ def f id,name="F#{id}"
32
+ Okura::Feature.new id,name
33
+ end
34
+ def n *args
35
+ Okura::Node.new *args
36
+ end
37
+
38
+ describe Okura::Parser do
39
+ describe 'Matrix' do
40
+ it 'MeCab形式のMatrixファイルを読める' do
41
+ parser=Okura::Parser::Matrix.new as_io(<<-EOS)
42
+ 2 3
43
+ 0 0 0
44
+ 0 1 1
45
+ 1 0 2
46
+ 1 1 3
47
+ 1 2 10
48
+ EOS
49
+ parser.rid_size.should == 2
50
+ parser.lid_size.should == 3
51
+ parser.each.to_a.should == [
52
+ [0,0,0],
53
+ [0,1,1],
54
+ [1,0,2],
55
+ [1,1,3],
56
+ [1,2,10]
57
+ ]
58
+ end
59
+ end
60
+ describe 'Word' do
61
+ it 'MeCab形式の単語ファイルを読める' do
62
+ parser=Okura::Parser::Word.new as_io(<<-EOS)
63
+ あがなう,854,458,6636,動詞,自立,*,*,五段・ワ行促音便,基本形,あがなう,アガナウ,アガナウ,あがなう/購う/贖う,
64
+ あがめる,645,546,1234,動詞,自立,*,*,一段,基本形,あがめる,アガメル,アガメル,あがめる/崇める,
65
+ EOS
66
+ parser.each.to_a.map{|x|x[0..3]}.should == [
67
+ ['あがなう',854,458,6636],
68
+ ['あがめる',645,546,1234]
69
+ ]
70
+ end
71
+ it 'ダブルクオートでエスケープされた単語定義も扱える'
72
+ end
73
+ describe 'Feature' do
74
+ it 'MeCab形式の品詞ファイルを読める' do
75
+ parser=Okura::Parser::Feature.new as_io(<<-EOS)
76
+ 0 BOS/EOS,*,*,*,*,*,BOS/EOS
77
+ 1 その他,間投,*,*,*,*,*
78
+ EOS
79
+ parser.each.to_a.should == [
80
+ [0,'BOS/EOS,*,*,*,*,*,BOS/EOS'],
81
+ [1,'その他,間投,*,*,*,*,*']
82
+ ]
83
+ end
84
+ end
85
+ describe 'CharType' do
86
+ it 'MeCab形式の文字種定義ファイルを読める' do
87
+ parser=Okura::Parser::CharType.new
88
+ h={single:[],range:[],type:[]}
89
+ parser.on_mapping_single {|code,type,ctypes| h[:single]<<[code,type,ctypes]}
90
+ parser.on_mapping_range{|from,to,type,ctypes| h[:range]<<[from,to,type,ctypes]}
91
+ parser.on_chartype_def{|name,invoke,group,length| h[:type]<<[name,invoke,group,length]}
92
+
93
+ parser.parse_all as_io(<<-EOS)
94
+ DEFAULT 0 1 0 # DEFAULT is a mandatory category!
95
+ KATAKANA 1 0 2
96
+
97
+ 0x000D SPACE # CR
98
+ 0x003A..0x0040 SYMBOL
99
+ # KANJI
100
+ 0x5146 KANJINUMERIC KANJI
101
+ EOS
102
+
103
+ h[:single].should == [
104
+ [0x000D, 'SPACE', []],
105
+ [0x5146, 'KANJINUMERIC', %w(KANJI)]
106
+ ]
107
+ h[:range].should == [
108
+ [0x003A, 0x0040, 'SYMBOL', []]
109
+ ]
110
+ h[:type].should == [
111
+ ['DEFAULT', false, true, 0],
112
+ ['KATAKANA', true, false, 2]
113
+ ]
114
+ end
115
+ end
116
+ describe 'UnkDic' do
117
+ it '未知語の定義を読める' do
118
+ parser=Okura::Parser::UnkDic.new as_io(<<-EOS)
119
+ A,5,6,3274,記号,一般,*,*,*,*,*
120
+ Z,9,10,5244,記号,空白,*,*,*,*,*
121
+ EOS
122
+ parser.to_a.should == [
123
+ ['A',5,6,3274],
124
+ ['Z',9,10,5244]
125
+ ]
126
+ end
127
+ end
128
+ end
129
+
130
+ describe 'Compile and load' do
131
+ describe Okura::Serializer::FormatInfo do
132
+ it 'シリアライズして復元できる' do
133
+ info=Okura::Serializer::FormatInfo.new
134
+ info.word_dic=:Naive
135
+ info.features=:Marshal
136
+ info.char_types=:Marshal
137
+ info.unk_dic=:Marshal
138
+ info.matrix=:Marshal
139
+
140
+ out=StringIO.new
141
+ info.compile(out)
142
+ out.rewind
143
+
144
+ loaded=Okura::Serializer::FormatInfo.load(out)
145
+ loaded.word_dic.should == :Naive
146
+ loaded.features.should == :Marshal
147
+ loaded.char_types.should == :Marshal
148
+ loaded.unk_dic.should == :Marshal
149
+ loaded.matrix.should == :Marshal
150
+ end
151
+ it '設定に基づいて辞書をコンパイル/ロードできる' do
152
+ with_dict_dir{|src_dir,bin_dir|
153
+ set_content(src_dir,'w1.csv',<<-EOS)
154
+ w1,1,2,1000,
155
+ EOS
156
+ set_content(src_dir,'w2.csv',<<-EOS)
157
+ w2,5,6,2000,
158
+ w3,9,10,3000,
159
+ EOS
160
+ set_content(src_dir,'left-id.def',<<-EOS)
161
+ 1 F1
162
+ 5 F5
163
+ 9 F9
164
+ EOS
165
+ set_content(src_dir,'right-id.def',<<-EOS)
166
+ 2 F2
167
+ 6 F6
168
+ 10 F10
169
+ EOS
170
+ set_content(src_dir,'char.def',<<-EOS)
171
+ A 0 0 1
172
+ Z 1 1 3
173
+ EOS
174
+ set_content(src_dir,'unk.def',<<-EOS)
175
+ A,5,6,3274,記号,一般,*,*,*,*,*
176
+ Z,9,10,5244,記号,空白,*,*,*,*,*
177
+ EOS
178
+ set_content(src_dir,'matrix.def',<<-EOS)
179
+ 2 3
180
+ 0 0 10
181
+ 0 1 5
182
+ EOS
183
+
184
+ fi=Okura::Serializer::FormatInfo.new
185
+ fi.encoding='UTF-8'
186
+ fi.compile_dict(src_dir,bin_dir)
187
+
188
+ tagger=Okura::Serializer::FormatInfo.create_tagger(bin_dir)
189
+
190
+ tagger.dic.unk_dic.rule_size.should == 2
191
+ tagger.dic.word_dic.word_size.should == 3
192
+ tagger.mat.cost(0,1).should == 5
193
+
194
+ pending {
195
+ w2=tagger.dic.word_dic.possible_words('w2',0)[0]
196
+ w2.left.name.should == 'F5'
197
+ w2.right.name.should == 'F6'
198
+ }
199
+ penging {
200
+ u1=tagger.dic.unk_dic.word_templates_for('A')
201
+ u1.left.name.should == 'F5'
202
+ u1.right.name.should == 'F6'
203
+ }
204
+ }
205
+ end
206
+ end
207
+ describe Okura::Serializer::Features::Marshal do
208
+ it 'コンパイルして復元できる' do
209
+ serializer=Okura::Serializer::Features::Marshal.new
210
+ out=StringIO.new
211
+ serializer.compile(as_io(<<-EOS),out)
212
+ 0 BOS/EOS,*,*,*,*,*,BOS/EOS
213
+ 1 その他,間投,*,*,*,*,*
214
+ EOS
215
+ out.rewind
216
+
217
+ features=serializer.load(out)
218
+ features.from_id(0).text.should == 'BOS/EOS,*,*,*,*,*,BOS/EOS'
219
+ features.from_id(1).text.should == 'その他,間投,*,*,*,*,*'
220
+ end
221
+ end
222
+ describe Okura::Serializer::CharTypes::Marshal do
223
+ it 'コンパイルして復元できる' do
224
+ serializer=Okura::Serializer::CharTypes::Marshal.new
225
+ out=StringIO.new
226
+ serializer.compile(as_io(<<-EOS),out)
227
+ DEFAULT 0 1 0 # DEFAULT is a mandatory category!
228
+ TYPE1 1 0 0
229
+ TYPE2 0 1 0
230
+ TYPE3 0 1 3
231
+
232
+ # comment
233
+
234
+ 0x0021 TYPE1
235
+ 0x0022 TYPE2 # comment
236
+ 0x0023..0x0040 TYPE3
237
+ 0x0099 TYPE1 TYPE2 # 互換カテゴリ
238
+ 0xABCd TYPE1 DEFAULT
239
+ EOS
240
+ out.rewind
241
+
242
+ cts=serializer.load(out)
243
+
244
+ cts.type_for(0x21).name.should == 'TYPE1'
245
+ cts.type_for(0x22).name.should == 'TYPE2'
246
+ cts.type_for(0x23).name.should == 'TYPE3'
247
+ cts.type_for(0x40).name.should == 'TYPE3'
248
+ cts.type_for(0x41).name.should == 'DEFAULT'
249
+ cts.type_for(0x99).name.should == 'TYPE1'
250
+
251
+ t1,t2,t3=cts.named('TYPE1'), cts.named('TYPE2'), cts.named('TYPE3')
252
+
253
+ t1.name.should == 'TYPE1'
254
+
255
+ t1.invoke?.should be_true
256
+ t2.invoke?.should be_false
257
+
258
+ t1.group?.should be_false
259
+ t2.group?.should be_true
260
+
261
+ t2.length.should == 0
262
+ t3.length.should == 3
263
+
264
+ t1.should be_accept(0x21)
265
+ t1.should_not be_accept(0x22)
266
+ t2.should be_accept(0x22)
267
+
268
+ t1.should be_accept(0x99)
269
+ end
270
+ end
271
+ shared_examples_for 'WordDic serializer' do
272
+ # subject : Serializer class
273
+ it 'コンパイルして復元できる' do
274
+ serializer=subject.new
275
+ features=Okura::Features.new
276
+ features.add 854,f(854)
277
+ features.add 458,f(458)
278
+ features.add 645,f(645)
279
+ features.add 546,f(546)
280
+ out=StringIO.new
281
+ src=<<-EOS
282
+ あがなう,854,458,6636,動詞,自立,*,*,五段・ワ行促音便,基本形,あがなう,アガナウ,アガナウ,あがなう/購う/贖う,
283
+ あがめる,645,546,1234,動詞,自立,*,*,一段,基本形,あがめる,アガメル,アガメル,あがめる/崇める,
284
+ EOS
285
+ serializer.compile(features,[as_io(src)],'UTF-8',out)
286
+ out.rewind
287
+ wd=serializer.load(out)
288
+
289
+ wd.possible_words('あがなう',0).should == [w('あがなう',f(854),f(458),6636)]
290
+ wd.possible_words('あがめる',0).should == [w('あがめる',f(645),f(546),1234)]
291
+ wd.possible_words('あがめる',1).should == []
292
+ end
293
+ end
294
+ describe Okura::Serializer::WordDic::Naive do
295
+ subject { Okura::Serializer::WordDic::Naive }
296
+ it_should_behave_like 'WordDic serializer'
297
+ end
298
+ describe Okura::Serializer::WordDic::DoubleArray do
299
+ subject { Okura::Serializer::WordDic::DoubleArray }
300
+ it_should_behave_like 'WordDic serializer'
301
+ end
302
+ describe Okura::Serializer::UnkDic::Marshal do
303
+ it 'コンパイルして復元できる' do
304
+ serializer=Okura::Serializer::UnkDic::Marshal.new
305
+ cts=Okura::CharTypes.new
306
+ cts.define_type 'A',true,false,10
307
+ cts.define_type 'Z',false,true,0
308
+ cts.define_map 0x0001,cts.named('A'),[]
309
+ cts.define_map 0x0002,cts.named('Z'),[]
310
+ features=Okura::Features.new
311
+ features.add 5,'F5'
312
+ features.add 6,'F6'
313
+ features.add 9,'F9'
314
+ features.add 10,'F10'
315
+ out=StringIO.new
316
+ serializer.compile(cts,features,as_io(<<-EOS),out)
317
+ A,5,6,3274,記号,一般,*,*,*,*,*
318
+ Z,9,10,5244,記号,空白,*,*,*,*,*
319
+ EOS
320
+ out.rewind
321
+
322
+ unk=serializer.load(out)
323
+ unk.word_templates_for('A').first.cost.should == 3274
324
+ unk.word_templates_for('Z').first.cost.should == 5244
325
+ end
326
+ end
327
+ describe Okura::Serializer::Matrix::Marshal do
328
+ it 'コンパイルして復元できる' do
329
+ serializer=Okura::Serializer::Matrix::Marshal.new
330
+ out=StringIO.new
331
+ serializer.compile(as_io(<<-EOS),out)
332
+ 2 3
333
+ 0 0 0
334
+ 0 1 1
335
+ 1 0 2
336
+ 1 1 3
337
+ 1 2 10
338
+ EOS
339
+ out.rewind
340
+
341
+ mat=serializer.load(out)
342
+ mat.cost(0,0).should == 0
343
+ mat.cost(1,2).should == 10
344
+ end
345
+ end
346
+ end
347
+
348
+ describe Okura::Matrix do
349
+ describe '#cost' do
350
+ it '渡された二つのFeature idを元にコストを返せる' do
351
+ m=Okura::Matrix.new 2,2
352
+ m.set(0,0,0)
353
+ m.set(0,1,1)
354
+ m.set(1,0,2)
355
+ m.set(1,1,3)
356
+
357
+ m.cost(1,1).should == 3
358
+ end
359
+ end
360
+ end
361
+
362
+ shared_examples_for 'WordDic' do
363
+ # subject = dict builder
364
+ def w surface
365
+ Okura::Word.new surface,f(1),f(1),1
366
+ end
367
+
368
+ describe '#possible_words' do
369
+ it '登録された単語のサイズを取得できる' do
370
+ subject.build.word_size.should == 0
371
+ subject.define w('aaa')
372
+ subject.define w('bbb')
373
+ subject.build.word_size.should == 2
374
+ end
375
+ it '同じ表記の単語を複数登録できる' do
376
+ w1=Okura::Word.new 'w',f(1),f(2),100
377
+ w2=Okura::Word.new 'w',f(10),f(20),200
378
+ subject.define w1
379
+ subject.define w1
380
+ subject.define w2
381
+
382
+ wd=subject.build
383
+
384
+ wd.possible_words('w',0).should == [w1,w1,w2]
385
+ end
386
+ it '文字列と位置から、辞書に登録された単語を返せる' do
387
+ subject.define w('aaa')
388
+ subject.define w('bbb')
389
+ subject.define w('aa')
390
+ subject.define w('aaaa')
391
+ subject.define w('aaaaa')
392
+
393
+ wd=subject.build
394
+
395
+ wd.possible_words('bbbaaa',0).should == [w('bbb')]
396
+ wd.possible_words('bbbaaa',1).should == []
397
+ wd.possible_words('bbbaaa',3).should == [w('aa'),w('aaa')]
398
+ end
399
+ it 'マルチバイト文字にも対応している' do
400
+ subject.define w('ニワトリ')
401
+ wd=subject.build
402
+
403
+ wd.possible_words('ニワトリ',0).should == [w('ニワトリ')]
404
+ wd.possible_words('ニワトリ',1).should == []
405
+ end
406
+ def matches words,str,dest
407
+ words.each{|word| subject.define w(word) }
408
+ dic=subject.build
409
+ dic.possible_words(str,0).should == dest.map{|d|w(d)}
410
+ end
411
+ it { matches %w() , '' , %w() }
412
+ it { matches %w() , 'aaa' , %w() }
413
+ it { matches %w(a) , '' , %w() }
414
+ it { matches %w(a) , 'a' , %w(a) }
415
+ it { matches %w(a) , 'aa' , %w(a) }
416
+ it { matches %w(a) , 'b' , %w() }
417
+ it { matches %w(aa) , 'a' , %w() }
418
+ it { matches %w(aa) , 'aa' , %w(aa) }
419
+ it { matches %w(aa) , 'aaa' , %w(aa) }
420
+ it { matches %w(aa) , 'ab' , %w() }
421
+ it { matches %w(a aa) , 'a' , %w(a) }
422
+ it { matches %w(a aa) , 'aa' , %w(a aa) }
423
+ it { matches %w(a aa) , 'aaa' , %w(a aa) }
424
+ it { matches %w(a aa) , 'aab' , %w(a aa) }
425
+ it { matches %w(a aa ab) , 'aab' , %w(a aa) }
426
+ it { matches %w(a aa ab) , 'ab' , %w(a ab) }
427
+ it { matches %w(a aa ab) , 'aa' , %w(a aa) }
428
+ it { matches %w(a b) , 'ba' , %w(b) }
429
+ it { matches %w(アイウ) , 'アイウ' , %w(アイウ) }
430
+ it { matches %w(ア アイ) , 'アイウ' , %w(ア アイ) }
431
+ it { matches %w(ア アイ) , 'aアイウ' , %w() }
432
+ end
433
+ end
434
+
435
+ describe Okura::WordDic::Naive do
436
+ class NaiveBuilder
437
+ def initialize
438
+ @wd=Okura::WordDic::Naive.new
439
+ end
440
+ def define *args
441
+ @wd.define *args
442
+ end
443
+ def build
444
+ @wd
445
+ end
446
+ end
447
+ subject { NaiveBuilder.new }
448
+ it_should_behave_like 'WordDic'
449
+ end
450
+
451
+ describe Okura::WordDic::DoubleArray do
452
+ subject { Okura::WordDic::DoubleArray::Builder.new }
453
+ def base(dic)
454
+ dic.instance_eval{@base}
455
+ end
456
+ def check(dic)
457
+ dic.instance_eval{@check}
458
+ end
459
+ def words(dic)
460
+ dic.instance_eval{@words}
461
+ end
462
+ it_should_behave_like 'WordDic'
463
+ end
464
+
465
+ describe Okura::Features do
466
+ end
467
+
468
+ describe Okura::CharTypes do
469
+ describe '#type_for' do
470
+ describe '文字に対するCharTypeが定義されていない場合' do
471
+ describe '文字種DEFAULTが定義されている場合' do
472
+ subject {
473
+ cts=Okura::CharTypes.new
474
+ cts.define_type 'DEFAULT',false,false,0
475
+ cts
476
+ }
477
+ it 'CharType#default_typeが返る' do
478
+ subject.type_for('a'.ord).name.should == subject.default_type.name
479
+ end
480
+ end
481
+ describe '文字種DEFAULTが定義されてない場合' do
482
+ subject { cts=Okura::CharTypes.new }
483
+ it 'エラーになる' do
484
+ expect { subject.type_for('a'.ord) }.to raise_error
485
+ end
486
+ end
487
+ end
488
+ end
489
+ describe '#define_map' do
490
+ describe '互換カテゴリが指定された場合' do
491
+ subject {
492
+ cts=Okura::CharTypes.new
493
+ cts.define_type 'A',true,true,10
494
+ cts.define_type 'B',true,true,10
495
+ cts.define_map 1,cts.named('A'),[cts.named('B')]
496
+ cts
497
+ }
498
+ it '互換カテゴリが正しく認識される' do
499
+ subject.named('A').accept?(1).should be_true
500
+ subject.named('B').accept?(1).should be_true
501
+ end
502
+ end
503
+ end
504
+ end
505
+
506
+ describe Okura::UnkDic do
507
+ describe '#possible_words' do
508
+ describe '互換カテゴリ' do
509
+ subject {
510
+ cts=Okura::CharTypes.new
511
+ cts.define_type 'KATAKANA',false,true,0
512
+ cts.define_type 'HIRAGANA',false,true,0
513
+ cts.define_map 'ア'.ord,cts.named('KATAKANA'),[]
514
+ cts.define_map 'ー'.ord,cts.named('HIRAGANA'),[cts.named('KATAKANA')]
515
+ ud=Okura::UnkDic.new cts
516
+ ud.define 'KATAKANA',f(10),f(20),1000
517
+ ud.define 'HIRAGANA',f(1),f(2),1000
518
+ ud
519
+ }
520
+ it '互換カテゴリを正しく解釈する' do
521
+ subject.possible_words('アーー',0,false).should == [w('アーー',10,20,1000)]
522
+ end
523
+ end
524
+ describe '未知語定義' do
525
+ describe '同一文字種に複数の未知語定義があった場合' do
526
+ subject do
527
+ cts=Okura::CharTypes.new
528
+ cts.define_type 'A',true,true,0
529
+ cts.define_map 'A'.ord,cts.named('A'),[]
530
+ ud=Okura::UnkDic.new cts
531
+ ud.define 'A',f(10),f(20),1000
532
+ ud.define 'A',f(11),f(21),1111
533
+ ud
534
+ end
535
+ it 'すべての定義から未知語を抽出する' do
536
+ subject.possible_words('A',0,false).should == [
537
+ w('A',10,20,1000),
538
+ w('A',11,21,1111)
539
+ ]
540
+ end
541
+ end
542
+ end
543
+ end
544
+ describe '#possible_words: 文字コードによる挙動:' do
545
+ subject do
546
+ cts=Okura::CharTypes.new
547
+ cts.define_type 'A',true,true,0
548
+ cts.define_map 'あ'.ord,cts.named('A'),[]
549
+ ud=Okura::UnkDic.new cts
550
+ ud.define 'A',f(10),f(20),1000
551
+ ud
552
+ end
553
+ describe 'UTF8文字列が来たとき' do
554
+ it '正しく解析できる' do
555
+ subject.possible_words('あいう'.encode('UTF-8'),0,false).map(&:surface).should == %w(あ)
556
+ end
557
+ end
558
+ describe 'UTF8じゃない文字列が来たとき' do
559
+ it 'エラーになる' do
560
+ expect { subject.possible_words('あいう'.encode('SHIFT_JIS'),0,false) }.to raise_error
561
+ end
562
+ end
563
+ end
564
+ describe '#possible_words: 先頭文字のカテゴリによる挙動:' do
565
+ def create_chartypes typename_under_test
566
+ cts=Okura::CharTypes.new
567
+ cts.define_type 'T000',false,false,0
568
+ cts.define_type 'T012',false,true,2
569
+ cts.define_type 'T100',true,false,0
570
+ cts.define_type 'T102',true,false,2
571
+ cts.define_type 'T110',true,true,0
572
+ cts.define_type 'T112',true,true,2
573
+ cts.define_type 'ZZZZ',true,true,2
574
+
575
+ cts.define_map 'A'.ord,cts.named(typename_under_test),[]
576
+ cts.define_map 'Z'.ord,cts.named('ZZZZ'),[]
577
+
578
+ cts
579
+ end
580
+ def create_subject typename_under_test
581
+ udic=Okura::UnkDic.new create_chartypes(typename_under_test)
582
+ udic.define typename_under_test,f(10),(20),1000
583
+ udic
584
+ end
585
+ describe 'invoke=0のとき' do
586
+ subject { create_subject 'T012' }
587
+ describe '辞書に単語がある場合' do
588
+ it '未知語を抽出しない' do
589
+ subject.possible_words('AAA',0,true).should be_empty
590
+ end
591
+ end
592
+ end
593
+ describe 'invoke=1のとき' do
594
+ describe '辞書に単語がある場合' do
595
+ subject { create_subject 'T102' }
596
+ it 'も、未知語を抽出する' do
597
+ subject.possible_words('AAAZ',0,true).should_not be_empty
598
+ end
599
+ end
600
+ describe '先頭文字のカテゴリに対応する未知語定義がなかった場合' do
601
+ subject { create_subject 'T112' }
602
+ it '未知語を抽出しない' do
603
+ subject.possible_words('ZZ',0,false).should be_empty
604
+ end
605
+ end
606
+ describe '辞書に単語がない場合' do
607
+ describe 'group=0のとき' do
608
+ describe 'length=0のとき' do
609
+ subject { create_subject 'T100' }
610
+ it '未知語を抽出しない' do
611
+ subject.possible_words('AAAZ',0,false).should be_empty
612
+ end
613
+ end
614
+ describe 'length=2のとき' do
615
+ subject { create_subject 'T102' }
616
+ it '2文字までの同種文字列を未知語とする' do
617
+ subject.possible_words('AAAZ',0,false).map(&:surface).should == %w(A AA)
618
+ end
619
+ end
620
+ end
621
+ describe 'group=1のとき' do
622
+ describe 'length=0のとき' do
623
+ subject { create_subject 'T110' }
624
+ it '同種の文字列を長さ制限なしでまとめて未知語とする' do
625
+ subject.possible_words('AAAAAZ',0,false).map(&:surface).should == %w(AAAAA)
626
+ end
627
+ it '連続が一文字の場合も未知語として取れる' do
628
+ subject.possible_words('AZZZ',0,false).map(&:surface).should == %w(A)
629
+ end
630
+ it '1文字しかなくても正しく扱える' do
631
+ subject.possible_words('A',0,false).map(&:surface).should == %w(A)
632
+ end
633
+ end
634
+ describe 'length=2のとき' do
635
+ subject { create_subject 'T112' }
636
+ it 'length=0の結果に加え、2文字までの同種文字列を未知語とする' do
637
+ subject.possible_words('AAAAAZ',0,false).map(&:surface).should == %w(A AA AAAAA)
638
+ end
639
+ it '1文字しかなくても正しく扱える' do
640
+ subject.possible_words('A',0,false).map(&:surface).should == %w(A)
641
+ end
642
+ it '2文字しかなくても正しく扱える' do
643
+ subject.possible_words('AA',0,false).map(&:surface).should == %w(A AA)
644
+ end
645
+ it '3文字しかなくても正しく扱える' do
646
+ subject.possible_words('AAA',0,false).map(&:surface).should == %w(A AA AAA)
647
+ end
648
+ end
649
+ end
650
+ end
651
+ end
652
+ end
653
+ end
654
+
655
+ describe Okura::Tagger do
656
+ describe '#parse' do
657
+ it '文字列を解析してNodesを返せる' do
658
+ dic=Okura::WordDic::Naive.new
659
+ dic.define w('a',1,1,0)
660
+ dic.define w('aa',1,1,10)
661
+ dic.define w('b',2,2,3)
662
+ tagger=Okura::Tagger.new dic,nil
663
+
664
+ nodes=tagger.parse('aab')
665
+
666
+ nodes[0][0].word.should == w('BOS/EOS',0,0,0)
667
+ nodes[4][0].word.should == w('BOS/EOS',0,0,0)
668
+ nodes[1].size.should == 2
669
+ nodes[3][0].word.should == w('b',2,2,3)
670
+ end
671
+ end
672
+ end
673
+
674
+ describe Okura::Node do
675
+ describe '#make_bos_eos' do
676
+ describe '#length' do
677
+ it 'returns 1' do
678
+ Okura::Node.mk_bos_eos.length.should == 1
679
+ end
680
+ end
681
+ end
682
+ end
683
+
684
+ describe Okura::Nodes do
685
+ describe '#mincost_path' do
686
+ it '最小コストのパスを返せる' do
687
+ mat=Okura::Matrix.new 2,2
688
+ mat.set(0,1,10)
689
+ mat.set(1,0,10)
690
+ nodes=Okura::Nodes.new 3,mat
691
+ nodes.add(0,Okura::Node.mk_bos_eos)
692
+ nodes.add(1,n(w('a',1,1,10)))
693
+ nodes.add(1,n(w('b',1,1,0)))
694
+ nodes.add(2,Okura::Node.mk_bos_eos)
695
+
696
+ mcp=nodes.mincost_path
697
+ mcp.length.should == 3
698
+ mcp[0].word.surface.should == 'BOS/EOS'
699
+ mcp[1].word.surface.should == 'b'
700
+ mcp[2].word.surface.should == 'BOS/EOS'
701
+ end
702
+ it '単語長が1を超えても動く' do
703
+ mat=Okura::Matrix.new 2,2
704
+ mat.set(0,1,10)
705
+ mat.set(1,0,10)
706
+ mat.set(1,1,10)
707
+ nodes=Okura::Nodes.new 4,mat
708
+ nodes.add(0,Okura::Node.mk_bos_eos)
709
+ nodes.add(1,n(w('a',1,1,10)))
710
+ nodes.add(1,n(w('bb',1,1,0)))
711
+ nodes.add(2,n(w('a',1,1,10)))
712
+ nodes.add(3,Okura::Node.mk_bos_eos)
713
+
714
+ mcp=nodes.mincost_path
715
+ mcp.length.should == 3
716
+ mcp[0].word.surface.should == 'BOS/EOS'
717
+ mcp[1].word.surface.should == 'bb'
718
+ mcp[2].word.surface.should == 'BOS/EOS'
719
+ end
720
+ end
721
+ end