okura 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,721 @@
1
+ #-*- coding:utf-8
2
+ require File.join(File.dirname(__FILE__),'spec_helper.rb')
3
+ require File.join(File.dirname(__FILE__),'..','lib','okura')
4
+ require File.join(File.dirname(__FILE__),'..','lib','okura','parser')
5
+ require File.join(File.dirname(__FILE__),'..','lib','okura','serializer')
6
+
7
+ require 'tmpdir'
8
+
9
+ def with_dict_dir &block
10
+ Dir.mktmpdir {|src_dir|
11
+ Dir.mktmpdir {|bin_dir|
12
+ yield src_dir,bin_dir
13
+ }
14
+ }
15
+ end
16
+
17
+ def set_content(dir,filename,content)
18
+ File.open(File.join(dir,filename),'w'){|file|
19
+ file.write content
20
+ }
21
+ end
22
+
23
+ def as_io str
24
+ StringIO.new str
25
+ end
26
+ def w surface,l,r,cost
27
+ l=f(l) unless l.respond_to? :id
28
+ r=f(r) unless r.respond_to? :id
29
+ Okura::Word.new surface,l,r,cost
30
+ end
31
+ def f id,name="F#{id}"
32
+ Okura::Feature.new id,name
33
+ end
34
+ def n *args
35
+ Okura::Node.new *args
36
+ end
37
+
38
+ describe Okura::Parser do
39
+ describe 'Matrix' do
40
+ it 'MeCab形式のMatrixファイルを読める' do
41
+ parser=Okura::Parser::Matrix.new as_io(<<-EOS)
42
+ 2 3
43
+ 0 0 0
44
+ 0 1 1
45
+ 1 0 2
46
+ 1 1 3
47
+ 1 2 10
48
+ EOS
49
+ parser.rid_size.should == 2
50
+ parser.lid_size.should == 3
51
+ parser.each.to_a.should == [
52
+ [0,0,0],
53
+ [0,1,1],
54
+ [1,0,2],
55
+ [1,1,3],
56
+ [1,2,10]
57
+ ]
58
+ end
59
+ end
60
+ describe 'Word' do
61
+ it 'MeCab形式の単語ファイルを読める' do
62
+ parser=Okura::Parser::Word.new as_io(<<-EOS)
63
+ あがなう,854,458,6636,動詞,自立,*,*,五段・ワ行促音便,基本形,あがなう,アガナウ,アガナウ,あがなう/購う/贖う,
64
+ あがめる,645,546,1234,動詞,自立,*,*,一段,基本形,あがめる,アガメル,アガメル,あがめる/崇める,
65
+ EOS
66
+ parser.each.to_a.map{|x|x[0..3]}.should == [
67
+ ['あがなう',854,458,6636],
68
+ ['あがめる',645,546,1234]
69
+ ]
70
+ end
71
+ it 'ダブルクオートでエスケープされた単語定義も扱える'
72
+ end
73
+ describe 'Feature' do
74
+ it 'MeCab形式の品詞ファイルを読める' do
75
+ parser=Okura::Parser::Feature.new as_io(<<-EOS)
76
+ 0 BOS/EOS,*,*,*,*,*,BOS/EOS
77
+ 1 その他,間投,*,*,*,*,*
78
+ EOS
79
+ parser.each.to_a.should == [
80
+ [0,'BOS/EOS,*,*,*,*,*,BOS/EOS'],
81
+ [1,'その他,間投,*,*,*,*,*']
82
+ ]
83
+ end
84
+ end
85
+ describe 'CharType' do
86
+ it 'MeCab形式の文字種定義ファイルを読める' do
87
+ parser=Okura::Parser::CharType.new
88
+ h={single:[],range:[],type:[]}
89
+ parser.on_mapping_single {|code,type,ctypes| h[:single]<<[code,type,ctypes]}
90
+ parser.on_mapping_range{|from,to,type,ctypes| h[:range]<<[from,to,type,ctypes]}
91
+ parser.on_chartype_def{|name,invoke,group,length| h[:type]<<[name,invoke,group,length]}
92
+
93
+ parser.parse_all as_io(<<-EOS)
94
+ DEFAULT 0 1 0 # DEFAULT is a mandatory category!
95
+ KATAKANA 1 0 2
96
+
97
+ 0x000D SPACE # CR
98
+ 0x003A..0x0040 SYMBOL
99
+ # KANJI
100
+ 0x5146 KANJINUMERIC KANJI
101
+ EOS
102
+
103
+ h[:single].should == [
104
+ [0x000D, 'SPACE', []],
105
+ [0x5146, 'KANJINUMERIC', %w(KANJI)]
106
+ ]
107
+ h[:range].should == [
108
+ [0x003A, 0x0040, 'SYMBOL', []]
109
+ ]
110
+ h[:type].should == [
111
+ ['DEFAULT', false, true, 0],
112
+ ['KATAKANA', true, false, 2]
113
+ ]
114
+ end
115
+ end
116
+ describe 'UnkDic' do
117
+ it '未知語の定義を読める' do
118
+ parser=Okura::Parser::UnkDic.new as_io(<<-EOS)
119
+ A,5,6,3274,記号,一般,*,*,*,*,*
120
+ Z,9,10,5244,記号,空白,*,*,*,*,*
121
+ EOS
122
+ parser.to_a.should == [
123
+ ['A',5,6,3274],
124
+ ['Z',9,10,5244]
125
+ ]
126
+ end
127
+ end
128
+ end
129
+
130
+ describe 'Compile and load' do
131
+ describe Okura::Serializer::FormatInfo do
132
+ it 'シリアライズして復元できる' do
133
+ info=Okura::Serializer::FormatInfo.new
134
+ info.word_dic=:Naive
135
+ info.features=:Marshal
136
+ info.char_types=:Marshal
137
+ info.unk_dic=:Marshal
138
+ info.matrix=:Marshal
139
+
140
+ out=StringIO.new
141
+ info.compile(out)
142
+ out.rewind
143
+
144
+ loaded=Okura::Serializer::FormatInfo.load(out)
145
+ loaded.word_dic.should == :Naive
146
+ loaded.features.should == :Marshal
147
+ loaded.char_types.should == :Marshal
148
+ loaded.unk_dic.should == :Marshal
149
+ loaded.matrix.should == :Marshal
150
+ end
151
+ it '設定に基づいて辞書をコンパイル/ロードできる' do
152
+ with_dict_dir{|src_dir,bin_dir|
153
+ set_content(src_dir,'w1.csv',<<-EOS)
154
+ w1,1,2,1000,
155
+ EOS
156
+ set_content(src_dir,'w2.csv',<<-EOS)
157
+ w2,5,6,2000,
158
+ w3,9,10,3000,
159
+ EOS
160
+ set_content(src_dir,'left-id.def',<<-EOS)
161
+ 1 F1
162
+ 5 F5
163
+ 9 F9
164
+ EOS
165
+ set_content(src_dir,'right-id.def',<<-EOS)
166
+ 2 F2
167
+ 6 F6
168
+ 10 F10
169
+ EOS
170
+ set_content(src_dir,'char.def',<<-EOS)
171
+ A 0 0 1
172
+ Z 1 1 3
173
+ EOS
174
+ set_content(src_dir,'unk.def',<<-EOS)
175
+ A,5,6,3274,記号,一般,*,*,*,*,*
176
+ Z,9,10,5244,記号,空白,*,*,*,*,*
177
+ EOS
178
+ set_content(src_dir,'matrix.def',<<-EOS)
179
+ 2 3
180
+ 0 0 10
181
+ 0 1 5
182
+ EOS
183
+
184
+ fi=Okura::Serializer::FormatInfo.new
185
+ fi.encoding='UTF-8'
186
+ fi.compile_dict(src_dir,bin_dir)
187
+
188
+ tagger=Okura::Serializer::FormatInfo.create_tagger(bin_dir)
189
+
190
+ tagger.dic.unk_dic.rule_size.should == 2
191
+ tagger.dic.word_dic.word_size.should == 3
192
+ tagger.mat.cost(0,1).should == 5
193
+
194
+ pending {
195
+ w2=tagger.dic.word_dic.possible_words('w2',0)[0]
196
+ w2.left.name.should == 'F5'
197
+ w2.right.name.should == 'F6'
198
+ }
199
+ penging {
200
+ u1=tagger.dic.unk_dic.word_templates_for('A')
201
+ u1.left.name.should == 'F5'
202
+ u1.right.name.should == 'F6'
203
+ }
204
+ }
205
+ end
206
+ end
207
+ describe Okura::Serializer::Features::Marshal do
208
+ it 'コンパイルして復元できる' do
209
+ serializer=Okura::Serializer::Features::Marshal.new
210
+ out=StringIO.new
211
+ serializer.compile(as_io(<<-EOS),out)
212
+ 0 BOS/EOS,*,*,*,*,*,BOS/EOS
213
+ 1 その他,間投,*,*,*,*,*
214
+ EOS
215
+ out.rewind
216
+
217
+ features=serializer.load(out)
218
+ features.from_id(0).text.should == 'BOS/EOS,*,*,*,*,*,BOS/EOS'
219
+ features.from_id(1).text.should == 'その他,間投,*,*,*,*,*'
220
+ end
221
+ end
222
+ describe Okura::Serializer::CharTypes::Marshal do
223
+ it 'コンパイルして復元できる' do
224
+ serializer=Okura::Serializer::CharTypes::Marshal.new
225
+ out=StringIO.new
226
+ serializer.compile(as_io(<<-EOS),out)
227
+ DEFAULT 0 1 0 # DEFAULT is a mandatory category!
228
+ TYPE1 1 0 0
229
+ TYPE2 0 1 0
230
+ TYPE3 0 1 3
231
+
232
+ # comment
233
+
234
+ 0x0021 TYPE1
235
+ 0x0022 TYPE2 # comment
236
+ 0x0023..0x0040 TYPE3
237
+ 0x0099 TYPE1 TYPE2 # 互換カテゴリ
238
+ 0xABCd TYPE1 DEFAULT
239
+ EOS
240
+ out.rewind
241
+
242
+ cts=serializer.load(out)
243
+
244
+ cts.type_for(0x21).name.should == 'TYPE1'
245
+ cts.type_for(0x22).name.should == 'TYPE2'
246
+ cts.type_for(0x23).name.should == 'TYPE3'
247
+ cts.type_for(0x40).name.should == 'TYPE3'
248
+ cts.type_for(0x41).name.should == 'DEFAULT'
249
+ cts.type_for(0x99).name.should == 'TYPE1'
250
+
251
+ t1,t2,t3=cts.named('TYPE1'), cts.named('TYPE2'), cts.named('TYPE3')
252
+
253
+ t1.name.should == 'TYPE1'
254
+
255
+ t1.invoke?.should be_true
256
+ t2.invoke?.should be_false
257
+
258
+ t1.group?.should be_false
259
+ t2.group?.should be_true
260
+
261
+ t2.length.should == 0
262
+ t3.length.should == 3
263
+
264
+ t1.should be_accept(0x21)
265
+ t1.should_not be_accept(0x22)
266
+ t2.should be_accept(0x22)
267
+
268
+ t1.should be_accept(0x99)
269
+ end
270
+ end
271
+ shared_examples_for 'WordDic serializer' do
272
+ # subject : Serializer class
273
+ it 'コンパイルして復元できる' do
274
+ serializer=subject.new
275
+ features=Okura::Features.new
276
+ features.add 854,f(854)
277
+ features.add 458,f(458)
278
+ features.add 645,f(645)
279
+ features.add 546,f(546)
280
+ out=StringIO.new
281
+ src=<<-EOS
282
+ あがなう,854,458,6636,動詞,自立,*,*,五段・ワ行促音便,基本形,あがなう,アガナウ,アガナウ,あがなう/購う/贖う,
283
+ あがめる,645,546,1234,動詞,自立,*,*,一段,基本形,あがめる,アガメル,アガメル,あがめる/崇める,
284
+ EOS
285
+ serializer.compile(features,[as_io(src)],'UTF-8',out)
286
+ out.rewind
287
+ wd=serializer.load(out)
288
+
289
+ wd.possible_words('あがなう',0).should == [w('あがなう',f(854),f(458),6636)]
290
+ wd.possible_words('あがめる',0).should == [w('あがめる',f(645),f(546),1234)]
291
+ wd.possible_words('あがめる',1).should == []
292
+ end
293
+ end
294
+ describe Okura::Serializer::WordDic::Naive do
295
+ subject { Okura::Serializer::WordDic::Naive }
296
+ it_should_behave_like 'WordDic serializer'
297
+ end
298
+ describe Okura::Serializer::WordDic::DoubleArray do
299
+ subject { Okura::Serializer::WordDic::DoubleArray }
300
+ it_should_behave_like 'WordDic serializer'
301
+ end
302
+ describe Okura::Serializer::UnkDic::Marshal do
303
+ it 'コンパイルして復元できる' do
304
+ serializer=Okura::Serializer::UnkDic::Marshal.new
305
+ cts=Okura::CharTypes.new
306
+ cts.define_type 'A',true,false,10
307
+ cts.define_type 'Z',false,true,0
308
+ cts.define_map 0x0001,cts.named('A'),[]
309
+ cts.define_map 0x0002,cts.named('Z'),[]
310
+ features=Okura::Features.new
311
+ features.add 5,'F5'
312
+ features.add 6,'F6'
313
+ features.add 9,'F9'
314
+ features.add 10,'F10'
315
+ out=StringIO.new
316
+ serializer.compile(cts,features,as_io(<<-EOS),out)
317
+ A,5,6,3274,記号,一般,*,*,*,*,*
318
+ Z,9,10,5244,記号,空白,*,*,*,*,*
319
+ EOS
320
+ out.rewind
321
+
322
+ unk=serializer.load(out)
323
+ unk.word_templates_for('A').first.cost.should == 3274
324
+ unk.word_templates_for('Z').first.cost.should == 5244
325
+ end
326
+ end
327
+ describe Okura::Serializer::Matrix::Marshal do
328
+ it 'コンパイルして復元できる' do
329
+ serializer=Okura::Serializer::Matrix::Marshal.new
330
+ out=StringIO.new
331
+ serializer.compile(as_io(<<-EOS),out)
332
+ 2 3
333
+ 0 0 0
334
+ 0 1 1
335
+ 1 0 2
336
+ 1 1 3
337
+ 1 2 10
338
+ EOS
339
+ out.rewind
340
+
341
+ mat=serializer.load(out)
342
+ mat.cost(0,0).should == 0
343
+ mat.cost(1,2).should == 10
344
+ end
345
+ end
346
+ end
347
+
348
+ describe Okura::Matrix do
349
+ describe '#cost' do
350
+ it '渡された二つのFeature idを元にコストを返せる' do
351
+ m=Okura::Matrix.new 2,2
352
+ m.set(0,0,0)
353
+ m.set(0,1,1)
354
+ m.set(1,0,2)
355
+ m.set(1,1,3)
356
+
357
+ m.cost(1,1).should == 3
358
+ end
359
+ end
360
+ end
361
+
362
+ shared_examples_for 'WordDic' do
363
+ # subject = dict builder
364
+ def w surface
365
+ Okura::Word.new surface,f(1),f(1),1
366
+ end
367
+
368
+ describe '#possible_words' do
369
+ it '登録された単語のサイズを取得できる' do
370
+ subject.build.word_size.should == 0
371
+ subject.define w('aaa')
372
+ subject.define w('bbb')
373
+ subject.build.word_size.should == 2
374
+ end
375
+ it '同じ表記の単語を複数登録できる' do
376
+ w1=Okura::Word.new 'w',f(1),f(2),100
377
+ w2=Okura::Word.new 'w',f(10),f(20),200
378
+ subject.define w1
379
+ subject.define w1
380
+ subject.define w2
381
+
382
+ wd=subject.build
383
+
384
+ wd.possible_words('w',0).should == [w1,w1,w2]
385
+ end
386
+ it '文字列と位置から、辞書に登録された単語を返せる' do
387
+ subject.define w('aaa')
388
+ subject.define w('bbb')
389
+ subject.define w('aa')
390
+ subject.define w('aaaa')
391
+ subject.define w('aaaaa')
392
+
393
+ wd=subject.build
394
+
395
+ wd.possible_words('bbbaaa',0).should == [w('bbb')]
396
+ wd.possible_words('bbbaaa',1).should == []
397
+ wd.possible_words('bbbaaa',3).should == [w('aa'),w('aaa')]
398
+ end
399
+ it 'マルチバイト文字にも対応している' do
400
+ subject.define w('ニワトリ')
401
+ wd=subject.build
402
+
403
+ wd.possible_words('ニワトリ',0).should == [w('ニワトリ')]
404
+ wd.possible_words('ニワトリ',1).should == []
405
+ end
406
+ def matches words,str,dest
407
+ words.each{|word| subject.define w(word) }
408
+ dic=subject.build
409
+ dic.possible_words(str,0).should == dest.map{|d|w(d)}
410
+ end
411
+ it { matches %w() , '' , %w() }
412
+ it { matches %w() , 'aaa' , %w() }
413
+ it { matches %w(a) , '' , %w() }
414
+ it { matches %w(a) , 'a' , %w(a) }
415
+ it { matches %w(a) , 'aa' , %w(a) }
416
+ it { matches %w(a) , 'b' , %w() }
417
+ it { matches %w(aa) , 'a' , %w() }
418
+ it { matches %w(aa) , 'aa' , %w(aa) }
419
+ it { matches %w(aa) , 'aaa' , %w(aa) }
420
+ it { matches %w(aa) , 'ab' , %w() }
421
+ it { matches %w(a aa) , 'a' , %w(a) }
422
+ it { matches %w(a aa) , 'aa' , %w(a aa) }
423
+ it { matches %w(a aa) , 'aaa' , %w(a aa) }
424
+ it { matches %w(a aa) , 'aab' , %w(a aa) }
425
+ it { matches %w(a aa ab) , 'aab' , %w(a aa) }
426
+ it { matches %w(a aa ab) , 'ab' , %w(a ab) }
427
+ it { matches %w(a aa ab) , 'aa' , %w(a aa) }
428
+ it { matches %w(a b) , 'ba' , %w(b) }
429
+ it { matches %w(アイウ) , 'アイウ' , %w(アイウ) }
430
+ it { matches %w(ア アイ) , 'アイウ' , %w(ア アイ) }
431
+ it { matches %w(ア アイ) , 'aアイウ' , %w() }
432
+ end
433
+ end
434
+
435
+ describe Okura::WordDic::Naive do
436
+ class NaiveBuilder
437
+ def initialize
438
+ @wd=Okura::WordDic::Naive.new
439
+ end
440
+ def define *args
441
+ @wd.define *args
442
+ end
443
+ def build
444
+ @wd
445
+ end
446
+ end
447
+ subject { NaiveBuilder.new }
448
+ it_should_behave_like 'WordDic'
449
+ end
450
+
451
+ describe Okura::WordDic::DoubleArray do
452
+ subject { Okura::WordDic::DoubleArray::Builder.new }
453
+ def base(dic)
454
+ dic.instance_eval{@base}
455
+ end
456
+ def check(dic)
457
+ dic.instance_eval{@check}
458
+ end
459
+ def words(dic)
460
+ dic.instance_eval{@words}
461
+ end
462
+ it_should_behave_like 'WordDic'
463
+ end
464
+
465
+ describe Okura::Features do
466
+ end
467
+
468
+ describe Okura::CharTypes do
469
+ describe '#type_for' do
470
+ describe '文字に対するCharTypeが定義されていない場合' do
471
+ describe '文字種DEFAULTが定義されている場合' do
472
+ subject {
473
+ cts=Okura::CharTypes.new
474
+ cts.define_type 'DEFAULT',false,false,0
475
+ cts
476
+ }
477
+ it 'CharType#default_typeが返る' do
478
+ subject.type_for('a'.ord).name.should == subject.default_type.name
479
+ end
480
+ end
481
+ describe '文字種DEFAULTが定義されてない場合' do
482
+ subject { cts=Okura::CharTypes.new }
483
+ it 'エラーになる' do
484
+ expect { subject.type_for('a'.ord) }.to raise_error
485
+ end
486
+ end
487
+ end
488
+ end
489
+ describe '#define_map' do
490
+ describe '互換カテゴリが指定された場合' do
491
+ subject {
492
+ cts=Okura::CharTypes.new
493
+ cts.define_type 'A',true,true,10
494
+ cts.define_type 'B',true,true,10
495
+ cts.define_map 1,cts.named('A'),[cts.named('B')]
496
+ cts
497
+ }
498
+ it '互換カテゴリが正しく認識される' do
499
+ subject.named('A').accept?(1).should be_true
500
+ subject.named('B').accept?(1).should be_true
501
+ end
502
+ end
503
+ end
504
+ end
505
+
506
+ describe Okura::UnkDic do
507
+ describe '#possible_words' do
508
+ describe '互換カテゴリ' do
509
+ subject {
510
+ cts=Okura::CharTypes.new
511
+ cts.define_type 'KATAKANA',false,true,0
512
+ cts.define_type 'HIRAGANA',false,true,0
513
+ cts.define_map 'ア'.ord,cts.named('KATAKANA'),[]
514
+ cts.define_map 'ー'.ord,cts.named('HIRAGANA'),[cts.named('KATAKANA')]
515
+ ud=Okura::UnkDic.new cts
516
+ ud.define 'KATAKANA',f(10),f(20),1000
517
+ ud.define 'HIRAGANA',f(1),f(2),1000
518
+ ud
519
+ }
520
+ it '互換カテゴリを正しく解釈する' do
521
+ subject.possible_words('アーー',0,false).should == [w('アーー',10,20,1000)]
522
+ end
523
+ end
524
+ describe '未知語定義' do
525
+ describe '同一文字種に複数の未知語定義があった場合' do
526
+ subject do
527
+ cts=Okura::CharTypes.new
528
+ cts.define_type 'A',true,true,0
529
+ cts.define_map 'A'.ord,cts.named('A'),[]
530
+ ud=Okura::UnkDic.new cts
531
+ ud.define 'A',f(10),f(20),1000
532
+ ud.define 'A',f(11),f(21),1111
533
+ ud
534
+ end
535
+ it 'すべての定義から未知語を抽出する' do
536
+ subject.possible_words('A',0,false).should == [
537
+ w('A',10,20,1000),
538
+ w('A',11,21,1111)
539
+ ]
540
+ end
541
+ end
542
+ end
543
+ end
544
+ describe '#possible_words: 文字コードによる挙動:' do
545
+ subject do
546
+ cts=Okura::CharTypes.new
547
+ cts.define_type 'A',true,true,0
548
+ cts.define_map 'あ'.ord,cts.named('A'),[]
549
+ ud=Okura::UnkDic.new cts
550
+ ud.define 'A',f(10),f(20),1000
551
+ ud
552
+ end
553
+ describe 'UTF8文字列が来たとき' do
554
+ it '正しく解析できる' do
555
+ subject.possible_words('あいう'.encode('UTF-8'),0,false).map(&:surface).should == %w(あ)
556
+ end
557
+ end
558
+ describe 'UTF8じゃない文字列が来たとき' do
559
+ it 'エラーになる' do
560
+ expect { subject.possible_words('あいう'.encode('SHIFT_JIS'),0,false) }.to raise_error
561
+ end
562
+ end
563
+ end
564
+ describe '#possible_words: 先頭文字のカテゴリによる挙動:' do
565
+ def create_chartypes typename_under_test
566
+ cts=Okura::CharTypes.new
567
+ cts.define_type 'T000',false,false,0
568
+ cts.define_type 'T012',false,true,2
569
+ cts.define_type 'T100',true,false,0
570
+ cts.define_type 'T102',true,false,2
571
+ cts.define_type 'T110',true,true,0
572
+ cts.define_type 'T112',true,true,2
573
+ cts.define_type 'ZZZZ',true,true,2
574
+
575
+ cts.define_map 'A'.ord,cts.named(typename_under_test),[]
576
+ cts.define_map 'Z'.ord,cts.named('ZZZZ'),[]
577
+
578
+ cts
579
+ end
580
+ def create_subject typename_under_test
581
+ udic=Okura::UnkDic.new create_chartypes(typename_under_test)
582
+ udic.define typename_under_test,f(10),(20),1000
583
+ udic
584
+ end
585
+ describe 'invoke=0のとき' do
586
+ subject { create_subject 'T012' }
587
+ describe '辞書に単語がある場合' do
588
+ it '未知語を抽出しない' do
589
+ subject.possible_words('AAA',0,true).should be_empty
590
+ end
591
+ end
592
+ end
593
+ describe 'invoke=1のとき' do
594
+ describe '辞書に単語がある場合' do
595
+ subject { create_subject 'T102' }
596
+ it 'も、未知語を抽出する' do
597
+ subject.possible_words('AAAZ',0,true).should_not be_empty
598
+ end
599
+ end
600
+ describe '先頭文字のカテゴリに対応する未知語定義がなかった場合' do
601
+ subject { create_subject 'T112' }
602
+ it '未知語を抽出しない' do
603
+ subject.possible_words('ZZ',0,false).should be_empty
604
+ end
605
+ end
606
+ describe '辞書に単語がない場合' do
607
+ describe 'group=0のとき' do
608
+ describe 'length=0のとき' do
609
+ subject { create_subject 'T100' }
610
+ it '未知語を抽出しない' do
611
+ subject.possible_words('AAAZ',0,false).should be_empty
612
+ end
613
+ end
614
+ describe 'length=2のとき' do
615
+ subject { create_subject 'T102' }
616
+ it '2文字までの同種文字列を未知語とする' do
617
+ subject.possible_words('AAAZ',0,false).map(&:surface).should == %w(A AA)
618
+ end
619
+ end
620
+ end
621
+ describe 'group=1のとき' do
622
+ describe 'length=0のとき' do
623
+ subject { create_subject 'T110' }
624
+ it '同種の文字列を長さ制限なしでまとめて未知語とする' do
625
+ subject.possible_words('AAAAAZ',0,false).map(&:surface).should == %w(AAAAA)
626
+ end
627
+ it '連続が一文字の場合も未知語として取れる' do
628
+ subject.possible_words('AZZZ',0,false).map(&:surface).should == %w(A)
629
+ end
630
+ it '1文字しかなくても正しく扱える' do
631
+ subject.possible_words('A',0,false).map(&:surface).should == %w(A)
632
+ end
633
+ end
634
+ describe 'length=2のとき' do
635
+ subject { create_subject 'T112' }
636
+ it 'length=0の結果に加え、2文字までの同種文字列を未知語とする' do
637
+ subject.possible_words('AAAAAZ',0,false).map(&:surface).should == %w(A AA AAAAA)
638
+ end
639
+ it '1文字しかなくても正しく扱える' do
640
+ subject.possible_words('A',0,false).map(&:surface).should == %w(A)
641
+ end
642
+ it '2文字しかなくても正しく扱える' do
643
+ subject.possible_words('AA',0,false).map(&:surface).should == %w(A AA)
644
+ end
645
+ it '3文字しかなくても正しく扱える' do
646
+ subject.possible_words('AAA',0,false).map(&:surface).should == %w(A AA AAA)
647
+ end
648
+ end
649
+ end
650
+ end
651
+ end
652
+ end
653
+ end
654
+
655
+ describe Okura::Tagger do
656
+ describe '#parse' do
657
+ it '文字列を解析してNodesを返せる' do
658
+ dic=Okura::WordDic::Naive.new
659
+ dic.define w('a',1,1,0)
660
+ dic.define w('aa',1,1,10)
661
+ dic.define w('b',2,2,3)
662
+ tagger=Okura::Tagger.new dic,nil
663
+
664
+ nodes=tagger.parse('aab')
665
+
666
+ nodes[0][0].word.should == w('BOS/EOS',0,0,0)
667
+ nodes[4][0].word.should == w('BOS/EOS',0,0,0)
668
+ nodes[1].size.should == 2
669
+ nodes[3][0].word.should == w('b',2,2,3)
670
+ end
671
+ end
672
+ end
673
+
674
+ describe Okura::Node do
675
+ describe '#make_bos_eos' do
676
+ describe '#length' do
677
+ it 'returns 1' do
678
+ Okura::Node.mk_bos_eos.length.should == 1
679
+ end
680
+ end
681
+ end
682
+ end
683
+
684
+ describe Okura::Nodes do
685
+ describe '#mincost_path' do
686
+ it '最小コストのパスを返せる' do
687
+ mat=Okura::Matrix.new 2,2
688
+ mat.set(0,1,10)
689
+ mat.set(1,0,10)
690
+ nodes=Okura::Nodes.new 3,mat
691
+ nodes.add(0,Okura::Node.mk_bos_eos)
692
+ nodes.add(1,n(w('a',1,1,10)))
693
+ nodes.add(1,n(w('b',1,1,0)))
694
+ nodes.add(2,Okura::Node.mk_bos_eos)
695
+
696
+ mcp=nodes.mincost_path
697
+ mcp.length.should == 3
698
+ mcp[0].word.surface.should == 'BOS/EOS'
699
+ mcp[1].word.surface.should == 'b'
700
+ mcp[2].word.surface.should == 'BOS/EOS'
701
+ end
702
+ it '単語長が1を超えても動く' do
703
+ mat=Okura::Matrix.new 2,2
704
+ mat.set(0,1,10)
705
+ mat.set(1,0,10)
706
+ mat.set(1,1,10)
707
+ nodes=Okura::Nodes.new 4,mat
708
+ nodes.add(0,Okura::Node.mk_bos_eos)
709
+ nodes.add(1,n(w('a',1,1,10)))
710
+ nodes.add(1,n(w('bb',1,1,0)))
711
+ nodes.add(2,n(w('a',1,1,10)))
712
+ nodes.add(3,Okura::Node.mk_bos_eos)
713
+
714
+ mcp=nodes.mincost_path
715
+ mcp.length.should == 3
716
+ mcp[0].word.surface.should == 'BOS/EOS'
717
+ mcp[1].word.surface.should == 'bb'
718
+ mcp[2].word.surface.should == 'BOS/EOS'
719
+ end
720
+ end
721
+ end