okura 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/okura/console.rb +23 -0
- data/lib/okura/parser.rb +131 -0
- data/lib/okura/serializer.rb +317 -0
- data/lib/okura/version.rb +3 -0
- data/lib/okura/word_dic.rb +216 -0
- data/lib/okura.rb +388 -0
- data/test/okura_spec.rb +721 -0
- data/test/spec_helper.rb +6 -0
- data/test/words_spec.rb +36 -0
- metadata +86 -0
data/test/okura_spec.rb
ADDED
@@ -0,0 +1,721 @@
|
|
1
|
+
#-*- coding:utf-8
|
2
|
+
require File.join(File.dirname(__FILE__),'spec_helper.rb')
|
3
|
+
require File.join(File.dirname(__FILE__),'..','lib','okura')
|
4
|
+
require File.join(File.dirname(__FILE__),'..','lib','okura','parser')
|
5
|
+
require File.join(File.dirname(__FILE__),'..','lib','okura','serializer')
|
6
|
+
|
7
|
+
require 'tmpdir'
|
8
|
+
|
9
|
+
def with_dict_dir &block
|
10
|
+
Dir.mktmpdir {|src_dir|
|
11
|
+
Dir.mktmpdir {|bin_dir|
|
12
|
+
yield src_dir,bin_dir
|
13
|
+
}
|
14
|
+
}
|
15
|
+
end
|
16
|
+
|
17
|
+
def set_content(dir,filename,content)
|
18
|
+
File.open(File.join(dir,filename),'w'){|file|
|
19
|
+
file.write content
|
20
|
+
}
|
21
|
+
end
|
22
|
+
|
23
|
+
def as_io str
|
24
|
+
StringIO.new str
|
25
|
+
end
|
26
|
+
def w surface,l,r,cost
|
27
|
+
l=f(l) unless l.respond_to? :id
|
28
|
+
r=f(r) unless r.respond_to? :id
|
29
|
+
Okura::Word.new surface,l,r,cost
|
30
|
+
end
|
31
|
+
def f id,name="F#{id}"
|
32
|
+
Okura::Feature.new id,name
|
33
|
+
end
|
34
|
+
def n *args
|
35
|
+
Okura::Node.new *args
|
36
|
+
end
|
37
|
+
|
38
|
+
describe Okura::Parser do
|
39
|
+
describe 'Matrix' do
|
40
|
+
it 'MeCab形式のMatrixファイルを読める' do
|
41
|
+
parser=Okura::Parser::Matrix.new as_io(<<-EOS)
|
42
|
+
2 3
|
43
|
+
0 0 0
|
44
|
+
0 1 1
|
45
|
+
1 0 2
|
46
|
+
1 1 3
|
47
|
+
1 2 10
|
48
|
+
EOS
|
49
|
+
parser.rid_size.should == 2
|
50
|
+
parser.lid_size.should == 3
|
51
|
+
parser.each.to_a.should == [
|
52
|
+
[0,0,0],
|
53
|
+
[0,1,1],
|
54
|
+
[1,0,2],
|
55
|
+
[1,1,3],
|
56
|
+
[1,2,10]
|
57
|
+
]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
describe 'Word' do
|
61
|
+
it 'MeCab形式の単語ファイルを読める' do
|
62
|
+
parser=Okura::Parser::Word.new as_io(<<-EOS)
|
63
|
+
あがなう,854,458,6636,動詞,自立,*,*,五段・ワ行促音便,基本形,あがなう,アガナウ,アガナウ,あがなう/購う/贖う,
|
64
|
+
あがめる,645,546,1234,動詞,自立,*,*,一段,基本形,あがめる,アガメル,アガメル,あがめる/崇める,
|
65
|
+
EOS
|
66
|
+
parser.each.to_a.map{|x|x[0..3]}.should == [
|
67
|
+
['あがなう',854,458,6636],
|
68
|
+
['あがめる',645,546,1234]
|
69
|
+
]
|
70
|
+
end
|
71
|
+
it 'ダブルクオートでエスケープされた単語定義も扱える'
|
72
|
+
end
|
73
|
+
describe 'Feature' do
|
74
|
+
it 'MeCab形式の品詞ファイルを読める' do
|
75
|
+
parser=Okura::Parser::Feature.new as_io(<<-EOS)
|
76
|
+
0 BOS/EOS,*,*,*,*,*,BOS/EOS
|
77
|
+
1 その他,間投,*,*,*,*,*
|
78
|
+
EOS
|
79
|
+
parser.each.to_a.should == [
|
80
|
+
[0,'BOS/EOS,*,*,*,*,*,BOS/EOS'],
|
81
|
+
[1,'その他,間投,*,*,*,*,*']
|
82
|
+
]
|
83
|
+
end
|
84
|
+
end
|
85
|
+
describe 'CharType' do
|
86
|
+
it 'MeCab形式の文字種定義ファイルを読める' do
|
87
|
+
parser=Okura::Parser::CharType.new
|
88
|
+
h={single:[],range:[],type:[]}
|
89
|
+
parser.on_mapping_single {|code,type,ctypes| h[:single]<<[code,type,ctypes]}
|
90
|
+
parser.on_mapping_range{|from,to,type,ctypes| h[:range]<<[from,to,type,ctypes]}
|
91
|
+
parser.on_chartype_def{|name,invoke,group,length| h[:type]<<[name,invoke,group,length]}
|
92
|
+
|
93
|
+
parser.parse_all as_io(<<-EOS)
|
94
|
+
DEFAULT 0 1 0 # DEFAULT is a mandatory category!
|
95
|
+
KATAKANA 1 0 2
|
96
|
+
|
97
|
+
0x000D SPACE # CR
|
98
|
+
0x003A..0x0040 SYMBOL
|
99
|
+
# KANJI
|
100
|
+
0x5146 KANJINUMERIC KANJI
|
101
|
+
EOS
|
102
|
+
|
103
|
+
h[:single].should == [
|
104
|
+
[0x000D, 'SPACE', []],
|
105
|
+
[0x5146, 'KANJINUMERIC', %w(KANJI)]
|
106
|
+
]
|
107
|
+
h[:range].should == [
|
108
|
+
[0x003A, 0x0040, 'SYMBOL', []]
|
109
|
+
]
|
110
|
+
h[:type].should == [
|
111
|
+
['DEFAULT', false, true, 0],
|
112
|
+
['KATAKANA', true, false, 2]
|
113
|
+
]
|
114
|
+
end
|
115
|
+
end
|
116
|
+
describe 'UnkDic' do
|
117
|
+
it '未知語の定義を読める' do
|
118
|
+
parser=Okura::Parser::UnkDic.new as_io(<<-EOS)
|
119
|
+
A,5,6,3274,記号,一般,*,*,*,*,*
|
120
|
+
Z,9,10,5244,記号,空白,*,*,*,*,*
|
121
|
+
EOS
|
122
|
+
parser.to_a.should == [
|
123
|
+
['A',5,6,3274],
|
124
|
+
['Z',9,10,5244]
|
125
|
+
]
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
describe 'Compile and load' do
|
131
|
+
describe Okura::Serializer::FormatInfo do
|
132
|
+
it 'シリアライズして復元できる' do
|
133
|
+
info=Okura::Serializer::FormatInfo.new
|
134
|
+
info.word_dic=:Naive
|
135
|
+
info.features=:Marshal
|
136
|
+
info.char_types=:Marshal
|
137
|
+
info.unk_dic=:Marshal
|
138
|
+
info.matrix=:Marshal
|
139
|
+
|
140
|
+
out=StringIO.new
|
141
|
+
info.compile(out)
|
142
|
+
out.rewind
|
143
|
+
|
144
|
+
loaded=Okura::Serializer::FormatInfo.load(out)
|
145
|
+
loaded.word_dic.should == :Naive
|
146
|
+
loaded.features.should == :Marshal
|
147
|
+
loaded.char_types.should == :Marshal
|
148
|
+
loaded.unk_dic.should == :Marshal
|
149
|
+
loaded.matrix.should == :Marshal
|
150
|
+
end
|
151
|
+
it '設定に基づいて辞書をコンパイル/ロードできる' do
|
152
|
+
with_dict_dir{|src_dir,bin_dir|
|
153
|
+
set_content(src_dir,'w1.csv',<<-EOS)
|
154
|
+
w1,1,2,1000,
|
155
|
+
EOS
|
156
|
+
set_content(src_dir,'w2.csv',<<-EOS)
|
157
|
+
w2,5,6,2000,
|
158
|
+
w3,9,10,3000,
|
159
|
+
EOS
|
160
|
+
set_content(src_dir,'left-id.def',<<-EOS)
|
161
|
+
1 F1
|
162
|
+
5 F5
|
163
|
+
9 F9
|
164
|
+
EOS
|
165
|
+
set_content(src_dir,'right-id.def',<<-EOS)
|
166
|
+
2 F2
|
167
|
+
6 F6
|
168
|
+
10 F10
|
169
|
+
EOS
|
170
|
+
set_content(src_dir,'char.def',<<-EOS)
|
171
|
+
A 0 0 1
|
172
|
+
Z 1 1 3
|
173
|
+
EOS
|
174
|
+
set_content(src_dir,'unk.def',<<-EOS)
|
175
|
+
A,5,6,3274,記号,一般,*,*,*,*,*
|
176
|
+
Z,9,10,5244,記号,空白,*,*,*,*,*
|
177
|
+
EOS
|
178
|
+
set_content(src_dir,'matrix.def',<<-EOS)
|
179
|
+
2 3
|
180
|
+
0 0 10
|
181
|
+
0 1 5
|
182
|
+
EOS
|
183
|
+
|
184
|
+
fi=Okura::Serializer::FormatInfo.new
|
185
|
+
fi.encoding='UTF-8'
|
186
|
+
fi.compile_dict(src_dir,bin_dir)
|
187
|
+
|
188
|
+
tagger=Okura::Serializer::FormatInfo.create_tagger(bin_dir)
|
189
|
+
|
190
|
+
tagger.dic.unk_dic.rule_size.should == 2
|
191
|
+
tagger.dic.word_dic.word_size.should == 3
|
192
|
+
tagger.mat.cost(0,1).should == 5
|
193
|
+
|
194
|
+
pending {
|
195
|
+
w2=tagger.dic.word_dic.possible_words('w2',0)[0]
|
196
|
+
w2.left.name.should == 'F5'
|
197
|
+
w2.right.name.should == 'F6'
|
198
|
+
}
|
199
|
+
penging {
|
200
|
+
u1=tagger.dic.unk_dic.word_templates_for('A')
|
201
|
+
u1.left.name.should == 'F5'
|
202
|
+
u1.right.name.should == 'F6'
|
203
|
+
}
|
204
|
+
}
|
205
|
+
end
|
206
|
+
end
|
207
|
+
describe Okura::Serializer::Features::Marshal do
|
208
|
+
it 'コンパイルして復元できる' do
|
209
|
+
serializer=Okura::Serializer::Features::Marshal.new
|
210
|
+
out=StringIO.new
|
211
|
+
serializer.compile(as_io(<<-EOS),out)
|
212
|
+
0 BOS/EOS,*,*,*,*,*,BOS/EOS
|
213
|
+
1 その他,間投,*,*,*,*,*
|
214
|
+
EOS
|
215
|
+
out.rewind
|
216
|
+
|
217
|
+
features=serializer.load(out)
|
218
|
+
features.from_id(0).text.should == 'BOS/EOS,*,*,*,*,*,BOS/EOS'
|
219
|
+
features.from_id(1).text.should == 'その他,間投,*,*,*,*,*'
|
220
|
+
end
|
221
|
+
end
|
222
|
+
describe Okura::Serializer::CharTypes::Marshal do
|
223
|
+
it 'コンパイルして復元できる' do
|
224
|
+
serializer=Okura::Serializer::CharTypes::Marshal.new
|
225
|
+
out=StringIO.new
|
226
|
+
serializer.compile(as_io(<<-EOS),out)
|
227
|
+
DEFAULT 0 1 0 # DEFAULT is a mandatory category!
|
228
|
+
TYPE1 1 0 0
|
229
|
+
TYPE2 0 1 0
|
230
|
+
TYPE3 0 1 3
|
231
|
+
|
232
|
+
# comment
|
233
|
+
|
234
|
+
0x0021 TYPE1
|
235
|
+
0x0022 TYPE2 # comment
|
236
|
+
0x0023..0x0040 TYPE3
|
237
|
+
0x0099 TYPE1 TYPE2 # 互換カテゴリ
|
238
|
+
0xABCd TYPE1 DEFAULT
|
239
|
+
EOS
|
240
|
+
out.rewind
|
241
|
+
|
242
|
+
cts=serializer.load(out)
|
243
|
+
|
244
|
+
cts.type_for(0x21).name.should == 'TYPE1'
|
245
|
+
cts.type_for(0x22).name.should == 'TYPE2'
|
246
|
+
cts.type_for(0x23).name.should == 'TYPE3'
|
247
|
+
cts.type_for(0x40).name.should == 'TYPE3'
|
248
|
+
cts.type_for(0x41).name.should == 'DEFAULT'
|
249
|
+
cts.type_for(0x99).name.should == 'TYPE1'
|
250
|
+
|
251
|
+
t1,t2,t3=cts.named('TYPE1'), cts.named('TYPE2'), cts.named('TYPE3')
|
252
|
+
|
253
|
+
t1.name.should == 'TYPE1'
|
254
|
+
|
255
|
+
t1.invoke?.should be_true
|
256
|
+
t2.invoke?.should be_false
|
257
|
+
|
258
|
+
t1.group?.should be_false
|
259
|
+
t2.group?.should be_true
|
260
|
+
|
261
|
+
t2.length.should == 0
|
262
|
+
t3.length.should == 3
|
263
|
+
|
264
|
+
t1.should be_accept(0x21)
|
265
|
+
t1.should_not be_accept(0x22)
|
266
|
+
t2.should be_accept(0x22)
|
267
|
+
|
268
|
+
t1.should be_accept(0x99)
|
269
|
+
end
|
270
|
+
end
|
271
|
+
shared_examples_for 'WordDic serializer' do
|
272
|
+
# subject : Serializer class
|
273
|
+
it 'コンパイルして復元できる' do
|
274
|
+
serializer=subject.new
|
275
|
+
features=Okura::Features.new
|
276
|
+
features.add 854,f(854)
|
277
|
+
features.add 458,f(458)
|
278
|
+
features.add 645,f(645)
|
279
|
+
features.add 546,f(546)
|
280
|
+
out=StringIO.new
|
281
|
+
src=<<-EOS
|
282
|
+
あがなう,854,458,6636,動詞,自立,*,*,五段・ワ行促音便,基本形,あがなう,アガナウ,アガナウ,あがなう/購う/贖う,
|
283
|
+
あがめる,645,546,1234,動詞,自立,*,*,一段,基本形,あがめる,アガメル,アガメル,あがめる/崇める,
|
284
|
+
EOS
|
285
|
+
serializer.compile(features,[as_io(src)],'UTF-8',out)
|
286
|
+
out.rewind
|
287
|
+
wd=serializer.load(out)
|
288
|
+
|
289
|
+
wd.possible_words('あがなう',0).should == [w('あがなう',f(854),f(458),6636)]
|
290
|
+
wd.possible_words('あがめる',0).should == [w('あがめる',f(645),f(546),1234)]
|
291
|
+
wd.possible_words('あがめる',1).should == []
|
292
|
+
end
|
293
|
+
end
|
294
|
+
describe Okura::Serializer::WordDic::Naive do
|
295
|
+
subject { Okura::Serializer::WordDic::Naive }
|
296
|
+
it_should_behave_like 'WordDic serializer'
|
297
|
+
end
|
298
|
+
describe Okura::Serializer::WordDic::DoubleArray do
|
299
|
+
subject { Okura::Serializer::WordDic::DoubleArray }
|
300
|
+
it_should_behave_like 'WordDic serializer'
|
301
|
+
end
|
302
|
+
describe Okura::Serializer::UnkDic::Marshal do
|
303
|
+
it 'コンパイルして復元できる' do
|
304
|
+
serializer=Okura::Serializer::UnkDic::Marshal.new
|
305
|
+
cts=Okura::CharTypes.new
|
306
|
+
cts.define_type 'A',true,false,10
|
307
|
+
cts.define_type 'Z',false,true,0
|
308
|
+
cts.define_map 0x0001,cts.named('A'),[]
|
309
|
+
cts.define_map 0x0002,cts.named('Z'),[]
|
310
|
+
features=Okura::Features.new
|
311
|
+
features.add 5,'F5'
|
312
|
+
features.add 6,'F6'
|
313
|
+
features.add 9,'F9'
|
314
|
+
features.add 10,'F10'
|
315
|
+
out=StringIO.new
|
316
|
+
serializer.compile(cts,features,as_io(<<-EOS),out)
|
317
|
+
A,5,6,3274,記号,一般,*,*,*,*,*
|
318
|
+
Z,9,10,5244,記号,空白,*,*,*,*,*
|
319
|
+
EOS
|
320
|
+
out.rewind
|
321
|
+
|
322
|
+
unk=serializer.load(out)
|
323
|
+
unk.word_templates_for('A').first.cost.should == 3274
|
324
|
+
unk.word_templates_for('Z').first.cost.should == 5244
|
325
|
+
end
|
326
|
+
end
|
327
|
+
describe Okura::Serializer::Matrix::Marshal do
|
328
|
+
it 'コンパイルして復元できる' do
|
329
|
+
serializer=Okura::Serializer::Matrix::Marshal.new
|
330
|
+
out=StringIO.new
|
331
|
+
serializer.compile(as_io(<<-EOS),out)
|
332
|
+
2 3
|
333
|
+
0 0 0
|
334
|
+
0 1 1
|
335
|
+
1 0 2
|
336
|
+
1 1 3
|
337
|
+
1 2 10
|
338
|
+
EOS
|
339
|
+
out.rewind
|
340
|
+
|
341
|
+
mat=serializer.load(out)
|
342
|
+
mat.cost(0,0).should == 0
|
343
|
+
mat.cost(1,2).should == 10
|
344
|
+
end
|
345
|
+
end
|
346
|
+
end
|
347
|
+
|
348
|
+
describe Okura::Matrix do
|
349
|
+
describe '#cost' do
|
350
|
+
it '渡された二つのFeature idを元にコストを返せる' do
|
351
|
+
m=Okura::Matrix.new 2,2
|
352
|
+
m.set(0,0,0)
|
353
|
+
m.set(0,1,1)
|
354
|
+
m.set(1,0,2)
|
355
|
+
m.set(1,1,3)
|
356
|
+
|
357
|
+
m.cost(1,1).should == 3
|
358
|
+
end
|
359
|
+
end
|
360
|
+
end
|
361
|
+
|
362
|
+
shared_examples_for 'WordDic' do
|
363
|
+
# subject = dict builder
|
364
|
+
def w surface
|
365
|
+
Okura::Word.new surface,f(1),f(1),1
|
366
|
+
end
|
367
|
+
|
368
|
+
describe '#possible_words' do
|
369
|
+
it '登録された単語のサイズを取得できる' do
|
370
|
+
subject.build.word_size.should == 0
|
371
|
+
subject.define w('aaa')
|
372
|
+
subject.define w('bbb')
|
373
|
+
subject.build.word_size.should == 2
|
374
|
+
end
|
375
|
+
it '同じ表記の単語を複数登録できる' do
|
376
|
+
w1=Okura::Word.new 'w',f(1),f(2),100
|
377
|
+
w2=Okura::Word.new 'w',f(10),f(20),200
|
378
|
+
subject.define w1
|
379
|
+
subject.define w1
|
380
|
+
subject.define w2
|
381
|
+
|
382
|
+
wd=subject.build
|
383
|
+
|
384
|
+
wd.possible_words('w',0).should == [w1,w1,w2]
|
385
|
+
end
|
386
|
+
it '文字列と位置から、辞書に登録された単語を返せる' do
|
387
|
+
subject.define w('aaa')
|
388
|
+
subject.define w('bbb')
|
389
|
+
subject.define w('aa')
|
390
|
+
subject.define w('aaaa')
|
391
|
+
subject.define w('aaaaa')
|
392
|
+
|
393
|
+
wd=subject.build
|
394
|
+
|
395
|
+
wd.possible_words('bbbaaa',0).should == [w('bbb')]
|
396
|
+
wd.possible_words('bbbaaa',1).should == []
|
397
|
+
wd.possible_words('bbbaaa',3).should == [w('aa'),w('aaa')]
|
398
|
+
end
|
399
|
+
it 'マルチバイト文字にも対応している' do
|
400
|
+
subject.define w('ニワトリ')
|
401
|
+
wd=subject.build
|
402
|
+
|
403
|
+
wd.possible_words('ニワトリ',0).should == [w('ニワトリ')]
|
404
|
+
wd.possible_words('ニワトリ',1).should == []
|
405
|
+
end
|
406
|
+
def matches words,str,dest
|
407
|
+
words.each{|word| subject.define w(word) }
|
408
|
+
dic=subject.build
|
409
|
+
dic.possible_words(str,0).should == dest.map{|d|w(d)}
|
410
|
+
end
|
411
|
+
it { matches %w() , '' , %w() }
|
412
|
+
it { matches %w() , 'aaa' , %w() }
|
413
|
+
it { matches %w(a) , '' , %w() }
|
414
|
+
it { matches %w(a) , 'a' , %w(a) }
|
415
|
+
it { matches %w(a) , 'aa' , %w(a) }
|
416
|
+
it { matches %w(a) , 'b' , %w() }
|
417
|
+
it { matches %w(aa) , 'a' , %w() }
|
418
|
+
it { matches %w(aa) , 'aa' , %w(aa) }
|
419
|
+
it { matches %w(aa) , 'aaa' , %w(aa) }
|
420
|
+
it { matches %w(aa) , 'ab' , %w() }
|
421
|
+
it { matches %w(a aa) , 'a' , %w(a) }
|
422
|
+
it { matches %w(a aa) , 'aa' , %w(a aa) }
|
423
|
+
it { matches %w(a aa) , 'aaa' , %w(a aa) }
|
424
|
+
it { matches %w(a aa) , 'aab' , %w(a aa) }
|
425
|
+
it { matches %w(a aa ab) , 'aab' , %w(a aa) }
|
426
|
+
it { matches %w(a aa ab) , 'ab' , %w(a ab) }
|
427
|
+
it { matches %w(a aa ab) , 'aa' , %w(a aa) }
|
428
|
+
it { matches %w(a b) , 'ba' , %w(b) }
|
429
|
+
it { matches %w(アイウ) , 'アイウ' , %w(アイウ) }
|
430
|
+
it { matches %w(ア アイ) , 'アイウ' , %w(ア アイ) }
|
431
|
+
it { matches %w(ア アイ) , 'aアイウ' , %w() }
|
432
|
+
end
|
433
|
+
end
|
434
|
+
|
435
|
+
describe Okura::WordDic::Naive do
|
436
|
+
class NaiveBuilder
|
437
|
+
def initialize
|
438
|
+
@wd=Okura::WordDic::Naive.new
|
439
|
+
end
|
440
|
+
def define *args
|
441
|
+
@wd.define *args
|
442
|
+
end
|
443
|
+
def build
|
444
|
+
@wd
|
445
|
+
end
|
446
|
+
end
|
447
|
+
subject { NaiveBuilder.new }
|
448
|
+
it_should_behave_like 'WordDic'
|
449
|
+
end
|
450
|
+
|
451
|
+
describe Okura::WordDic::DoubleArray do
|
452
|
+
subject { Okura::WordDic::DoubleArray::Builder.new }
|
453
|
+
def base(dic)
|
454
|
+
dic.instance_eval{@base}
|
455
|
+
end
|
456
|
+
def check(dic)
|
457
|
+
dic.instance_eval{@check}
|
458
|
+
end
|
459
|
+
def words(dic)
|
460
|
+
dic.instance_eval{@words}
|
461
|
+
end
|
462
|
+
it_should_behave_like 'WordDic'
|
463
|
+
end
|
464
|
+
|
465
|
+
describe Okura::Features do
|
466
|
+
end
|
467
|
+
|
468
|
+
describe Okura::CharTypes do
|
469
|
+
describe '#type_for' do
|
470
|
+
describe '文字に対するCharTypeが定義されていない場合' do
|
471
|
+
describe '文字種DEFAULTが定義されている場合' do
|
472
|
+
subject {
|
473
|
+
cts=Okura::CharTypes.new
|
474
|
+
cts.define_type 'DEFAULT',false,false,0
|
475
|
+
cts
|
476
|
+
}
|
477
|
+
it 'CharType#default_typeが返る' do
|
478
|
+
subject.type_for('a'.ord).name.should == subject.default_type.name
|
479
|
+
end
|
480
|
+
end
|
481
|
+
describe '文字種DEFAULTが定義されてない場合' do
|
482
|
+
subject { cts=Okura::CharTypes.new }
|
483
|
+
it 'エラーになる' do
|
484
|
+
expect { subject.type_for('a'.ord) }.to raise_error
|
485
|
+
end
|
486
|
+
end
|
487
|
+
end
|
488
|
+
end
|
489
|
+
describe '#define_map' do
|
490
|
+
describe '互換カテゴリが指定された場合' do
|
491
|
+
subject {
|
492
|
+
cts=Okura::CharTypes.new
|
493
|
+
cts.define_type 'A',true,true,10
|
494
|
+
cts.define_type 'B',true,true,10
|
495
|
+
cts.define_map 1,cts.named('A'),[cts.named('B')]
|
496
|
+
cts
|
497
|
+
}
|
498
|
+
it '互換カテゴリが正しく認識される' do
|
499
|
+
subject.named('A').accept?(1).should be_true
|
500
|
+
subject.named('B').accept?(1).should be_true
|
501
|
+
end
|
502
|
+
end
|
503
|
+
end
|
504
|
+
end
|
505
|
+
|
506
|
+
describe Okura::UnkDic do
|
507
|
+
describe '#possible_words' do
|
508
|
+
describe '互換カテゴリ' do
|
509
|
+
subject {
|
510
|
+
cts=Okura::CharTypes.new
|
511
|
+
cts.define_type 'KATAKANA',false,true,0
|
512
|
+
cts.define_type 'HIRAGANA',false,true,0
|
513
|
+
cts.define_map 'ア'.ord,cts.named('KATAKANA'),[]
|
514
|
+
cts.define_map 'ー'.ord,cts.named('HIRAGANA'),[cts.named('KATAKANA')]
|
515
|
+
ud=Okura::UnkDic.new cts
|
516
|
+
ud.define 'KATAKANA',f(10),f(20),1000
|
517
|
+
ud.define 'HIRAGANA',f(1),f(2),1000
|
518
|
+
ud
|
519
|
+
}
|
520
|
+
it '互換カテゴリを正しく解釈する' do
|
521
|
+
subject.possible_words('アーー',0,false).should == [w('アーー',10,20,1000)]
|
522
|
+
end
|
523
|
+
end
|
524
|
+
describe '未知語定義' do
|
525
|
+
describe '同一文字種に複数の未知語定義があった場合' do
|
526
|
+
subject do
|
527
|
+
cts=Okura::CharTypes.new
|
528
|
+
cts.define_type 'A',true,true,0
|
529
|
+
cts.define_map 'A'.ord,cts.named('A'),[]
|
530
|
+
ud=Okura::UnkDic.new cts
|
531
|
+
ud.define 'A',f(10),f(20),1000
|
532
|
+
ud.define 'A',f(11),f(21),1111
|
533
|
+
ud
|
534
|
+
end
|
535
|
+
it 'すべての定義から未知語を抽出する' do
|
536
|
+
subject.possible_words('A',0,false).should == [
|
537
|
+
w('A',10,20,1000),
|
538
|
+
w('A',11,21,1111)
|
539
|
+
]
|
540
|
+
end
|
541
|
+
end
|
542
|
+
end
|
543
|
+
end
|
544
|
+
describe '#possible_words: 文字コードによる挙動:' do
|
545
|
+
subject do
|
546
|
+
cts=Okura::CharTypes.new
|
547
|
+
cts.define_type 'A',true,true,0
|
548
|
+
cts.define_map 'あ'.ord,cts.named('A'),[]
|
549
|
+
ud=Okura::UnkDic.new cts
|
550
|
+
ud.define 'A',f(10),f(20),1000
|
551
|
+
ud
|
552
|
+
end
|
553
|
+
describe 'UTF8文字列が来たとき' do
|
554
|
+
it '正しく解析できる' do
|
555
|
+
subject.possible_words('あいう'.encode('UTF-8'),0,false).map(&:surface).should == %w(あ)
|
556
|
+
end
|
557
|
+
end
|
558
|
+
describe 'UTF8じゃない文字列が来たとき' do
|
559
|
+
it 'エラーになる' do
|
560
|
+
expect { subject.possible_words('あいう'.encode('SHIFT_JIS'),0,false) }.to raise_error
|
561
|
+
end
|
562
|
+
end
|
563
|
+
end
|
564
|
+
describe '#possible_words: 先頭文字のカテゴリによる挙動:' do
|
565
|
+
def create_chartypes typename_under_test
|
566
|
+
cts=Okura::CharTypes.new
|
567
|
+
cts.define_type 'T000',false,false,0
|
568
|
+
cts.define_type 'T012',false,true,2
|
569
|
+
cts.define_type 'T100',true,false,0
|
570
|
+
cts.define_type 'T102',true,false,2
|
571
|
+
cts.define_type 'T110',true,true,0
|
572
|
+
cts.define_type 'T112',true,true,2
|
573
|
+
cts.define_type 'ZZZZ',true,true,2
|
574
|
+
|
575
|
+
cts.define_map 'A'.ord,cts.named(typename_under_test),[]
|
576
|
+
cts.define_map 'Z'.ord,cts.named('ZZZZ'),[]
|
577
|
+
|
578
|
+
cts
|
579
|
+
end
|
580
|
+
def create_subject typename_under_test
|
581
|
+
udic=Okura::UnkDic.new create_chartypes(typename_under_test)
|
582
|
+
udic.define typename_under_test,f(10),(20),1000
|
583
|
+
udic
|
584
|
+
end
|
585
|
+
describe 'invoke=0のとき' do
|
586
|
+
subject { create_subject 'T012' }
|
587
|
+
describe '辞書に単語がある場合' do
|
588
|
+
it '未知語を抽出しない' do
|
589
|
+
subject.possible_words('AAA',0,true).should be_empty
|
590
|
+
end
|
591
|
+
end
|
592
|
+
end
|
593
|
+
describe 'invoke=1のとき' do
|
594
|
+
describe '辞書に単語がある場合' do
|
595
|
+
subject { create_subject 'T102' }
|
596
|
+
it 'も、未知語を抽出する' do
|
597
|
+
subject.possible_words('AAAZ',0,true).should_not be_empty
|
598
|
+
end
|
599
|
+
end
|
600
|
+
describe '先頭文字のカテゴリに対応する未知語定義がなかった場合' do
|
601
|
+
subject { create_subject 'T112' }
|
602
|
+
it '未知語を抽出しない' do
|
603
|
+
subject.possible_words('ZZ',0,false).should be_empty
|
604
|
+
end
|
605
|
+
end
|
606
|
+
describe '辞書に単語がない場合' do
|
607
|
+
describe 'group=0のとき' do
|
608
|
+
describe 'length=0のとき' do
|
609
|
+
subject { create_subject 'T100' }
|
610
|
+
it '未知語を抽出しない' do
|
611
|
+
subject.possible_words('AAAZ',0,false).should be_empty
|
612
|
+
end
|
613
|
+
end
|
614
|
+
describe 'length=2のとき' do
|
615
|
+
subject { create_subject 'T102' }
|
616
|
+
it '2文字までの同種文字列を未知語とする' do
|
617
|
+
subject.possible_words('AAAZ',0,false).map(&:surface).should == %w(A AA)
|
618
|
+
end
|
619
|
+
end
|
620
|
+
end
|
621
|
+
describe 'group=1のとき' do
|
622
|
+
describe 'length=0のとき' do
|
623
|
+
subject { create_subject 'T110' }
|
624
|
+
it '同種の文字列を長さ制限なしでまとめて未知語とする' do
|
625
|
+
subject.possible_words('AAAAAZ',0,false).map(&:surface).should == %w(AAAAA)
|
626
|
+
end
|
627
|
+
it '連続が一文字の場合も未知語として取れる' do
|
628
|
+
subject.possible_words('AZZZ',0,false).map(&:surface).should == %w(A)
|
629
|
+
end
|
630
|
+
it '1文字しかなくても正しく扱える' do
|
631
|
+
subject.possible_words('A',0,false).map(&:surface).should == %w(A)
|
632
|
+
end
|
633
|
+
end
|
634
|
+
describe 'length=2のとき' do
|
635
|
+
subject { create_subject 'T112' }
|
636
|
+
it 'length=0の結果に加え、2文字までの同種文字列を未知語とする' do
|
637
|
+
subject.possible_words('AAAAAZ',0,false).map(&:surface).should == %w(A AA AAAAA)
|
638
|
+
end
|
639
|
+
it '1文字しかなくても正しく扱える' do
|
640
|
+
subject.possible_words('A',0,false).map(&:surface).should == %w(A)
|
641
|
+
end
|
642
|
+
it '2文字しかなくても正しく扱える' do
|
643
|
+
subject.possible_words('AA',0,false).map(&:surface).should == %w(A AA)
|
644
|
+
end
|
645
|
+
it '3文字しかなくても正しく扱える' do
|
646
|
+
subject.possible_words('AAA',0,false).map(&:surface).should == %w(A AA AAA)
|
647
|
+
end
|
648
|
+
end
|
649
|
+
end
|
650
|
+
end
|
651
|
+
end
|
652
|
+
end
|
653
|
+
end
|
654
|
+
|
655
|
+
describe Okura::Tagger do
|
656
|
+
describe '#parse' do
|
657
|
+
it '文字列を解析してNodesを返せる' do
|
658
|
+
dic=Okura::WordDic::Naive.new
|
659
|
+
dic.define w('a',1,1,0)
|
660
|
+
dic.define w('aa',1,1,10)
|
661
|
+
dic.define w('b',2,2,3)
|
662
|
+
tagger=Okura::Tagger.new dic,nil
|
663
|
+
|
664
|
+
nodes=tagger.parse('aab')
|
665
|
+
|
666
|
+
nodes[0][0].word.should == w('BOS/EOS',0,0,0)
|
667
|
+
nodes[4][0].word.should == w('BOS/EOS',0,0,0)
|
668
|
+
nodes[1].size.should == 2
|
669
|
+
nodes[3][0].word.should == w('b',2,2,3)
|
670
|
+
end
|
671
|
+
end
|
672
|
+
end
|
673
|
+
|
674
|
+
describe Okura::Node do
|
675
|
+
describe '#make_bos_eos' do
|
676
|
+
describe '#length' do
|
677
|
+
it 'returns 1' do
|
678
|
+
Okura::Node.mk_bos_eos.length.should == 1
|
679
|
+
end
|
680
|
+
end
|
681
|
+
end
|
682
|
+
end
|
683
|
+
|
684
|
+
describe Okura::Nodes do
|
685
|
+
describe '#mincost_path' do
|
686
|
+
it '最小コストのパスを返せる' do
|
687
|
+
mat=Okura::Matrix.new 2,2
|
688
|
+
mat.set(0,1,10)
|
689
|
+
mat.set(1,0,10)
|
690
|
+
nodes=Okura::Nodes.new 3,mat
|
691
|
+
nodes.add(0,Okura::Node.mk_bos_eos)
|
692
|
+
nodes.add(1,n(w('a',1,1,10)))
|
693
|
+
nodes.add(1,n(w('b',1,1,0)))
|
694
|
+
nodes.add(2,Okura::Node.mk_bos_eos)
|
695
|
+
|
696
|
+
mcp=nodes.mincost_path
|
697
|
+
mcp.length.should == 3
|
698
|
+
mcp[0].word.surface.should == 'BOS/EOS'
|
699
|
+
mcp[1].word.surface.should == 'b'
|
700
|
+
mcp[2].word.surface.should == 'BOS/EOS'
|
701
|
+
end
|
702
|
+
it '単語長が1を超えても動く' do
|
703
|
+
mat=Okura::Matrix.new 2,2
|
704
|
+
mat.set(0,1,10)
|
705
|
+
mat.set(1,0,10)
|
706
|
+
mat.set(1,1,10)
|
707
|
+
nodes=Okura::Nodes.new 4,mat
|
708
|
+
nodes.add(0,Okura::Node.mk_bos_eos)
|
709
|
+
nodes.add(1,n(w('a',1,1,10)))
|
710
|
+
nodes.add(1,n(w('bb',1,1,0)))
|
711
|
+
nodes.add(2,n(w('a',1,1,10)))
|
712
|
+
nodes.add(3,Okura::Node.mk_bos_eos)
|
713
|
+
|
714
|
+
mcp=nodes.mincost_path
|
715
|
+
mcp.length.should == 3
|
716
|
+
mcp[0].word.surface.should == 'BOS/EOS'
|
717
|
+
mcp[1].word.surface.should == 'bb'
|
718
|
+
mcp[2].word.surface.should == 'BOS/EOS'
|
719
|
+
end
|
720
|
+
end
|
721
|
+
end
|