ruby-sfst 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +1 -0
- data/Manifest +31 -0
- data/README.rdoc +25 -0
- data/Rakefile +22 -0
- data/ext/sfst_machine/alphabet.C +807 -0
- data/ext/sfst_machine/alphabet.h +281 -0
- data/ext/sfst_machine/basic.C +84 -0
- data/ext/sfst_machine/basic.h +24 -0
- data/ext/sfst_machine/compact.C +616 -0
- data/ext/sfst_machine/compact.h +98 -0
- data/ext/sfst_machine/determinise.C +304 -0
- data/ext/sfst_machine/extconf.rb +4 -0
- data/ext/sfst_machine/fst-compiler.C +2375 -0
- data/ext/sfst_machine/fst-compiler.h +113 -0
- data/ext/sfst_machine/fst-compiler.yy +213 -0
- data/ext/sfst_machine/fst.C +966 -0
- data/ext/sfst_machine/fst.h +365 -0
- data/ext/sfst_machine/interface.C +1838 -0
- data/ext/sfst_machine/interface.h +94 -0
- data/ext/sfst_machine/make-compact.C +328 -0
- data/ext/sfst_machine/make-compact.h +34 -0
- data/ext/sfst_machine/mem.h +74 -0
- data/ext/sfst_machine/operators.C +1131 -0
- data/ext/sfst_machine/sfst_machine.cc +411 -0
- data/ext/sfst_machine/utf8-scanner.C +2197 -0
- data/ext/sfst_machine/utf8-scanner.ll +179 -0
- data/ext/sfst_machine/utf8.C +146 -0
- data/ext/sfst_machine/utf8.h +19 -0
- data/lib/sfst.rb +99 -0
- data/ruby-sfst.gemspec +34 -0
- data/test/test_sfst.fst +3 -0
- data/test/test_sfst.rb +119 -0
- metadata +100 -0
@@ -0,0 +1,616 @@
|
|
1
|
+
/*******************************************************************/
|
2
|
+
/* */
|
3
|
+
/* FILE compact.C */
|
4
|
+
/* MODULE compact */
|
5
|
+
/* PROGRAM SFST */
|
6
|
+
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
7
|
+
/* */
|
8
|
+
/* PURPOSE Code needed for analysing data */
|
9
|
+
/* */
|
10
|
+
/*******************************************************************/
|
11
|
+
|
12
|
+
#include <stdio.h>
|
13
|
+
#include <math.h>
|
14
|
+
|
15
|
+
#include <limits.h>
|
16
|
+
|
17
|
+
#include "compact.h"
|
18
|
+
|
19
|
+
using std::equal_range;
|
20
|
+
using std::vector;
|
21
|
+
using std::pair;
|
22
|
+
|
23
|
+
const int BUFFER_SIZE=1000;
|
24
|
+
|
25
|
+
|
26
|
+
/*******************************************************************/
|
27
|
+
/* */
|
28
|
+
/* CompactTransducer::convert */
|
29
|
+
/* */
|
30
|
+
/*******************************************************************/
|
31
|
+
|
32
|
+
void CompactTransducer::convert( CAnalysis &cana, Analysis &ana )
|
33
|
+
|
34
|
+
{
|
35
|
+
ana.resize(cana.size());
|
36
|
+
for( size_t i=0; i<cana.size(); i++ )
|
37
|
+
ana[i] = label[cana[i]];
|
38
|
+
}
|
39
|
+
|
40
|
+
|
41
|
+
/*******************************************************************/
|
42
|
+
/* */
|
43
|
+
/* CompactTransducer::analyze */
|
44
|
+
/* */
|
45
|
+
/*******************************************************************/
|
46
|
+
|
47
|
+
void CompactTransducer::analyze(unsigned int n, vector<Character> &input,
|
48
|
+
size_t ipos, CAnalysis &ca,
|
49
|
+
vector<CAnalysis> &analyses )
|
50
|
+
{
|
51
|
+
// "n" is the number of the current transducer node/state
|
52
|
+
// "input" is the sequence of input symbols
|
53
|
+
// "ipos" is the input position currently analysed
|
54
|
+
// "ca" stores the incomplete analysis string
|
55
|
+
// "analyses" stores the analyses found so far
|
56
|
+
|
57
|
+
if (analyses.size() > 10000)
|
58
|
+
return; // limit the maximal number of analyses
|
59
|
+
|
60
|
+
// Is the input string fully analyzed and the current node a final node?
|
61
|
+
if (finalp[n] && ipos == input.size())
|
62
|
+
// store the new analysis
|
63
|
+
analyses.push_back(ca);
|
64
|
+
|
65
|
+
// follow the epsilon transitions
|
66
|
+
// first_arc[n] is the number of the first outgoing transition of node n
|
67
|
+
// first_arc[n+1]-1 is the number of the last outgoing transition of node n
|
68
|
+
// first_arc[n+1] is the number of the first outgoing transition of node n+1
|
69
|
+
unsigned int i;
|
70
|
+
for( i=first_arc[n];
|
71
|
+
i<first_arc[n+1] && label[i].upper_char() == Label::epsilon;
|
72
|
+
i++)
|
73
|
+
{
|
74
|
+
ca.push_back(i);
|
75
|
+
analyze(target_node[i], input, ipos, ca, analyses);
|
76
|
+
ca.pop_back();
|
77
|
+
}
|
78
|
+
|
79
|
+
// follow the non-epsilon transitions
|
80
|
+
|
81
|
+
// scan the next input symbol
|
82
|
+
if (ipos < input.size()) {
|
83
|
+
// find the set of arcs with matching upper character in the sorted list
|
84
|
+
pair<Label*,Label*>range =
|
85
|
+
equal_range(label+i, label+first_arc[n+1], Label(input[ipos]));
|
86
|
+
unsigned int to = (unsigned int)(range.second - label);
|
87
|
+
|
88
|
+
// follow the non-epsilon transitions
|
89
|
+
for( i=range.first-label; i<to; i++) {
|
90
|
+
ca.push_back(i);
|
91
|
+
analyze(target_node[i], input, ipos+1, ca, analyses);
|
92
|
+
ca.pop_back();
|
93
|
+
}
|
94
|
+
}
|
95
|
+
}
|
96
|
+
|
97
|
+
|
98
|
+
/*******************************************************************/
|
99
|
+
/* */
|
100
|
+
/* CompactTransducer::analyze_string */
|
101
|
+
/* */
|
102
|
+
/*******************************************************************/
|
103
|
+
|
104
|
+
void CompactTransducer::analyze_string( char *s, vector<CAnalysis> &analyses )
|
105
|
+
|
106
|
+
{
|
107
|
+
// "s" input string to be analyzed
|
108
|
+
// "analyses" is the data structure in which the results are stored
|
109
|
+
// and returned
|
110
|
+
|
111
|
+
vector<Character> input;
|
112
|
+
alphabet.string2symseq( s, input );
|
113
|
+
|
114
|
+
analyses.clear();
|
115
|
+
CAnalysis ca; // data structure where the current incomplete analysis
|
116
|
+
// is stored
|
117
|
+
analyze(0, input, 0, ca, analyses); // start the analysis
|
118
|
+
|
119
|
+
if (analyses.size() > 10000)
|
120
|
+
fprintf(stderr,"Warning: Only the first 10000 analyses considered for \"%s\"!\n", s);
|
121
|
+
|
122
|
+
if (simplest_only && analyses.size() > 1)
|
123
|
+
disambiguate( analyses ); // select the simplest analyses
|
124
|
+
}
|
125
|
+
|
126
|
+
|
127
|
+
|
128
|
+
/*******************************************************************/
|
129
|
+
/* */
|
130
|
+
/* CompactTransducer::~CompactTransducer */
|
131
|
+
/* */
|
132
|
+
/*******************************************************************/
|
133
|
+
|
134
|
+
CompactTransducer::~CompactTransducer()
|
135
|
+
|
136
|
+
{
|
137
|
+
delete[] finalp;
|
138
|
+
delete[] first_arc;
|
139
|
+
delete[] label;
|
140
|
+
delete[] target_node;
|
141
|
+
delete[] final_logprob;
|
142
|
+
delete[] arc_logprob;
|
143
|
+
}
|
144
|
+
|
145
|
+
|
146
|
+
/*******************************************************************/
|
147
|
+
/* */
|
148
|
+
/* CompactTransducer::CompactTransducer */
|
149
|
+
/* */
|
150
|
+
/*******************************************************************/
|
151
|
+
|
152
|
+
CompactTransducer::CompactTransducer()
|
153
|
+
|
154
|
+
{
|
155
|
+
both_layers = false;
|
156
|
+
simplest_only = false;
|
157
|
+
number_of_nodes = 0;
|
158
|
+
number_of_arcs = 0;
|
159
|
+
finalp = NULL;
|
160
|
+
first_arc = NULL;
|
161
|
+
label = NULL;
|
162
|
+
target_node = NULL;
|
163
|
+
arc_logprob = final_logprob = (float*)NULL;
|
164
|
+
}
|
165
|
+
|
166
|
+
|
167
|
+
/*******************************************************************/
|
168
|
+
/* */
|
169
|
+
/* CompactTransducer::read_finalp */
|
170
|
+
/* */
|
171
|
+
/*******************************************************************/
|
172
|
+
|
173
|
+
void CompactTransducer::read_finalp( FILE *file )
|
174
|
+
|
175
|
+
{
|
176
|
+
int k=0;
|
177
|
+
unsigned char n=0;
|
178
|
+
for( size_t i=0; i<number_of_nodes; i++ ) {
|
179
|
+
if (k == 0) {
|
180
|
+
n = (unsigned char)fgetc(file);
|
181
|
+
k = 8;
|
182
|
+
}
|
183
|
+
k--;
|
184
|
+
if (n & (1 << k))
|
185
|
+
finalp[i] = 1;
|
186
|
+
else
|
187
|
+
finalp[i] = 0;
|
188
|
+
}
|
189
|
+
}
|
190
|
+
|
191
|
+
|
192
|
+
/*******************************************************************/
|
193
|
+
/* */
|
194
|
+
/* CompactTransducer::read_first_arcs */
|
195
|
+
/* */
|
196
|
+
/*******************************************************************/
|
197
|
+
|
198
|
+
void CompactTransducer::read_first_arcs( FILE *file )
|
199
|
+
|
200
|
+
{
|
201
|
+
int k=0;
|
202
|
+
unsigned int n=0;
|
203
|
+
size_t bits=(size_t)ceil(log(number_of_arcs+1)/log(2));
|
204
|
+
|
205
|
+
for( size_t i=0; i<=number_of_nodes; i++ ) {
|
206
|
+
first_arc[i] = n >> (sizeof(n)*8 - bits);
|
207
|
+
n <<= bits;
|
208
|
+
k -= bits;
|
209
|
+
if (k < 0) {
|
210
|
+
read_num(&n,sizeof(n),file);
|
211
|
+
first_arc[i] |= n >> (sizeof(n)*8 + k);
|
212
|
+
n <<= -k;
|
213
|
+
k += sizeof(n) * 8;
|
214
|
+
}
|
215
|
+
}
|
216
|
+
}
|
217
|
+
|
218
|
+
|
219
|
+
/*******************************************************************/
|
220
|
+
/* */
|
221
|
+
/* CompactTransducer::read_target_nodes */
|
222
|
+
/* */
|
223
|
+
/*******************************************************************/
|
224
|
+
|
225
|
+
void CompactTransducer::read_target_nodes( FILE *file )
|
226
|
+
|
227
|
+
{
|
228
|
+
int k=0;
|
229
|
+
unsigned int n=0;
|
230
|
+
size_t bits=(size_t)ceil(log(number_of_nodes)/log(2));
|
231
|
+
|
232
|
+
for( size_t i=0; i<number_of_arcs; i++ ) {
|
233
|
+
target_node[i] = n >> (sizeof(n)*8 - bits);
|
234
|
+
n <<= bits;
|
235
|
+
k -= bits;
|
236
|
+
if (k < 0) {
|
237
|
+
read_num(&n,sizeof(n),file);
|
238
|
+
target_node[i] |= n >> (sizeof(n)*8 + k);
|
239
|
+
n <<= -k;
|
240
|
+
k += sizeof(n) * 8;
|
241
|
+
}
|
242
|
+
}
|
243
|
+
}
|
244
|
+
|
245
|
+
|
246
|
+
/*******************************************************************/
|
247
|
+
/* */
|
248
|
+
/* CompactTransducer::read_labels */
|
249
|
+
/* */
|
250
|
+
/*******************************************************************/
|
251
|
+
|
252
|
+
void CompactTransducer::read_labels( FILE *file )
|
253
|
+
|
254
|
+
{
|
255
|
+
size_t N=0;
|
256
|
+
Label Num2Label[alphabet.size()];
|
257
|
+
for( Alphabet::const_iterator it=alphabet.begin();
|
258
|
+
it != alphabet.end(); it++ )
|
259
|
+
{
|
260
|
+
Label l=*it;
|
261
|
+
Num2Label[N++] = l;
|
262
|
+
}
|
263
|
+
|
264
|
+
int k=0;
|
265
|
+
unsigned int n=0;
|
266
|
+
size_t bits=(size_t)ceil(log(alphabet.size())/log(2));
|
267
|
+
|
268
|
+
for( size_t i=0; i<number_of_arcs; i++ ) {
|
269
|
+
unsigned int l = n >> (sizeof(n)*8 - bits);
|
270
|
+
n <<= bits;
|
271
|
+
k -= bits;
|
272
|
+
if (k < 0) {
|
273
|
+
read_num(&n,sizeof(n),file);
|
274
|
+
l |= n >> (sizeof(n)*8 + k);
|
275
|
+
n <<= -k;
|
276
|
+
k += sizeof(n) * 8;
|
277
|
+
}
|
278
|
+
label[i] = Num2Label[l];
|
279
|
+
}
|
280
|
+
}
|
281
|
+
|
282
|
+
|
283
|
+
/*******************************************************************/
|
284
|
+
/* */
|
285
|
+
/* CompactTransducer::read_probs */
|
286
|
+
/* */
|
287
|
+
/*******************************************************************/
|
288
|
+
|
289
|
+
void CompactTransducer::read_probs( FILE *file )
|
290
|
+
|
291
|
+
{
|
292
|
+
size_t n,m;
|
293
|
+
fread(&n, sizeof(n), 1, file);
|
294
|
+
if (fread(&m, sizeof(n), 1, file) != 1 ||
|
295
|
+
n != node_count() || m != arc_count())
|
296
|
+
{
|
297
|
+
fprintf(stderr,"Error: incompatible probability file!\n");
|
298
|
+
exit(1);
|
299
|
+
}
|
300
|
+
final_logprob = new float[n];
|
301
|
+
arc_logprob = new float[m];
|
302
|
+
fread(final_logprob, sizeof(float), n, file);
|
303
|
+
if (fread(arc_logprob, sizeof(float), n, file) != n) {
|
304
|
+
fprintf(stderr,"Error: in probability file!\n");
|
305
|
+
exit(1);
|
306
|
+
}
|
307
|
+
}
|
308
|
+
|
309
|
+
|
310
|
+
/*******************************************************************/
|
311
|
+
/* */
|
312
|
+
/* CompactTransducer::CompactTransducer */
|
313
|
+
/* */
|
314
|
+
/*******************************************************************/
|
315
|
+
|
316
|
+
CompactTransducer::CompactTransducer( FILE *file, FILE *pfile )
|
317
|
+
|
318
|
+
{
|
319
|
+
both_layers = false;
|
320
|
+
simplest_only = false;
|
321
|
+
|
322
|
+
if (fgetc(file) != 'c')
|
323
|
+
throw "Error: wrong file format (not a compact transducer)\n";
|
324
|
+
|
325
|
+
alphabet.read(file);
|
326
|
+
|
327
|
+
read_num(&number_of_nodes,sizeof(number_of_nodes),file);
|
328
|
+
read_num(&number_of_arcs,sizeof(number_of_arcs),file);
|
329
|
+
|
330
|
+
if (!ferror(file)) {
|
331
|
+
// memory allocation
|
332
|
+
finalp = new char[number_of_nodes];
|
333
|
+
first_arc = new unsigned[number_of_nodes+1];
|
334
|
+
label = new Label[number_of_arcs];
|
335
|
+
target_node = new unsigned[number_of_arcs];
|
336
|
+
|
337
|
+
// reading the data
|
338
|
+
read_finalp(file);
|
339
|
+
read_first_arcs(file);
|
340
|
+
read_labels(file);
|
341
|
+
read_target_nodes(file);
|
342
|
+
}
|
343
|
+
|
344
|
+
if (pfile == NULL)
|
345
|
+
arc_logprob = final_logprob = (float*)NULL;
|
346
|
+
else
|
347
|
+
read_probs(pfile);
|
348
|
+
}
|
349
|
+
|
350
|
+
|
351
|
+
/*******************************************************************/
|
352
|
+
/* */
|
353
|
+
/* CompactTransducer::longest_match2 */
|
354
|
+
/* */
|
355
|
+
/*******************************************************************/
|
356
|
+
|
357
|
+
void CompactTransducer::longest_match2(unsigned int n, char *string, int l,
|
358
|
+
CAnalysis &ca, int &bl, CAnalysis &ba)
|
359
|
+
{
|
360
|
+
// n: transducer state
|
361
|
+
// string: rest string
|
362
|
+
// l: length of current analysis
|
363
|
+
// bl: length of the currently longest match
|
364
|
+
// ca: current analysis
|
365
|
+
// ba: best analysis
|
366
|
+
|
367
|
+
if (finalp[n] && l > bl) {
|
368
|
+
// store the new analysis
|
369
|
+
bl = l;
|
370
|
+
ba = ca; // copy the arc vector
|
371
|
+
}
|
372
|
+
|
373
|
+
// follow the epsilon transitions
|
374
|
+
unsigned int i;
|
375
|
+
for( i=first_arc[n];
|
376
|
+
i<first_arc[n+1] && label[i].upper_char() == Label::epsilon;
|
377
|
+
i++)
|
378
|
+
{
|
379
|
+
ca.push_back(i);
|
380
|
+
longest_match2(target_node[i], string, l, ca, bl, ba);
|
381
|
+
ca.pop_back();
|
382
|
+
}
|
383
|
+
|
384
|
+
// follow the non-epsilon transitions
|
385
|
+
char *end=string;
|
386
|
+
int c=alphabet.next_code(end, false);
|
387
|
+
l += end-string;
|
388
|
+
if (c != EOF) {
|
389
|
+
// find the set of arcs with matching upper character in the sort list
|
390
|
+
pair<Label*,Label*>range =
|
391
|
+
equal_range(label+i, label+first_arc[n+1], Label((Character)c));
|
392
|
+
unsigned int to = (unsigned int)(range.second - label);
|
393
|
+
for( i=range.first-label; i<to; i++) {
|
394
|
+
ca.push_back(i);
|
395
|
+
longest_match2(target_node[i], end, l, ca, bl, ba);
|
396
|
+
ca.pop_back();
|
397
|
+
}
|
398
|
+
}
|
399
|
+
}
|
400
|
+
|
401
|
+
|
402
|
+
/*******************************************************************/
|
403
|
+
/* */
|
404
|
+
/* CompactTransducer::print_analysis */
|
405
|
+
/* */
|
406
|
+
/*******************************************************************/
|
407
|
+
|
408
|
+
char *CompactTransducer::print_analysis( CAnalysis &cana )
|
409
|
+
|
410
|
+
{
|
411
|
+
Analysis ana;
|
412
|
+
convert(cana, ana);
|
413
|
+
return alphabet.print_analysis( ana, both_layers );
|
414
|
+
}
|
415
|
+
|
416
|
+
|
417
|
+
/*******************************************************************/
|
418
|
+
/* */
|
419
|
+
/* CompactTransducer::longest_match */
|
420
|
+
/* */
|
421
|
+
/*******************************************************************/
|
422
|
+
|
423
|
+
const char *CompactTransducer::longest_match( char* &string )
|
424
|
+
|
425
|
+
{
|
426
|
+
vector<char> analysis;
|
427
|
+
CAnalysis ca, ba;
|
428
|
+
int l=0;
|
429
|
+
longest_match2(0, string, 0, ca, l, ba);
|
430
|
+
|
431
|
+
// no match? return the next character
|
432
|
+
if (ba.size() == 0) {
|
433
|
+
int c=alphabet.next_code(string, false);
|
434
|
+
return alphabet.code2symbol(c);
|
435
|
+
}
|
436
|
+
|
437
|
+
string += l;
|
438
|
+
return print_analysis( ba );
|
439
|
+
}
|
440
|
+
|
441
|
+
|
442
|
+
/*******************************************************************/
|
443
|
+
/* */
|
444
|
+
/* CompactTransducer::disambiguate */
|
445
|
+
/* */
|
446
|
+
/*******************************************************************/
|
447
|
+
|
448
|
+
void CompactTransducer::disambiguate( vector<CAnalysis> &analyses )
|
449
|
+
|
450
|
+
{
|
451
|
+
// compute the scores
|
452
|
+
int bestscore=INT_MIN;
|
453
|
+
vector<int> score;
|
454
|
+
Analysis ana;
|
455
|
+
|
456
|
+
for( size_t i=0; i<analyses.size(); i++ ) {
|
457
|
+
convert(analyses[i], ana);
|
458
|
+
score.push_back(alphabet.compute_score(ana));
|
459
|
+
if (bestscore < score[i])
|
460
|
+
bestscore = score[i];
|
461
|
+
}
|
462
|
+
|
463
|
+
// delete suboptimal analyses
|
464
|
+
size_t k=0;
|
465
|
+
for( size_t i=0; i<analyses.size(); i++ )
|
466
|
+
if (score[i] == bestscore)
|
467
|
+
analyses[k++] = analyses[i];
|
468
|
+
analyses.resize(k);
|
469
|
+
}
|
470
|
+
|
471
|
+
|
472
|
+
/*******************************************************************/
|
473
|
+
/* */
|
474
|
+
/* CompactTransducer::train2 */
|
475
|
+
/* */
|
476
|
+
/*******************************************************************/
|
477
|
+
|
478
|
+
bool CompactTransducer::train2( char *s, vector<double> &arcfreq,
|
479
|
+
vector<double> &finalfreq )
|
480
|
+
{
|
481
|
+
vector<CAnalysis> analyses;
|
482
|
+
vector<Label> input;
|
483
|
+
alphabet.string2labelseq( s, input );
|
484
|
+
|
485
|
+
CAnalysis ca; // data structure where the analysis is stored
|
486
|
+
unsigned int n=0;
|
487
|
+
bool failure=false;
|
488
|
+
for( size_t i=0; i<input.size(); i++ ) {
|
489
|
+
failure = true;
|
490
|
+
for( unsigned int k=first_arc[n]; k<first_arc[n+1]; k++) {
|
491
|
+
if (label[k] == input[i]) {
|
492
|
+
ca.push_back(k);
|
493
|
+
n = target_node[k];
|
494
|
+
failure = false;
|
495
|
+
break;
|
496
|
+
}
|
497
|
+
}
|
498
|
+
if (failure)
|
499
|
+
break;
|
500
|
+
}
|
501
|
+
if (failure || !finalp[n]) {
|
502
|
+
fprintf(stderr,"Warning: The following input is not covered:\n%s\n", s);
|
503
|
+
return false;
|
504
|
+
}
|
505
|
+
|
506
|
+
for( size_t k=0; k<ca.size(); k++ )
|
507
|
+
arcfreq[ca[k]]++;
|
508
|
+
finalfreq[target_node[ca.back()]]++;
|
509
|
+
|
510
|
+
return true;
|
511
|
+
}
|
512
|
+
|
513
|
+
|
514
|
+
/*******************************************************************/
|
515
|
+
/* */
|
516
|
+
/* CompactTransducer::train */
|
517
|
+
/* */
|
518
|
+
/*******************************************************************/
|
519
|
+
|
520
|
+
bool CompactTransducer::train( char *s, vector<double> &arcfreq,
|
521
|
+
vector<double> &finalfreq )
|
522
|
+
{
|
523
|
+
vector<CAnalysis> analyses;
|
524
|
+
vector<Character> input;
|
525
|
+
alphabet.string2symseq( s, input );
|
526
|
+
|
527
|
+
CAnalysis ca; // data structure where the current incomplete analysis
|
528
|
+
// is stored
|
529
|
+
analyze(0, input, 0, ca, analyses); // start the analysis
|
530
|
+
|
531
|
+
if (analyses.size() > 10000)
|
532
|
+
return true; // ignore inputs with more than 10000 analyses
|
533
|
+
else if (analyses.size() == 0)
|
534
|
+
return false;
|
535
|
+
|
536
|
+
if (simplest_only && analyses.size() > 1)
|
537
|
+
disambiguate( analyses ); // select the simplest analyses
|
538
|
+
|
539
|
+
if (analyses.size() > 0) {
|
540
|
+
double incr = 1.0 / analyses.size();
|
541
|
+
CAnalysis arcs;
|
542
|
+
|
543
|
+
for( size_t i=0; i<analyses.size(); i++ ) {
|
544
|
+
CAnalysis &arcs=analyses[i];
|
545
|
+
for( size_t k=0; k<arcs.size(); k++ )
|
546
|
+
arcfreq[arcs[k]] += incr;
|
547
|
+
finalfreq[target_node[arcs.back()]] += incr;
|
548
|
+
}
|
549
|
+
}
|
550
|
+
return true;
|
551
|
+
}
|
552
|
+
|
553
|
+
|
554
|
+
/*******************************************************************/
|
555
|
+
/* */
|
556
|
+
/* CompactTransducer::estimate_probs */
|
557
|
+
/* */
|
558
|
+
/*******************************************************************/
|
559
|
+
|
560
|
+
void CompactTransducer::estimate_probs( vector<double> &arcfreq,
|
561
|
+
vector<double> &finalfreq )
|
562
|
+
{
|
563
|
+
// turn frequencies into probabilities
|
564
|
+
for( size_t n=0; n<finalfreq.size(); n++ ) {
|
565
|
+
double sum = finalfreq[n];
|
566
|
+
for( size_t a=first_arc[n]; a<first_arc[n+1]; a++ )
|
567
|
+
sum += arcfreq[a];
|
568
|
+
if (sum == 0.0)
|
569
|
+
sum = 1.0;
|
570
|
+
finalfreq[n] = finalfreq[n] / sum;
|
571
|
+
for( size_t a=first_arc[n]; a<first_arc[n+1]; a++ )
|
572
|
+
arcfreq[a] = arcfreq[a] / sum;
|
573
|
+
}
|
574
|
+
}
|
575
|
+
|
576
|
+
|
577
|
+
|
578
|
+
/*******************************************************************/
|
579
|
+
/* */
|
580
|
+
/* CompactTransducer::compute_probs */
|
581
|
+
/* */
|
582
|
+
/*******************************************************************/
|
583
|
+
|
584
|
+
void CompactTransducer::compute_probs( vector<CAnalysis> &analyses,
|
585
|
+
vector<double> &prob )
|
586
|
+
{
|
587
|
+
prob.resize(analyses.size());
|
588
|
+
double sum=0.0;
|
589
|
+
for( size_t i=0; i<analyses.size(); i++ ) {
|
590
|
+
CAnalysis &a=analyses[i];
|
591
|
+
|
592
|
+
// compute the probability
|
593
|
+
double logprob=0.0;
|
594
|
+
for( size_t k=0; k<a.size(); k++ )
|
595
|
+
logprob += arc_logprob[a[k]];
|
596
|
+
logprob += final_logprob[target_node[a.back()]];
|
597
|
+
prob[i] = exp(logprob);
|
598
|
+
sum += prob[i];
|
599
|
+
}
|
600
|
+
|
601
|
+
// sort the analyses
|
602
|
+
vector<CAnalysis> oldanalyses(analyses);
|
603
|
+
vector<double> oldprob(prob);
|
604
|
+
for( size_t i=0; i<analyses.size(); i++ ) {
|
605
|
+
prob[i] = -1.0;
|
606
|
+
int n=0;
|
607
|
+
for( size_t k=0; k<oldanalyses.size(); k++ )
|
608
|
+
if (prob[i] < oldprob[k]) {
|
609
|
+
prob[i] = oldprob[k];
|
610
|
+
n = k;
|
611
|
+
}
|
612
|
+
analyses[i] = oldanalyses[n];
|
613
|
+
oldprob[n] = -1.0;
|
614
|
+
prob[i] /= sum; // normalization
|
615
|
+
}
|
616
|
+
}
|
@@ -0,0 +1,98 @@
|
|
1
|
+
/*******************************************************************/
|
2
|
+
/* */
|
3
|
+
/* FILE compact.h */
|
4
|
+
/* MODULE compact */
|
5
|
+
/* PROGRAM SFST */
|
6
|
+
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
7
|
+
/* */
|
8
|
+
/* PURPOSE finite state tools */
|
9
|
+
/* */
|
10
|
+
/*******************************************************************/
|
11
|
+
|
12
|
+
#ifndef _COMPACT_H_
|
13
|
+
#define _COMPACT_H_
|
14
|
+
|
15
|
+
#include "alphabet.h"
|
16
|
+
|
17
|
+
#include <vector>
|
18
|
+
|
19
|
+
typedef std::vector<unsigned int> CAnalysis;
|
20
|
+
|
21
|
+
class CompactTransducer {
|
22
|
+
|
23
|
+
protected:
|
24
|
+
|
25
|
+
// the following data structures are used to store the nodes
|
26
|
+
|
27
|
+
unsigned int number_of_nodes; // number of nodes in the transducer
|
28
|
+
char *finalp; // finalp[i] is 1 if node i is final and 0 otherwise
|
29
|
+
unsigned int *first_arc; // first_arc[i] is the number of the first
|
30
|
+
// arc outgoing from node i
|
31
|
+
|
32
|
+
// the following data structures are used to store the transition arcs
|
33
|
+
|
34
|
+
unsigned int number_of_arcs; // total number of arcs in the transducer
|
35
|
+
Label *label; // the label (character pair) of arc i
|
36
|
+
unsigned int *target_node; // target node of arc i
|
37
|
+
|
38
|
+
// the following data structures are used to store the stochastic parameters
|
39
|
+
float *final_logprob;
|
40
|
+
float *arc_logprob;
|
41
|
+
|
42
|
+
// functions needed to read the transducer from a file
|
43
|
+
|
44
|
+
void read_finalp( FILE *file );
|
45
|
+
void read_first_arcs( FILE *file );
|
46
|
+
void read_target_nodes( FILE *file );
|
47
|
+
void read_labels( FILE *file );
|
48
|
+
void read_probs( FILE *file );
|
49
|
+
|
50
|
+
// functions needed to analyze data with the transducer
|
51
|
+
|
52
|
+
void analyze( unsigned int n, std::vector<Character> &ch, size_t ipos,
|
53
|
+
CAnalysis&, std::vector<CAnalysis>&);
|
54
|
+
|
55
|
+
// function selecting the simplest morphological analysis
|
56
|
+
|
57
|
+
int compute_score( CAnalysis &ana );
|
58
|
+
void disambiguate( std::vector<CAnalysis> &analyses );
|
59
|
+
|
60
|
+
// functions for longest-match analysis of input data
|
61
|
+
|
62
|
+
void longest_match2(unsigned int, char*, int, CAnalysis&, int&, CAnalysis&);
|
63
|
+
|
64
|
+
void convert( CAnalysis &cana, Analysis &ana );
|
65
|
+
|
66
|
+
public:
|
67
|
+
size_t node_count() { return number_of_nodes; };
|
68
|
+
size_t arc_count() { return number_of_arcs; };
|
69
|
+
|
70
|
+
bool both_layers; // print surface and analysis symbols
|
71
|
+
bool simplest_only; // print only the simplest analyses
|
72
|
+
|
73
|
+
Alphabet alphabet; // data structure which maps symbols to numeric codes
|
74
|
+
CompactTransducer(); // dummy constructor
|
75
|
+
CompactTransducer( FILE*, FILE *pfile=NULL ); // reads a (stochastic) transducer
|
76
|
+
~CompactTransducer(); // destroys a transducer
|
77
|
+
|
78
|
+
// the analysis function returns the set of analyses for the string "s"
|
79
|
+
// in the argument "analyses"
|
80
|
+
void analyze_string( char *s, std::vector<CAnalysis > &analyses );
|
81
|
+
|
82
|
+
void compute_probs( std::vector<CAnalysis> &analyses, std::vector<double> &prob );
|
83
|
+
char *print_analysis( CAnalysis &ana );
|
84
|
+
|
85
|
+
// longest-match analysis
|
86
|
+
const char *longest_match( char*& );
|
87
|
+
|
88
|
+
// EM training
|
89
|
+
bool train2( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
|
90
|
+
bool train( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
|
91
|
+
void estimate_probs( std::vector<double> &arcfreq, std::vector<double> &finalfreq );
|
92
|
+
|
93
|
+
// robust analysis
|
94
|
+
float robust_analyze_string( char *string, std::vector<CAnalysis> &analyses,
|
95
|
+
float ErrorsAllowed );
|
96
|
+
};
|
97
|
+
|
98
|
+
#endif
|