ruby-sfst 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +1 -0
- data/Manifest +31 -0
- data/README.rdoc +25 -0
- data/Rakefile +22 -0
- data/ext/sfst_machine/alphabet.C +807 -0
- data/ext/sfst_machine/alphabet.h +281 -0
- data/ext/sfst_machine/basic.C +84 -0
- data/ext/sfst_machine/basic.h +24 -0
- data/ext/sfst_machine/compact.C +616 -0
- data/ext/sfst_machine/compact.h +98 -0
- data/ext/sfst_machine/determinise.C +304 -0
- data/ext/sfst_machine/extconf.rb +4 -0
- data/ext/sfst_machine/fst-compiler.C +2375 -0
- data/ext/sfst_machine/fst-compiler.h +113 -0
- data/ext/sfst_machine/fst-compiler.yy +213 -0
- data/ext/sfst_machine/fst.C +966 -0
- data/ext/sfst_machine/fst.h +365 -0
- data/ext/sfst_machine/interface.C +1838 -0
- data/ext/sfst_machine/interface.h +94 -0
- data/ext/sfst_machine/make-compact.C +328 -0
- data/ext/sfst_machine/make-compact.h +34 -0
- data/ext/sfst_machine/mem.h +74 -0
- data/ext/sfst_machine/operators.C +1131 -0
- data/ext/sfst_machine/sfst_machine.cc +411 -0
- data/ext/sfst_machine/utf8-scanner.C +2197 -0
- data/ext/sfst_machine/utf8-scanner.ll +179 -0
- data/ext/sfst_machine/utf8.C +146 -0
- data/ext/sfst_machine/utf8.h +19 -0
- data/lib/sfst.rb +99 -0
- data/ruby-sfst.gemspec +34 -0
- data/test/test_sfst.fst +3 -0
- data/test/test_sfst.rb +119 -0
- metadata +100 -0
@@ -0,0 +1,616 @@
|
|
1
|
+
/*******************************************************************/
|
2
|
+
/* */
|
3
|
+
/* FILE compact.C */
|
4
|
+
/* MODULE compact */
|
5
|
+
/* PROGRAM SFST */
|
6
|
+
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
7
|
+
/* */
|
8
|
+
/* PURPOSE Code needed for analysing data */
|
9
|
+
/* */
|
10
|
+
/*******************************************************************/
|
11
|
+
|
12
|
+
#include <stdio.h>
|
13
|
+
#include <math.h>
|
14
|
+
|
15
|
+
#include <limits.h>
|
16
|
+
|
17
|
+
#include "compact.h"
|
18
|
+
|
19
|
+
using std::equal_range;
|
20
|
+
using std::vector;
|
21
|
+
using std::pair;
|
22
|
+
|
23
|
+
const int BUFFER_SIZE=1000;
|
24
|
+
|
25
|
+
|
26
|
+
/*******************************************************************/
|
27
|
+
/* */
|
28
|
+
/* CompactTransducer::convert */
|
29
|
+
/* */
|
30
|
+
/*******************************************************************/
|
31
|
+
|
32
|
+
void CompactTransducer::convert( CAnalysis &cana, Analysis &ana )
|
33
|
+
|
34
|
+
{
|
35
|
+
ana.resize(cana.size());
|
36
|
+
for( size_t i=0; i<cana.size(); i++ )
|
37
|
+
ana[i] = label[cana[i]];
|
38
|
+
}
|
39
|
+
|
40
|
+
|
41
|
+
/*******************************************************************/
|
42
|
+
/* */
|
43
|
+
/* CompactTransducer::analyze */
|
44
|
+
/* */
|
45
|
+
/*******************************************************************/
|
46
|
+
|
47
|
+
void CompactTransducer::analyze(unsigned int n, vector<Character> &input,
|
48
|
+
size_t ipos, CAnalysis &ca,
|
49
|
+
vector<CAnalysis> &analyses )
|
50
|
+
{
|
51
|
+
// "n" is the number of the current transducer node/state
|
52
|
+
// "input" is the sequence of input symbols
|
53
|
+
// "ipos" is the input position currently analysed
|
54
|
+
// "ca" stores the incomplete analysis string
|
55
|
+
// "analyses" stores the analyses found so far
|
56
|
+
|
57
|
+
if (analyses.size() > 10000)
|
58
|
+
return; // limit the maximal number of analyses
|
59
|
+
|
60
|
+
// Is the input string fully analyzed and the current node a final node?
|
61
|
+
if (finalp[n] && ipos == input.size())
|
62
|
+
// store the new analysis
|
63
|
+
analyses.push_back(ca);
|
64
|
+
|
65
|
+
// follow the epsilon transitions
|
66
|
+
// first_arc[n] is the number of the first outgoing transition of node n
|
67
|
+
// first_arc[n+1]-1 is the number of the last outgoing transition of node n
|
68
|
+
// first_arc[n+1] is the number of the first outgoing transition of node n+1
|
69
|
+
unsigned int i;
|
70
|
+
for( i=first_arc[n];
|
71
|
+
i<first_arc[n+1] && label[i].upper_char() == Label::epsilon;
|
72
|
+
i++)
|
73
|
+
{
|
74
|
+
ca.push_back(i);
|
75
|
+
analyze(target_node[i], input, ipos, ca, analyses);
|
76
|
+
ca.pop_back();
|
77
|
+
}
|
78
|
+
|
79
|
+
// follow the non-epsilon transitions
|
80
|
+
|
81
|
+
// scan the next input symbol
|
82
|
+
if (ipos < input.size()) {
|
83
|
+
// find the set of arcs with matching upper character in the sorted list
|
84
|
+
pair<Label*,Label*>range =
|
85
|
+
equal_range(label+i, label+first_arc[n+1], Label(input[ipos]));
|
86
|
+
unsigned int to = (unsigned int)(range.second - label);
|
87
|
+
|
88
|
+
// follow the non-epsilon transitions
|
89
|
+
for( i=range.first-label; i<to; i++) {
|
90
|
+
ca.push_back(i);
|
91
|
+
analyze(target_node[i], input, ipos+1, ca, analyses);
|
92
|
+
ca.pop_back();
|
93
|
+
}
|
94
|
+
}
|
95
|
+
}
|
96
|
+
|
97
|
+
|
98
|
+
/*******************************************************************/
|
99
|
+
/* */
|
100
|
+
/* CompactTransducer::analyze_string */
|
101
|
+
/* */
|
102
|
+
/*******************************************************************/
|
103
|
+
|
104
|
+
void CompactTransducer::analyze_string( char *s, vector<CAnalysis> &analyses )
|
105
|
+
|
106
|
+
{
|
107
|
+
// "s" input string to be analyzed
|
108
|
+
// "analyses" is the data structure in which the results are stored
|
109
|
+
// and returned
|
110
|
+
|
111
|
+
vector<Character> input;
|
112
|
+
alphabet.string2symseq( s, input );
|
113
|
+
|
114
|
+
analyses.clear();
|
115
|
+
CAnalysis ca; // data structure where the current incomplete analysis
|
116
|
+
// is stored
|
117
|
+
analyze(0, input, 0, ca, analyses); // start the analysis
|
118
|
+
|
119
|
+
if (analyses.size() > 10000)
|
120
|
+
fprintf(stderr,"Warning: Only the first 10000 analyses considered for \"%s\"!\n", s);
|
121
|
+
|
122
|
+
if (simplest_only && analyses.size() > 1)
|
123
|
+
disambiguate( analyses ); // select the simplest analyses
|
124
|
+
}
|
125
|
+
|
126
|
+
|
127
|
+
|
128
|
+
/*******************************************************************/
|
129
|
+
/* */
|
130
|
+
/* CompactTransducer::~CompactTransducer */
|
131
|
+
/* */
|
132
|
+
/*******************************************************************/
|
133
|
+
|
134
|
+
CompactTransducer::~CompactTransducer()
|
135
|
+
|
136
|
+
{
|
137
|
+
delete[] finalp;
|
138
|
+
delete[] first_arc;
|
139
|
+
delete[] label;
|
140
|
+
delete[] target_node;
|
141
|
+
delete[] final_logprob;
|
142
|
+
delete[] arc_logprob;
|
143
|
+
}
|
144
|
+
|
145
|
+
|
146
|
+
/*******************************************************************/
|
147
|
+
/* */
|
148
|
+
/* CompactTransducer::CompactTransducer */
|
149
|
+
/* */
|
150
|
+
/*******************************************************************/
|
151
|
+
|
152
|
+
CompactTransducer::CompactTransducer()
|
153
|
+
|
154
|
+
{
|
155
|
+
both_layers = false;
|
156
|
+
simplest_only = false;
|
157
|
+
number_of_nodes = 0;
|
158
|
+
number_of_arcs = 0;
|
159
|
+
finalp = NULL;
|
160
|
+
first_arc = NULL;
|
161
|
+
label = NULL;
|
162
|
+
target_node = NULL;
|
163
|
+
arc_logprob = final_logprob = (float*)NULL;
|
164
|
+
}
|
165
|
+
|
166
|
+
|
167
|
+
/*******************************************************************/
|
168
|
+
/* */
|
169
|
+
/* CompactTransducer::read_finalp */
|
170
|
+
/* */
|
171
|
+
/*******************************************************************/
|
172
|
+
|
173
|
+
void CompactTransducer::read_finalp( FILE *file )
|
174
|
+
|
175
|
+
{
|
176
|
+
int k=0;
|
177
|
+
unsigned char n=0;
|
178
|
+
for( size_t i=0; i<number_of_nodes; i++ ) {
|
179
|
+
if (k == 0) {
|
180
|
+
n = (unsigned char)fgetc(file);
|
181
|
+
k = 8;
|
182
|
+
}
|
183
|
+
k--;
|
184
|
+
if (n & (1 << k))
|
185
|
+
finalp[i] = 1;
|
186
|
+
else
|
187
|
+
finalp[i] = 0;
|
188
|
+
}
|
189
|
+
}
|
190
|
+
|
191
|
+
|
192
|
+
/*******************************************************************/
|
193
|
+
/* */
|
194
|
+
/* CompactTransducer::read_first_arcs */
|
195
|
+
/* */
|
196
|
+
/*******************************************************************/
|
197
|
+
|
198
|
+
void CompactTransducer::read_first_arcs( FILE *file )
|
199
|
+
|
200
|
+
{
|
201
|
+
int k=0;
|
202
|
+
unsigned int n=0;
|
203
|
+
size_t bits=(size_t)ceil(log(number_of_arcs+1)/log(2));
|
204
|
+
|
205
|
+
for( size_t i=0; i<=number_of_nodes; i++ ) {
|
206
|
+
first_arc[i] = n >> (sizeof(n)*8 - bits);
|
207
|
+
n <<= bits;
|
208
|
+
k -= bits;
|
209
|
+
if (k < 0) {
|
210
|
+
read_num(&n,sizeof(n),file);
|
211
|
+
first_arc[i] |= n >> (sizeof(n)*8 + k);
|
212
|
+
n <<= -k;
|
213
|
+
k += sizeof(n) * 8;
|
214
|
+
}
|
215
|
+
}
|
216
|
+
}
|
217
|
+
|
218
|
+
|
219
|
+
/*******************************************************************/
|
220
|
+
/* */
|
221
|
+
/* CompactTransducer::read_target_nodes */
|
222
|
+
/* */
|
223
|
+
/*******************************************************************/
|
224
|
+
|
225
|
+
void CompactTransducer::read_target_nodes( FILE *file )
|
226
|
+
|
227
|
+
{
|
228
|
+
int k=0;
|
229
|
+
unsigned int n=0;
|
230
|
+
size_t bits=(size_t)ceil(log(number_of_nodes)/log(2));
|
231
|
+
|
232
|
+
for( size_t i=0; i<number_of_arcs; i++ ) {
|
233
|
+
target_node[i] = n >> (sizeof(n)*8 - bits);
|
234
|
+
n <<= bits;
|
235
|
+
k -= bits;
|
236
|
+
if (k < 0) {
|
237
|
+
read_num(&n,sizeof(n),file);
|
238
|
+
target_node[i] |= n >> (sizeof(n)*8 + k);
|
239
|
+
n <<= -k;
|
240
|
+
k += sizeof(n) * 8;
|
241
|
+
}
|
242
|
+
}
|
243
|
+
}
|
244
|
+
|
245
|
+
|
246
|
+
/*******************************************************************/
|
247
|
+
/* */
|
248
|
+
/* CompactTransducer::read_labels */
|
249
|
+
/* */
|
250
|
+
/*******************************************************************/
|
251
|
+
|
252
|
+
void CompactTransducer::read_labels( FILE *file )
|
253
|
+
|
254
|
+
{
|
255
|
+
size_t N=0;
|
256
|
+
Label Num2Label[alphabet.size()];
|
257
|
+
for( Alphabet::const_iterator it=alphabet.begin();
|
258
|
+
it != alphabet.end(); it++ )
|
259
|
+
{
|
260
|
+
Label l=*it;
|
261
|
+
Num2Label[N++] = l;
|
262
|
+
}
|
263
|
+
|
264
|
+
int k=0;
|
265
|
+
unsigned int n=0;
|
266
|
+
size_t bits=(size_t)ceil(log(alphabet.size())/log(2));
|
267
|
+
|
268
|
+
for( size_t i=0; i<number_of_arcs; i++ ) {
|
269
|
+
unsigned int l = n >> (sizeof(n)*8 - bits);
|
270
|
+
n <<= bits;
|
271
|
+
k -= bits;
|
272
|
+
if (k < 0) {
|
273
|
+
read_num(&n,sizeof(n),file);
|
274
|
+
l |= n >> (sizeof(n)*8 + k);
|
275
|
+
n <<= -k;
|
276
|
+
k += sizeof(n) * 8;
|
277
|
+
}
|
278
|
+
label[i] = Num2Label[l];
|
279
|
+
}
|
280
|
+
}
|
281
|
+
|
282
|
+
|
283
|
+
/*******************************************************************/
|
284
|
+
/* */
|
285
|
+
/* CompactTransducer::read_probs */
|
286
|
+
/* */
|
287
|
+
/*******************************************************************/
|
288
|
+
|
289
|
+
void CompactTransducer::read_probs( FILE *file )
|
290
|
+
|
291
|
+
{
|
292
|
+
size_t n,m;
|
293
|
+
fread(&n, sizeof(n), 1, file);
|
294
|
+
if (fread(&m, sizeof(n), 1, file) != 1 ||
|
295
|
+
n != node_count() || m != arc_count())
|
296
|
+
{
|
297
|
+
fprintf(stderr,"Error: incompatible probability file!\n");
|
298
|
+
exit(1);
|
299
|
+
}
|
300
|
+
final_logprob = new float[n];
|
301
|
+
arc_logprob = new float[m];
|
302
|
+
fread(final_logprob, sizeof(float), n, file);
|
303
|
+
if (fread(arc_logprob, sizeof(float), n, file) != n) {
|
304
|
+
fprintf(stderr,"Error: in probability file!\n");
|
305
|
+
exit(1);
|
306
|
+
}
|
307
|
+
}
|
308
|
+
|
309
|
+
|
310
|
+
/*******************************************************************/
|
311
|
+
/* */
|
312
|
+
/* CompactTransducer::CompactTransducer */
|
313
|
+
/* */
|
314
|
+
/*******************************************************************/
|
315
|
+
|
316
|
+
CompactTransducer::CompactTransducer( FILE *file, FILE *pfile )
|
317
|
+
|
318
|
+
{
|
319
|
+
both_layers = false;
|
320
|
+
simplest_only = false;
|
321
|
+
|
322
|
+
if (fgetc(file) != 'c')
|
323
|
+
throw "Error: wrong file format (not a compact transducer)\n";
|
324
|
+
|
325
|
+
alphabet.read(file);
|
326
|
+
|
327
|
+
read_num(&number_of_nodes,sizeof(number_of_nodes),file);
|
328
|
+
read_num(&number_of_arcs,sizeof(number_of_arcs),file);
|
329
|
+
|
330
|
+
if (!ferror(file)) {
|
331
|
+
// memory allocation
|
332
|
+
finalp = new char[number_of_nodes];
|
333
|
+
first_arc = new unsigned[number_of_nodes+1];
|
334
|
+
label = new Label[number_of_arcs];
|
335
|
+
target_node = new unsigned[number_of_arcs];
|
336
|
+
|
337
|
+
// reading the data
|
338
|
+
read_finalp(file);
|
339
|
+
read_first_arcs(file);
|
340
|
+
read_labels(file);
|
341
|
+
read_target_nodes(file);
|
342
|
+
}
|
343
|
+
|
344
|
+
if (pfile == NULL)
|
345
|
+
arc_logprob = final_logprob = (float*)NULL;
|
346
|
+
else
|
347
|
+
read_probs(pfile);
|
348
|
+
}
|
349
|
+
|
350
|
+
|
351
|
+
/*******************************************************************/
|
352
|
+
/* */
|
353
|
+
/* CompactTransducer::longest_match2 */
|
354
|
+
/* */
|
355
|
+
/*******************************************************************/
|
356
|
+
|
357
|
+
void CompactTransducer::longest_match2(unsigned int n, char *string, int l,
|
358
|
+
CAnalysis &ca, int &bl, CAnalysis &ba)
|
359
|
+
{
|
360
|
+
// n: transducer state
|
361
|
+
// string: rest string
|
362
|
+
// l: length of current analysis
|
363
|
+
// bl: length of the currently longest match
|
364
|
+
// ca: current analysis
|
365
|
+
// ba: best analysis
|
366
|
+
|
367
|
+
if (finalp[n] && l > bl) {
|
368
|
+
// store the new analysis
|
369
|
+
bl = l;
|
370
|
+
ba = ca; // copy the arc vector
|
371
|
+
}
|
372
|
+
|
373
|
+
// follow the epsilon transitions
|
374
|
+
unsigned int i;
|
375
|
+
for( i=first_arc[n];
|
376
|
+
i<first_arc[n+1] && label[i].upper_char() == Label::epsilon;
|
377
|
+
i++)
|
378
|
+
{
|
379
|
+
ca.push_back(i);
|
380
|
+
longest_match2(target_node[i], string, l, ca, bl, ba);
|
381
|
+
ca.pop_back();
|
382
|
+
}
|
383
|
+
|
384
|
+
// follow the non-epsilon transitions
|
385
|
+
char *end=string;
|
386
|
+
int c=alphabet.next_code(end, false);
|
387
|
+
l += end-string;
|
388
|
+
if (c != EOF) {
|
389
|
+
// find the set of arcs with matching upper character in the sort list
|
390
|
+
pair<Label*,Label*>range =
|
391
|
+
equal_range(label+i, label+first_arc[n+1], Label((Character)c));
|
392
|
+
unsigned int to = (unsigned int)(range.second - label);
|
393
|
+
for( i=range.first-label; i<to; i++) {
|
394
|
+
ca.push_back(i);
|
395
|
+
longest_match2(target_node[i], end, l, ca, bl, ba);
|
396
|
+
ca.pop_back();
|
397
|
+
}
|
398
|
+
}
|
399
|
+
}
|
400
|
+
|
401
|
+
|
402
|
+
/*******************************************************************/
|
403
|
+
/* */
|
404
|
+
/* CompactTransducer::print_analysis */
|
405
|
+
/* */
|
406
|
+
/*******************************************************************/
|
407
|
+
|
408
|
+
char *CompactTransducer::print_analysis( CAnalysis &cana )
|
409
|
+
|
410
|
+
{
|
411
|
+
Analysis ana;
|
412
|
+
convert(cana, ana);
|
413
|
+
return alphabet.print_analysis( ana, both_layers );
|
414
|
+
}
|
415
|
+
|
416
|
+
|
417
|
+
/*******************************************************************/
|
418
|
+
/* */
|
419
|
+
/* CompactTransducer::longest_match */
|
420
|
+
/* */
|
421
|
+
/*******************************************************************/
|
422
|
+
|
423
|
+
const char *CompactTransducer::longest_match( char* &string )
|
424
|
+
|
425
|
+
{
|
426
|
+
vector<char> analysis;
|
427
|
+
CAnalysis ca, ba;
|
428
|
+
int l=0;
|
429
|
+
longest_match2(0, string, 0, ca, l, ba);
|
430
|
+
|
431
|
+
// no match? return the next character
|
432
|
+
if (ba.size() == 0) {
|
433
|
+
int c=alphabet.next_code(string, false);
|
434
|
+
return alphabet.code2symbol(c);
|
435
|
+
}
|
436
|
+
|
437
|
+
string += l;
|
438
|
+
return print_analysis( ba );
|
439
|
+
}
|
440
|
+
|
441
|
+
|
442
|
+
/*******************************************************************/
|
443
|
+
/* */
|
444
|
+
/* CompactTransducer::disambiguate */
|
445
|
+
/* */
|
446
|
+
/*******************************************************************/
|
447
|
+
|
448
|
+
void CompactTransducer::disambiguate( vector<CAnalysis> &analyses )
|
449
|
+
|
450
|
+
{
|
451
|
+
// compute the scores
|
452
|
+
int bestscore=INT_MIN;
|
453
|
+
vector<int> score;
|
454
|
+
Analysis ana;
|
455
|
+
|
456
|
+
for( size_t i=0; i<analyses.size(); i++ ) {
|
457
|
+
convert(analyses[i], ana);
|
458
|
+
score.push_back(alphabet.compute_score(ana));
|
459
|
+
if (bestscore < score[i])
|
460
|
+
bestscore = score[i];
|
461
|
+
}
|
462
|
+
|
463
|
+
// delete suboptimal analyses
|
464
|
+
size_t k=0;
|
465
|
+
for( size_t i=0; i<analyses.size(); i++ )
|
466
|
+
if (score[i] == bestscore)
|
467
|
+
analyses[k++] = analyses[i];
|
468
|
+
analyses.resize(k);
|
469
|
+
}
|
470
|
+
|
471
|
+
|
472
|
+
/*******************************************************************/
|
473
|
+
/* */
|
474
|
+
/* CompactTransducer::train2 */
|
475
|
+
/* */
|
476
|
+
/*******************************************************************/
|
477
|
+
|
478
|
+
bool CompactTransducer::train2( char *s, vector<double> &arcfreq,
|
479
|
+
vector<double> &finalfreq )
|
480
|
+
{
|
481
|
+
vector<CAnalysis> analyses;
|
482
|
+
vector<Label> input;
|
483
|
+
alphabet.string2labelseq( s, input );
|
484
|
+
|
485
|
+
CAnalysis ca; // data structure where the analysis is stored
|
486
|
+
unsigned int n=0;
|
487
|
+
bool failure=false;
|
488
|
+
for( size_t i=0; i<input.size(); i++ ) {
|
489
|
+
failure = true;
|
490
|
+
for( unsigned int k=first_arc[n]; k<first_arc[n+1]; k++) {
|
491
|
+
if (label[k] == input[i]) {
|
492
|
+
ca.push_back(k);
|
493
|
+
n = target_node[k];
|
494
|
+
failure = false;
|
495
|
+
break;
|
496
|
+
}
|
497
|
+
}
|
498
|
+
if (failure)
|
499
|
+
break;
|
500
|
+
}
|
501
|
+
if (failure || !finalp[n]) {
|
502
|
+
fprintf(stderr,"Warning: The following input is not covered:\n%s\n", s);
|
503
|
+
return false;
|
504
|
+
}
|
505
|
+
|
506
|
+
for( size_t k=0; k<ca.size(); k++ )
|
507
|
+
arcfreq[ca[k]]++;
|
508
|
+
finalfreq[target_node[ca.back()]]++;
|
509
|
+
|
510
|
+
return true;
|
511
|
+
}
|
512
|
+
|
513
|
+
|
514
|
+
/*******************************************************************/
|
515
|
+
/* */
|
516
|
+
/* CompactTransducer::train */
|
517
|
+
/* */
|
518
|
+
/*******************************************************************/
|
519
|
+
|
520
|
+
bool CompactTransducer::train( char *s, vector<double> &arcfreq,
|
521
|
+
vector<double> &finalfreq )
|
522
|
+
{
|
523
|
+
vector<CAnalysis> analyses;
|
524
|
+
vector<Character> input;
|
525
|
+
alphabet.string2symseq( s, input );
|
526
|
+
|
527
|
+
CAnalysis ca; // data structure where the current incomplete analysis
|
528
|
+
// is stored
|
529
|
+
analyze(0, input, 0, ca, analyses); // start the analysis
|
530
|
+
|
531
|
+
if (analyses.size() > 10000)
|
532
|
+
return true; // ignore inputs with more than 10000 analyses
|
533
|
+
else if (analyses.size() == 0)
|
534
|
+
return false;
|
535
|
+
|
536
|
+
if (simplest_only && analyses.size() > 1)
|
537
|
+
disambiguate( analyses ); // select the simplest analyses
|
538
|
+
|
539
|
+
if (analyses.size() > 0) {
|
540
|
+
double incr = 1.0 / analyses.size();
|
541
|
+
CAnalysis arcs;
|
542
|
+
|
543
|
+
for( size_t i=0; i<analyses.size(); i++ ) {
|
544
|
+
CAnalysis &arcs=analyses[i];
|
545
|
+
for( size_t k=0; k<arcs.size(); k++ )
|
546
|
+
arcfreq[arcs[k]] += incr;
|
547
|
+
finalfreq[target_node[arcs.back()]] += incr;
|
548
|
+
}
|
549
|
+
}
|
550
|
+
return true;
|
551
|
+
}
|
552
|
+
|
553
|
+
|
554
|
+
/*******************************************************************/
|
555
|
+
/* */
|
556
|
+
/* CompactTransducer::estimate_probs */
|
557
|
+
/* */
|
558
|
+
/*******************************************************************/
|
559
|
+
|
560
|
+
void CompactTransducer::estimate_probs( vector<double> &arcfreq,
|
561
|
+
vector<double> &finalfreq )
|
562
|
+
{
|
563
|
+
// turn frequencies into probabilities
|
564
|
+
for( size_t n=0; n<finalfreq.size(); n++ ) {
|
565
|
+
double sum = finalfreq[n];
|
566
|
+
for( size_t a=first_arc[n]; a<first_arc[n+1]; a++ )
|
567
|
+
sum += arcfreq[a];
|
568
|
+
if (sum == 0.0)
|
569
|
+
sum = 1.0;
|
570
|
+
finalfreq[n] = finalfreq[n] / sum;
|
571
|
+
for( size_t a=first_arc[n]; a<first_arc[n+1]; a++ )
|
572
|
+
arcfreq[a] = arcfreq[a] / sum;
|
573
|
+
}
|
574
|
+
}
|
575
|
+
|
576
|
+
|
577
|
+
|
578
|
+
/*******************************************************************/
|
579
|
+
/* */
|
580
|
+
/* CompactTransducer::compute_probs */
|
581
|
+
/* */
|
582
|
+
/*******************************************************************/
|
583
|
+
|
584
|
+
void CompactTransducer::compute_probs( vector<CAnalysis> &analyses,
|
585
|
+
vector<double> &prob )
|
586
|
+
{
|
587
|
+
prob.resize(analyses.size());
|
588
|
+
double sum=0.0;
|
589
|
+
for( size_t i=0; i<analyses.size(); i++ ) {
|
590
|
+
CAnalysis &a=analyses[i];
|
591
|
+
|
592
|
+
// compute the probability
|
593
|
+
double logprob=0.0;
|
594
|
+
for( size_t k=0; k<a.size(); k++ )
|
595
|
+
logprob += arc_logprob[a[k]];
|
596
|
+
logprob += final_logprob[target_node[a.back()]];
|
597
|
+
prob[i] = exp(logprob);
|
598
|
+
sum += prob[i];
|
599
|
+
}
|
600
|
+
|
601
|
+
// sort the analyses
|
602
|
+
vector<CAnalysis> oldanalyses(analyses);
|
603
|
+
vector<double> oldprob(prob);
|
604
|
+
for( size_t i=0; i<analyses.size(); i++ ) {
|
605
|
+
prob[i] = -1.0;
|
606
|
+
int n=0;
|
607
|
+
for( size_t k=0; k<oldanalyses.size(); k++ )
|
608
|
+
if (prob[i] < oldprob[k]) {
|
609
|
+
prob[i] = oldprob[k];
|
610
|
+
n = k;
|
611
|
+
}
|
612
|
+
analyses[i] = oldanalyses[n];
|
613
|
+
oldprob[n] = -1.0;
|
614
|
+
prob[i] /= sum; // normalization
|
615
|
+
}
|
616
|
+
}
|
@@ -0,0 +1,98 @@
|
|
1
|
+
/*******************************************************************/
|
2
|
+
/* */
|
3
|
+
/* FILE compact.h */
|
4
|
+
/* MODULE compact */
|
5
|
+
/* PROGRAM SFST */
|
6
|
+
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
7
|
+
/* */
|
8
|
+
/* PURPOSE finite state tools */
|
9
|
+
/* */
|
10
|
+
/*******************************************************************/
|
11
|
+
|
12
|
+
#ifndef _COMPACT_H_
|
13
|
+
#define _COMPACT_H_
|
14
|
+
|
15
|
+
#include "alphabet.h"
|
16
|
+
|
17
|
+
#include <vector>
|
18
|
+
|
19
|
+
typedef std::vector<unsigned int> CAnalysis;
|
20
|
+
|
21
|
+
class CompactTransducer {
|
22
|
+
|
23
|
+
protected:
|
24
|
+
|
25
|
+
// the following data structures are used to store the nodes
|
26
|
+
|
27
|
+
unsigned int number_of_nodes; // number of nodes in the transducer
|
28
|
+
char *finalp; // finalp[i] is 1 if node i is final and 0 otherwise
|
29
|
+
unsigned int *first_arc; // first_arc[i] is the number of the first
|
30
|
+
// arc outgoing from node i
|
31
|
+
|
32
|
+
// the following data structures are used to store the transition arcs
|
33
|
+
|
34
|
+
unsigned int number_of_arcs; // total number of arcs in the transducer
|
35
|
+
Label *label; // the label (character pair) of arc i
|
36
|
+
unsigned int *target_node; // target node of arc i
|
37
|
+
|
38
|
+
// the following data structures are used to store the stochastic parameters
|
39
|
+
float *final_logprob;
|
40
|
+
float *arc_logprob;
|
41
|
+
|
42
|
+
// functions needed to read the transducer from a file
|
43
|
+
|
44
|
+
void read_finalp( FILE *file );
|
45
|
+
void read_first_arcs( FILE *file );
|
46
|
+
void read_target_nodes( FILE *file );
|
47
|
+
void read_labels( FILE *file );
|
48
|
+
void read_probs( FILE *file );
|
49
|
+
|
50
|
+
// functions needed to analyze data with the transducer
|
51
|
+
|
52
|
+
void analyze( unsigned int n, std::vector<Character> &ch, size_t ipos,
|
53
|
+
CAnalysis&, std::vector<CAnalysis>&);
|
54
|
+
|
55
|
+
// function selecting the simplest morphological analysis
|
56
|
+
|
57
|
+
int compute_score( CAnalysis &ana );
|
58
|
+
void disambiguate( std::vector<CAnalysis> &analyses );
|
59
|
+
|
60
|
+
// functions for longest-match analysis of input data
|
61
|
+
|
62
|
+
void longest_match2(unsigned int, char*, int, CAnalysis&, int&, CAnalysis&);
|
63
|
+
|
64
|
+
void convert( CAnalysis &cana, Analysis &ana );
|
65
|
+
|
66
|
+
public:
|
67
|
+
size_t node_count() { return number_of_nodes; };
|
68
|
+
size_t arc_count() { return number_of_arcs; };
|
69
|
+
|
70
|
+
bool both_layers; // print surface and analysis symbols
|
71
|
+
bool simplest_only; // print only the simplest analyses
|
72
|
+
|
73
|
+
Alphabet alphabet; // data structure which maps symbols to numeric codes
|
74
|
+
CompactTransducer(); // dummy constructor
|
75
|
+
CompactTransducer( FILE*, FILE *pfile=NULL ); // reads a (stochastic) transducer
|
76
|
+
~CompactTransducer(); // destroys a transducer
|
77
|
+
|
78
|
+
// the analysis function returns the set of analyses for the string "s"
|
79
|
+
// in the argument "analyses"
|
80
|
+
void analyze_string( char *s, std::vector<CAnalysis > &analyses );
|
81
|
+
|
82
|
+
void compute_probs( std::vector<CAnalysis> &analyses, std::vector<double> &prob );
|
83
|
+
char *print_analysis( CAnalysis &ana );
|
84
|
+
|
85
|
+
// longest-match analysis
|
86
|
+
const char *longest_match( char*& );
|
87
|
+
|
88
|
+
// EM training
|
89
|
+
bool train2( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
|
90
|
+
bool train( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
|
91
|
+
void estimate_probs( std::vector<double> &arcfreq, std::vector<double> &finalfreq );
|
92
|
+
|
93
|
+
// robust analysis
|
94
|
+
float robust_analyze_string( char *string, std::vector<CAnalysis> &analyses,
|
95
|
+
float ErrorsAllowed );
|
96
|
+
};
|
97
|
+
|
98
|
+
#endif
|