ruby-sfst 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,616 @@
1
+ /*******************************************************************/
2
+ /* */
3
+ /* FILE compact.C */
4
+ /* MODULE compact */
5
+ /* PROGRAM SFST */
6
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
+ /* */
8
+ /* PURPOSE Code needed for analysing data */
9
+ /* */
10
+ /*******************************************************************/
11
+
12
+ #include <stdio.h>
13
+ #include <math.h>
14
+
15
+ #include <limits.h>
16
+
17
+ #include "compact.h"
18
+
19
+ using std::equal_range;
20
+ using std::vector;
21
+ using std::pair;
22
+
23
+ const int BUFFER_SIZE=1000;
24
+
25
+
26
+ /*******************************************************************/
27
+ /* */
28
+ /* CompactTransducer::convert */
29
+ /* */
30
+ /*******************************************************************/
31
+
32
+ void CompactTransducer::convert( CAnalysis &cana, Analysis &ana )
33
+
34
+ {
35
+ ana.resize(cana.size());
36
+ for( size_t i=0; i<cana.size(); i++ )
37
+ ana[i] = label[cana[i]];
38
+ }
39
+
40
+
41
+ /*******************************************************************/
42
+ /* */
43
+ /* CompactTransducer::analyze */
44
+ /* */
45
+ /*******************************************************************/
46
+
47
+ void CompactTransducer::analyze(unsigned int n, vector<Character> &input,
48
+ size_t ipos, CAnalysis &ca,
49
+ vector<CAnalysis> &analyses )
50
+ {
51
+ // "n" is the number of the current transducer node/state
52
+ // "input" is the sequence of input symbols
53
+ // "ipos" is the input position currently analysed
54
+ // "ca" stores the incomplete analysis string
55
+ // "analyses" stores the analyses found so far
56
+
57
+ if (analyses.size() > 10000)
58
+ return; // limit the maximal number of analyses
59
+
60
+ // Is the input string fully analyzed and the current node a final node?
61
+ if (finalp[n] && ipos == input.size())
62
+ // store the new analysis
63
+ analyses.push_back(ca);
64
+
65
+ // follow the epsilon transitions
66
+ // first_arc[n] is the number of the first outgoing transition of node n
67
+ // first_arc[n+1]-1 is the number of the last outgoing transition of node n
68
+ // first_arc[n+1] is the number of the first outgoing transition of node n+1
69
+ unsigned int i;
70
+ for( i=first_arc[n];
71
+ i<first_arc[n+1] && label[i].upper_char() == Label::epsilon;
72
+ i++)
73
+ {
74
+ ca.push_back(i);
75
+ analyze(target_node[i], input, ipos, ca, analyses);
76
+ ca.pop_back();
77
+ }
78
+
79
+ // follow the non-epsilon transitions
80
+
81
+ // scan the next input symbol
82
+ if (ipos < input.size()) {
83
+ // find the set of arcs with matching upper character in the sorted list
84
+ pair<Label*,Label*>range =
85
+ equal_range(label+i, label+first_arc[n+1], Label(input[ipos]));
86
+ unsigned int to = (unsigned int)(range.second - label);
87
+
88
+ // follow the non-epsilon transitions
89
+ for( i=range.first-label; i<to; i++) {
90
+ ca.push_back(i);
91
+ analyze(target_node[i], input, ipos+1, ca, analyses);
92
+ ca.pop_back();
93
+ }
94
+ }
95
+ }
96
+
97
+
98
+ /*******************************************************************/
99
+ /* */
100
+ /* CompactTransducer::analyze_string */
101
+ /* */
102
+ /*******************************************************************/
103
+
104
+ void CompactTransducer::analyze_string( char *s, vector<CAnalysis> &analyses )
105
+
106
+ {
107
+ // "s" input string to be analyzed
108
+ // "analyses" is the data structure in which the results are stored
109
+ // and returned
110
+
111
+ vector<Character> input;
112
+ alphabet.string2symseq( s, input );
113
+
114
+ analyses.clear();
115
+ CAnalysis ca; // data structure where the current incomplete analysis
116
+ // is stored
117
+ analyze(0, input, 0, ca, analyses); // start the analysis
118
+
119
+ if (analyses.size() > 10000)
120
+ fprintf(stderr,"Warning: Only the first 10000 analyses considered for \"%s\"!\n", s);
121
+
122
+ if (simplest_only && analyses.size() > 1)
123
+ disambiguate( analyses ); // select the simplest analyses
124
+ }
125
+
126
+
127
+
128
+ /*******************************************************************/
129
+ /* */
130
+ /* CompactTransducer::~CompactTransducer */
131
+ /* */
132
+ /*******************************************************************/
133
+
134
+ CompactTransducer::~CompactTransducer()
135
+
136
+ {
137
+ delete[] finalp;
138
+ delete[] first_arc;
139
+ delete[] label;
140
+ delete[] target_node;
141
+ delete[] final_logprob;
142
+ delete[] arc_logprob;
143
+ }
144
+
145
+
146
+ /*******************************************************************/
147
+ /* */
148
+ /* CompactTransducer::CompactTransducer */
149
+ /* */
150
+ /*******************************************************************/
151
+
152
+ CompactTransducer::CompactTransducer()
153
+
154
+ {
155
+ both_layers = false;
156
+ simplest_only = false;
157
+ number_of_nodes = 0;
158
+ number_of_arcs = 0;
159
+ finalp = NULL;
160
+ first_arc = NULL;
161
+ label = NULL;
162
+ target_node = NULL;
163
+ arc_logprob = final_logprob = (float*)NULL;
164
+ }
165
+
166
+
167
+ /*******************************************************************/
168
+ /* */
169
+ /* CompactTransducer::read_finalp */
170
+ /* */
171
+ /*******************************************************************/
172
+
173
+ void CompactTransducer::read_finalp( FILE *file )
174
+
175
+ {
176
+ int k=0;
177
+ unsigned char n=0;
178
+ for( size_t i=0; i<number_of_nodes; i++ ) {
179
+ if (k == 0) {
180
+ n = (unsigned char)fgetc(file);
181
+ k = 8;
182
+ }
183
+ k--;
184
+ if (n & (1 << k))
185
+ finalp[i] = 1;
186
+ else
187
+ finalp[i] = 0;
188
+ }
189
+ }
190
+
191
+
192
+ /*******************************************************************/
193
+ /* */
194
+ /* CompactTransducer::read_first_arcs */
195
+ /* */
196
+ /*******************************************************************/
197
+
198
+ void CompactTransducer::read_first_arcs( FILE *file )
199
+
200
+ {
201
+ int k=0;
202
+ unsigned int n=0;
203
+ size_t bits=(size_t)ceil(log(number_of_arcs+1)/log(2));
204
+
205
+ for( size_t i=0; i<=number_of_nodes; i++ ) {
206
+ first_arc[i] = n >> (sizeof(n)*8 - bits);
207
+ n <<= bits;
208
+ k -= bits;
209
+ if (k < 0) {
210
+ read_num(&n,sizeof(n),file);
211
+ first_arc[i] |= n >> (sizeof(n)*8 + k);
212
+ n <<= -k;
213
+ k += sizeof(n) * 8;
214
+ }
215
+ }
216
+ }
217
+
218
+
219
+ /*******************************************************************/
220
+ /* */
221
+ /* CompactTransducer::read_target_nodes */
222
+ /* */
223
+ /*******************************************************************/
224
+
225
+ void CompactTransducer::read_target_nodes( FILE *file )
226
+
227
+ {
228
+ int k=0;
229
+ unsigned int n=0;
230
+ size_t bits=(size_t)ceil(log(number_of_nodes)/log(2));
231
+
232
+ for( size_t i=0; i<number_of_arcs; i++ ) {
233
+ target_node[i] = n >> (sizeof(n)*8 - bits);
234
+ n <<= bits;
235
+ k -= bits;
236
+ if (k < 0) {
237
+ read_num(&n,sizeof(n),file);
238
+ target_node[i] |= n >> (sizeof(n)*8 + k);
239
+ n <<= -k;
240
+ k += sizeof(n) * 8;
241
+ }
242
+ }
243
+ }
244
+
245
+
246
+ /*******************************************************************/
247
+ /* */
248
+ /* CompactTransducer::read_labels */
249
+ /* */
250
+ /*******************************************************************/
251
+
252
+ void CompactTransducer::read_labels( FILE *file )
253
+
254
+ {
255
+ size_t N=0;
256
+ Label Num2Label[alphabet.size()];
257
+ for( Alphabet::const_iterator it=alphabet.begin();
258
+ it != alphabet.end(); it++ )
259
+ {
260
+ Label l=*it;
261
+ Num2Label[N++] = l;
262
+ }
263
+
264
+ int k=0;
265
+ unsigned int n=0;
266
+ size_t bits=(size_t)ceil(log(alphabet.size())/log(2));
267
+
268
+ for( size_t i=0; i<number_of_arcs; i++ ) {
269
+ unsigned int l = n >> (sizeof(n)*8 - bits);
270
+ n <<= bits;
271
+ k -= bits;
272
+ if (k < 0) {
273
+ read_num(&n,sizeof(n),file);
274
+ l |= n >> (sizeof(n)*8 + k);
275
+ n <<= -k;
276
+ k += sizeof(n) * 8;
277
+ }
278
+ label[i] = Num2Label[l];
279
+ }
280
+ }
281
+
282
+
283
+ /*******************************************************************/
284
+ /* */
285
+ /* CompactTransducer::read_probs */
286
+ /* */
287
+ /*******************************************************************/
288
+
289
+ void CompactTransducer::read_probs( FILE *file )
290
+
291
+ {
292
+ size_t n,m;
293
+ fread(&n, sizeof(n), 1, file);
294
+ if (fread(&m, sizeof(n), 1, file) != 1 ||
295
+ n != node_count() || m != arc_count())
296
+ {
297
+ fprintf(stderr,"Error: incompatible probability file!\n");
298
+ exit(1);
299
+ }
300
+ final_logprob = new float[n];
301
+ arc_logprob = new float[m];
302
+ fread(final_logprob, sizeof(float), n, file);
303
+ if (fread(arc_logprob, sizeof(float), n, file) != n) {
304
+ fprintf(stderr,"Error: in probability file!\n");
305
+ exit(1);
306
+ }
307
+ }
308
+
309
+
310
+ /*******************************************************************/
311
+ /* */
312
+ /* CompactTransducer::CompactTransducer */
313
+ /* */
314
+ /*******************************************************************/
315
+
316
+ CompactTransducer::CompactTransducer( FILE *file, FILE *pfile )
317
+
318
+ {
319
+ both_layers = false;
320
+ simplest_only = false;
321
+
322
+ if (fgetc(file) != 'c')
323
+ throw "Error: wrong file format (not a compact transducer)\n";
324
+
325
+ alphabet.read(file);
326
+
327
+ read_num(&number_of_nodes,sizeof(number_of_nodes),file);
328
+ read_num(&number_of_arcs,sizeof(number_of_arcs),file);
329
+
330
+ if (!ferror(file)) {
331
+ // memory allocation
332
+ finalp = new char[number_of_nodes];
333
+ first_arc = new unsigned[number_of_nodes+1];
334
+ label = new Label[number_of_arcs];
335
+ target_node = new unsigned[number_of_arcs];
336
+
337
+ // reading the data
338
+ read_finalp(file);
339
+ read_first_arcs(file);
340
+ read_labels(file);
341
+ read_target_nodes(file);
342
+ }
343
+
344
+ if (pfile == NULL)
345
+ arc_logprob = final_logprob = (float*)NULL;
346
+ else
347
+ read_probs(pfile);
348
+ }
349
+
350
+
351
+ /*******************************************************************/
352
+ /* */
353
+ /* CompactTransducer::longest_match2 */
354
+ /* */
355
+ /*******************************************************************/
356
+
357
+ void CompactTransducer::longest_match2(unsigned int n, char *string, int l,
358
+ CAnalysis &ca, int &bl, CAnalysis &ba)
359
+ {
360
+ // n: transducer state
361
+ // string: rest string
362
+ // l: length of current analysis
363
+ // bl: length of the currently longest match
364
+ // ca: current analysis
365
+ // ba: best analysis
366
+
367
+ if (finalp[n] && l > bl) {
368
+ // store the new analysis
369
+ bl = l;
370
+ ba = ca; // copy the arc vector
371
+ }
372
+
373
+ // follow the epsilon transitions
374
+ unsigned int i;
375
+ for( i=first_arc[n];
376
+ i<first_arc[n+1] && label[i].upper_char() == Label::epsilon;
377
+ i++)
378
+ {
379
+ ca.push_back(i);
380
+ longest_match2(target_node[i], string, l, ca, bl, ba);
381
+ ca.pop_back();
382
+ }
383
+
384
+ // follow the non-epsilon transitions
385
+ char *end=string;
386
+ int c=alphabet.next_code(end, false);
387
+ l += end-string;
388
+ if (c != EOF) {
389
+ // find the set of arcs with matching upper character in the sort list
390
+ pair<Label*,Label*>range =
391
+ equal_range(label+i, label+first_arc[n+1], Label((Character)c));
392
+ unsigned int to = (unsigned int)(range.second - label);
393
+ for( i=range.first-label; i<to; i++) {
394
+ ca.push_back(i);
395
+ longest_match2(target_node[i], end, l, ca, bl, ba);
396
+ ca.pop_back();
397
+ }
398
+ }
399
+ }
400
+
401
+
402
+ /*******************************************************************/
403
+ /* */
404
+ /* CompactTransducer::print_analysis */
405
+ /* */
406
+ /*******************************************************************/
407
+
408
+ char *CompactTransducer::print_analysis( CAnalysis &cana )
409
+
410
+ {
411
+ Analysis ana;
412
+ convert(cana, ana);
413
+ return alphabet.print_analysis( ana, both_layers );
414
+ }
415
+
416
+
417
+ /*******************************************************************/
418
+ /* */
419
+ /* CompactTransducer::longest_match */
420
+ /* */
421
+ /*******************************************************************/
422
+
423
+ const char *CompactTransducer::longest_match( char* &string )
424
+
425
+ {
426
+ vector<char> analysis;
427
+ CAnalysis ca, ba;
428
+ int l=0;
429
+ longest_match2(0, string, 0, ca, l, ba);
430
+
431
+ // no match? return the next character
432
+ if (ba.size() == 0) {
433
+ int c=alphabet.next_code(string, false);
434
+ return alphabet.code2symbol(c);
435
+ }
436
+
437
+ string += l;
438
+ return print_analysis( ba );
439
+ }
440
+
441
+
442
+ /*******************************************************************/
443
+ /* */
444
+ /* CompactTransducer::disambiguate */
445
+ /* */
446
+ /*******************************************************************/
447
+
448
+ void CompactTransducer::disambiguate( vector<CAnalysis> &analyses )
449
+
450
+ {
451
+ // compute the scores
452
+ int bestscore=INT_MIN;
453
+ vector<int> score;
454
+ Analysis ana;
455
+
456
+ for( size_t i=0; i<analyses.size(); i++ ) {
457
+ convert(analyses[i], ana);
458
+ score.push_back(alphabet.compute_score(ana));
459
+ if (bestscore < score[i])
460
+ bestscore = score[i];
461
+ }
462
+
463
+ // delete suboptimal analyses
464
+ size_t k=0;
465
+ for( size_t i=0; i<analyses.size(); i++ )
466
+ if (score[i] == bestscore)
467
+ analyses[k++] = analyses[i];
468
+ analyses.resize(k);
469
+ }
470
+
471
+
472
+ /*******************************************************************/
473
+ /* */
474
+ /* CompactTransducer::train2 */
475
+ /* */
476
+ /*******************************************************************/
477
+
478
+ bool CompactTransducer::train2( char *s, vector<double> &arcfreq,
479
+ vector<double> &finalfreq )
480
+ {
481
+ vector<CAnalysis> analyses;
482
+ vector<Label> input;
483
+ alphabet.string2labelseq( s, input );
484
+
485
+ CAnalysis ca; // data structure where the analysis is stored
486
+ unsigned int n=0;
487
+ bool failure=false;
488
+ for( size_t i=0; i<input.size(); i++ ) {
489
+ failure = true;
490
+ for( unsigned int k=first_arc[n]; k<first_arc[n+1]; k++) {
491
+ if (label[k] == input[i]) {
492
+ ca.push_back(k);
493
+ n = target_node[k];
494
+ failure = false;
495
+ break;
496
+ }
497
+ }
498
+ if (failure)
499
+ break;
500
+ }
501
+ if (failure || !finalp[n]) {
502
+ fprintf(stderr,"Warning: The following input is not covered:\n%s\n", s);
503
+ return false;
504
+ }
505
+
506
+ for( size_t k=0; k<ca.size(); k++ )
507
+ arcfreq[ca[k]]++;
508
+ finalfreq[target_node[ca.back()]]++;
509
+
510
+ return true;
511
+ }
512
+
513
+
514
+ /*******************************************************************/
515
+ /* */
516
+ /* CompactTransducer::train */
517
+ /* */
518
+ /*******************************************************************/
519
+
520
+ bool CompactTransducer::train( char *s, vector<double> &arcfreq,
521
+ vector<double> &finalfreq )
522
+ {
523
+ vector<CAnalysis> analyses;
524
+ vector<Character> input;
525
+ alphabet.string2symseq( s, input );
526
+
527
+ CAnalysis ca; // data structure where the current incomplete analysis
528
+ // is stored
529
+ analyze(0, input, 0, ca, analyses); // start the analysis
530
+
531
+ if (analyses.size() > 10000)
532
+ return true; // ignore inputs with more than 10000 analyses
533
+ else if (analyses.size() == 0)
534
+ return false;
535
+
536
+ if (simplest_only && analyses.size() > 1)
537
+ disambiguate( analyses ); // select the simplest analyses
538
+
539
+ if (analyses.size() > 0) {
540
+ double incr = 1.0 / analyses.size();
541
+ CAnalysis arcs;
542
+
543
+ for( size_t i=0; i<analyses.size(); i++ ) {
544
+ CAnalysis &arcs=analyses[i];
545
+ for( size_t k=0; k<arcs.size(); k++ )
546
+ arcfreq[arcs[k]] += incr;
547
+ finalfreq[target_node[arcs.back()]] += incr;
548
+ }
549
+ }
550
+ return true;
551
+ }
552
+
553
+
554
+ /*******************************************************************/
555
+ /* */
556
+ /* CompactTransducer::estimate_probs */
557
+ /* */
558
+ /*******************************************************************/
559
+
560
+ void CompactTransducer::estimate_probs( vector<double> &arcfreq,
561
+ vector<double> &finalfreq )
562
+ {
563
+ // turn frequencies into probabilities
564
+ for( size_t n=0; n<finalfreq.size(); n++ ) {
565
+ double sum = finalfreq[n];
566
+ for( size_t a=first_arc[n]; a<first_arc[n+1]; a++ )
567
+ sum += arcfreq[a];
568
+ if (sum == 0.0)
569
+ sum = 1.0;
570
+ finalfreq[n] = finalfreq[n] / sum;
571
+ for( size_t a=first_arc[n]; a<first_arc[n+1]; a++ )
572
+ arcfreq[a] = arcfreq[a] / sum;
573
+ }
574
+ }
575
+
576
+
577
+
578
+ /*******************************************************************/
579
+ /* */
580
+ /* CompactTransducer::compute_probs */
581
+ /* */
582
+ /*******************************************************************/
583
+
584
+ void CompactTransducer::compute_probs( vector<CAnalysis> &analyses,
585
+ vector<double> &prob )
586
+ {
587
+ prob.resize(analyses.size());
588
+ double sum=0.0;
589
+ for( size_t i=0; i<analyses.size(); i++ ) {
590
+ CAnalysis &a=analyses[i];
591
+
592
+ // compute the probability
593
+ double logprob=0.0;
594
+ for( size_t k=0; k<a.size(); k++ )
595
+ logprob += arc_logprob[a[k]];
596
+ logprob += final_logprob[target_node[a.back()]];
597
+ prob[i] = exp(logprob);
598
+ sum += prob[i];
599
+ }
600
+
601
+ // sort the analyses
602
+ vector<CAnalysis> oldanalyses(analyses);
603
+ vector<double> oldprob(prob);
604
+ for( size_t i=0; i<analyses.size(); i++ ) {
605
+ prob[i] = -1.0;
606
+ int n=0;
607
+ for( size_t k=0; k<oldanalyses.size(); k++ )
608
+ if (prob[i] < oldprob[k]) {
609
+ prob[i] = oldprob[k];
610
+ n = k;
611
+ }
612
+ analyses[i] = oldanalyses[n];
613
+ oldprob[n] = -1.0;
614
+ prob[i] /= sum; // normalization
615
+ }
616
+ }
@@ -0,0 +1,98 @@
1
+ /*******************************************************************/
2
+ /* */
3
+ /* FILE compact.h */
4
+ /* MODULE compact */
5
+ /* PROGRAM SFST */
6
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
+ /* */
8
+ /* PURPOSE finite state tools */
9
+ /* */
10
+ /*******************************************************************/
11
+
12
+ #ifndef _COMPACT_H_
13
+ #define _COMPACT_H_
14
+
15
+ #include "alphabet.h"
16
+
17
+ #include <vector>
18
+
19
+ typedef std::vector<unsigned int> CAnalysis;
20
+
21
+ class CompactTransducer {
22
+
23
+ protected:
24
+
25
+ // the following data structures are used to store the nodes
26
+
27
+ unsigned int number_of_nodes; // number of nodes in the transducer
28
+ char *finalp; // finalp[i] is 1 if node i is final and 0 otherwise
29
+ unsigned int *first_arc; // first_arc[i] is the number of the first
30
+ // arc outgoing from node i
31
+
32
+ // the following data structures are used to store the transition arcs
33
+
34
+ unsigned int number_of_arcs; // total number of arcs in the transducer
35
+ Label *label; // the label (character pair) of arc i
36
+ unsigned int *target_node; // target node of arc i
37
+
38
+ // the following data structures are used to store the stochastic parameters
39
+ float *final_logprob;
40
+ float *arc_logprob;
41
+
42
+ // functions needed to read the transducer from a file
43
+
44
+ void read_finalp( FILE *file );
45
+ void read_first_arcs( FILE *file );
46
+ void read_target_nodes( FILE *file );
47
+ void read_labels( FILE *file );
48
+ void read_probs( FILE *file );
49
+
50
+ // functions needed to analyze data with the transducer
51
+
52
+ void analyze( unsigned int n, std::vector<Character> &ch, size_t ipos,
53
+ CAnalysis&, std::vector<CAnalysis>&);
54
+
55
+ // function selecting the simplest morphological analysis
56
+
57
+ int compute_score( CAnalysis &ana );
58
+ void disambiguate( std::vector<CAnalysis> &analyses );
59
+
60
+ // functions for longest-match analysis of input data
61
+
62
+ void longest_match2(unsigned int, char*, int, CAnalysis&, int&, CAnalysis&);
63
+
64
+ void convert( CAnalysis &cana, Analysis &ana );
65
+
66
+ public:
67
+ size_t node_count() { return number_of_nodes; };
68
+ size_t arc_count() { return number_of_arcs; };
69
+
70
+ bool both_layers; // print surface and analysis symbols
71
+ bool simplest_only; // print only the simplest analyses
72
+
73
+ Alphabet alphabet; // data structure which maps symbols to numeric codes
74
+ CompactTransducer(); // dummy constructor
75
+ CompactTransducer( FILE*, FILE *pfile=NULL ); // reads a (stochastic) transducer
76
+ ~CompactTransducer(); // destroys a transducer
77
+
78
+ // the analysis function returns the set of analyses for the string "s"
79
+ // in the argument "analyses"
80
+ void analyze_string( char *s, std::vector<CAnalysis > &analyses );
81
+
82
+ void compute_probs( std::vector<CAnalysis> &analyses, std::vector<double> &prob );
83
+ char *print_analysis( CAnalysis &ana );
84
+
85
+ // longest-match analysis
86
+ const char *longest_match( char*& );
87
+
88
+ // EM training
89
+ bool train2( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
90
+ bool train( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
91
+ void estimate_probs( std::vector<double> &arcfreq, std::vector<double> &finalfreq );
92
+
93
+ // robust analysis
94
+ float robust_analyze_string( char *string, std::vector<CAnalysis> &analyses,
95
+ float ErrorsAllowed );
96
+ };
97
+
98
+ #endif