ruby-sfst 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,616 @@
1
+ /*******************************************************************/
2
+ /* */
3
+ /* FILE compact.C */
4
+ /* MODULE compact */
5
+ /* PROGRAM SFST */
6
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
+ /* */
8
+ /* PURPOSE Code needed for analysing data */
9
+ /* */
10
+ /*******************************************************************/
11
+
12
+ #include <stdio.h>
13
+ #include <math.h>
14
+
15
+ #include <limits.h>
16
+
17
+ #include "compact.h"
18
+
19
+ using std::equal_range;
20
+ using std::vector;
21
+ using std::pair;
22
+
23
+ const int BUFFER_SIZE=1000;
24
+
25
+
26
+ /*******************************************************************/
27
+ /* */
28
+ /* CompactTransducer::convert */
29
+ /* */
30
+ /*******************************************************************/
31
+
32
+ void CompactTransducer::convert( CAnalysis &cana, Analysis &ana )
33
+
34
+ {
35
+ ana.resize(cana.size());
36
+ for( size_t i=0; i<cana.size(); i++ )
37
+ ana[i] = label[cana[i]];
38
+ }
39
+
40
+
41
+ /*******************************************************************/
42
+ /* */
43
+ /* CompactTransducer::analyze */
44
+ /* */
45
+ /*******************************************************************/
46
+
47
+ void CompactTransducer::analyze(unsigned int n, vector<Character> &input,
48
+ size_t ipos, CAnalysis &ca,
49
+ vector<CAnalysis> &analyses )
50
+ {
51
+ // "n" is the number of the current transducer node/state
52
+ // "input" is the sequence of input symbols
53
+ // "ipos" is the input position currently analysed
54
+ // "ca" stores the incomplete analysis string
55
+ // "analyses" stores the analyses found so far
56
+
57
+ if (analyses.size() > 10000)
58
+ return; // limit the maximal number of analyses
59
+
60
+ // Is the input string fully analyzed and the current node a final node?
61
+ if (finalp[n] && ipos == input.size())
62
+ // store the new analysis
63
+ analyses.push_back(ca);
64
+
65
+ // follow the epsilon transitions
66
+ // first_arc[n] is the number of the first outgoing transition of node n
67
+ // first_arc[n+1]-1 is the number of the last outgoing transition of node n
68
+ // first_arc[n+1] is the number of the first outgoing transition of node n+1
69
+ unsigned int i;
70
+ for( i=first_arc[n];
71
+ i<first_arc[n+1] && label[i].upper_char() == Label::epsilon;
72
+ i++)
73
+ {
74
+ ca.push_back(i);
75
+ analyze(target_node[i], input, ipos, ca, analyses);
76
+ ca.pop_back();
77
+ }
78
+
79
+ // follow the non-epsilon transitions
80
+
81
+ // scan the next input symbol
82
+ if (ipos < input.size()) {
83
+ // find the set of arcs with matching upper character in the sorted list
84
+ pair<Label*,Label*>range =
85
+ equal_range(label+i, label+first_arc[n+1], Label(input[ipos]));
86
+ unsigned int to = (unsigned int)(range.second - label);
87
+
88
+ // follow the non-epsilon transitions
89
+ for( i=range.first-label; i<to; i++) {
90
+ ca.push_back(i);
91
+ analyze(target_node[i], input, ipos+1, ca, analyses);
92
+ ca.pop_back();
93
+ }
94
+ }
95
+ }
96
+
97
+
98
+ /*******************************************************************/
99
+ /* */
100
+ /* CompactTransducer::analyze_string */
101
+ /* */
102
+ /*******************************************************************/
103
+
104
+ void CompactTransducer::analyze_string( char *s, vector<CAnalysis> &analyses )
105
+
106
+ {
107
+ // "s" input string to be analyzed
108
+ // "analyses" is the data structure in which the results are stored
109
+ // and returned
110
+
111
+ vector<Character> input;
112
+ alphabet.string2symseq( s, input );
113
+
114
+ analyses.clear();
115
+ CAnalysis ca; // data structure where the current incomplete analysis
116
+ // is stored
117
+ analyze(0, input, 0, ca, analyses); // start the analysis
118
+
119
+ if (analyses.size() > 10000)
120
+ fprintf(stderr,"Warning: Only the first 10000 analyses considered for \"%s\"!\n", s);
121
+
122
+ if (simplest_only && analyses.size() > 1)
123
+ disambiguate( analyses ); // select the simplest analyses
124
+ }
125
+
126
+
127
+
128
+ /*******************************************************************/
129
+ /* */
130
+ /* CompactTransducer::~CompactTransducer */
131
+ /* */
132
+ /*******************************************************************/
133
+
134
+ CompactTransducer::~CompactTransducer()
135
+
136
+ {
137
+ delete[] finalp;
138
+ delete[] first_arc;
139
+ delete[] label;
140
+ delete[] target_node;
141
+ delete[] final_logprob;
142
+ delete[] arc_logprob;
143
+ }
144
+
145
+
146
+ /*******************************************************************/
147
+ /* */
148
+ /* CompactTransducer::CompactTransducer */
149
+ /* */
150
+ /*******************************************************************/
151
+
152
+ CompactTransducer::CompactTransducer()
153
+
154
+ {
155
+ both_layers = false;
156
+ simplest_only = false;
157
+ number_of_nodes = 0;
158
+ number_of_arcs = 0;
159
+ finalp = NULL;
160
+ first_arc = NULL;
161
+ label = NULL;
162
+ target_node = NULL;
163
+ arc_logprob = final_logprob = (float*)NULL;
164
+ }
165
+
166
+
167
+ /*******************************************************************/
168
+ /* */
169
+ /* CompactTransducer::read_finalp */
170
+ /* */
171
+ /*******************************************************************/
172
+
173
+ void CompactTransducer::read_finalp( FILE *file )
174
+
175
+ {
176
+ int k=0;
177
+ unsigned char n=0;
178
+ for( size_t i=0; i<number_of_nodes; i++ ) {
179
+ if (k == 0) {
180
+ n = (unsigned char)fgetc(file);
181
+ k = 8;
182
+ }
183
+ k--;
184
+ if (n & (1 << k))
185
+ finalp[i] = 1;
186
+ else
187
+ finalp[i] = 0;
188
+ }
189
+ }
190
+
191
+
192
+ /*******************************************************************/
193
+ /* */
194
+ /* CompactTransducer::read_first_arcs */
195
+ /* */
196
+ /*******************************************************************/
197
+
198
+ void CompactTransducer::read_first_arcs( FILE *file )
199
+
200
+ {
201
+ int k=0;
202
+ unsigned int n=0;
203
+ size_t bits=(size_t)ceil(log(number_of_arcs+1)/log(2));
204
+
205
+ for( size_t i=0; i<=number_of_nodes; i++ ) {
206
+ first_arc[i] = n >> (sizeof(n)*8 - bits);
207
+ n <<= bits;
208
+ k -= bits;
209
+ if (k < 0) {
210
+ read_num(&n,sizeof(n),file);
211
+ first_arc[i] |= n >> (sizeof(n)*8 + k);
212
+ n <<= -k;
213
+ k += sizeof(n) * 8;
214
+ }
215
+ }
216
+ }
217
+
218
+
219
+ /*******************************************************************/
220
+ /* */
221
+ /* CompactTransducer::read_target_nodes */
222
+ /* */
223
+ /*******************************************************************/
224
+
225
+ void CompactTransducer::read_target_nodes( FILE *file )
226
+
227
+ {
228
+ int k=0;
229
+ unsigned int n=0;
230
+ size_t bits=(size_t)ceil(log(number_of_nodes)/log(2));
231
+
232
+ for( size_t i=0; i<number_of_arcs; i++ ) {
233
+ target_node[i] = n >> (sizeof(n)*8 - bits);
234
+ n <<= bits;
235
+ k -= bits;
236
+ if (k < 0) {
237
+ read_num(&n,sizeof(n),file);
238
+ target_node[i] |= n >> (sizeof(n)*8 + k);
239
+ n <<= -k;
240
+ k += sizeof(n) * 8;
241
+ }
242
+ }
243
+ }
244
+
245
+
246
+ /*******************************************************************/
247
+ /* */
248
+ /* CompactTransducer::read_labels */
249
+ /* */
250
+ /*******************************************************************/
251
+
252
+ void CompactTransducer::read_labels( FILE *file )
253
+
254
+ {
255
+ size_t N=0;
256
+ Label Num2Label[alphabet.size()];
257
+ for( Alphabet::const_iterator it=alphabet.begin();
258
+ it != alphabet.end(); it++ )
259
+ {
260
+ Label l=*it;
261
+ Num2Label[N++] = l;
262
+ }
263
+
264
+ int k=0;
265
+ unsigned int n=0;
266
+ size_t bits=(size_t)ceil(log(alphabet.size())/log(2));
267
+
268
+ for( size_t i=0; i<number_of_arcs; i++ ) {
269
+ unsigned int l = n >> (sizeof(n)*8 - bits);
270
+ n <<= bits;
271
+ k -= bits;
272
+ if (k < 0) {
273
+ read_num(&n,sizeof(n),file);
274
+ l |= n >> (sizeof(n)*8 + k);
275
+ n <<= -k;
276
+ k += sizeof(n) * 8;
277
+ }
278
+ label[i] = Num2Label[l];
279
+ }
280
+ }
281
+
282
+
283
+ /*******************************************************************/
284
+ /* */
285
+ /* CompactTransducer::read_probs */
286
+ /* */
287
+ /*******************************************************************/
288
+
289
+ void CompactTransducer::read_probs( FILE *file )
290
+
291
+ {
292
+ size_t n,m;
293
+ fread(&n, sizeof(n), 1, file);
294
+ if (fread(&m, sizeof(n), 1, file) != 1 ||
295
+ n != node_count() || m != arc_count())
296
+ {
297
+ fprintf(stderr,"Error: incompatible probability file!\n");
298
+ exit(1);
299
+ }
300
+ final_logprob = new float[n];
301
+ arc_logprob = new float[m];
302
+ fread(final_logprob, sizeof(float), n, file);
303
+ if (fread(arc_logprob, sizeof(float), n, file) != n) {
304
+ fprintf(stderr,"Error: in probability file!\n");
305
+ exit(1);
306
+ }
307
+ }
308
+
309
+
310
+ /*******************************************************************/
311
+ /* */
312
+ /* CompactTransducer::CompactTransducer */
313
+ /* */
314
+ /*******************************************************************/
315
+
316
+ CompactTransducer::CompactTransducer( FILE *file, FILE *pfile )
317
+
318
+ {
319
+ both_layers = false;
320
+ simplest_only = false;
321
+
322
+ if (fgetc(file) != 'c')
323
+ throw "Error: wrong file format (not a compact transducer)\n";
324
+
325
+ alphabet.read(file);
326
+
327
+ read_num(&number_of_nodes,sizeof(number_of_nodes),file);
328
+ read_num(&number_of_arcs,sizeof(number_of_arcs),file);
329
+
330
+ if (!ferror(file)) {
331
+ // memory allocation
332
+ finalp = new char[number_of_nodes];
333
+ first_arc = new unsigned[number_of_nodes+1];
334
+ label = new Label[number_of_arcs];
335
+ target_node = new unsigned[number_of_arcs];
336
+
337
+ // reading the data
338
+ read_finalp(file);
339
+ read_first_arcs(file);
340
+ read_labels(file);
341
+ read_target_nodes(file);
342
+ }
343
+
344
+ if (pfile == NULL)
345
+ arc_logprob = final_logprob = (float*)NULL;
346
+ else
347
+ read_probs(pfile);
348
+ }
349
+
350
+
351
+ /*******************************************************************/
352
+ /* */
353
+ /* CompactTransducer::longest_match2 */
354
+ /* */
355
+ /*******************************************************************/
356
+
357
+ void CompactTransducer::longest_match2(unsigned int n, char *string, int l,
358
+ CAnalysis &ca, int &bl, CAnalysis &ba)
359
+ {
360
+ // n: transducer state
361
+ // string: rest string
362
+ // l: length of current analysis
363
+ // bl: length of the currently longest match
364
+ // ca: current analysis
365
+ // ba: best analysis
366
+
367
+ if (finalp[n] && l > bl) {
368
+ // store the new analysis
369
+ bl = l;
370
+ ba = ca; // copy the arc vector
371
+ }
372
+
373
+ // follow the epsilon transitions
374
+ unsigned int i;
375
+ for( i=first_arc[n];
376
+ i<first_arc[n+1] && label[i].upper_char() == Label::epsilon;
377
+ i++)
378
+ {
379
+ ca.push_back(i);
380
+ longest_match2(target_node[i], string, l, ca, bl, ba);
381
+ ca.pop_back();
382
+ }
383
+
384
+ // follow the non-epsilon transitions
385
+ char *end=string;
386
+ int c=alphabet.next_code(end, false);
387
+ l += end-string;
388
+ if (c != EOF) {
389
+ // find the set of arcs with matching upper character in the sort list
390
+ pair<Label*,Label*>range =
391
+ equal_range(label+i, label+first_arc[n+1], Label((Character)c));
392
+ unsigned int to = (unsigned int)(range.second - label);
393
+ for( i=range.first-label; i<to; i++) {
394
+ ca.push_back(i);
395
+ longest_match2(target_node[i], end, l, ca, bl, ba);
396
+ ca.pop_back();
397
+ }
398
+ }
399
+ }
400
+
401
+
402
+ /*******************************************************************/
403
+ /* */
404
+ /* CompactTransducer::print_analysis */
405
+ /* */
406
+ /*******************************************************************/
407
+
408
+ char *CompactTransducer::print_analysis( CAnalysis &cana )
409
+
410
+ {
411
+ Analysis ana;
412
+ convert(cana, ana);
413
+ return alphabet.print_analysis( ana, both_layers );
414
+ }
415
+
416
+
417
+ /*******************************************************************/
418
+ /* */
419
+ /* CompactTransducer::longest_match */
420
+ /* */
421
+ /*******************************************************************/
422
+
423
+ const char *CompactTransducer::longest_match( char* &string )
424
+
425
+ {
426
+ vector<char> analysis;
427
+ CAnalysis ca, ba;
428
+ int l=0;
429
+ longest_match2(0, string, 0, ca, l, ba);
430
+
431
+ // no match? return the next character
432
+ if (ba.size() == 0) {
433
+ int c=alphabet.next_code(string, false);
434
+ return alphabet.code2symbol(c);
435
+ }
436
+
437
+ string += l;
438
+ return print_analysis( ba );
439
+ }
440
+
441
+
442
+ /*******************************************************************/
443
+ /* */
444
+ /* CompactTransducer::disambiguate */
445
+ /* */
446
+ /*******************************************************************/
447
+
448
+ void CompactTransducer::disambiguate( vector<CAnalysis> &analyses )
449
+
450
+ {
451
+ // compute the scores
452
+ int bestscore=INT_MIN;
453
+ vector<int> score;
454
+ Analysis ana;
455
+
456
+ for( size_t i=0; i<analyses.size(); i++ ) {
457
+ convert(analyses[i], ana);
458
+ score.push_back(alphabet.compute_score(ana));
459
+ if (bestscore < score[i])
460
+ bestscore = score[i];
461
+ }
462
+
463
+ // delete suboptimal analyses
464
+ size_t k=0;
465
+ for( size_t i=0; i<analyses.size(); i++ )
466
+ if (score[i] == bestscore)
467
+ analyses[k++] = analyses[i];
468
+ analyses.resize(k);
469
+ }
470
+
471
+
472
+ /*******************************************************************/
473
+ /* */
474
+ /* CompactTransducer::train2 */
475
+ /* */
476
+ /*******************************************************************/
477
+
478
+ bool CompactTransducer::train2( char *s, vector<double> &arcfreq,
479
+ vector<double> &finalfreq )
480
+ {
481
+ vector<CAnalysis> analyses;
482
+ vector<Label> input;
483
+ alphabet.string2labelseq( s, input );
484
+
485
+ CAnalysis ca; // data structure where the analysis is stored
486
+ unsigned int n=0;
487
+ bool failure=false;
488
+ for( size_t i=0; i<input.size(); i++ ) {
489
+ failure = true;
490
+ for( unsigned int k=first_arc[n]; k<first_arc[n+1]; k++) {
491
+ if (label[k] == input[i]) {
492
+ ca.push_back(k);
493
+ n = target_node[k];
494
+ failure = false;
495
+ break;
496
+ }
497
+ }
498
+ if (failure)
499
+ break;
500
+ }
501
+ if (failure || !finalp[n]) {
502
+ fprintf(stderr,"Warning: The following input is not covered:\n%s\n", s);
503
+ return false;
504
+ }
505
+
506
+ for( size_t k=0; k<ca.size(); k++ )
507
+ arcfreq[ca[k]]++;
508
+ finalfreq[target_node[ca.back()]]++;
509
+
510
+ return true;
511
+ }
512
+
513
+
514
+ /*******************************************************************/
515
+ /* */
516
+ /* CompactTransducer::train */
517
+ /* */
518
+ /*******************************************************************/
519
+
520
+ bool CompactTransducer::train( char *s, vector<double> &arcfreq,
521
+ vector<double> &finalfreq )
522
+ {
523
+ vector<CAnalysis> analyses;
524
+ vector<Character> input;
525
+ alphabet.string2symseq( s, input );
526
+
527
+ CAnalysis ca; // data structure where the current incomplete analysis
528
+ // is stored
529
+ analyze(0, input, 0, ca, analyses); // start the analysis
530
+
531
+ if (analyses.size() > 10000)
532
+ return true; // ignore inputs with more than 10000 analyses
533
+ else if (analyses.size() == 0)
534
+ return false;
535
+
536
+ if (simplest_only && analyses.size() > 1)
537
+ disambiguate( analyses ); // select the simplest analyses
538
+
539
+ if (analyses.size() > 0) {
540
+ double incr = 1.0 / analyses.size();
541
+ CAnalysis arcs;
542
+
543
+ for( size_t i=0; i<analyses.size(); i++ ) {
544
+ CAnalysis &arcs=analyses[i];
545
+ for( size_t k=0; k<arcs.size(); k++ )
546
+ arcfreq[arcs[k]] += incr;
547
+ finalfreq[target_node[arcs.back()]] += incr;
548
+ }
549
+ }
550
+ return true;
551
+ }
552
+
553
+
554
+ /*******************************************************************/
555
+ /* */
556
+ /* CompactTransducer::estimate_probs */
557
+ /* */
558
+ /*******************************************************************/
559
+
560
+ void CompactTransducer::estimate_probs( vector<double> &arcfreq,
561
+ vector<double> &finalfreq )
562
+ {
563
+ // turn frequencies into probabilities
564
+ for( size_t n=0; n<finalfreq.size(); n++ ) {
565
+ double sum = finalfreq[n];
566
+ for( size_t a=first_arc[n]; a<first_arc[n+1]; a++ )
567
+ sum += arcfreq[a];
568
+ if (sum == 0.0)
569
+ sum = 1.0;
570
+ finalfreq[n] = finalfreq[n] / sum;
571
+ for( size_t a=first_arc[n]; a<first_arc[n+1]; a++ )
572
+ arcfreq[a] = arcfreq[a] / sum;
573
+ }
574
+ }
575
+
576
+
577
+
578
+ /*******************************************************************/
579
+ /* */
580
+ /* CompactTransducer::compute_probs */
581
+ /* */
582
+ /*******************************************************************/
583
+
584
+ void CompactTransducer::compute_probs( vector<CAnalysis> &analyses,
585
+ vector<double> &prob )
586
+ {
587
+ prob.resize(analyses.size());
588
+ double sum=0.0;
589
+ for( size_t i=0; i<analyses.size(); i++ ) {
590
+ CAnalysis &a=analyses[i];
591
+
592
+ // compute the probability
593
+ double logprob=0.0;
594
+ for( size_t k=0; k<a.size(); k++ )
595
+ logprob += arc_logprob[a[k]];
596
+ logprob += final_logprob[target_node[a.back()]];
597
+ prob[i] = exp(logprob);
598
+ sum += prob[i];
599
+ }
600
+
601
+ // sort the analyses
602
+ vector<CAnalysis> oldanalyses(analyses);
603
+ vector<double> oldprob(prob);
604
+ for( size_t i=0; i<analyses.size(); i++ ) {
605
+ prob[i] = -1.0;
606
+ int n=0;
607
+ for( size_t k=0; k<oldanalyses.size(); k++ )
608
+ if (prob[i] < oldprob[k]) {
609
+ prob[i] = oldprob[k];
610
+ n = k;
611
+ }
612
+ analyses[i] = oldanalyses[n];
613
+ oldprob[n] = -1.0;
614
+ prob[i] /= sum; // normalization
615
+ }
616
+ }
@@ -0,0 +1,98 @@
1
+ /*******************************************************************/
2
+ /* */
3
+ /* FILE compact.h */
4
+ /* MODULE compact */
5
+ /* PROGRAM SFST */
6
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
+ /* */
8
+ /* PURPOSE finite state tools */
9
+ /* */
10
+ /*******************************************************************/
11
+
12
+ #ifndef _COMPACT_H_
13
+ #define _COMPACT_H_
14
+
15
+ #include "alphabet.h"
16
+
17
+ #include <vector>
18
+
19
+ typedef std::vector<unsigned int> CAnalysis;
20
+
21
+ class CompactTransducer {
22
+
23
+ protected:
24
+
25
+ // the following data structures are used to store the nodes
26
+
27
+ unsigned int number_of_nodes; // number of nodes in the transducer
28
+ char *finalp; // finalp[i] is 1 if node i is final and 0 otherwise
29
+ unsigned int *first_arc; // first_arc[i] is the number of the first
30
+ // arc outgoing from node i
31
+
32
+ // the following data structures are used to store the transition arcs
33
+
34
+ unsigned int number_of_arcs; // total number of arcs in the transducer
35
+ Label *label; // the label (character pair) of arc i
36
+ unsigned int *target_node; // target node of arc i
37
+
38
+ // the following data structures are used to store the stochastic parameters
39
+ float *final_logprob;
40
+ float *arc_logprob;
41
+
42
+ // functions needed to read the transducer from a file
43
+
44
+ void read_finalp( FILE *file );
45
+ void read_first_arcs( FILE *file );
46
+ void read_target_nodes( FILE *file );
47
+ void read_labels( FILE *file );
48
+ void read_probs( FILE *file );
49
+
50
+ // functions needed to analyze data with the transducer
51
+
52
+ void analyze( unsigned int n, std::vector<Character> &ch, size_t ipos,
53
+ CAnalysis&, std::vector<CAnalysis>&);
54
+
55
+ // function selecting the simplest morphological analysis
56
+
57
+ int compute_score( CAnalysis &ana );
58
+ void disambiguate( std::vector<CAnalysis> &analyses );
59
+
60
+ // functions for longest-match analysis of input data
61
+
62
+ void longest_match2(unsigned int, char*, int, CAnalysis&, int&, CAnalysis&);
63
+
64
+ void convert( CAnalysis &cana, Analysis &ana );
65
+
66
+ public:
67
+ size_t node_count() { return number_of_nodes; };
68
+ size_t arc_count() { return number_of_arcs; };
69
+
70
+ bool both_layers; // print surface and analysis symbols
71
+ bool simplest_only; // print only the simplest analyses
72
+
73
+ Alphabet alphabet; // data structure which maps symbols to numeric codes
74
+ CompactTransducer(); // dummy constructor
75
+ CompactTransducer( FILE*, FILE *pfile=NULL ); // reads a (stochastic) transducer
76
+ ~CompactTransducer(); // destroys a transducer
77
+
78
+ // the analysis function returns the set of analyses for the string "s"
79
+ // in the argument "analyses"
80
+ void analyze_string( char *s, std::vector<CAnalysis > &analyses );
81
+
82
+ void compute_probs( std::vector<CAnalysis> &analyses, std::vector<double> &prob );
83
+ char *print_analysis( CAnalysis &ana );
84
+
85
+ // longest-match analysis
86
+ const char *longest_match( char*& );
87
+
88
+ // EM training
89
+ bool train2( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
90
+ bool train( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
91
+ void estimate_probs( std::vector<double> &arcfreq, std::vector<double> &finalfreq );
92
+
93
+ // robust analysis
94
+ float robust_analyze_string( char *string, std::vector<CAnalysis> &analyses,
95
+ float ErrorsAllowed );
96
+ };
97
+
98
+ #endif