ruby-sfst 0.4.3 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -0
  3. data/COPYING +280 -0
  4. data/Gemfile +3 -0
  5. data/Gemfile.lock +54 -0
  6. data/README.md +1 -1
  7. data/Rakefile +9 -18
  8. data/bin/console +7 -0
  9. data/bin/setup +6 -0
  10. data/ext/sfst/alphabet.cc +879 -0
  11. data/ext/sfst/alphabet.h +302 -0
  12. data/ext/sfst/basic.cc +85 -0
  13. data/ext/{sfst_machine → sfst}/basic.h +7 -4
  14. data/ext/sfst/compact.cc +629 -0
  15. data/ext/sfst/compact.h +100 -0
  16. data/ext/sfst/determinise.cc +279 -0
  17. data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
  18. data/ext/sfst/fst.cc +1150 -0
  19. data/ext/sfst/fst.h +374 -0
  20. data/ext/sfst/hopcroft.cc +681 -0
  21. data/ext/sfst/interface.cc +1921 -0
  22. data/ext/sfst/interface.h +171 -0
  23. data/ext/sfst/make-compact.cc +323 -0
  24. data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
  25. data/ext/sfst/mem.h +80 -0
  26. data/ext/sfst/operators.cc +1273 -0
  27. data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
  28. data/ext/sfst/sgi.h +72 -0
  29. data/ext/sfst/utf8.cc +149 -0
  30. data/ext/{sfst_machine → sfst}/utf8.h +7 -4
  31. data/lib/sfst.rb +2 -1
  32. data/lib/sfst/version.rb +1 -1
  33. data/ruby-sfst.gemspec +23 -23
  34. metadata +107 -35
  35. data/ext/sfst_machine/alphabet.cc +0 -812
  36. data/ext/sfst_machine/alphabet.h +0 -273
  37. data/ext/sfst_machine/basic.cc +0 -84
  38. data/ext/sfst_machine/compact.cc +0 -616
  39. data/ext/sfst_machine/compact.h +0 -98
  40. data/ext/sfst_machine/determinise.cc +0 -303
  41. data/ext/sfst_machine/fst.cc +0 -1000
  42. data/ext/sfst_machine/fst.h +0 -369
  43. data/ext/sfst_machine/interface.cc +0 -1842
  44. data/ext/sfst_machine/interface.h +0 -93
  45. data/ext/sfst_machine/make-compact.cc +0 -327
  46. data/ext/sfst_machine/mem.h +0 -74
  47. data/ext/sfst_machine/operators.cc +0 -1131
  48. data/ext/sfst_machine/sgi.h +0 -44
  49. data/ext/sfst_machine/utf8.cc +0 -146
  50. data/test/test_sfst.fst +0 -3
  51. data/test/test_sfst.rb +0 -114
@@ -0,0 +1,302 @@
1
+ /*******************************************************************/
2
+ /* */
3
+ /* FILE alphabet.h */
4
+ /* MODULE alphabet */
5
+ /* PROGRAM SFST */
6
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
+ /* */
8
+ /* PURPOSE finite state tools */
9
+ /* */
10
+ /*******************************************************************/
11
+
12
+ #ifndef _ALPHABET_H_
13
+ #define _ALPHABET_H_
14
+
15
+ #include <stdio.h>
16
+
17
+ #include "basic.h"
18
+
19
+ #include <set>
20
+ using std::set;
21
+
22
+ #include <vector>
23
+ using std::vector;
24
+
25
+ #include <iostream>
26
+ using std::ostream;
27
+
28
+ #include <cstring>
29
+
30
+ #include "sgi.h"
31
+
32
+ #define SFSTVersion "1.4.7a"
33
+
34
+ namespace SFST {
35
+
36
+ #ifndef CODE_DATA_TYPE
37
+ typedef unsigned short Character; // data type of the symbol codes
38
+ #else
39
+ typedef unsigned CODE_DATA_TYPE Character;
40
+ #endif
41
+
42
+ // data type used to indicate whether some action is to be performed
43
+ // on the analysis level (lower) or the surface level (upper)
44
+ typedef enum {upper, lower, both} Level;
45
+
46
+
47
+ /***************** class Label ***********************************/
48
+
49
+ class Label {
50
+
51
+ private:
52
+ // data structure where the two symbols are stored
53
+ struct {
54
+ Character lower;
55
+ Character upper;
56
+ } label;
57
+
58
+ public:
59
+ static const Character epsilon=0; // code of the empty symbol
60
+
61
+ // new label with two identical symbols
62
+ Label( Character c=epsilon ) { label.lower = label.upper = c; };
63
+
64
+ // new label with two different symbols
65
+ Label( Character c1, Character c2 )
66
+ { label.lower = c1; label.upper = c2; };
67
+
68
+ // returns the indicated symbol of the label
69
+ Character get_char( Level l ) const
70
+ { return ((l==upper)? label.upper: label.lower); };
71
+
72
+ // returns the "upper" symbol of the label (i.e. the surface symbol)
73
+ Character upper_char() const { return label.upper; };
74
+
75
+ // returns the "lower" symbol of the label (i.e. the analysis symbol)
76
+ Character lower_char() const { return label.lower; };
77
+
78
+ // replaces symbols in a label
79
+ Label replace_char( Character c, Character nc ) const {
80
+ Label l = *this;
81
+ if (l.label.lower == c)
82
+ l.label.lower = nc;
83
+ if (l.label.upper == c)
84
+ l.label.upper = nc;
85
+ return l;
86
+ };
87
+
88
+ // operators checking the equality of labels
89
+ int operator==( Label l ) const
90
+ { return (label.lower==l.label.lower && label.upper==l.label.upper); };
91
+ int operator!=( Label l ) const
92
+ { return !(l == *this); };
93
+
94
+ // comparison operator needed for sorting labels in compact.C
95
+ int operator<( Label l ) const {
96
+ if (upper_char() < l.upper_char())
97
+ return true;
98
+ if (upper_char() > l.upper_char())
99
+ return false;
100
+ if (lower_char() < l.lower_char())
101
+ return true;
102
+ return false;
103
+ };
104
+ int operator>( Label l ) const {
105
+ if (upper_char() > l.upper_char())
106
+ return true;
107
+ if (upper_char() < l.upper_char())
108
+ return false;
109
+ if (lower_char() > l.lower_char())
110
+ return true;
111
+ return false;
112
+ };
113
+
114
+ // check whether the label is epsilon (i.e. both symbols are epsilon)
115
+ // transitions with epsilon labels are epsilon transitions
116
+ int is_epsilon() const
117
+ { return (label.upper == epsilon && label.lower == epsilon); };
118
+
119
+ // check whether the "upper" symbol is epsilon
120
+ int upper_is_epsilon() const
121
+ { return (label.upper == epsilon); };
122
+
123
+ // check whether the "lower" symbol is epsilon
124
+ int lower_is_epsilon() const
125
+ { return (label.lower == epsilon); };
126
+
127
+ // hash function needed to store labels in a hash table
128
+ struct label_hash {
129
+ size_t operator() ( const Label l ) const {
130
+ return (size_t)l.lower_char() ^
131
+ ((size_t)l.upper_char() << 16) ^
132
+ ((size_t)l.upper_char() >> 16);
133
+ }
134
+ };
135
+
136
+ // comparison function needed to store labels in a map table
137
+ struct label_cmp {
138
+ bool operator() ( const Label l1, const Label l2 ) const {
139
+ return (l1.lower_char() < l2.lower_char() ||
140
+ (l1.lower_char() == l2.lower_char() &&
141
+ l1.upper_char() < l2.upper_char()));
142
+ }
143
+ };
144
+
145
+ // comparison operator needed to store labels in a hash table
146
+ struct label_eq {
147
+ bool operator() ( const Label l1, const Label l2 ) const {
148
+ return (l1.lower_char() == l2.lower_char() &&
149
+ l1.upper_char() == l2.upper_char());
150
+ }
151
+ };
152
+ };
153
+
154
+ typedef vector<Label> Analysis;
155
+
156
+
157
+ /***************** class Alphabet *******************************/
158
+
159
+ class Alphabet {
160
+
161
+ // string comparison operators needed to stored strings in a hash table
162
+ struct eqstr {
163
+ bool operator()(const char* s1, const char* s2) const {
164
+ return strcmp(s1, s2) == 0;
165
+ }
166
+ };
167
+
168
+ // data structure storing labels without repetitions (i.e. as a set)
169
+ typedef set<Label, Label::label_cmp> LabelSet;
170
+
171
+ // hash table used to map the symbols to their codes
172
+ typedef hash_map<const char*, Character, hash<const char*>,eqstr> SymbolMap;
173
+
174
+ public: // HFST addition
175
+ // hash table used to map the codes back to the symbols
176
+ typedef hash_map<Character, char*> CharMap;
177
+
178
+ // HFST addition
179
+ bool operator==(const Alphabet &alpha) const;
180
+
181
+ private:
182
+ SymbolMap sm; // maps symbols to codes
183
+ CharMap cm; // maps codes to symbols
184
+ LabelSet ls; // set of labels known to the alphabet
185
+
186
+ // add a new symbol with symbol code c
187
+ void add( const char *symbol, Character c );
188
+
189
+ public:
190
+ bool utf8;
191
+
192
+ // iterators over the set of known labels
193
+ typedef LabelSet::iterator iterator;
194
+ typedef LabelSet::const_iterator const_iterator;
195
+ Alphabet();
196
+ ~Alphabet() { clear(); };
197
+ const_iterator begin() const { return ls.begin(); };
198
+ const_iterator end() const { return ls.end(); };
199
+ size_t size() const { return ls.size(); };
200
+
201
+ // HFST additions
202
+ CharMap get_char_map(void) { return cm; };
203
+ void print(void);
204
+
205
+
206
+ void clear();
207
+ void clear_char_pairs() { ls.clear(); };
208
+
209
+ // lookup a label in the alphabet
210
+ iterator find( Label l ) { return ls.find(l); };
211
+
212
+ // insert a label in the alphabet
213
+ void insert( Label l ) { if (!l.is_epsilon()) ls.insert(l); };
214
+
215
+ // insert the known symbols from another alphabet
216
+ void insert_symbols( const Alphabet& );
217
+
218
+ // insert the labels and known symbols from another alphabet
219
+ void copy( const Alphabet &a, Level level=both );
220
+
221
+ // create the alphabet of a transducer obtained by a composition operation
222
+ void compose( const Alphabet &la, const Alphabet &ua );
223
+
224
+ // add a symbol to the alphabet and return its code
225
+ Character add_symbol(const char *symbol);
226
+
227
+ // add a symbol to the alphabet with a given code
228
+ void add_symbol(const char *symbol, Character c );
229
+
230
+ // create a new marker symbol and return its code
231
+ Character new_marker( void );
232
+ void delete_markers();
233
+
234
+ // compute the complement of a symbol set
235
+ void complement( vector<Character> &sym );
236
+
237
+ // return the code of the argument symbol
238
+ int symbol2code( const char *s ) const {
239
+ SymbolMap::const_iterator p = sm.find(s);
240
+ if (p != sm.end()) return p->second;
241
+ return EOF;
242
+ };
243
+
244
+ // return the symbol for the given symbol code
245
+ const char *code2symbol( Character c ) const {
246
+ CharMap::const_iterator p=cm.find(c);
247
+ if (p == cm.end())
248
+ return NULL;
249
+ else
250
+ return p->second;
251
+ };
252
+
253
+ // write the symbol for the given symbol code into a string
254
+ void write_char( Character c, char *buffer, int *pos,
255
+ bool with_brackets=true ) const;
256
+
257
+ // write the symbol pair of a given label into a string
258
+ void write_label( Label l, char *buffer, int *pos,
259
+ bool with_brackets=true ) const;
260
+
261
+ // write the symbol for the given symbol code into a buffer and return
262
+ // a pointer to it
263
+ // the flag "with_brackets" indicates whether the angle brackets
264
+ // surrounding multi-character symbols are to be printed or not
265
+ const char *write_char( Character c, bool with_brackets=true ) const;
266
+
267
+ // write the symbol pair of a given label into a string
268
+ // and return a pointer to it
269
+ const char *write_label( Label l, bool with_brackets=true ) const;
270
+
271
+ // scan the next multi-character symbol in the argument string
272
+ int next_mcsym( char*&, bool insert=true );
273
+
274
+ // scan the next symbol in the argument string
275
+ int next_code( char*&, bool extended=true, bool insert=true );
276
+
277
+ // convert a character string into a symbol or label sequence
278
+ void string2symseq( char*, vector<Character>& );
279
+ void string2labelseq( char*, vector<Label>& );
280
+
281
+ // scan the next label in the argument string
282
+ Label next_label( char*&, bool extended=true );
283
+
284
+ // store the alphabet in the argument file (in binary form)
285
+ void store( FILE* ) const;
286
+
287
+ // read the alphabet from the argument file
288
+ void read( FILE* );
289
+
290
+ // disambiguation and printing of analyses
291
+ int compute_score( Analysis &ana );
292
+ void disambiguate( vector<Analysis> &analyses );
293
+ char *print_analysis( Analysis &ana, bool both_layers );
294
+
295
+ friend ostream &operator<<(ostream&, const Alphabet&);
296
+ };
297
+
298
+ // write the alphabet to the output stream (in readable form)
299
+ ostream &operator<<(ostream&, const Alphabet&);
300
+ }
301
+
302
+ #endif
@@ -0,0 +1,85 @@
1
+
2
+ /*******************************************************************/
3
+ /* */
4
+ /* FILE basic.C */
5
+ /* MODULE basic */
6
+ /* PROGRAM SFST */
7
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
8
+ /* */
9
+ /* PURPOSE */
10
+ /* */
11
+ /*******************************************************************/
12
+
13
+ #include <stdlib.h>
14
+ #include <string.h>
15
+
16
+ #include "basic.h"
17
+
18
+ namespace SFST {
19
+
20
+ bool Switch_Bytes=false;
21
+
22
+
23
+ /*******************************************************************/
24
+ /* */
25
+ /* fst_strdup */
26
+ /* */
27
+ /*******************************************************************/
28
+
29
+ char* fst_strdup(const char* pString)
30
+
31
+ {
32
+ char* pStringCopy = (char*)malloc(strlen(pString) + 1);
33
+ if (pStringCopy == NULL) {
34
+ fprintf(stderr, "\nError: out of memory (malloc failed)\naborted.\n");
35
+ exit(1);
36
+ }
37
+ strcpy(pStringCopy, pString);
38
+ return pStringCopy;
39
+ }
40
+
41
+
42
+ /*******************************************************************/
43
+ /* */
44
+ /* read_string */
45
+ /* */
46
+ /*******************************************************************/
47
+
48
+ int read_string( char *buffer, int size, FILE *file )
49
+
50
+ {
51
+ for( int i=0; i<size; i++ ) {
52
+ int c=fgetc(file);
53
+ if (c == EOF || c == 0) {
54
+ buffer[i] = 0;
55
+ return (c==0);
56
+ }
57
+ buffer[i] = (char)c;
58
+ }
59
+ buffer[size-1] = 0;
60
+ return 0;
61
+ }
62
+
63
+
64
+ /*******************************************************************/
65
+ /* */
66
+ /* read_num */
67
+ /* */
68
+ /*******************************************************************/
69
+
70
+ size_t read_num( void *p, size_t n, FILE *file )
71
+
72
+ {
73
+ char *pp=(char*)p;
74
+ size_t result=fread( pp, 1, n, file );
75
+ if (Switch_Bytes) {
76
+ size_t e=n/2;
77
+ for( size_t i=0; i<e; i++ ) {
78
+ char tmp=pp[i];
79
+ pp[i] = pp[--n];
80
+ pp[n] = tmp;
81
+ }
82
+ }
83
+ return result;
84
+ }
85
+ }
@@ -15,10 +15,13 @@
15
15
 
16
16
  #include <stdio.h>
17
17
 
18
- extern bool Switch_Bytes;
18
+ namespace SFST {
19
19
 
20
- char* fst_strdup(const char* pString);
21
- int read_string( char *buffer, int size, FILE *file );
22
- size_t read_num( void *p, size_t size, FILE *file );
20
+ extern bool Switch_Bytes;
23
21
 
22
+ char* fst_strdup(const char* pString);
23
+ int read_string( char *buffer, int size, FILE *file );
24
+ size_t read_num( void *p, size_t size, FILE *file );
25
+
26
+ }
24
27
  #endif
@@ -0,0 +1,629 @@
1
+ /*******************************************************************/
2
+ /* */
3
+ /* FILE compact.C */
4
+ /* MODULE compact */
5
+ /* PROGRAM SFST */
6
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
+ /* */
8
+ /* PURPOSE Code needed for analysing data */
9
+ /* */
10
+ /*******************************************************************/
11
+
12
+ #include <stdio.h>
13
+ #include <math.h>
14
+
15
+ #include <limits.h>
16
+
17
+ #include "compact.h"
18
+
19
+ namespace SFST {
20
+
21
+ using std::equal_range;
22
+ using std::vector;
23
+ using std::pair;
24
+
25
+ class label_less {
26
+ public:
27
+ bool operator()(const Label l1, const Label l2) const {
28
+ return l1.upper_char() < l2.upper_char();
29
+ }
30
+ };
31
+
32
+ const int BUFFER_SIZE=1000;
33
+
34
+
35
+ /*******************************************************************/
36
+ /* */
37
+ /* CompactTransducer::convert */
38
+ /* */
39
+ /*******************************************************************/
40
+
41
+ void CompactTransducer::convert( CAnalysis &cana, Analysis &ana )
42
+
43
+ {
44
+ ana.resize(cana.size());
45
+ for( size_t i=0; i<cana.size(); i++ )
46
+ ana[i] = label[cana[i]];
47
+ }
48
+
49
+
50
+ /*******************************************************************/
51
+ /* */
52
+ /* CompactTransducer::analyze */
53
+ /* */
54
+ /*******************************************************************/
55
+
56
+ void CompactTransducer::analyze(unsigned int n, vector<Character> &input,
57
+ size_t ipos, CAnalysis &ca,
58
+ vector<CAnalysis> &analyses )
59
+ {
60
+ // "n" is the number of the current transducer node/state
61
+ // "input" is the sequence of input symbols
62
+ // "ipos" is the input position currently analysed
63
+ // "ca" stores the incomplete analysis string
64
+ // "analyses" stores the analyses found so far
65
+
66
+ if (analyses.size() > 10000)
67
+ return; // limit the maximal number of analyses
68
+
69
+ // Is the input string fully analyzed and the current node a final node?
70
+ if (finalp[n] && ipos == input.size())
71
+ // store the new analysis
72
+ analyses.push_back(ca);
73
+
74
+ // follow the epsilon transitions
75
+ // first_arc[n] is the number of the first outgoing transition of node n
76
+ // first_arc[n+1]-1 is the number of the last outgoing transition of node n
77
+ // first_arc[n+1] is the number of the first outgoing transition of node n+1
78
+ unsigned int i;
79
+ for( i=first_arc[n];
80
+ i<first_arc[n+1] && label[i].upper_char() == Label::epsilon;
81
+ i++)
82
+ {
83
+ ca.push_back(i);
84
+ analyze(target_node[i], input, ipos, ca, analyses);
85
+ ca.pop_back();
86
+ }
87
+
88
+ // follow the non-epsilon transitions
89
+
90
+ // scan the next input symbol
91
+ if (ipos < input.size()) {
92
+ // find the set of arcs with matching upper character in the sorted list
93
+ pair<Label*,Label*>range =
94
+ equal_range(label+i, label+first_arc[n+1], Label(input[ipos]),
95
+ label_less());
96
+ unsigned int to = (unsigned int)(range.second - label);
97
+
98
+ // follow the non-epsilon transitions
99
+ for( i=(unsigned)(range.first-label); i<to; i++) {
100
+ ca.push_back(i);
101
+ analyze(target_node[i], input, ipos+1, ca, analyses);
102
+ ca.pop_back();
103
+ }
104
+ }
105
+ }
106
+
107
+
108
+ /*******************************************************************/
109
+ /* */
110
+ /* CompactTransducer::analyze_string */
111
+ /* */
112
+ /*******************************************************************/
113
+
114
+ void CompactTransducer::analyze_string( char *s, vector<CAnalysis> &analyses )
115
+
116
+ {
117
+ // "s" input string to be analyzed
118
+ // "analyses" is the data structure in which the results are stored
119
+ // and returned
120
+
121
+ vector<Character> input;
122
+
123
+ alphabet.string2symseq( s, input );
124
+
125
+ analyses.clear();
126
+ CAnalysis ca; // data structure where the current incomplete analysis
127
+ // is stored
128
+ analyze(0, input, 0, ca, analyses); // start the analysis
129
+
130
+ if (analyses.size() > 10000)
131
+ fprintf(stderr,"Warning: Only the first 10000 analyses considered for \"%s\"!\n", s);
132
+
133
+ if (simplest_only && analyses.size() > 1)
134
+ disambiguate( analyses ); // select the simplest analyses
135
+ }
136
+
137
+
138
+
139
+ /*******************************************************************/
140
+ /* */
141
+ /* CompactTransducer::~CompactTransducer */
142
+ /* */
143
+ /*******************************************************************/
144
+
145
+ CompactTransducer::~CompactTransducer()
146
+
147
+ {
148
+ delete[] finalp;
149
+ delete[] first_arc;
150
+ delete[] label;
151
+ delete[] target_node;
152
+ delete[] final_logprob;
153
+ delete[] arc_logprob;
154
+ }
155
+
156
+
157
+ /*******************************************************************/
158
+ /* */
159
+ /* CompactTransducer::CompactTransducer */
160
+ /* */
161
+ /*******************************************************************/
162
+
163
+ CompactTransducer::CompactTransducer()
164
+
165
+ {
166
+ both_layers = false;
167
+ simplest_only = false;
168
+ number_of_nodes = 0;
169
+ number_of_arcs = 0;
170
+ finalp = NULL;
171
+ first_arc = NULL;
172
+ label = NULL;
173
+ target_node = NULL;
174
+ arc_logprob = final_logprob = (float*)NULL;
175
+ }
176
+
177
+
178
+ /*******************************************************************/
179
+ /* */
180
+ /* CompactTransducer::read_finalp */
181
+ /* */
182
+ /*******************************************************************/
183
+
184
+ void CompactTransducer::read_finalp( FILE *file )
185
+
186
+ {
187
+ int k=0;
188
+ unsigned char n=0;
189
+ for( size_t i=0; i<number_of_nodes; i++ ) {
190
+ if (k == 0) {
191
+ n = (unsigned char)fgetc(file);
192
+ k = 8;
193
+ }
194
+ k--;
195
+ if (n & (1 << k))
196
+ finalp[i] = 1;
197
+ else
198
+ finalp[i] = 0;
199
+ }
200
+ }
201
+
202
+
203
+ /*******************************************************************/
204
+ /* */
205
+ /* CompactTransducer::read_first_arcs */
206
+ /* */
207
+ /*******************************************************************/
208
+
209
+ void CompactTransducer::read_first_arcs( FILE *file )
210
+
211
+ {
212
+ int k=0;
213
+ unsigned int n=0;
214
+ int bits=(int)ceil(log(number_of_arcs+1)/log(2));
215
+
216
+ for( size_t i=0; i<=number_of_nodes; i++ ) {
217
+ first_arc[i] = n >> (sizeof(n)*8 - bits);
218
+ n <<= bits;
219
+ k -= bits;
220
+ if (k < 0) {
221
+ read_num(&n,sizeof(n),file);
222
+ first_arc[i] |= n >> (sizeof(n)*8 + k);
223
+ n <<= -k;
224
+ k += (int)sizeof(n) * 8;
225
+ }
226
+ }
227
+ }
228
+
229
+
230
+ /*******************************************************************/
231
+ /* */
232
+ /* CompactTransducer::read_target_nodes */
233
+ /* */
234
+ /*******************************************************************/
235
+
236
+ void CompactTransducer::read_target_nodes( FILE *file )
237
+
238
+ {
239
+ int k=0;
240
+ unsigned int n=0;
241
+ int bits=(int)ceil(log(number_of_nodes)/log(2));
242
+
243
+ for( size_t i=0; i<number_of_arcs; i++ ) {
244
+ target_node[i] = n >> (sizeof(n)*8 - bits);
245
+ n <<= bits;
246
+ k -= bits;
247
+ if (k < 0) {
248
+ read_num(&n,sizeof(n),file);
249
+ target_node[i] |= n >> (sizeof(n)*8 + k);
250
+ n <<= -k;
251
+ k += (int)sizeof(n) * 8;
252
+ }
253
+ }
254
+ }
255
+
256
+
257
+ /*******************************************************************/
258
+ /* */
259
+ /* CompactTransducer::read_labels */
260
+ /* */
261
+ /*******************************************************************/
262
+
263
+ void CompactTransducer::read_labels( FILE *file )
264
+
265
+ {
266
+ size_t N=0;
267
+ vector<Label> Num2Label(alphabet.size());
268
+ for( Alphabet::const_iterator it=alphabet.begin();
269
+ it != alphabet.end(); it++ )
270
+ {
271
+ Label l=*it;
272
+ Num2Label[N++] = l;
273
+ }
274
+
275
+ int k=0;
276
+ unsigned int n=0;
277
+ int bits=(int)ceil(log((double)alphabet.size())/log(2));
278
+
279
+ for( size_t i=0; i<number_of_arcs; i++ ) {
280
+ unsigned int l = n >> (sizeof(n)*8 - bits);
281
+ n <<= bits;
282
+ k -= bits;
283
+ if (k < 0) {
284
+ read_num(&n,sizeof(n),file);
285
+ l |= n >> (sizeof(n)*8 + k);
286
+ n <<= -k;
287
+ k += (int)sizeof(n) * 8;
288
+ }
289
+ label[i] = Num2Label[l];
290
+ }
291
+ }
292
+
293
+
294
+ /*******************************************************************/
295
+ /* */
296
+ /* CompactTransducer::read_probs */
297
+ /* */
298
+ /*******************************************************************/
299
+
300
+ void CompactTransducer::read_probs( FILE *file )
301
+
302
+ {
303
+ size_t n,m;
304
+ fread(&n, sizeof(n), 1, file);
305
+ if (fread(&m, sizeof(n), 1, file) != 1 ||
306
+ n != node_count() || m != arc_count())
307
+ {
308
+ fprintf(stderr,"Error: incompatible probability file!\n");
309
+ exit(1);
310
+ }
311
+ final_logprob = new float[n];
312
+ arc_logprob = new float[m];
313
+ fread(final_logprob, sizeof(float), n, file);
314
+ if (fread(arc_logprob, sizeof(float), n, file) != n) {
315
+ fprintf(stderr,"Error: in probability file!\n");
316
+ exit(1);
317
+ }
318
+ }
319
+
320
+
321
+ /*******************************************************************/
322
+ /* */
323
+ /* CompactTransducer::CompactTransducer */
324
+ /* */
325
+ /*******************************************************************/
326
+
327
+ CompactTransducer::CompactTransducer( FILE *file, FILE *pfile )
328
+
329
+ {
330
+ both_layers = false;
331
+ simplest_only = false;
332
+
333
+ if (fgetc(file) != 'c')
334
+ throw "Error: wrong file format (not a compact transducer)\n";
335
+
336
+ alphabet.read(file);
337
+
338
+ read_num(&number_of_nodes,sizeof(number_of_nodes),file);
339
+ read_num(&number_of_arcs,sizeof(number_of_arcs),file);
340
+
341
+ if (!ferror(file)) {
342
+ // memory allocation
343
+ finalp = new char[number_of_nodes];
344
+ first_arc = new unsigned[number_of_nodes+1];
345
+ label = new Label[number_of_arcs];
346
+ target_node = new unsigned[number_of_arcs];
347
+
348
+ // reading the data
349
+ read_finalp(file);
350
+ read_first_arcs(file);
351
+ read_labels(file);
352
+ read_target_nodes(file);
353
+ }
354
+
355
+ if (pfile == NULL)
356
+ arc_logprob = final_logprob = (float*)NULL;
357
+ else
358
+ read_probs(pfile);
359
+ }
360
+
361
+
362
+ /*******************************************************************/
363
+ /* */
364
+ /* CompactTransducer::longest_match2 */
365
+ /* */
366
+ /*******************************************************************/
367
+
368
+ void CompactTransducer::longest_match2(unsigned int n, char *string, int l,
369
+ CAnalysis &ca, int &bl, CAnalysis &ba)
370
+ {
371
+ // n: transducer state
372
+ // string: rest string
373
+ // l: length of current analysis
374
+ // bl: length of the currently longest match
375
+ // ca: current analysis
376
+ // ba: best analysis
377
+
378
+ if (finalp[n] && l > bl) {
379
+ // store the new analysis
380
+ bl = l;
381
+ ba = ca; // copy the arc vector
382
+ }
383
+
384
+ // follow the epsilon transitions
385
+ unsigned int i;
386
+ for( i=first_arc[n];
387
+ i<first_arc[n+1] && label[i].upper_char() == Label::epsilon;
388
+ i++)
389
+ {
390
+ ca.push_back(i);
391
+ longest_match2(target_node[i], string, l, ca, bl, ba);
392
+ ca.pop_back();
393
+ }
394
+
395
+ // follow the non-epsilon transitions
396
+ char *end=string;
397
+ int c=alphabet.next_code(end, false, false);
398
+ l += (int)(end - string);
399
+ if (c != EOF) {
400
+ // find the set of arcs with matching upper character in the sort list
401
+ pair<Label*,Label*>range =
402
+ equal_range(label+i, label+first_arc[n+1], Label((Character)c),
403
+ label_less());
404
+ unsigned int to = (unsigned int)(range.second - label);
405
+ for( i=(unsigned)(range.first-label); i<to; i++) {
406
+ ca.push_back(i);
407
+ longest_match2(target_node[i], end, l, ca, bl, ba);
408
+ ca.pop_back();
409
+ }
410
+ }
411
+ }
412
+
413
+
414
+ /*******************************************************************/
415
+ /* */
416
+ /* CompactTransducer::print_analysis */
417
+ /* */
418
+ /*******************************************************************/
419
+
420
+ char *CompactTransducer::print_analysis( CAnalysis &cana )
421
+
422
+ {
423
+ Analysis ana;
424
+ convert(cana, ana);
425
+ return alphabet.print_analysis( ana, both_layers );
426
+ }
427
+
428
+
429
+ /*******************************************************************/
430
+ /* */
431
+ /* CompactTransducer::longest_match */
432
+ /* */
433
+ /*******************************************************************/
434
+
435
+ const char *CompactTransducer::longest_match( char* &string )
436
+
437
+ {
438
+ vector<char> analysis;
439
+ CAnalysis ca, ba;
440
+ int l=0;
441
+ longest_match2(0, string, 0, ca, l, ba);
442
+
443
+ // no match? return the next character
444
+ if (ba.size() == 0) {
445
+ int c=alphabet.next_code(string, false, false);
446
+ return alphabet.code2symbol((Character)c);
447
+ }
448
+
449
+ string += l;
450
+ return print_analysis( ba );
451
+ }
452
+
453
+
454
+ /*******************************************************************/
455
+ /* */
456
+ /* CompactTransducer::disambiguate */
457
+ /* */
458
+ /*******************************************************************/
459
+
460
+ void CompactTransducer::disambiguate( vector<CAnalysis> &analyses )
461
+
462
+ {
463
+ // compute the scores
464
+ int bestscore=INT_MIN;
465
+ vector<int> score;
466
+ Analysis ana;
467
+
468
+ for( size_t i=0; i<analyses.size(); i++ ) {
469
+ convert(analyses[i], ana);
470
+ score.push_back(alphabet.compute_score(ana));
471
+ if (bestscore < score[i])
472
+ bestscore = score[i];
473
+ }
474
+
475
+ // delete suboptimal analyses
476
+ size_t k=0;
477
+ for( size_t i=0; i<analyses.size(); i++ )
478
+ if (score[i] == bestscore)
479
+ analyses[k++] = analyses[i];
480
+ analyses.resize(k);
481
+ }
482
+
483
+
484
+ /*******************************************************************/
485
+ /* */
486
+ /* CompactTransducer::train2 */
487
+ /* */
488
+ /*******************************************************************/
489
+
490
+ bool CompactTransducer::train2( char *s, vector<double> &arcfreq,
491
+ vector<double> &finalfreq )
492
+ {
493
+ vector<CAnalysis> analyses;
494
+ vector<Label> input;
495
+ alphabet.string2labelseq( s, input );
496
+
497
+ CAnalysis ca; // data structure where the analysis is stored
498
+ unsigned int n=0;
499
+ bool failure=false;
500
+ for( size_t i=0; i<input.size(); i++ ) {
501
+ failure = true;
502
+ for( unsigned int k=first_arc[n]; k<first_arc[n+1]; k++) {
503
+ if (label[k] == input[i]) {
504
+ ca.push_back(k);
505
+ n = target_node[k];
506
+ failure = false;
507
+ break;
508
+ }
509
+ }
510
+ if (failure)
511
+ break;
512
+ }
513
+ if (failure || !finalp[n]) {
514
+ fprintf(stderr,"Warning: The following input is not covered:\n%s\n", s);
515
+ return false;
516
+ }
517
+
518
+ for( size_t k=0; k<ca.size(); k++ )
519
+ arcfreq[ca[k]]++;
520
+ finalfreq[target_node[ca.back()]]++;
521
+
522
+ return true;
523
+ }
524
+
525
+
526
+ /*******************************************************************/
527
+ /* */
528
+ /* CompactTransducer::train */
529
+ /* */
530
+ /*******************************************************************/
531
+
532
+ bool CompactTransducer::train( char *s, vector<double> &arcfreq,
533
+ vector<double> &finalfreq )
534
+ {
535
+ vector<CAnalysis> analyses;
536
+ vector<Character> input;
537
+ alphabet.string2symseq( s, input );
538
+
539
+ CAnalysis ca; // data structure where the current incomplete analysis
540
+ // is stored
541
+ analyze(0, input, 0, ca, analyses); // start the analysis
542
+
543
+ if (analyses.size() > 10000)
544
+ return true; // ignore inputs with more than 10000 analyses
545
+ else if (analyses.size() == 0)
546
+ return false;
547
+
548
+ if (simplest_only && analyses.size() > 1)
549
+ disambiguate( analyses ); // select the simplest analyses
550
+
551
+ if (analyses.size() > 0) {
552
+ double incr = 1.0 / (double)analyses.size();
553
+ CAnalysis arcs;
554
+
555
+ for( size_t i=0; i<analyses.size(); i++ ) {
556
+ CAnalysis &arcs=analyses[i];
557
+ for( size_t k=0; k<arcs.size(); k++ )
558
+ arcfreq[arcs[k]] += incr;
559
+ finalfreq[target_node[arcs.back()]] += incr;
560
+ }
561
+ }
562
+ return true;
563
+ }
564
+
565
+
566
+ /*******************************************************************/
567
+ /* */
568
+ /* CompactTransducer::estimate_probs */
569
+ /* */
570
+ /*******************************************************************/
571
+
572
+ void CompactTransducer::estimate_probs( vector<double> &arcfreq,
573
+ vector<double> &finalfreq )
574
+ {
575
+ // turn frequencies into probabilities
576
+ for( size_t n=0; n<finalfreq.size(); n++ ) {
577
+ double sum = finalfreq[n];
578
+ for( size_t a=first_arc[n]; a<first_arc[n+1]; a++ )
579
+ sum += arcfreq[a];
580
+ if (sum == 0.0)
581
+ sum = 1.0;
582
+ finalfreq[n] = finalfreq[n] / sum;
583
+ for( size_t a=first_arc[n]; a<first_arc[n+1]; a++ )
584
+ arcfreq[a] = arcfreq[a] / sum;
585
+ }
586
+ }
587
+
588
+
589
+
590
+ /*******************************************************************/
591
+ /* */
592
+ /* CompactTransducer::compute_probs */
593
+ /* */
594
+ /*******************************************************************/
595
+
596
+ void CompactTransducer::compute_probs( vector<CAnalysis> &analyses,
597
+ vector<double> &prob )
598
+ {
599
+ prob.resize(analyses.size());
600
+ double sum=0.0;
601
+ for( size_t i=0; i<analyses.size(); i++ ) {
602
+ CAnalysis &a=analyses[i];
603
+
604
+ // compute the probability
605
+ double logprob=0.0;
606
+ for( size_t k=0; k<a.size(); k++ )
607
+ logprob += arc_logprob[a[k]];
608
+ logprob += final_logprob[target_node[a.back()]];
609
+ prob[i] = exp(logprob);
610
+ sum += prob[i];
611
+ }
612
+
613
+ // sort the analyses
614
+ vector<CAnalysis> oldanalyses(analyses);
615
+ vector<double> oldprob(prob);
616
+ for( size_t i=0; i<analyses.size(); i++ ) {
617
+ prob[i] = -1.0;
618
+ size_t n=0;
619
+ for( size_t k=0; k<oldanalyses.size(); k++ )
620
+ if (prob[i] < oldprob[k]) {
621
+ prob[i] = oldprob[k];
622
+ n = k;
623
+ }
624
+ analyses[i] = oldanalyses[n];
625
+ oldprob[n] = -1.0;
626
+ prob[i] /= sum; // normalization
627
+ }
628
+ }
629
+ }