ruby-sfst 0.4.3 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -0
  3. data/COPYING +280 -0
  4. data/Gemfile +3 -0
  5. data/Gemfile.lock +54 -0
  6. data/README.md +1 -1
  7. data/Rakefile +9 -18
  8. data/bin/console +7 -0
  9. data/bin/setup +6 -0
  10. data/ext/sfst/alphabet.cc +879 -0
  11. data/ext/sfst/alphabet.h +302 -0
  12. data/ext/sfst/basic.cc +85 -0
  13. data/ext/{sfst_machine → sfst}/basic.h +7 -4
  14. data/ext/sfst/compact.cc +629 -0
  15. data/ext/sfst/compact.h +100 -0
  16. data/ext/sfst/determinise.cc +279 -0
  17. data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
  18. data/ext/sfst/fst.cc +1150 -0
  19. data/ext/sfst/fst.h +374 -0
  20. data/ext/sfst/hopcroft.cc +681 -0
  21. data/ext/sfst/interface.cc +1921 -0
  22. data/ext/sfst/interface.h +171 -0
  23. data/ext/sfst/make-compact.cc +323 -0
  24. data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
  25. data/ext/sfst/mem.h +80 -0
  26. data/ext/sfst/operators.cc +1273 -0
  27. data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
  28. data/ext/sfst/sgi.h +72 -0
  29. data/ext/sfst/utf8.cc +149 -0
  30. data/ext/{sfst_machine → sfst}/utf8.h +7 -4
  31. data/lib/sfst.rb +2 -1
  32. data/lib/sfst/version.rb +1 -1
  33. data/ruby-sfst.gemspec +23 -23
  34. metadata +107 -35
  35. data/ext/sfst_machine/alphabet.cc +0 -812
  36. data/ext/sfst_machine/alphabet.h +0 -273
  37. data/ext/sfst_machine/basic.cc +0 -84
  38. data/ext/sfst_machine/compact.cc +0 -616
  39. data/ext/sfst_machine/compact.h +0 -98
  40. data/ext/sfst_machine/determinise.cc +0 -303
  41. data/ext/sfst_machine/fst.cc +0 -1000
  42. data/ext/sfst_machine/fst.h +0 -369
  43. data/ext/sfst_machine/interface.cc +0 -1842
  44. data/ext/sfst_machine/interface.h +0 -93
  45. data/ext/sfst_machine/make-compact.cc +0 -327
  46. data/ext/sfst_machine/mem.h +0 -74
  47. data/ext/sfst_machine/operators.cc +0 -1131
  48. data/ext/sfst_machine/sgi.h +0 -44
  49. data/ext/sfst_machine/utf8.cc +0 -146
  50. data/test/test_sfst.fst +0 -3
  51. data/test/test_sfst.rb +0 -114
@@ -1,273 +0,0 @@
1
- /*******************************************************************/
2
- /* */
3
- /* FILE alphabet.h */
4
- /* MODULE alphabet */
5
- /* PROGRAM SFST */
6
- /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
- /* */
8
- /* PURPOSE finite state tools */
9
- /* */
10
- /*******************************************************************/
11
-
12
- #ifndef _ALPHABET_H_
13
- #define _ALPHABET_H_
14
-
15
- #include <stdio.h>
16
-
17
- #include "basic.h"
18
-
19
- #include <set>
20
- #include <vector>
21
-
22
- #include <iostream>
23
-
24
- #include <cstring>
25
-
26
- #include "sgi.h"
27
-
28
- #ifndef CODE_DATA_TYPE
29
- typedef unsigned short Character; // data type of the symbol codes
30
- #else
31
- typedef unsigned CODE_DATA_TYPE Character;
32
- #endif
33
-
34
- // data type used to indicate whether some action is to be performed
35
- // on the analysis level (lower) or the surface level (upper)
36
- typedef enum {upper, lower} Level;
37
-
38
- extern char EpsilonString[]; // holds the symbol representing the empty string
39
- // which is usually "<>"
40
-
41
-
42
- /***************** class Label ***********************************/
43
-
44
- class Label {
45
-
46
- private:
47
- // data structure where the two symbols are stored
48
- struct {
49
- Character lower;
50
- Character upper;
51
- } label;
52
-
53
- public:
54
- static const Character epsilon=0; // code of the empty symbol
55
-
56
- // new label with two identical symbols
57
- Label( Character c=epsilon ) { label.lower = label.upper = c; };
58
-
59
- // new label with two different symbols
60
- Label( Character c1, Character c2 )
61
- { label.lower = c1; label.upper = c2; };
62
-
63
- // returns the indicated symbol of the label
64
- Character get_char( Level l ) const
65
- { return ((l==upper)? label.upper: label.lower); };
66
-
67
- // returns the "upper" symbol of the label (i.e. the surface symbol)
68
- Character upper_char() const { return label.upper; };
69
-
70
- // returns the "lower" symbol of the label (i.e. the analysis symbol)
71
- Character lower_char() const { return label.lower; };
72
-
73
- // replaces symbols in a label
74
- Label replace_char( Character c, Character nc ) const {
75
- Label l = *this;
76
- if (l.label.lower == c)
77
- l.label.lower = nc;
78
- if (l.label.upper == c)
79
- l.label.upper = nc;
80
- return l;
81
- };
82
-
83
- // operators checking the equality of labels
84
- int operator==( Label l ) const
85
- { return (label.lower==l.label.lower && label.upper==l.label.upper); };
86
- int operator!=( Label l ) const
87
- { return !(l == *this); };
88
-
89
- // comparison operator needed for sorting labels
90
- int operator<( Label l ) const {
91
- return (upper_char() < l.upper_char()); };
92
-
93
- // check whether the label is epsilon (i.e. both symbols are epsilon)
94
- // transitions with epsilon labels are epsilon transitions
95
- int is_epsilon() const
96
- { return (label.upper == epsilon && label.lower == epsilon); };
97
-
98
- // check whether the "upper" symbol is epsilon
99
- int upper_is_epsilon() const
100
- { return (label.upper == epsilon); };
101
-
102
- // check whether the "lower" symbol is epsilon
103
- int lower_is_epsilon() const
104
- { return (label.lower == epsilon); };
105
-
106
- // hash function needed to store labels in a hash table
107
- struct label_hash {
108
- size_t operator() ( const Label l ) const {
109
- return (size_t)l.lower_char() ^
110
- ((size_t)l.upper_char() << 16) ^
111
- ((size_t)l.upper_char() >> 16);
112
- }
113
- };
114
-
115
- // hash function needed to store labels in a hash table
116
- struct label_cmp {
117
- bool operator() ( const Label l1, const Label l2 ) const {
118
- return (l1.lower_char() < l2.lower_char() ||
119
- (l1.lower_char() == l2.lower_char() &&
120
- l1.upper_char() < l2.upper_char()));
121
- }
122
- };
123
-
124
- // comparison operator needed to store labels in a hash table
125
- struct label_eq {
126
- bool operator() ( const Label l1, const Label l2 ) const {
127
- return (l1.lower_char() == l2.lower_char() &&
128
- l1.upper_char() == l2.upper_char());
129
- }
130
- };
131
- };
132
-
133
- typedef std::vector<Label> Analysis;
134
-
135
-
136
- /***************** class Alphabet *******************************/
137
-
138
- class Alphabet {
139
-
140
- // string comparison operators needed to stored strings in a hash table
141
- struct eqstr {
142
- bool operator()(const char* s1, const char* s2) const {
143
- return strcmp(s1, s2) == 0;
144
- }
145
- };
146
-
147
- // data structure storing labels without repetitions (i.e. as a set)
148
- typedef std::set<Label, Label::label_cmp> LabelSet;
149
-
150
- // hash table used to map the symbols to their codes
151
- typedef hash_map<const char*, Character, hash<const char*>,eqstr> SymbolMap;
152
-
153
- // hash table used to map the codes back to the symbols
154
- typedef hash_map<Character, char*> CharMap;
155
-
156
- private:
157
- SymbolMap sm; // maps symbols to codes
158
-
159
- CharMap cm; // maps codes to symbols
160
- LabelSet ls; // set of labels known to the alphabet
161
-
162
- // add a new symbol with symbol code c
163
- void add( const char *symbol, Character c );
164
-
165
- public:
166
- bool utf8;
167
-
168
- // iterators over the set of known labels
169
- typedef LabelSet::iterator iterator;
170
- typedef LabelSet::const_iterator const_iterator;
171
- Alphabet();
172
- ~Alphabet() { clear(); };
173
- const_iterator begin() const { return ls.begin(); };
174
- const_iterator end() const { return ls.end(); };
175
- size_t size() const { return ls.size(); };
176
-
177
- void clear();
178
- void clear_char_pairs() { ls.clear(); };
179
-
180
- // lookup a label in the alphabet
181
- iterator find( Label l ) { return ls.find(l); };
182
-
183
- // insert a label in the alphabet
184
- void insert( Label l ) { if (!l.is_epsilon()) ls.insert(l); };
185
-
186
- // insert the known symbols from another alphabet
187
- void insert_symbols( const Alphabet& );
188
-
189
- // insert the labels and known symbols from another alphabet
190
- void copy( const Alphabet& );
191
-
192
- // create the alphabet of a transducer obtained by a composition operation
193
- void compose( const Alphabet &la, const Alphabet &ua );
194
-
195
- // add a symbol to the alphabet and return its code
196
- Character add_symbol(const char *symbol);
197
-
198
- // add a symbol to the alphabet with a given code
199
- void add_symbol(const char *symbol, Character c );
200
-
201
- // create a new marker symbol and return its code
202
- Character new_marker( void );
203
- void delete_markers();
204
-
205
- // compute the complement of a symbol set
206
- void complement( std::vector<Character> &sym );
207
-
208
- // return the code of the argument symbol
209
- int symbol2code( const char *s ) const {
210
- SymbolMap::const_iterator p = sm.find(s);
211
- if (p != sm.end()) return p->second;
212
- return EOF;
213
- };
214
-
215
- // return the symbol for the given symbol code
216
- const char *code2symbol( Character c ) const {
217
- CharMap::const_iterator p=cm.find(c);
218
- if (p == cm.end())
219
- return NULL;
220
- else
221
- return p->second;
222
- };
223
-
224
- // write the symbol for the given symbol code into a string
225
- void write_char( Character c, char *buffer, int *pos,
226
- bool with_brackets=true ) const;
227
-
228
- // write the symbol pair of a given label into a string
229
- void write_label( Label l, char *buffer, int *pos,
230
- bool with_brackets=true ) const;
231
-
232
- // write the symbol for the given symbol code into a buffer and return
233
- // a pointer to it
234
- // the flag "with_brackets" indicates whether the angle brackets
235
- // surrounding multi-character symbols are to be printed or not
236
- const char *write_char( Character c, bool with_brackets=true ) const;
237
-
238
- // write the symbol pair of a given label into a string
239
- // and return a pointer to it
240
- const char *write_label( Label l, bool with_brackets=true ) const;
241
-
242
- // scan the next multi-character symbol in the argument string
243
- int next_mcsym( char*&, bool insert=true );
244
-
245
- // scan the next symbol in the argument string
246
- int next_code( char*&, bool extended=true, bool insert=true );
247
-
248
- // convert a character string into a symbol or label sequence
249
- void string2symseq( char*, std::vector<Character>& );
250
- void string2labelseq( char*, std::vector<Label>& );
251
-
252
- // scan the next label in the argument string
253
- Label next_label( char*&, bool extended=true );
254
-
255
- // store the alphabet in the argument file (in binary form)
256
- void store( FILE* ) const;
257
-
258
- // read the alphabet from the argument file
259
- void read( FILE* );
260
-
261
- // disambiguation and printing of analyses
262
- int compute_score( Analysis &ana );
263
- void disambiguate( std::vector<Analysis> &analyses );
264
- char *print_analysis( Analysis &ana, bool both_layers );
265
-
266
- friend std::ostream &operator<<(std::ostream&, const Alphabet&);
267
- };
268
-
269
- // write the alphabet to the output stream (in readable form)
270
- std::ostream &operator<<(std::ostream&, const Alphabet&);
271
-
272
-
273
- #endif
@@ -1,84 +0,0 @@
1
-
2
- /*******************************************************************/
3
- /* */
4
- /* FILE basic.C */
5
- /* MODULE basic */
6
- /* PROGRAM SFST */
7
- /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
8
- /* */
9
- /* PURPOSE */
10
- /* */
11
- /*******************************************************************/
12
-
13
- #include <stdlib.h>
14
- #include <string.h>
15
-
16
- #include "basic.h"
17
-
18
- bool Switch_Bytes=false;
19
-
20
-
21
-
22
- /*******************************************************************/
23
- /* */
24
- /* fst_strdup */
25
- /* */
26
- /*******************************************************************/
27
-
28
- char* fst_strdup(const char* pString)
29
-
30
- {
31
- char* pStringCopy = (char*)malloc(strlen(pString) + 1);
32
- if (pStringCopy == NULL) {
33
- fprintf(stderr, "\nError: out of memory (malloc failed)\naborted.\n");
34
- exit(1);
35
- }
36
- strcpy(pStringCopy, pString);
37
- return pStringCopy;
38
- }
39
-
40
-
41
- /*******************************************************************/
42
- /* */
43
- /* read_string */
44
- /* */
45
- /*******************************************************************/
46
-
47
- int read_string( char *buffer, int size, FILE *file )
48
-
49
- {
50
- for( int i=0; i<size; i++ ) {
51
- int c=fgetc(file);
52
- if (c == EOF || c == 0) {
53
- buffer[i] = 0;
54
- return (c==0);
55
- }
56
- buffer[i] = (char)c;
57
- }
58
- buffer[size-1] = 0;
59
- return 0;
60
- }
61
-
62
-
63
- /*******************************************************************/
64
- /* */
65
- /* read_num */
66
- /* */
67
- /*******************************************************************/
68
-
69
- size_t read_num( void *p, size_t n, FILE *file )
70
-
71
- {
72
- char *pp=(char*)p;
73
- size_t result=fread( pp, 1, n, file );
74
- if (Switch_Bytes) {
75
- size_t e=n/2;
76
- for( size_t i=0; i<e; i++ ) {
77
- char tmp=pp[i];
78
- pp[i] = pp[--n];
79
- pp[n] = tmp;
80
- }
81
- }
82
- return result;
83
- }
84
-
@@ -1,616 +0,0 @@
1
- /*******************************************************************/
2
- /* */
3
- /* FILE compact.C */
4
- /* MODULE compact */
5
- /* PROGRAM SFST */
6
- /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
- /* */
8
- /* PURPOSE Code needed for analysing data */
9
- /* */
10
- /*******************************************************************/
11
-
12
- #include <stdio.h>
13
- #include <math.h>
14
-
15
- #include <limits.h>
16
-
17
- #include "compact.h"
18
-
19
- using std::equal_range;
20
- using std::vector;
21
- using std::pair;
22
-
23
- const int BUFFER_SIZE=1000;
24
-
25
-
26
- /*******************************************************************/
27
- /* */
28
- /* CompactTransducer::convert */
29
- /* */
30
- /*******************************************************************/
31
-
32
- void CompactTransducer::convert( CAnalysis &cana, Analysis &ana )
33
-
34
- {
35
- ana.resize(cana.size());
36
- for( size_t i=0; i<cana.size(); i++ )
37
- ana[i] = label[cana[i]];
38
- }
39
-
40
-
41
- /*******************************************************************/
42
- /* */
43
- /* CompactTransducer::analyze */
44
- /* */
45
- /*******************************************************************/
46
-
47
- void CompactTransducer::analyze(unsigned int n, vector<Character> &input,
48
- size_t ipos, CAnalysis &ca,
49
- vector<CAnalysis> &analyses )
50
- {
51
- // "n" is the number of the current transducer node/state
52
- // "input" is the sequence of input symbols
53
- // "ipos" is the input position currently analysed
54
- // "ca" stores the incomplete analysis string
55
- // "analyses" stores the analyses found so far
56
-
57
- if (analyses.size() > 10000)
58
- return; // limit the maximal number of analyses
59
-
60
- // Is the input string fully analyzed and the current node a final node?
61
- if (finalp[n] && ipos == input.size())
62
- // store the new analysis
63
- analyses.push_back(ca);
64
-
65
- // follow the epsilon transitions
66
- // first_arc[n] is the number of the first outgoing transition of node n
67
- // first_arc[n+1]-1 is the number of the last outgoing transition of node n
68
- // first_arc[n+1] is the number of the first outgoing transition of node n+1
69
- unsigned int i;
70
- for( i=first_arc[n];
71
- i<first_arc[n+1] && label[i].upper_char() == Label::epsilon;
72
- i++)
73
- {
74
- ca.push_back(i);
75
- analyze(target_node[i], input, ipos, ca, analyses);
76
- ca.pop_back();
77
- }
78
-
79
- // follow the non-epsilon transitions
80
-
81
- // scan the next input symbol
82
- if (ipos < input.size()) {
83
- // find the set of arcs with matching upper character in the sorted list
84
- pair<Label*,Label*>range =
85
- equal_range(label+i, label+first_arc[n+1], Label(input[ipos]));
86
- unsigned int to = (unsigned int)(range.second - label);
87
-
88
- // follow the non-epsilon transitions
89
- for( i=range.first-label; i<to; i++) {
90
- ca.push_back(i);
91
- analyze(target_node[i], input, ipos+1, ca, analyses);
92
- ca.pop_back();
93
- }
94
- }
95
- }
96
-
97
-
98
- /*******************************************************************/
99
- /* */
100
- /* CompactTransducer::analyze_string */
101
- /* */
102
- /*******************************************************************/
103
-
104
- void CompactTransducer::analyze_string( char *s, vector<CAnalysis> &analyses )
105
-
106
- {
107
- // "s" input string to be analyzed
108
- // "analyses" is the data structure in which the results are stored
109
- // and returned
110
-
111
- vector<Character> input;
112
- alphabet.string2symseq( s, input );
113
-
114
- analyses.clear();
115
- CAnalysis ca; // data structure where the current incomplete analysis
116
- // is stored
117
- analyze(0, input, 0, ca, analyses); // start the analysis
118
-
119
- if (analyses.size() > 10000)
120
- fprintf(stderr,"Warning: Only the first 10000 analyses considered for \"%s\"!\n", s);
121
-
122
- if (simplest_only && analyses.size() > 1)
123
- disambiguate( analyses ); // select the simplest analyses
124
- }
125
-
126
-
127
-
128
- /*******************************************************************/
129
- /* */
130
- /* CompactTransducer::~CompactTransducer */
131
- /* */
132
- /*******************************************************************/
133
-
134
- CompactTransducer::~CompactTransducer()
135
-
136
- {
137
- delete[] finalp;
138
- delete[] first_arc;
139
- delete[] label;
140
- delete[] target_node;
141
- delete[] final_logprob;
142
- delete[] arc_logprob;
143
- }
144
-
145
-
146
- /*******************************************************************/
147
- /* */
148
- /* CompactTransducer::CompactTransducer */
149
- /* */
150
- /*******************************************************************/
151
-
152
- CompactTransducer::CompactTransducer()
153
-
154
- {
155
- both_layers = false;
156
- simplest_only = false;
157
- number_of_nodes = 0;
158
- number_of_arcs = 0;
159
- finalp = NULL;
160
- first_arc = NULL;
161
- label = NULL;
162
- target_node = NULL;
163
- arc_logprob = final_logprob = (float*)NULL;
164
- }
165
-
166
-
167
- /*******************************************************************/
168
- /* */
169
- /* CompactTransducer::read_finalp */
170
- /* */
171
- /*******************************************************************/
172
-
173
- void CompactTransducer::read_finalp( FILE *file )
174
-
175
- {
176
- int k=0;
177
- unsigned char n=0;
178
- for( size_t i=0; i<number_of_nodes; i++ ) {
179
- if (k == 0) {
180
- n = (unsigned char)fgetc(file);
181
- k = 8;
182
- }
183
- k--;
184
- if (n & (1 << k))
185
- finalp[i] = 1;
186
- else
187
- finalp[i] = 0;
188
- }
189
- }
190
-
191
-
192
- /*******************************************************************/
193
- /* */
194
- /* CompactTransducer::read_first_arcs */
195
- /* */
196
- /*******************************************************************/
197
-
198
- void CompactTransducer::read_first_arcs( FILE *file )
199
-
200
- {
201
- int k=0;
202
- unsigned int n=0;
203
- size_t bits=(size_t)ceil(log(number_of_arcs+1)/log(2));
204
-
205
- for( size_t i=0; i<=number_of_nodes; i++ ) {
206
- first_arc[i] = n >> (sizeof(n)*8 - bits);
207
- n <<= bits;
208
- k -= bits;
209
- if (k < 0) {
210
- read_num(&n,sizeof(n),file);
211
- first_arc[i] |= n >> (sizeof(n)*8 + k);
212
- n <<= -k;
213
- k += sizeof(n) * 8;
214
- }
215
- }
216
- }
217
-
218
-
219
- /*******************************************************************/
220
- /* */
221
- /* CompactTransducer::read_target_nodes */
222
- /* */
223
- /*******************************************************************/
224
-
225
- void CompactTransducer::read_target_nodes( FILE *file )
226
-
227
- {
228
- int k=0;
229
- unsigned int n=0;
230
- size_t bits=(size_t)ceil(log(number_of_nodes)/log(2));
231
-
232
- for( size_t i=0; i<number_of_arcs; i++ ) {
233
- target_node[i] = n >> (sizeof(n)*8 - bits);
234
- n <<= bits;
235
- k -= bits;
236
- if (k < 0) {
237
- read_num(&n,sizeof(n),file);
238
- target_node[i] |= n >> (sizeof(n)*8 + k);
239
- n <<= -k;
240
- k += sizeof(n) * 8;
241
- }
242
- }
243
- }
244
-
245
-
246
- /*******************************************************************/
247
- /* */
248
- /* CompactTransducer::read_labels */
249
- /* */
250
- /*******************************************************************/
251
-
252
- void CompactTransducer::read_labels( FILE *file )
253
-
254
- {
255
- size_t N=0;
256
- Label Num2Label[alphabet.size()];
257
- for( Alphabet::const_iterator it=alphabet.begin();
258
- it != alphabet.end(); it++ )
259
- {
260
- Label l=*it;
261
- Num2Label[N++] = l;
262
- }
263
-
264
- int k=0;
265
- unsigned int n=0;
266
- size_t bits=(size_t)ceil(log(alphabet.size())/log(2));
267
-
268
- for( size_t i=0; i<number_of_arcs; i++ ) {
269
- unsigned int l = n >> (sizeof(n)*8 - bits);
270
- n <<= bits;
271
- k -= bits;
272
- if (k < 0) {
273
- read_num(&n,sizeof(n),file);
274
- l |= n >> (sizeof(n)*8 + k);
275
- n <<= -k;
276
- k += sizeof(n) * 8;
277
- }
278
- label[i] = Num2Label[l];
279
- }
280
- }
281
-
282
-
283
- /*******************************************************************/
284
- /* */
285
- /* CompactTransducer::read_probs */
286
- /* */
287
- /*******************************************************************/
288
-
289
- void CompactTransducer::read_probs( FILE *file )
290
-
291
- {
292
- size_t n,m;
293
- fread(&n, sizeof(n), 1, file);
294
- if (fread(&m, sizeof(n), 1, file) != 1 ||
295
- n != node_count() || m != arc_count())
296
- {
297
- fprintf(stderr,"Error: incompatible probability file!\n");
298
- exit(1);
299
- }
300
- final_logprob = new float[n];
301
- arc_logprob = new float[m];
302
- fread(final_logprob, sizeof(float), n, file);
303
- if (fread(arc_logprob, sizeof(float), n, file) != n) {
304
- fprintf(stderr,"Error: in probability file!\n");
305
- exit(1);
306
- }
307
- }
308
-
309
-
310
- /*******************************************************************/
311
- /* */
312
- /* CompactTransducer::CompactTransducer */
313
- /* */
314
- /*******************************************************************/
315
-
316
- CompactTransducer::CompactTransducer( FILE *file, FILE *pfile )
317
-
318
- {
319
- both_layers = false;
320
- simplest_only = false;
321
-
322
- if (fgetc(file) != 'c')
323
- throw "Error: wrong file format (not a compact transducer)\n";
324
-
325
- alphabet.read(file);
326
-
327
- read_num(&number_of_nodes,sizeof(number_of_nodes),file);
328
- read_num(&number_of_arcs,sizeof(number_of_arcs),file);
329
-
330
- if (!ferror(file)) {
331
- // memory allocation
332
- finalp = new char[number_of_nodes];
333
- first_arc = new unsigned[number_of_nodes+1];
334
- label = new Label[number_of_arcs];
335
- target_node = new unsigned[number_of_arcs];
336
-
337
- // reading the data
338
- read_finalp(file);
339
- read_first_arcs(file);
340
- read_labels(file);
341
- read_target_nodes(file);
342
- }
343
-
344
- if (pfile == NULL)
345
- arc_logprob = final_logprob = (float*)NULL;
346
- else
347
- read_probs(pfile);
348
- }
349
-
350
-
351
- /*******************************************************************/
352
- /* */
353
- /* CompactTransducer::longest_match2 */
354
- /* */
355
- /*******************************************************************/
356
-
357
- void CompactTransducer::longest_match2(unsigned int n, char *string, int l,
358
- CAnalysis &ca, int &bl, CAnalysis &ba)
359
- {
360
- // n: transducer state
361
- // string: rest string
362
- // l: length of current analysis
363
- // bl: length of the currently longest match
364
- // ca: current analysis
365
- // ba: best analysis
366
-
367
- if (finalp[n] && l > bl) {
368
- // store the new analysis
369
- bl = l;
370
- ba = ca; // copy the arc vector
371
- }
372
-
373
- // follow the epsilon transitions
374
- unsigned int i;
375
- for( i=first_arc[n];
376
- i<first_arc[n+1] && label[i].upper_char() == Label::epsilon;
377
- i++)
378
- {
379
- ca.push_back(i);
380
- longest_match2(target_node[i], string, l, ca, bl, ba);
381
- ca.pop_back();
382
- }
383
-
384
- // follow the non-epsilon transitions
385
- char *end=string;
386
- int c=alphabet.next_code(end, false, false);
387
- l += end-string;
388
- if (c != EOF) {
389
- // find the set of arcs with matching upper character in the sort list
390
- pair<Label*,Label*>range =
391
- equal_range(label+i, label+first_arc[n+1], Label((Character)c));
392
- unsigned int to = (unsigned int)(range.second - label);
393
- for( i=range.first-label; i<to; i++) {
394
- ca.push_back(i);
395
- longest_match2(target_node[i], end, l, ca, bl, ba);
396
- ca.pop_back();
397
- }
398
- }
399
- }
400
-
401
-
402
- /*******************************************************************/
403
- /* */
404
- /* CompactTransducer::print_analysis */
405
- /* */
406
- /*******************************************************************/
407
-
408
- char *CompactTransducer::print_analysis( CAnalysis &cana )
409
-
410
- {
411
- Analysis ana;
412
- convert(cana, ana);
413
- return alphabet.print_analysis( ana, both_layers );
414
- }
415
-
416
-
417
- /*******************************************************************/
418
- /* */
419
- /* CompactTransducer::longest_match */
420
- /* */
421
- /*******************************************************************/
422
-
423
- const char *CompactTransducer::longest_match( char* &string )
424
-
425
- {
426
- vector<char> analysis;
427
- CAnalysis ca, ba;
428
- int l=0;
429
- longest_match2(0, string, 0, ca, l, ba);
430
-
431
- // no match? return the next character
432
- if (ba.size() == 0) {
433
- int c=alphabet.next_code(string, false, false);
434
- return alphabet.code2symbol(c);
435
- }
436
-
437
- string += l;
438
- return print_analysis( ba );
439
- }
440
-
441
-
442
- /*******************************************************************/
443
- /* */
444
- /* CompactTransducer::disambiguate */
445
- /* */
446
- /*******************************************************************/
447
-
448
- void CompactTransducer::disambiguate( vector<CAnalysis> &analyses )
449
-
450
- {
451
- // compute the scores
452
- int bestscore=INT_MIN;
453
- vector<int> score;
454
- Analysis ana;
455
-
456
- for( size_t i=0; i<analyses.size(); i++ ) {
457
- convert(analyses[i], ana);
458
- score.push_back(alphabet.compute_score(ana));
459
- if (bestscore < score[i])
460
- bestscore = score[i];
461
- }
462
-
463
- // delete suboptimal analyses
464
- size_t k=0;
465
- for( size_t i=0; i<analyses.size(); i++ )
466
- if (score[i] == bestscore)
467
- analyses[k++] = analyses[i];
468
- analyses.resize(k);
469
- }
470
-
471
-
472
- /*******************************************************************/
473
- /* */
474
- /* CompactTransducer::train2 */
475
- /* */
476
- /*******************************************************************/
477
-
478
- bool CompactTransducer::train2( char *s, vector<double> &arcfreq,
479
- vector<double> &finalfreq )
480
- {
481
- vector<CAnalysis> analyses;
482
- vector<Label> input;
483
- alphabet.string2labelseq( s, input );
484
-
485
- CAnalysis ca; // data structure where the analysis is stored
486
- unsigned int n=0;
487
- bool failure=false;
488
- for( size_t i=0; i<input.size(); i++ ) {
489
- failure = true;
490
- for( unsigned int k=first_arc[n]; k<first_arc[n+1]; k++) {
491
- if (label[k] == input[i]) {
492
- ca.push_back(k);
493
- n = target_node[k];
494
- failure = false;
495
- break;
496
- }
497
- }
498
- if (failure)
499
- break;
500
- }
501
- if (failure || !finalp[n]) {
502
- fprintf(stderr,"Warning: The following input is not covered:\n%s\n", s);
503
- return false;
504
- }
505
-
506
- for( size_t k=0; k<ca.size(); k++ )
507
- arcfreq[ca[k]]++;
508
- finalfreq[target_node[ca.back()]]++;
509
-
510
- return true;
511
- }
512
-
513
-
514
- /*******************************************************************/
515
- /* */
516
- /* CompactTransducer::train */
517
- /* */
518
- /*******************************************************************/
519
-
520
- bool CompactTransducer::train( char *s, vector<double> &arcfreq,
521
- vector<double> &finalfreq )
522
- {
523
- vector<CAnalysis> analyses;
524
- vector<Character> input;
525
- alphabet.string2symseq( s, input );
526
-
527
- CAnalysis ca; // data structure where the current incomplete analysis
528
- // is stored
529
- analyze(0, input, 0, ca, analyses); // start the analysis
530
-
531
- if (analyses.size() > 10000)
532
- return true; // ignore inputs with more than 10000 analyses
533
- else if (analyses.size() == 0)
534
- return false;
535
-
536
- if (simplest_only && analyses.size() > 1)
537
- disambiguate( analyses ); // select the simplest analyses
538
-
539
- if (analyses.size() > 0) {
540
- double incr = 1.0 / analyses.size();
541
- CAnalysis arcs;
542
-
543
- for( size_t i=0; i<analyses.size(); i++ ) {
544
- CAnalysis &arcs=analyses[i];
545
- for( size_t k=0; k<arcs.size(); k++ )
546
- arcfreq[arcs[k]] += incr;
547
- finalfreq[target_node[arcs.back()]] += incr;
548
- }
549
- }
550
- return true;
551
- }
552
-
553
-
554
- /*******************************************************************/
555
- /* */
556
- /* CompactTransducer::estimate_probs */
557
- /* */
558
- /*******************************************************************/
559
-
560
- void CompactTransducer::estimate_probs( vector<double> &arcfreq,
561
- vector<double> &finalfreq )
562
- {
563
- // turn frequencies into probabilities
564
- for( size_t n=0; n<finalfreq.size(); n++ ) {
565
- double sum = finalfreq[n];
566
- for( size_t a=first_arc[n]; a<first_arc[n+1]; a++ )
567
- sum += arcfreq[a];
568
- if (sum == 0.0)
569
- sum = 1.0;
570
- finalfreq[n] = finalfreq[n] / sum;
571
- for( size_t a=first_arc[n]; a<first_arc[n+1]; a++ )
572
- arcfreq[a] = arcfreq[a] / sum;
573
- }
574
- }
575
-
576
-
577
-
578
- /*******************************************************************/
579
- /* */
580
- /* CompactTransducer::compute_probs */
581
- /* */
582
- /*******************************************************************/
583
-
584
- void CompactTransducer::compute_probs( vector<CAnalysis> &analyses,
585
- vector<double> &prob )
586
- {
587
- prob.resize(analyses.size());
588
- double sum=0.0;
589
- for( size_t i=0; i<analyses.size(); i++ ) {
590
- CAnalysis &a=analyses[i];
591
-
592
- // compute the probability
593
- double logprob=0.0;
594
- for( size_t k=0; k<a.size(); k++ )
595
- logprob += arc_logprob[a[k]];
596
- logprob += final_logprob[target_node[a.back()]];
597
- prob[i] = exp(logprob);
598
- sum += prob[i];
599
- }
600
-
601
- // sort the analyses
602
- vector<CAnalysis> oldanalyses(analyses);
603
- vector<double> oldprob(prob);
604
- for( size_t i=0; i<analyses.size(); i++ ) {
605
- prob[i] = -1.0;
606
- int n=0;
607
- for( size_t k=0; k<oldanalyses.size(); k++ )
608
- if (prob[i] < oldprob[k]) {
609
- prob[i] = oldprob[k];
610
- n = k;
611
- }
612
- analyses[i] = oldanalyses[n];
613
- oldprob[n] = -1.0;
614
- prob[i] /= sum; // normalization
615
- }
616
- }