ruby-sfst 0.4.3 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -0
  3. data/COPYING +280 -0
  4. data/Gemfile +3 -0
  5. data/Gemfile.lock +54 -0
  6. data/README.md +1 -1
  7. data/Rakefile +9 -18
  8. data/bin/console +7 -0
  9. data/bin/setup +6 -0
  10. data/ext/sfst/alphabet.cc +879 -0
  11. data/ext/sfst/alphabet.h +302 -0
  12. data/ext/sfst/basic.cc +85 -0
  13. data/ext/{sfst_machine → sfst}/basic.h +7 -4
  14. data/ext/sfst/compact.cc +629 -0
  15. data/ext/sfst/compact.h +100 -0
  16. data/ext/sfst/determinise.cc +279 -0
  17. data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
  18. data/ext/sfst/fst.cc +1150 -0
  19. data/ext/sfst/fst.h +374 -0
  20. data/ext/sfst/hopcroft.cc +681 -0
  21. data/ext/sfst/interface.cc +1921 -0
  22. data/ext/sfst/interface.h +171 -0
  23. data/ext/sfst/make-compact.cc +323 -0
  24. data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
  25. data/ext/sfst/mem.h +80 -0
  26. data/ext/sfst/operators.cc +1273 -0
  27. data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
  28. data/ext/sfst/sgi.h +72 -0
  29. data/ext/sfst/utf8.cc +149 -0
  30. data/ext/{sfst_machine → sfst}/utf8.h +7 -4
  31. data/lib/sfst.rb +2 -1
  32. data/lib/sfst/version.rb +1 -1
  33. data/ruby-sfst.gemspec +23 -23
  34. metadata +107 -35
  35. data/ext/sfst_machine/alphabet.cc +0 -812
  36. data/ext/sfst_machine/alphabet.h +0 -273
  37. data/ext/sfst_machine/basic.cc +0 -84
  38. data/ext/sfst_machine/compact.cc +0 -616
  39. data/ext/sfst_machine/compact.h +0 -98
  40. data/ext/sfst_machine/determinise.cc +0 -303
  41. data/ext/sfst_machine/fst.cc +0 -1000
  42. data/ext/sfst_machine/fst.h +0 -369
  43. data/ext/sfst_machine/interface.cc +0 -1842
  44. data/ext/sfst_machine/interface.h +0 -93
  45. data/ext/sfst_machine/make-compact.cc +0 -327
  46. data/ext/sfst_machine/mem.h +0 -74
  47. data/ext/sfst_machine/operators.cc +0 -1131
  48. data/ext/sfst_machine/sgi.h +0 -44
  49. data/ext/sfst_machine/utf8.cc +0 -146
  50. data/test/test_sfst.fst +0 -3
  51. data/test/test_sfst.rb +0 -114
@@ -1,273 +0,0 @@
1
- /*******************************************************************/
2
- /* */
3
- /* FILE alphabet.h */
4
- /* MODULE alphabet */
5
- /* PROGRAM SFST */
6
- /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
- /* */
8
- /* PURPOSE finite state tools */
9
- /* */
10
- /*******************************************************************/
11
-
12
- #ifndef _ALPHABET_H_
13
- #define _ALPHABET_H_
14
-
15
- #include <stdio.h>
16
-
17
- #include "basic.h"
18
-
19
- #include <set>
20
- #include <vector>
21
-
22
- #include <iostream>
23
-
24
- #include <cstring>
25
-
26
- #include "sgi.h"
27
-
28
- #ifndef CODE_DATA_TYPE
29
- typedef unsigned short Character; // data type of the symbol codes
30
- #else
31
- typedef unsigned CODE_DATA_TYPE Character;
32
- #endif
33
-
34
- // data type used to indicate whether some action is to be performed
35
- // on the analysis level (lower) or the surface level (upper)
36
- typedef enum {upper, lower} Level;
37
-
38
- extern char EpsilonString[]; // holds the symbol representing the empty string
39
- // which is usually "<>"
40
-
41
-
42
- /***************** class Label ***********************************/
43
-
44
- class Label {
45
-
46
- private:
47
- // data structure where the two symbols are stored
48
- struct {
49
- Character lower;
50
- Character upper;
51
- } label;
52
-
53
- public:
54
- static const Character epsilon=0; // code of the empty symbol
55
-
56
- // new label with two identical symbols
57
- Label( Character c=epsilon ) { label.lower = label.upper = c; };
58
-
59
- // new label with two different symbols
60
- Label( Character c1, Character c2 )
61
- { label.lower = c1; label.upper = c2; };
62
-
63
- // returns the indicated symbol of the label
64
- Character get_char( Level l ) const
65
- { return ((l==upper)? label.upper: label.lower); };
66
-
67
- // returns the "upper" symbol of the label (i.e. the surface symbol)
68
- Character upper_char() const { return label.upper; };
69
-
70
- // returns the "lower" symbol of the label (i.e. the analysis symbol)
71
- Character lower_char() const { return label.lower; };
72
-
73
- // replaces symbols in a label
74
- Label replace_char( Character c, Character nc ) const {
75
- Label l = *this;
76
- if (l.label.lower == c)
77
- l.label.lower = nc;
78
- if (l.label.upper == c)
79
- l.label.upper = nc;
80
- return l;
81
- };
82
-
83
- // operators checking the equality of labels
84
- int operator==( Label l ) const
85
- { return (label.lower==l.label.lower && label.upper==l.label.upper); };
86
- int operator!=( Label l ) const
87
- { return !(l == *this); };
88
-
89
- // comparison operator needed for sorting labels
90
- int operator<( Label l ) const {
91
- return (upper_char() < l.upper_char()); };
92
-
93
- // check whether the label is epsilon (i.e. both symbols are epsilon)
94
- // transitions with epsilon labels are epsilon transitions
95
- int is_epsilon() const
96
- { return (label.upper == epsilon && label.lower == epsilon); };
97
-
98
- // check whether the "upper" symbol is epsilon
99
- int upper_is_epsilon() const
100
- { return (label.upper == epsilon); };
101
-
102
- // check whether the "lower" symbol is epsilon
103
- int lower_is_epsilon() const
104
- { return (label.lower == epsilon); };
105
-
106
- // hash function needed to store labels in a hash table
107
- struct label_hash {
108
- size_t operator() ( const Label l ) const {
109
- return (size_t)l.lower_char() ^
110
- ((size_t)l.upper_char() << 16) ^
111
- ((size_t)l.upper_char() >> 16);
112
- }
113
- };
114
-
115
- // hash function needed to store labels in a hash table
116
- struct label_cmp {
117
- bool operator() ( const Label l1, const Label l2 ) const {
118
- return (l1.lower_char() < l2.lower_char() ||
119
- (l1.lower_char() == l2.lower_char() &&
120
- l1.upper_char() < l2.upper_char()));
121
- }
122
- };
123
-
124
- // comparison operator needed to store labels in a hash table
125
- struct label_eq {
126
- bool operator() ( const Label l1, const Label l2 ) const {
127
- return (l1.lower_char() == l2.lower_char() &&
128
- l1.upper_char() == l2.upper_char());
129
- }
130
- };
131
- };
132
-
133
- typedef std::vector<Label> Analysis;
134
-
135
-
136
- /***************** class Alphabet *******************************/
137
-
138
- class Alphabet {
139
-
140
- // string comparison operators needed to stored strings in a hash table
141
- struct eqstr {
142
- bool operator()(const char* s1, const char* s2) const {
143
- return strcmp(s1, s2) == 0;
144
- }
145
- };
146
-
147
- // data structure storing labels without repetitions (i.e. as a set)
148
- typedef std::set<Label, Label::label_cmp> LabelSet;
149
-
150
- // hash table used to map the symbols to their codes
151
- typedef hash_map<const char*, Character, hash<const char*>,eqstr> SymbolMap;
152
-
153
- // hash table used to map the codes back to the symbols
154
- typedef hash_map<Character, char*> CharMap;
155
-
156
- private:
157
- SymbolMap sm; // maps symbols to codes
158
-
159
- CharMap cm; // maps codes to symbols
160
- LabelSet ls; // set of labels known to the alphabet
161
-
162
- // add a new symbol with symbol code c
163
- void add( const char *symbol, Character c );
164
-
165
- public:
166
- bool utf8;
167
-
168
- // iterators over the set of known labels
169
- typedef LabelSet::iterator iterator;
170
- typedef LabelSet::const_iterator const_iterator;
171
- Alphabet();
172
- ~Alphabet() { clear(); };
173
- const_iterator begin() const { return ls.begin(); };
174
- const_iterator end() const { return ls.end(); };
175
- size_t size() const { return ls.size(); };
176
-
177
- void clear();
178
- void clear_char_pairs() { ls.clear(); };
179
-
180
- // lookup a label in the alphabet
181
- iterator find( Label l ) { return ls.find(l); };
182
-
183
- // insert a label in the alphabet
184
- void insert( Label l ) { if (!l.is_epsilon()) ls.insert(l); };
185
-
186
- // insert the known symbols from another alphabet
187
- void insert_symbols( const Alphabet& );
188
-
189
- // insert the labels and known symbols from another alphabet
190
- void copy( const Alphabet& );
191
-
192
- // create the alphabet of a transducer obtained by a composition operation
193
- void compose( const Alphabet &la, const Alphabet &ua );
194
-
195
- // add a symbol to the alphabet and return its code
196
- Character add_symbol(const char *symbol);
197
-
198
- // add a symbol to the alphabet with a given code
199
- void add_symbol(const char *symbol, Character c );
200
-
201
- // create a new marker symbol and return its code
202
- Character new_marker( void );
203
- void delete_markers();
204
-
205
- // compute the complement of a symbol set
206
- void complement( std::vector<Character> &sym );
207
-
208
- // return the code of the argument symbol
209
- int symbol2code( const char *s ) const {
210
- SymbolMap::const_iterator p = sm.find(s);
211
- if (p != sm.end()) return p->second;
212
- return EOF;
213
- };
214
-
215
- // return the symbol for the given symbol code
216
- const char *code2symbol( Character c ) const {
217
- CharMap::const_iterator p=cm.find(c);
218
- if (p == cm.end())
219
- return NULL;
220
- else
221
- return p->second;
222
- };
223
-
224
- // write the symbol for the given symbol code into a string
225
- void write_char( Character c, char *buffer, int *pos,
226
- bool with_brackets=true ) const;
227
-
228
- // write the symbol pair of a given label into a string
229
- void write_label( Label l, char *buffer, int *pos,
230
- bool with_brackets=true ) const;
231
-
232
- // write the symbol for the given symbol code into a buffer and return
233
- // a pointer to it
234
- // the flag "with_brackets" indicates whether the angle brackets
235
- // surrounding multi-character symbols are to be printed or not
236
- const char *write_char( Character c, bool with_brackets=true ) const;
237
-
238
- // write the symbol pair of a given label into a string
239
- // and return a pointer to it
240
- const char *write_label( Label l, bool with_brackets=true ) const;
241
-
242
- // scan the next multi-character symbol in the argument string
243
- int next_mcsym( char*&, bool insert=true );
244
-
245
- // scan the next symbol in the argument string
246
- int next_code( char*&, bool extended=true, bool insert=true );
247
-
248
- // convert a character string into a symbol or label sequence
249
- void string2symseq( char*, std::vector<Character>& );
250
- void string2labelseq( char*, std::vector<Label>& );
251
-
252
- // scan the next label in the argument string
253
- Label next_label( char*&, bool extended=true );
254
-
255
- // store the alphabet in the argument file (in binary form)
256
- void store( FILE* ) const;
257
-
258
- // read the alphabet from the argument file
259
- void read( FILE* );
260
-
261
- // disambiguation and printing of analyses
262
- int compute_score( Analysis &ana );
263
- void disambiguate( std::vector<Analysis> &analyses );
264
- char *print_analysis( Analysis &ana, bool both_layers );
265
-
266
- friend std::ostream &operator<<(std::ostream&, const Alphabet&);
267
- };
268
-
269
- // write the alphabet to the output stream (in readable form)
270
- std::ostream &operator<<(std::ostream&, const Alphabet&);
271
-
272
-
273
- #endif
@@ -1,84 +0,0 @@
1
-
2
- /*******************************************************************/
3
- /* */
4
- /* FILE basic.C */
5
- /* MODULE basic */
6
- /* PROGRAM SFST */
7
- /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
8
- /* */
9
- /* PURPOSE */
10
- /* */
11
- /*******************************************************************/
12
-
13
- #include <stdlib.h>
14
- #include <string.h>
15
-
16
- #include "basic.h"
17
-
18
- bool Switch_Bytes=false;
19
-
20
-
21
-
22
- /*******************************************************************/
23
- /* */
24
- /* fst_strdup */
25
- /* */
26
- /*******************************************************************/
27
-
28
- char* fst_strdup(const char* pString)
29
-
30
- {
31
- char* pStringCopy = (char*)malloc(strlen(pString) + 1);
32
- if (pStringCopy == NULL) {
33
- fprintf(stderr, "\nError: out of memory (malloc failed)\naborted.\n");
34
- exit(1);
35
- }
36
- strcpy(pStringCopy, pString);
37
- return pStringCopy;
38
- }
39
-
40
-
41
- /*******************************************************************/
42
- /* */
43
- /* read_string */
44
- /* */
45
- /*******************************************************************/
46
-
47
- int read_string( char *buffer, int size, FILE *file )
48
-
49
- {
50
- for( int i=0; i<size; i++ ) {
51
- int c=fgetc(file);
52
- if (c == EOF || c == 0) {
53
- buffer[i] = 0;
54
- return (c==0);
55
- }
56
- buffer[i] = (char)c;
57
- }
58
- buffer[size-1] = 0;
59
- return 0;
60
- }
61
-
62
-
63
- /*******************************************************************/
64
- /* */
65
- /* read_num */
66
- /* */
67
- /*******************************************************************/
68
-
69
- size_t read_num( void *p, size_t n, FILE *file )
70
-
71
- {
72
- char *pp=(char*)p;
73
- size_t result=fread( pp, 1, n, file );
74
- if (Switch_Bytes) {
75
- size_t e=n/2;
76
- for( size_t i=0; i<e; i++ ) {
77
- char tmp=pp[i];
78
- pp[i] = pp[--n];
79
- pp[n] = tmp;
80
- }
81
- }
82
- return result;
83
- }
84
-
@@ -1,616 +0,0 @@
1
- /*******************************************************************/
2
- /* */
3
- /* FILE compact.C */
4
- /* MODULE compact */
5
- /* PROGRAM SFST */
6
- /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
- /* */
8
- /* PURPOSE Code needed for analysing data */
9
- /* */
10
- /*******************************************************************/
11
-
12
- #include <stdio.h>
13
- #include <math.h>
14
-
15
- #include <limits.h>
16
-
17
- #include "compact.h"
18
-
19
- using std::equal_range;
20
- using std::vector;
21
- using std::pair;
22
-
23
- const int BUFFER_SIZE=1000;
24
-
25
-
26
- /*******************************************************************/
27
- /* */
28
- /* CompactTransducer::convert */
29
- /* */
30
- /*******************************************************************/
31
-
32
- void CompactTransducer::convert( CAnalysis &cana, Analysis &ana )
33
-
34
- {
35
- ana.resize(cana.size());
36
- for( size_t i=0; i<cana.size(); i++ )
37
- ana[i] = label[cana[i]];
38
- }
39
-
40
-
41
- /*******************************************************************/
42
- /* */
43
- /* CompactTransducer::analyze */
44
- /* */
45
- /*******************************************************************/
46
-
47
- void CompactTransducer::analyze(unsigned int n, vector<Character> &input,
48
- size_t ipos, CAnalysis &ca,
49
- vector<CAnalysis> &analyses )
50
- {
51
- // "n" is the number of the current transducer node/state
52
- // "input" is the sequence of input symbols
53
- // "ipos" is the input position currently analysed
54
- // "ca" stores the incomplete analysis string
55
- // "analyses" stores the analyses found so far
56
-
57
- if (analyses.size() > 10000)
58
- return; // limit the maximal number of analyses
59
-
60
- // Is the input string fully analyzed and the current node a final node?
61
- if (finalp[n] && ipos == input.size())
62
- // store the new analysis
63
- analyses.push_back(ca);
64
-
65
- // follow the epsilon transitions
66
- // first_arc[n] is the number of the first outgoing transition of node n
67
- // first_arc[n+1]-1 is the number of the last outgoing transition of node n
68
- // first_arc[n+1] is the number of the first outgoing transition of node n+1
69
- unsigned int i;
70
- for( i=first_arc[n];
71
- i<first_arc[n+1] && label[i].upper_char() == Label::epsilon;
72
- i++)
73
- {
74
- ca.push_back(i);
75
- analyze(target_node[i], input, ipos, ca, analyses);
76
- ca.pop_back();
77
- }
78
-
79
- // follow the non-epsilon transitions
80
-
81
- // scan the next input symbol
82
- if (ipos < input.size()) {
83
- // find the set of arcs with matching upper character in the sorted list
84
- pair<Label*,Label*>range =
85
- equal_range(label+i, label+first_arc[n+1], Label(input[ipos]));
86
- unsigned int to = (unsigned int)(range.second - label);
87
-
88
- // follow the non-epsilon transitions
89
- for( i=range.first-label; i<to; i++) {
90
- ca.push_back(i);
91
- analyze(target_node[i], input, ipos+1, ca, analyses);
92
- ca.pop_back();
93
- }
94
- }
95
- }
96
-
97
-
98
- /*******************************************************************/
99
- /* */
100
- /* CompactTransducer::analyze_string */
101
- /* */
102
- /*******************************************************************/
103
-
104
- void CompactTransducer::analyze_string( char *s, vector<CAnalysis> &analyses )
105
-
106
- {
107
- // "s" input string to be analyzed
108
- // "analyses" is the data structure in which the results are stored
109
- // and returned
110
-
111
- vector<Character> input;
112
- alphabet.string2symseq( s, input );
113
-
114
- analyses.clear();
115
- CAnalysis ca; // data structure where the current incomplete analysis
116
- // is stored
117
- analyze(0, input, 0, ca, analyses); // start the analysis
118
-
119
- if (analyses.size() > 10000)
120
- fprintf(stderr,"Warning: Only the first 10000 analyses considered for \"%s\"!\n", s);
121
-
122
- if (simplest_only && analyses.size() > 1)
123
- disambiguate( analyses ); // select the simplest analyses
124
- }
125
-
126
-
127
-
128
- /*******************************************************************/
129
- /* */
130
- /* CompactTransducer::~CompactTransducer */
131
- /* */
132
- /*******************************************************************/
133
-
134
- CompactTransducer::~CompactTransducer()
135
-
136
- {
137
- delete[] finalp;
138
- delete[] first_arc;
139
- delete[] label;
140
- delete[] target_node;
141
- delete[] final_logprob;
142
- delete[] arc_logprob;
143
- }
144
-
145
-
146
- /*******************************************************************/
147
- /* */
148
- /* CompactTransducer::CompactTransducer */
149
- /* */
150
- /*******************************************************************/
151
-
152
- CompactTransducer::CompactTransducer()
153
-
154
- {
155
- both_layers = false;
156
- simplest_only = false;
157
- number_of_nodes = 0;
158
- number_of_arcs = 0;
159
- finalp = NULL;
160
- first_arc = NULL;
161
- label = NULL;
162
- target_node = NULL;
163
- arc_logprob = final_logprob = (float*)NULL;
164
- }
165
-
166
-
167
- /*******************************************************************/
168
- /* */
169
- /* CompactTransducer::read_finalp */
170
- /* */
171
- /*******************************************************************/
172
-
173
- void CompactTransducer::read_finalp( FILE *file )
174
-
175
- {
176
- int k=0;
177
- unsigned char n=0;
178
- for( size_t i=0; i<number_of_nodes; i++ ) {
179
- if (k == 0) {
180
- n = (unsigned char)fgetc(file);
181
- k = 8;
182
- }
183
- k--;
184
- if (n & (1 << k))
185
- finalp[i] = 1;
186
- else
187
- finalp[i] = 0;
188
- }
189
- }
190
-
191
-
192
- /*******************************************************************/
193
- /* */
194
- /* CompactTransducer::read_first_arcs */
195
- /* */
196
- /*******************************************************************/
197
-
198
- void CompactTransducer::read_first_arcs( FILE *file )
199
-
200
- {
201
- int k=0;
202
- unsigned int n=0;
203
- size_t bits=(size_t)ceil(log(number_of_arcs+1)/log(2));
204
-
205
- for( size_t i=0; i<=number_of_nodes; i++ ) {
206
- first_arc[i] = n >> (sizeof(n)*8 - bits);
207
- n <<= bits;
208
- k -= bits;
209
- if (k < 0) {
210
- read_num(&n,sizeof(n),file);
211
- first_arc[i] |= n >> (sizeof(n)*8 + k);
212
- n <<= -k;
213
- k += sizeof(n) * 8;
214
- }
215
- }
216
- }
217
-
218
-
219
- /*******************************************************************/
220
- /* */
221
- /* CompactTransducer::read_target_nodes */
222
- /* */
223
- /*******************************************************************/
224
-
225
- void CompactTransducer::read_target_nodes( FILE *file )
226
-
227
- {
228
- int k=0;
229
- unsigned int n=0;
230
- size_t bits=(size_t)ceil(log(number_of_nodes)/log(2));
231
-
232
- for( size_t i=0; i<number_of_arcs; i++ ) {
233
- target_node[i] = n >> (sizeof(n)*8 - bits);
234
- n <<= bits;
235
- k -= bits;
236
- if (k < 0) {
237
- read_num(&n,sizeof(n),file);
238
- target_node[i] |= n >> (sizeof(n)*8 + k);
239
- n <<= -k;
240
- k += sizeof(n) * 8;
241
- }
242
- }
243
- }
244
-
245
-
246
- /*******************************************************************/
247
- /* */
248
- /* CompactTransducer::read_labels */
249
- /* */
250
- /*******************************************************************/
251
-
252
- void CompactTransducer::read_labels( FILE *file )
253
-
254
- {
255
- size_t N=0;
256
- Label Num2Label[alphabet.size()];
257
- for( Alphabet::const_iterator it=alphabet.begin();
258
- it != alphabet.end(); it++ )
259
- {
260
- Label l=*it;
261
- Num2Label[N++] = l;
262
- }
263
-
264
- int k=0;
265
- unsigned int n=0;
266
- size_t bits=(size_t)ceil(log(alphabet.size())/log(2));
267
-
268
- for( size_t i=0; i<number_of_arcs; i++ ) {
269
- unsigned int l = n >> (sizeof(n)*8 - bits);
270
- n <<= bits;
271
- k -= bits;
272
- if (k < 0) {
273
- read_num(&n,sizeof(n),file);
274
- l |= n >> (sizeof(n)*8 + k);
275
- n <<= -k;
276
- k += sizeof(n) * 8;
277
- }
278
- label[i] = Num2Label[l];
279
- }
280
- }
281
-
282
-
283
- /*******************************************************************/
284
- /* */
285
- /* CompactTransducer::read_probs */
286
- /* */
287
- /*******************************************************************/
288
-
289
- void CompactTransducer::read_probs( FILE *file )
290
-
291
- {
292
- size_t n,m;
293
- fread(&n, sizeof(n), 1, file);
294
- if (fread(&m, sizeof(n), 1, file) != 1 ||
295
- n != node_count() || m != arc_count())
296
- {
297
- fprintf(stderr,"Error: incompatible probability file!\n");
298
- exit(1);
299
- }
300
- final_logprob = new float[n];
301
- arc_logprob = new float[m];
302
- fread(final_logprob, sizeof(float), n, file);
303
- if (fread(arc_logprob, sizeof(float), n, file) != n) {
304
- fprintf(stderr,"Error: in probability file!\n");
305
- exit(1);
306
- }
307
- }
308
-
309
-
310
- /*******************************************************************/
311
- /* */
312
- /* CompactTransducer::CompactTransducer */
313
- /* */
314
- /*******************************************************************/
315
-
316
- CompactTransducer::CompactTransducer( FILE *file, FILE *pfile )
317
-
318
- {
319
- both_layers = false;
320
- simplest_only = false;
321
-
322
- if (fgetc(file) != 'c')
323
- throw "Error: wrong file format (not a compact transducer)\n";
324
-
325
- alphabet.read(file);
326
-
327
- read_num(&number_of_nodes,sizeof(number_of_nodes),file);
328
- read_num(&number_of_arcs,sizeof(number_of_arcs),file);
329
-
330
- if (!ferror(file)) {
331
- // memory allocation
332
- finalp = new char[number_of_nodes];
333
- first_arc = new unsigned[number_of_nodes+1];
334
- label = new Label[number_of_arcs];
335
- target_node = new unsigned[number_of_arcs];
336
-
337
- // reading the data
338
- read_finalp(file);
339
- read_first_arcs(file);
340
- read_labels(file);
341
- read_target_nodes(file);
342
- }
343
-
344
- if (pfile == NULL)
345
- arc_logprob = final_logprob = (float*)NULL;
346
- else
347
- read_probs(pfile);
348
- }
349
-
350
-
351
- /*******************************************************************/
352
- /* */
353
- /* CompactTransducer::longest_match2 */
354
- /* */
355
- /*******************************************************************/
356
-
357
- void CompactTransducer::longest_match2(unsigned int n, char *string, int l,
358
- CAnalysis &ca, int &bl, CAnalysis &ba)
359
- {
360
- // n: transducer state
361
- // string: rest string
362
- // l: length of current analysis
363
- // bl: length of the currently longest match
364
- // ca: current analysis
365
- // ba: best analysis
366
-
367
- if (finalp[n] && l > bl) {
368
- // store the new analysis
369
- bl = l;
370
- ba = ca; // copy the arc vector
371
- }
372
-
373
- // follow the epsilon transitions
374
- unsigned int i;
375
- for( i=first_arc[n];
376
- i<first_arc[n+1] && label[i].upper_char() == Label::epsilon;
377
- i++)
378
- {
379
- ca.push_back(i);
380
- longest_match2(target_node[i], string, l, ca, bl, ba);
381
- ca.pop_back();
382
- }
383
-
384
- // follow the non-epsilon transitions
385
- char *end=string;
386
- int c=alphabet.next_code(end, false, false);
387
- l += end-string;
388
- if (c != EOF) {
389
- // find the set of arcs with matching upper character in the sort list
390
- pair<Label*,Label*>range =
391
- equal_range(label+i, label+first_arc[n+1], Label((Character)c));
392
- unsigned int to = (unsigned int)(range.second - label);
393
- for( i=range.first-label; i<to; i++) {
394
- ca.push_back(i);
395
- longest_match2(target_node[i], end, l, ca, bl, ba);
396
- ca.pop_back();
397
- }
398
- }
399
- }
400
-
401
-
402
- /*******************************************************************/
403
- /* */
404
- /* CompactTransducer::print_analysis */
405
- /* */
406
- /*******************************************************************/
407
-
408
- char *CompactTransducer::print_analysis( CAnalysis &cana )
409
-
410
- {
411
- Analysis ana;
412
- convert(cana, ana);
413
- return alphabet.print_analysis( ana, both_layers );
414
- }
415
-
416
-
417
- /*******************************************************************/
418
- /* */
419
- /* CompactTransducer::longest_match */
420
- /* */
421
- /*******************************************************************/
422
-
423
- const char *CompactTransducer::longest_match( char* &string )
424
-
425
- {
426
- vector<char> analysis;
427
- CAnalysis ca, ba;
428
- int l=0;
429
- longest_match2(0, string, 0, ca, l, ba);
430
-
431
- // no match? return the next character
432
- if (ba.size() == 0) {
433
- int c=alphabet.next_code(string, false, false);
434
- return alphabet.code2symbol(c);
435
- }
436
-
437
- string += l;
438
- return print_analysis( ba );
439
- }
440
-
441
-
442
- /*******************************************************************/
443
- /* */
444
- /* CompactTransducer::disambiguate */
445
- /* */
446
- /*******************************************************************/
447
-
448
- void CompactTransducer::disambiguate( vector<CAnalysis> &analyses )
449
-
450
- {
451
- // compute the scores
452
- int bestscore=INT_MIN;
453
- vector<int> score;
454
- Analysis ana;
455
-
456
- for( size_t i=0; i<analyses.size(); i++ ) {
457
- convert(analyses[i], ana);
458
- score.push_back(alphabet.compute_score(ana));
459
- if (bestscore < score[i])
460
- bestscore = score[i];
461
- }
462
-
463
- // delete suboptimal analyses
464
- size_t k=0;
465
- for( size_t i=0; i<analyses.size(); i++ )
466
- if (score[i] == bestscore)
467
- analyses[k++] = analyses[i];
468
- analyses.resize(k);
469
- }
470
-
471
-
472
- /*******************************************************************/
473
- /* */
474
- /* CompactTransducer::train2 */
475
- /* */
476
- /*******************************************************************/
477
-
478
- bool CompactTransducer::train2( char *s, vector<double> &arcfreq,
479
- vector<double> &finalfreq )
480
- {
481
- vector<CAnalysis> analyses;
482
- vector<Label> input;
483
- alphabet.string2labelseq( s, input );
484
-
485
- CAnalysis ca; // data structure where the analysis is stored
486
- unsigned int n=0;
487
- bool failure=false;
488
- for( size_t i=0; i<input.size(); i++ ) {
489
- failure = true;
490
- for( unsigned int k=first_arc[n]; k<first_arc[n+1]; k++) {
491
- if (label[k] == input[i]) {
492
- ca.push_back(k);
493
- n = target_node[k];
494
- failure = false;
495
- break;
496
- }
497
- }
498
- if (failure)
499
- break;
500
- }
501
- if (failure || !finalp[n]) {
502
- fprintf(stderr,"Warning: The following input is not covered:\n%s\n", s);
503
- return false;
504
- }
505
-
506
- for( size_t k=0; k<ca.size(); k++ )
507
- arcfreq[ca[k]]++;
508
- finalfreq[target_node[ca.back()]]++;
509
-
510
- return true;
511
- }
512
-
513
-
514
- /*******************************************************************/
515
- /* */
516
- /* CompactTransducer::train */
517
- /* */
518
- /*******************************************************************/
519
-
520
- bool CompactTransducer::train( char *s, vector<double> &arcfreq,
521
- vector<double> &finalfreq )
522
- {
523
- vector<CAnalysis> analyses;
524
- vector<Character> input;
525
- alphabet.string2symseq( s, input );
526
-
527
- CAnalysis ca; // data structure where the current incomplete analysis
528
- // is stored
529
- analyze(0, input, 0, ca, analyses); // start the analysis
530
-
531
- if (analyses.size() > 10000)
532
- return true; // ignore inputs with more than 10000 analyses
533
- else if (analyses.size() == 0)
534
- return false;
535
-
536
- if (simplest_only && analyses.size() > 1)
537
- disambiguate( analyses ); // select the simplest analyses
538
-
539
- if (analyses.size() > 0) {
540
- double incr = 1.0 / analyses.size();
541
- CAnalysis arcs;
542
-
543
- for( size_t i=0; i<analyses.size(); i++ ) {
544
- CAnalysis &arcs=analyses[i];
545
- for( size_t k=0; k<arcs.size(); k++ )
546
- arcfreq[arcs[k]] += incr;
547
- finalfreq[target_node[arcs.back()]] += incr;
548
- }
549
- }
550
- return true;
551
- }
552
-
553
-
554
- /*******************************************************************/
555
- /* */
556
- /* CompactTransducer::estimate_probs */
557
- /* */
558
- /*******************************************************************/
559
-
560
- void CompactTransducer::estimate_probs( vector<double> &arcfreq,
561
- vector<double> &finalfreq )
562
- {
563
- // turn frequencies into probabilities
564
- for( size_t n=0; n<finalfreq.size(); n++ ) {
565
- double sum = finalfreq[n];
566
- for( size_t a=first_arc[n]; a<first_arc[n+1]; a++ )
567
- sum += arcfreq[a];
568
- if (sum == 0.0)
569
- sum = 1.0;
570
- finalfreq[n] = finalfreq[n] / sum;
571
- for( size_t a=first_arc[n]; a<first_arc[n+1]; a++ )
572
- arcfreq[a] = arcfreq[a] / sum;
573
- }
574
- }
575
-
576
-
577
-
578
- /*******************************************************************/
579
- /* */
580
- /* CompactTransducer::compute_probs */
581
- /* */
582
- /*******************************************************************/
583
-
584
- void CompactTransducer::compute_probs( vector<CAnalysis> &analyses,
585
- vector<double> &prob )
586
- {
587
- prob.resize(analyses.size());
588
- double sum=0.0;
589
- for( size_t i=0; i<analyses.size(); i++ ) {
590
- CAnalysis &a=analyses[i];
591
-
592
- // compute the probability
593
- double logprob=0.0;
594
- for( size_t k=0; k<a.size(); k++ )
595
- logprob += arc_logprob[a[k]];
596
- logprob += final_logprob[target_node[a.back()]];
597
- prob[i] = exp(logprob);
598
- sum += prob[i];
599
- }
600
-
601
- // sort the analyses
602
- vector<CAnalysis> oldanalyses(analyses);
603
- vector<double> oldprob(prob);
604
- for( size_t i=0; i<analyses.size(); i++ ) {
605
- prob[i] = -1.0;
606
- int n=0;
607
- for( size_t k=0; k<oldanalyses.size(); k++ )
608
- if (prob[i] < oldprob[k]) {
609
- prob[i] = oldprob[k];
610
- n = k;
611
- }
612
- analyses[i] = oldanalyses[n];
613
- oldprob[n] = -1.0;
614
- prob[i] /= sum; // normalization
615
- }
616
- }