ruby-sfst 0.4.3 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -0
- data/COPYING +280 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +54 -0
- data/README.md +1 -1
- data/Rakefile +9 -18
- data/bin/console +7 -0
- data/bin/setup +6 -0
- data/ext/sfst/alphabet.cc +879 -0
- data/ext/sfst/alphabet.h +302 -0
- data/ext/sfst/basic.cc +85 -0
- data/ext/{sfst_machine → sfst}/basic.h +7 -4
- data/ext/sfst/compact.cc +629 -0
- data/ext/sfst/compact.h +100 -0
- data/ext/sfst/determinise.cc +279 -0
- data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
- data/ext/sfst/fst.cc +1150 -0
- data/ext/sfst/fst.h +374 -0
- data/ext/sfst/hopcroft.cc +681 -0
- data/ext/sfst/interface.cc +1921 -0
- data/ext/sfst/interface.h +171 -0
- data/ext/sfst/make-compact.cc +323 -0
- data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
- data/ext/sfst/mem.h +80 -0
- data/ext/sfst/operators.cc +1273 -0
- data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
- data/ext/sfst/sgi.h +72 -0
- data/ext/sfst/utf8.cc +149 -0
- data/ext/{sfst_machine → sfst}/utf8.h +7 -4
- data/lib/sfst.rb +2 -1
- data/lib/sfst/version.rb +1 -1
- data/ruby-sfst.gemspec +23 -23
- metadata +107 -35
- data/ext/sfst_machine/alphabet.cc +0 -812
- data/ext/sfst_machine/alphabet.h +0 -273
- data/ext/sfst_machine/basic.cc +0 -84
- data/ext/sfst_machine/compact.cc +0 -616
- data/ext/sfst_machine/compact.h +0 -98
- data/ext/sfst_machine/determinise.cc +0 -303
- data/ext/sfst_machine/fst.cc +0 -1000
- data/ext/sfst_machine/fst.h +0 -369
- data/ext/sfst_machine/interface.cc +0 -1842
- data/ext/sfst_machine/interface.h +0 -93
- data/ext/sfst_machine/make-compact.cc +0 -327
- data/ext/sfst_machine/mem.h +0 -74
- data/ext/sfst_machine/operators.cc +0 -1131
- data/ext/sfst_machine/sgi.h +0 -44
- data/ext/sfst_machine/utf8.cc +0 -146
- data/test/test_sfst.fst +0 -3
- data/test/test_sfst.rb +0 -114
data/ext/sfst/alphabet.h
ADDED
@@ -0,0 +1,302 @@
|
|
1
|
+
/*******************************************************************/
|
2
|
+
/* */
|
3
|
+
/* FILE alphabet.h */
|
4
|
+
/* MODULE alphabet */
|
5
|
+
/* PROGRAM SFST */
|
6
|
+
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
7
|
+
/* */
|
8
|
+
/* PURPOSE finite state tools */
|
9
|
+
/* */
|
10
|
+
/*******************************************************************/
|
11
|
+
|
12
|
+
#ifndef _ALPHABET_H_
|
13
|
+
#define _ALPHABET_H_
|
14
|
+
|
15
|
+
#include <stdio.h>
|
16
|
+
|
17
|
+
#include "basic.h"
|
18
|
+
|
19
|
+
#include <set>
|
20
|
+
using std::set;
|
21
|
+
|
22
|
+
#include <vector>
|
23
|
+
using std::vector;
|
24
|
+
|
25
|
+
#include <iostream>
|
26
|
+
using std::ostream;
|
27
|
+
|
28
|
+
#include <cstring>
|
29
|
+
|
30
|
+
#include "sgi.h"
|
31
|
+
|
32
|
+
#define SFSTVersion "1.4.7a"
|
33
|
+
|
34
|
+
namespace SFST {
|
35
|
+
|
36
|
+
#ifndef CODE_DATA_TYPE
|
37
|
+
typedef unsigned short Character; // data type of the symbol codes
|
38
|
+
#else
|
39
|
+
typedef unsigned CODE_DATA_TYPE Character;
|
40
|
+
#endif
|
41
|
+
|
42
|
+
// data type used to indicate whether some action is to be performed
|
43
|
+
// on the analysis level (lower) or the surface level (upper)
|
44
|
+
typedef enum {upper, lower, both} Level;
|
45
|
+
|
46
|
+
|
47
|
+
/***************** class Label ***********************************/
|
48
|
+
|
49
|
+
class Label {
|
50
|
+
|
51
|
+
private:
|
52
|
+
// data structure where the two symbols are stored
|
53
|
+
struct {
|
54
|
+
Character lower;
|
55
|
+
Character upper;
|
56
|
+
} label;
|
57
|
+
|
58
|
+
public:
|
59
|
+
static const Character epsilon=0; // code of the empty symbol
|
60
|
+
|
61
|
+
// new label with two identical symbols
|
62
|
+
Label( Character c=epsilon ) { label.lower = label.upper = c; };
|
63
|
+
|
64
|
+
// new label with two different symbols
|
65
|
+
Label( Character c1, Character c2 )
|
66
|
+
{ label.lower = c1; label.upper = c2; };
|
67
|
+
|
68
|
+
// returns the indicated symbol of the label
|
69
|
+
Character get_char( Level l ) const
|
70
|
+
{ return ((l==upper)? label.upper: label.lower); };
|
71
|
+
|
72
|
+
// returns the "upper" symbol of the label (i.e. the surface symbol)
|
73
|
+
Character upper_char() const { return label.upper; };
|
74
|
+
|
75
|
+
// returns the "lower" symbol of the label (i.e. the analysis symbol)
|
76
|
+
Character lower_char() const { return label.lower; };
|
77
|
+
|
78
|
+
// replaces symbols in a label
|
79
|
+
Label replace_char( Character c, Character nc ) const {
|
80
|
+
Label l = *this;
|
81
|
+
if (l.label.lower == c)
|
82
|
+
l.label.lower = nc;
|
83
|
+
if (l.label.upper == c)
|
84
|
+
l.label.upper = nc;
|
85
|
+
return l;
|
86
|
+
};
|
87
|
+
|
88
|
+
// operators checking the equality of labels
|
89
|
+
int operator==( Label l ) const
|
90
|
+
{ return (label.lower==l.label.lower && label.upper==l.label.upper); };
|
91
|
+
int operator!=( Label l ) const
|
92
|
+
{ return !(l == *this); };
|
93
|
+
|
94
|
+
// comparison operator needed for sorting labels in compact.C
|
95
|
+
int operator<( Label l ) const {
|
96
|
+
if (upper_char() < l.upper_char())
|
97
|
+
return true;
|
98
|
+
if (upper_char() > l.upper_char())
|
99
|
+
return false;
|
100
|
+
if (lower_char() < l.lower_char())
|
101
|
+
return true;
|
102
|
+
return false;
|
103
|
+
};
|
104
|
+
int operator>( Label l ) const {
|
105
|
+
if (upper_char() > l.upper_char())
|
106
|
+
return true;
|
107
|
+
if (upper_char() < l.upper_char())
|
108
|
+
return false;
|
109
|
+
if (lower_char() > l.lower_char())
|
110
|
+
return true;
|
111
|
+
return false;
|
112
|
+
};
|
113
|
+
|
114
|
+
// check whether the label is epsilon (i.e. both symbols are epsilon)
|
115
|
+
// transitions with epsilon labels are epsilon transitions
|
116
|
+
int is_epsilon() const
|
117
|
+
{ return (label.upper == epsilon && label.lower == epsilon); };
|
118
|
+
|
119
|
+
// check whether the "upper" symbol is epsilon
|
120
|
+
int upper_is_epsilon() const
|
121
|
+
{ return (label.upper == epsilon); };
|
122
|
+
|
123
|
+
// check whether the "lower" symbol is epsilon
|
124
|
+
int lower_is_epsilon() const
|
125
|
+
{ return (label.lower == epsilon); };
|
126
|
+
|
127
|
+
// hash function needed to store labels in a hash table
|
128
|
+
struct label_hash {
|
129
|
+
size_t operator() ( const Label l ) const {
|
130
|
+
return (size_t)l.lower_char() ^
|
131
|
+
((size_t)l.upper_char() << 16) ^
|
132
|
+
((size_t)l.upper_char() >> 16);
|
133
|
+
}
|
134
|
+
};
|
135
|
+
|
136
|
+
// comparison function needed to store labels in a map table
|
137
|
+
struct label_cmp {
|
138
|
+
bool operator() ( const Label l1, const Label l2 ) const {
|
139
|
+
return (l1.lower_char() < l2.lower_char() ||
|
140
|
+
(l1.lower_char() == l2.lower_char() &&
|
141
|
+
l1.upper_char() < l2.upper_char()));
|
142
|
+
}
|
143
|
+
};
|
144
|
+
|
145
|
+
// comparison operator needed to store labels in a hash table
|
146
|
+
struct label_eq {
|
147
|
+
bool operator() ( const Label l1, const Label l2 ) const {
|
148
|
+
return (l1.lower_char() == l2.lower_char() &&
|
149
|
+
l1.upper_char() == l2.upper_char());
|
150
|
+
}
|
151
|
+
};
|
152
|
+
};
|
153
|
+
|
154
|
+
typedef vector<Label> Analysis;
|
155
|
+
|
156
|
+
|
157
|
+
/***************** class Alphabet *******************************/
|
158
|
+
|
159
|
+
class Alphabet {
|
160
|
+
|
161
|
+
// string comparison operators needed to stored strings in a hash table
|
162
|
+
struct eqstr {
|
163
|
+
bool operator()(const char* s1, const char* s2) const {
|
164
|
+
return strcmp(s1, s2) == 0;
|
165
|
+
}
|
166
|
+
};
|
167
|
+
|
168
|
+
// data structure storing labels without repetitions (i.e. as a set)
|
169
|
+
typedef set<Label, Label::label_cmp> LabelSet;
|
170
|
+
|
171
|
+
// hash table used to map the symbols to their codes
|
172
|
+
typedef hash_map<const char*, Character, hash<const char*>,eqstr> SymbolMap;
|
173
|
+
|
174
|
+
public: // HFST addition
|
175
|
+
// hash table used to map the codes back to the symbols
|
176
|
+
typedef hash_map<Character, char*> CharMap;
|
177
|
+
|
178
|
+
// HFST addition
|
179
|
+
bool operator==(const Alphabet &alpha) const;
|
180
|
+
|
181
|
+
private:
|
182
|
+
SymbolMap sm; // maps symbols to codes
|
183
|
+
CharMap cm; // maps codes to symbols
|
184
|
+
LabelSet ls; // set of labels known to the alphabet
|
185
|
+
|
186
|
+
// add a new symbol with symbol code c
|
187
|
+
void add( const char *symbol, Character c );
|
188
|
+
|
189
|
+
public:
|
190
|
+
bool utf8;
|
191
|
+
|
192
|
+
// iterators over the set of known labels
|
193
|
+
typedef LabelSet::iterator iterator;
|
194
|
+
typedef LabelSet::const_iterator const_iterator;
|
195
|
+
Alphabet();
|
196
|
+
~Alphabet() { clear(); };
|
197
|
+
const_iterator begin() const { return ls.begin(); };
|
198
|
+
const_iterator end() const { return ls.end(); };
|
199
|
+
size_t size() const { return ls.size(); };
|
200
|
+
|
201
|
+
// HFST additions
|
202
|
+
CharMap get_char_map(void) { return cm; };
|
203
|
+
void print(void);
|
204
|
+
|
205
|
+
|
206
|
+
void clear();
|
207
|
+
void clear_char_pairs() { ls.clear(); };
|
208
|
+
|
209
|
+
// lookup a label in the alphabet
|
210
|
+
iterator find( Label l ) { return ls.find(l); };
|
211
|
+
|
212
|
+
// insert a label in the alphabet
|
213
|
+
void insert( Label l ) { if (!l.is_epsilon()) ls.insert(l); };
|
214
|
+
|
215
|
+
// insert the known symbols from another alphabet
|
216
|
+
void insert_symbols( const Alphabet& );
|
217
|
+
|
218
|
+
// insert the labels and known symbols from another alphabet
|
219
|
+
void copy( const Alphabet &a, Level level=both );
|
220
|
+
|
221
|
+
// create the alphabet of a transducer obtained by a composition operation
|
222
|
+
void compose( const Alphabet &la, const Alphabet &ua );
|
223
|
+
|
224
|
+
// add a symbol to the alphabet and return its code
|
225
|
+
Character add_symbol(const char *symbol);
|
226
|
+
|
227
|
+
// add a symbol to the alphabet with a given code
|
228
|
+
void add_symbol(const char *symbol, Character c );
|
229
|
+
|
230
|
+
// create a new marker symbol and return its code
|
231
|
+
Character new_marker( void );
|
232
|
+
void delete_markers();
|
233
|
+
|
234
|
+
// compute the complement of a symbol set
|
235
|
+
void complement( vector<Character> &sym );
|
236
|
+
|
237
|
+
// return the code of the argument symbol
|
238
|
+
int symbol2code( const char *s ) const {
|
239
|
+
SymbolMap::const_iterator p = sm.find(s);
|
240
|
+
if (p != sm.end()) return p->second;
|
241
|
+
return EOF;
|
242
|
+
};
|
243
|
+
|
244
|
+
// return the symbol for the given symbol code
|
245
|
+
const char *code2symbol( Character c ) const {
|
246
|
+
CharMap::const_iterator p=cm.find(c);
|
247
|
+
if (p == cm.end())
|
248
|
+
return NULL;
|
249
|
+
else
|
250
|
+
return p->second;
|
251
|
+
};
|
252
|
+
|
253
|
+
// write the symbol for the given symbol code into a string
|
254
|
+
void write_char( Character c, char *buffer, int *pos,
|
255
|
+
bool with_brackets=true ) const;
|
256
|
+
|
257
|
+
// write the symbol pair of a given label into a string
|
258
|
+
void write_label( Label l, char *buffer, int *pos,
|
259
|
+
bool with_brackets=true ) const;
|
260
|
+
|
261
|
+
// write the symbol for the given symbol code into a buffer and return
|
262
|
+
// a pointer to it
|
263
|
+
// the flag "with_brackets" indicates whether the angle brackets
|
264
|
+
// surrounding multi-character symbols are to be printed or not
|
265
|
+
const char *write_char( Character c, bool with_brackets=true ) const;
|
266
|
+
|
267
|
+
// write the symbol pair of a given label into a string
|
268
|
+
// and return a pointer to it
|
269
|
+
const char *write_label( Label l, bool with_brackets=true ) const;
|
270
|
+
|
271
|
+
// scan the next multi-character symbol in the argument string
|
272
|
+
int next_mcsym( char*&, bool insert=true );
|
273
|
+
|
274
|
+
// scan the next symbol in the argument string
|
275
|
+
int next_code( char*&, bool extended=true, bool insert=true );
|
276
|
+
|
277
|
+
// convert a character string into a symbol or label sequence
|
278
|
+
void string2symseq( char*, vector<Character>& );
|
279
|
+
void string2labelseq( char*, vector<Label>& );
|
280
|
+
|
281
|
+
// scan the next label in the argument string
|
282
|
+
Label next_label( char*&, bool extended=true );
|
283
|
+
|
284
|
+
// store the alphabet in the argument file (in binary form)
|
285
|
+
void store( FILE* ) const;
|
286
|
+
|
287
|
+
// read the alphabet from the argument file
|
288
|
+
void read( FILE* );
|
289
|
+
|
290
|
+
// disambiguation and printing of analyses
|
291
|
+
int compute_score( Analysis &ana );
|
292
|
+
void disambiguate( vector<Analysis> &analyses );
|
293
|
+
char *print_analysis( Analysis &ana, bool both_layers );
|
294
|
+
|
295
|
+
friend ostream &operator<<(ostream&, const Alphabet&);
|
296
|
+
};
|
297
|
+
|
298
|
+
// write the alphabet to the output stream (in readable form)
|
299
|
+
ostream &operator<<(ostream&, const Alphabet&);
|
300
|
+
}
|
301
|
+
|
302
|
+
#endif
|
data/ext/sfst/basic.cc
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
|
2
|
+
/*******************************************************************/
|
3
|
+
/* */
|
4
|
+
/* FILE basic.C */
|
5
|
+
/* MODULE basic */
|
6
|
+
/* PROGRAM SFST */
|
7
|
+
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
8
|
+
/* */
|
9
|
+
/* PURPOSE */
|
10
|
+
/* */
|
11
|
+
/*******************************************************************/
|
12
|
+
|
13
|
+
#include <stdlib.h>
|
14
|
+
#include <string.h>
|
15
|
+
|
16
|
+
#include "basic.h"
|
17
|
+
|
18
|
+
namespace SFST {
|
19
|
+
|
20
|
+
bool Switch_Bytes=false;
|
21
|
+
|
22
|
+
|
23
|
+
/*******************************************************************/
|
24
|
+
/* */
|
25
|
+
/* fst_strdup */
|
26
|
+
/* */
|
27
|
+
/*******************************************************************/
|
28
|
+
|
29
|
+
char* fst_strdup(const char* pString)
|
30
|
+
|
31
|
+
{
|
32
|
+
char* pStringCopy = (char*)malloc(strlen(pString) + 1);
|
33
|
+
if (pStringCopy == NULL) {
|
34
|
+
fprintf(stderr, "\nError: out of memory (malloc failed)\naborted.\n");
|
35
|
+
exit(1);
|
36
|
+
}
|
37
|
+
strcpy(pStringCopy, pString);
|
38
|
+
return pStringCopy;
|
39
|
+
}
|
40
|
+
|
41
|
+
|
42
|
+
/*******************************************************************/
|
43
|
+
/* */
|
44
|
+
/* read_string */
|
45
|
+
/* */
|
46
|
+
/*******************************************************************/
|
47
|
+
|
48
|
+
int read_string( char *buffer, int size, FILE *file )
|
49
|
+
|
50
|
+
{
|
51
|
+
for( int i=0; i<size; i++ ) {
|
52
|
+
int c=fgetc(file);
|
53
|
+
if (c == EOF || c == 0) {
|
54
|
+
buffer[i] = 0;
|
55
|
+
return (c==0);
|
56
|
+
}
|
57
|
+
buffer[i] = (char)c;
|
58
|
+
}
|
59
|
+
buffer[size-1] = 0;
|
60
|
+
return 0;
|
61
|
+
}
|
62
|
+
|
63
|
+
|
64
|
+
/*******************************************************************/
|
65
|
+
/* */
|
66
|
+
/* read_num */
|
67
|
+
/* */
|
68
|
+
/*******************************************************************/
|
69
|
+
|
70
|
+
size_t read_num( void *p, size_t n, FILE *file )
|
71
|
+
|
72
|
+
{
|
73
|
+
char *pp=(char*)p;
|
74
|
+
size_t result=fread( pp, 1, n, file );
|
75
|
+
if (Switch_Bytes) {
|
76
|
+
size_t e=n/2;
|
77
|
+
for( size_t i=0; i<e; i++ ) {
|
78
|
+
char tmp=pp[i];
|
79
|
+
pp[i] = pp[--n];
|
80
|
+
pp[n] = tmp;
|
81
|
+
}
|
82
|
+
}
|
83
|
+
return result;
|
84
|
+
}
|
85
|
+
}
|
@@ -15,10 +15,13 @@
|
|
15
15
|
|
16
16
|
#include <stdio.h>
|
17
17
|
|
18
|
-
|
18
|
+
namespace SFST {
|
19
19
|
|
20
|
-
|
21
|
-
int read_string( char *buffer, int size, FILE *file );
|
22
|
-
size_t read_num( void *p, size_t size, FILE *file );
|
20
|
+
extern bool Switch_Bytes;
|
23
21
|
|
22
|
+
char* fst_strdup(const char* pString);
|
23
|
+
int read_string( char *buffer, int size, FILE *file );
|
24
|
+
size_t read_num( void *p, size_t size, FILE *file );
|
25
|
+
|
26
|
+
}
|
24
27
|
#endif
|
data/ext/sfst/compact.cc
ADDED
@@ -0,0 +1,629 @@
|
|
1
|
+
/*******************************************************************/
|
2
|
+
/* */
|
3
|
+
/* FILE compact.C */
|
4
|
+
/* MODULE compact */
|
5
|
+
/* PROGRAM SFST */
|
6
|
+
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
7
|
+
/* */
|
8
|
+
/* PURPOSE Code needed for analysing data */
|
9
|
+
/* */
|
10
|
+
/*******************************************************************/
|
11
|
+
|
12
|
+
#include <stdio.h>
|
13
|
+
#include <math.h>
|
14
|
+
|
15
|
+
#include <limits.h>
|
16
|
+
|
17
|
+
#include "compact.h"
|
18
|
+
|
19
|
+
namespace SFST {
|
20
|
+
|
21
|
+
using std::equal_range;
|
22
|
+
using std::vector;
|
23
|
+
using std::pair;
|
24
|
+
|
25
|
+
class label_less {
|
26
|
+
public:
|
27
|
+
bool operator()(const Label l1, const Label l2) const {
|
28
|
+
return l1.upper_char() < l2.upper_char();
|
29
|
+
}
|
30
|
+
};
|
31
|
+
|
32
|
+
const int BUFFER_SIZE=1000;
|
33
|
+
|
34
|
+
|
35
|
+
/*******************************************************************/
|
36
|
+
/* */
|
37
|
+
/* CompactTransducer::convert */
|
38
|
+
/* */
|
39
|
+
/*******************************************************************/
|
40
|
+
|
41
|
+
void CompactTransducer::convert( CAnalysis &cana, Analysis &ana )
|
42
|
+
|
43
|
+
{
|
44
|
+
ana.resize(cana.size());
|
45
|
+
for( size_t i=0; i<cana.size(); i++ )
|
46
|
+
ana[i] = label[cana[i]];
|
47
|
+
}
|
48
|
+
|
49
|
+
|
50
|
+
/*******************************************************************/
|
51
|
+
/* */
|
52
|
+
/* CompactTransducer::analyze */
|
53
|
+
/* */
|
54
|
+
/*******************************************************************/
|
55
|
+
|
56
|
+
void CompactTransducer::analyze(unsigned int n, vector<Character> &input,
|
57
|
+
size_t ipos, CAnalysis &ca,
|
58
|
+
vector<CAnalysis> &analyses )
|
59
|
+
{
|
60
|
+
// "n" is the number of the current transducer node/state
|
61
|
+
// "input" is the sequence of input symbols
|
62
|
+
// "ipos" is the input position currently analysed
|
63
|
+
// "ca" stores the incomplete analysis string
|
64
|
+
// "analyses" stores the analyses found so far
|
65
|
+
|
66
|
+
if (analyses.size() > 10000)
|
67
|
+
return; // limit the maximal number of analyses
|
68
|
+
|
69
|
+
// Is the input string fully analyzed and the current node a final node?
|
70
|
+
if (finalp[n] && ipos == input.size())
|
71
|
+
// store the new analysis
|
72
|
+
analyses.push_back(ca);
|
73
|
+
|
74
|
+
// follow the epsilon transitions
|
75
|
+
// first_arc[n] is the number of the first outgoing transition of node n
|
76
|
+
// first_arc[n+1]-1 is the number of the last outgoing transition of node n
|
77
|
+
// first_arc[n+1] is the number of the first outgoing transition of node n+1
|
78
|
+
unsigned int i;
|
79
|
+
for( i=first_arc[n];
|
80
|
+
i<first_arc[n+1] && label[i].upper_char() == Label::epsilon;
|
81
|
+
i++)
|
82
|
+
{
|
83
|
+
ca.push_back(i);
|
84
|
+
analyze(target_node[i], input, ipos, ca, analyses);
|
85
|
+
ca.pop_back();
|
86
|
+
}
|
87
|
+
|
88
|
+
// follow the non-epsilon transitions
|
89
|
+
|
90
|
+
// scan the next input symbol
|
91
|
+
if (ipos < input.size()) {
|
92
|
+
// find the set of arcs with matching upper character in the sorted list
|
93
|
+
pair<Label*,Label*>range =
|
94
|
+
equal_range(label+i, label+first_arc[n+1], Label(input[ipos]),
|
95
|
+
label_less());
|
96
|
+
unsigned int to = (unsigned int)(range.second - label);
|
97
|
+
|
98
|
+
// follow the non-epsilon transitions
|
99
|
+
for( i=(unsigned)(range.first-label); i<to; i++) {
|
100
|
+
ca.push_back(i);
|
101
|
+
analyze(target_node[i], input, ipos+1, ca, analyses);
|
102
|
+
ca.pop_back();
|
103
|
+
}
|
104
|
+
}
|
105
|
+
}
|
106
|
+
|
107
|
+
|
108
|
+
/*******************************************************************/
|
109
|
+
/* */
|
110
|
+
/* CompactTransducer::analyze_string */
|
111
|
+
/* */
|
112
|
+
/*******************************************************************/
|
113
|
+
|
114
|
+
void CompactTransducer::analyze_string( char *s, vector<CAnalysis> &analyses )
|
115
|
+
|
116
|
+
{
|
117
|
+
// "s" input string to be analyzed
|
118
|
+
// "analyses" is the data structure in which the results are stored
|
119
|
+
// and returned
|
120
|
+
|
121
|
+
vector<Character> input;
|
122
|
+
|
123
|
+
alphabet.string2symseq( s, input );
|
124
|
+
|
125
|
+
analyses.clear();
|
126
|
+
CAnalysis ca; // data structure where the current incomplete analysis
|
127
|
+
// is stored
|
128
|
+
analyze(0, input, 0, ca, analyses); // start the analysis
|
129
|
+
|
130
|
+
if (analyses.size() > 10000)
|
131
|
+
fprintf(stderr,"Warning: Only the first 10000 analyses considered for \"%s\"!\n", s);
|
132
|
+
|
133
|
+
if (simplest_only && analyses.size() > 1)
|
134
|
+
disambiguate( analyses ); // select the simplest analyses
|
135
|
+
}
|
136
|
+
|
137
|
+
|
138
|
+
|
139
|
+
/*******************************************************************/
|
140
|
+
/* */
|
141
|
+
/* CompactTransducer::~CompactTransducer */
|
142
|
+
/* */
|
143
|
+
/*******************************************************************/
|
144
|
+
|
145
|
+
CompactTransducer::~CompactTransducer()
|
146
|
+
|
147
|
+
{
|
148
|
+
delete[] finalp;
|
149
|
+
delete[] first_arc;
|
150
|
+
delete[] label;
|
151
|
+
delete[] target_node;
|
152
|
+
delete[] final_logprob;
|
153
|
+
delete[] arc_logprob;
|
154
|
+
}
|
155
|
+
|
156
|
+
|
157
|
+
/*******************************************************************/
|
158
|
+
/* */
|
159
|
+
/* CompactTransducer::CompactTransducer */
|
160
|
+
/* */
|
161
|
+
/*******************************************************************/
|
162
|
+
|
163
|
+
CompactTransducer::CompactTransducer()
|
164
|
+
|
165
|
+
{
|
166
|
+
both_layers = false;
|
167
|
+
simplest_only = false;
|
168
|
+
number_of_nodes = 0;
|
169
|
+
number_of_arcs = 0;
|
170
|
+
finalp = NULL;
|
171
|
+
first_arc = NULL;
|
172
|
+
label = NULL;
|
173
|
+
target_node = NULL;
|
174
|
+
arc_logprob = final_logprob = (float*)NULL;
|
175
|
+
}
|
176
|
+
|
177
|
+
|
178
|
+
/*******************************************************************/
|
179
|
+
/* */
|
180
|
+
/* CompactTransducer::read_finalp */
|
181
|
+
/* */
|
182
|
+
/*******************************************************************/
|
183
|
+
|
184
|
+
void CompactTransducer::read_finalp( FILE *file )
|
185
|
+
|
186
|
+
{
|
187
|
+
int k=0;
|
188
|
+
unsigned char n=0;
|
189
|
+
for( size_t i=0; i<number_of_nodes; i++ ) {
|
190
|
+
if (k == 0) {
|
191
|
+
n = (unsigned char)fgetc(file);
|
192
|
+
k = 8;
|
193
|
+
}
|
194
|
+
k--;
|
195
|
+
if (n & (1 << k))
|
196
|
+
finalp[i] = 1;
|
197
|
+
else
|
198
|
+
finalp[i] = 0;
|
199
|
+
}
|
200
|
+
}
|
201
|
+
|
202
|
+
|
203
|
+
/*******************************************************************/
|
204
|
+
/* */
|
205
|
+
/* CompactTransducer::read_first_arcs */
|
206
|
+
/* */
|
207
|
+
/*******************************************************************/
|
208
|
+
|
209
|
+
void CompactTransducer::read_first_arcs( FILE *file )
|
210
|
+
|
211
|
+
{
|
212
|
+
int k=0;
|
213
|
+
unsigned int n=0;
|
214
|
+
int bits=(int)ceil(log(number_of_arcs+1)/log(2));
|
215
|
+
|
216
|
+
for( size_t i=0; i<=number_of_nodes; i++ ) {
|
217
|
+
first_arc[i] = n >> (sizeof(n)*8 - bits);
|
218
|
+
n <<= bits;
|
219
|
+
k -= bits;
|
220
|
+
if (k < 0) {
|
221
|
+
read_num(&n,sizeof(n),file);
|
222
|
+
first_arc[i] |= n >> (sizeof(n)*8 + k);
|
223
|
+
n <<= -k;
|
224
|
+
k += (int)sizeof(n) * 8;
|
225
|
+
}
|
226
|
+
}
|
227
|
+
}
|
228
|
+
|
229
|
+
|
230
|
+
/*******************************************************************/
|
231
|
+
/* */
|
232
|
+
/* CompactTransducer::read_target_nodes */
|
233
|
+
/* */
|
234
|
+
/*******************************************************************/
|
235
|
+
|
236
|
+
void CompactTransducer::read_target_nodes( FILE *file )
|
237
|
+
|
238
|
+
{
|
239
|
+
int k=0;
|
240
|
+
unsigned int n=0;
|
241
|
+
int bits=(int)ceil(log(number_of_nodes)/log(2));
|
242
|
+
|
243
|
+
for( size_t i=0; i<number_of_arcs; i++ ) {
|
244
|
+
target_node[i] = n >> (sizeof(n)*8 - bits);
|
245
|
+
n <<= bits;
|
246
|
+
k -= bits;
|
247
|
+
if (k < 0) {
|
248
|
+
read_num(&n,sizeof(n),file);
|
249
|
+
target_node[i] |= n >> (sizeof(n)*8 + k);
|
250
|
+
n <<= -k;
|
251
|
+
k += (int)sizeof(n) * 8;
|
252
|
+
}
|
253
|
+
}
|
254
|
+
}
|
255
|
+
|
256
|
+
|
257
|
+
/*******************************************************************/
|
258
|
+
/* */
|
259
|
+
/* CompactTransducer::read_labels */
|
260
|
+
/* */
|
261
|
+
/*******************************************************************/
|
262
|
+
|
263
|
+
void CompactTransducer::read_labels( FILE *file )
|
264
|
+
|
265
|
+
{
|
266
|
+
size_t N=0;
|
267
|
+
vector<Label> Num2Label(alphabet.size());
|
268
|
+
for( Alphabet::const_iterator it=alphabet.begin();
|
269
|
+
it != alphabet.end(); it++ )
|
270
|
+
{
|
271
|
+
Label l=*it;
|
272
|
+
Num2Label[N++] = l;
|
273
|
+
}
|
274
|
+
|
275
|
+
int k=0;
|
276
|
+
unsigned int n=0;
|
277
|
+
int bits=(int)ceil(log((double)alphabet.size())/log(2));
|
278
|
+
|
279
|
+
for( size_t i=0; i<number_of_arcs; i++ ) {
|
280
|
+
unsigned int l = n >> (sizeof(n)*8 - bits);
|
281
|
+
n <<= bits;
|
282
|
+
k -= bits;
|
283
|
+
if (k < 0) {
|
284
|
+
read_num(&n,sizeof(n),file);
|
285
|
+
l |= n >> (sizeof(n)*8 + k);
|
286
|
+
n <<= -k;
|
287
|
+
k += (int)sizeof(n) * 8;
|
288
|
+
}
|
289
|
+
label[i] = Num2Label[l];
|
290
|
+
}
|
291
|
+
}
|
292
|
+
|
293
|
+
|
294
|
+
/*******************************************************************/
|
295
|
+
/* */
|
296
|
+
/* CompactTransducer::read_probs */
|
297
|
+
/* */
|
298
|
+
/*******************************************************************/
|
299
|
+
|
300
|
+
void CompactTransducer::read_probs( FILE *file )
|
301
|
+
|
302
|
+
{
|
303
|
+
size_t n,m;
|
304
|
+
fread(&n, sizeof(n), 1, file);
|
305
|
+
if (fread(&m, sizeof(n), 1, file) != 1 ||
|
306
|
+
n != node_count() || m != arc_count())
|
307
|
+
{
|
308
|
+
fprintf(stderr,"Error: incompatible probability file!\n");
|
309
|
+
exit(1);
|
310
|
+
}
|
311
|
+
final_logprob = new float[n];
|
312
|
+
arc_logprob = new float[m];
|
313
|
+
fread(final_logprob, sizeof(float), n, file);
|
314
|
+
if (fread(arc_logprob, sizeof(float), n, file) != n) {
|
315
|
+
fprintf(stderr,"Error: in probability file!\n");
|
316
|
+
exit(1);
|
317
|
+
}
|
318
|
+
}
|
319
|
+
|
320
|
+
|
321
|
+
/*******************************************************************/
|
322
|
+
/* */
|
323
|
+
/* CompactTransducer::CompactTransducer */
|
324
|
+
/* */
|
325
|
+
/*******************************************************************/
|
326
|
+
|
327
|
+
CompactTransducer::CompactTransducer( FILE *file, FILE *pfile )
|
328
|
+
|
329
|
+
{
|
330
|
+
both_layers = false;
|
331
|
+
simplest_only = false;
|
332
|
+
|
333
|
+
if (fgetc(file) != 'c')
|
334
|
+
throw "Error: wrong file format (not a compact transducer)\n";
|
335
|
+
|
336
|
+
alphabet.read(file);
|
337
|
+
|
338
|
+
read_num(&number_of_nodes,sizeof(number_of_nodes),file);
|
339
|
+
read_num(&number_of_arcs,sizeof(number_of_arcs),file);
|
340
|
+
|
341
|
+
if (!ferror(file)) {
|
342
|
+
// memory allocation
|
343
|
+
finalp = new char[number_of_nodes];
|
344
|
+
first_arc = new unsigned[number_of_nodes+1];
|
345
|
+
label = new Label[number_of_arcs];
|
346
|
+
target_node = new unsigned[number_of_arcs];
|
347
|
+
|
348
|
+
// reading the data
|
349
|
+
read_finalp(file);
|
350
|
+
read_first_arcs(file);
|
351
|
+
read_labels(file);
|
352
|
+
read_target_nodes(file);
|
353
|
+
}
|
354
|
+
|
355
|
+
if (pfile == NULL)
|
356
|
+
arc_logprob = final_logprob = (float*)NULL;
|
357
|
+
else
|
358
|
+
read_probs(pfile);
|
359
|
+
}
|
360
|
+
|
361
|
+
|
362
|
+
/*******************************************************************/
|
363
|
+
/* */
|
364
|
+
/* CompactTransducer::longest_match2 */
|
365
|
+
/* */
|
366
|
+
/*******************************************************************/
|
367
|
+
|
368
|
+
void CompactTransducer::longest_match2(unsigned int n, char *string, int l,
|
369
|
+
CAnalysis &ca, int &bl, CAnalysis &ba)
|
370
|
+
{
|
371
|
+
// n: transducer state
|
372
|
+
// string: rest string
|
373
|
+
// l: length of current analysis
|
374
|
+
// bl: length of the currently longest match
|
375
|
+
// ca: current analysis
|
376
|
+
// ba: best analysis
|
377
|
+
|
378
|
+
if (finalp[n] && l > bl) {
|
379
|
+
// store the new analysis
|
380
|
+
bl = l;
|
381
|
+
ba = ca; // copy the arc vector
|
382
|
+
}
|
383
|
+
|
384
|
+
// follow the epsilon transitions
|
385
|
+
unsigned int i;
|
386
|
+
for( i=first_arc[n];
|
387
|
+
i<first_arc[n+1] && label[i].upper_char() == Label::epsilon;
|
388
|
+
i++)
|
389
|
+
{
|
390
|
+
ca.push_back(i);
|
391
|
+
longest_match2(target_node[i], string, l, ca, bl, ba);
|
392
|
+
ca.pop_back();
|
393
|
+
}
|
394
|
+
|
395
|
+
// follow the non-epsilon transitions
|
396
|
+
char *end=string;
|
397
|
+
int c=alphabet.next_code(end, false, false);
|
398
|
+
l += (int)(end - string);
|
399
|
+
if (c != EOF) {
|
400
|
+
// find the set of arcs with matching upper character in the sort list
|
401
|
+
pair<Label*,Label*>range =
|
402
|
+
equal_range(label+i, label+first_arc[n+1], Label((Character)c),
|
403
|
+
label_less());
|
404
|
+
unsigned int to = (unsigned int)(range.second - label);
|
405
|
+
for( i=(unsigned)(range.first-label); i<to; i++) {
|
406
|
+
ca.push_back(i);
|
407
|
+
longest_match2(target_node[i], end, l, ca, bl, ba);
|
408
|
+
ca.pop_back();
|
409
|
+
}
|
410
|
+
}
|
411
|
+
}
|
412
|
+
|
413
|
+
|
414
|
+
/*******************************************************************/
|
415
|
+
/* */
|
416
|
+
/* CompactTransducer::print_analysis */
|
417
|
+
/* */
|
418
|
+
/*******************************************************************/
|
419
|
+
|
420
|
+
char *CompactTransducer::print_analysis( CAnalysis &cana )
|
421
|
+
|
422
|
+
{
|
423
|
+
Analysis ana;
|
424
|
+
convert(cana, ana);
|
425
|
+
return alphabet.print_analysis( ana, both_layers );
|
426
|
+
}
|
427
|
+
|
428
|
+
|
429
|
+
/*******************************************************************/
|
430
|
+
/* */
|
431
|
+
/* CompactTransducer::longest_match */
|
432
|
+
/* */
|
433
|
+
/*******************************************************************/
|
434
|
+
|
435
|
+
const char *CompactTransducer::longest_match( char* &string )
|
436
|
+
|
437
|
+
{
|
438
|
+
vector<char> analysis;
|
439
|
+
CAnalysis ca, ba;
|
440
|
+
int l=0;
|
441
|
+
longest_match2(0, string, 0, ca, l, ba);
|
442
|
+
|
443
|
+
// no match? return the next character
|
444
|
+
if (ba.size() == 0) {
|
445
|
+
int c=alphabet.next_code(string, false, false);
|
446
|
+
return alphabet.code2symbol((Character)c);
|
447
|
+
}
|
448
|
+
|
449
|
+
string += l;
|
450
|
+
return print_analysis( ba );
|
451
|
+
}
|
452
|
+
|
453
|
+
|
454
|
+
/*******************************************************************/
|
455
|
+
/* */
|
456
|
+
/* CompactTransducer::disambiguate */
|
457
|
+
/* */
|
458
|
+
/*******************************************************************/
|
459
|
+
|
460
|
+
void CompactTransducer::disambiguate( vector<CAnalysis> &analyses )
|
461
|
+
|
462
|
+
{
|
463
|
+
// compute the scores
|
464
|
+
int bestscore=INT_MIN;
|
465
|
+
vector<int> score;
|
466
|
+
Analysis ana;
|
467
|
+
|
468
|
+
for( size_t i=0; i<analyses.size(); i++ ) {
|
469
|
+
convert(analyses[i], ana);
|
470
|
+
score.push_back(alphabet.compute_score(ana));
|
471
|
+
if (bestscore < score[i])
|
472
|
+
bestscore = score[i];
|
473
|
+
}
|
474
|
+
|
475
|
+
// delete suboptimal analyses
|
476
|
+
size_t k=0;
|
477
|
+
for( size_t i=0; i<analyses.size(); i++ )
|
478
|
+
if (score[i] == bestscore)
|
479
|
+
analyses[k++] = analyses[i];
|
480
|
+
analyses.resize(k);
|
481
|
+
}
|
482
|
+
|
483
|
+
|
484
|
+
/*******************************************************************/
|
485
|
+
/* */
|
486
|
+
/* CompactTransducer::train2 */
|
487
|
+
/* */
|
488
|
+
/*******************************************************************/
|
489
|
+
|
490
|
+
bool CompactTransducer::train2( char *s, vector<double> &arcfreq,
|
491
|
+
vector<double> &finalfreq )
|
492
|
+
{
|
493
|
+
vector<CAnalysis> analyses;
|
494
|
+
vector<Label> input;
|
495
|
+
alphabet.string2labelseq( s, input );
|
496
|
+
|
497
|
+
CAnalysis ca; // data structure where the analysis is stored
|
498
|
+
unsigned int n=0;
|
499
|
+
bool failure=false;
|
500
|
+
for( size_t i=0; i<input.size(); i++ ) {
|
501
|
+
failure = true;
|
502
|
+
for( unsigned int k=first_arc[n]; k<first_arc[n+1]; k++) {
|
503
|
+
if (label[k] == input[i]) {
|
504
|
+
ca.push_back(k);
|
505
|
+
n = target_node[k];
|
506
|
+
failure = false;
|
507
|
+
break;
|
508
|
+
}
|
509
|
+
}
|
510
|
+
if (failure)
|
511
|
+
break;
|
512
|
+
}
|
513
|
+
if (failure || !finalp[n]) {
|
514
|
+
fprintf(stderr,"Warning: The following input is not covered:\n%s\n", s);
|
515
|
+
return false;
|
516
|
+
}
|
517
|
+
|
518
|
+
for( size_t k=0; k<ca.size(); k++ )
|
519
|
+
arcfreq[ca[k]]++;
|
520
|
+
finalfreq[target_node[ca.back()]]++;
|
521
|
+
|
522
|
+
return true;
|
523
|
+
}
|
524
|
+
|
525
|
+
|
526
|
+
/*******************************************************************/
|
527
|
+
/* */
|
528
|
+
/* CompactTransducer::train */
|
529
|
+
/* */
|
530
|
+
/*******************************************************************/
|
531
|
+
|
532
|
+
bool CompactTransducer::train( char *s, vector<double> &arcfreq,
|
533
|
+
vector<double> &finalfreq )
|
534
|
+
{
|
535
|
+
vector<CAnalysis> analyses;
|
536
|
+
vector<Character> input;
|
537
|
+
alphabet.string2symseq( s, input );
|
538
|
+
|
539
|
+
CAnalysis ca; // data structure where the current incomplete analysis
|
540
|
+
// is stored
|
541
|
+
analyze(0, input, 0, ca, analyses); // start the analysis
|
542
|
+
|
543
|
+
if (analyses.size() > 10000)
|
544
|
+
return true; // ignore inputs with more than 10000 analyses
|
545
|
+
else if (analyses.size() == 0)
|
546
|
+
return false;
|
547
|
+
|
548
|
+
if (simplest_only && analyses.size() > 1)
|
549
|
+
disambiguate( analyses ); // select the simplest analyses
|
550
|
+
|
551
|
+
if (analyses.size() > 0) {
|
552
|
+
double incr = 1.0 / (double)analyses.size();
|
553
|
+
CAnalysis arcs;
|
554
|
+
|
555
|
+
for( size_t i=0; i<analyses.size(); i++ ) {
|
556
|
+
CAnalysis &arcs=analyses[i];
|
557
|
+
for( size_t k=0; k<arcs.size(); k++ )
|
558
|
+
arcfreq[arcs[k]] += incr;
|
559
|
+
finalfreq[target_node[arcs.back()]] += incr;
|
560
|
+
}
|
561
|
+
}
|
562
|
+
return true;
|
563
|
+
}
|
564
|
+
|
565
|
+
|
566
|
+
/*******************************************************************/
|
567
|
+
/* */
|
568
|
+
/* CompactTransducer::estimate_probs */
|
569
|
+
/* */
|
570
|
+
/*******************************************************************/
|
571
|
+
|
572
|
+
void CompactTransducer::estimate_probs( vector<double> &arcfreq,
|
573
|
+
vector<double> &finalfreq )
|
574
|
+
{
|
575
|
+
// turn frequencies into probabilities
|
576
|
+
for( size_t n=0; n<finalfreq.size(); n++ ) {
|
577
|
+
double sum = finalfreq[n];
|
578
|
+
for( size_t a=first_arc[n]; a<first_arc[n+1]; a++ )
|
579
|
+
sum += arcfreq[a];
|
580
|
+
if (sum == 0.0)
|
581
|
+
sum = 1.0;
|
582
|
+
finalfreq[n] = finalfreq[n] / sum;
|
583
|
+
for( size_t a=first_arc[n]; a<first_arc[n+1]; a++ )
|
584
|
+
arcfreq[a] = arcfreq[a] / sum;
|
585
|
+
}
|
586
|
+
}
|
587
|
+
|
588
|
+
|
589
|
+
|
590
|
+
/*******************************************************************/
|
591
|
+
/* */
|
592
|
+
/* CompactTransducer::compute_probs */
|
593
|
+
/* */
|
594
|
+
/*******************************************************************/
|
595
|
+
|
596
|
+
void CompactTransducer::compute_probs( vector<CAnalysis> &analyses,
|
597
|
+
vector<double> &prob )
|
598
|
+
{
|
599
|
+
prob.resize(analyses.size());
|
600
|
+
double sum=0.0;
|
601
|
+
for( size_t i=0; i<analyses.size(); i++ ) {
|
602
|
+
CAnalysis &a=analyses[i];
|
603
|
+
|
604
|
+
// compute the probability
|
605
|
+
double logprob=0.0;
|
606
|
+
for( size_t k=0; k<a.size(); k++ )
|
607
|
+
logprob += arc_logprob[a[k]];
|
608
|
+
logprob += final_logprob[target_node[a.back()]];
|
609
|
+
prob[i] = exp(logprob);
|
610
|
+
sum += prob[i];
|
611
|
+
}
|
612
|
+
|
613
|
+
// sort the analyses
|
614
|
+
vector<CAnalysis> oldanalyses(analyses);
|
615
|
+
vector<double> oldprob(prob);
|
616
|
+
for( size_t i=0; i<analyses.size(); i++ ) {
|
617
|
+
prob[i] = -1.0;
|
618
|
+
size_t n=0;
|
619
|
+
for( size_t k=0; k<oldanalyses.size(); k++ )
|
620
|
+
if (prob[i] < oldprob[k]) {
|
621
|
+
prob[i] = oldprob[k];
|
622
|
+
n = k;
|
623
|
+
}
|
624
|
+
analyses[i] = oldanalyses[n];
|
625
|
+
oldprob[n] = -1.0;
|
626
|
+
prob[i] /= sum; // normalization
|
627
|
+
}
|
628
|
+
}
|
629
|
+
}
|