ruby-sfst 0.4.3 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -0
- data/COPYING +280 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +54 -0
- data/README.md +1 -1
- data/Rakefile +9 -18
- data/bin/console +7 -0
- data/bin/setup +6 -0
- data/ext/sfst/alphabet.cc +879 -0
- data/ext/sfst/alphabet.h +302 -0
- data/ext/sfst/basic.cc +85 -0
- data/ext/{sfst_machine → sfst}/basic.h +7 -4
- data/ext/sfst/compact.cc +629 -0
- data/ext/sfst/compact.h +100 -0
- data/ext/sfst/determinise.cc +279 -0
- data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
- data/ext/sfst/fst.cc +1150 -0
- data/ext/sfst/fst.h +374 -0
- data/ext/sfst/hopcroft.cc +681 -0
- data/ext/sfst/interface.cc +1921 -0
- data/ext/sfst/interface.h +171 -0
- data/ext/sfst/make-compact.cc +323 -0
- data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
- data/ext/sfst/mem.h +80 -0
- data/ext/sfst/operators.cc +1273 -0
- data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
- data/ext/sfst/sgi.h +72 -0
- data/ext/sfst/utf8.cc +149 -0
- data/ext/{sfst_machine → sfst}/utf8.h +7 -4
- data/lib/sfst.rb +2 -1
- data/lib/sfst/version.rb +1 -1
- data/ruby-sfst.gemspec +23 -23
- metadata +107 -35
- data/ext/sfst_machine/alphabet.cc +0 -812
- data/ext/sfst_machine/alphabet.h +0 -273
- data/ext/sfst_machine/basic.cc +0 -84
- data/ext/sfst_machine/compact.cc +0 -616
- data/ext/sfst_machine/compact.h +0 -98
- data/ext/sfst_machine/determinise.cc +0 -303
- data/ext/sfst_machine/fst.cc +0 -1000
- data/ext/sfst_machine/fst.h +0 -369
- data/ext/sfst_machine/interface.cc +0 -1842
- data/ext/sfst_machine/interface.h +0 -93
- data/ext/sfst_machine/make-compact.cc +0 -327
- data/ext/sfst_machine/mem.h +0 -74
- data/ext/sfst_machine/operators.cc +0 -1131
- data/ext/sfst_machine/sgi.h +0 -44
- data/ext/sfst_machine/utf8.cc +0 -146
- data/test/test_sfst.fst +0 -3
- data/test/test_sfst.rb +0 -114
data/ext/sfst/alphabet.h
ADDED
@@ -0,0 +1,302 @@
|
|
1
|
+
/*******************************************************************/
|
2
|
+
/* */
|
3
|
+
/* FILE alphabet.h */
|
4
|
+
/* MODULE alphabet */
|
5
|
+
/* PROGRAM SFST */
|
6
|
+
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
7
|
+
/* */
|
8
|
+
/* PURPOSE finite state tools */
|
9
|
+
/* */
|
10
|
+
/*******************************************************************/
|
11
|
+
|
12
|
+
#ifndef _ALPHABET_H_
|
13
|
+
#define _ALPHABET_H_
|
14
|
+
|
15
|
+
#include <stdio.h>
|
16
|
+
|
17
|
+
#include "basic.h"
|
18
|
+
|
19
|
+
#include <set>
|
20
|
+
using std::set;
|
21
|
+
|
22
|
+
#include <vector>
|
23
|
+
using std::vector;
|
24
|
+
|
25
|
+
#include <iostream>
|
26
|
+
using std::ostream;
|
27
|
+
|
28
|
+
#include <cstring>
|
29
|
+
|
30
|
+
#include "sgi.h"
|
31
|
+
|
32
|
+
#define SFSTVersion "1.4.7a"
|
33
|
+
|
34
|
+
namespace SFST {
|
35
|
+
|
36
|
+
#ifndef CODE_DATA_TYPE
|
37
|
+
typedef unsigned short Character; // data type of the symbol codes
|
38
|
+
#else
|
39
|
+
typedef unsigned CODE_DATA_TYPE Character;
|
40
|
+
#endif
|
41
|
+
|
42
|
+
// data type used to indicate whether some action is to be performed
|
43
|
+
// on the analysis level (lower) or the surface level (upper)
|
44
|
+
typedef enum {upper, lower, both} Level;
|
45
|
+
|
46
|
+
|
47
|
+
/***************** class Label ***********************************/
|
48
|
+
|
49
|
+
class Label {
|
50
|
+
|
51
|
+
private:
|
52
|
+
// data structure where the two symbols are stored
|
53
|
+
struct {
|
54
|
+
Character lower;
|
55
|
+
Character upper;
|
56
|
+
} label;
|
57
|
+
|
58
|
+
public:
|
59
|
+
static const Character epsilon=0; // code of the empty symbol
|
60
|
+
|
61
|
+
// new label with two identical symbols
|
62
|
+
Label( Character c=epsilon ) { label.lower = label.upper = c; };
|
63
|
+
|
64
|
+
// new label with two different symbols
|
65
|
+
Label( Character c1, Character c2 )
|
66
|
+
{ label.lower = c1; label.upper = c2; };
|
67
|
+
|
68
|
+
// returns the indicated symbol of the label
|
69
|
+
Character get_char( Level l ) const
|
70
|
+
{ return ((l==upper)? label.upper: label.lower); };
|
71
|
+
|
72
|
+
// returns the "upper" symbol of the label (i.e. the surface symbol)
|
73
|
+
Character upper_char() const { return label.upper; };
|
74
|
+
|
75
|
+
// returns the "lower" symbol of the label (i.e. the analysis symbol)
|
76
|
+
Character lower_char() const { return label.lower; };
|
77
|
+
|
78
|
+
// replaces symbols in a label
|
79
|
+
Label replace_char( Character c, Character nc ) const {
|
80
|
+
Label l = *this;
|
81
|
+
if (l.label.lower == c)
|
82
|
+
l.label.lower = nc;
|
83
|
+
if (l.label.upper == c)
|
84
|
+
l.label.upper = nc;
|
85
|
+
return l;
|
86
|
+
};
|
87
|
+
|
88
|
+
// operators checking the equality of labels
|
89
|
+
int operator==( Label l ) const
|
90
|
+
{ return (label.lower==l.label.lower && label.upper==l.label.upper); };
|
91
|
+
int operator!=( Label l ) const
|
92
|
+
{ return !(l == *this); };
|
93
|
+
|
94
|
+
// comparison operator needed for sorting labels in compact.C
|
95
|
+
int operator<( Label l ) const {
|
96
|
+
if (upper_char() < l.upper_char())
|
97
|
+
return true;
|
98
|
+
if (upper_char() > l.upper_char())
|
99
|
+
return false;
|
100
|
+
if (lower_char() < l.lower_char())
|
101
|
+
return true;
|
102
|
+
return false;
|
103
|
+
};
|
104
|
+
int operator>( Label l ) const {
|
105
|
+
if (upper_char() > l.upper_char())
|
106
|
+
return true;
|
107
|
+
if (upper_char() < l.upper_char())
|
108
|
+
return false;
|
109
|
+
if (lower_char() > l.lower_char())
|
110
|
+
return true;
|
111
|
+
return false;
|
112
|
+
};
|
113
|
+
|
114
|
+
// check whether the label is epsilon (i.e. both symbols are epsilon)
|
115
|
+
// transitions with epsilon labels are epsilon transitions
|
116
|
+
int is_epsilon() const
|
117
|
+
{ return (label.upper == epsilon && label.lower == epsilon); };
|
118
|
+
|
119
|
+
// check whether the "upper" symbol is epsilon
|
120
|
+
int upper_is_epsilon() const
|
121
|
+
{ return (label.upper == epsilon); };
|
122
|
+
|
123
|
+
// check whether the "lower" symbol is epsilon
|
124
|
+
int lower_is_epsilon() const
|
125
|
+
{ return (label.lower == epsilon); };
|
126
|
+
|
127
|
+
// hash function needed to store labels in a hash table
|
128
|
+
struct label_hash {
|
129
|
+
size_t operator() ( const Label l ) const {
|
130
|
+
return (size_t)l.lower_char() ^
|
131
|
+
((size_t)l.upper_char() << 16) ^
|
132
|
+
((size_t)l.upper_char() >> 16);
|
133
|
+
}
|
134
|
+
};
|
135
|
+
|
136
|
+
// comparison function needed to store labels in a map table
|
137
|
+
struct label_cmp {
|
138
|
+
bool operator() ( const Label l1, const Label l2 ) const {
|
139
|
+
return (l1.lower_char() < l2.lower_char() ||
|
140
|
+
(l1.lower_char() == l2.lower_char() &&
|
141
|
+
l1.upper_char() < l2.upper_char()));
|
142
|
+
}
|
143
|
+
};
|
144
|
+
|
145
|
+
// comparison operator needed to store labels in a hash table
|
146
|
+
struct label_eq {
|
147
|
+
bool operator() ( const Label l1, const Label l2 ) const {
|
148
|
+
return (l1.lower_char() == l2.lower_char() &&
|
149
|
+
l1.upper_char() == l2.upper_char());
|
150
|
+
}
|
151
|
+
};
|
152
|
+
};
|
153
|
+
|
154
|
+
typedef vector<Label> Analysis;
|
155
|
+
|
156
|
+
|
157
|
+
/***************** class Alphabet *******************************/
|
158
|
+
|
159
|
+
class Alphabet {
|
160
|
+
|
161
|
+
// string comparison operators needed to stored strings in a hash table
|
162
|
+
struct eqstr {
|
163
|
+
bool operator()(const char* s1, const char* s2) const {
|
164
|
+
return strcmp(s1, s2) == 0;
|
165
|
+
}
|
166
|
+
};
|
167
|
+
|
168
|
+
// data structure storing labels without repetitions (i.e. as a set)
|
169
|
+
typedef set<Label, Label::label_cmp> LabelSet;
|
170
|
+
|
171
|
+
// hash table used to map the symbols to their codes
|
172
|
+
typedef hash_map<const char*, Character, hash<const char*>,eqstr> SymbolMap;
|
173
|
+
|
174
|
+
public: // HFST addition
|
175
|
+
// hash table used to map the codes back to the symbols
|
176
|
+
typedef hash_map<Character, char*> CharMap;
|
177
|
+
|
178
|
+
// HFST addition
|
179
|
+
bool operator==(const Alphabet &alpha) const;
|
180
|
+
|
181
|
+
private:
|
182
|
+
SymbolMap sm; // maps symbols to codes
|
183
|
+
CharMap cm; // maps codes to symbols
|
184
|
+
LabelSet ls; // set of labels known to the alphabet
|
185
|
+
|
186
|
+
// add a new symbol with symbol code c
|
187
|
+
void add( const char *symbol, Character c );
|
188
|
+
|
189
|
+
public:
|
190
|
+
bool utf8;
|
191
|
+
|
192
|
+
// iterators over the set of known labels
|
193
|
+
typedef LabelSet::iterator iterator;
|
194
|
+
typedef LabelSet::const_iterator const_iterator;
|
195
|
+
Alphabet();
|
196
|
+
~Alphabet() { clear(); };
|
197
|
+
const_iterator begin() const { return ls.begin(); };
|
198
|
+
const_iterator end() const { return ls.end(); };
|
199
|
+
size_t size() const { return ls.size(); };
|
200
|
+
|
201
|
+
// HFST additions
|
202
|
+
CharMap get_char_map(void) { return cm; };
|
203
|
+
void print(void);
|
204
|
+
|
205
|
+
|
206
|
+
void clear();
|
207
|
+
void clear_char_pairs() { ls.clear(); };
|
208
|
+
|
209
|
+
// lookup a label in the alphabet
|
210
|
+
iterator find( Label l ) { return ls.find(l); };
|
211
|
+
|
212
|
+
// insert a label in the alphabet
|
213
|
+
void insert( Label l ) { if (!l.is_epsilon()) ls.insert(l); };
|
214
|
+
|
215
|
+
// insert the known symbols from another alphabet
|
216
|
+
void insert_symbols( const Alphabet& );
|
217
|
+
|
218
|
+
// insert the labels and known symbols from another alphabet
|
219
|
+
void copy( const Alphabet &a, Level level=both );
|
220
|
+
|
221
|
+
// create the alphabet of a transducer obtained by a composition operation
|
222
|
+
void compose( const Alphabet &la, const Alphabet &ua );
|
223
|
+
|
224
|
+
// add a symbol to the alphabet and return its code
|
225
|
+
Character add_symbol(const char *symbol);
|
226
|
+
|
227
|
+
// add a symbol to the alphabet with a given code
|
228
|
+
void add_symbol(const char *symbol, Character c );
|
229
|
+
|
230
|
+
// create a new marker symbol and return its code
|
231
|
+
Character new_marker( void );
|
232
|
+
void delete_markers();
|
233
|
+
|
234
|
+
// compute the complement of a symbol set
|
235
|
+
void complement( vector<Character> &sym );
|
236
|
+
|
237
|
+
// return the code of the argument symbol
|
238
|
+
int symbol2code( const char *s ) const {
|
239
|
+
SymbolMap::const_iterator p = sm.find(s);
|
240
|
+
if (p != sm.end()) return p->second;
|
241
|
+
return EOF;
|
242
|
+
};
|
243
|
+
|
244
|
+
// return the symbol for the given symbol code
|
245
|
+
const char *code2symbol( Character c ) const {
|
246
|
+
CharMap::const_iterator p=cm.find(c);
|
247
|
+
if (p == cm.end())
|
248
|
+
return NULL;
|
249
|
+
else
|
250
|
+
return p->second;
|
251
|
+
};
|
252
|
+
|
253
|
+
// write the symbol for the given symbol code into a string
|
254
|
+
void write_char( Character c, char *buffer, int *pos,
|
255
|
+
bool with_brackets=true ) const;
|
256
|
+
|
257
|
+
// write the symbol pair of a given label into a string
|
258
|
+
void write_label( Label l, char *buffer, int *pos,
|
259
|
+
bool with_brackets=true ) const;
|
260
|
+
|
261
|
+
// write the symbol for the given symbol code into a buffer and return
|
262
|
+
// a pointer to it
|
263
|
+
// the flag "with_brackets" indicates whether the angle brackets
|
264
|
+
// surrounding multi-character symbols are to be printed or not
|
265
|
+
const char *write_char( Character c, bool with_brackets=true ) const;
|
266
|
+
|
267
|
+
// write the symbol pair of a given label into a string
|
268
|
+
// and return a pointer to it
|
269
|
+
const char *write_label( Label l, bool with_brackets=true ) const;
|
270
|
+
|
271
|
+
// scan the next multi-character symbol in the argument string
|
272
|
+
int next_mcsym( char*&, bool insert=true );
|
273
|
+
|
274
|
+
// scan the next symbol in the argument string
|
275
|
+
int next_code( char*&, bool extended=true, bool insert=true );
|
276
|
+
|
277
|
+
// convert a character string into a symbol or label sequence
|
278
|
+
void string2symseq( char*, vector<Character>& );
|
279
|
+
void string2labelseq( char*, vector<Label>& );
|
280
|
+
|
281
|
+
// scan the next label in the argument string
|
282
|
+
Label next_label( char*&, bool extended=true );
|
283
|
+
|
284
|
+
// store the alphabet in the argument file (in binary form)
|
285
|
+
void store( FILE* ) const;
|
286
|
+
|
287
|
+
// read the alphabet from the argument file
|
288
|
+
void read( FILE* );
|
289
|
+
|
290
|
+
// disambiguation and printing of analyses
|
291
|
+
int compute_score( Analysis &ana );
|
292
|
+
void disambiguate( vector<Analysis> &analyses );
|
293
|
+
char *print_analysis( Analysis &ana, bool both_layers );
|
294
|
+
|
295
|
+
friend ostream &operator<<(ostream&, const Alphabet&);
|
296
|
+
};
|
297
|
+
|
298
|
+
// write the alphabet to the output stream (in readable form)
|
299
|
+
ostream &operator<<(ostream&, const Alphabet&);
|
300
|
+
}
|
301
|
+
|
302
|
+
#endif
|
data/ext/sfst/basic.cc
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
|
2
|
+
/*******************************************************************/
|
3
|
+
/* */
|
4
|
+
/* FILE basic.C */
|
5
|
+
/* MODULE basic */
|
6
|
+
/* PROGRAM SFST */
|
7
|
+
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
8
|
+
/* */
|
9
|
+
/* PURPOSE */
|
10
|
+
/* */
|
11
|
+
/*******************************************************************/
|
12
|
+
|
13
|
+
#include <stdlib.h>
|
14
|
+
#include <string.h>
|
15
|
+
|
16
|
+
#include "basic.h"
|
17
|
+
|
18
|
+
namespace SFST {
|
19
|
+
|
20
|
+
bool Switch_Bytes=false;
|
21
|
+
|
22
|
+
|
23
|
+
/*******************************************************************/
|
24
|
+
/* */
|
25
|
+
/* fst_strdup */
|
26
|
+
/* */
|
27
|
+
/*******************************************************************/
|
28
|
+
|
29
|
+
char* fst_strdup(const char* pString)
|
30
|
+
|
31
|
+
{
|
32
|
+
char* pStringCopy = (char*)malloc(strlen(pString) + 1);
|
33
|
+
if (pStringCopy == NULL) {
|
34
|
+
fprintf(stderr, "\nError: out of memory (malloc failed)\naborted.\n");
|
35
|
+
exit(1);
|
36
|
+
}
|
37
|
+
strcpy(pStringCopy, pString);
|
38
|
+
return pStringCopy;
|
39
|
+
}
|
40
|
+
|
41
|
+
|
42
|
+
/*******************************************************************/
|
43
|
+
/* */
|
44
|
+
/* read_string */
|
45
|
+
/* */
|
46
|
+
/*******************************************************************/
|
47
|
+
|
48
|
+
int read_string( char *buffer, int size, FILE *file )
|
49
|
+
|
50
|
+
{
|
51
|
+
for( int i=0; i<size; i++ ) {
|
52
|
+
int c=fgetc(file);
|
53
|
+
if (c == EOF || c == 0) {
|
54
|
+
buffer[i] = 0;
|
55
|
+
return (c==0);
|
56
|
+
}
|
57
|
+
buffer[i] = (char)c;
|
58
|
+
}
|
59
|
+
buffer[size-1] = 0;
|
60
|
+
return 0;
|
61
|
+
}
|
62
|
+
|
63
|
+
|
64
|
+
/*******************************************************************/
|
65
|
+
/* */
|
66
|
+
/* read_num */
|
67
|
+
/* */
|
68
|
+
/*******************************************************************/
|
69
|
+
|
70
|
+
size_t read_num( void *p, size_t n, FILE *file )
|
71
|
+
|
72
|
+
{
|
73
|
+
char *pp=(char*)p;
|
74
|
+
size_t result=fread( pp, 1, n, file );
|
75
|
+
if (Switch_Bytes) {
|
76
|
+
size_t e=n/2;
|
77
|
+
for( size_t i=0; i<e; i++ ) {
|
78
|
+
char tmp=pp[i];
|
79
|
+
pp[i] = pp[--n];
|
80
|
+
pp[n] = tmp;
|
81
|
+
}
|
82
|
+
}
|
83
|
+
return result;
|
84
|
+
}
|
85
|
+
}
|
@@ -15,10 +15,13 @@
|
|
15
15
|
|
16
16
|
#include <stdio.h>
|
17
17
|
|
18
|
-
|
18
|
+
namespace SFST {
|
19
19
|
|
20
|
-
|
21
|
-
int read_string( char *buffer, int size, FILE *file );
|
22
|
-
size_t read_num( void *p, size_t size, FILE *file );
|
20
|
+
extern bool Switch_Bytes;
|
23
21
|
|
22
|
+
char* fst_strdup(const char* pString);
|
23
|
+
int read_string( char *buffer, int size, FILE *file );
|
24
|
+
size_t read_num( void *p, size_t size, FILE *file );
|
25
|
+
|
26
|
+
}
|
24
27
|
#endif
|
data/ext/sfst/compact.cc
ADDED
@@ -0,0 +1,629 @@
|
|
1
|
+
/*******************************************************************/
|
2
|
+
/* */
|
3
|
+
/* FILE compact.C */
|
4
|
+
/* MODULE compact */
|
5
|
+
/* PROGRAM SFST */
|
6
|
+
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
7
|
+
/* */
|
8
|
+
/* PURPOSE Code needed for analysing data */
|
9
|
+
/* */
|
10
|
+
/*******************************************************************/
|
11
|
+
|
12
|
+
#include <stdio.h>
|
13
|
+
#include <math.h>
|
14
|
+
|
15
|
+
#include <limits.h>
|
16
|
+
|
17
|
+
#include "compact.h"
|
18
|
+
|
19
|
+
namespace SFST {
|
20
|
+
|
21
|
+
using std::equal_range;
|
22
|
+
using std::vector;
|
23
|
+
using std::pair;
|
24
|
+
|
25
|
+
class label_less {
|
26
|
+
public:
|
27
|
+
bool operator()(const Label l1, const Label l2) const {
|
28
|
+
return l1.upper_char() < l2.upper_char();
|
29
|
+
}
|
30
|
+
};
|
31
|
+
|
32
|
+
const int BUFFER_SIZE=1000;
|
33
|
+
|
34
|
+
|
35
|
+
/*******************************************************************/
|
36
|
+
/* */
|
37
|
+
/* CompactTransducer::convert */
|
38
|
+
/* */
|
39
|
+
/*******************************************************************/
|
40
|
+
|
41
|
+
void CompactTransducer::convert( CAnalysis &cana, Analysis &ana )
|
42
|
+
|
43
|
+
{
|
44
|
+
ana.resize(cana.size());
|
45
|
+
for( size_t i=0; i<cana.size(); i++ )
|
46
|
+
ana[i] = label[cana[i]];
|
47
|
+
}
|
48
|
+
|
49
|
+
|
50
|
+
/*******************************************************************/
|
51
|
+
/* */
|
52
|
+
/* CompactTransducer::analyze */
|
53
|
+
/* */
|
54
|
+
/*******************************************************************/
|
55
|
+
|
56
|
+
void CompactTransducer::analyze(unsigned int n, vector<Character> &input,
|
57
|
+
size_t ipos, CAnalysis &ca,
|
58
|
+
vector<CAnalysis> &analyses )
|
59
|
+
{
|
60
|
+
// "n" is the number of the current transducer node/state
|
61
|
+
// "input" is the sequence of input symbols
|
62
|
+
// "ipos" is the input position currently analysed
|
63
|
+
// "ca" stores the incomplete analysis string
|
64
|
+
// "analyses" stores the analyses found so far
|
65
|
+
|
66
|
+
if (analyses.size() > 10000)
|
67
|
+
return; // limit the maximal number of analyses
|
68
|
+
|
69
|
+
// Is the input string fully analyzed and the current node a final node?
|
70
|
+
if (finalp[n] && ipos == input.size())
|
71
|
+
// store the new analysis
|
72
|
+
analyses.push_back(ca);
|
73
|
+
|
74
|
+
// follow the epsilon transitions
|
75
|
+
// first_arc[n] is the number of the first outgoing transition of node n
|
76
|
+
// first_arc[n+1]-1 is the number of the last outgoing transition of node n
|
77
|
+
// first_arc[n+1] is the number of the first outgoing transition of node n+1
|
78
|
+
unsigned int i;
|
79
|
+
for( i=first_arc[n];
|
80
|
+
i<first_arc[n+1] && label[i].upper_char() == Label::epsilon;
|
81
|
+
i++)
|
82
|
+
{
|
83
|
+
ca.push_back(i);
|
84
|
+
analyze(target_node[i], input, ipos, ca, analyses);
|
85
|
+
ca.pop_back();
|
86
|
+
}
|
87
|
+
|
88
|
+
// follow the non-epsilon transitions
|
89
|
+
|
90
|
+
// scan the next input symbol
|
91
|
+
if (ipos < input.size()) {
|
92
|
+
// find the set of arcs with matching upper character in the sorted list
|
93
|
+
pair<Label*,Label*>range =
|
94
|
+
equal_range(label+i, label+first_arc[n+1], Label(input[ipos]),
|
95
|
+
label_less());
|
96
|
+
unsigned int to = (unsigned int)(range.second - label);
|
97
|
+
|
98
|
+
// follow the non-epsilon transitions
|
99
|
+
for( i=(unsigned)(range.first-label); i<to; i++) {
|
100
|
+
ca.push_back(i);
|
101
|
+
analyze(target_node[i], input, ipos+1, ca, analyses);
|
102
|
+
ca.pop_back();
|
103
|
+
}
|
104
|
+
}
|
105
|
+
}
|
106
|
+
|
107
|
+
|
108
|
+
/*******************************************************************/
|
109
|
+
/* */
|
110
|
+
/* CompactTransducer::analyze_string */
|
111
|
+
/* */
|
112
|
+
/*******************************************************************/
|
113
|
+
|
114
|
+
void CompactTransducer::analyze_string( char *s, vector<CAnalysis> &analyses )
|
115
|
+
|
116
|
+
{
|
117
|
+
// "s" input string to be analyzed
|
118
|
+
// "analyses" is the data structure in which the results are stored
|
119
|
+
// and returned
|
120
|
+
|
121
|
+
vector<Character> input;
|
122
|
+
|
123
|
+
alphabet.string2symseq( s, input );
|
124
|
+
|
125
|
+
analyses.clear();
|
126
|
+
CAnalysis ca; // data structure where the current incomplete analysis
|
127
|
+
// is stored
|
128
|
+
analyze(0, input, 0, ca, analyses); // start the analysis
|
129
|
+
|
130
|
+
if (analyses.size() > 10000)
|
131
|
+
fprintf(stderr,"Warning: Only the first 10000 analyses considered for \"%s\"!\n", s);
|
132
|
+
|
133
|
+
if (simplest_only && analyses.size() > 1)
|
134
|
+
disambiguate( analyses ); // select the simplest analyses
|
135
|
+
}
|
136
|
+
|
137
|
+
|
138
|
+
|
139
|
+
/*******************************************************************/
|
140
|
+
/* */
|
141
|
+
/* CompactTransducer::~CompactTransducer */
|
142
|
+
/* */
|
143
|
+
/*******************************************************************/
|
144
|
+
|
145
|
+
CompactTransducer::~CompactTransducer()
|
146
|
+
|
147
|
+
{
|
148
|
+
delete[] finalp;
|
149
|
+
delete[] first_arc;
|
150
|
+
delete[] label;
|
151
|
+
delete[] target_node;
|
152
|
+
delete[] final_logprob;
|
153
|
+
delete[] arc_logprob;
|
154
|
+
}
|
155
|
+
|
156
|
+
|
157
|
+
/*******************************************************************/
|
158
|
+
/* */
|
159
|
+
/* CompactTransducer::CompactTransducer */
|
160
|
+
/* */
|
161
|
+
/*******************************************************************/
|
162
|
+
|
163
|
+
CompactTransducer::CompactTransducer()
|
164
|
+
|
165
|
+
{
|
166
|
+
both_layers = false;
|
167
|
+
simplest_only = false;
|
168
|
+
number_of_nodes = 0;
|
169
|
+
number_of_arcs = 0;
|
170
|
+
finalp = NULL;
|
171
|
+
first_arc = NULL;
|
172
|
+
label = NULL;
|
173
|
+
target_node = NULL;
|
174
|
+
arc_logprob = final_logprob = (float*)NULL;
|
175
|
+
}
|
176
|
+
|
177
|
+
|
178
|
+
/*******************************************************************/
|
179
|
+
/* */
|
180
|
+
/* CompactTransducer::read_finalp */
|
181
|
+
/* */
|
182
|
+
/*******************************************************************/
|
183
|
+
|
184
|
+
void CompactTransducer::read_finalp( FILE *file )
|
185
|
+
|
186
|
+
{
|
187
|
+
int k=0;
|
188
|
+
unsigned char n=0;
|
189
|
+
for( size_t i=0; i<number_of_nodes; i++ ) {
|
190
|
+
if (k == 0) {
|
191
|
+
n = (unsigned char)fgetc(file);
|
192
|
+
k = 8;
|
193
|
+
}
|
194
|
+
k--;
|
195
|
+
if (n & (1 << k))
|
196
|
+
finalp[i] = 1;
|
197
|
+
else
|
198
|
+
finalp[i] = 0;
|
199
|
+
}
|
200
|
+
}
|
201
|
+
|
202
|
+
|
203
|
+
/*******************************************************************/
|
204
|
+
/* */
|
205
|
+
/* CompactTransducer::read_first_arcs */
|
206
|
+
/* */
|
207
|
+
/*******************************************************************/
|
208
|
+
|
209
|
+
void CompactTransducer::read_first_arcs( FILE *file )
|
210
|
+
|
211
|
+
{
|
212
|
+
int k=0;
|
213
|
+
unsigned int n=0;
|
214
|
+
int bits=(int)ceil(log(number_of_arcs+1)/log(2));
|
215
|
+
|
216
|
+
for( size_t i=0; i<=number_of_nodes; i++ ) {
|
217
|
+
first_arc[i] = n >> (sizeof(n)*8 - bits);
|
218
|
+
n <<= bits;
|
219
|
+
k -= bits;
|
220
|
+
if (k < 0) {
|
221
|
+
read_num(&n,sizeof(n),file);
|
222
|
+
first_arc[i] |= n >> (sizeof(n)*8 + k);
|
223
|
+
n <<= -k;
|
224
|
+
k += (int)sizeof(n) * 8;
|
225
|
+
}
|
226
|
+
}
|
227
|
+
}
|
228
|
+
|
229
|
+
|
230
|
+
/*******************************************************************/
|
231
|
+
/* */
|
232
|
+
/* CompactTransducer::read_target_nodes */
|
233
|
+
/* */
|
234
|
+
/*******************************************************************/
|
235
|
+
|
236
|
+
void CompactTransducer::read_target_nodes( FILE *file )
|
237
|
+
|
238
|
+
{
|
239
|
+
int k=0;
|
240
|
+
unsigned int n=0;
|
241
|
+
int bits=(int)ceil(log(number_of_nodes)/log(2));
|
242
|
+
|
243
|
+
for( size_t i=0; i<number_of_arcs; i++ ) {
|
244
|
+
target_node[i] = n >> (sizeof(n)*8 - bits);
|
245
|
+
n <<= bits;
|
246
|
+
k -= bits;
|
247
|
+
if (k < 0) {
|
248
|
+
read_num(&n,sizeof(n),file);
|
249
|
+
target_node[i] |= n >> (sizeof(n)*8 + k);
|
250
|
+
n <<= -k;
|
251
|
+
k += (int)sizeof(n) * 8;
|
252
|
+
}
|
253
|
+
}
|
254
|
+
}
|
255
|
+
|
256
|
+
|
257
|
+
/*******************************************************************/
|
258
|
+
/* */
|
259
|
+
/* CompactTransducer::read_labels */
|
260
|
+
/* */
|
261
|
+
/*******************************************************************/
|
262
|
+
|
263
|
+
void CompactTransducer::read_labels( FILE *file )
|
264
|
+
|
265
|
+
{
|
266
|
+
size_t N=0;
|
267
|
+
vector<Label> Num2Label(alphabet.size());
|
268
|
+
for( Alphabet::const_iterator it=alphabet.begin();
|
269
|
+
it != alphabet.end(); it++ )
|
270
|
+
{
|
271
|
+
Label l=*it;
|
272
|
+
Num2Label[N++] = l;
|
273
|
+
}
|
274
|
+
|
275
|
+
int k=0;
|
276
|
+
unsigned int n=0;
|
277
|
+
int bits=(int)ceil(log((double)alphabet.size())/log(2));
|
278
|
+
|
279
|
+
for( size_t i=0; i<number_of_arcs; i++ ) {
|
280
|
+
unsigned int l = n >> (sizeof(n)*8 - bits);
|
281
|
+
n <<= bits;
|
282
|
+
k -= bits;
|
283
|
+
if (k < 0) {
|
284
|
+
read_num(&n,sizeof(n),file);
|
285
|
+
l |= n >> (sizeof(n)*8 + k);
|
286
|
+
n <<= -k;
|
287
|
+
k += (int)sizeof(n) * 8;
|
288
|
+
}
|
289
|
+
label[i] = Num2Label[l];
|
290
|
+
}
|
291
|
+
}
|
292
|
+
|
293
|
+
|
294
|
+
/*******************************************************************/
|
295
|
+
/* */
|
296
|
+
/* CompactTransducer::read_probs */
|
297
|
+
/* */
|
298
|
+
/*******************************************************************/
|
299
|
+
|
300
|
+
void CompactTransducer::read_probs( FILE *file )
|
301
|
+
|
302
|
+
{
|
303
|
+
size_t n,m;
|
304
|
+
fread(&n, sizeof(n), 1, file);
|
305
|
+
if (fread(&m, sizeof(n), 1, file) != 1 ||
|
306
|
+
n != node_count() || m != arc_count())
|
307
|
+
{
|
308
|
+
fprintf(stderr,"Error: incompatible probability file!\n");
|
309
|
+
exit(1);
|
310
|
+
}
|
311
|
+
final_logprob = new float[n];
|
312
|
+
arc_logprob = new float[m];
|
313
|
+
fread(final_logprob, sizeof(float), n, file);
|
314
|
+
if (fread(arc_logprob, sizeof(float), n, file) != n) {
|
315
|
+
fprintf(stderr,"Error: in probability file!\n");
|
316
|
+
exit(1);
|
317
|
+
}
|
318
|
+
}
|
319
|
+
|
320
|
+
|
321
|
+
/*******************************************************************/
|
322
|
+
/* */
|
323
|
+
/* CompactTransducer::CompactTransducer */
|
324
|
+
/* */
|
325
|
+
/*******************************************************************/
|
326
|
+
|
327
|
+
CompactTransducer::CompactTransducer( FILE *file, FILE *pfile )
|
328
|
+
|
329
|
+
{
|
330
|
+
both_layers = false;
|
331
|
+
simplest_only = false;
|
332
|
+
|
333
|
+
if (fgetc(file) != 'c')
|
334
|
+
throw "Error: wrong file format (not a compact transducer)\n";
|
335
|
+
|
336
|
+
alphabet.read(file);
|
337
|
+
|
338
|
+
read_num(&number_of_nodes,sizeof(number_of_nodes),file);
|
339
|
+
read_num(&number_of_arcs,sizeof(number_of_arcs),file);
|
340
|
+
|
341
|
+
if (!ferror(file)) {
|
342
|
+
// memory allocation
|
343
|
+
finalp = new char[number_of_nodes];
|
344
|
+
first_arc = new unsigned[number_of_nodes+1];
|
345
|
+
label = new Label[number_of_arcs];
|
346
|
+
target_node = new unsigned[number_of_arcs];
|
347
|
+
|
348
|
+
// reading the data
|
349
|
+
read_finalp(file);
|
350
|
+
read_first_arcs(file);
|
351
|
+
read_labels(file);
|
352
|
+
read_target_nodes(file);
|
353
|
+
}
|
354
|
+
|
355
|
+
if (pfile == NULL)
|
356
|
+
arc_logprob = final_logprob = (float*)NULL;
|
357
|
+
else
|
358
|
+
read_probs(pfile);
|
359
|
+
}
|
360
|
+
|
361
|
+
|
362
|
+
/*******************************************************************/
|
363
|
+
/* */
|
364
|
+
/* CompactTransducer::longest_match2 */
|
365
|
+
/* */
|
366
|
+
/*******************************************************************/
|
367
|
+
|
368
|
+
void CompactTransducer::longest_match2(unsigned int n, char *string, int l,
|
369
|
+
CAnalysis &ca, int &bl, CAnalysis &ba)
|
370
|
+
{
|
371
|
+
// n: transducer state
|
372
|
+
// string: rest string
|
373
|
+
// l: length of current analysis
|
374
|
+
// bl: length of the currently longest match
|
375
|
+
// ca: current analysis
|
376
|
+
// ba: best analysis
|
377
|
+
|
378
|
+
if (finalp[n] && l > bl) {
|
379
|
+
// store the new analysis
|
380
|
+
bl = l;
|
381
|
+
ba = ca; // copy the arc vector
|
382
|
+
}
|
383
|
+
|
384
|
+
// follow the epsilon transitions
|
385
|
+
unsigned int i;
|
386
|
+
for( i=first_arc[n];
|
387
|
+
i<first_arc[n+1] && label[i].upper_char() == Label::epsilon;
|
388
|
+
i++)
|
389
|
+
{
|
390
|
+
ca.push_back(i);
|
391
|
+
longest_match2(target_node[i], string, l, ca, bl, ba);
|
392
|
+
ca.pop_back();
|
393
|
+
}
|
394
|
+
|
395
|
+
// follow the non-epsilon transitions
|
396
|
+
char *end=string;
|
397
|
+
int c=alphabet.next_code(end, false, false);
|
398
|
+
l += (int)(end - string);
|
399
|
+
if (c != EOF) {
|
400
|
+
// find the set of arcs with matching upper character in the sort list
|
401
|
+
pair<Label*,Label*>range =
|
402
|
+
equal_range(label+i, label+first_arc[n+1], Label((Character)c),
|
403
|
+
label_less());
|
404
|
+
unsigned int to = (unsigned int)(range.second - label);
|
405
|
+
for( i=(unsigned)(range.first-label); i<to; i++) {
|
406
|
+
ca.push_back(i);
|
407
|
+
longest_match2(target_node[i], end, l, ca, bl, ba);
|
408
|
+
ca.pop_back();
|
409
|
+
}
|
410
|
+
}
|
411
|
+
}
|
412
|
+
|
413
|
+
|
414
|
+
/*******************************************************************/
|
415
|
+
/* */
|
416
|
+
/* CompactTransducer::print_analysis */
|
417
|
+
/* */
|
418
|
+
/*******************************************************************/
|
419
|
+
|
420
|
+
char *CompactTransducer::print_analysis( CAnalysis &cana )
|
421
|
+
|
422
|
+
{
|
423
|
+
Analysis ana;
|
424
|
+
convert(cana, ana);
|
425
|
+
return alphabet.print_analysis( ana, both_layers );
|
426
|
+
}
|
427
|
+
|
428
|
+
|
429
|
+
/*******************************************************************/
|
430
|
+
/* */
|
431
|
+
/* CompactTransducer::longest_match */
|
432
|
+
/* */
|
433
|
+
/*******************************************************************/
|
434
|
+
|
435
|
+
const char *CompactTransducer::longest_match( char* &string )
|
436
|
+
|
437
|
+
{
|
438
|
+
vector<char> analysis;
|
439
|
+
CAnalysis ca, ba;
|
440
|
+
int l=0;
|
441
|
+
longest_match2(0, string, 0, ca, l, ba);
|
442
|
+
|
443
|
+
// no match? return the next character
|
444
|
+
if (ba.size() == 0) {
|
445
|
+
int c=alphabet.next_code(string, false, false);
|
446
|
+
return alphabet.code2symbol((Character)c);
|
447
|
+
}
|
448
|
+
|
449
|
+
string += l;
|
450
|
+
return print_analysis( ba );
|
451
|
+
}
|
452
|
+
|
453
|
+
|
454
|
+
/*******************************************************************/
|
455
|
+
/* */
|
456
|
+
/* CompactTransducer::disambiguate */
|
457
|
+
/* */
|
458
|
+
/*******************************************************************/
|
459
|
+
|
460
|
+
void CompactTransducer::disambiguate( vector<CAnalysis> &analyses )
|
461
|
+
|
462
|
+
{
|
463
|
+
// compute the scores
|
464
|
+
int bestscore=INT_MIN;
|
465
|
+
vector<int> score;
|
466
|
+
Analysis ana;
|
467
|
+
|
468
|
+
for( size_t i=0; i<analyses.size(); i++ ) {
|
469
|
+
convert(analyses[i], ana);
|
470
|
+
score.push_back(alphabet.compute_score(ana));
|
471
|
+
if (bestscore < score[i])
|
472
|
+
bestscore = score[i];
|
473
|
+
}
|
474
|
+
|
475
|
+
// delete suboptimal analyses
|
476
|
+
size_t k=0;
|
477
|
+
for( size_t i=0; i<analyses.size(); i++ )
|
478
|
+
if (score[i] == bestscore)
|
479
|
+
analyses[k++] = analyses[i];
|
480
|
+
analyses.resize(k);
|
481
|
+
}
|
482
|
+
|
483
|
+
|
484
|
+
/*******************************************************************/
|
485
|
+
/* */
|
486
|
+
/* CompactTransducer::train2 */
|
487
|
+
/* */
|
488
|
+
/*******************************************************************/
|
489
|
+
|
490
|
+
bool CompactTransducer::train2( char *s, vector<double> &arcfreq,
|
491
|
+
vector<double> &finalfreq )
|
492
|
+
{
|
493
|
+
vector<CAnalysis> analyses;
|
494
|
+
vector<Label> input;
|
495
|
+
alphabet.string2labelseq( s, input );
|
496
|
+
|
497
|
+
CAnalysis ca; // data structure where the analysis is stored
|
498
|
+
unsigned int n=0;
|
499
|
+
bool failure=false;
|
500
|
+
for( size_t i=0; i<input.size(); i++ ) {
|
501
|
+
failure = true;
|
502
|
+
for( unsigned int k=first_arc[n]; k<first_arc[n+1]; k++) {
|
503
|
+
if (label[k] == input[i]) {
|
504
|
+
ca.push_back(k);
|
505
|
+
n = target_node[k];
|
506
|
+
failure = false;
|
507
|
+
break;
|
508
|
+
}
|
509
|
+
}
|
510
|
+
if (failure)
|
511
|
+
break;
|
512
|
+
}
|
513
|
+
if (failure || !finalp[n]) {
|
514
|
+
fprintf(stderr,"Warning: The following input is not covered:\n%s\n", s);
|
515
|
+
return false;
|
516
|
+
}
|
517
|
+
|
518
|
+
for( size_t k=0; k<ca.size(); k++ )
|
519
|
+
arcfreq[ca[k]]++;
|
520
|
+
finalfreq[target_node[ca.back()]]++;
|
521
|
+
|
522
|
+
return true;
|
523
|
+
}
|
524
|
+
|
525
|
+
|
526
|
+
/*******************************************************************/
|
527
|
+
/* */
|
528
|
+
/* CompactTransducer::train */
|
529
|
+
/* */
|
530
|
+
/*******************************************************************/
|
531
|
+
|
532
|
+
bool CompactTransducer::train( char *s, vector<double> &arcfreq,
|
533
|
+
vector<double> &finalfreq )
|
534
|
+
{
|
535
|
+
vector<CAnalysis> analyses;
|
536
|
+
vector<Character> input;
|
537
|
+
alphabet.string2symseq( s, input );
|
538
|
+
|
539
|
+
CAnalysis ca; // data structure where the current incomplete analysis
|
540
|
+
// is stored
|
541
|
+
analyze(0, input, 0, ca, analyses); // start the analysis
|
542
|
+
|
543
|
+
if (analyses.size() > 10000)
|
544
|
+
return true; // ignore inputs with more than 10000 analyses
|
545
|
+
else if (analyses.size() == 0)
|
546
|
+
return false;
|
547
|
+
|
548
|
+
if (simplest_only && analyses.size() > 1)
|
549
|
+
disambiguate( analyses ); // select the simplest analyses
|
550
|
+
|
551
|
+
if (analyses.size() > 0) {
|
552
|
+
double incr = 1.0 / (double)analyses.size();
|
553
|
+
CAnalysis arcs;
|
554
|
+
|
555
|
+
for( size_t i=0; i<analyses.size(); i++ ) {
|
556
|
+
CAnalysis &arcs=analyses[i];
|
557
|
+
for( size_t k=0; k<arcs.size(); k++ )
|
558
|
+
arcfreq[arcs[k]] += incr;
|
559
|
+
finalfreq[target_node[arcs.back()]] += incr;
|
560
|
+
}
|
561
|
+
}
|
562
|
+
return true;
|
563
|
+
}
|
564
|
+
|
565
|
+
|
566
|
+
/*******************************************************************/
|
567
|
+
/* */
|
568
|
+
/* CompactTransducer::estimate_probs */
|
569
|
+
/* */
|
570
|
+
/*******************************************************************/
|
571
|
+
|
572
|
+
void CompactTransducer::estimate_probs( vector<double> &arcfreq,
|
573
|
+
vector<double> &finalfreq )
|
574
|
+
{
|
575
|
+
// turn frequencies into probabilities
|
576
|
+
for( size_t n=0; n<finalfreq.size(); n++ ) {
|
577
|
+
double sum = finalfreq[n];
|
578
|
+
for( size_t a=first_arc[n]; a<first_arc[n+1]; a++ )
|
579
|
+
sum += arcfreq[a];
|
580
|
+
if (sum == 0.0)
|
581
|
+
sum = 1.0;
|
582
|
+
finalfreq[n] = finalfreq[n] / sum;
|
583
|
+
for( size_t a=first_arc[n]; a<first_arc[n+1]; a++ )
|
584
|
+
arcfreq[a] = arcfreq[a] / sum;
|
585
|
+
}
|
586
|
+
}
|
587
|
+
|
588
|
+
|
589
|
+
|
590
|
+
/*******************************************************************/
|
591
|
+
/* */
|
592
|
+
/* CompactTransducer::compute_probs */
|
593
|
+
/* */
|
594
|
+
/*******************************************************************/
|
595
|
+
|
596
|
+
void CompactTransducer::compute_probs( vector<CAnalysis> &analyses,
|
597
|
+
vector<double> &prob )
|
598
|
+
{
|
599
|
+
prob.resize(analyses.size());
|
600
|
+
double sum=0.0;
|
601
|
+
for( size_t i=0; i<analyses.size(); i++ ) {
|
602
|
+
CAnalysis &a=analyses[i];
|
603
|
+
|
604
|
+
// compute the probability
|
605
|
+
double logprob=0.0;
|
606
|
+
for( size_t k=0; k<a.size(); k++ )
|
607
|
+
logprob += arc_logprob[a[k]];
|
608
|
+
logprob += final_logprob[target_node[a.back()]];
|
609
|
+
prob[i] = exp(logprob);
|
610
|
+
sum += prob[i];
|
611
|
+
}
|
612
|
+
|
613
|
+
// sort the analyses
|
614
|
+
vector<CAnalysis> oldanalyses(analyses);
|
615
|
+
vector<double> oldprob(prob);
|
616
|
+
for( size_t i=0; i<analyses.size(); i++ ) {
|
617
|
+
prob[i] = -1.0;
|
618
|
+
size_t n=0;
|
619
|
+
for( size_t k=0; k<oldanalyses.size(); k++ )
|
620
|
+
if (prob[i] < oldprob[k]) {
|
621
|
+
prob[i] = oldprob[k];
|
622
|
+
n = k;
|
623
|
+
}
|
624
|
+
analyses[i] = oldanalyses[n];
|
625
|
+
oldprob[n] = -1.0;
|
626
|
+
prob[i] /= sum; // normalization
|
627
|
+
}
|
628
|
+
}
|
629
|
+
}
|