ruby-sfst 0.4.3 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -0
- data/COPYING +280 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +54 -0
- data/README.md +1 -1
- data/Rakefile +9 -18
- data/bin/console +7 -0
- data/bin/setup +6 -0
- data/ext/sfst/alphabet.cc +879 -0
- data/ext/sfst/alphabet.h +302 -0
- data/ext/sfst/basic.cc +85 -0
- data/ext/{sfst_machine → sfst}/basic.h +7 -4
- data/ext/sfst/compact.cc +629 -0
- data/ext/sfst/compact.h +100 -0
- data/ext/sfst/determinise.cc +279 -0
- data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
- data/ext/sfst/fst.cc +1150 -0
- data/ext/sfst/fst.h +374 -0
- data/ext/sfst/hopcroft.cc +681 -0
- data/ext/sfst/interface.cc +1921 -0
- data/ext/sfst/interface.h +171 -0
- data/ext/sfst/make-compact.cc +323 -0
- data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
- data/ext/sfst/mem.h +80 -0
- data/ext/sfst/operators.cc +1273 -0
- data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
- data/ext/sfst/sgi.h +72 -0
- data/ext/sfst/utf8.cc +149 -0
- data/ext/{sfst_machine → sfst}/utf8.h +7 -4
- data/lib/sfst.rb +2 -1
- data/lib/sfst/version.rb +1 -1
- data/ruby-sfst.gemspec +23 -23
- metadata +107 -35
- data/ext/sfst_machine/alphabet.cc +0 -812
- data/ext/sfst_machine/alphabet.h +0 -273
- data/ext/sfst_machine/basic.cc +0 -84
- data/ext/sfst_machine/compact.cc +0 -616
- data/ext/sfst_machine/compact.h +0 -98
- data/ext/sfst_machine/determinise.cc +0 -303
- data/ext/sfst_machine/fst.cc +0 -1000
- data/ext/sfst_machine/fst.h +0 -369
- data/ext/sfst_machine/interface.cc +0 -1842
- data/ext/sfst_machine/interface.h +0 -93
- data/ext/sfst_machine/make-compact.cc +0 -327
- data/ext/sfst_machine/mem.h +0 -74
- data/ext/sfst_machine/operators.cc +0 -1131
- data/ext/sfst_machine/sgi.h +0 -44
- data/ext/sfst_machine/utf8.cc +0 -146
- data/test/test_sfst.fst +0 -3
- data/test/test_sfst.rb +0 -114
data/ext/sfst_machine/alphabet.h
DELETED
@@ -1,273 +0,0 @@
|
|
1
|
-
/*******************************************************************/
|
2
|
-
/* */
|
3
|
-
/* FILE alphabet.h */
|
4
|
-
/* MODULE alphabet */
|
5
|
-
/* PROGRAM SFST */
|
6
|
-
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
7
|
-
/* */
|
8
|
-
/* PURPOSE finite state tools */
|
9
|
-
/* */
|
10
|
-
/*******************************************************************/
|
11
|
-
|
12
|
-
#ifndef _ALPHABET_H_
|
13
|
-
#define _ALPHABET_H_
|
14
|
-
|
15
|
-
#include <stdio.h>
|
16
|
-
|
17
|
-
#include "basic.h"
|
18
|
-
|
19
|
-
#include <set>
|
20
|
-
#include <vector>
|
21
|
-
|
22
|
-
#include <iostream>
|
23
|
-
|
24
|
-
#include <cstring>
|
25
|
-
|
26
|
-
#include "sgi.h"
|
27
|
-
|
28
|
-
#ifndef CODE_DATA_TYPE
|
29
|
-
typedef unsigned short Character; // data type of the symbol codes
|
30
|
-
#else
|
31
|
-
typedef unsigned CODE_DATA_TYPE Character;
|
32
|
-
#endif
|
33
|
-
|
34
|
-
// data type used to indicate whether some action is to be performed
|
35
|
-
// on the analysis level (lower) or the surface level (upper)
|
36
|
-
typedef enum {upper, lower} Level;
|
37
|
-
|
38
|
-
extern char EpsilonString[]; // holds the symbol representing the empty string
|
39
|
-
// which is usually "<>"
|
40
|
-
|
41
|
-
|
42
|
-
/***************** class Label ***********************************/
|
43
|
-
|
44
|
-
class Label {
|
45
|
-
|
46
|
-
private:
|
47
|
-
// data structure where the two symbols are stored
|
48
|
-
struct {
|
49
|
-
Character lower;
|
50
|
-
Character upper;
|
51
|
-
} label;
|
52
|
-
|
53
|
-
public:
|
54
|
-
static const Character epsilon=0; // code of the empty symbol
|
55
|
-
|
56
|
-
// new label with two identical symbols
|
57
|
-
Label( Character c=epsilon ) { label.lower = label.upper = c; };
|
58
|
-
|
59
|
-
// new label with two different symbols
|
60
|
-
Label( Character c1, Character c2 )
|
61
|
-
{ label.lower = c1; label.upper = c2; };
|
62
|
-
|
63
|
-
// returns the indicated symbol of the label
|
64
|
-
Character get_char( Level l ) const
|
65
|
-
{ return ((l==upper)? label.upper: label.lower); };
|
66
|
-
|
67
|
-
// returns the "upper" symbol of the label (i.e. the surface symbol)
|
68
|
-
Character upper_char() const { return label.upper; };
|
69
|
-
|
70
|
-
// returns the "lower" symbol of the label (i.e. the analysis symbol)
|
71
|
-
Character lower_char() const { return label.lower; };
|
72
|
-
|
73
|
-
// replaces symbols in a label
|
74
|
-
Label replace_char( Character c, Character nc ) const {
|
75
|
-
Label l = *this;
|
76
|
-
if (l.label.lower == c)
|
77
|
-
l.label.lower = nc;
|
78
|
-
if (l.label.upper == c)
|
79
|
-
l.label.upper = nc;
|
80
|
-
return l;
|
81
|
-
};
|
82
|
-
|
83
|
-
// operators checking the equality of labels
|
84
|
-
int operator==( Label l ) const
|
85
|
-
{ return (label.lower==l.label.lower && label.upper==l.label.upper); };
|
86
|
-
int operator!=( Label l ) const
|
87
|
-
{ return !(l == *this); };
|
88
|
-
|
89
|
-
// comparison operator needed for sorting labels
|
90
|
-
int operator<( Label l ) const {
|
91
|
-
return (upper_char() < l.upper_char()); };
|
92
|
-
|
93
|
-
// check whether the label is epsilon (i.e. both symbols are epsilon)
|
94
|
-
// transitions with epsilon labels are epsilon transitions
|
95
|
-
int is_epsilon() const
|
96
|
-
{ return (label.upper == epsilon && label.lower == epsilon); };
|
97
|
-
|
98
|
-
// check whether the "upper" symbol is epsilon
|
99
|
-
int upper_is_epsilon() const
|
100
|
-
{ return (label.upper == epsilon); };
|
101
|
-
|
102
|
-
// check whether the "lower" symbol is epsilon
|
103
|
-
int lower_is_epsilon() const
|
104
|
-
{ return (label.lower == epsilon); };
|
105
|
-
|
106
|
-
// hash function needed to store labels in a hash table
|
107
|
-
struct label_hash {
|
108
|
-
size_t operator() ( const Label l ) const {
|
109
|
-
return (size_t)l.lower_char() ^
|
110
|
-
((size_t)l.upper_char() << 16) ^
|
111
|
-
((size_t)l.upper_char() >> 16);
|
112
|
-
}
|
113
|
-
};
|
114
|
-
|
115
|
-
// hash function needed to store labels in a hash table
|
116
|
-
struct label_cmp {
|
117
|
-
bool operator() ( const Label l1, const Label l2 ) const {
|
118
|
-
return (l1.lower_char() < l2.lower_char() ||
|
119
|
-
(l1.lower_char() == l2.lower_char() &&
|
120
|
-
l1.upper_char() < l2.upper_char()));
|
121
|
-
}
|
122
|
-
};
|
123
|
-
|
124
|
-
// comparison operator needed to store labels in a hash table
|
125
|
-
struct label_eq {
|
126
|
-
bool operator() ( const Label l1, const Label l2 ) const {
|
127
|
-
return (l1.lower_char() == l2.lower_char() &&
|
128
|
-
l1.upper_char() == l2.upper_char());
|
129
|
-
}
|
130
|
-
};
|
131
|
-
};
|
132
|
-
|
133
|
-
typedef std::vector<Label> Analysis;
|
134
|
-
|
135
|
-
|
136
|
-
/***************** class Alphabet *******************************/
|
137
|
-
|
138
|
-
class Alphabet {
|
139
|
-
|
140
|
-
// string comparison operators needed to stored strings in a hash table
|
141
|
-
struct eqstr {
|
142
|
-
bool operator()(const char* s1, const char* s2) const {
|
143
|
-
return strcmp(s1, s2) == 0;
|
144
|
-
}
|
145
|
-
};
|
146
|
-
|
147
|
-
// data structure storing labels without repetitions (i.e. as a set)
|
148
|
-
typedef std::set<Label, Label::label_cmp> LabelSet;
|
149
|
-
|
150
|
-
// hash table used to map the symbols to their codes
|
151
|
-
typedef hash_map<const char*, Character, hash<const char*>,eqstr> SymbolMap;
|
152
|
-
|
153
|
-
// hash table used to map the codes back to the symbols
|
154
|
-
typedef hash_map<Character, char*> CharMap;
|
155
|
-
|
156
|
-
private:
|
157
|
-
SymbolMap sm; // maps symbols to codes
|
158
|
-
|
159
|
-
CharMap cm; // maps codes to symbols
|
160
|
-
LabelSet ls; // set of labels known to the alphabet
|
161
|
-
|
162
|
-
// add a new symbol with symbol code c
|
163
|
-
void add( const char *symbol, Character c );
|
164
|
-
|
165
|
-
public:
|
166
|
-
bool utf8;
|
167
|
-
|
168
|
-
// iterators over the set of known labels
|
169
|
-
typedef LabelSet::iterator iterator;
|
170
|
-
typedef LabelSet::const_iterator const_iterator;
|
171
|
-
Alphabet();
|
172
|
-
~Alphabet() { clear(); };
|
173
|
-
const_iterator begin() const { return ls.begin(); };
|
174
|
-
const_iterator end() const { return ls.end(); };
|
175
|
-
size_t size() const { return ls.size(); };
|
176
|
-
|
177
|
-
void clear();
|
178
|
-
void clear_char_pairs() { ls.clear(); };
|
179
|
-
|
180
|
-
// lookup a label in the alphabet
|
181
|
-
iterator find( Label l ) { return ls.find(l); };
|
182
|
-
|
183
|
-
// insert a label in the alphabet
|
184
|
-
void insert( Label l ) { if (!l.is_epsilon()) ls.insert(l); };
|
185
|
-
|
186
|
-
// insert the known symbols from another alphabet
|
187
|
-
void insert_symbols( const Alphabet& );
|
188
|
-
|
189
|
-
// insert the labels and known symbols from another alphabet
|
190
|
-
void copy( const Alphabet& );
|
191
|
-
|
192
|
-
// create the alphabet of a transducer obtained by a composition operation
|
193
|
-
void compose( const Alphabet &la, const Alphabet &ua );
|
194
|
-
|
195
|
-
// add a symbol to the alphabet and return its code
|
196
|
-
Character add_symbol(const char *symbol);
|
197
|
-
|
198
|
-
// add a symbol to the alphabet with a given code
|
199
|
-
void add_symbol(const char *symbol, Character c );
|
200
|
-
|
201
|
-
// create a new marker symbol and return its code
|
202
|
-
Character new_marker( void );
|
203
|
-
void delete_markers();
|
204
|
-
|
205
|
-
// compute the complement of a symbol set
|
206
|
-
void complement( std::vector<Character> &sym );
|
207
|
-
|
208
|
-
// return the code of the argument symbol
|
209
|
-
int symbol2code( const char *s ) const {
|
210
|
-
SymbolMap::const_iterator p = sm.find(s);
|
211
|
-
if (p != sm.end()) return p->second;
|
212
|
-
return EOF;
|
213
|
-
};
|
214
|
-
|
215
|
-
// return the symbol for the given symbol code
|
216
|
-
const char *code2symbol( Character c ) const {
|
217
|
-
CharMap::const_iterator p=cm.find(c);
|
218
|
-
if (p == cm.end())
|
219
|
-
return NULL;
|
220
|
-
else
|
221
|
-
return p->second;
|
222
|
-
};
|
223
|
-
|
224
|
-
// write the symbol for the given symbol code into a string
|
225
|
-
void write_char( Character c, char *buffer, int *pos,
|
226
|
-
bool with_brackets=true ) const;
|
227
|
-
|
228
|
-
// write the symbol pair of a given label into a string
|
229
|
-
void write_label( Label l, char *buffer, int *pos,
|
230
|
-
bool with_brackets=true ) const;
|
231
|
-
|
232
|
-
// write the symbol for the given symbol code into a buffer and return
|
233
|
-
// a pointer to it
|
234
|
-
// the flag "with_brackets" indicates whether the angle brackets
|
235
|
-
// surrounding multi-character symbols are to be printed or not
|
236
|
-
const char *write_char( Character c, bool with_brackets=true ) const;
|
237
|
-
|
238
|
-
// write the symbol pair of a given label into a string
|
239
|
-
// and return a pointer to it
|
240
|
-
const char *write_label( Label l, bool with_brackets=true ) const;
|
241
|
-
|
242
|
-
// scan the next multi-character symbol in the argument string
|
243
|
-
int next_mcsym( char*&, bool insert=true );
|
244
|
-
|
245
|
-
// scan the next symbol in the argument string
|
246
|
-
int next_code( char*&, bool extended=true, bool insert=true );
|
247
|
-
|
248
|
-
// convert a character string into a symbol or label sequence
|
249
|
-
void string2symseq( char*, std::vector<Character>& );
|
250
|
-
void string2labelseq( char*, std::vector<Label>& );
|
251
|
-
|
252
|
-
// scan the next label in the argument string
|
253
|
-
Label next_label( char*&, bool extended=true );
|
254
|
-
|
255
|
-
// store the alphabet in the argument file (in binary form)
|
256
|
-
void store( FILE* ) const;
|
257
|
-
|
258
|
-
// read the alphabet from the argument file
|
259
|
-
void read( FILE* );
|
260
|
-
|
261
|
-
// disambiguation and printing of analyses
|
262
|
-
int compute_score( Analysis &ana );
|
263
|
-
void disambiguate( std::vector<Analysis> &analyses );
|
264
|
-
char *print_analysis( Analysis &ana, bool both_layers );
|
265
|
-
|
266
|
-
friend std::ostream &operator<<(std::ostream&, const Alphabet&);
|
267
|
-
};
|
268
|
-
|
269
|
-
// write the alphabet to the output stream (in readable form)
|
270
|
-
std::ostream &operator<<(std::ostream&, const Alphabet&);
|
271
|
-
|
272
|
-
|
273
|
-
#endif
|
data/ext/sfst_machine/basic.cc
DELETED
@@ -1,84 +0,0 @@
|
|
1
|
-
|
2
|
-
/*******************************************************************/
|
3
|
-
/* */
|
4
|
-
/* FILE basic.C */
|
5
|
-
/* MODULE basic */
|
6
|
-
/* PROGRAM SFST */
|
7
|
-
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
8
|
-
/* */
|
9
|
-
/* PURPOSE */
|
10
|
-
/* */
|
11
|
-
/*******************************************************************/
|
12
|
-
|
13
|
-
#include <stdlib.h>
|
14
|
-
#include <string.h>
|
15
|
-
|
16
|
-
#include "basic.h"
|
17
|
-
|
18
|
-
bool Switch_Bytes=false;
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
/*******************************************************************/
|
23
|
-
/* */
|
24
|
-
/* fst_strdup */
|
25
|
-
/* */
|
26
|
-
/*******************************************************************/
|
27
|
-
|
28
|
-
char* fst_strdup(const char* pString)
|
29
|
-
|
30
|
-
{
|
31
|
-
char* pStringCopy = (char*)malloc(strlen(pString) + 1);
|
32
|
-
if (pStringCopy == NULL) {
|
33
|
-
fprintf(stderr, "\nError: out of memory (malloc failed)\naborted.\n");
|
34
|
-
exit(1);
|
35
|
-
}
|
36
|
-
strcpy(pStringCopy, pString);
|
37
|
-
return pStringCopy;
|
38
|
-
}
|
39
|
-
|
40
|
-
|
41
|
-
/*******************************************************************/
|
42
|
-
/* */
|
43
|
-
/* read_string */
|
44
|
-
/* */
|
45
|
-
/*******************************************************************/
|
46
|
-
|
47
|
-
int read_string( char *buffer, int size, FILE *file )
|
48
|
-
|
49
|
-
{
|
50
|
-
for( int i=0; i<size; i++ ) {
|
51
|
-
int c=fgetc(file);
|
52
|
-
if (c == EOF || c == 0) {
|
53
|
-
buffer[i] = 0;
|
54
|
-
return (c==0);
|
55
|
-
}
|
56
|
-
buffer[i] = (char)c;
|
57
|
-
}
|
58
|
-
buffer[size-1] = 0;
|
59
|
-
return 0;
|
60
|
-
}
|
61
|
-
|
62
|
-
|
63
|
-
/*******************************************************************/
|
64
|
-
/* */
|
65
|
-
/* read_num */
|
66
|
-
/* */
|
67
|
-
/*******************************************************************/
|
68
|
-
|
69
|
-
size_t read_num( void *p, size_t n, FILE *file )
|
70
|
-
|
71
|
-
{
|
72
|
-
char *pp=(char*)p;
|
73
|
-
size_t result=fread( pp, 1, n, file );
|
74
|
-
if (Switch_Bytes) {
|
75
|
-
size_t e=n/2;
|
76
|
-
for( size_t i=0; i<e; i++ ) {
|
77
|
-
char tmp=pp[i];
|
78
|
-
pp[i] = pp[--n];
|
79
|
-
pp[n] = tmp;
|
80
|
-
}
|
81
|
-
}
|
82
|
-
return result;
|
83
|
-
}
|
84
|
-
|
data/ext/sfst_machine/compact.cc
DELETED
@@ -1,616 +0,0 @@
|
|
1
|
-
/*******************************************************************/
|
2
|
-
/* */
|
3
|
-
/* FILE compact.C */
|
4
|
-
/* MODULE compact */
|
5
|
-
/* PROGRAM SFST */
|
6
|
-
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
7
|
-
/* */
|
8
|
-
/* PURPOSE Code needed for analysing data */
|
9
|
-
/* */
|
10
|
-
/*******************************************************************/
|
11
|
-
|
12
|
-
#include <stdio.h>
|
13
|
-
#include <math.h>
|
14
|
-
|
15
|
-
#include <limits.h>
|
16
|
-
|
17
|
-
#include "compact.h"
|
18
|
-
|
19
|
-
using std::equal_range;
|
20
|
-
using std::vector;
|
21
|
-
using std::pair;
|
22
|
-
|
23
|
-
const int BUFFER_SIZE=1000;
|
24
|
-
|
25
|
-
|
26
|
-
/*******************************************************************/
|
27
|
-
/* */
|
28
|
-
/* CompactTransducer::convert */
|
29
|
-
/* */
|
30
|
-
/*******************************************************************/
|
31
|
-
|
32
|
-
void CompactTransducer::convert( CAnalysis &cana, Analysis &ana )
|
33
|
-
|
34
|
-
{
|
35
|
-
ana.resize(cana.size());
|
36
|
-
for( size_t i=0; i<cana.size(); i++ )
|
37
|
-
ana[i] = label[cana[i]];
|
38
|
-
}
|
39
|
-
|
40
|
-
|
41
|
-
/*******************************************************************/
|
42
|
-
/* */
|
43
|
-
/* CompactTransducer::analyze */
|
44
|
-
/* */
|
45
|
-
/*******************************************************************/
|
46
|
-
|
47
|
-
void CompactTransducer::analyze(unsigned int n, vector<Character> &input,
|
48
|
-
size_t ipos, CAnalysis &ca,
|
49
|
-
vector<CAnalysis> &analyses )
|
50
|
-
{
|
51
|
-
// "n" is the number of the current transducer node/state
|
52
|
-
// "input" is the sequence of input symbols
|
53
|
-
// "ipos" is the input position currently analysed
|
54
|
-
// "ca" stores the incomplete analysis string
|
55
|
-
// "analyses" stores the analyses found so far
|
56
|
-
|
57
|
-
if (analyses.size() > 10000)
|
58
|
-
return; // limit the maximal number of analyses
|
59
|
-
|
60
|
-
// Is the input string fully analyzed and the current node a final node?
|
61
|
-
if (finalp[n] && ipos == input.size())
|
62
|
-
// store the new analysis
|
63
|
-
analyses.push_back(ca);
|
64
|
-
|
65
|
-
// follow the epsilon transitions
|
66
|
-
// first_arc[n] is the number of the first outgoing transition of node n
|
67
|
-
// first_arc[n+1]-1 is the number of the last outgoing transition of node n
|
68
|
-
// first_arc[n+1] is the number of the first outgoing transition of node n+1
|
69
|
-
unsigned int i;
|
70
|
-
for( i=first_arc[n];
|
71
|
-
i<first_arc[n+1] && label[i].upper_char() == Label::epsilon;
|
72
|
-
i++)
|
73
|
-
{
|
74
|
-
ca.push_back(i);
|
75
|
-
analyze(target_node[i], input, ipos, ca, analyses);
|
76
|
-
ca.pop_back();
|
77
|
-
}
|
78
|
-
|
79
|
-
// follow the non-epsilon transitions
|
80
|
-
|
81
|
-
// scan the next input symbol
|
82
|
-
if (ipos < input.size()) {
|
83
|
-
// find the set of arcs with matching upper character in the sorted list
|
84
|
-
pair<Label*,Label*>range =
|
85
|
-
equal_range(label+i, label+first_arc[n+1], Label(input[ipos]));
|
86
|
-
unsigned int to = (unsigned int)(range.second - label);
|
87
|
-
|
88
|
-
// follow the non-epsilon transitions
|
89
|
-
for( i=range.first-label; i<to; i++) {
|
90
|
-
ca.push_back(i);
|
91
|
-
analyze(target_node[i], input, ipos+1, ca, analyses);
|
92
|
-
ca.pop_back();
|
93
|
-
}
|
94
|
-
}
|
95
|
-
}
|
96
|
-
|
97
|
-
|
98
|
-
/*******************************************************************/
|
99
|
-
/* */
|
100
|
-
/* CompactTransducer::analyze_string */
|
101
|
-
/* */
|
102
|
-
/*******************************************************************/
|
103
|
-
|
104
|
-
void CompactTransducer::analyze_string( char *s, vector<CAnalysis> &analyses )
|
105
|
-
|
106
|
-
{
|
107
|
-
// "s" input string to be analyzed
|
108
|
-
// "analyses" is the data structure in which the results are stored
|
109
|
-
// and returned
|
110
|
-
|
111
|
-
vector<Character> input;
|
112
|
-
alphabet.string2symseq( s, input );
|
113
|
-
|
114
|
-
analyses.clear();
|
115
|
-
CAnalysis ca; // data structure where the current incomplete analysis
|
116
|
-
// is stored
|
117
|
-
analyze(0, input, 0, ca, analyses); // start the analysis
|
118
|
-
|
119
|
-
if (analyses.size() > 10000)
|
120
|
-
fprintf(stderr,"Warning: Only the first 10000 analyses considered for \"%s\"!\n", s);
|
121
|
-
|
122
|
-
if (simplest_only && analyses.size() > 1)
|
123
|
-
disambiguate( analyses ); // select the simplest analyses
|
124
|
-
}
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
/*******************************************************************/
|
129
|
-
/* */
|
130
|
-
/* CompactTransducer::~CompactTransducer */
|
131
|
-
/* */
|
132
|
-
/*******************************************************************/
|
133
|
-
|
134
|
-
CompactTransducer::~CompactTransducer()
|
135
|
-
|
136
|
-
{
|
137
|
-
delete[] finalp;
|
138
|
-
delete[] first_arc;
|
139
|
-
delete[] label;
|
140
|
-
delete[] target_node;
|
141
|
-
delete[] final_logprob;
|
142
|
-
delete[] arc_logprob;
|
143
|
-
}
|
144
|
-
|
145
|
-
|
146
|
-
/*******************************************************************/
|
147
|
-
/* */
|
148
|
-
/* CompactTransducer::CompactTransducer */
|
149
|
-
/* */
|
150
|
-
/*******************************************************************/
|
151
|
-
|
152
|
-
CompactTransducer::CompactTransducer()
|
153
|
-
|
154
|
-
{
|
155
|
-
both_layers = false;
|
156
|
-
simplest_only = false;
|
157
|
-
number_of_nodes = 0;
|
158
|
-
number_of_arcs = 0;
|
159
|
-
finalp = NULL;
|
160
|
-
first_arc = NULL;
|
161
|
-
label = NULL;
|
162
|
-
target_node = NULL;
|
163
|
-
arc_logprob = final_logprob = (float*)NULL;
|
164
|
-
}
|
165
|
-
|
166
|
-
|
167
|
-
/*******************************************************************/
|
168
|
-
/* */
|
169
|
-
/* CompactTransducer::read_finalp */
|
170
|
-
/* */
|
171
|
-
/*******************************************************************/
|
172
|
-
|
173
|
-
void CompactTransducer::read_finalp( FILE *file )
|
174
|
-
|
175
|
-
{
|
176
|
-
int k=0;
|
177
|
-
unsigned char n=0;
|
178
|
-
for( size_t i=0; i<number_of_nodes; i++ ) {
|
179
|
-
if (k == 0) {
|
180
|
-
n = (unsigned char)fgetc(file);
|
181
|
-
k = 8;
|
182
|
-
}
|
183
|
-
k--;
|
184
|
-
if (n & (1 << k))
|
185
|
-
finalp[i] = 1;
|
186
|
-
else
|
187
|
-
finalp[i] = 0;
|
188
|
-
}
|
189
|
-
}
|
190
|
-
|
191
|
-
|
192
|
-
/*******************************************************************/
|
193
|
-
/* */
|
194
|
-
/* CompactTransducer::read_first_arcs */
|
195
|
-
/* */
|
196
|
-
/*******************************************************************/
|
197
|
-
|
198
|
-
void CompactTransducer::read_first_arcs( FILE *file )
|
199
|
-
|
200
|
-
{
|
201
|
-
int k=0;
|
202
|
-
unsigned int n=0;
|
203
|
-
size_t bits=(size_t)ceil(log(number_of_arcs+1)/log(2));
|
204
|
-
|
205
|
-
for( size_t i=0; i<=number_of_nodes; i++ ) {
|
206
|
-
first_arc[i] = n >> (sizeof(n)*8 - bits);
|
207
|
-
n <<= bits;
|
208
|
-
k -= bits;
|
209
|
-
if (k < 0) {
|
210
|
-
read_num(&n,sizeof(n),file);
|
211
|
-
first_arc[i] |= n >> (sizeof(n)*8 + k);
|
212
|
-
n <<= -k;
|
213
|
-
k += sizeof(n) * 8;
|
214
|
-
}
|
215
|
-
}
|
216
|
-
}
|
217
|
-
|
218
|
-
|
219
|
-
/*******************************************************************/
|
220
|
-
/* */
|
221
|
-
/* CompactTransducer::read_target_nodes */
|
222
|
-
/* */
|
223
|
-
/*******************************************************************/
|
224
|
-
|
225
|
-
void CompactTransducer::read_target_nodes( FILE *file )
|
226
|
-
|
227
|
-
{
|
228
|
-
int k=0;
|
229
|
-
unsigned int n=0;
|
230
|
-
size_t bits=(size_t)ceil(log(number_of_nodes)/log(2));
|
231
|
-
|
232
|
-
for( size_t i=0; i<number_of_arcs; i++ ) {
|
233
|
-
target_node[i] = n >> (sizeof(n)*8 - bits);
|
234
|
-
n <<= bits;
|
235
|
-
k -= bits;
|
236
|
-
if (k < 0) {
|
237
|
-
read_num(&n,sizeof(n),file);
|
238
|
-
target_node[i] |= n >> (sizeof(n)*8 + k);
|
239
|
-
n <<= -k;
|
240
|
-
k += sizeof(n) * 8;
|
241
|
-
}
|
242
|
-
}
|
243
|
-
}
|
244
|
-
|
245
|
-
|
246
|
-
/*******************************************************************/
|
247
|
-
/* */
|
248
|
-
/* CompactTransducer::read_labels */
|
249
|
-
/* */
|
250
|
-
/*******************************************************************/
|
251
|
-
|
252
|
-
void CompactTransducer::read_labels( FILE *file )
|
253
|
-
|
254
|
-
{
|
255
|
-
size_t N=0;
|
256
|
-
Label Num2Label[alphabet.size()];
|
257
|
-
for( Alphabet::const_iterator it=alphabet.begin();
|
258
|
-
it != alphabet.end(); it++ )
|
259
|
-
{
|
260
|
-
Label l=*it;
|
261
|
-
Num2Label[N++] = l;
|
262
|
-
}
|
263
|
-
|
264
|
-
int k=0;
|
265
|
-
unsigned int n=0;
|
266
|
-
size_t bits=(size_t)ceil(log(alphabet.size())/log(2));
|
267
|
-
|
268
|
-
for( size_t i=0; i<number_of_arcs; i++ ) {
|
269
|
-
unsigned int l = n >> (sizeof(n)*8 - bits);
|
270
|
-
n <<= bits;
|
271
|
-
k -= bits;
|
272
|
-
if (k < 0) {
|
273
|
-
read_num(&n,sizeof(n),file);
|
274
|
-
l |= n >> (sizeof(n)*8 + k);
|
275
|
-
n <<= -k;
|
276
|
-
k += sizeof(n) * 8;
|
277
|
-
}
|
278
|
-
label[i] = Num2Label[l];
|
279
|
-
}
|
280
|
-
}
|
281
|
-
|
282
|
-
|
283
|
-
/*******************************************************************/
|
284
|
-
/* */
|
285
|
-
/* CompactTransducer::read_probs */
|
286
|
-
/* */
|
287
|
-
/*******************************************************************/
|
288
|
-
|
289
|
-
void CompactTransducer::read_probs( FILE *file )
|
290
|
-
|
291
|
-
{
|
292
|
-
size_t n,m;
|
293
|
-
fread(&n, sizeof(n), 1, file);
|
294
|
-
if (fread(&m, sizeof(n), 1, file) != 1 ||
|
295
|
-
n != node_count() || m != arc_count())
|
296
|
-
{
|
297
|
-
fprintf(stderr,"Error: incompatible probability file!\n");
|
298
|
-
exit(1);
|
299
|
-
}
|
300
|
-
final_logprob = new float[n];
|
301
|
-
arc_logprob = new float[m];
|
302
|
-
fread(final_logprob, sizeof(float), n, file);
|
303
|
-
if (fread(arc_logprob, sizeof(float), n, file) != n) {
|
304
|
-
fprintf(stderr,"Error: in probability file!\n");
|
305
|
-
exit(1);
|
306
|
-
}
|
307
|
-
}
|
308
|
-
|
309
|
-
|
310
|
-
/*******************************************************************/
|
311
|
-
/* */
|
312
|
-
/* CompactTransducer::CompactTransducer */
|
313
|
-
/* */
|
314
|
-
/*******************************************************************/
|
315
|
-
|
316
|
-
CompactTransducer::CompactTransducer( FILE *file, FILE *pfile )
|
317
|
-
|
318
|
-
{
|
319
|
-
both_layers = false;
|
320
|
-
simplest_only = false;
|
321
|
-
|
322
|
-
if (fgetc(file) != 'c')
|
323
|
-
throw "Error: wrong file format (not a compact transducer)\n";
|
324
|
-
|
325
|
-
alphabet.read(file);
|
326
|
-
|
327
|
-
read_num(&number_of_nodes,sizeof(number_of_nodes),file);
|
328
|
-
read_num(&number_of_arcs,sizeof(number_of_arcs),file);
|
329
|
-
|
330
|
-
if (!ferror(file)) {
|
331
|
-
// memory allocation
|
332
|
-
finalp = new char[number_of_nodes];
|
333
|
-
first_arc = new unsigned[number_of_nodes+1];
|
334
|
-
label = new Label[number_of_arcs];
|
335
|
-
target_node = new unsigned[number_of_arcs];
|
336
|
-
|
337
|
-
// reading the data
|
338
|
-
read_finalp(file);
|
339
|
-
read_first_arcs(file);
|
340
|
-
read_labels(file);
|
341
|
-
read_target_nodes(file);
|
342
|
-
}
|
343
|
-
|
344
|
-
if (pfile == NULL)
|
345
|
-
arc_logprob = final_logprob = (float*)NULL;
|
346
|
-
else
|
347
|
-
read_probs(pfile);
|
348
|
-
}
|
349
|
-
|
350
|
-
|
351
|
-
/*******************************************************************/
|
352
|
-
/* */
|
353
|
-
/* CompactTransducer::longest_match2 */
|
354
|
-
/* */
|
355
|
-
/*******************************************************************/
|
356
|
-
|
357
|
-
void CompactTransducer::longest_match2(unsigned int n, char *string, int l,
|
358
|
-
CAnalysis &ca, int &bl, CAnalysis &ba)
|
359
|
-
{
|
360
|
-
// n: transducer state
|
361
|
-
// string: rest string
|
362
|
-
// l: length of current analysis
|
363
|
-
// bl: length of the currently longest match
|
364
|
-
// ca: current analysis
|
365
|
-
// ba: best analysis
|
366
|
-
|
367
|
-
if (finalp[n] && l > bl) {
|
368
|
-
// store the new analysis
|
369
|
-
bl = l;
|
370
|
-
ba = ca; // copy the arc vector
|
371
|
-
}
|
372
|
-
|
373
|
-
// follow the epsilon transitions
|
374
|
-
unsigned int i;
|
375
|
-
for( i=first_arc[n];
|
376
|
-
i<first_arc[n+1] && label[i].upper_char() == Label::epsilon;
|
377
|
-
i++)
|
378
|
-
{
|
379
|
-
ca.push_back(i);
|
380
|
-
longest_match2(target_node[i], string, l, ca, bl, ba);
|
381
|
-
ca.pop_back();
|
382
|
-
}
|
383
|
-
|
384
|
-
// follow the non-epsilon transitions
|
385
|
-
char *end=string;
|
386
|
-
int c=alphabet.next_code(end, false, false);
|
387
|
-
l += end-string;
|
388
|
-
if (c != EOF) {
|
389
|
-
// find the set of arcs with matching upper character in the sort list
|
390
|
-
pair<Label*,Label*>range =
|
391
|
-
equal_range(label+i, label+first_arc[n+1], Label((Character)c));
|
392
|
-
unsigned int to = (unsigned int)(range.second - label);
|
393
|
-
for( i=range.first-label; i<to; i++) {
|
394
|
-
ca.push_back(i);
|
395
|
-
longest_match2(target_node[i], end, l, ca, bl, ba);
|
396
|
-
ca.pop_back();
|
397
|
-
}
|
398
|
-
}
|
399
|
-
}
|
400
|
-
|
401
|
-
|
402
|
-
/*******************************************************************/
|
403
|
-
/* */
|
404
|
-
/* CompactTransducer::print_analysis */
|
405
|
-
/* */
|
406
|
-
/*******************************************************************/
|
407
|
-
|
408
|
-
char *CompactTransducer::print_analysis( CAnalysis &cana )
|
409
|
-
|
410
|
-
{
|
411
|
-
Analysis ana;
|
412
|
-
convert(cana, ana);
|
413
|
-
return alphabet.print_analysis( ana, both_layers );
|
414
|
-
}
|
415
|
-
|
416
|
-
|
417
|
-
/*******************************************************************/
|
418
|
-
/* */
|
419
|
-
/* CompactTransducer::longest_match */
|
420
|
-
/* */
|
421
|
-
/*******************************************************************/
|
422
|
-
|
423
|
-
const char *CompactTransducer::longest_match( char* &string )
|
424
|
-
|
425
|
-
{
|
426
|
-
vector<char> analysis;
|
427
|
-
CAnalysis ca, ba;
|
428
|
-
int l=0;
|
429
|
-
longest_match2(0, string, 0, ca, l, ba);
|
430
|
-
|
431
|
-
// no match? return the next character
|
432
|
-
if (ba.size() == 0) {
|
433
|
-
int c=alphabet.next_code(string, false, false);
|
434
|
-
return alphabet.code2symbol(c);
|
435
|
-
}
|
436
|
-
|
437
|
-
string += l;
|
438
|
-
return print_analysis( ba );
|
439
|
-
}
|
440
|
-
|
441
|
-
|
442
|
-
/*******************************************************************/
|
443
|
-
/* */
|
444
|
-
/* CompactTransducer::disambiguate */
|
445
|
-
/* */
|
446
|
-
/*******************************************************************/
|
447
|
-
|
448
|
-
void CompactTransducer::disambiguate( vector<CAnalysis> &analyses )
|
449
|
-
|
450
|
-
{
|
451
|
-
// compute the scores
|
452
|
-
int bestscore=INT_MIN;
|
453
|
-
vector<int> score;
|
454
|
-
Analysis ana;
|
455
|
-
|
456
|
-
for( size_t i=0; i<analyses.size(); i++ ) {
|
457
|
-
convert(analyses[i], ana);
|
458
|
-
score.push_back(alphabet.compute_score(ana));
|
459
|
-
if (bestscore < score[i])
|
460
|
-
bestscore = score[i];
|
461
|
-
}
|
462
|
-
|
463
|
-
// delete suboptimal analyses
|
464
|
-
size_t k=0;
|
465
|
-
for( size_t i=0; i<analyses.size(); i++ )
|
466
|
-
if (score[i] == bestscore)
|
467
|
-
analyses[k++] = analyses[i];
|
468
|
-
analyses.resize(k);
|
469
|
-
}
|
470
|
-
|
471
|
-
|
472
|
-
/*******************************************************************/
|
473
|
-
/* */
|
474
|
-
/* CompactTransducer::train2 */
|
475
|
-
/* */
|
476
|
-
/*******************************************************************/
|
477
|
-
|
478
|
-
bool CompactTransducer::train2( char *s, vector<double> &arcfreq,
|
479
|
-
vector<double> &finalfreq )
|
480
|
-
{
|
481
|
-
vector<CAnalysis> analyses;
|
482
|
-
vector<Label> input;
|
483
|
-
alphabet.string2labelseq( s, input );
|
484
|
-
|
485
|
-
CAnalysis ca; // data structure where the analysis is stored
|
486
|
-
unsigned int n=0;
|
487
|
-
bool failure=false;
|
488
|
-
for( size_t i=0; i<input.size(); i++ ) {
|
489
|
-
failure = true;
|
490
|
-
for( unsigned int k=first_arc[n]; k<first_arc[n+1]; k++) {
|
491
|
-
if (label[k] == input[i]) {
|
492
|
-
ca.push_back(k);
|
493
|
-
n = target_node[k];
|
494
|
-
failure = false;
|
495
|
-
break;
|
496
|
-
}
|
497
|
-
}
|
498
|
-
if (failure)
|
499
|
-
break;
|
500
|
-
}
|
501
|
-
if (failure || !finalp[n]) {
|
502
|
-
fprintf(stderr,"Warning: The following input is not covered:\n%s\n", s);
|
503
|
-
return false;
|
504
|
-
}
|
505
|
-
|
506
|
-
for( size_t k=0; k<ca.size(); k++ )
|
507
|
-
arcfreq[ca[k]]++;
|
508
|
-
finalfreq[target_node[ca.back()]]++;
|
509
|
-
|
510
|
-
return true;
|
511
|
-
}
|
512
|
-
|
513
|
-
|
514
|
-
/*******************************************************************/
|
515
|
-
/* */
|
516
|
-
/* CompactTransducer::train */
|
517
|
-
/* */
|
518
|
-
/*******************************************************************/
|
519
|
-
|
520
|
-
bool CompactTransducer::train( char *s, vector<double> &arcfreq,
|
521
|
-
vector<double> &finalfreq )
|
522
|
-
{
|
523
|
-
vector<CAnalysis> analyses;
|
524
|
-
vector<Character> input;
|
525
|
-
alphabet.string2symseq( s, input );
|
526
|
-
|
527
|
-
CAnalysis ca; // data structure where the current incomplete analysis
|
528
|
-
// is stored
|
529
|
-
analyze(0, input, 0, ca, analyses); // start the analysis
|
530
|
-
|
531
|
-
if (analyses.size() > 10000)
|
532
|
-
return true; // ignore inputs with more than 10000 analyses
|
533
|
-
else if (analyses.size() == 0)
|
534
|
-
return false;
|
535
|
-
|
536
|
-
if (simplest_only && analyses.size() > 1)
|
537
|
-
disambiguate( analyses ); // select the simplest analyses
|
538
|
-
|
539
|
-
if (analyses.size() > 0) {
|
540
|
-
double incr = 1.0 / analyses.size();
|
541
|
-
CAnalysis arcs;
|
542
|
-
|
543
|
-
for( size_t i=0; i<analyses.size(); i++ ) {
|
544
|
-
CAnalysis &arcs=analyses[i];
|
545
|
-
for( size_t k=0; k<arcs.size(); k++ )
|
546
|
-
arcfreq[arcs[k]] += incr;
|
547
|
-
finalfreq[target_node[arcs.back()]] += incr;
|
548
|
-
}
|
549
|
-
}
|
550
|
-
return true;
|
551
|
-
}
|
552
|
-
|
553
|
-
|
554
|
-
/*******************************************************************/
|
555
|
-
/* */
|
556
|
-
/* CompactTransducer::estimate_probs */
|
557
|
-
/* */
|
558
|
-
/*******************************************************************/
|
559
|
-
|
560
|
-
void CompactTransducer::estimate_probs( vector<double> &arcfreq,
|
561
|
-
vector<double> &finalfreq )
|
562
|
-
{
|
563
|
-
// turn frequencies into probabilities
|
564
|
-
for( size_t n=0; n<finalfreq.size(); n++ ) {
|
565
|
-
double sum = finalfreq[n];
|
566
|
-
for( size_t a=first_arc[n]; a<first_arc[n+1]; a++ )
|
567
|
-
sum += arcfreq[a];
|
568
|
-
if (sum == 0.0)
|
569
|
-
sum = 1.0;
|
570
|
-
finalfreq[n] = finalfreq[n] / sum;
|
571
|
-
for( size_t a=first_arc[n]; a<first_arc[n+1]; a++ )
|
572
|
-
arcfreq[a] = arcfreq[a] / sum;
|
573
|
-
}
|
574
|
-
}
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
/*******************************************************************/
|
579
|
-
/* */
|
580
|
-
/* CompactTransducer::compute_probs */
|
581
|
-
/* */
|
582
|
-
/*******************************************************************/
|
583
|
-
|
584
|
-
void CompactTransducer::compute_probs( vector<CAnalysis> &analyses,
|
585
|
-
vector<double> &prob )
|
586
|
-
{
|
587
|
-
prob.resize(analyses.size());
|
588
|
-
double sum=0.0;
|
589
|
-
for( size_t i=0; i<analyses.size(); i++ ) {
|
590
|
-
CAnalysis &a=analyses[i];
|
591
|
-
|
592
|
-
// compute the probability
|
593
|
-
double logprob=0.0;
|
594
|
-
for( size_t k=0; k<a.size(); k++ )
|
595
|
-
logprob += arc_logprob[a[k]];
|
596
|
-
logprob += final_logprob[target_node[a.back()]];
|
597
|
-
prob[i] = exp(logprob);
|
598
|
-
sum += prob[i];
|
599
|
-
}
|
600
|
-
|
601
|
-
// sort the analyses
|
602
|
-
vector<CAnalysis> oldanalyses(analyses);
|
603
|
-
vector<double> oldprob(prob);
|
604
|
-
for( size_t i=0; i<analyses.size(); i++ ) {
|
605
|
-
prob[i] = -1.0;
|
606
|
-
int n=0;
|
607
|
-
for( size_t k=0; k<oldanalyses.size(); k++ )
|
608
|
-
if (prob[i] < oldprob[k]) {
|
609
|
-
prob[i] = oldprob[k];
|
610
|
-
n = k;
|
611
|
-
}
|
612
|
-
analyses[i] = oldanalyses[n];
|
613
|
-
oldprob[n] = -1.0;
|
614
|
-
prob[i] /= sum; // normalization
|
615
|
-
}
|
616
|
-
}
|