ruby-sfst 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +1 -0
- data/Manifest +31 -0
- data/README.rdoc +25 -0
- data/Rakefile +22 -0
- data/ext/sfst_machine/alphabet.C +807 -0
- data/ext/sfst_machine/alphabet.h +281 -0
- data/ext/sfst_machine/basic.C +84 -0
- data/ext/sfst_machine/basic.h +24 -0
- data/ext/sfst_machine/compact.C +616 -0
- data/ext/sfst_machine/compact.h +98 -0
- data/ext/sfst_machine/determinise.C +304 -0
- data/ext/sfst_machine/extconf.rb +4 -0
- data/ext/sfst_machine/fst-compiler.C +2375 -0
- data/ext/sfst_machine/fst-compiler.h +113 -0
- data/ext/sfst_machine/fst-compiler.yy +213 -0
- data/ext/sfst_machine/fst.C +966 -0
- data/ext/sfst_machine/fst.h +365 -0
- data/ext/sfst_machine/interface.C +1838 -0
- data/ext/sfst_machine/interface.h +94 -0
- data/ext/sfst_machine/make-compact.C +328 -0
- data/ext/sfst_machine/make-compact.h +34 -0
- data/ext/sfst_machine/mem.h +74 -0
- data/ext/sfst_machine/operators.C +1131 -0
- data/ext/sfst_machine/sfst_machine.cc +411 -0
- data/ext/sfst_machine/utf8-scanner.C +2197 -0
- data/ext/sfst_machine/utf8-scanner.ll +179 -0
- data/ext/sfst_machine/utf8.C +146 -0
- data/ext/sfst_machine/utf8.h +19 -0
- data/lib/sfst.rb +99 -0
- data/ruby-sfst.gemspec +34 -0
- data/test/test_sfst.fst +3 -0
- data/test/test_sfst.rb +119 -0
- metadata +100 -0
@@ -0,0 +1,365 @@
|
|
1
|
+
/*******************************************************************/
|
2
|
+
/* */
|
3
|
+
/* FILE fst.h */
|
4
|
+
/* MODULE fst */
|
5
|
+
/* PROGRAM SFST */
|
6
|
+
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
7
|
+
/* */
|
8
|
+
/* PURPOSE finite state tools */
|
9
|
+
/* */
|
10
|
+
/*******************************************************************/
|
11
|
+
|
12
|
+
#ifndef _FST_H_
|
13
|
+
#define _FST_H_
|
14
|
+
|
15
|
+
#include "alphabet.h"
|
16
|
+
|
17
|
+
|
18
|
+
/*******************************************************************/
|
19
|
+
/* include commands */
|
20
|
+
/*******************************************************************/
|
21
|
+
|
22
|
+
#include <string>
|
23
|
+
|
24
|
+
#include <vector>
|
25
|
+
|
26
|
+
#include "mem.h"
|
27
|
+
|
28
|
+
typedef unsigned short VType;
|
29
|
+
|
30
|
+
extern int Quiet;
|
31
|
+
|
32
|
+
class Node;
|
33
|
+
class Arc;
|
34
|
+
class Arcs;
|
35
|
+
class Transducer;
|
36
|
+
|
37
|
+
|
38
|
+
struct hashf {
|
39
|
+
size_t operator()(const Node *n) const { return (size_t) n; }
|
40
|
+
};
|
41
|
+
struct equalf {
|
42
|
+
int operator()(const Node *n1, const Node *n2) const { return n1==n2; }
|
43
|
+
};
|
44
|
+
typedef __gnu_cxx::hash_set<Node*, hashf, equalf> NodeHashSet;
|
45
|
+
|
46
|
+
|
47
|
+
|
48
|
+
/***************** class Arc *************************************/
|
49
|
+
|
50
|
+
class Arc {
|
51
|
+
|
52
|
+
private:
|
53
|
+
Label l;
|
54
|
+
Node *target;
|
55
|
+
Arc *next;
|
56
|
+
|
57
|
+
public:
|
58
|
+
void init( Label ll, Node *node ) { l=ll; target=node; };
|
59
|
+
Label label( void ) const { return l; };
|
60
|
+
Node *target_node( void ) { return target; };
|
61
|
+
const Node *target_node( void ) const { return target; };
|
62
|
+
|
63
|
+
friend class Arcs;
|
64
|
+
friend class ArcsIter;
|
65
|
+
};
|
66
|
+
|
67
|
+
|
68
|
+
/***************** class Arcs ************************************/
|
69
|
+
|
70
|
+
class Arcs {
|
71
|
+
|
72
|
+
private:
|
73
|
+
Arc *first_arcp;
|
74
|
+
Arc *first_epsilon_arcp;
|
75
|
+
|
76
|
+
public:
|
77
|
+
void init( void ) { first_arcp = first_epsilon_arcp = NULL; };
|
78
|
+
Arcs( void ) { init(); };
|
79
|
+
Node *target_node( Label l );
|
80
|
+
const Node *target_node( Label l ) const;
|
81
|
+
void add_arc( Label, Node*, Transducer* );
|
82
|
+
int remove_arc( Arc* );
|
83
|
+
bool is_empty( void ) const { return !(first_arcp || first_epsilon_arcp); };
|
84
|
+
bool epsilon_transition_exists( void ) const { return first_epsilon_arcp != NULL; };
|
85
|
+
bool non_epsilon_transition_exists( void ) const { return first_arcp != NULL; };
|
86
|
+
int size( void ) const;
|
87
|
+
|
88
|
+
friend class ArcsIter;
|
89
|
+
};
|
90
|
+
|
91
|
+
|
92
|
+
/***************** class ArcsIter ********************************/
|
93
|
+
|
94
|
+
class ArcsIter {
|
95
|
+
|
96
|
+
// ArcsIter iterates over the arcs starting with epsilon arcs
|
97
|
+
|
98
|
+
private:
|
99
|
+
Arc *current_arcp;
|
100
|
+
Arc *more_arcs;
|
101
|
+
|
102
|
+
public:
|
103
|
+
typedef enum {all,non_eps,eps} IterType;
|
104
|
+
|
105
|
+
ArcsIter( const Arcs *arcs, IterType type=all ) {
|
106
|
+
more_arcs = NULL;
|
107
|
+
if (type == all) {
|
108
|
+
if (arcs->first_epsilon_arcp) {
|
109
|
+
current_arcp = arcs->first_epsilon_arcp;
|
110
|
+
more_arcs = arcs->first_arcp;
|
111
|
+
}
|
112
|
+
else
|
113
|
+
current_arcp = arcs->first_arcp;
|
114
|
+
}
|
115
|
+
else if (type == non_eps)
|
116
|
+
current_arcp = arcs->first_arcp;
|
117
|
+
else
|
118
|
+
current_arcp = arcs->first_epsilon_arcp;
|
119
|
+
};
|
120
|
+
|
121
|
+
void operator++( int ) {
|
122
|
+
if (current_arcp) {
|
123
|
+
current_arcp = current_arcp->next;
|
124
|
+
if (!current_arcp && more_arcs) {
|
125
|
+
current_arcp = more_arcs;
|
126
|
+
more_arcs = NULL;
|
127
|
+
}
|
128
|
+
}
|
129
|
+
};
|
130
|
+
operator Arc*( void ) { return current_arcp; };
|
131
|
+
|
132
|
+
};
|
133
|
+
|
134
|
+
|
135
|
+
/***************** class Node ************************************/
|
136
|
+
|
137
|
+
class Node {
|
138
|
+
|
139
|
+
private:
|
140
|
+
bool final;
|
141
|
+
VType visited;
|
142
|
+
Arcs arcsp;
|
143
|
+
Node *forwardp;
|
144
|
+
|
145
|
+
public:
|
146
|
+
Node( void ) { init(); };
|
147
|
+
void init( void );
|
148
|
+
bool is_final( void ) const { return final; };
|
149
|
+
void set_final( bool flag ) { final = flag; };
|
150
|
+
void set_forward( Node *node ) { forwardp = node; };
|
151
|
+
const Node *target_node( Label l ) const { return arcs()->target_node(l); };
|
152
|
+
Node *target_node( Label l ) { return arcs()->target_node(l); };
|
153
|
+
void add_arc( Label l, Node *n, Transducer *a ) { arcs()->add_arc(l, n, a); };
|
154
|
+
Arcs *arcs( void ) { return &arcsp; };
|
155
|
+
const Arcs *arcs( void ) const { return &arcsp; };
|
156
|
+
Node *forward( void ) { return forwardp; };
|
157
|
+
bool was_visited( VType vmark ) {
|
158
|
+
if (visited == vmark)
|
159
|
+
return true;
|
160
|
+
visited = vmark;
|
161
|
+
return false;
|
162
|
+
};
|
163
|
+
bool check_visited( VType vm ) // leaves the visited flag unchanged
|
164
|
+
{ return (visited==vm); };
|
165
|
+
};
|
166
|
+
|
167
|
+
|
168
|
+
/***************** class Node2Int *********************************/
|
169
|
+
|
170
|
+
class Node2Int {
|
171
|
+
|
172
|
+
struct hashf {
|
173
|
+
size_t operator()(const Node *node) const {
|
174
|
+
return (size_t)node;
|
175
|
+
}
|
176
|
+
};
|
177
|
+
struct equalf {
|
178
|
+
int operator()(const Node *n1, const Node *n2) const {
|
179
|
+
return (n1 == n2);
|
180
|
+
}
|
181
|
+
};
|
182
|
+
typedef __gnu_cxx::hash_map<Node*, int, hashf, equalf> NL;
|
183
|
+
|
184
|
+
private:
|
185
|
+
int current_number;
|
186
|
+
NL number;
|
187
|
+
|
188
|
+
public:
|
189
|
+
int &operator[]( Node *node ) {
|
190
|
+
NL::iterator it=number.find(node);
|
191
|
+
if (it == number.end())
|
192
|
+
return number.insert(NL::value_type(node, 0)).first->second;
|
193
|
+
return it->second;
|
194
|
+
};
|
195
|
+
};
|
196
|
+
|
197
|
+
|
198
|
+
/***************** class NodeNumbering ****************************/
|
199
|
+
|
200
|
+
class NodeNumbering {
|
201
|
+
|
202
|
+
private:
|
203
|
+
std::vector<Node*> nodes;
|
204
|
+
Node2Int nummap;
|
205
|
+
void number_node( Node*, Transducer& );
|
206
|
+
|
207
|
+
public:
|
208
|
+
NodeNumbering( Transducer& );
|
209
|
+
int operator[]( Node *node ) { return nummap[node]; };
|
210
|
+
size_t number_of_nodes( void ) { return nodes.size(); };
|
211
|
+
Node *get_node( size_t n ) { return nodes[n]; };
|
212
|
+
};
|
213
|
+
|
214
|
+
|
215
|
+
/***************** class PairMapping ****************************/
|
216
|
+
|
217
|
+
class PairMapping {
|
218
|
+
// This class is used to map a node pair from two transducers
|
219
|
+
// to a single node in another transducer
|
220
|
+
|
221
|
+
typedef std::pair<Node*, Node*> NodePair;
|
222
|
+
|
223
|
+
private:
|
224
|
+
struct hashf {
|
225
|
+
size_t operator()(const NodePair p) const {
|
226
|
+
return (size_t)p.first ^ (size_t)p.second;
|
227
|
+
}
|
228
|
+
};
|
229
|
+
struct equalf {
|
230
|
+
int operator()(const NodePair p1, const NodePair p2) const {
|
231
|
+
return (p1.first==p2.first && p1.second == p2.second);
|
232
|
+
}
|
233
|
+
};
|
234
|
+
typedef __gnu_cxx::hash_map<NodePair, Node*, hashf, equalf> PairMap;
|
235
|
+
PairMap pm;
|
236
|
+
|
237
|
+
public:
|
238
|
+
typedef PairMap::iterator iterator;
|
239
|
+
iterator begin( void ) { return pm.begin(); };
|
240
|
+
iterator end( void ) { return pm.end(); };
|
241
|
+
iterator find( Node *n1, Node *n2 )
|
242
|
+
{ return pm.find( NodePair(n1,n2) ); };
|
243
|
+
Node* &operator[]( NodePair p ) { return pm.operator[](p); };
|
244
|
+
|
245
|
+
};
|
246
|
+
|
247
|
+
|
248
|
+
/***************** class Transducer *******************************/
|
249
|
+
|
250
|
+
class Transducer {
|
251
|
+
|
252
|
+
private:
|
253
|
+
bool deterministic;
|
254
|
+
bool minimised;
|
255
|
+
Node root;
|
256
|
+
Mem mem;
|
257
|
+
|
258
|
+
typedef std::set<Label, Label::label_cmp> LabelSet;
|
259
|
+
typedef __gnu_cxx::hash_map<Character, char*> SymbolMap;
|
260
|
+
|
261
|
+
void reverse_node( Node *old_node, Transducer *new_node );
|
262
|
+
Label recode_label( Label, bool lswitch, bool recode, Alphabet& );
|
263
|
+
Node *copy_nodes( Node *n, Transducer *a,
|
264
|
+
bool lswitch=false, bool recode=false );
|
265
|
+
void rec_cat_nodes( Node*, Node* );
|
266
|
+
bool productive_node( Node* );
|
267
|
+
bool prune_nodes( Node* );
|
268
|
+
void negate_nodes( Node*, Node* );
|
269
|
+
bool compare_nodes( Node *node, Node *node2, Transducer &a2 );
|
270
|
+
void map_nodes( Node *node, Node *node2, Transducer *a, Level level );
|
271
|
+
void freely_insert_at_node( Node *node, Label l );
|
272
|
+
int print_strings_node(Node *node, char *buffer, int pos, FILE *file, bool);
|
273
|
+
bool infinitely_ambiguous_node( Node* );
|
274
|
+
bool is_cyclic_node( Node*, NodeHashSet &visited );
|
275
|
+
bool is_automaton_node( Node* );
|
276
|
+
bool generate1( Node*, Node2Int&, char*, int, char*, int, FILE* );
|
277
|
+
void store_symbols( Node*, SymbolMap&, LabelSet& );
|
278
|
+
|
279
|
+
void splice_nodes(Node*, Node*, Label sl, Transducer*, Transducer*);
|
280
|
+
void splice_arc( Node*, Node*, Node*, Transducer* );
|
281
|
+
void enumerate_paths_node( Node*, std::vector<Label>&, NodeHashSet&,
|
282
|
+
std::vector<Transducer*>& );
|
283
|
+
void replace_char2( Node*, Node*, Character, Character, Transducer* );
|
284
|
+
Node *create_node( std::vector<Node*>&, char*, size_t line );
|
285
|
+
void read_transducer_binary( FILE* );
|
286
|
+
void read_transducer_text( FILE* );
|
287
|
+
|
288
|
+
public:
|
289
|
+
VType vmark;
|
290
|
+
void incr_vmark( void ) {
|
291
|
+
if (++vmark == 0)
|
292
|
+
throw "Overflow of generation counter!";
|
293
|
+
};
|
294
|
+
Alphabet alphabet; // The set of all labels, i.e. character pairs
|
295
|
+
|
296
|
+
Transducer( void ) : root(), mem()
|
297
|
+
{ vmark = 0; deterministic = minimised = false; };
|
298
|
+
// convertion of a string to an transducer
|
299
|
+
Transducer( char *s, const Alphabet *a=NULL, bool extended=false );
|
300
|
+
// reads a word list from a file and stores it in the transducer
|
301
|
+
Transducer( std::istream&, const Alphabet *a=NULL, bool verbose=false );
|
302
|
+
// reads a transducer from a binary or text file
|
303
|
+
Transducer( FILE*, bool binary=true );
|
304
|
+
// turns a sequence of labels into a transducer
|
305
|
+
Transducer( std::vector<Label>& );
|
306
|
+
|
307
|
+
Node *root_node( void ) { return &root; }; // returns the root node
|
308
|
+
const Node *root_node( void ) const { return &root; }; // returns the root node
|
309
|
+
Node *new_node( void ); // memory alocation for a new node
|
310
|
+
Arc *new_arc( Label l, Node *target ); // memory alocation for a new arc
|
311
|
+
void add_string( char *s, bool extended=false );
|
312
|
+
void complete_alphabet( void );
|
313
|
+
void minimise_alphabet( void );
|
314
|
+
void prune( void ); // remove unnecessary arcs
|
315
|
+
|
316
|
+
int print_strings( FILE*, bool with_brackets=true ); //enumerate all strings
|
317
|
+
|
318
|
+
bool analyze_string( char *s, FILE *file, bool with_brackets=true );
|
319
|
+
bool generate_string( char *s, FILE *file, bool with_brackets=true );
|
320
|
+
bool generate( FILE *file, bool separate=false );
|
321
|
+
|
322
|
+
void clear( void ); // clears the transducer. The resulting transducer
|
323
|
+
// is like one created with Transducer()
|
324
|
+
// copy duplicates an transducer
|
325
|
+
// if called with a non-zero argument, upper and lower level are switched
|
326
|
+
Transducer ©( bool lswitch=false, const Alphabet *al=NULL );
|
327
|
+
Transducer &switch_levels( void ) { return copy( true ); };
|
328
|
+
Transducer &splice( Label l, Transducer *a);
|
329
|
+
Transducer &freely_insert( Label l );
|
330
|
+
Transducer &replace_char( Character c, Character nc );
|
331
|
+
Transducer &level( Level );
|
332
|
+
Transducer &lower_level( void ) // creates an transducer for the "lower" language
|
333
|
+
{ return level(lower); };
|
334
|
+
Transducer &upper_level( void ) // creates an transducer for the "upper" language
|
335
|
+
{ return level(upper); };
|
336
|
+
Transducer &determinise( void ); // creates a deterministic transducer
|
337
|
+
Transducer &minimise( bool verbose=true ); // creates a minimised transducer
|
338
|
+
void store( FILE* ); // stores the transducer in binary format
|
339
|
+
void store_lowmem( FILE* );
|
340
|
+
void read( FILE* ); // reads an transducer in binary format
|
341
|
+
bool enumerate_paths( std::vector<Transducer*>& );
|
342
|
+
|
343
|
+
Transducer &reverse( void ); // reverse language
|
344
|
+
Transducer &operator|( Transducer& ); // union, disjunction
|
345
|
+
Transducer &operator+( Transducer& ); // concatenation
|
346
|
+
Transducer &operator/( Transducer& ); // subtraction
|
347
|
+
Transducer &operator&( Transducer& ); // intersection, conjunction
|
348
|
+
Transducer &operator||( Transducer& ); // composition
|
349
|
+
Transducer &operator!( void ); // complement, negation
|
350
|
+
Transducer &kleene_star( void );
|
351
|
+
bool operator==( Transducer& ); // minimises its arguments first
|
352
|
+
|
353
|
+
bool is_cyclic( void );
|
354
|
+
bool is_automaton( void );
|
355
|
+
bool is_infinitely_ambiguous( void );
|
356
|
+
bool is_empty( void ); // For efficiency reasons, these functions
|
357
|
+
bool generates_empty_string( void );// are better called after minimisation
|
358
|
+
|
359
|
+
friend class NodeNumbering;
|
360
|
+
friend class EdgeCount;
|
361
|
+
friend class MakeCompactTransducer;
|
362
|
+
friend std::ostream &operator<<(std::ostream&, Transducer&);
|
363
|
+
};
|
364
|
+
|
365
|
+
#endif
|