ruby-sfst 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +1 -0
- data/Manifest +31 -0
- data/README.rdoc +25 -0
- data/Rakefile +22 -0
- data/ext/sfst_machine/alphabet.C +807 -0
- data/ext/sfst_machine/alphabet.h +281 -0
- data/ext/sfst_machine/basic.C +84 -0
- data/ext/sfst_machine/basic.h +24 -0
- data/ext/sfst_machine/compact.C +616 -0
- data/ext/sfst_machine/compact.h +98 -0
- data/ext/sfst_machine/determinise.C +304 -0
- data/ext/sfst_machine/extconf.rb +4 -0
- data/ext/sfst_machine/fst-compiler.C +2375 -0
- data/ext/sfst_machine/fst-compiler.h +113 -0
- data/ext/sfst_machine/fst-compiler.yy +213 -0
- data/ext/sfst_machine/fst.C +966 -0
- data/ext/sfst_machine/fst.h +365 -0
- data/ext/sfst_machine/interface.C +1838 -0
- data/ext/sfst_machine/interface.h +94 -0
- data/ext/sfst_machine/make-compact.C +328 -0
- data/ext/sfst_machine/make-compact.h +34 -0
- data/ext/sfst_machine/mem.h +74 -0
- data/ext/sfst_machine/operators.C +1131 -0
- data/ext/sfst_machine/sfst_machine.cc +411 -0
- data/ext/sfst_machine/utf8-scanner.C +2197 -0
- data/ext/sfst_machine/utf8-scanner.ll +179 -0
- data/ext/sfst_machine/utf8.C +146 -0
- data/ext/sfst_machine/utf8.h +19 -0
- data/lib/sfst.rb +99 -0
- data/ruby-sfst.gemspec +34 -0
- data/test/test_sfst.fst +3 -0
- data/test/test_sfst.rb +119 -0
- metadata +100 -0
@@ -0,0 +1,365 @@
|
|
1
|
+
/*******************************************************************/
|
2
|
+
/* */
|
3
|
+
/* FILE fst.h */
|
4
|
+
/* MODULE fst */
|
5
|
+
/* PROGRAM SFST */
|
6
|
+
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
7
|
+
/* */
|
8
|
+
/* PURPOSE finite state tools */
|
9
|
+
/* */
|
10
|
+
/*******************************************************************/
|
11
|
+
|
12
|
+
#ifndef _FST_H_
|
13
|
+
#define _FST_H_
|
14
|
+
|
15
|
+
#include "alphabet.h"
|
16
|
+
|
17
|
+
|
18
|
+
/*******************************************************************/
|
19
|
+
/* include commands */
|
20
|
+
/*******************************************************************/
|
21
|
+
|
22
|
+
#include <string>
|
23
|
+
|
24
|
+
#include <vector>
|
25
|
+
|
26
|
+
#include "mem.h"
|
27
|
+
|
28
|
+
typedef unsigned short VType;
|
29
|
+
|
30
|
+
extern int Quiet;
|
31
|
+
|
32
|
+
class Node;
|
33
|
+
class Arc;
|
34
|
+
class Arcs;
|
35
|
+
class Transducer;
|
36
|
+
|
37
|
+
|
38
|
+
struct hashf {
|
39
|
+
size_t operator()(const Node *n) const { return (size_t) n; }
|
40
|
+
};
|
41
|
+
struct equalf {
|
42
|
+
int operator()(const Node *n1, const Node *n2) const { return n1==n2; }
|
43
|
+
};
|
44
|
+
typedef __gnu_cxx::hash_set<Node*, hashf, equalf> NodeHashSet;
|
45
|
+
|
46
|
+
|
47
|
+
|
48
|
+
/***************** class Arc *************************************/
|
49
|
+
|
50
|
+
class Arc {
|
51
|
+
|
52
|
+
private:
|
53
|
+
Label l;
|
54
|
+
Node *target;
|
55
|
+
Arc *next;
|
56
|
+
|
57
|
+
public:
|
58
|
+
void init( Label ll, Node *node ) { l=ll; target=node; };
|
59
|
+
Label label( void ) const { return l; };
|
60
|
+
Node *target_node( void ) { return target; };
|
61
|
+
const Node *target_node( void ) const { return target; };
|
62
|
+
|
63
|
+
friend class Arcs;
|
64
|
+
friend class ArcsIter;
|
65
|
+
};
|
66
|
+
|
67
|
+
|
68
|
+
/***************** class Arcs ************************************/
|
69
|
+
|
70
|
+
class Arcs {
|
71
|
+
|
72
|
+
private:
|
73
|
+
Arc *first_arcp;
|
74
|
+
Arc *first_epsilon_arcp;
|
75
|
+
|
76
|
+
public:
|
77
|
+
void init( void ) { first_arcp = first_epsilon_arcp = NULL; };
|
78
|
+
Arcs( void ) { init(); };
|
79
|
+
Node *target_node( Label l );
|
80
|
+
const Node *target_node( Label l ) const;
|
81
|
+
void add_arc( Label, Node*, Transducer* );
|
82
|
+
int remove_arc( Arc* );
|
83
|
+
bool is_empty( void ) const { return !(first_arcp || first_epsilon_arcp); };
|
84
|
+
bool epsilon_transition_exists( void ) const { return first_epsilon_arcp != NULL; };
|
85
|
+
bool non_epsilon_transition_exists( void ) const { return first_arcp != NULL; };
|
86
|
+
int size( void ) const;
|
87
|
+
|
88
|
+
friend class ArcsIter;
|
89
|
+
};
|
90
|
+
|
91
|
+
|
92
|
+
/***************** class ArcsIter ********************************/
|
93
|
+
|
94
|
+
class ArcsIter {
|
95
|
+
|
96
|
+
// ArcsIter iterates over the arcs starting with epsilon arcs
|
97
|
+
|
98
|
+
private:
|
99
|
+
Arc *current_arcp;
|
100
|
+
Arc *more_arcs;
|
101
|
+
|
102
|
+
public:
|
103
|
+
typedef enum {all,non_eps,eps} IterType;
|
104
|
+
|
105
|
+
ArcsIter( const Arcs *arcs, IterType type=all ) {
|
106
|
+
more_arcs = NULL;
|
107
|
+
if (type == all) {
|
108
|
+
if (arcs->first_epsilon_arcp) {
|
109
|
+
current_arcp = arcs->first_epsilon_arcp;
|
110
|
+
more_arcs = arcs->first_arcp;
|
111
|
+
}
|
112
|
+
else
|
113
|
+
current_arcp = arcs->first_arcp;
|
114
|
+
}
|
115
|
+
else if (type == non_eps)
|
116
|
+
current_arcp = arcs->first_arcp;
|
117
|
+
else
|
118
|
+
current_arcp = arcs->first_epsilon_arcp;
|
119
|
+
};
|
120
|
+
|
121
|
+
void operator++( int ) {
|
122
|
+
if (current_arcp) {
|
123
|
+
current_arcp = current_arcp->next;
|
124
|
+
if (!current_arcp && more_arcs) {
|
125
|
+
current_arcp = more_arcs;
|
126
|
+
more_arcs = NULL;
|
127
|
+
}
|
128
|
+
}
|
129
|
+
};
|
130
|
+
operator Arc*( void ) { return current_arcp; };
|
131
|
+
|
132
|
+
};
|
133
|
+
|
134
|
+
|
135
|
+
/***************** class Node ************************************/
|
136
|
+
|
137
|
+
class Node {
|
138
|
+
|
139
|
+
private:
|
140
|
+
bool final;
|
141
|
+
VType visited;
|
142
|
+
Arcs arcsp;
|
143
|
+
Node *forwardp;
|
144
|
+
|
145
|
+
public:
|
146
|
+
Node( void ) { init(); };
|
147
|
+
void init( void );
|
148
|
+
bool is_final( void ) const { return final; };
|
149
|
+
void set_final( bool flag ) { final = flag; };
|
150
|
+
void set_forward( Node *node ) { forwardp = node; };
|
151
|
+
const Node *target_node( Label l ) const { return arcs()->target_node(l); };
|
152
|
+
Node *target_node( Label l ) { return arcs()->target_node(l); };
|
153
|
+
void add_arc( Label l, Node *n, Transducer *a ) { arcs()->add_arc(l, n, a); };
|
154
|
+
Arcs *arcs( void ) { return &arcsp; };
|
155
|
+
const Arcs *arcs( void ) const { return &arcsp; };
|
156
|
+
Node *forward( void ) { return forwardp; };
|
157
|
+
bool was_visited( VType vmark ) {
|
158
|
+
if (visited == vmark)
|
159
|
+
return true;
|
160
|
+
visited = vmark;
|
161
|
+
return false;
|
162
|
+
};
|
163
|
+
bool check_visited( VType vm ) // leaves the visited flag unchanged
|
164
|
+
{ return (visited==vm); };
|
165
|
+
};
|
166
|
+
|
167
|
+
|
168
|
+
/***************** class Node2Int *********************************/
|
169
|
+
|
170
|
+
class Node2Int {
|
171
|
+
|
172
|
+
struct hashf {
|
173
|
+
size_t operator()(const Node *node) const {
|
174
|
+
return (size_t)node;
|
175
|
+
}
|
176
|
+
};
|
177
|
+
struct equalf {
|
178
|
+
int operator()(const Node *n1, const Node *n2) const {
|
179
|
+
return (n1 == n2);
|
180
|
+
}
|
181
|
+
};
|
182
|
+
typedef __gnu_cxx::hash_map<Node*, int, hashf, equalf> NL;
|
183
|
+
|
184
|
+
private:
|
185
|
+
int current_number;
|
186
|
+
NL number;
|
187
|
+
|
188
|
+
public:
|
189
|
+
int &operator[]( Node *node ) {
|
190
|
+
NL::iterator it=number.find(node);
|
191
|
+
if (it == number.end())
|
192
|
+
return number.insert(NL::value_type(node, 0)).first->second;
|
193
|
+
return it->second;
|
194
|
+
};
|
195
|
+
};
|
196
|
+
|
197
|
+
|
198
|
+
/***************** class NodeNumbering ****************************/
|
199
|
+
|
200
|
+
class NodeNumbering {
|
201
|
+
|
202
|
+
private:
|
203
|
+
std::vector<Node*> nodes;
|
204
|
+
Node2Int nummap;
|
205
|
+
void number_node( Node*, Transducer& );
|
206
|
+
|
207
|
+
public:
|
208
|
+
NodeNumbering( Transducer& );
|
209
|
+
int operator[]( Node *node ) { return nummap[node]; };
|
210
|
+
size_t number_of_nodes( void ) { return nodes.size(); };
|
211
|
+
Node *get_node( size_t n ) { return nodes[n]; };
|
212
|
+
};
|
213
|
+
|
214
|
+
|
215
|
+
/***************** class PairMapping ****************************/
|
216
|
+
|
217
|
+
class PairMapping {
|
218
|
+
// This class is used to map a node pair from two transducers
|
219
|
+
// to a single node in another transducer
|
220
|
+
|
221
|
+
typedef std::pair<Node*, Node*> NodePair;
|
222
|
+
|
223
|
+
private:
|
224
|
+
struct hashf {
|
225
|
+
size_t operator()(const NodePair p) const {
|
226
|
+
return (size_t)p.first ^ (size_t)p.second;
|
227
|
+
}
|
228
|
+
};
|
229
|
+
struct equalf {
|
230
|
+
int operator()(const NodePair p1, const NodePair p2) const {
|
231
|
+
return (p1.first==p2.first && p1.second == p2.second);
|
232
|
+
}
|
233
|
+
};
|
234
|
+
typedef __gnu_cxx::hash_map<NodePair, Node*, hashf, equalf> PairMap;
|
235
|
+
PairMap pm;
|
236
|
+
|
237
|
+
public:
|
238
|
+
typedef PairMap::iterator iterator;
|
239
|
+
iterator begin( void ) { return pm.begin(); };
|
240
|
+
iterator end( void ) { return pm.end(); };
|
241
|
+
iterator find( Node *n1, Node *n2 )
|
242
|
+
{ return pm.find( NodePair(n1,n2) ); };
|
243
|
+
Node* &operator[]( NodePair p ) { return pm.operator[](p); };
|
244
|
+
|
245
|
+
};
|
246
|
+
|
247
|
+
|
248
|
+
/***************** class Transducer *******************************/
|
249
|
+
|
250
|
+
class Transducer {
|
251
|
+
|
252
|
+
private:
|
253
|
+
bool deterministic;
|
254
|
+
bool minimised;
|
255
|
+
Node root;
|
256
|
+
Mem mem;
|
257
|
+
|
258
|
+
typedef std::set<Label, Label::label_cmp> LabelSet;
|
259
|
+
typedef __gnu_cxx::hash_map<Character, char*> SymbolMap;
|
260
|
+
|
261
|
+
void reverse_node( Node *old_node, Transducer *new_node );
|
262
|
+
Label recode_label( Label, bool lswitch, bool recode, Alphabet& );
|
263
|
+
Node *copy_nodes( Node *n, Transducer *a,
|
264
|
+
bool lswitch=false, bool recode=false );
|
265
|
+
void rec_cat_nodes( Node*, Node* );
|
266
|
+
bool productive_node( Node* );
|
267
|
+
bool prune_nodes( Node* );
|
268
|
+
void negate_nodes( Node*, Node* );
|
269
|
+
bool compare_nodes( Node *node, Node *node2, Transducer &a2 );
|
270
|
+
void map_nodes( Node *node, Node *node2, Transducer *a, Level level );
|
271
|
+
void freely_insert_at_node( Node *node, Label l );
|
272
|
+
int print_strings_node(Node *node, char *buffer, int pos, FILE *file, bool);
|
273
|
+
bool infinitely_ambiguous_node( Node* );
|
274
|
+
bool is_cyclic_node( Node*, NodeHashSet &visited );
|
275
|
+
bool is_automaton_node( Node* );
|
276
|
+
bool generate1( Node*, Node2Int&, char*, int, char*, int, FILE* );
|
277
|
+
void store_symbols( Node*, SymbolMap&, LabelSet& );
|
278
|
+
|
279
|
+
void splice_nodes(Node*, Node*, Label sl, Transducer*, Transducer*);
|
280
|
+
void splice_arc( Node*, Node*, Node*, Transducer* );
|
281
|
+
void enumerate_paths_node( Node*, std::vector<Label>&, NodeHashSet&,
|
282
|
+
std::vector<Transducer*>& );
|
283
|
+
void replace_char2( Node*, Node*, Character, Character, Transducer* );
|
284
|
+
Node *create_node( std::vector<Node*>&, char*, size_t line );
|
285
|
+
void read_transducer_binary( FILE* );
|
286
|
+
void read_transducer_text( FILE* );
|
287
|
+
|
288
|
+
public:
|
289
|
+
VType vmark;
|
290
|
+
void incr_vmark( void ) {
|
291
|
+
if (++vmark == 0)
|
292
|
+
throw "Overflow of generation counter!";
|
293
|
+
};
|
294
|
+
Alphabet alphabet; // The set of all labels, i.e. character pairs
|
295
|
+
|
296
|
+
Transducer( void ) : root(), mem()
|
297
|
+
{ vmark = 0; deterministic = minimised = false; };
|
298
|
+
// convertion of a string to an transducer
|
299
|
+
Transducer( char *s, const Alphabet *a=NULL, bool extended=false );
|
300
|
+
// reads a word list from a file and stores it in the transducer
|
301
|
+
Transducer( std::istream&, const Alphabet *a=NULL, bool verbose=false );
|
302
|
+
// reads a transducer from a binary or text file
|
303
|
+
Transducer( FILE*, bool binary=true );
|
304
|
+
// turns a sequence of labels into a transducer
|
305
|
+
Transducer( std::vector<Label>& );
|
306
|
+
|
307
|
+
Node *root_node( void ) { return &root; }; // returns the root node
|
308
|
+
const Node *root_node( void ) const { return &root; }; // returns the root node
|
309
|
+
Node *new_node( void ); // memory alocation for a new node
|
310
|
+
Arc *new_arc( Label l, Node *target ); // memory alocation for a new arc
|
311
|
+
void add_string( char *s, bool extended=false );
|
312
|
+
void complete_alphabet( void );
|
313
|
+
void minimise_alphabet( void );
|
314
|
+
void prune( void ); // remove unnecessary arcs
|
315
|
+
|
316
|
+
int print_strings( FILE*, bool with_brackets=true ); //enumerate all strings
|
317
|
+
|
318
|
+
bool analyze_string( char *s, FILE *file, bool with_brackets=true );
|
319
|
+
bool generate_string( char *s, FILE *file, bool with_brackets=true );
|
320
|
+
bool generate( FILE *file, bool separate=false );
|
321
|
+
|
322
|
+
void clear( void ); // clears the transducer. The resulting transducer
|
323
|
+
// is like one created with Transducer()
|
324
|
+
// copy duplicates an transducer
|
325
|
+
// if called with a non-zero argument, upper and lower level are switched
|
326
|
+
Transducer ©( bool lswitch=false, const Alphabet *al=NULL );
|
327
|
+
Transducer &switch_levels( void ) { return copy( true ); };
|
328
|
+
Transducer &splice( Label l, Transducer *a);
|
329
|
+
Transducer &freely_insert( Label l );
|
330
|
+
Transducer &replace_char( Character c, Character nc );
|
331
|
+
Transducer &level( Level );
|
332
|
+
Transducer &lower_level( void ) // creates an transducer for the "lower" language
|
333
|
+
{ return level(lower); };
|
334
|
+
Transducer &upper_level( void ) // creates an transducer for the "upper" language
|
335
|
+
{ return level(upper); };
|
336
|
+
Transducer &determinise( void ); // creates a deterministic transducer
|
337
|
+
Transducer &minimise( bool verbose=true ); // creates a minimised transducer
|
338
|
+
void store( FILE* ); // stores the transducer in binary format
|
339
|
+
void store_lowmem( FILE* );
|
340
|
+
void read( FILE* ); // reads an transducer in binary format
|
341
|
+
bool enumerate_paths( std::vector<Transducer*>& );
|
342
|
+
|
343
|
+
Transducer &reverse( void ); // reverse language
|
344
|
+
Transducer &operator|( Transducer& ); // union, disjunction
|
345
|
+
Transducer &operator+( Transducer& ); // concatenation
|
346
|
+
Transducer &operator/( Transducer& ); // subtraction
|
347
|
+
Transducer &operator&( Transducer& ); // intersection, conjunction
|
348
|
+
Transducer &operator||( Transducer& ); // composition
|
349
|
+
Transducer &operator!( void ); // complement, negation
|
350
|
+
Transducer &kleene_star( void );
|
351
|
+
bool operator==( Transducer& ); // minimises its arguments first
|
352
|
+
|
353
|
+
bool is_cyclic( void );
|
354
|
+
bool is_automaton( void );
|
355
|
+
bool is_infinitely_ambiguous( void );
|
356
|
+
bool is_empty( void ); // For efficiency reasons, these functions
|
357
|
+
bool generates_empty_string( void );// are better called after minimisation
|
358
|
+
|
359
|
+
friend class NodeNumbering;
|
360
|
+
friend class EdgeCount;
|
361
|
+
friend class MakeCompactTransducer;
|
362
|
+
friend std::ostream &operator<<(std::ostream&, Transducer&);
|
363
|
+
};
|
364
|
+
|
365
|
+
#endif
|