ruby-sfst 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +1 -0
- data/Manifest +31 -0
- data/README.rdoc +25 -0
- data/Rakefile +22 -0
- data/ext/sfst_machine/alphabet.C +807 -0
- data/ext/sfst_machine/alphabet.h +281 -0
- data/ext/sfst_machine/basic.C +84 -0
- data/ext/sfst_machine/basic.h +24 -0
- data/ext/sfst_machine/compact.C +616 -0
- data/ext/sfst_machine/compact.h +98 -0
- data/ext/sfst_machine/determinise.C +304 -0
- data/ext/sfst_machine/extconf.rb +4 -0
- data/ext/sfst_machine/fst-compiler.C +2375 -0
- data/ext/sfst_machine/fst-compiler.h +113 -0
- data/ext/sfst_machine/fst-compiler.yy +213 -0
- data/ext/sfst_machine/fst.C +966 -0
- data/ext/sfst_machine/fst.h +365 -0
- data/ext/sfst_machine/interface.C +1838 -0
- data/ext/sfst_machine/interface.h +94 -0
- data/ext/sfst_machine/make-compact.C +328 -0
- data/ext/sfst_machine/make-compact.h +34 -0
- data/ext/sfst_machine/mem.h +74 -0
- data/ext/sfst_machine/operators.C +1131 -0
- data/ext/sfst_machine/sfst_machine.cc +411 -0
- data/ext/sfst_machine/utf8-scanner.C +2197 -0
- data/ext/sfst_machine/utf8-scanner.ll +179 -0
- data/ext/sfst_machine/utf8.C +146 -0
- data/ext/sfst_machine/utf8.h +19 -0
- data/lib/sfst.rb +99 -0
- data/ruby-sfst.gemspec +34 -0
- data/test/test_sfst.fst +3 -0
- data/test/test_sfst.rb +119 -0
- metadata +100 -0
@@ -0,0 +1,304 @@
|
|
1
|
+
|
2
|
+
/*******************************************************************/
|
3
|
+
/* */
|
4
|
+
/* FILE determinise.C */
|
5
|
+
/* MODULE determinise */
|
6
|
+
/* PROGRAM SFST */
|
7
|
+
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
8
|
+
/* */
|
9
|
+
/*******************************************************************/
|
10
|
+
|
11
|
+
|
12
|
+
#include "fst.h"
|
13
|
+
|
14
|
+
using std::vector;
|
15
|
+
using std::pair;
|
16
|
+
using std::set;
|
17
|
+
using __gnu_cxx::hash_map;
|
18
|
+
|
19
|
+
/***************** class NodeSet *********************************/
|
20
|
+
|
21
|
+
class NodeSet {
|
22
|
+
// This class is used to store a set of nodes.
|
23
|
+
// Whenever a new node is added, all nodes accessible
|
24
|
+
// through epsilon transitions are added as well.
|
25
|
+
|
26
|
+
private:
|
27
|
+
set<Node*> ht;
|
28
|
+
|
29
|
+
public:
|
30
|
+
typedef set<Node*>::iterator iterator;
|
31
|
+
NodeSet() {};
|
32
|
+
void add( Node* );
|
33
|
+
bool insert(Node *node) {
|
34
|
+
pair<iterator, bool> result = ht.insert(node);
|
35
|
+
return result.second;
|
36
|
+
};
|
37
|
+
iterator begin() const { return ht.begin(); }
|
38
|
+
iterator end() const { return ht.end(); }
|
39
|
+
size_t size() const { return ht.size(); }
|
40
|
+
void clear() { ht.clear(); }
|
41
|
+
};
|
42
|
+
|
43
|
+
|
44
|
+
/***************** class NodeArray *******************************/
|
45
|
+
|
46
|
+
class NodeArray {
|
47
|
+
|
48
|
+
private:
|
49
|
+
size_t sizev;
|
50
|
+
bool final;
|
51
|
+
Node **node;
|
52
|
+
|
53
|
+
public:
|
54
|
+
NodeArray( NodeSet& );
|
55
|
+
~NodeArray() { delete[] node; };
|
56
|
+
size_t size() const { return sizev; }
|
57
|
+
bool is_final() const { return final; };
|
58
|
+
Node* &operator[]( int i ) const { return node[i]; }
|
59
|
+
};
|
60
|
+
|
61
|
+
|
62
|
+
/***************** class Transition ******************************/
|
63
|
+
|
64
|
+
class Transition {
|
65
|
+
public:
|
66
|
+
Label label;
|
67
|
+
NodeArray *nodes;
|
68
|
+
Transition(Label l, NodeArray *na) { label = l; nodes = na; };
|
69
|
+
};
|
70
|
+
|
71
|
+
|
72
|
+
/***************** class NodeMapping ****************************/
|
73
|
+
|
74
|
+
class NodeMapping {
|
75
|
+
// This class is used to map a node set from one transducer
|
76
|
+
// to a single node in another transducer
|
77
|
+
|
78
|
+
private:
|
79
|
+
struct hashf {
|
80
|
+
size_t operator()(const NodeArray *na) const {
|
81
|
+
size_t key=na->size() ^ na->is_final();
|
82
|
+
for( size_t i=0; i<na->size(); i++)
|
83
|
+
key = (key<<1) ^ (size_t)(*na)[i];
|
84
|
+
return key;
|
85
|
+
}
|
86
|
+
};
|
87
|
+
struct equalf {
|
88
|
+
int operator()(const NodeArray *na1, const NodeArray *na2) const {
|
89
|
+
if (na1->size() != na2->size() || na1->is_final() != na2->is_final())
|
90
|
+
return 0;
|
91
|
+
for( size_t i=0; i<na1->size(); i++)
|
92
|
+
if ((*na1)[i] != (*na2)[i])
|
93
|
+
return 0;
|
94
|
+
return 1;
|
95
|
+
}
|
96
|
+
};
|
97
|
+
typedef hash_map<NodeArray*, Node*, hashf, equalf> NodeMap;
|
98
|
+
NodeMap hm;
|
99
|
+
|
100
|
+
public:
|
101
|
+
typedef NodeMap::iterator iterator;
|
102
|
+
~NodeMapping();
|
103
|
+
iterator begin() { return hm.begin(); };
|
104
|
+
iterator end() { return hm.end(); };
|
105
|
+
iterator find( NodeArray *na) { return hm.find( na ); };
|
106
|
+
Node* &operator[]( NodeArray *na ) { return hm.operator[](na); };
|
107
|
+
|
108
|
+
};
|
109
|
+
|
110
|
+
|
111
|
+
/***************** class LabelMapping ****************************/
|
112
|
+
|
113
|
+
class LabelMapping {
|
114
|
+
// This class is used to map a label to a node set
|
115
|
+
|
116
|
+
private:
|
117
|
+
struct hashf {
|
118
|
+
size_t operator()(const Label l) const {
|
119
|
+
return l.lower_char() | (l.upper_char() << 16);
|
120
|
+
}
|
121
|
+
};
|
122
|
+
struct equalf {
|
123
|
+
int operator()(const Label l1, const Label l2) const {
|
124
|
+
return l1==l2;
|
125
|
+
}
|
126
|
+
};
|
127
|
+
typedef hash_map<const Label, NodeSet, hashf, equalf> LabelMap;
|
128
|
+
LabelMap lm;
|
129
|
+
|
130
|
+
public:
|
131
|
+
LabelMapping(): lm(8) {};
|
132
|
+
typedef LabelMap::iterator iterator;
|
133
|
+
iterator begin() { return lm.begin(); };
|
134
|
+
iterator end() { return lm.end(); };
|
135
|
+
size_t size() { return lm.size(); };
|
136
|
+
iterator find( Label l) { return lm.find( l ); };
|
137
|
+
NodeSet &operator[]( const Label l ) { return lm.operator[]( l ); };
|
138
|
+
|
139
|
+
};
|
140
|
+
|
141
|
+
static void determinise_node( NodeArray&, Node*, Transducer*, NodeMapping&, long );
|
142
|
+
|
143
|
+
|
144
|
+
|
145
|
+
/*******************************************************************/
|
146
|
+
/* */
|
147
|
+
/* NodeSet::add */
|
148
|
+
/* */
|
149
|
+
/*******************************************************************/
|
150
|
+
|
151
|
+
void NodeSet::add( Node *node )
|
152
|
+
|
153
|
+
{
|
154
|
+
pair<iterator, bool> result = ht.insert(node);
|
155
|
+
if (result.second) {
|
156
|
+
// new node, add nodes reachable with epsilon transitions
|
157
|
+
for( ArcsIter p(node->arcs(),ArcsIter::eps); p; p++ ) {
|
158
|
+
Arc *arc=p;
|
159
|
+
if (!arc->label().is_epsilon())
|
160
|
+
break;
|
161
|
+
add(arc->target_node());
|
162
|
+
}
|
163
|
+
}
|
164
|
+
}
|
165
|
+
|
166
|
+
|
167
|
+
/*******************************************************************/
|
168
|
+
/* */
|
169
|
+
/* NodeArray::NodeArray */
|
170
|
+
/* */
|
171
|
+
/*******************************************************************/
|
172
|
+
|
173
|
+
NodeArray::NodeArray( NodeSet &ns )
|
174
|
+
|
175
|
+
{
|
176
|
+
sizev = 0;
|
177
|
+
NodeSet::iterator it;
|
178
|
+
|
179
|
+
final = false;
|
180
|
+
node = new Node*[ns.size()];
|
181
|
+
for( it=ns.begin(); it!=ns.end(); it++ ) {
|
182
|
+
Node *nn = *it;
|
183
|
+
if (nn->arcs()->non_epsilon_transition_exists())
|
184
|
+
node[sizev++] = nn;
|
185
|
+
final |= nn->is_final();
|
186
|
+
}
|
187
|
+
std::sort(node, node+sizev);
|
188
|
+
}
|
189
|
+
|
190
|
+
|
191
|
+
/*******************************************************************/
|
192
|
+
/* */
|
193
|
+
/* NodeMapping::~NodeMapping */
|
194
|
+
/* */
|
195
|
+
/*******************************************************************/
|
196
|
+
|
197
|
+
NodeMapping::~NodeMapping()
|
198
|
+
|
199
|
+
{
|
200
|
+
// if we delete NodeArrays without removing them from NodeMapping,
|
201
|
+
// the system will crash when NodeMapping is deleted.
|
202
|
+
for( iterator it=hm.begin(); it!=hm.end(); ) {
|
203
|
+
NodeArray *na=it->first;
|
204
|
+
iterator old = it++;
|
205
|
+
hm.erase(old);
|
206
|
+
delete na;
|
207
|
+
}
|
208
|
+
}
|
209
|
+
|
210
|
+
|
211
|
+
/*******************************************************************/
|
212
|
+
/* */
|
213
|
+
/* compute_transitions */
|
214
|
+
/* */
|
215
|
+
/*******************************************************************/
|
216
|
+
|
217
|
+
static void compute_transitions( NodeArray &na, vector<Transition> &t )
|
218
|
+
|
219
|
+
{
|
220
|
+
LabelMapping lmap;
|
221
|
+
|
222
|
+
// for all nodes in the current set
|
223
|
+
for( size_t i=0; i<na.size(); i++) {
|
224
|
+
Node *n = na[i]; // old node
|
225
|
+
|
226
|
+
// For each non-epsilon transition, add the target node
|
227
|
+
// to the respective node set.
|
228
|
+
for( ArcsIter p(n->arcs(),ArcsIter::non_eps); p; p++ ) {
|
229
|
+
Arc *arc=p;
|
230
|
+
lmap[arc->label()].add(arc->target_node());
|
231
|
+
}
|
232
|
+
}
|
233
|
+
|
234
|
+
t.reserve(lmap.size());
|
235
|
+
for( LabelMapping::iterator it=lmap.begin(); it!=lmap.end(); it++ )
|
236
|
+
t.push_back(Transition(it->first, new NodeArray( it->second )));
|
237
|
+
}
|
238
|
+
|
239
|
+
|
240
|
+
/*******************************************************************/
|
241
|
+
/* */
|
242
|
+
/* determinise_node */
|
243
|
+
/* */
|
244
|
+
/*******************************************************************/
|
245
|
+
|
246
|
+
static void determinise_node( NodeArray &na, Node *node, Transducer *a,
|
247
|
+
NodeMapping &map, long depth )
|
248
|
+
{
|
249
|
+
if (depth > 10000)
|
250
|
+
fprintf(stderr,"\r%ld",depth);
|
251
|
+
node->set_final(na.is_final());
|
252
|
+
|
253
|
+
vector<Transition> t;
|
254
|
+
compute_transitions( na, t );
|
255
|
+
|
256
|
+
for( size_t i=0; i<t.size(); i++ ) {
|
257
|
+
NodeMapping::iterator it=map.find(t[i].nodes);
|
258
|
+
if (it == map.end()) {
|
259
|
+
// new node set
|
260
|
+
Node *target_node = a->new_node();
|
261
|
+
map[t[i].nodes] = target_node;
|
262
|
+
node->add_arc( t[i].label, target_node, a );
|
263
|
+
determinise_node( *t[i].nodes, target_node, a, map, depth+1 );
|
264
|
+
}
|
265
|
+
else {
|
266
|
+
delete t[i].nodes;
|
267
|
+
node->add_arc( t[i].label, it->second, a );
|
268
|
+
}
|
269
|
+
}
|
270
|
+
}
|
271
|
+
|
272
|
+
|
273
|
+
/*******************************************************************/
|
274
|
+
/* */
|
275
|
+
/* Transducer::determinise */
|
276
|
+
/* */
|
277
|
+
/*******************************************************************/
|
278
|
+
|
279
|
+
Transducer &Transducer::determinise()
|
280
|
+
|
281
|
+
{
|
282
|
+
// initialisations
|
283
|
+
NodeMapping map;
|
284
|
+
|
285
|
+
Transducer *a = new Transducer();
|
286
|
+
a->alphabet.copy(alphabet);
|
287
|
+
|
288
|
+
// creation of the initial node set consisting of all nodes
|
289
|
+
// reachable from the start node via epsilon transitions.
|
290
|
+
NodeArray *na;
|
291
|
+
{
|
292
|
+
NodeSet ns;
|
293
|
+
ns.add(root_node());
|
294
|
+
na = new NodeArray(ns);
|
295
|
+
}
|
296
|
+
|
297
|
+
// map the node set to the new root node
|
298
|
+
map[na] = a->root_node();
|
299
|
+
|
300
|
+
// determinise the transducer recursively
|
301
|
+
determinise_node( *na, a->root_node(), a, map, 0);
|
302
|
+
a->deterministic = 1;
|
303
|
+
return *a;
|
304
|
+
}
|