ruby-sfst 0.4.3 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -0
- data/COPYING +280 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +54 -0
- data/README.md +1 -1
- data/Rakefile +9 -18
- data/bin/console +7 -0
- data/bin/setup +6 -0
- data/ext/sfst/alphabet.cc +879 -0
- data/ext/sfst/alphabet.h +302 -0
- data/ext/sfst/basic.cc +85 -0
- data/ext/{sfst_machine → sfst}/basic.h +7 -4
- data/ext/sfst/compact.cc +629 -0
- data/ext/sfst/compact.h +100 -0
- data/ext/sfst/determinise.cc +279 -0
- data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
- data/ext/sfst/fst.cc +1150 -0
- data/ext/sfst/fst.h +374 -0
- data/ext/sfst/hopcroft.cc +681 -0
- data/ext/sfst/interface.cc +1921 -0
- data/ext/sfst/interface.h +171 -0
- data/ext/sfst/make-compact.cc +323 -0
- data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
- data/ext/sfst/mem.h +80 -0
- data/ext/sfst/operators.cc +1273 -0
- data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
- data/ext/sfst/sgi.h +72 -0
- data/ext/sfst/utf8.cc +149 -0
- data/ext/{sfst_machine → sfst}/utf8.h +7 -4
- data/lib/sfst.rb +2 -1
- data/lib/sfst/version.rb +1 -1
- data/ruby-sfst.gemspec +23 -23
- metadata +107 -35
- data/ext/sfst_machine/alphabet.cc +0 -812
- data/ext/sfst_machine/alphabet.h +0 -273
- data/ext/sfst_machine/basic.cc +0 -84
- data/ext/sfst_machine/compact.cc +0 -616
- data/ext/sfst_machine/compact.h +0 -98
- data/ext/sfst_machine/determinise.cc +0 -303
- data/ext/sfst_machine/fst.cc +0 -1000
- data/ext/sfst_machine/fst.h +0 -369
- data/ext/sfst_machine/interface.cc +0 -1842
- data/ext/sfst_machine/interface.h +0 -93
- data/ext/sfst_machine/make-compact.cc +0 -327
- data/ext/sfst_machine/mem.h +0 -74
- data/ext/sfst_machine/operators.cc +0 -1131
- data/ext/sfst_machine/sgi.h +0 -44
- data/ext/sfst_machine/utf8.cc +0 -146
- data/test/test_sfst.fst +0 -3
- data/test/test_sfst.rb +0 -114
data/ext/sfst_machine/compact.h
DELETED
@@ -1,98 +0,0 @@
|
|
1
|
-
/*******************************************************************/
|
2
|
-
/* */
|
3
|
-
/* FILE compact.h */
|
4
|
-
/* MODULE compact */
|
5
|
-
/* PROGRAM SFST */
|
6
|
-
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
7
|
-
/* */
|
8
|
-
/* PURPOSE finite state tools */
|
9
|
-
/* */
|
10
|
-
/*******************************************************************/
|
11
|
-
|
12
|
-
#ifndef _COMPACT_H_
|
13
|
-
#define _COMPACT_H_
|
14
|
-
|
15
|
-
#include "alphabet.h"
|
16
|
-
|
17
|
-
#include <vector>
|
18
|
-
|
19
|
-
typedef std::vector<unsigned int> CAnalysis;
|
20
|
-
|
21
|
-
class CompactTransducer {
|
22
|
-
|
23
|
-
protected:
|
24
|
-
|
25
|
-
// the following data structures are used to store the nodes
|
26
|
-
|
27
|
-
unsigned int number_of_nodes; // number of nodes in the transducer
|
28
|
-
char *finalp; // finalp[i] is 1 if node i is final and 0 otherwise
|
29
|
-
unsigned int *first_arc; // first_arc[i] is the number of the first
|
30
|
-
// arc outgoing from node i
|
31
|
-
|
32
|
-
// the following data structures are used to store the transition arcs
|
33
|
-
|
34
|
-
unsigned int number_of_arcs; // total number of arcs in the transducer
|
35
|
-
Label *label; // the label (character pair) of arc i
|
36
|
-
unsigned int *target_node; // target node of arc i
|
37
|
-
|
38
|
-
// the following data structures are used to store the stochastic parameters
|
39
|
-
float *final_logprob;
|
40
|
-
float *arc_logprob;
|
41
|
-
|
42
|
-
// functions needed to read the transducer from a file
|
43
|
-
|
44
|
-
void read_finalp( FILE *file );
|
45
|
-
void read_first_arcs( FILE *file );
|
46
|
-
void read_target_nodes( FILE *file );
|
47
|
-
void read_labels( FILE *file );
|
48
|
-
void read_probs( FILE *file );
|
49
|
-
|
50
|
-
// functions needed to analyze data with the transducer
|
51
|
-
|
52
|
-
void analyze( unsigned int n, std::vector<Character> &ch, size_t ipos,
|
53
|
-
CAnalysis&, std::vector<CAnalysis>&);
|
54
|
-
|
55
|
-
// function selecting the simplest morphological analysis
|
56
|
-
|
57
|
-
int compute_score( CAnalysis &ana );
|
58
|
-
void disambiguate( std::vector<CAnalysis> &analyses );
|
59
|
-
|
60
|
-
// functions for longest-match analysis of input data
|
61
|
-
|
62
|
-
void longest_match2(unsigned int, char*, int, CAnalysis&, int&, CAnalysis&);
|
63
|
-
|
64
|
-
void convert( CAnalysis &cana, Analysis &ana );
|
65
|
-
|
66
|
-
public:
|
67
|
-
size_t node_count() { return number_of_nodes; };
|
68
|
-
size_t arc_count() { return number_of_arcs; };
|
69
|
-
|
70
|
-
bool both_layers; // print surface and analysis symbols
|
71
|
-
bool simplest_only; // print only the simplest analyses
|
72
|
-
|
73
|
-
Alphabet alphabet; // data structure which maps symbols to numeric codes
|
74
|
-
CompactTransducer(); // dummy constructor
|
75
|
-
CompactTransducer( FILE*, FILE *pfile=NULL ); // reads a (stochastic) transducer
|
76
|
-
~CompactTransducer(); // destroys a transducer
|
77
|
-
|
78
|
-
// the analysis function returns the set of analyses for the string "s"
|
79
|
-
// in the argument "analyses"
|
80
|
-
void analyze_string( char *s, std::vector<CAnalysis > &analyses );
|
81
|
-
|
82
|
-
void compute_probs( std::vector<CAnalysis> &analyses, std::vector<double> &prob );
|
83
|
-
char *print_analysis( CAnalysis &ana );
|
84
|
-
|
85
|
-
// longest-match analysis
|
86
|
-
const char *longest_match( char*& );
|
87
|
-
|
88
|
-
// EM training
|
89
|
-
bool train2( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
|
90
|
-
bool train( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
|
91
|
-
void estimate_probs( std::vector<double> &arcfreq, std::vector<double> &finalfreq );
|
92
|
-
|
93
|
-
// robust analysis
|
94
|
-
float robust_analyze_string( char *string, std::vector<CAnalysis> &analyses,
|
95
|
-
float ErrorsAllowed );
|
96
|
-
};
|
97
|
-
|
98
|
-
#endif
|
@@ -1,303 +0,0 @@
|
|
1
|
-
|
2
|
-
/*******************************************************************/
|
3
|
-
/* */
|
4
|
-
/* FILE determinise.C */
|
5
|
-
/* MODULE determinise */
|
6
|
-
/* PROGRAM SFST */
|
7
|
-
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
8
|
-
/* */
|
9
|
-
/*******************************************************************/
|
10
|
-
|
11
|
-
|
12
|
-
#include "fst.h"
|
13
|
-
|
14
|
-
using std::vector;
|
15
|
-
using std::pair;
|
16
|
-
using std::set;
|
17
|
-
|
18
|
-
/***************** class NodeSet *********************************/
|
19
|
-
|
20
|
-
class NodeSet {
|
21
|
-
// This class is used to store a set of nodes.
|
22
|
-
// Whenever a new node is added, all nodes accessible
|
23
|
-
// through epsilon transitions are added as well.
|
24
|
-
|
25
|
-
private:
|
26
|
-
set<Node*> ht;
|
27
|
-
|
28
|
-
public:
|
29
|
-
typedef set<Node*>::iterator iterator;
|
30
|
-
NodeSet() {};
|
31
|
-
void add( Node* );
|
32
|
-
bool insert(Node *node) {
|
33
|
-
pair<iterator, bool> result = ht.insert(node);
|
34
|
-
return result.second;
|
35
|
-
};
|
36
|
-
iterator begin() const { return ht.begin(); }
|
37
|
-
iterator end() const { return ht.end(); }
|
38
|
-
size_t size() const { return ht.size(); }
|
39
|
-
void clear() { ht.clear(); }
|
40
|
-
};
|
41
|
-
|
42
|
-
|
43
|
-
/***************** class NodeArray *******************************/
|
44
|
-
|
45
|
-
class NodeArray {
|
46
|
-
|
47
|
-
private:
|
48
|
-
size_t sizev;
|
49
|
-
bool final;
|
50
|
-
Node **node;
|
51
|
-
|
52
|
-
public:
|
53
|
-
NodeArray( NodeSet& );
|
54
|
-
~NodeArray() { delete[] node; };
|
55
|
-
size_t size() const { return sizev; }
|
56
|
-
bool is_final() const { return final; };
|
57
|
-
Node* &operator[]( int i ) const { return node[i]; }
|
58
|
-
};
|
59
|
-
|
60
|
-
|
61
|
-
/***************** class Transition ******************************/
|
62
|
-
|
63
|
-
class Transition {
|
64
|
-
public:
|
65
|
-
Label label;
|
66
|
-
NodeArray *nodes;
|
67
|
-
Transition(Label l, NodeArray *na) { label = l; nodes = na; };
|
68
|
-
};
|
69
|
-
|
70
|
-
|
71
|
-
/***************** class NodeMapping ****************************/
|
72
|
-
|
73
|
-
class NodeMapping {
|
74
|
-
// This class is used to map a node set from one transducer
|
75
|
-
// to a single node in another transducer
|
76
|
-
|
77
|
-
private:
|
78
|
-
struct hashf {
|
79
|
-
size_t operator()(const NodeArray *na) const {
|
80
|
-
size_t key=na->size() ^ na->is_final();
|
81
|
-
for( size_t i=0; i<na->size(); i++)
|
82
|
-
key = (key<<1) ^ (size_t)(*na)[i];
|
83
|
-
return key;
|
84
|
-
}
|
85
|
-
};
|
86
|
-
struct equalf {
|
87
|
-
int operator()(const NodeArray *na1, const NodeArray *na2) const {
|
88
|
-
if (na1->size() != na2->size() || na1->is_final() != na2->is_final())
|
89
|
-
return 0;
|
90
|
-
for( size_t i=0; i<na1->size(); i++)
|
91
|
-
if ((*na1)[i] != (*na2)[i])
|
92
|
-
return 0;
|
93
|
-
return 1;
|
94
|
-
}
|
95
|
-
};
|
96
|
-
typedef hash_map<NodeArray*, Node*, hashf, equalf> NodeMap;
|
97
|
-
NodeMap hm;
|
98
|
-
|
99
|
-
public:
|
100
|
-
typedef NodeMap::iterator iterator;
|
101
|
-
~NodeMapping();
|
102
|
-
iterator begin() { return hm.begin(); };
|
103
|
-
iterator end() { return hm.end(); };
|
104
|
-
iterator find( NodeArray *na) { return hm.find( na ); };
|
105
|
-
Node* &operator[]( NodeArray *na ) { return hm.operator[](na); };
|
106
|
-
|
107
|
-
};
|
108
|
-
|
109
|
-
|
110
|
-
/***************** class LabelMapping ****************************/
|
111
|
-
|
112
|
-
class LabelMapping {
|
113
|
-
// This class is used to map a label to a node set
|
114
|
-
|
115
|
-
private:
|
116
|
-
struct hashf {
|
117
|
-
size_t operator()(const Label l) const {
|
118
|
-
return l.lower_char() | (l.upper_char() << 16);
|
119
|
-
}
|
120
|
-
};
|
121
|
-
struct equalf {
|
122
|
-
int operator()(const Label l1, const Label l2) const {
|
123
|
-
return l1==l2;
|
124
|
-
}
|
125
|
-
};
|
126
|
-
typedef hash_map<const Label, NodeSet, hashf, equalf> LabelMap;
|
127
|
-
LabelMap lm;
|
128
|
-
|
129
|
-
public:
|
130
|
-
LabelMapping(): lm(8) {};
|
131
|
-
typedef LabelMap::iterator iterator;
|
132
|
-
iterator begin() { return lm.begin(); };
|
133
|
-
iterator end() { return lm.end(); };
|
134
|
-
size_t size() { return lm.size(); };
|
135
|
-
iterator find( Label l) { return lm.find( l ); };
|
136
|
-
NodeSet &operator[]( const Label l ) { return lm.operator[]( l ); };
|
137
|
-
|
138
|
-
};
|
139
|
-
|
140
|
-
static void determinise_node( NodeArray&, Node*, Transducer*, NodeMapping&, long );
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
/*******************************************************************/
|
145
|
-
/* */
|
146
|
-
/* NodeSet::add */
|
147
|
-
/* */
|
148
|
-
/*******************************************************************/
|
149
|
-
|
150
|
-
void NodeSet::add( Node *node )
|
151
|
-
|
152
|
-
{
|
153
|
-
pair<iterator, bool> result = ht.insert(node);
|
154
|
-
if (result.second) {
|
155
|
-
// new node, add nodes reachable with epsilon transitions
|
156
|
-
for( ArcsIter p(node->arcs(),ArcsIter::eps); p; p++ ) {
|
157
|
-
Arc *arc=p;
|
158
|
-
if (!arc->label().is_epsilon())
|
159
|
-
break;
|
160
|
-
add(arc->target_node());
|
161
|
-
}
|
162
|
-
}
|
163
|
-
}
|
164
|
-
|
165
|
-
|
166
|
-
/*******************************************************************/
|
167
|
-
/* */
|
168
|
-
/* NodeArray::NodeArray */
|
169
|
-
/* */
|
170
|
-
/*******************************************************************/
|
171
|
-
|
172
|
-
NodeArray::NodeArray( NodeSet &ns )
|
173
|
-
|
174
|
-
{
|
175
|
-
sizev = 0;
|
176
|
-
NodeSet::iterator it;
|
177
|
-
|
178
|
-
final = false;
|
179
|
-
node = new Node*[ns.size()];
|
180
|
-
for( it=ns.begin(); it!=ns.end(); it++ ) {
|
181
|
-
Node *nn = *it;
|
182
|
-
if (nn->arcs()->non_epsilon_transition_exists())
|
183
|
-
node[sizev++] = nn;
|
184
|
-
final |= nn->is_final();
|
185
|
-
}
|
186
|
-
std::sort(node, node+sizev);
|
187
|
-
}
|
188
|
-
|
189
|
-
|
190
|
-
/*******************************************************************/
|
191
|
-
/* */
|
192
|
-
/* NodeMapping::~NodeMapping */
|
193
|
-
/* */
|
194
|
-
/*******************************************************************/
|
195
|
-
|
196
|
-
NodeMapping::~NodeMapping()
|
197
|
-
|
198
|
-
{
|
199
|
-
// if we delete NodeArrays without removing them from NodeMapping,
|
200
|
-
// the system will crash when NodeMapping is deleted.
|
201
|
-
for( iterator it=hm.begin(); it!=hm.end(); ) {
|
202
|
-
NodeArray *na=it->first;
|
203
|
-
iterator old = it++;
|
204
|
-
hm.erase(old);
|
205
|
-
delete na;
|
206
|
-
}
|
207
|
-
}
|
208
|
-
|
209
|
-
|
210
|
-
/*******************************************************************/
|
211
|
-
/* */
|
212
|
-
/* compute_transitions */
|
213
|
-
/* */
|
214
|
-
/*******************************************************************/
|
215
|
-
|
216
|
-
static void compute_transitions( NodeArray &na, vector<Transition> &t )
|
217
|
-
|
218
|
-
{
|
219
|
-
LabelMapping lmap;
|
220
|
-
|
221
|
-
// for all nodes in the current set
|
222
|
-
for( size_t i=0; i<na.size(); i++) {
|
223
|
-
Node *n = na[i]; // old node
|
224
|
-
|
225
|
-
// For each non-epsilon transition, add the target node
|
226
|
-
// to the respective node set.
|
227
|
-
for( ArcsIter p(n->arcs(),ArcsIter::non_eps); p; p++ ) {
|
228
|
-
Arc *arc=p;
|
229
|
-
lmap[arc->label()].add(arc->target_node());
|
230
|
-
}
|
231
|
-
}
|
232
|
-
|
233
|
-
t.reserve(lmap.size());
|
234
|
-
for( LabelMapping::iterator it=lmap.begin(); it!=lmap.end(); it++ )
|
235
|
-
t.push_back(Transition(it->first, new NodeArray( it->second )));
|
236
|
-
}
|
237
|
-
|
238
|
-
|
239
|
-
/*******************************************************************/
|
240
|
-
/* */
|
241
|
-
/* determinise_node */
|
242
|
-
/* */
|
243
|
-
/*******************************************************************/
|
244
|
-
|
245
|
-
static void determinise_node( NodeArray &na, Node *node, Transducer *a,
|
246
|
-
NodeMapping &map, long depth )
|
247
|
-
{
|
248
|
-
if (depth > 10000)
|
249
|
-
fprintf(stderr,"\r%ld",depth);
|
250
|
-
node->set_final(na.is_final());
|
251
|
-
|
252
|
-
vector<Transition> t;
|
253
|
-
compute_transitions( na, t );
|
254
|
-
|
255
|
-
for( size_t i=0; i<t.size(); i++ ) {
|
256
|
-
NodeMapping::iterator it=map.find(t[i].nodes);
|
257
|
-
if (it == map.end()) {
|
258
|
-
// new node set
|
259
|
-
Node *target_node = a->new_node();
|
260
|
-
map[t[i].nodes] = target_node;
|
261
|
-
node->add_arc( t[i].label, target_node, a );
|
262
|
-
determinise_node( *t[i].nodes, target_node, a, map, depth+1 );
|
263
|
-
}
|
264
|
-
else {
|
265
|
-
delete t[i].nodes;
|
266
|
-
node->add_arc( t[i].label, it->second, a );
|
267
|
-
}
|
268
|
-
}
|
269
|
-
}
|
270
|
-
|
271
|
-
|
272
|
-
/*******************************************************************/
|
273
|
-
/* */
|
274
|
-
/* Transducer::determinise */
|
275
|
-
/* */
|
276
|
-
/*******************************************************************/
|
277
|
-
|
278
|
-
Transducer &Transducer::determinise()
|
279
|
-
|
280
|
-
{
|
281
|
-
// initialisations
|
282
|
-
NodeMapping map;
|
283
|
-
|
284
|
-
Transducer *a = new Transducer();
|
285
|
-
a->alphabet.copy(alphabet);
|
286
|
-
|
287
|
-
// creation of the initial node set consisting of all nodes
|
288
|
-
// reachable from the start node via epsilon transitions.
|
289
|
-
NodeArray *na;
|
290
|
-
{
|
291
|
-
NodeSet ns;
|
292
|
-
ns.add(root_node());
|
293
|
-
na = new NodeArray(ns);
|
294
|
-
}
|
295
|
-
|
296
|
-
// map the node set to the new root node
|
297
|
-
map[na] = a->root_node();
|
298
|
-
|
299
|
-
// determinise the transducer recursively
|
300
|
-
determinise_node( *na, a->root_node(), a, map, 0);
|
301
|
-
a->deterministic = 1;
|
302
|
-
return *a;
|
303
|
-
}
|
data/ext/sfst_machine/fst.cc
DELETED
@@ -1,1000 +0,0 @@
|
|
1
|
-
|
2
|
-
/*******************************************************************/
|
3
|
-
/* */
|
4
|
-
/* FILE fst.C */
|
5
|
-
/* MODULE fst */
|
6
|
-
/* PROGRAM SFST */
|
7
|
-
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
8
|
-
/* */
|
9
|
-
/* PURPOSE basic FST functions */
|
10
|
-
/* */
|
11
|
-
/*******************************************************************/
|
12
|
-
|
13
|
-
#include "fst.h"
|
14
|
-
|
15
|
-
using std::vector;
|
16
|
-
using std::istream;
|
17
|
-
using std::ostream;
|
18
|
-
using std::cerr;
|
19
|
-
|
20
|
-
const int BUFFER_SIZE=100000;
|
21
|
-
|
22
|
-
|
23
|
-
/*******************************************************************/
|
24
|
-
/* */
|
25
|
-
/* Arcs::size */
|
26
|
-
/* */
|
27
|
-
/*******************************************************************/
|
28
|
-
|
29
|
-
int Arcs::size() const
|
30
|
-
|
31
|
-
{
|
32
|
-
int n=0;
|
33
|
-
for( Arc *p=first_arcp; p; p=p->next ) n++;
|
34
|
-
for( Arc *p=first_epsilon_arcp; p; p=p->next ) n++;
|
35
|
-
return n;
|
36
|
-
}
|
37
|
-
|
38
|
-
|
39
|
-
/*******************************************************************/
|
40
|
-
/* */
|
41
|
-
/* Arcs::target_node */
|
42
|
-
/* */
|
43
|
-
/*******************************************************************/
|
44
|
-
|
45
|
-
Node *Arcs::target_node( Label l )
|
46
|
-
|
47
|
-
{
|
48
|
-
Arc *arc;
|
49
|
-
|
50
|
-
for( arc=first_arcp; arc; arc=arc->next)
|
51
|
-
if (arc->label() == l)
|
52
|
-
return arc->target_node();
|
53
|
-
|
54
|
-
return NULL;
|
55
|
-
}
|
56
|
-
|
57
|
-
const Node *Arcs::target_node( Label l ) const
|
58
|
-
|
59
|
-
{
|
60
|
-
const Arc *arc;
|
61
|
-
|
62
|
-
for( arc=first_arcp; arc; arc=arc->next)
|
63
|
-
if (arc->label() == l)
|
64
|
-
return arc->target_node();
|
65
|
-
|
66
|
-
return NULL;
|
67
|
-
}
|
68
|
-
|
69
|
-
|
70
|
-
/*******************************************************************/
|
71
|
-
/* */
|
72
|
-
/* Arcs::add_arc */
|
73
|
-
/* */
|
74
|
-
/*******************************************************************/
|
75
|
-
|
76
|
-
void Arcs::add_arc( Label l, Node *node, Transducer *a )
|
77
|
-
|
78
|
-
{
|
79
|
-
Arc *arc=a->new_arc( l, node );
|
80
|
-
|
81
|
-
if (l.is_epsilon()) {
|
82
|
-
arc->next = first_epsilon_arcp;
|
83
|
-
first_epsilon_arcp = arc;
|
84
|
-
}
|
85
|
-
else {
|
86
|
-
arc->next = first_arcp;
|
87
|
-
first_arcp = arc;
|
88
|
-
}
|
89
|
-
}
|
90
|
-
|
91
|
-
|
92
|
-
/*******************************************************************/
|
93
|
-
/* */
|
94
|
-
/* Arcs::remove_arc */
|
95
|
-
/* */
|
96
|
-
/*******************************************************************/
|
97
|
-
|
98
|
-
int Arcs::remove_arc( Arc *arc )
|
99
|
-
|
100
|
-
{
|
101
|
-
Arc **p = (arc->label().is_epsilon()) ? &first_epsilon_arcp : &first_arcp;
|
102
|
-
for( ; *p; p=&(*p)->next )
|
103
|
-
if (*p == arc) {
|
104
|
-
*p = arc->next;
|
105
|
-
return 1;
|
106
|
-
}
|
107
|
-
return 0;
|
108
|
-
}
|
109
|
-
|
110
|
-
|
111
|
-
/*******************************************************************/
|
112
|
-
/* */
|
113
|
-
/* Node::init */
|
114
|
-
/* */
|
115
|
-
/*******************************************************************/
|
116
|
-
|
117
|
-
void Node::init()
|
118
|
-
|
119
|
-
{
|
120
|
-
final = false;
|
121
|
-
visited = 0;
|
122
|
-
arcsp.init();
|
123
|
-
forwardp = NULL;
|
124
|
-
}
|
125
|
-
|
126
|
-
|
127
|
-
/*******************************************************************/
|
128
|
-
/* */
|
129
|
-
/* Node::clear_visited */
|
130
|
-
/* */
|
131
|
-
/*******************************************************************/
|
132
|
-
|
133
|
-
void Node::clear_visited( NodeHashSet &nodeset )
|
134
|
-
|
135
|
-
{
|
136
|
-
if (nodeset.find( this ) == nodeset.end()) {
|
137
|
-
visited = 0;
|
138
|
-
nodeset.insert( this );
|
139
|
-
fprintf(stderr," %lu", nodeset.size());
|
140
|
-
for( ArcsIter p(arcs()); p; p++ ) {
|
141
|
-
Arc *arc=p;
|
142
|
-
arc->target_node()->clear_visited( nodeset );
|
143
|
-
}
|
144
|
-
}
|
145
|
-
}
|
146
|
-
|
147
|
-
|
148
|
-
/*******************************************************************/
|
149
|
-
/* */
|
150
|
-
/* NodeNumbering::number_node */
|
151
|
-
/* */
|
152
|
-
/*******************************************************************/
|
153
|
-
|
154
|
-
void NodeNumbering::number_node( Node *node, Transducer &a )
|
155
|
-
|
156
|
-
{
|
157
|
-
if (!node->was_visited( a.vmark )) {
|
158
|
-
nummap[node] = nodes.size();
|
159
|
-
nodes.push_back(node);
|
160
|
-
for( ArcsIter p(node->arcs()); p; p++ ) {
|
161
|
-
Arc *arc=p;
|
162
|
-
number_node( arc->target_node(), a );
|
163
|
-
}
|
164
|
-
}
|
165
|
-
}
|
166
|
-
|
167
|
-
|
168
|
-
/*******************************************************************/
|
169
|
-
/* */
|
170
|
-
/* NodeNumbering::NodeNumbering */
|
171
|
-
/* */
|
172
|
-
/*******************************************************************/
|
173
|
-
|
174
|
-
NodeNumbering::NodeNumbering( Transducer &a )
|
175
|
-
|
176
|
-
{
|
177
|
-
a.incr_vmark();
|
178
|
-
number_node( a.root_node(), a );
|
179
|
-
}
|
180
|
-
|
181
|
-
|
182
|
-
/*******************************************************************/
|
183
|
-
/* */
|
184
|
-
/* Transducer::new_node */
|
185
|
-
/* */
|
186
|
-
/*******************************************************************/
|
187
|
-
|
188
|
-
Node *Transducer::new_node()
|
189
|
-
|
190
|
-
{
|
191
|
-
Node *node=(Node*)mem.alloc( sizeof(Node) );
|
192
|
-
|
193
|
-
node->init();
|
194
|
-
return node;
|
195
|
-
}
|
196
|
-
|
197
|
-
|
198
|
-
/*******************************************************************/
|
199
|
-
/* */
|
200
|
-
/* Transducer::new_arc */
|
201
|
-
/* */
|
202
|
-
/*******************************************************************/
|
203
|
-
|
204
|
-
Arc *Transducer::new_arc( Label l, Node *target )
|
205
|
-
|
206
|
-
{
|
207
|
-
Arc *arc=(Arc*)mem.alloc( sizeof(Arc) );
|
208
|
-
|
209
|
-
arc->init( l, target);
|
210
|
-
return arc;
|
211
|
-
}
|
212
|
-
|
213
|
-
|
214
|
-
/*******************************************************************/
|
215
|
-
/* */
|
216
|
-
/* Transducer::add_string */
|
217
|
-
/* */
|
218
|
-
/*******************************************************************/
|
219
|
-
|
220
|
-
void Transducer::add_string( char *s, bool extended, Alphabet *a )
|
221
|
-
|
222
|
-
{
|
223
|
-
if (a == NULL)
|
224
|
-
a = &alphabet;
|
225
|
-
|
226
|
-
Node *node=root_node();
|
227
|
-
Label l;
|
228
|
-
while (!(l = a->next_label(s, extended)).is_epsilon()) {
|
229
|
-
a->insert(l);
|
230
|
-
Arcs *arcs=node->arcs();
|
231
|
-
node = arcs->target_node( l );
|
232
|
-
if (node == NULL) {
|
233
|
-
node = new_node();
|
234
|
-
arcs->add_arc( l, node, this );
|
235
|
-
}
|
236
|
-
}
|
237
|
-
node->set_final(1);
|
238
|
-
}
|
239
|
-
|
240
|
-
|
241
|
-
/*******************************************************************/
|
242
|
-
/* */
|
243
|
-
/* Transducer::Transducer */
|
244
|
-
/* */
|
245
|
-
/*******************************************************************/
|
246
|
-
|
247
|
-
Transducer::Transducer( vector<Label> &path )
|
248
|
-
: root(), mem()
|
249
|
-
{
|
250
|
-
Node *node=root_node();
|
251
|
-
|
252
|
-
vmark = 0;
|
253
|
-
deterministic = minimised = true;
|
254
|
-
for( size_t i=0; i<path.size(); i++ ) {
|
255
|
-
Arcs *arcs=node->arcs();
|
256
|
-
node = new_node();
|
257
|
-
arcs->add_arc( path[i], node, this );
|
258
|
-
}
|
259
|
-
node->set_final(1);
|
260
|
-
}
|
261
|
-
|
262
|
-
|
263
|
-
/*******************************************************************/
|
264
|
-
/* */
|
265
|
-
/* Transducer::Transducer */
|
266
|
-
/* */
|
267
|
-
/*******************************************************************/
|
268
|
-
|
269
|
-
Transducer::Transducer( istream &is, const Alphabet *a, bool verbose )
|
270
|
-
: root(), mem()
|
271
|
-
{
|
272
|
-
bool extended=false;
|
273
|
-
int n=0;
|
274
|
-
char buffer[10000];
|
275
|
-
|
276
|
-
vmark = 0;
|
277
|
-
deterministic = true;
|
278
|
-
minimised = false;
|
279
|
-
if (a) {
|
280
|
-
alphabet.copy(*a);
|
281
|
-
extended = true;
|
282
|
-
}
|
283
|
-
while (is.getline(buffer, 10000)) {
|
284
|
-
if (verbose && ++n % 10000 == 0) {
|
285
|
-
if (n == 10000)
|
286
|
-
cerr << "\n";
|
287
|
-
cerr << "\r" << n << " words";
|
288
|
-
}
|
289
|
-
// delete final whitespace characters
|
290
|
-
int l;
|
291
|
-
for( l=strlen(buffer)-1; l>=0; l-- )
|
292
|
-
if ((buffer[l] != ' ' && buffer[l] != '\t' && buffer[l] != '\r') ||
|
293
|
-
(l > 0 && buffer[l-1] == '\\'))
|
294
|
-
break;
|
295
|
-
buffer[l+1] = 0;
|
296
|
-
|
297
|
-
add_string(buffer, extended);
|
298
|
-
}
|
299
|
-
if (verbose && n >= 10000)
|
300
|
-
cerr << "\n";
|
301
|
-
}
|
302
|
-
|
303
|
-
|
304
|
-
/*******************************************************************/
|
305
|
-
/* */
|
306
|
-
/* Transducer::Transducer */
|
307
|
-
/* */
|
308
|
-
/*******************************************************************/
|
309
|
-
|
310
|
-
Transducer::Transducer( char *s, const Alphabet *a, bool extended )
|
311
|
-
: root(), mem()
|
312
|
-
{
|
313
|
-
vmark = 0;
|
314
|
-
deterministic = minimised = true;
|
315
|
-
if (a)
|
316
|
-
alphabet.copy(*a);
|
317
|
-
add_string(s, extended);
|
318
|
-
}
|
319
|
-
|
320
|
-
|
321
|
-
/*******************************************************************/
|
322
|
-
/* */
|
323
|
-
/* Transducer::clear */
|
324
|
-
/* */
|
325
|
-
/*******************************************************************/
|
326
|
-
|
327
|
-
void Transducer::clear()
|
328
|
-
|
329
|
-
{
|
330
|
-
vmark = 0;
|
331
|
-
deterministic = minimised = false;
|
332
|
-
root.init();
|
333
|
-
mem.clear();
|
334
|
-
alphabet.clear();
|
335
|
-
}
|
336
|
-
|
337
|
-
|
338
|
-
/*******************************************************************/
|
339
|
-
/* */
|
340
|
-
/* Transducer::store_symbols */
|
341
|
-
/* */
|
342
|
-
/*******************************************************************/
|
343
|
-
|
344
|
-
void Transducer::store_symbols(Node *node, SymbolMap &symbol, LabelSet &labels)
|
345
|
-
|
346
|
-
{
|
347
|
-
if (!node->was_visited( vmark )) {
|
348
|
-
Arcs *arcs=node->arcs();
|
349
|
-
for( ArcsIter p(arcs); p; p++ ) {
|
350
|
-
Arc *arc=p;
|
351
|
-
Label l=arc->label();
|
352
|
-
|
353
|
-
labels.insert(l);
|
354
|
-
|
355
|
-
Character c = l.upper_char();
|
356
|
-
if (symbol.find(c) == symbol.end()) {
|
357
|
-
const char *s = alphabet.code2symbol(c);
|
358
|
-
if (s)
|
359
|
-
symbol[c] = fst_strdup(s);
|
360
|
-
}
|
361
|
-
|
362
|
-
c = l.lower_char();
|
363
|
-
if (symbol.find(c) == symbol.end()) {
|
364
|
-
const char *s = alphabet.code2symbol(c);
|
365
|
-
if (s)
|
366
|
-
symbol[c] = fst_strdup(s);
|
367
|
-
}
|
368
|
-
|
369
|
-
store_symbols( arc->target_node(), symbol, labels );
|
370
|
-
}
|
371
|
-
}
|
372
|
-
}
|
373
|
-
|
374
|
-
|
375
|
-
/*******************************************************************/
|
376
|
-
/* */
|
377
|
-
/* Transducer::minimise_alphabet */
|
378
|
-
/* */
|
379
|
-
/*******************************************************************/
|
380
|
-
|
381
|
-
void Transducer::minimise_alphabet()
|
382
|
-
|
383
|
-
{
|
384
|
-
SymbolMap symbols;
|
385
|
-
LabelSet labels;
|
386
|
-
incr_vmark();
|
387
|
-
store_symbols(root_node(), symbols, labels);
|
388
|
-
alphabet.clear();
|
389
|
-
for( SymbolMap::iterator it=symbols.begin(); it!=symbols.end(); it++ ) {
|
390
|
-
alphabet.add_symbol( it->second, it->first );
|
391
|
-
free(it->second);
|
392
|
-
}
|
393
|
-
for( LabelSet::iterator it=labels.begin(); it!=labels.end(); it++ )
|
394
|
-
alphabet.insert(*it);
|
395
|
-
}
|
396
|
-
|
397
|
-
|
398
|
-
/*******************************************************************/
|
399
|
-
/* */
|
400
|
-
/* Transducer::minimise */
|
401
|
-
/* */
|
402
|
-
/*******************************************************************/
|
403
|
-
|
404
|
-
Transducer &Transducer::minimise( bool verbose )
|
405
|
-
|
406
|
-
{
|
407
|
-
if (minimised)
|
408
|
-
return copy();
|
409
|
-
|
410
|
-
Transducer *a1, *a2;
|
411
|
-
|
412
|
-
a1 = &reverse();
|
413
|
-
a2 = &a1->determinise();
|
414
|
-
delete a1;
|
415
|
-
|
416
|
-
a1 = &a2->reverse();
|
417
|
-
delete a2;
|
418
|
-
|
419
|
-
a2 = &a1->determinise();
|
420
|
-
delete a1;
|
421
|
-
|
422
|
-
a2->minimised = true;
|
423
|
-
a2->minimise_alphabet();
|
424
|
-
|
425
|
-
return *a2;
|
426
|
-
}
|
427
|
-
|
428
|
-
|
429
|
-
/*******************************************************************/
|
430
|
-
/* */
|
431
|
-
/* Transducer::enumerate_paths_node */
|
432
|
-
/* */
|
433
|
-
/*******************************************************************/
|
434
|
-
|
435
|
-
void Transducer::enumerate_paths_node( Node *node, vector<Label> &path,
|
436
|
-
NodeHashSet &previous,
|
437
|
-
vector<Transducer*> &result )
|
438
|
-
{
|
439
|
-
if (node->is_final())
|
440
|
-
result.push_back(new Transducer(path));
|
441
|
-
|
442
|
-
for( ArcsIter it_arc(node->arcs()); it_arc; it_arc++ ) {
|
443
|
-
Arc *arc=it_arc;
|
444
|
-
|
445
|
-
NodeHashSet::iterator it_node=previous.insert(node).first;
|
446
|
-
path.push_back(arc->label());
|
447
|
-
enumerate_paths_node( arc->target_node(), path, previous, result );
|
448
|
-
path.pop_back();
|
449
|
-
previous.erase(it_node);
|
450
|
-
}
|
451
|
-
}
|
452
|
-
|
453
|
-
|
454
|
-
/*******************************************************************/
|
455
|
-
/* */
|
456
|
-
/* Transducer::enumerate_paths */
|
457
|
-
/* */
|
458
|
-
/*******************************************************************/
|
459
|
-
|
460
|
-
bool Transducer::enumerate_paths( vector<Transducer*> &result )
|
461
|
-
|
462
|
-
{
|
463
|
-
if (is_infinitely_ambiguous())
|
464
|
-
return true;
|
465
|
-
for( size_t i=0; i<result.size(); i++ )
|
466
|
-
delete result[i];
|
467
|
-
result.clear();
|
468
|
-
|
469
|
-
vector<Label> path;
|
470
|
-
NodeHashSet previous;
|
471
|
-
enumerate_paths_node( root_node(), path, previous, result );
|
472
|
-
return false;
|
473
|
-
}
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
/*******************************************************************/
|
479
|
-
/* */
|
480
|
-
/* Transducer::print_strings_node */
|
481
|
-
/* */
|
482
|
-
/*******************************************************************/
|
483
|
-
|
484
|
-
int Transducer::print_strings_node(Node *node, char *buffer, int pos,
|
485
|
-
FILE *file, bool with_brackets )
|
486
|
-
{
|
487
|
-
int result = 0;
|
488
|
-
|
489
|
-
if (node->was_visited( vmark )) {
|
490
|
-
if (node->forward() != NULL) { // cycle detected
|
491
|
-
cerr << "Warning: cyclic analyses (cycle aborted)\n";
|
492
|
-
return 0;
|
493
|
-
}
|
494
|
-
node->set_forward(node); // used like a flag for loop detection
|
495
|
-
}
|
496
|
-
if (pos == BUFFER_SIZE)
|
497
|
-
throw "Output string in function print_strings_node is too long";
|
498
|
-
if (node->is_final()) {
|
499
|
-
buffer[pos] = '\0';
|
500
|
-
fprintf(file,"%s\n", buffer);
|
501
|
-
result = 1;
|
502
|
-
}
|
503
|
-
for( ArcsIter i(node->arcs()); i; i++ ) {
|
504
|
-
int p=pos;
|
505
|
-
Arc *arc=i;
|
506
|
-
Label l=arc->label();
|
507
|
-
alphabet.write_label(l, buffer, &p, with_brackets);
|
508
|
-
result |= print_strings_node(arc->target_node(), buffer, p,
|
509
|
-
file, with_brackets );
|
510
|
-
}
|
511
|
-
node->set_forward(NULL);
|
512
|
-
|
513
|
-
return result;
|
514
|
-
}
|
515
|
-
|
516
|
-
|
517
|
-
/*******************************************************************/
|
518
|
-
/* */
|
519
|
-
/* Transducer::print_strings */
|
520
|
-
/* */
|
521
|
-
/*******************************************************************/
|
522
|
-
|
523
|
-
int Transducer::print_strings( FILE *file, bool with_brackets )
|
524
|
-
|
525
|
-
{
|
526
|
-
char buffer[BUFFER_SIZE];
|
527
|
-
incr_vmark();
|
528
|
-
return print_strings_node( root_node(), buffer, 0, file, with_brackets );
|
529
|
-
}
|
530
|
-
|
531
|
-
|
532
|
-
/*******************************************************************/
|
533
|
-
/* */
|
534
|
-
/* Transducer::analyze_string */
|
535
|
-
/* */
|
536
|
-
/*******************************************************************/
|
537
|
-
|
538
|
-
bool Transducer::analyze_string( char *string, FILE *file, bool with_brackets )
|
539
|
-
|
540
|
-
{
|
541
|
-
vector<Character> input;
|
542
|
-
alphabet.string2symseq( string, input );
|
543
|
-
vector<Label> labels;
|
544
|
-
for( size_t i=0; i<input.size(); i++ )
|
545
|
-
labels.push_back(Label(input[i]));
|
546
|
-
|
547
|
-
Transducer a1(labels);
|
548
|
-
Transducer *a2=&(*this || a1);
|
549
|
-
Transducer *a3=&(a2->lower_level());
|
550
|
-
delete a2;
|
551
|
-
a2 = &a3->minimise();
|
552
|
-
delete a3;
|
553
|
-
|
554
|
-
a2->alphabet.copy(alphabet);
|
555
|
-
bool result = a2->print_strings( file, with_brackets );
|
556
|
-
delete a2;
|
557
|
-
return result;
|
558
|
-
}
|
559
|
-
|
560
|
-
|
561
|
-
/*******************************************************************/
|
562
|
-
/* */
|
563
|
-
/* Transducer::generate_string */
|
564
|
-
/* */
|
565
|
-
/*******************************************************************/
|
566
|
-
|
567
|
-
bool Transducer::generate_string( char *string, FILE *file, bool with_brackets)
|
568
|
-
|
569
|
-
{
|
570
|
-
Transducer a1(string, &alphabet, false);
|
571
|
-
Transducer *a2=&(a1 || *this);
|
572
|
-
Transducer *a3=&(a2->upper_level());
|
573
|
-
delete a2;
|
574
|
-
a2 = &a3->minimise();
|
575
|
-
delete a3;
|
576
|
-
|
577
|
-
a2->alphabet.copy(alphabet);
|
578
|
-
bool result = a2->print_strings( file, with_brackets );
|
579
|
-
delete a2;
|
580
|
-
return result;
|
581
|
-
}
|
582
|
-
|
583
|
-
|
584
|
-
/*******************************************************************/
|
585
|
-
/* */
|
586
|
-
/* complete */
|
587
|
-
/* */
|
588
|
-
/*******************************************************************/
|
589
|
-
|
590
|
-
static void complete( Node *node, Alphabet &alphabet, int vmark)
|
591
|
-
|
592
|
-
{
|
593
|
-
if (node->was_visited( vmark ))
|
594
|
-
return;
|
595
|
-
for( ArcsIter p(node->arcs()); p; p++ ) {
|
596
|
-
Arc *arc=p;
|
597
|
-
if (!arc->label().is_epsilon())
|
598
|
-
alphabet.insert(arc->label());
|
599
|
-
complete(arc->target_node(), alphabet, vmark);
|
600
|
-
}
|
601
|
-
}
|
602
|
-
|
603
|
-
|
604
|
-
/*******************************************************************/
|
605
|
-
/* */
|
606
|
-
/* Transducer::complete_alphabet */
|
607
|
-
/* */
|
608
|
-
/*******************************************************************/
|
609
|
-
|
610
|
-
void Transducer::complete_alphabet()
|
611
|
-
|
612
|
-
{
|
613
|
-
incr_vmark();
|
614
|
-
complete(root_node(), alphabet, vmark);
|
615
|
-
}
|
616
|
-
|
617
|
-
|
618
|
-
/*******************************************************************/
|
619
|
-
/* */
|
620
|
-
/* print_node */
|
621
|
-
/* */
|
622
|
-
/*******************************************************************/
|
623
|
-
|
624
|
-
static void print_node( ostream &s, Node *node, NodeNumbering &index,
|
625
|
-
long vmark, Alphabet &abc )
|
626
|
-
|
627
|
-
{
|
628
|
-
if (!node->was_visited( vmark )) {
|
629
|
-
Arcs *arcs=node->arcs();
|
630
|
-
for( ArcsIter p(arcs); p; p++ ) {
|
631
|
-
Arc *arc=p;
|
632
|
-
s << index[node] << "\t" << index[arc->target_node()];
|
633
|
-
s << "\t" << abc.write_char(arc->label().lower_char());
|
634
|
-
s << "\t" << abc.write_char(arc->label().upper_char());
|
635
|
-
s << "\n";
|
636
|
-
}
|
637
|
-
if (node->is_final())
|
638
|
-
s << index[node] << "\n";
|
639
|
-
for( ArcsIter p(arcs); p; p++ ) {
|
640
|
-
Arc *arc=p;
|
641
|
-
print_node( s, arc->target_node(), index, vmark, abc );
|
642
|
-
}
|
643
|
-
}
|
644
|
-
}
|
645
|
-
|
646
|
-
|
647
|
-
/*******************************************************************/
|
648
|
-
/* */
|
649
|
-
/* operator<< */
|
650
|
-
/* */
|
651
|
-
/*******************************************************************/
|
652
|
-
|
653
|
-
ostream &operator<<( ostream &s, Transducer &a )
|
654
|
-
|
655
|
-
{
|
656
|
-
NodeNumbering index(a);
|
657
|
-
a.incr_vmark();
|
658
|
-
print_node( s, a.root_node(), index, a.vmark, a.alphabet );
|
659
|
-
return s;
|
660
|
-
}
|
661
|
-
|
662
|
-
|
663
|
-
/*******************************************************************/
|
664
|
-
/* */
|
665
|
-
/* store_node_info */
|
666
|
-
/* */
|
667
|
-
/*******************************************************************/
|
668
|
-
|
669
|
-
static void store_node_info( FILE *file, Node *node )
|
670
|
-
|
671
|
-
{
|
672
|
-
// write final flag
|
673
|
-
char c=node->is_final();
|
674
|
-
fwrite(&c,sizeof(c),1,file);
|
675
|
-
|
676
|
-
// write the number of arcs
|
677
|
-
int nn = node->arcs()->size();
|
678
|
-
if (nn > 65535)
|
679
|
-
throw "Error: in function store_node\n";
|
680
|
-
unsigned short n=(unsigned short)nn;
|
681
|
-
fwrite(&n,sizeof(n),1,file);
|
682
|
-
}
|
683
|
-
|
684
|
-
|
685
|
-
/*******************************************************************/
|
686
|
-
/* */
|
687
|
-
/* store_arc_label */
|
688
|
-
/* */
|
689
|
-
/*******************************************************************/
|
690
|
-
|
691
|
-
static void store_arc_label( FILE *file, Arc *arc )
|
692
|
-
|
693
|
-
{
|
694
|
-
Label l=arc->label();
|
695
|
-
Character lc=l.lower_char();
|
696
|
-
Character uc=l.upper_char();
|
697
|
-
fwrite(&lc,sizeof(lc),1,file);
|
698
|
-
fwrite(&uc,sizeof(uc),1,file);
|
699
|
-
}
|
700
|
-
|
701
|
-
|
702
|
-
/*******************************************************************/
|
703
|
-
/* */
|
704
|
-
/* store_node */
|
705
|
-
/* */
|
706
|
-
/*******************************************************************/
|
707
|
-
|
708
|
-
static void store_node( FILE *file, Node *node, NodeNumbering &index,
|
709
|
-
long vmark )
|
710
|
-
{
|
711
|
-
if (!node->was_visited( vmark )) {
|
712
|
-
|
713
|
-
store_node_info( file, node );
|
714
|
-
|
715
|
-
// write the arcs
|
716
|
-
for( ArcsIter p(node->arcs()); p; p++ ) {
|
717
|
-
Arc *arc=p;
|
718
|
-
store_arc_label( file, arc );
|
719
|
-
unsigned int t=index[arc->target_node()];
|
720
|
-
fwrite(&t,sizeof(t),1,file);
|
721
|
-
store_node(file, arc->target_node(), index, vmark );
|
722
|
-
}
|
723
|
-
}
|
724
|
-
}
|
725
|
-
|
726
|
-
|
727
|
-
/*******************************************************************/
|
728
|
-
/* */
|
729
|
-
/* store_lowmem_node */
|
730
|
-
/* */
|
731
|
-
/*******************************************************************/
|
732
|
-
|
733
|
-
static void store_lowmem_node( FILE *file, Node *node, NodeNumbering &index,
|
734
|
-
vector<unsigned int> &startpos)
|
735
|
-
{
|
736
|
-
store_node_info( file, node );
|
737
|
-
|
738
|
-
// write the arcs
|
739
|
-
for( ArcsIter p(node->arcs()); p; p++ ) {
|
740
|
-
Arc *arc=p;
|
741
|
-
store_arc_label( file, arc );
|
742
|
-
unsigned int t=startpos[index[arc->target_node()]];
|
743
|
-
fwrite(&t,sizeof(t),1,file);
|
744
|
-
}
|
745
|
-
}
|
746
|
-
|
747
|
-
|
748
|
-
/*******************************************************************/
|
749
|
-
/* */
|
750
|
-
/* Transducer::store_lowmem */
|
751
|
-
/* */
|
752
|
-
/*******************************************************************/
|
753
|
-
|
754
|
-
void Transducer::store_lowmem( FILE *file )
|
755
|
-
|
756
|
-
{
|
757
|
-
fputc('l',file);
|
758
|
-
alphabet.store(file);
|
759
|
-
|
760
|
-
// storing size of index table
|
761
|
-
NodeNumbering index(*this);
|
762
|
-
|
763
|
-
// compute the start position of the first node
|
764
|
-
unsigned int pos=(unsigned int)ftell(file);
|
765
|
-
vector<unsigned int> startpos;
|
766
|
-
for( size_t i=0; i<index.number_of_nodes(); i++ ) {
|
767
|
-
startpos.push_back(pos);
|
768
|
-
Node *node=index.get_node(i);
|
769
|
-
Arcs *arcs=node->arcs();
|
770
|
-
pos += sizeof(char) // size of final flag
|
771
|
-
+ sizeof(unsigned short) // size of number of arcs
|
772
|
-
+ arcs->size() * (sizeof(Character) * 2 + sizeof(unsigned int)); // size of n arcs
|
773
|
-
}
|
774
|
-
|
775
|
-
// storing nodes
|
776
|
-
for( size_t i=0; i<index.number_of_nodes(); i++ )
|
777
|
-
store_lowmem_node( file, index.get_node(i), index, startpos );
|
778
|
-
}
|
779
|
-
|
780
|
-
|
781
|
-
/*******************************************************************/
|
782
|
-
/* */
|
783
|
-
/* Transducer::store */
|
784
|
-
/* */
|
785
|
-
/*******************************************************************/
|
786
|
-
|
787
|
-
void Transducer::store( FILE *file )
|
788
|
-
|
789
|
-
{
|
790
|
-
fputc('a',file);
|
791
|
-
|
792
|
-
NodeNumbering index(*this);
|
793
|
-
incr_vmark();
|
794
|
-
unsigned int n=index.number_of_nodes();
|
795
|
-
fwrite(&n,sizeof(n),1,file);
|
796
|
-
store_node( file, root_node(), index, vmark );
|
797
|
-
|
798
|
-
alphabet.store(file);
|
799
|
-
}
|
800
|
-
|
801
|
-
|
802
|
-
/*******************************************************************/
|
803
|
-
/* */
|
804
|
-
/* read_node */
|
805
|
-
/* */
|
806
|
-
/*******************************************************************/
|
807
|
-
|
808
|
-
static void read_node( FILE *file, Node *node, Node **p, Transducer *a )
|
809
|
-
{
|
810
|
-
char c;
|
811
|
-
fread(&c,sizeof(c),1,file);
|
812
|
-
node->set_final(c);
|
813
|
-
|
814
|
-
unsigned short n;
|
815
|
-
fread( &n, sizeof(n), 1, file);
|
816
|
-
|
817
|
-
for( int i=0; i<n; i++ ) {
|
818
|
-
Character lc,uc;
|
819
|
-
unsigned int t;
|
820
|
-
fread(&lc,sizeof(lc),1,file);
|
821
|
-
fread(&uc,sizeof(uc),1,file);
|
822
|
-
fread(&t,sizeof(t),1,file);
|
823
|
-
if (ferror(file))
|
824
|
-
throw "Error encountered while reading transducer from file";
|
825
|
-
if (p[t])
|
826
|
-
node->add_arc( Label(lc,uc), p[t], a );
|
827
|
-
else {
|
828
|
-
p[t] = a->new_node();
|
829
|
-
node->add_arc( Label(lc,uc), p[t], a );
|
830
|
-
read_node(file, p[t], p, a );
|
831
|
-
}
|
832
|
-
}
|
833
|
-
}
|
834
|
-
|
835
|
-
|
836
|
-
/*******************************************************************/
|
837
|
-
/* */
|
838
|
-
/* Transducer::read_transducer_binary */
|
839
|
-
/* */
|
840
|
-
/*******************************************************************/
|
841
|
-
|
842
|
-
void Transducer::read_transducer_binary( FILE *file )
|
843
|
-
|
844
|
-
{
|
845
|
-
if (fgetc(file) != 'a')
|
846
|
-
throw "Error: wrong file format (not a standard transducer)\n";
|
847
|
-
|
848
|
-
vmark = deterministic = 0;
|
849
|
-
unsigned int n;
|
850
|
-
fread(&n,sizeof(n),1,file); // number of nodes
|
851
|
-
if (ferror(file))
|
852
|
-
throw "Error encountered while reading transducer from file";
|
853
|
-
|
854
|
-
Node **p=new Node*[n]; // maps indices to nodes
|
855
|
-
p[0] = root_node();
|
856
|
-
for( unsigned int i=1; i<n; i++)
|
857
|
-
p[i] = NULL;
|
858
|
-
read_node( file, root_node(), p, this );
|
859
|
-
delete[] p;
|
860
|
-
|
861
|
-
alphabet.read(file);
|
862
|
-
|
863
|
-
vmark = 1;
|
864
|
-
deterministic = minimised = 1;
|
865
|
-
}
|
866
|
-
|
867
|
-
|
868
|
-
/*******************************************************************/
|
869
|
-
/* */
|
870
|
-
/* error_message */
|
871
|
-
/* */
|
872
|
-
/*******************************************************************/
|
873
|
-
|
874
|
-
static void error_message( size_t line )
|
875
|
-
|
876
|
-
{
|
877
|
-
static char message[1000];
|
878
|
-
sprintf(message, "Error: in line %u of text transducer file",
|
879
|
-
(unsigned int)line);
|
880
|
-
throw message;
|
881
|
-
}
|
882
|
-
|
883
|
-
|
884
|
-
/*******************************************************************/
|
885
|
-
/* */
|
886
|
-
/* Transducer::create_node */
|
887
|
-
/* */
|
888
|
-
/*******************************************************************/
|
889
|
-
|
890
|
-
Node *Transducer::create_node( vector<Node*> &node, char *s, size_t line )
|
891
|
-
|
892
|
-
{
|
893
|
-
char *p;
|
894
|
-
long n = strtol(s, &p, 10);
|
895
|
-
|
896
|
-
if (s == p || n < 0)
|
897
|
-
error_message( line );
|
898
|
-
if ((long)node.size() <= n)
|
899
|
-
node.resize(n+1, NULL);
|
900
|
-
if (node[n] == NULL)
|
901
|
-
node[n] = new Node;
|
902
|
-
|
903
|
-
return node[n];
|
904
|
-
}
|
905
|
-
|
906
|
-
|
907
|
-
/*******************************************************************/
|
908
|
-
/* */
|
909
|
-
/* next_string */
|
910
|
-
/* */
|
911
|
-
/*******************************************************************/
|
912
|
-
|
913
|
-
static char *next_string( char* &s, size_t line )
|
914
|
-
|
915
|
-
{
|
916
|
-
// scan the input up to the next tab or newline character
|
917
|
-
// and unquote symbols preceded by a backslash
|
918
|
-
char *p = s;
|
919
|
-
char *q = s;
|
920
|
-
while (*q!=0 && *q!='\t' && *q!='\n' && *q!='\r') {
|
921
|
-
if (*q == '\\')
|
922
|
-
q++;
|
923
|
-
*(p++) = *(q++);
|
924
|
-
}
|
925
|
-
if (p == s)
|
926
|
-
error_message(line); // no string found
|
927
|
-
|
928
|
-
char *result=s;
|
929
|
-
// skip over following whitespace
|
930
|
-
while (*q == ' ' || *q == '\t' || *q == '\n' || *q == '\r')
|
931
|
-
q++;
|
932
|
-
|
933
|
-
if (*q == 0)
|
934
|
-
s = NULL; // end of string was reached
|
935
|
-
else
|
936
|
-
s = q; // move the string pointer s
|
937
|
-
|
938
|
-
*p = 0; // mark the end of the result string
|
939
|
-
|
940
|
-
return result;
|
941
|
-
}
|
942
|
-
|
943
|
-
|
944
|
-
/*******************************************************************/
|
945
|
-
/* */
|
946
|
-
/* Transducer::read_transducer_text */
|
947
|
-
/* */
|
948
|
-
/*******************************************************************/
|
949
|
-
|
950
|
-
void Transducer::read_transducer_text( FILE *file )
|
951
|
-
|
952
|
-
{
|
953
|
-
vector<Node*> nodes;
|
954
|
-
nodes.push_back(root_node());
|
955
|
-
|
956
|
-
vmark = deterministic = 0;
|
957
|
-
char buffer[10000];
|
958
|
-
for( size_t line=0; fgets(buffer, 10000, file ); line++ ) {
|
959
|
-
char *p = buffer;
|
960
|
-
char *s = next_string(p, line);
|
961
|
-
Node *node = create_node( nodes, s, line );
|
962
|
-
if (p == NULL)
|
963
|
-
node->set_final(true);
|
964
|
-
else {
|
965
|
-
s = next_string(p, line);
|
966
|
-
Node *target = create_node( nodes, s, line );
|
967
|
-
|
968
|
-
s = next_string(p, line);
|
969
|
-
Character lc = alphabet.add_symbol(s);
|
970
|
-
s = next_string(p, line);
|
971
|
-
Character uc = alphabet.add_symbol(s);
|
972
|
-
Label l(lc,uc);
|
973
|
-
if (l == Label::epsilon)
|
974
|
-
error_message( line );
|
975
|
-
|
976
|
-
alphabet.insert(l);
|
977
|
-
node->add_arc( l, target, this );
|
978
|
-
}
|
979
|
-
}
|
980
|
-
|
981
|
-
vmark = 1;
|
982
|
-
deterministic = minimised = 1;
|
983
|
-
}
|
984
|
-
|
985
|
-
|
986
|
-
/*******************************************************************/
|
987
|
-
/* */
|
988
|
-
/* Transducer::Transducer */
|
989
|
-
/* */
|
990
|
-
/*******************************************************************/
|
991
|
-
|
992
|
-
Transducer::Transducer( FILE *file, bool binary )
|
993
|
-
|
994
|
-
{
|
995
|
-
if (binary)
|
996
|
-
read_transducer_binary( file );
|
997
|
-
else
|
998
|
-
read_transducer_text( file );
|
999
|
-
}
|
1000
|
-
|