ruby-sfst 0.4.3 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -0
- data/COPYING +280 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +54 -0
- data/README.md +1 -1
- data/Rakefile +9 -18
- data/bin/console +7 -0
- data/bin/setup +6 -0
- data/ext/sfst/alphabet.cc +879 -0
- data/ext/sfst/alphabet.h +302 -0
- data/ext/sfst/basic.cc +85 -0
- data/ext/{sfst_machine → sfst}/basic.h +7 -4
- data/ext/sfst/compact.cc +629 -0
- data/ext/sfst/compact.h +100 -0
- data/ext/sfst/determinise.cc +279 -0
- data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
- data/ext/sfst/fst.cc +1150 -0
- data/ext/sfst/fst.h +374 -0
- data/ext/sfst/hopcroft.cc +681 -0
- data/ext/sfst/interface.cc +1921 -0
- data/ext/sfst/interface.h +171 -0
- data/ext/sfst/make-compact.cc +323 -0
- data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
- data/ext/sfst/mem.h +80 -0
- data/ext/sfst/operators.cc +1273 -0
- data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
- data/ext/sfst/sgi.h +72 -0
- data/ext/sfst/utf8.cc +149 -0
- data/ext/{sfst_machine → sfst}/utf8.h +7 -4
- data/lib/sfst.rb +2 -1
- data/lib/sfst/version.rb +1 -1
- data/ruby-sfst.gemspec +23 -23
- metadata +107 -35
- data/ext/sfst_machine/alphabet.cc +0 -812
- data/ext/sfst_machine/alphabet.h +0 -273
- data/ext/sfst_machine/basic.cc +0 -84
- data/ext/sfst_machine/compact.cc +0 -616
- data/ext/sfst_machine/compact.h +0 -98
- data/ext/sfst_machine/determinise.cc +0 -303
- data/ext/sfst_machine/fst.cc +0 -1000
- data/ext/sfst_machine/fst.h +0 -369
- data/ext/sfst_machine/interface.cc +0 -1842
- data/ext/sfst_machine/interface.h +0 -93
- data/ext/sfst_machine/make-compact.cc +0 -327
- data/ext/sfst_machine/mem.h +0 -74
- data/ext/sfst_machine/operators.cc +0 -1131
- data/ext/sfst_machine/sgi.h +0 -44
- data/ext/sfst_machine/utf8.cc +0 -146
- data/test/test_sfst.fst +0 -3
- data/test/test_sfst.rb +0 -114
data/ext/sfst/compact.h
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
/*******************************************************************/
|
2
|
+
/* */
|
3
|
+
/* FILE compact.h */
|
4
|
+
/* MODULE compact */
|
5
|
+
/* PROGRAM SFST */
|
6
|
+
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
7
|
+
/* */
|
8
|
+
/* PURPOSE finite state tools */
|
9
|
+
/* */
|
10
|
+
/*******************************************************************/
|
11
|
+
|
12
|
+
#ifndef _COMPACT_H_
|
13
|
+
#define _COMPACT_H_
|
14
|
+
|
15
|
+
#include "alphabet.h"
|
16
|
+
|
17
|
+
#include <vector>
|
18
|
+
|
19
|
+
namespace SFST {
|
20
|
+
|
21
|
+
typedef std::vector<unsigned int> CAnalysis;
|
22
|
+
|
23
|
+
class CompactTransducer {
|
24
|
+
|
25
|
+
protected:
|
26
|
+
|
27
|
+
// the following data structures are used to store the nodes
|
28
|
+
|
29
|
+
unsigned int number_of_nodes; // number of nodes in the transducer
|
30
|
+
char *finalp; // finalp[i] is 1 if node i is final and 0 otherwise
|
31
|
+
unsigned int *first_arc; // first_arc[i] is the number of the first
|
32
|
+
// arc outgoing from node i
|
33
|
+
|
34
|
+
// the following data structures are used to store the transition arcs
|
35
|
+
|
36
|
+
unsigned int number_of_arcs; // total number of arcs in the transducer
|
37
|
+
Label *label; // the label (character pair) of arc i
|
38
|
+
unsigned int *target_node; // target node of arc i
|
39
|
+
|
40
|
+
// the following data structures are used to store the stochastic parameters
|
41
|
+
float *final_logprob;
|
42
|
+
float *arc_logprob;
|
43
|
+
|
44
|
+
// functions needed to read the transducer from a file
|
45
|
+
|
46
|
+
void read_finalp( FILE *file );
|
47
|
+
void read_first_arcs( FILE *file );
|
48
|
+
void read_target_nodes( FILE *file );
|
49
|
+
void read_labels( FILE *file );
|
50
|
+
void read_probs( FILE *file );
|
51
|
+
|
52
|
+
// functions needed to analyze data with the transducer
|
53
|
+
|
54
|
+
void analyze( unsigned int n, std::vector<Character> &ch, size_t ipos,
|
55
|
+
CAnalysis&, std::vector<CAnalysis>&);
|
56
|
+
|
57
|
+
// function selecting the simplest morphological analysis
|
58
|
+
|
59
|
+
int compute_score( CAnalysis &ana );
|
60
|
+
void disambiguate( std::vector<CAnalysis> &analyses );
|
61
|
+
|
62
|
+
// functions for longest-match analysis of input data
|
63
|
+
|
64
|
+
void longest_match2(unsigned int, char*, int, CAnalysis&, int&, CAnalysis&);
|
65
|
+
|
66
|
+
void convert( CAnalysis &cana, Analysis &ana );
|
67
|
+
|
68
|
+
public:
|
69
|
+
size_t node_count() { return number_of_nodes; };
|
70
|
+
size_t arc_count() { return number_of_arcs; };
|
71
|
+
|
72
|
+
bool both_layers; // print surface and analysis symbols
|
73
|
+
bool simplest_only; // print only the simplest analyses
|
74
|
+
|
75
|
+
Alphabet alphabet; // data structure which maps symbols to numeric codes
|
76
|
+
CompactTransducer(); // dummy constructor
|
77
|
+
CompactTransducer( FILE*, FILE *pfile=NULL ); // reads a (stochastic) transducer
|
78
|
+
~CompactTransducer(); // destroys a transducer
|
79
|
+
|
80
|
+
// the analysis function returns the set of analyses for the string "s"
|
81
|
+
// in the argument "analyses"
|
82
|
+
void analyze_string( char *s, std::vector<CAnalysis > &analyses );
|
83
|
+
|
84
|
+
void compute_probs( std::vector<CAnalysis> &analyses, std::vector<double> &prob );
|
85
|
+
char *print_analysis( CAnalysis &ana );
|
86
|
+
|
87
|
+
// longest-match analysis
|
88
|
+
const char *longest_match( char*& );
|
89
|
+
|
90
|
+
// EM training
|
91
|
+
bool train2( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
|
92
|
+
bool train( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
|
93
|
+
void estimate_probs( std::vector<double> &arcfreq, std::vector<double> &finalfreq );
|
94
|
+
|
95
|
+
// robust analysis
|
96
|
+
float robust_analyze_string( char *string, std::vector<CAnalysis> &analyses,
|
97
|
+
float ErrorsAllowed );
|
98
|
+
};
|
99
|
+
}
|
100
|
+
#endif
|
@@ -0,0 +1,279 @@
|
|
1
|
+
|
2
|
+
/*******************************************************************/
|
3
|
+
/* */
|
4
|
+
/* FILE determinise.C */
|
5
|
+
/* MODULE determinise */
|
6
|
+
/* PROGRAM SFST */
|
7
|
+
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
8
|
+
/* */
|
9
|
+
/*******************************************************************/
|
10
|
+
|
11
|
+
|
12
|
+
#include "fst.h"
|
13
|
+
|
14
|
+
using std::vector;
|
15
|
+
using std::pair;
|
16
|
+
using std::set;
|
17
|
+
|
18
|
+
namespace SFST {
|
19
|
+
|
20
|
+
|
21
|
+
/***************** class NodeSet *********************************/
|
22
|
+
|
23
|
+
class NodeSet {
|
24
|
+
// This class is used to store a set of nodes.
|
25
|
+
// Whenever a new node is added, all nodes accessible
|
26
|
+
// through epsilon transitions are added as well.
|
27
|
+
|
28
|
+
private:
|
29
|
+
set<Node*> ht;
|
30
|
+
|
31
|
+
public:
|
32
|
+
typedef set<Node*>::iterator iterator;
|
33
|
+
NodeSet() {};
|
34
|
+
void add( Node* );
|
35
|
+
bool insert(Node *node) {
|
36
|
+
pair<iterator, bool> result = ht.insert(node);
|
37
|
+
return result.second;
|
38
|
+
};
|
39
|
+
iterator begin() const { return ht.begin(); }
|
40
|
+
iterator end() const { return ht.end(); }
|
41
|
+
size_t size() const { return ht.size(); }
|
42
|
+
void clear() { ht.clear(); }
|
43
|
+
};
|
44
|
+
|
45
|
+
typedef map<const Label, NodeSet> Label2NodeSet;
|
46
|
+
|
47
|
+
|
48
|
+
/***************** class NodeArray *******************************/
|
49
|
+
|
50
|
+
class NodeArray {
|
51
|
+
|
52
|
+
private:
|
53
|
+
size_t sizev;
|
54
|
+
bool final;
|
55
|
+
Node **node;
|
56
|
+
|
57
|
+
public:
|
58
|
+
NodeArray( NodeSet& );
|
59
|
+
~NodeArray() { delete[] node; };
|
60
|
+
size_t size() const { return sizev; }
|
61
|
+
bool is_final() const { return final; };
|
62
|
+
Node* &operator[]( size_t i ) const { return node[i]; }
|
63
|
+
};
|
64
|
+
|
65
|
+
|
66
|
+
/***************** class DTransition *****************************/
|
67
|
+
|
68
|
+
class DTransition {
|
69
|
+
public:
|
70
|
+
Label label;
|
71
|
+
NodeArray *nodes;
|
72
|
+
DTransition(Label l, NodeArray *na) { label = l; nodes = na; };
|
73
|
+
};
|
74
|
+
|
75
|
+
|
76
|
+
/***************** class NodeMapping ****************************/
|
77
|
+
|
78
|
+
class NodeMapping {
|
79
|
+
// This class is used to map a node set from one transducer
|
80
|
+
// to a single node in another transducer
|
81
|
+
|
82
|
+
private:
|
83
|
+
struct hashf {
|
84
|
+
size_t operator()(const NodeArray *na) const {
|
85
|
+
size_t key=na->size() ^ na->is_final();
|
86
|
+
for( size_t i=0; i<na->size(); i++)
|
87
|
+
key = (key<<1) ^ (size_t)(*na)[i];
|
88
|
+
return key;
|
89
|
+
}
|
90
|
+
};
|
91
|
+
struct equalf {
|
92
|
+
int operator()(const NodeArray *na1, const NodeArray *na2) const {
|
93
|
+
if (na1->size() != na2->size() || na1->is_final() != na2->is_final())
|
94
|
+
return 0;
|
95
|
+
for( size_t i=0; i<na1->size(); i++)
|
96
|
+
if ((*na1)[i] != (*na2)[i])
|
97
|
+
return 0;
|
98
|
+
return 1;
|
99
|
+
}
|
100
|
+
};
|
101
|
+
typedef hash_map<NodeArray*, Node*, hashf, equalf> NodeMap;
|
102
|
+
NodeMap hm;
|
103
|
+
|
104
|
+
public:
|
105
|
+
typedef NodeMap::iterator iterator;
|
106
|
+
~NodeMapping();
|
107
|
+
iterator begin() { return hm.begin(); };
|
108
|
+
iterator end() { return hm.end(); };
|
109
|
+
iterator find( NodeArray *na) { return hm.find( na ); };
|
110
|
+
Node* &operator[]( NodeArray *na ) { return hm.operator[](na); };
|
111
|
+
|
112
|
+
};
|
113
|
+
|
114
|
+
|
115
|
+
static void determinise_node( NodeArray&, Node*, Transducer*, NodeMapping& );
|
116
|
+
|
117
|
+
|
118
|
+
/*******************************************************************/
|
119
|
+
/* */
|
120
|
+
/* NodeSet::add */
|
121
|
+
/* */
|
122
|
+
/*******************************************************************/
|
123
|
+
|
124
|
+
void NodeSet::add( Node *node )
|
125
|
+
|
126
|
+
{
|
127
|
+
pair<iterator, bool> result = ht.insert(node);
|
128
|
+
if (result.second) {
|
129
|
+
// new node, add nodes reachable with epsilon transitions
|
130
|
+
for( ArcsIter p(node->arcs(),ArcsIter::eps); p; p++ ) {
|
131
|
+
Arc *arc=p;
|
132
|
+
if (!arc->label().is_epsilon())
|
133
|
+
break;
|
134
|
+
add(arc->target_node());
|
135
|
+
}
|
136
|
+
}
|
137
|
+
}
|
138
|
+
|
139
|
+
|
140
|
+
/*******************************************************************/
|
141
|
+
/* */
|
142
|
+
/* NodeArray::NodeArray */
|
143
|
+
/* */
|
144
|
+
/*******************************************************************/
|
145
|
+
|
146
|
+
NodeArray::NodeArray( NodeSet &ns )
|
147
|
+
|
148
|
+
{
|
149
|
+
sizev = 0;
|
150
|
+
NodeSet::iterator it;
|
151
|
+
|
152
|
+
final = false;
|
153
|
+
node = new Node*[ns.size()];
|
154
|
+
for( it=ns.begin(); it!=ns.end(); it++ ) {
|
155
|
+
Node *nn = *it;
|
156
|
+
if (nn->arcs()->non_epsilon_transition_exists())
|
157
|
+
node[sizev++] = nn;
|
158
|
+
if (nn->is_final())
|
159
|
+
final = true;
|
160
|
+
}
|
161
|
+
}
|
162
|
+
|
163
|
+
|
164
|
+
/*******************************************************************/
|
165
|
+
/* */
|
166
|
+
/* NodeMapping::~NodeMapping */
|
167
|
+
/* */
|
168
|
+
/*******************************************************************/
|
169
|
+
|
170
|
+
NodeMapping::~NodeMapping()
|
171
|
+
|
172
|
+
{
|
173
|
+
// if we delete NodeArrays without removing them from NodeMapping,
|
174
|
+
// the system will crash when NodeMapping is deleted.
|
175
|
+
for( iterator it=hm.begin(); it!=hm.end(); ) {
|
176
|
+
NodeArray *na=it->first;
|
177
|
+
iterator old = it++;
|
178
|
+
hm.erase(old);
|
179
|
+
delete na;
|
180
|
+
}
|
181
|
+
}
|
182
|
+
|
183
|
+
|
184
|
+
/*******************************************************************/
|
185
|
+
/* */
|
186
|
+
/* compute_transitions */
|
187
|
+
/* */
|
188
|
+
/*******************************************************************/
|
189
|
+
|
190
|
+
static void compute_transitions( NodeArray &na, vector<DTransition> &t )
|
191
|
+
|
192
|
+
{
|
193
|
+
Label2NodeSet lmap;
|
194
|
+
|
195
|
+
// for all nodes in the current set
|
196
|
+
for( size_t i=0; i<na.size(); i++) {
|
197
|
+
Node *n = na[i]; // old node
|
198
|
+
|
199
|
+
// For each non-epsilon transition, add the target node
|
200
|
+
// to the respective node set.
|
201
|
+
for( ArcsIter p(n->arcs(), ArcsIter::non_eps); p; p++ ) {
|
202
|
+
Arc *arc=p;
|
203
|
+
lmap[arc->label()].add(arc->target_node());
|
204
|
+
}
|
205
|
+
}
|
206
|
+
|
207
|
+
t.reserve(lmap.size());
|
208
|
+
for( Label2NodeSet::iterator it=lmap.begin(); it!=lmap.end(); it++ ) {
|
209
|
+
t.push_back(DTransition(it->first, new NodeArray( it->second )));
|
210
|
+
}
|
211
|
+
}
|
212
|
+
|
213
|
+
|
214
|
+
/*******************************************************************/
|
215
|
+
/* */
|
216
|
+
/* determinise_node */
|
217
|
+
/* */
|
218
|
+
/*******************************************************************/
|
219
|
+
|
220
|
+
static void determinise_node( NodeArray &na, Node *node, Transducer *a,
|
221
|
+
NodeMapping &map )
|
222
|
+
{
|
223
|
+
node->set_final(na.is_final());
|
224
|
+
|
225
|
+
vector<DTransition> t;
|
226
|
+
compute_transitions( na, t );
|
227
|
+
|
228
|
+
for( size_t i=0; i<t.size(); i++ ) {
|
229
|
+
NodeMapping::iterator it=map.find(t[i].nodes);
|
230
|
+
if (it == map.end()) {
|
231
|
+
// new node set
|
232
|
+
Node *target_node = a->new_node();
|
233
|
+
map[t[i].nodes] = target_node;
|
234
|
+
node->add_arc( t[i].label, target_node, a );
|
235
|
+
determinise_node( *t[i].nodes, target_node, a, map );
|
236
|
+
}
|
237
|
+
else {
|
238
|
+
delete t[i].nodes;
|
239
|
+
node->add_arc( t[i].label, it->second, a );
|
240
|
+
}
|
241
|
+
}
|
242
|
+
}
|
243
|
+
|
244
|
+
|
245
|
+
/*******************************************************************/
|
246
|
+
/* */
|
247
|
+
/* Transducer::determinise */
|
248
|
+
/* */
|
249
|
+
/*******************************************************************/
|
250
|
+
|
251
|
+
Transducer &Transducer::determinise( bool copy_alphabet )
|
252
|
+
|
253
|
+
{
|
254
|
+
if (deterministic)
|
255
|
+
return copy();
|
256
|
+
|
257
|
+
Transducer *a = new Transducer();
|
258
|
+
if (copy_alphabet)
|
259
|
+
a->alphabet.copy(alphabet);
|
260
|
+
|
261
|
+
// creation of the initial node set consisting of all nodes
|
262
|
+
// reachable from the start node via epsilon transitions.
|
263
|
+
NodeArray *na;
|
264
|
+
{
|
265
|
+
NodeSet ns;
|
266
|
+
ns.add(root_node());
|
267
|
+
na = new NodeArray(ns);
|
268
|
+
}
|
269
|
+
|
270
|
+
// map the node set to the new root node
|
271
|
+
NodeMapping map;
|
272
|
+
map[na] = a->root_node();
|
273
|
+
|
274
|
+
// determinise the transducer recursively
|
275
|
+
determinise_node( *na, a->root_node(), a, map );
|
276
|
+
a->deterministic = 1;
|
277
|
+
return *a;
|
278
|
+
}
|
279
|
+
}
|
data/ext/sfst/fst.cc
ADDED
@@ -0,0 +1,1150 @@
|
|
1
|
+
|
2
|
+
/*******************************************************************/
|
3
|
+
/* */
|
4
|
+
/* FILE fst.C */
|
5
|
+
/* MODULE fst */
|
6
|
+
/* PROGRAM SFST */
|
7
|
+
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
8
|
+
/* */
|
9
|
+
/* PURPOSE basic FST functions */
|
10
|
+
/* */
|
11
|
+
/*******************************************************************/
|
12
|
+
|
13
|
+
#include "fst.h"
|
14
|
+
|
15
|
+
namespace SFST {
|
16
|
+
|
17
|
+
using std::vector;
|
18
|
+
using std::istream;
|
19
|
+
using std::ostream;
|
20
|
+
using std::cerr;
|
21
|
+
|
22
|
+
const int BUFFER_SIZE=100000;
|
23
|
+
|
24
|
+
|
25
|
+
/*******************************************************************/
|
26
|
+
/* */
|
27
|
+
/* Arcs::size */
|
28
|
+
/* */
|
29
|
+
/*******************************************************************/
|
30
|
+
|
31
|
+
int Arcs::size() const
|
32
|
+
|
33
|
+
{
|
34
|
+
int n=0;
|
35
|
+
for( Arc *p=first_arcp; p; p=p->next ) n++;
|
36
|
+
for( Arc *p=first_epsilon_arcp; p; p=p->next ) n++;
|
37
|
+
return n;
|
38
|
+
}
|
39
|
+
|
40
|
+
|
41
|
+
/*******************************************************************/
|
42
|
+
/* */
|
43
|
+
/* Arcs::target_node */
|
44
|
+
/* */
|
45
|
+
/*******************************************************************/
|
46
|
+
|
47
|
+
Node *Arcs::target_node( Label l )
|
48
|
+
|
49
|
+
{
|
50
|
+
Arc *arc;
|
51
|
+
|
52
|
+
for( arc=first_arcp; arc; arc=arc->next)
|
53
|
+
if (arc->label() == l)
|
54
|
+
return arc->target_node();
|
55
|
+
|
56
|
+
return NULL;
|
57
|
+
}
|
58
|
+
|
59
|
+
const Node *Arcs::target_node( Label l ) const
|
60
|
+
|
61
|
+
{
|
62
|
+
const Arc *arc;
|
63
|
+
|
64
|
+
for( arc=first_arcp; arc; arc=arc->next)
|
65
|
+
if (arc->label() == l)
|
66
|
+
return arc->target_node();
|
67
|
+
|
68
|
+
return NULL;
|
69
|
+
}
|
70
|
+
|
71
|
+
|
72
|
+
/*******************************************************************/
|
73
|
+
/* */
|
74
|
+
/* Transducer::new_node */
|
75
|
+
/* */
|
76
|
+
/*******************************************************************/
|
77
|
+
|
78
|
+
Node *Transducer::new_node()
|
79
|
+
|
80
|
+
{
|
81
|
+
Node *node=(Node*)mem.alloc( sizeof(Node) );
|
82
|
+
|
83
|
+
node->init();
|
84
|
+
return node;
|
85
|
+
}
|
86
|
+
|
87
|
+
|
88
|
+
/*******************************************************************/
|
89
|
+
/* */
|
90
|
+
/* Transducer::new_arc */
|
91
|
+
/* */
|
92
|
+
/*******************************************************************/
|
93
|
+
|
94
|
+
Arc *Transducer::new_arc( Label l, Node *target )
|
95
|
+
|
96
|
+
{
|
97
|
+
Arc *arc=(Arc*)mem.alloc( sizeof(Arc) );
|
98
|
+
arc->init( l, target);
|
99
|
+
return arc;
|
100
|
+
}
|
101
|
+
|
102
|
+
|
103
|
+
/*******************************************************************/
|
104
|
+
/* */
|
105
|
+
/* Arcs::add_arc */
|
106
|
+
/* */
|
107
|
+
/*******************************************************************/
|
108
|
+
|
109
|
+
void Arcs::add_arc( Label l, Node *node, Transducer *a )
|
110
|
+
|
111
|
+
{
|
112
|
+
Arc *arc=a->new_arc( l, node );
|
113
|
+
|
114
|
+
if (l.is_epsilon()) {
|
115
|
+
arc->next = first_epsilon_arcp;
|
116
|
+
first_epsilon_arcp = arc;
|
117
|
+
}
|
118
|
+
else {
|
119
|
+
arc->next = first_arcp;
|
120
|
+
first_arcp = arc;
|
121
|
+
}
|
122
|
+
}
|
123
|
+
|
124
|
+
|
125
|
+
/*******************************************************************/
|
126
|
+
/* */
|
127
|
+
/* Arcs::remove_arc */
|
128
|
+
/* */
|
129
|
+
/*******************************************************************/
|
130
|
+
|
131
|
+
int Arcs::remove_arc( Arc *arc )
|
132
|
+
|
133
|
+
{
|
134
|
+
Arc **p = (arc->label().is_epsilon()) ? &first_epsilon_arcp : &first_arcp;
|
135
|
+
for( ; *p; p=&(*p)->next )
|
136
|
+
if (*p == arc) {
|
137
|
+
*p = arc->next;
|
138
|
+
return 1;
|
139
|
+
}
|
140
|
+
return 0;
|
141
|
+
}
|
142
|
+
|
143
|
+
|
144
|
+
/*******************************************************************/
|
145
|
+
/* */
|
146
|
+
/* Node::init */
|
147
|
+
/* */
|
148
|
+
/*******************************************************************/
|
149
|
+
|
150
|
+
void Node::init()
|
151
|
+
|
152
|
+
{
|
153
|
+
final = false;
|
154
|
+
visited = 0;
|
155
|
+
arcsp.init();
|
156
|
+
forwardp = NULL;
|
157
|
+
}
|
158
|
+
|
159
|
+
|
160
|
+
/*******************************************************************/
|
161
|
+
/* */
|
162
|
+
/* Node::clear_visited */
|
163
|
+
/* */
|
164
|
+
/*******************************************************************/
|
165
|
+
|
166
|
+
void Node::clear_visited( NodeHashSet &nodeset )
|
167
|
+
|
168
|
+
{
|
169
|
+
if (nodeset.find( this ) == nodeset.end()) {
|
170
|
+
visited = 0;
|
171
|
+
nodeset.insert( this );
|
172
|
+
fprintf(stderr," %lu", (unsigned long)nodeset.size());
|
173
|
+
for( ArcsIter p(arcs()); p; p++ ) {
|
174
|
+
Arc *arc=p;
|
175
|
+
arc->target_node()->clear_visited( nodeset );
|
176
|
+
}
|
177
|
+
}
|
178
|
+
}
|
179
|
+
|
180
|
+
|
181
|
+
/*******************************************************************/
|
182
|
+
/* */
|
183
|
+
/* Transducer::index_nodes */
|
184
|
+
/* */
|
185
|
+
/*******************************************************************/
|
186
|
+
|
187
|
+
void Transducer::index_nodes( Node *node, vector<Node*> *nodearray )
|
188
|
+
|
189
|
+
{
|
190
|
+
if (!node->was_visited( vmark )) {
|
191
|
+
node->index = (Index)node_count++;
|
192
|
+
if (nodearray)
|
193
|
+
nodearray->push_back(node);
|
194
|
+
|
195
|
+
for( ArcsIter p(node->arcs()); p; p++ ) {
|
196
|
+
Arc *arc=p;
|
197
|
+
transition_count++;
|
198
|
+
index_nodes( arc->target_node(), nodearray );
|
199
|
+
}
|
200
|
+
}
|
201
|
+
}
|
202
|
+
|
203
|
+
|
204
|
+
/*******************************************************************/
|
205
|
+
/* */
|
206
|
+
/* Transducer::nodeindexing */
|
207
|
+
/* */
|
208
|
+
/*******************************************************************/
|
209
|
+
|
210
|
+
std::pair<size_t,size_t> Transducer::nodeindexing( vector<Node*> *nodearray )
|
211
|
+
|
212
|
+
{
|
213
|
+
if (!indexed) {
|
214
|
+
incr_vmark();
|
215
|
+
index_nodes( root_node(), nodearray );
|
216
|
+
indexed = true;
|
217
|
+
}
|
218
|
+
|
219
|
+
return std::pair<size_t,size_t>(node_count, transition_count);
|
220
|
+
}
|
221
|
+
|
222
|
+
|
223
|
+
/*******************************************************************/
|
224
|
+
/* */
|
225
|
+
/* Transducer::add_string */
|
226
|
+
/* */
|
227
|
+
/*******************************************************************/
|
228
|
+
|
229
|
+
void Transducer::add_string( char *s, bool extended, Alphabet *a )
|
230
|
+
|
231
|
+
{
|
232
|
+
if (a == NULL)
|
233
|
+
a = &alphabet;
|
234
|
+
|
235
|
+
Node *node=root_node();
|
236
|
+
Label l;
|
237
|
+
while (!(l = a->next_label(s, extended)).is_epsilon()) {
|
238
|
+
a->insert(l);
|
239
|
+
Arcs *arcs=node->arcs();
|
240
|
+
node = arcs->target_node( l );
|
241
|
+
if (node == NULL) {
|
242
|
+
node = new_node();
|
243
|
+
arcs->add_arc( l, node, this );
|
244
|
+
}
|
245
|
+
}
|
246
|
+
node->set_final(1);
|
247
|
+
}
|
248
|
+
|
249
|
+
|
250
|
+
/*******************************************************************/
|
251
|
+
/* */
|
252
|
+
/* Transducer::Transducer */
|
253
|
+
/* */
|
254
|
+
/*******************************************************************/
|
255
|
+
|
256
|
+
Transducer::Transducer( vector<Label> &path )
|
257
|
+
: root(), mem()
|
258
|
+
{
|
259
|
+
Node *node=root_node();
|
260
|
+
|
261
|
+
vmark = 0;
|
262
|
+
indexed = false;
|
263
|
+
node_count = transition_count = 0;
|
264
|
+
deterministic = minimised = true;
|
265
|
+
for( size_t i=0; i<path.size(); i++ ) {
|
266
|
+
Arcs *arcs=node->arcs();
|
267
|
+
node = new_node();
|
268
|
+
arcs->add_arc( path[i], node, this );
|
269
|
+
}
|
270
|
+
node->set_final(1);
|
271
|
+
}
|
272
|
+
|
273
|
+
|
274
|
+
/*******************************************************************/
|
275
|
+
/* */
|
276
|
+
/* Transducer::Transducer */
|
277
|
+
/* */
|
278
|
+
/*******************************************************************/
|
279
|
+
|
280
|
+
Transducer::Transducer( istream &is, const Alphabet *a, bool verbose,
|
281
|
+
bool lexcomments )
|
282
|
+
: root(), mem()
|
283
|
+
{
|
284
|
+
bool extended=false;
|
285
|
+
int n=0;
|
286
|
+
char buffer[10000];
|
287
|
+
|
288
|
+
vmark = 0;
|
289
|
+
indexed = false;
|
290
|
+
node_count = transition_count = 0;
|
291
|
+
deterministic = true;
|
292
|
+
minimised = false;
|
293
|
+
if (a) {
|
294
|
+
alphabet.copy(*a);
|
295
|
+
extended = true;
|
296
|
+
}
|
297
|
+
while (is.getline(buffer, 10000)) {
|
298
|
+
if (verbose && ++n % 10000 == 0) {
|
299
|
+
if (n == 10000)
|
300
|
+
cerr << "\n";
|
301
|
+
cerr << "\r" << n << " words";
|
302
|
+
}
|
303
|
+
|
304
|
+
// delete comments
|
305
|
+
if (lexcomments) {
|
306
|
+
size_t l = strlen(buffer);
|
307
|
+
for( size_t i=0; i<l; i++ )
|
308
|
+
if (buffer[i] == '\\' && buffer[i+1])
|
309
|
+
; // quoted character
|
310
|
+
else if (buffer[i] == '%') {
|
311
|
+
// comment starts here
|
312
|
+
buffer[i] = 0;
|
313
|
+
break;
|
314
|
+
}
|
315
|
+
if (buffer[0] == 0)
|
316
|
+
continue;
|
317
|
+
}
|
318
|
+
|
319
|
+
// delete final whitespace characters
|
320
|
+
int l;
|
321
|
+
for( l=(int)strlen(buffer)-1; l>=0; l-- )
|
322
|
+
if ((buffer[l] != ' ' && buffer[l] != '\t' && buffer[l] != '\r') ||
|
323
|
+
(l > 0 && buffer[l-1] == '\\'))
|
324
|
+
break;
|
325
|
+
buffer[l+1] = 0;
|
326
|
+
|
327
|
+
add_string(buffer, extended);
|
328
|
+
}
|
329
|
+
if (verbose && n >= 10000)
|
330
|
+
cerr << "\n";
|
331
|
+
}
|
332
|
+
|
333
|
+
|
334
|
+
/*******************************************************************/
|
335
|
+
/* */
|
336
|
+
/* Transducer::Transducer */
|
337
|
+
/* */
|
338
|
+
/*******************************************************************/
|
339
|
+
|
340
|
+
Transducer::Transducer( char *s, const Alphabet *a, bool extended )
|
341
|
+
: root(), mem()
|
342
|
+
{
|
343
|
+
vmark = 0;
|
344
|
+
indexed = false;
|
345
|
+
node_count = transition_count = 0;
|
346
|
+
deterministic = minimised = true;
|
347
|
+
if (a)
|
348
|
+
alphabet.copy(*a);
|
349
|
+
add_string(s, extended);
|
350
|
+
}
|
351
|
+
|
352
|
+
|
353
|
+
/*******************************************************************/
|
354
|
+
/* */
|
355
|
+
/* Transducer::clear */
|
356
|
+
/* */
|
357
|
+
/*******************************************************************/
|
358
|
+
|
359
|
+
void Transducer::clear()
|
360
|
+
|
361
|
+
{
|
362
|
+
vmark = 0;
|
363
|
+
deterministic = minimised = false;
|
364
|
+
root.init();
|
365
|
+
mem.clear();
|
366
|
+
alphabet.clear();
|
367
|
+
}
|
368
|
+
|
369
|
+
|
370
|
+
/*******************************************************************/
|
371
|
+
/* */
|
372
|
+
/* Transducer::store_symbols */
|
373
|
+
/* */
|
374
|
+
/*******************************************************************/
|
375
|
+
|
376
|
+
void Transducer::store_symbols(Node *node, SymbolMap &symbol,
|
377
|
+
LabelSet &labels)
|
378
|
+
{
|
379
|
+
if (!node->was_visited( vmark )) {
|
380
|
+
Arcs *arcs=node->arcs();
|
381
|
+
for( ArcsIter p(arcs); p; p++ ) {
|
382
|
+
Arc *arc=p;
|
383
|
+
Label l=arc->label();
|
384
|
+
|
385
|
+
labels.insert(l);
|
386
|
+
|
387
|
+
Character c = l.upper_char();
|
388
|
+
if (symbol.find(c) == symbol.end()) {
|
389
|
+
const char *s = alphabet.code2symbol(c);
|
390
|
+
if (s)
|
391
|
+
symbol[c] = fst_strdup(s);
|
392
|
+
}
|
393
|
+
|
394
|
+
c = l.lower_char();
|
395
|
+
if (symbol.find(c) == symbol.end()) {
|
396
|
+
const char *s = alphabet.code2symbol(c);
|
397
|
+
if (s)
|
398
|
+
symbol[c] = fst_strdup(s);
|
399
|
+
}
|
400
|
+
|
401
|
+
store_symbols( arc->target_node(), symbol, labels );
|
402
|
+
}
|
403
|
+
}
|
404
|
+
}
|
405
|
+
|
406
|
+
|
407
|
+
/*******************************************************************/
|
408
|
+
/* */
|
409
|
+
/* Transducer::minimise_alphabet */
|
410
|
+
/* */
|
411
|
+
/*******************************************************************/
|
412
|
+
|
413
|
+
void Transducer::minimise_alphabet()
|
414
|
+
|
415
|
+
{
|
416
|
+
SymbolMap symbols;
|
417
|
+
LabelSet labels;
|
418
|
+
incr_vmark();
|
419
|
+
store_symbols(root_node(), symbols, labels);
|
420
|
+
alphabet.clear();
|
421
|
+
for( SymbolMap::iterator it=symbols.begin(); it!=symbols.end(); it++ ) {
|
422
|
+
alphabet.add_symbol( it->second, it->first );
|
423
|
+
free(it->second);
|
424
|
+
}
|
425
|
+
for( LabelSet::iterator it=labels.begin(); it!=labels.end(); it++ )
|
426
|
+
alphabet.insert(*it);
|
427
|
+
}
|
428
|
+
|
429
|
+
|
430
|
+
/*******************************************************************/
|
431
|
+
/* */
|
432
|
+
/* Transducer::size_node */
|
433
|
+
/* */
|
434
|
+
/*******************************************************************/
|
435
|
+
|
436
|
+
size_t Transducer::size_node( Node *node )
|
437
|
+
|
438
|
+
{
|
439
|
+
size_t result = 0;
|
440
|
+
if (!node->was_visited( vmark )) {
|
441
|
+
result++;
|
442
|
+
for( ArcsIter it(node->arcs()); it; it++ ) {
|
443
|
+
Arc *arc=it;
|
444
|
+
result += size_node( arc->target_node() );
|
445
|
+
}
|
446
|
+
}
|
447
|
+
return result;
|
448
|
+
}
|
449
|
+
|
450
|
+
|
451
|
+
/*******************************************************************/
|
452
|
+
/* */
|
453
|
+
/* Transducer::size_node */
|
454
|
+
/* */
|
455
|
+
/*******************************************************************/
|
456
|
+
|
457
|
+
size_t Transducer::size()
|
458
|
+
|
459
|
+
{
|
460
|
+
incr_vmark();
|
461
|
+
return size_node(root_node());
|
462
|
+
}
|
463
|
+
|
464
|
+
|
465
|
+
/*******************************************************************/
|
466
|
+
/* */
|
467
|
+
/* Transducer::enumerate_paths_node */
|
468
|
+
/* */
|
469
|
+
/*******************************************************************/
|
470
|
+
|
471
|
+
void Transducer::enumerate_paths_node( Node *node, vector<Label> &path,
|
472
|
+
NodeHashSet &previous,
|
473
|
+
vector<Transducer*> &result )
|
474
|
+
{
|
475
|
+
if (node->is_final())
|
476
|
+
result.push_back(new Transducer(path));
|
477
|
+
|
478
|
+
for( ArcsIter it(node->arcs()); it; it++ ) {
|
479
|
+
Arc *arc=it;
|
480
|
+
|
481
|
+
NodeHashSet::iterator hsit=previous.insert(node).first;
|
482
|
+
path.push_back(arc->label());
|
483
|
+
enumerate_paths_node( arc->target_node(), path, previous, result );
|
484
|
+
path.pop_back();
|
485
|
+
previous.erase(hsit);
|
486
|
+
}
|
487
|
+
}
|
488
|
+
|
489
|
+
|
490
|
+
/*******************************************************************/
|
491
|
+
/* */
|
492
|
+
/* Transducer::enumerate_paths */
|
493
|
+
/* */
|
494
|
+
/*******************************************************************/
|
495
|
+
|
496
|
+
bool Transducer::enumerate_paths( vector<Transducer*> &result )
|
497
|
+
|
498
|
+
{
|
499
|
+
if (is_infinitely_ambiguous())
|
500
|
+
return true;
|
501
|
+
for( size_t i=0; i<result.size(); i++ )
|
502
|
+
delete result[i];
|
503
|
+
result.clear();
|
504
|
+
|
505
|
+
vector<Label> path;
|
506
|
+
NodeHashSet previous;
|
507
|
+
enumerate_paths_node( root_node(), path, previous, result );
|
508
|
+
return false;
|
509
|
+
}
|
510
|
+
|
511
|
+
|
512
|
+
|
513
|
+
|
514
|
+
/*******************************************************************/
|
515
|
+
/* */
|
516
|
+
/* Transducer::print_strings_node */
|
517
|
+
/* */
|
518
|
+
/*******************************************************************/
|
519
|
+
|
520
|
+
int Transducer::print_strings_node(Node *node, char *buffer, int pos,
|
521
|
+
FILE *file, bool with_brackets )
|
522
|
+
{
|
523
|
+
int result = 0;
|
524
|
+
|
525
|
+
if (node->was_visited( vmark )) {
|
526
|
+
if (node->forward() != NULL) { // cycle detected
|
527
|
+
cerr << "Warning: cyclic analyses (cycle aborted)\n";
|
528
|
+
return 0;
|
529
|
+
}
|
530
|
+
node->set_forward(node); // used like a flag for loop detection
|
531
|
+
}
|
532
|
+
if (pos == BUFFER_SIZE)
|
533
|
+
throw "Output string in function print_strings_node is too long";
|
534
|
+
if (node->is_final()) {
|
535
|
+
buffer[pos] = '\0';
|
536
|
+
fprintf(file,"%s\n", buffer);
|
537
|
+
result = 1;
|
538
|
+
}
|
539
|
+
for( ArcsIter i(node->arcs()); i; i++ ) {
|
540
|
+
int p=pos;
|
541
|
+
Arc *arc=i;
|
542
|
+
Label l=arc->label();
|
543
|
+
alphabet.write_label(l, buffer, &p, with_brackets);
|
544
|
+
result |= print_strings_node(arc->target_node(), buffer, p,
|
545
|
+
file, with_brackets );
|
546
|
+
}
|
547
|
+
node->set_forward(NULL);
|
548
|
+
|
549
|
+
return result;
|
550
|
+
}
|
551
|
+
|
552
|
+
|
553
|
+
/*******************************************************************/
|
554
|
+
/* */
|
555
|
+
/* Transducer::print_strings */
|
556
|
+
/* */
|
557
|
+
/*******************************************************************/
|
558
|
+
|
559
|
+
int Transducer::print_strings( FILE *file, bool with_brackets )
|
560
|
+
|
561
|
+
{
|
562
|
+
char buffer[BUFFER_SIZE];
|
563
|
+
incr_vmark();
|
564
|
+
return print_strings_node( root_node(), buffer, 0, file, with_brackets );
|
565
|
+
}
|
566
|
+
|
567
|
+
|
568
|
+
/*******************************************************************/
|
569
|
+
/* */
|
570
|
+
/* Transducer::analyze_string */
|
571
|
+
/* */
|
572
|
+
/*******************************************************************/
|
573
|
+
|
574
|
+
bool Transducer::analyze_string( char *string, FILE *file, bool with_brackets )
|
575
|
+
|
576
|
+
{
|
577
|
+
vector<Character> input;
|
578
|
+
alphabet.string2symseq( string, input );
|
579
|
+
vector<Label> labels;
|
580
|
+
for( size_t i=0; i<input.size(); i++ )
|
581
|
+
labels.push_back(Label(input[i]));
|
582
|
+
|
583
|
+
Transducer a1(labels);
|
584
|
+
Transducer *a2=&(*this || a1);
|
585
|
+
Transducer *a3=&(a2->lower_level());
|
586
|
+
delete a2;
|
587
|
+
a2 = &a3->minimise();
|
588
|
+
delete a3;
|
589
|
+
|
590
|
+
a2->alphabet.copy(alphabet);
|
591
|
+
bool result = a2->print_strings( file, with_brackets );
|
592
|
+
delete a2;
|
593
|
+
return result;
|
594
|
+
}
|
595
|
+
|
596
|
+
|
597
|
+
/*******************************************************************/
|
598
|
+
/* */
|
599
|
+
/* Transducer::generate_string */
|
600
|
+
/* */
|
601
|
+
/*******************************************************************/
|
602
|
+
|
603
|
+
bool Transducer::generate_string( char *string, FILE *file, bool with_brackets)
|
604
|
+
|
605
|
+
{
|
606
|
+
Transducer a1(string, &alphabet, false);
|
607
|
+
Transducer *a2=&(a1 || *this);
|
608
|
+
Transducer *a3=&(a2->upper_level());
|
609
|
+
delete a2;
|
610
|
+
a2 = &a3->minimise();
|
611
|
+
delete a3;
|
612
|
+
|
613
|
+
a2->alphabet.copy(alphabet);
|
614
|
+
bool result = a2->print_strings( file, with_brackets );
|
615
|
+
delete a2;
|
616
|
+
return result;
|
617
|
+
}
|
618
|
+
|
619
|
+
|
620
|
+
/*******************************************************************/
|
621
|
+
/* */
|
622
|
+
/* complete */
|
623
|
+
/* */
|
624
|
+
/*******************************************************************/
|
625
|
+
|
626
|
+
static void complete( Node *node, Alphabet &alphabet, VType vmark)
|
627
|
+
|
628
|
+
{
|
629
|
+
if (node->was_visited( vmark ))
|
630
|
+
return;
|
631
|
+
for( ArcsIter p(node->arcs()); p; p++ ) {
|
632
|
+
Arc *arc=p;
|
633
|
+
if (!arc->label().is_epsilon())
|
634
|
+
alphabet.insert(arc->label());
|
635
|
+
complete(arc->target_node(), alphabet, vmark);
|
636
|
+
}
|
637
|
+
}
|
638
|
+
|
639
|
+
|
640
|
+
/*******************************************************************/
|
641
|
+
/* */
|
642
|
+
/* Transducer::complete_alphabet */
|
643
|
+
/* */
|
644
|
+
/*******************************************************************/
|
645
|
+
|
646
|
+
void Transducer::complete_alphabet()
|
647
|
+
|
648
|
+
{
|
649
|
+
incr_vmark();
|
650
|
+
complete(root_node(), alphabet, vmark);
|
651
|
+
}
|
652
|
+
|
653
|
+
|
654
|
+
/*******************************************************************/
|
655
|
+
/* */
|
656
|
+
/* print_node */
|
657
|
+
/* */
|
658
|
+
/*******************************************************************/
|
659
|
+
|
660
|
+
static void print_node( ostream &s, Node *node, VType vmark, Alphabet &abc )
|
661
|
+
|
662
|
+
{
|
663
|
+
if (!node->was_visited( vmark )) {
|
664
|
+
Arcs *arcs=node->arcs();
|
665
|
+
for( ArcsIter p(arcs); p; p++ ) {
|
666
|
+
Arc *arc=p;
|
667
|
+
s << node->index << "\t" << arc->target_node()->index;
|
668
|
+
s << "\t" << abc.write_char(arc->label().lower_char());
|
669
|
+
s << "\t" << abc.write_char(arc->label().upper_char());
|
670
|
+
s << "\n";
|
671
|
+
}
|
672
|
+
if (node->is_final())
|
673
|
+
s << node->index << "\n";
|
674
|
+
for( ArcsIter p(arcs); p; p++ ) {
|
675
|
+
Arc *arc=p;
|
676
|
+
print_node( s, arc->target_node(), vmark, abc );
|
677
|
+
}
|
678
|
+
}
|
679
|
+
}
|
680
|
+
|
681
|
+
|
682
|
+
/*******************************************************************/
|
683
|
+
/* */
|
684
|
+
/* operator<< */
|
685
|
+
/* */
|
686
|
+
/*******************************************************************/
|
687
|
+
|
688
|
+
ostream &operator<<( ostream &s, Transducer &a )
|
689
|
+
|
690
|
+
{
|
691
|
+
a.nodeindexing();
|
692
|
+
a.incr_vmark();
|
693
|
+
print_node( s, a.root_node(), a.vmark, a.alphabet );
|
694
|
+
return s;
|
695
|
+
}
|
696
|
+
|
697
|
+
|
698
|
+
/*******************************************************************/
|
699
|
+
/* */
|
700
|
+
/* store_node_info */
|
701
|
+
/* */
|
702
|
+
/*******************************************************************/
|
703
|
+
|
704
|
+
static void store_node_info( FILE *file, Node *node )
|
705
|
+
|
706
|
+
{
|
707
|
+
// write final flag
|
708
|
+
char c=node->is_final();
|
709
|
+
fwrite(&c,sizeof(c),1,file);
|
710
|
+
|
711
|
+
// write the number of arcs
|
712
|
+
int nn = node->arcs()->size();
|
713
|
+
if (nn > 65535)
|
714
|
+
throw "Error: in function store_node\n";
|
715
|
+
unsigned short n=(unsigned short)nn;
|
716
|
+
fwrite(&n,sizeof(n),1,file);
|
717
|
+
}
|
718
|
+
|
719
|
+
|
720
|
+
/*******************************************************************/
|
721
|
+
/* */
|
722
|
+
/* store_arc_label */
|
723
|
+
/* */
|
724
|
+
/*******************************************************************/
|
725
|
+
|
726
|
+
static void store_arc_label( FILE *file, Arc *arc )
|
727
|
+
|
728
|
+
{
|
729
|
+
Label l=arc->label();
|
730
|
+
Character lc=l.lower_char();
|
731
|
+
Character uc=l.upper_char();
|
732
|
+
fwrite(&lc,sizeof(lc),1,file);
|
733
|
+
fwrite(&uc,sizeof(uc),1,file);
|
734
|
+
}
|
735
|
+
|
736
|
+
|
737
|
+
/*******************************************************************/
|
738
|
+
/* */
|
739
|
+
/* store_node */
|
740
|
+
/* */
|
741
|
+
/*******************************************************************/
|
742
|
+
|
743
|
+
static void store_node( FILE *file, Node *node, VType vmark )
|
744
|
+
{
|
745
|
+
if (!node->was_visited( vmark )) {
|
746
|
+
|
747
|
+
store_node_info( file, node );
|
748
|
+
|
749
|
+
// write the arcs
|
750
|
+
for( ArcsIter p(node->arcs()); p; p++ ) {
|
751
|
+
Arc *arc=p;
|
752
|
+
store_arc_label( file, arc );
|
753
|
+
unsigned int t = (unsigned int)arc->target_node()->index;
|
754
|
+
fwrite(&t,sizeof(t),1,file);
|
755
|
+
store_node(file, arc->target_node(), vmark );
|
756
|
+
}
|
757
|
+
}
|
758
|
+
}
|
759
|
+
|
760
|
+
|
761
|
+
/*******************************************************************/
|
762
|
+
/* */
|
763
|
+
/* store_lowmem_node */
|
764
|
+
/* */
|
765
|
+
/*******************************************************************/
|
766
|
+
|
767
|
+
static void store_lowmem_node( FILE *file, Node *node,
|
768
|
+
vector<unsigned int> &startpos)
|
769
|
+
{
|
770
|
+
store_node_info( file, node );
|
771
|
+
|
772
|
+
// write the arcs
|
773
|
+
for( ArcsIter p(node->arcs()); p; p++ ) {
|
774
|
+
Arc *arc=p;
|
775
|
+
store_arc_label( file, arc );
|
776
|
+
unsigned int t=startpos[arc->target_node()->index];
|
777
|
+
fwrite(&t,sizeof(t),1,file);
|
778
|
+
}
|
779
|
+
}
|
780
|
+
|
781
|
+
|
782
|
+
/*******************************************************************/
|
783
|
+
/* */
|
784
|
+
/* Transducer::store_lowmem */
|
785
|
+
/* */
|
786
|
+
/*******************************************************************/
|
787
|
+
|
788
|
+
void Transducer::store_lowmem( FILE *file )
|
789
|
+
|
790
|
+
{
|
791
|
+
fputc('l',file);
|
792
|
+
alphabet.store(file);
|
793
|
+
|
794
|
+
// storing size of index table
|
795
|
+
vector<Node*> nodearray;
|
796
|
+
nodeindexing( &nodearray );
|
797
|
+
|
798
|
+
// compute the start position of the first node
|
799
|
+
unsigned int pos=(unsigned int)ftell(file);
|
800
|
+
vector<unsigned int> startpos;
|
801
|
+
for( size_t i=0; i<nodearray.size(); i++ ) {
|
802
|
+
startpos.push_back(pos);
|
803
|
+
Node *node=nodearray[i];
|
804
|
+
Arcs *arcs=node->arcs();
|
805
|
+
pos += (unsigned)(sizeof(char) // size of final flag
|
806
|
+
+ sizeof(unsigned short) // size of number of arcs
|
807
|
+
+ arcs->size() * (sizeof(Character) * 2 + sizeof(unsigned int))); // size of n arcs
|
808
|
+
}
|
809
|
+
|
810
|
+
// storing nodes
|
811
|
+
for( size_t i=0; i<nodearray.size(); i++ )
|
812
|
+
store_lowmem_node( file, nodearray[i], startpos );
|
813
|
+
}
|
814
|
+
|
815
|
+
|
816
|
+
/*******************************************************************/
|
817
|
+
/* */
|
818
|
+
/* Transducer::store */
|
819
|
+
/* */
|
820
|
+
/*******************************************************************/
|
821
|
+
|
822
|
+
void Transducer::store( FILE *file )
|
823
|
+
|
824
|
+
{
|
825
|
+
fputc('a',file);
|
826
|
+
|
827
|
+
vector<Node*> nodearray;
|
828
|
+
nodeindexing( &nodearray );
|
829
|
+
incr_vmark();
|
830
|
+
unsigned int n=(unsigned)nodearray.size();
|
831
|
+
fwrite(&n,sizeof(n),1,file);
|
832
|
+
store_node( file, root_node(), vmark );
|
833
|
+
|
834
|
+
alphabet.store(file);
|
835
|
+
}
|
836
|
+
|
837
|
+
|
838
|
+
/*******************************************************************/
|
839
|
+
/* */
|
840
|
+
/* read_node */
|
841
|
+
/* */
|
842
|
+
/*******************************************************************/
|
843
|
+
|
844
|
+
static void read_node( FILE *file, Node *node, Node **p, Transducer *a )
|
845
|
+
{
|
846
|
+
char c;
|
847
|
+
fread(&c,sizeof(c),1,file);
|
848
|
+
node->set_final(c);
|
849
|
+
|
850
|
+
unsigned short n;
|
851
|
+
fread( &n, sizeof(n), 1, file);
|
852
|
+
|
853
|
+
for( int i=0; i<n; i++ ) {
|
854
|
+
Character lc,uc;
|
855
|
+
unsigned int t;
|
856
|
+
fread(&lc,sizeof(lc),1,file);
|
857
|
+
fread(&uc,sizeof(uc),1,file);
|
858
|
+
fread(&t,sizeof(t),1,file);
|
859
|
+
if (ferror(file))
|
860
|
+
throw "Error encountered while reading transducer from file";
|
861
|
+
if (p[t])
|
862
|
+
node->add_arc( Label(lc,uc), p[t], a );
|
863
|
+
else {
|
864
|
+
p[t] = a->new_node();
|
865
|
+
node->add_arc( Label(lc,uc), p[t], a );
|
866
|
+
read_node(file, p[t], p, a );
|
867
|
+
}
|
868
|
+
}
|
869
|
+
}
|
870
|
+
|
871
|
+
|
872
|
+
/*******************************************************************/
|
873
|
+
/* */
|
874
|
+
/* Transducer::read_transducer_binary */
|
875
|
+
/* */
|
876
|
+
/*******************************************************************/
|
877
|
+
|
878
|
+
void Transducer::read_transducer_binary( FILE *file )
|
879
|
+
|
880
|
+
{
|
881
|
+
if (fgetc(file) != 'a')
|
882
|
+
throw "Error: wrong file format (not a standard transducer)\n";
|
883
|
+
|
884
|
+
vmark = deterministic = 0;
|
885
|
+
unsigned int n;
|
886
|
+
fread(&n,sizeof(n),1,file); // number of nodes
|
887
|
+
if (ferror(file))
|
888
|
+
throw "Error encountered while reading transducer from file";
|
889
|
+
|
890
|
+
Node **p=new Node*[n]; // maps indices to nodes
|
891
|
+
p[0] = root_node();
|
892
|
+
for( unsigned int i=1; i<n; i++)
|
893
|
+
p[i] = NULL;
|
894
|
+
read_node( file, root_node(), p, this );
|
895
|
+
delete[] p;
|
896
|
+
|
897
|
+
alphabet.read(file);
|
898
|
+
|
899
|
+
vmark = 1;
|
900
|
+
deterministic = minimised = 1;
|
901
|
+
}
|
902
|
+
|
903
|
+
|
904
|
+
/*******************************************************************/
|
905
|
+
/* */
|
906
|
+
/* error_message */
|
907
|
+
/* */
|
908
|
+
/*******************************************************************/
|
909
|
+
|
910
|
+
static void error_message( size_t line )
|
911
|
+
|
912
|
+
{
|
913
|
+
static char message[1000];
|
914
|
+
sprintf(message, "Error: in line %u of text transducer file",
|
915
|
+
(unsigned int)line);
|
916
|
+
throw message;
|
917
|
+
}
|
918
|
+
|
919
|
+
|
920
|
+
/*******************************************************************/
|
921
|
+
/* */
|
922
|
+
/* Transducer::create_node */
|
923
|
+
/* */
|
924
|
+
/*******************************************************************/
|
925
|
+
|
926
|
+
Node *Transducer::create_node( vector<Node*> &node, char *s, size_t line )
|
927
|
+
|
928
|
+
{
|
929
|
+
char *p;
|
930
|
+
long n = strtol(s, &p, 10);
|
931
|
+
|
932
|
+
if (s == p || n < 0)
|
933
|
+
error_message( line );
|
934
|
+
if ((long)node.size() <= n)
|
935
|
+
node.resize(n+1, NULL);
|
936
|
+
if (node[n] == NULL)
|
937
|
+
node[n] = new_node(); //new Node;
|
938
|
+
|
939
|
+
return node[n];
|
940
|
+
}
|
941
|
+
|
942
|
+
|
943
|
+
/*******************************************************************/
|
944
|
+
/* */
|
945
|
+
/* next_string */
|
946
|
+
/* */
|
947
|
+
/*******************************************************************/
|
948
|
+
|
949
|
+
static char *next_string( char* &s, size_t line )
|
950
|
+
|
951
|
+
{
|
952
|
+
// scan the input up to the next tab or newline character
|
953
|
+
// and unquote symbols preceded by a backslash
|
954
|
+
char *p = s;
|
955
|
+
char *q = s;
|
956
|
+
while (*q!=0 && *q!='\t' && *q!='\n' && *q!='\r') {
|
957
|
+
if (*q == '\\')
|
958
|
+
q++;
|
959
|
+
*(p++) = *(q++);
|
960
|
+
}
|
961
|
+
if (p == s)
|
962
|
+
error_message(line); // no string found
|
963
|
+
|
964
|
+
char *result=s;
|
965
|
+
// skip over following whitespace
|
966
|
+
while (*q == ' ' || *q == '\t' || *q == '\n' || *q == '\r')
|
967
|
+
q++;
|
968
|
+
|
969
|
+
if (*q == 0)
|
970
|
+
s = NULL; // end of string was reached
|
971
|
+
else
|
972
|
+
s = q; // move the string pointer s
|
973
|
+
|
974
|
+
*p = 0; // mark the end of the result string
|
975
|
+
|
976
|
+
return result;
|
977
|
+
}
|
978
|
+
|
979
|
+
|
980
|
+
/*******************************************************************/
|
981
|
+
/* */
|
982
|
+
/* Transducer::read_transducer_text */
|
983
|
+
/* */
|
984
|
+
/*******************************************************************/
|
985
|
+
|
986
|
+
void Transducer::read_transducer_text( FILE *file )
|
987
|
+
|
988
|
+
{
|
989
|
+
vector<Node*> nodes;
|
990
|
+
nodes.push_back(root_node());
|
991
|
+
|
992
|
+
vmark = deterministic = 0;
|
993
|
+
char buffer[10000];
|
994
|
+
for( size_t line=0; fgets(buffer, 10000, file ); line++ ) {
|
995
|
+
char *p = buffer;
|
996
|
+
char *s = next_string(p, line);
|
997
|
+
Node *node = create_node( nodes, s, line );
|
998
|
+
if (p == NULL)
|
999
|
+
node->set_final(true);
|
1000
|
+
else {
|
1001
|
+
s = next_string(p, line);
|
1002
|
+
Node *target = create_node( nodes, s, line );
|
1003
|
+
|
1004
|
+
s = next_string(p, line);
|
1005
|
+
Character lc = alphabet.add_symbol(s);
|
1006
|
+
s = next_string(p, line);
|
1007
|
+
Character uc = alphabet.add_symbol(s);
|
1008
|
+
Label l(lc,uc);
|
1009
|
+
if (l == Label::epsilon)
|
1010
|
+
error_message( line );
|
1011
|
+
|
1012
|
+
alphabet.insert(l);
|
1013
|
+
node->add_arc( l, target, this );
|
1014
|
+
}
|
1015
|
+
}
|
1016
|
+
|
1017
|
+
vmark = 1;
|
1018
|
+
deterministic = minimised = 1;
|
1019
|
+
}
|
1020
|
+
|
1021
|
+
|
1022
|
+
/*******************************************************************/
|
1023
|
+
/* */
|
1024
|
+
/* Transducer::Transducer */
|
1025
|
+
/* */
|
1026
|
+
/*******************************************************************/
|
1027
|
+
|
1028
|
+
Transducer::Transducer( FILE *file, bool binary )
|
1029
|
+
|
1030
|
+
{
|
1031
|
+
indexed = false;
|
1032
|
+
node_count = transition_count = 0;
|
1033
|
+
if (binary)
|
1034
|
+
read_transducer_binary( file );
|
1035
|
+
else
|
1036
|
+
read_transducer_text( file );
|
1037
|
+
}
|
1038
|
+
|
1039
|
+
|
1040
|
+
/* EPSILON REMOVAL ALGORITHM written by Erik Axelson starts here */
|
1041
|
+
|
1042
|
+
/*******************************************************************/
|
1043
|
+
/* */
|
1044
|
+
/* node_in_copy_tr */
|
1045
|
+
/* */
|
1046
|
+
/*******************************************************************/
|
1047
|
+
|
1048
|
+
/* Find the corresponding node in 'copy_tr' for 'node'. If needed, create a new node to 'copy_tr'
|
1049
|
+
and update 'mapper' accordingly. */
|
1050
|
+
|
1051
|
+
Node *node_in_copy_tr( Node *node, Transducer *copy_tr, map<int, Node*> &mapper ) {
|
1052
|
+
int node_index = (int)node->index; // node index in original transducer
|
1053
|
+
map<int,Node*>::iterator it = mapper.find(node_index); // iterator to associated node in copy_tr
|
1054
|
+
if (it == mapper.end()) {
|
1055
|
+
Node *associated_node = copy_tr->new_node(); // create new node in copy_tr
|
1056
|
+
if (node->is_final())
|
1057
|
+
associated_node->set_final(true);
|
1058
|
+
mapper[node_index] = associated_node; // and associate it with node_index
|
1059
|
+
return associated_node;
|
1060
|
+
}
|
1061
|
+
else
|
1062
|
+
return it->second;
|
1063
|
+
}
|
1064
|
+
|
1065
|
+
|
1066
|
+
/*******************************************************************/
|
1067
|
+
/* */
|
1068
|
+
/* Transducer::copy_nodes */
|
1069
|
+
/* */
|
1070
|
+
/*******************************************************************/
|
1071
|
+
|
1072
|
+
/* Recursive epsilon removal algorithm. Copies arcs and their
|
1073
|
+
target nodes starting from search_node to node copy_tr_start_node
|
1074
|
+
in transducer copy_tr. nn and mapper are used to associate nodes
|
1075
|
+
with nodes in copy_tr. */
|
1076
|
+
|
1077
|
+
void Transducer::copy_nodes( Node *search_node, Transducer *copy_tr,
|
1078
|
+
Node *copy_tr_start_node,
|
1079
|
+
map<int, Node*> &mapper ) {
|
1080
|
+
|
1081
|
+
// go through all arcs leaving from search node
|
1082
|
+
// (the iterator lists the epsilon arcs first)
|
1083
|
+
for( ArcsIter it(search_node->arcs()); it; it++ ) {
|
1084
|
+
Arc arc=*it;
|
1085
|
+
|
1086
|
+
if (arc.label().is_epsilon()) {
|
1087
|
+
// 'forward', which is originally NULL, is used as a flag
|
1088
|
+
// for detecting epsilon transition loops
|
1089
|
+
if (search_node->forward() != copy_tr_start_node) {
|
1090
|
+
search_node->set_forward(copy_tr_start_node); // set epsilon flag
|
1091
|
+
if (arc.target_node()->is_final())
|
1092
|
+
copy_tr_start_node->set_final(true);
|
1093
|
+
copy_nodes(arc.target_node(), copy_tr, copy_tr_start_node, mapper);
|
1094
|
+
search_node->set_forward(NULL); // remove epsilon flag
|
1095
|
+
}
|
1096
|
+
}
|
1097
|
+
|
1098
|
+
else {
|
1099
|
+
// target node in copy_tr
|
1100
|
+
Node *copy_tr_end_node =
|
1101
|
+
node_in_copy_tr(arc.target_node(), copy_tr, mapper);
|
1102
|
+
// add arc to copy_tr
|
1103
|
+
copy_tr_start_node->add_arc( Label(arc.label().lower_char(),
|
1104
|
+
arc.label().upper_char()),
|
1105
|
+
copy_tr_end_node,
|
1106
|
+
copy_tr );
|
1107
|
+
// if the target node is not visited, copy nodes recursively
|
1108
|
+
if ( !(arc.target_node()->was_visited(vmark)) )
|
1109
|
+
copy_nodes(arc.target_node(), copy_tr, copy_tr_end_node, mapper);
|
1110
|
+
}
|
1111
|
+
|
1112
|
+
}
|
1113
|
+
}
|
1114
|
+
|
1115
|
+
|
1116
|
+
/*******************************************************************/
|
1117
|
+
/* */
|
1118
|
+
/* Transducer::remove_epsilons */
|
1119
|
+
/* */
|
1120
|
+
/*******************************************************************/
|
1121
|
+
|
1122
|
+
Transducer &Transducer::remove_epsilons()
|
1123
|
+
|
1124
|
+
{
|
1125
|
+
if ( deterministic || minimised )
|
1126
|
+
return this->copy();
|
1127
|
+
|
1128
|
+
nodeindexing();
|
1129
|
+
incr_vmark();
|
1130
|
+
Transducer *copy_tr = new Transducer();
|
1131
|
+
copy_tr->alphabet.copy(alphabet);
|
1132
|
+
map<int, Node*> mapper;
|
1133
|
+
// mark root node as visited
|
1134
|
+
root_node()->was_visited(vmark);
|
1135
|
+
// set copy_tr root node final, if needed
|
1136
|
+
if (root_node()->is_final())
|
1137
|
+
copy_tr->root_node()->set_final(true);
|
1138
|
+
// associate the root_nodes in this and copy_tr
|
1139
|
+
// (node indexing for root_node is zero)
|
1140
|
+
mapper[0] = copy_tr->root_node();
|
1141
|
+
|
1142
|
+
copy_nodes(root_node(), copy_tr, copy_tr->root_node(), mapper);
|
1143
|
+
incr_vmark();
|
1144
|
+
|
1145
|
+
return *copy_tr;
|
1146
|
+
}
|
1147
|
+
|
1148
|
+
// EPSILON REMOVAL ALGORITHM ENDS
|
1149
|
+
|
1150
|
+
}
|