ruby-sfst 0.4.3 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -0
  3. data/COPYING +280 -0
  4. data/Gemfile +3 -0
  5. data/Gemfile.lock +54 -0
  6. data/README.md +1 -1
  7. data/Rakefile +9 -18
  8. data/bin/console +7 -0
  9. data/bin/setup +6 -0
  10. data/ext/sfst/alphabet.cc +879 -0
  11. data/ext/sfst/alphabet.h +302 -0
  12. data/ext/sfst/basic.cc +85 -0
  13. data/ext/{sfst_machine → sfst}/basic.h +7 -4
  14. data/ext/sfst/compact.cc +629 -0
  15. data/ext/sfst/compact.h +100 -0
  16. data/ext/sfst/determinise.cc +279 -0
  17. data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
  18. data/ext/sfst/fst.cc +1150 -0
  19. data/ext/sfst/fst.h +374 -0
  20. data/ext/sfst/hopcroft.cc +681 -0
  21. data/ext/sfst/interface.cc +1921 -0
  22. data/ext/sfst/interface.h +171 -0
  23. data/ext/sfst/make-compact.cc +323 -0
  24. data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
  25. data/ext/sfst/mem.h +80 -0
  26. data/ext/sfst/operators.cc +1273 -0
  27. data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
  28. data/ext/sfst/sgi.h +72 -0
  29. data/ext/sfst/utf8.cc +149 -0
  30. data/ext/{sfst_machine → sfst}/utf8.h +7 -4
  31. data/lib/sfst.rb +2 -1
  32. data/lib/sfst/version.rb +1 -1
  33. data/ruby-sfst.gemspec +23 -23
  34. metadata +107 -35
  35. data/ext/sfst_machine/alphabet.cc +0 -812
  36. data/ext/sfst_machine/alphabet.h +0 -273
  37. data/ext/sfst_machine/basic.cc +0 -84
  38. data/ext/sfst_machine/compact.cc +0 -616
  39. data/ext/sfst_machine/compact.h +0 -98
  40. data/ext/sfst_machine/determinise.cc +0 -303
  41. data/ext/sfst_machine/fst.cc +0 -1000
  42. data/ext/sfst_machine/fst.h +0 -369
  43. data/ext/sfst_machine/interface.cc +0 -1842
  44. data/ext/sfst_machine/interface.h +0 -93
  45. data/ext/sfst_machine/make-compact.cc +0 -327
  46. data/ext/sfst_machine/mem.h +0 -74
  47. data/ext/sfst_machine/operators.cc +0 -1131
  48. data/ext/sfst_machine/sgi.h +0 -44
  49. data/ext/sfst_machine/utf8.cc +0 -146
  50. data/test/test_sfst.fst +0 -3
  51. data/test/test_sfst.rb +0 -114
@@ -0,0 +1,100 @@
1
+ /*******************************************************************/
2
+ /* */
3
+ /* FILE compact.h */
4
+ /* MODULE compact */
5
+ /* PROGRAM SFST */
6
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
+ /* */
8
+ /* PURPOSE finite state tools */
9
+ /* */
10
+ /*******************************************************************/
11
+
12
+ #ifndef _COMPACT_H_
13
+ #define _COMPACT_H_
14
+
15
+ #include "alphabet.h"
16
+
17
+ #include <vector>
18
+
19
+ namespace SFST {
20
+
21
+ typedef std::vector<unsigned int> CAnalysis;
22
+
23
+ class CompactTransducer {
24
+
25
+ protected:
26
+
27
+ // the following data structures are used to store the nodes
28
+
29
+ unsigned int number_of_nodes; // number of nodes in the transducer
30
+ char *finalp; // finalp[i] is 1 if node i is final and 0 otherwise
31
+ unsigned int *first_arc; // first_arc[i] is the number of the first
32
+ // arc outgoing from node i
33
+
34
+ // the following data structures are used to store the transition arcs
35
+
36
+ unsigned int number_of_arcs; // total number of arcs in the transducer
37
+ Label *label; // the label (character pair) of arc i
38
+ unsigned int *target_node; // target node of arc i
39
+
40
+ // the following data structures are used to store the stochastic parameters
41
+ float *final_logprob;
42
+ float *arc_logprob;
43
+
44
+ // functions needed to read the transducer from a file
45
+
46
+ void read_finalp( FILE *file );
47
+ void read_first_arcs( FILE *file );
48
+ void read_target_nodes( FILE *file );
49
+ void read_labels( FILE *file );
50
+ void read_probs( FILE *file );
51
+
52
+ // functions needed to analyze data with the transducer
53
+
54
+ void analyze( unsigned int n, std::vector<Character> &ch, size_t ipos,
55
+ CAnalysis&, std::vector<CAnalysis>&);
56
+
57
+ // function selecting the simplest morphological analysis
58
+
59
+ int compute_score( CAnalysis &ana );
60
+ void disambiguate( std::vector<CAnalysis> &analyses );
61
+
62
+ // functions for longest-match analysis of input data
63
+
64
+ void longest_match2(unsigned int, char*, int, CAnalysis&, int&, CAnalysis&);
65
+
66
+ void convert( CAnalysis &cana, Analysis &ana );
67
+
68
+ public:
69
+ size_t node_count() { return number_of_nodes; };
70
+ size_t arc_count() { return number_of_arcs; };
71
+
72
+ bool both_layers; // print surface and analysis symbols
73
+ bool simplest_only; // print only the simplest analyses
74
+
75
+ Alphabet alphabet; // data structure which maps symbols to numeric codes
76
+ CompactTransducer(); // dummy constructor
77
+ CompactTransducer( FILE*, FILE *pfile=NULL ); // reads a (stochastic) transducer
78
+ ~CompactTransducer(); // destroys a transducer
79
+
80
+ // the analysis function returns the set of analyses for the string "s"
81
+ // in the argument "analyses"
82
+ void analyze_string( char *s, std::vector<CAnalysis > &analyses );
83
+
84
+ void compute_probs( std::vector<CAnalysis> &analyses, std::vector<double> &prob );
85
+ char *print_analysis( CAnalysis &ana );
86
+
87
+ // longest-match analysis
88
+ const char *longest_match( char*& );
89
+
90
+ // EM training
91
+ bool train2( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
92
+ bool train( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
93
+ void estimate_probs( std::vector<double> &arcfreq, std::vector<double> &finalfreq );
94
+
95
+ // robust analysis
96
+ float robust_analyze_string( char *string, std::vector<CAnalysis> &analyses,
97
+ float ErrorsAllowed );
98
+ };
99
+ }
100
+ #endif
@@ -0,0 +1,279 @@
1
+
2
+ /*******************************************************************/
3
+ /* */
4
+ /* FILE determinise.C */
5
+ /* MODULE determinise */
6
+ /* PROGRAM SFST */
7
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
8
+ /* */
9
+ /*******************************************************************/
10
+
11
+
12
+ #include "fst.h"
13
+
14
+ using std::vector;
15
+ using std::pair;
16
+ using std::set;
17
+
18
+ namespace SFST {
19
+
20
+
21
+ /***************** class NodeSet *********************************/
22
+
23
+ class NodeSet {
24
+ // This class is used to store a set of nodes.
25
+ // Whenever a new node is added, all nodes accessible
26
+ // through epsilon transitions are added as well.
27
+
28
+ private:
29
+ set<Node*> ht;
30
+
31
+ public:
32
+ typedef set<Node*>::iterator iterator;
33
+ NodeSet() {};
34
+ void add( Node* );
35
+ bool insert(Node *node) {
36
+ pair<iterator, bool> result = ht.insert(node);
37
+ return result.second;
38
+ };
39
+ iterator begin() const { return ht.begin(); }
40
+ iterator end() const { return ht.end(); }
41
+ size_t size() const { return ht.size(); }
42
+ void clear() { ht.clear(); }
43
+ };
44
+
45
+ typedef map<const Label, NodeSet> Label2NodeSet;
46
+
47
+
48
+ /***************** class NodeArray *******************************/
49
+
50
+ class NodeArray {
51
+
52
+ private:
53
+ size_t sizev;
54
+ bool final;
55
+ Node **node;
56
+
57
+ public:
58
+ NodeArray( NodeSet& );
59
+ ~NodeArray() { delete[] node; };
60
+ size_t size() const { return sizev; }
61
+ bool is_final() const { return final; };
62
+ Node* &operator[]( size_t i ) const { return node[i]; }
63
+ };
64
+
65
+
66
+ /***************** class DTransition *****************************/
67
+
68
+ class DTransition {
69
+ public:
70
+ Label label;
71
+ NodeArray *nodes;
72
+ DTransition(Label l, NodeArray *na) { label = l; nodes = na; };
73
+ };
74
+
75
+
76
+ /***************** class NodeMapping ****************************/
77
+
78
+ class NodeMapping {
79
+ // This class is used to map a node set from one transducer
80
+ // to a single node in another transducer
81
+
82
+ private:
83
+ struct hashf {
84
+ size_t operator()(const NodeArray *na) const {
85
+ size_t key=na->size() ^ na->is_final();
86
+ for( size_t i=0; i<na->size(); i++)
87
+ key = (key<<1) ^ (size_t)(*na)[i];
88
+ return key;
89
+ }
90
+ };
91
+ struct equalf {
92
+ int operator()(const NodeArray *na1, const NodeArray *na2) const {
93
+ if (na1->size() != na2->size() || na1->is_final() != na2->is_final())
94
+ return 0;
95
+ for( size_t i=0; i<na1->size(); i++)
96
+ if ((*na1)[i] != (*na2)[i])
97
+ return 0;
98
+ return 1;
99
+ }
100
+ };
101
+ typedef hash_map<NodeArray*, Node*, hashf, equalf> NodeMap;
102
+ NodeMap hm;
103
+
104
+ public:
105
+ typedef NodeMap::iterator iterator;
106
+ ~NodeMapping();
107
+ iterator begin() { return hm.begin(); };
108
+ iterator end() { return hm.end(); };
109
+ iterator find( NodeArray *na) { return hm.find( na ); };
110
+ Node* &operator[]( NodeArray *na ) { return hm.operator[](na); };
111
+
112
+ };
113
+
114
+
115
+ static void determinise_node( NodeArray&, Node*, Transducer*, NodeMapping& );
116
+
117
+
118
+ /*******************************************************************/
119
+ /* */
120
+ /* NodeSet::add */
121
+ /* */
122
+ /*******************************************************************/
123
+
124
+ void NodeSet::add( Node *node )
125
+
126
+ {
127
+ pair<iterator, bool> result = ht.insert(node);
128
+ if (result.second) {
129
+ // new node, add nodes reachable with epsilon transitions
130
+ for( ArcsIter p(node->arcs(),ArcsIter::eps); p; p++ ) {
131
+ Arc *arc=p;
132
+ if (!arc->label().is_epsilon())
133
+ break;
134
+ add(arc->target_node());
135
+ }
136
+ }
137
+ }
138
+
139
+
140
+ /*******************************************************************/
141
+ /* */
142
+ /* NodeArray::NodeArray */
143
+ /* */
144
+ /*******************************************************************/
145
+
146
+ NodeArray::NodeArray( NodeSet &ns )
147
+
148
+ {
149
+ sizev = 0;
150
+ NodeSet::iterator it;
151
+
152
+ final = false;
153
+ node = new Node*[ns.size()];
154
+ for( it=ns.begin(); it!=ns.end(); it++ ) {
155
+ Node *nn = *it;
156
+ if (nn->arcs()->non_epsilon_transition_exists())
157
+ node[sizev++] = nn;
158
+ if (nn->is_final())
159
+ final = true;
160
+ }
161
+ }
162
+
163
+
164
+ /*******************************************************************/
165
+ /* */
166
+ /* NodeMapping::~NodeMapping */
167
+ /* */
168
+ /*******************************************************************/
169
+
170
+ NodeMapping::~NodeMapping()
171
+
172
+ {
173
+ // if we delete NodeArrays without removing them from NodeMapping,
174
+ // the system will crash when NodeMapping is deleted.
175
+ for( iterator it=hm.begin(); it!=hm.end(); ) {
176
+ NodeArray *na=it->first;
177
+ iterator old = it++;
178
+ hm.erase(old);
179
+ delete na;
180
+ }
181
+ }
182
+
183
+
184
+ /*******************************************************************/
185
+ /* */
186
+ /* compute_transitions */
187
+ /* */
188
+ /*******************************************************************/
189
+
190
+ static void compute_transitions( NodeArray &na, vector<DTransition> &t )
191
+
192
+ {
193
+ Label2NodeSet lmap;
194
+
195
+ // for all nodes in the current set
196
+ for( size_t i=0; i<na.size(); i++) {
197
+ Node *n = na[i]; // old node
198
+
199
+ // For each non-epsilon transition, add the target node
200
+ // to the respective node set.
201
+ for( ArcsIter p(n->arcs(), ArcsIter::non_eps); p; p++ ) {
202
+ Arc *arc=p;
203
+ lmap[arc->label()].add(arc->target_node());
204
+ }
205
+ }
206
+
207
+ t.reserve(lmap.size());
208
+ for( Label2NodeSet::iterator it=lmap.begin(); it!=lmap.end(); it++ ) {
209
+ t.push_back(DTransition(it->first, new NodeArray( it->second )));
210
+ }
211
+ }
212
+
213
+
214
+ /*******************************************************************/
215
+ /* */
216
+ /* determinise_node */
217
+ /* */
218
+ /*******************************************************************/
219
+
220
+ static void determinise_node( NodeArray &na, Node *node, Transducer *a,
221
+ NodeMapping &map )
222
+ {
223
+ node->set_final(na.is_final());
224
+
225
+ vector<DTransition> t;
226
+ compute_transitions( na, t );
227
+
228
+ for( size_t i=0; i<t.size(); i++ ) {
229
+ NodeMapping::iterator it=map.find(t[i].nodes);
230
+ if (it == map.end()) {
231
+ // new node set
232
+ Node *target_node = a->new_node();
233
+ map[t[i].nodes] = target_node;
234
+ node->add_arc( t[i].label, target_node, a );
235
+ determinise_node( *t[i].nodes, target_node, a, map );
236
+ }
237
+ else {
238
+ delete t[i].nodes;
239
+ node->add_arc( t[i].label, it->second, a );
240
+ }
241
+ }
242
+ }
243
+
244
+
245
+ /*******************************************************************/
246
+ /* */
247
+ /* Transducer::determinise */
248
+ /* */
249
+ /*******************************************************************/
250
+
251
+ Transducer &Transducer::determinise( bool copy_alphabet )
252
+
253
+ {
254
+ if (deterministic)
255
+ return copy();
256
+
257
+ Transducer *a = new Transducer();
258
+ if (copy_alphabet)
259
+ a->alphabet.copy(alphabet);
260
+
261
+ // creation of the initial node set consisting of all nodes
262
+ // reachable from the start node via epsilon transitions.
263
+ NodeArray *na;
264
+ {
265
+ NodeSet ns;
266
+ ns.add(root_node());
267
+ na = new NodeArray(ns);
268
+ }
269
+
270
+ // map the node set to the new root node
271
+ NodeMapping map;
272
+ map[na] = a->root_node();
273
+
274
+ // determinise the transducer recursively
275
+ determinise_node( *na, a->root_node(), a, map );
276
+ a->deterministic = 1;
277
+ return *a;
278
+ }
279
+ }
@@ -1,5 +1,6 @@
1
1
  require 'mkmf'
2
+
2
3
  CONFIG['CC'] = 'g++'
3
4
  CONFIG['CXX'] = 'g++'
4
5
  $CPPFLAGS='-Wall -O3 -Wall -Wcast-qual -Wconversion -DSGI__gnu_cxx -DREADLINE'
5
- create_makefile "sfst_machine"
6
+ create_makefile('sfst/sfst')
@@ -0,0 +1,1150 @@
1
+
2
+ /*******************************************************************/
3
+ /* */
4
+ /* FILE fst.C */
5
+ /* MODULE fst */
6
+ /* PROGRAM SFST */
7
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
8
+ /* */
9
+ /* PURPOSE basic FST functions */
10
+ /* */
11
+ /*******************************************************************/
12
+
13
+ #include "fst.h"
14
+
15
+ namespace SFST {
16
+
17
+ using std::vector;
18
+ using std::istream;
19
+ using std::ostream;
20
+ using std::cerr;
21
+
22
+ const int BUFFER_SIZE=100000;
23
+
24
+
25
+ /*******************************************************************/
26
+ /* */
27
+ /* Arcs::size */
28
+ /* */
29
+ /*******************************************************************/
30
+
31
+ int Arcs::size() const
32
+
33
+ {
34
+ int n=0;
35
+ for( Arc *p=first_arcp; p; p=p->next ) n++;
36
+ for( Arc *p=first_epsilon_arcp; p; p=p->next ) n++;
37
+ return n;
38
+ }
39
+
40
+
41
+ /*******************************************************************/
42
+ /* */
43
+ /* Arcs::target_node */
44
+ /* */
45
+ /*******************************************************************/
46
+
47
+ Node *Arcs::target_node( Label l )
48
+
49
+ {
50
+ Arc *arc;
51
+
52
+ for( arc=first_arcp; arc; arc=arc->next)
53
+ if (arc->label() == l)
54
+ return arc->target_node();
55
+
56
+ return NULL;
57
+ }
58
+
59
+ const Node *Arcs::target_node( Label l ) const
60
+
61
+ {
62
+ const Arc *arc;
63
+
64
+ for( arc=first_arcp; arc; arc=arc->next)
65
+ if (arc->label() == l)
66
+ return arc->target_node();
67
+
68
+ return NULL;
69
+ }
70
+
71
+
72
+ /*******************************************************************/
73
+ /* */
74
+ /* Transducer::new_node */
75
+ /* */
76
+ /*******************************************************************/
77
+
78
+ Node *Transducer::new_node()
79
+
80
+ {
81
+ Node *node=(Node*)mem.alloc( sizeof(Node) );
82
+
83
+ node->init();
84
+ return node;
85
+ }
86
+
87
+
88
+ /*******************************************************************/
89
+ /* */
90
+ /* Transducer::new_arc */
91
+ /* */
92
+ /*******************************************************************/
93
+
94
+ Arc *Transducer::new_arc( Label l, Node *target )
95
+
96
+ {
97
+ Arc *arc=(Arc*)mem.alloc( sizeof(Arc) );
98
+ arc->init( l, target);
99
+ return arc;
100
+ }
101
+
102
+
103
+ /*******************************************************************/
104
+ /* */
105
+ /* Arcs::add_arc */
106
+ /* */
107
+ /*******************************************************************/
108
+
109
+ void Arcs::add_arc( Label l, Node *node, Transducer *a )
110
+
111
+ {
112
+ Arc *arc=a->new_arc( l, node );
113
+
114
+ if (l.is_epsilon()) {
115
+ arc->next = first_epsilon_arcp;
116
+ first_epsilon_arcp = arc;
117
+ }
118
+ else {
119
+ arc->next = first_arcp;
120
+ first_arcp = arc;
121
+ }
122
+ }
123
+
124
+
125
+ /*******************************************************************/
126
+ /* */
127
+ /* Arcs::remove_arc */
128
+ /* */
129
+ /*******************************************************************/
130
+
131
+ int Arcs::remove_arc( Arc *arc )
132
+
133
+ {
134
+ Arc **p = (arc->label().is_epsilon()) ? &first_epsilon_arcp : &first_arcp;
135
+ for( ; *p; p=&(*p)->next )
136
+ if (*p == arc) {
137
+ *p = arc->next;
138
+ return 1;
139
+ }
140
+ return 0;
141
+ }
142
+
143
+
144
+ /*******************************************************************/
145
+ /* */
146
+ /* Node::init */
147
+ /* */
148
+ /*******************************************************************/
149
+
150
+ void Node::init()
151
+
152
+ {
153
+ final = false;
154
+ visited = 0;
155
+ arcsp.init();
156
+ forwardp = NULL;
157
+ }
158
+
159
+
160
+ /*******************************************************************/
161
+ /* */
162
+ /* Node::clear_visited */
163
+ /* */
164
+ /*******************************************************************/
165
+
166
+ void Node::clear_visited( NodeHashSet &nodeset )
167
+
168
+ {
169
+ if (nodeset.find( this ) == nodeset.end()) {
170
+ visited = 0;
171
+ nodeset.insert( this );
172
+ fprintf(stderr," %lu", (unsigned long)nodeset.size());
173
+ for( ArcsIter p(arcs()); p; p++ ) {
174
+ Arc *arc=p;
175
+ arc->target_node()->clear_visited( nodeset );
176
+ }
177
+ }
178
+ }
179
+
180
+
181
+ /*******************************************************************/
182
+ /* */
183
+ /* Transducer::index_nodes */
184
+ /* */
185
+ /*******************************************************************/
186
+
187
+ void Transducer::index_nodes( Node *node, vector<Node*> *nodearray )
188
+
189
+ {
190
+ if (!node->was_visited( vmark )) {
191
+ node->index = (Index)node_count++;
192
+ if (nodearray)
193
+ nodearray->push_back(node);
194
+
195
+ for( ArcsIter p(node->arcs()); p; p++ ) {
196
+ Arc *arc=p;
197
+ transition_count++;
198
+ index_nodes( arc->target_node(), nodearray );
199
+ }
200
+ }
201
+ }
202
+
203
+
204
+ /*******************************************************************/
205
+ /* */
206
+ /* Transducer::nodeindexing */
207
+ /* */
208
+ /*******************************************************************/
209
+
210
+ std::pair<size_t,size_t> Transducer::nodeindexing( vector<Node*> *nodearray )
211
+
212
+ {
213
+ if (!indexed) {
214
+ incr_vmark();
215
+ index_nodes( root_node(), nodearray );
216
+ indexed = true;
217
+ }
218
+
219
+ return std::pair<size_t,size_t>(node_count, transition_count);
220
+ }
221
+
222
+
223
+ /*******************************************************************/
224
+ /* */
225
+ /* Transducer::add_string */
226
+ /* */
227
+ /*******************************************************************/
228
+
229
+ void Transducer::add_string( char *s, bool extended, Alphabet *a )
230
+
231
+ {
232
+ if (a == NULL)
233
+ a = &alphabet;
234
+
235
+ Node *node=root_node();
236
+ Label l;
237
+ while (!(l = a->next_label(s, extended)).is_epsilon()) {
238
+ a->insert(l);
239
+ Arcs *arcs=node->arcs();
240
+ node = arcs->target_node( l );
241
+ if (node == NULL) {
242
+ node = new_node();
243
+ arcs->add_arc( l, node, this );
244
+ }
245
+ }
246
+ node->set_final(1);
247
+ }
248
+
249
+
250
+ /*******************************************************************/
251
+ /* */
252
+ /* Transducer::Transducer */
253
+ /* */
254
+ /*******************************************************************/
255
+
256
+ Transducer::Transducer( vector<Label> &path )
257
+ : root(), mem()
258
+ {
259
+ Node *node=root_node();
260
+
261
+ vmark = 0;
262
+ indexed = false;
263
+ node_count = transition_count = 0;
264
+ deterministic = minimised = true;
265
+ for( size_t i=0; i<path.size(); i++ ) {
266
+ Arcs *arcs=node->arcs();
267
+ node = new_node();
268
+ arcs->add_arc( path[i], node, this );
269
+ }
270
+ node->set_final(1);
271
+ }
272
+
273
+
274
+ /*******************************************************************/
275
+ /* */
276
+ /* Transducer::Transducer */
277
+ /* */
278
+ /*******************************************************************/
279
+
280
+ Transducer::Transducer( istream &is, const Alphabet *a, bool verbose,
281
+ bool lexcomments )
282
+ : root(), mem()
283
+ {
284
+ bool extended=false;
285
+ int n=0;
286
+ char buffer[10000];
287
+
288
+ vmark = 0;
289
+ indexed = false;
290
+ node_count = transition_count = 0;
291
+ deterministic = true;
292
+ minimised = false;
293
+ if (a) {
294
+ alphabet.copy(*a);
295
+ extended = true;
296
+ }
297
+ while (is.getline(buffer, 10000)) {
298
+ if (verbose && ++n % 10000 == 0) {
299
+ if (n == 10000)
300
+ cerr << "\n";
301
+ cerr << "\r" << n << " words";
302
+ }
303
+
304
+ // delete comments
305
+ if (lexcomments) {
306
+ size_t l = strlen(buffer);
307
+ for( size_t i=0; i<l; i++ )
308
+ if (buffer[i] == '\\' && buffer[i+1])
309
+ ; // quoted character
310
+ else if (buffer[i] == '%') {
311
+ // comment starts here
312
+ buffer[i] = 0;
313
+ break;
314
+ }
315
+ if (buffer[0] == 0)
316
+ continue;
317
+ }
318
+
319
+ // delete final whitespace characters
320
+ int l;
321
+ for( l=(int)strlen(buffer)-1; l>=0; l-- )
322
+ if ((buffer[l] != ' ' && buffer[l] != '\t' && buffer[l] != '\r') ||
323
+ (l > 0 && buffer[l-1] == '\\'))
324
+ break;
325
+ buffer[l+1] = 0;
326
+
327
+ add_string(buffer, extended);
328
+ }
329
+ if (verbose && n >= 10000)
330
+ cerr << "\n";
331
+ }
332
+
333
+
334
+ /*******************************************************************/
335
+ /* */
336
+ /* Transducer::Transducer */
337
+ /* */
338
+ /*******************************************************************/
339
+
340
+ Transducer::Transducer( char *s, const Alphabet *a, bool extended )
341
+ : root(), mem()
342
+ {
343
+ vmark = 0;
344
+ indexed = false;
345
+ node_count = transition_count = 0;
346
+ deterministic = minimised = true;
347
+ if (a)
348
+ alphabet.copy(*a);
349
+ add_string(s, extended);
350
+ }
351
+
352
+
353
+ /*******************************************************************/
354
+ /* */
355
+ /* Transducer::clear */
356
+ /* */
357
+ /*******************************************************************/
358
+
359
+ void Transducer::clear()
360
+
361
+ {
362
+ vmark = 0;
363
+ deterministic = minimised = false;
364
+ root.init();
365
+ mem.clear();
366
+ alphabet.clear();
367
+ }
368
+
369
+
370
+ /*******************************************************************/
371
+ /* */
372
+ /* Transducer::store_symbols */
373
+ /* */
374
+ /*******************************************************************/
375
+
376
+ void Transducer::store_symbols(Node *node, SymbolMap &symbol,
377
+ LabelSet &labels)
378
+ {
379
+ if (!node->was_visited( vmark )) {
380
+ Arcs *arcs=node->arcs();
381
+ for( ArcsIter p(arcs); p; p++ ) {
382
+ Arc *arc=p;
383
+ Label l=arc->label();
384
+
385
+ labels.insert(l);
386
+
387
+ Character c = l.upper_char();
388
+ if (symbol.find(c) == symbol.end()) {
389
+ const char *s = alphabet.code2symbol(c);
390
+ if (s)
391
+ symbol[c] = fst_strdup(s);
392
+ }
393
+
394
+ c = l.lower_char();
395
+ if (symbol.find(c) == symbol.end()) {
396
+ const char *s = alphabet.code2symbol(c);
397
+ if (s)
398
+ symbol[c] = fst_strdup(s);
399
+ }
400
+
401
+ store_symbols( arc->target_node(), symbol, labels );
402
+ }
403
+ }
404
+ }
405
+
406
+
407
+ /*******************************************************************/
408
+ /* */
409
+ /* Transducer::minimise_alphabet */
410
+ /* */
411
+ /*******************************************************************/
412
+
413
+ void Transducer::minimise_alphabet()
414
+
415
+ {
416
+ SymbolMap symbols;
417
+ LabelSet labels;
418
+ incr_vmark();
419
+ store_symbols(root_node(), symbols, labels);
420
+ alphabet.clear();
421
+ for( SymbolMap::iterator it=symbols.begin(); it!=symbols.end(); it++ ) {
422
+ alphabet.add_symbol( it->second, it->first );
423
+ free(it->second);
424
+ }
425
+ for( LabelSet::iterator it=labels.begin(); it!=labels.end(); it++ )
426
+ alphabet.insert(*it);
427
+ }
428
+
429
+
430
+ /*******************************************************************/
431
+ /* */
432
+ /* Transducer::size_node */
433
+ /* */
434
+ /*******************************************************************/
435
+
436
+ size_t Transducer::size_node( Node *node )
437
+
438
+ {
439
+ size_t result = 0;
440
+ if (!node->was_visited( vmark )) {
441
+ result++;
442
+ for( ArcsIter it(node->arcs()); it; it++ ) {
443
+ Arc *arc=it;
444
+ result += size_node( arc->target_node() );
445
+ }
446
+ }
447
+ return result;
448
+ }
449
+
450
+
451
+ /*******************************************************************/
452
+ /* */
453
+ /* Transducer::size_node */
454
+ /* */
455
+ /*******************************************************************/
456
+
457
+ size_t Transducer::size()
458
+
459
+ {
460
+ incr_vmark();
461
+ return size_node(root_node());
462
+ }
463
+
464
+
465
+ /*******************************************************************/
466
+ /* */
467
+ /* Transducer::enumerate_paths_node */
468
+ /* */
469
+ /*******************************************************************/
470
+
471
+ void Transducer::enumerate_paths_node( Node *node, vector<Label> &path,
472
+ NodeHashSet &previous,
473
+ vector<Transducer*> &result )
474
+ {
475
+ if (node->is_final())
476
+ result.push_back(new Transducer(path));
477
+
478
+ for( ArcsIter it(node->arcs()); it; it++ ) {
479
+ Arc *arc=it;
480
+
481
+ NodeHashSet::iterator hsit=previous.insert(node).first;
482
+ path.push_back(arc->label());
483
+ enumerate_paths_node( arc->target_node(), path, previous, result );
484
+ path.pop_back();
485
+ previous.erase(hsit);
486
+ }
487
+ }
488
+
489
+
490
+ /*******************************************************************/
491
+ /* */
492
+ /* Transducer::enumerate_paths */
493
+ /* */
494
+ /*******************************************************************/
495
+
496
+ bool Transducer::enumerate_paths( vector<Transducer*> &result )
497
+
498
+ {
499
+ if (is_infinitely_ambiguous())
500
+ return true;
501
+ for( size_t i=0; i<result.size(); i++ )
502
+ delete result[i];
503
+ result.clear();
504
+
505
+ vector<Label> path;
506
+ NodeHashSet previous;
507
+ enumerate_paths_node( root_node(), path, previous, result );
508
+ return false;
509
+ }
510
+
511
+
512
+
513
+
514
+ /*******************************************************************/
515
+ /* */
516
+ /* Transducer::print_strings_node */
517
+ /* */
518
+ /*******************************************************************/
519
+
520
+ int Transducer::print_strings_node(Node *node, char *buffer, int pos,
521
+ FILE *file, bool with_brackets )
522
+ {
523
+ int result = 0;
524
+
525
+ if (node->was_visited( vmark )) {
526
+ if (node->forward() != NULL) { // cycle detected
527
+ cerr << "Warning: cyclic analyses (cycle aborted)\n";
528
+ return 0;
529
+ }
530
+ node->set_forward(node); // used like a flag for loop detection
531
+ }
532
+ if (pos == BUFFER_SIZE)
533
+ throw "Output string in function print_strings_node is too long";
534
+ if (node->is_final()) {
535
+ buffer[pos] = '\0';
536
+ fprintf(file,"%s\n", buffer);
537
+ result = 1;
538
+ }
539
+ for( ArcsIter i(node->arcs()); i; i++ ) {
540
+ int p=pos;
541
+ Arc *arc=i;
542
+ Label l=arc->label();
543
+ alphabet.write_label(l, buffer, &p, with_brackets);
544
+ result |= print_strings_node(arc->target_node(), buffer, p,
545
+ file, with_brackets );
546
+ }
547
+ node->set_forward(NULL);
548
+
549
+ return result;
550
+ }
551
+
552
+
553
+ /*******************************************************************/
554
+ /* */
555
+ /* Transducer::print_strings */
556
+ /* */
557
+ /*******************************************************************/
558
+
559
+ int Transducer::print_strings( FILE *file, bool with_brackets )
560
+
561
+ {
562
+ char buffer[BUFFER_SIZE];
563
+ incr_vmark();
564
+ return print_strings_node( root_node(), buffer, 0, file, with_brackets );
565
+ }
566
+
567
+
568
+ /*******************************************************************/
569
+ /* */
570
+ /* Transducer::analyze_string */
571
+ /* */
572
+ /*******************************************************************/
573
+
574
+ bool Transducer::analyze_string( char *string, FILE *file, bool with_brackets )
575
+
576
+ {
577
+ vector<Character> input;
578
+ alphabet.string2symseq( string, input );
579
+ vector<Label> labels;
580
+ for( size_t i=0; i<input.size(); i++ )
581
+ labels.push_back(Label(input[i]));
582
+
583
+ Transducer a1(labels);
584
+ Transducer *a2=&(*this || a1);
585
+ Transducer *a3=&(a2->lower_level());
586
+ delete a2;
587
+ a2 = &a3->minimise();
588
+ delete a3;
589
+
590
+ a2->alphabet.copy(alphabet);
591
+ bool result = a2->print_strings( file, with_brackets );
592
+ delete a2;
593
+ return result;
594
+ }
595
+
596
+
597
+ /*******************************************************************/
598
+ /* */
599
+ /* Transducer::generate_string */
600
+ /* */
601
+ /*******************************************************************/
602
+
603
+ bool Transducer::generate_string( char *string, FILE *file, bool with_brackets)
604
+
605
+ {
606
+ Transducer a1(string, &alphabet, false);
607
+ Transducer *a2=&(a1 || *this);
608
+ Transducer *a3=&(a2->upper_level());
609
+ delete a2;
610
+ a2 = &a3->minimise();
611
+ delete a3;
612
+
613
+ a2->alphabet.copy(alphabet);
614
+ bool result = a2->print_strings( file, with_brackets );
615
+ delete a2;
616
+ return result;
617
+ }
618
+
619
+
620
+ /*******************************************************************/
621
+ /* */
622
+ /* complete */
623
+ /* */
624
+ /*******************************************************************/
625
+
626
+ static void complete( Node *node, Alphabet &alphabet, VType vmark)
627
+
628
+ {
629
+ if (node->was_visited( vmark ))
630
+ return;
631
+ for( ArcsIter p(node->arcs()); p; p++ ) {
632
+ Arc *arc=p;
633
+ if (!arc->label().is_epsilon())
634
+ alphabet.insert(arc->label());
635
+ complete(arc->target_node(), alphabet, vmark);
636
+ }
637
+ }
638
+
639
+
640
+ /*******************************************************************/
641
+ /* */
642
+ /* Transducer::complete_alphabet */
643
+ /* */
644
+ /*******************************************************************/
645
+
646
+ void Transducer::complete_alphabet()
647
+
648
+ {
649
+ incr_vmark();
650
+ complete(root_node(), alphabet, vmark);
651
+ }
652
+
653
+
654
+ /*******************************************************************/
655
+ /* */
656
+ /* print_node */
657
+ /* */
658
+ /*******************************************************************/
659
+
660
+ static void print_node( ostream &s, Node *node, VType vmark, Alphabet &abc )
661
+
662
+ {
663
+ if (!node->was_visited( vmark )) {
664
+ Arcs *arcs=node->arcs();
665
+ for( ArcsIter p(arcs); p; p++ ) {
666
+ Arc *arc=p;
667
+ s << node->index << "\t" << arc->target_node()->index;
668
+ s << "\t" << abc.write_char(arc->label().lower_char());
669
+ s << "\t" << abc.write_char(arc->label().upper_char());
670
+ s << "\n";
671
+ }
672
+ if (node->is_final())
673
+ s << node->index << "\n";
674
+ for( ArcsIter p(arcs); p; p++ ) {
675
+ Arc *arc=p;
676
+ print_node( s, arc->target_node(), vmark, abc );
677
+ }
678
+ }
679
+ }
680
+
681
+
682
+ /*******************************************************************/
683
+ /* */
684
+ /* operator<< */
685
+ /* */
686
+ /*******************************************************************/
687
+
688
+ ostream &operator<<( ostream &s, Transducer &a )
689
+
690
+ {
691
+ a.nodeindexing();
692
+ a.incr_vmark();
693
+ print_node( s, a.root_node(), a.vmark, a.alphabet );
694
+ return s;
695
+ }
696
+
697
+
698
+ /*******************************************************************/
699
+ /* */
700
+ /* store_node_info */
701
+ /* */
702
+ /*******************************************************************/
703
+
704
+ static void store_node_info( FILE *file, Node *node )
705
+
706
+ {
707
+ // write final flag
708
+ char c=node->is_final();
709
+ fwrite(&c,sizeof(c),1,file);
710
+
711
+ // write the number of arcs
712
+ int nn = node->arcs()->size();
713
+ if (nn > 65535)
714
+ throw "Error: in function store_node\n";
715
+ unsigned short n=(unsigned short)nn;
716
+ fwrite(&n,sizeof(n),1,file);
717
+ }
718
+
719
+
720
+ /*******************************************************************/
721
+ /* */
722
+ /* store_arc_label */
723
+ /* */
724
+ /*******************************************************************/
725
+
726
+ static void store_arc_label( FILE *file, Arc *arc )
727
+
728
+ {
729
+ Label l=arc->label();
730
+ Character lc=l.lower_char();
731
+ Character uc=l.upper_char();
732
+ fwrite(&lc,sizeof(lc),1,file);
733
+ fwrite(&uc,sizeof(uc),1,file);
734
+ }
735
+
736
+
737
+ /*******************************************************************/
738
+ /* */
739
+ /* store_node */
740
+ /* */
741
+ /*******************************************************************/
742
+
743
+ static void store_node( FILE *file, Node *node, VType vmark )
744
+ {
745
+ if (!node->was_visited( vmark )) {
746
+
747
+ store_node_info( file, node );
748
+
749
+ // write the arcs
750
+ for( ArcsIter p(node->arcs()); p; p++ ) {
751
+ Arc *arc=p;
752
+ store_arc_label( file, arc );
753
+ unsigned int t = (unsigned int)arc->target_node()->index;
754
+ fwrite(&t,sizeof(t),1,file);
755
+ store_node(file, arc->target_node(), vmark );
756
+ }
757
+ }
758
+ }
759
+
760
+
761
+ /*******************************************************************/
762
+ /* */
763
+ /* store_lowmem_node */
764
+ /* */
765
+ /*******************************************************************/
766
+
767
+ static void store_lowmem_node( FILE *file, Node *node,
768
+ vector<unsigned int> &startpos)
769
+ {
770
+ store_node_info( file, node );
771
+
772
+ // write the arcs
773
+ for( ArcsIter p(node->arcs()); p; p++ ) {
774
+ Arc *arc=p;
775
+ store_arc_label( file, arc );
776
+ unsigned int t=startpos[arc->target_node()->index];
777
+ fwrite(&t,sizeof(t),1,file);
778
+ }
779
+ }
780
+
781
+
782
+ /*******************************************************************/
783
+ /* */
784
+ /* Transducer::store_lowmem */
785
+ /* */
786
+ /*******************************************************************/
787
+
788
+ void Transducer::store_lowmem( FILE *file )
789
+
790
+ {
791
+ fputc('l',file);
792
+ alphabet.store(file);
793
+
794
+ // storing size of index table
795
+ vector<Node*> nodearray;
796
+ nodeindexing( &nodearray );
797
+
798
+ // compute the start position of the first node
799
+ unsigned int pos=(unsigned int)ftell(file);
800
+ vector<unsigned int> startpos;
801
+ for( size_t i=0; i<nodearray.size(); i++ ) {
802
+ startpos.push_back(pos);
803
+ Node *node=nodearray[i];
804
+ Arcs *arcs=node->arcs();
805
+ pos += (unsigned)(sizeof(char) // size of final flag
806
+ + sizeof(unsigned short) // size of number of arcs
807
+ + arcs->size() * (sizeof(Character) * 2 + sizeof(unsigned int))); // size of n arcs
808
+ }
809
+
810
+ // storing nodes
811
+ for( size_t i=0; i<nodearray.size(); i++ )
812
+ store_lowmem_node( file, nodearray[i], startpos );
813
+ }
814
+
815
+
816
+ /*******************************************************************/
817
+ /* */
818
+ /* Transducer::store */
819
+ /* */
820
+ /*******************************************************************/
821
+
822
+ void Transducer::store( FILE *file )
823
+
824
+ {
825
+ fputc('a',file);
826
+
827
+ vector<Node*> nodearray;
828
+ nodeindexing( &nodearray );
829
+ incr_vmark();
830
+ unsigned int n=(unsigned)nodearray.size();
831
+ fwrite(&n,sizeof(n),1,file);
832
+ store_node( file, root_node(), vmark );
833
+
834
+ alphabet.store(file);
835
+ }
836
+
837
+
838
+ /*******************************************************************/
839
+ /* */
840
+ /* read_node */
841
+ /* */
842
+ /*******************************************************************/
843
+
844
+ static void read_node( FILE *file, Node *node, Node **p, Transducer *a )
845
+ {
846
+ char c;
847
+ fread(&c,sizeof(c),1,file);
848
+ node->set_final(c);
849
+
850
+ unsigned short n;
851
+ fread( &n, sizeof(n), 1, file);
852
+
853
+ for( int i=0; i<n; i++ ) {
854
+ Character lc,uc;
855
+ unsigned int t;
856
+ fread(&lc,sizeof(lc),1,file);
857
+ fread(&uc,sizeof(uc),1,file);
858
+ fread(&t,sizeof(t),1,file);
859
+ if (ferror(file))
860
+ throw "Error encountered while reading transducer from file";
861
+ if (p[t])
862
+ node->add_arc( Label(lc,uc), p[t], a );
863
+ else {
864
+ p[t] = a->new_node();
865
+ node->add_arc( Label(lc,uc), p[t], a );
866
+ read_node(file, p[t], p, a );
867
+ }
868
+ }
869
+ }
870
+
871
+
872
+ /*******************************************************************/
873
+ /* */
874
+ /* Transducer::read_transducer_binary */
875
+ /* */
876
+ /*******************************************************************/
877
+
878
+ void Transducer::read_transducer_binary( FILE *file )
879
+
880
+ {
881
+ if (fgetc(file) != 'a')
882
+ throw "Error: wrong file format (not a standard transducer)\n";
883
+
884
+ vmark = deterministic = 0;
885
+ unsigned int n;
886
+ fread(&n,sizeof(n),1,file); // number of nodes
887
+ if (ferror(file))
888
+ throw "Error encountered while reading transducer from file";
889
+
890
+ Node **p=new Node*[n]; // maps indices to nodes
891
+ p[0] = root_node();
892
+ for( unsigned int i=1; i<n; i++)
893
+ p[i] = NULL;
894
+ read_node( file, root_node(), p, this );
895
+ delete[] p;
896
+
897
+ alphabet.read(file);
898
+
899
+ vmark = 1;
900
+ deterministic = minimised = 1;
901
+ }
902
+
903
+
904
+ /*******************************************************************/
905
+ /* */
906
+ /* error_message */
907
+ /* */
908
+ /*******************************************************************/
909
+
910
+ static void error_message( size_t line )
911
+
912
+ {
913
+ static char message[1000];
914
+ sprintf(message, "Error: in line %u of text transducer file",
915
+ (unsigned int)line);
916
+ throw message;
917
+ }
918
+
919
+
920
+ /*******************************************************************/
921
+ /* */
922
+ /* Transducer::create_node */
923
+ /* */
924
+ /*******************************************************************/
925
+
926
+ Node *Transducer::create_node( vector<Node*> &node, char *s, size_t line )
927
+
928
+ {
929
+ char *p;
930
+ long n = strtol(s, &p, 10);
931
+
932
+ if (s == p || n < 0)
933
+ error_message( line );
934
+ if ((long)node.size() <= n)
935
+ node.resize(n+1, NULL);
936
+ if (node[n] == NULL)
937
+ node[n] = new_node(); //new Node;
938
+
939
+ return node[n];
940
+ }
941
+
942
+
943
+ /*******************************************************************/
944
+ /* */
945
+ /* next_string */
946
+ /* */
947
+ /*******************************************************************/
948
+
949
+ static char *next_string( char* &s, size_t line )
950
+
951
+ {
952
+ // scan the input up to the next tab or newline character
953
+ // and unquote symbols preceded by a backslash
954
+ char *p = s;
955
+ char *q = s;
956
+ while (*q!=0 && *q!='\t' && *q!='\n' && *q!='\r') {
957
+ if (*q == '\\')
958
+ q++;
959
+ *(p++) = *(q++);
960
+ }
961
+ if (p == s)
962
+ error_message(line); // no string found
963
+
964
+ char *result=s;
965
+ // skip over following whitespace
966
+ while (*q == ' ' || *q == '\t' || *q == '\n' || *q == '\r')
967
+ q++;
968
+
969
+ if (*q == 0)
970
+ s = NULL; // end of string was reached
971
+ else
972
+ s = q; // move the string pointer s
973
+
974
+ *p = 0; // mark the end of the result string
975
+
976
+ return result;
977
+ }
978
+
979
+
980
+ /*******************************************************************/
981
+ /* */
982
+ /* Transducer::read_transducer_text */
983
+ /* */
984
+ /*******************************************************************/
985
+
986
+ void Transducer::read_transducer_text( FILE *file )
987
+
988
+ {
989
+ vector<Node*> nodes;
990
+ nodes.push_back(root_node());
991
+
992
+ vmark = deterministic = 0;
993
+ char buffer[10000];
994
+ for( size_t line=0; fgets(buffer, 10000, file ); line++ ) {
995
+ char *p = buffer;
996
+ char *s = next_string(p, line);
997
+ Node *node = create_node( nodes, s, line );
998
+ if (p == NULL)
999
+ node->set_final(true);
1000
+ else {
1001
+ s = next_string(p, line);
1002
+ Node *target = create_node( nodes, s, line );
1003
+
1004
+ s = next_string(p, line);
1005
+ Character lc = alphabet.add_symbol(s);
1006
+ s = next_string(p, line);
1007
+ Character uc = alphabet.add_symbol(s);
1008
+ Label l(lc,uc);
1009
+ if (l == Label::epsilon)
1010
+ error_message( line );
1011
+
1012
+ alphabet.insert(l);
1013
+ node->add_arc( l, target, this );
1014
+ }
1015
+ }
1016
+
1017
+ vmark = 1;
1018
+ deterministic = minimised = 1;
1019
+ }
1020
+
1021
+
1022
+ /*******************************************************************/
1023
+ /* */
1024
+ /* Transducer::Transducer */
1025
+ /* */
1026
+ /*******************************************************************/
1027
+
1028
+ Transducer::Transducer( FILE *file, bool binary )
1029
+
1030
+ {
1031
+ indexed = false;
1032
+ node_count = transition_count = 0;
1033
+ if (binary)
1034
+ read_transducer_binary( file );
1035
+ else
1036
+ read_transducer_text( file );
1037
+ }
1038
+
1039
+
1040
+ /* EPSILON REMOVAL ALGORITHM written by Erik Axelson starts here */
1041
+
1042
+ /*******************************************************************/
1043
+ /* */
1044
+ /* node_in_copy_tr */
1045
+ /* */
1046
+ /*******************************************************************/
1047
+
1048
+ /* Find the corresponding node in 'copy_tr' for 'node'. If needed, create a new node to 'copy_tr'
1049
+ and update 'mapper' accordingly. */
1050
+
1051
+ Node *node_in_copy_tr( Node *node, Transducer *copy_tr, map<int, Node*> &mapper ) {
1052
+ int node_index = (int)node->index; // node index in original transducer
1053
+ map<int,Node*>::iterator it = mapper.find(node_index); // iterator to associated node in copy_tr
1054
+ if (it == mapper.end()) {
1055
+ Node *associated_node = copy_tr->new_node(); // create new node in copy_tr
1056
+ if (node->is_final())
1057
+ associated_node->set_final(true);
1058
+ mapper[node_index] = associated_node; // and associate it with node_index
1059
+ return associated_node;
1060
+ }
1061
+ else
1062
+ return it->second;
1063
+ }
1064
+
1065
+
1066
+ /*******************************************************************/
1067
+ /* */
1068
+ /* Transducer::copy_nodes */
1069
+ /* */
1070
+ /*******************************************************************/
1071
+
1072
+ /* Recursive epsilon removal algorithm. Copies arcs and their
1073
+ target nodes starting from search_node to node copy_tr_start_node
1074
+ in transducer copy_tr. nn and mapper are used to associate nodes
1075
+ with nodes in copy_tr. */
1076
+
1077
+ void Transducer::copy_nodes( Node *search_node, Transducer *copy_tr,
1078
+ Node *copy_tr_start_node,
1079
+ map<int, Node*> &mapper ) {
1080
+
1081
+ // go through all arcs leaving from search node
1082
+ // (the iterator lists the epsilon arcs first)
1083
+ for( ArcsIter it(search_node->arcs()); it; it++ ) {
1084
+ Arc arc=*it;
1085
+
1086
+ if (arc.label().is_epsilon()) {
1087
+ // 'forward', which is originally NULL, is used as a flag
1088
+ // for detecting epsilon transition loops
1089
+ if (search_node->forward() != copy_tr_start_node) {
1090
+ search_node->set_forward(copy_tr_start_node); // set epsilon flag
1091
+ if (arc.target_node()->is_final())
1092
+ copy_tr_start_node->set_final(true);
1093
+ copy_nodes(arc.target_node(), copy_tr, copy_tr_start_node, mapper);
1094
+ search_node->set_forward(NULL); // remove epsilon flag
1095
+ }
1096
+ }
1097
+
1098
+ else {
1099
+ // target node in copy_tr
1100
+ Node *copy_tr_end_node =
1101
+ node_in_copy_tr(arc.target_node(), copy_tr, mapper);
1102
+ // add arc to copy_tr
1103
+ copy_tr_start_node->add_arc( Label(arc.label().lower_char(),
1104
+ arc.label().upper_char()),
1105
+ copy_tr_end_node,
1106
+ copy_tr );
1107
+ // if the target node is not visited, copy nodes recursively
1108
+ if ( !(arc.target_node()->was_visited(vmark)) )
1109
+ copy_nodes(arc.target_node(), copy_tr, copy_tr_end_node, mapper);
1110
+ }
1111
+
1112
+ }
1113
+ }
1114
+
1115
+
1116
+ /*******************************************************************/
1117
+ /* */
1118
+ /* Transducer::remove_epsilons */
1119
+ /* */
1120
+ /*******************************************************************/
1121
+
1122
+ Transducer &Transducer::remove_epsilons()
1123
+
1124
+ {
1125
+ if ( deterministic || minimised )
1126
+ return this->copy();
1127
+
1128
+ nodeindexing();
1129
+ incr_vmark();
1130
+ Transducer *copy_tr = new Transducer();
1131
+ copy_tr->alphabet.copy(alphabet);
1132
+ map<int, Node*> mapper;
1133
+ // mark root node as visited
1134
+ root_node()->was_visited(vmark);
1135
+ // set copy_tr root node final, if needed
1136
+ if (root_node()->is_final())
1137
+ copy_tr->root_node()->set_final(true);
1138
+ // associate the root_nodes in this and copy_tr
1139
+ // (node indexing for root_node is zero)
1140
+ mapper[0] = copy_tr->root_node();
1141
+
1142
+ copy_nodes(root_node(), copy_tr, copy_tr->root_node(), mapper);
1143
+ incr_vmark();
1144
+
1145
+ return *copy_tr;
1146
+ }
1147
+
1148
+ // EPSILON REMOVAL ALGORITHM ENDS
1149
+
1150
+ }