ruby-sfst 0.4.3 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -0
  3. data/COPYING +280 -0
  4. data/Gemfile +3 -0
  5. data/Gemfile.lock +54 -0
  6. data/README.md +1 -1
  7. data/Rakefile +9 -18
  8. data/bin/console +7 -0
  9. data/bin/setup +6 -0
  10. data/ext/sfst/alphabet.cc +879 -0
  11. data/ext/sfst/alphabet.h +302 -0
  12. data/ext/sfst/basic.cc +85 -0
  13. data/ext/{sfst_machine → sfst}/basic.h +7 -4
  14. data/ext/sfst/compact.cc +629 -0
  15. data/ext/sfst/compact.h +100 -0
  16. data/ext/sfst/determinise.cc +279 -0
  17. data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
  18. data/ext/sfst/fst.cc +1150 -0
  19. data/ext/sfst/fst.h +374 -0
  20. data/ext/sfst/hopcroft.cc +681 -0
  21. data/ext/sfst/interface.cc +1921 -0
  22. data/ext/sfst/interface.h +171 -0
  23. data/ext/sfst/make-compact.cc +323 -0
  24. data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
  25. data/ext/sfst/mem.h +80 -0
  26. data/ext/sfst/operators.cc +1273 -0
  27. data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
  28. data/ext/sfst/sgi.h +72 -0
  29. data/ext/sfst/utf8.cc +149 -0
  30. data/ext/{sfst_machine → sfst}/utf8.h +7 -4
  31. data/lib/sfst.rb +2 -1
  32. data/lib/sfst/version.rb +1 -1
  33. data/ruby-sfst.gemspec +23 -23
  34. metadata +107 -35
  35. data/ext/sfst_machine/alphabet.cc +0 -812
  36. data/ext/sfst_machine/alphabet.h +0 -273
  37. data/ext/sfst_machine/basic.cc +0 -84
  38. data/ext/sfst_machine/compact.cc +0 -616
  39. data/ext/sfst_machine/compact.h +0 -98
  40. data/ext/sfst_machine/determinise.cc +0 -303
  41. data/ext/sfst_machine/fst.cc +0 -1000
  42. data/ext/sfst_machine/fst.h +0 -369
  43. data/ext/sfst_machine/interface.cc +0 -1842
  44. data/ext/sfst_machine/interface.h +0 -93
  45. data/ext/sfst_machine/make-compact.cc +0 -327
  46. data/ext/sfst_machine/mem.h +0 -74
  47. data/ext/sfst_machine/operators.cc +0 -1131
  48. data/ext/sfst_machine/sgi.h +0 -44
  49. data/ext/sfst_machine/utf8.cc +0 -146
  50. data/test/test_sfst.fst +0 -3
  51. data/test/test_sfst.rb +0 -114
@@ -0,0 +1,100 @@
1
+ /*******************************************************************/
2
+ /* */
3
+ /* FILE compact.h */
4
+ /* MODULE compact */
5
+ /* PROGRAM SFST */
6
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
+ /* */
8
+ /* PURPOSE finite state tools */
9
+ /* */
10
+ /*******************************************************************/
11
+
12
+ #ifndef _COMPACT_H_
13
+ #define _COMPACT_H_
14
+
15
+ #include "alphabet.h"
16
+
17
+ #include <vector>
18
+
19
+ namespace SFST {
20
+
21
+ typedef std::vector<unsigned int> CAnalysis;
22
+
23
+ class CompactTransducer {
24
+
25
+ protected:
26
+
27
+ // the following data structures are used to store the nodes
28
+
29
+ unsigned int number_of_nodes; // number of nodes in the transducer
30
+ char *finalp; // finalp[i] is 1 if node i is final and 0 otherwise
31
+ unsigned int *first_arc; // first_arc[i] is the number of the first
32
+ // arc outgoing from node i
33
+
34
+ // the following data structures are used to store the transition arcs
35
+
36
+ unsigned int number_of_arcs; // total number of arcs in the transducer
37
+ Label *label; // the label (character pair) of arc i
38
+ unsigned int *target_node; // target node of arc i
39
+
40
+ // the following data structures are used to store the stochastic parameters
41
+ float *final_logprob;
42
+ float *arc_logprob;
43
+
44
+ // functions needed to read the transducer from a file
45
+
46
+ void read_finalp( FILE *file );
47
+ void read_first_arcs( FILE *file );
48
+ void read_target_nodes( FILE *file );
49
+ void read_labels( FILE *file );
50
+ void read_probs( FILE *file );
51
+
52
+ // functions needed to analyze data with the transducer
53
+
54
+ void analyze( unsigned int n, std::vector<Character> &ch, size_t ipos,
55
+ CAnalysis&, std::vector<CAnalysis>&);
56
+
57
+ // function selecting the simplest morphological analysis
58
+
59
+ int compute_score( CAnalysis &ana );
60
+ void disambiguate( std::vector<CAnalysis> &analyses );
61
+
62
+ // functions for longest-match analysis of input data
63
+
64
+ void longest_match2(unsigned int, char*, int, CAnalysis&, int&, CAnalysis&);
65
+
66
+ void convert( CAnalysis &cana, Analysis &ana );
67
+
68
+ public:
69
+ size_t node_count() { return number_of_nodes; };
70
+ size_t arc_count() { return number_of_arcs; };
71
+
72
+ bool both_layers; // print surface and analysis symbols
73
+ bool simplest_only; // print only the simplest analyses
74
+
75
+ Alphabet alphabet; // data structure which maps symbols to numeric codes
76
+ CompactTransducer(); // dummy constructor
77
+ CompactTransducer( FILE*, FILE *pfile=NULL ); // reads a (stochastic) transducer
78
+ ~CompactTransducer(); // destroys a transducer
79
+
80
+ // the analysis function returns the set of analyses for the string "s"
81
+ // in the argument "analyses"
82
+ void analyze_string( char *s, std::vector<CAnalysis > &analyses );
83
+
84
+ void compute_probs( std::vector<CAnalysis> &analyses, std::vector<double> &prob );
85
+ char *print_analysis( CAnalysis &ana );
86
+
87
+ // longest-match analysis
88
+ const char *longest_match( char*& );
89
+
90
+ // EM training
91
+ bool train2( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
92
+ bool train( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
93
+ void estimate_probs( std::vector<double> &arcfreq, std::vector<double> &finalfreq );
94
+
95
+ // robust analysis
96
+ float robust_analyze_string( char *string, std::vector<CAnalysis> &analyses,
97
+ float ErrorsAllowed );
98
+ };
99
+ }
100
+ #endif
@@ -0,0 +1,279 @@
1
+
2
+ /*******************************************************************/
3
+ /* */
4
+ /* FILE determinise.C */
5
+ /* MODULE determinise */
6
+ /* PROGRAM SFST */
7
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
8
+ /* */
9
+ /*******************************************************************/
10
+
11
+
12
+ #include "fst.h"
13
+
14
+ using std::vector;
15
+ using std::pair;
16
+ using std::set;
17
+
18
+ namespace SFST {
19
+
20
+
21
+ /***************** class NodeSet *********************************/
22
+
23
+ class NodeSet {
24
+ // This class is used to store a set of nodes.
25
+ // Whenever a new node is added, all nodes accessible
26
+ // through epsilon transitions are added as well.
27
+
28
+ private:
29
+ set<Node*> ht;
30
+
31
+ public:
32
+ typedef set<Node*>::iterator iterator;
33
+ NodeSet() {};
34
+ void add( Node* );
35
+ bool insert(Node *node) {
36
+ pair<iterator, bool> result = ht.insert(node);
37
+ return result.second;
38
+ };
39
+ iterator begin() const { return ht.begin(); }
40
+ iterator end() const { return ht.end(); }
41
+ size_t size() const { return ht.size(); }
42
+ void clear() { ht.clear(); }
43
+ };
44
+
45
+ typedef map<const Label, NodeSet> Label2NodeSet;
46
+
47
+
48
+ /***************** class NodeArray *******************************/
49
+
50
+ class NodeArray {
51
+
52
+ private:
53
+ size_t sizev;
54
+ bool final;
55
+ Node **node;
56
+
57
+ public:
58
+ NodeArray( NodeSet& );
59
+ ~NodeArray() { delete[] node; };
60
+ size_t size() const { return sizev; }
61
+ bool is_final() const { return final; };
62
+ Node* &operator[]( size_t i ) const { return node[i]; }
63
+ };
64
+
65
+
66
+ /***************** class DTransition *****************************/
67
+
68
+ class DTransition {
69
+ public:
70
+ Label label;
71
+ NodeArray *nodes;
72
+ DTransition(Label l, NodeArray *na) { label = l; nodes = na; };
73
+ };
74
+
75
+
76
+ /***************** class NodeMapping ****************************/
77
+
78
+ class NodeMapping {
79
+ // This class is used to map a node set from one transducer
80
+ // to a single node in another transducer
81
+
82
+ private:
83
+ struct hashf {
84
+ size_t operator()(const NodeArray *na) const {
85
+ size_t key=na->size() ^ na->is_final();
86
+ for( size_t i=0; i<na->size(); i++)
87
+ key = (key<<1) ^ (size_t)(*na)[i];
88
+ return key;
89
+ }
90
+ };
91
+ struct equalf {
92
+ int operator()(const NodeArray *na1, const NodeArray *na2) const {
93
+ if (na1->size() != na2->size() || na1->is_final() != na2->is_final())
94
+ return 0;
95
+ for( size_t i=0; i<na1->size(); i++)
96
+ if ((*na1)[i] != (*na2)[i])
97
+ return 0;
98
+ return 1;
99
+ }
100
+ };
101
+ typedef hash_map<NodeArray*, Node*, hashf, equalf> NodeMap;
102
+ NodeMap hm;
103
+
104
+ public:
105
+ typedef NodeMap::iterator iterator;
106
+ ~NodeMapping();
107
+ iterator begin() { return hm.begin(); };
108
+ iterator end() { return hm.end(); };
109
+ iterator find( NodeArray *na) { return hm.find( na ); };
110
+ Node* &operator[]( NodeArray *na ) { return hm.operator[](na); };
111
+
112
+ };
113
+
114
+
115
+ static void determinise_node( NodeArray&, Node*, Transducer*, NodeMapping& );
116
+
117
+
118
+ /*******************************************************************/
119
+ /* */
120
+ /* NodeSet::add */
121
+ /* */
122
+ /*******************************************************************/
123
+
124
+ void NodeSet::add( Node *node )
125
+
126
+ {
127
+ pair<iterator, bool> result = ht.insert(node);
128
+ if (result.second) {
129
+ // new node, add nodes reachable with epsilon transitions
130
+ for( ArcsIter p(node->arcs(),ArcsIter::eps); p; p++ ) {
131
+ Arc *arc=p;
132
+ if (!arc->label().is_epsilon())
133
+ break;
134
+ add(arc->target_node());
135
+ }
136
+ }
137
+ }
138
+
139
+
140
+ /*******************************************************************/
141
+ /* */
142
+ /* NodeArray::NodeArray */
143
+ /* */
144
+ /*******************************************************************/
145
+
146
+ NodeArray::NodeArray( NodeSet &ns )
147
+
148
+ {
149
+ sizev = 0;
150
+ NodeSet::iterator it;
151
+
152
+ final = false;
153
+ node = new Node*[ns.size()];
154
+ for( it=ns.begin(); it!=ns.end(); it++ ) {
155
+ Node *nn = *it;
156
+ if (nn->arcs()->non_epsilon_transition_exists())
157
+ node[sizev++] = nn;
158
+ if (nn->is_final())
159
+ final = true;
160
+ }
161
+ }
162
+
163
+
164
+ /*******************************************************************/
165
+ /* */
166
+ /* NodeMapping::~NodeMapping */
167
+ /* */
168
+ /*******************************************************************/
169
+
170
+ NodeMapping::~NodeMapping()
171
+
172
+ {
173
+ // if we delete NodeArrays without removing them from NodeMapping,
174
+ // the system will crash when NodeMapping is deleted.
175
+ for( iterator it=hm.begin(); it!=hm.end(); ) {
176
+ NodeArray *na=it->first;
177
+ iterator old = it++;
178
+ hm.erase(old);
179
+ delete na;
180
+ }
181
+ }
182
+
183
+
184
+ /*******************************************************************/
185
+ /* */
186
+ /* compute_transitions */
187
+ /* */
188
+ /*******************************************************************/
189
+
190
+ static void compute_transitions( NodeArray &na, vector<DTransition> &t )
191
+
192
+ {
193
+ Label2NodeSet lmap;
194
+
195
+ // for all nodes in the current set
196
+ for( size_t i=0; i<na.size(); i++) {
197
+ Node *n = na[i]; // old node
198
+
199
+ // For each non-epsilon transition, add the target node
200
+ // to the respective node set.
201
+ for( ArcsIter p(n->arcs(), ArcsIter::non_eps); p; p++ ) {
202
+ Arc *arc=p;
203
+ lmap[arc->label()].add(arc->target_node());
204
+ }
205
+ }
206
+
207
+ t.reserve(lmap.size());
208
+ for( Label2NodeSet::iterator it=lmap.begin(); it!=lmap.end(); it++ ) {
209
+ t.push_back(DTransition(it->first, new NodeArray( it->second )));
210
+ }
211
+ }
212
+
213
+
214
+ /*******************************************************************/
215
+ /* */
216
+ /* determinise_node */
217
+ /* */
218
+ /*******************************************************************/
219
+
220
+ static void determinise_node( NodeArray &na, Node *node, Transducer *a,
221
+ NodeMapping &map )
222
+ {
223
+ node->set_final(na.is_final());
224
+
225
+ vector<DTransition> t;
226
+ compute_transitions( na, t );
227
+
228
+ for( size_t i=0; i<t.size(); i++ ) {
229
+ NodeMapping::iterator it=map.find(t[i].nodes);
230
+ if (it == map.end()) {
231
+ // new node set
232
+ Node *target_node = a->new_node();
233
+ map[t[i].nodes] = target_node;
234
+ node->add_arc( t[i].label, target_node, a );
235
+ determinise_node( *t[i].nodes, target_node, a, map );
236
+ }
237
+ else {
238
+ delete t[i].nodes;
239
+ node->add_arc( t[i].label, it->second, a );
240
+ }
241
+ }
242
+ }
243
+
244
+
245
+ /*******************************************************************/
246
+ /* */
247
+ /* Transducer::determinise */
248
+ /* */
249
+ /*******************************************************************/
250
+
251
+ Transducer &Transducer::determinise( bool copy_alphabet )
252
+
253
+ {
254
+ if (deterministic)
255
+ return copy();
256
+
257
+ Transducer *a = new Transducer();
258
+ if (copy_alphabet)
259
+ a->alphabet.copy(alphabet);
260
+
261
+ // creation of the initial node set consisting of all nodes
262
+ // reachable from the start node via epsilon transitions.
263
+ NodeArray *na;
264
+ {
265
+ NodeSet ns;
266
+ ns.add(root_node());
267
+ na = new NodeArray(ns);
268
+ }
269
+
270
+ // map the node set to the new root node
271
+ NodeMapping map;
272
+ map[na] = a->root_node();
273
+
274
+ // determinise the transducer recursively
275
+ determinise_node( *na, a->root_node(), a, map );
276
+ a->deterministic = 1;
277
+ return *a;
278
+ }
279
+ }
@@ -1,5 +1,6 @@
1
1
  require 'mkmf'
2
+
2
3
  CONFIG['CC'] = 'g++'
3
4
  CONFIG['CXX'] = 'g++'
4
5
  $CPPFLAGS='-Wall -O3 -Wall -Wcast-qual -Wconversion -DSGI__gnu_cxx -DREADLINE'
5
- create_makefile "sfst_machine"
6
+ create_makefile('sfst/sfst')
@@ -0,0 +1,1150 @@
1
+
2
+ /*******************************************************************/
3
+ /* */
4
+ /* FILE fst.C */
5
+ /* MODULE fst */
6
+ /* PROGRAM SFST */
7
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
8
+ /* */
9
+ /* PURPOSE basic FST functions */
10
+ /* */
11
+ /*******************************************************************/
12
+
13
+ #include "fst.h"
14
+
15
+ namespace SFST {
16
+
17
+ using std::vector;
18
+ using std::istream;
19
+ using std::ostream;
20
+ using std::cerr;
21
+
22
+ const int BUFFER_SIZE=100000;
23
+
24
+
25
+ /*******************************************************************/
26
+ /* */
27
+ /* Arcs::size */
28
+ /* */
29
+ /*******************************************************************/
30
+
31
+ int Arcs::size() const
32
+
33
+ {
34
+ int n=0;
35
+ for( Arc *p=first_arcp; p; p=p->next ) n++;
36
+ for( Arc *p=first_epsilon_arcp; p; p=p->next ) n++;
37
+ return n;
38
+ }
39
+
40
+
41
+ /*******************************************************************/
42
+ /* */
43
+ /* Arcs::target_node */
44
+ /* */
45
+ /*******************************************************************/
46
+
47
+ Node *Arcs::target_node( Label l )
48
+
49
+ {
50
+ Arc *arc;
51
+
52
+ for( arc=first_arcp; arc; arc=arc->next)
53
+ if (arc->label() == l)
54
+ return arc->target_node();
55
+
56
+ return NULL;
57
+ }
58
+
59
+ const Node *Arcs::target_node( Label l ) const
60
+
61
+ {
62
+ const Arc *arc;
63
+
64
+ for( arc=first_arcp; arc; arc=arc->next)
65
+ if (arc->label() == l)
66
+ return arc->target_node();
67
+
68
+ return NULL;
69
+ }
70
+
71
+
72
+ /*******************************************************************/
73
+ /* */
74
+ /* Transducer::new_node */
75
+ /* */
76
+ /*******************************************************************/
77
+
78
+ Node *Transducer::new_node()
79
+
80
+ {
81
+ Node *node=(Node*)mem.alloc( sizeof(Node) );
82
+
83
+ node->init();
84
+ return node;
85
+ }
86
+
87
+
88
+ /*******************************************************************/
89
+ /* */
90
+ /* Transducer::new_arc */
91
+ /* */
92
+ /*******************************************************************/
93
+
94
+ Arc *Transducer::new_arc( Label l, Node *target )
95
+
96
+ {
97
+ Arc *arc=(Arc*)mem.alloc( sizeof(Arc) );
98
+ arc->init( l, target);
99
+ return arc;
100
+ }
101
+
102
+
103
+ /*******************************************************************/
104
+ /* */
105
+ /* Arcs::add_arc */
106
+ /* */
107
+ /*******************************************************************/
108
+
109
+ void Arcs::add_arc( Label l, Node *node, Transducer *a )
110
+
111
+ {
112
+ Arc *arc=a->new_arc( l, node );
113
+
114
+ if (l.is_epsilon()) {
115
+ arc->next = first_epsilon_arcp;
116
+ first_epsilon_arcp = arc;
117
+ }
118
+ else {
119
+ arc->next = first_arcp;
120
+ first_arcp = arc;
121
+ }
122
+ }
123
+
124
+
125
+ /*******************************************************************/
126
+ /* */
127
+ /* Arcs::remove_arc */
128
+ /* */
129
+ /*******************************************************************/
130
+
131
+ int Arcs::remove_arc( Arc *arc )
132
+
133
+ {
134
+ Arc **p = (arc->label().is_epsilon()) ? &first_epsilon_arcp : &first_arcp;
135
+ for( ; *p; p=&(*p)->next )
136
+ if (*p == arc) {
137
+ *p = arc->next;
138
+ return 1;
139
+ }
140
+ return 0;
141
+ }
142
+
143
+
144
+ /*******************************************************************/
145
+ /* */
146
+ /* Node::init */
147
+ /* */
148
+ /*******************************************************************/
149
+
150
+ void Node::init()
151
+
152
+ {
153
+ final = false;
154
+ visited = 0;
155
+ arcsp.init();
156
+ forwardp = NULL;
157
+ }
158
+
159
+
160
+ /*******************************************************************/
161
+ /* */
162
+ /* Node::clear_visited */
163
+ /* */
164
+ /*******************************************************************/
165
+
166
+ void Node::clear_visited( NodeHashSet &nodeset )
167
+
168
+ {
169
+ if (nodeset.find( this ) == nodeset.end()) {
170
+ visited = 0;
171
+ nodeset.insert( this );
172
+ fprintf(stderr," %lu", (unsigned long)nodeset.size());
173
+ for( ArcsIter p(arcs()); p; p++ ) {
174
+ Arc *arc=p;
175
+ arc->target_node()->clear_visited( nodeset );
176
+ }
177
+ }
178
+ }
179
+
180
+
181
+ /*******************************************************************/
182
+ /* */
183
+ /* Transducer::index_nodes */
184
+ /* */
185
+ /*******************************************************************/
186
+
187
+ void Transducer::index_nodes( Node *node, vector<Node*> *nodearray )
188
+
189
+ {
190
+ if (!node->was_visited( vmark )) {
191
+ node->index = (Index)node_count++;
192
+ if (nodearray)
193
+ nodearray->push_back(node);
194
+
195
+ for( ArcsIter p(node->arcs()); p; p++ ) {
196
+ Arc *arc=p;
197
+ transition_count++;
198
+ index_nodes( arc->target_node(), nodearray );
199
+ }
200
+ }
201
+ }
202
+
203
+
204
+ /*******************************************************************/
205
+ /* */
206
+ /* Transducer::nodeindexing */
207
+ /* */
208
+ /*******************************************************************/
209
+
210
+ std::pair<size_t,size_t> Transducer::nodeindexing( vector<Node*> *nodearray )
211
+
212
+ {
213
+ if (!indexed) {
214
+ incr_vmark();
215
+ index_nodes( root_node(), nodearray );
216
+ indexed = true;
217
+ }
218
+
219
+ return std::pair<size_t,size_t>(node_count, transition_count);
220
+ }
221
+
222
+
223
+ /*******************************************************************/
224
+ /* */
225
+ /* Transducer::add_string */
226
+ /* */
227
+ /*******************************************************************/
228
+
229
+ void Transducer::add_string( char *s, bool extended, Alphabet *a )
230
+
231
+ {
232
+ if (a == NULL)
233
+ a = &alphabet;
234
+
235
+ Node *node=root_node();
236
+ Label l;
237
+ while (!(l = a->next_label(s, extended)).is_epsilon()) {
238
+ a->insert(l);
239
+ Arcs *arcs=node->arcs();
240
+ node = arcs->target_node( l );
241
+ if (node == NULL) {
242
+ node = new_node();
243
+ arcs->add_arc( l, node, this );
244
+ }
245
+ }
246
+ node->set_final(1);
247
+ }
248
+
249
+
250
+ /*******************************************************************/
251
+ /* */
252
+ /* Transducer::Transducer */
253
+ /* */
254
+ /*******************************************************************/
255
+
256
+ Transducer::Transducer( vector<Label> &path )
257
+ : root(), mem()
258
+ {
259
+ Node *node=root_node();
260
+
261
+ vmark = 0;
262
+ indexed = false;
263
+ node_count = transition_count = 0;
264
+ deterministic = minimised = true;
265
+ for( size_t i=0; i<path.size(); i++ ) {
266
+ Arcs *arcs=node->arcs();
267
+ node = new_node();
268
+ arcs->add_arc( path[i], node, this );
269
+ }
270
+ node->set_final(1);
271
+ }
272
+
273
+
274
+ /*******************************************************************/
275
+ /* */
276
+ /* Transducer::Transducer */
277
+ /* */
278
+ /*******************************************************************/
279
+
280
+ Transducer::Transducer( istream &is, const Alphabet *a, bool verbose,
281
+ bool lexcomments )
282
+ : root(), mem()
283
+ {
284
+ bool extended=false;
285
+ int n=0;
286
+ char buffer[10000];
287
+
288
+ vmark = 0;
289
+ indexed = false;
290
+ node_count = transition_count = 0;
291
+ deterministic = true;
292
+ minimised = false;
293
+ if (a) {
294
+ alphabet.copy(*a);
295
+ extended = true;
296
+ }
297
+ while (is.getline(buffer, 10000)) {
298
+ if (verbose && ++n % 10000 == 0) {
299
+ if (n == 10000)
300
+ cerr << "\n";
301
+ cerr << "\r" << n << " words";
302
+ }
303
+
304
+ // delete comments
305
+ if (lexcomments) {
306
+ size_t l = strlen(buffer);
307
+ for( size_t i=0; i<l; i++ )
308
+ if (buffer[i] == '\\' && buffer[i+1])
309
+ ; // quoted character
310
+ else if (buffer[i] == '%') {
311
+ // comment starts here
312
+ buffer[i] = 0;
313
+ break;
314
+ }
315
+ if (buffer[0] == 0)
316
+ continue;
317
+ }
318
+
319
+ // delete final whitespace characters
320
+ int l;
321
+ for( l=(int)strlen(buffer)-1; l>=0; l-- )
322
+ if ((buffer[l] != ' ' && buffer[l] != '\t' && buffer[l] != '\r') ||
323
+ (l > 0 && buffer[l-1] == '\\'))
324
+ break;
325
+ buffer[l+1] = 0;
326
+
327
+ add_string(buffer, extended);
328
+ }
329
+ if (verbose && n >= 10000)
330
+ cerr << "\n";
331
+ }
332
+
333
+
334
+ /*******************************************************************/
335
+ /* */
336
+ /* Transducer::Transducer */
337
+ /* */
338
+ /*******************************************************************/
339
+
340
+ Transducer::Transducer( char *s, const Alphabet *a, bool extended )
341
+ : root(), mem()
342
+ {
343
+ vmark = 0;
344
+ indexed = false;
345
+ node_count = transition_count = 0;
346
+ deterministic = minimised = true;
347
+ if (a)
348
+ alphabet.copy(*a);
349
+ add_string(s, extended);
350
+ }
351
+
352
+
353
+ /*******************************************************************/
354
+ /* */
355
+ /* Transducer::clear */
356
+ /* */
357
+ /*******************************************************************/
358
+
359
+ void Transducer::clear()
360
+
361
+ {
362
+ vmark = 0;
363
+ deterministic = minimised = false;
364
+ root.init();
365
+ mem.clear();
366
+ alphabet.clear();
367
+ }
368
+
369
+
370
+ /*******************************************************************/
371
+ /* */
372
+ /* Transducer::store_symbols */
373
+ /* */
374
+ /*******************************************************************/
375
+
376
+ void Transducer::store_symbols(Node *node, SymbolMap &symbol,
377
+ LabelSet &labels)
378
+ {
379
+ if (!node->was_visited( vmark )) {
380
+ Arcs *arcs=node->arcs();
381
+ for( ArcsIter p(arcs); p; p++ ) {
382
+ Arc *arc=p;
383
+ Label l=arc->label();
384
+
385
+ labels.insert(l);
386
+
387
+ Character c = l.upper_char();
388
+ if (symbol.find(c) == symbol.end()) {
389
+ const char *s = alphabet.code2symbol(c);
390
+ if (s)
391
+ symbol[c] = fst_strdup(s);
392
+ }
393
+
394
+ c = l.lower_char();
395
+ if (symbol.find(c) == symbol.end()) {
396
+ const char *s = alphabet.code2symbol(c);
397
+ if (s)
398
+ symbol[c] = fst_strdup(s);
399
+ }
400
+
401
+ store_symbols( arc->target_node(), symbol, labels );
402
+ }
403
+ }
404
+ }
405
+
406
+
407
+ /*******************************************************************/
408
+ /* */
409
+ /* Transducer::minimise_alphabet */
410
+ /* */
411
+ /*******************************************************************/
412
+
413
+ void Transducer::minimise_alphabet()
414
+
415
+ {
416
+ SymbolMap symbols;
417
+ LabelSet labels;
418
+ incr_vmark();
419
+ store_symbols(root_node(), symbols, labels);
420
+ alphabet.clear();
421
+ for( SymbolMap::iterator it=symbols.begin(); it!=symbols.end(); it++ ) {
422
+ alphabet.add_symbol( it->second, it->first );
423
+ free(it->second);
424
+ }
425
+ for( LabelSet::iterator it=labels.begin(); it!=labels.end(); it++ )
426
+ alphabet.insert(*it);
427
+ }
428
+
429
+
430
+ /*******************************************************************/
431
+ /* */
432
+ /* Transducer::size_node */
433
+ /* */
434
+ /*******************************************************************/
435
+
436
+ size_t Transducer::size_node( Node *node )
437
+
438
+ {
439
+ size_t result = 0;
440
+ if (!node->was_visited( vmark )) {
441
+ result++;
442
+ for( ArcsIter it(node->arcs()); it; it++ ) {
443
+ Arc *arc=it;
444
+ result += size_node( arc->target_node() );
445
+ }
446
+ }
447
+ return result;
448
+ }
449
+
450
+
451
+ /*******************************************************************/
452
+ /* */
453
+ /* Transducer::size_node */
454
+ /* */
455
+ /*******************************************************************/
456
+
457
+ size_t Transducer::size()
458
+
459
+ {
460
+ incr_vmark();
461
+ return size_node(root_node());
462
+ }
463
+
464
+
465
+ /*******************************************************************/
466
+ /* */
467
+ /* Transducer::enumerate_paths_node */
468
+ /* */
469
+ /*******************************************************************/
470
+
471
+ void Transducer::enumerate_paths_node( Node *node, vector<Label> &path,
472
+ NodeHashSet &previous,
473
+ vector<Transducer*> &result )
474
+ {
475
+ if (node->is_final())
476
+ result.push_back(new Transducer(path));
477
+
478
+ for( ArcsIter it(node->arcs()); it; it++ ) {
479
+ Arc *arc=it;
480
+
481
+ NodeHashSet::iterator hsit=previous.insert(node).first;
482
+ path.push_back(arc->label());
483
+ enumerate_paths_node( arc->target_node(), path, previous, result );
484
+ path.pop_back();
485
+ previous.erase(hsit);
486
+ }
487
+ }
488
+
489
+
490
+ /*******************************************************************/
491
+ /* */
492
+ /* Transducer::enumerate_paths */
493
+ /* */
494
+ /*******************************************************************/
495
+
496
+ bool Transducer::enumerate_paths( vector<Transducer*> &result )
497
+
498
+ {
499
+ if (is_infinitely_ambiguous())
500
+ return true;
501
+ for( size_t i=0; i<result.size(); i++ )
502
+ delete result[i];
503
+ result.clear();
504
+
505
+ vector<Label> path;
506
+ NodeHashSet previous;
507
+ enumerate_paths_node( root_node(), path, previous, result );
508
+ return false;
509
+ }
510
+
511
+
512
+
513
+
514
+ /*******************************************************************/
515
+ /* */
516
+ /* Transducer::print_strings_node */
517
+ /* */
518
+ /*******************************************************************/
519
+
520
+ int Transducer::print_strings_node(Node *node, char *buffer, int pos,
521
+ FILE *file, bool with_brackets )
522
+ {
523
+ int result = 0;
524
+
525
+ if (node->was_visited( vmark )) {
526
+ if (node->forward() != NULL) { // cycle detected
527
+ cerr << "Warning: cyclic analyses (cycle aborted)\n";
528
+ return 0;
529
+ }
530
+ node->set_forward(node); // used like a flag for loop detection
531
+ }
532
+ if (pos == BUFFER_SIZE)
533
+ throw "Output string in function print_strings_node is too long";
534
+ if (node->is_final()) {
535
+ buffer[pos] = '\0';
536
+ fprintf(file,"%s\n", buffer);
537
+ result = 1;
538
+ }
539
+ for( ArcsIter i(node->arcs()); i; i++ ) {
540
+ int p=pos;
541
+ Arc *arc=i;
542
+ Label l=arc->label();
543
+ alphabet.write_label(l, buffer, &p, with_brackets);
544
+ result |= print_strings_node(arc->target_node(), buffer, p,
545
+ file, with_brackets );
546
+ }
547
+ node->set_forward(NULL);
548
+
549
+ return result;
550
+ }
551
+
552
+
553
+ /*******************************************************************/
554
+ /* */
555
+ /* Transducer::print_strings */
556
+ /* */
557
+ /*******************************************************************/
558
+
559
+ int Transducer::print_strings( FILE *file, bool with_brackets )
560
+
561
+ {
562
+ char buffer[BUFFER_SIZE];
563
+ incr_vmark();
564
+ return print_strings_node( root_node(), buffer, 0, file, with_brackets );
565
+ }
566
+
567
+
568
+ /*******************************************************************/
569
+ /* */
570
+ /* Transducer::analyze_string */
571
+ /* */
572
+ /*******************************************************************/
573
+
574
+ bool Transducer::analyze_string( char *string, FILE *file, bool with_brackets )
575
+
576
+ {
577
+ vector<Character> input;
578
+ alphabet.string2symseq( string, input );
579
+ vector<Label> labels;
580
+ for( size_t i=0; i<input.size(); i++ )
581
+ labels.push_back(Label(input[i]));
582
+
583
+ Transducer a1(labels);
584
+ Transducer *a2=&(*this || a1);
585
+ Transducer *a3=&(a2->lower_level());
586
+ delete a2;
587
+ a2 = &a3->minimise();
588
+ delete a3;
589
+
590
+ a2->alphabet.copy(alphabet);
591
+ bool result = a2->print_strings( file, with_brackets );
592
+ delete a2;
593
+ return result;
594
+ }
595
+
596
+
597
+ /*******************************************************************/
598
+ /* */
599
+ /* Transducer::generate_string */
600
+ /* */
601
+ /*******************************************************************/
602
+
603
+ bool Transducer::generate_string( char *string, FILE *file, bool with_brackets)
604
+
605
+ {
606
+ Transducer a1(string, &alphabet, false);
607
+ Transducer *a2=&(a1 || *this);
608
+ Transducer *a3=&(a2->upper_level());
609
+ delete a2;
610
+ a2 = &a3->minimise();
611
+ delete a3;
612
+
613
+ a2->alphabet.copy(alphabet);
614
+ bool result = a2->print_strings( file, with_brackets );
615
+ delete a2;
616
+ return result;
617
+ }
618
+
619
+
620
+ /*******************************************************************/
621
+ /* */
622
+ /* complete */
623
+ /* */
624
+ /*******************************************************************/
625
+
626
+ static void complete( Node *node, Alphabet &alphabet, VType vmark)
627
+
628
+ {
629
+ if (node->was_visited( vmark ))
630
+ return;
631
+ for( ArcsIter p(node->arcs()); p; p++ ) {
632
+ Arc *arc=p;
633
+ if (!arc->label().is_epsilon())
634
+ alphabet.insert(arc->label());
635
+ complete(arc->target_node(), alphabet, vmark);
636
+ }
637
+ }
638
+
639
+
640
+ /*******************************************************************/
641
+ /* */
642
+ /* Transducer::complete_alphabet */
643
+ /* */
644
+ /*******************************************************************/
645
+
646
+ void Transducer::complete_alphabet()
647
+
648
+ {
649
+ incr_vmark();
650
+ complete(root_node(), alphabet, vmark);
651
+ }
652
+
653
+
654
+ /*******************************************************************/
655
+ /* */
656
+ /* print_node */
657
+ /* */
658
+ /*******************************************************************/
659
+
660
+ static void print_node( ostream &s, Node *node, VType vmark, Alphabet &abc )
661
+
662
+ {
663
+ if (!node->was_visited( vmark )) {
664
+ Arcs *arcs=node->arcs();
665
+ for( ArcsIter p(arcs); p; p++ ) {
666
+ Arc *arc=p;
667
+ s << node->index << "\t" << arc->target_node()->index;
668
+ s << "\t" << abc.write_char(arc->label().lower_char());
669
+ s << "\t" << abc.write_char(arc->label().upper_char());
670
+ s << "\n";
671
+ }
672
+ if (node->is_final())
673
+ s << node->index << "\n";
674
+ for( ArcsIter p(arcs); p; p++ ) {
675
+ Arc *arc=p;
676
+ print_node( s, arc->target_node(), vmark, abc );
677
+ }
678
+ }
679
+ }
680
+
681
+
682
+ /*******************************************************************/
683
+ /* */
684
+ /* operator<< */
685
+ /* */
686
+ /*******************************************************************/
687
+
688
+ ostream &operator<<( ostream &s, Transducer &a )
689
+
690
+ {
691
+ a.nodeindexing();
692
+ a.incr_vmark();
693
+ print_node( s, a.root_node(), a.vmark, a.alphabet );
694
+ return s;
695
+ }
696
+
697
+
698
+ /*******************************************************************/
699
+ /* */
700
+ /* store_node_info */
701
+ /* */
702
+ /*******************************************************************/
703
+
704
+ static void store_node_info( FILE *file, Node *node )
705
+
706
+ {
707
+ // write final flag
708
+ char c=node->is_final();
709
+ fwrite(&c,sizeof(c),1,file);
710
+
711
+ // write the number of arcs
712
+ int nn = node->arcs()->size();
713
+ if (nn > 65535)
714
+ throw "Error: in function store_node\n";
715
+ unsigned short n=(unsigned short)nn;
716
+ fwrite(&n,sizeof(n),1,file);
717
+ }
718
+
719
+
720
+ /*******************************************************************/
721
+ /* */
722
+ /* store_arc_label */
723
+ /* */
724
+ /*******************************************************************/
725
+
726
+ static void store_arc_label( FILE *file, Arc *arc )
727
+
728
+ {
729
+ Label l=arc->label();
730
+ Character lc=l.lower_char();
731
+ Character uc=l.upper_char();
732
+ fwrite(&lc,sizeof(lc),1,file);
733
+ fwrite(&uc,sizeof(uc),1,file);
734
+ }
735
+
736
+
737
+ /*******************************************************************/
738
+ /* */
739
+ /* store_node */
740
+ /* */
741
+ /*******************************************************************/
742
+
743
+ static void store_node( FILE *file, Node *node, VType vmark )
744
+ {
745
+ if (!node->was_visited( vmark )) {
746
+
747
+ store_node_info( file, node );
748
+
749
+ // write the arcs
750
+ for( ArcsIter p(node->arcs()); p; p++ ) {
751
+ Arc *arc=p;
752
+ store_arc_label( file, arc );
753
+ unsigned int t = (unsigned int)arc->target_node()->index;
754
+ fwrite(&t,sizeof(t),1,file);
755
+ store_node(file, arc->target_node(), vmark );
756
+ }
757
+ }
758
+ }
759
+
760
+
761
+ /*******************************************************************/
762
+ /* */
763
+ /* store_lowmem_node */
764
+ /* */
765
+ /*******************************************************************/
766
+
767
+ static void store_lowmem_node( FILE *file, Node *node,
768
+ vector<unsigned int> &startpos)
769
+ {
770
+ store_node_info( file, node );
771
+
772
+ // write the arcs
773
+ for( ArcsIter p(node->arcs()); p; p++ ) {
774
+ Arc *arc=p;
775
+ store_arc_label( file, arc );
776
+ unsigned int t=startpos[arc->target_node()->index];
777
+ fwrite(&t,sizeof(t),1,file);
778
+ }
779
+ }
780
+
781
+
782
+ /*******************************************************************/
783
+ /* */
784
+ /* Transducer::store_lowmem */
785
+ /* */
786
+ /*******************************************************************/
787
+
788
+ void Transducer::store_lowmem( FILE *file )
789
+
790
+ {
791
+ fputc('l',file);
792
+ alphabet.store(file);
793
+
794
+ // storing size of index table
795
+ vector<Node*> nodearray;
796
+ nodeindexing( &nodearray );
797
+
798
+ // compute the start position of the first node
799
+ unsigned int pos=(unsigned int)ftell(file);
800
+ vector<unsigned int> startpos;
801
+ for( size_t i=0; i<nodearray.size(); i++ ) {
802
+ startpos.push_back(pos);
803
+ Node *node=nodearray[i];
804
+ Arcs *arcs=node->arcs();
805
+ pos += (unsigned)(sizeof(char) // size of final flag
806
+ + sizeof(unsigned short) // size of number of arcs
807
+ + arcs->size() * (sizeof(Character) * 2 + sizeof(unsigned int))); // size of n arcs
808
+ }
809
+
810
+ // storing nodes
811
+ for( size_t i=0; i<nodearray.size(); i++ )
812
+ store_lowmem_node( file, nodearray[i], startpos );
813
+ }
814
+
815
+
816
+ /*******************************************************************/
817
+ /* */
818
+ /* Transducer::store */
819
+ /* */
820
+ /*******************************************************************/
821
+
822
+ void Transducer::store( FILE *file )
823
+
824
+ {
825
+ fputc('a',file);
826
+
827
+ vector<Node*> nodearray;
828
+ nodeindexing( &nodearray );
829
+ incr_vmark();
830
+ unsigned int n=(unsigned)nodearray.size();
831
+ fwrite(&n,sizeof(n),1,file);
832
+ store_node( file, root_node(), vmark );
833
+
834
+ alphabet.store(file);
835
+ }
836
+
837
+
838
+ /*******************************************************************/
839
+ /* */
840
+ /* read_node */
841
+ /* */
842
+ /*******************************************************************/
843
+
844
+ static void read_node( FILE *file, Node *node, Node **p, Transducer *a )
845
+ {
846
+ char c;
847
+ fread(&c,sizeof(c),1,file);
848
+ node->set_final(c);
849
+
850
+ unsigned short n;
851
+ fread( &n, sizeof(n), 1, file);
852
+
853
+ for( int i=0; i<n; i++ ) {
854
+ Character lc,uc;
855
+ unsigned int t;
856
+ fread(&lc,sizeof(lc),1,file);
857
+ fread(&uc,sizeof(uc),1,file);
858
+ fread(&t,sizeof(t),1,file);
859
+ if (ferror(file))
860
+ throw "Error encountered while reading transducer from file";
861
+ if (p[t])
862
+ node->add_arc( Label(lc,uc), p[t], a );
863
+ else {
864
+ p[t] = a->new_node();
865
+ node->add_arc( Label(lc,uc), p[t], a );
866
+ read_node(file, p[t], p, a );
867
+ }
868
+ }
869
+ }
870
+
871
+
872
+ /*******************************************************************/
873
+ /* */
874
+ /* Transducer::read_transducer_binary */
875
+ /* */
876
+ /*******************************************************************/
877
+
878
+ void Transducer::read_transducer_binary( FILE *file )
879
+
880
+ {
881
+ if (fgetc(file) != 'a')
882
+ throw "Error: wrong file format (not a standard transducer)\n";
883
+
884
+ vmark = deterministic = 0;
885
+ unsigned int n;
886
+ fread(&n,sizeof(n),1,file); // number of nodes
887
+ if (ferror(file))
888
+ throw "Error encountered while reading transducer from file";
889
+
890
+ Node **p=new Node*[n]; // maps indices to nodes
891
+ p[0] = root_node();
892
+ for( unsigned int i=1; i<n; i++)
893
+ p[i] = NULL;
894
+ read_node( file, root_node(), p, this );
895
+ delete[] p;
896
+
897
+ alphabet.read(file);
898
+
899
+ vmark = 1;
900
+ deterministic = minimised = 1;
901
+ }
902
+
903
+
904
+ /*******************************************************************/
905
+ /* */
906
+ /* error_message */
907
+ /* */
908
+ /*******************************************************************/
909
+
910
+ static void error_message( size_t line )
911
+
912
+ {
913
+ static char message[1000];
914
+ sprintf(message, "Error: in line %u of text transducer file",
915
+ (unsigned int)line);
916
+ throw message;
917
+ }
918
+
919
+
920
+ /*******************************************************************/
921
+ /* */
922
+ /* Transducer::create_node */
923
+ /* */
924
+ /*******************************************************************/
925
+
926
+ Node *Transducer::create_node( vector<Node*> &node, char *s, size_t line )
927
+
928
+ {
929
+ char *p;
930
+ long n = strtol(s, &p, 10);
931
+
932
+ if (s == p || n < 0)
933
+ error_message( line );
934
+ if ((long)node.size() <= n)
935
+ node.resize(n+1, NULL);
936
+ if (node[n] == NULL)
937
+ node[n] = new_node(); //new Node;
938
+
939
+ return node[n];
940
+ }
941
+
942
+
943
+ /*******************************************************************/
944
+ /* */
945
+ /* next_string */
946
+ /* */
947
+ /*******************************************************************/
948
+
949
+ static char *next_string( char* &s, size_t line )
950
+
951
+ {
952
+ // scan the input up to the next tab or newline character
953
+ // and unquote symbols preceded by a backslash
954
+ char *p = s;
955
+ char *q = s;
956
+ while (*q!=0 && *q!='\t' && *q!='\n' && *q!='\r') {
957
+ if (*q == '\\')
958
+ q++;
959
+ *(p++) = *(q++);
960
+ }
961
+ if (p == s)
962
+ error_message(line); // no string found
963
+
964
+ char *result=s;
965
+ // skip over following whitespace
966
+ while (*q == ' ' || *q == '\t' || *q == '\n' || *q == '\r')
967
+ q++;
968
+
969
+ if (*q == 0)
970
+ s = NULL; // end of string was reached
971
+ else
972
+ s = q; // move the string pointer s
973
+
974
+ *p = 0; // mark the end of the result string
975
+
976
+ return result;
977
+ }
978
+
979
+
980
+ /*******************************************************************/
981
+ /* */
982
+ /* Transducer::read_transducer_text */
983
+ /* */
984
+ /*******************************************************************/
985
+
986
+ void Transducer::read_transducer_text( FILE *file )
987
+
988
+ {
989
+ vector<Node*> nodes;
990
+ nodes.push_back(root_node());
991
+
992
+ vmark = deterministic = 0;
993
+ char buffer[10000];
994
+ for( size_t line=0; fgets(buffer, 10000, file ); line++ ) {
995
+ char *p = buffer;
996
+ char *s = next_string(p, line);
997
+ Node *node = create_node( nodes, s, line );
998
+ if (p == NULL)
999
+ node->set_final(true);
1000
+ else {
1001
+ s = next_string(p, line);
1002
+ Node *target = create_node( nodes, s, line );
1003
+
1004
+ s = next_string(p, line);
1005
+ Character lc = alphabet.add_symbol(s);
1006
+ s = next_string(p, line);
1007
+ Character uc = alphabet.add_symbol(s);
1008
+ Label l(lc,uc);
1009
+ if (l == Label::epsilon)
1010
+ error_message( line );
1011
+
1012
+ alphabet.insert(l);
1013
+ node->add_arc( l, target, this );
1014
+ }
1015
+ }
1016
+
1017
+ vmark = 1;
1018
+ deterministic = minimised = 1;
1019
+ }
1020
+
1021
+
1022
+ /*******************************************************************/
1023
+ /* */
1024
+ /* Transducer::Transducer */
1025
+ /* */
1026
+ /*******************************************************************/
1027
+
1028
+ Transducer::Transducer( FILE *file, bool binary )
1029
+
1030
+ {
1031
+ indexed = false;
1032
+ node_count = transition_count = 0;
1033
+ if (binary)
1034
+ read_transducer_binary( file );
1035
+ else
1036
+ read_transducer_text( file );
1037
+ }
1038
+
1039
+
1040
+ /* EPSILON REMOVAL ALGORITHM written by Erik Axelson starts here */
1041
+
1042
+ /*******************************************************************/
1043
+ /* */
1044
+ /* node_in_copy_tr */
1045
+ /* */
1046
+ /*******************************************************************/
1047
+
1048
+ /* Find the corresponding node in 'copy_tr' for 'node'. If needed, create a new node to 'copy_tr'
1049
+ and update 'mapper' accordingly. */
1050
+
1051
+ Node *node_in_copy_tr( Node *node, Transducer *copy_tr, map<int, Node*> &mapper ) {
1052
+ int node_index = (int)node->index; // node index in original transducer
1053
+ map<int,Node*>::iterator it = mapper.find(node_index); // iterator to associated node in copy_tr
1054
+ if (it == mapper.end()) {
1055
+ Node *associated_node = copy_tr->new_node(); // create new node in copy_tr
1056
+ if (node->is_final())
1057
+ associated_node->set_final(true);
1058
+ mapper[node_index] = associated_node; // and associate it with node_index
1059
+ return associated_node;
1060
+ }
1061
+ else
1062
+ return it->second;
1063
+ }
1064
+
1065
+
1066
+ /*******************************************************************/
1067
+ /* */
1068
+ /* Transducer::copy_nodes */
1069
+ /* */
1070
+ /*******************************************************************/
1071
+
1072
+ /* Recursive epsilon removal algorithm. Copies arcs and their
1073
+ target nodes starting from search_node to node copy_tr_start_node
1074
+ in transducer copy_tr. nn and mapper are used to associate nodes
1075
+ with nodes in copy_tr. */
1076
+
1077
+ void Transducer::copy_nodes( Node *search_node, Transducer *copy_tr,
1078
+ Node *copy_tr_start_node,
1079
+ map<int, Node*> &mapper ) {
1080
+
1081
+ // go through all arcs leaving from search node
1082
+ // (the iterator lists the epsilon arcs first)
1083
+ for( ArcsIter it(search_node->arcs()); it; it++ ) {
1084
+ Arc arc=*it;
1085
+
1086
+ if (arc.label().is_epsilon()) {
1087
+ // 'forward', which is originally NULL, is used as a flag
1088
+ // for detecting epsilon transition loops
1089
+ if (search_node->forward() != copy_tr_start_node) {
1090
+ search_node->set_forward(copy_tr_start_node); // set epsilon flag
1091
+ if (arc.target_node()->is_final())
1092
+ copy_tr_start_node->set_final(true);
1093
+ copy_nodes(arc.target_node(), copy_tr, copy_tr_start_node, mapper);
1094
+ search_node->set_forward(NULL); // remove epsilon flag
1095
+ }
1096
+ }
1097
+
1098
+ else {
1099
+ // target node in copy_tr
1100
+ Node *copy_tr_end_node =
1101
+ node_in_copy_tr(arc.target_node(), copy_tr, mapper);
1102
+ // add arc to copy_tr
1103
+ copy_tr_start_node->add_arc( Label(arc.label().lower_char(),
1104
+ arc.label().upper_char()),
1105
+ copy_tr_end_node,
1106
+ copy_tr );
1107
+ // if the target node is not visited, copy nodes recursively
1108
+ if ( !(arc.target_node()->was_visited(vmark)) )
1109
+ copy_nodes(arc.target_node(), copy_tr, copy_tr_end_node, mapper);
1110
+ }
1111
+
1112
+ }
1113
+ }
1114
+
1115
+
1116
+ /*******************************************************************/
1117
+ /* */
1118
+ /* Transducer::remove_epsilons */
1119
+ /* */
1120
+ /*******************************************************************/
1121
+
1122
+ Transducer &Transducer::remove_epsilons()
1123
+
1124
+ {
1125
+ if ( deterministic || minimised )
1126
+ return this->copy();
1127
+
1128
+ nodeindexing();
1129
+ incr_vmark();
1130
+ Transducer *copy_tr = new Transducer();
1131
+ copy_tr->alphabet.copy(alphabet);
1132
+ map<int, Node*> mapper;
1133
+ // mark root node as visited
1134
+ root_node()->was_visited(vmark);
1135
+ // set copy_tr root node final, if needed
1136
+ if (root_node()->is_final())
1137
+ copy_tr->root_node()->set_final(true);
1138
+ // associate the root_nodes in this and copy_tr
1139
+ // (node indexing for root_node is zero)
1140
+ mapper[0] = copy_tr->root_node();
1141
+
1142
+ copy_nodes(root_node(), copy_tr, copy_tr->root_node(), mapper);
1143
+ incr_vmark();
1144
+
1145
+ return *copy_tr;
1146
+ }
1147
+
1148
+ // EPSILON REMOVAL ALGORITHM ENDS
1149
+
1150
+ }