ruby-sfst 0.4.3 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -0
  3. data/COPYING +280 -0
  4. data/Gemfile +3 -0
  5. data/Gemfile.lock +54 -0
  6. data/README.md +1 -1
  7. data/Rakefile +9 -18
  8. data/bin/console +7 -0
  9. data/bin/setup +6 -0
  10. data/ext/sfst/alphabet.cc +879 -0
  11. data/ext/sfst/alphabet.h +302 -0
  12. data/ext/sfst/basic.cc +85 -0
  13. data/ext/{sfst_machine → sfst}/basic.h +7 -4
  14. data/ext/sfst/compact.cc +629 -0
  15. data/ext/sfst/compact.h +100 -0
  16. data/ext/sfst/determinise.cc +279 -0
  17. data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
  18. data/ext/sfst/fst.cc +1150 -0
  19. data/ext/sfst/fst.h +374 -0
  20. data/ext/sfst/hopcroft.cc +681 -0
  21. data/ext/sfst/interface.cc +1921 -0
  22. data/ext/sfst/interface.h +171 -0
  23. data/ext/sfst/make-compact.cc +323 -0
  24. data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
  25. data/ext/sfst/mem.h +80 -0
  26. data/ext/sfst/operators.cc +1273 -0
  27. data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
  28. data/ext/sfst/sgi.h +72 -0
  29. data/ext/sfst/utf8.cc +149 -0
  30. data/ext/{sfst_machine → sfst}/utf8.h +7 -4
  31. data/lib/sfst.rb +2 -1
  32. data/lib/sfst/version.rb +1 -1
  33. data/ruby-sfst.gemspec +23 -23
  34. metadata +107 -35
  35. data/ext/sfst_machine/alphabet.cc +0 -812
  36. data/ext/sfst_machine/alphabet.h +0 -273
  37. data/ext/sfst_machine/basic.cc +0 -84
  38. data/ext/sfst_machine/compact.cc +0 -616
  39. data/ext/sfst_machine/compact.h +0 -98
  40. data/ext/sfst_machine/determinise.cc +0 -303
  41. data/ext/sfst_machine/fst.cc +0 -1000
  42. data/ext/sfst_machine/fst.h +0 -369
  43. data/ext/sfst_machine/interface.cc +0 -1842
  44. data/ext/sfst_machine/interface.h +0 -93
  45. data/ext/sfst_machine/make-compact.cc +0 -327
  46. data/ext/sfst_machine/mem.h +0 -74
  47. data/ext/sfst_machine/operators.cc +0 -1131
  48. data/ext/sfst_machine/sgi.h +0 -44
  49. data/ext/sfst_machine/utf8.cc +0 -146
  50. data/test/test_sfst.fst +0 -3
  51. data/test/test_sfst.rb +0 -114
@@ -1,98 +0,0 @@
1
- /*******************************************************************/
2
- /* */
3
- /* FILE compact.h */
4
- /* MODULE compact */
5
- /* PROGRAM SFST */
6
- /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
- /* */
8
- /* PURPOSE finite state tools */
9
- /* */
10
- /*******************************************************************/
11
-
12
- #ifndef _COMPACT_H_
13
- #define _COMPACT_H_
14
-
15
- #include "alphabet.h"
16
-
17
- #include <vector>
18
-
19
- typedef std::vector<unsigned int> CAnalysis;
20
-
21
- class CompactTransducer {
22
-
23
- protected:
24
-
25
- // the following data structures are used to store the nodes
26
-
27
- unsigned int number_of_nodes; // number of nodes in the transducer
28
- char *finalp; // finalp[i] is 1 if node i is final and 0 otherwise
29
- unsigned int *first_arc; // first_arc[i] is the number of the first
30
- // arc outgoing from node i
31
-
32
- // the following data structures are used to store the transition arcs
33
-
34
- unsigned int number_of_arcs; // total number of arcs in the transducer
35
- Label *label; // the label (character pair) of arc i
36
- unsigned int *target_node; // target node of arc i
37
-
38
- // the following data structures are used to store the stochastic parameters
39
- float *final_logprob;
40
- float *arc_logprob;
41
-
42
- // functions needed to read the transducer from a file
43
-
44
- void read_finalp( FILE *file );
45
- void read_first_arcs( FILE *file );
46
- void read_target_nodes( FILE *file );
47
- void read_labels( FILE *file );
48
- void read_probs( FILE *file );
49
-
50
- // functions needed to analyze data with the transducer
51
-
52
- void analyze( unsigned int n, std::vector<Character> &ch, size_t ipos,
53
- CAnalysis&, std::vector<CAnalysis>&);
54
-
55
- // function selecting the simplest morphological analysis
56
-
57
- int compute_score( CAnalysis &ana );
58
- void disambiguate( std::vector<CAnalysis> &analyses );
59
-
60
- // functions for longest-match analysis of input data
61
-
62
- void longest_match2(unsigned int, char*, int, CAnalysis&, int&, CAnalysis&);
63
-
64
- void convert( CAnalysis &cana, Analysis &ana );
65
-
66
- public:
67
- size_t node_count() { return number_of_nodes; };
68
- size_t arc_count() { return number_of_arcs; };
69
-
70
- bool both_layers; // print surface and analysis symbols
71
- bool simplest_only; // print only the simplest analyses
72
-
73
- Alphabet alphabet; // data structure which maps symbols to numeric codes
74
- CompactTransducer(); // dummy constructor
75
- CompactTransducer( FILE*, FILE *pfile=NULL ); // reads a (stochastic) transducer
76
- ~CompactTransducer(); // destroys a transducer
77
-
78
- // the analysis function returns the set of analyses for the string "s"
79
- // in the argument "analyses"
80
- void analyze_string( char *s, std::vector<CAnalysis > &analyses );
81
-
82
- void compute_probs( std::vector<CAnalysis> &analyses, std::vector<double> &prob );
83
- char *print_analysis( CAnalysis &ana );
84
-
85
- // longest-match analysis
86
- const char *longest_match( char*& );
87
-
88
- // EM training
89
- bool train2( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
90
- bool train( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
91
- void estimate_probs( std::vector<double> &arcfreq, std::vector<double> &finalfreq );
92
-
93
- // robust analysis
94
- float robust_analyze_string( char *string, std::vector<CAnalysis> &analyses,
95
- float ErrorsAllowed );
96
- };
97
-
98
- #endif
@@ -1,303 +0,0 @@
1
-
2
- /*******************************************************************/
3
- /* */
4
- /* FILE determinise.C */
5
- /* MODULE determinise */
6
- /* PROGRAM SFST */
7
- /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
8
- /* */
9
- /*******************************************************************/
10
-
11
-
12
- #include "fst.h"
13
-
14
- using std::vector;
15
- using std::pair;
16
- using std::set;
17
-
18
- /***************** class NodeSet *********************************/
19
-
20
- class NodeSet {
21
- // This class is used to store a set of nodes.
22
- // Whenever a new node is added, all nodes accessible
23
- // through epsilon transitions are added as well.
24
-
25
- private:
26
- set<Node*> ht;
27
-
28
- public:
29
- typedef set<Node*>::iterator iterator;
30
- NodeSet() {};
31
- void add( Node* );
32
- bool insert(Node *node) {
33
- pair<iterator, bool> result = ht.insert(node);
34
- return result.second;
35
- };
36
- iterator begin() const { return ht.begin(); }
37
- iterator end() const { return ht.end(); }
38
- size_t size() const { return ht.size(); }
39
- void clear() { ht.clear(); }
40
- };
41
-
42
-
43
- /***************** class NodeArray *******************************/
44
-
45
- class NodeArray {
46
-
47
- private:
48
- size_t sizev;
49
- bool final;
50
- Node **node;
51
-
52
- public:
53
- NodeArray( NodeSet& );
54
- ~NodeArray() { delete[] node; };
55
- size_t size() const { return sizev; }
56
- bool is_final() const { return final; };
57
- Node* &operator[]( int i ) const { return node[i]; }
58
- };
59
-
60
-
61
- /***************** class Transition ******************************/
62
-
63
- class Transition {
64
- public:
65
- Label label;
66
- NodeArray *nodes;
67
- Transition(Label l, NodeArray *na) { label = l; nodes = na; };
68
- };
69
-
70
-
71
- /***************** class NodeMapping ****************************/
72
-
73
- class NodeMapping {
74
- // This class is used to map a node set from one transducer
75
- // to a single node in another transducer
76
-
77
- private:
78
- struct hashf {
79
- size_t operator()(const NodeArray *na) const {
80
- size_t key=na->size() ^ na->is_final();
81
- for( size_t i=0; i<na->size(); i++)
82
- key = (key<<1) ^ (size_t)(*na)[i];
83
- return key;
84
- }
85
- };
86
- struct equalf {
87
- int operator()(const NodeArray *na1, const NodeArray *na2) const {
88
- if (na1->size() != na2->size() || na1->is_final() != na2->is_final())
89
- return 0;
90
- for( size_t i=0; i<na1->size(); i++)
91
- if ((*na1)[i] != (*na2)[i])
92
- return 0;
93
- return 1;
94
- }
95
- };
96
- typedef hash_map<NodeArray*, Node*, hashf, equalf> NodeMap;
97
- NodeMap hm;
98
-
99
- public:
100
- typedef NodeMap::iterator iterator;
101
- ~NodeMapping();
102
- iterator begin() { return hm.begin(); };
103
- iterator end() { return hm.end(); };
104
- iterator find( NodeArray *na) { return hm.find( na ); };
105
- Node* &operator[]( NodeArray *na ) { return hm.operator[](na); };
106
-
107
- };
108
-
109
-
110
- /***************** class LabelMapping ****************************/
111
-
112
- class LabelMapping {
113
- // This class is used to map a label to a node set
114
-
115
- private:
116
- struct hashf {
117
- size_t operator()(const Label l) const {
118
- return l.lower_char() | (l.upper_char() << 16);
119
- }
120
- };
121
- struct equalf {
122
- int operator()(const Label l1, const Label l2) const {
123
- return l1==l2;
124
- }
125
- };
126
- typedef hash_map<const Label, NodeSet, hashf, equalf> LabelMap;
127
- LabelMap lm;
128
-
129
- public:
130
- LabelMapping(): lm(8) {};
131
- typedef LabelMap::iterator iterator;
132
- iterator begin() { return lm.begin(); };
133
- iterator end() { return lm.end(); };
134
- size_t size() { return lm.size(); };
135
- iterator find( Label l) { return lm.find( l ); };
136
- NodeSet &operator[]( const Label l ) { return lm.operator[]( l ); };
137
-
138
- };
139
-
140
- static void determinise_node( NodeArray&, Node*, Transducer*, NodeMapping&, long );
141
-
142
-
143
-
144
- /*******************************************************************/
145
- /* */
146
- /* NodeSet::add */
147
- /* */
148
- /*******************************************************************/
149
-
150
- void NodeSet::add( Node *node )
151
-
152
- {
153
- pair<iterator, bool> result = ht.insert(node);
154
- if (result.second) {
155
- // new node, add nodes reachable with epsilon transitions
156
- for( ArcsIter p(node->arcs(),ArcsIter::eps); p; p++ ) {
157
- Arc *arc=p;
158
- if (!arc->label().is_epsilon())
159
- break;
160
- add(arc->target_node());
161
- }
162
- }
163
- }
164
-
165
-
166
- /*******************************************************************/
167
- /* */
168
- /* NodeArray::NodeArray */
169
- /* */
170
- /*******************************************************************/
171
-
172
- NodeArray::NodeArray( NodeSet &ns )
173
-
174
- {
175
- sizev = 0;
176
- NodeSet::iterator it;
177
-
178
- final = false;
179
- node = new Node*[ns.size()];
180
- for( it=ns.begin(); it!=ns.end(); it++ ) {
181
- Node *nn = *it;
182
- if (nn->arcs()->non_epsilon_transition_exists())
183
- node[sizev++] = nn;
184
- final |= nn->is_final();
185
- }
186
- std::sort(node, node+sizev);
187
- }
188
-
189
-
190
- /*******************************************************************/
191
- /* */
192
- /* NodeMapping::~NodeMapping */
193
- /* */
194
- /*******************************************************************/
195
-
196
- NodeMapping::~NodeMapping()
197
-
198
- {
199
- // if we delete NodeArrays without removing them from NodeMapping,
200
- // the system will crash when NodeMapping is deleted.
201
- for( iterator it=hm.begin(); it!=hm.end(); ) {
202
- NodeArray *na=it->first;
203
- iterator old = it++;
204
- hm.erase(old);
205
- delete na;
206
- }
207
- }
208
-
209
-
210
- /*******************************************************************/
211
- /* */
212
- /* compute_transitions */
213
- /* */
214
- /*******************************************************************/
215
-
216
- static void compute_transitions( NodeArray &na, vector<Transition> &t )
217
-
218
- {
219
- LabelMapping lmap;
220
-
221
- // for all nodes in the current set
222
- for( size_t i=0; i<na.size(); i++) {
223
- Node *n = na[i]; // old node
224
-
225
- // For each non-epsilon transition, add the target node
226
- // to the respective node set.
227
- for( ArcsIter p(n->arcs(),ArcsIter::non_eps); p; p++ ) {
228
- Arc *arc=p;
229
- lmap[arc->label()].add(arc->target_node());
230
- }
231
- }
232
-
233
- t.reserve(lmap.size());
234
- for( LabelMapping::iterator it=lmap.begin(); it!=lmap.end(); it++ )
235
- t.push_back(Transition(it->first, new NodeArray( it->second )));
236
- }
237
-
238
-
239
- /*******************************************************************/
240
- /* */
241
- /* determinise_node */
242
- /* */
243
- /*******************************************************************/
244
-
245
- static void determinise_node( NodeArray &na, Node *node, Transducer *a,
246
- NodeMapping &map, long depth )
247
- {
248
- if (depth > 10000)
249
- fprintf(stderr,"\r%ld",depth);
250
- node->set_final(na.is_final());
251
-
252
- vector<Transition> t;
253
- compute_transitions( na, t );
254
-
255
- for( size_t i=0; i<t.size(); i++ ) {
256
- NodeMapping::iterator it=map.find(t[i].nodes);
257
- if (it == map.end()) {
258
- // new node set
259
- Node *target_node = a->new_node();
260
- map[t[i].nodes] = target_node;
261
- node->add_arc( t[i].label, target_node, a );
262
- determinise_node( *t[i].nodes, target_node, a, map, depth+1 );
263
- }
264
- else {
265
- delete t[i].nodes;
266
- node->add_arc( t[i].label, it->second, a );
267
- }
268
- }
269
- }
270
-
271
-
272
- /*******************************************************************/
273
- /* */
274
- /* Transducer::determinise */
275
- /* */
276
- /*******************************************************************/
277
-
278
- Transducer &Transducer::determinise()
279
-
280
- {
281
- // initialisations
282
- NodeMapping map;
283
-
284
- Transducer *a = new Transducer();
285
- a->alphabet.copy(alphabet);
286
-
287
- // creation of the initial node set consisting of all nodes
288
- // reachable from the start node via epsilon transitions.
289
- NodeArray *na;
290
- {
291
- NodeSet ns;
292
- ns.add(root_node());
293
- na = new NodeArray(ns);
294
- }
295
-
296
- // map the node set to the new root node
297
- map[na] = a->root_node();
298
-
299
- // determinise the transducer recursively
300
- determinise_node( *na, a->root_node(), a, map, 0);
301
- a->deterministic = 1;
302
- return *a;
303
- }
@@ -1,1000 +0,0 @@
1
-
2
- /*******************************************************************/
3
- /* */
4
- /* FILE fst.C */
5
- /* MODULE fst */
6
- /* PROGRAM SFST */
7
- /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
8
- /* */
9
- /* PURPOSE basic FST functions */
10
- /* */
11
- /*******************************************************************/
12
-
13
- #include "fst.h"
14
-
15
- using std::vector;
16
- using std::istream;
17
- using std::ostream;
18
- using std::cerr;
19
-
20
- const int BUFFER_SIZE=100000;
21
-
22
-
23
- /*******************************************************************/
24
- /* */
25
- /* Arcs::size */
26
- /* */
27
- /*******************************************************************/
28
-
29
- int Arcs::size() const
30
-
31
- {
32
- int n=0;
33
- for( Arc *p=first_arcp; p; p=p->next ) n++;
34
- for( Arc *p=first_epsilon_arcp; p; p=p->next ) n++;
35
- return n;
36
- }
37
-
38
-
39
- /*******************************************************************/
40
- /* */
41
- /* Arcs::target_node */
42
- /* */
43
- /*******************************************************************/
44
-
45
- Node *Arcs::target_node( Label l )
46
-
47
- {
48
- Arc *arc;
49
-
50
- for( arc=first_arcp; arc; arc=arc->next)
51
- if (arc->label() == l)
52
- return arc->target_node();
53
-
54
- return NULL;
55
- }
56
-
57
- const Node *Arcs::target_node( Label l ) const
58
-
59
- {
60
- const Arc *arc;
61
-
62
- for( arc=first_arcp; arc; arc=arc->next)
63
- if (arc->label() == l)
64
- return arc->target_node();
65
-
66
- return NULL;
67
- }
68
-
69
-
70
- /*******************************************************************/
71
- /* */
72
- /* Arcs::add_arc */
73
- /* */
74
- /*******************************************************************/
75
-
76
- void Arcs::add_arc( Label l, Node *node, Transducer *a )
77
-
78
- {
79
- Arc *arc=a->new_arc( l, node );
80
-
81
- if (l.is_epsilon()) {
82
- arc->next = first_epsilon_arcp;
83
- first_epsilon_arcp = arc;
84
- }
85
- else {
86
- arc->next = first_arcp;
87
- first_arcp = arc;
88
- }
89
- }
90
-
91
-
92
- /*******************************************************************/
93
- /* */
94
- /* Arcs::remove_arc */
95
- /* */
96
- /*******************************************************************/
97
-
98
- int Arcs::remove_arc( Arc *arc )
99
-
100
- {
101
- Arc **p = (arc->label().is_epsilon()) ? &first_epsilon_arcp : &first_arcp;
102
- for( ; *p; p=&(*p)->next )
103
- if (*p == arc) {
104
- *p = arc->next;
105
- return 1;
106
- }
107
- return 0;
108
- }
109
-
110
-
111
- /*******************************************************************/
112
- /* */
113
- /* Node::init */
114
- /* */
115
- /*******************************************************************/
116
-
117
- void Node::init()
118
-
119
- {
120
- final = false;
121
- visited = 0;
122
- arcsp.init();
123
- forwardp = NULL;
124
- }
125
-
126
-
127
- /*******************************************************************/
128
- /* */
129
- /* Node::clear_visited */
130
- /* */
131
- /*******************************************************************/
132
-
133
- void Node::clear_visited( NodeHashSet &nodeset )
134
-
135
- {
136
- if (nodeset.find( this ) == nodeset.end()) {
137
- visited = 0;
138
- nodeset.insert( this );
139
- fprintf(stderr," %lu", nodeset.size());
140
- for( ArcsIter p(arcs()); p; p++ ) {
141
- Arc *arc=p;
142
- arc->target_node()->clear_visited( nodeset );
143
- }
144
- }
145
- }
146
-
147
-
148
- /*******************************************************************/
149
- /* */
150
- /* NodeNumbering::number_node */
151
- /* */
152
- /*******************************************************************/
153
-
154
- void NodeNumbering::number_node( Node *node, Transducer &a )
155
-
156
- {
157
- if (!node->was_visited( a.vmark )) {
158
- nummap[node] = nodes.size();
159
- nodes.push_back(node);
160
- for( ArcsIter p(node->arcs()); p; p++ ) {
161
- Arc *arc=p;
162
- number_node( arc->target_node(), a );
163
- }
164
- }
165
- }
166
-
167
-
168
- /*******************************************************************/
169
- /* */
170
- /* NodeNumbering::NodeNumbering */
171
- /* */
172
- /*******************************************************************/
173
-
174
- NodeNumbering::NodeNumbering( Transducer &a )
175
-
176
- {
177
- a.incr_vmark();
178
- number_node( a.root_node(), a );
179
- }
180
-
181
-
182
- /*******************************************************************/
183
- /* */
184
- /* Transducer::new_node */
185
- /* */
186
- /*******************************************************************/
187
-
188
- Node *Transducer::new_node()
189
-
190
- {
191
- Node *node=(Node*)mem.alloc( sizeof(Node) );
192
-
193
- node->init();
194
- return node;
195
- }
196
-
197
-
198
- /*******************************************************************/
199
- /* */
200
- /* Transducer::new_arc */
201
- /* */
202
- /*******************************************************************/
203
-
204
- Arc *Transducer::new_arc( Label l, Node *target )
205
-
206
- {
207
- Arc *arc=(Arc*)mem.alloc( sizeof(Arc) );
208
-
209
- arc->init( l, target);
210
- return arc;
211
- }
212
-
213
-
214
- /*******************************************************************/
215
- /* */
216
- /* Transducer::add_string */
217
- /* */
218
- /*******************************************************************/
219
-
220
- void Transducer::add_string( char *s, bool extended, Alphabet *a )
221
-
222
- {
223
- if (a == NULL)
224
- a = &alphabet;
225
-
226
- Node *node=root_node();
227
- Label l;
228
- while (!(l = a->next_label(s, extended)).is_epsilon()) {
229
- a->insert(l);
230
- Arcs *arcs=node->arcs();
231
- node = arcs->target_node( l );
232
- if (node == NULL) {
233
- node = new_node();
234
- arcs->add_arc( l, node, this );
235
- }
236
- }
237
- node->set_final(1);
238
- }
239
-
240
-
241
- /*******************************************************************/
242
- /* */
243
- /* Transducer::Transducer */
244
- /* */
245
- /*******************************************************************/
246
-
247
- Transducer::Transducer( vector<Label> &path )
248
- : root(), mem()
249
- {
250
- Node *node=root_node();
251
-
252
- vmark = 0;
253
- deterministic = minimised = true;
254
- for( size_t i=0; i<path.size(); i++ ) {
255
- Arcs *arcs=node->arcs();
256
- node = new_node();
257
- arcs->add_arc( path[i], node, this );
258
- }
259
- node->set_final(1);
260
- }
261
-
262
-
263
- /*******************************************************************/
264
- /* */
265
- /* Transducer::Transducer */
266
- /* */
267
- /*******************************************************************/
268
-
269
- Transducer::Transducer( istream &is, const Alphabet *a, bool verbose )
270
- : root(), mem()
271
- {
272
- bool extended=false;
273
- int n=0;
274
- char buffer[10000];
275
-
276
- vmark = 0;
277
- deterministic = true;
278
- minimised = false;
279
- if (a) {
280
- alphabet.copy(*a);
281
- extended = true;
282
- }
283
- while (is.getline(buffer, 10000)) {
284
- if (verbose && ++n % 10000 == 0) {
285
- if (n == 10000)
286
- cerr << "\n";
287
- cerr << "\r" << n << " words";
288
- }
289
- // delete final whitespace characters
290
- int l;
291
- for( l=strlen(buffer)-1; l>=0; l-- )
292
- if ((buffer[l] != ' ' && buffer[l] != '\t' && buffer[l] != '\r') ||
293
- (l > 0 && buffer[l-1] == '\\'))
294
- break;
295
- buffer[l+1] = 0;
296
-
297
- add_string(buffer, extended);
298
- }
299
- if (verbose && n >= 10000)
300
- cerr << "\n";
301
- }
302
-
303
-
304
- /*******************************************************************/
305
- /* */
306
- /* Transducer::Transducer */
307
- /* */
308
- /*******************************************************************/
309
-
310
- Transducer::Transducer( char *s, const Alphabet *a, bool extended )
311
- : root(), mem()
312
- {
313
- vmark = 0;
314
- deterministic = minimised = true;
315
- if (a)
316
- alphabet.copy(*a);
317
- add_string(s, extended);
318
- }
319
-
320
-
321
- /*******************************************************************/
322
- /* */
323
- /* Transducer::clear */
324
- /* */
325
- /*******************************************************************/
326
-
327
- void Transducer::clear()
328
-
329
- {
330
- vmark = 0;
331
- deterministic = minimised = false;
332
- root.init();
333
- mem.clear();
334
- alphabet.clear();
335
- }
336
-
337
-
338
- /*******************************************************************/
339
- /* */
340
- /* Transducer::store_symbols */
341
- /* */
342
- /*******************************************************************/
343
-
344
- void Transducer::store_symbols(Node *node, SymbolMap &symbol, LabelSet &labels)
345
-
346
- {
347
- if (!node->was_visited( vmark )) {
348
- Arcs *arcs=node->arcs();
349
- for( ArcsIter p(arcs); p; p++ ) {
350
- Arc *arc=p;
351
- Label l=arc->label();
352
-
353
- labels.insert(l);
354
-
355
- Character c = l.upper_char();
356
- if (symbol.find(c) == symbol.end()) {
357
- const char *s = alphabet.code2symbol(c);
358
- if (s)
359
- symbol[c] = fst_strdup(s);
360
- }
361
-
362
- c = l.lower_char();
363
- if (symbol.find(c) == symbol.end()) {
364
- const char *s = alphabet.code2symbol(c);
365
- if (s)
366
- symbol[c] = fst_strdup(s);
367
- }
368
-
369
- store_symbols( arc->target_node(), symbol, labels );
370
- }
371
- }
372
- }
373
-
374
-
375
- /*******************************************************************/
376
- /* */
377
- /* Transducer::minimise_alphabet */
378
- /* */
379
- /*******************************************************************/
380
-
381
- void Transducer::minimise_alphabet()
382
-
383
- {
384
- SymbolMap symbols;
385
- LabelSet labels;
386
- incr_vmark();
387
- store_symbols(root_node(), symbols, labels);
388
- alphabet.clear();
389
- for( SymbolMap::iterator it=symbols.begin(); it!=symbols.end(); it++ ) {
390
- alphabet.add_symbol( it->second, it->first );
391
- free(it->second);
392
- }
393
- for( LabelSet::iterator it=labels.begin(); it!=labels.end(); it++ )
394
- alphabet.insert(*it);
395
- }
396
-
397
-
398
- /*******************************************************************/
399
- /* */
400
- /* Transducer::minimise */
401
- /* */
402
- /*******************************************************************/
403
-
404
- Transducer &Transducer::minimise( bool verbose )
405
-
406
- {
407
- if (minimised)
408
- return copy();
409
-
410
- Transducer *a1, *a2;
411
-
412
- a1 = &reverse();
413
- a2 = &a1->determinise();
414
- delete a1;
415
-
416
- a1 = &a2->reverse();
417
- delete a2;
418
-
419
- a2 = &a1->determinise();
420
- delete a1;
421
-
422
- a2->minimised = true;
423
- a2->minimise_alphabet();
424
-
425
- return *a2;
426
- }
427
-
428
-
429
- /*******************************************************************/
430
- /* */
431
- /* Transducer::enumerate_paths_node */
432
- /* */
433
- /*******************************************************************/
434
-
435
- void Transducer::enumerate_paths_node( Node *node, vector<Label> &path,
436
- NodeHashSet &previous,
437
- vector<Transducer*> &result )
438
- {
439
- if (node->is_final())
440
- result.push_back(new Transducer(path));
441
-
442
- for( ArcsIter it_arc(node->arcs()); it_arc; it_arc++ ) {
443
- Arc *arc=it_arc;
444
-
445
- NodeHashSet::iterator it_node=previous.insert(node).first;
446
- path.push_back(arc->label());
447
- enumerate_paths_node( arc->target_node(), path, previous, result );
448
- path.pop_back();
449
- previous.erase(it_node);
450
- }
451
- }
452
-
453
-
454
- /*******************************************************************/
455
- /* */
456
- /* Transducer::enumerate_paths */
457
- /* */
458
- /*******************************************************************/
459
-
460
- bool Transducer::enumerate_paths( vector<Transducer*> &result )
461
-
462
- {
463
- if (is_infinitely_ambiguous())
464
- return true;
465
- for( size_t i=0; i<result.size(); i++ )
466
- delete result[i];
467
- result.clear();
468
-
469
- vector<Label> path;
470
- NodeHashSet previous;
471
- enumerate_paths_node( root_node(), path, previous, result );
472
- return false;
473
- }
474
-
475
-
476
-
477
-
478
- /*******************************************************************/
479
- /* */
480
- /* Transducer::print_strings_node */
481
- /* */
482
- /*******************************************************************/
483
-
484
- int Transducer::print_strings_node(Node *node, char *buffer, int pos,
485
- FILE *file, bool with_brackets )
486
- {
487
- int result = 0;
488
-
489
- if (node->was_visited( vmark )) {
490
- if (node->forward() != NULL) { // cycle detected
491
- cerr << "Warning: cyclic analyses (cycle aborted)\n";
492
- return 0;
493
- }
494
- node->set_forward(node); // used like a flag for loop detection
495
- }
496
- if (pos == BUFFER_SIZE)
497
- throw "Output string in function print_strings_node is too long";
498
- if (node->is_final()) {
499
- buffer[pos] = '\0';
500
- fprintf(file,"%s\n", buffer);
501
- result = 1;
502
- }
503
- for( ArcsIter i(node->arcs()); i; i++ ) {
504
- int p=pos;
505
- Arc *arc=i;
506
- Label l=arc->label();
507
- alphabet.write_label(l, buffer, &p, with_brackets);
508
- result |= print_strings_node(arc->target_node(), buffer, p,
509
- file, with_brackets );
510
- }
511
- node->set_forward(NULL);
512
-
513
- return result;
514
- }
515
-
516
-
517
- /*******************************************************************/
518
- /* */
519
- /* Transducer::print_strings */
520
- /* */
521
- /*******************************************************************/
522
-
523
- int Transducer::print_strings( FILE *file, bool with_brackets )
524
-
525
- {
526
- char buffer[BUFFER_SIZE];
527
- incr_vmark();
528
- return print_strings_node( root_node(), buffer, 0, file, with_brackets );
529
- }
530
-
531
-
532
- /*******************************************************************/
533
- /* */
534
- /* Transducer::analyze_string */
535
- /* */
536
- /*******************************************************************/
537
-
538
- bool Transducer::analyze_string( char *string, FILE *file, bool with_brackets )
539
-
540
- {
541
- vector<Character> input;
542
- alphabet.string2symseq( string, input );
543
- vector<Label> labels;
544
- for( size_t i=0; i<input.size(); i++ )
545
- labels.push_back(Label(input[i]));
546
-
547
- Transducer a1(labels);
548
- Transducer *a2=&(*this || a1);
549
- Transducer *a3=&(a2->lower_level());
550
- delete a2;
551
- a2 = &a3->minimise();
552
- delete a3;
553
-
554
- a2->alphabet.copy(alphabet);
555
- bool result = a2->print_strings( file, with_brackets );
556
- delete a2;
557
- return result;
558
- }
559
-
560
-
561
- /*******************************************************************/
562
- /* */
563
- /* Transducer::generate_string */
564
- /* */
565
- /*******************************************************************/
566
-
567
- bool Transducer::generate_string( char *string, FILE *file, bool with_brackets)
568
-
569
- {
570
- Transducer a1(string, &alphabet, false);
571
- Transducer *a2=&(a1 || *this);
572
- Transducer *a3=&(a2->upper_level());
573
- delete a2;
574
- a2 = &a3->minimise();
575
- delete a3;
576
-
577
- a2->alphabet.copy(alphabet);
578
- bool result = a2->print_strings( file, with_brackets );
579
- delete a2;
580
- return result;
581
- }
582
-
583
-
584
- /*******************************************************************/
585
- /* */
586
- /* complete */
587
- /* */
588
- /*******************************************************************/
589
-
590
- static void complete( Node *node, Alphabet &alphabet, int vmark)
591
-
592
- {
593
- if (node->was_visited( vmark ))
594
- return;
595
- for( ArcsIter p(node->arcs()); p; p++ ) {
596
- Arc *arc=p;
597
- if (!arc->label().is_epsilon())
598
- alphabet.insert(arc->label());
599
- complete(arc->target_node(), alphabet, vmark);
600
- }
601
- }
602
-
603
-
604
- /*******************************************************************/
605
- /* */
606
- /* Transducer::complete_alphabet */
607
- /* */
608
- /*******************************************************************/
609
-
610
- void Transducer::complete_alphabet()
611
-
612
- {
613
- incr_vmark();
614
- complete(root_node(), alphabet, vmark);
615
- }
616
-
617
-
618
- /*******************************************************************/
619
- /* */
620
- /* print_node */
621
- /* */
622
- /*******************************************************************/
623
-
624
- static void print_node( ostream &s, Node *node, NodeNumbering &index,
625
- long vmark, Alphabet &abc )
626
-
627
- {
628
- if (!node->was_visited( vmark )) {
629
- Arcs *arcs=node->arcs();
630
- for( ArcsIter p(arcs); p; p++ ) {
631
- Arc *arc=p;
632
- s << index[node] << "\t" << index[arc->target_node()];
633
- s << "\t" << abc.write_char(arc->label().lower_char());
634
- s << "\t" << abc.write_char(arc->label().upper_char());
635
- s << "\n";
636
- }
637
- if (node->is_final())
638
- s << index[node] << "\n";
639
- for( ArcsIter p(arcs); p; p++ ) {
640
- Arc *arc=p;
641
- print_node( s, arc->target_node(), index, vmark, abc );
642
- }
643
- }
644
- }
645
-
646
-
647
- /*******************************************************************/
648
- /* */
649
- /* operator<< */
650
- /* */
651
- /*******************************************************************/
652
-
653
- ostream &operator<<( ostream &s, Transducer &a )
654
-
655
- {
656
- NodeNumbering index(a);
657
- a.incr_vmark();
658
- print_node( s, a.root_node(), index, a.vmark, a.alphabet );
659
- return s;
660
- }
661
-
662
-
663
- /*******************************************************************/
664
- /* */
665
- /* store_node_info */
666
- /* */
667
- /*******************************************************************/
668
-
669
- static void store_node_info( FILE *file, Node *node )
670
-
671
- {
672
- // write final flag
673
- char c=node->is_final();
674
- fwrite(&c,sizeof(c),1,file);
675
-
676
- // write the number of arcs
677
- int nn = node->arcs()->size();
678
- if (nn > 65535)
679
- throw "Error: in function store_node\n";
680
- unsigned short n=(unsigned short)nn;
681
- fwrite(&n,sizeof(n),1,file);
682
- }
683
-
684
-
685
- /*******************************************************************/
686
- /* */
687
- /* store_arc_label */
688
- /* */
689
- /*******************************************************************/
690
-
691
- static void store_arc_label( FILE *file, Arc *arc )
692
-
693
- {
694
- Label l=arc->label();
695
- Character lc=l.lower_char();
696
- Character uc=l.upper_char();
697
- fwrite(&lc,sizeof(lc),1,file);
698
- fwrite(&uc,sizeof(uc),1,file);
699
- }
700
-
701
-
702
- /*******************************************************************/
703
- /* */
704
- /* store_node */
705
- /* */
706
- /*******************************************************************/
707
-
708
- static void store_node( FILE *file, Node *node, NodeNumbering &index,
709
- long vmark )
710
- {
711
- if (!node->was_visited( vmark )) {
712
-
713
- store_node_info( file, node );
714
-
715
- // write the arcs
716
- for( ArcsIter p(node->arcs()); p; p++ ) {
717
- Arc *arc=p;
718
- store_arc_label( file, arc );
719
- unsigned int t=index[arc->target_node()];
720
- fwrite(&t,sizeof(t),1,file);
721
- store_node(file, arc->target_node(), index, vmark );
722
- }
723
- }
724
- }
725
-
726
-
727
- /*******************************************************************/
728
- /* */
729
- /* store_lowmem_node */
730
- /* */
731
- /*******************************************************************/
732
-
733
- static void store_lowmem_node( FILE *file, Node *node, NodeNumbering &index,
734
- vector<unsigned int> &startpos)
735
- {
736
- store_node_info( file, node );
737
-
738
- // write the arcs
739
- for( ArcsIter p(node->arcs()); p; p++ ) {
740
- Arc *arc=p;
741
- store_arc_label( file, arc );
742
- unsigned int t=startpos[index[arc->target_node()]];
743
- fwrite(&t,sizeof(t),1,file);
744
- }
745
- }
746
-
747
-
748
- /*******************************************************************/
749
- /* */
750
- /* Transducer::store_lowmem */
751
- /* */
752
- /*******************************************************************/
753
-
754
- void Transducer::store_lowmem( FILE *file )
755
-
756
- {
757
- fputc('l',file);
758
- alphabet.store(file);
759
-
760
- // storing size of index table
761
- NodeNumbering index(*this);
762
-
763
- // compute the start position of the first node
764
- unsigned int pos=(unsigned int)ftell(file);
765
- vector<unsigned int> startpos;
766
- for( size_t i=0; i<index.number_of_nodes(); i++ ) {
767
- startpos.push_back(pos);
768
- Node *node=index.get_node(i);
769
- Arcs *arcs=node->arcs();
770
- pos += sizeof(char) // size of final flag
771
- + sizeof(unsigned short) // size of number of arcs
772
- + arcs->size() * (sizeof(Character) * 2 + sizeof(unsigned int)); // size of n arcs
773
- }
774
-
775
- // storing nodes
776
- for( size_t i=0; i<index.number_of_nodes(); i++ )
777
- store_lowmem_node( file, index.get_node(i), index, startpos );
778
- }
779
-
780
-
781
- /*******************************************************************/
782
- /* */
783
- /* Transducer::store */
784
- /* */
785
- /*******************************************************************/
786
-
787
- void Transducer::store( FILE *file )
788
-
789
- {
790
- fputc('a',file);
791
-
792
- NodeNumbering index(*this);
793
- incr_vmark();
794
- unsigned int n=index.number_of_nodes();
795
- fwrite(&n,sizeof(n),1,file);
796
- store_node( file, root_node(), index, vmark );
797
-
798
- alphabet.store(file);
799
- }
800
-
801
-
802
- /*******************************************************************/
803
- /* */
804
- /* read_node */
805
- /* */
806
- /*******************************************************************/
807
-
808
- static void read_node( FILE *file, Node *node, Node **p, Transducer *a )
809
- {
810
- char c;
811
- fread(&c,sizeof(c),1,file);
812
- node->set_final(c);
813
-
814
- unsigned short n;
815
- fread( &n, sizeof(n), 1, file);
816
-
817
- for( int i=0; i<n; i++ ) {
818
- Character lc,uc;
819
- unsigned int t;
820
- fread(&lc,sizeof(lc),1,file);
821
- fread(&uc,sizeof(uc),1,file);
822
- fread(&t,sizeof(t),1,file);
823
- if (ferror(file))
824
- throw "Error encountered while reading transducer from file";
825
- if (p[t])
826
- node->add_arc( Label(lc,uc), p[t], a );
827
- else {
828
- p[t] = a->new_node();
829
- node->add_arc( Label(lc,uc), p[t], a );
830
- read_node(file, p[t], p, a );
831
- }
832
- }
833
- }
834
-
835
-
836
- /*******************************************************************/
837
- /* */
838
- /* Transducer::read_transducer_binary */
839
- /* */
840
- /*******************************************************************/
841
-
842
- void Transducer::read_transducer_binary( FILE *file )
843
-
844
- {
845
- if (fgetc(file) != 'a')
846
- throw "Error: wrong file format (not a standard transducer)\n";
847
-
848
- vmark = deterministic = 0;
849
- unsigned int n;
850
- fread(&n,sizeof(n),1,file); // number of nodes
851
- if (ferror(file))
852
- throw "Error encountered while reading transducer from file";
853
-
854
- Node **p=new Node*[n]; // maps indices to nodes
855
- p[0] = root_node();
856
- for( unsigned int i=1; i<n; i++)
857
- p[i] = NULL;
858
- read_node( file, root_node(), p, this );
859
- delete[] p;
860
-
861
- alphabet.read(file);
862
-
863
- vmark = 1;
864
- deterministic = minimised = 1;
865
- }
866
-
867
-
868
- /*******************************************************************/
869
- /* */
870
- /* error_message */
871
- /* */
872
- /*******************************************************************/
873
-
874
- static void error_message( size_t line )
875
-
876
- {
877
- static char message[1000];
878
- sprintf(message, "Error: in line %u of text transducer file",
879
- (unsigned int)line);
880
- throw message;
881
- }
882
-
883
-
884
- /*******************************************************************/
885
- /* */
886
- /* Transducer::create_node */
887
- /* */
888
- /*******************************************************************/
889
-
890
- Node *Transducer::create_node( vector<Node*> &node, char *s, size_t line )
891
-
892
- {
893
- char *p;
894
- long n = strtol(s, &p, 10);
895
-
896
- if (s == p || n < 0)
897
- error_message( line );
898
- if ((long)node.size() <= n)
899
- node.resize(n+1, NULL);
900
- if (node[n] == NULL)
901
- node[n] = new Node;
902
-
903
- return node[n];
904
- }
905
-
906
-
907
- /*******************************************************************/
908
- /* */
909
- /* next_string */
910
- /* */
911
- /*******************************************************************/
912
-
913
- static char *next_string( char* &s, size_t line )
914
-
915
- {
916
- // scan the input up to the next tab or newline character
917
- // and unquote symbols preceded by a backslash
918
- char *p = s;
919
- char *q = s;
920
- while (*q!=0 && *q!='\t' && *q!='\n' && *q!='\r') {
921
- if (*q == '\\')
922
- q++;
923
- *(p++) = *(q++);
924
- }
925
- if (p == s)
926
- error_message(line); // no string found
927
-
928
- char *result=s;
929
- // skip over following whitespace
930
- while (*q == ' ' || *q == '\t' || *q == '\n' || *q == '\r')
931
- q++;
932
-
933
- if (*q == 0)
934
- s = NULL; // end of string was reached
935
- else
936
- s = q; // move the string pointer s
937
-
938
- *p = 0; // mark the end of the result string
939
-
940
- return result;
941
- }
942
-
943
-
944
- /*******************************************************************/
945
- /* */
946
- /* Transducer::read_transducer_text */
947
- /* */
948
- /*******************************************************************/
949
-
950
- void Transducer::read_transducer_text( FILE *file )
951
-
952
- {
953
- vector<Node*> nodes;
954
- nodes.push_back(root_node());
955
-
956
- vmark = deterministic = 0;
957
- char buffer[10000];
958
- for( size_t line=0; fgets(buffer, 10000, file ); line++ ) {
959
- char *p = buffer;
960
- char *s = next_string(p, line);
961
- Node *node = create_node( nodes, s, line );
962
- if (p == NULL)
963
- node->set_final(true);
964
- else {
965
- s = next_string(p, line);
966
- Node *target = create_node( nodes, s, line );
967
-
968
- s = next_string(p, line);
969
- Character lc = alphabet.add_symbol(s);
970
- s = next_string(p, line);
971
- Character uc = alphabet.add_symbol(s);
972
- Label l(lc,uc);
973
- if (l == Label::epsilon)
974
- error_message( line );
975
-
976
- alphabet.insert(l);
977
- node->add_arc( l, target, this );
978
- }
979
- }
980
-
981
- vmark = 1;
982
- deterministic = minimised = 1;
983
- }
984
-
985
-
986
- /*******************************************************************/
987
- /* */
988
- /* Transducer::Transducer */
989
- /* */
990
- /*******************************************************************/
991
-
992
- Transducer::Transducer( FILE *file, bool binary )
993
-
994
- {
995
- if (binary)
996
- read_transducer_binary( file );
997
- else
998
- read_transducer_text( file );
999
- }
1000
-