ruby-sfst 0.4.3 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -0
  3. data/COPYING +280 -0
  4. data/Gemfile +3 -0
  5. data/Gemfile.lock +54 -0
  6. data/README.md +1 -1
  7. data/Rakefile +9 -18
  8. data/bin/console +7 -0
  9. data/bin/setup +6 -0
  10. data/ext/sfst/alphabet.cc +879 -0
  11. data/ext/sfst/alphabet.h +302 -0
  12. data/ext/sfst/basic.cc +85 -0
  13. data/ext/{sfst_machine → sfst}/basic.h +7 -4
  14. data/ext/sfst/compact.cc +629 -0
  15. data/ext/sfst/compact.h +100 -0
  16. data/ext/sfst/determinise.cc +279 -0
  17. data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
  18. data/ext/sfst/fst.cc +1150 -0
  19. data/ext/sfst/fst.h +374 -0
  20. data/ext/sfst/hopcroft.cc +681 -0
  21. data/ext/sfst/interface.cc +1921 -0
  22. data/ext/sfst/interface.h +171 -0
  23. data/ext/sfst/make-compact.cc +323 -0
  24. data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
  25. data/ext/sfst/mem.h +80 -0
  26. data/ext/sfst/operators.cc +1273 -0
  27. data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
  28. data/ext/sfst/sgi.h +72 -0
  29. data/ext/sfst/utf8.cc +149 -0
  30. data/ext/{sfst_machine → sfst}/utf8.h +7 -4
  31. data/lib/sfst.rb +2 -1
  32. data/lib/sfst/version.rb +1 -1
  33. data/ruby-sfst.gemspec +23 -23
  34. metadata +107 -35
  35. data/ext/sfst_machine/alphabet.cc +0 -812
  36. data/ext/sfst_machine/alphabet.h +0 -273
  37. data/ext/sfst_machine/basic.cc +0 -84
  38. data/ext/sfst_machine/compact.cc +0 -616
  39. data/ext/sfst_machine/compact.h +0 -98
  40. data/ext/sfst_machine/determinise.cc +0 -303
  41. data/ext/sfst_machine/fst.cc +0 -1000
  42. data/ext/sfst_machine/fst.h +0 -369
  43. data/ext/sfst_machine/interface.cc +0 -1842
  44. data/ext/sfst_machine/interface.h +0 -93
  45. data/ext/sfst_machine/make-compact.cc +0 -327
  46. data/ext/sfst_machine/mem.h +0 -74
  47. data/ext/sfst_machine/operators.cc +0 -1131
  48. data/ext/sfst_machine/sgi.h +0 -44
  49. data/ext/sfst_machine/utf8.cc +0 -146
  50. data/test/test_sfst.fst +0 -3
  51. data/test/test_sfst.rb +0 -114
@@ -1,98 +0,0 @@
1
- /*******************************************************************/
2
- /* */
3
- /* FILE compact.h */
4
- /* MODULE compact */
5
- /* PROGRAM SFST */
6
- /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
- /* */
8
- /* PURPOSE finite state tools */
9
- /* */
10
- /*******************************************************************/
11
-
12
- #ifndef _COMPACT_H_
13
- #define _COMPACT_H_
14
-
15
- #include "alphabet.h"
16
-
17
- #include <vector>
18
-
19
- typedef std::vector<unsigned int> CAnalysis;
20
-
21
- class CompactTransducer {
22
-
23
- protected:
24
-
25
- // the following data structures are used to store the nodes
26
-
27
- unsigned int number_of_nodes; // number of nodes in the transducer
28
- char *finalp; // finalp[i] is 1 if node i is final and 0 otherwise
29
- unsigned int *first_arc; // first_arc[i] is the number of the first
30
- // arc outgoing from node i
31
-
32
- // the following data structures are used to store the transition arcs
33
-
34
- unsigned int number_of_arcs; // total number of arcs in the transducer
35
- Label *label; // the label (character pair) of arc i
36
- unsigned int *target_node; // target node of arc i
37
-
38
- // the following data structures are used to store the stochastic parameters
39
- float *final_logprob;
40
- float *arc_logprob;
41
-
42
- // functions needed to read the transducer from a file
43
-
44
- void read_finalp( FILE *file );
45
- void read_first_arcs( FILE *file );
46
- void read_target_nodes( FILE *file );
47
- void read_labels( FILE *file );
48
- void read_probs( FILE *file );
49
-
50
- // functions needed to analyze data with the transducer
51
-
52
- void analyze( unsigned int n, std::vector<Character> &ch, size_t ipos,
53
- CAnalysis&, std::vector<CAnalysis>&);
54
-
55
- // function selecting the simplest morphological analysis
56
-
57
- int compute_score( CAnalysis &ana );
58
- void disambiguate( std::vector<CAnalysis> &analyses );
59
-
60
- // functions for longest-match analysis of input data
61
-
62
- void longest_match2(unsigned int, char*, int, CAnalysis&, int&, CAnalysis&);
63
-
64
- void convert( CAnalysis &cana, Analysis &ana );
65
-
66
- public:
67
- size_t node_count() { return number_of_nodes; };
68
- size_t arc_count() { return number_of_arcs; };
69
-
70
- bool both_layers; // print surface and analysis symbols
71
- bool simplest_only; // print only the simplest analyses
72
-
73
- Alphabet alphabet; // data structure which maps symbols to numeric codes
74
- CompactTransducer(); // dummy constructor
75
- CompactTransducer( FILE*, FILE *pfile=NULL ); // reads a (stochastic) transducer
76
- ~CompactTransducer(); // destroys a transducer
77
-
78
- // the analysis function returns the set of analyses for the string "s"
79
- // in the argument "analyses"
80
- void analyze_string( char *s, std::vector<CAnalysis > &analyses );
81
-
82
- void compute_probs( std::vector<CAnalysis> &analyses, std::vector<double> &prob );
83
- char *print_analysis( CAnalysis &ana );
84
-
85
- // longest-match analysis
86
- const char *longest_match( char*& );
87
-
88
- // EM training
89
- bool train2( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
90
- bool train( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
91
- void estimate_probs( std::vector<double> &arcfreq, std::vector<double> &finalfreq );
92
-
93
- // robust analysis
94
- float robust_analyze_string( char *string, std::vector<CAnalysis> &analyses,
95
- float ErrorsAllowed );
96
- };
97
-
98
- #endif
@@ -1,303 +0,0 @@
1
-
2
- /*******************************************************************/
3
- /* */
4
- /* FILE determinise.C */
5
- /* MODULE determinise */
6
- /* PROGRAM SFST */
7
- /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
8
- /* */
9
- /*******************************************************************/
10
-
11
-
12
- #include "fst.h"
13
-
14
- using std::vector;
15
- using std::pair;
16
- using std::set;
17
-
18
- /***************** class NodeSet *********************************/
19
-
20
- class NodeSet {
21
- // This class is used to store a set of nodes.
22
- // Whenever a new node is added, all nodes accessible
23
- // through epsilon transitions are added as well.
24
-
25
- private:
26
- set<Node*> ht;
27
-
28
- public:
29
- typedef set<Node*>::iterator iterator;
30
- NodeSet() {};
31
- void add( Node* );
32
- bool insert(Node *node) {
33
- pair<iterator, bool> result = ht.insert(node);
34
- return result.second;
35
- };
36
- iterator begin() const { return ht.begin(); }
37
- iterator end() const { return ht.end(); }
38
- size_t size() const { return ht.size(); }
39
- void clear() { ht.clear(); }
40
- };
41
-
42
-
43
- /***************** class NodeArray *******************************/
44
-
45
- class NodeArray {
46
-
47
- private:
48
- size_t sizev;
49
- bool final;
50
- Node **node;
51
-
52
- public:
53
- NodeArray( NodeSet& );
54
- ~NodeArray() { delete[] node; };
55
- size_t size() const { return sizev; }
56
- bool is_final() const { return final; };
57
- Node* &operator[]( int i ) const { return node[i]; }
58
- };
59
-
60
-
61
- /***************** class Transition ******************************/
62
-
63
- class Transition {
64
- public:
65
- Label label;
66
- NodeArray *nodes;
67
- Transition(Label l, NodeArray *na) { label = l; nodes = na; };
68
- };
69
-
70
-
71
- /***************** class NodeMapping ****************************/
72
-
73
- class NodeMapping {
74
- // This class is used to map a node set from one transducer
75
- // to a single node in another transducer
76
-
77
- private:
78
- struct hashf {
79
- size_t operator()(const NodeArray *na) const {
80
- size_t key=na->size() ^ na->is_final();
81
- for( size_t i=0; i<na->size(); i++)
82
- key = (key<<1) ^ (size_t)(*na)[i];
83
- return key;
84
- }
85
- };
86
- struct equalf {
87
- int operator()(const NodeArray *na1, const NodeArray *na2) const {
88
- if (na1->size() != na2->size() || na1->is_final() != na2->is_final())
89
- return 0;
90
- for( size_t i=0; i<na1->size(); i++)
91
- if ((*na1)[i] != (*na2)[i])
92
- return 0;
93
- return 1;
94
- }
95
- };
96
- typedef hash_map<NodeArray*, Node*, hashf, equalf> NodeMap;
97
- NodeMap hm;
98
-
99
- public:
100
- typedef NodeMap::iterator iterator;
101
- ~NodeMapping();
102
- iterator begin() { return hm.begin(); };
103
- iterator end() { return hm.end(); };
104
- iterator find( NodeArray *na) { return hm.find( na ); };
105
- Node* &operator[]( NodeArray *na ) { return hm.operator[](na); };
106
-
107
- };
108
-
109
-
110
- /***************** class LabelMapping ****************************/
111
-
112
- class LabelMapping {
113
- // This class is used to map a label to a node set
114
-
115
- private:
116
- struct hashf {
117
- size_t operator()(const Label l) const {
118
- return l.lower_char() | (l.upper_char() << 16);
119
- }
120
- };
121
- struct equalf {
122
- int operator()(const Label l1, const Label l2) const {
123
- return l1==l2;
124
- }
125
- };
126
- typedef hash_map<const Label, NodeSet, hashf, equalf> LabelMap;
127
- LabelMap lm;
128
-
129
- public:
130
- LabelMapping(): lm(8) {};
131
- typedef LabelMap::iterator iterator;
132
- iterator begin() { return lm.begin(); };
133
- iterator end() { return lm.end(); };
134
- size_t size() { return lm.size(); };
135
- iterator find( Label l) { return lm.find( l ); };
136
- NodeSet &operator[]( const Label l ) { return lm.operator[]( l ); };
137
-
138
- };
139
-
140
- static void determinise_node( NodeArray&, Node*, Transducer*, NodeMapping&, long );
141
-
142
-
143
-
144
- /*******************************************************************/
145
- /* */
146
- /* NodeSet::add */
147
- /* */
148
- /*******************************************************************/
149
-
150
- void NodeSet::add( Node *node )
151
-
152
- {
153
- pair<iterator, bool> result = ht.insert(node);
154
- if (result.second) {
155
- // new node, add nodes reachable with epsilon transitions
156
- for( ArcsIter p(node->arcs(),ArcsIter::eps); p; p++ ) {
157
- Arc *arc=p;
158
- if (!arc->label().is_epsilon())
159
- break;
160
- add(arc->target_node());
161
- }
162
- }
163
- }
164
-
165
-
166
- /*******************************************************************/
167
- /* */
168
- /* NodeArray::NodeArray */
169
- /* */
170
- /*******************************************************************/
171
-
172
- NodeArray::NodeArray( NodeSet &ns )
173
-
174
- {
175
- sizev = 0;
176
- NodeSet::iterator it;
177
-
178
- final = false;
179
- node = new Node*[ns.size()];
180
- for( it=ns.begin(); it!=ns.end(); it++ ) {
181
- Node *nn = *it;
182
- if (nn->arcs()->non_epsilon_transition_exists())
183
- node[sizev++] = nn;
184
- final |= nn->is_final();
185
- }
186
- std::sort(node, node+sizev);
187
- }
188
-
189
-
190
- /*******************************************************************/
191
- /* */
192
- /* NodeMapping::~NodeMapping */
193
- /* */
194
- /*******************************************************************/
195
-
196
- NodeMapping::~NodeMapping()
197
-
198
- {
199
- // if we delete NodeArrays without removing them from NodeMapping,
200
- // the system will crash when NodeMapping is deleted.
201
- for( iterator it=hm.begin(); it!=hm.end(); ) {
202
- NodeArray *na=it->first;
203
- iterator old = it++;
204
- hm.erase(old);
205
- delete na;
206
- }
207
- }
208
-
209
-
210
- /*******************************************************************/
211
- /* */
212
- /* compute_transitions */
213
- /* */
214
- /*******************************************************************/
215
-
216
- static void compute_transitions( NodeArray &na, vector<Transition> &t )
217
-
218
- {
219
- LabelMapping lmap;
220
-
221
- // for all nodes in the current set
222
- for( size_t i=0; i<na.size(); i++) {
223
- Node *n = na[i]; // old node
224
-
225
- // For each non-epsilon transition, add the target node
226
- // to the respective node set.
227
- for( ArcsIter p(n->arcs(),ArcsIter::non_eps); p; p++ ) {
228
- Arc *arc=p;
229
- lmap[arc->label()].add(arc->target_node());
230
- }
231
- }
232
-
233
- t.reserve(lmap.size());
234
- for( LabelMapping::iterator it=lmap.begin(); it!=lmap.end(); it++ )
235
- t.push_back(Transition(it->first, new NodeArray( it->second )));
236
- }
237
-
238
-
239
- /*******************************************************************/
240
- /* */
241
- /* determinise_node */
242
- /* */
243
- /*******************************************************************/
244
-
245
- static void determinise_node( NodeArray &na, Node *node, Transducer *a,
246
- NodeMapping &map, long depth )
247
- {
248
- if (depth > 10000)
249
- fprintf(stderr,"\r%ld",depth);
250
- node->set_final(na.is_final());
251
-
252
- vector<Transition> t;
253
- compute_transitions( na, t );
254
-
255
- for( size_t i=0; i<t.size(); i++ ) {
256
- NodeMapping::iterator it=map.find(t[i].nodes);
257
- if (it == map.end()) {
258
- // new node set
259
- Node *target_node = a->new_node();
260
- map[t[i].nodes] = target_node;
261
- node->add_arc( t[i].label, target_node, a );
262
- determinise_node( *t[i].nodes, target_node, a, map, depth+1 );
263
- }
264
- else {
265
- delete t[i].nodes;
266
- node->add_arc( t[i].label, it->second, a );
267
- }
268
- }
269
- }
270
-
271
-
272
- /*******************************************************************/
273
- /* */
274
- /* Transducer::determinise */
275
- /* */
276
- /*******************************************************************/
277
-
278
- Transducer &Transducer::determinise()
279
-
280
- {
281
- // initialisations
282
- NodeMapping map;
283
-
284
- Transducer *a = new Transducer();
285
- a->alphabet.copy(alphabet);
286
-
287
- // creation of the initial node set consisting of all nodes
288
- // reachable from the start node via epsilon transitions.
289
- NodeArray *na;
290
- {
291
- NodeSet ns;
292
- ns.add(root_node());
293
- na = new NodeArray(ns);
294
- }
295
-
296
- // map the node set to the new root node
297
- map[na] = a->root_node();
298
-
299
- // determinise the transducer recursively
300
- determinise_node( *na, a->root_node(), a, map, 0);
301
- a->deterministic = 1;
302
- return *a;
303
- }
@@ -1,1000 +0,0 @@
1
-
2
- /*******************************************************************/
3
- /* */
4
- /* FILE fst.C */
5
- /* MODULE fst */
6
- /* PROGRAM SFST */
7
- /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
8
- /* */
9
- /* PURPOSE basic FST functions */
10
- /* */
11
- /*******************************************************************/
12
-
13
- #include "fst.h"
14
-
15
- using std::vector;
16
- using std::istream;
17
- using std::ostream;
18
- using std::cerr;
19
-
20
- const int BUFFER_SIZE=100000;
21
-
22
-
23
- /*******************************************************************/
24
- /* */
25
- /* Arcs::size */
26
- /* */
27
- /*******************************************************************/
28
-
29
- int Arcs::size() const
30
-
31
- {
32
- int n=0;
33
- for( Arc *p=first_arcp; p; p=p->next ) n++;
34
- for( Arc *p=first_epsilon_arcp; p; p=p->next ) n++;
35
- return n;
36
- }
37
-
38
-
39
- /*******************************************************************/
40
- /* */
41
- /* Arcs::target_node */
42
- /* */
43
- /*******************************************************************/
44
-
45
- Node *Arcs::target_node( Label l )
46
-
47
- {
48
- Arc *arc;
49
-
50
- for( arc=first_arcp; arc; arc=arc->next)
51
- if (arc->label() == l)
52
- return arc->target_node();
53
-
54
- return NULL;
55
- }
56
-
57
- const Node *Arcs::target_node( Label l ) const
58
-
59
- {
60
- const Arc *arc;
61
-
62
- for( arc=first_arcp; arc; arc=arc->next)
63
- if (arc->label() == l)
64
- return arc->target_node();
65
-
66
- return NULL;
67
- }
68
-
69
-
70
- /*******************************************************************/
71
- /* */
72
- /* Arcs::add_arc */
73
- /* */
74
- /*******************************************************************/
75
-
76
- void Arcs::add_arc( Label l, Node *node, Transducer *a )
77
-
78
- {
79
- Arc *arc=a->new_arc( l, node );
80
-
81
- if (l.is_epsilon()) {
82
- arc->next = first_epsilon_arcp;
83
- first_epsilon_arcp = arc;
84
- }
85
- else {
86
- arc->next = first_arcp;
87
- first_arcp = arc;
88
- }
89
- }
90
-
91
-
92
- /*******************************************************************/
93
- /* */
94
- /* Arcs::remove_arc */
95
- /* */
96
- /*******************************************************************/
97
-
98
- int Arcs::remove_arc( Arc *arc )
99
-
100
- {
101
- Arc **p = (arc->label().is_epsilon()) ? &first_epsilon_arcp : &first_arcp;
102
- for( ; *p; p=&(*p)->next )
103
- if (*p == arc) {
104
- *p = arc->next;
105
- return 1;
106
- }
107
- return 0;
108
- }
109
-
110
-
111
- /*******************************************************************/
112
- /* */
113
- /* Node::init */
114
- /* */
115
- /*******************************************************************/
116
-
117
- void Node::init()
118
-
119
- {
120
- final = false;
121
- visited = 0;
122
- arcsp.init();
123
- forwardp = NULL;
124
- }
125
-
126
-
127
- /*******************************************************************/
128
- /* */
129
- /* Node::clear_visited */
130
- /* */
131
- /*******************************************************************/
132
-
133
- void Node::clear_visited( NodeHashSet &nodeset )
134
-
135
- {
136
- if (nodeset.find( this ) == nodeset.end()) {
137
- visited = 0;
138
- nodeset.insert( this );
139
- fprintf(stderr," %lu", nodeset.size());
140
- for( ArcsIter p(arcs()); p; p++ ) {
141
- Arc *arc=p;
142
- arc->target_node()->clear_visited( nodeset );
143
- }
144
- }
145
- }
146
-
147
-
148
- /*******************************************************************/
149
- /* */
150
- /* NodeNumbering::number_node */
151
- /* */
152
- /*******************************************************************/
153
-
154
- void NodeNumbering::number_node( Node *node, Transducer &a )
155
-
156
- {
157
- if (!node->was_visited( a.vmark )) {
158
- nummap[node] = nodes.size();
159
- nodes.push_back(node);
160
- for( ArcsIter p(node->arcs()); p; p++ ) {
161
- Arc *arc=p;
162
- number_node( arc->target_node(), a );
163
- }
164
- }
165
- }
166
-
167
-
168
- /*******************************************************************/
169
- /* */
170
- /* NodeNumbering::NodeNumbering */
171
- /* */
172
- /*******************************************************************/
173
-
174
- NodeNumbering::NodeNumbering( Transducer &a )
175
-
176
- {
177
- a.incr_vmark();
178
- number_node( a.root_node(), a );
179
- }
180
-
181
-
182
- /*******************************************************************/
183
- /* */
184
- /* Transducer::new_node */
185
- /* */
186
- /*******************************************************************/
187
-
188
- Node *Transducer::new_node()
189
-
190
- {
191
- Node *node=(Node*)mem.alloc( sizeof(Node) );
192
-
193
- node->init();
194
- return node;
195
- }
196
-
197
-
198
- /*******************************************************************/
199
- /* */
200
- /* Transducer::new_arc */
201
- /* */
202
- /*******************************************************************/
203
-
204
- Arc *Transducer::new_arc( Label l, Node *target )
205
-
206
- {
207
- Arc *arc=(Arc*)mem.alloc( sizeof(Arc) );
208
-
209
- arc->init( l, target);
210
- return arc;
211
- }
212
-
213
-
214
- /*******************************************************************/
215
- /* */
216
- /* Transducer::add_string */
217
- /* */
218
- /*******************************************************************/
219
-
220
- void Transducer::add_string( char *s, bool extended, Alphabet *a )
221
-
222
- {
223
- if (a == NULL)
224
- a = &alphabet;
225
-
226
- Node *node=root_node();
227
- Label l;
228
- while (!(l = a->next_label(s, extended)).is_epsilon()) {
229
- a->insert(l);
230
- Arcs *arcs=node->arcs();
231
- node = arcs->target_node( l );
232
- if (node == NULL) {
233
- node = new_node();
234
- arcs->add_arc( l, node, this );
235
- }
236
- }
237
- node->set_final(1);
238
- }
239
-
240
-
241
- /*******************************************************************/
242
- /* */
243
- /* Transducer::Transducer */
244
- /* */
245
- /*******************************************************************/
246
-
247
- Transducer::Transducer( vector<Label> &path )
248
- : root(), mem()
249
- {
250
- Node *node=root_node();
251
-
252
- vmark = 0;
253
- deterministic = minimised = true;
254
- for( size_t i=0; i<path.size(); i++ ) {
255
- Arcs *arcs=node->arcs();
256
- node = new_node();
257
- arcs->add_arc( path[i], node, this );
258
- }
259
- node->set_final(1);
260
- }
261
-
262
-
263
- /*******************************************************************/
264
- /* */
265
- /* Transducer::Transducer */
266
- /* */
267
- /*******************************************************************/
268
-
269
- Transducer::Transducer( istream &is, const Alphabet *a, bool verbose )
270
- : root(), mem()
271
- {
272
- bool extended=false;
273
- int n=0;
274
- char buffer[10000];
275
-
276
- vmark = 0;
277
- deterministic = true;
278
- minimised = false;
279
- if (a) {
280
- alphabet.copy(*a);
281
- extended = true;
282
- }
283
- while (is.getline(buffer, 10000)) {
284
- if (verbose && ++n % 10000 == 0) {
285
- if (n == 10000)
286
- cerr << "\n";
287
- cerr << "\r" << n << " words";
288
- }
289
- // delete final whitespace characters
290
- int l;
291
- for( l=strlen(buffer)-1; l>=0; l-- )
292
- if ((buffer[l] != ' ' && buffer[l] != '\t' && buffer[l] != '\r') ||
293
- (l > 0 && buffer[l-1] == '\\'))
294
- break;
295
- buffer[l+1] = 0;
296
-
297
- add_string(buffer, extended);
298
- }
299
- if (verbose && n >= 10000)
300
- cerr << "\n";
301
- }
302
-
303
-
304
- /*******************************************************************/
305
- /* */
306
- /* Transducer::Transducer */
307
- /* */
308
- /*******************************************************************/
309
-
310
- Transducer::Transducer( char *s, const Alphabet *a, bool extended )
311
- : root(), mem()
312
- {
313
- vmark = 0;
314
- deterministic = minimised = true;
315
- if (a)
316
- alphabet.copy(*a);
317
- add_string(s, extended);
318
- }
319
-
320
-
321
- /*******************************************************************/
322
- /* */
323
- /* Transducer::clear */
324
- /* */
325
- /*******************************************************************/
326
-
327
- void Transducer::clear()
328
-
329
- {
330
- vmark = 0;
331
- deterministic = minimised = false;
332
- root.init();
333
- mem.clear();
334
- alphabet.clear();
335
- }
336
-
337
-
338
- /*******************************************************************/
339
- /* */
340
- /* Transducer::store_symbols */
341
- /* */
342
- /*******************************************************************/
343
-
344
- void Transducer::store_symbols(Node *node, SymbolMap &symbol, LabelSet &labels)
345
-
346
- {
347
- if (!node->was_visited( vmark )) {
348
- Arcs *arcs=node->arcs();
349
- for( ArcsIter p(arcs); p; p++ ) {
350
- Arc *arc=p;
351
- Label l=arc->label();
352
-
353
- labels.insert(l);
354
-
355
- Character c = l.upper_char();
356
- if (symbol.find(c) == symbol.end()) {
357
- const char *s = alphabet.code2symbol(c);
358
- if (s)
359
- symbol[c] = fst_strdup(s);
360
- }
361
-
362
- c = l.lower_char();
363
- if (symbol.find(c) == symbol.end()) {
364
- const char *s = alphabet.code2symbol(c);
365
- if (s)
366
- symbol[c] = fst_strdup(s);
367
- }
368
-
369
- store_symbols( arc->target_node(), symbol, labels );
370
- }
371
- }
372
- }
373
-
374
-
375
- /*******************************************************************/
376
- /* */
377
- /* Transducer::minimise_alphabet */
378
- /* */
379
- /*******************************************************************/
380
-
381
- void Transducer::minimise_alphabet()
382
-
383
- {
384
- SymbolMap symbols;
385
- LabelSet labels;
386
- incr_vmark();
387
- store_symbols(root_node(), symbols, labels);
388
- alphabet.clear();
389
- for( SymbolMap::iterator it=symbols.begin(); it!=symbols.end(); it++ ) {
390
- alphabet.add_symbol( it->second, it->first );
391
- free(it->second);
392
- }
393
- for( LabelSet::iterator it=labels.begin(); it!=labels.end(); it++ )
394
- alphabet.insert(*it);
395
- }
396
-
397
-
398
- /*******************************************************************/
399
- /* */
400
- /* Transducer::minimise */
401
- /* */
402
- /*******************************************************************/
403
-
404
- Transducer &Transducer::minimise( bool verbose )
405
-
406
- {
407
- if (minimised)
408
- return copy();
409
-
410
- Transducer *a1, *a2;
411
-
412
- a1 = &reverse();
413
- a2 = &a1->determinise();
414
- delete a1;
415
-
416
- a1 = &a2->reverse();
417
- delete a2;
418
-
419
- a2 = &a1->determinise();
420
- delete a1;
421
-
422
- a2->minimised = true;
423
- a2->minimise_alphabet();
424
-
425
- return *a2;
426
- }
427
-
428
-
429
- /*******************************************************************/
430
- /* */
431
- /* Transducer::enumerate_paths_node */
432
- /* */
433
- /*******************************************************************/
434
-
435
- void Transducer::enumerate_paths_node( Node *node, vector<Label> &path,
436
- NodeHashSet &previous,
437
- vector<Transducer*> &result )
438
- {
439
- if (node->is_final())
440
- result.push_back(new Transducer(path));
441
-
442
- for( ArcsIter it_arc(node->arcs()); it_arc; it_arc++ ) {
443
- Arc *arc=it_arc;
444
-
445
- NodeHashSet::iterator it_node=previous.insert(node).first;
446
- path.push_back(arc->label());
447
- enumerate_paths_node( arc->target_node(), path, previous, result );
448
- path.pop_back();
449
- previous.erase(it_node);
450
- }
451
- }
452
-
453
-
454
- /*******************************************************************/
455
- /* */
456
- /* Transducer::enumerate_paths */
457
- /* */
458
- /*******************************************************************/
459
-
460
- bool Transducer::enumerate_paths( vector<Transducer*> &result )
461
-
462
- {
463
- if (is_infinitely_ambiguous())
464
- return true;
465
- for( size_t i=0; i<result.size(); i++ )
466
- delete result[i];
467
- result.clear();
468
-
469
- vector<Label> path;
470
- NodeHashSet previous;
471
- enumerate_paths_node( root_node(), path, previous, result );
472
- return false;
473
- }
474
-
475
-
476
-
477
-
478
- /*******************************************************************/
479
- /* */
480
- /* Transducer::print_strings_node */
481
- /* */
482
- /*******************************************************************/
483
-
484
- int Transducer::print_strings_node(Node *node, char *buffer, int pos,
485
- FILE *file, bool with_brackets )
486
- {
487
- int result = 0;
488
-
489
- if (node->was_visited( vmark )) {
490
- if (node->forward() != NULL) { // cycle detected
491
- cerr << "Warning: cyclic analyses (cycle aborted)\n";
492
- return 0;
493
- }
494
- node->set_forward(node); // used like a flag for loop detection
495
- }
496
- if (pos == BUFFER_SIZE)
497
- throw "Output string in function print_strings_node is too long";
498
- if (node->is_final()) {
499
- buffer[pos] = '\0';
500
- fprintf(file,"%s\n", buffer);
501
- result = 1;
502
- }
503
- for( ArcsIter i(node->arcs()); i; i++ ) {
504
- int p=pos;
505
- Arc *arc=i;
506
- Label l=arc->label();
507
- alphabet.write_label(l, buffer, &p, with_brackets);
508
- result |= print_strings_node(arc->target_node(), buffer, p,
509
- file, with_brackets );
510
- }
511
- node->set_forward(NULL);
512
-
513
- return result;
514
- }
515
-
516
-
517
- /*******************************************************************/
518
- /* */
519
- /* Transducer::print_strings */
520
- /* */
521
- /*******************************************************************/
522
-
523
- int Transducer::print_strings( FILE *file, bool with_brackets )
524
-
525
- {
526
- char buffer[BUFFER_SIZE];
527
- incr_vmark();
528
- return print_strings_node( root_node(), buffer, 0, file, with_brackets );
529
- }
530
-
531
-
532
- /*******************************************************************/
533
- /* */
534
- /* Transducer::analyze_string */
535
- /* */
536
- /*******************************************************************/
537
-
538
- bool Transducer::analyze_string( char *string, FILE *file, bool with_brackets )
539
-
540
- {
541
- vector<Character> input;
542
- alphabet.string2symseq( string, input );
543
- vector<Label> labels;
544
- for( size_t i=0; i<input.size(); i++ )
545
- labels.push_back(Label(input[i]));
546
-
547
- Transducer a1(labels);
548
- Transducer *a2=&(*this || a1);
549
- Transducer *a3=&(a2->lower_level());
550
- delete a2;
551
- a2 = &a3->minimise();
552
- delete a3;
553
-
554
- a2->alphabet.copy(alphabet);
555
- bool result = a2->print_strings( file, with_brackets );
556
- delete a2;
557
- return result;
558
- }
559
-
560
-
561
- /*******************************************************************/
562
- /* */
563
- /* Transducer::generate_string */
564
- /* */
565
- /*******************************************************************/
566
-
567
- bool Transducer::generate_string( char *string, FILE *file, bool with_brackets)
568
-
569
- {
570
- Transducer a1(string, &alphabet, false);
571
- Transducer *a2=&(a1 || *this);
572
- Transducer *a3=&(a2->upper_level());
573
- delete a2;
574
- a2 = &a3->minimise();
575
- delete a3;
576
-
577
- a2->alphabet.copy(alphabet);
578
- bool result = a2->print_strings( file, with_brackets );
579
- delete a2;
580
- return result;
581
- }
582
-
583
-
584
- /*******************************************************************/
585
- /* */
586
- /* complete */
587
- /* */
588
- /*******************************************************************/
589
-
590
- static void complete( Node *node, Alphabet &alphabet, int vmark)
591
-
592
- {
593
- if (node->was_visited( vmark ))
594
- return;
595
- for( ArcsIter p(node->arcs()); p; p++ ) {
596
- Arc *arc=p;
597
- if (!arc->label().is_epsilon())
598
- alphabet.insert(arc->label());
599
- complete(arc->target_node(), alphabet, vmark);
600
- }
601
- }
602
-
603
-
604
- /*******************************************************************/
605
- /* */
606
- /* Transducer::complete_alphabet */
607
- /* */
608
- /*******************************************************************/
609
-
610
- void Transducer::complete_alphabet()
611
-
612
- {
613
- incr_vmark();
614
- complete(root_node(), alphabet, vmark);
615
- }
616
-
617
-
618
- /*******************************************************************/
619
- /* */
620
- /* print_node */
621
- /* */
622
- /*******************************************************************/
623
-
624
- static void print_node( ostream &s, Node *node, NodeNumbering &index,
625
- long vmark, Alphabet &abc )
626
-
627
- {
628
- if (!node->was_visited( vmark )) {
629
- Arcs *arcs=node->arcs();
630
- for( ArcsIter p(arcs); p; p++ ) {
631
- Arc *arc=p;
632
- s << index[node] << "\t" << index[arc->target_node()];
633
- s << "\t" << abc.write_char(arc->label().lower_char());
634
- s << "\t" << abc.write_char(arc->label().upper_char());
635
- s << "\n";
636
- }
637
- if (node->is_final())
638
- s << index[node] << "\n";
639
- for( ArcsIter p(arcs); p; p++ ) {
640
- Arc *arc=p;
641
- print_node( s, arc->target_node(), index, vmark, abc );
642
- }
643
- }
644
- }
645
-
646
-
647
- /*******************************************************************/
648
- /* */
649
- /* operator<< */
650
- /* */
651
- /*******************************************************************/
652
-
653
- ostream &operator<<( ostream &s, Transducer &a )
654
-
655
- {
656
- NodeNumbering index(a);
657
- a.incr_vmark();
658
- print_node( s, a.root_node(), index, a.vmark, a.alphabet );
659
- return s;
660
- }
661
-
662
-
663
- /*******************************************************************/
664
- /* */
665
- /* store_node_info */
666
- /* */
667
- /*******************************************************************/
668
-
669
- static void store_node_info( FILE *file, Node *node )
670
-
671
- {
672
- // write final flag
673
- char c=node->is_final();
674
- fwrite(&c,sizeof(c),1,file);
675
-
676
- // write the number of arcs
677
- int nn = node->arcs()->size();
678
- if (nn > 65535)
679
- throw "Error: in function store_node\n";
680
- unsigned short n=(unsigned short)nn;
681
- fwrite(&n,sizeof(n),1,file);
682
- }
683
-
684
-
685
- /*******************************************************************/
686
- /* */
687
- /* store_arc_label */
688
- /* */
689
- /*******************************************************************/
690
-
691
- static void store_arc_label( FILE *file, Arc *arc )
692
-
693
- {
694
- Label l=arc->label();
695
- Character lc=l.lower_char();
696
- Character uc=l.upper_char();
697
- fwrite(&lc,sizeof(lc),1,file);
698
- fwrite(&uc,sizeof(uc),1,file);
699
- }
700
-
701
-
702
- /*******************************************************************/
703
- /* */
704
- /* store_node */
705
- /* */
706
- /*******************************************************************/
707
-
708
- static void store_node( FILE *file, Node *node, NodeNumbering &index,
709
- long vmark )
710
- {
711
- if (!node->was_visited( vmark )) {
712
-
713
- store_node_info( file, node );
714
-
715
- // write the arcs
716
- for( ArcsIter p(node->arcs()); p; p++ ) {
717
- Arc *arc=p;
718
- store_arc_label( file, arc );
719
- unsigned int t=index[arc->target_node()];
720
- fwrite(&t,sizeof(t),1,file);
721
- store_node(file, arc->target_node(), index, vmark );
722
- }
723
- }
724
- }
725
-
726
-
727
- /*******************************************************************/
728
- /* */
729
- /* store_lowmem_node */
730
- /* */
731
- /*******************************************************************/
732
-
733
- static void store_lowmem_node( FILE *file, Node *node, NodeNumbering &index,
734
- vector<unsigned int> &startpos)
735
- {
736
- store_node_info( file, node );
737
-
738
- // write the arcs
739
- for( ArcsIter p(node->arcs()); p; p++ ) {
740
- Arc *arc=p;
741
- store_arc_label( file, arc );
742
- unsigned int t=startpos[index[arc->target_node()]];
743
- fwrite(&t,sizeof(t),1,file);
744
- }
745
- }
746
-
747
-
748
- /*******************************************************************/
749
- /* */
750
- /* Transducer::store_lowmem */
751
- /* */
752
- /*******************************************************************/
753
-
754
- void Transducer::store_lowmem( FILE *file )
755
-
756
- {
757
- fputc('l',file);
758
- alphabet.store(file);
759
-
760
- // storing size of index table
761
- NodeNumbering index(*this);
762
-
763
- // compute the start position of the first node
764
- unsigned int pos=(unsigned int)ftell(file);
765
- vector<unsigned int> startpos;
766
- for( size_t i=0; i<index.number_of_nodes(); i++ ) {
767
- startpos.push_back(pos);
768
- Node *node=index.get_node(i);
769
- Arcs *arcs=node->arcs();
770
- pos += sizeof(char) // size of final flag
771
- + sizeof(unsigned short) // size of number of arcs
772
- + arcs->size() * (sizeof(Character) * 2 + sizeof(unsigned int)); // size of n arcs
773
- }
774
-
775
- // storing nodes
776
- for( size_t i=0; i<index.number_of_nodes(); i++ )
777
- store_lowmem_node( file, index.get_node(i), index, startpos );
778
- }
779
-
780
-
781
- /*******************************************************************/
782
- /* */
783
- /* Transducer::store */
784
- /* */
785
- /*******************************************************************/
786
-
787
- void Transducer::store( FILE *file )
788
-
789
- {
790
- fputc('a',file);
791
-
792
- NodeNumbering index(*this);
793
- incr_vmark();
794
- unsigned int n=index.number_of_nodes();
795
- fwrite(&n,sizeof(n),1,file);
796
- store_node( file, root_node(), index, vmark );
797
-
798
- alphabet.store(file);
799
- }
800
-
801
-
802
- /*******************************************************************/
803
- /* */
804
- /* read_node */
805
- /* */
806
- /*******************************************************************/
807
-
808
- static void read_node( FILE *file, Node *node, Node **p, Transducer *a )
809
- {
810
- char c;
811
- fread(&c,sizeof(c),1,file);
812
- node->set_final(c);
813
-
814
- unsigned short n;
815
- fread( &n, sizeof(n), 1, file);
816
-
817
- for( int i=0; i<n; i++ ) {
818
- Character lc,uc;
819
- unsigned int t;
820
- fread(&lc,sizeof(lc),1,file);
821
- fread(&uc,sizeof(uc),1,file);
822
- fread(&t,sizeof(t),1,file);
823
- if (ferror(file))
824
- throw "Error encountered while reading transducer from file";
825
- if (p[t])
826
- node->add_arc( Label(lc,uc), p[t], a );
827
- else {
828
- p[t] = a->new_node();
829
- node->add_arc( Label(lc,uc), p[t], a );
830
- read_node(file, p[t], p, a );
831
- }
832
- }
833
- }
834
-
835
-
836
- /*******************************************************************/
837
- /* */
838
- /* Transducer::read_transducer_binary */
839
- /* */
840
- /*******************************************************************/
841
-
842
- void Transducer::read_transducer_binary( FILE *file )
843
-
844
- {
845
- if (fgetc(file) != 'a')
846
- throw "Error: wrong file format (not a standard transducer)\n";
847
-
848
- vmark = deterministic = 0;
849
- unsigned int n;
850
- fread(&n,sizeof(n),1,file); // number of nodes
851
- if (ferror(file))
852
- throw "Error encountered while reading transducer from file";
853
-
854
- Node **p=new Node*[n]; // maps indices to nodes
855
- p[0] = root_node();
856
- for( unsigned int i=1; i<n; i++)
857
- p[i] = NULL;
858
- read_node( file, root_node(), p, this );
859
- delete[] p;
860
-
861
- alphabet.read(file);
862
-
863
- vmark = 1;
864
- deterministic = minimised = 1;
865
- }
866
-
867
-
868
- /*******************************************************************/
869
- /* */
870
- /* error_message */
871
- /* */
872
- /*******************************************************************/
873
-
874
- static void error_message( size_t line )
875
-
876
- {
877
- static char message[1000];
878
- sprintf(message, "Error: in line %u of text transducer file",
879
- (unsigned int)line);
880
- throw message;
881
- }
882
-
883
-
884
- /*******************************************************************/
885
- /* */
886
- /* Transducer::create_node */
887
- /* */
888
- /*******************************************************************/
889
-
890
- Node *Transducer::create_node( vector<Node*> &node, char *s, size_t line )
891
-
892
- {
893
- char *p;
894
- long n = strtol(s, &p, 10);
895
-
896
- if (s == p || n < 0)
897
- error_message( line );
898
- if ((long)node.size() <= n)
899
- node.resize(n+1, NULL);
900
- if (node[n] == NULL)
901
- node[n] = new Node;
902
-
903
- return node[n];
904
- }
905
-
906
-
907
- /*******************************************************************/
908
- /* */
909
- /* next_string */
910
- /* */
911
- /*******************************************************************/
912
-
913
- static char *next_string( char* &s, size_t line )
914
-
915
- {
916
- // scan the input up to the next tab or newline character
917
- // and unquote symbols preceded by a backslash
918
- char *p = s;
919
- char *q = s;
920
- while (*q!=0 && *q!='\t' && *q!='\n' && *q!='\r') {
921
- if (*q == '\\')
922
- q++;
923
- *(p++) = *(q++);
924
- }
925
- if (p == s)
926
- error_message(line); // no string found
927
-
928
- char *result=s;
929
- // skip over following whitespace
930
- while (*q == ' ' || *q == '\t' || *q == '\n' || *q == '\r')
931
- q++;
932
-
933
- if (*q == 0)
934
- s = NULL; // end of string was reached
935
- else
936
- s = q; // move the string pointer s
937
-
938
- *p = 0; // mark the end of the result string
939
-
940
- return result;
941
- }
942
-
943
-
944
- /*******************************************************************/
945
- /* */
946
- /* Transducer::read_transducer_text */
947
- /* */
948
- /*******************************************************************/
949
-
950
- void Transducer::read_transducer_text( FILE *file )
951
-
952
- {
953
- vector<Node*> nodes;
954
- nodes.push_back(root_node());
955
-
956
- vmark = deterministic = 0;
957
- char buffer[10000];
958
- for( size_t line=0; fgets(buffer, 10000, file ); line++ ) {
959
- char *p = buffer;
960
- char *s = next_string(p, line);
961
- Node *node = create_node( nodes, s, line );
962
- if (p == NULL)
963
- node->set_final(true);
964
- else {
965
- s = next_string(p, line);
966
- Node *target = create_node( nodes, s, line );
967
-
968
- s = next_string(p, line);
969
- Character lc = alphabet.add_symbol(s);
970
- s = next_string(p, line);
971
- Character uc = alphabet.add_symbol(s);
972
- Label l(lc,uc);
973
- if (l == Label::epsilon)
974
- error_message( line );
975
-
976
- alphabet.insert(l);
977
- node->add_arc( l, target, this );
978
- }
979
- }
980
-
981
- vmark = 1;
982
- deterministic = minimised = 1;
983
- }
984
-
985
-
986
- /*******************************************************************/
987
- /* */
988
- /* Transducer::Transducer */
989
- /* */
990
- /*******************************************************************/
991
-
992
- Transducer::Transducer( FILE *file, bool binary )
993
-
994
- {
995
- if (binary)
996
- read_transducer_binary( file );
997
- else
998
- read_transducer_text( file );
999
- }
1000
-