ruby-sfst 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,94 @@
1
+ /*******************************************************************/
2
+ /* */
3
+ /* FILE interface.h */
4
+ /* MODULE interface */
5
+ /* PROGRAM SFST */
6
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
+ /* */
8
+ /*******************************************************************/
9
+
10
+ #ifndef _INTERFACE_H_
11
+ #define _INTERFACE_H_
12
+
13
+ #include "utf8.h"
14
+ #include "fst.h"
15
+
16
+ typedef enum {twol_left,twol_right,twol_both} Twol_Type;
17
+
18
+ typedef enum {repl_left,repl_right,repl_up,repl_down} Repl_Type;
19
+
20
+ typedef struct range_t {
21
+ Character character;
22
+ struct range_t *next;
23
+ } Range;
24
+
25
+ typedef struct ranges_t {
26
+ Range *range;
27
+ struct ranges_t *next;
28
+ } Ranges;
29
+
30
+
31
+ typedef struct contexts_t {
32
+ Transducer *left, *right;
33
+ struct contexts_t *next;
34
+ } Contexts;
35
+
36
+
37
+ extern bool Verbose;
38
+ extern bool UTF8;
39
+ extern char *FileName;
40
+ extern Alphabet TheAlphabet;
41
+
42
+ void error2( char *message, char *input );
43
+ Transducer *new_transducer( Range*, Range* );
44
+ Transducer *read_words( char *filename );
45
+ Transducer *read_transducer( char *filename );
46
+ Transducer *var_value( char *name );
47
+ Transducer *rvar_value( char *name );
48
+ Range *svar_value( char *name );
49
+ Range *complement_range( Range* );
50
+ Range *rsvar_value( char *name );
51
+ Character character_code( unsigned int uc );
52
+ Character symbol_code( char *s );
53
+
54
+ Range *add_value( Character, Range*);
55
+ Range *add_var_values( char *name, Range*);
56
+ Range *add_values( unsigned int, unsigned int, Range*);
57
+ Range *append_values( Range *r2, Range *r );
58
+ void add_alphabet( Transducer* );
59
+ void store_transducer( Transducer *a, char *filename );
60
+
61
+ // These functions delete their argument automata
62
+
63
+ void def_alphabet( Transducer *a );
64
+ bool def_var( char *name, Transducer *a );
65
+ bool def_rvar( char *name, Transducer *a );
66
+ bool def_svar( char *name, Range *r );
67
+ Transducer *explode( Transducer *a );
68
+ Transducer *catenate( Transducer *a1, Transducer *a2 );
69
+ Transducer *disjunction( Transducer *a1, Transducer *a2 );
70
+ Transducer *conjunction( Transducer *a1, Transducer *a2 );
71
+ Transducer *subtraction( Transducer *a1, Transducer *a2 );
72
+ Transducer *composition( Transducer *a1, Transducer *a2 );
73
+ Transducer *restriction( Transducer *a, Twol_Type type, Contexts *c, int );
74
+ Transducer *replace( Transducer *a, Repl_Type type, bool optional );
75
+ Transducer *replace_in_context( Transducer *a, Repl_Type type, Contexts *c, bool optional );
76
+ Transducer *negation( Transducer *a );
77
+ Transducer *upper_level( Transducer *a );
78
+ Transducer *lower_level( Transducer *a );
79
+ Transducer *minimise( Transducer *a );
80
+ Transducer *switch_levels( Transducer *a );
81
+ Transducer *repetition( Transducer *a );
82
+ Transducer *repetition2( Transducer *a );
83
+ Transducer *optional( Transducer *a );
84
+ Transducer *make_rule( Transducer *lc, Range *r1, Twol_Type type,
85
+ Range *r2, Transducer *rc );
86
+ Transducer *freely_insert( Transducer *a, Character c, Character c );
87
+ Transducer *make_mapping( Ranges*, Ranges* );
88
+ Ranges *add_range( Range*, Ranges* );
89
+ Contexts *make_context( Transducer *l, Transducer *r );
90
+ Contexts *add_context( Contexts *nc, Contexts *c );
91
+ Transducer *result( Transducer*, bool );
92
+ void write_to_file( Transducer*, char *filename);
93
+
94
+ #endif
@@ -0,0 +1,328 @@
1
+ /*******************************************************************/
2
+ /* */
3
+ /* FILE make-compact.C */
4
+ /* MODULE make-compact */
5
+ /* PROGRAM SFST */
6
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
+ /* */
8
+ /* PURPOSE Code needed for generating compact automata */
9
+ /* */
10
+ /*******************************************************************/
11
+
12
+ #include <math.h>
13
+
14
+ #include "make-compact.h"
15
+
16
+ using std::equal_range;
17
+ using std::sort;
18
+ using std::cerr;
19
+ using __gnu_cxx::hash_map;
20
+
21
+ class ARC {
22
+ public:
23
+ int cv;
24
+ Label label;
25
+ unsigned int target_node;
26
+
27
+ bool operator< ( const ARC a ) const {
28
+ return cv < a.cv;
29
+ };
30
+ };
31
+
32
+ typedef hash_map<Label, size_t, Label::label_hash, Label::label_eq> LabelNumber;
33
+
34
+
35
+ /*******************************************************************/
36
+ /* */
37
+ /* MakeCompactTransducer::sort */
38
+ /* */
39
+ /*******************************************************************/
40
+
41
+ void MakeCompactTransducer::sort( Level level )
42
+
43
+ {
44
+ for( unsigned int n=0; n<number_of_nodes; n++) {
45
+ unsigned int from=first_arc[n];
46
+ unsigned int to=first_arc[n+1];
47
+ int l=to-from;
48
+
49
+ // copy the arcs to a temporary table
50
+ ARC *arc=new ARC[l];
51
+ for( unsigned int i=from; i<to; i++) {
52
+ arc[i-from].cv = (int)label[i].get_char(level);
53
+ // make sure that epsilon arcs are stored at the beginning
54
+ // even if epsilon is not 0
55
+ if (arc[i-from].cv == (int)Label::epsilon)
56
+ arc[i-from].cv = -1;
57
+ arc[i-from].label = label[i];
58
+ arc[i-from].target_node = target_node[i];
59
+ }
60
+
61
+ // sort the table
62
+ ::sort( arc, arc+l );
63
+
64
+ // copy the arcs back to the original table
65
+ for( unsigned int i=from; i<to; i++) {
66
+ label[i] = arc[i-from].label;
67
+ target_node[i] = arc[i-from].target_node;
68
+ }
69
+
70
+ delete[] arc;
71
+ }
72
+ }
73
+
74
+
75
+ /*******************************************************************/
76
+ /* */
77
+ /* MakeCompactTransducer::count_arcs */
78
+ /* */
79
+ /*******************************************************************/
80
+
81
+ void MakeCompactTransducer::count_arcs( Node *node, NodeNumbering &index,
82
+ long vmark )
83
+ {
84
+ if (!node->was_visited( vmark )) {
85
+ unsigned int n = index[node];
86
+ finalp[n] = node->is_final();
87
+ first_arc[n] = 0;
88
+ Arcs *arcs=node->arcs();
89
+ for( ArcsIter p(arcs); p; p++ ) {
90
+ Arc *arc=p;
91
+ first_arc[n]++;
92
+ count_arcs(arc->target_node(), index, vmark);
93
+ }
94
+ }
95
+ }
96
+
97
+
98
+ /*******************************************************************/
99
+ /* */
100
+ /* MakeCompactTransducer::store_arcs */
101
+ /* */
102
+ /*******************************************************************/
103
+
104
+ void MakeCompactTransducer::store_arcs( Node *node, NodeNumbering &index,
105
+ long vmark )
106
+ {
107
+ if (!node->was_visited( vmark )) {
108
+ unsigned int n=first_arc[index[node]];
109
+ Arcs *arcs=node->arcs();
110
+ for( ArcsIter p(arcs); p; p++ ) {
111
+ Arc *arc=p;
112
+ label[n] = arc->label();
113
+ target_node[n++] = index[arc->target_node()];
114
+ store_arcs(arc->target_node(), index, vmark);
115
+ }
116
+ }
117
+ }
118
+
119
+
120
+ /*******************************************************************/
121
+ /* */
122
+ /* MakeCompactTransducer::MakeCompactTransducer */
123
+ /* */
124
+ /*******************************************************************/
125
+
126
+ MakeCompactTransducer::MakeCompactTransducer( Transducer &a, Level l )
127
+
128
+ {
129
+ if (a.is_infinitely_ambiguous()) {
130
+ cerr << "Error: resulting transducer contains an infinite loop!\n";
131
+ exit(1);
132
+ }
133
+
134
+ NodeNumbering index(a);
135
+
136
+ alphabet.copy(a.alphabet);
137
+
138
+ // memory allocation
139
+ number_of_nodes = index.number_of_nodes();
140
+ finalp = new char[number_of_nodes];
141
+ first_arc = new unsigned int[number_of_nodes+1];
142
+
143
+ // count the number of outgoing arcs for each node
144
+ // and store them in first_arc[]
145
+ a.incr_vmark();
146
+ count_arcs( a.root_node(), index, a.vmark );
147
+ for( int n=number_of_nodes; n>0; n-- )
148
+ first_arc[n] = first_arc[n-1];
149
+ first_arc[0] = 0;
150
+ for( unsigned int n=0; n<number_of_nodes; n++ )
151
+ first_arc[n+1] += first_arc[n];
152
+ number_of_arcs = first_arc[number_of_nodes];
153
+
154
+ // memory allocation
155
+ label = new Label[number_of_arcs];
156
+ target_node = new unsigned int[number_of_arcs];
157
+
158
+ // store the arcs
159
+ a.incr_vmark();
160
+ store_arcs( a.root_node(), index, a.vmark );
161
+
162
+ // sort the arcs
163
+ sort( l );
164
+ }
165
+
166
+
167
+ /*******************************************************************/
168
+ /* */
169
+ /* MakeCompactTransducer::store_finalp */
170
+ /* */
171
+ /*******************************************************************/
172
+
173
+ void MakeCompactTransducer::store_finalp( FILE *file )
174
+
175
+ {
176
+ int k=0;
177
+ unsigned char n=0;
178
+
179
+ for( size_t i=0; i<number_of_nodes; i++ ) {
180
+ n = n << 1;
181
+ if (finalp[i])
182
+ n |= 1;
183
+ if (++k == 8) {
184
+ fputc(n, file);
185
+ n = 0;
186
+ k = 0;
187
+ }
188
+ }
189
+ if (k > 0) {
190
+ n <<= 8-k;
191
+ fputc(n, file);
192
+ }
193
+ }
194
+
195
+
196
+ /*******************************************************************/
197
+ /* */
198
+ /* MakeCompactTransducer::store_first_arcs */
199
+ /* */
200
+ /* The data is encoded with the minimal number of bits needed. */
201
+ /* */
202
+ /*******************************************************************/
203
+
204
+ void MakeCompactTransducer::store_first_arcs( FILE *file )
205
+
206
+ {
207
+ int k=0;
208
+ unsigned int n=0;
209
+ // compute number of bits required for storing each item
210
+ size_t bits=(size_t)ceil(log(number_of_arcs+1)/log(2));
211
+
212
+ for( size_t i=0; i<=number_of_nodes; i++ ) {
213
+ unsigned int m=first_arc[i];
214
+ m <<= (sizeof(n)*8) - bits;
215
+ m >>= k;
216
+ n = n | m;
217
+ k += bits;
218
+ if (k >= (int)sizeof(n)*8) {
219
+ fwrite(&n, sizeof(n), 1, file);
220
+ k -= sizeof(n) * 8;
221
+ n = first_arc[i];
222
+ if (k == 0)
223
+ n = 0;
224
+ else
225
+ n = first_arc[i] << (sizeof(n) * 8 - k);
226
+ }
227
+ }
228
+ if (k > 0)
229
+ fwrite(&n, sizeof(n), 1, file);
230
+ }
231
+
232
+
233
+ /*******************************************************************/
234
+ /* */
235
+ /* MakeCompactTransducer::store_target_nodes */
236
+ /* */
237
+ /*******************************************************************/
238
+
239
+ void MakeCompactTransducer::store_target_nodes( FILE *file )
240
+
241
+ {
242
+ int k=0;
243
+ unsigned int n=0;
244
+ size_t bits=(size_t)ceil(log(number_of_nodes)/log(2));
245
+
246
+ for( size_t i=0; i<number_of_arcs; i++ ) {
247
+ unsigned int m=target_node[i];
248
+ m <<= (sizeof(n)*8) - bits;
249
+ m >>= k;
250
+ n = n | m;
251
+ k += bits;
252
+ if (k >= (int)sizeof(n)*8) {
253
+ fwrite(&n, sizeof(n), 1, file);
254
+ k -= sizeof(n)*8;
255
+ if (k == 0)
256
+ n = 0;
257
+ else
258
+ n = target_node[i] << (sizeof(n) * 8 - k);
259
+ }
260
+ }
261
+ if (k > 0)
262
+ fwrite(&n, sizeof(n), 1, file);
263
+ }
264
+
265
+
266
+ /*******************************************************************/
267
+ /* */
268
+ /* MakeCompactTransducer::store_labels */
269
+ /* */
270
+ /*******************************************************************/
271
+
272
+ void MakeCompactTransducer::store_labels( FILE *file )
273
+
274
+ {
275
+ size_t N=0;
276
+ LabelNumber LNum;
277
+ for( Alphabet::const_iterator it=alphabet.begin();
278
+ it != alphabet.end(); it++ )
279
+ {
280
+ Label l=*it;
281
+ LNum[l] = N++;
282
+ }
283
+
284
+ int k=0;
285
+ unsigned int n=0;
286
+ size_t bits=(size_t)ceil(log(alphabet.size())/log(2));
287
+
288
+ for( size_t i=0; i<number_of_arcs; i++ ) {
289
+ unsigned int l = LNum[label[i]];
290
+ unsigned int m=l;
291
+ m = m << (sizeof(n)*8) - bits;
292
+ m = m >> k;
293
+ n = n | m;
294
+ k += bits;
295
+ if (k >= (int)sizeof(n)*8) {
296
+ fwrite(&n, sizeof(n), 1, file);
297
+ k -= sizeof(n)*8;
298
+ if (k == 0)
299
+ n = 0;
300
+ else
301
+ n = l << (sizeof(n) * 8 - k);
302
+ }
303
+ }
304
+ if (k > 0)
305
+ fwrite(&n, sizeof(n), 1, file);
306
+ }
307
+
308
+
309
+ /*******************************************************************/
310
+ /* */
311
+ /* MakeCompactTransducer::store */
312
+ /* */
313
+ /*******************************************************************/
314
+
315
+ void MakeCompactTransducer::store( FILE *file )
316
+
317
+ {
318
+ fputc('c',file);
319
+ alphabet.store(file);
320
+ fwrite(&number_of_nodes, sizeof(number_of_nodes), 1, file);
321
+ fwrite(&number_of_arcs, sizeof(number_of_arcs), 1, file);
322
+ store_finalp(file);
323
+ store_first_arcs(file);
324
+ store_labels(file);
325
+ store_target_nodes(file);
326
+ if (ferror(file))
327
+ throw "Error encountered while writing transducer to file\n";
328
+ }
@@ -0,0 +1,34 @@
1
+ /*******************************************************************/
2
+ /* */
3
+ /* FILE make-compact.h */
4
+ /* MODULE make-compact */
5
+ /* PROGRAM SFST */
6
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
+ /* */
8
+ /*******************************************************************/
9
+
10
+ #ifndef _MAKE_COMPACT_H_
11
+ #define _MAKE_COMPACT_H_
12
+
13
+ #include "fst.h"
14
+ #include "compact.h"
15
+
16
+
17
+ class MakeCompactTransducer : CompactTransducer {
18
+
19
+ private:
20
+ void count_arcs(Node *node, NodeNumbering &index, long vmark);
21
+ void store_arcs(Node *node, NodeNumbering &index, long vmark);
22
+ void store_finalp( FILE *file );
23
+ void store_first_arcs( FILE *file );
24
+ void store_target_nodes( FILE *file );
25
+ void store_labels( FILE *file );
26
+
27
+ public:
28
+ MakeCompactTransducer( Transducer &a, Level sort=upper );
29
+
30
+ void sort( Level );
31
+ void store( FILE *file );
32
+ };
33
+
34
+ #endif
@@ -0,0 +1,74 @@
1
+ /*******************************************************************/
2
+ /* */
3
+ /* FILE mem.h */
4
+ /* MODULE mem */
5
+ /* PROGRAM SFST */
6
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
+ /* */
8
+ /* PURPOSE memory management functions */
9
+ /* */
10
+ /*******************************************************************/
11
+
12
+ #ifndef _MEM_H_
13
+ #define _MEM_H_
14
+
15
+ #include <stdlib.h>
16
+
17
+ #define MEMBUFFER_SIZE 100000
18
+
19
+
20
+ /***************** class Mem *************************************/
21
+
22
+ class Mem {
23
+
24
+ private:
25
+
26
+ struct MemBuffer {
27
+ char buffer[MEMBUFFER_SIZE];
28
+ struct MemBuffer *next;
29
+ };
30
+
31
+ MemBuffer *first_buffer;
32
+ long pos;
33
+ void add_buffer() {
34
+ MemBuffer *mb=(MemBuffer*)malloc(sizeof(MemBuffer));
35
+ mb->next = first_buffer;
36
+ first_buffer = mb;
37
+ pos = 0;
38
+ }
39
+
40
+ public:
41
+ Mem() { first_buffer = NULL; add_buffer(); }
42
+ ~Mem() { clear(); }
43
+
44
+ void clear() {
45
+ while (first_buffer) {
46
+ MemBuffer *next = first_buffer->next;
47
+ free(first_buffer);
48
+ first_buffer = next;
49
+ }
50
+ pos = 0;
51
+ }
52
+
53
+ void *alloc( size_t n ) {
54
+ void *result;
55
+
56
+ /* do memory alignment to multiples of 4 */
57
+ if (n % 4)
58
+ n += 4 - (n % 4);
59
+
60
+ if (first_buffer == NULL || pos+n > MEMBUFFER_SIZE)
61
+ add_buffer();
62
+ if (pos+n > MEMBUFFER_SIZE)
63
+ throw "Allocation of memory block larger than MEMBUFFER_SIZE attempted!";
64
+
65
+ result = (void*)(first_buffer->buffer + pos);
66
+ pos += n;
67
+ return result;
68
+ }
69
+
70
+ //class MemError {};
71
+
72
+ };
73
+
74
+ #endif