ruby-sfst 0.4.3 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -0
  3. data/COPYING +280 -0
  4. data/Gemfile +3 -0
  5. data/Gemfile.lock +54 -0
  6. data/README.md +1 -1
  7. data/Rakefile +9 -18
  8. data/bin/console +7 -0
  9. data/bin/setup +6 -0
  10. data/ext/sfst/alphabet.cc +879 -0
  11. data/ext/sfst/alphabet.h +302 -0
  12. data/ext/sfst/basic.cc +85 -0
  13. data/ext/{sfst_machine → sfst}/basic.h +7 -4
  14. data/ext/sfst/compact.cc +629 -0
  15. data/ext/sfst/compact.h +100 -0
  16. data/ext/sfst/determinise.cc +279 -0
  17. data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
  18. data/ext/sfst/fst.cc +1150 -0
  19. data/ext/sfst/fst.h +374 -0
  20. data/ext/sfst/hopcroft.cc +681 -0
  21. data/ext/sfst/interface.cc +1921 -0
  22. data/ext/sfst/interface.h +171 -0
  23. data/ext/sfst/make-compact.cc +323 -0
  24. data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
  25. data/ext/sfst/mem.h +80 -0
  26. data/ext/sfst/operators.cc +1273 -0
  27. data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
  28. data/ext/sfst/sgi.h +72 -0
  29. data/ext/sfst/utf8.cc +149 -0
  30. data/ext/{sfst_machine → sfst}/utf8.h +7 -4
  31. data/lib/sfst.rb +2 -1
  32. data/lib/sfst/version.rb +1 -1
  33. data/ruby-sfst.gemspec +23 -23
  34. metadata +107 -35
  35. data/ext/sfst_machine/alphabet.cc +0 -812
  36. data/ext/sfst_machine/alphabet.h +0 -273
  37. data/ext/sfst_machine/basic.cc +0 -84
  38. data/ext/sfst_machine/compact.cc +0 -616
  39. data/ext/sfst_machine/compact.h +0 -98
  40. data/ext/sfst_machine/determinise.cc +0 -303
  41. data/ext/sfst_machine/fst.cc +0 -1000
  42. data/ext/sfst_machine/fst.h +0 -369
  43. data/ext/sfst_machine/interface.cc +0 -1842
  44. data/ext/sfst_machine/interface.h +0 -93
  45. data/ext/sfst_machine/make-compact.cc +0 -327
  46. data/ext/sfst_machine/mem.h +0 -74
  47. data/ext/sfst_machine/operators.cc +0 -1131
  48. data/ext/sfst_machine/sgi.h +0 -44
  49. data/ext/sfst_machine/utf8.cc +0 -146
  50. data/test/test_sfst.fst +0 -3
  51. data/test/test_sfst.rb +0 -114
@@ -0,0 +1,171 @@
1
+ /*******************************************************************/
2
+ /* */
3
+ /* FILE interface.h */
4
+ /* MODULE interface */
5
+ /* PROGRAM SFST */
6
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
+ /* */
8
+ /*******************************************************************/
9
+
10
+ #ifndef _INTERFACE_H_
11
+ #define _INTERFACE_H_
12
+
13
+ #include "utf8.h"
14
+ #include "fst.h"
15
+
16
+ using std::set;
17
+ using std::cerr;
18
+
19
+ namespace SFST {
20
+
21
+ void error( const char *message );
22
+ void error2( const char *message, char *input );
23
+
24
+ typedef enum {twol_left, twol_right, twol_both} Twol_Type;
25
+
26
+ typedef enum {repl_left, repl_right, repl_up, my_repl_down, repl_down} Repl_Type;
27
+
28
+ typedef struct range_t {
29
+ Character character;
30
+ struct range_t *next;
31
+ } Range;
32
+
33
+ typedef struct ranges_t {
34
+ Range *range;
35
+ struct ranges_t *next;
36
+ } Ranges;
37
+
38
+
39
+ typedef struct contexts_t {
40
+ Transducer *left, *right;
41
+ struct contexts_t *next;
42
+ } Contexts;
43
+
44
+
45
+ /***************** class Interface *******************************/
46
+
47
+ class Interface {
48
+
49
+ private:
50
+ struct ltstr {
51
+ bool operator()(const char* s1, const char* s2) const
52
+ { return strcmp(s1, s2) < 0; }
53
+ };
54
+
55
+ typedef set<char*, ltstr> RVarSet;
56
+
57
+ typedef map<char*, Transducer*, ltstr> VarMap;
58
+ typedef map<char*, Range*, ltstr> SVarMap;
59
+
60
+ Range *copy_values( const Range *r );
61
+ Transducer *one_label_transducer( Label l );
62
+ void add_pi_transitions( Transducer *t, Node *node, Alphabet &alph );
63
+ Transducer *pi_machine( Alphabet &alph );
64
+ Transducer *empty_string_transducer( void );
65
+ Transducer *cp( Range *lower_range, Range *upper_range );
66
+ Transducer *anti_cp( Range *lower_range, Range *upper_range );
67
+ Transducer *twol_right_rule( Transducer *lc, Range *lower_range,
68
+ Range *upper_range, Transducer *rc);
69
+ Transducer *twol_left_rule( Transducer *lc, Range *lower_range,
70
+ Range *upper_range, Transducer *rc );
71
+ Transducer *restriction_transducer( Transducer *l1, Transducer *l2,
72
+ Character marker );
73
+ Transducer *marker_transducer( Transducer *t, Contexts *c,
74
+ Character &marker );
75
+ Transducer *center_transducer( Transducer *t, Transducer *pi,
76
+ Transducer *mt );
77
+ Transducer *context_transducer( Transducer *t, Transducer *pi,
78
+ Transducer *mt, Contexts *c );
79
+ Transducer *insert_boundary_transducer( Character leftm, Character rightm,
80
+ Alphabet &alph );
81
+ Transducer *remove_boundary_transducer( Character leftm, Character rightm,
82
+ Alphabet &alph );
83
+ Transducer *constrain_boundary_transducer( Character leftm, Character rm,
84
+ Alphabet &alph );
85
+ Transducer *extended_left_transducer( Transducer *t, Character m1,
86
+ Character m2, Alphabet& );
87
+ Transducer *left_context( Transducer *t, Character m1, Character m2, Alphabet& );
88
+ Transducer *right_context( Transducer *t, Character m1, Character m2, Alphabet& );
89
+ Transducer *make_optional( Transducer *t, Repl_Type type );
90
+ Transducer *replace_transducer( Transducer *ct, Character lm,
91
+ Character rm, Repl_Type type );
92
+
93
+ Transducer *result_transducer( Transducer *l1, Transducer *l2,
94
+ Twol_Type type, Character marker );
95
+
96
+ VarMap VM;
97
+ SVarMap SVM;
98
+ RVarSet RS;
99
+ RVarSet RSS;
100
+
101
+ public:
102
+ bool Verbose;
103
+ bool Alphabet_Defined;
104
+ bool LexiconComments;
105
+ Alphabet TheAlphabet;
106
+
107
+ Interface( bool utf8=false, bool verbose=false ) :
108
+ Verbose(verbose), Alphabet_Defined(false), LexiconComments(false)
109
+ {
110
+ TheAlphabet.utf8 = utf8;
111
+ }
112
+
113
+ void allow_lexicon_comments() { LexiconComments = true; }
114
+
115
+ Transducer *new_transducer( Range*, Range* );
116
+ Transducer *read_words( char *filename );
117
+ Transducer *read_transducer( char *filename );
118
+ Transducer *var_value( char *name );
119
+ Transducer *rvar_value( char *name );
120
+ Range *svar_value( char *name );
121
+ Range *complement_range( Range* );
122
+ Range *rsvar_value( char *name );
123
+ Character character_code( unsigned int uc );
124
+ Character symbol_code( char *s );
125
+
126
+ bool in_range( unsigned int c, Range *r );
127
+ Transducer *make_transducer( Range *r1, Range *r2 );
128
+
129
+ Range *add_value( Character, Range*);
130
+
131
+ Range *add_var_values( char *name, Range*);
132
+ Range *add_values( unsigned int, unsigned int, Range*);
133
+ Range *append_values( Range *r2, Range *r );
134
+ void add_alphabet( Transducer* );
135
+
136
+ // These functions delete their argument automata
137
+
138
+ void def_alphabet( Transducer *a );
139
+ bool def_var( char *name, Transducer *a );
140
+ bool def_rvar( char *name, Transducer *a );
141
+ bool def_svar( char *name, Range *r );
142
+ Transducer *explode( Transducer *a );
143
+ Transducer *catenate( Transducer *a1, Transducer *a2 );
144
+ Transducer *disjunction( Transducer *a1, Transducer *a2 );
145
+ Transducer *conjunction( Transducer *a1, Transducer *a2 );
146
+ Transducer *subtraction( Transducer *a1, Transducer *a2 );
147
+ Transducer *composition( Transducer *a1, Transducer *a2 );
148
+ Transducer *restriction( Transducer *a, Twol_Type type, Contexts *c, int );
149
+ Transducer *replace( Transducer *a, Repl_Type type, bool optional );
150
+ Transducer *replace_in_context( Transducer *a, Repl_Type type,
151
+ Contexts *c, bool optional );
152
+ Transducer *negation( Transducer *a );
153
+ Transducer *upper_level( Transducer *a );
154
+ Transducer *lower_level( Transducer *a );
155
+ Transducer *minimise( Transducer *a );
156
+ Transducer *switch_levels( Transducer *a );
157
+ Transducer *repetition( Transducer *a );
158
+ Transducer *repetition2( Transducer *a );
159
+ Transducer *optional( Transducer *a );
160
+ Transducer *make_rule( Transducer *lc, Range *r1, Twol_Type type,
161
+ Range *r2, Transducer *rc );
162
+ Transducer *freely_insert( Transducer *a, Character lc, Character uc );
163
+ Transducer *make_mapping( Ranges*, Ranges* );
164
+ Ranges *add_range( Range*, Ranges* );
165
+ Contexts *make_context( Transducer *l, Transducer *r );
166
+ Contexts *add_context( Contexts *nc, Contexts *c );
167
+ Transducer *result( Transducer*, bool );
168
+ void write_to_file( Transducer*, char *filename);
169
+ };
170
+ }
171
+ #endif
@@ -0,0 +1,323 @@
1
+ /*******************************************************************/
2
+ /* */
3
+ /* FILE make-compact.C */
4
+ /* MODULE make-compact */
5
+ /* PROGRAM SFST */
6
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
+ /* */
8
+ /* PURPOSE Code needed for generating compact automata */
9
+ /* */
10
+ /*******************************************************************/
11
+
12
+ #include <math.h>
13
+
14
+ #include "make-compact.h"
15
+
16
+ namespace SFST {
17
+
18
+ class ARC {
19
+ public:
20
+ int cv;
21
+ Label label;
22
+ unsigned int target_node;
23
+
24
+ bool operator< ( const ARC a ) const {
25
+ return cv < a.cv;
26
+ };
27
+ };
28
+
29
+ typedef map<Label, size_t, Label::label_cmp> LabelNumber;
30
+
31
+
32
+ /*******************************************************************/
33
+ /* */
34
+ /* MakeCompactTransducer::sort */
35
+ /* */
36
+ /*******************************************************************/
37
+
38
+ void MakeCompactTransducer::sort( Level level )
39
+
40
+ {
41
+ for( unsigned int n=0; n<number_of_nodes; n++) {
42
+ unsigned int from=first_arc[n];
43
+ unsigned int to=first_arc[n+1];
44
+ int l=to-from;
45
+
46
+ // copy the arcs to a temporary table
47
+ ARC *arc=new ARC[l];
48
+ for( unsigned int i=from; i<to; i++) {
49
+ arc[i-from].cv = (int)label[i].get_char(level);
50
+ // make sure that epsilon arcs are stored at the beginning
51
+ // even if epsilon is not 0
52
+ if (arc[i-from].cv == (int)Label::epsilon)
53
+ arc[i-from].cv = -1;
54
+ arc[i-from].label = label[i];
55
+ arc[i-from].target_node = target_node[i];
56
+ }
57
+
58
+ // sort the table
59
+ std::sort( arc, arc+l );
60
+
61
+ // copy the arcs back to the original table
62
+ for( unsigned int i=from; i<to; i++) {
63
+ label[i] = arc[i-from].label;
64
+ target_node[i] = arc[i-from].target_node;
65
+ }
66
+
67
+ delete[] arc;
68
+ }
69
+ }
70
+
71
+
72
+ /*******************************************************************/
73
+ /* */
74
+ /* MakeCompactTransducer::count_arcs */
75
+ /* */
76
+ /*******************************************************************/
77
+
78
+ void MakeCompactTransducer::count_arcs( Node *node, VType vmark )
79
+ {
80
+ if (!node->was_visited( vmark )) {
81
+ unsigned n = (unsigned)node->index;
82
+ finalp[n] = node->is_final();
83
+ first_arc[n] = 0;
84
+ Arcs *arcs=node->arcs();
85
+ for( ArcsIter p(arcs); p; p++ ) {
86
+ Arc *arc=p;
87
+ first_arc[n]++;
88
+ count_arcs(arc->target_node(), vmark);
89
+ }
90
+ }
91
+ }
92
+
93
+
94
+ /*******************************************************************/
95
+ /* */
96
+ /* MakeCompactTransducer::store_arcs */
97
+ /* */
98
+ /*******************************************************************/
99
+
100
+ void MakeCompactTransducer::store_arcs( Node *node, VType vmark )
101
+ {
102
+ if (!node->was_visited( vmark )) {
103
+ unsigned int n=first_arc[node->index];
104
+ Arcs *arcs=node->arcs();
105
+ for( ArcsIter p(arcs); p; p++ ) {
106
+ Arc *arc=p;
107
+ label[n] = arc->label();
108
+ target_node[n++] = (unsigned)arc->target_node()->index;
109
+ store_arcs(arc->target_node(), vmark);
110
+ }
111
+ }
112
+ }
113
+
114
+
115
+ /*******************************************************************/
116
+ /* */
117
+ /* MakeCompactTransducer::MakeCompactTransducer */
118
+ /* */
119
+ /*******************************************************************/
120
+
121
+ MakeCompactTransducer::MakeCompactTransducer( Transducer &a, Level l )
122
+
123
+ {
124
+ if (a.is_infinitely_ambiguous()) {
125
+ std::cerr << "Error: resulting transducer contains an infinite loop!\n";
126
+ exit(1);
127
+ }
128
+
129
+ number_of_nodes = (unsigned)a.nodeindexing().first;
130
+ alphabet.copy(a.alphabet);
131
+
132
+ // memory allocation
133
+ finalp = new char[number_of_nodes];
134
+ first_arc = new unsigned int[number_of_nodes+1];
135
+
136
+ // count the number of outgoing arcs for each node
137
+ // and store them in first_arc[]
138
+ a.incr_vmark();
139
+ count_arcs( a.root_node(), a.vmark );
140
+ for( int n=number_of_nodes; n>0; n-- )
141
+ first_arc[n] = first_arc[n-1];
142
+ first_arc[0] = 0;
143
+ for( unsigned int n=0; n<number_of_nodes; n++ )
144
+ first_arc[n+1] += first_arc[n];
145
+ number_of_arcs = first_arc[number_of_nodes];
146
+
147
+ // memory allocation
148
+ label = new Label[number_of_arcs];
149
+ target_node = new unsigned int[number_of_arcs];
150
+
151
+ // store the arcs
152
+ a.incr_vmark();
153
+ store_arcs( a.root_node(), a.vmark );
154
+
155
+ // sort the arcs
156
+ sort( l );
157
+ }
158
+
159
+
160
+ /*******************************************************************/
161
+ /* */
162
+ /* MakeCompactTransducer::store_finalp */
163
+ /* */
164
+ /*******************************************************************/
165
+
166
+ void MakeCompactTransducer::store_finalp( FILE *file )
167
+
168
+ {
169
+ int k=0;
170
+ unsigned char n=0;
171
+
172
+ for( size_t i=0; i<number_of_nodes; i++ ) {
173
+ n = (unsigned char)(n << 1);
174
+ if (finalp[i])
175
+ n |= 1;
176
+ if (++k == 8) {
177
+ fputc(n, file);
178
+ n = 0;
179
+ k = 0;
180
+ }
181
+ }
182
+ if (k > 0) {
183
+ n = (unsigned char)(n << (8-k));
184
+ fputc(n, file);
185
+ }
186
+ }
187
+
188
+
189
+ /*******************************************************************/
190
+ /* */
191
+ /* MakeCompactTransducer::store_first_arcs */
192
+ /* */
193
+ /* The data is encoded with the minimal number of bits needed. */
194
+ /* */
195
+ /*******************************************************************/
196
+
197
+ void MakeCompactTransducer::store_first_arcs( FILE *file )
198
+
199
+ {
200
+ int k=0;
201
+ unsigned int n=0;
202
+ // compute number of bits required for storing each item
203
+ int bits=(int)ceil(log(number_of_arcs+1)/log(2));
204
+
205
+ for( size_t i=0; i<=number_of_nodes; i++ ) {
206
+ unsigned int m=first_arc[i];
207
+ m <<= (sizeof(n)*8) - bits;
208
+ m >>= k;
209
+ n = n | m;
210
+ k += bits;
211
+ if (k >= (int)sizeof(n)*8) {
212
+ fwrite(&n, sizeof(n), 1, file);
213
+ k -= (int)sizeof(n) * 8;
214
+ n = first_arc[i];
215
+ if (k == 0)
216
+ n = 0;
217
+ else
218
+ n = first_arc[i] << (sizeof(n) * 8 - k);
219
+ }
220
+ }
221
+ if (k > 0)
222
+ fwrite(&n, sizeof(n), 1, file);
223
+ }
224
+
225
+
226
+ /*******************************************************************/
227
+ /* */
228
+ /* MakeCompactTransducer::store_target_nodes */
229
+ /* */
230
+ /*******************************************************************/
231
+
232
+ void MakeCompactTransducer::store_target_nodes( FILE *file )
233
+
234
+ {
235
+ int k=0;
236
+ unsigned int n=0;
237
+ int bits=(int)ceil(log(number_of_nodes)/log(2));
238
+
239
+ for( size_t i=0; i<number_of_arcs; i++ ) {
240
+ unsigned int m=target_node[i];
241
+ m <<= (sizeof(n)*8) - bits;
242
+ m >>= k;
243
+ n = n | m;
244
+ k += bits;
245
+ if (k >= (int)sizeof(n)*8) {
246
+ fwrite(&n, sizeof(n), 1, file);
247
+ k -= (int)sizeof(n)*8;
248
+ if (k == 0)
249
+ n = 0;
250
+ else
251
+ n = target_node[i] << (sizeof(n) * 8 - k);
252
+ }
253
+ }
254
+ if (k > 0)
255
+ fwrite(&n, sizeof(n), 1, file);
256
+ }
257
+
258
+
259
+ /*******************************************************************/
260
+ /* */
261
+ /* MakeCompactTransducer::store_labels */
262
+ /* */
263
+ /*******************************************************************/
264
+
265
+ void MakeCompactTransducer::store_labels( FILE *file )
266
+
267
+ {
268
+ size_t N=0;
269
+ LabelNumber LNum;
270
+ for( Alphabet::const_iterator it=alphabet.begin();
271
+ it != alphabet.end(); it++ )
272
+ {
273
+ Label l=*it;
274
+ LNum[l] = N++;
275
+ }
276
+
277
+ int k=0;
278
+ unsigned int n=0;
279
+ int bits=(int)ceil(log((double)alphabet.size())/log(2));
280
+
281
+ for( size_t i=0; i<number_of_arcs; i++ ) {
282
+ unsigned int l = (unsigned)LNum[label[i]];
283
+ unsigned int m=l;
284
+ m <<= (sizeof(n)*8) - bits;
285
+ m >>= k;
286
+ n = n | m;
287
+ k += bits;
288
+ if (k >= (int)sizeof(n)*8) {
289
+ fwrite(&n, sizeof(n), 1, file);
290
+ k -= (int)sizeof(n)*8;
291
+ if (k == 0)
292
+ n = 0;
293
+ else
294
+ n = l << (sizeof(n) * 8 - k);
295
+ }
296
+ }
297
+ if (k > 0)
298
+ fwrite(&n, sizeof(n), 1, file);
299
+ }
300
+
301
+
302
+ /*******************************************************************/
303
+ /* */
304
+ /* MakeCompactTransducer::store */
305
+ /* */
306
+ /*******************************************************************/
307
+
308
+ void MakeCompactTransducer::store( FILE *file )
309
+
310
+ {
311
+ fputc('c',file);
312
+ alphabet.store(file);
313
+ fwrite(&number_of_nodes, sizeof(number_of_nodes), 1, file);
314
+ fwrite(&number_of_arcs, sizeof(number_of_arcs), 1, file);
315
+ store_finalp(file);
316
+ store_first_arcs(file);
317
+ store_labels(file);
318
+ store_target_nodes(file);
319
+ if (ferror(file))
320
+ throw "Error encountered while writing transducer to file\n";
321
+ }
322
+
323
+ }