ruby-sfst 0.4.3 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -0
  3. data/COPYING +280 -0
  4. data/Gemfile +3 -0
  5. data/Gemfile.lock +54 -0
  6. data/README.md +1 -1
  7. data/Rakefile +9 -18
  8. data/bin/console +7 -0
  9. data/bin/setup +6 -0
  10. data/ext/sfst/alphabet.cc +879 -0
  11. data/ext/sfst/alphabet.h +302 -0
  12. data/ext/sfst/basic.cc +85 -0
  13. data/ext/{sfst_machine → sfst}/basic.h +7 -4
  14. data/ext/sfst/compact.cc +629 -0
  15. data/ext/sfst/compact.h +100 -0
  16. data/ext/sfst/determinise.cc +279 -0
  17. data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
  18. data/ext/sfst/fst.cc +1150 -0
  19. data/ext/sfst/fst.h +374 -0
  20. data/ext/sfst/hopcroft.cc +681 -0
  21. data/ext/sfst/interface.cc +1921 -0
  22. data/ext/sfst/interface.h +171 -0
  23. data/ext/sfst/make-compact.cc +323 -0
  24. data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
  25. data/ext/sfst/mem.h +80 -0
  26. data/ext/sfst/operators.cc +1273 -0
  27. data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
  28. data/ext/sfst/sgi.h +72 -0
  29. data/ext/sfst/utf8.cc +149 -0
  30. data/ext/{sfst_machine → sfst}/utf8.h +7 -4
  31. data/lib/sfst.rb +2 -1
  32. data/lib/sfst/version.rb +1 -1
  33. data/ruby-sfst.gemspec +23 -23
  34. metadata +107 -35
  35. data/ext/sfst_machine/alphabet.cc +0 -812
  36. data/ext/sfst_machine/alphabet.h +0 -273
  37. data/ext/sfst_machine/basic.cc +0 -84
  38. data/ext/sfst_machine/compact.cc +0 -616
  39. data/ext/sfst_machine/compact.h +0 -98
  40. data/ext/sfst_machine/determinise.cc +0 -303
  41. data/ext/sfst_machine/fst.cc +0 -1000
  42. data/ext/sfst_machine/fst.h +0 -369
  43. data/ext/sfst_machine/interface.cc +0 -1842
  44. data/ext/sfst_machine/interface.h +0 -93
  45. data/ext/sfst_machine/make-compact.cc +0 -327
  46. data/ext/sfst_machine/mem.h +0 -74
  47. data/ext/sfst_machine/operators.cc +0 -1131
  48. data/ext/sfst_machine/sgi.h +0 -44
  49. data/ext/sfst_machine/utf8.cc +0 -146
  50. data/test/test_sfst.fst +0 -3
  51. data/test/test_sfst.rb +0 -114
@@ -0,0 +1,171 @@
1
+ /*******************************************************************/
2
+ /* */
3
+ /* FILE interface.h */
4
+ /* MODULE interface */
5
+ /* PROGRAM SFST */
6
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
+ /* */
8
+ /*******************************************************************/
9
+
10
+ #ifndef _INTERFACE_H_
11
+ #define _INTERFACE_H_
12
+
13
+ #include "utf8.h"
14
+ #include "fst.h"
15
+
16
+ using std::set;
17
+ using std::cerr;
18
+
19
+ namespace SFST {
20
+
21
+ void error( const char *message );
22
+ void error2( const char *message, char *input );
23
+
24
+ typedef enum {twol_left, twol_right, twol_both} Twol_Type;
25
+
26
+ typedef enum {repl_left, repl_right, repl_up, my_repl_down, repl_down} Repl_Type;
27
+
28
+ typedef struct range_t {
29
+ Character character;
30
+ struct range_t *next;
31
+ } Range;
32
+
33
+ typedef struct ranges_t {
34
+ Range *range;
35
+ struct ranges_t *next;
36
+ } Ranges;
37
+
38
+
39
+ typedef struct contexts_t {
40
+ Transducer *left, *right;
41
+ struct contexts_t *next;
42
+ } Contexts;
43
+
44
+
45
+ /***************** class Interface *******************************/
46
+
47
+ class Interface {
48
+
49
+ private:
50
+ struct ltstr {
51
+ bool operator()(const char* s1, const char* s2) const
52
+ { return strcmp(s1, s2) < 0; }
53
+ };
54
+
55
+ typedef set<char*, ltstr> RVarSet;
56
+
57
+ typedef map<char*, Transducer*, ltstr> VarMap;
58
+ typedef map<char*, Range*, ltstr> SVarMap;
59
+
60
+ Range *copy_values( const Range *r );
61
+ Transducer *one_label_transducer( Label l );
62
+ void add_pi_transitions( Transducer *t, Node *node, Alphabet &alph );
63
+ Transducer *pi_machine( Alphabet &alph );
64
+ Transducer *empty_string_transducer( void );
65
+ Transducer *cp( Range *lower_range, Range *upper_range );
66
+ Transducer *anti_cp( Range *lower_range, Range *upper_range );
67
+ Transducer *twol_right_rule( Transducer *lc, Range *lower_range,
68
+ Range *upper_range, Transducer *rc);
69
+ Transducer *twol_left_rule( Transducer *lc, Range *lower_range,
70
+ Range *upper_range, Transducer *rc );
71
+ Transducer *restriction_transducer( Transducer *l1, Transducer *l2,
72
+ Character marker );
73
+ Transducer *marker_transducer( Transducer *t, Contexts *c,
74
+ Character &marker );
75
+ Transducer *center_transducer( Transducer *t, Transducer *pi,
76
+ Transducer *mt );
77
+ Transducer *context_transducer( Transducer *t, Transducer *pi,
78
+ Transducer *mt, Contexts *c );
79
+ Transducer *insert_boundary_transducer( Character leftm, Character rightm,
80
+ Alphabet &alph );
81
+ Transducer *remove_boundary_transducer( Character leftm, Character rightm,
82
+ Alphabet &alph );
83
+ Transducer *constrain_boundary_transducer( Character leftm, Character rm,
84
+ Alphabet &alph );
85
+ Transducer *extended_left_transducer( Transducer *t, Character m1,
86
+ Character m2, Alphabet& );
87
+ Transducer *left_context( Transducer *t, Character m1, Character m2, Alphabet& );
88
+ Transducer *right_context( Transducer *t, Character m1, Character m2, Alphabet& );
89
+ Transducer *make_optional( Transducer *t, Repl_Type type );
90
+ Transducer *replace_transducer( Transducer *ct, Character lm,
91
+ Character rm, Repl_Type type );
92
+
93
+ Transducer *result_transducer( Transducer *l1, Transducer *l2,
94
+ Twol_Type type, Character marker );
95
+
96
+ VarMap VM;
97
+ SVarMap SVM;
98
+ RVarSet RS;
99
+ RVarSet RSS;
100
+
101
+ public:
102
+ bool Verbose;
103
+ bool Alphabet_Defined;
104
+ bool LexiconComments;
105
+ Alphabet TheAlphabet;
106
+
107
+ Interface( bool utf8=false, bool verbose=false ) :
108
+ Verbose(verbose), Alphabet_Defined(false), LexiconComments(false)
109
+ {
110
+ TheAlphabet.utf8 = utf8;
111
+ }
112
+
113
+ void allow_lexicon_comments() { LexiconComments = true; }
114
+
115
+ Transducer *new_transducer( Range*, Range* );
116
+ Transducer *read_words( char *filename );
117
+ Transducer *read_transducer( char *filename );
118
+ Transducer *var_value( char *name );
119
+ Transducer *rvar_value( char *name );
120
+ Range *svar_value( char *name );
121
+ Range *complement_range( Range* );
122
+ Range *rsvar_value( char *name );
123
+ Character character_code( unsigned int uc );
124
+ Character symbol_code( char *s );
125
+
126
+ bool in_range( unsigned int c, Range *r );
127
+ Transducer *make_transducer( Range *r1, Range *r2 );
128
+
129
+ Range *add_value( Character, Range*);
130
+
131
+ Range *add_var_values( char *name, Range*);
132
+ Range *add_values( unsigned int, unsigned int, Range*);
133
+ Range *append_values( Range *r2, Range *r );
134
+ void add_alphabet( Transducer* );
135
+
136
+ // These functions delete their argument automata
137
+
138
+ void def_alphabet( Transducer *a );
139
+ bool def_var( char *name, Transducer *a );
140
+ bool def_rvar( char *name, Transducer *a );
141
+ bool def_svar( char *name, Range *r );
142
+ Transducer *explode( Transducer *a );
143
+ Transducer *catenate( Transducer *a1, Transducer *a2 );
144
+ Transducer *disjunction( Transducer *a1, Transducer *a2 );
145
+ Transducer *conjunction( Transducer *a1, Transducer *a2 );
146
+ Transducer *subtraction( Transducer *a1, Transducer *a2 );
147
+ Transducer *composition( Transducer *a1, Transducer *a2 );
148
+ Transducer *restriction( Transducer *a, Twol_Type type, Contexts *c, int );
149
+ Transducer *replace( Transducer *a, Repl_Type type, bool optional );
150
+ Transducer *replace_in_context( Transducer *a, Repl_Type type,
151
+ Contexts *c, bool optional );
152
+ Transducer *negation( Transducer *a );
153
+ Transducer *upper_level( Transducer *a );
154
+ Transducer *lower_level( Transducer *a );
155
+ Transducer *minimise( Transducer *a );
156
+ Transducer *switch_levels( Transducer *a );
157
+ Transducer *repetition( Transducer *a );
158
+ Transducer *repetition2( Transducer *a );
159
+ Transducer *optional( Transducer *a );
160
+ Transducer *make_rule( Transducer *lc, Range *r1, Twol_Type type,
161
+ Range *r2, Transducer *rc );
162
+ Transducer *freely_insert( Transducer *a, Character lc, Character uc );
163
+ Transducer *make_mapping( Ranges*, Ranges* );
164
+ Ranges *add_range( Range*, Ranges* );
165
+ Contexts *make_context( Transducer *l, Transducer *r );
166
+ Contexts *add_context( Contexts *nc, Contexts *c );
167
+ Transducer *result( Transducer*, bool );
168
+ void write_to_file( Transducer*, char *filename);
169
+ };
170
+ }
171
+ #endif
@@ -0,0 +1,323 @@
1
+ /*******************************************************************/
2
+ /* */
3
+ /* FILE make-compact.C */
4
+ /* MODULE make-compact */
5
+ /* PROGRAM SFST */
6
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
+ /* */
8
+ /* PURPOSE Code needed for generating compact automata */
9
+ /* */
10
+ /*******************************************************************/
11
+
12
+ #include <math.h>
13
+
14
+ #include "make-compact.h"
15
+
16
+ namespace SFST {
17
+
18
+ class ARC {
19
+ public:
20
+ int cv;
21
+ Label label;
22
+ unsigned int target_node;
23
+
24
+ bool operator< ( const ARC a ) const {
25
+ return cv < a.cv;
26
+ };
27
+ };
28
+
29
+ typedef map<Label, size_t, Label::label_cmp> LabelNumber;
30
+
31
+
32
+ /*******************************************************************/
33
+ /* */
34
+ /* MakeCompactTransducer::sort */
35
+ /* */
36
+ /*******************************************************************/
37
+
38
+ void MakeCompactTransducer::sort( Level level )
39
+
40
+ {
41
+ for( unsigned int n=0; n<number_of_nodes; n++) {
42
+ unsigned int from=first_arc[n];
43
+ unsigned int to=first_arc[n+1];
44
+ int l=to-from;
45
+
46
+ // copy the arcs to a temporary table
47
+ ARC *arc=new ARC[l];
48
+ for( unsigned int i=from; i<to; i++) {
49
+ arc[i-from].cv = (int)label[i].get_char(level);
50
+ // make sure that epsilon arcs are stored at the beginning
51
+ // even if epsilon is not 0
52
+ if (arc[i-from].cv == (int)Label::epsilon)
53
+ arc[i-from].cv = -1;
54
+ arc[i-from].label = label[i];
55
+ arc[i-from].target_node = target_node[i];
56
+ }
57
+
58
+ // sort the table
59
+ std::sort( arc, arc+l );
60
+
61
+ // copy the arcs back to the original table
62
+ for( unsigned int i=from; i<to; i++) {
63
+ label[i] = arc[i-from].label;
64
+ target_node[i] = arc[i-from].target_node;
65
+ }
66
+
67
+ delete[] arc;
68
+ }
69
+ }
70
+
71
+
72
+ /*******************************************************************/
73
+ /* */
74
+ /* MakeCompactTransducer::count_arcs */
75
+ /* */
76
+ /*******************************************************************/
77
+
78
+ void MakeCompactTransducer::count_arcs( Node *node, VType vmark )
79
+ {
80
+ if (!node->was_visited( vmark )) {
81
+ unsigned n = (unsigned)node->index;
82
+ finalp[n] = node->is_final();
83
+ first_arc[n] = 0;
84
+ Arcs *arcs=node->arcs();
85
+ for( ArcsIter p(arcs); p; p++ ) {
86
+ Arc *arc=p;
87
+ first_arc[n]++;
88
+ count_arcs(arc->target_node(), vmark);
89
+ }
90
+ }
91
+ }
92
+
93
+
94
+ /*******************************************************************/
95
+ /* */
96
+ /* MakeCompactTransducer::store_arcs */
97
+ /* */
98
+ /*******************************************************************/
99
+
100
+ void MakeCompactTransducer::store_arcs( Node *node, VType vmark )
101
+ {
102
+ if (!node->was_visited( vmark )) {
103
+ unsigned int n=first_arc[node->index];
104
+ Arcs *arcs=node->arcs();
105
+ for( ArcsIter p(arcs); p; p++ ) {
106
+ Arc *arc=p;
107
+ label[n] = arc->label();
108
+ target_node[n++] = (unsigned)arc->target_node()->index;
109
+ store_arcs(arc->target_node(), vmark);
110
+ }
111
+ }
112
+ }
113
+
114
+
115
+ /*******************************************************************/
116
+ /* */
117
+ /* MakeCompactTransducer::MakeCompactTransducer */
118
+ /* */
119
+ /*******************************************************************/
120
+
121
+ MakeCompactTransducer::MakeCompactTransducer( Transducer &a, Level l )
122
+
123
+ {
124
+ if (a.is_infinitely_ambiguous()) {
125
+ std::cerr << "Error: resulting transducer contains an infinite loop!\n";
126
+ exit(1);
127
+ }
128
+
129
+ number_of_nodes = (unsigned)a.nodeindexing().first;
130
+ alphabet.copy(a.alphabet);
131
+
132
+ // memory allocation
133
+ finalp = new char[number_of_nodes];
134
+ first_arc = new unsigned int[number_of_nodes+1];
135
+
136
+ // count the number of outgoing arcs for each node
137
+ // and store them in first_arc[]
138
+ a.incr_vmark();
139
+ count_arcs( a.root_node(), a.vmark );
140
+ for( int n=number_of_nodes; n>0; n-- )
141
+ first_arc[n] = first_arc[n-1];
142
+ first_arc[0] = 0;
143
+ for( unsigned int n=0; n<number_of_nodes; n++ )
144
+ first_arc[n+1] += first_arc[n];
145
+ number_of_arcs = first_arc[number_of_nodes];
146
+
147
+ // memory allocation
148
+ label = new Label[number_of_arcs];
149
+ target_node = new unsigned int[number_of_arcs];
150
+
151
+ // store the arcs
152
+ a.incr_vmark();
153
+ store_arcs( a.root_node(), a.vmark );
154
+
155
+ // sort the arcs
156
+ sort( l );
157
+ }
158
+
159
+
160
+ /*******************************************************************/
161
+ /* */
162
+ /* MakeCompactTransducer::store_finalp */
163
+ /* */
164
+ /*******************************************************************/
165
+
166
+ void MakeCompactTransducer::store_finalp( FILE *file )
167
+
168
+ {
169
+ int k=0;
170
+ unsigned char n=0;
171
+
172
+ for( size_t i=0; i<number_of_nodes; i++ ) {
173
+ n = (unsigned char)(n << 1);
174
+ if (finalp[i])
175
+ n |= 1;
176
+ if (++k == 8) {
177
+ fputc(n, file);
178
+ n = 0;
179
+ k = 0;
180
+ }
181
+ }
182
+ if (k > 0) {
183
+ n = (unsigned char)(n << (8-k));
184
+ fputc(n, file);
185
+ }
186
+ }
187
+
188
+
189
+ /*******************************************************************/
190
+ /* */
191
+ /* MakeCompactTransducer::store_first_arcs */
192
+ /* */
193
+ /* The data is encoded with the minimal number of bits needed. */
194
+ /* */
195
+ /*******************************************************************/
196
+
197
+ void MakeCompactTransducer::store_first_arcs( FILE *file )
198
+
199
+ {
200
+ int k=0;
201
+ unsigned int n=0;
202
+ // compute number of bits required for storing each item
203
+ int bits=(int)ceil(log(number_of_arcs+1)/log(2));
204
+
205
+ for( size_t i=0; i<=number_of_nodes; i++ ) {
206
+ unsigned int m=first_arc[i];
207
+ m <<= (sizeof(n)*8) - bits;
208
+ m >>= k;
209
+ n = n | m;
210
+ k += bits;
211
+ if (k >= (int)sizeof(n)*8) {
212
+ fwrite(&n, sizeof(n), 1, file);
213
+ k -= (int)sizeof(n) * 8;
214
+ n = first_arc[i];
215
+ if (k == 0)
216
+ n = 0;
217
+ else
218
+ n = first_arc[i] << (sizeof(n) * 8 - k);
219
+ }
220
+ }
221
+ if (k > 0)
222
+ fwrite(&n, sizeof(n), 1, file);
223
+ }
224
+
225
+
226
+ /*******************************************************************/
227
+ /* */
228
+ /* MakeCompactTransducer::store_target_nodes */
229
+ /* */
230
+ /*******************************************************************/
231
+
232
+ void MakeCompactTransducer::store_target_nodes( FILE *file )
233
+
234
+ {
235
+ int k=0;
236
+ unsigned int n=0;
237
+ int bits=(int)ceil(log(number_of_nodes)/log(2));
238
+
239
+ for( size_t i=0; i<number_of_arcs; i++ ) {
240
+ unsigned int m=target_node[i];
241
+ m <<= (sizeof(n)*8) - bits;
242
+ m >>= k;
243
+ n = n | m;
244
+ k += bits;
245
+ if (k >= (int)sizeof(n)*8) {
246
+ fwrite(&n, sizeof(n), 1, file);
247
+ k -= (int)sizeof(n)*8;
248
+ if (k == 0)
249
+ n = 0;
250
+ else
251
+ n = target_node[i] << (sizeof(n) * 8 - k);
252
+ }
253
+ }
254
+ if (k > 0)
255
+ fwrite(&n, sizeof(n), 1, file);
256
+ }
257
+
258
+
259
+ /*******************************************************************/
260
+ /* */
261
+ /* MakeCompactTransducer::store_labels */
262
+ /* */
263
+ /*******************************************************************/
264
+
265
+ void MakeCompactTransducer::store_labels( FILE *file )
266
+
267
+ {
268
+ size_t N=0;
269
+ LabelNumber LNum;
270
+ for( Alphabet::const_iterator it=alphabet.begin();
271
+ it != alphabet.end(); it++ )
272
+ {
273
+ Label l=*it;
274
+ LNum[l] = N++;
275
+ }
276
+
277
+ int k=0;
278
+ unsigned int n=0;
279
+ int bits=(int)ceil(log((double)alphabet.size())/log(2));
280
+
281
+ for( size_t i=0; i<number_of_arcs; i++ ) {
282
+ unsigned int l = (unsigned)LNum[label[i]];
283
+ unsigned int m=l;
284
+ m <<= (sizeof(n)*8) - bits;
285
+ m >>= k;
286
+ n = n | m;
287
+ k += bits;
288
+ if (k >= (int)sizeof(n)*8) {
289
+ fwrite(&n, sizeof(n), 1, file);
290
+ k -= (int)sizeof(n)*8;
291
+ if (k == 0)
292
+ n = 0;
293
+ else
294
+ n = l << (sizeof(n) * 8 - k);
295
+ }
296
+ }
297
+ if (k > 0)
298
+ fwrite(&n, sizeof(n), 1, file);
299
+ }
300
+
301
+
302
+ /*******************************************************************/
303
+ /* */
304
+ /* MakeCompactTransducer::store */
305
+ /* */
306
+ /*******************************************************************/
307
+
308
+ void MakeCompactTransducer::store( FILE *file )
309
+
310
+ {
311
+ fputc('c',file);
312
+ alphabet.store(file);
313
+ fwrite(&number_of_nodes, sizeof(number_of_nodes), 1, file);
314
+ fwrite(&number_of_arcs, sizeof(number_of_arcs), 1, file);
315
+ store_finalp(file);
316
+ store_first_arcs(file);
317
+ store_labels(file);
318
+ store_target_nodes(file);
319
+ if (ferror(file))
320
+ throw "Error encountered while writing transducer to file\n";
321
+ }
322
+
323
+ }