ruby-sfst 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,411 @@
1
+ #include "ruby.h"
2
+ #include "compact.h"
3
+ #include "fst.h"
4
+ #include "interface.h"
5
+ #include "make-compact.h"
6
+
7
+ /*:enddoc:*/
8
+ extern Transducer *Result;
9
+ extern FILE *yyin;
10
+ int yyparse(void);
11
+
12
+ VALUE mSFST = Qnil;
13
+ VALUE mCompactTransducer = Qnil;
14
+ VALUE mRegularTransducer = Qnil;
15
+
16
+ static VALUE compile(char *from_filename, char *to_filename, bool compact) // :nodoc:
17
+ {
18
+ FILE *in_file, *out_file;
19
+
20
+ in_file = fopen(from_filename, "rb");
21
+
22
+ if (!in_file) {
23
+ rb_raise(rb_eRuntimeError, "Unable to open grammar file %s", from_filename);
24
+ }
25
+
26
+ FileName = from_filename;
27
+ Result = NULL;
28
+ TheAlphabet.utf8 = UTF8;
29
+ yyin = in_file;
30
+
31
+ try {
32
+ yyparse();
33
+ Result->alphabet.utf8 = UTF8;
34
+
35
+ fclose(in_file);
36
+
37
+ if (!(out_file = fopen(to_filename, "wb"))) {
38
+ rb_raise(rb_eRuntimeError, "Unable to open output file %s", to_filename);
39
+ }
40
+
41
+ if (compact) {
42
+ MakeCompactTransducer ca(*Result);
43
+ delete Result;
44
+ ca.store(out_file);
45
+ } else
46
+ Result->store(out_file);
47
+
48
+ fclose(out_file);
49
+ }
50
+ catch(const char* p) {
51
+ rb_raise(rb_eRuntimeError, p);
52
+ }
53
+
54
+ return Qnil;
55
+ }
56
+
57
+ static VALUE compile_regular(VALUE obj, VALUE from_filename, VALUE to_filename)
58
+ {
59
+ return compile(RSTRING(from_filename)->ptr, RSTRING(to_filename)->ptr, false);
60
+ }
61
+
62
+ static VALUE compile_compact(VALUE obj, VALUE from_filename, VALUE to_filename)
63
+ {
64
+ return compile(RSTRING(from_filename)->ptr, RSTRING(to_filename)->ptr, true);
65
+ }
66
+
67
+ static void compact_transducer_free(CompactTransducer *t)
68
+ {
69
+ if (t)
70
+ delete t;
71
+ }
72
+
73
+ static void compact_transducer_mark(CompactTransducer *t)
74
+ {
75
+ }
76
+
77
+ static VALUE compact_transducer_alloc(VALUE klass)
78
+ {
79
+ CompactTransducer *t = NULL;
80
+
81
+ return Data_Wrap_Struct(klass, compact_transducer_mark, compact_transducer_free, t);
82
+ }
83
+
84
+ static VALUE compact_transducer_init(VALUE obj, VALUE filename)
85
+ {
86
+ FILE *file;
87
+ CompactTransducer *t;
88
+
89
+ file = fopen(RSTRING(filename)->ptr, "rb");
90
+
91
+ if (!file) {
92
+ rb_raise(rb_eRuntimeError, "Unable to open SFST file %s", RSTRING(filename)->ptr);
93
+ }
94
+
95
+ try {
96
+ t = new CompactTransducer(file);
97
+ fclose(file);
98
+ }
99
+ catch (const char *p) {
100
+ rb_raise(rb_eRuntimeError, "Unable to open SFST file %s: %s", RSTRING(filename)->ptr, p);
101
+ }
102
+
103
+ DATA_PTR(obj) = t;
104
+ return Qnil;
105
+ }
106
+
107
+ static VALUE compact_transducer_analyze(VALUE self, VALUE string)
108
+ {
109
+ VALUE accepted = Qfalse;
110
+ CompactTransducer *t;
111
+
112
+ Check_Type(string, T_STRING);
113
+
114
+ Data_Get_Struct(self, CompactTransducer, t);
115
+
116
+ std::vector<CAnalysis> analyses;
117
+ t->analyze_string(RSTRING(string)->ptr, analyses);
118
+
119
+ for (size_t k = 0; k < analyses.size(); k++) {
120
+ accepted = Qtrue;
121
+
122
+ if (rb_block_given_p()) {
123
+ rb_yield(rb_str_new2(t->print_analysis(analyses[k])));
124
+ } else
125
+ // We might as well return early if there is no block.
126
+ break;
127
+ }
128
+
129
+ return accepted;
130
+ }
131
+
132
+ static void regular_transducer_free(Transducer *t)
133
+ {
134
+ if (t)
135
+ delete t;
136
+ }
137
+
138
+ static void regular_transducer_mark(Transducer *t)
139
+ {
140
+ }
141
+
142
+ static VALUE regular_transducer_alloc(VALUE klass)
143
+ {
144
+ Transducer *t = NULL;
145
+
146
+ return Data_Wrap_Struct(klass, regular_transducer_mark, regular_transducer_free, t);
147
+ }
148
+
149
+ static VALUE regular_transducer_init(VALUE obj, VALUE filename)
150
+ {
151
+ FILE *file;
152
+ Transducer *t;
153
+
154
+ file = fopen(RSTRING(filename)->ptr, "rb");
155
+
156
+ if (!file) {
157
+ rb_raise(rb_eRuntimeError, "Unable to open SFST file %s", RSTRING(filename)->ptr);
158
+ }
159
+
160
+ try {
161
+ t = new Transducer(file);
162
+ fclose(file);
163
+ }
164
+ catch (const char *p) {
165
+ rb_raise(rb_eRuntimeError, "Unable to open SFST file %s: %s", RSTRING(filename)->ptr, p);
166
+ }
167
+
168
+ DATA_PTR(obj) = t;
169
+ return Qnil;
170
+ }
171
+
172
+ using std::vector;
173
+
174
+ enum { BOTH, LOWER, UPPER };
175
+
176
+ static VALUE _alphabet_to_rb_str(Alphabet *a, Character c)
177
+ {
178
+ const char *s = a->code2symbol(c);
179
+
180
+ if (s)
181
+ return rb_str_new2(s);
182
+
183
+ unsigned int uc = c;
184
+ char buf[32];
185
+
186
+ if (uc >= 32 && uc < 256) {
187
+ buf[0] = (char)c;
188
+ buf[1] = '\0';
189
+ } else {
190
+ sprintf(buf, "\\%u", uc);
191
+ }
192
+ return rb_str_new2(buf);
193
+ }
194
+
195
+ static VALUE rb_ary_push_pair(VALUE ary, VALUE a, VALUE b)
196
+ {
197
+ VALUE pair = rb_ary_new();
198
+ rb_ary_push(pair, a);
199
+ rb_ary_push(pair, b);
200
+ rb_ary_push(ary, pair);
201
+ return ary;
202
+ }
203
+
204
+ static void _regular_transducer_generate(Transducer *t, Node *node,
205
+ Node2Int &visitations, VALUE a, int mode, bool epsilons)
206
+ {
207
+ if (node->is_final())
208
+ rb_yield(a);
209
+
210
+ visitations[node]++;
211
+
212
+ vector<Arc*> arc;
213
+ for (ArcsIter p(node->arcs()); p; p++) {
214
+ Arc *a = p;
215
+ Node *n = a->target_node();
216
+ size_t i;
217
+ for (i = 0; i < arc.size(); i++)
218
+ if (visitations[n] < visitations[arc[i]->target_node()])
219
+ break;
220
+ arc.push_back(NULL);
221
+ for (size_t k = arc.size() - 1; k > i; k--)
222
+ arc[k] = arc[k - 1];
223
+ arc[i] = a;
224
+ }
225
+
226
+ for (size_t i = 0; i < arc.size(); i++) {
227
+ Label l = arc[i]->label();
228
+ VALUE lower, upper;
229
+
230
+ Character lc = l.lower_char();
231
+ if ((mode == BOTH || mode == LOWER) && (epsilons || lc != Label::epsilon)) {
232
+ lower = _alphabet_to_rb_str(&(t->alphabet), lc);
233
+ } else
234
+ lower = Qnil;
235
+
236
+ Character uc = l.upper_char();
237
+ if ((mode == BOTH || mode == UPPER) && (epsilons || uc != Label::epsilon)) {
238
+ upper = _alphabet_to_rb_str(&(t->alphabet), uc);
239
+ } else
240
+ upper = Qnil;
241
+
242
+ switch (mode) {
243
+ case BOTH:
244
+ rb_ary_push_pair(a, lower, upper);
245
+ break;
246
+
247
+ case UPPER:
248
+ rb_ary_push(a, upper);
249
+ break;
250
+
251
+ case LOWER:
252
+ rb_ary_push(a, lower);
253
+ break;
254
+ }
255
+
256
+ _regular_transducer_generate(t, arc[i]->target_node(), visitations, a, mode, epsilons);
257
+
258
+ rb_ary_pop(a);
259
+ }
260
+ }
261
+
262
+ static VALUE regular_transducer_generate_language(VALUE self, VALUE levels_arg, VALUE mode_arg)
263
+ {
264
+ Transducer *t;
265
+ Data_Get_Struct(self, Transducer, t);
266
+
267
+ static ID id_upper = rb_intern("upper");
268
+ static ID id_lower = rb_intern("lower");
269
+ static ID id_both = rb_intern("both");
270
+ static ID id_noepsilons = rb_intern("noepsilons");
271
+ static ID id_all = rb_intern("all");
272
+
273
+ int levels;
274
+ bool epsilons;
275
+
276
+ Check_Type(levels_arg, T_SYMBOL);
277
+
278
+ if (SYM2ID(levels_arg) == id_upper)
279
+ levels = UPPER;
280
+ else if (SYM2ID(levels_arg) == id_lower)
281
+ levels = LOWER;
282
+ else if (SYM2ID(levels_arg) == id_both)
283
+ levels = BOTH;
284
+ else
285
+ rb_raise(rb_eRuntimeError, "invalid levels");
286
+
287
+ Check_Type(mode_arg, T_SYMBOL);
288
+
289
+ if (SYM2ID(mode_arg) == id_noepsilons)
290
+ epsilons = false;
291
+ else if (SYM2ID(mode_arg) == id_all)
292
+ epsilons = true;
293
+ else
294
+ rb_raise(rb_eRuntimeError, "invalid mode");
295
+
296
+ if (!rb_block_given_p())
297
+ rb_raise(rb_eRuntimeError, "block expected");
298
+
299
+ Node2Int visitations;
300
+ Transducer *a2;
301
+ switch (levels) {
302
+ case UPPER:
303
+ a2 = &(t->upper_level().minimise());
304
+ break;
305
+ case LOWER:
306
+ a2 = &(t->lower_level().minimise());
307
+ break;
308
+ default:
309
+ a2 = t;
310
+ break;
311
+ }
312
+ _regular_transducer_generate(a2, a2->root_node(), visitations, rb_ary_new(),
313
+ levels, epsilons);
314
+
315
+ return Qnil;
316
+ }
317
+
318
+ static bool _regular_transducer_yield(Transducer *t, Node *node, VALUE result_array)
319
+ {
320
+ int accepted = 0;
321
+
322
+ if (node->was_visited(t->vmark)) {
323
+ if (node->forward() != NULL) { // cycle detected
324
+ // FIXME: How is this best handled?
325
+ //cerr << "Warning: cyclic analyses (cycle aborted)\n";
326
+ }
327
+ node->set_forward(node); // used like a flag for loop detection
328
+ }
329
+
330
+ if (node->is_final()) {
331
+ if (rb_block_given_p())
332
+ rb_yield(result_array);
333
+
334
+ accepted = 1;
335
+ }
336
+
337
+ for (ArcsIter i(node->arcs()); i; i++) {
338
+ Arc *arc = i;
339
+ Label l = arc->label();
340
+
341
+ rb_ary_push(result_array, rb_str_new2(t->alphabet.write_label(l)));
342
+
343
+ accepted |= _regular_transducer_yield(t, arc->target_node(), result_array);
344
+
345
+ rb_ary_pop(result_array);
346
+ }
347
+
348
+ node->set_forward(NULL);
349
+
350
+ return accepted == 1 ? true : false;
351
+ }
352
+
353
+ static VALUE _regular_transducer_analyze_or_generate(Transducer *t, VALUE string, bool generate)
354
+ {
355
+ Transducer *a2, *a3;
356
+ Transducer a1(RSTRING(string)->ptr, &(t->alphabet), false);
357
+ if (generate) {
358
+ a2 = &(a1 || *t);
359
+ a3 = &(a2->upper_level());
360
+ } else {
361
+ a2 = &(*t || a1);
362
+ a3 = &(a2->lower_level());
363
+ }
364
+ delete a2;
365
+ a2 = &a3->minimise();
366
+ delete a3;
367
+
368
+ a2->alphabet.copy(t->alphabet);
369
+ t->incr_vmark();
370
+ bool accepted = _regular_transducer_yield(a2, a2->root_node(), rb_ary_new());
371
+ delete a2;
372
+
373
+ return accepted ? Qtrue : Qfalse;
374
+ }
375
+
376
+ static VALUE regular_transducer_generate(VALUE self, VALUE string)
377
+ {
378
+ Transducer *t;
379
+ Check_Type(string, T_STRING);
380
+ Data_Get_Struct(self, Transducer, t);
381
+ return _regular_transducer_analyze_or_generate(t, string, true);
382
+ }
383
+
384
+ static VALUE regular_transducer_analyze(VALUE self, VALUE string)
385
+ {
386
+ Transducer *t;
387
+ Check_Type(string, T_STRING);
388
+ Data_Get_Struct(self, Transducer, t);
389
+ return _regular_transducer_analyze_or_generate(t, string, false);
390
+ }
391
+
392
+ extern "C"
393
+
394
+ void Init_sfst_machine(void)
395
+ {
396
+ mSFST = rb_define_module("SFST");
397
+ rb_define_module_function(mSFST, "_compile_regular", (VALUE (*)(...))compile_regular, 2);
398
+ rb_define_module_function(mSFST, "_compile_compact", (VALUE (*)(...))compile_compact, 2);
399
+
400
+ mCompactTransducer = rb_define_class_under(mSFST, "CompactTransducerMachine", rb_cObject);
401
+ rb_define_alloc_func(mCompactTransducer, compact_transducer_alloc);
402
+ rb_define_method(mCompactTransducer, "initialize", (VALUE (*)(...))compact_transducer_init, 1);
403
+ rb_define_method(mCompactTransducer, "_analyze", (VALUE (*)(...))compact_transducer_analyze, 1);
404
+
405
+ mRegularTransducer = rb_define_class_under(mSFST, "RegularTransducerMachine", rb_cObject);
406
+ rb_define_alloc_func(mRegularTransducer, regular_transducer_alloc);
407
+ rb_define_method(mRegularTransducer, "initialize", (VALUE (*)(...))regular_transducer_init, 1);
408
+ rb_define_method(mRegularTransducer, "_generate_language", (VALUE (*)(...))regular_transducer_generate_language, 2);
409
+ rb_define_method(mRegularTransducer, "_analyze", (VALUE (*)(...))regular_transducer_analyze, 1);
410
+ rb_define_method(mRegularTransducer, "_generate", (VALUE (*)(...))regular_transducer_generate, 1);
411
+ }