ruby-sfst 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +1 -0
- data/Manifest +31 -0
- data/README.rdoc +25 -0
- data/Rakefile +22 -0
- data/ext/sfst_machine/alphabet.C +807 -0
- data/ext/sfst_machine/alphabet.h +281 -0
- data/ext/sfst_machine/basic.C +84 -0
- data/ext/sfst_machine/basic.h +24 -0
- data/ext/sfst_machine/compact.C +616 -0
- data/ext/sfst_machine/compact.h +98 -0
- data/ext/sfst_machine/determinise.C +304 -0
- data/ext/sfst_machine/extconf.rb +4 -0
- data/ext/sfst_machine/fst-compiler.C +2375 -0
- data/ext/sfst_machine/fst-compiler.h +113 -0
- data/ext/sfst_machine/fst-compiler.yy +213 -0
- data/ext/sfst_machine/fst.C +966 -0
- data/ext/sfst_machine/fst.h +365 -0
- data/ext/sfst_machine/interface.C +1838 -0
- data/ext/sfst_machine/interface.h +94 -0
- data/ext/sfst_machine/make-compact.C +328 -0
- data/ext/sfst_machine/make-compact.h +34 -0
- data/ext/sfst_machine/mem.h +74 -0
- data/ext/sfst_machine/operators.C +1131 -0
- data/ext/sfst_machine/sfst_machine.cc +411 -0
- data/ext/sfst_machine/utf8-scanner.C +2197 -0
- data/ext/sfst_machine/utf8-scanner.ll +179 -0
- data/ext/sfst_machine/utf8.C +146 -0
- data/ext/sfst_machine/utf8.h +19 -0
- data/lib/sfst.rb +99 -0
- data/ruby-sfst.gemspec +34 -0
- data/test/test_sfst.fst +3 -0
- data/test/test_sfst.rb +119 -0
- metadata +100 -0
@@ -0,0 +1,411 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include "compact.h"
|
3
|
+
#include "fst.h"
|
4
|
+
#include "interface.h"
|
5
|
+
#include "make-compact.h"
|
6
|
+
|
7
|
+
/*:enddoc:*/
|
8
|
+
extern Transducer *Result;
|
9
|
+
extern FILE *yyin;
|
10
|
+
int yyparse(void);
|
11
|
+
|
12
|
+
VALUE mSFST = Qnil;
|
13
|
+
VALUE mCompactTransducer = Qnil;
|
14
|
+
VALUE mRegularTransducer = Qnil;
|
15
|
+
|
16
|
+
static VALUE compile(char *from_filename, char *to_filename, bool compact) // :nodoc:
|
17
|
+
{
|
18
|
+
FILE *in_file, *out_file;
|
19
|
+
|
20
|
+
in_file = fopen(from_filename, "rb");
|
21
|
+
|
22
|
+
if (!in_file) {
|
23
|
+
rb_raise(rb_eRuntimeError, "Unable to open grammar file %s", from_filename);
|
24
|
+
}
|
25
|
+
|
26
|
+
FileName = from_filename;
|
27
|
+
Result = NULL;
|
28
|
+
TheAlphabet.utf8 = UTF8;
|
29
|
+
yyin = in_file;
|
30
|
+
|
31
|
+
try {
|
32
|
+
yyparse();
|
33
|
+
Result->alphabet.utf8 = UTF8;
|
34
|
+
|
35
|
+
fclose(in_file);
|
36
|
+
|
37
|
+
if (!(out_file = fopen(to_filename, "wb"))) {
|
38
|
+
rb_raise(rb_eRuntimeError, "Unable to open output file %s", to_filename);
|
39
|
+
}
|
40
|
+
|
41
|
+
if (compact) {
|
42
|
+
MakeCompactTransducer ca(*Result);
|
43
|
+
delete Result;
|
44
|
+
ca.store(out_file);
|
45
|
+
} else
|
46
|
+
Result->store(out_file);
|
47
|
+
|
48
|
+
fclose(out_file);
|
49
|
+
}
|
50
|
+
catch(const char* p) {
|
51
|
+
rb_raise(rb_eRuntimeError, p);
|
52
|
+
}
|
53
|
+
|
54
|
+
return Qnil;
|
55
|
+
}
|
56
|
+
|
57
|
+
static VALUE compile_regular(VALUE obj, VALUE from_filename, VALUE to_filename)
|
58
|
+
{
|
59
|
+
return compile(RSTRING(from_filename)->ptr, RSTRING(to_filename)->ptr, false);
|
60
|
+
}
|
61
|
+
|
62
|
+
static VALUE compile_compact(VALUE obj, VALUE from_filename, VALUE to_filename)
|
63
|
+
{
|
64
|
+
return compile(RSTRING(from_filename)->ptr, RSTRING(to_filename)->ptr, true);
|
65
|
+
}
|
66
|
+
|
67
|
+
static void compact_transducer_free(CompactTransducer *t)
|
68
|
+
{
|
69
|
+
if (t)
|
70
|
+
delete t;
|
71
|
+
}
|
72
|
+
|
73
|
+
static void compact_transducer_mark(CompactTransducer *t)
|
74
|
+
{
|
75
|
+
}
|
76
|
+
|
77
|
+
static VALUE compact_transducer_alloc(VALUE klass)
|
78
|
+
{
|
79
|
+
CompactTransducer *t = NULL;
|
80
|
+
|
81
|
+
return Data_Wrap_Struct(klass, compact_transducer_mark, compact_transducer_free, t);
|
82
|
+
}
|
83
|
+
|
84
|
+
static VALUE compact_transducer_init(VALUE obj, VALUE filename)
|
85
|
+
{
|
86
|
+
FILE *file;
|
87
|
+
CompactTransducer *t;
|
88
|
+
|
89
|
+
file = fopen(RSTRING(filename)->ptr, "rb");
|
90
|
+
|
91
|
+
if (!file) {
|
92
|
+
rb_raise(rb_eRuntimeError, "Unable to open SFST file %s", RSTRING(filename)->ptr);
|
93
|
+
}
|
94
|
+
|
95
|
+
try {
|
96
|
+
t = new CompactTransducer(file);
|
97
|
+
fclose(file);
|
98
|
+
}
|
99
|
+
catch (const char *p) {
|
100
|
+
rb_raise(rb_eRuntimeError, "Unable to open SFST file %s: %s", RSTRING(filename)->ptr, p);
|
101
|
+
}
|
102
|
+
|
103
|
+
DATA_PTR(obj) = t;
|
104
|
+
return Qnil;
|
105
|
+
}
|
106
|
+
|
107
|
+
static VALUE compact_transducer_analyze(VALUE self, VALUE string)
|
108
|
+
{
|
109
|
+
VALUE accepted = Qfalse;
|
110
|
+
CompactTransducer *t;
|
111
|
+
|
112
|
+
Check_Type(string, T_STRING);
|
113
|
+
|
114
|
+
Data_Get_Struct(self, CompactTransducer, t);
|
115
|
+
|
116
|
+
std::vector<CAnalysis> analyses;
|
117
|
+
t->analyze_string(RSTRING(string)->ptr, analyses);
|
118
|
+
|
119
|
+
for (size_t k = 0; k < analyses.size(); k++) {
|
120
|
+
accepted = Qtrue;
|
121
|
+
|
122
|
+
if (rb_block_given_p()) {
|
123
|
+
rb_yield(rb_str_new2(t->print_analysis(analyses[k])));
|
124
|
+
} else
|
125
|
+
// We might as well return early if there is no block.
|
126
|
+
break;
|
127
|
+
}
|
128
|
+
|
129
|
+
return accepted;
|
130
|
+
}
|
131
|
+
|
132
|
+
static void regular_transducer_free(Transducer *t)
|
133
|
+
{
|
134
|
+
if (t)
|
135
|
+
delete t;
|
136
|
+
}
|
137
|
+
|
138
|
+
static void regular_transducer_mark(Transducer *t)
|
139
|
+
{
|
140
|
+
}
|
141
|
+
|
142
|
+
static VALUE regular_transducer_alloc(VALUE klass)
|
143
|
+
{
|
144
|
+
Transducer *t = NULL;
|
145
|
+
|
146
|
+
return Data_Wrap_Struct(klass, regular_transducer_mark, regular_transducer_free, t);
|
147
|
+
}
|
148
|
+
|
149
|
+
static VALUE regular_transducer_init(VALUE obj, VALUE filename)
|
150
|
+
{
|
151
|
+
FILE *file;
|
152
|
+
Transducer *t;
|
153
|
+
|
154
|
+
file = fopen(RSTRING(filename)->ptr, "rb");
|
155
|
+
|
156
|
+
if (!file) {
|
157
|
+
rb_raise(rb_eRuntimeError, "Unable to open SFST file %s", RSTRING(filename)->ptr);
|
158
|
+
}
|
159
|
+
|
160
|
+
try {
|
161
|
+
t = new Transducer(file);
|
162
|
+
fclose(file);
|
163
|
+
}
|
164
|
+
catch (const char *p) {
|
165
|
+
rb_raise(rb_eRuntimeError, "Unable to open SFST file %s: %s", RSTRING(filename)->ptr, p);
|
166
|
+
}
|
167
|
+
|
168
|
+
DATA_PTR(obj) = t;
|
169
|
+
return Qnil;
|
170
|
+
}
|
171
|
+
|
172
|
+
using std::vector;
|
173
|
+
|
174
|
+
enum { BOTH, LOWER, UPPER };
|
175
|
+
|
176
|
+
static VALUE _alphabet_to_rb_str(Alphabet *a, Character c)
|
177
|
+
{
|
178
|
+
const char *s = a->code2symbol(c);
|
179
|
+
|
180
|
+
if (s)
|
181
|
+
return rb_str_new2(s);
|
182
|
+
|
183
|
+
unsigned int uc = c;
|
184
|
+
char buf[32];
|
185
|
+
|
186
|
+
if (uc >= 32 && uc < 256) {
|
187
|
+
buf[0] = (char)c;
|
188
|
+
buf[1] = '\0';
|
189
|
+
} else {
|
190
|
+
sprintf(buf, "\\%u", uc);
|
191
|
+
}
|
192
|
+
return rb_str_new2(buf);
|
193
|
+
}
|
194
|
+
|
195
|
+
static VALUE rb_ary_push_pair(VALUE ary, VALUE a, VALUE b)
|
196
|
+
{
|
197
|
+
VALUE pair = rb_ary_new();
|
198
|
+
rb_ary_push(pair, a);
|
199
|
+
rb_ary_push(pair, b);
|
200
|
+
rb_ary_push(ary, pair);
|
201
|
+
return ary;
|
202
|
+
}
|
203
|
+
|
204
|
+
static void _regular_transducer_generate(Transducer *t, Node *node,
|
205
|
+
Node2Int &visitations, VALUE a, int mode, bool epsilons)
|
206
|
+
{
|
207
|
+
if (node->is_final())
|
208
|
+
rb_yield(a);
|
209
|
+
|
210
|
+
visitations[node]++;
|
211
|
+
|
212
|
+
vector<Arc*> arc;
|
213
|
+
for (ArcsIter p(node->arcs()); p; p++) {
|
214
|
+
Arc *a = p;
|
215
|
+
Node *n = a->target_node();
|
216
|
+
size_t i;
|
217
|
+
for (i = 0; i < arc.size(); i++)
|
218
|
+
if (visitations[n] < visitations[arc[i]->target_node()])
|
219
|
+
break;
|
220
|
+
arc.push_back(NULL);
|
221
|
+
for (size_t k = arc.size() - 1; k > i; k--)
|
222
|
+
arc[k] = arc[k - 1];
|
223
|
+
arc[i] = a;
|
224
|
+
}
|
225
|
+
|
226
|
+
for (size_t i = 0; i < arc.size(); i++) {
|
227
|
+
Label l = arc[i]->label();
|
228
|
+
VALUE lower, upper;
|
229
|
+
|
230
|
+
Character lc = l.lower_char();
|
231
|
+
if ((mode == BOTH || mode == LOWER) && (epsilons || lc != Label::epsilon)) {
|
232
|
+
lower = _alphabet_to_rb_str(&(t->alphabet), lc);
|
233
|
+
} else
|
234
|
+
lower = Qnil;
|
235
|
+
|
236
|
+
Character uc = l.upper_char();
|
237
|
+
if ((mode == BOTH || mode == UPPER) && (epsilons || uc != Label::epsilon)) {
|
238
|
+
upper = _alphabet_to_rb_str(&(t->alphabet), uc);
|
239
|
+
} else
|
240
|
+
upper = Qnil;
|
241
|
+
|
242
|
+
switch (mode) {
|
243
|
+
case BOTH:
|
244
|
+
rb_ary_push_pair(a, lower, upper);
|
245
|
+
break;
|
246
|
+
|
247
|
+
case UPPER:
|
248
|
+
rb_ary_push(a, upper);
|
249
|
+
break;
|
250
|
+
|
251
|
+
case LOWER:
|
252
|
+
rb_ary_push(a, lower);
|
253
|
+
break;
|
254
|
+
}
|
255
|
+
|
256
|
+
_regular_transducer_generate(t, arc[i]->target_node(), visitations, a, mode, epsilons);
|
257
|
+
|
258
|
+
rb_ary_pop(a);
|
259
|
+
}
|
260
|
+
}
|
261
|
+
|
262
|
+
static VALUE regular_transducer_generate_language(VALUE self, VALUE levels_arg, VALUE mode_arg)
|
263
|
+
{
|
264
|
+
Transducer *t;
|
265
|
+
Data_Get_Struct(self, Transducer, t);
|
266
|
+
|
267
|
+
static ID id_upper = rb_intern("upper");
|
268
|
+
static ID id_lower = rb_intern("lower");
|
269
|
+
static ID id_both = rb_intern("both");
|
270
|
+
static ID id_noepsilons = rb_intern("noepsilons");
|
271
|
+
static ID id_all = rb_intern("all");
|
272
|
+
|
273
|
+
int levels;
|
274
|
+
bool epsilons;
|
275
|
+
|
276
|
+
Check_Type(levels_arg, T_SYMBOL);
|
277
|
+
|
278
|
+
if (SYM2ID(levels_arg) == id_upper)
|
279
|
+
levels = UPPER;
|
280
|
+
else if (SYM2ID(levels_arg) == id_lower)
|
281
|
+
levels = LOWER;
|
282
|
+
else if (SYM2ID(levels_arg) == id_both)
|
283
|
+
levels = BOTH;
|
284
|
+
else
|
285
|
+
rb_raise(rb_eRuntimeError, "invalid levels");
|
286
|
+
|
287
|
+
Check_Type(mode_arg, T_SYMBOL);
|
288
|
+
|
289
|
+
if (SYM2ID(mode_arg) == id_noepsilons)
|
290
|
+
epsilons = false;
|
291
|
+
else if (SYM2ID(mode_arg) == id_all)
|
292
|
+
epsilons = true;
|
293
|
+
else
|
294
|
+
rb_raise(rb_eRuntimeError, "invalid mode");
|
295
|
+
|
296
|
+
if (!rb_block_given_p())
|
297
|
+
rb_raise(rb_eRuntimeError, "block expected");
|
298
|
+
|
299
|
+
Node2Int visitations;
|
300
|
+
Transducer *a2;
|
301
|
+
switch (levels) {
|
302
|
+
case UPPER:
|
303
|
+
a2 = &(t->upper_level().minimise());
|
304
|
+
break;
|
305
|
+
case LOWER:
|
306
|
+
a2 = &(t->lower_level().minimise());
|
307
|
+
break;
|
308
|
+
default:
|
309
|
+
a2 = t;
|
310
|
+
break;
|
311
|
+
}
|
312
|
+
_regular_transducer_generate(a2, a2->root_node(), visitations, rb_ary_new(),
|
313
|
+
levels, epsilons);
|
314
|
+
|
315
|
+
return Qnil;
|
316
|
+
}
|
317
|
+
|
318
|
+
static bool _regular_transducer_yield(Transducer *t, Node *node, VALUE result_array)
|
319
|
+
{
|
320
|
+
int accepted = 0;
|
321
|
+
|
322
|
+
if (node->was_visited(t->vmark)) {
|
323
|
+
if (node->forward() != NULL) { // cycle detected
|
324
|
+
// FIXME: How is this best handled?
|
325
|
+
//cerr << "Warning: cyclic analyses (cycle aborted)\n";
|
326
|
+
}
|
327
|
+
node->set_forward(node); // used like a flag for loop detection
|
328
|
+
}
|
329
|
+
|
330
|
+
if (node->is_final()) {
|
331
|
+
if (rb_block_given_p())
|
332
|
+
rb_yield(result_array);
|
333
|
+
|
334
|
+
accepted = 1;
|
335
|
+
}
|
336
|
+
|
337
|
+
for (ArcsIter i(node->arcs()); i; i++) {
|
338
|
+
Arc *arc = i;
|
339
|
+
Label l = arc->label();
|
340
|
+
|
341
|
+
rb_ary_push(result_array, rb_str_new2(t->alphabet.write_label(l)));
|
342
|
+
|
343
|
+
accepted |= _regular_transducer_yield(t, arc->target_node(), result_array);
|
344
|
+
|
345
|
+
rb_ary_pop(result_array);
|
346
|
+
}
|
347
|
+
|
348
|
+
node->set_forward(NULL);
|
349
|
+
|
350
|
+
return accepted == 1 ? true : false;
|
351
|
+
}
|
352
|
+
|
353
|
+
static VALUE _regular_transducer_analyze_or_generate(Transducer *t, VALUE string, bool generate)
|
354
|
+
{
|
355
|
+
Transducer *a2, *a3;
|
356
|
+
Transducer a1(RSTRING(string)->ptr, &(t->alphabet), false);
|
357
|
+
if (generate) {
|
358
|
+
a2 = &(a1 || *t);
|
359
|
+
a3 = &(a2->upper_level());
|
360
|
+
} else {
|
361
|
+
a2 = &(*t || a1);
|
362
|
+
a3 = &(a2->lower_level());
|
363
|
+
}
|
364
|
+
delete a2;
|
365
|
+
a2 = &a3->minimise();
|
366
|
+
delete a3;
|
367
|
+
|
368
|
+
a2->alphabet.copy(t->alphabet);
|
369
|
+
t->incr_vmark();
|
370
|
+
bool accepted = _regular_transducer_yield(a2, a2->root_node(), rb_ary_new());
|
371
|
+
delete a2;
|
372
|
+
|
373
|
+
return accepted ? Qtrue : Qfalse;
|
374
|
+
}
|
375
|
+
|
376
|
+
static VALUE regular_transducer_generate(VALUE self, VALUE string)
|
377
|
+
{
|
378
|
+
Transducer *t;
|
379
|
+
Check_Type(string, T_STRING);
|
380
|
+
Data_Get_Struct(self, Transducer, t);
|
381
|
+
return _regular_transducer_analyze_or_generate(t, string, true);
|
382
|
+
}
|
383
|
+
|
384
|
+
static VALUE regular_transducer_analyze(VALUE self, VALUE string)
|
385
|
+
{
|
386
|
+
Transducer *t;
|
387
|
+
Check_Type(string, T_STRING);
|
388
|
+
Data_Get_Struct(self, Transducer, t);
|
389
|
+
return _regular_transducer_analyze_or_generate(t, string, false);
|
390
|
+
}
|
391
|
+
|
392
|
+
extern "C"
|
393
|
+
|
394
|
+
void Init_sfst_machine(void)
|
395
|
+
{
|
396
|
+
mSFST = rb_define_module("SFST");
|
397
|
+
rb_define_module_function(mSFST, "_compile_regular", (VALUE (*)(...))compile_regular, 2);
|
398
|
+
rb_define_module_function(mSFST, "_compile_compact", (VALUE (*)(...))compile_compact, 2);
|
399
|
+
|
400
|
+
mCompactTransducer = rb_define_class_under(mSFST, "CompactTransducerMachine", rb_cObject);
|
401
|
+
rb_define_alloc_func(mCompactTransducer, compact_transducer_alloc);
|
402
|
+
rb_define_method(mCompactTransducer, "initialize", (VALUE (*)(...))compact_transducer_init, 1);
|
403
|
+
rb_define_method(mCompactTransducer, "_analyze", (VALUE (*)(...))compact_transducer_analyze, 1);
|
404
|
+
|
405
|
+
mRegularTransducer = rb_define_class_under(mSFST, "RegularTransducerMachine", rb_cObject);
|
406
|
+
rb_define_alloc_func(mRegularTransducer, regular_transducer_alloc);
|
407
|
+
rb_define_method(mRegularTransducer, "initialize", (VALUE (*)(...))regular_transducer_init, 1);
|
408
|
+
rb_define_method(mRegularTransducer, "_generate_language", (VALUE (*)(...))regular_transducer_generate_language, 2);
|
409
|
+
rb_define_method(mRegularTransducer, "_analyze", (VALUE (*)(...))regular_transducer_analyze, 1);
|
410
|
+
rb_define_method(mRegularTransducer, "_generate", (VALUE (*)(...))regular_transducer_generate, 1);
|
411
|
+
}
|