ruby-sfst 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +1 -0
- data/Manifest +31 -0
- data/README.rdoc +25 -0
- data/Rakefile +22 -0
- data/ext/sfst_machine/alphabet.C +807 -0
- data/ext/sfst_machine/alphabet.h +281 -0
- data/ext/sfst_machine/basic.C +84 -0
- data/ext/sfst_machine/basic.h +24 -0
- data/ext/sfst_machine/compact.C +616 -0
- data/ext/sfst_machine/compact.h +98 -0
- data/ext/sfst_machine/determinise.C +304 -0
- data/ext/sfst_machine/extconf.rb +4 -0
- data/ext/sfst_machine/fst-compiler.C +2375 -0
- data/ext/sfst_machine/fst-compiler.h +113 -0
- data/ext/sfst_machine/fst-compiler.yy +213 -0
- data/ext/sfst_machine/fst.C +966 -0
- data/ext/sfst_machine/fst.h +365 -0
- data/ext/sfst_machine/interface.C +1838 -0
- data/ext/sfst_machine/interface.h +94 -0
- data/ext/sfst_machine/make-compact.C +328 -0
- data/ext/sfst_machine/make-compact.h +34 -0
- data/ext/sfst_machine/mem.h +74 -0
- data/ext/sfst_machine/operators.C +1131 -0
- data/ext/sfst_machine/sfst_machine.cc +411 -0
- data/ext/sfst_machine/utf8-scanner.C +2197 -0
- data/ext/sfst_machine/utf8-scanner.ll +179 -0
- data/ext/sfst_machine/utf8.C +146 -0
- data/ext/sfst_machine/utf8.h +19 -0
- data/lib/sfst.rb +99 -0
- data/ruby-sfst.gemspec +34 -0
- data/test/test_sfst.fst +3 -0
- data/test/test_sfst.rb +119 -0
- metadata +100 -0
@@ -0,0 +1,411 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include "compact.h"
|
3
|
+
#include "fst.h"
|
4
|
+
#include "interface.h"
|
5
|
+
#include "make-compact.h"
|
6
|
+
|
7
|
+
/*:enddoc:*/
|
8
|
+
extern Transducer *Result;
|
9
|
+
extern FILE *yyin;
|
10
|
+
int yyparse(void);
|
11
|
+
|
12
|
+
VALUE mSFST = Qnil;
|
13
|
+
VALUE mCompactTransducer = Qnil;
|
14
|
+
VALUE mRegularTransducer = Qnil;
|
15
|
+
|
16
|
+
static VALUE compile(char *from_filename, char *to_filename, bool compact) // :nodoc:
|
17
|
+
{
|
18
|
+
FILE *in_file, *out_file;
|
19
|
+
|
20
|
+
in_file = fopen(from_filename, "rb");
|
21
|
+
|
22
|
+
if (!in_file) {
|
23
|
+
rb_raise(rb_eRuntimeError, "Unable to open grammar file %s", from_filename);
|
24
|
+
}
|
25
|
+
|
26
|
+
FileName = from_filename;
|
27
|
+
Result = NULL;
|
28
|
+
TheAlphabet.utf8 = UTF8;
|
29
|
+
yyin = in_file;
|
30
|
+
|
31
|
+
try {
|
32
|
+
yyparse();
|
33
|
+
Result->alphabet.utf8 = UTF8;
|
34
|
+
|
35
|
+
fclose(in_file);
|
36
|
+
|
37
|
+
if (!(out_file = fopen(to_filename, "wb"))) {
|
38
|
+
rb_raise(rb_eRuntimeError, "Unable to open output file %s", to_filename);
|
39
|
+
}
|
40
|
+
|
41
|
+
if (compact) {
|
42
|
+
MakeCompactTransducer ca(*Result);
|
43
|
+
delete Result;
|
44
|
+
ca.store(out_file);
|
45
|
+
} else
|
46
|
+
Result->store(out_file);
|
47
|
+
|
48
|
+
fclose(out_file);
|
49
|
+
}
|
50
|
+
catch(const char* p) {
|
51
|
+
rb_raise(rb_eRuntimeError, p);
|
52
|
+
}
|
53
|
+
|
54
|
+
return Qnil;
|
55
|
+
}
|
56
|
+
|
57
|
+
static VALUE compile_regular(VALUE obj, VALUE from_filename, VALUE to_filename)
|
58
|
+
{
|
59
|
+
return compile(RSTRING(from_filename)->ptr, RSTRING(to_filename)->ptr, false);
|
60
|
+
}
|
61
|
+
|
62
|
+
static VALUE compile_compact(VALUE obj, VALUE from_filename, VALUE to_filename)
|
63
|
+
{
|
64
|
+
return compile(RSTRING(from_filename)->ptr, RSTRING(to_filename)->ptr, true);
|
65
|
+
}
|
66
|
+
|
67
|
+
static void compact_transducer_free(CompactTransducer *t)
|
68
|
+
{
|
69
|
+
if (t)
|
70
|
+
delete t;
|
71
|
+
}
|
72
|
+
|
73
|
+
static void compact_transducer_mark(CompactTransducer *t)
|
74
|
+
{
|
75
|
+
}
|
76
|
+
|
77
|
+
static VALUE compact_transducer_alloc(VALUE klass)
|
78
|
+
{
|
79
|
+
CompactTransducer *t = NULL;
|
80
|
+
|
81
|
+
return Data_Wrap_Struct(klass, compact_transducer_mark, compact_transducer_free, t);
|
82
|
+
}
|
83
|
+
|
84
|
+
static VALUE compact_transducer_init(VALUE obj, VALUE filename)
|
85
|
+
{
|
86
|
+
FILE *file;
|
87
|
+
CompactTransducer *t;
|
88
|
+
|
89
|
+
file = fopen(RSTRING(filename)->ptr, "rb");
|
90
|
+
|
91
|
+
if (!file) {
|
92
|
+
rb_raise(rb_eRuntimeError, "Unable to open SFST file %s", RSTRING(filename)->ptr);
|
93
|
+
}
|
94
|
+
|
95
|
+
try {
|
96
|
+
t = new CompactTransducer(file);
|
97
|
+
fclose(file);
|
98
|
+
}
|
99
|
+
catch (const char *p) {
|
100
|
+
rb_raise(rb_eRuntimeError, "Unable to open SFST file %s: %s", RSTRING(filename)->ptr, p);
|
101
|
+
}
|
102
|
+
|
103
|
+
DATA_PTR(obj) = t;
|
104
|
+
return Qnil;
|
105
|
+
}
|
106
|
+
|
107
|
+
static VALUE compact_transducer_analyze(VALUE self, VALUE string)
|
108
|
+
{
|
109
|
+
VALUE accepted = Qfalse;
|
110
|
+
CompactTransducer *t;
|
111
|
+
|
112
|
+
Check_Type(string, T_STRING);
|
113
|
+
|
114
|
+
Data_Get_Struct(self, CompactTransducer, t);
|
115
|
+
|
116
|
+
std::vector<CAnalysis> analyses;
|
117
|
+
t->analyze_string(RSTRING(string)->ptr, analyses);
|
118
|
+
|
119
|
+
for (size_t k = 0; k < analyses.size(); k++) {
|
120
|
+
accepted = Qtrue;
|
121
|
+
|
122
|
+
if (rb_block_given_p()) {
|
123
|
+
rb_yield(rb_str_new2(t->print_analysis(analyses[k])));
|
124
|
+
} else
|
125
|
+
// We might as well return early if there is no block.
|
126
|
+
break;
|
127
|
+
}
|
128
|
+
|
129
|
+
return accepted;
|
130
|
+
}
|
131
|
+
|
132
|
+
static void regular_transducer_free(Transducer *t)
|
133
|
+
{
|
134
|
+
if (t)
|
135
|
+
delete t;
|
136
|
+
}
|
137
|
+
|
138
|
+
static void regular_transducer_mark(Transducer *t)
|
139
|
+
{
|
140
|
+
}
|
141
|
+
|
142
|
+
static VALUE regular_transducer_alloc(VALUE klass)
|
143
|
+
{
|
144
|
+
Transducer *t = NULL;
|
145
|
+
|
146
|
+
return Data_Wrap_Struct(klass, regular_transducer_mark, regular_transducer_free, t);
|
147
|
+
}
|
148
|
+
|
149
|
+
static VALUE regular_transducer_init(VALUE obj, VALUE filename)
|
150
|
+
{
|
151
|
+
FILE *file;
|
152
|
+
Transducer *t;
|
153
|
+
|
154
|
+
file = fopen(RSTRING(filename)->ptr, "rb");
|
155
|
+
|
156
|
+
if (!file) {
|
157
|
+
rb_raise(rb_eRuntimeError, "Unable to open SFST file %s", RSTRING(filename)->ptr);
|
158
|
+
}
|
159
|
+
|
160
|
+
try {
|
161
|
+
t = new Transducer(file);
|
162
|
+
fclose(file);
|
163
|
+
}
|
164
|
+
catch (const char *p) {
|
165
|
+
rb_raise(rb_eRuntimeError, "Unable to open SFST file %s: %s", RSTRING(filename)->ptr, p);
|
166
|
+
}
|
167
|
+
|
168
|
+
DATA_PTR(obj) = t;
|
169
|
+
return Qnil;
|
170
|
+
}
|
171
|
+
|
172
|
+
using std::vector;
|
173
|
+
|
174
|
+
enum { BOTH, LOWER, UPPER };
|
175
|
+
|
176
|
+
static VALUE _alphabet_to_rb_str(Alphabet *a, Character c)
|
177
|
+
{
|
178
|
+
const char *s = a->code2symbol(c);
|
179
|
+
|
180
|
+
if (s)
|
181
|
+
return rb_str_new2(s);
|
182
|
+
|
183
|
+
unsigned int uc = c;
|
184
|
+
char buf[32];
|
185
|
+
|
186
|
+
if (uc >= 32 && uc < 256) {
|
187
|
+
buf[0] = (char)c;
|
188
|
+
buf[1] = '\0';
|
189
|
+
} else {
|
190
|
+
sprintf(buf, "\\%u", uc);
|
191
|
+
}
|
192
|
+
return rb_str_new2(buf);
|
193
|
+
}
|
194
|
+
|
195
|
+
static VALUE rb_ary_push_pair(VALUE ary, VALUE a, VALUE b)
|
196
|
+
{
|
197
|
+
VALUE pair = rb_ary_new();
|
198
|
+
rb_ary_push(pair, a);
|
199
|
+
rb_ary_push(pair, b);
|
200
|
+
rb_ary_push(ary, pair);
|
201
|
+
return ary;
|
202
|
+
}
|
203
|
+
|
204
|
+
static void _regular_transducer_generate(Transducer *t, Node *node,
|
205
|
+
Node2Int &visitations, VALUE a, int mode, bool epsilons)
|
206
|
+
{
|
207
|
+
if (node->is_final())
|
208
|
+
rb_yield(a);
|
209
|
+
|
210
|
+
visitations[node]++;
|
211
|
+
|
212
|
+
vector<Arc*> arc;
|
213
|
+
for (ArcsIter p(node->arcs()); p; p++) {
|
214
|
+
Arc *a = p;
|
215
|
+
Node *n = a->target_node();
|
216
|
+
size_t i;
|
217
|
+
for (i = 0; i < arc.size(); i++)
|
218
|
+
if (visitations[n] < visitations[arc[i]->target_node()])
|
219
|
+
break;
|
220
|
+
arc.push_back(NULL);
|
221
|
+
for (size_t k = arc.size() - 1; k > i; k--)
|
222
|
+
arc[k] = arc[k - 1];
|
223
|
+
arc[i] = a;
|
224
|
+
}
|
225
|
+
|
226
|
+
for (size_t i = 0; i < arc.size(); i++) {
|
227
|
+
Label l = arc[i]->label();
|
228
|
+
VALUE lower, upper;
|
229
|
+
|
230
|
+
Character lc = l.lower_char();
|
231
|
+
if ((mode == BOTH || mode == LOWER) && (epsilons || lc != Label::epsilon)) {
|
232
|
+
lower = _alphabet_to_rb_str(&(t->alphabet), lc);
|
233
|
+
} else
|
234
|
+
lower = Qnil;
|
235
|
+
|
236
|
+
Character uc = l.upper_char();
|
237
|
+
if ((mode == BOTH || mode == UPPER) && (epsilons || uc != Label::epsilon)) {
|
238
|
+
upper = _alphabet_to_rb_str(&(t->alphabet), uc);
|
239
|
+
} else
|
240
|
+
upper = Qnil;
|
241
|
+
|
242
|
+
switch (mode) {
|
243
|
+
case BOTH:
|
244
|
+
rb_ary_push_pair(a, lower, upper);
|
245
|
+
break;
|
246
|
+
|
247
|
+
case UPPER:
|
248
|
+
rb_ary_push(a, upper);
|
249
|
+
break;
|
250
|
+
|
251
|
+
case LOWER:
|
252
|
+
rb_ary_push(a, lower);
|
253
|
+
break;
|
254
|
+
}
|
255
|
+
|
256
|
+
_regular_transducer_generate(t, arc[i]->target_node(), visitations, a, mode, epsilons);
|
257
|
+
|
258
|
+
rb_ary_pop(a);
|
259
|
+
}
|
260
|
+
}
|
261
|
+
|
262
|
+
static VALUE regular_transducer_generate_language(VALUE self, VALUE levels_arg, VALUE mode_arg)
|
263
|
+
{
|
264
|
+
Transducer *t;
|
265
|
+
Data_Get_Struct(self, Transducer, t);
|
266
|
+
|
267
|
+
static ID id_upper = rb_intern("upper");
|
268
|
+
static ID id_lower = rb_intern("lower");
|
269
|
+
static ID id_both = rb_intern("both");
|
270
|
+
static ID id_noepsilons = rb_intern("noepsilons");
|
271
|
+
static ID id_all = rb_intern("all");
|
272
|
+
|
273
|
+
int levels;
|
274
|
+
bool epsilons;
|
275
|
+
|
276
|
+
Check_Type(levels_arg, T_SYMBOL);
|
277
|
+
|
278
|
+
if (SYM2ID(levels_arg) == id_upper)
|
279
|
+
levels = UPPER;
|
280
|
+
else if (SYM2ID(levels_arg) == id_lower)
|
281
|
+
levels = LOWER;
|
282
|
+
else if (SYM2ID(levels_arg) == id_both)
|
283
|
+
levels = BOTH;
|
284
|
+
else
|
285
|
+
rb_raise(rb_eRuntimeError, "invalid levels");
|
286
|
+
|
287
|
+
Check_Type(mode_arg, T_SYMBOL);
|
288
|
+
|
289
|
+
if (SYM2ID(mode_arg) == id_noepsilons)
|
290
|
+
epsilons = false;
|
291
|
+
else if (SYM2ID(mode_arg) == id_all)
|
292
|
+
epsilons = true;
|
293
|
+
else
|
294
|
+
rb_raise(rb_eRuntimeError, "invalid mode");
|
295
|
+
|
296
|
+
if (!rb_block_given_p())
|
297
|
+
rb_raise(rb_eRuntimeError, "block expected");
|
298
|
+
|
299
|
+
Node2Int visitations;
|
300
|
+
Transducer *a2;
|
301
|
+
switch (levels) {
|
302
|
+
case UPPER:
|
303
|
+
a2 = &(t->upper_level().minimise());
|
304
|
+
break;
|
305
|
+
case LOWER:
|
306
|
+
a2 = &(t->lower_level().minimise());
|
307
|
+
break;
|
308
|
+
default:
|
309
|
+
a2 = t;
|
310
|
+
break;
|
311
|
+
}
|
312
|
+
_regular_transducer_generate(a2, a2->root_node(), visitations, rb_ary_new(),
|
313
|
+
levels, epsilons);
|
314
|
+
|
315
|
+
return Qnil;
|
316
|
+
}
|
317
|
+
|
318
|
+
static bool _regular_transducer_yield(Transducer *t, Node *node, VALUE result_array)
|
319
|
+
{
|
320
|
+
int accepted = 0;
|
321
|
+
|
322
|
+
if (node->was_visited(t->vmark)) {
|
323
|
+
if (node->forward() != NULL) { // cycle detected
|
324
|
+
// FIXME: How is this best handled?
|
325
|
+
//cerr << "Warning: cyclic analyses (cycle aborted)\n";
|
326
|
+
}
|
327
|
+
node->set_forward(node); // used like a flag for loop detection
|
328
|
+
}
|
329
|
+
|
330
|
+
if (node->is_final()) {
|
331
|
+
if (rb_block_given_p())
|
332
|
+
rb_yield(result_array);
|
333
|
+
|
334
|
+
accepted = 1;
|
335
|
+
}
|
336
|
+
|
337
|
+
for (ArcsIter i(node->arcs()); i; i++) {
|
338
|
+
Arc *arc = i;
|
339
|
+
Label l = arc->label();
|
340
|
+
|
341
|
+
rb_ary_push(result_array, rb_str_new2(t->alphabet.write_label(l)));
|
342
|
+
|
343
|
+
accepted |= _regular_transducer_yield(t, arc->target_node(), result_array);
|
344
|
+
|
345
|
+
rb_ary_pop(result_array);
|
346
|
+
}
|
347
|
+
|
348
|
+
node->set_forward(NULL);
|
349
|
+
|
350
|
+
return accepted == 1 ? true : false;
|
351
|
+
}
|
352
|
+
|
353
|
+
static VALUE _regular_transducer_analyze_or_generate(Transducer *t, VALUE string, bool generate)
|
354
|
+
{
|
355
|
+
Transducer *a2, *a3;
|
356
|
+
Transducer a1(RSTRING(string)->ptr, &(t->alphabet), false);
|
357
|
+
if (generate) {
|
358
|
+
a2 = &(a1 || *t);
|
359
|
+
a3 = &(a2->upper_level());
|
360
|
+
} else {
|
361
|
+
a2 = &(*t || a1);
|
362
|
+
a3 = &(a2->lower_level());
|
363
|
+
}
|
364
|
+
delete a2;
|
365
|
+
a2 = &a3->minimise();
|
366
|
+
delete a3;
|
367
|
+
|
368
|
+
a2->alphabet.copy(t->alphabet);
|
369
|
+
t->incr_vmark();
|
370
|
+
bool accepted = _regular_transducer_yield(a2, a2->root_node(), rb_ary_new());
|
371
|
+
delete a2;
|
372
|
+
|
373
|
+
return accepted ? Qtrue : Qfalse;
|
374
|
+
}
|
375
|
+
|
376
|
+
static VALUE regular_transducer_generate(VALUE self, VALUE string)
|
377
|
+
{
|
378
|
+
Transducer *t;
|
379
|
+
Check_Type(string, T_STRING);
|
380
|
+
Data_Get_Struct(self, Transducer, t);
|
381
|
+
return _regular_transducer_analyze_or_generate(t, string, true);
|
382
|
+
}
|
383
|
+
|
384
|
+
static VALUE regular_transducer_analyze(VALUE self, VALUE string)
|
385
|
+
{
|
386
|
+
Transducer *t;
|
387
|
+
Check_Type(string, T_STRING);
|
388
|
+
Data_Get_Struct(self, Transducer, t);
|
389
|
+
return _regular_transducer_analyze_or_generate(t, string, false);
|
390
|
+
}
|
391
|
+
|
392
|
+
extern "C"
|
393
|
+
|
394
|
+
void Init_sfst_machine(void)
|
395
|
+
{
|
396
|
+
mSFST = rb_define_module("SFST");
|
397
|
+
rb_define_module_function(mSFST, "_compile_regular", (VALUE (*)(...))compile_regular, 2);
|
398
|
+
rb_define_module_function(mSFST, "_compile_compact", (VALUE (*)(...))compile_compact, 2);
|
399
|
+
|
400
|
+
mCompactTransducer = rb_define_class_under(mSFST, "CompactTransducerMachine", rb_cObject);
|
401
|
+
rb_define_alloc_func(mCompactTransducer, compact_transducer_alloc);
|
402
|
+
rb_define_method(mCompactTransducer, "initialize", (VALUE (*)(...))compact_transducer_init, 1);
|
403
|
+
rb_define_method(mCompactTransducer, "_analyze", (VALUE (*)(...))compact_transducer_analyze, 1);
|
404
|
+
|
405
|
+
mRegularTransducer = rb_define_class_under(mSFST, "RegularTransducerMachine", rb_cObject);
|
406
|
+
rb_define_alloc_func(mRegularTransducer, regular_transducer_alloc);
|
407
|
+
rb_define_method(mRegularTransducer, "initialize", (VALUE (*)(...))regular_transducer_init, 1);
|
408
|
+
rb_define_method(mRegularTransducer, "_generate_language", (VALUE (*)(...))regular_transducer_generate_language, 2);
|
409
|
+
rb_define_method(mRegularTransducer, "_analyze", (VALUE (*)(...))regular_transducer_analyze, 1);
|
410
|
+
rb_define_method(mRegularTransducer, "_generate", (VALUE (*)(...))regular_transducer_generate, 1);
|
411
|
+
}
|