ruby-sfst 0.4.3 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -0
  3. data/COPYING +280 -0
  4. data/Gemfile +3 -0
  5. data/Gemfile.lock +54 -0
  6. data/README.md +1 -1
  7. data/Rakefile +9 -18
  8. data/bin/console +7 -0
  9. data/bin/setup +6 -0
  10. data/ext/sfst/alphabet.cc +879 -0
  11. data/ext/sfst/alphabet.h +302 -0
  12. data/ext/sfst/basic.cc +85 -0
  13. data/ext/{sfst_machine → sfst}/basic.h +7 -4
  14. data/ext/sfst/compact.cc +629 -0
  15. data/ext/sfst/compact.h +100 -0
  16. data/ext/sfst/determinise.cc +279 -0
  17. data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
  18. data/ext/sfst/fst.cc +1150 -0
  19. data/ext/sfst/fst.h +374 -0
  20. data/ext/sfst/hopcroft.cc +681 -0
  21. data/ext/sfst/interface.cc +1921 -0
  22. data/ext/sfst/interface.h +171 -0
  23. data/ext/sfst/make-compact.cc +323 -0
  24. data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
  25. data/ext/sfst/mem.h +80 -0
  26. data/ext/sfst/operators.cc +1273 -0
  27. data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
  28. data/ext/sfst/sgi.h +72 -0
  29. data/ext/sfst/utf8.cc +149 -0
  30. data/ext/{sfst_machine → sfst}/utf8.h +7 -4
  31. data/lib/sfst.rb +2 -1
  32. data/lib/sfst/version.rb +1 -1
  33. data/ruby-sfst.gemspec +23 -23
  34. metadata +107 -35
  35. data/ext/sfst_machine/alphabet.cc +0 -812
  36. data/ext/sfst_machine/alphabet.h +0 -273
  37. data/ext/sfst_machine/basic.cc +0 -84
  38. data/ext/sfst_machine/compact.cc +0 -616
  39. data/ext/sfst_machine/compact.h +0 -98
  40. data/ext/sfst_machine/determinise.cc +0 -303
  41. data/ext/sfst_machine/fst.cc +0 -1000
  42. data/ext/sfst_machine/fst.h +0 -369
  43. data/ext/sfst_machine/interface.cc +0 -1842
  44. data/ext/sfst_machine/interface.h +0 -93
  45. data/ext/sfst_machine/make-compact.cc +0 -327
  46. data/ext/sfst_machine/mem.h +0 -74
  47. data/ext/sfst_machine/operators.cc +0 -1131
  48. data/ext/sfst_machine/sgi.h +0 -44
  49. data/ext/sfst_machine/utf8.cc +0 -146
  50. data/test/test_sfst.fst +0 -3
  51. data/test/test_sfst.rb +0 -114
@@ -14,19 +14,19 @@ VALUE mSFST = Qnil;
14
14
  VALUE mCompactTransducer = Qnil;
15
15
  VALUE mRegularTransducer = Qnil;
16
16
 
17
- static void compact_transducer_free(CompactTransducer *t)
17
+ static void compact_transducer_free(SFST::CompactTransducer *t)
18
18
  {
19
19
  if (t)
20
20
  delete t;
21
21
  }
22
22
 
23
- static void compact_transducer_mark(CompactTransducer *t)
23
+ static void compact_transducer_mark(SFST::CompactTransducer *t)
24
24
  {
25
25
  }
26
26
 
27
27
  static VALUE compact_transducer_alloc(VALUE klass)
28
28
  {
29
- CompactTransducer *t = NULL;
29
+ SFST::CompactTransducer *t = NULL;
30
30
 
31
31
  return Data_Wrap_Struct(klass, compact_transducer_mark, compact_transducer_free, t);
32
32
  }
@@ -34,7 +34,7 @@ static VALUE compact_transducer_alloc(VALUE klass)
34
34
  static VALUE compact_transducer_init(VALUE obj, VALUE filename)
35
35
  {
36
36
  FILE *file;
37
- CompactTransducer *t;
37
+ SFST::CompactTransducer *t;
38
38
 
39
39
  file = fopen(RSTRING_PTR(filename), "rb");
40
40
 
@@ -43,7 +43,7 @@ static VALUE compact_transducer_init(VALUE obj, VALUE filename)
43
43
  }
44
44
 
45
45
  try {
46
- t = new CompactTransducer(file);
46
+ t = new SFST::CompactTransducer(file);
47
47
  fclose(file);
48
48
  }
49
49
  catch (const char *p) {
@@ -57,13 +57,13 @@ static VALUE compact_transducer_init(VALUE obj, VALUE filename)
57
57
  static VALUE compact_transducer_analyze(VALUE self, VALUE string)
58
58
  {
59
59
  VALUE accepted = Qfalse;
60
- CompactTransducer *t;
60
+ SFST::CompactTransducer *t;
61
61
 
62
62
  Check_Type(string, T_STRING);
63
63
 
64
- Data_Get_Struct(self, CompactTransducer, t);
64
+ Data_Get_Struct(self, SFST::CompactTransducer, t);
65
65
 
66
- std::vector<CAnalysis> analyses;
66
+ std::vector<SFST::CAnalysis> analyses;
67
67
  t->analyze_string(RSTRING_PTR(string), analyses);
68
68
 
69
69
  for (size_t k = 0; k < analyses.size(); k++) {
@@ -79,19 +79,19 @@ static VALUE compact_transducer_analyze(VALUE self, VALUE string)
79
79
  return accepted;
80
80
  }
81
81
 
82
- static void regular_transducer_free(Transducer *t)
82
+ static void regular_transducer_free(SFST::Transducer *t)
83
83
  {
84
84
  if (t)
85
85
  delete t;
86
86
  }
87
87
 
88
- static void regular_transducer_mark(Transducer *t)
88
+ static void regular_transducer_mark(SFST::Transducer *t)
89
89
  {
90
90
  }
91
91
 
92
92
  static VALUE regular_transducer_alloc(VALUE klass)
93
93
  {
94
- Transducer *t = NULL;
94
+ SFST::Transducer *t = NULL;
95
95
 
96
96
  return Data_Wrap_Struct(klass, regular_transducer_mark, regular_transducer_free, t);
97
97
  }
@@ -99,7 +99,7 @@ static VALUE regular_transducer_alloc(VALUE klass)
99
99
  static VALUE regular_transducer_init(VALUE obj, VALUE filename)
100
100
  {
101
101
  FILE *file;
102
- Transducer *t;
102
+ SFST::Transducer *t;
103
103
 
104
104
  file = fopen(RSTRING_PTR(filename), "rb");
105
105
 
@@ -108,7 +108,7 @@ static VALUE regular_transducer_init(VALUE obj, VALUE filename)
108
108
  }
109
109
 
110
110
  try {
111
- t = new Transducer(file);
111
+ t = new SFST::Transducer(file);
112
112
  fclose(file);
113
113
  }
114
114
  catch (const char *p) {
@@ -123,7 +123,7 @@ using std::vector;
123
123
 
124
124
  enum { BOTH, LOWER, UPPER };
125
125
 
126
- static VALUE _alphabet_to_rb_str(Alphabet *a, Character c)
126
+ static VALUE _alphabet_to_rb_str(SFST::Alphabet *a, SFST::Character c)
127
127
  {
128
128
  const char *s = a->code2symbol(c);
129
129
 
@@ -151,68 +151,79 @@ static VALUE rb_ary_push_pair(VALUE ary, VALUE a, VALUE b)
151
151
  return ary;
152
152
  }
153
153
 
154
- static void _regular_transducer_generate(Transducer *t, Node *node,
155
- Node2Int &visitations, VALUE a, int mode, bool epsilons)
156
- {
157
- if (node->is_final())
158
- rb_yield(a);
159
-
160
- visitations[node]++;
161
-
162
- vector<Arc*> arc;
163
- for (ArcsIter p(node->arcs()); p; p++) {
164
- Arc *a = p;
165
- Node *n = a->target_node();
166
- size_t i;
167
- for (i = 0; i < arc.size(); i++)
168
- if (visitations[n] < visitations[arc[i]->target_node()])
169
- break;
170
- arc.push_back(NULL);
171
- for (size_t k = arc.size() - 1; k > i; k--)
172
- arc[k] = arc[k - 1];
173
- arc[i] = a;
174
- }
154
+ class Gen {
155
+ public:
175
156
 
176
- for (size_t i = 0; i < arc.size(); i++) {
177
- Label l = arc[i]->label();
178
- VALUE lower, upper;
157
+ SFST::Node *node;
158
+ SFST::Index previous;
159
+ SFST::Label label;
179
160
 
180
- Character lc = l.lower_char();
181
- if ((mode == BOTH || mode == LOWER) && (epsilons || lc != Label::epsilon)) {
182
- lower = _alphabet_to_rb_str(&(t->alphabet), lc);
183
- } else
184
- lower = Qnil;
161
+ Gen(SFST::Node *n, SFST::Label l = SFST::Label::epsilon, SFST::Index p = SFST::undef):
162
+ node(n), previous(p), label(l) {}
185
163
 
186
- Character uc = l.upper_char();
187
- if ((mode == BOTH || mode == UPPER) && (epsilons || uc != Label::epsilon)) {
188
- upper = _alphabet_to_rb_str(&(t->alphabet), uc);
189
- } else
190
- upper = Qnil;
164
+ void print(vector<Gen> &paths, VALUE a, int levels, bool epsilons, SFST::Transducer *t) {
165
+ if (previous != SFST::undef) {
166
+ paths[previous].print(paths, a, levels, epsilons, t);
167
+
168
+ SFST::Label l = label;
169
+
170
+ VALUE lower, upper;
171
+
172
+ SFST::Character lc = l.lower_char();
173
+ if ((levels == BOTH || levels == LOWER) && (epsilons || lc != SFST::Label::epsilon)) {
174
+ lower = _alphabet_to_rb_str(&(t->alphabet), lc);
175
+ } else
176
+ lower = Qnil;
177
+
178
+ SFST::Character uc = l.upper_char();
179
+ if ((levels == BOTH || levels == UPPER) && (epsilons || uc != SFST::Label::epsilon)) {
180
+ upper = _alphabet_to_rb_str(&(t->alphabet), uc);
181
+ } else
182
+ upper = Qnil;
191
183
 
192
- switch (mode) {
193
- case BOTH:
194
- rb_ary_push_pair(a, lower, upper);
195
- break;
184
+ switch (levels) {
185
+ case BOTH:
186
+ rb_ary_push_pair(a, lower, upper);
187
+ break;
196
188
 
197
- case UPPER:
198
- rb_ary_push(a, upper);
199
- break;
189
+ case UPPER:
190
+ rb_ary_push(a, upper);
191
+ break;
200
192
 
201
- case LOWER:
202
- rb_ary_push(a, lower);
203
- break;
193
+ case LOWER:
194
+ rb_ary_push(a, lower);
195
+ break;
196
+ }
204
197
  }
198
+ }
199
+ };
200
+
201
+ static void _generate(SFST::Transducer *t, int levels, bool epsilons)
202
+ {
203
+ vector<Gen> paths;
204
+ paths.push_back(Gen(t->root_node()));
205
+
206
+ for (size_t i = 0; i < paths.size(); i++) {
207
+ Gen &gen = paths[i];
208
+ SFST::Node *node = gen.node;
205
209
 
206
- _regular_transducer_generate(t, arc[i]->target_node(), visitations, a, mode, epsilons);
210
+ if (node->is_final()) {
211
+ VALUE a = rb_ary_new();
212
+ gen.print(paths, a, levels, epsilons, t);
213
+ rb_yield(a);
214
+ }
207
215
 
208
- rb_ary_pop(a);
216
+ for (SFST::ArcsIter p(node->arcs()); p; p++) {
217
+ SFST::Arc *arc = p;
218
+ paths.push_back(Gen(arc->target_node(), arc->label(), (SFST::Index)i));
219
+ }
209
220
  }
210
221
  }
211
222
 
212
223
  static VALUE regular_transducer_generate_language(VALUE self, VALUE levels_arg, VALUE mode_arg)
213
224
  {
214
- Transducer *t;
215
- Data_Get_Struct(self, Transducer, t);
225
+ SFST::Transducer *t;
226
+ Data_Get_Struct(self, SFST::Transducer, t);
216
227
 
217
228
  static ID id_upper = rb_intern("upper");
218
229
  static ID id_lower = rb_intern("lower");
@@ -246,8 +257,8 @@ static VALUE regular_transducer_generate_language(VALUE self, VALUE levels_arg,
246
257
  if (!rb_block_given_p())
247
258
  rb_raise(rb_eRuntimeError, "block expected");
248
259
 
249
- Node2Int visitations;
250
- Transducer *a2;
260
+ SFST::Transducer *a2;
261
+
251
262
  switch (levels) {
252
263
  case UPPER:
253
264
  a2 = &(t->upper_level().minimise());
@@ -259,13 +270,13 @@ static VALUE regular_transducer_generate_language(VALUE self, VALUE levels_arg,
259
270
  a2 = t;
260
271
  break;
261
272
  }
262
- _regular_transducer_generate(a2, a2->root_node(), visitations, rb_ary_new(),
263
- levels, epsilons);
273
+
274
+ _generate(a2, levels, epsilons);
264
275
 
265
276
  return Qnil;
266
277
  }
267
278
 
268
- static bool _regular_transducer_yield(Transducer *t, Node *node, VALUE result_array)
279
+ static bool _regular_transducer_yield(SFST::Transducer *t, SFST::Node *node, VALUE result_array)
269
280
  {
270
281
  int accepted = 0;
271
282
 
@@ -284,9 +295,9 @@ static bool _regular_transducer_yield(Transducer *t, Node *node, VALUE result_ar
284
295
  accepted = 1;
285
296
  }
286
297
 
287
- for (ArcsIter i(node->arcs()); i; i++) {
288
- Arc *arc = i;
289
- Label l = arc->label();
298
+ for (SFST::ArcsIter i(node->arcs()); i; i++) {
299
+ SFST::Arc *arc = i;
300
+ SFST::Label l = arc->label();
290
301
 
291
302
  rb_ary_push(result_array, rb_str_new2(t->alphabet.write_label(l)));
292
303
 
@@ -300,10 +311,10 @@ static bool _regular_transducer_yield(Transducer *t, Node *node, VALUE result_ar
300
311
  return accepted == 1 ? true : false;
301
312
  }
302
313
 
303
- static VALUE _regular_transducer_analyze_or_generate(Transducer *t, VALUE string, bool generate)
314
+ static VALUE _regular_transducer_analyze_or_generate(SFST::Transducer *t, VALUE string, bool generate)
304
315
  {
305
- Transducer *a2, *a3;
306
- Transducer a1(RSTRING_PTR(string), &(t->alphabet), false);
316
+ SFST::Transducer *a2, *a3;
317
+ SFST::Transducer a1(RSTRING_PTR(string), &(t->alphabet), false);
307
318
  if (generate) {
308
319
  a2 = &(a1 || *t);
309
320
  a3 = &(a2->upper_level());
@@ -324,23 +335,23 @@ static VALUE _regular_transducer_analyze_or_generate(Transducer *t, VALUE string
324
335
 
325
336
  static VALUE regular_transducer_generate(VALUE self, VALUE string)
326
337
  {
327
- Transducer *t;
338
+ SFST::Transducer *t;
328
339
  Check_Type(string, T_STRING);
329
- Data_Get_Struct(self, Transducer, t);
340
+ Data_Get_Struct(self, SFST::Transducer, t);
330
341
  return _regular_transducer_analyze_or_generate(t, string, true);
331
342
  }
332
343
 
333
344
  static VALUE regular_transducer_analyze(VALUE self, VALUE string)
334
345
  {
335
- Transducer *t;
346
+ SFST::Transducer *t;
336
347
  Check_Type(string, T_STRING);
337
- Data_Get_Struct(self, Transducer, t);
348
+ Data_Get_Struct(self, SFST::Transducer, t);
338
349
  return _regular_transducer_analyze_or_generate(t, string, false);
339
350
  }
340
351
 
341
352
  extern "C"
342
353
 
343
- void Init_sfst_machine(void)
354
+ void Init_sfst(void)
344
355
  {
345
356
  mSFST = rb_define_module("SFST");
346
357
 
@@ -0,0 +1,72 @@
1
+ /*******************************************************************/
2
+ /* */
3
+ /* File: sgi.h */
4
+ /* Author: Helmut Schmid */
5
+ /* Purpose: */
6
+ /* Created: Thu Sep 11 15:58:25 2008 */
7
+ /* Modified: Fri Sep 12 08:17:03 2008 (schmid) */
8
+ /* Modified: Wed May 26 12:54:00 2010 (hfst) */
9
+ /*******************************************************************/
10
+
11
+ #ifndef _SGI_INCLUDED
12
+ #define _SGI_INCLUDED
13
+
14
+ #if HAVE_CONFIG_H
15
+ # include <config.h>
16
+ #endif
17
+
18
+ #if HAVE_BACKWARD_HASH_MAP
19
+ # include <backward/hash_map>
20
+ #elif HAVE_EXT_HASH_MAP
21
+ # include <ext/hash_map>
22
+ #elif HAVE_HASH_MAP
23
+ # include <hash_map>
24
+ #elif SGIext
25
+ # include <ext/hash_map>
26
+ #elif SGI__gnu_cxx
27
+ # include <ext/hash_map>
28
+ #else
29
+ # warning "unknown hash_map"
30
+ # include <hash_map>
31
+ #endif
32
+ #if HAVE_BACKWARD_HASH_SET
33
+ # include <backward/hash_set>
34
+ #elif HAVE_EXT_HASH_SET
35
+ # include <ext/hash_set>
36
+ #elif HAVE_HASH_SET
37
+ # include <hash_set>
38
+ #elif SGIext
39
+ # include <ext/hash_set>
40
+ #elif SGI__gnu_cxx
41
+ # include <ext/hash_set>
42
+ #else
43
+ # warning "missing hash_set"
44
+ # include <hash_set>
45
+ #endif
46
+
47
+ // Hfst addition
48
+ namespace SFST
49
+ {
50
+ // from <http://gcc.gnu.org/onlinedocs/libstdc++/manual/backwards.html>
51
+ #ifdef __GNUC__
52
+ # if __GNUC__ < 3
53
+ using ::hash_map;
54
+ using ::hash_set;
55
+ using ::hash;
56
+ # elif __GNUC__ == 3 && __GNUC_MINOR__ == 0
57
+ using std::hash_map;
58
+ using std::hash_set;
59
+ using std::hash;
60
+ # else
61
+ using __gnu_cxx::hash_map;
62
+ using __gnu_cxx::hash_set;
63
+ using __gnu_cxx::hash;
64
+ # endif
65
+ #else
66
+ using std::hash_map;
67
+ using std::hash_set;
68
+ using std::hash;
69
+ #endif
70
+ }
71
+
72
+ #endif
@@ -0,0 +1,149 @@
1
+
2
+ /*******************************************************************/
3
+ /* */
4
+ /* File: utf8.C */
5
+ /* Author: Helmut Schmid */
6
+ /* Purpose: */
7
+ /* Created: Mon Sep 5 17:49:16 2005 */
8
+ /* Modified: Wed Sep 29 15:08:34 2010 (schmid) */
9
+ /* */
10
+ /*******************************************************************/
11
+
12
+ #include "string.h"
13
+ #include "utf8.h"
14
+
15
+ namespace SFST {
16
+
17
+ const unsigned char get3LSbits=7;
18
+ const unsigned char get4LSbits=15;
19
+ const unsigned char get5LSbits=31;
20
+ const unsigned char get6LSbits=63;
21
+
22
+ const unsigned char set1MSbits=128;
23
+ const unsigned char set2MSbits=192;
24
+ const unsigned char set3MSbits=224;
25
+ const unsigned char set4MSbits=240;
26
+
27
+
28
+
29
+ /*******************************************************************/
30
+ /* */
31
+ /* int2utf8 */
32
+ /* */
33
+ /*******************************************************************/
34
+
35
+ char *int2utf8( unsigned int sym )
36
+
37
+ {
38
+ static unsigned char ch[5];
39
+
40
+ if (sym < 128) {
41
+ // 1-byte UTF8 symbol, 7 bits
42
+ ch[0] = (unsigned char)sym;
43
+ ch[1] = 0;
44
+ }
45
+
46
+ else if (sym < 2048) {
47
+ // 2-byte UTF8 symbol, 5+6 bits
48
+ ch[0] = (unsigned char)((sym >> 6) | set2MSbits);
49
+ ch[1] = (unsigned char)((sym & get6LSbits) | set1MSbits);
50
+ ch[2] = 0;
51
+ }
52
+
53
+ else if (sym < 65536) {
54
+ // 3-byte UTF8 symbol, 4+6+6 bits
55
+ ch[0] = (unsigned char)((sym >> 12) | set3MSbits);
56
+ ch[1] = (unsigned char)(((sym >> 6) & get6LSbits) | set1MSbits);
57
+ ch[2] = (unsigned char)((sym & get6LSbits) | set1MSbits);
58
+ ch[3] = 0;
59
+ }
60
+
61
+ else if (sym < 2097152) {
62
+ // 4-byte UTF8 symbol, 3+6+6+6 bits
63
+ ch[0] = (unsigned char)((sym >> 18) | set4MSbits);
64
+ ch[1] = (unsigned char)(((sym >> 12) & get6LSbits) | set1MSbits);
65
+ ch[2] = (unsigned char)(((sym >> 6) & get6LSbits) | set1MSbits);
66
+ ch[3] = (unsigned char)((sym & get6LSbits) | set1MSbits);
67
+ ch[4] = 0;
68
+ }
69
+
70
+ else
71
+ return NULL;
72
+
73
+ return (char*)ch;
74
+ }
75
+
76
+
77
+ /*******************************************************************/
78
+ /* */
79
+ /* utf8toint */
80
+ /* */
81
+ /*******************************************************************/
82
+
83
+ unsigned int utf8toint( char **s )
84
+
85
+ {
86
+ int bytes_to_come;
87
+ unsigned int result=0;
88
+ unsigned char c=(unsigned char)**s;
89
+
90
+ if (c >= (unsigned char)set4MSbits) { // 1111xxxx
91
+ bytes_to_come = 3;
92
+ result = (result << 3) | (c & get3LSbits);
93
+ }
94
+
95
+ else if (c >= (unsigned char) set3MSbits) { // 1110xxxx
96
+ // start of a three-byte symbol
97
+ bytes_to_come = 2;
98
+ result = (result << 4) | (c & get4LSbits);
99
+ }
100
+
101
+ else if (c >= (unsigned char) set2MSbits) { // 1100xxxx
102
+ // start of a two-byte symbol
103
+ bytes_to_come = 1;
104
+ result = (result << 5) | (c & get5LSbits);
105
+ }
106
+
107
+ else if (c < (unsigned char) set1MSbits) { // 0100xxxx
108
+ // one-byte symbol
109
+ bytes_to_come = 0;
110
+ result = c;
111
+ }
112
+
113
+ else
114
+ return 0; // error
115
+
116
+ while (bytes_to_come > 0) {
117
+ bytes_to_come--;
118
+ (*s)++;
119
+ c = (unsigned char)**s;
120
+ if (c < (unsigned char) set2MSbits &&
121
+ c >= (unsigned char) set1MSbits) // 1000xxxx
122
+ {
123
+ result = (result << 6) | (c & get6LSbits);
124
+ }
125
+ else
126
+ return 0;
127
+ }
128
+
129
+ (*s)++;
130
+ return result;
131
+ }
132
+
133
+
134
+ /*******************************************************************/
135
+ /* */
136
+ /* utf8toint */
137
+ /* */
138
+ /*******************************************************************/
139
+
140
+ unsigned int utf8toint( char *s )
141
+
142
+ {
143
+ unsigned int result = utf8toint( &s );
144
+ if (*s == 0) // all bytes converted?
145
+ return result;
146
+ return 0;
147
+ }
148
+
149
+ }