ruby-sfst 0.4.3 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -0
  3. data/COPYING +280 -0
  4. data/Gemfile +3 -0
  5. data/Gemfile.lock +54 -0
  6. data/README.md +1 -1
  7. data/Rakefile +9 -18
  8. data/bin/console +7 -0
  9. data/bin/setup +6 -0
  10. data/ext/sfst/alphabet.cc +879 -0
  11. data/ext/sfst/alphabet.h +302 -0
  12. data/ext/sfst/basic.cc +85 -0
  13. data/ext/{sfst_machine → sfst}/basic.h +7 -4
  14. data/ext/sfst/compact.cc +629 -0
  15. data/ext/sfst/compact.h +100 -0
  16. data/ext/sfst/determinise.cc +279 -0
  17. data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
  18. data/ext/sfst/fst.cc +1150 -0
  19. data/ext/sfst/fst.h +374 -0
  20. data/ext/sfst/hopcroft.cc +681 -0
  21. data/ext/sfst/interface.cc +1921 -0
  22. data/ext/sfst/interface.h +171 -0
  23. data/ext/sfst/make-compact.cc +323 -0
  24. data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
  25. data/ext/sfst/mem.h +80 -0
  26. data/ext/sfst/operators.cc +1273 -0
  27. data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
  28. data/ext/sfst/sgi.h +72 -0
  29. data/ext/sfst/utf8.cc +149 -0
  30. data/ext/{sfst_machine → sfst}/utf8.h +7 -4
  31. data/lib/sfst.rb +2 -1
  32. data/lib/sfst/version.rb +1 -1
  33. data/ruby-sfst.gemspec +23 -23
  34. metadata +107 -35
  35. data/ext/sfst_machine/alphabet.cc +0 -812
  36. data/ext/sfst_machine/alphabet.h +0 -273
  37. data/ext/sfst_machine/basic.cc +0 -84
  38. data/ext/sfst_machine/compact.cc +0 -616
  39. data/ext/sfst_machine/compact.h +0 -98
  40. data/ext/sfst_machine/determinise.cc +0 -303
  41. data/ext/sfst_machine/fst.cc +0 -1000
  42. data/ext/sfst_machine/fst.h +0 -369
  43. data/ext/sfst_machine/interface.cc +0 -1842
  44. data/ext/sfst_machine/interface.h +0 -93
  45. data/ext/sfst_machine/make-compact.cc +0 -327
  46. data/ext/sfst_machine/mem.h +0 -74
  47. data/ext/sfst_machine/operators.cc +0 -1131
  48. data/ext/sfst_machine/sgi.h +0 -44
  49. data/ext/sfst_machine/utf8.cc +0 -146
  50. data/test/test_sfst.fst +0 -3
  51. data/test/test_sfst.rb +0 -114
@@ -14,19 +14,19 @@ VALUE mSFST = Qnil;
14
14
  VALUE mCompactTransducer = Qnil;
15
15
  VALUE mRegularTransducer = Qnil;
16
16
 
17
- static void compact_transducer_free(CompactTransducer *t)
17
+ static void compact_transducer_free(SFST::CompactTransducer *t)
18
18
  {
19
19
  if (t)
20
20
  delete t;
21
21
  }
22
22
 
23
- static void compact_transducer_mark(CompactTransducer *t)
23
+ static void compact_transducer_mark(SFST::CompactTransducer *t)
24
24
  {
25
25
  }
26
26
 
27
27
  static VALUE compact_transducer_alloc(VALUE klass)
28
28
  {
29
- CompactTransducer *t = NULL;
29
+ SFST::CompactTransducer *t = NULL;
30
30
 
31
31
  return Data_Wrap_Struct(klass, compact_transducer_mark, compact_transducer_free, t);
32
32
  }
@@ -34,7 +34,7 @@ static VALUE compact_transducer_alloc(VALUE klass)
34
34
  static VALUE compact_transducer_init(VALUE obj, VALUE filename)
35
35
  {
36
36
  FILE *file;
37
- CompactTransducer *t;
37
+ SFST::CompactTransducer *t;
38
38
 
39
39
  file = fopen(RSTRING_PTR(filename), "rb");
40
40
 
@@ -43,7 +43,7 @@ static VALUE compact_transducer_init(VALUE obj, VALUE filename)
43
43
  }
44
44
 
45
45
  try {
46
- t = new CompactTransducer(file);
46
+ t = new SFST::CompactTransducer(file);
47
47
  fclose(file);
48
48
  }
49
49
  catch (const char *p) {
@@ -57,13 +57,13 @@ static VALUE compact_transducer_init(VALUE obj, VALUE filename)
57
57
  static VALUE compact_transducer_analyze(VALUE self, VALUE string)
58
58
  {
59
59
  VALUE accepted = Qfalse;
60
- CompactTransducer *t;
60
+ SFST::CompactTransducer *t;
61
61
 
62
62
  Check_Type(string, T_STRING);
63
63
 
64
- Data_Get_Struct(self, CompactTransducer, t);
64
+ Data_Get_Struct(self, SFST::CompactTransducer, t);
65
65
 
66
- std::vector<CAnalysis> analyses;
66
+ std::vector<SFST::CAnalysis> analyses;
67
67
  t->analyze_string(RSTRING_PTR(string), analyses);
68
68
 
69
69
  for (size_t k = 0; k < analyses.size(); k++) {
@@ -79,19 +79,19 @@ static VALUE compact_transducer_analyze(VALUE self, VALUE string)
79
79
  return accepted;
80
80
  }
81
81
 
82
- static void regular_transducer_free(Transducer *t)
82
+ static void regular_transducer_free(SFST::Transducer *t)
83
83
  {
84
84
  if (t)
85
85
  delete t;
86
86
  }
87
87
 
88
- static void regular_transducer_mark(Transducer *t)
88
+ static void regular_transducer_mark(SFST::Transducer *t)
89
89
  {
90
90
  }
91
91
 
92
92
  static VALUE regular_transducer_alloc(VALUE klass)
93
93
  {
94
- Transducer *t = NULL;
94
+ SFST::Transducer *t = NULL;
95
95
 
96
96
  return Data_Wrap_Struct(klass, regular_transducer_mark, regular_transducer_free, t);
97
97
  }
@@ -99,7 +99,7 @@ static VALUE regular_transducer_alloc(VALUE klass)
99
99
  static VALUE regular_transducer_init(VALUE obj, VALUE filename)
100
100
  {
101
101
  FILE *file;
102
- Transducer *t;
102
+ SFST::Transducer *t;
103
103
 
104
104
  file = fopen(RSTRING_PTR(filename), "rb");
105
105
 
@@ -108,7 +108,7 @@ static VALUE regular_transducer_init(VALUE obj, VALUE filename)
108
108
  }
109
109
 
110
110
  try {
111
- t = new Transducer(file);
111
+ t = new SFST::Transducer(file);
112
112
  fclose(file);
113
113
  }
114
114
  catch (const char *p) {
@@ -123,7 +123,7 @@ using std::vector;
123
123
 
124
124
  enum { BOTH, LOWER, UPPER };
125
125
 
126
- static VALUE _alphabet_to_rb_str(Alphabet *a, Character c)
126
+ static VALUE _alphabet_to_rb_str(SFST::Alphabet *a, SFST::Character c)
127
127
  {
128
128
  const char *s = a->code2symbol(c);
129
129
 
@@ -151,68 +151,79 @@ static VALUE rb_ary_push_pair(VALUE ary, VALUE a, VALUE b)
151
151
  return ary;
152
152
  }
153
153
 
154
- static void _regular_transducer_generate(Transducer *t, Node *node,
155
- Node2Int &visitations, VALUE a, int mode, bool epsilons)
156
- {
157
- if (node->is_final())
158
- rb_yield(a);
159
-
160
- visitations[node]++;
161
-
162
- vector<Arc*> arc;
163
- for (ArcsIter p(node->arcs()); p; p++) {
164
- Arc *a = p;
165
- Node *n = a->target_node();
166
- size_t i;
167
- for (i = 0; i < arc.size(); i++)
168
- if (visitations[n] < visitations[arc[i]->target_node()])
169
- break;
170
- arc.push_back(NULL);
171
- for (size_t k = arc.size() - 1; k > i; k--)
172
- arc[k] = arc[k - 1];
173
- arc[i] = a;
174
- }
154
+ class Gen {
155
+ public:
175
156
 
176
- for (size_t i = 0; i < arc.size(); i++) {
177
- Label l = arc[i]->label();
178
- VALUE lower, upper;
157
+ SFST::Node *node;
158
+ SFST::Index previous;
159
+ SFST::Label label;
179
160
 
180
- Character lc = l.lower_char();
181
- if ((mode == BOTH || mode == LOWER) && (epsilons || lc != Label::epsilon)) {
182
- lower = _alphabet_to_rb_str(&(t->alphabet), lc);
183
- } else
184
- lower = Qnil;
161
+ Gen(SFST::Node *n, SFST::Label l = SFST::Label::epsilon, SFST::Index p = SFST::undef):
162
+ node(n), previous(p), label(l) {}
185
163
 
186
- Character uc = l.upper_char();
187
- if ((mode == BOTH || mode == UPPER) && (epsilons || uc != Label::epsilon)) {
188
- upper = _alphabet_to_rb_str(&(t->alphabet), uc);
189
- } else
190
- upper = Qnil;
164
+ void print(vector<Gen> &paths, VALUE a, int levels, bool epsilons, SFST::Transducer *t) {
165
+ if (previous != SFST::undef) {
166
+ paths[previous].print(paths, a, levels, epsilons, t);
167
+
168
+ SFST::Label l = label;
169
+
170
+ VALUE lower, upper;
171
+
172
+ SFST::Character lc = l.lower_char();
173
+ if ((levels == BOTH || levels == LOWER) && (epsilons || lc != SFST::Label::epsilon)) {
174
+ lower = _alphabet_to_rb_str(&(t->alphabet), lc);
175
+ } else
176
+ lower = Qnil;
177
+
178
+ SFST::Character uc = l.upper_char();
179
+ if ((levels == BOTH || levels == UPPER) && (epsilons || uc != SFST::Label::epsilon)) {
180
+ upper = _alphabet_to_rb_str(&(t->alphabet), uc);
181
+ } else
182
+ upper = Qnil;
191
183
 
192
- switch (mode) {
193
- case BOTH:
194
- rb_ary_push_pair(a, lower, upper);
195
- break;
184
+ switch (levels) {
185
+ case BOTH:
186
+ rb_ary_push_pair(a, lower, upper);
187
+ break;
196
188
 
197
- case UPPER:
198
- rb_ary_push(a, upper);
199
- break;
189
+ case UPPER:
190
+ rb_ary_push(a, upper);
191
+ break;
200
192
 
201
- case LOWER:
202
- rb_ary_push(a, lower);
203
- break;
193
+ case LOWER:
194
+ rb_ary_push(a, lower);
195
+ break;
196
+ }
204
197
  }
198
+ }
199
+ };
200
+
201
+ static void _generate(SFST::Transducer *t, int levels, bool epsilons)
202
+ {
203
+ vector<Gen> paths;
204
+ paths.push_back(Gen(t->root_node()));
205
+
206
+ for (size_t i = 0; i < paths.size(); i++) {
207
+ Gen &gen = paths[i];
208
+ SFST::Node *node = gen.node;
205
209
 
206
- _regular_transducer_generate(t, arc[i]->target_node(), visitations, a, mode, epsilons);
210
+ if (node->is_final()) {
211
+ VALUE a = rb_ary_new();
212
+ gen.print(paths, a, levels, epsilons, t);
213
+ rb_yield(a);
214
+ }
207
215
 
208
- rb_ary_pop(a);
216
+ for (SFST::ArcsIter p(node->arcs()); p; p++) {
217
+ SFST::Arc *arc = p;
218
+ paths.push_back(Gen(arc->target_node(), arc->label(), (SFST::Index)i));
219
+ }
209
220
  }
210
221
  }
211
222
 
212
223
  static VALUE regular_transducer_generate_language(VALUE self, VALUE levels_arg, VALUE mode_arg)
213
224
  {
214
- Transducer *t;
215
- Data_Get_Struct(self, Transducer, t);
225
+ SFST::Transducer *t;
226
+ Data_Get_Struct(self, SFST::Transducer, t);
216
227
 
217
228
  static ID id_upper = rb_intern("upper");
218
229
  static ID id_lower = rb_intern("lower");
@@ -246,8 +257,8 @@ static VALUE regular_transducer_generate_language(VALUE self, VALUE levels_arg,
246
257
  if (!rb_block_given_p())
247
258
  rb_raise(rb_eRuntimeError, "block expected");
248
259
 
249
- Node2Int visitations;
250
- Transducer *a2;
260
+ SFST::Transducer *a2;
261
+
251
262
  switch (levels) {
252
263
  case UPPER:
253
264
  a2 = &(t->upper_level().minimise());
@@ -259,13 +270,13 @@ static VALUE regular_transducer_generate_language(VALUE self, VALUE levels_arg,
259
270
  a2 = t;
260
271
  break;
261
272
  }
262
- _regular_transducer_generate(a2, a2->root_node(), visitations, rb_ary_new(),
263
- levels, epsilons);
273
+
274
+ _generate(a2, levels, epsilons);
264
275
 
265
276
  return Qnil;
266
277
  }
267
278
 
268
- static bool _regular_transducer_yield(Transducer *t, Node *node, VALUE result_array)
279
+ static bool _regular_transducer_yield(SFST::Transducer *t, SFST::Node *node, VALUE result_array)
269
280
  {
270
281
  int accepted = 0;
271
282
 
@@ -284,9 +295,9 @@ static bool _regular_transducer_yield(Transducer *t, Node *node, VALUE result_ar
284
295
  accepted = 1;
285
296
  }
286
297
 
287
- for (ArcsIter i(node->arcs()); i; i++) {
288
- Arc *arc = i;
289
- Label l = arc->label();
298
+ for (SFST::ArcsIter i(node->arcs()); i; i++) {
299
+ SFST::Arc *arc = i;
300
+ SFST::Label l = arc->label();
290
301
 
291
302
  rb_ary_push(result_array, rb_str_new2(t->alphabet.write_label(l)));
292
303
 
@@ -300,10 +311,10 @@ static bool _regular_transducer_yield(Transducer *t, Node *node, VALUE result_ar
300
311
  return accepted == 1 ? true : false;
301
312
  }
302
313
 
303
- static VALUE _regular_transducer_analyze_or_generate(Transducer *t, VALUE string, bool generate)
314
+ static VALUE _regular_transducer_analyze_or_generate(SFST::Transducer *t, VALUE string, bool generate)
304
315
  {
305
- Transducer *a2, *a3;
306
- Transducer a1(RSTRING_PTR(string), &(t->alphabet), false);
316
+ SFST::Transducer *a2, *a3;
317
+ SFST::Transducer a1(RSTRING_PTR(string), &(t->alphabet), false);
307
318
  if (generate) {
308
319
  a2 = &(a1 || *t);
309
320
  a3 = &(a2->upper_level());
@@ -324,23 +335,23 @@ static VALUE _regular_transducer_analyze_or_generate(Transducer *t, VALUE string
324
335
 
325
336
  static VALUE regular_transducer_generate(VALUE self, VALUE string)
326
337
  {
327
- Transducer *t;
338
+ SFST::Transducer *t;
328
339
  Check_Type(string, T_STRING);
329
- Data_Get_Struct(self, Transducer, t);
340
+ Data_Get_Struct(self, SFST::Transducer, t);
330
341
  return _regular_transducer_analyze_or_generate(t, string, true);
331
342
  }
332
343
 
333
344
  static VALUE regular_transducer_analyze(VALUE self, VALUE string)
334
345
  {
335
- Transducer *t;
346
+ SFST::Transducer *t;
336
347
  Check_Type(string, T_STRING);
337
- Data_Get_Struct(self, Transducer, t);
348
+ Data_Get_Struct(self, SFST::Transducer, t);
338
349
  return _regular_transducer_analyze_or_generate(t, string, false);
339
350
  }
340
351
 
341
352
  extern "C"
342
353
 
343
- void Init_sfst_machine(void)
354
+ void Init_sfst(void)
344
355
  {
345
356
  mSFST = rb_define_module("SFST");
346
357
 
@@ -0,0 +1,72 @@
1
+ /*******************************************************************/
2
+ /* */
3
+ /* File: sgi.h */
4
+ /* Author: Helmut Schmid */
5
+ /* Purpose: */
6
+ /* Created: Thu Sep 11 15:58:25 2008 */
7
+ /* Modified: Fri Sep 12 08:17:03 2008 (schmid) */
8
+ /* Modified: Wed May 26 12:54:00 2010 (hfst) */
9
+ /*******************************************************************/
10
+
11
+ #ifndef _SGI_INCLUDED
12
+ #define _SGI_INCLUDED
13
+
14
+ #if HAVE_CONFIG_H
15
+ # include <config.h>
16
+ #endif
17
+
18
+ #if HAVE_BACKWARD_HASH_MAP
19
+ # include <backward/hash_map>
20
+ #elif HAVE_EXT_HASH_MAP
21
+ # include <ext/hash_map>
22
+ #elif HAVE_HASH_MAP
23
+ # include <hash_map>
24
+ #elif SGIext
25
+ # include <ext/hash_map>
26
+ #elif SGI__gnu_cxx
27
+ # include <ext/hash_map>
28
+ #else
29
+ # warning "unknown hash_map"
30
+ # include <hash_map>
31
+ #endif
32
+ #if HAVE_BACKWARD_HASH_SET
33
+ # include <backward/hash_set>
34
+ #elif HAVE_EXT_HASH_SET
35
+ # include <ext/hash_set>
36
+ #elif HAVE_HASH_SET
37
+ # include <hash_set>
38
+ #elif SGIext
39
+ # include <ext/hash_set>
40
+ #elif SGI__gnu_cxx
41
+ # include <ext/hash_set>
42
+ #else
43
+ # warning "missing hash_set"
44
+ # include <hash_set>
45
+ #endif
46
+
47
+ // Hfst addition
48
+ namespace SFST
49
+ {
50
+ // from <http://gcc.gnu.org/onlinedocs/libstdc++/manual/backwards.html>
51
+ #ifdef __GNUC__
52
+ # if __GNUC__ < 3
53
+ using ::hash_map;
54
+ using ::hash_set;
55
+ using ::hash;
56
+ # elif __GNUC__ == 3 && __GNUC_MINOR__ == 0
57
+ using std::hash_map;
58
+ using std::hash_set;
59
+ using std::hash;
60
+ # else
61
+ using __gnu_cxx::hash_map;
62
+ using __gnu_cxx::hash_set;
63
+ using __gnu_cxx::hash;
64
+ # endif
65
+ #else
66
+ using std::hash_map;
67
+ using std::hash_set;
68
+ using std::hash;
69
+ #endif
70
+ }
71
+
72
+ #endif
@@ -0,0 +1,149 @@
1
+
2
+ /*******************************************************************/
3
+ /* */
4
+ /* File: utf8.C */
5
+ /* Author: Helmut Schmid */
6
+ /* Purpose: */
7
+ /* Created: Mon Sep 5 17:49:16 2005 */
8
+ /* Modified: Wed Sep 29 15:08:34 2010 (schmid) */
9
+ /* */
10
+ /*******************************************************************/
11
+
12
+ #include "string.h"
13
+ #include "utf8.h"
14
+
15
+ namespace SFST {
16
+
17
+ const unsigned char get3LSbits=7;
18
+ const unsigned char get4LSbits=15;
19
+ const unsigned char get5LSbits=31;
20
+ const unsigned char get6LSbits=63;
21
+
22
+ const unsigned char set1MSbits=128;
23
+ const unsigned char set2MSbits=192;
24
+ const unsigned char set3MSbits=224;
25
+ const unsigned char set4MSbits=240;
26
+
27
+
28
+
29
+ /*******************************************************************/
30
+ /* */
31
+ /* int2utf8 */
32
+ /* */
33
+ /*******************************************************************/
34
+
35
+ char *int2utf8( unsigned int sym )
36
+
37
+ {
38
+ static unsigned char ch[5];
39
+
40
+ if (sym < 128) {
41
+ // 1-byte UTF8 symbol, 7 bits
42
+ ch[0] = (unsigned char)sym;
43
+ ch[1] = 0;
44
+ }
45
+
46
+ else if (sym < 2048) {
47
+ // 2-byte UTF8 symbol, 5+6 bits
48
+ ch[0] = (unsigned char)((sym >> 6) | set2MSbits);
49
+ ch[1] = (unsigned char)((sym & get6LSbits) | set1MSbits);
50
+ ch[2] = 0;
51
+ }
52
+
53
+ else if (sym < 65536) {
54
+ // 3-byte UTF8 symbol, 4+6+6 bits
55
+ ch[0] = (unsigned char)((sym >> 12) | set3MSbits);
56
+ ch[1] = (unsigned char)(((sym >> 6) & get6LSbits) | set1MSbits);
57
+ ch[2] = (unsigned char)((sym & get6LSbits) | set1MSbits);
58
+ ch[3] = 0;
59
+ }
60
+
61
+ else if (sym < 2097152) {
62
+ // 4-byte UTF8 symbol, 3+6+6+6 bits
63
+ ch[0] = (unsigned char)((sym >> 18) | set4MSbits);
64
+ ch[1] = (unsigned char)(((sym >> 12) & get6LSbits) | set1MSbits);
65
+ ch[2] = (unsigned char)(((sym >> 6) & get6LSbits) | set1MSbits);
66
+ ch[3] = (unsigned char)((sym & get6LSbits) | set1MSbits);
67
+ ch[4] = 0;
68
+ }
69
+
70
+ else
71
+ return NULL;
72
+
73
+ return (char*)ch;
74
+ }
75
+
76
+
77
+ /*******************************************************************/
78
+ /* */
79
+ /* utf8toint */
80
+ /* */
81
+ /*******************************************************************/
82
+
83
+ unsigned int utf8toint( char **s )
84
+
85
+ {
86
+ int bytes_to_come;
87
+ unsigned int result=0;
88
+ unsigned char c=(unsigned char)**s;
89
+
90
+ if (c >= (unsigned char)set4MSbits) { // 1111xxxx
91
+ bytes_to_come = 3;
92
+ result = (result << 3) | (c & get3LSbits);
93
+ }
94
+
95
+ else if (c >= (unsigned char) set3MSbits) { // 1110xxxx
96
+ // start of a three-byte symbol
97
+ bytes_to_come = 2;
98
+ result = (result << 4) | (c & get4LSbits);
99
+ }
100
+
101
+ else if (c >= (unsigned char) set2MSbits) { // 1100xxxx
102
+ // start of a two-byte symbol
103
+ bytes_to_come = 1;
104
+ result = (result << 5) | (c & get5LSbits);
105
+ }
106
+
107
+ else if (c < (unsigned char) set1MSbits) { // 0100xxxx
108
+ // one-byte symbol
109
+ bytes_to_come = 0;
110
+ result = c;
111
+ }
112
+
113
+ else
114
+ return 0; // error
115
+
116
+ while (bytes_to_come > 0) {
117
+ bytes_to_come--;
118
+ (*s)++;
119
+ c = (unsigned char)**s;
120
+ if (c < (unsigned char) set2MSbits &&
121
+ c >= (unsigned char) set1MSbits) // 1000xxxx
122
+ {
123
+ result = (result << 6) | (c & get6LSbits);
124
+ }
125
+ else
126
+ return 0;
127
+ }
128
+
129
+ (*s)++;
130
+ return result;
131
+ }
132
+
133
+
134
+ /*******************************************************************/
135
+ /* */
136
+ /* utf8toint */
137
+ /* */
138
+ /*******************************************************************/
139
+
140
+ unsigned int utf8toint( char *s )
141
+
142
+ {
143
+ unsigned int result = utf8toint( &s );
144
+ if (*s == 0) // all bytes converted?
145
+ return result;
146
+ return 0;
147
+ }
148
+
149
+ }