ruby-sfst 0.4.3 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +1 -0
  3. data/COPYING +280 -0
  4. data/Gemfile +3 -0
  5. data/Gemfile.lock +54 -0
  6. data/README.md +1 -1
  7. data/Rakefile +9 -18
  8. data/bin/console +7 -0
  9. data/bin/setup +6 -0
  10. data/ext/sfst/alphabet.cc +879 -0
  11. data/ext/sfst/alphabet.h +302 -0
  12. data/ext/sfst/basic.cc +85 -0
  13. data/ext/{sfst_machine → sfst}/basic.h +7 -4
  14. data/ext/sfst/compact.cc +629 -0
  15. data/ext/sfst/compact.h +100 -0
  16. data/ext/sfst/determinise.cc +279 -0
  17. data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
  18. data/ext/sfst/fst.cc +1150 -0
  19. data/ext/sfst/fst.h +374 -0
  20. data/ext/sfst/hopcroft.cc +681 -0
  21. data/ext/sfst/interface.cc +1921 -0
  22. data/ext/sfst/interface.h +171 -0
  23. data/ext/sfst/make-compact.cc +323 -0
  24. data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
  25. data/ext/sfst/mem.h +80 -0
  26. data/ext/sfst/operators.cc +1273 -0
  27. data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
  28. data/ext/sfst/sgi.h +72 -0
  29. data/ext/sfst/utf8.cc +149 -0
  30. data/ext/{sfst_machine → sfst}/utf8.h +7 -4
  31. data/lib/sfst.rb +2 -1
  32. data/lib/sfst/version.rb +1 -1
  33. data/ruby-sfst.gemspec +23 -23
  34. metadata +107 -35
  35. data/ext/sfst_machine/alphabet.cc +0 -812
  36. data/ext/sfst_machine/alphabet.h +0 -273
  37. data/ext/sfst_machine/basic.cc +0 -84
  38. data/ext/sfst_machine/compact.cc +0 -616
  39. data/ext/sfst_machine/compact.h +0 -98
  40. data/ext/sfst_machine/determinise.cc +0 -303
  41. data/ext/sfst_machine/fst.cc +0 -1000
  42. data/ext/sfst_machine/fst.h +0 -369
  43. data/ext/sfst_machine/interface.cc +0 -1842
  44. data/ext/sfst_machine/interface.h +0 -93
  45. data/ext/sfst_machine/make-compact.cc +0 -327
  46. data/ext/sfst_machine/mem.h +0 -74
  47. data/ext/sfst_machine/operators.cc +0 -1131
  48. data/ext/sfst_machine/sgi.h +0 -44
  49. data/ext/sfst_machine/utf8.cc +0 -146
  50. data/test/test_sfst.fst +0 -3
  51. data/test/test_sfst.rb +0 -114
@@ -13,22 +13,24 @@
13
13
  #include "fst.h"
14
14
  #include "compact.h"
15
15
 
16
+ namespace SFST {
16
17
 
17
- class MakeCompactTransducer : CompactTransducer {
18
+ class MakeCompactTransducer : CompactTransducer {
18
19
 
19
- private:
20
- void count_arcs(Node *node, NodeNumbering &index, long vmark);
21
- void store_arcs(Node *node, NodeNumbering &index, long vmark);
22
- void store_finalp( FILE *file );
23
- void store_first_arcs( FILE *file );
24
- void store_target_nodes( FILE *file );
25
- void store_labels( FILE *file );
20
+ private:
21
+ void count_arcs(Node *node, VType vmark);
22
+ void store_arcs(Node *node, VType vmark);
23
+ void store_finalp( FILE *file );
24
+ void store_first_arcs( FILE *file );
25
+ void store_target_nodes( FILE *file );
26
+ void store_labels( FILE *file );
26
27
 
27
- public:
28
- MakeCompactTransducer( Transducer &a, Level sort=upper );
28
+ public:
29
+ MakeCompactTransducer( Transducer &a, Level sort=upper );
29
30
 
30
- void sort( Level );
31
- void store( FILE *file );
32
- };
31
+ void sort( Level );
32
+ void store( FILE *file );
33
+ };
33
34
 
35
+ }
34
36
  #endif
@@ -0,0 +1,80 @@
1
+ /*******************************************************************/
2
+ /* */
3
+ /* FILE mem.h */
4
+ /* MODULE mem */
5
+ /* PROGRAM SFST */
6
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
7
+ /* */
8
+ /* PURPOSE memory management functions */
9
+ /* */
10
+ /*******************************************************************/
11
+
12
+ #ifndef _MEM_H_
13
+ #define _MEM_H_
14
+
15
+ #include <stdlib.h>
16
+ #include <assert.h>
17
+
18
+ namespace SFST {
19
+
20
+ #define MEMBUFFER_SIZE 100000
21
+
22
+
23
+ /***************** class Mem *************************************/
24
+
25
+ class Mem {
26
+
27
+ private:
28
+
29
+ struct MemBuffer {
30
+ char buffer[MEMBUFFER_SIZE];
31
+ struct MemBuffer *next;
32
+ };
33
+
34
+ MemBuffer *first_buffer;
35
+ long pos;
36
+ void add_buffer() {
37
+ MemBuffer *mb=(MemBuffer*)malloc(sizeof(MemBuffer));
38
+ if (mb == NULL)
39
+ throw "Allocation of memory failed in Mem::add_buffer!";
40
+ mb->next = first_buffer;
41
+ first_buffer = mb;
42
+ pos = 0;
43
+ }
44
+
45
+ public:
46
+ Mem() { first_buffer = NULL; add_buffer(); }
47
+ ~Mem() { clear(); }
48
+
49
+ void clear() {
50
+ while (first_buffer) {
51
+ MemBuffer *next = first_buffer->next;
52
+ free(first_buffer);
53
+ first_buffer = next;
54
+ }
55
+ pos = 0;
56
+ }
57
+
58
+ void *alloc( size_t n ) {
59
+ void *result;
60
+
61
+ /* do memory alignment to multiples of 4 */
62
+ if (n % 4)
63
+ n += 4 - (n % 4);
64
+
65
+ if (first_buffer == NULL || pos+n > MEMBUFFER_SIZE)
66
+ add_buffer();
67
+ if (pos+n > MEMBUFFER_SIZE)
68
+ throw "Allocation of memory block larger than MEMBUFFER_SIZE attempted!";
69
+
70
+ result = (void*)(first_buffer->buffer + pos);
71
+ pos += n;
72
+ return result;
73
+ }
74
+
75
+ //class MemError {};
76
+
77
+ };
78
+
79
+ }
80
+ #endif
@@ -0,0 +1,1273 @@
1
+
2
+ /*******************************************************************/
3
+ /* */
4
+ /* FILE operators.C */
5
+ /* MODULE operators */
6
+ /* PROGRAM SFST */
7
+ /* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
8
+ /* */
9
+ /*******************************************************************/
10
+
11
+
12
+ #include "fst.h"
13
+
14
+ using std::pair;
15
+ using std::cerr;
16
+
17
+ namespace SFST {
18
+
19
+ typedef map<Character, vector<Arc*> > Sym2Arcs;
20
+
21
+ // special data structures for the optimization of composition
22
+ // All transitions outgoing from the same node and having the same
23
+ // symbol on the upper (or lower) layer are stored in a hash table
24
+ // for quick retrieval
25
+
26
+ /***************** class FromTo *********************************/
27
+
28
+ class FromTo {
29
+ public:
30
+ Index from, to; // start and end of a range of transitions
31
+ Index size() { return to - from; }
32
+ };
33
+
34
+ /***************** class NodeSym ********************************/
35
+
36
+ class NodeSym {
37
+ // pair consisting of a node and a symbol
38
+ public:
39
+ Index nodeID;
40
+ Character symbol;
41
+ NodeSym( Index n, Character s ) { nodeID = n; symbol = s; }
42
+ };
43
+
44
+
45
+ /***************** class CharNode2Trans **************************/
46
+
47
+ class CharNode2Trans {
48
+
49
+ struct hashf {
50
+ size_t operator()(const NodeSym &ns) const {
51
+ return ns.nodeID ^ ns.symbol;
52
+ }
53
+ };
54
+
55
+ struct equalf {
56
+ int operator()(const NodeSym &ns1, const NodeSym &ns2) const {
57
+ return (ns1.nodeID == ns2.nodeID && ns1.symbol == ns2.symbol);
58
+ }
59
+ };
60
+
61
+ typedef hash_map<NodeSym, FromTo, hashf, equalf > NodeSym2Range;
62
+
63
+ // data structure for storing an index from node + symbol to a list
64
+ // of transitions with that symbol on the upper/lower layer
65
+ Transducer &transducer;
66
+ vector<Index> node_size;
67
+ vector<Arc*> cs_transitions; // transitions for a certain character + state
68
+ NodeSym2Range trange;
69
+
70
+ public:
71
+
72
+ CharNode2Trans(Transducer &t);
73
+ size_t hash_transitions( Node *node, bool upper );
74
+
75
+ class iterator {
76
+ CharNode2Trans &c2t;
77
+ Index current, end;
78
+ public:
79
+ iterator( CharNode2Trans &table, Index nodeID, Character symbol )
80
+ : c2t(table)
81
+ {
82
+ FromTo range=c2t.trange[NodeSym(nodeID, symbol)];
83
+ current = range.from;
84
+ end = range.to;
85
+ }
86
+ void operator++( int ) { current++; }
87
+ Arc *operator*() { return c2t.cs_transitions[current]; }
88
+ bool finished() { return current == end; }
89
+ Index size() { return end-current; };
90
+ };
91
+ };
92
+
93
+
94
+ static void compose_nodes( Node*, Node*, Node*, Transducer*, PairMapping&,
95
+ CharNode2Trans&, CharNode2Trans& );
96
+
97
+
98
+ /*******************************************************************/
99
+ /* */
100
+ /* CharNode2Trans::CharNode2Trans */
101
+ /* */
102
+ /*******************************************************************/
103
+
104
+ CharNode2Trans::CharNode2Trans(Transducer &t): transducer(t)
105
+
106
+ {
107
+ pair<Index,Index> p = transducer.nodeindexing();
108
+ Index node_count = p.first;
109
+ Index transition_count = p.second;
110
+ node_size.resize(node_count, undef);
111
+ cs_transitions.reserve(transition_count);
112
+ }
113
+
114
+
115
+ /*******************************************************************/
116
+ /* */
117
+ /* CharNode2Trans::hash_transitions */
118
+ /* */
119
+ /*******************************************************************/
120
+
121
+ size_t CharNode2Trans::hash_transitions( Node *node, bool upper )
122
+
123
+ {
124
+ size_t n = node_size[node->index];
125
+ if (n != undef)
126
+ return n;
127
+
128
+ Sym2Arcs sym2arcs;
129
+
130
+ for( ArcsIter p(node->arcs()); p; p++ ) {
131
+ Arc *arc=p;
132
+ if (upper)
133
+ sym2arcs[arc->label().upper_char()].push_back(arc);
134
+ else
135
+ sym2arcs[arc->label().lower_char()].push_back(arc);
136
+ }
137
+
138
+ for( Sym2Arcs::iterator it=sym2arcs.begin(); it!=sym2arcs.end(); it++ ) {
139
+ Character sym = it->first;
140
+ vector<Arc*> &arc = it->second;
141
+ FromTo range;
142
+ range.from = (Index)cs_transitions.size();
143
+ for( size_t i=0; i<arc.size(); i++ )
144
+ cs_transitions.push_back( arc[i] );
145
+ range.to = (Index)cs_transitions.size();
146
+ trange[NodeSym(node->index, sym)] = range;
147
+ }
148
+ n = sym2arcs.size();
149
+ node_size[node->index] = (Index)n;
150
+ return n;
151
+ }
152
+
153
+
154
+ /*******************************************************************/
155
+ /* */
156
+ /* check_cyclicity */
157
+ /* */
158
+ /*******************************************************************/
159
+
160
+ static bool check_cyclicity( Node *node, NodeHashSet &visited,
161
+ const Alphabet &alphabet)
162
+ {
163
+
164
+ if (!visited.insert(node).second)
165
+ return true; // node was visited before
166
+
167
+ for( ArcsIter p(node->arcs()); p; p++ ) {
168
+ Arc *arc=p;
169
+ if (arc->label().upper_is_epsilon())
170
+ if (check_cyclicity(arc->target_node(), visited, alphabet)) {
171
+ cerr << alphabet.write_label(arc->label()) << "\n";
172
+ return true;
173
+ }
174
+ }
175
+ visited.erase(node);
176
+ return false;
177
+ }
178
+
179
+
180
+ /*******************************************************************/
181
+ /* */
182
+ /* Transducer::infinitely_ambiguous_node */
183
+ /* */
184
+ /*******************************************************************/
185
+
186
+ bool Transducer::infinitely_ambiguous_node( Node *node )
187
+
188
+ {
189
+ if (!node->was_visited( vmark )) {
190
+ NodeHashSet visited;
191
+ if (check_cyclicity(node, visited, alphabet))
192
+ return true;
193
+
194
+ // iterate over all outgoing arcs
195
+ for( ArcsIter p(node->arcs()); p; p++ ) {
196
+ Arc *arc=p;
197
+ if (infinitely_ambiguous_node( arc->target_node() ))
198
+ return true;
199
+ }
200
+ }
201
+ return false;
202
+ }
203
+
204
+
205
+ /*******************************************************************/
206
+ /* */
207
+ /* Transducer::is_infinitely_ambiguous */
208
+ /* */
209
+ /*******************************************************************/
210
+
211
+ bool Transducer::is_infinitely_ambiguous()
212
+
213
+ {
214
+ incr_vmark();
215
+ return infinitely_ambiguous_node(root_node());
216
+ }
217
+
218
+
219
+ /*******************************************************************/
220
+ /* */
221
+ /* Transducer::is_cyclic_node */
222
+ /* */
223
+ /*******************************************************************/
224
+
225
+ bool Transducer::is_cyclic_node( Node *node, NodeHashSet &previous )
226
+
227
+ {
228
+ if (!node->was_visited( vmark )) {
229
+ NodeHashSet visited;
230
+
231
+ NodeHashSet::iterator it=previous.insert(node).first;
232
+
233
+ // iterate over all outgoing arcs
234
+ for( ArcsIter p(node->arcs()); p; p++ ) {
235
+ Arc *arc=p;
236
+ if (previous.find(arc->target_node()) != previous.end() ||
237
+ is_cyclic_node( arc->target_node(), previous ))
238
+ return true;
239
+ }
240
+
241
+ previous.erase(it);
242
+ }
243
+ return false;
244
+ }
245
+
246
+
247
+ /*******************************************************************/
248
+ /* */
249
+ /* Transducer::is_cyclic */
250
+ /* */
251
+ /*******************************************************************/
252
+
253
+ bool Transducer::is_cyclic()
254
+
255
+ {
256
+ incr_vmark();
257
+ NodeHashSet previous;
258
+ return is_cyclic_node(root_node(), previous);
259
+ }
260
+
261
+
262
+
263
+ /*******************************************************************/
264
+ /* */
265
+ /* Transducer::is_automaton_node */
266
+ /* */
267
+ /*******************************************************************/
268
+
269
+ bool Transducer::is_automaton_node( Node *node )
270
+
271
+ {
272
+ if (!node->was_visited( vmark )) {
273
+ // iterate over all outgoing arcs
274
+ for( ArcsIter p(node->arcs()); p; p++ ) {
275
+ Arc *arc=p;
276
+ Label l=arc->label();
277
+ if (l.upper_char() != l.lower_char())
278
+ return false;
279
+ if (!is_automaton_node( arc->target_node()))
280
+ return false;
281
+ }
282
+ }
283
+ return true;
284
+ }
285
+
286
+
287
+ /*******************************************************************/
288
+ /* */
289
+ /* Transducer::is_automaton */
290
+ /* */
291
+ /*******************************************************************/
292
+
293
+ bool Transducer::is_automaton()
294
+
295
+ {
296
+ incr_vmark();
297
+ return is_automaton_node(root_node());
298
+ }
299
+
300
+
301
+ /*******************************************************************/
302
+ /* */
303
+ /* Transducer::is_empty */
304
+ /* */
305
+ /*******************************************************************/
306
+
307
+ bool Transducer::is_empty()
308
+
309
+ {
310
+ if (!minimised) {
311
+ Transducer *tmp=&minimise();
312
+ bool result=tmp->is_empty();
313
+ delete tmp;
314
+ return result;
315
+ }
316
+ if (root_node()->is_final())
317
+ return false;
318
+ return root_node()->arcs()->is_empty();
319
+ }
320
+
321
+
322
+ /*******************************************************************/
323
+ /* */
324
+ /* Transducer::generates_empty_string */
325
+ /* */
326
+ /*******************************************************************/
327
+
328
+ bool Transducer::generates_empty_string()
329
+
330
+ {
331
+ if (!minimised) {
332
+ Transducer *tmp=&minimise();
333
+ bool result=tmp->root_node()->is_final();
334
+ delete tmp;
335
+ return result;
336
+ }
337
+ return root_node()->is_final();
338
+ }
339
+
340
+
341
+ /*******************************************************************/
342
+ /* */
343
+ /* Transducer::reverse_node */
344
+ /* */
345
+ /*******************************************************************/
346
+
347
+ void Transducer::reverse_node( Node *node, Transducer *na )
348
+
349
+ {
350
+ if (!node->was_visited( vmark )) {
351
+
352
+ // create a new node
353
+ node->set_forward( na->new_node() );
354
+
355
+ if (node->is_final())
356
+ // add epsilon transition from new root to this node
357
+ na->root_node()->add_arc( Label(), node->forward(), na );
358
+
359
+ // iterate over all outgoing arcs
360
+ for( ArcsIter p(node->arcs()); p; p++ ) {
361
+ Arc *arc=p;
362
+
363
+ // reverse the subgraph headed by the target node
364
+ reverse_node( arc->target_node(), na );
365
+ Node *n = arc->target_node()->forward();
366
+
367
+ // create the reverse arc
368
+ n->add_arc( arc->label(), node->forward(), na );
369
+ }
370
+ }
371
+ }
372
+
373
+
374
+ /*******************************************************************/
375
+ /* */
376
+ /* Transducer::reverse */
377
+ /* */
378
+ /*******************************************************************/
379
+
380
+ Transducer &Transducer::reverse( bool copy_alphabet )
381
+
382
+ {
383
+ Transducer *na = new Transducer();
384
+ if (copy_alphabet)
385
+ na->alphabet.copy(alphabet);
386
+
387
+ incr_vmark();
388
+ reverse_node(root_node(), na);
389
+ root_node()->forward()->set_final(1);
390
+ return *na;
391
+ }
392
+
393
+
394
+ /*******************************************************************/
395
+ /* */
396
+ /* Transducer::recode_label */
397
+ /* */
398
+ /*******************************************************************/
399
+
400
+ Label Transducer::recode_label( Label l, bool lswitch, bool recode,
401
+ Alphabet &al )
402
+ {
403
+ if (lswitch)
404
+ l = Label(l.upper_char(), l.lower_char());
405
+
406
+ if (recode) {
407
+ Character lc = al.add_symbol(alphabet.code2symbol(l.lower_char()));
408
+ Character uc = al.add_symbol(alphabet.code2symbol(l.upper_char()));
409
+ l = Label(lc, uc);
410
+ al.insert(l);
411
+ }
412
+
413
+ return l;
414
+ }
415
+
416
+
417
+ /*******************************************************************/
418
+ /* */
419
+ /* Transducer::copy_nodes */
420
+ /* */
421
+ /*******************************************************************/
422
+
423
+ Node *Transducer::copy_nodes( Node *node, Transducer *a,
424
+ bool lswitch, bool recode )
425
+ {
426
+ if (!node->was_visited(vmark)) {
427
+
428
+ node->set_forward(a->new_node());
429
+
430
+ // define final nodes
431
+ if (node->is_final())
432
+ node->forward()->set_final(1);
433
+
434
+ // iterate over all outgoing arcs of node
435
+ for( ArcsIter p(node->arcs()); p; p++ ) {
436
+ Arc *arc=p;
437
+ Node *tn = copy_nodes( arc->target_node(), a, lswitch, recode );
438
+
439
+ // Add a link to the new node
440
+ Label l=recode_label(arc->label(), lswitch, recode, a->alphabet);
441
+ node->forward()->add_arc( l, tn, a );
442
+ }
443
+ }
444
+
445
+ return node->forward();
446
+ }
447
+
448
+
449
+ /*******************************************************************/
450
+ /* */
451
+ /* Transducer::copy */
452
+ /* */
453
+ /*******************************************************************/
454
+
455
+ Transducer &Transducer::copy( bool lswitch, const Alphabet *al )
456
+
457
+ {
458
+ bool recode = false;
459
+ Transducer *na = new Transducer();
460
+ if (al == NULL)
461
+ al = &alphabet;
462
+ else
463
+ recode = true;
464
+
465
+ na->alphabet.utf8 = al->utf8;
466
+ if (lswitch) {
467
+ na->alphabet.insert_symbols(*al);
468
+ for( Alphabet::iterator it=al->begin(); it!=al->end(); it++ ) {
469
+ Character lc=it->lower_char();
470
+ Character uc=it->upper_char();
471
+ na->alphabet.insert(Label(uc,lc));
472
+ }
473
+ }
474
+ else
475
+ na->alphabet.copy(*al);
476
+
477
+ na->deterministic = deterministic;
478
+ na->minimised = minimised;
479
+ na->root_node()->set_final(root_node()->is_final());
480
+ incr_vmark();
481
+
482
+ root_node()->set_forward(na->root_node());
483
+ root_node()->was_visited(vmark);
484
+
485
+ for( ArcsIter p(root_node()->arcs()); p; p++ ) {
486
+ Arc *arc=p;
487
+ Node *target_node=copy_nodes(arc->target_node(), na, lswitch, recode);
488
+ Label l = recode_label(arc->label(), lswitch, recode, na->alphabet);
489
+ na->root_node()->add_arc( l, target_node, na);
490
+ }
491
+
492
+ return *na;
493
+ }
494
+
495
+
496
+ /*******************************************************************/
497
+ /* */
498
+ /* Transducer::operator | */
499
+ /* */
500
+ /*******************************************************************/
501
+
502
+ Transducer &Transducer::operator|( Transducer &a )
503
+
504
+ {
505
+ Transducer *na = new Transducer();
506
+ na->alphabet.copy(alphabet);
507
+ na->alphabet.copy(a.alphabet);
508
+
509
+ incr_vmark();
510
+ na->root_node()->add_arc( Label(), copy_nodes(root_node(), na), na);
511
+ a.incr_vmark();
512
+ na->root_node()->add_arc( Label(), a.copy_nodes(a.root_node(), na), na);
513
+
514
+ return *na;
515
+ }
516
+
517
+
518
+ /*******************************************************************/
519
+ /* */
520
+ /* Transducer::rec_cat_nodes */
521
+ /* */
522
+ /*******************************************************************/
523
+
524
+ void Transducer::rec_cat_nodes( Node *node, Node *node2 )
525
+
526
+ {
527
+ if (!node->was_visited( vmark )) {
528
+
529
+ // iterate over all outgoing arcs of node
530
+ for( ArcsIter p(node->arcs()); p; p++ ) {
531
+ Arc *arc=p;
532
+ rec_cat_nodes( arc->target_node(), node2 );
533
+ }
534
+
535
+ if (node->is_final()) {
536
+ // link this node to node2
537
+ node->set_final(0);
538
+ node->add_arc( Label(), node2, this );
539
+ }
540
+ }
541
+ }
542
+
543
+
544
+ /*******************************************************************/
545
+ /* */
546
+ /* Transducer::operator+ */
547
+ /* */
548
+ /*******************************************************************/
549
+
550
+ Transducer &Transducer::operator+( Transducer &a )
551
+
552
+ {
553
+ Transducer *na = new Transducer();
554
+ na->alphabet.copy(alphabet);
555
+ na->alphabet.copy(a.alphabet);
556
+
557
+ // copy Transducer1 to the new Transducer
558
+ incr_vmark();
559
+ Node *node=copy_nodes(root_node(), na);
560
+ na->root_node()->add_arc( Label(), node, na);
561
+
562
+ // copy Transducer2 to the new Transducer
563
+ a.incr_vmark();
564
+ node=a.copy_nodes(a.root_node(), na);
565
+
566
+ // catenate the two automata
567
+ na->incr_vmark();
568
+ na->rec_cat_nodes(na->root_node(), node);
569
+
570
+ return *na;
571
+ }
572
+
573
+
574
+ /*******************************************************************/
575
+ /* */
576
+ /* Transducer::kleene_star */
577
+ /* (HFST addition: now works for cyclic transducers as well) */
578
+ /* */
579
+ /*******************************************************************/
580
+
581
+ Transducer &Transducer::kleene_star()
582
+
583
+ {
584
+ Transducer *na = &copy();
585
+ na->alphabet.copy(alphabet);
586
+
587
+ // HFST addition
588
+ Transducer eps;
589
+ eps.root_node()->set_final(1);
590
+ Transducer *tmp = &(eps + *na);
591
+ delete na;
592
+ na = tmp;
593
+
594
+ // link back to the start node
595
+ na->incr_vmark();
596
+ na->rec_cat_nodes(na->root_node(), na->root_node());
597
+
598
+ na->root_node()->set_final(1); // root node is already final
599
+ na->deterministic = na->minimised = false;
600
+
601
+ return *na;
602
+ }
603
+
604
+
605
+ /*******************************************************************/
606
+ /* */
607
+ /* Transducer::negate_nodes */
608
+ /* */
609
+ /*******************************************************************/
610
+
611
+ void Transducer::negate_nodes( Node *node, Node *accept )
612
+
613
+ {
614
+ if (!node->was_visited(vmark)) {
615
+ node->set_final( !node->is_final() );
616
+
617
+ for( ArcsIter p(node->arcs()); p; p++ ) {
618
+ Arc *arc=p;
619
+ negate_nodes( arc->target_node(), accept );
620
+ }
621
+
622
+ for( Alphabet::iterator it=alphabet.begin(); it!=alphabet.end(); it++)
623
+ if (!node->target_node(*it))
624
+ node->add_arc( *it, accept, this );
625
+ }
626
+ }
627
+
628
+
629
+ /*******************************************************************/
630
+ /* */
631
+ /* Transducer::operator! */
632
+ /* */
633
+ /*******************************************************************/
634
+
635
+ Transducer &Transducer::operator!()
636
+
637
+ {
638
+ Transducer *na;
639
+
640
+ if (alphabet.size() == 0) {
641
+ // throw "Negation of Transducer with undefined alphabet attempted!";
642
+ fprintf(stderr, "Warning: undefined alphabet\n");
643
+ na = new Transducer();
644
+ return *na;
645
+ }
646
+
647
+ if (minimised)
648
+ na = &copy();
649
+ else
650
+ na = &minimise();
651
+ na->alphabet.copy(alphabet);
652
+
653
+ Node *accept_node=na->new_node();
654
+ accept_node->set_final(1);
655
+ for( Alphabet::iterator it=alphabet.begin(); it!=alphabet.end(); it++)
656
+ accept_node->add_arc( *it, accept_node, na );
657
+
658
+ na->incr_vmark();
659
+ na->negate_nodes( na->root_node(), accept_node );
660
+ na->minimised = na->deterministic = false;
661
+
662
+ return *na;
663
+ }
664
+
665
+
666
+ /*******************************************************************/
667
+ /* */
668
+ /* conjoin_nodes */
669
+ /* */
670
+ /*******************************************************************/
671
+
672
+ static void conjoin_nodes( Node *n1, Node *n2, Node *node,
673
+ Transducer *a, PairMapping &map )
674
+
675
+ {
676
+ // if both input nodes are final, so is the new one
677
+ if (n1->is_final() && n2->is_final())
678
+ node->set_final(1);
679
+
680
+ // iterate over all outgoing arcs of the first node
681
+ for( ArcsIter i(n1->arcs()); i; i++ ) {
682
+ Arc *arc=i;
683
+ Label l=arc->label();
684
+ Node *t1 = arc->target_node();
685
+ Node *t2 = n2->target_node(l);
686
+
687
+ // Does the second node have an outgoing arc with the same label?
688
+ if (t2) {
689
+ // Check whether this node pair has been encountered before
690
+ PairMapping::iterator it=map.find(t1, t2);
691
+
692
+ if (it == map.end()) {
693
+ // new node pair
694
+ // create a new node in the conjunction Transducer
695
+ Node *target_node = a->new_node();
696
+ // map the target node pair to the new node
697
+ map[pair<Node*,Node*>(t1,t2)] = target_node;
698
+ // add an arc to the new node
699
+ node->add_arc( l, target_node, a );
700
+ // recursion
701
+ conjoin_nodes( t1, t2, target_node, a, map );
702
+ }
703
+ else {
704
+ // add an arc to the already existing target node
705
+ node->add_arc( l, it->second, a );
706
+ }
707
+ }
708
+ }
709
+ }
710
+
711
+
712
+ /*******************************************************************/
713
+ /* */
714
+ /* Transducer::operator & */
715
+ /* */
716
+ /*******************************************************************/
717
+
718
+ Transducer &Transducer::operator&( Transducer &a )
719
+
720
+ {
721
+ Transducer *tmp1=NULL;
722
+ Transducer *tmp2=NULL;
723
+ Node *r1, *r2;
724
+
725
+ if (deterministic)
726
+ r1 = root_node();
727
+ else {
728
+ tmp1 = &determinise();
729
+ r1 = tmp1->root_node();
730
+ }
731
+
732
+ if (a.deterministic)
733
+ r2 = a.root_node();
734
+ else {
735
+ tmp2 = &a.determinise();
736
+ r2 = tmp2->root_node();
737
+ }
738
+
739
+ PairMapping map;
740
+
741
+ Transducer *na = new Transducer();
742
+ na->alphabet.copy(alphabet);
743
+ na->alphabet.copy(a.alphabet);
744
+
745
+ // map the two root nodes to the new root node
746
+ map[pair<Node*,Node*>(r1, r2)] = na->root_node();
747
+
748
+ // recursively conjoin the two automata
749
+ conjoin_nodes( r1, r2, na->root_node(), na, map);
750
+
751
+ na->deterministic = 1;
752
+ delete tmp1;
753
+ delete tmp2;
754
+
755
+ return *na;
756
+ }
757
+
758
+
759
+ /*******************************************************************/
760
+ /* */
761
+ /* add_transition */
762
+ /* */
763
+ /*******************************************************************/
764
+
765
+ static void add_transition( Label l, Node *n1, Node *n2, Node *node,
766
+ Transducer *a, PairMapping &map,
767
+ CharNode2Trans &cn2trans1,
768
+ CharNode2Trans &cn2trans2 )
769
+
770
+ {
771
+ // fprintf(stderr,"transition from %u to %u with label %s\n",
772
+ // n1->index, n2->index, a->alphabet.write_label(l));
773
+
774
+ // Check whether this node pair has been encountered before
775
+ PairMapping::iterator it=map.find(n1, n2);
776
+
777
+ if (it != map.end()) {
778
+ // add an arc to the already existing target node
779
+ node->add_arc( l, it->second, a );
780
+ return;
781
+ }
782
+
783
+ // create a new node in the composed Transducer
784
+ Node *target_node = a->new_node();
785
+
786
+ // map the target node pair to the new node
787
+ map[pair<Node*,Node*>(n1,n2)] = target_node;
788
+
789
+ // add an arc to the new node
790
+ node->add_arc( l, target_node, a );
791
+
792
+ // recursion
793
+ compose_nodes( n1, n2, target_node, a, map, cn2trans1, cn2trans2 );
794
+ }
795
+
796
+
797
+ /*******************************************************************/
798
+ /* */
799
+ /* compose_nodes */
800
+ /* */
801
+ /*******************************************************************/
802
+
803
+ static void compose_nodes( Node *n1, Node *n2, Node *node, Transducer *a,
804
+ PairMapping &map, CharNode2Trans &cn2trans1,
805
+ CharNode2Trans &cn2trans2 )
806
+ {
807
+ // fprintf(stderr,"A%u || B%u\n",n1->index,n2->index);
808
+
809
+ // index upper character of first transducer
810
+ size_t size1 = cn2trans1.hash_transitions( n1, true );
811
+ // index lower character of second transducer
812
+ size_t size2 = cn2trans2.hash_transitions( n2, false );
813
+
814
+ // use the hashing of the transducer whose node is larger
815
+ bool hash2 = (size1 <= size2);
816
+
817
+ // if both input nodes are final, so is the new one
818
+ if (n1->is_final() && n2->is_final())
819
+ node->set_final(1);
820
+
821
+ if (hash2) {
822
+ // iterate over all outgoing arcs of the first node
823
+ for( ArcsIter i(n1->arcs()); i; i++ ) {
824
+ Arc *arc1=i;
825
+ Node *t1 = arc1->target_node();
826
+ Label l1=arc1->label();
827
+ Character uc1=l1.upper_char();
828
+ Character lc1=l1.lower_char();
829
+
830
+ if (uc1 == Label::epsilon)
831
+ add_transition( l1, t1, n2, node, a, map, cn2trans1, cn2trans2 );
832
+
833
+ else {
834
+ // iterate over the matching outgoing arcs of the second node
835
+ for( CharNode2Trans::iterator it(cn2trans2, n2->index, uc1 );
836
+ !it.finished(); it++ )
837
+ {
838
+ Arc *arc2 = *it;
839
+ Node *t2 = arc2->target_node();
840
+ Label l2=arc2->label();
841
+ assert(uc1 == l2.lower_char());
842
+ Character uc2=l2.upper_char();
843
+
844
+ add_transition( Label(lc1,uc2), t1, t2, node, a, map,
845
+ cn2trans1, cn2trans2 );
846
+ }
847
+ }
848
+ }
849
+
850
+ // epsilon input characters of the second Transducer
851
+ for( CharNode2Trans::iterator it(cn2trans2, n2->index, Label::epsilon );
852
+ !it.finished(); it++ )
853
+ {
854
+ Arc *arc2 = *it;
855
+ Node *t2 = arc2->target_node();
856
+ Label l=arc2->label();
857
+ assert(l.lower_char() == Label::epsilon);
858
+ add_transition( l, n1, t2, node, a, map, cn2trans1, cn2trans2 );
859
+ }
860
+ }
861
+
862
+ else { /* !hash2 */
863
+ // iterate over all outgoing arcs of the second node
864
+ for( ArcsIter i(n2->arcs()); i; i++ ) {
865
+ Arc *arc2=i;
866
+ Node *t2 = arc2->target_node();
867
+ Label l2=arc2->label();
868
+ Character uc2=l2.upper_char();
869
+ Character lc2=l2.lower_char();
870
+
871
+ if (lc2 == Label::epsilon)
872
+ add_transition( l2, n1, t2, node, a, map, cn2trans1, cn2trans2 );
873
+
874
+ else {
875
+ // iterate over the matching outgoing arcs of the first node
876
+ for( CharNode2Trans::iterator it(cn2trans1, n1->index, lc2 );
877
+ !it.finished(); it++ )
878
+ {
879
+ Arc *arc1 = *it;
880
+ Node *t1 = arc1->target_node();
881
+ Label l1=arc1->label();
882
+ assert(l1.upper_char() == lc2);
883
+ Character lc1=l1.lower_char();
884
+
885
+ add_transition( Label(lc1,uc2), t1, t2, node, a, map,
886
+ cn2trans1, cn2trans2 );
887
+ }
888
+ }
889
+ }
890
+
891
+ // epsilon output characters of the first Transducer
892
+ for( CharNode2Trans::iterator it(cn2trans1, n1->index, Label::epsilon );
893
+ !it.finished(); it++ )
894
+ {
895
+ Arc *arc1 = *it;
896
+ Node *t1 = arc1->target_node();
897
+ Label l=arc1->label();
898
+ assert(l.upper_char() == Label::epsilon);
899
+ add_transition( l, t1, n2, node, a, map, cn2trans1, cn2trans2 );
900
+ }
901
+ }
902
+ }
903
+
904
+
905
+ /*******************************************************************/
906
+ /* */
907
+ /* Transducer::operator || */
908
+ /* */
909
+ /*******************************************************************/
910
+
911
+ Transducer &Transducer::operator||( Transducer &a )
912
+
913
+ {
914
+ PairMapping map;
915
+
916
+ Transducer *na = new Transducer();
917
+ na->alphabet.compose(alphabet, a.alphabet);
918
+
919
+ // map the two root nodes to the new root node
920
+ map[pair<Node*,Node*>(root_node(), a.root_node())] = na->root_node();
921
+
922
+ // recursively compose the two automata
923
+ CharNode2Trans cn2trans1(*this);
924
+ CharNode2Trans cn2trans2(a);
925
+ compose_nodes( root_node(), a.root_node(), na->root_node(),
926
+ na, map, cn2trans1, cn2trans2 );
927
+
928
+ return *na;
929
+ }
930
+
931
+
932
+
933
+ /*******************************************************************/
934
+ /* */
935
+ /* Transducer::operator/ */
936
+ /* */
937
+ /*******************************************************************/
938
+
939
+ Transducer &Transducer::operator/( Transducer &a )
940
+
941
+ {
942
+ complete_alphabet();
943
+ a.alphabet.copy(alphabet);
944
+ // a-b = a & !b = a & !(a & b)
945
+ Transducer *a1 = &(*this & a);
946
+ Transducer *a2 = &(!*a1);
947
+ delete a1;
948
+ a1 = &(*this & *a2);
949
+ delete a2;
950
+ return *a1;
951
+ }
952
+
953
+
954
+ /*******************************************************************/
955
+ /* */
956
+ /* Transducer::compare_nodes */
957
+ /* */
958
+ /*******************************************************************/
959
+
960
+ bool Transducer::compare_nodes( Node *node, Node *node2, Transducer &a2 )
961
+
962
+ {
963
+ if (node->was_visited( vmark )) {
964
+ if (node2->was_visited( a2.vmark ))
965
+ return (node->forward() == node2 && node2->forward() == node);
966
+ else
967
+ return false;
968
+ }
969
+ else if (node2->was_visited( a2.vmark ))
970
+ return false;
971
+
972
+ node->set_forward( node2 );
973
+ node2->set_forward( node );
974
+
975
+ if (node->is_final() != node2->is_final())
976
+ return false;
977
+
978
+ // iterate over all outgoing arcs
979
+ for( ArcsIter p(node->arcs()); p; p++ ) {
980
+ Arc *arc=p;
981
+ Node *t2=node2->target_node(arc->label());
982
+
983
+ if (t2 == NULL)
984
+ return false;
985
+ else if (!compare_nodes(arc->target_node(), t2, a2))
986
+ return false;
987
+ }
988
+ for( ArcsIter p(node2->arcs()); p; p++ ) {
989
+ Arc *arc=p;
990
+ if (node->target_node(arc->label()) == NULL)
991
+ return false;
992
+ }
993
+
994
+ return true;
995
+ }
996
+
997
+
998
+ /*******************************************************************/
999
+ /* */
1000
+ /* Transducer::operator == */
1001
+ /* */
1002
+ /*******************************************************************/
1003
+
1004
+ bool Transducer::operator==( Transducer &a )
1005
+
1006
+ {
1007
+ Transducer *p1 = (minimised)? this: &minimise();
1008
+ Transducer *p2 = (a.minimised)? &a: &a.minimise();
1009
+
1010
+ p1->incr_vmark();
1011
+ p2->incr_vmark();
1012
+ bool result = p1->compare_nodes(p1->root_node(), p2->root_node(), *p2 );
1013
+
1014
+ if (p1 != this) delete p1;
1015
+ if (p2 != &a) delete p2;
1016
+
1017
+ return result;
1018
+ }
1019
+
1020
+
1021
+
1022
+ /*******************************************************************/
1023
+ /* */
1024
+ /* Transducer::map_nodes */
1025
+ /* */
1026
+ /*******************************************************************/
1027
+
1028
+ void Transducer::map_nodes( Node *node, Node *node2, Transducer *a, Level level)
1029
+
1030
+ {
1031
+ if (!node->was_visited(vmark)) {
1032
+
1033
+ node->set_forward(node2);
1034
+
1035
+ // define final nodes
1036
+ if (node->is_final())
1037
+ node2->set_final(1);
1038
+
1039
+ // iterate over all outgoing arcs of node
1040
+ for( ArcsIter p(node->arcs()); p; p++ ) {
1041
+ Arc *arc=p;
1042
+ Label l(arc->label().get_char(level));
1043
+ Node *t2=NULL, *t=arc->target_node();
1044
+
1045
+ if (t->check_visited(vmark))
1046
+ t2 = t->forward();
1047
+ else
1048
+ t2 = a->new_node(); // create a new node
1049
+
1050
+ node2->add_arc(l, t2, a); // add a link to the node
1051
+
1052
+ map_nodes( t, t2, a, level );
1053
+ }
1054
+ }
1055
+ }
1056
+
1057
+
1058
+ /*******************************************************************/
1059
+ /* */
1060
+ /* Transducer::level */
1061
+ /* */
1062
+ /*******************************************************************/
1063
+
1064
+ Transducer &Transducer::level( Level level )
1065
+
1066
+ {
1067
+ Transducer *na = new Transducer();
1068
+
1069
+ for( Alphabet::iterator it=alphabet.begin(); it!=alphabet.end(); it++ ) {
1070
+ Character c = it->get_char(level);
1071
+ if (alphabet.code2symbol(c) != NULL)
1072
+ na->alphabet.add_symbol( alphabet.code2symbol(c), c );
1073
+ na->alphabet.insert(Label(c));
1074
+ }
1075
+
1076
+ incr_vmark();
1077
+ map_nodes(root_node(), na->root_node(), na, level );
1078
+
1079
+ return *na;
1080
+ }
1081
+
1082
+
1083
+ /*******************************************************************/
1084
+ /* */
1085
+ /* Transducer::freely_insert_at_node */
1086
+ /* */
1087
+ /*******************************************************************/
1088
+
1089
+ void Transducer::freely_insert_at_node( Node *node, Label l )
1090
+
1091
+ {
1092
+ if (!node->was_visited(vmark)) {
1093
+ node->add_arc(l, node, this); // add a recursive link labelled with l
1094
+
1095
+ // iterate over all outgoing arcs of node
1096
+ for( ArcsIter p(node->arcs()); p; p++ ) {
1097
+ Arc *arc=p;
1098
+ freely_insert_at_node(arc->target_node(), l );
1099
+ }
1100
+ }
1101
+ }
1102
+
1103
+
1104
+ /*******************************************************************/
1105
+ /* */
1106
+ /* Transducer::freely_insert */
1107
+ /* */
1108
+ /*******************************************************************/
1109
+
1110
+ Transducer &Transducer::freely_insert( Label l )
1111
+
1112
+ {
1113
+ Transducer *na = &copy();
1114
+
1115
+ na->incr_vmark();
1116
+ na->freely_insert_at_node(na->root_node(), l );
1117
+
1118
+ return *na;
1119
+ }
1120
+
1121
+
1122
+ /*******************************************************************/
1123
+ /* */
1124
+ /* Transducer::splice_arc */
1125
+ /* */
1126
+ /*******************************************************************/
1127
+
1128
+ void Transducer::splice_arc( Node *node, Node *node2, Node *next_node,
1129
+ Transducer *a )
1130
+ {
1131
+ if (node->is_final()) {
1132
+ // link final node to the next node
1133
+ node2->add_arc( Label(), next_node, a );
1134
+ return;
1135
+ }
1136
+
1137
+ // iterate over the outgoing arcs
1138
+ for( ArcsIter p(node->arcs()); p; p++ ) {
1139
+ Arc *arc=p;
1140
+ Node *tn=a->new_node();
1141
+
1142
+ node2->add_arc( arc->label(), tn, a );
1143
+ splice_arc( arc->target_node(), tn, next_node, a );
1144
+ }
1145
+ }
1146
+
1147
+
1148
+ /*******************************************************************/
1149
+ /* */
1150
+ /* Transducer::splice_nodes */
1151
+ /* */
1152
+ /*******************************************************************/
1153
+
1154
+ void Transducer::splice_nodes(Node *node, Node *node2, Label sl,
1155
+ Transducer *sa, Transducer *a)
1156
+ {
1157
+ if (!node->was_visited(vmark)) {
1158
+
1159
+ node->set_forward(node2);
1160
+
1161
+ // define final nodes
1162
+ if (node->is_final())
1163
+ node2->set_final(1);
1164
+
1165
+ // iterate over all outgoing arcs of node
1166
+ for( ArcsIter p(node->arcs()); p; p++ ) {
1167
+ Arc *arc=p;
1168
+ Node *t2=NULL, *t=arc->target_node();
1169
+
1170
+ if (t->check_visited(vmark))
1171
+ t2 = t->forward();
1172
+ else
1173
+ t2 = a->new_node(); // create a new node
1174
+
1175
+ if (arc->label() == sl)
1176
+ // insert the transducer
1177
+ splice_arc(sa->root_node(), node2, t2, a);
1178
+ else
1179
+ // add a link to the node
1180
+ node2->add_arc(arc->label(), t2, a);
1181
+
1182
+ splice_nodes( t, t2, sl, sa, a );
1183
+ }
1184
+ }
1185
+ }
1186
+
1187
+
1188
+ /*******************************************************************/
1189
+ /* */
1190
+ /* Transducer::splice */
1191
+ /* */
1192
+ /*******************************************************************/
1193
+
1194
+ Transducer &Transducer::splice( Label sl, Transducer *sa )
1195
+
1196
+ {
1197
+ Alphabet::iterator it;
1198
+
1199
+ Transducer *na = new Transducer();
1200
+
1201
+ for( it=alphabet.begin(); it!=alphabet.end(); it++ ) {
1202
+ Label l = *it;
1203
+ if (l != sl)
1204
+ na->alphabet.insert(l);
1205
+ }
1206
+ for( it=sa->alphabet.begin(); it!=sa->alphabet.end(); it++ )
1207
+ na->alphabet.insert(*it);
1208
+
1209
+ incr_vmark();
1210
+ splice_nodes(root_node(), na->root_node(), sl, sa, na );
1211
+
1212
+ return *na;
1213
+ }
1214
+
1215
+
1216
+ /*******************************************************************/
1217
+ /* */
1218
+ /* Transducer::replace_char */
1219
+ /* */
1220
+ /*******************************************************************/
1221
+
1222
+ Transducer &Transducer::replace_char( Character c, Character nc )
1223
+
1224
+ {
1225
+ Alphabet::iterator it;
1226
+
1227
+ Transducer *na = new Transducer();
1228
+
1229
+ for( it=alphabet.begin(); it!=alphabet.end(); it++ ) {
1230
+ Label l = *it;
1231
+ na->alphabet.insert(l.replace_char(c,nc));
1232
+ }
1233
+
1234
+ incr_vmark();
1235
+ replace_char2(root_node(), na->root_node(), c, nc, na );
1236
+
1237
+ return *na;
1238
+ }
1239
+
1240
+
1241
+ /*******************************************************************/
1242
+ /* */
1243
+ /* Transducer::replace_char2 */
1244
+ /* */
1245
+ /*******************************************************************/
1246
+
1247
+ void Transducer::replace_char2(Node *node, Node *node2, Character c,
1248
+ Character nc, Transducer *a)
1249
+ {
1250
+ if (!node->was_visited(vmark)) {
1251
+
1252
+ node->set_forward(node2);
1253
+
1254
+ // define final nodes
1255
+ if (node->is_final())
1256
+ node2->set_final(1);
1257
+
1258
+ // iterate over all outgoing arcs of node
1259
+ for( ArcsIter p(node->arcs()); p; p++ ) {
1260
+ Arc *arc=p;
1261
+ Node *t2=NULL, *t=arc->target_node();
1262
+
1263
+ if (t->check_visited(vmark))
1264
+ t2 = t->forward();
1265
+ else
1266
+ t2 = a->new_node(); // create a new node
1267
+
1268
+ node2->add_arc(arc->label().replace_char(c, nc), t2, a);
1269
+ replace_char2( t, t2, c, nc, a );
1270
+ }
1271
+ }
1272
+ }
1273
+ }