ruby-sfst 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +1 -0
- data/Manifest +1 -0
- data/README.rdoc +2 -0
- data/Rakefile +1 -1
- data/ext/sfst_machine/alphabet.C +18 -14
- data/ext/sfst_machine/alphabet.h +10 -19
- data/ext/sfst_machine/compact.C +2 -2
- data/ext/sfst_machine/determinise.C +0 -1
- data/ext/sfst_machine/extconf.rb +1 -1
- data/ext/sfst_machine/fst-compiler.C +127 -4
- data/ext/sfst_machine/fst-compiler.h +2 -2
- data/ext/sfst_machine/fst-compiler.yy +3 -3
- data/ext/sfst_machine/fst.C +54 -20
- data/ext/sfst_machine/fst.h +15 -11
- data/ext/sfst_machine/interface.C +62 -58
- data/ext/sfst_machine/interface.h +0 -1
- data/ext/sfst_machine/make-compact.C +0 -1
- data/ext/sfst_machine/sfst_machine.cc +0 -1
- data/ext/sfst_machine/sgi.h +44 -0
- data/ext/sfst_machine/utf8-scanner.C +73 -91
- data/ext/sfst_machine/utf8-scanner.ll +24 -28
- data/ruby-sfst.gemspec +8 -6
- metadata +5 -4
@@ -16,8 +16,8 @@ using std::ofstream;
|
|
16
16
|
#include <set>
|
17
17
|
using std::set;
|
18
18
|
|
19
|
-
|
20
|
-
|
19
|
+
#include "sgi.h"
|
20
|
+
|
21
21
|
using std::cerr;
|
22
22
|
using std::cout;
|
23
23
|
using std::vector;
|
@@ -199,10 +199,9 @@ static bool in_range( unsigned int c, Range *r )
|
|
199
199
|
static void free_values( Range *r )
|
200
200
|
|
201
201
|
{
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
delete tmp;
|
202
|
+
if (r) {
|
203
|
+
free_values(r->next);
|
204
|
+
delete r;
|
206
205
|
}
|
207
206
|
}
|
208
207
|
|
@@ -216,11 +215,9 @@ static void free_values( Range *r )
|
|
216
215
|
static void free_values( Ranges *r )
|
217
216
|
|
218
217
|
{
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
free_values(tmp->range);
|
223
|
-
delete tmp;
|
218
|
+
if (r) {
|
219
|
+
free_values(r->next);
|
220
|
+
delete r;
|
224
221
|
}
|
225
222
|
}
|
226
223
|
|
@@ -234,10 +231,9 @@ static void free_values( Ranges *r )
|
|
234
231
|
static void free_contexts( Contexts *c )
|
235
232
|
|
236
233
|
{
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
delete tmp;
|
234
|
+
if (c) {
|
235
|
+
free_contexts(c->next);
|
236
|
+
delete c;
|
241
237
|
}
|
242
238
|
}
|
243
239
|
|
@@ -269,6 +265,8 @@ Range *complement_range( Range *r )
|
|
269
265
|
vector<Character> sym;
|
270
266
|
for( Range *p=r; p; p=p->next)
|
271
267
|
sym.push_back( p->character );
|
268
|
+
free_values( r );
|
269
|
+
|
272
270
|
TheAlphabet.complement(sym);
|
273
271
|
if (sym.size() == 0)
|
274
272
|
error("Empty character range!");
|
@@ -393,9 +391,10 @@ Transducer *read_words( char *filename )
|
|
393
391
|
ifstream is(filename);
|
394
392
|
if (!is.is_open()) {
|
395
393
|
static char message[1000];
|
396
|
-
sprintf(message,"Error: Cannot open file \"%s\"!",filename);
|
394
|
+
sprintf(message,"Error: Cannot open file \"%s\"!", filename);
|
397
395
|
throw message;
|
398
396
|
}
|
397
|
+
free( filename );
|
399
398
|
Transducer *t = new Transducer(is, &TheAlphabet, Verbose);
|
400
399
|
is.close();
|
401
400
|
TheAlphabet.insert_symbols(t->alphabet);
|
@@ -430,6 +429,7 @@ Transducer *read_transducer( char *filename )
|
|
430
429
|
filename);
|
431
430
|
throw message;
|
432
431
|
}
|
432
|
+
free( filename );
|
433
433
|
Transducer *nt = &t.copy(false, &TheAlphabet);
|
434
434
|
TheAlphabet.insert_symbols(nt->alphabet);
|
435
435
|
if (Verbose)
|
@@ -1405,8 +1405,8 @@ Transducer *restriction( Transducer *t, Twol_Type type, Contexts *c,
|
|
1405
1405
|
/* */
|
1406
1406
|
/*******************************************************************/
|
1407
1407
|
|
1408
|
-
Transducer *constrain_boundary_transducer( Character leftm,
|
1409
|
-
|
1408
|
+
static Transducer *constrain_boundary_transducer( Character leftm,
|
1409
|
+
Character rightm )
|
1410
1410
|
{
|
1411
1411
|
// create the transducer (.|<L>|<R>)*
|
1412
1412
|
|
@@ -1436,17 +1436,17 @@ Transducer *constrain_boundary_transducer( Character leftm, Character rightm )
|
|
1436
1436
|
|
1437
1437
|
/*******************************************************************/
|
1438
1438
|
/* */
|
1439
|
-
/*
|
1439
|
+
/* extended_left_transducer */
|
1440
1440
|
/* */
|
1441
1441
|
/*******************************************************************/
|
1442
1442
|
|
1443
|
-
Transducer *
|
1444
|
-
|
1443
|
+
static Transducer *extended_left_transducer( Transducer *t,
|
1444
|
+
Character m1, Character m2 )
|
1445
1445
|
{
|
1446
1446
|
if (t == NULL) // empty context
|
1447
1447
|
return pi_machine(TheAlphabet);
|
1448
1448
|
|
1449
|
-
// Extended context transducer
|
1449
|
+
// Extended left context transducer
|
1450
1450
|
|
1451
1451
|
// <R> >> (<L> >> $T$)
|
1452
1452
|
Transducer *tmp=&t->freely_insert( Label(m1) );
|
@@ -1475,17 +1475,17 @@ Transducer *extended_context( Transducer *t, Character m1, Character m2 )
|
|
1475
1475
|
|
1476
1476
|
/*******************************************************************/
|
1477
1477
|
/* */
|
1478
|
-
/*
|
1478
|
+
/* left_context */
|
1479
1479
|
/* */
|
1480
1480
|
/*******************************************************************/
|
1481
1481
|
|
1482
|
-
Transducer *
|
1482
|
+
static Transducer *left_context( Transducer *t, Character m1, Character m2 )
|
1483
1483
|
|
1484
1484
|
{
|
1485
|
-
//
|
1486
|
-
Transducer *ct =
|
1485
|
+
// .* (<R> >> (<L> >> $T$)) || !(.*<L>)
|
1486
|
+
Transducer *ct = extended_left_transducer(t, m1, m2);
|
1487
1487
|
|
1488
|
-
// <R
|
1488
|
+
// <R>* <L> .*
|
1489
1489
|
Transducer *mt = one_label_transducer(Label(m1));
|
1490
1490
|
mt->root_node()->add_arc(Label(m2), mt->root_node(), mt );
|
1491
1491
|
add_pi_transitions(mt, mt->root_node()->target_node(Label(m1)),TheAlphabet);
|
@@ -1496,6 +1496,17 @@ Transducer *replace_context( Transducer *t, Character m1, Character m2 )
|
|
1496
1496
|
mt->alphabet.copy(TheAlphabet);
|
1497
1497
|
Transducer *no_mt = &!*mt;
|
1498
1498
|
|
1499
|
+
{
|
1500
|
+
static int print=1;
|
1501
|
+
if (print) {
|
1502
|
+
print = 0;
|
1503
|
+
Transducer *temp = &(ct->copy());
|
1504
|
+
temp = &(no_ct->copy());
|
1505
|
+
temp = &(mt->copy());
|
1506
|
+
temp = &(no_mt->copy());
|
1507
|
+
}
|
1508
|
+
}
|
1509
|
+
|
1499
1510
|
Transducer *t1 = &(*no_ct + *mt);
|
1500
1511
|
delete no_ct;
|
1501
1512
|
delete mt;
|
@@ -1596,8 +1607,8 @@ Transducer *replace( Transducer *ct, Repl_Type type, bool optional )
|
|
1596
1607
|
/* */
|
1597
1608
|
/*******************************************************************/
|
1598
1609
|
|
1599
|
-
Transducer *replace_transducer( Transducer *ct, Character lm,
|
1600
|
-
|
1610
|
+
static Transducer *replace_transducer( Transducer *ct, Character lm,
|
1611
|
+
Character rm, Repl_Type type )
|
1601
1612
|
{
|
1602
1613
|
// insert boundary markers into the center transducer
|
1603
1614
|
|
@@ -1634,6 +1645,9 @@ Transducer *replace_transducer( Transducer *ct, Character lm, Character rm,
|
|
1634
1645
|
Transducer *replace_in_context( Transducer *t, Repl_Type type, Contexts *c,
|
1635
1646
|
bool optional )
|
1636
1647
|
{
|
1648
|
+
// The implementation of the replace operators is based on
|
1649
|
+
// "The Replace Operator" by Lauri Karttunen
|
1650
|
+
|
1637
1651
|
if (!Alphabet_Defined)
|
1638
1652
|
error("The replace operators require the definition of an alphabet");
|
1639
1653
|
|
@@ -1645,7 +1659,7 @@ Transducer *replace_in_context( Transducer *t, Repl_Type type, Contexts *c,
|
|
1645
1659
|
Character rightm = TheAlphabet.new_marker();
|
1646
1660
|
|
1647
1661
|
/////////////////////////////////////////////////////////////
|
1648
|
-
// Create the insert
|
1662
|
+
// Create the insert boundaries transducer (.|<>:<L>|<>:<R>)*
|
1649
1663
|
/////////////////////////////////////////////////////////////
|
1650
1664
|
|
1651
1665
|
Transducer *ibt=pi_machine(TheAlphabet);
|
@@ -1654,7 +1668,7 @@ Transducer *replace_in_context( Transducer *t, Repl_Type type, Contexts *c,
|
|
1654
1668
|
root->add_arc( Label(Label::epsilon, rightm),root, ibt);
|
1655
1669
|
|
1656
1670
|
/////////////////////////////////////////////////////////////
|
1657
|
-
// Create the remove
|
1671
|
+
// Create the remove boundaries transducer (.|<L>:<>|<R>:<>)*
|
1658
1672
|
/////////////////////////////////////////////////////////////
|
1659
1673
|
|
1660
1674
|
Transducer *rbt=pi_machine(TheAlphabet);
|
@@ -1667,7 +1681,7 @@ Transducer *replace_in_context( Transducer *t, Repl_Type type, Contexts *c,
|
|
1667
1681
|
TheAlphabet.insert(Label(rightm));
|
1668
1682
|
|
1669
1683
|
/////////////////////////////////////////////////////////////
|
1670
|
-
// Create the constrain
|
1684
|
+
// Create the constrain boundaries transducer !(.*<L><R>.*)
|
1671
1685
|
/////////////////////////////////////////////////////////////
|
1672
1686
|
|
1673
1687
|
Transducer *cbt=constrain_boundary_transducer(leftm, rightm);
|
@@ -1677,12 +1691,12 @@ Transducer *replace_in_context( Transducer *t, Repl_Type type, Contexts *c,
|
|
1677
1691
|
/////////////////////////////////////////////////////////////
|
1678
1692
|
|
1679
1693
|
// left context transducer: .* (<R> >> (<L> >> $T$)) || !(.*<L>)
|
1680
|
-
Transducer *lct =
|
1694
|
+
Transducer *lct = left_context(c->left, leftm, rightm);
|
1681
1695
|
|
1682
1696
|
// right context transducer: (<R> >> (<L> >> $T$)) .* || !(<R>.*)
|
1683
1697
|
Transducer *tmp = &c->right->reverse();
|
1684
1698
|
delete c->right;
|
1685
|
-
Transducer *t2 =
|
1699
|
+
Transducer *t2 = left_context(tmp, rightm, leftm);
|
1686
1700
|
Transducer *rct = &t2->reverse();
|
1687
1701
|
delete t2;
|
1688
1702
|
|
@@ -1700,23 +1714,30 @@ Transducer *replace_in_context( Transducer *t, Repl_Type type, Contexts *c,
|
|
1700
1714
|
// build the conditional replacement transducer
|
1701
1715
|
/////////////////////////////////////////////////////////////
|
1702
1716
|
|
1717
|
+
tmp = &(ibt->copy());
|
1718
|
+
tmp = &(cbt->copy());
|
1719
|
+
tmp = &(lct->copy());
|
1720
|
+
tmp = &(rct->copy());
|
1721
|
+
tmp = &(rt->copy());
|
1722
|
+
tmp = &(rbt->copy());
|
1723
|
+
|
1703
1724
|
tmp = ibt;
|
1704
1725
|
tmp = &(*ibt || *cbt);
|
1705
1726
|
delete(ibt);
|
1706
1727
|
delete(cbt);
|
1707
1728
|
|
1708
|
-
if (type == repl_up || type == repl_right) {
|
1709
|
-
t2 = &(*tmp || *rct);
|
1710
|
-
delete tmp;
|
1711
|
-
delete rct;
|
1712
|
-
tmp = t2;
|
1713
|
-
}
|
1714
1729
|
if (type == repl_up || type == repl_left) {
|
1715
1730
|
t2 = &(*tmp || *lct);
|
1716
1731
|
delete tmp;
|
1717
1732
|
delete lct;
|
1718
1733
|
tmp = t2;
|
1719
1734
|
}
|
1735
|
+
if (type == repl_up || type == repl_right) {
|
1736
|
+
t2 = &(*tmp || *rct);
|
1737
|
+
delete tmp;
|
1738
|
+
delete rct;
|
1739
|
+
tmp = t2;
|
1740
|
+
}
|
1720
1741
|
|
1721
1742
|
t2 = &(*tmp || *rt);
|
1722
1743
|
delete tmp;
|
@@ -1766,24 +1787,6 @@ void add_alphabet( Transducer *t )
|
|
1766
1787
|
}
|
1767
1788
|
|
1768
1789
|
|
1769
|
-
/*******************************************************************/
|
1770
|
-
/* */
|
1771
|
-
/* store_transducer */
|
1772
|
-
/* */
|
1773
|
-
/*******************************************************************/
|
1774
|
-
|
1775
|
-
void store_transducer( Transducer *t, char *filename )
|
1776
|
-
|
1777
|
-
{
|
1778
|
-
if (filename == NULL)
|
1779
|
-
cout << *t;
|
1780
|
-
else {
|
1781
|
-
ofstream os(filename);
|
1782
|
-
os << *t;
|
1783
|
-
os.close();
|
1784
|
-
}
|
1785
|
-
}
|
1786
|
-
|
1787
1790
|
/*******************************************************************/
|
1788
1791
|
/* */
|
1789
1792
|
/* write_to_file */
|
@@ -1798,6 +1801,7 @@ void write_to_file( Transducer *t, char *filename)
|
|
1798
1801
|
fprintf(stderr,"\nError: Cannot open output file \"%s\"\n\n", filename);
|
1799
1802
|
exit(1);
|
1800
1803
|
}
|
1804
|
+
free( filename );
|
1801
1805
|
|
1802
1806
|
t = explode(t);
|
1803
1807
|
add_alphabet(t);
|
@@ -56,7 +56,6 @@ Range *add_var_values( char *name, Range*);
|
|
56
56
|
Range *add_values( unsigned int, unsigned int, Range*);
|
57
57
|
Range *append_values( Range *r2, Range *r );
|
58
58
|
void add_alphabet( Transducer* );
|
59
|
-
void store_transducer( Transducer *a, char *filename );
|
60
59
|
|
61
60
|
// These functions delete their argument automata
|
62
61
|
|
@@ -366,7 +366,6 @@ static VALUE _regular_transducer_analyze_or_generate(Transducer *t, VALUE string
|
|
366
366
|
delete a3;
|
367
367
|
|
368
368
|
a2->alphabet.copy(t->alphabet);
|
369
|
-
t->incr_vmark();
|
370
369
|
bool accepted = _regular_transducer_yield(a2, a2->root_node(), rb_ary_new());
|
371
370
|
delete a2;
|
372
371
|
|
@@ -0,0 +1,44 @@
|
|
1
|
+
|
2
|
+
/*******************************************************************/
|
3
|
+
/* */
|
4
|
+
/* File: sgi.h */
|
5
|
+
/* Author: Helmut Schmid */
|
6
|
+
/* Purpose: */
|
7
|
+
/* Created: Thu Sep 11 15:58:25 2008 */
|
8
|
+
/* Modified: Fri Sep 12 08:17:03 2008 (schmid) */
|
9
|
+
/* */
|
10
|
+
/*******************************************************************/
|
11
|
+
|
12
|
+
#ifndef _SGI_INCLUDED
|
13
|
+
#define _SGI_INCLUDED
|
14
|
+
|
15
|
+
|
16
|
+
#ifdef SGIext
|
17
|
+
|
18
|
+
#include <ext/hash_map>
|
19
|
+
#include <ext/hash_set>
|
20
|
+
using std::hash_map;
|
21
|
+
using std::hash_set;
|
22
|
+
using std::hash;
|
23
|
+
|
24
|
+
#else
|
25
|
+
|
26
|
+
#ifdef SGI__gnu_cxx
|
27
|
+
|
28
|
+
#include <ext/hash_map>
|
29
|
+
#include <ext/hash_set>
|
30
|
+
|
31
|
+
#else
|
32
|
+
|
33
|
+
#include <backward/hash_map>
|
34
|
+
#include <backward/hash_set>
|
35
|
+
|
36
|
+
#endif
|
37
|
+
|
38
|
+
using __gnu_cxx::hash_map;
|
39
|
+
using __gnu_cxx::hash_set;
|
40
|
+
using __gnu_cxx::hash;
|
41
|
+
|
42
|
+
#endif
|
43
|
+
|
44
|
+
#endif
|