ruby-sfst 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +1 -0
- data/Manifest +1 -0
- data/README.rdoc +2 -0
- data/Rakefile +1 -1
- data/ext/sfst_machine/alphabet.C +18 -14
- data/ext/sfst_machine/alphabet.h +10 -19
- data/ext/sfst_machine/compact.C +2 -2
- data/ext/sfst_machine/determinise.C +0 -1
- data/ext/sfst_machine/extconf.rb +1 -1
- data/ext/sfst_machine/fst-compiler.C +127 -4
- data/ext/sfst_machine/fst-compiler.h +2 -2
- data/ext/sfst_machine/fst-compiler.yy +3 -3
- data/ext/sfst_machine/fst.C +54 -20
- data/ext/sfst_machine/fst.h +15 -11
- data/ext/sfst_machine/interface.C +62 -58
- data/ext/sfst_machine/interface.h +0 -1
- data/ext/sfst_machine/make-compact.C +0 -1
- data/ext/sfst_machine/sfst_machine.cc +0 -1
- data/ext/sfst_machine/sgi.h +44 -0
- data/ext/sfst_machine/utf8-scanner.C +73 -91
- data/ext/sfst_machine/utf8-scanner.ll +24 -28
- data/ruby-sfst.gemspec +8 -6
- metadata +5 -4
@@ -16,8 +16,8 @@ using std::ofstream;
|
|
16
16
|
#include <set>
|
17
17
|
using std::set;
|
18
18
|
|
19
|
-
|
20
|
-
|
19
|
+
#include "sgi.h"
|
20
|
+
|
21
21
|
using std::cerr;
|
22
22
|
using std::cout;
|
23
23
|
using std::vector;
|
@@ -199,10 +199,9 @@ static bool in_range( unsigned int c, Range *r )
|
|
199
199
|
static void free_values( Range *r )
|
200
200
|
|
201
201
|
{
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
delete tmp;
|
202
|
+
if (r) {
|
203
|
+
free_values(r->next);
|
204
|
+
delete r;
|
206
205
|
}
|
207
206
|
}
|
208
207
|
|
@@ -216,11 +215,9 @@ static void free_values( Range *r )
|
|
216
215
|
static void free_values( Ranges *r )
|
217
216
|
|
218
217
|
{
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
free_values(tmp->range);
|
223
|
-
delete tmp;
|
218
|
+
if (r) {
|
219
|
+
free_values(r->next);
|
220
|
+
delete r;
|
224
221
|
}
|
225
222
|
}
|
226
223
|
|
@@ -234,10 +231,9 @@ static void free_values( Ranges *r )
|
|
234
231
|
static void free_contexts( Contexts *c )
|
235
232
|
|
236
233
|
{
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
delete tmp;
|
234
|
+
if (c) {
|
235
|
+
free_contexts(c->next);
|
236
|
+
delete c;
|
241
237
|
}
|
242
238
|
}
|
243
239
|
|
@@ -269,6 +265,8 @@ Range *complement_range( Range *r )
|
|
269
265
|
vector<Character> sym;
|
270
266
|
for( Range *p=r; p; p=p->next)
|
271
267
|
sym.push_back( p->character );
|
268
|
+
free_values( r );
|
269
|
+
|
272
270
|
TheAlphabet.complement(sym);
|
273
271
|
if (sym.size() == 0)
|
274
272
|
error("Empty character range!");
|
@@ -393,9 +391,10 @@ Transducer *read_words( char *filename )
|
|
393
391
|
ifstream is(filename);
|
394
392
|
if (!is.is_open()) {
|
395
393
|
static char message[1000];
|
396
|
-
sprintf(message,"Error: Cannot open file \"%s\"!",filename);
|
394
|
+
sprintf(message,"Error: Cannot open file \"%s\"!", filename);
|
397
395
|
throw message;
|
398
396
|
}
|
397
|
+
free( filename );
|
399
398
|
Transducer *t = new Transducer(is, &TheAlphabet, Verbose);
|
400
399
|
is.close();
|
401
400
|
TheAlphabet.insert_symbols(t->alphabet);
|
@@ -430,6 +429,7 @@ Transducer *read_transducer( char *filename )
|
|
430
429
|
filename);
|
431
430
|
throw message;
|
432
431
|
}
|
432
|
+
free( filename );
|
433
433
|
Transducer *nt = &t.copy(false, &TheAlphabet);
|
434
434
|
TheAlphabet.insert_symbols(nt->alphabet);
|
435
435
|
if (Verbose)
|
@@ -1405,8 +1405,8 @@ Transducer *restriction( Transducer *t, Twol_Type type, Contexts *c,
|
|
1405
1405
|
/* */
|
1406
1406
|
/*******************************************************************/
|
1407
1407
|
|
1408
|
-
Transducer *constrain_boundary_transducer( Character leftm,
|
1409
|
-
|
1408
|
+
static Transducer *constrain_boundary_transducer( Character leftm,
|
1409
|
+
Character rightm )
|
1410
1410
|
{
|
1411
1411
|
// create the transducer (.|<L>|<R>)*
|
1412
1412
|
|
@@ -1436,17 +1436,17 @@ Transducer *constrain_boundary_transducer( Character leftm, Character rightm )
|
|
1436
1436
|
|
1437
1437
|
/*******************************************************************/
|
1438
1438
|
/* */
|
1439
|
-
/*
|
1439
|
+
/* extended_left_transducer */
|
1440
1440
|
/* */
|
1441
1441
|
/*******************************************************************/
|
1442
1442
|
|
1443
|
-
Transducer *
|
1444
|
-
|
1443
|
+
static Transducer *extended_left_transducer( Transducer *t,
|
1444
|
+
Character m1, Character m2 )
|
1445
1445
|
{
|
1446
1446
|
if (t == NULL) // empty context
|
1447
1447
|
return pi_machine(TheAlphabet);
|
1448
1448
|
|
1449
|
-
// Extended context transducer
|
1449
|
+
// Extended left context transducer
|
1450
1450
|
|
1451
1451
|
// <R> >> (<L> >> $T$)
|
1452
1452
|
Transducer *tmp=&t->freely_insert( Label(m1) );
|
@@ -1475,17 +1475,17 @@ Transducer *extended_context( Transducer *t, Character m1, Character m2 )
|
|
1475
1475
|
|
1476
1476
|
/*******************************************************************/
|
1477
1477
|
/* */
|
1478
|
-
/*
|
1478
|
+
/* left_context */
|
1479
1479
|
/* */
|
1480
1480
|
/*******************************************************************/
|
1481
1481
|
|
1482
|
-
Transducer *
|
1482
|
+
static Transducer *left_context( Transducer *t, Character m1, Character m2 )
|
1483
1483
|
|
1484
1484
|
{
|
1485
|
-
//
|
1486
|
-
Transducer *ct =
|
1485
|
+
// .* (<R> >> (<L> >> $T$)) || !(.*<L>)
|
1486
|
+
Transducer *ct = extended_left_transducer(t, m1, m2);
|
1487
1487
|
|
1488
|
-
// <R
|
1488
|
+
// <R>* <L> .*
|
1489
1489
|
Transducer *mt = one_label_transducer(Label(m1));
|
1490
1490
|
mt->root_node()->add_arc(Label(m2), mt->root_node(), mt );
|
1491
1491
|
add_pi_transitions(mt, mt->root_node()->target_node(Label(m1)),TheAlphabet);
|
@@ -1496,6 +1496,17 @@ Transducer *replace_context( Transducer *t, Character m1, Character m2 )
|
|
1496
1496
|
mt->alphabet.copy(TheAlphabet);
|
1497
1497
|
Transducer *no_mt = &!*mt;
|
1498
1498
|
|
1499
|
+
{
|
1500
|
+
static int print=1;
|
1501
|
+
if (print) {
|
1502
|
+
print = 0;
|
1503
|
+
Transducer *temp = &(ct->copy());
|
1504
|
+
temp = &(no_ct->copy());
|
1505
|
+
temp = &(mt->copy());
|
1506
|
+
temp = &(no_mt->copy());
|
1507
|
+
}
|
1508
|
+
}
|
1509
|
+
|
1499
1510
|
Transducer *t1 = &(*no_ct + *mt);
|
1500
1511
|
delete no_ct;
|
1501
1512
|
delete mt;
|
@@ -1596,8 +1607,8 @@ Transducer *replace( Transducer *ct, Repl_Type type, bool optional )
|
|
1596
1607
|
/* */
|
1597
1608
|
/*******************************************************************/
|
1598
1609
|
|
1599
|
-
Transducer *replace_transducer( Transducer *ct, Character lm,
|
1600
|
-
|
1610
|
+
static Transducer *replace_transducer( Transducer *ct, Character lm,
|
1611
|
+
Character rm, Repl_Type type )
|
1601
1612
|
{
|
1602
1613
|
// insert boundary markers into the center transducer
|
1603
1614
|
|
@@ -1634,6 +1645,9 @@ Transducer *replace_transducer( Transducer *ct, Character lm, Character rm,
|
|
1634
1645
|
Transducer *replace_in_context( Transducer *t, Repl_Type type, Contexts *c,
|
1635
1646
|
bool optional )
|
1636
1647
|
{
|
1648
|
+
// The implementation of the replace operators is based on
|
1649
|
+
// "The Replace Operator" by Lauri Karttunen
|
1650
|
+
|
1637
1651
|
if (!Alphabet_Defined)
|
1638
1652
|
error("The replace operators require the definition of an alphabet");
|
1639
1653
|
|
@@ -1645,7 +1659,7 @@ Transducer *replace_in_context( Transducer *t, Repl_Type type, Contexts *c,
|
|
1645
1659
|
Character rightm = TheAlphabet.new_marker();
|
1646
1660
|
|
1647
1661
|
/////////////////////////////////////////////////////////////
|
1648
|
-
// Create the insert
|
1662
|
+
// Create the insert boundaries transducer (.|<>:<L>|<>:<R>)*
|
1649
1663
|
/////////////////////////////////////////////////////////////
|
1650
1664
|
|
1651
1665
|
Transducer *ibt=pi_machine(TheAlphabet);
|
@@ -1654,7 +1668,7 @@ Transducer *replace_in_context( Transducer *t, Repl_Type type, Contexts *c,
|
|
1654
1668
|
root->add_arc( Label(Label::epsilon, rightm),root, ibt);
|
1655
1669
|
|
1656
1670
|
/////////////////////////////////////////////////////////////
|
1657
|
-
// Create the remove
|
1671
|
+
// Create the remove boundaries transducer (.|<L>:<>|<R>:<>)*
|
1658
1672
|
/////////////////////////////////////////////////////////////
|
1659
1673
|
|
1660
1674
|
Transducer *rbt=pi_machine(TheAlphabet);
|
@@ -1667,7 +1681,7 @@ Transducer *replace_in_context( Transducer *t, Repl_Type type, Contexts *c,
|
|
1667
1681
|
TheAlphabet.insert(Label(rightm));
|
1668
1682
|
|
1669
1683
|
/////////////////////////////////////////////////////////////
|
1670
|
-
// Create the constrain
|
1684
|
+
// Create the constrain boundaries transducer !(.*<L><R>.*)
|
1671
1685
|
/////////////////////////////////////////////////////////////
|
1672
1686
|
|
1673
1687
|
Transducer *cbt=constrain_boundary_transducer(leftm, rightm);
|
@@ -1677,12 +1691,12 @@ Transducer *replace_in_context( Transducer *t, Repl_Type type, Contexts *c,
|
|
1677
1691
|
/////////////////////////////////////////////////////////////
|
1678
1692
|
|
1679
1693
|
// left context transducer: .* (<R> >> (<L> >> $T$)) || !(.*<L>)
|
1680
|
-
Transducer *lct =
|
1694
|
+
Transducer *lct = left_context(c->left, leftm, rightm);
|
1681
1695
|
|
1682
1696
|
// right context transducer: (<R> >> (<L> >> $T$)) .* || !(<R>.*)
|
1683
1697
|
Transducer *tmp = &c->right->reverse();
|
1684
1698
|
delete c->right;
|
1685
|
-
Transducer *t2 =
|
1699
|
+
Transducer *t2 = left_context(tmp, rightm, leftm);
|
1686
1700
|
Transducer *rct = &t2->reverse();
|
1687
1701
|
delete t2;
|
1688
1702
|
|
@@ -1700,23 +1714,30 @@ Transducer *replace_in_context( Transducer *t, Repl_Type type, Contexts *c,
|
|
1700
1714
|
// build the conditional replacement transducer
|
1701
1715
|
/////////////////////////////////////////////////////////////
|
1702
1716
|
|
1717
|
+
tmp = &(ibt->copy());
|
1718
|
+
tmp = &(cbt->copy());
|
1719
|
+
tmp = &(lct->copy());
|
1720
|
+
tmp = &(rct->copy());
|
1721
|
+
tmp = &(rt->copy());
|
1722
|
+
tmp = &(rbt->copy());
|
1723
|
+
|
1703
1724
|
tmp = ibt;
|
1704
1725
|
tmp = &(*ibt || *cbt);
|
1705
1726
|
delete(ibt);
|
1706
1727
|
delete(cbt);
|
1707
1728
|
|
1708
|
-
if (type == repl_up || type == repl_right) {
|
1709
|
-
t2 = &(*tmp || *rct);
|
1710
|
-
delete tmp;
|
1711
|
-
delete rct;
|
1712
|
-
tmp = t2;
|
1713
|
-
}
|
1714
1729
|
if (type == repl_up || type == repl_left) {
|
1715
1730
|
t2 = &(*tmp || *lct);
|
1716
1731
|
delete tmp;
|
1717
1732
|
delete lct;
|
1718
1733
|
tmp = t2;
|
1719
1734
|
}
|
1735
|
+
if (type == repl_up || type == repl_right) {
|
1736
|
+
t2 = &(*tmp || *rct);
|
1737
|
+
delete tmp;
|
1738
|
+
delete rct;
|
1739
|
+
tmp = t2;
|
1740
|
+
}
|
1720
1741
|
|
1721
1742
|
t2 = &(*tmp || *rt);
|
1722
1743
|
delete tmp;
|
@@ -1766,24 +1787,6 @@ void add_alphabet( Transducer *t )
|
|
1766
1787
|
}
|
1767
1788
|
|
1768
1789
|
|
1769
|
-
/*******************************************************************/
|
1770
|
-
/* */
|
1771
|
-
/* store_transducer */
|
1772
|
-
/* */
|
1773
|
-
/*******************************************************************/
|
1774
|
-
|
1775
|
-
void store_transducer( Transducer *t, char *filename )
|
1776
|
-
|
1777
|
-
{
|
1778
|
-
if (filename == NULL)
|
1779
|
-
cout << *t;
|
1780
|
-
else {
|
1781
|
-
ofstream os(filename);
|
1782
|
-
os << *t;
|
1783
|
-
os.close();
|
1784
|
-
}
|
1785
|
-
}
|
1786
|
-
|
1787
1790
|
/*******************************************************************/
|
1788
1791
|
/* */
|
1789
1792
|
/* write_to_file */
|
@@ -1798,6 +1801,7 @@ void write_to_file( Transducer *t, char *filename)
|
|
1798
1801
|
fprintf(stderr,"\nError: Cannot open output file \"%s\"\n\n", filename);
|
1799
1802
|
exit(1);
|
1800
1803
|
}
|
1804
|
+
free( filename );
|
1801
1805
|
|
1802
1806
|
t = explode(t);
|
1803
1807
|
add_alphabet(t);
|
@@ -56,7 +56,6 @@ Range *add_var_values( char *name, Range*);
|
|
56
56
|
Range *add_values( unsigned int, unsigned int, Range*);
|
57
57
|
Range *append_values( Range *r2, Range *r );
|
58
58
|
void add_alphabet( Transducer* );
|
59
|
-
void store_transducer( Transducer *a, char *filename );
|
60
59
|
|
61
60
|
// These functions delete their argument automata
|
62
61
|
|
@@ -366,7 +366,6 @@ static VALUE _regular_transducer_analyze_or_generate(Transducer *t, VALUE string
|
|
366
366
|
delete a3;
|
367
367
|
|
368
368
|
a2->alphabet.copy(t->alphabet);
|
369
|
-
t->incr_vmark();
|
370
369
|
bool accepted = _regular_transducer_yield(a2, a2->root_node(), rb_ary_new());
|
371
370
|
delete a2;
|
372
371
|
|
@@ -0,0 +1,44 @@
|
|
1
|
+
|
2
|
+
/*******************************************************************/
|
3
|
+
/* */
|
4
|
+
/* File: sgi.h */
|
5
|
+
/* Author: Helmut Schmid */
|
6
|
+
/* Purpose: */
|
7
|
+
/* Created: Thu Sep 11 15:58:25 2008 */
|
8
|
+
/* Modified: Fri Sep 12 08:17:03 2008 (schmid) */
|
9
|
+
/* */
|
10
|
+
/*******************************************************************/
|
11
|
+
|
12
|
+
#ifndef _SGI_INCLUDED
|
13
|
+
#define _SGI_INCLUDED
|
14
|
+
|
15
|
+
|
16
|
+
#ifdef SGIext
|
17
|
+
|
18
|
+
#include <ext/hash_map>
|
19
|
+
#include <ext/hash_set>
|
20
|
+
using std::hash_map;
|
21
|
+
using std::hash_set;
|
22
|
+
using std::hash;
|
23
|
+
|
24
|
+
#else
|
25
|
+
|
26
|
+
#ifdef SGI__gnu_cxx
|
27
|
+
|
28
|
+
#include <ext/hash_map>
|
29
|
+
#include <ext/hash_set>
|
30
|
+
|
31
|
+
#else
|
32
|
+
|
33
|
+
#include <backward/hash_map>
|
34
|
+
#include <backward/hash_set>
|
35
|
+
|
36
|
+
#endif
|
37
|
+
|
38
|
+
using __gnu_cxx::hash_map;
|
39
|
+
using __gnu_cxx::hash_set;
|
40
|
+
using __gnu_cxx::hash;
|
41
|
+
|
42
|
+
#endif
|
43
|
+
|
44
|
+
#endif
|