ruby-sfst 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +1 -0
- data/Manifest +1 -0
- data/README.rdoc +2 -0
- data/Rakefile +1 -1
- data/ext/sfst_machine/alphabet.C +18 -14
- data/ext/sfst_machine/alphabet.h +10 -19
- data/ext/sfst_machine/compact.C +2 -2
- data/ext/sfst_machine/determinise.C +0 -1
- data/ext/sfst_machine/extconf.rb +1 -1
- data/ext/sfst_machine/fst-compiler.C +127 -4
- data/ext/sfst_machine/fst-compiler.h +2 -2
- data/ext/sfst_machine/fst-compiler.yy +3 -3
- data/ext/sfst_machine/fst.C +54 -20
- data/ext/sfst_machine/fst.h +15 -11
- data/ext/sfst_machine/interface.C +62 -58
- data/ext/sfst_machine/interface.h +0 -1
- data/ext/sfst_machine/make-compact.C +0 -1
- data/ext/sfst_machine/sfst_machine.cc +0 -1
- data/ext/sfst_machine/sgi.h +44 -0
- data/ext/sfst_machine/utf8-scanner.C +73 -91
- data/ext/sfst_machine/utf8-scanner.ll +24 -28
- data/ruby-sfst.gemspec +8 -6
- metadata +5 -4
data/CHANGELOG
CHANGED
data/Manifest
CHANGED
data/README.rdoc
CHANGED
data/Rakefile
CHANGED
@@ -4,7 +4,7 @@ require 'rake'
|
|
4
4
|
begin
|
5
5
|
require 'echoe'
|
6
6
|
|
7
|
-
Echoe.new('ruby-sfst', '0.
|
7
|
+
Echoe.new('ruby-sfst', '0.2.0') do |p|
|
8
8
|
p.summary = "A wrapper for the Stuttgart Finite State Transducer Tools (SFST)."
|
9
9
|
p.author = 'Marius L. Jøhndal'
|
10
10
|
p.email = "mariuslj (at) ifi [dot] uio (dot) no"
|
data/ext/sfst_machine/alphabet.C
CHANGED
@@ -10,14 +10,14 @@
|
|
10
10
|
/* */
|
11
11
|
/*******************************************************************/
|
12
12
|
|
13
|
-
#include <
|
13
|
+
#include <climits>
|
14
|
+
#include <cstring>
|
15
|
+
|
14
16
|
#include "utf8.h"
|
15
17
|
#include "alphabet.h"
|
16
18
|
|
17
19
|
using std::vector;
|
18
20
|
using std::ostream;
|
19
|
-
using __gnu_cxx::hash_map;
|
20
|
-
using __gnu_cxx::hash_set;
|
21
21
|
|
22
22
|
const int BUFFER_SIZE=100000;
|
23
23
|
|
@@ -425,13 +425,13 @@ ostream &operator<<( ostream &s, const Alphabet &a )
|
|
425
425
|
/* Alphabet::next_mcsym */
|
426
426
|
/* */
|
427
427
|
/* recognizes multi-character symbols which are enclosed with */
|
428
|
-
/* angle brackets <...>. If the
|
429
|
-
/*
|
430
|
-
/*
|
428
|
+
/* angle brackets <...>. If the argument flag insert is true, */
|
429
|
+
/* the multi-character symbol must be already in the lexicon in */
|
430
|
+
/* order to be recognized. */
|
431
431
|
/* */
|
432
432
|
/*******************************************************************/
|
433
433
|
|
434
|
-
int Alphabet::next_mcsym( char* &string,
|
434
|
+
int Alphabet::next_mcsym( char* &string, bool insert )
|
435
435
|
|
436
436
|
{
|
437
437
|
char *start=string;
|
@@ -446,7 +446,7 @@ int Alphabet::next_mcsym( char* &string, int extended )
|
|
446
446
|
*end = 0;
|
447
447
|
|
448
448
|
int c;
|
449
|
-
if (
|
449
|
+
if (insert)
|
450
450
|
c = add_symbol( start );
|
451
451
|
else
|
452
452
|
c = symbol2code(start);
|
@@ -473,13 +473,13 @@ int Alphabet::next_mcsym( char* &string, int extended )
|
|
473
473
|
/* */
|
474
474
|
/*******************************************************************/
|
475
475
|
|
476
|
-
int Alphabet::next_code( char* &string,
|
476
|
+
int Alphabet::next_code( char* &string, bool extended, bool insert )
|
477
477
|
|
478
478
|
{
|
479
479
|
if (*string == 0)
|
480
480
|
return EOF; // finished
|
481
481
|
|
482
|
-
int c = next_mcsym(string,
|
482
|
+
int c = next_mcsym(string, insert);
|
483
483
|
if (c != EOF)
|
484
484
|
return c;
|
485
485
|
|
@@ -506,7 +506,7 @@ int Alphabet::next_code( char* &string, int extended )
|
|
506
506
|
/* */
|
507
507
|
/*******************************************************************/
|
508
508
|
|
509
|
-
Label Alphabet::next_label( char* &string,
|
509
|
+
Label Alphabet::next_label( char* &string, bool extended )
|
510
510
|
|
511
511
|
{
|
512
512
|
// read first character
|
@@ -517,7 +517,7 @@ Label Alphabet::next_label( char* &string, int extended )
|
|
517
517
|
Character lc=(Character)c;
|
518
518
|
if (!extended || *string != ':') { // single character?
|
519
519
|
if (lc == Label::epsilon)
|
520
|
-
return next_label(string); // ignore epsilon
|
520
|
+
return next_label(string, extended); // ignore epsilon
|
521
521
|
return Label(lc);
|
522
522
|
}
|
523
523
|
|
@@ -532,7 +532,7 @@ Label Alphabet::next_label( char* &string, int extended )
|
|
532
532
|
|
533
533
|
Label l(lc, (Character)c);
|
534
534
|
if (l.is_epsilon())
|
535
|
-
return next_label(string); // ignore epsilon transitions
|
535
|
+
return next_label(string, extended); // ignore epsilon transitions
|
536
536
|
return l;
|
537
537
|
}
|
538
538
|
|
@@ -782,8 +782,12 @@ char *Alphabet::print_analysis( Analysis &ana, bool both_layers )
|
|
782
782
|
const char *s;
|
783
783
|
|
784
784
|
// either print the analysis symbol or the whole label
|
785
|
-
if (both_layers)
|
785
|
+
if (both_layers) {
|
786
786
|
s = write_label(l);
|
787
|
+
// quote colons
|
788
|
+
if (strcmp(s,":") == 0)
|
789
|
+
ch.push_back('\\');
|
790
|
+
}
|
787
791
|
else if (l.lower_char() != Label::epsilon)
|
788
792
|
s = write_char(l.lower_char());
|
789
793
|
else
|
data/ext/sfst_machine/alphabet.h
CHANGED
@@ -13,7 +13,6 @@
|
|
13
13
|
#define _ALPHABET_H_
|
14
14
|
|
15
15
|
#include <stdio.h>
|
16
|
-
#include <string.h>
|
17
16
|
|
18
17
|
#include "basic.h"
|
19
18
|
|
@@ -22,6 +21,10 @@
|
|
22
21
|
|
23
22
|
#include <iostream>
|
24
23
|
|
24
|
+
#include <cstring>
|
25
|
+
|
26
|
+
#include "sgi.h"
|
27
|
+
|
25
28
|
#ifndef CODE_DATA_TYPE
|
26
29
|
typedef unsigned short Character; // data type of the symbol codes
|
27
30
|
#else
|
@@ -32,18 +35,6 @@ typedef unsigned CODE_DATA_TYPE Character;
|
|
32
35
|
// on the analysis level (lower) or the surface level (upper)
|
33
36
|
typedef enum {upper, lower} Level;
|
34
37
|
|
35
|
-
#ifdef SGIext
|
36
|
-
|
37
|
-
#include <ext/hash_set>
|
38
|
-
#include <ext/hash_map>
|
39
|
-
|
40
|
-
#else
|
41
|
-
|
42
|
-
#include <hash_set>
|
43
|
-
#include <hash_map>
|
44
|
-
|
45
|
-
#endif
|
46
|
-
|
47
38
|
extern char EpsilonString[]; // holds the symbol representing the empty string
|
48
39
|
// which is usually "<>"
|
49
40
|
|
@@ -157,10 +148,10 @@ class Alphabet {
|
|
157
148
|
typedef std::set<Label, Label::label_cmp> LabelSet;
|
158
149
|
|
159
150
|
// hash table used to map the symbols to their codes
|
160
|
-
typedef
|
151
|
+
typedef hash_map<const char*, Character, hash<const char*>,eqstr> SymbolMap;
|
161
152
|
|
162
153
|
// hash table used to map the codes back to the symbols
|
163
|
-
typedef
|
154
|
+
typedef hash_map<Character, char*> CharMap;
|
164
155
|
|
165
156
|
private:
|
166
157
|
SymbolMap sm; // maps symbols to codes
|
@@ -249,17 +240,17 @@ class Alphabet {
|
|
249
240
|
const char *write_label( Label l, bool with_brackets=true ) const;
|
250
241
|
|
251
242
|
// scan the next multi-character symbol in the argument string
|
252
|
-
int next_mcsym( char*&,
|
243
|
+
int next_mcsym( char*&, bool insert=true );
|
253
244
|
|
254
245
|
// scan the next symbol in the argument string
|
255
|
-
int next_code( char*&,
|
246
|
+
int next_code( char*&, bool extended=true, bool insert=true );
|
256
247
|
|
257
248
|
// convert a character string into a symbol or label sequence
|
258
249
|
void string2symseq( char*, std::vector<Character>& );
|
259
250
|
void string2labelseq( char*, std::vector<Label>& );
|
260
251
|
|
261
252
|
// scan the next label in the argument string
|
262
|
-
Label next_label( char*&,
|
253
|
+
Label next_label( char*&, bool extended=true );
|
263
254
|
|
264
255
|
// store the alphabet in the argument file (in binary form)
|
265
256
|
void store( FILE* ) const;
|
@@ -276,7 +267,7 @@ class Alphabet {
|
|
276
267
|
};
|
277
268
|
|
278
269
|
// write the alphabet to the output stream (in readable form)
|
279
|
-
std::ostream &operator<<(std::ostream&, Alphabet&);
|
270
|
+
std::ostream &operator<<(std::ostream&, const Alphabet&);
|
280
271
|
|
281
272
|
|
282
273
|
#endif
|
data/ext/sfst_machine/compact.C
CHANGED
@@ -383,7 +383,7 @@ void CompactTransducer::longest_match2(unsigned int n, char *string, int l,
|
|
383
383
|
|
384
384
|
// follow the non-epsilon transitions
|
385
385
|
char *end=string;
|
386
|
-
int c=alphabet.next_code(end, false);
|
386
|
+
int c=alphabet.next_code(end, false, false);
|
387
387
|
l += end-string;
|
388
388
|
if (c != EOF) {
|
389
389
|
// find the set of arcs with matching upper character in the sort list
|
@@ -430,7 +430,7 @@ const char *CompactTransducer::longest_match( char* &string )
|
|
430
430
|
|
431
431
|
// no match? return the next character
|
432
432
|
if (ba.size() == 0) {
|
433
|
-
int c=alphabet.next_code(string, false);
|
433
|
+
int c=alphabet.next_code(string, false, false);
|
434
434
|
return alphabet.code2symbol(c);
|
435
435
|
}
|
436
436
|
|
data/ext/sfst_machine/extconf.rb
CHANGED
@@ -72,7 +72,7 @@
|
|
72
72
|
PRINT = 261,
|
73
73
|
POS = 262,
|
74
74
|
INSERT = 263,
|
75
|
-
|
75
|
+
SWITCH = 264,
|
76
76
|
ARROW = 265,
|
77
77
|
REPLACE = 266,
|
78
78
|
SYMBOL = 267,
|
@@ -94,7 +94,7 @@
|
|
94
94
|
#define PRINT 261
|
95
95
|
#define POS 262
|
96
96
|
#define INSERT 263
|
97
|
-
#define
|
97
|
+
#define SWITCH 264
|
98
98
|
#define ARROW 265
|
99
99
|
#define REPLACE 266
|
100
100
|
#define SYMBOL 267
|
@@ -532,7 +532,7 @@ static const yytype_uint8 yyrline[] =
|
|
532
532
|
static const char *const yytname[] =
|
533
533
|
{
|
534
534
|
"$end", "error", "$undefined", "NEWLINE", "ALPHA", "COMPOSE", "PRINT",
|
535
|
-
"POS", "INSERT", "
|
535
|
+
"POS", "INSERT", "SWITCH", "ARROW", "REPLACE", "SYMBOL", "VAR", "SVAR",
|
536
536
|
"RVAR", "RSVAR", "STRING", "STRING2", "UTF8CHAR", "CHARACTER", "'|'",
|
537
537
|
"'-'", "'&'", "SEQ", "'!'", "'^'", "'_'", "'*'", "'+'", "'='", "'?'",
|
538
538
|
"'('", "')'", "'{'", "'}'", "':'", "'['", "']'", "'.'", "','", "$accept",
|
@@ -2020,7 +2020,7 @@ yyreduce:
|
|
2020
2020
|
|
2021
2021
|
case 71:
|
2022
2022
|
#line 165 "fst-compiler.yy"
|
2023
|
-
{ (yyval.longchar)=utf8toint((yyvsp[(1) - (1)].value)); ;}
|
2023
|
+
{ (yyval.longchar)=utf8toint((yyvsp[(1) - (1)].value)); free((yyvsp[(1) - (1)].value)); ;}
|
2024
2024
|
break;
|
2025
2025
|
|
2026
2026
|
case 72:
|
@@ -2358,6 +2358,8 @@ yyreturn:
|
|
2358
2358
|
|
2359
2359
|
|
2360
2360
|
extern FILE *yyin;
|
2361
|
+
static int Compact=0;
|
2362
|
+
static int LowMem=0;
|
2361
2363
|
|
2362
2364
|
/*******************************************************************/
|
2363
2365
|
/* */
|
@@ -2373,3 +2375,124 @@ void yyerror(char *text)
|
|
2373
2375
|
exit(1);
|
2374
2376
|
}
|
2375
2377
|
|
2378
|
+
|
2379
|
+
/*******************************************************************/
|
2380
|
+
/* */
|
2381
|
+
/* warn */
|
2382
|
+
/* */
|
2383
|
+
/*******************************************************************/
|
2384
|
+
|
2385
|
+
void warn(char *text)
|
2386
|
+
|
2387
|
+
{
|
2388
|
+
cerr << "\n" << FileName << ":" << yylineno << ": warning: " << text << "!\n";
|
2389
|
+
}
|
2390
|
+
|
2391
|
+
|
2392
|
+
/*******************************************************************/
|
2393
|
+
/* */
|
2394
|
+
/* warn2 */
|
2395
|
+
/* */
|
2396
|
+
/*******************************************************************/
|
2397
|
+
|
2398
|
+
void warn2(char *text, char *text2)
|
2399
|
+
|
2400
|
+
{
|
2401
|
+
cerr << "\n" << FileName << ":" << yylineno << ": warning: " << text << ": ";
|
2402
|
+
cerr << text2 << "\n";
|
2403
|
+
}
|
2404
|
+
|
2405
|
+
|
2406
|
+
/*******************************************************************/
|
2407
|
+
/* */
|
2408
|
+
/* get_flags */
|
2409
|
+
/* */
|
2410
|
+
/*******************************************************************/
|
2411
|
+
|
2412
|
+
void get_flags( int *argc, char **argv )
|
2413
|
+
|
2414
|
+
{
|
2415
|
+
for( int i=1; i<*argc; i++ ) {
|
2416
|
+
if (strcmp(argv[i],"-c") == 0) {
|
2417
|
+
Compact = 1;
|
2418
|
+
argv[i] = NULL;
|
2419
|
+
}
|
2420
|
+
else if (strcmp(argv[i],"-l") == 0) {
|
2421
|
+
LowMem = 1;
|
2422
|
+
argv[i] = NULL;
|
2423
|
+
}
|
2424
|
+
else if (strcmp(argv[i],"-q") == 0) {
|
2425
|
+
Verbose = 0;
|
2426
|
+
argv[i] = NULL;
|
2427
|
+
}
|
2428
|
+
else if (strcmp(argv[i],"-s") == 0) {
|
2429
|
+
Switch = 1;
|
2430
|
+
argv[i] = NULL;
|
2431
|
+
}
|
2432
|
+
}
|
2433
|
+
// remove flags from the argument list
|
2434
|
+
int k;
|
2435
|
+
for( int i=k=1; i<*argc; i++)
|
2436
|
+
if (argv[i] != NULL)
|
2437
|
+
argv[k++] = argv[i];
|
2438
|
+
*argc = k;
|
2439
|
+
}
|
2440
|
+
|
2441
|
+
|
2442
|
+
/*******************************************************************/
|
2443
|
+
/* */
|
2444
|
+
/* main */
|
2445
|
+
/* */
|
2446
|
+
/*******************************************************************/
|
2447
|
+
|
2448
|
+
int main( int argc, char *argv[] )
|
2449
|
+
|
2450
|
+
{
|
2451
|
+
FILE *file;
|
2452
|
+
|
2453
|
+
get_flags(&argc, argv);
|
2454
|
+
if (argc < 3) {
|
2455
|
+
fprintf(stderr,"\nUsage: %s [options] infile outfile\n", argv[0]);
|
2456
|
+
fprintf(stderr,"\nOPTIONS:\n");
|
2457
|
+
fprintf(stderr,"-c\tStore the transducer in fst-infl2 format.\n");
|
2458
|
+
fprintf(stderr,"-l\tStore the transducer in fst-infl3 format.\n");
|
2459
|
+
fprintf(stderr,"-s\tSwitch the upper and lower levels producing a transducer for generation rather than recognition.\n");
|
2460
|
+
fprintf(stderr,"-q\tquiet mode\n\n");
|
2461
|
+
exit(1);
|
2462
|
+
}
|
2463
|
+
if ((file = fopen(argv[1],"rt")) == NULL) {
|
2464
|
+
fprintf(stderr,"\nError: Cannot open grammar file \"%s\"\n\n", argv[1]);
|
2465
|
+
exit(1);
|
2466
|
+
}
|
2467
|
+
FileName = argv[1];
|
2468
|
+
Result = NULL;
|
2469
|
+
TheAlphabet.utf8 = UTF8;
|
2470
|
+
yyin = file;
|
2471
|
+
try {
|
2472
|
+
yyparse();
|
2473
|
+
Result->alphabet.utf8 = UTF8;
|
2474
|
+
if (Verbose)
|
2475
|
+
cerr << "\n";
|
2476
|
+
if (Result->is_empty())
|
2477
|
+
warn("resulting transducer is empty");
|
2478
|
+
if ((file = fopen(argv[2],"wb")) == NULL) {
|
2479
|
+
fprintf(stderr,"\nError: Cannot open output file %s\n\n", argv[2]);
|
2480
|
+
exit(1);
|
2481
|
+
}
|
2482
|
+
if (Compact) {
|
2483
|
+
MakeCompactTransducer ca(*Result);
|
2484
|
+
delete Result;
|
2485
|
+
ca.store(file);
|
2486
|
+
}
|
2487
|
+
else if (LowMem)
|
2488
|
+
Result->store_lowmem(file);
|
2489
|
+
else
|
2490
|
+
Result->store(file);
|
2491
|
+
fclose(file);
|
2492
|
+
}
|
2493
|
+
catch(const char* p) {
|
2494
|
+
cerr << "\n" << p << "\n\n";
|
2495
|
+
exit(1);
|
2496
|
+
}
|
2497
|
+
}
|
2498
|
+
|
@@ -45,7 +45,7 @@
|
|
45
45
|
PRINT = 261,
|
46
46
|
POS = 262,
|
47
47
|
INSERT = 263,
|
48
|
-
|
48
|
+
SWITCH = 264,
|
49
49
|
ARROW = 265,
|
50
50
|
REPLACE = 266,
|
51
51
|
SYMBOL = 267,
|
@@ -67,7 +67,7 @@
|
|
67
67
|
#define PRINT 261
|
68
68
|
#define POS 262
|
69
69
|
#define INSERT 263
|
70
|
-
#define
|
70
|
+
#define SWITCH 264
|
71
71
|
#define ARROW 265
|
72
72
|
#define REPLACE 266
|
73
73
|
#define SYMBOL 267
|
@@ -43,7 +43,7 @@ Transducer *Result;
|
|
43
43
|
Contexts *contexts;
|
44
44
|
}
|
45
45
|
|
46
|
-
%token <number> NEWLINE ALPHA COMPOSE PRINT POS INSERT
|
46
|
+
%token <number> NEWLINE ALPHA COMPOSE PRINT POS INSERT SWITCH
|
47
47
|
%token <type> ARROW
|
48
48
|
%token <rtype> REPLACE
|
49
49
|
%token <name> SYMBOL VAR SVAR RVAR RSVAR
|
@@ -111,7 +111,7 @@ RE: RE ARROW CONTEXTS2 { $$ = restriction($1,$2,$3,0); }
|
|
111
111
|
| RE '?' { $$ = optional($1); }
|
112
112
|
| RE RE %prec SEQ { $$ = catenate($1, $2); }
|
113
113
|
| '!' RE { $$ = negation($2); }
|
114
|
-
|
|
114
|
+
| SWITCH RE { $$ = switch_levels($2); }
|
115
115
|
| '^' RE { $$ = upper_level($2); }
|
116
116
|
| '_' RE { $$ = lower_level($2); }
|
117
117
|
| RE '&' RE { $$ = conjunction($1, $3); }
|
@@ -162,7 +162,7 @@ VALUE: LCHAR '-' LCHAR { $$=add_values($1,$3,NULL); }
|
|
162
162
|
;
|
163
163
|
|
164
164
|
LCHAR: CHARACTER { $$=$1; }
|
165
|
-
| UTF8CHAR { $$=utf8toint($1); }
|
165
|
+
| UTF8CHAR { $$=utf8toint($1); free($1); }
|
166
166
|
| SCHAR { $$=$1; }
|
167
167
|
;
|
168
168
|
|
data/ext/sfst_machine/fst.C
CHANGED
@@ -124,6 +124,27 @@ void Node::init()
|
|
124
124
|
}
|
125
125
|
|
126
126
|
|
127
|
+
/*******************************************************************/
|
128
|
+
/* */
|
129
|
+
/* Node::clear_visited */
|
130
|
+
/* */
|
131
|
+
/*******************************************************************/
|
132
|
+
|
133
|
+
void Node::clear_visited( NodeHashSet &nodeset )
|
134
|
+
|
135
|
+
{
|
136
|
+
if (nodeset.find( this ) == nodeset.end()) {
|
137
|
+
visited = 0;
|
138
|
+
nodeset.insert( this );
|
139
|
+
fprintf(stderr," %lu", nodeset.size());
|
140
|
+
for( ArcsIter p(arcs()); p; p++ ) {
|
141
|
+
Arc *arc=p;
|
142
|
+
arc->target_node()->clear_visited( nodeset );
|
143
|
+
}
|
144
|
+
}
|
145
|
+
}
|
146
|
+
|
147
|
+
|
127
148
|
/*******************************************************************/
|
128
149
|
/* */
|
129
150
|
/* NodeNumbering::number_node */
|
@@ -196,14 +217,16 @@ Arc *Transducer::new_arc( Label l, Node *target )
|
|
196
217
|
/* */
|
197
218
|
/*******************************************************************/
|
198
219
|
|
199
|
-
void Transducer::add_string( char *s, bool extended )
|
220
|
+
void Transducer::add_string( char *s, bool extended, Alphabet *a )
|
200
221
|
|
201
222
|
{
|
223
|
+
if (a == NULL)
|
224
|
+
a = &alphabet;
|
225
|
+
|
202
226
|
Node *node=root_node();
|
203
|
-
|
204
227
|
Label l;
|
205
|
-
while (!(l =
|
206
|
-
|
228
|
+
while (!(l = a->next_label(s, extended)).is_epsilon()) {
|
229
|
+
a->insert(l);
|
207
230
|
Arcs *arcs=node->arcs();
|
208
231
|
node = arcs->target_node( l );
|
209
232
|
if (node == NULL) {
|
@@ -271,7 +294,7 @@ Transducer::Transducer( istream &is, const Alphabet *a, bool verbose )
|
|
271
294
|
break;
|
272
295
|
buffer[l+1] = 0;
|
273
296
|
|
274
|
-
add_string(buffer,extended);
|
297
|
+
add_string(buffer, extended);
|
275
298
|
}
|
276
299
|
if (verbose && n >= 10000)
|
277
300
|
cerr << "\n";
|
@@ -515,7 +538,13 @@ int Transducer::print_strings( FILE *file, bool with_brackets )
|
|
515
538
|
bool Transducer::analyze_string( char *string, FILE *file, bool with_brackets )
|
516
539
|
|
517
540
|
{
|
518
|
-
|
541
|
+
vector<Character> input;
|
542
|
+
alphabet.string2symseq( string, input );
|
543
|
+
vector<Label> labels;
|
544
|
+
for( size_t i=0; i<input.size(); i++ )
|
545
|
+
labels.push_back(Label(input[i]));
|
546
|
+
|
547
|
+
Transducer a1(labels);
|
519
548
|
Transducer *a2=&(*this || a1);
|
520
549
|
Transducer *a3=&(a2->lower_level());
|
521
550
|
delete a2;
|
@@ -598,14 +627,15 @@ static void print_node( ostream &s, Node *node, NodeNumbering &index,
|
|
598
627
|
{
|
599
628
|
if (!node->was_visited( vmark )) {
|
600
629
|
Arcs *arcs=node->arcs();
|
601
|
-
if (node->is_final())
|
602
|
-
s << "final\t" << index[node] << "\n";
|
603
630
|
for( ArcsIter p(arcs); p; p++ ) {
|
604
631
|
Arc *arc=p;
|
605
|
-
s << index[node] << "\t";
|
606
|
-
s << abc.
|
607
|
-
s <<
|
632
|
+
s << index[node] << "\t" << index[arc->target_node()];
|
633
|
+
s << "\t" << abc.write_char(arc->label().lower_char());
|
634
|
+
s << "\t" << abc.write_char(arc->label().upper_char());
|
635
|
+
s << "\n";
|
608
636
|
}
|
637
|
+
if (node->is_final())
|
638
|
+
s << index[node] << "\n";
|
609
639
|
for( ArcsIter p(arcs); p; p++ ) {
|
610
640
|
Arc *arc=p;
|
611
641
|
print_node( s, arc->target_node(), index, vmark, abc );
|
@@ -928,18 +958,22 @@ void Transducer::read_transducer_text( FILE *file )
|
|
928
958
|
for( size_t line=0; fgets(buffer, 10000, file ); line++ ) {
|
929
959
|
char *p = buffer;
|
930
960
|
char *s = next_string(p, line);
|
931
|
-
|
932
|
-
|
933
|
-
|
934
|
-
}
|
961
|
+
Node *node = create_node( nodes, s, line );
|
962
|
+
if (p == NULL)
|
963
|
+
node->set_final(true);
|
935
964
|
else {
|
936
|
-
Node *node = create_node( nodes, s, line );
|
937
|
-
s = next_string(p, line);
|
938
|
-
Label l = alphabet.next_label( s, 2 );
|
939
|
-
if (*s != 0 || l == Label::epsilon)
|
940
|
-
error_message( line );
|
941
965
|
s = next_string(p, line);
|
942
966
|
Node *target = create_node( nodes, s, line );
|
967
|
+
|
968
|
+
s = next_string(p, line);
|
969
|
+
Character lc = alphabet.add_symbol(s);
|
970
|
+
s = next_string(p, line);
|
971
|
+
Character uc = alphabet.add_symbol(s);
|
972
|
+
Label l(lc,uc);
|
973
|
+
if (l == Label::epsilon)
|
974
|
+
error_message( line );
|
975
|
+
|
976
|
+
alphabet.insert(l);
|
943
977
|
node->add_arc( l, target, this );
|
944
978
|
}
|
945
979
|
}
|
data/ext/sfst_machine/fst.h
CHANGED
@@ -25,7 +25,7 @@
|
|
25
25
|
|
26
26
|
#include "mem.h"
|
27
27
|
|
28
|
-
typedef unsigned
|
28
|
+
typedef unsigned short VType;
|
29
29
|
|
30
30
|
extern int Quiet;
|
31
31
|
|
@@ -41,8 +41,7 @@ struct hashf {
|
|
41
41
|
struct equalf {
|
42
42
|
int operator()(const Node *n1, const Node *n2) const { return n1==n2; }
|
43
43
|
};
|
44
|
-
typedef
|
45
|
-
|
44
|
+
typedef hash_set<Node*, hashf, equalf> NodeHashSet;
|
46
45
|
|
47
46
|
|
48
47
|
/***************** class Arc *************************************/
|
@@ -154,6 +153,7 @@ class Node {
|
|
154
153
|
Arcs *arcs( void ) { return &arcsp; };
|
155
154
|
const Arcs *arcs( void ) const { return &arcsp; };
|
156
155
|
Node *forward( void ) { return forwardp; };
|
156
|
+
void clear_visited( NodeHashSet &nodeset );
|
157
157
|
bool was_visited( VType vmark ) {
|
158
158
|
if (visited == vmark)
|
159
159
|
return true;
|
@@ -179,7 +179,7 @@ class Node2Int {
|
|
179
179
|
return (n1 == n2);
|
180
180
|
}
|
181
181
|
};
|
182
|
-
typedef
|
182
|
+
typedef hash_map<Node*, int, hashf, equalf> NL;
|
183
183
|
|
184
184
|
private:
|
185
185
|
int current_number;
|
@@ -231,7 +231,7 @@ class PairMapping {
|
|
231
231
|
return (p1.first==p2.first && p1.second == p2.second);
|
232
232
|
}
|
233
233
|
};
|
234
|
-
typedef
|
234
|
+
typedef hash_map<NodePair, Node*, hashf, equalf> PairMap;
|
235
235
|
PairMap pm;
|
236
236
|
|
237
237
|
public:
|
@@ -256,8 +256,16 @@ class Transducer {
|
|
256
256
|
Mem mem;
|
257
257
|
|
258
258
|
typedef std::set<Label, Label::label_cmp> LabelSet;
|
259
|
-
typedef
|
259
|
+
typedef hash_map<Character, char*> SymbolMap;
|
260
260
|
|
261
|
+
void incr_vmark( void ) {
|
262
|
+
if (++vmark == 0) {
|
263
|
+
NodeHashSet nodes;
|
264
|
+
root.clear_visited( nodes );
|
265
|
+
fprintf(stderr,"clearing flags\n");
|
266
|
+
vmark = 1;
|
267
|
+
}
|
268
|
+
};
|
261
269
|
void reverse_node( Node *old_node, Transducer *new_node );
|
262
270
|
Label recode_label( Label, bool lswitch, bool recode, Alphabet& );
|
263
271
|
Node *copy_nodes( Node *n, Transducer *a,
|
@@ -287,10 +295,6 @@ class Transducer {
|
|
287
295
|
|
288
296
|
public:
|
289
297
|
VType vmark;
|
290
|
-
void incr_vmark( void ) {
|
291
|
-
if (++vmark == 0)
|
292
|
-
throw "Overflow of generation counter!";
|
293
|
-
};
|
294
298
|
Alphabet alphabet; // The set of all labels, i.e. character pairs
|
295
299
|
|
296
300
|
Transducer( void ) : root(), mem()
|
@@ -308,7 +312,7 @@ class Transducer {
|
|
308
312
|
const Node *root_node( void ) const { return &root; }; // returns the root node
|
309
313
|
Node *new_node( void ); // memory alocation for a new node
|
310
314
|
Arc *new_arc( Label l, Node *target ); // memory alocation for a new arc
|
311
|
-
void add_string( char *s, bool extended=false );
|
315
|
+
void add_string( char *s, bool extended=false, Alphabet *a=NULL );
|
312
316
|
void complete_alphabet( void );
|
313
317
|
void minimise_alphabet( void );
|
314
318
|
void prune( void ); // remove unnecessary arcs
|