ruby-sfst 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +1 -0
- data/Manifest +1 -0
- data/README.rdoc +2 -0
- data/Rakefile +1 -1
- data/ext/sfst_machine/alphabet.C +18 -14
- data/ext/sfst_machine/alphabet.h +10 -19
- data/ext/sfst_machine/compact.C +2 -2
- data/ext/sfst_machine/determinise.C +0 -1
- data/ext/sfst_machine/extconf.rb +1 -1
- data/ext/sfst_machine/fst-compiler.C +127 -4
- data/ext/sfst_machine/fst-compiler.h +2 -2
- data/ext/sfst_machine/fst-compiler.yy +3 -3
- data/ext/sfst_machine/fst.C +54 -20
- data/ext/sfst_machine/fst.h +15 -11
- data/ext/sfst_machine/interface.C +62 -58
- data/ext/sfst_machine/interface.h +0 -1
- data/ext/sfst_machine/make-compact.C +0 -1
- data/ext/sfst_machine/sfst_machine.cc +0 -1
- data/ext/sfst_machine/sgi.h +44 -0
- data/ext/sfst_machine/utf8-scanner.C +73 -91
- data/ext/sfst_machine/utf8-scanner.ll +24 -28
- data/ruby-sfst.gemspec +8 -6
- metadata +5 -4
data/CHANGELOG
CHANGED
data/Manifest
CHANGED
data/README.rdoc
CHANGED
data/Rakefile
CHANGED
@@ -4,7 +4,7 @@ require 'rake'
|
|
4
4
|
begin
|
5
5
|
require 'echoe'
|
6
6
|
|
7
|
-
Echoe.new('ruby-sfst', '0.
|
7
|
+
Echoe.new('ruby-sfst', '0.2.0') do |p|
|
8
8
|
p.summary = "A wrapper for the Stuttgart Finite State Transducer Tools (SFST)."
|
9
9
|
p.author = 'Marius L. Jøhndal'
|
10
10
|
p.email = "mariuslj (at) ifi [dot] uio (dot) no"
|
data/ext/sfst_machine/alphabet.C
CHANGED
@@ -10,14 +10,14 @@
|
|
10
10
|
/* */
|
11
11
|
/*******************************************************************/
|
12
12
|
|
13
|
-
#include <
|
13
|
+
#include <climits>
|
14
|
+
#include <cstring>
|
15
|
+
|
14
16
|
#include "utf8.h"
|
15
17
|
#include "alphabet.h"
|
16
18
|
|
17
19
|
using std::vector;
|
18
20
|
using std::ostream;
|
19
|
-
using __gnu_cxx::hash_map;
|
20
|
-
using __gnu_cxx::hash_set;
|
21
21
|
|
22
22
|
const int BUFFER_SIZE=100000;
|
23
23
|
|
@@ -425,13 +425,13 @@ ostream &operator<<( ostream &s, const Alphabet &a )
|
|
425
425
|
/* Alphabet::next_mcsym */
|
426
426
|
/* */
|
427
427
|
/* recognizes multi-character symbols which are enclosed with */
|
428
|
-
/* angle brackets <...>. If the
|
429
|
-
/*
|
430
|
-
/*
|
428
|
+
/* angle brackets <...>. If the argument flag insert is true, */
|
429
|
+
/* the multi-character symbol must be already in the lexicon in */
|
430
|
+
/* order to be recognized. */
|
431
431
|
/* */
|
432
432
|
/*******************************************************************/
|
433
433
|
|
434
|
-
int Alphabet::next_mcsym( char* &string,
|
434
|
+
int Alphabet::next_mcsym( char* &string, bool insert )
|
435
435
|
|
436
436
|
{
|
437
437
|
char *start=string;
|
@@ -446,7 +446,7 @@ int Alphabet::next_mcsym( char* &string, int extended )
|
|
446
446
|
*end = 0;
|
447
447
|
|
448
448
|
int c;
|
449
|
-
if (
|
449
|
+
if (insert)
|
450
450
|
c = add_symbol( start );
|
451
451
|
else
|
452
452
|
c = symbol2code(start);
|
@@ -473,13 +473,13 @@ int Alphabet::next_mcsym( char* &string, int extended )
|
|
473
473
|
/* */
|
474
474
|
/*******************************************************************/
|
475
475
|
|
476
|
-
int Alphabet::next_code( char* &string,
|
476
|
+
int Alphabet::next_code( char* &string, bool extended, bool insert )
|
477
477
|
|
478
478
|
{
|
479
479
|
if (*string == 0)
|
480
480
|
return EOF; // finished
|
481
481
|
|
482
|
-
int c = next_mcsym(string,
|
482
|
+
int c = next_mcsym(string, insert);
|
483
483
|
if (c != EOF)
|
484
484
|
return c;
|
485
485
|
|
@@ -506,7 +506,7 @@ int Alphabet::next_code( char* &string, int extended )
|
|
506
506
|
/* */
|
507
507
|
/*******************************************************************/
|
508
508
|
|
509
|
-
Label Alphabet::next_label( char* &string,
|
509
|
+
Label Alphabet::next_label( char* &string, bool extended )
|
510
510
|
|
511
511
|
{
|
512
512
|
// read first character
|
@@ -517,7 +517,7 @@ Label Alphabet::next_label( char* &string, int extended )
|
|
517
517
|
Character lc=(Character)c;
|
518
518
|
if (!extended || *string != ':') { // single character?
|
519
519
|
if (lc == Label::epsilon)
|
520
|
-
return next_label(string); // ignore epsilon
|
520
|
+
return next_label(string, extended); // ignore epsilon
|
521
521
|
return Label(lc);
|
522
522
|
}
|
523
523
|
|
@@ -532,7 +532,7 @@ Label Alphabet::next_label( char* &string, int extended )
|
|
532
532
|
|
533
533
|
Label l(lc, (Character)c);
|
534
534
|
if (l.is_epsilon())
|
535
|
-
return next_label(string); // ignore epsilon transitions
|
535
|
+
return next_label(string, extended); // ignore epsilon transitions
|
536
536
|
return l;
|
537
537
|
}
|
538
538
|
|
@@ -782,8 +782,12 @@ char *Alphabet::print_analysis( Analysis &ana, bool both_layers )
|
|
782
782
|
const char *s;
|
783
783
|
|
784
784
|
// either print the analysis symbol or the whole label
|
785
|
-
if (both_layers)
|
785
|
+
if (both_layers) {
|
786
786
|
s = write_label(l);
|
787
|
+
// quote colons
|
788
|
+
if (strcmp(s,":") == 0)
|
789
|
+
ch.push_back('\\');
|
790
|
+
}
|
787
791
|
else if (l.lower_char() != Label::epsilon)
|
788
792
|
s = write_char(l.lower_char());
|
789
793
|
else
|
data/ext/sfst_machine/alphabet.h
CHANGED
@@ -13,7 +13,6 @@
|
|
13
13
|
#define _ALPHABET_H_
|
14
14
|
|
15
15
|
#include <stdio.h>
|
16
|
-
#include <string.h>
|
17
16
|
|
18
17
|
#include "basic.h"
|
19
18
|
|
@@ -22,6 +21,10 @@
|
|
22
21
|
|
23
22
|
#include <iostream>
|
24
23
|
|
24
|
+
#include <cstring>
|
25
|
+
|
26
|
+
#include "sgi.h"
|
27
|
+
|
25
28
|
#ifndef CODE_DATA_TYPE
|
26
29
|
typedef unsigned short Character; // data type of the symbol codes
|
27
30
|
#else
|
@@ -32,18 +35,6 @@ typedef unsigned CODE_DATA_TYPE Character;
|
|
32
35
|
// on the analysis level (lower) or the surface level (upper)
|
33
36
|
typedef enum {upper, lower} Level;
|
34
37
|
|
35
|
-
#ifdef SGIext
|
36
|
-
|
37
|
-
#include <ext/hash_set>
|
38
|
-
#include <ext/hash_map>
|
39
|
-
|
40
|
-
#else
|
41
|
-
|
42
|
-
#include <hash_set>
|
43
|
-
#include <hash_map>
|
44
|
-
|
45
|
-
#endif
|
46
|
-
|
47
38
|
extern char EpsilonString[]; // holds the symbol representing the empty string
|
48
39
|
// which is usually "<>"
|
49
40
|
|
@@ -157,10 +148,10 @@ class Alphabet {
|
|
157
148
|
typedef std::set<Label, Label::label_cmp> LabelSet;
|
158
149
|
|
159
150
|
// hash table used to map the symbols to their codes
|
160
|
-
typedef
|
151
|
+
typedef hash_map<const char*, Character, hash<const char*>,eqstr> SymbolMap;
|
161
152
|
|
162
153
|
// hash table used to map the codes back to the symbols
|
163
|
-
typedef
|
154
|
+
typedef hash_map<Character, char*> CharMap;
|
164
155
|
|
165
156
|
private:
|
166
157
|
SymbolMap sm; // maps symbols to codes
|
@@ -249,17 +240,17 @@ class Alphabet {
|
|
249
240
|
const char *write_label( Label l, bool with_brackets=true ) const;
|
250
241
|
|
251
242
|
// scan the next multi-character symbol in the argument string
|
252
|
-
int next_mcsym( char*&,
|
243
|
+
int next_mcsym( char*&, bool insert=true );
|
253
244
|
|
254
245
|
// scan the next symbol in the argument string
|
255
|
-
int next_code( char*&,
|
246
|
+
int next_code( char*&, bool extended=true, bool insert=true );
|
256
247
|
|
257
248
|
// convert a character string into a symbol or label sequence
|
258
249
|
void string2symseq( char*, std::vector<Character>& );
|
259
250
|
void string2labelseq( char*, std::vector<Label>& );
|
260
251
|
|
261
252
|
// scan the next label in the argument string
|
262
|
-
Label next_label( char*&,
|
253
|
+
Label next_label( char*&, bool extended=true );
|
263
254
|
|
264
255
|
// store the alphabet in the argument file (in binary form)
|
265
256
|
void store( FILE* ) const;
|
@@ -276,7 +267,7 @@ class Alphabet {
|
|
276
267
|
};
|
277
268
|
|
278
269
|
// write the alphabet to the output stream (in readable form)
|
279
|
-
std::ostream &operator<<(std::ostream&, Alphabet&);
|
270
|
+
std::ostream &operator<<(std::ostream&, const Alphabet&);
|
280
271
|
|
281
272
|
|
282
273
|
#endif
|
data/ext/sfst_machine/compact.C
CHANGED
@@ -383,7 +383,7 @@ void CompactTransducer::longest_match2(unsigned int n, char *string, int l,
|
|
383
383
|
|
384
384
|
// follow the non-epsilon transitions
|
385
385
|
char *end=string;
|
386
|
-
int c=alphabet.next_code(end, false);
|
386
|
+
int c=alphabet.next_code(end, false, false);
|
387
387
|
l += end-string;
|
388
388
|
if (c != EOF) {
|
389
389
|
// find the set of arcs with matching upper character in the sort list
|
@@ -430,7 +430,7 @@ const char *CompactTransducer::longest_match( char* &string )
|
|
430
430
|
|
431
431
|
// no match? return the next character
|
432
432
|
if (ba.size() == 0) {
|
433
|
-
int c=alphabet.next_code(string, false);
|
433
|
+
int c=alphabet.next_code(string, false, false);
|
434
434
|
return alphabet.code2symbol(c);
|
435
435
|
}
|
436
436
|
|
data/ext/sfst_machine/extconf.rb
CHANGED
@@ -72,7 +72,7 @@
|
|
72
72
|
PRINT = 261,
|
73
73
|
POS = 262,
|
74
74
|
INSERT = 263,
|
75
|
-
|
75
|
+
SWITCH = 264,
|
76
76
|
ARROW = 265,
|
77
77
|
REPLACE = 266,
|
78
78
|
SYMBOL = 267,
|
@@ -94,7 +94,7 @@
|
|
94
94
|
#define PRINT 261
|
95
95
|
#define POS 262
|
96
96
|
#define INSERT 263
|
97
|
-
#define
|
97
|
+
#define SWITCH 264
|
98
98
|
#define ARROW 265
|
99
99
|
#define REPLACE 266
|
100
100
|
#define SYMBOL 267
|
@@ -532,7 +532,7 @@ static const yytype_uint8 yyrline[] =
|
|
532
532
|
static const char *const yytname[] =
|
533
533
|
{
|
534
534
|
"$end", "error", "$undefined", "NEWLINE", "ALPHA", "COMPOSE", "PRINT",
|
535
|
-
"POS", "INSERT", "
|
535
|
+
"POS", "INSERT", "SWITCH", "ARROW", "REPLACE", "SYMBOL", "VAR", "SVAR",
|
536
536
|
"RVAR", "RSVAR", "STRING", "STRING2", "UTF8CHAR", "CHARACTER", "'|'",
|
537
537
|
"'-'", "'&'", "SEQ", "'!'", "'^'", "'_'", "'*'", "'+'", "'='", "'?'",
|
538
538
|
"'('", "')'", "'{'", "'}'", "':'", "'['", "']'", "'.'", "','", "$accept",
|
@@ -2020,7 +2020,7 @@ yyreduce:
|
|
2020
2020
|
|
2021
2021
|
case 71:
|
2022
2022
|
#line 165 "fst-compiler.yy"
|
2023
|
-
{ (yyval.longchar)=utf8toint((yyvsp[(1) - (1)].value)); ;}
|
2023
|
+
{ (yyval.longchar)=utf8toint((yyvsp[(1) - (1)].value)); free((yyvsp[(1) - (1)].value)); ;}
|
2024
2024
|
break;
|
2025
2025
|
|
2026
2026
|
case 72:
|
@@ -2358,6 +2358,8 @@ yyreturn:
|
|
2358
2358
|
|
2359
2359
|
|
2360
2360
|
extern FILE *yyin;
|
2361
|
+
static int Compact=0;
|
2362
|
+
static int LowMem=0;
|
2361
2363
|
|
2362
2364
|
/*******************************************************************/
|
2363
2365
|
/* */
|
@@ -2373,3 +2375,124 @@ void yyerror(char *text)
|
|
2373
2375
|
exit(1);
|
2374
2376
|
}
|
2375
2377
|
|
2378
|
+
|
2379
|
+
/*******************************************************************/
|
2380
|
+
/* */
|
2381
|
+
/* warn */
|
2382
|
+
/* */
|
2383
|
+
/*******************************************************************/
|
2384
|
+
|
2385
|
+
void warn(char *text)
|
2386
|
+
|
2387
|
+
{
|
2388
|
+
cerr << "\n" << FileName << ":" << yylineno << ": warning: " << text << "!\n";
|
2389
|
+
}
|
2390
|
+
|
2391
|
+
|
2392
|
+
/*******************************************************************/
|
2393
|
+
/* */
|
2394
|
+
/* warn2 */
|
2395
|
+
/* */
|
2396
|
+
/*******************************************************************/
|
2397
|
+
|
2398
|
+
void warn2(char *text, char *text2)
|
2399
|
+
|
2400
|
+
{
|
2401
|
+
cerr << "\n" << FileName << ":" << yylineno << ": warning: " << text << ": ";
|
2402
|
+
cerr << text2 << "\n";
|
2403
|
+
}
|
2404
|
+
|
2405
|
+
|
2406
|
+
/*******************************************************************/
|
2407
|
+
/* */
|
2408
|
+
/* get_flags */
|
2409
|
+
/* */
|
2410
|
+
/*******************************************************************/
|
2411
|
+
|
2412
|
+
void get_flags( int *argc, char **argv )
|
2413
|
+
|
2414
|
+
{
|
2415
|
+
for( int i=1; i<*argc; i++ ) {
|
2416
|
+
if (strcmp(argv[i],"-c") == 0) {
|
2417
|
+
Compact = 1;
|
2418
|
+
argv[i] = NULL;
|
2419
|
+
}
|
2420
|
+
else if (strcmp(argv[i],"-l") == 0) {
|
2421
|
+
LowMem = 1;
|
2422
|
+
argv[i] = NULL;
|
2423
|
+
}
|
2424
|
+
else if (strcmp(argv[i],"-q") == 0) {
|
2425
|
+
Verbose = 0;
|
2426
|
+
argv[i] = NULL;
|
2427
|
+
}
|
2428
|
+
else if (strcmp(argv[i],"-s") == 0) {
|
2429
|
+
Switch = 1;
|
2430
|
+
argv[i] = NULL;
|
2431
|
+
}
|
2432
|
+
}
|
2433
|
+
// remove flags from the argument list
|
2434
|
+
int k;
|
2435
|
+
for( int i=k=1; i<*argc; i++)
|
2436
|
+
if (argv[i] != NULL)
|
2437
|
+
argv[k++] = argv[i];
|
2438
|
+
*argc = k;
|
2439
|
+
}
|
2440
|
+
|
2441
|
+
|
2442
|
+
/*******************************************************************/
|
2443
|
+
/* */
|
2444
|
+
/* main */
|
2445
|
+
/* */
|
2446
|
+
/*******************************************************************/
|
2447
|
+
|
2448
|
+
int main( int argc, char *argv[] )
|
2449
|
+
|
2450
|
+
{
|
2451
|
+
FILE *file;
|
2452
|
+
|
2453
|
+
get_flags(&argc, argv);
|
2454
|
+
if (argc < 3) {
|
2455
|
+
fprintf(stderr,"\nUsage: %s [options] infile outfile\n", argv[0]);
|
2456
|
+
fprintf(stderr,"\nOPTIONS:\n");
|
2457
|
+
fprintf(stderr,"-c\tStore the transducer in fst-infl2 format.\n");
|
2458
|
+
fprintf(stderr,"-l\tStore the transducer in fst-infl3 format.\n");
|
2459
|
+
fprintf(stderr,"-s\tSwitch the upper and lower levels producing a transducer for generation rather than recognition.\n");
|
2460
|
+
fprintf(stderr,"-q\tquiet mode\n\n");
|
2461
|
+
exit(1);
|
2462
|
+
}
|
2463
|
+
if ((file = fopen(argv[1],"rt")) == NULL) {
|
2464
|
+
fprintf(stderr,"\nError: Cannot open grammar file \"%s\"\n\n", argv[1]);
|
2465
|
+
exit(1);
|
2466
|
+
}
|
2467
|
+
FileName = argv[1];
|
2468
|
+
Result = NULL;
|
2469
|
+
TheAlphabet.utf8 = UTF8;
|
2470
|
+
yyin = file;
|
2471
|
+
try {
|
2472
|
+
yyparse();
|
2473
|
+
Result->alphabet.utf8 = UTF8;
|
2474
|
+
if (Verbose)
|
2475
|
+
cerr << "\n";
|
2476
|
+
if (Result->is_empty())
|
2477
|
+
warn("resulting transducer is empty");
|
2478
|
+
if ((file = fopen(argv[2],"wb")) == NULL) {
|
2479
|
+
fprintf(stderr,"\nError: Cannot open output file %s\n\n", argv[2]);
|
2480
|
+
exit(1);
|
2481
|
+
}
|
2482
|
+
if (Compact) {
|
2483
|
+
MakeCompactTransducer ca(*Result);
|
2484
|
+
delete Result;
|
2485
|
+
ca.store(file);
|
2486
|
+
}
|
2487
|
+
else if (LowMem)
|
2488
|
+
Result->store_lowmem(file);
|
2489
|
+
else
|
2490
|
+
Result->store(file);
|
2491
|
+
fclose(file);
|
2492
|
+
}
|
2493
|
+
catch(const char* p) {
|
2494
|
+
cerr << "\n" << p << "\n\n";
|
2495
|
+
exit(1);
|
2496
|
+
}
|
2497
|
+
}
|
2498
|
+
|
@@ -45,7 +45,7 @@
|
|
45
45
|
PRINT = 261,
|
46
46
|
POS = 262,
|
47
47
|
INSERT = 263,
|
48
|
-
|
48
|
+
SWITCH = 264,
|
49
49
|
ARROW = 265,
|
50
50
|
REPLACE = 266,
|
51
51
|
SYMBOL = 267,
|
@@ -67,7 +67,7 @@
|
|
67
67
|
#define PRINT 261
|
68
68
|
#define POS 262
|
69
69
|
#define INSERT 263
|
70
|
-
#define
|
70
|
+
#define SWITCH 264
|
71
71
|
#define ARROW 265
|
72
72
|
#define REPLACE 266
|
73
73
|
#define SYMBOL 267
|
@@ -43,7 +43,7 @@ Transducer *Result;
|
|
43
43
|
Contexts *contexts;
|
44
44
|
}
|
45
45
|
|
46
|
-
%token <number> NEWLINE ALPHA COMPOSE PRINT POS INSERT
|
46
|
+
%token <number> NEWLINE ALPHA COMPOSE PRINT POS INSERT SWITCH
|
47
47
|
%token <type> ARROW
|
48
48
|
%token <rtype> REPLACE
|
49
49
|
%token <name> SYMBOL VAR SVAR RVAR RSVAR
|
@@ -111,7 +111,7 @@ RE: RE ARROW CONTEXTS2 { $$ = restriction($1,$2,$3,0); }
|
|
111
111
|
| RE '?' { $$ = optional($1); }
|
112
112
|
| RE RE %prec SEQ { $$ = catenate($1, $2); }
|
113
113
|
| '!' RE { $$ = negation($2); }
|
114
|
-
|
|
114
|
+
| SWITCH RE { $$ = switch_levels($2); }
|
115
115
|
| '^' RE { $$ = upper_level($2); }
|
116
116
|
| '_' RE { $$ = lower_level($2); }
|
117
117
|
| RE '&' RE { $$ = conjunction($1, $3); }
|
@@ -162,7 +162,7 @@ VALUE: LCHAR '-' LCHAR { $$=add_values($1,$3,NULL); }
|
|
162
162
|
;
|
163
163
|
|
164
164
|
LCHAR: CHARACTER { $$=$1; }
|
165
|
-
| UTF8CHAR { $$=utf8toint($1); }
|
165
|
+
| UTF8CHAR { $$=utf8toint($1); free($1); }
|
166
166
|
| SCHAR { $$=$1; }
|
167
167
|
;
|
168
168
|
|
data/ext/sfst_machine/fst.C
CHANGED
@@ -124,6 +124,27 @@ void Node::init()
|
|
124
124
|
}
|
125
125
|
|
126
126
|
|
127
|
+
/*******************************************************************/
|
128
|
+
/* */
|
129
|
+
/* Node::clear_visited */
|
130
|
+
/* */
|
131
|
+
/*******************************************************************/
|
132
|
+
|
133
|
+
void Node::clear_visited( NodeHashSet &nodeset )
|
134
|
+
|
135
|
+
{
|
136
|
+
if (nodeset.find( this ) == nodeset.end()) {
|
137
|
+
visited = 0;
|
138
|
+
nodeset.insert( this );
|
139
|
+
fprintf(stderr," %lu", nodeset.size());
|
140
|
+
for( ArcsIter p(arcs()); p; p++ ) {
|
141
|
+
Arc *arc=p;
|
142
|
+
arc->target_node()->clear_visited( nodeset );
|
143
|
+
}
|
144
|
+
}
|
145
|
+
}
|
146
|
+
|
147
|
+
|
127
148
|
/*******************************************************************/
|
128
149
|
/* */
|
129
150
|
/* NodeNumbering::number_node */
|
@@ -196,14 +217,16 @@ Arc *Transducer::new_arc( Label l, Node *target )
|
|
196
217
|
/* */
|
197
218
|
/*******************************************************************/
|
198
219
|
|
199
|
-
void Transducer::add_string( char *s, bool extended )
|
220
|
+
void Transducer::add_string( char *s, bool extended, Alphabet *a )
|
200
221
|
|
201
222
|
{
|
223
|
+
if (a == NULL)
|
224
|
+
a = &alphabet;
|
225
|
+
|
202
226
|
Node *node=root_node();
|
203
|
-
|
204
227
|
Label l;
|
205
|
-
while (!(l =
|
206
|
-
|
228
|
+
while (!(l = a->next_label(s, extended)).is_epsilon()) {
|
229
|
+
a->insert(l);
|
207
230
|
Arcs *arcs=node->arcs();
|
208
231
|
node = arcs->target_node( l );
|
209
232
|
if (node == NULL) {
|
@@ -271,7 +294,7 @@ Transducer::Transducer( istream &is, const Alphabet *a, bool verbose )
|
|
271
294
|
break;
|
272
295
|
buffer[l+1] = 0;
|
273
296
|
|
274
|
-
add_string(buffer,extended);
|
297
|
+
add_string(buffer, extended);
|
275
298
|
}
|
276
299
|
if (verbose && n >= 10000)
|
277
300
|
cerr << "\n";
|
@@ -515,7 +538,13 @@ int Transducer::print_strings( FILE *file, bool with_brackets )
|
|
515
538
|
bool Transducer::analyze_string( char *string, FILE *file, bool with_brackets )
|
516
539
|
|
517
540
|
{
|
518
|
-
|
541
|
+
vector<Character> input;
|
542
|
+
alphabet.string2symseq( string, input );
|
543
|
+
vector<Label> labels;
|
544
|
+
for( size_t i=0; i<input.size(); i++ )
|
545
|
+
labels.push_back(Label(input[i]));
|
546
|
+
|
547
|
+
Transducer a1(labels);
|
519
548
|
Transducer *a2=&(*this || a1);
|
520
549
|
Transducer *a3=&(a2->lower_level());
|
521
550
|
delete a2;
|
@@ -598,14 +627,15 @@ static void print_node( ostream &s, Node *node, NodeNumbering &index,
|
|
598
627
|
{
|
599
628
|
if (!node->was_visited( vmark )) {
|
600
629
|
Arcs *arcs=node->arcs();
|
601
|
-
if (node->is_final())
|
602
|
-
s << "final\t" << index[node] << "\n";
|
603
630
|
for( ArcsIter p(arcs); p; p++ ) {
|
604
631
|
Arc *arc=p;
|
605
|
-
s << index[node] << "\t";
|
606
|
-
s << abc.
|
607
|
-
s <<
|
632
|
+
s << index[node] << "\t" << index[arc->target_node()];
|
633
|
+
s << "\t" << abc.write_char(arc->label().lower_char());
|
634
|
+
s << "\t" << abc.write_char(arc->label().upper_char());
|
635
|
+
s << "\n";
|
608
636
|
}
|
637
|
+
if (node->is_final())
|
638
|
+
s << index[node] << "\n";
|
609
639
|
for( ArcsIter p(arcs); p; p++ ) {
|
610
640
|
Arc *arc=p;
|
611
641
|
print_node( s, arc->target_node(), index, vmark, abc );
|
@@ -928,18 +958,22 @@ void Transducer::read_transducer_text( FILE *file )
|
|
928
958
|
for( size_t line=0; fgets(buffer, 10000, file ); line++ ) {
|
929
959
|
char *p = buffer;
|
930
960
|
char *s = next_string(p, line);
|
931
|
-
|
932
|
-
|
933
|
-
|
934
|
-
}
|
961
|
+
Node *node = create_node( nodes, s, line );
|
962
|
+
if (p == NULL)
|
963
|
+
node->set_final(true);
|
935
964
|
else {
|
936
|
-
Node *node = create_node( nodes, s, line );
|
937
|
-
s = next_string(p, line);
|
938
|
-
Label l = alphabet.next_label( s, 2 );
|
939
|
-
if (*s != 0 || l == Label::epsilon)
|
940
|
-
error_message( line );
|
941
965
|
s = next_string(p, line);
|
942
966
|
Node *target = create_node( nodes, s, line );
|
967
|
+
|
968
|
+
s = next_string(p, line);
|
969
|
+
Character lc = alphabet.add_symbol(s);
|
970
|
+
s = next_string(p, line);
|
971
|
+
Character uc = alphabet.add_symbol(s);
|
972
|
+
Label l(lc,uc);
|
973
|
+
if (l == Label::epsilon)
|
974
|
+
error_message( line );
|
975
|
+
|
976
|
+
alphabet.insert(l);
|
943
977
|
node->add_arc( l, target, this );
|
944
978
|
}
|
945
979
|
}
|
data/ext/sfst_machine/fst.h
CHANGED
@@ -25,7 +25,7 @@
|
|
25
25
|
|
26
26
|
#include "mem.h"
|
27
27
|
|
28
|
-
typedef unsigned
|
28
|
+
typedef unsigned short VType;
|
29
29
|
|
30
30
|
extern int Quiet;
|
31
31
|
|
@@ -41,8 +41,7 @@ struct hashf {
|
|
41
41
|
struct equalf {
|
42
42
|
int operator()(const Node *n1, const Node *n2) const { return n1==n2; }
|
43
43
|
};
|
44
|
-
typedef
|
45
|
-
|
44
|
+
typedef hash_set<Node*, hashf, equalf> NodeHashSet;
|
46
45
|
|
47
46
|
|
48
47
|
/***************** class Arc *************************************/
|
@@ -154,6 +153,7 @@ class Node {
|
|
154
153
|
Arcs *arcs( void ) { return &arcsp; };
|
155
154
|
const Arcs *arcs( void ) const { return &arcsp; };
|
156
155
|
Node *forward( void ) { return forwardp; };
|
156
|
+
void clear_visited( NodeHashSet &nodeset );
|
157
157
|
bool was_visited( VType vmark ) {
|
158
158
|
if (visited == vmark)
|
159
159
|
return true;
|
@@ -179,7 +179,7 @@ class Node2Int {
|
|
179
179
|
return (n1 == n2);
|
180
180
|
}
|
181
181
|
};
|
182
|
-
typedef
|
182
|
+
typedef hash_map<Node*, int, hashf, equalf> NL;
|
183
183
|
|
184
184
|
private:
|
185
185
|
int current_number;
|
@@ -231,7 +231,7 @@ class PairMapping {
|
|
231
231
|
return (p1.first==p2.first && p1.second == p2.second);
|
232
232
|
}
|
233
233
|
};
|
234
|
-
typedef
|
234
|
+
typedef hash_map<NodePair, Node*, hashf, equalf> PairMap;
|
235
235
|
PairMap pm;
|
236
236
|
|
237
237
|
public:
|
@@ -256,8 +256,16 @@ class Transducer {
|
|
256
256
|
Mem mem;
|
257
257
|
|
258
258
|
typedef std::set<Label, Label::label_cmp> LabelSet;
|
259
|
-
typedef
|
259
|
+
typedef hash_map<Character, char*> SymbolMap;
|
260
260
|
|
261
|
+
void incr_vmark( void ) {
|
262
|
+
if (++vmark == 0) {
|
263
|
+
NodeHashSet nodes;
|
264
|
+
root.clear_visited( nodes );
|
265
|
+
fprintf(stderr,"clearing flags\n");
|
266
|
+
vmark = 1;
|
267
|
+
}
|
268
|
+
};
|
261
269
|
void reverse_node( Node *old_node, Transducer *new_node );
|
262
270
|
Label recode_label( Label, bool lswitch, bool recode, Alphabet& );
|
263
271
|
Node *copy_nodes( Node *n, Transducer *a,
|
@@ -287,10 +295,6 @@ class Transducer {
|
|
287
295
|
|
288
296
|
public:
|
289
297
|
VType vmark;
|
290
|
-
void incr_vmark( void ) {
|
291
|
-
if (++vmark == 0)
|
292
|
-
throw "Overflow of generation counter!";
|
293
|
-
};
|
294
298
|
Alphabet alphabet; // The set of all labels, i.e. character pairs
|
295
299
|
|
296
300
|
Transducer( void ) : root(), mem()
|
@@ -308,7 +312,7 @@ class Transducer {
|
|
308
312
|
const Node *root_node( void ) const { return &root; }; // returns the root node
|
309
313
|
Node *new_node( void ); // memory alocation for a new node
|
310
314
|
Arc *new_arc( Label l, Node *target ); // memory alocation for a new arc
|
311
|
-
void add_string( char *s, bool extended=false );
|
315
|
+
void add_string( char *s, bool extended=false, Alphabet *a=NULL );
|
312
316
|
void complete_alphabet( void );
|
313
317
|
void minimise_alphabet( void );
|
314
318
|
void prune( void ); // remove unnecessary arcs
|