ruby-sfst 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +1 -0
- data/Manifest +31 -0
- data/README.rdoc +25 -0
- data/Rakefile +22 -0
- data/ext/sfst_machine/alphabet.C +807 -0
- data/ext/sfst_machine/alphabet.h +281 -0
- data/ext/sfst_machine/basic.C +84 -0
- data/ext/sfst_machine/basic.h +24 -0
- data/ext/sfst_machine/compact.C +616 -0
- data/ext/sfst_machine/compact.h +98 -0
- data/ext/sfst_machine/determinise.C +304 -0
- data/ext/sfst_machine/extconf.rb +4 -0
- data/ext/sfst_machine/fst-compiler.C +2375 -0
- data/ext/sfst_machine/fst-compiler.h +113 -0
- data/ext/sfst_machine/fst-compiler.yy +213 -0
- data/ext/sfst_machine/fst.C +966 -0
- data/ext/sfst_machine/fst.h +365 -0
- data/ext/sfst_machine/interface.C +1838 -0
- data/ext/sfst_machine/interface.h +94 -0
- data/ext/sfst_machine/make-compact.C +328 -0
- data/ext/sfst_machine/make-compact.h +34 -0
- data/ext/sfst_machine/mem.h +74 -0
- data/ext/sfst_machine/operators.C +1131 -0
- data/ext/sfst_machine/sfst_machine.cc +411 -0
- data/ext/sfst_machine/utf8-scanner.C +2197 -0
- data/ext/sfst_machine/utf8-scanner.ll +179 -0
- data/ext/sfst_machine/utf8.C +146 -0
- data/ext/sfst_machine/utf8.h +19 -0
- data/lib/sfst.rb +99 -0
- data/ruby-sfst.gemspec +34 -0
- data/test/test_sfst.fst +3 -0
- data/test/test_sfst.rb +119 -0
- metadata +100 -0
@@ -0,0 +1,113 @@
|
|
1
|
+
/* A Bison parser, made by GNU Bison 2.3. */
|
2
|
+
|
3
|
+
/* Skeleton interface for Bison's Yacc-like parsers in C
|
4
|
+
|
5
|
+
Copyright (C) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004, 2005, 2006
|
6
|
+
Free Software Foundation, Inc.
|
7
|
+
|
8
|
+
This program is free software; you can redistribute it and/or modify
|
9
|
+
it under the terms of the GNU General Public License as published by
|
10
|
+
the Free Software Foundation; either version 2, or (at your option)
|
11
|
+
any later version.
|
12
|
+
|
13
|
+
This program is distributed in the hope that it will be useful,
|
14
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
+
GNU General Public License for more details.
|
17
|
+
|
18
|
+
You should have received a copy of the GNU General Public License
|
19
|
+
along with this program; if not, write to the Free Software
|
20
|
+
Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
21
|
+
Boston, MA 02110-1301, USA. */
|
22
|
+
|
23
|
+
/* As a special exception, you may create a larger work that contains
|
24
|
+
part or all of the Bison parser skeleton and distribute that work
|
25
|
+
under terms of your choice, so long as that work isn't itself a
|
26
|
+
parser generator using the skeleton or a modified version thereof
|
27
|
+
as a parser skeleton. Alternatively, if you modify or redistribute
|
28
|
+
the parser skeleton itself, you may (at your option) remove this
|
29
|
+
special exception, which will cause the skeleton and the resulting
|
30
|
+
Bison output files to be licensed under the GNU General Public
|
31
|
+
License without this special exception.
|
32
|
+
|
33
|
+
This special exception was added by the Free Software Foundation in
|
34
|
+
version 2.2 of Bison. */
|
35
|
+
|
36
|
+
/* Tokens. */
|
37
|
+
#ifndef YYTOKENTYPE
|
38
|
+
# define YYTOKENTYPE
|
39
|
+
/* Put the tokens into the symbol table, so that GDB and other debuggers
|
40
|
+
know about them. */
|
41
|
+
enum yytokentype {
|
42
|
+
NEWLINE = 258,
|
43
|
+
ALPHA = 259,
|
44
|
+
COMPOSE = 260,
|
45
|
+
PRINT = 261,
|
46
|
+
POS = 262,
|
47
|
+
INSERT = 263,
|
48
|
+
REV = 264,
|
49
|
+
ARROW = 265,
|
50
|
+
REPLACE = 266,
|
51
|
+
SYMBOL = 267,
|
52
|
+
VAR = 268,
|
53
|
+
SVAR = 269,
|
54
|
+
RVAR = 270,
|
55
|
+
RSVAR = 271,
|
56
|
+
STRING = 272,
|
57
|
+
STRING2 = 273,
|
58
|
+
UTF8CHAR = 274,
|
59
|
+
CHARACTER = 275,
|
60
|
+
SEQ = 276
|
61
|
+
};
|
62
|
+
#endif
|
63
|
+
/* Tokens. */
|
64
|
+
#define NEWLINE 258
|
65
|
+
#define ALPHA 259
|
66
|
+
#define COMPOSE 260
|
67
|
+
#define PRINT 261
|
68
|
+
#define POS 262
|
69
|
+
#define INSERT 263
|
70
|
+
#define REV 264
|
71
|
+
#define ARROW 265
|
72
|
+
#define REPLACE 266
|
73
|
+
#define SYMBOL 267
|
74
|
+
#define VAR 268
|
75
|
+
#define SVAR 269
|
76
|
+
#define RVAR 270
|
77
|
+
#define RSVAR 271
|
78
|
+
#define STRING 272
|
79
|
+
#define STRING2 273
|
80
|
+
#define UTF8CHAR 274
|
81
|
+
#define CHARACTER 275
|
82
|
+
#define SEQ 276
|
83
|
+
|
84
|
+
|
85
|
+
|
86
|
+
|
87
|
+
#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
|
88
|
+
typedef union YYSTYPE
|
89
|
+
#line 31 "fst-compiler.yy"
|
90
|
+
{
|
91
|
+
int number;
|
92
|
+
Twol_Type type;
|
93
|
+
Repl_Type rtype;
|
94
|
+
char *name;
|
95
|
+
char *value;
|
96
|
+
unsigned char uchar;
|
97
|
+
unsigned int longchar;
|
98
|
+
Character character;
|
99
|
+
Transducer *expression;
|
100
|
+
Range *range;
|
101
|
+
Ranges *ranges;
|
102
|
+
Contexts *contexts;
|
103
|
+
}
|
104
|
+
/* Line 1489 of yacc.c. */
|
105
|
+
#line 106 "fst-compiler.H"
|
106
|
+
YYSTYPE;
|
107
|
+
# define yystype YYSTYPE /* obsolescent; will be withdrawn */
|
108
|
+
# define YYSTYPE_IS_DECLARED 1
|
109
|
+
# define YYSTYPE_IS_TRIVIAL 1
|
110
|
+
#endif
|
111
|
+
|
112
|
+
extern YYSTYPE yylval;
|
113
|
+
|
@@ -0,0 +1,213 @@
|
|
1
|
+
%{
|
2
|
+
/*******************************************************************/
|
3
|
+
/* */
|
4
|
+
/* FILE fst-compiler.yy */
|
5
|
+
/* MODULE fst-compiler */
|
6
|
+
/* PROGRAM SFST */
|
7
|
+
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
8
|
+
/* */
|
9
|
+
/*******************************************************************/
|
10
|
+
|
11
|
+
#include <stdio.h>
|
12
|
+
|
13
|
+
#include "make-compact.h"
|
14
|
+
#include "interface.h"
|
15
|
+
|
16
|
+
using std::cerr;
|
17
|
+
|
18
|
+
extern int yylineno;
|
19
|
+
extern char *yytext;
|
20
|
+
|
21
|
+
void yyerror(char *text);
|
22
|
+
void warn(char *text);
|
23
|
+
void warn2(char *text, char *text2);
|
24
|
+
int yylex( void );
|
25
|
+
int yyparse( void );
|
26
|
+
|
27
|
+
static int Switch=0;
|
28
|
+
Transducer *Result;
|
29
|
+
%}
|
30
|
+
|
31
|
+
%union {
|
32
|
+
int number;
|
33
|
+
Twol_Type type;
|
34
|
+
Repl_Type rtype;
|
35
|
+
char *name;
|
36
|
+
char *value;
|
37
|
+
unsigned char uchar;
|
38
|
+
unsigned int longchar;
|
39
|
+
Character character;
|
40
|
+
Transducer *expression;
|
41
|
+
Range *range;
|
42
|
+
Ranges *ranges;
|
43
|
+
Contexts *contexts;
|
44
|
+
}
|
45
|
+
|
46
|
+
%token <number> NEWLINE ALPHA COMPOSE PRINT POS INSERT REV
|
47
|
+
%token <type> ARROW
|
48
|
+
%token <rtype> REPLACE
|
49
|
+
%token <name> SYMBOL VAR SVAR RVAR RSVAR
|
50
|
+
%token <value> STRING STRING2 UTF8CHAR
|
51
|
+
%token <uchar> CHARACTER
|
52
|
+
|
53
|
+
%type <uchar> SCHAR
|
54
|
+
%type <longchar> LCHAR
|
55
|
+
%type <character> CODE
|
56
|
+
%type <expression> RE
|
57
|
+
%type <range> RANGE VALUE VALUES
|
58
|
+
%type <ranges> RANGES
|
59
|
+
%type <contexts> CONTEXT CONTEXT2 CONTEXTS CONTEXTS2
|
60
|
+
|
61
|
+
%left PRINT INSERT
|
62
|
+
%left ARROW REPLACE
|
63
|
+
%left COMPOSE
|
64
|
+
%left '|'
|
65
|
+
%left '-'
|
66
|
+
%left '&'
|
67
|
+
%left SEQ
|
68
|
+
%left '!' '^' '_'
|
69
|
+
%left '*' '+'
|
70
|
+
%%
|
71
|
+
|
72
|
+
ALL: ASSIGNMENTS RE NEWLINES { Result=result($2, Switch); }
|
73
|
+
;
|
74
|
+
|
75
|
+
ASSIGNMENTS: ASSIGNMENTS ASSIGNMENT {}
|
76
|
+
| ASSIGNMENTS NEWLINE {}
|
77
|
+
| /* nothing */ {}
|
78
|
+
;
|
79
|
+
|
80
|
+
ASSIGNMENT: VAR '=' RE { if (def_var($1,$3)) warn2("assignment of empty transducer to",$1); }
|
81
|
+
| RVAR '=' RE { if (def_rvar($1,$3)) warn2("assignment of empty transducer to",$1); }
|
82
|
+
| SVAR '=' VALUES { if (def_svar($1,$3)) warn2("assignment of empty symbol range to",$1); }
|
83
|
+
| RSVAR '=' VALUES { if (def_svar($1,$3)) warn2("assignment of empty symbol range to",$1); }
|
84
|
+
| RE PRINT STRING { write_to_file($1, $3); }
|
85
|
+
| ALPHA RE { def_alphabet($2); }
|
86
|
+
;
|
87
|
+
|
88
|
+
RE: RE ARROW CONTEXTS2 { $$ = restriction($1,$2,$3,0); }
|
89
|
+
| RE '^' ARROW CONTEXTS2 { $$ = restriction($1,$3,$4,1); }
|
90
|
+
| RE '_' ARROW CONTEXTS2 { $$ = restriction($1,$3,$4,-1); }
|
91
|
+
| RE REPLACE CONTEXT2 { $$ = replace_in_context(minimise(explode($1)),$2,$3,false); }
|
92
|
+
| RE REPLACE '?' CONTEXT2 { $$ = replace_in_context(minimise(explode($1)),$2,$4,true);}
|
93
|
+
| RE REPLACE '(' ')' { $$ = replace(minimise(explode($1)), $2, false); }
|
94
|
+
| RE REPLACE '?' '(' ')' { $$ = replace(minimise(explode($1)), $2, true); }
|
95
|
+
| RE RANGE ARROW RANGE RE { $$ = make_rule($1,$2,$3,$4,$5); }
|
96
|
+
| RE RANGE ARROW RANGE { $$ = make_rule($1,$2,$3,$4,NULL); }
|
97
|
+
| RANGE ARROW RANGE RE { $$ = make_rule(NULL,$1,$2,$3,$4); }
|
98
|
+
| RANGE ARROW RANGE { $$ = make_rule(NULL,$1,$2,$3,NULL); }
|
99
|
+
| RE COMPOSE RE { $$ = composition($1, $3); }
|
100
|
+
| '{' RANGES '}' ':' '{' RANGES '}' { $$ = make_mapping($2,$6); }
|
101
|
+
| RANGE ':' '{' RANGES '}' { $$ = make_mapping(add_range($1,NULL),$4); }
|
102
|
+
| '{' RANGES '}' ':' RANGE { $$ = make_mapping($2,add_range($5,NULL)); }
|
103
|
+
| RE INSERT CODE ':' CODE { $$ = freely_insert($1, $3, $5); }
|
104
|
+
| RE INSERT CODE { $$ = freely_insert($1, $3, $3); }
|
105
|
+
| RANGE ':' RANGE { $$ = new_transducer($1,$3); }
|
106
|
+
| RANGE { $$ = new_transducer($1,$1); }
|
107
|
+
| VAR { $$ = var_value($1); }
|
108
|
+
| RVAR { $$ = rvar_value($1); }
|
109
|
+
| RE '*' { $$ = repetition($1); }
|
110
|
+
| RE '+' { $$ = repetition2($1); }
|
111
|
+
| RE '?' { $$ = optional($1); }
|
112
|
+
| RE RE %prec SEQ { $$ = catenate($1, $2); }
|
113
|
+
| '!' RE { $$ = negation($2); }
|
114
|
+
| REV RE { $$ = switch_levels($2); }
|
115
|
+
| '^' RE { $$ = upper_level($2); }
|
116
|
+
| '_' RE { $$ = lower_level($2); }
|
117
|
+
| RE '&' RE { $$ = conjunction($1, $3); }
|
118
|
+
| RE '-' RE { $$ = subtraction($1, $3); }
|
119
|
+
| RE '|' RE { $$ = disjunction($1, $3); }
|
120
|
+
| '(' RE ')' { $$ = $2; }
|
121
|
+
| STRING { $$ = read_words($1); }
|
122
|
+
| STRING2 { $$ = read_transducer($1); }
|
123
|
+
;
|
124
|
+
|
125
|
+
RANGES: RANGE RANGES { $$ = add_range($1,$2); }
|
126
|
+
| { $$ = NULL; }
|
127
|
+
;
|
128
|
+
|
129
|
+
RANGE: '[' VALUES ']' { $$=$2; }
|
130
|
+
| '[' '^' VALUES ']' { $$=complement_range($3); }
|
131
|
+
| '[' RSVAR ']' { $$=rsvar_value($2); }
|
132
|
+
| '.' { $$=NULL; }
|
133
|
+
| CODE { $$=add_value($1,NULL); }
|
134
|
+
;
|
135
|
+
|
136
|
+
CONTEXTS2: CONTEXTS { $$ = $1; }
|
137
|
+
| '(' CONTEXTS ')' { $$ = $2; }
|
138
|
+
;
|
139
|
+
|
140
|
+
CONTEXTS: CONTEXT ',' CONTEXTS { $$ = add_context($1,$3); }
|
141
|
+
| CONTEXT { $$ = $1; }
|
142
|
+
;
|
143
|
+
|
144
|
+
CONTEXT2: CONTEXT { $$ = $1; }
|
145
|
+
| '(' CONTEXT ')' { $$ = $2; }
|
146
|
+
;
|
147
|
+
|
148
|
+
CONTEXT : RE POS RE { $$ = make_context($1, $3); }
|
149
|
+
| POS RE { $$ = make_context(NULL, $2); }
|
150
|
+
| RE POS { $$ = make_context($1, NULL); }
|
151
|
+
;
|
152
|
+
|
153
|
+
VALUES: VALUE VALUES { $$=append_values($1,$2); }
|
154
|
+
| VALUE { $$ = $1; }
|
155
|
+
;
|
156
|
+
|
157
|
+
VALUE: LCHAR '-' LCHAR { $$=add_values($1,$3,NULL); }
|
158
|
+
| SVAR { $$=svar_value($1); }
|
159
|
+
| LCHAR { $$=add_value(character_code($1),NULL); }
|
160
|
+
| CODE { $$=add_value($1,NULL); }
|
161
|
+
| SCHAR { $$=add_value($1,NULL); }
|
162
|
+
;
|
163
|
+
|
164
|
+
LCHAR: CHARACTER { $$=$1; }
|
165
|
+
| UTF8CHAR { $$=utf8toint($1); }
|
166
|
+
| SCHAR { $$=$1; }
|
167
|
+
;
|
168
|
+
|
169
|
+
CODE: CHARACTER { $$=character_code($1); }
|
170
|
+
| UTF8CHAR { $$=symbol_code($1); }
|
171
|
+
| SYMBOL { $$=symbol_code($1); }
|
172
|
+
;
|
173
|
+
|
174
|
+
SCHAR: '.' { $$=character_code('.'); }
|
175
|
+
| '!' { $$=character_code('!'); }
|
176
|
+
| '?' { $$=character_code('?'); }
|
177
|
+
| '{' { $$=character_code('{'); }
|
178
|
+
| '}' { $$=character_code('}'); }
|
179
|
+
| ')' { $$=character_code(')'); }
|
180
|
+
| '(' { $$=character_code('('); }
|
181
|
+
| '&' { $$=character_code('&'); }
|
182
|
+
| '|' { $$=character_code('|'); }
|
183
|
+
| '*' { $$=character_code('*'); }
|
184
|
+
| '+' { $$=character_code('+'); }
|
185
|
+
| ':' { $$=character_code(':'); }
|
186
|
+
| ',' { $$=character_code(','); }
|
187
|
+
| '=' { $$=character_code('='); }
|
188
|
+
| '_' { $$=character_code('_'); }
|
189
|
+
| '^' { $$=character_code('^'); }
|
190
|
+
| '-' { $$=character_code('-'); }
|
191
|
+
;
|
192
|
+
|
193
|
+
NEWLINES: NEWLINE NEWLINES {}
|
194
|
+
| /* nothing */ {}
|
195
|
+
;
|
196
|
+
|
197
|
+
%%
|
198
|
+
|
199
|
+
extern FILE *yyin;
|
200
|
+
|
201
|
+
/*******************************************************************/
|
202
|
+
/* */
|
203
|
+
/* yyerror */
|
204
|
+
/* */
|
205
|
+
/*******************************************************************/
|
206
|
+
|
207
|
+
void yyerror(char *text)
|
208
|
+
|
209
|
+
{
|
210
|
+
cerr << "\n" << FileName << ":" << yylineno << ": " << text << " at: ";
|
211
|
+
cerr << yytext << "\naborted.\n";
|
212
|
+
exit(1);
|
213
|
+
}
|