ruby-sfst 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +1 -0
- data/Manifest +31 -0
- data/README.rdoc +25 -0
- data/Rakefile +22 -0
- data/ext/sfst_machine/alphabet.C +807 -0
- data/ext/sfst_machine/alphabet.h +281 -0
- data/ext/sfst_machine/basic.C +84 -0
- data/ext/sfst_machine/basic.h +24 -0
- data/ext/sfst_machine/compact.C +616 -0
- data/ext/sfst_machine/compact.h +98 -0
- data/ext/sfst_machine/determinise.C +304 -0
- data/ext/sfst_machine/extconf.rb +4 -0
- data/ext/sfst_machine/fst-compiler.C +2375 -0
- data/ext/sfst_machine/fst-compiler.h +113 -0
- data/ext/sfst_machine/fst-compiler.yy +213 -0
- data/ext/sfst_machine/fst.C +966 -0
- data/ext/sfst_machine/fst.h +365 -0
- data/ext/sfst_machine/interface.C +1838 -0
- data/ext/sfst_machine/interface.h +94 -0
- data/ext/sfst_machine/make-compact.C +328 -0
- data/ext/sfst_machine/make-compact.h +34 -0
- data/ext/sfst_machine/mem.h +74 -0
- data/ext/sfst_machine/operators.C +1131 -0
- data/ext/sfst_machine/sfst_machine.cc +411 -0
- data/ext/sfst_machine/utf8-scanner.C +2197 -0
- data/ext/sfst_machine/utf8-scanner.ll +179 -0
- data/ext/sfst_machine/utf8.C +146 -0
- data/ext/sfst_machine/utf8.h +19 -0
- data/lib/sfst.rb +99 -0
- data/ruby-sfst.gemspec +34 -0
- data/test/test_sfst.fst +3 -0
- data/test/test_sfst.rb +119 -0
- metadata +100 -0
@@ -0,0 +1,179 @@
|
|
1
|
+
%option 8Bit batch yylineno noyywrap
|
2
|
+
|
3
|
+
/* the "incl" state is used to pick up the name of an include file */
|
4
|
+
%x incl
|
5
|
+
|
6
|
+
%{
|
7
|
+
/*******************************************************************/
|
8
|
+
/* */
|
9
|
+
/* FILE scanner.ll */
|
10
|
+
/* MODULE scanner */
|
11
|
+
/* PROGRAM SFST */
|
12
|
+
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
13
|
+
/* */
|
14
|
+
/*******************************************************************/
|
15
|
+
|
16
|
+
#include <string.h>
|
17
|
+
|
18
|
+
#include "interface.h"
|
19
|
+
#include "fst-compiler.h"
|
20
|
+
|
21
|
+
#define MAX_INCLUDE_DEPTH 10
|
22
|
+
|
23
|
+
int Include_Stack_Ptr = 0;
|
24
|
+
YY_BUFFER_STATE Include_Stack[MAX_INCLUDE_DEPTH];
|
25
|
+
char *Name_Stack[MAX_INCLUDE_DEPTH];
|
26
|
+
int Lineno_Stack[MAX_INCLUDE_DEPTH];
|
27
|
+
|
28
|
+
char *FileName;
|
29
|
+
|
30
|
+
bool UTF8=true;
|
31
|
+
|
32
|
+
static char *unquote(char *string, bool del_quote=true) {
|
33
|
+
char *s=string, *result=string;
|
34
|
+
if (del_quote)
|
35
|
+
string++;
|
36
|
+
|
37
|
+
while (*string) {
|
38
|
+
if (*string == '\\')
|
39
|
+
string++;
|
40
|
+
*(s++) = *(string++);
|
41
|
+
}
|
42
|
+
|
43
|
+
if (del_quote)
|
44
|
+
s--;
|
45
|
+
*s = '\0';
|
46
|
+
|
47
|
+
return fst_strdup(result);
|
48
|
+
}
|
49
|
+
|
50
|
+
static void print_lineno() {
|
51
|
+
if (!Verbose)
|
52
|
+
return;
|
53
|
+
fputc('\r',stderr);
|
54
|
+
for( int i=0; i<Include_Stack_Ptr; i++ )
|
55
|
+
fputs(" ", stderr);
|
56
|
+
fprintf(stderr,"%s: %d", FileName, yylineno);
|
57
|
+
}
|
58
|
+
|
59
|
+
extern void yyerror(char *text);
|
60
|
+
|
61
|
+
%}
|
62
|
+
|
63
|
+
CC [\x80-\xbf]
|
64
|
+
C1 [A-Za-z0-9._/\-]
|
65
|
+
C2 [A-Za-z0-9._/\-&()+,=?\^|~]
|
66
|
+
C3 [A-Za-z0-9._/\-&()+,=?\^|~#<>]
|
67
|
+
C4 [A-Za-z0-9._/\-&()+,=?\^|~$<>]
|
68
|
+
C5 [\!-;\?-\[\]-\~=]
|
69
|
+
FN [A-Za-z0-9._/\-*+]
|
70
|
+
|
71
|
+
%%
|
72
|
+
|
73
|
+
#include BEGIN(incl);
|
74
|
+
<incl>[ \t]* /* eat the whitespace */
|
75
|
+
<incl>{FN}+ { error2("Missing quotes",yytext); }
|
76
|
+
<incl>\"{FN}+\" { /* got the include file name */
|
77
|
+
FILE *file;
|
78
|
+
char *name=unquote(yytext);
|
79
|
+
if ( Include_Stack_Ptr >= MAX_INCLUDE_DEPTH )
|
80
|
+
{
|
81
|
+
fprintf( stderr, "Includes nested too deeply" );
|
82
|
+
exit( 1 );
|
83
|
+
}
|
84
|
+
if (Verbose) fputc('\n', stderr);
|
85
|
+
file = fopen( name, "rt" );
|
86
|
+
if (!file)
|
87
|
+
error2("Can't open include file",name);
|
88
|
+
else
|
89
|
+
{
|
90
|
+
Name_Stack[Include_Stack_Ptr] = FileName;
|
91
|
+
FileName = name;
|
92
|
+
Lineno_Stack[Include_Stack_Ptr] = yylineno;
|
93
|
+
yylineno = 1;
|
94
|
+
Include_Stack[Include_Stack_Ptr++]=YY_CURRENT_BUFFER;
|
95
|
+
yy_switch_to_buffer(
|
96
|
+
yy_create_buffer(yyin, YY_BUF_SIZE));
|
97
|
+
yyin = file;
|
98
|
+
print_lineno();
|
99
|
+
BEGIN(INITIAL);
|
100
|
+
}
|
101
|
+
}
|
102
|
+
<<EOF>> {
|
103
|
+
if (Verbose)
|
104
|
+
fputc('\n', stderr);
|
105
|
+
if ( --Include_Stack_Ptr < 0 )
|
106
|
+
yyterminate();
|
107
|
+
else
|
108
|
+
{
|
109
|
+
free(FileName);
|
110
|
+
FileName = Name_Stack[Include_Stack_Ptr];
|
111
|
+
yylineno = Lineno_Stack[Include_Stack_Ptr];
|
112
|
+
yy_delete_buffer( YY_CURRENT_BUFFER );
|
113
|
+
yy_switch_to_buffer(Include_Stack[Include_Stack_Ptr]);
|
114
|
+
}
|
115
|
+
}
|
116
|
+
|
117
|
+
|
118
|
+
^[ \t]*\%.*\r?\n { print_lineno(); /* ignore comments */ }
|
119
|
+
|
120
|
+
\%.*\\[ \t]*\r?\n { print_lineno(); /* ignore comments */ }
|
121
|
+
|
122
|
+
\%.* { /* ignore comments */ }
|
123
|
+
|
124
|
+
|
125
|
+
^[ \t]*ALPHABET[ \t]*= { return ALPHA; }
|
126
|
+
|
127
|
+
\|\| { return COMPOSE; }
|
128
|
+
"<=>" { yylval.type = twol_both; return ARROW; }
|
129
|
+
"=>" { yylval.type = twol_right;return ARROW; }
|
130
|
+
"<=" { yylval.type = twol_left; return ARROW; }
|
131
|
+
"^->" { yylval.rtype = repl_up; return REPLACE; }
|
132
|
+
"_->" { yylval.rtype = repl_down; return REPLACE; }
|
133
|
+
"/->" { yylval.rtype = repl_right;return REPLACE; }
|
134
|
+
"\\->" { yylval.rtype = repl_left; return REPLACE; }
|
135
|
+
">>" { return PRINT; }
|
136
|
+
"<<" { return INSERT; }
|
137
|
+
"__" { return POS; }
|
138
|
+
"^_" { return REV; }
|
139
|
+
|
140
|
+
[.,{}\[\]()&!?|*+:=_\^\-] { return yytext[0]; }
|
141
|
+
|
142
|
+
\$=({C3}|(\\.))+\$ { yylval.name = fst_strdup(yytext); return RVAR; }
|
143
|
+
|
144
|
+
\$({C3}|(\\.))+\$ { yylval.name = fst_strdup(yytext); return VAR; }
|
145
|
+
|
146
|
+
#=({C4}|(\\.))+# { yylval.name = fst_strdup(yytext); return RSVAR; }
|
147
|
+
|
148
|
+
#({C4}|(\\.))+# { yylval.name = fst_strdup(yytext); return SVAR; }
|
149
|
+
|
150
|
+
\<({C5}|\\.)*\> { yylval.name = unquote(yytext,false); return SYMBOL; }
|
151
|
+
|
152
|
+
\"<{FN}+>\" {
|
153
|
+
yylval.value = unquote(yytext)+1;
|
154
|
+
yylval.value[strlen(yylval.value)-1] = 0;
|
155
|
+
return STRING2;
|
156
|
+
}
|
157
|
+
|
158
|
+
\"{FN}+\" {
|
159
|
+
yylval.value = unquote(yytext);
|
160
|
+
return STRING;
|
161
|
+
}
|
162
|
+
|
163
|
+
[ \t] { /* ignored */ }
|
164
|
+
\\[ \t]*([ \t]\%.*)?\r?\n { print_lineno(); /* ignored */ }
|
165
|
+
\r?\n { print_lineno(); return NEWLINE; }
|
166
|
+
|
167
|
+
\\[0-9]+ { long l=atol(yytext+1);
|
168
|
+
if (l <= 1114112) { yylval.uchar=l; return CHARACTER; }
|
169
|
+
yyerror("invalid expression");
|
170
|
+
}
|
171
|
+
|
172
|
+
|
173
|
+
\\. { yylval.value=fst_strdup(yytext+1); return UTF8CHAR; }
|
174
|
+
[\x00-\x7f] { yylval.value=fst_strdup(yytext); return UTF8CHAR; }
|
175
|
+
[\xc0-\xdf]{CC} { yylval.value=fst_strdup(yytext); return UTF8CHAR; }
|
176
|
+
[\xe0-\xef]{CC}{2} { yylval.value=fst_strdup(yytext); return UTF8CHAR; }
|
177
|
+
[\xf0-\xff]{CC}{3} { yylval.value=fst_strdup(yytext); return UTF8CHAR; }
|
178
|
+
|
179
|
+
%%
|
@@ -0,0 +1,146 @@
|
|
1
|
+
|
2
|
+
/*******************************************************************/
|
3
|
+
/* */
|
4
|
+
/* File: utf8.C */
|
5
|
+
/* Author: Helmut Schmid */
|
6
|
+
/* Purpose: */
|
7
|
+
/* Created: Mon Sep 5 17:49:16 2005 */
|
8
|
+
/* Modified: Mon Mar 3 11:00:53 2008 (schmid) */
|
9
|
+
/* */
|
10
|
+
/*******************************************************************/
|
11
|
+
|
12
|
+
#include "string.h"
|
13
|
+
|
14
|
+
#include "utf8.h"
|
15
|
+
|
16
|
+
const unsigned char get3LSbits=7;
|
17
|
+
const unsigned char get4LSbits=15;
|
18
|
+
const unsigned char get5LSbits=31;
|
19
|
+
const unsigned char get6LSbits=63;
|
20
|
+
|
21
|
+
const unsigned char set1MSbits=128;
|
22
|
+
const unsigned char set2MSbits=192;
|
23
|
+
const unsigned char set3MSbits=224;
|
24
|
+
const unsigned char set4MSbits=240;
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
/*******************************************************************/
|
29
|
+
/* */
|
30
|
+
/* int2utf8 */
|
31
|
+
/* */
|
32
|
+
/*******************************************************************/
|
33
|
+
|
34
|
+
char *int2utf8( unsigned int sym )
|
35
|
+
|
36
|
+
{
|
37
|
+
static unsigned char ch[5];
|
38
|
+
|
39
|
+
if (sym < 128) {
|
40
|
+
// 1-byte UTF8 symbol, 7 bits
|
41
|
+
ch[0] = sym;
|
42
|
+
ch[1] = 0;
|
43
|
+
}
|
44
|
+
|
45
|
+
else if (sym < 2048) {
|
46
|
+
// 2-byte UTF8 symbol, 5+6 bits
|
47
|
+
ch[0] = (sym >> 6) | set2MSbits;
|
48
|
+
ch[1] = (sym & get6LSbits) | set1MSbits;
|
49
|
+
ch[2] = 0;
|
50
|
+
}
|
51
|
+
|
52
|
+
else if (sym < 65536) {
|
53
|
+
// 3-byte UTF8 symbol, 4+6+6 bits
|
54
|
+
ch[0] = (sym >> 12) | set3MSbits;
|
55
|
+
ch[1] = ((sym >> 6) & get6LSbits) | set1MSbits;
|
56
|
+
ch[2] = (sym & get6LSbits) | set1MSbits;
|
57
|
+
ch[3] = 0;
|
58
|
+
}
|
59
|
+
|
60
|
+
else if (sym < 2097152) {
|
61
|
+
// 4-byte UTF8 symbol, 3+6+6+6 bits
|
62
|
+
ch[0] = (sym >> 18) | set4MSbits;
|
63
|
+
ch[1] = ((sym >> 12) & get6LSbits) | set1MSbits;
|
64
|
+
ch[2] = ((sym >> 6) & get6LSbits) | set1MSbits;
|
65
|
+
ch[3] = (sym & get6LSbits) | set1MSbits;
|
66
|
+
ch[4] = 0;
|
67
|
+
}
|
68
|
+
|
69
|
+
else
|
70
|
+
return NULL;
|
71
|
+
|
72
|
+
return (char*)ch;
|
73
|
+
}
|
74
|
+
|
75
|
+
|
76
|
+
/*******************************************************************/
|
77
|
+
/* */
|
78
|
+
/* utf8toint */
|
79
|
+
/* */
|
80
|
+
/*******************************************************************/
|
81
|
+
|
82
|
+
unsigned int utf8toint( char **s )
|
83
|
+
|
84
|
+
{
|
85
|
+
int bytes_to_come;
|
86
|
+
unsigned int result=0;
|
87
|
+
unsigned char c=(unsigned char)**s;
|
88
|
+
|
89
|
+
if (c >= (unsigned char)set4MSbits) { // 1111xxxx
|
90
|
+
bytes_to_come = 3;
|
91
|
+
result = (result << 3) | (c & get3LSbits);
|
92
|
+
}
|
93
|
+
|
94
|
+
else if (c >= (unsigned char) set3MSbits) { // 1110xxxx
|
95
|
+
// start of a three-byte symbol
|
96
|
+
bytes_to_come = 2;
|
97
|
+
result = (result << 4) | (c & get4LSbits);
|
98
|
+
}
|
99
|
+
|
100
|
+
else if (c >= (unsigned char) set2MSbits) { // 1100xxxx
|
101
|
+
// start of a two-byte symbol
|
102
|
+
bytes_to_come = 1;
|
103
|
+
result = (result << 5) | (c & get5LSbits);
|
104
|
+
}
|
105
|
+
|
106
|
+
else if (c < (unsigned char) set1MSbits) { // 0100xxxx
|
107
|
+
// one-byte symbol
|
108
|
+
bytes_to_come = 0;
|
109
|
+
result = c;
|
110
|
+
}
|
111
|
+
|
112
|
+
else
|
113
|
+
return 0; // error
|
114
|
+
|
115
|
+
while (bytes_to_come > 0) {
|
116
|
+
bytes_to_come--;
|
117
|
+
(*s)++;
|
118
|
+
c = (unsigned char)**s;
|
119
|
+
if (c < (unsigned char) set2MSbits &&
|
120
|
+
c >= (unsigned char) set1MSbits) // 1000xxxx
|
121
|
+
{
|
122
|
+
result = (result << 6) | (c & get6LSbits);
|
123
|
+
}
|
124
|
+
else
|
125
|
+
return 0;
|
126
|
+
}
|
127
|
+
|
128
|
+
(*s)++;
|
129
|
+
return result;
|
130
|
+
}
|
131
|
+
|
132
|
+
|
133
|
+
/*******************************************************************/
|
134
|
+
/* */
|
135
|
+
/* utf8toint */
|
136
|
+
/* */
|
137
|
+
/*******************************************************************/
|
138
|
+
|
139
|
+
unsigned int utf8toint( char *s )
|
140
|
+
|
141
|
+
{
|
142
|
+
unsigned int result = utf8toint( &s );
|
143
|
+
if (*s == 0) // all bytes converted?
|
144
|
+
return result;
|
145
|
+
return 0;
|
146
|
+
}
|
@@ -0,0 +1,19 @@
|
|
1
|
+
|
2
|
+
/*******************************************************************/
|
3
|
+
/* */
|
4
|
+
/* File: utf8.h */
|
5
|
+
/* Author: Helmut Schmid */
|
6
|
+
/* Purpose: */
|
7
|
+
/* Created: Mon Sep 5 17:49:16 2005 */
|
8
|
+
/* Modified: Mon Apr 7 08:26:39 2008 (schmid) */
|
9
|
+
/* */
|
10
|
+
/*******************************************************************/
|
11
|
+
|
12
|
+
#ifndef _UTF8_H_
|
13
|
+
#define _UTF8_H_
|
14
|
+
|
15
|
+
unsigned int utf8toint( char *s );
|
16
|
+
unsigned int utf8toint( char **s );
|
17
|
+
char *int2utf8( unsigned int );
|
18
|
+
|
19
|
+
#endif
|
data/lib/sfst.rb
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# sfst.rb - SFST interface
|
4
|
+
#
|
5
|
+
# Written by Marius L. Jøhndal, 2008.
|
6
|
+
#
|
7
|
+
require 'sfst_machine'
|
8
|
+
|
9
|
+
module SFST
|
10
|
+
# Compiles an SFST transducer +source+ and saves it as +machine+.
|
11
|
+
#
|
12
|
+
# ==== Options
|
13
|
+
# compact:: Compile a compact transducer.
|
14
|
+
def self.compile(source, machine, options = {})
|
15
|
+
unless options[:compact]
|
16
|
+
_compile_regular(source, machine)
|
17
|
+
else
|
18
|
+
_compile_compact(source, machine)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# A regular, i.e. not a compact, transducer.
|
23
|
+
class RegularTransducer
|
24
|
+
def initialize(file)
|
25
|
+
@fst = RegularTransducerMachine.new(file)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Analyses a string +string+. Returns an array of analysed
|
29
|
+
# strings if the string is accepted, or an empty array if not.
|
30
|
+
#
|
31
|
+
# ==== Options
|
32
|
+
# * +symbol_sequence+ - Return each analysis as a sequence of symbols.
|
33
|
+
# Multicharacter symbols will be strings on the form +<symbol>+.
|
34
|
+
def analyze(string, options = {})
|
35
|
+
x = []
|
36
|
+
@fst._analyze(string) do |a|
|
37
|
+
if options[:symbol_sequence]
|
38
|
+
x << a.map { |s| s.match(/^<(.*)>$/) ? $1.to_sym : s }
|
39
|
+
else
|
40
|
+
x << a.join
|
41
|
+
end
|
42
|
+
end
|
43
|
+
x
|
44
|
+
end
|
45
|
+
|
46
|
+
# Checks if the string +string+ is accepted for analysis.
|
47
|
+
def accepted_analysis?(string)
|
48
|
+
@fst._analyze(string)
|
49
|
+
end
|
50
|
+
|
51
|
+
# Generates a string +string+. Returns an array of generated
|
52
|
+
# strings if the string is accepted or an empty array if not.
|
53
|
+
def generate(string)
|
54
|
+
x = []
|
55
|
+
@fst._generate(string) { |a| x << a.join }
|
56
|
+
x
|
57
|
+
end
|
58
|
+
|
59
|
+
# Checks if the string +string+ is accepted for generating.
|
60
|
+
def accepted_generating?(string)
|
61
|
+
@fst._generate(string)
|
62
|
+
end
|
63
|
+
|
64
|
+
# Generates upper or lower level or both. This only works with
|
65
|
+
# non-compact transducers.
|
66
|
+
#
|
67
|
+
# ==== Options
|
68
|
+
# * +levels+ - if <tt>:upper</tt>, generates only upper level. If <tt>:lower</tt> generates
|
69
|
+
# only lower level. If <tt>:both</tt>, generates both. Default is <tt>:both</tt>.
|
70
|
+
# * +epsilons+ - if +true+, produces epsilons. Default is +false+.
|
71
|
+
def generate_language(options = {}, &block)
|
72
|
+
@fst._generate_language(options[:levels] || :both, options[:epsilons] ? :all : :noepsilons, &block)
|
73
|
+
end
|
74
|
+
|
75
|
+
alias :analyse :analyze
|
76
|
+
end
|
77
|
+
|
78
|
+
# A compact transducer.
|
79
|
+
class CompactTransducer < CompactTransducerMachine
|
80
|
+
def initialize(file)
|
81
|
+
@fst = CompactTransducerMachine.new(file)
|
82
|
+
end
|
83
|
+
|
84
|
+
# Checks if the string +string+ is accepted for analysis.
|
85
|
+
def accepted_analysis?(string)
|
86
|
+
@fst._analyze(string)
|
87
|
+
end
|
88
|
+
|
89
|
+
# Analyses a string +string+. Returns an array of analysed
|
90
|
+
# strings if the string is accepted or an empty array if not.
|
91
|
+
def analyze(form)
|
92
|
+
x = []
|
93
|
+
@fst._analyze(form) { |a| x << a }
|
94
|
+
x
|
95
|
+
end
|
96
|
+
|
97
|
+
alias :analyse :analyze
|
98
|
+
end
|
99
|
+
end
|
data/ruby-sfst.gemspec
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = %q{ruby-sfst}
|
3
|
+
s.version = "0.1.0"
|
4
|
+
|
5
|
+
s.required_rubygems_version = Gem::Requirement.new("= 1.2") if s.respond_to? :required_rubygems_version=
|
6
|
+
s.authors = ["Marius L. J\303\270hndal"]
|
7
|
+
s.date = %q{2008-08-04}
|
8
|
+
s.description = %q{A wrapper for the Stuttgart Finite State Transducer Tools (SFST).}
|
9
|
+
s.email = %q{mariuslj (at) ifi [dot] uio (dot) no}
|
10
|
+
s.extensions = ["ext/sfst_machine/extconf.rb"]
|
11
|
+
s.extra_rdoc_files = ["README.rdoc", "lib/sfst.rb"]
|
12
|
+
s.files = ["README.rdoc", "Rakefile", "Manifest", "test/test_sfst.rb", "test/test_sfst.fst", "CHANGELOG", "ext/sfst_machine/fst-compiler.h", "ext/sfst_machine/utf8.C", "ext/sfst_machine/operators.C", "ext/sfst_machine/utf8-scanner.ll", "ext/sfst_machine/determinise.C", "ext/sfst_machine/interface.C", "ext/sfst_machine/compact.h", "ext/sfst_machine/basic.h", "ext/sfst_machine/fst.h", "ext/sfst_machine/make-compact.h", "ext/sfst_machine/fst-compiler.yy", "ext/sfst_machine/mem.h", "ext/sfst_machine/compact.C", "ext/sfst_machine/basic.C", "ext/sfst_machine/interface.h", "ext/sfst_machine/sfst_machine.cc", "ext/sfst_machine/extconf.rb", "ext/sfst_machine/alphabet.C", "ext/sfst_machine/fst.C", "ext/sfst_machine/alphabet.h", "ext/sfst_machine/make-compact.C", "ext/sfst_machine/fst-compiler.C", "ext/sfst_machine/utf8.h", "ext/sfst_machine/utf8-scanner.C", "lib/sfst.rb", "ruby-sfst.gemspec"]
|
13
|
+
s.has_rdoc = true
|
14
|
+
s.homepage = %q{http://github.com/mlj/ruby-sfst}
|
15
|
+
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Ruby-sfst", "--main", "README.rdoc"]
|
16
|
+
s.require_paths = ["lib", "ext"]
|
17
|
+
s.rubyforge_project = %q{sfst}
|
18
|
+
s.rubygems_version = %q{1.2.0}
|
19
|
+
s.summary = %q{A wrapper for the Stuttgart Finite State Transducer Tools (SFST).}
|
20
|
+
s.test_files = ["test/test_sfst.rb"]
|
21
|
+
|
22
|
+
if s.respond_to? :specification_version then
|
23
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
24
|
+
s.specification_version = 2
|
25
|
+
|
26
|
+
if current_version >= 3 then
|
27
|
+
s.add_development_dependency(%q<echoe>, [">= 0"])
|
28
|
+
else
|
29
|
+
s.add_dependency(%q<echoe>, [">= 0"])
|
30
|
+
end
|
31
|
+
else
|
32
|
+
s.add_dependency(%q<echoe>, [">= 0"])
|
33
|
+
end
|
34
|
+
end
|
data/test/test_sfst.fst
ADDED
data/test/test_sfst.rb
ADDED
@@ -0,0 +1,119 @@
|
|
1
|
+
require 'sfst'
|
2
|
+
require 'test/unit'
|
3
|
+
|
4
|
+
TEST_DIRECTORY = File.expand_path(File.dirname(__FILE__))
|
5
|
+
TEST_SCRIPT_FILE = File.join(TEST_DIRECTORY, 'test_sfst.fst')
|
6
|
+
TEST_COMPILED_FILE = File.join(TEST_DIRECTORY, 'test_sfst.a')
|
7
|
+
TEST_COMPILED_COMPACT_FILE = File.join(TEST_DIRECTORY, 'test_sfst_compact.a')
|
8
|
+
TEST_COMPILED_REGULAR_FILE = File.join(TEST_DIRECTORY, 'test_sfst_regular.a')
|
9
|
+
|
10
|
+
class SFSTTestCase < Test::Unit::TestCase
|
11
|
+
def test_sfst_compile_regular
|
12
|
+
SFST::compile(TEST_SCRIPT_FILE, TEST_COMPILED_FILE)
|
13
|
+
SFST::compile(TEST_SCRIPT_FILE, TEST_COMPILED_FILE, :compact => false)
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_sfst_compile_compact
|
17
|
+
SFST::compile(TEST_SCRIPT_FILE, TEST_COMPILED_FILE, :compact => true)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class RegularTransducerTestCase < Test::Unit::TestCase
|
22
|
+
def setup
|
23
|
+
SFST::compile(TEST_SCRIPT_FILE, TEST_COMPILED_REGULAR_FILE, :compact => false)
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_analyze_acceptance
|
27
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
28
|
+
assert_equal true, fst.accepted_analysis?('foo')
|
29
|
+
assert_equal false, fst.accepted_analysis?('fox')
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_analyze
|
33
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
34
|
+
assert_equal ['bar', 'baz'], fst.analyse('foo').sort
|
35
|
+
|
36
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
37
|
+
assert_equal [], fst.analyse('fox').sort
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_analyze_symbol_sequence
|
41
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
42
|
+
assert_equal [['b', 'a', 'r'], ['b', 'a', 'z']], fst.analyse('foo', :symbol_sequence => true).sort
|
43
|
+
|
44
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
45
|
+
assert_equal [], fst.analyse('fox', :symbol_sequence => true).sort
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_generate_acceptance
|
49
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
50
|
+
assert_equal true, fst.accepted_generating?('bar')
|
51
|
+
assert_equal true, fst.accepted_generating?('baz')
|
52
|
+
assert_equal false, fst.accepted_generating?('bax')
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_generate
|
56
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
57
|
+
assert_equal ['foo'], fst.generate('bar').sort
|
58
|
+
|
59
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
60
|
+
assert_equal ['foo'], fst.generate('baz').sort
|
61
|
+
|
62
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
63
|
+
assert_equal [], fst.generate('bax').sort
|
64
|
+
end
|
65
|
+
|
66
|
+
def test_generate_language_default
|
67
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
68
|
+
a = []
|
69
|
+
fst.generate_language do |u|
|
70
|
+
a << u.collect { |pair| pair.join(':') }.join
|
71
|
+
end
|
72
|
+
assert_equal ['b:fa:or:o', 'b:fa:oz:o'], a.sort
|
73
|
+
end
|
74
|
+
|
75
|
+
def test_generate_language_both
|
76
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
77
|
+
a = []
|
78
|
+
fst.generate_language(:levels => :both) do |u|
|
79
|
+
a << u.collect { |pair| pair.join(':') }.join
|
80
|
+
end
|
81
|
+
assert_equal ['b:fa:or:o', 'b:fa:oz:o'], a.sort
|
82
|
+
end
|
83
|
+
|
84
|
+
def test_generate_language_upper
|
85
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
86
|
+
a = []
|
87
|
+
fst.generate_language(:levels => :upper) do |u|
|
88
|
+
a << u.join
|
89
|
+
end
|
90
|
+
assert_equal ['foo'], a.sort
|
91
|
+
end
|
92
|
+
|
93
|
+
def test_generate_language_lower
|
94
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
95
|
+
a = []
|
96
|
+
fst.generate_language(:levels => :lower) do |u|
|
97
|
+
a << u.join
|
98
|
+
end
|
99
|
+
assert_equal ['bar', 'baz'], a.sort
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
class CompactTransducerTestCase < Test::Unit::TestCase
|
104
|
+
def setup
|
105
|
+
SFST::compile(TEST_SCRIPT_FILE, TEST_COMPILED_COMPACT_FILE, :compact => true)
|
106
|
+
end
|
107
|
+
|
108
|
+
def test_analyze_acceptance
|
109
|
+
fst = SFST::CompactTransducer.new(TEST_COMPILED_COMPACT_FILE)
|
110
|
+
assert_equal true, fst.accepted_analysis?('foo')
|
111
|
+
assert_equal false, fst.accepted_analysis?('fox')
|
112
|
+
end
|
113
|
+
|
114
|
+
def test_analyze
|
115
|
+
fst = SFST::CompactTransducer.new(TEST_COMPILED_COMPACT_FILE)
|
116
|
+
assert_equal ['bar', 'baz'], fst.analyse('foo').sort
|
117
|
+
assert_equal [], fst.analyse('fox').sort
|
118
|
+
end
|
119
|
+
end
|