ruby-sfst 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +1 -0
- data/Manifest +31 -0
- data/README.rdoc +25 -0
- data/Rakefile +22 -0
- data/ext/sfst_machine/alphabet.C +807 -0
- data/ext/sfst_machine/alphabet.h +281 -0
- data/ext/sfst_machine/basic.C +84 -0
- data/ext/sfst_machine/basic.h +24 -0
- data/ext/sfst_machine/compact.C +616 -0
- data/ext/sfst_machine/compact.h +98 -0
- data/ext/sfst_machine/determinise.C +304 -0
- data/ext/sfst_machine/extconf.rb +4 -0
- data/ext/sfst_machine/fst-compiler.C +2375 -0
- data/ext/sfst_machine/fst-compiler.h +113 -0
- data/ext/sfst_machine/fst-compiler.yy +213 -0
- data/ext/sfst_machine/fst.C +966 -0
- data/ext/sfst_machine/fst.h +365 -0
- data/ext/sfst_machine/interface.C +1838 -0
- data/ext/sfst_machine/interface.h +94 -0
- data/ext/sfst_machine/make-compact.C +328 -0
- data/ext/sfst_machine/make-compact.h +34 -0
- data/ext/sfst_machine/mem.h +74 -0
- data/ext/sfst_machine/operators.C +1131 -0
- data/ext/sfst_machine/sfst_machine.cc +411 -0
- data/ext/sfst_machine/utf8-scanner.C +2197 -0
- data/ext/sfst_machine/utf8-scanner.ll +179 -0
- data/ext/sfst_machine/utf8.C +146 -0
- data/ext/sfst_machine/utf8.h +19 -0
- data/lib/sfst.rb +99 -0
- data/ruby-sfst.gemspec +34 -0
- data/test/test_sfst.fst +3 -0
- data/test/test_sfst.rb +119 -0
- metadata +100 -0
@@ -0,0 +1,179 @@
|
|
1
|
+
%option 8Bit batch yylineno noyywrap
|
2
|
+
|
3
|
+
/* the "incl" state is used to pick up the name of an include file */
|
4
|
+
%x incl
|
5
|
+
|
6
|
+
%{
|
7
|
+
/*******************************************************************/
|
8
|
+
/* */
|
9
|
+
/* FILE scanner.ll */
|
10
|
+
/* MODULE scanner */
|
11
|
+
/* PROGRAM SFST */
|
12
|
+
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
|
13
|
+
/* */
|
14
|
+
/*******************************************************************/
|
15
|
+
|
16
|
+
#include <string.h>
|
17
|
+
|
18
|
+
#include "interface.h"
|
19
|
+
#include "fst-compiler.h"
|
20
|
+
|
21
|
+
#define MAX_INCLUDE_DEPTH 10
|
22
|
+
|
23
|
+
int Include_Stack_Ptr = 0;
|
24
|
+
YY_BUFFER_STATE Include_Stack[MAX_INCLUDE_DEPTH];
|
25
|
+
char *Name_Stack[MAX_INCLUDE_DEPTH];
|
26
|
+
int Lineno_Stack[MAX_INCLUDE_DEPTH];
|
27
|
+
|
28
|
+
char *FileName;
|
29
|
+
|
30
|
+
bool UTF8=true;
|
31
|
+
|
32
|
+
static char *unquote(char *string, bool del_quote=true) {
|
33
|
+
char *s=string, *result=string;
|
34
|
+
if (del_quote)
|
35
|
+
string++;
|
36
|
+
|
37
|
+
while (*string) {
|
38
|
+
if (*string == '\\')
|
39
|
+
string++;
|
40
|
+
*(s++) = *(string++);
|
41
|
+
}
|
42
|
+
|
43
|
+
if (del_quote)
|
44
|
+
s--;
|
45
|
+
*s = '\0';
|
46
|
+
|
47
|
+
return fst_strdup(result);
|
48
|
+
}
|
49
|
+
|
50
|
+
static void print_lineno() {
|
51
|
+
if (!Verbose)
|
52
|
+
return;
|
53
|
+
fputc('\r',stderr);
|
54
|
+
for( int i=0; i<Include_Stack_Ptr; i++ )
|
55
|
+
fputs(" ", stderr);
|
56
|
+
fprintf(stderr,"%s: %d", FileName, yylineno);
|
57
|
+
}
|
58
|
+
|
59
|
+
extern void yyerror(char *text);
|
60
|
+
|
61
|
+
%}
|
62
|
+
|
63
|
+
CC [\x80-\xbf]
|
64
|
+
C1 [A-Za-z0-9._/\-]
|
65
|
+
C2 [A-Za-z0-9._/\-&()+,=?\^|~]
|
66
|
+
C3 [A-Za-z0-9._/\-&()+,=?\^|~#<>]
|
67
|
+
C4 [A-Za-z0-9._/\-&()+,=?\^|~$<>]
|
68
|
+
C5 [\!-;\?-\[\]-\~=]
|
69
|
+
FN [A-Za-z0-9._/\-*+]
|
70
|
+
|
71
|
+
%%
|
72
|
+
|
73
|
+
#include BEGIN(incl);
|
74
|
+
<incl>[ \t]* /* eat the whitespace */
|
75
|
+
<incl>{FN}+ { error2("Missing quotes",yytext); }
|
76
|
+
<incl>\"{FN}+\" { /* got the include file name */
|
77
|
+
FILE *file;
|
78
|
+
char *name=unquote(yytext);
|
79
|
+
if ( Include_Stack_Ptr >= MAX_INCLUDE_DEPTH )
|
80
|
+
{
|
81
|
+
fprintf( stderr, "Includes nested too deeply" );
|
82
|
+
exit( 1 );
|
83
|
+
}
|
84
|
+
if (Verbose) fputc('\n', stderr);
|
85
|
+
file = fopen( name, "rt" );
|
86
|
+
if (!file)
|
87
|
+
error2("Can't open include file",name);
|
88
|
+
else
|
89
|
+
{
|
90
|
+
Name_Stack[Include_Stack_Ptr] = FileName;
|
91
|
+
FileName = name;
|
92
|
+
Lineno_Stack[Include_Stack_Ptr] = yylineno;
|
93
|
+
yylineno = 1;
|
94
|
+
Include_Stack[Include_Stack_Ptr++]=YY_CURRENT_BUFFER;
|
95
|
+
yy_switch_to_buffer(
|
96
|
+
yy_create_buffer(yyin, YY_BUF_SIZE));
|
97
|
+
yyin = file;
|
98
|
+
print_lineno();
|
99
|
+
BEGIN(INITIAL);
|
100
|
+
}
|
101
|
+
}
|
102
|
+
<<EOF>> {
|
103
|
+
if (Verbose)
|
104
|
+
fputc('\n', stderr);
|
105
|
+
if ( --Include_Stack_Ptr < 0 )
|
106
|
+
yyterminate();
|
107
|
+
else
|
108
|
+
{
|
109
|
+
free(FileName);
|
110
|
+
FileName = Name_Stack[Include_Stack_Ptr];
|
111
|
+
yylineno = Lineno_Stack[Include_Stack_Ptr];
|
112
|
+
yy_delete_buffer( YY_CURRENT_BUFFER );
|
113
|
+
yy_switch_to_buffer(Include_Stack[Include_Stack_Ptr]);
|
114
|
+
}
|
115
|
+
}
|
116
|
+
|
117
|
+
|
118
|
+
^[ \t]*\%.*\r?\n { print_lineno(); /* ignore comments */ }
|
119
|
+
|
120
|
+
\%.*\\[ \t]*\r?\n { print_lineno(); /* ignore comments */ }
|
121
|
+
|
122
|
+
\%.* { /* ignore comments */ }
|
123
|
+
|
124
|
+
|
125
|
+
^[ \t]*ALPHABET[ \t]*= { return ALPHA; }
|
126
|
+
|
127
|
+
\|\| { return COMPOSE; }
|
128
|
+
"<=>" { yylval.type = twol_both; return ARROW; }
|
129
|
+
"=>" { yylval.type = twol_right;return ARROW; }
|
130
|
+
"<=" { yylval.type = twol_left; return ARROW; }
|
131
|
+
"^->" { yylval.rtype = repl_up; return REPLACE; }
|
132
|
+
"_->" { yylval.rtype = repl_down; return REPLACE; }
|
133
|
+
"/->" { yylval.rtype = repl_right;return REPLACE; }
|
134
|
+
"\\->" { yylval.rtype = repl_left; return REPLACE; }
|
135
|
+
">>" { return PRINT; }
|
136
|
+
"<<" { return INSERT; }
|
137
|
+
"__" { return POS; }
|
138
|
+
"^_" { return REV; }
|
139
|
+
|
140
|
+
[.,{}\[\]()&!?|*+:=_\^\-] { return yytext[0]; }
|
141
|
+
|
142
|
+
\$=({C3}|(\\.))+\$ { yylval.name = fst_strdup(yytext); return RVAR; }
|
143
|
+
|
144
|
+
\$({C3}|(\\.))+\$ { yylval.name = fst_strdup(yytext); return VAR; }
|
145
|
+
|
146
|
+
#=({C4}|(\\.))+# { yylval.name = fst_strdup(yytext); return RSVAR; }
|
147
|
+
|
148
|
+
#({C4}|(\\.))+# { yylval.name = fst_strdup(yytext); return SVAR; }
|
149
|
+
|
150
|
+
\<({C5}|\\.)*\> { yylval.name = unquote(yytext,false); return SYMBOL; }
|
151
|
+
|
152
|
+
\"<{FN}+>\" {
|
153
|
+
yylval.value = unquote(yytext)+1;
|
154
|
+
yylval.value[strlen(yylval.value)-1] = 0;
|
155
|
+
return STRING2;
|
156
|
+
}
|
157
|
+
|
158
|
+
\"{FN}+\" {
|
159
|
+
yylval.value = unquote(yytext);
|
160
|
+
return STRING;
|
161
|
+
}
|
162
|
+
|
163
|
+
[ \t] { /* ignored */ }
|
164
|
+
\\[ \t]*([ \t]\%.*)?\r?\n { print_lineno(); /* ignored */ }
|
165
|
+
\r?\n { print_lineno(); return NEWLINE; }
|
166
|
+
|
167
|
+
\\[0-9]+ { long l=atol(yytext+1);
|
168
|
+
if (l <= 1114112) { yylval.uchar=l; return CHARACTER; }
|
169
|
+
yyerror("invalid expression");
|
170
|
+
}
|
171
|
+
|
172
|
+
|
173
|
+
\\. { yylval.value=fst_strdup(yytext+1); return UTF8CHAR; }
|
174
|
+
[\x00-\x7f] { yylval.value=fst_strdup(yytext); return UTF8CHAR; }
|
175
|
+
[\xc0-\xdf]{CC} { yylval.value=fst_strdup(yytext); return UTF8CHAR; }
|
176
|
+
[\xe0-\xef]{CC}{2} { yylval.value=fst_strdup(yytext); return UTF8CHAR; }
|
177
|
+
[\xf0-\xff]{CC}{3} { yylval.value=fst_strdup(yytext); return UTF8CHAR; }
|
178
|
+
|
179
|
+
%%
|
@@ -0,0 +1,146 @@
|
|
1
|
+
|
2
|
+
/*******************************************************************/
|
3
|
+
/* */
|
4
|
+
/* File: utf8.C */
|
5
|
+
/* Author: Helmut Schmid */
|
6
|
+
/* Purpose: */
|
7
|
+
/* Created: Mon Sep 5 17:49:16 2005 */
|
8
|
+
/* Modified: Mon Mar 3 11:00:53 2008 (schmid) */
|
9
|
+
/* */
|
10
|
+
/*******************************************************************/
|
11
|
+
|
12
|
+
#include "string.h"
|
13
|
+
|
14
|
+
#include "utf8.h"
|
15
|
+
|
16
|
+
const unsigned char get3LSbits=7;
|
17
|
+
const unsigned char get4LSbits=15;
|
18
|
+
const unsigned char get5LSbits=31;
|
19
|
+
const unsigned char get6LSbits=63;
|
20
|
+
|
21
|
+
const unsigned char set1MSbits=128;
|
22
|
+
const unsigned char set2MSbits=192;
|
23
|
+
const unsigned char set3MSbits=224;
|
24
|
+
const unsigned char set4MSbits=240;
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
/*******************************************************************/
|
29
|
+
/* */
|
30
|
+
/* int2utf8 */
|
31
|
+
/* */
|
32
|
+
/*******************************************************************/
|
33
|
+
|
34
|
+
char *int2utf8( unsigned int sym )
|
35
|
+
|
36
|
+
{
|
37
|
+
static unsigned char ch[5];
|
38
|
+
|
39
|
+
if (sym < 128) {
|
40
|
+
// 1-byte UTF8 symbol, 7 bits
|
41
|
+
ch[0] = sym;
|
42
|
+
ch[1] = 0;
|
43
|
+
}
|
44
|
+
|
45
|
+
else if (sym < 2048) {
|
46
|
+
// 2-byte UTF8 symbol, 5+6 bits
|
47
|
+
ch[0] = (sym >> 6) | set2MSbits;
|
48
|
+
ch[1] = (sym & get6LSbits) | set1MSbits;
|
49
|
+
ch[2] = 0;
|
50
|
+
}
|
51
|
+
|
52
|
+
else if (sym < 65536) {
|
53
|
+
// 3-byte UTF8 symbol, 4+6+6 bits
|
54
|
+
ch[0] = (sym >> 12) | set3MSbits;
|
55
|
+
ch[1] = ((sym >> 6) & get6LSbits) | set1MSbits;
|
56
|
+
ch[2] = (sym & get6LSbits) | set1MSbits;
|
57
|
+
ch[3] = 0;
|
58
|
+
}
|
59
|
+
|
60
|
+
else if (sym < 2097152) {
|
61
|
+
// 4-byte UTF8 symbol, 3+6+6+6 bits
|
62
|
+
ch[0] = (sym >> 18) | set4MSbits;
|
63
|
+
ch[1] = ((sym >> 12) & get6LSbits) | set1MSbits;
|
64
|
+
ch[2] = ((sym >> 6) & get6LSbits) | set1MSbits;
|
65
|
+
ch[3] = (sym & get6LSbits) | set1MSbits;
|
66
|
+
ch[4] = 0;
|
67
|
+
}
|
68
|
+
|
69
|
+
else
|
70
|
+
return NULL;
|
71
|
+
|
72
|
+
return (char*)ch;
|
73
|
+
}
|
74
|
+
|
75
|
+
|
76
|
+
/*******************************************************************/
|
77
|
+
/* */
|
78
|
+
/* utf8toint */
|
79
|
+
/* */
|
80
|
+
/*******************************************************************/
|
81
|
+
|
82
|
+
unsigned int utf8toint( char **s )
|
83
|
+
|
84
|
+
{
|
85
|
+
int bytes_to_come;
|
86
|
+
unsigned int result=0;
|
87
|
+
unsigned char c=(unsigned char)**s;
|
88
|
+
|
89
|
+
if (c >= (unsigned char)set4MSbits) { // 1111xxxx
|
90
|
+
bytes_to_come = 3;
|
91
|
+
result = (result << 3) | (c & get3LSbits);
|
92
|
+
}
|
93
|
+
|
94
|
+
else if (c >= (unsigned char) set3MSbits) { // 1110xxxx
|
95
|
+
// start of a three-byte symbol
|
96
|
+
bytes_to_come = 2;
|
97
|
+
result = (result << 4) | (c & get4LSbits);
|
98
|
+
}
|
99
|
+
|
100
|
+
else if (c >= (unsigned char) set2MSbits) { // 1100xxxx
|
101
|
+
// start of a two-byte symbol
|
102
|
+
bytes_to_come = 1;
|
103
|
+
result = (result << 5) | (c & get5LSbits);
|
104
|
+
}
|
105
|
+
|
106
|
+
else if (c < (unsigned char) set1MSbits) { // 0100xxxx
|
107
|
+
// one-byte symbol
|
108
|
+
bytes_to_come = 0;
|
109
|
+
result = c;
|
110
|
+
}
|
111
|
+
|
112
|
+
else
|
113
|
+
return 0; // error
|
114
|
+
|
115
|
+
while (bytes_to_come > 0) {
|
116
|
+
bytes_to_come--;
|
117
|
+
(*s)++;
|
118
|
+
c = (unsigned char)**s;
|
119
|
+
if (c < (unsigned char) set2MSbits &&
|
120
|
+
c >= (unsigned char) set1MSbits) // 1000xxxx
|
121
|
+
{
|
122
|
+
result = (result << 6) | (c & get6LSbits);
|
123
|
+
}
|
124
|
+
else
|
125
|
+
return 0;
|
126
|
+
}
|
127
|
+
|
128
|
+
(*s)++;
|
129
|
+
return result;
|
130
|
+
}
|
131
|
+
|
132
|
+
|
133
|
+
/*******************************************************************/
|
134
|
+
/* */
|
135
|
+
/* utf8toint */
|
136
|
+
/* */
|
137
|
+
/*******************************************************************/
|
138
|
+
|
139
|
+
unsigned int utf8toint( char *s )
|
140
|
+
|
141
|
+
{
|
142
|
+
unsigned int result = utf8toint( &s );
|
143
|
+
if (*s == 0) // all bytes converted?
|
144
|
+
return result;
|
145
|
+
return 0;
|
146
|
+
}
|
@@ -0,0 +1,19 @@
|
|
1
|
+
|
2
|
+
/*******************************************************************/
|
3
|
+
/* */
|
4
|
+
/* File: utf8.h */
|
5
|
+
/* Author: Helmut Schmid */
|
6
|
+
/* Purpose: */
|
7
|
+
/* Created: Mon Sep 5 17:49:16 2005 */
|
8
|
+
/* Modified: Mon Apr 7 08:26:39 2008 (schmid) */
|
9
|
+
/* */
|
10
|
+
/*******************************************************************/
|
11
|
+
|
12
|
+
#ifndef _UTF8_H_
|
13
|
+
#define _UTF8_H_
|
14
|
+
|
15
|
+
unsigned int utf8toint( char *s );
|
16
|
+
unsigned int utf8toint( char **s );
|
17
|
+
char *int2utf8( unsigned int );
|
18
|
+
|
19
|
+
#endif
|
data/lib/sfst.rb
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# sfst.rb - SFST interface
|
4
|
+
#
|
5
|
+
# Written by Marius L. Jøhndal, 2008.
|
6
|
+
#
|
7
|
+
require 'sfst_machine'
|
8
|
+
|
9
|
+
module SFST
|
10
|
+
# Compiles an SFST transducer +source+ and saves it as +machine+.
|
11
|
+
#
|
12
|
+
# ==== Options
|
13
|
+
# compact:: Compile a compact transducer.
|
14
|
+
def self.compile(source, machine, options = {})
|
15
|
+
unless options[:compact]
|
16
|
+
_compile_regular(source, machine)
|
17
|
+
else
|
18
|
+
_compile_compact(source, machine)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# A regular, i.e. not a compact, transducer.
|
23
|
+
class RegularTransducer
|
24
|
+
def initialize(file)
|
25
|
+
@fst = RegularTransducerMachine.new(file)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Analyses a string +string+. Returns an array of analysed
|
29
|
+
# strings if the string is accepted, or an empty array if not.
|
30
|
+
#
|
31
|
+
# ==== Options
|
32
|
+
# * +symbol_sequence+ - Return each analysis as a sequence of symbols.
|
33
|
+
# Multicharacter symbols will be strings on the form +<symbol>+.
|
34
|
+
def analyze(string, options = {})
|
35
|
+
x = []
|
36
|
+
@fst._analyze(string) do |a|
|
37
|
+
if options[:symbol_sequence]
|
38
|
+
x << a.map { |s| s.match(/^<(.*)>$/) ? $1.to_sym : s }
|
39
|
+
else
|
40
|
+
x << a.join
|
41
|
+
end
|
42
|
+
end
|
43
|
+
x
|
44
|
+
end
|
45
|
+
|
46
|
+
# Checks if the string +string+ is accepted for analysis.
|
47
|
+
def accepted_analysis?(string)
|
48
|
+
@fst._analyze(string)
|
49
|
+
end
|
50
|
+
|
51
|
+
# Generates a string +string+. Returns an array of generated
|
52
|
+
# strings if the string is accepted or an empty array if not.
|
53
|
+
def generate(string)
|
54
|
+
x = []
|
55
|
+
@fst._generate(string) { |a| x << a.join }
|
56
|
+
x
|
57
|
+
end
|
58
|
+
|
59
|
+
# Checks if the string +string+ is accepted for generating.
|
60
|
+
def accepted_generating?(string)
|
61
|
+
@fst._generate(string)
|
62
|
+
end
|
63
|
+
|
64
|
+
# Generates upper or lower level or both. This only works with
|
65
|
+
# non-compact transducers.
|
66
|
+
#
|
67
|
+
# ==== Options
|
68
|
+
# * +levels+ - if <tt>:upper</tt>, generates only upper level. If <tt>:lower</tt> generates
|
69
|
+
# only lower level. If <tt>:both</tt>, generates both. Default is <tt>:both</tt>.
|
70
|
+
# * +epsilons+ - if +true+, produces epsilons. Default is +false+.
|
71
|
+
def generate_language(options = {}, &block)
|
72
|
+
@fst._generate_language(options[:levels] || :both, options[:epsilons] ? :all : :noepsilons, &block)
|
73
|
+
end
|
74
|
+
|
75
|
+
alias :analyse :analyze
|
76
|
+
end
|
77
|
+
|
78
|
+
# A compact transducer.
|
79
|
+
class CompactTransducer < CompactTransducerMachine
|
80
|
+
def initialize(file)
|
81
|
+
@fst = CompactTransducerMachine.new(file)
|
82
|
+
end
|
83
|
+
|
84
|
+
# Checks if the string +string+ is accepted for analysis.
|
85
|
+
def accepted_analysis?(string)
|
86
|
+
@fst._analyze(string)
|
87
|
+
end
|
88
|
+
|
89
|
+
# Analyses a string +string+. Returns an array of analysed
|
90
|
+
# strings if the string is accepted or an empty array if not.
|
91
|
+
def analyze(form)
|
92
|
+
x = []
|
93
|
+
@fst._analyze(form) { |a| x << a }
|
94
|
+
x
|
95
|
+
end
|
96
|
+
|
97
|
+
alias :analyse :analyze
|
98
|
+
end
|
99
|
+
end
|
data/ruby-sfst.gemspec
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = %q{ruby-sfst}
|
3
|
+
s.version = "0.1.0"
|
4
|
+
|
5
|
+
s.required_rubygems_version = Gem::Requirement.new("= 1.2") if s.respond_to? :required_rubygems_version=
|
6
|
+
s.authors = ["Marius L. J\303\270hndal"]
|
7
|
+
s.date = %q{2008-08-04}
|
8
|
+
s.description = %q{A wrapper for the Stuttgart Finite State Transducer Tools (SFST).}
|
9
|
+
s.email = %q{mariuslj (at) ifi [dot] uio (dot) no}
|
10
|
+
s.extensions = ["ext/sfst_machine/extconf.rb"]
|
11
|
+
s.extra_rdoc_files = ["README.rdoc", "lib/sfst.rb"]
|
12
|
+
s.files = ["README.rdoc", "Rakefile", "Manifest", "test/test_sfst.rb", "test/test_sfst.fst", "CHANGELOG", "ext/sfst_machine/fst-compiler.h", "ext/sfst_machine/utf8.C", "ext/sfst_machine/operators.C", "ext/sfst_machine/utf8-scanner.ll", "ext/sfst_machine/determinise.C", "ext/sfst_machine/interface.C", "ext/sfst_machine/compact.h", "ext/sfst_machine/basic.h", "ext/sfst_machine/fst.h", "ext/sfst_machine/make-compact.h", "ext/sfst_machine/fst-compiler.yy", "ext/sfst_machine/mem.h", "ext/sfst_machine/compact.C", "ext/sfst_machine/basic.C", "ext/sfst_machine/interface.h", "ext/sfst_machine/sfst_machine.cc", "ext/sfst_machine/extconf.rb", "ext/sfst_machine/alphabet.C", "ext/sfst_machine/fst.C", "ext/sfst_machine/alphabet.h", "ext/sfst_machine/make-compact.C", "ext/sfst_machine/fst-compiler.C", "ext/sfst_machine/utf8.h", "ext/sfst_machine/utf8-scanner.C", "lib/sfst.rb", "ruby-sfst.gemspec"]
|
13
|
+
s.has_rdoc = true
|
14
|
+
s.homepage = %q{http://github.com/mlj/ruby-sfst}
|
15
|
+
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Ruby-sfst", "--main", "README.rdoc"]
|
16
|
+
s.require_paths = ["lib", "ext"]
|
17
|
+
s.rubyforge_project = %q{sfst}
|
18
|
+
s.rubygems_version = %q{1.2.0}
|
19
|
+
s.summary = %q{A wrapper for the Stuttgart Finite State Transducer Tools (SFST).}
|
20
|
+
s.test_files = ["test/test_sfst.rb"]
|
21
|
+
|
22
|
+
if s.respond_to? :specification_version then
|
23
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
24
|
+
s.specification_version = 2
|
25
|
+
|
26
|
+
if current_version >= 3 then
|
27
|
+
s.add_development_dependency(%q<echoe>, [">= 0"])
|
28
|
+
else
|
29
|
+
s.add_dependency(%q<echoe>, [">= 0"])
|
30
|
+
end
|
31
|
+
else
|
32
|
+
s.add_dependency(%q<echoe>, [">= 0"])
|
33
|
+
end
|
34
|
+
end
|
data/test/test_sfst.fst
ADDED
data/test/test_sfst.rb
ADDED
@@ -0,0 +1,119 @@
|
|
1
|
+
require 'sfst'
|
2
|
+
require 'test/unit'
|
3
|
+
|
4
|
+
TEST_DIRECTORY = File.expand_path(File.dirname(__FILE__))
|
5
|
+
TEST_SCRIPT_FILE = File.join(TEST_DIRECTORY, 'test_sfst.fst')
|
6
|
+
TEST_COMPILED_FILE = File.join(TEST_DIRECTORY, 'test_sfst.a')
|
7
|
+
TEST_COMPILED_COMPACT_FILE = File.join(TEST_DIRECTORY, 'test_sfst_compact.a')
|
8
|
+
TEST_COMPILED_REGULAR_FILE = File.join(TEST_DIRECTORY, 'test_sfst_regular.a')
|
9
|
+
|
10
|
+
class SFSTTestCase < Test::Unit::TestCase
|
11
|
+
def test_sfst_compile_regular
|
12
|
+
SFST::compile(TEST_SCRIPT_FILE, TEST_COMPILED_FILE)
|
13
|
+
SFST::compile(TEST_SCRIPT_FILE, TEST_COMPILED_FILE, :compact => false)
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_sfst_compile_compact
|
17
|
+
SFST::compile(TEST_SCRIPT_FILE, TEST_COMPILED_FILE, :compact => true)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class RegularTransducerTestCase < Test::Unit::TestCase
|
22
|
+
def setup
|
23
|
+
SFST::compile(TEST_SCRIPT_FILE, TEST_COMPILED_REGULAR_FILE, :compact => false)
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_analyze_acceptance
|
27
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
28
|
+
assert_equal true, fst.accepted_analysis?('foo')
|
29
|
+
assert_equal false, fst.accepted_analysis?('fox')
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_analyze
|
33
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
34
|
+
assert_equal ['bar', 'baz'], fst.analyse('foo').sort
|
35
|
+
|
36
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
37
|
+
assert_equal [], fst.analyse('fox').sort
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_analyze_symbol_sequence
|
41
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
42
|
+
assert_equal [['b', 'a', 'r'], ['b', 'a', 'z']], fst.analyse('foo', :symbol_sequence => true).sort
|
43
|
+
|
44
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
45
|
+
assert_equal [], fst.analyse('fox', :symbol_sequence => true).sort
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_generate_acceptance
|
49
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
50
|
+
assert_equal true, fst.accepted_generating?('bar')
|
51
|
+
assert_equal true, fst.accepted_generating?('baz')
|
52
|
+
assert_equal false, fst.accepted_generating?('bax')
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_generate
|
56
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
57
|
+
assert_equal ['foo'], fst.generate('bar').sort
|
58
|
+
|
59
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
60
|
+
assert_equal ['foo'], fst.generate('baz').sort
|
61
|
+
|
62
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
63
|
+
assert_equal [], fst.generate('bax').sort
|
64
|
+
end
|
65
|
+
|
66
|
+
def test_generate_language_default
|
67
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
68
|
+
a = []
|
69
|
+
fst.generate_language do |u|
|
70
|
+
a << u.collect { |pair| pair.join(':') }.join
|
71
|
+
end
|
72
|
+
assert_equal ['b:fa:or:o', 'b:fa:oz:o'], a.sort
|
73
|
+
end
|
74
|
+
|
75
|
+
def test_generate_language_both
|
76
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
77
|
+
a = []
|
78
|
+
fst.generate_language(:levels => :both) do |u|
|
79
|
+
a << u.collect { |pair| pair.join(':') }.join
|
80
|
+
end
|
81
|
+
assert_equal ['b:fa:or:o', 'b:fa:oz:o'], a.sort
|
82
|
+
end
|
83
|
+
|
84
|
+
def test_generate_language_upper
|
85
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
86
|
+
a = []
|
87
|
+
fst.generate_language(:levels => :upper) do |u|
|
88
|
+
a << u.join
|
89
|
+
end
|
90
|
+
assert_equal ['foo'], a.sort
|
91
|
+
end
|
92
|
+
|
93
|
+
def test_generate_language_lower
|
94
|
+
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
95
|
+
a = []
|
96
|
+
fst.generate_language(:levels => :lower) do |u|
|
97
|
+
a << u.join
|
98
|
+
end
|
99
|
+
assert_equal ['bar', 'baz'], a.sort
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
class CompactTransducerTestCase < Test::Unit::TestCase
|
104
|
+
def setup
|
105
|
+
SFST::compile(TEST_SCRIPT_FILE, TEST_COMPILED_COMPACT_FILE, :compact => true)
|
106
|
+
end
|
107
|
+
|
108
|
+
def test_analyze_acceptance
|
109
|
+
fst = SFST::CompactTransducer.new(TEST_COMPILED_COMPACT_FILE)
|
110
|
+
assert_equal true, fst.accepted_analysis?('foo')
|
111
|
+
assert_equal false, fst.accepted_analysis?('fox')
|
112
|
+
end
|
113
|
+
|
114
|
+
def test_analyze
|
115
|
+
fst = SFST::CompactTransducer.new(TEST_COMPILED_COMPACT_FILE)
|
116
|
+
assert_equal ['bar', 'baz'], fst.analyse('foo').sort
|
117
|
+
assert_equal [], fst.analyse('fox').sort
|
118
|
+
end
|
119
|
+
end
|