ruby-sfst 0.4.3 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -0
- data/COPYING +280 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +54 -0
- data/README.md +1 -1
- data/Rakefile +9 -18
- data/bin/console +7 -0
- data/bin/setup +6 -0
- data/ext/sfst/alphabet.cc +879 -0
- data/ext/sfst/alphabet.h +302 -0
- data/ext/sfst/basic.cc +85 -0
- data/ext/{sfst_machine → sfst}/basic.h +7 -4
- data/ext/sfst/compact.cc +629 -0
- data/ext/sfst/compact.h +100 -0
- data/ext/sfst/determinise.cc +279 -0
- data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
- data/ext/sfst/fst.cc +1150 -0
- data/ext/sfst/fst.h +374 -0
- data/ext/sfst/hopcroft.cc +681 -0
- data/ext/sfst/interface.cc +1921 -0
- data/ext/sfst/interface.h +171 -0
- data/ext/sfst/make-compact.cc +323 -0
- data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
- data/ext/sfst/mem.h +80 -0
- data/ext/sfst/operators.cc +1273 -0
- data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
- data/ext/sfst/sgi.h +72 -0
- data/ext/sfst/utf8.cc +149 -0
- data/ext/{sfst_machine → sfst}/utf8.h +7 -4
- data/lib/sfst.rb +2 -1
- data/lib/sfst/version.rb +1 -1
- data/ruby-sfst.gemspec +23 -23
- metadata +107 -35
- data/ext/sfst_machine/alphabet.cc +0 -812
- data/ext/sfst_machine/alphabet.h +0 -273
- data/ext/sfst_machine/basic.cc +0 -84
- data/ext/sfst_machine/compact.cc +0 -616
- data/ext/sfst_machine/compact.h +0 -98
- data/ext/sfst_machine/determinise.cc +0 -303
- data/ext/sfst_machine/fst.cc +0 -1000
- data/ext/sfst_machine/fst.h +0 -369
- data/ext/sfst_machine/interface.cc +0 -1842
- data/ext/sfst_machine/interface.h +0 -93
- data/ext/sfst_machine/make-compact.cc +0 -327
- data/ext/sfst_machine/mem.h +0 -74
- data/ext/sfst_machine/operators.cc +0 -1131
- data/ext/sfst_machine/sgi.h +0 -44
- data/ext/sfst_machine/utf8.cc +0 -146
- data/test/test_sfst.fst +0 -3
- data/test/test_sfst.rb +0 -114
data/ext/sfst_machine/sgi.h
DELETED
@@ -1,44 +0,0 @@
|
|
1
|
-
|
2
|
-
/*******************************************************************/
|
3
|
-
/* */
|
4
|
-
/* File: sgi.h */
|
5
|
-
/* Author: Helmut Schmid */
|
6
|
-
/* Purpose: */
|
7
|
-
/* Created: Thu Sep 11 15:58:25 2008 */
|
8
|
-
/* Modified: Fri Sep 12 08:17:03 2008 (schmid) */
|
9
|
-
/* */
|
10
|
-
/*******************************************************************/
|
11
|
-
|
12
|
-
#ifndef _SGI_INCLUDED
|
13
|
-
#define _SGI_INCLUDED
|
14
|
-
|
15
|
-
|
16
|
-
#ifdef SGIext
|
17
|
-
|
18
|
-
#include <ext/hash_map>
|
19
|
-
#include <ext/hash_set>
|
20
|
-
using std::hash_map;
|
21
|
-
using std::hash_set;
|
22
|
-
using std::hash;
|
23
|
-
|
24
|
-
#else
|
25
|
-
|
26
|
-
#ifdef SGI__gnu_cxx
|
27
|
-
|
28
|
-
#include <ext/hash_map>
|
29
|
-
#include <ext/hash_set>
|
30
|
-
|
31
|
-
#else
|
32
|
-
|
33
|
-
#include <backward/hash_map>
|
34
|
-
#include <backward/hash_set>
|
35
|
-
|
36
|
-
#endif
|
37
|
-
|
38
|
-
using __gnu_cxx::hash_map;
|
39
|
-
using __gnu_cxx::hash_set;
|
40
|
-
using __gnu_cxx::hash;
|
41
|
-
|
42
|
-
#endif
|
43
|
-
|
44
|
-
#endif
|
data/ext/sfst_machine/utf8.cc
DELETED
@@ -1,146 +0,0 @@
|
|
1
|
-
|
2
|
-
/*******************************************************************/
|
3
|
-
/* */
|
4
|
-
/* File: utf8.C */
|
5
|
-
/* Author: Helmut Schmid */
|
6
|
-
/* Purpose: */
|
7
|
-
/* Created: Mon Sep 5 17:49:16 2005 */
|
8
|
-
/* Modified: Mon Mar 3 11:00:53 2008 (schmid) */
|
9
|
-
/* */
|
10
|
-
/*******************************************************************/
|
11
|
-
|
12
|
-
#include "string.h"
|
13
|
-
|
14
|
-
#include "utf8.h"
|
15
|
-
|
16
|
-
const unsigned char get3LSbits=7;
|
17
|
-
const unsigned char get4LSbits=15;
|
18
|
-
const unsigned char get5LSbits=31;
|
19
|
-
const unsigned char get6LSbits=63;
|
20
|
-
|
21
|
-
const unsigned char set1MSbits=128;
|
22
|
-
const unsigned char set2MSbits=192;
|
23
|
-
const unsigned char set3MSbits=224;
|
24
|
-
const unsigned char set4MSbits=240;
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
/*******************************************************************/
|
29
|
-
/* */
|
30
|
-
/* int2utf8 */
|
31
|
-
/* */
|
32
|
-
/*******************************************************************/
|
33
|
-
|
34
|
-
char *int2utf8( unsigned int sym )
|
35
|
-
|
36
|
-
{
|
37
|
-
static unsigned char ch[5];
|
38
|
-
|
39
|
-
if (sym < 128) {
|
40
|
-
// 1-byte UTF8 symbol, 7 bits
|
41
|
-
ch[0] = sym;
|
42
|
-
ch[1] = 0;
|
43
|
-
}
|
44
|
-
|
45
|
-
else if (sym < 2048) {
|
46
|
-
// 2-byte UTF8 symbol, 5+6 bits
|
47
|
-
ch[0] = (sym >> 6) | set2MSbits;
|
48
|
-
ch[1] = (sym & get6LSbits) | set1MSbits;
|
49
|
-
ch[2] = 0;
|
50
|
-
}
|
51
|
-
|
52
|
-
else if (sym < 65536) {
|
53
|
-
// 3-byte UTF8 symbol, 4+6+6 bits
|
54
|
-
ch[0] = (sym >> 12) | set3MSbits;
|
55
|
-
ch[1] = ((sym >> 6) & get6LSbits) | set1MSbits;
|
56
|
-
ch[2] = (sym & get6LSbits) | set1MSbits;
|
57
|
-
ch[3] = 0;
|
58
|
-
}
|
59
|
-
|
60
|
-
else if (sym < 2097152) {
|
61
|
-
// 4-byte UTF8 symbol, 3+6+6+6 bits
|
62
|
-
ch[0] = (sym >> 18) | set4MSbits;
|
63
|
-
ch[1] = ((sym >> 12) & get6LSbits) | set1MSbits;
|
64
|
-
ch[2] = ((sym >> 6) & get6LSbits) | set1MSbits;
|
65
|
-
ch[3] = (sym & get6LSbits) | set1MSbits;
|
66
|
-
ch[4] = 0;
|
67
|
-
}
|
68
|
-
|
69
|
-
else
|
70
|
-
return NULL;
|
71
|
-
|
72
|
-
return (char*)ch;
|
73
|
-
}
|
74
|
-
|
75
|
-
|
76
|
-
/*******************************************************************/
|
77
|
-
/* */
|
78
|
-
/* utf8toint */
|
79
|
-
/* */
|
80
|
-
/*******************************************************************/
|
81
|
-
|
82
|
-
unsigned int utf8toint( char **s )
|
83
|
-
|
84
|
-
{
|
85
|
-
int bytes_to_come;
|
86
|
-
unsigned int result=0;
|
87
|
-
unsigned char c=(unsigned char)**s;
|
88
|
-
|
89
|
-
if (c >= (unsigned char)set4MSbits) { // 1111xxxx
|
90
|
-
bytes_to_come = 3;
|
91
|
-
result = (result << 3) | (c & get3LSbits);
|
92
|
-
}
|
93
|
-
|
94
|
-
else if (c >= (unsigned char) set3MSbits) { // 1110xxxx
|
95
|
-
// start of a three-byte symbol
|
96
|
-
bytes_to_come = 2;
|
97
|
-
result = (result << 4) | (c & get4LSbits);
|
98
|
-
}
|
99
|
-
|
100
|
-
else if (c >= (unsigned char) set2MSbits) { // 1100xxxx
|
101
|
-
// start of a two-byte symbol
|
102
|
-
bytes_to_come = 1;
|
103
|
-
result = (result << 5) | (c & get5LSbits);
|
104
|
-
}
|
105
|
-
|
106
|
-
else if (c < (unsigned char) set1MSbits) { // 0100xxxx
|
107
|
-
// one-byte symbol
|
108
|
-
bytes_to_come = 0;
|
109
|
-
result = c;
|
110
|
-
}
|
111
|
-
|
112
|
-
else
|
113
|
-
return 0; // error
|
114
|
-
|
115
|
-
while (bytes_to_come > 0) {
|
116
|
-
bytes_to_come--;
|
117
|
-
(*s)++;
|
118
|
-
c = (unsigned char)**s;
|
119
|
-
if (c < (unsigned char) set2MSbits &&
|
120
|
-
c >= (unsigned char) set1MSbits) // 1000xxxx
|
121
|
-
{
|
122
|
-
result = (result << 6) | (c & get6LSbits);
|
123
|
-
}
|
124
|
-
else
|
125
|
-
return 0;
|
126
|
-
}
|
127
|
-
|
128
|
-
(*s)++;
|
129
|
-
return result;
|
130
|
-
}
|
131
|
-
|
132
|
-
|
133
|
-
/*******************************************************************/
|
134
|
-
/* */
|
135
|
-
/* utf8toint */
|
136
|
-
/* */
|
137
|
-
/*******************************************************************/
|
138
|
-
|
139
|
-
unsigned int utf8toint( char *s )
|
140
|
-
|
141
|
-
{
|
142
|
-
unsigned int result = utf8toint( &s );
|
143
|
-
if (*s == 0) // all bytes converted?
|
144
|
-
return result;
|
145
|
-
return 0;
|
146
|
-
}
|
data/test/test_sfst.fst
DELETED
data/test/test_sfst.rb
DELETED
@@ -1,114 +0,0 @@
|
|
1
|
-
require 'sfst'
|
2
|
-
require 'test/unit'
|
3
|
-
|
4
|
-
TEST_DIRECTORY = File.expand_path(File.dirname(__FILE__))
|
5
|
-
TEST_COMPILED_COMPACT_FILE = File.join(TEST_DIRECTORY, 'test_sfst_compact.a')
|
6
|
-
TEST_COMPILED_REGULAR_FILE = File.join(TEST_DIRECTORY, 'test_sfst_regular.a')
|
7
|
-
|
8
|
-
class RegularTransducerTestCase < Test::Unit::TestCase
|
9
|
-
def test_analyze_acceptance
|
10
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
11
|
-
assert_equal true, fst.accepted_analysis?('foo')
|
12
|
-
assert_equal false, fst.accepted_analysis?('fox')
|
13
|
-
end
|
14
|
-
|
15
|
-
def test_analyze
|
16
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
17
|
-
assert_equal ['bar', 'baz'], fst.analyse('foo').sort
|
18
|
-
|
19
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
20
|
-
assert_equal [], fst.analyse('fox').sort
|
21
|
-
end
|
22
|
-
|
23
|
-
def test_analyze_symbol_sequence
|
24
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
25
|
-
assert_equal [['b', 'a', 'r'], ['b', 'a', 'z']], fst.analyse('foo', :symbol_sequence => true).sort
|
26
|
-
|
27
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
28
|
-
assert_equal [], fst.analyse('fox', :symbol_sequence => true).sort
|
29
|
-
end
|
30
|
-
|
31
|
-
def test_generate_acceptance
|
32
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
33
|
-
assert_equal true, fst.accepted_generating?('bar')
|
34
|
-
assert_equal true, fst.accepted_generating?('baz')
|
35
|
-
assert_equal false, fst.accepted_generating?('bax')
|
36
|
-
end
|
37
|
-
|
38
|
-
def test_generate
|
39
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
40
|
-
assert_equal ['foo'], fst.generate('bar').sort
|
41
|
-
|
42
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
43
|
-
assert_equal ['foo'], fst.generate('baz').sort
|
44
|
-
|
45
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
46
|
-
assert_equal [], fst.generate('bax').sort
|
47
|
-
end
|
48
|
-
|
49
|
-
def test_generate_language_default
|
50
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
51
|
-
a = []
|
52
|
-
fst.generate_language do |u|
|
53
|
-
a << u.collect { |pair| pair.join(':') }.join
|
54
|
-
end
|
55
|
-
assert_equal ['b:fa:or:o', 'b:fa:oz:o'], a.sort
|
56
|
-
end
|
57
|
-
|
58
|
-
def test_generate_language_both
|
59
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
60
|
-
a = []
|
61
|
-
fst.generate_language(:levels => :both) do |u|
|
62
|
-
a << u.collect { |pair| pair.join(':') }.join
|
63
|
-
end
|
64
|
-
assert_equal ['b:fa:or:o', 'b:fa:oz:o'], a.sort
|
65
|
-
end
|
66
|
-
|
67
|
-
def test_generate_language_upper
|
68
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
69
|
-
a = []
|
70
|
-
fst.generate_language(:levels => :upper) do |u|
|
71
|
-
a << u.join
|
72
|
-
end
|
73
|
-
assert_equal ['foo'], a.sort
|
74
|
-
end
|
75
|
-
|
76
|
-
def test_generate_language_lower
|
77
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
78
|
-
a = []
|
79
|
-
fst.generate_language(:levels => :lower) do |u|
|
80
|
-
a << u.join
|
81
|
-
end
|
82
|
-
assert_equal ['bar', 'baz'], a.sort
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
86
|
-
class CompactTransducerTestCase < Test::Unit::TestCase
|
87
|
-
def test_analyze_acceptance
|
88
|
-
fst = SFST::CompactTransducer.new(TEST_COMPILED_COMPACT_FILE)
|
89
|
-
assert_equal true, fst.accepted_analysis?('foo')
|
90
|
-
assert_equal false, fst.accepted_analysis?('fox')
|
91
|
-
end
|
92
|
-
|
93
|
-
def test_analyze
|
94
|
-
fst = SFST::CompactTransducer.new(TEST_COMPILED_COMPACT_FILE)
|
95
|
-
assert_equal ['bar', 'baz'], fst.analyse('foo').sort
|
96
|
-
assert_equal [], fst.analyse('fox').sort
|
97
|
-
end
|
98
|
-
end
|
99
|
-
|
100
|
-
class StressTestCase < Test::Unit::TestCase
|
101
|
-
def test_repeated_analyses_regular
|
102
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
103
|
-
65536.times do
|
104
|
-
assert_equal ['bar', 'baz'], fst.analyse('foo').sort
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
def test_repeated_analyses_compact
|
109
|
-
fst = SFST::CompactTransducer.new(TEST_COMPILED_COMPACT_FILE)
|
110
|
-
65536.times do
|
111
|
-
assert_equal ['bar', 'baz'], fst.analyse('foo').sort
|
112
|
-
end
|
113
|
-
end
|
114
|
-
end
|