ruby-sfst 0.4.3 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -0
- data/COPYING +280 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +54 -0
- data/README.md +1 -1
- data/Rakefile +9 -18
- data/bin/console +7 -0
- data/bin/setup +6 -0
- data/ext/sfst/alphabet.cc +879 -0
- data/ext/sfst/alphabet.h +302 -0
- data/ext/sfst/basic.cc +85 -0
- data/ext/{sfst_machine → sfst}/basic.h +7 -4
- data/ext/sfst/compact.cc +629 -0
- data/ext/sfst/compact.h +100 -0
- data/ext/sfst/determinise.cc +279 -0
- data/ext/{sfst_machine → sfst}/extconf.rb +2 -1
- data/ext/sfst/fst.cc +1150 -0
- data/ext/sfst/fst.h +374 -0
- data/ext/sfst/hopcroft.cc +681 -0
- data/ext/sfst/interface.cc +1921 -0
- data/ext/sfst/interface.h +171 -0
- data/ext/sfst/make-compact.cc +323 -0
- data/ext/{sfst_machine → sfst}/make-compact.h +15 -13
- data/ext/sfst/mem.h +80 -0
- data/ext/sfst/operators.cc +1273 -0
- data/ext/{sfst_machine → sfst}/sfst_machine.cc +89 -78
- data/ext/sfst/sgi.h +72 -0
- data/ext/sfst/utf8.cc +149 -0
- data/ext/{sfst_machine → sfst}/utf8.h +7 -4
- data/lib/sfst.rb +2 -1
- data/lib/sfst/version.rb +1 -1
- data/ruby-sfst.gemspec +23 -23
- metadata +107 -35
- data/ext/sfst_machine/alphabet.cc +0 -812
- data/ext/sfst_machine/alphabet.h +0 -273
- data/ext/sfst_machine/basic.cc +0 -84
- data/ext/sfst_machine/compact.cc +0 -616
- data/ext/sfst_machine/compact.h +0 -98
- data/ext/sfst_machine/determinise.cc +0 -303
- data/ext/sfst_machine/fst.cc +0 -1000
- data/ext/sfst_machine/fst.h +0 -369
- data/ext/sfst_machine/interface.cc +0 -1842
- data/ext/sfst_machine/interface.h +0 -93
- data/ext/sfst_machine/make-compact.cc +0 -327
- data/ext/sfst_machine/mem.h +0 -74
- data/ext/sfst_machine/operators.cc +0 -1131
- data/ext/sfst_machine/sgi.h +0 -44
- data/ext/sfst_machine/utf8.cc +0 -146
- data/test/test_sfst.fst +0 -3
- data/test/test_sfst.rb +0 -114
data/ext/sfst_machine/sgi.h
DELETED
@@ -1,44 +0,0 @@
|
|
1
|
-
|
2
|
-
/*******************************************************************/
|
3
|
-
/* */
|
4
|
-
/* File: sgi.h */
|
5
|
-
/* Author: Helmut Schmid */
|
6
|
-
/* Purpose: */
|
7
|
-
/* Created: Thu Sep 11 15:58:25 2008 */
|
8
|
-
/* Modified: Fri Sep 12 08:17:03 2008 (schmid) */
|
9
|
-
/* */
|
10
|
-
/*******************************************************************/
|
11
|
-
|
12
|
-
#ifndef _SGI_INCLUDED
|
13
|
-
#define _SGI_INCLUDED
|
14
|
-
|
15
|
-
|
16
|
-
#ifdef SGIext
|
17
|
-
|
18
|
-
#include <ext/hash_map>
|
19
|
-
#include <ext/hash_set>
|
20
|
-
using std::hash_map;
|
21
|
-
using std::hash_set;
|
22
|
-
using std::hash;
|
23
|
-
|
24
|
-
#else
|
25
|
-
|
26
|
-
#ifdef SGI__gnu_cxx
|
27
|
-
|
28
|
-
#include <ext/hash_map>
|
29
|
-
#include <ext/hash_set>
|
30
|
-
|
31
|
-
#else
|
32
|
-
|
33
|
-
#include <backward/hash_map>
|
34
|
-
#include <backward/hash_set>
|
35
|
-
|
36
|
-
#endif
|
37
|
-
|
38
|
-
using __gnu_cxx::hash_map;
|
39
|
-
using __gnu_cxx::hash_set;
|
40
|
-
using __gnu_cxx::hash;
|
41
|
-
|
42
|
-
#endif
|
43
|
-
|
44
|
-
#endif
|
data/ext/sfst_machine/utf8.cc
DELETED
@@ -1,146 +0,0 @@
|
|
1
|
-
|
2
|
-
/*******************************************************************/
|
3
|
-
/* */
|
4
|
-
/* File: utf8.C */
|
5
|
-
/* Author: Helmut Schmid */
|
6
|
-
/* Purpose: */
|
7
|
-
/* Created: Mon Sep 5 17:49:16 2005 */
|
8
|
-
/* Modified: Mon Mar 3 11:00:53 2008 (schmid) */
|
9
|
-
/* */
|
10
|
-
/*******************************************************************/
|
11
|
-
|
12
|
-
#include "string.h"
|
13
|
-
|
14
|
-
#include "utf8.h"
|
15
|
-
|
16
|
-
const unsigned char get3LSbits=7;
|
17
|
-
const unsigned char get4LSbits=15;
|
18
|
-
const unsigned char get5LSbits=31;
|
19
|
-
const unsigned char get6LSbits=63;
|
20
|
-
|
21
|
-
const unsigned char set1MSbits=128;
|
22
|
-
const unsigned char set2MSbits=192;
|
23
|
-
const unsigned char set3MSbits=224;
|
24
|
-
const unsigned char set4MSbits=240;
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
/*******************************************************************/
|
29
|
-
/* */
|
30
|
-
/* int2utf8 */
|
31
|
-
/* */
|
32
|
-
/*******************************************************************/
|
33
|
-
|
34
|
-
char *int2utf8( unsigned int sym )
|
35
|
-
|
36
|
-
{
|
37
|
-
static unsigned char ch[5];
|
38
|
-
|
39
|
-
if (sym < 128) {
|
40
|
-
// 1-byte UTF8 symbol, 7 bits
|
41
|
-
ch[0] = sym;
|
42
|
-
ch[1] = 0;
|
43
|
-
}
|
44
|
-
|
45
|
-
else if (sym < 2048) {
|
46
|
-
// 2-byte UTF8 symbol, 5+6 bits
|
47
|
-
ch[0] = (sym >> 6) | set2MSbits;
|
48
|
-
ch[1] = (sym & get6LSbits) | set1MSbits;
|
49
|
-
ch[2] = 0;
|
50
|
-
}
|
51
|
-
|
52
|
-
else if (sym < 65536) {
|
53
|
-
// 3-byte UTF8 symbol, 4+6+6 bits
|
54
|
-
ch[0] = (sym >> 12) | set3MSbits;
|
55
|
-
ch[1] = ((sym >> 6) & get6LSbits) | set1MSbits;
|
56
|
-
ch[2] = (sym & get6LSbits) | set1MSbits;
|
57
|
-
ch[3] = 0;
|
58
|
-
}
|
59
|
-
|
60
|
-
else if (sym < 2097152) {
|
61
|
-
// 4-byte UTF8 symbol, 3+6+6+6 bits
|
62
|
-
ch[0] = (sym >> 18) | set4MSbits;
|
63
|
-
ch[1] = ((sym >> 12) & get6LSbits) | set1MSbits;
|
64
|
-
ch[2] = ((sym >> 6) & get6LSbits) | set1MSbits;
|
65
|
-
ch[3] = (sym & get6LSbits) | set1MSbits;
|
66
|
-
ch[4] = 0;
|
67
|
-
}
|
68
|
-
|
69
|
-
else
|
70
|
-
return NULL;
|
71
|
-
|
72
|
-
return (char*)ch;
|
73
|
-
}
|
74
|
-
|
75
|
-
|
76
|
-
/*******************************************************************/
|
77
|
-
/* */
|
78
|
-
/* utf8toint */
|
79
|
-
/* */
|
80
|
-
/*******************************************************************/
|
81
|
-
|
82
|
-
unsigned int utf8toint( char **s )
|
83
|
-
|
84
|
-
{
|
85
|
-
int bytes_to_come;
|
86
|
-
unsigned int result=0;
|
87
|
-
unsigned char c=(unsigned char)**s;
|
88
|
-
|
89
|
-
if (c >= (unsigned char)set4MSbits) { // 1111xxxx
|
90
|
-
bytes_to_come = 3;
|
91
|
-
result = (result << 3) | (c & get3LSbits);
|
92
|
-
}
|
93
|
-
|
94
|
-
else if (c >= (unsigned char) set3MSbits) { // 1110xxxx
|
95
|
-
// start of a three-byte symbol
|
96
|
-
bytes_to_come = 2;
|
97
|
-
result = (result << 4) | (c & get4LSbits);
|
98
|
-
}
|
99
|
-
|
100
|
-
else if (c >= (unsigned char) set2MSbits) { // 1100xxxx
|
101
|
-
// start of a two-byte symbol
|
102
|
-
bytes_to_come = 1;
|
103
|
-
result = (result << 5) | (c & get5LSbits);
|
104
|
-
}
|
105
|
-
|
106
|
-
else if (c < (unsigned char) set1MSbits) { // 0100xxxx
|
107
|
-
// one-byte symbol
|
108
|
-
bytes_to_come = 0;
|
109
|
-
result = c;
|
110
|
-
}
|
111
|
-
|
112
|
-
else
|
113
|
-
return 0; // error
|
114
|
-
|
115
|
-
while (bytes_to_come > 0) {
|
116
|
-
bytes_to_come--;
|
117
|
-
(*s)++;
|
118
|
-
c = (unsigned char)**s;
|
119
|
-
if (c < (unsigned char) set2MSbits &&
|
120
|
-
c >= (unsigned char) set1MSbits) // 1000xxxx
|
121
|
-
{
|
122
|
-
result = (result << 6) | (c & get6LSbits);
|
123
|
-
}
|
124
|
-
else
|
125
|
-
return 0;
|
126
|
-
}
|
127
|
-
|
128
|
-
(*s)++;
|
129
|
-
return result;
|
130
|
-
}
|
131
|
-
|
132
|
-
|
133
|
-
/*******************************************************************/
|
134
|
-
/* */
|
135
|
-
/* utf8toint */
|
136
|
-
/* */
|
137
|
-
/*******************************************************************/
|
138
|
-
|
139
|
-
unsigned int utf8toint( char *s )
|
140
|
-
|
141
|
-
{
|
142
|
-
unsigned int result = utf8toint( &s );
|
143
|
-
if (*s == 0) // all bytes converted?
|
144
|
-
return result;
|
145
|
-
return 0;
|
146
|
-
}
|
data/test/test_sfst.fst
DELETED
data/test/test_sfst.rb
DELETED
@@ -1,114 +0,0 @@
|
|
1
|
-
require 'sfst'
|
2
|
-
require 'test/unit'
|
3
|
-
|
4
|
-
TEST_DIRECTORY = File.expand_path(File.dirname(__FILE__))
|
5
|
-
TEST_COMPILED_COMPACT_FILE = File.join(TEST_DIRECTORY, 'test_sfst_compact.a')
|
6
|
-
TEST_COMPILED_REGULAR_FILE = File.join(TEST_DIRECTORY, 'test_sfst_regular.a')
|
7
|
-
|
8
|
-
class RegularTransducerTestCase < Test::Unit::TestCase
|
9
|
-
def test_analyze_acceptance
|
10
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
11
|
-
assert_equal true, fst.accepted_analysis?('foo')
|
12
|
-
assert_equal false, fst.accepted_analysis?('fox')
|
13
|
-
end
|
14
|
-
|
15
|
-
def test_analyze
|
16
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
17
|
-
assert_equal ['bar', 'baz'], fst.analyse('foo').sort
|
18
|
-
|
19
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
20
|
-
assert_equal [], fst.analyse('fox').sort
|
21
|
-
end
|
22
|
-
|
23
|
-
def test_analyze_symbol_sequence
|
24
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
25
|
-
assert_equal [['b', 'a', 'r'], ['b', 'a', 'z']], fst.analyse('foo', :symbol_sequence => true).sort
|
26
|
-
|
27
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
28
|
-
assert_equal [], fst.analyse('fox', :symbol_sequence => true).sort
|
29
|
-
end
|
30
|
-
|
31
|
-
def test_generate_acceptance
|
32
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
33
|
-
assert_equal true, fst.accepted_generating?('bar')
|
34
|
-
assert_equal true, fst.accepted_generating?('baz')
|
35
|
-
assert_equal false, fst.accepted_generating?('bax')
|
36
|
-
end
|
37
|
-
|
38
|
-
def test_generate
|
39
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
40
|
-
assert_equal ['foo'], fst.generate('bar').sort
|
41
|
-
|
42
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
43
|
-
assert_equal ['foo'], fst.generate('baz').sort
|
44
|
-
|
45
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
46
|
-
assert_equal [], fst.generate('bax').sort
|
47
|
-
end
|
48
|
-
|
49
|
-
def test_generate_language_default
|
50
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
51
|
-
a = []
|
52
|
-
fst.generate_language do |u|
|
53
|
-
a << u.collect { |pair| pair.join(':') }.join
|
54
|
-
end
|
55
|
-
assert_equal ['b:fa:or:o', 'b:fa:oz:o'], a.sort
|
56
|
-
end
|
57
|
-
|
58
|
-
def test_generate_language_both
|
59
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
60
|
-
a = []
|
61
|
-
fst.generate_language(:levels => :both) do |u|
|
62
|
-
a << u.collect { |pair| pair.join(':') }.join
|
63
|
-
end
|
64
|
-
assert_equal ['b:fa:or:o', 'b:fa:oz:o'], a.sort
|
65
|
-
end
|
66
|
-
|
67
|
-
def test_generate_language_upper
|
68
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
69
|
-
a = []
|
70
|
-
fst.generate_language(:levels => :upper) do |u|
|
71
|
-
a << u.join
|
72
|
-
end
|
73
|
-
assert_equal ['foo'], a.sort
|
74
|
-
end
|
75
|
-
|
76
|
-
def test_generate_language_lower
|
77
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
78
|
-
a = []
|
79
|
-
fst.generate_language(:levels => :lower) do |u|
|
80
|
-
a << u.join
|
81
|
-
end
|
82
|
-
assert_equal ['bar', 'baz'], a.sort
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
86
|
-
class CompactTransducerTestCase < Test::Unit::TestCase
|
87
|
-
def test_analyze_acceptance
|
88
|
-
fst = SFST::CompactTransducer.new(TEST_COMPILED_COMPACT_FILE)
|
89
|
-
assert_equal true, fst.accepted_analysis?('foo')
|
90
|
-
assert_equal false, fst.accepted_analysis?('fox')
|
91
|
-
end
|
92
|
-
|
93
|
-
def test_analyze
|
94
|
-
fst = SFST::CompactTransducer.new(TEST_COMPILED_COMPACT_FILE)
|
95
|
-
assert_equal ['bar', 'baz'], fst.analyse('foo').sort
|
96
|
-
assert_equal [], fst.analyse('fox').sort
|
97
|
-
end
|
98
|
-
end
|
99
|
-
|
100
|
-
class StressTestCase < Test::Unit::TestCase
|
101
|
-
def test_repeated_analyses_regular
|
102
|
-
fst = SFST::RegularTransducer.new(TEST_COMPILED_REGULAR_FILE)
|
103
|
-
65536.times do
|
104
|
-
assert_equal ['bar', 'baz'], fst.analyse('foo').sort
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
def test_repeated_analyses_compact
|
109
|
-
fst = SFST::CompactTransducer.new(TEST_COMPILED_COMPACT_FILE)
|
110
|
-
65536.times do
|
111
|
-
assert_equal ['bar', 'baz'], fst.analyse('foo').sort
|
112
|
-
end
|
113
|
-
end
|
114
|
-
end
|