mittens 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +30 -0
- data/README.md +62 -0
- data/Rakefile +21 -0
- data/ext/mittens/ext.c +96 -0
- data/ext/mittens/extconf.rb +12 -0
- data/lib/mittens/version.rb +3 -0
- data/lib/mittens.rb +7 -0
- data/mittens.gemspec +22 -0
- data/vendor/snowball/.gitignore +26 -0
- data/vendor/snowball/.travis.yml +112 -0
- data/vendor/snowball/AUTHORS +27 -0
- data/vendor/snowball/CONTRIBUTING.rst +216 -0
- data/vendor/snowball/COPYING +29 -0
- data/vendor/snowball/GNUmakefile +742 -0
- data/vendor/snowball/NEWS +754 -0
- data/vendor/snowball/README.rst +37 -0
- data/vendor/snowball/ada/README.md +74 -0
- data/vendor/snowball/ada/generate/generate.adb +83 -0
- data/vendor/snowball/ada/generate.gpr +21 -0
- data/vendor/snowball/ada/src/stemmer.adb +620 -0
- data/vendor/snowball/ada/src/stemmer.ads +219 -0
- data/vendor/snowball/ada/src/stemwords.adb +70 -0
- data/vendor/snowball/ada/stemmer_config.gpr +83 -0
- data/vendor/snowball/ada/stemwords.gpr +21 -0
- data/vendor/snowball/algorithms/arabic.sbl +558 -0
- data/vendor/snowball/algorithms/armenian.sbl +301 -0
- data/vendor/snowball/algorithms/basque.sbl +149 -0
- data/vendor/snowball/algorithms/catalan.sbl +202 -0
- data/vendor/snowball/algorithms/danish.sbl +93 -0
- data/vendor/snowball/algorithms/dutch.sbl +164 -0
- data/vendor/snowball/algorithms/english.sbl +229 -0
- data/vendor/snowball/algorithms/finnish.sbl +197 -0
- data/vendor/snowball/algorithms/french.sbl +254 -0
- data/vendor/snowball/algorithms/german.sbl +139 -0
- data/vendor/snowball/algorithms/german2.sbl +145 -0
- data/vendor/snowball/algorithms/greek.sbl +701 -0
- data/vendor/snowball/algorithms/hindi.sbl +323 -0
- data/vendor/snowball/algorithms/hungarian.sbl +241 -0
- data/vendor/snowball/algorithms/indonesian.sbl +192 -0
- data/vendor/snowball/algorithms/irish.sbl +149 -0
- data/vendor/snowball/algorithms/italian.sbl +202 -0
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
- data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
- data/vendor/snowball/algorithms/lovins.sbl +208 -0
- data/vendor/snowball/algorithms/nepali.sbl +92 -0
- data/vendor/snowball/algorithms/norwegian.sbl +80 -0
- data/vendor/snowball/algorithms/porter.sbl +139 -0
- data/vendor/snowball/algorithms/portuguese.sbl +218 -0
- data/vendor/snowball/algorithms/romanian.sbl +236 -0
- data/vendor/snowball/algorithms/russian.sbl +221 -0
- data/vendor/snowball/algorithms/serbian.sbl +2379 -0
- data/vendor/snowball/algorithms/spanish.sbl +230 -0
- data/vendor/snowball/algorithms/swedish.sbl +72 -0
- data/vendor/snowball/algorithms/tamil.sbl +405 -0
- data/vendor/snowball/algorithms/turkish.sbl +470 -0
- data/vendor/snowball/algorithms/yiddish.sbl +460 -0
- data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
- data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
- data/vendor/snowball/charsets/cp850.sbl +130 -0
- data/vendor/snowball/compiler/analyser.c +1547 -0
- data/vendor/snowball/compiler/driver.c +615 -0
- data/vendor/snowball/compiler/generator.c +1748 -0
- data/vendor/snowball/compiler/generator_ada.c +1702 -0
- data/vendor/snowball/compiler/generator_csharp.c +1322 -0
- data/vendor/snowball/compiler/generator_go.c +1278 -0
- data/vendor/snowball/compiler/generator_java.c +1313 -0
- data/vendor/snowball/compiler/generator_js.c +1316 -0
- data/vendor/snowball/compiler/generator_pascal.c +1387 -0
- data/vendor/snowball/compiler/generator_python.c +1337 -0
- data/vendor/snowball/compiler/generator_rust.c +1295 -0
- data/vendor/snowball/compiler/header.h +418 -0
- data/vendor/snowball/compiler/space.c +286 -0
- data/vendor/snowball/compiler/syswords.h +86 -0
- data/vendor/snowball/compiler/syswords2.h +13 -0
- data/vendor/snowball/compiler/tokeniser.c +567 -0
- data/vendor/snowball/csharp/.gitignore +8 -0
- data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
- data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
- data/vendor/snowball/csharp/Stemwords/App.config +6 -0
- data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
- data/vendor/snowball/doc/TODO +12 -0
- data/vendor/snowball/doc/libstemmer_c_README +148 -0
- data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
- data/vendor/snowball/doc/libstemmer_java_README +67 -0
- data/vendor/snowball/doc/libstemmer_js_README +48 -0
- data/vendor/snowball/doc/libstemmer_python_README +113 -0
- data/vendor/snowball/examples/stemwords.c +204 -0
- data/vendor/snowball/go/README.md +55 -0
- data/vendor/snowball/go/among.go +16 -0
- data/vendor/snowball/go/env.go +403 -0
- data/vendor/snowball/go/stemwords/generate.go +68 -0
- data/vendor/snowball/go/stemwords/main.go +68 -0
- data/vendor/snowball/go/util.go +34 -0
- data/vendor/snowball/iconv.py +50 -0
- data/vendor/snowball/include/libstemmer.h +78 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
- data/vendor/snowball/javascript/base-stemmer.js +294 -0
- data/vendor/snowball/javascript/stemwords.js +106 -0
- data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
- data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
- data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
- data/vendor/snowball/libstemmer/modules.txt +63 -0
- data/vendor/snowball/libstemmer/test.c +34 -0
- data/vendor/snowball/pascal/.gitignore +4 -0
- data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
- data/vendor/snowball/pascal/generate.pl +23 -0
- data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
- data/vendor/snowball/python/MANIFEST.in +7 -0
- data/vendor/snowball/python/create_init.py +54 -0
- data/vendor/snowball/python/setup.cfg +6 -0
- data/vendor/snowball/python/setup.py +81 -0
- data/vendor/snowball/python/snowballstemmer/among.py +13 -0
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
- data/vendor/snowball/python/stemwords.py +101 -0
- data/vendor/snowball/python/testapp.py +28 -0
- data/vendor/snowball/runtime/api.c +58 -0
- data/vendor/snowball/runtime/api.h +32 -0
- data/vendor/snowball/runtime/header.h +61 -0
- data/vendor/snowball/runtime/utilities.c +513 -0
- data/vendor/snowball/rust/Cargo.toml +7 -0
- data/vendor/snowball/rust/build.rs +55 -0
- data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
- data/vendor/snowball/rust/src/main.rs +102 -0
- data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
- data/vendor/snowball/rust/src/snowball/among.rs +6 -0
- data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
- data/vendor/snowball/tests/stemtest.c +95 -0
- metadata +178 -0
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
#!/usr/bin/env perl
|
|
2
|
+
use strict;
|
|
3
|
+
use 5.006;
|
|
4
|
+
use warnings;
|
|
5
|
+
|
|
6
|
+
my $progname = $0;
|
|
7
|
+
|
|
8
|
+
if (scalar @ARGV < 4 || scalar @ARGV > 5) {
|
|
9
|
+
print "Usage: $progname <outfile> <C source directory> <modules description file> <source list file> [<enc>]\n";
|
|
10
|
+
exit 1;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
my $outname = shift(@ARGV);
|
|
14
|
+
my $c_src_dir = shift(@ARGV);
|
|
15
|
+
my $descfile = shift(@ARGV);
|
|
16
|
+
my $srclistfile = shift(@ARGV);
|
|
17
|
+
my $enc_only;
|
|
18
|
+
my $extn = '';
|
|
19
|
+
if (@ARGV) {
|
|
20
|
+
$enc_only = shift(@ARGV);
|
|
21
|
+
$extn = '_'.$enc_only;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
my %aliases = ();
|
|
25
|
+
my %algorithms = ();
|
|
26
|
+
my %algorithm_encs = ();
|
|
27
|
+
|
|
28
|
+
my %encs = ();
|
|
29
|
+
|
|
30
|
+
sub addalgenc($$) {
|
|
31
|
+
my $alg = shift();
|
|
32
|
+
my $enc = shift();
|
|
33
|
+
|
|
34
|
+
if (defined $enc_only) {
|
|
35
|
+
my $norm_enc = lc $enc;
|
|
36
|
+
$norm_enc =~ s/_//g;
|
|
37
|
+
if ($norm_enc ne $enc_only) {
|
|
38
|
+
return;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
if (defined $algorithm_encs{$alg}) {
|
|
43
|
+
my $hashref = $algorithm_encs{$alg};
|
|
44
|
+
$$hashref{$enc}=1;
|
|
45
|
+
} else {
|
|
46
|
+
my %newhash = ($enc => 1);
|
|
47
|
+
$algorithm_encs{$alg}=\%newhash;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
$encs{$enc} = 1;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
sub readinput()
|
|
54
|
+
{
|
|
55
|
+
open DESCFILE, $descfile;
|
|
56
|
+
my $line;
|
|
57
|
+
while ($line = <DESCFILE>)
|
|
58
|
+
{
|
|
59
|
+
next if $line =~ m/^\s*#/;
|
|
60
|
+
next if $line =~ m/^\s*$/;
|
|
61
|
+
my ($alg,$encstr,$aliases) = split(/\s+/, $line);
|
|
62
|
+
my $enc;
|
|
63
|
+
my $alias;
|
|
64
|
+
|
|
65
|
+
$algorithms{$alg} = 1;
|
|
66
|
+
foreach $alias (split(/,/, $aliases)) {
|
|
67
|
+
foreach $enc (split(/,/, $encstr)) {
|
|
68
|
+
# print "$alias, $enc\n";
|
|
69
|
+
$aliases{$alias} = $alg;
|
|
70
|
+
addalgenc($alg, $enc);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
sub printoutput()
|
|
77
|
+
{
|
|
78
|
+
open (OUT, ">$outname") or die "Can't open output file `$outname': $!\n";
|
|
79
|
+
|
|
80
|
+
print OUT <<EOS;
|
|
81
|
+
/* $outname: List of stemming modules.
|
|
82
|
+
*
|
|
83
|
+
* This file is generated by mkmodules.pl from a list of module names.
|
|
84
|
+
* Do not edit manually.
|
|
85
|
+
*
|
|
86
|
+
EOS
|
|
87
|
+
|
|
88
|
+
my $line = " * Modules included by this file are: ";
|
|
89
|
+
print OUT $line;
|
|
90
|
+
my $linelen = length($line);
|
|
91
|
+
|
|
92
|
+
my $need_sep = 0;
|
|
93
|
+
my $lang;
|
|
94
|
+
my $enc;
|
|
95
|
+
my @algorithms = sort keys(%algorithms);
|
|
96
|
+
foreach $lang (@algorithms) {
|
|
97
|
+
if ($need_sep) {
|
|
98
|
+
if (($linelen + 2 + length($lang)) > 77) {
|
|
99
|
+
print OUT ",\n * ";
|
|
100
|
+
$linelen = 3;
|
|
101
|
+
} else {
|
|
102
|
+
print OUT ', ';
|
|
103
|
+
$linelen += 2;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
print OUT $lang;
|
|
107
|
+
$linelen += length($lang);
|
|
108
|
+
$need_sep = 1;
|
|
109
|
+
}
|
|
110
|
+
print OUT "\n */\n\n";
|
|
111
|
+
|
|
112
|
+
foreach $lang (@algorithms) {
|
|
113
|
+
my $hashref = $algorithm_encs{$lang};
|
|
114
|
+
foreach $enc (sort keys (%$hashref)) {
|
|
115
|
+
print OUT "#include \"../$c_src_dir/stem_${enc}_$lang.h\"\n";
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
print OUT <<EOS;
|
|
120
|
+
|
|
121
|
+
typedef enum {
|
|
122
|
+
ENC_UNKNOWN=0,
|
|
123
|
+
EOS
|
|
124
|
+
my $neednl = 0;
|
|
125
|
+
for $enc (sort keys %encs) {
|
|
126
|
+
print OUT ",\n" if $neednl;
|
|
127
|
+
print OUT " ENC_${enc}";
|
|
128
|
+
$neednl = 1;
|
|
129
|
+
}
|
|
130
|
+
print OUT <<EOS;
|
|
131
|
+
|
|
132
|
+
} stemmer_encoding_t;
|
|
133
|
+
|
|
134
|
+
struct stemmer_encoding {
|
|
135
|
+
const char * name;
|
|
136
|
+
stemmer_encoding_t enc;
|
|
137
|
+
};
|
|
138
|
+
static const struct stemmer_encoding encodings[] = {
|
|
139
|
+
EOS
|
|
140
|
+
for $enc (sort keys %encs) {
|
|
141
|
+
print OUT " {\"${enc}\", ENC_${enc}},\n";
|
|
142
|
+
}
|
|
143
|
+
print OUT <<EOS;
|
|
144
|
+
{0,ENC_UNKNOWN}
|
|
145
|
+
};
|
|
146
|
+
|
|
147
|
+
struct stemmer_modules {
|
|
148
|
+
const char * name;
|
|
149
|
+
stemmer_encoding_t enc;
|
|
150
|
+
struct SN_env * (*create)(void);
|
|
151
|
+
void (*close)(struct SN_env *);
|
|
152
|
+
int (*stem)(struct SN_env *);
|
|
153
|
+
};
|
|
154
|
+
static const struct stemmer_modules modules[] = {
|
|
155
|
+
EOS
|
|
156
|
+
|
|
157
|
+
for $lang (sort keys %aliases) {
|
|
158
|
+
my $l = $aliases{$lang};
|
|
159
|
+
my $hashref = $algorithm_encs{$l};
|
|
160
|
+
my $enc;
|
|
161
|
+
foreach $enc (sort keys (%$hashref)) {
|
|
162
|
+
my $p = "${l}_${enc}";
|
|
163
|
+
print OUT " {\"$lang\", ENC_$enc, ${p}_create_env, ${p}_close_env, ${p}_stem},\n";
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
print OUT <<EOS;
|
|
168
|
+
{0,ENC_UNKNOWN,0,0,0}
|
|
169
|
+
};
|
|
170
|
+
EOS
|
|
171
|
+
|
|
172
|
+
print OUT <<EOS;
|
|
173
|
+
static const char * algorithm_names[] = {
|
|
174
|
+
EOS
|
|
175
|
+
|
|
176
|
+
for $lang (@algorithms) {
|
|
177
|
+
print OUT " \"$lang\", \n";
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
print OUT <<EOS;
|
|
181
|
+
0
|
|
182
|
+
};
|
|
183
|
+
EOS
|
|
184
|
+
close OUT or die "Can't close ${outname}: $!\n";
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
sub printsrclist()
|
|
188
|
+
{
|
|
189
|
+
open (OUT, ">$srclistfile") or die "Can't open output file `$srclistfile': $!\n";
|
|
190
|
+
|
|
191
|
+
print OUT <<EOS;
|
|
192
|
+
# $srclistfile: List of stemming module source files
|
|
193
|
+
#
|
|
194
|
+
# This file is generated by mkmodules.pl from a list of module names.
|
|
195
|
+
# Do not edit manually.
|
|
196
|
+
#
|
|
197
|
+
EOS
|
|
198
|
+
|
|
199
|
+
my $line = "# Modules included by this file are: ";
|
|
200
|
+
print OUT $line;
|
|
201
|
+
my $linelen = length($line);
|
|
202
|
+
|
|
203
|
+
my $need_sep = 0;
|
|
204
|
+
my $lang;
|
|
205
|
+
my $srcfile;
|
|
206
|
+
my $enc;
|
|
207
|
+
my @algorithms = sort keys(%algorithms);
|
|
208
|
+
foreach $lang (@algorithms) {
|
|
209
|
+
if ($need_sep) {
|
|
210
|
+
if (($linelen + 2 + length($lang)) > 77) {
|
|
211
|
+
print OUT ",\n# ";
|
|
212
|
+
$linelen = 3;
|
|
213
|
+
} else {
|
|
214
|
+
print OUT ', ';
|
|
215
|
+
$linelen += 2;
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
print OUT $lang;
|
|
219
|
+
$linelen += length($lang);
|
|
220
|
+
$need_sep = 1;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
print OUT "\n\nsnowball_sources= \\\n";
|
|
224
|
+
for $lang (sort keys %aliases) {
|
|
225
|
+
my $hashref = $algorithm_encs{$lang};
|
|
226
|
+
my $enc;
|
|
227
|
+
foreach $enc (sort keys (%$hashref)) {
|
|
228
|
+
print OUT " src_c/stem_${enc}_${lang}.c \\\n";
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
$need_sep = 0;
|
|
233
|
+
for $srcfile ('runtime/api.c',
|
|
234
|
+
'runtime/utilities.c',
|
|
235
|
+
"libstemmer/libstemmer${extn}.c") {
|
|
236
|
+
print OUT " \\\n" if $need_sep;
|
|
237
|
+
print OUT " $srcfile";
|
|
238
|
+
$need_sep = 1;
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
print OUT "\n\nsnowball_headers= \\\n";
|
|
242
|
+
for $lang (sort keys %aliases) {
|
|
243
|
+
my $hashref = $algorithm_encs{$lang};
|
|
244
|
+
my $enc;
|
|
245
|
+
foreach $enc (sort keys (%$hashref)) {
|
|
246
|
+
my $p = "${lang}_${enc}";
|
|
247
|
+
print OUT " src_c/stem_${enc}_${lang}.h \\\n";
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
$need_sep = 0;
|
|
252
|
+
for $srcfile ('include/libstemmer.h',
|
|
253
|
+
"libstemmer/modules${extn}.h",
|
|
254
|
+
'runtime/api.h',
|
|
255
|
+
'runtime/header.h') {
|
|
256
|
+
print OUT " \\\n" if $need_sep;
|
|
257
|
+
print OUT " $srcfile";
|
|
258
|
+
$need_sep = 1;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
print OUT "\n\n";
|
|
262
|
+
close OUT or die "Can't close ${srclistfile}: $!\n";
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
readinput();
|
|
266
|
+
printoutput();
|
|
267
|
+
printsrclist();
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# This file contains a list of stemmers to include in the distribution.
|
|
2
|
+
# The format is a set of space separated lines - on each line:
|
|
3
|
+
# First item is name of stemmer.
|
|
4
|
+
# Second item is comma separated list of character sets.
|
|
5
|
+
# Third item is comma separated list of names to refer to the stemmer by.
|
|
6
|
+
#
|
|
7
|
+
# Lines starting with a #, or blank lines, are ignored.
|
|
8
|
+
|
|
9
|
+
# List all the main algorithms for each language, in UTF-8, and also with
|
|
10
|
+
# the most commonly used encoding.
|
|
11
|
+
|
|
12
|
+
arabic UTF_8 arabic,ar,ara
|
|
13
|
+
armenian UTF_8 armenian,hy,hye,arm
|
|
14
|
+
basque UTF_8,ISO_8859_1 basque,eu,eus,baq
|
|
15
|
+
catalan UTF_8,ISO_8859_1 catalan,ca,cat
|
|
16
|
+
danish UTF_8,ISO_8859_1 danish,da,dan
|
|
17
|
+
dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld
|
|
18
|
+
english UTF_8,ISO_8859_1 english,en,eng
|
|
19
|
+
finnish UTF_8,ISO_8859_1 finnish,fi,fin
|
|
20
|
+
french UTF_8,ISO_8859_1 french,fr,fre,fra
|
|
21
|
+
german UTF_8,ISO_8859_1 german,de,ger,deu
|
|
22
|
+
greek UTF_8 greek,el,gre,ell
|
|
23
|
+
hindi UTF_8 hindi,hi,hin
|
|
24
|
+
hungarian UTF_8,ISO_8859_2 hungarian,hu,hun
|
|
25
|
+
indonesian UTF_8,ISO_8859_1 indonesian,id,ind
|
|
26
|
+
irish UTF_8,ISO_8859_1 irish,ga,gle
|
|
27
|
+
italian UTF_8,ISO_8859_1 italian,it,ita
|
|
28
|
+
lithuanian UTF_8 lithuanian,lt,lit
|
|
29
|
+
nepali UTF_8 nepali,ne,nep
|
|
30
|
+
norwegian UTF_8,ISO_8859_1 norwegian,no,nor
|
|
31
|
+
portuguese UTF_8,ISO_8859_1 portuguese,pt,por
|
|
32
|
+
romanian UTF_8,ISO_8859_2 romanian,ro,rum,ron
|
|
33
|
+
russian UTF_8,KOI8_R russian,ru,rus
|
|
34
|
+
serbian UTF_8 serbian,sr,srp
|
|
35
|
+
spanish UTF_8,ISO_8859_1 spanish,es,esl,spa
|
|
36
|
+
swedish UTF_8,ISO_8859_1 swedish,sv,swe
|
|
37
|
+
tamil UTF_8 tamil,ta,tam
|
|
38
|
+
turkish UTF_8 turkish,tr,tur
|
|
39
|
+
yiddish UTF_8 yiddish,yi,yid
|
|
40
|
+
|
|
41
|
+
# Also include the traditional porter algorithm for english.
|
|
42
|
+
# The porter algorithm is included in the libstemmer distribution to assist
|
|
43
|
+
# with backwards compatibility, but for new systems the english algorithm
|
|
44
|
+
# should be used in preference.
|
|
45
|
+
porter UTF_8,ISO_8859_1 porter english
|
|
46
|
+
|
|
47
|
+
# Some other stemmers in the snowball project are not included in the standard
|
|
48
|
+
# distribution. To compile a libstemmer with them in, add them to this list,
|
|
49
|
+
# and regenerate the distribution. (You will need a full source checkout for
|
|
50
|
+
# this.) They are included in the snowball website as curiosities, but are not
|
|
51
|
+
# intended for general use, and use of them is is not fully supported. These
|
|
52
|
+
# algorithms are:
|
|
53
|
+
#
|
|
54
|
+
# german2 - This is a slight modification of the german stemmer.
|
|
55
|
+
#german2 UTF_8,ISO_8859_1 german2 german
|
|
56
|
+
#
|
|
57
|
+
# kraaij_pohlmann - This is a different dutch stemmer.
|
|
58
|
+
#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann dutch
|
|
59
|
+
#
|
|
60
|
+
# lovins - This is an english stemmer, but fairly outdated, and
|
|
61
|
+
# only really applicable to a restricted type of input text
|
|
62
|
+
# (keywords in academic publications).
|
|
63
|
+
#lovins UTF_8,ISO_8859_1 lovins english
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
|
|
2
|
+
#include "libstemmer.h"
|
|
3
|
+
|
|
4
|
+
/* test code */
|
|
5
|
+
void error(const char * err) {
|
|
6
|
+
printf("%s\n", err);
|
|
7
|
+
exit(1);
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
int main () {
|
|
11
|
+
const char * stemmed;
|
|
12
|
+
const char * unstemmed;
|
|
13
|
+
struct sb_stemmer * s;
|
|
14
|
+
const char ** list = sb_stemmer_list();
|
|
15
|
+
if (*list == 0) error("TEST FAIL: empty list of stemmers");
|
|
16
|
+
|
|
17
|
+
s = sb_stemmer_new("e");
|
|
18
|
+
if (s != 0) error("TEST FAIL: non zero return for unrecognised language");
|
|
19
|
+
s = sb_stemmer_new("english");
|
|
20
|
+
if (s == 0) error("TEST FAIL: zero return for recognised language");
|
|
21
|
+
sb_stemmer_delete(s);
|
|
22
|
+
s = sb_stemmer_new("en");
|
|
23
|
+
if (s == 0) error("TEST FAIL: zero return for recognised language");
|
|
24
|
+
unstemmed = "recognised";
|
|
25
|
+
stemmed = sb_stemmer_stem(s, unstemmed, 10);
|
|
26
|
+
printf("%s -> %s\n", unstemmed, stemmed);
|
|
27
|
+
if (sb_stemmer_length(s) != strlen(stemmed))
|
|
28
|
+
error("TEST FAIL: length not correct");
|
|
29
|
+
unstemmed = "recognized";
|
|
30
|
+
printf("%s -> %s\n", unstemmed, stemmed);
|
|
31
|
+
sb_stemmer_delete(s);
|
|
32
|
+
printf("Success\n");
|
|
33
|
+
return 0;
|
|
34
|
+
}
|