mittens 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +3 -3
- data/lib/mittens/version.rb +1 -1
- data/vendor/snowball/.github/workflows/ci.yml +216 -0
- data/vendor/snowball/CONTRIBUTING.rst +111 -62
- data/vendor/snowball/GNUmakefile +194 -136
- data/vendor/snowball/NEWS +798 -3
- data/vendor/snowball/README.rst +50 -1
- data/vendor/snowball/ada/src/stemmer.adb +25 -13
- data/vendor/snowball/ada/src/stemmer.ads +9 -9
- data/vendor/snowball/ada/stemmer_config.gpr +7 -7
- data/vendor/snowball/algorithms/basque.sbl +4 -19
- data/vendor/snowball/algorithms/catalan.sbl +2 -9
- data/vendor/snowball/algorithms/danish.sbl +1 -1
- data/vendor/snowball/algorithms/dutch.sbl +284 -122
- data/vendor/snowball/algorithms/dutch_porter.sbl +178 -0
- data/vendor/snowball/algorithms/english.sbl +52 -37
- data/vendor/snowball/algorithms/esperanto.sbl +157 -0
- data/vendor/snowball/algorithms/estonian.sbl +269 -0
- data/vendor/snowball/algorithms/finnish.sbl +2 -3
- data/vendor/snowball/algorithms/french.sbl +42 -16
- data/vendor/snowball/algorithms/german.sbl +35 -14
- data/vendor/snowball/algorithms/greek.sbl +76 -76
- data/vendor/snowball/algorithms/hungarian.sbl +8 -6
- data/vendor/snowball/algorithms/indonesian.sbl +14 -8
- data/vendor/snowball/algorithms/italian.sbl +11 -21
- data/vendor/snowball/algorithms/lithuanian.sbl +36 -37
- data/vendor/snowball/algorithms/lovins.sbl +0 -1
- data/vendor/snowball/algorithms/nepali.sbl +138 -37
- data/vendor/snowball/algorithms/norwegian.sbl +19 -5
- data/vendor/snowball/algorithms/porter.sbl +2 -2
- data/vendor/snowball/algorithms/portuguese.sbl +9 -13
- data/vendor/snowball/algorithms/romanian.sbl +17 -4
- data/vendor/snowball/algorithms/serbian.sbl +467 -468
- data/vendor/snowball/algorithms/spanish.sbl +5 -7
- data/vendor/snowball/algorithms/swedish.sbl +60 -6
- data/vendor/snowball/algorithms/tamil.sbl +207 -176
- data/vendor/snowball/algorithms/turkish.sbl +461 -445
- data/vendor/snowball/algorithms/yiddish.sbl +36 -38
- data/vendor/snowball/compiler/analyser.c +445 -192
- data/vendor/snowball/compiler/driver.c +109 -101
- data/vendor/snowball/compiler/generator.c +853 -464
- data/vendor/snowball/compiler/generator_ada.c +404 -366
- data/vendor/snowball/compiler/generator_csharp.c +297 -260
- data/vendor/snowball/compiler/generator_go.c +323 -254
- data/vendor/snowball/compiler/generator_java.c +326 -252
- data/vendor/snowball/compiler/generator_js.c +362 -252
- data/vendor/snowball/compiler/generator_pascal.c +349 -197
- data/vendor/snowball/compiler/generator_python.c +257 -240
- data/vendor/snowball/compiler/generator_rust.c +423 -251
- data/vendor/snowball/compiler/header.h +117 -71
- data/vendor/snowball/compiler/space.c +137 -68
- data/vendor/snowball/compiler/syswords.h +2 -2
- data/vendor/snowball/compiler/tokeniser.c +125 -107
- data/vendor/snowball/csharp/Snowball/Among.cs +14 -14
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +7 -7
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +57 -37
- data/vendor/snowball/csharp/Stemwords/App.config +2 -2
- data/vendor/snowball/csharp/Stemwords/Program.cs +16 -12
- data/vendor/snowball/doc/libstemmer_c_README +7 -4
- data/vendor/snowball/doc/libstemmer_csharp_README +4 -1
- data/vendor/snowball/doc/libstemmer_java_README +12 -1
- data/vendor/snowball/doc/libstemmer_js_README +6 -4
- data/vendor/snowball/doc/libstemmer_python_README +9 -4
- data/vendor/snowball/examples/stemwords.c +12 -12
- data/vendor/snowball/go/env.go +107 -31
- data/vendor/snowball/go/util.go +0 -4
- data/vendor/snowball/include/libstemmer.h +4 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +32 -15
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +347 -261
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +3 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +52 -37
- data/vendor/snowball/javascript/base-stemmer.js +186 -2
- data/vendor/snowball/javascript/stemwords.js +3 -6
- data/vendor/snowball/libstemmer/libstemmer_c.in +1 -1
- data/vendor/snowball/libstemmer/mkalgorithms.pl +6 -6
- data/vendor/snowball/libstemmer/mkmodules.pl +2 -2
- data/vendor/snowball/libstemmer/modules.txt +13 -10
- data/vendor/snowball/libstemmer/test.c +1 -1
- data/vendor/snowball/pascal/SnowballProgram.pas +84 -2
- data/vendor/snowball/pascal/generate.pl +13 -13
- data/vendor/snowball/python/create_init.py +4 -1
- data/vendor/snowball/python/setup.cfg +0 -3
- data/vendor/snowball/python/setup.py +8 -3
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +20 -54
- data/vendor/snowball/python/stemwords.py +8 -12
- data/vendor/snowball/runtime/api.c +10 -5
- data/vendor/snowball/runtime/header.h +10 -9
- data/vendor/snowball/runtime/utilities.c +9 -9
- data/vendor/snowball/rust/build.rs +1 -1
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +83 -5
- data/vendor/snowball/tests/stemtest.c +7 -4
- metadata +7 -7
- data/vendor/snowball/.travis.yml +0 -112
- data/vendor/snowball/algorithms/german2.sbl +0 -145
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +0 -240
- data/vendor/snowball/compiler/syswords2.h +0 -13
@@ -1,7 +1,6 @@
|
|
1
1
|
|
2
2
|
package org.tartarus.snowball;
|
3
3
|
|
4
|
-
import java.lang.reflect.Method;
|
5
4
|
import java.io.BufferedReader;
|
6
5
|
import java.io.BufferedWriter;
|
7
6
|
import java.io.FileInputStream;
|
@@ -13,6 +12,7 @@ import java.io.OutputStreamWriter;
|
|
13
12
|
import java.io.Reader;
|
14
13
|
import java.io.Writer;
|
15
14
|
import java.nio.charset.StandardCharsets;
|
15
|
+
import java.util.Arrays;
|
16
16
|
|
17
17
|
public class TestApp {
|
18
18
|
private static void usage()
|
@@ -20,56 +20,71 @@ public class TestApp {
|
|
20
20
|
System.err.println("Usage: TestApp <algorithm> [<input file>] [-o <output file>]");
|
21
21
|
}
|
22
22
|
|
23
|
-
|
24
|
-
|
23
|
+
private static SnowballStemmer getStemmer(String lang) {
|
24
|
+
try {
|
25
|
+
String c = "org.tartarus.snowball.ext." + lang + "Stemmer";
|
26
|
+
return (SnowballStemmer) Class.forName(c).getDeclaredConstructor().newInstance();
|
27
|
+
} catch (ReflectiveOperationException e) {
|
28
|
+
return null;
|
29
|
+
}
|
30
|
+
}
|
31
|
+
|
32
|
+
public static void main(String[] args) throws Throwable {
|
33
|
+
if (args.length < 1) {
|
25
34
|
usage();
|
26
35
|
return;
|
27
36
|
}
|
28
37
|
|
29
|
-
|
30
|
-
|
31
|
-
|
38
|
+
SnowballStemmer stemmer = getStemmer(args[0]);
|
39
|
+
if (stemmer == null) {
|
40
|
+
System.err.println("Stemmer " + args[0] + " not found");
|
41
|
+
return;
|
42
|
+
}
|
32
43
|
|
33
|
-
|
44
|
+
int arg = 1;
|
34
45
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
46
|
+
InputStream instream;
|
47
|
+
if (args.length > arg && !args[arg].equals("-o")) {
|
48
|
+
instream = new FileInputStream(args[arg++]);
|
49
|
+
} else {
|
50
|
+
instream = System.in;
|
51
|
+
}
|
41
52
|
|
42
53
|
OutputStream outstream;
|
43
|
-
|
54
|
+
if (args.length > arg) {
|
44
55
|
if (args.length != arg + 2 || !args[arg].equals("-o")) {
|
45
56
|
usage();
|
46
57
|
return;
|
47
58
|
}
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
59
|
+
outstream = new FileOutputStream(args[arg + 1]);
|
60
|
+
} else {
|
61
|
+
outstream = System.out;
|
62
|
+
}
|
52
63
|
|
53
|
-
|
54
|
-
|
64
|
+
Reader reader = new InputStreamReader(instream, StandardCharsets.UTF_8);
|
65
|
+
reader = new BufferedReader(reader);
|
55
66
|
|
56
|
-
|
57
|
-
|
67
|
+
Writer output = new OutputStreamWriter(outstream, StandardCharsets.UTF_8);
|
68
|
+
output = new BufferedWriter(output);
|
58
69
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
70
|
+
char[] input = new char[8];
|
71
|
+
int length = 0;
|
72
|
+
int character;
|
73
|
+
while ((character = reader.read()) != -1) {
|
74
|
+
char ch = (char) character;
|
75
|
+
if (Character.isWhitespace(ch)) {
|
76
|
+
stemmer.setCurrent(input, length);
|
77
|
+
stemmer.stem();
|
78
|
+
output.write(stemmer.getCurrentBuffer(), 0, stemmer.getCurrentBufferLength());
|
79
|
+
output.write('\n');
|
80
|
+
length = 0;
|
81
|
+
} else {
|
82
|
+
if (length == input.length) {
|
83
|
+
input = Arrays.copyOf(input, length + 1);
|
84
|
+
}
|
85
|
+
input[length++] = ch < 127 ? Character.toLowerCase(ch) : ch;
|
86
|
+
}
|
87
|
+
}
|
88
|
+
output.close();
|
74
89
|
}
|
75
90
|
}
|
@@ -1,5 +1,18 @@
|
|
1
|
+
// @ts-check
|
2
|
+
|
1
3
|
/**@constructor*/
|
2
|
-
BaseStemmer = function() {
|
4
|
+
const BaseStemmer = function() {
|
5
|
+
/** @protected */
|
6
|
+
this.current = '';
|
7
|
+
this.cursor = 0;
|
8
|
+
this.limit = 0;
|
9
|
+
this.limit_backward = 0;
|
10
|
+
this.bra = 0;
|
11
|
+
this.ket = 0;
|
12
|
+
|
13
|
+
/**
|
14
|
+
* @param {string} value
|
15
|
+
*/
|
3
16
|
this.setCurrent = function(value) {
|
4
17
|
this.current = value;
|
5
18
|
this.cursor = 0;
|
@@ -9,11 +22,18 @@ BaseStemmer = function() {
|
|
9
22
|
this.ket = this.limit;
|
10
23
|
};
|
11
24
|
|
25
|
+
/**
|
26
|
+
* @return {string}
|
27
|
+
*/
|
12
28
|
this.getCurrent = function() {
|
13
29
|
return this.current;
|
14
30
|
};
|
15
31
|
|
32
|
+
/**
|
33
|
+
* @param {BaseStemmer} other
|
34
|
+
*/
|
16
35
|
this.copy_from = function(other) {
|
36
|
+
/** @protected */
|
17
37
|
this.current = other.current;
|
18
38
|
this.cursor = other.cursor;
|
19
39
|
this.limit = other.limit;
|
@@ -22,7 +42,14 @@ BaseStemmer = function() {
|
|
22
42
|
this.ket = other.ket;
|
23
43
|
};
|
24
44
|
|
45
|
+
/**
|
46
|
+
* @param {number[]} s
|
47
|
+
* @param {number} min
|
48
|
+
* @param {number} max
|
49
|
+
* @return {boolean}
|
50
|
+
*/
|
25
51
|
this.in_grouping = function(s, min, max) {
|
52
|
+
/** @protected */
|
26
53
|
if (this.cursor >= this.limit) return false;
|
27
54
|
var ch = this.current.charCodeAt(this.cursor);
|
28
55
|
if (ch > max || ch < min) return false;
|
@@ -32,7 +59,34 @@ BaseStemmer = function() {
|
|
32
59
|
return true;
|
33
60
|
};
|
34
61
|
|
62
|
+
/**
|
63
|
+
* @param {number[]} s
|
64
|
+
* @param {number} min
|
65
|
+
* @param {number} max
|
66
|
+
* @return {boolean}
|
67
|
+
*/
|
68
|
+
this.go_in_grouping = function(s, min, max) {
|
69
|
+
/** @protected */
|
70
|
+
while (this.cursor < this.limit) {
|
71
|
+
var ch = this.current.charCodeAt(this.cursor);
|
72
|
+
if (ch > max || ch < min)
|
73
|
+
return true;
|
74
|
+
ch -= min;
|
75
|
+
if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0)
|
76
|
+
return true;
|
77
|
+
this.cursor++;
|
78
|
+
}
|
79
|
+
return false;
|
80
|
+
};
|
81
|
+
|
82
|
+
/**
|
83
|
+
* @param {number[]} s
|
84
|
+
* @param {number} min
|
85
|
+
* @param {number} max
|
86
|
+
* @return {boolean}
|
87
|
+
*/
|
35
88
|
this.in_grouping_b = function(s, min, max) {
|
89
|
+
/** @protected */
|
36
90
|
if (this.cursor <= this.limit_backward) return false;
|
37
91
|
var ch = this.current.charCodeAt(this.cursor - 1);
|
38
92
|
if (ch > max || ch < min) return false;
|
@@ -42,7 +96,32 @@ BaseStemmer = function() {
|
|
42
96
|
return true;
|
43
97
|
};
|
44
98
|
|
99
|
+
/**
|
100
|
+
* @param {number[]} s
|
101
|
+
* @param {number} min
|
102
|
+
* @param {number} max
|
103
|
+
* @return {boolean}
|
104
|
+
*/
|
105
|
+
this.go_in_grouping_b = function(s, min, max) {
|
106
|
+
/** @protected */
|
107
|
+
while (this.cursor > this.limit_backward) {
|
108
|
+
var ch = this.current.charCodeAt(this.cursor - 1);
|
109
|
+
if (ch > max || ch < min) return true;
|
110
|
+
ch -= min;
|
111
|
+
if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) return true;
|
112
|
+
this.cursor--;
|
113
|
+
}
|
114
|
+
return false;
|
115
|
+
};
|
116
|
+
|
117
|
+
/**
|
118
|
+
* @param {number[]} s
|
119
|
+
* @param {number} min
|
120
|
+
* @param {number} max
|
121
|
+
* @return {boolean}
|
122
|
+
*/
|
45
123
|
this.out_grouping = function(s, min, max) {
|
124
|
+
/** @protected */
|
46
125
|
if (this.cursor >= this.limit) return false;
|
47
126
|
var ch = this.current.charCodeAt(this.cursor);
|
48
127
|
if (ch > max || ch < min) {
|
@@ -57,7 +136,35 @@ BaseStemmer = function() {
|
|
57
136
|
return false;
|
58
137
|
};
|
59
138
|
|
139
|
+
/**
|
140
|
+
* @param {number[]} s
|
141
|
+
* @param {number} min
|
142
|
+
* @param {number} max
|
143
|
+
* @return {boolean}
|
144
|
+
*/
|
145
|
+
this.go_out_grouping = function(s, min, max) {
|
146
|
+
/** @protected */
|
147
|
+
while (this.cursor < this.limit) {
|
148
|
+
var ch = this.current.charCodeAt(this.cursor);
|
149
|
+
if (ch <= max && ch >= min) {
|
150
|
+
ch -= min;
|
151
|
+
if ((s[ch >>> 3] & (0X1 << (ch & 0x7))) != 0) {
|
152
|
+
return true;
|
153
|
+
}
|
154
|
+
}
|
155
|
+
this.cursor++;
|
156
|
+
}
|
157
|
+
return false;
|
158
|
+
};
|
159
|
+
|
160
|
+
/**
|
161
|
+
* @param {number[]} s
|
162
|
+
* @param {number} min
|
163
|
+
* @param {number} max
|
164
|
+
* @return {boolean}
|
165
|
+
*/
|
60
166
|
this.out_grouping_b = function(s, min, max) {
|
167
|
+
/** @protected */
|
61
168
|
if (this.cursor <= this.limit_backward) return false;
|
62
169
|
var ch = this.current.charCodeAt(this.cursor - 1);
|
63
170
|
if (ch > max || ch < min) {
|
@@ -72,8 +179,34 @@ BaseStemmer = function() {
|
|
72
179
|
return false;
|
73
180
|
};
|
74
181
|
|
182
|
+
/**
|
183
|
+
* @param {number[]} s
|
184
|
+
* @param {number} min
|
185
|
+
* @param {number} max
|
186
|
+
* @return {boolean}
|
187
|
+
*/
|
188
|
+
this.go_out_grouping_b = function(s, min, max) {
|
189
|
+
/** @protected */
|
190
|
+
while (this.cursor > this.limit_backward) {
|
191
|
+
var ch = this.current.charCodeAt(this.cursor - 1);
|
192
|
+
if (ch <= max && ch >= min) {
|
193
|
+
ch -= min;
|
194
|
+
if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) != 0) {
|
195
|
+
return true;
|
196
|
+
}
|
197
|
+
}
|
198
|
+
this.cursor--;
|
199
|
+
}
|
200
|
+
return false;
|
201
|
+
};
|
202
|
+
|
203
|
+
/**
|
204
|
+
* @param {string} s
|
205
|
+
* @return {boolean}
|
206
|
+
*/
|
75
207
|
this.eq_s = function(s)
|
76
208
|
{
|
209
|
+
/** @protected */
|
77
210
|
if (this.limit - this.cursor < s.length) return false;
|
78
211
|
if (this.current.slice(this.cursor, this.cursor + s.length) != s)
|
79
212
|
{
|
@@ -83,8 +216,13 @@ BaseStemmer = function() {
|
|
83
216
|
return true;
|
84
217
|
};
|
85
218
|
|
219
|
+
/**
|
220
|
+
* @param {string} s
|
221
|
+
* @return {boolean}
|
222
|
+
*/
|
86
223
|
this.eq_s_b = function(s)
|
87
224
|
{
|
225
|
+
/** @protected */
|
88
226
|
if (this.cursor - this.limit_backward < s.length) return false;
|
89
227
|
if (this.current.slice(this.cursor - s.length, this.cursor) != s)
|
90
228
|
{
|
@@ -94,8 +232,13 @@ BaseStemmer = function() {
|
|
94
232
|
return true;
|
95
233
|
};
|
96
234
|
|
97
|
-
/**
|
235
|
+
/**
|
236
|
+
* @param {Among[]} v
|
237
|
+
* @return {number}
|
238
|
+
*/
|
239
|
+
this.find_among = function(v)
|
98
240
|
{
|
241
|
+
/** @protected */
|
99
242
|
var i = 0;
|
100
243
|
var j = v.length;
|
101
244
|
|
@@ -165,8 +308,13 @@ BaseStemmer = function() {
|
|
165
308
|
};
|
166
309
|
|
167
310
|
// find_among_b is for backwards processing. Same comments apply
|
311
|
+
/**
|
312
|
+
* @param {Among[]} v
|
313
|
+
* @return {number}
|
314
|
+
*/
|
168
315
|
this.find_among_b = function(v)
|
169
316
|
{
|
317
|
+
/** @protected */
|
170
318
|
var i = 0;
|
171
319
|
var j = v.length
|
172
320
|
|
@@ -232,8 +380,15 @@ BaseStemmer = function() {
|
|
232
380
|
/* to replace chars between c_bra and c_ket in this.current by the
|
233
381
|
* chars in s.
|
234
382
|
*/
|
383
|
+
/**
|
384
|
+
* @param {number} c_bra
|
385
|
+
* @param {number} c_ket
|
386
|
+
* @param {string} s
|
387
|
+
* @return {number}
|
388
|
+
*/
|
235
389
|
this.replace_s = function(c_bra, c_ket, s)
|
236
390
|
{
|
391
|
+
/** @protected */
|
237
392
|
var adjustment = s.length - (c_ket - c_bra);
|
238
393
|
this.current = this.current.slice(0, c_bra) + s + this.current.slice(c_ket);
|
239
394
|
this.limit += adjustment;
|
@@ -242,8 +397,12 @@ BaseStemmer = function() {
|
|
242
397
|
return adjustment;
|
243
398
|
};
|
244
399
|
|
400
|
+
/**
|
401
|
+
* @return {boolean}
|
402
|
+
*/
|
245
403
|
this.slice_check = function()
|
246
404
|
{
|
405
|
+
/** @protected */
|
247
406
|
if (this.bra < 0 ||
|
248
407
|
this.bra > this.ket ||
|
249
408
|
this.ket > this.limit ||
|
@@ -254,8 +413,13 @@ BaseStemmer = function() {
|
|
254
413
|
return true;
|
255
414
|
};
|
256
415
|
|
416
|
+
/**
|
417
|
+
* @param {number} c_bra
|
418
|
+
* @return {boolean}
|
419
|
+
*/
|
257
420
|
this.slice_from = function(s)
|
258
421
|
{
|
422
|
+
/** @protected */
|
259
423
|
var result = false;
|
260
424
|
if (this.slice_check())
|
261
425
|
{
|
@@ -265,20 +429,34 @@ BaseStemmer = function() {
|
|
265
429
|
return result;
|
266
430
|
};
|
267
431
|
|
432
|
+
/**
|
433
|
+
* @return {boolean}
|
434
|
+
*/
|
268
435
|
this.slice_del = function()
|
269
436
|
{
|
437
|
+
/** @protected */
|
270
438
|
return this.slice_from("");
|
271
439
|
};
|
272
440
|
|
441
|
+
/**
|
442
|
+
* @param {number} c_bra
|
443
|
+
* @param {number} c_ket
|
444
|
+
* @param {string} s
|
445
|
+
*/
|
273
446
|
this.insert = function(c_bra, c_ket, s)
|
274
447
|
{
|
448
|
+
/** @protected */
|
275
449
|
var adjustment = this.replace_s(c_bra, c_ket, s);
|
276
450
|
if (c_bra <= this.bra) this.bra += adjustment;
|
277
451
|
if (c_bra <= this.ket) this.ket += adjustment;
|
278
452
|
};
|
279
453
|
|
454
|
+
/**
|
455
|
+
* @return {string}
|
456
|
+
*/
|
280
457
|
this.slice_to = function()
|
281
458
|
{
|
459
|
+
/** @protected */
|
282
460
|
var result = '';
|
283
461
|
if (this.slice_check())
|
284
462
|
{
|
@@ -287,8 +465,14 @@ BaseStemmer = function() {
|
|
287
465
|
return result;
|
288
466
|
};
|
289
467
|
|
468
|
+
/**
|
469
|
+
* @return {string}
|
470
|
+
*/
|
290
471
|
this.assign_to = function()
|
291
472
|
{
|
473
|
+
/** @protected */
|
292
474
|
return this.current.slice(0, this.limit);
|
293
475
|
};
|
294
476
|
};
|
477
|
+
|
478
|
+
if (typeof module === 'object' && module.exports) module.exports = BaseStemmer;
|
@@ -1,5 +1,3 @@
|
|
1
|
-
const stemmer = require('base-stemmer.js');
|
2
|
-
|
3
1
|
const fs = require('fs');
|
4
2
|
const readline = require('readline');
|
5
3
|
|
@@ -81,7 +79,7 @@ else
|
|
81
79
|
function stemming (lang, input, output, encoding) {
|
82
80
|
const lines = readline.createInterface({
|
83
81
|
input: fs.createReadStream(input, encoding),
|
84
|
-
|
82
|
+
terminal: false
|
85
83
|
});
|
86
84
|
var out = fs.createWriteStream(output, encoding);
|
87
85
|
var stemmer = create(lang);
|
@@ -93,10 +91,9 @@ function stemming (lang, input, output, encoding) {
|
|
93
91
|
function create (name) {
|
94
92
|
var lc_name = name.toLowerCase();
|
95
93
|
if (!lc_name.match('\\W') && lc_name != 'base') {
|
96
|
-
var algo = lc_name.substr(0, 1).toUpperCase() + lc_name.substr(1);
|
97
94
|
try {
|
98
|
-
const
|
99
|
-
return
|
95
|
+
const Stemmer = require(lc_name + '-stemmer.js');
|
96
|
+
return new Stemmer();
|
100
97
|
} catch (error) {
|
101
98
|
}
|
102
99
|
}
|
@@ -45,7 +45,7 @@ sb_stemmer_new(const char * algorithm, const char * charenc)
|
|
45
45
|
if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
|
46
46
|
}
|
47
47
|
if (module->name == NULL) return NULL;
|
48
|
-
|
48
|
+
|
49
49
|
stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
|
50
50
|
if (stemmer == NULL) return NULL;
|
51
51
|
|
@@ -73,16 +73,16 @@ EOS
|
|
73
73
|
my @algorithms = sort keys(%algorithms);
|
74
74
|
print OUT "\nlibstemmer_algorithms =";
|
75
75
|
foreach $lang (@algorithms) {
|
76
|
-
|
76
|
+
print OUT "\\\n ", $lang;
|
77
77
|
}
|
78
78
|
print OUT "\n";
|
79
79
|
|
80
80
|
for my $enc (qw(ISO_8859_1 ISO_8859_2 KOI8_R)) {
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
81
|
+
print OUT "\n${enc}_algorithms =";
|
82
|
+
foreach $lang (@algorithms) {
|
83
|
+
print OUT "\\\n ", $lang if exists $algorithm_encs{$lang}->{$enc};
|
84
|
+
}
|
85
|
+
print OUT "\n";
|
86
86
|
}
|
87
87
|
}
|
88
88
|
|
@@ -35,7 +35,7 @@ sub addalgenc($$) {
|
|
35
35
|
my $norm_enc = lc $enc;
|
36
36
|
$norm_enc =~ s/_//g;
|
37
37
|
if ($norm_enc ne $enc_only) {
|
38
|
-
|
38
|
+
return;
|
39
39
|
}
|
40
40
|
}
|
41
41
|
|
@@ -146,7 +146,7 @@ EOS
|
|
146
146
|
|
147
147
|
struct stemmer_modules {
|
148
148
|
const char * name;
|
149
|
-
stemmer_encoding_t enc;
|
149
|
+
stemmer_encoding_t enc;
|
150
150
|
struct SN_env * (*create)(void);
|
151
151
|
void (*close)(struct SN_env *);
|
152
152
|
int (*stem)(struct SN_env *);
|
@@ -14,8 +14,10 @@ armenian UTF_8 armenian,hy,hye,arm
|
|
14
14
|
basque UTF_8,ISO_8859_1 basque,eu,eus,baq
|
15
15
|
catalan UTF_8,ISO_8859_1 catalan,ca,cat
|
16
16
|
danish UTF_8,ISO_8859_1 danish,da,dan
|
17
|
-
dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld
|
17
|
+
dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld,kraaij_pohlmann
|
18
18
|
english UTF_8,ISO_8859_1 english,en,eng
|
19
|
+
esperanto UTF_8 esperanto,eo,epo
|
20
|
+
estonian UTF_8 estonian,et,est
|
19
21
|
finnish UTF_8,ISO_8859_1 finnish,fi,fin
|
20
22
|
french UTF_8,ISO_8859_1 french,fr,fre,fra
|
21
23
|
german UTF_8,ISO_8859_1 german,de,ger,deu
|
@@ -29,7 +31,7 @@ lithuanian UTF_8 lithuanian,lt,lit
|
|
29
31
|
nepali UTF_8 nepali,ne,nep
|
30
32
|
norwegian UTF_8,ISO_8859_1 norwegian,no,nor
|
31
33
|
portuguese UTF_8,ISO_8859_1 portuguese,pt,por
|
32
|
-
romanian UTF_8
|
34
|
+
romanian UTF_8 romanian,ro,rum,ron
|
33
35
|
russian UTF_8,KOI8_R russian,ru,rus
|
34
36
|
serbian UTF_8 serbian,sr,srp
|
35
37
|
spanish UTF_8,ISO_8859_1 spanish,es,esl,spa
|
@@ -42,7 +44,14 @@ yiddish UTF_8 yiddish,yi,yid
|
|
42
44
|
# The porter algorithm is included in the libstemmer distribution to assist
|
43
45
|
# with backwards compatibility, but for new systems the english algorithm
|
44
46
|
# should be used in preference.
|
45
|
-
porter UTF_8,ISO_8859_1 porter
|
47
|
+
porter UTF_8,ISO_8859_1 porter english
|
48
|
+
|
49
|
+
# This is Martin Porter's Dutch stemmer. It was the default Dutch stemming
|
50
|
+
# in Snowball 2.2.0 and earlier, but after user feedback and careful evaluation
|
51
|
+
# we concluded that the Kraaij-Pohlmann Dutch stemmer was a better default.
|
52
|
+
# We still provide this to help people who have a lot of existing data indexed
|
53
|
+
# using it.
|
54
|
+
dutch_porter UTF_8,ISO_8859_1 dutch_porter dutch
|
46
55
|
|
47
56
|
# Some other stemmers in the snowball project are not included in the standard
|
48
57
|
# distribution. To compile a libstemmer with them in, add them to this list,
|
@@ -51,13 +60,7 @@ porter UTF_8,ISO_8859_1 porter english
|
|
51
60
|
# intended for general use, and use of them is is not fully supported. These
|
52
61
|
# algorithms are:
|
53
62
|
#
|
54
|
-
# german2 - This is a slight modification of the german stemmer.
|
55
|
-
#german2 UTF_8,ISO_8859_1 german2 german
|
56
|
-
#
|
57
|
-
# kraaij_pohlmann - This is a different dutch stemmer.
|
58
|
-
#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann dutch
|
59
|
-
#
|
60
63
|
# lovins - This is an english stemmer, but fairly outdated, and
|
61
64
|
# only really applicable to a restricted type of input text
|
62
65
|
# (keywords in academic publications).
|
63
|
-
#lovins UTF_8,ISO_8859_1 lovins
|
66
|
+
#lovins UTF_8,ISO_8859_1 lovins english
|
@@ -13,7 +13,7 @@ int main () {
|
|
13
13
|
struct sb_stemmer * s;
|
14
14
|
const char ** list = sb_stemmer_list();
|
15
15
|
if (*list == 0) error("TEST FAIL: empty list of stemmers");
|
16
|
-
|
16
|
+
|
17
17
|
s = sb_stemmer_new("e");
|
18
18
|
if (s != 0) error("TEST FAIL: non zero return for unrecognised language");
|
19
19
|
s = sb_stemmer_new("english");
|