mittens 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +30 -0
- data/README.md +62 -0
- data/Rakefile +21 -0
- data/ext/mittens/ext.c +96 -0
- data/ext/mittens/extconf.rb +12 -0
- data/lib/mittens/version.rb +3 -0
- data/lib/mittens.rb +7 -0
- data/mittens.gemspec +22 -0
- data/vendor/snowball/.gitignore +26 -0
- data/vendor/snowball/.travis.yml +112 -0
- data/vendor/snowball/AUTHORS +27 -0
- data/vendor/snowball/CONTRIBUTING.rst +216 -0
- data/vendor/snowball/COPYING +29 -0
- data/vendor/snowball/GNUmakefile +742 -0
- data/vendor/snowball/NEWS +754 -0
- data/vendor/snowball/README.rst +37 -0
- data/vendor/snowball/ada/README.md +74 -0
- data/vendor/snowball/ada/generate/generate.adb +83 -0
- data/vendor/snowball/ada/generate.gpr +21 -0
- data/vendor/snowball/ada/src/stemmer.adb +620 -0
- data/vendor/snowball/ada/src/stemmer.ads +219 -0
- data/vendor/snowball/ada/src/stemwords.adb +70 -0
- data/vendor/snowball/ada/stemmer_config.gpr +83 -0
- data/vendor/snowball/ada/stemwords.gpr +21 -0
- data/vendor/snowball/algorithms/arabic.sbl +558 -0
- data/vendor/snowball/algorithms/armenian.sbl +301 -0
- data/vendor/snowball/algorithms/basque.sbl +149 -0
- data/vendor/snowball/algorithms/catalan.sbl +202 -0
- data/vendor/snowball/algorithms/danish.sbl +93 -0
- data/vendor/snowball/algorithms/dutch.sbl +164 -0
- data/vendor/snowball/algorithms/english.sbl +229 -0
- data/vendor/snowball/algorithms/finnish.sbl +197 -0
- data/vendor/snowball/algorithms/french.sbl +254 -0
- data/vendor/snowball/algorithms/german.sbl +139 -0
- data/vendor/snowball/algorithms/german2.sbl +145 -0
- data/vendor/snowball/algorithms/greek.sbl +701 -0
- data/vendor/snowball/algorithms/hindi.sbl +323 -0
- data/vendor/snowball/algorithms/hungarian.sbl +241 -0
- data/vendor/snowball/algorithms/indonesian.sbl +192 -0
- data/vendor/snowball/algorithms/irish.sbl +149 -0
- data/vendor/snowball/algorithms/italian.sbl +202 -0
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
- data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
- data/vendor/snowball/algorithms/lovins.sbl +208 -0
- data/vendor/snowball/algorithms/nepali.sbl +92 -0
- data/vendor/snowball/algorithms/norwegian.sbl +80 -0
- data/vendor/snowball/algorithms/porter.sbl +139 -0
- data/vendor/snowball/algorithms/portuguese.sbl +218 -0
- data/vendor/snowball/algorithms/romanian.sbl +236 -0
- data/vendor/snowball/algorithms/russian.sbl +221 -0
- data/vendor/snowball/algorithms/serbian.sbl +2379 -0
- data/vendor/snowball/algorithms/spanish.sbl +230 -0
- data/vendor/snowball/algorithms/swedish.sbl +72 -0
- data/vendor/snowball/algorithms/tamil.sbl +405 -0
- data/vendor/snowball/algorithms/turkish.sbl +470 -0
- data/vendor/snowball/algorithms/yiddish.sbl +460 -0
- data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
- data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
- data/vendor/snowball/charsets/cp850.sbl +130 -0
- data/vendor/snowball/compiler/analyser.c +1547 -0
- data/vendor/snowball/compiler/driver.c +615 -0
- data/vendor/snowball/compiler/generator.c +1748 -0
- data/vendor/snowball/compiler/generator_ada.c +1702 -0
- data/vendor/snowball/compiler/generator_csharp.c +1322 -0
- data/vendor/snowball/compiler/generator_go.c +1278 -0
- data/vendor/snowball/compiler/generator_java.c +1313 -0
- data/vendor/snowball/compiler/generator_js.c +1316 -0
- data/vendor/snowball/compiler/generator_pascal.c +1387 -0
- data/vendor/snowball/compiler/generator_python.c +1337 -0
- data/vendor/snowball/compiler/generator_rust.c +1295 -0
- data/vendor/snowball/compiler/header.h +418 -0
- data/vendor/snowball/compiler/space.c +286 -0
- data/vendor/snowball/compiler/syswords.h +86 -0
- data/vendor/snowball/compiler/syswords2.h +13 -0
- data/vendor/snowball/compiler/tokeniser.c +567 -0
- data/vendor/snowball/csharp/.gitignore +8 -0
- data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
- data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
- data/vendor/snowball/csharp/Stemwords/App.config +6 -0
- data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
- data/vendor/snowball/doc/TODO +12 -0
- data/vendor/snowball/doc/libstemmer_c_README +148 -0
- data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
- data/vendor/snowball/doc/libstemmer_java_README +67 -0
- data/vendor/snowball/doc/libstemmer_js_README +48 -0
- data/vendor/snowball/doc/libstemmer_python_README +113 -0
- data/vendor/snowball/examples/stemwords.c +204 -0
- data/vendor/snowball/go/README.md +55 -0
- data/vendor/snowball/go/among.go +16 -0
- data/vendor/snowball/go/env.go +403 -0
- data/vendor/snowball/go/stemwords/generate.go +68 -0
- data/vendor/snowball/go/stemwords/main.go +68 -0
- data/vendor/snowball/go/util.go +34 -0
- data/vendor/snowball/iconv.py +50 -0
- data/vendor/snowball/include/libstemmer.h +78 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
- data/vendor/snowball/javascript/base-stemmer.js +294 -0
- data/vendor/snowball/javascript/stemwords.js +106 -0
- data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
- data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
- data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
- data/vendor/snowball/libstemmer/modules.txt +63 -0
- data/vendor/snowball/libstemmer/test.c +34 -0
- data/vendor/snowball/pascal/.gitignore +4 -0
- data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
- data/vendor/snowball/pascal/generate.pl +23 -0
- data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
- data/vendor/snowball/python/MANIFEST.in +7 -0
- data/vendor/snowball/python/create_init.py +54 -0
- data/vendor/snowball/python/setup.cfg +6 -0
- data/vendor/snowball/python/setup.py +81 -0
- data/vendor/snowball/python/snowballstemmer/among.py +13 -0
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
- data/vendor/snowball/python/stemwords.py +101 -0
- data/vendor/snowball/python/testapp.py +28 -0
- data/vendor/snowball/runtime/api.c +58 -0
- data/vendor/snowball/runtime/api.h +32 -0
- data/vendor/snowball/runtime/header.h +61 -0
- data/vendor/snowball/runtime/utilities.c +513 -0
- data/vendor/snowball/rust/Cargo.toml +7 -0
- data/vendor/snowball/rust/build.rs +55 -0
- data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
- data/vendor/snowball/rust/src/main.rs +102 -0
- data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
- data/vendor/snowball/rust/src/snowball/among.rs +6 -0
- data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
- data/vendor/snowball/tests/stemtest.c +95 -0
- metadata +178 -0
@@ -0,0 +1,75 @@
|
|
1
|
+
|
2
|
+
package org.tartarus.snowball;
|
3
|
+
|
4
|
+
import java.lang.reflect.Method;
|
5
|
+
import java.io.BufferedReader;
|
6
|
+
import java.io.BufferedWriter;
|
7
|
+
import java.io.FileInputStream;
|
8
|
+
import java.io.FileOutputStream;
|
9
|
+
import java.io.InputStream;
|
10
|
+
import java.io.InputStreamReader;
|
11
|
+
import java.io.OutputStream;
|
12
|
+
import java.io.OutputStreamWriter;
|
13
|
+
import java.io.Reader;
|
14
|
+
import java.io.Writer;
|
15
|
+
import java.nio.charset.StandardCharsets;
|
16
|
+
|
17
|
+
public class TestApp {
|
18
|
+
private static void usage()
|
19
|
+
{
|
20
|
+
System.err.println("Usage: TestApp <algorithm> [<input file>] [-o <output file>]");
|
21
|
+
}
|
22
|
+
|
23
|
+
public static void main(String [] args) throws Throwable {
|
24
|
+
if (args.length < 2) {
|
25
|
+
usage();
|
26
|
+
return;
|
27
|
+
}
|
28
|
+
|
29
|
+
Class stemClass = Class.forName("org.tartarus.snowball.ext." +
|
30
|
+
args[0] + "Stemmer");
|
31
|
+
SnowballStemmer stemmer = (SnowballStemmer) stemClass.newInstance();
|
32
|
+
|
33
|
+
int arg = 1;
|
34
|
+
|
35
|
+
InputStream instream;
|
36
|
+
if (args.length > arg && !args[arg].equals("-o")) {
|
37
|
+
instream = new FileInputStream(args[arg++]);
|
38
|
+
} else {
|
39
|
+
instream = System.in;
|
40
|
+
}
|
41
|
+
|
42
|
+
OutputStream outstream;
|
43
|
+
if (args.length > arg) {
|
44
|
+
if (args.length != arg + 2 || !args[arg].equals("-o")) {
|
45
|
+
usage();
|
46
|
+
return;
|
47
|
+
}
|
48
|
+
outstream = new FileOutputStream(args[arg + 1]);
|
49
|
+
} else {
|
50
|
+
outstream = System.out;
|
51
|
+
}
|
52
|
+
|
53
|
+
Reader reader = new InputStreamReader(instream, StandardCharsets.UTF_8);
|
54
|
+
reader = new BufferedReader(reader);
|
55
|
+
|
56
|
+
Writer output = new OutputStreamWriter(outstream, StandardCharsets.UTF_8);
|
57
|
+
output = new BufferedWriter(output);
|
58
|
+
|
59
|
+
StringBuffer input = new StringBuffer();
|
60
|
+
int character;
|
61
|
+
while ((character = reader.read()) != -1) {
|
62
|
+
char ch = (char) character;
|
63
|
+
if (Character.isWhitespace(ch)) {
|
64
|
+
stemmer.setCurrent(input.toString());
|
65
|
+
stemmer.stem();
|
66
|
+
output.write(stemmer.getCurrent());
|
67
|
+
output.write('\n');
|
68
|
+
input.delete(0, input.length());
|
69
|
+
} else {
|
70
|
+
input.append(ch < 127 ? Character.toLowerCase(ch) : ch);
|
71
|
+
}
|
72
|
+
}
|
73
|
+
output.flush();
|
74
|
+
}
|
75
|
+
}
|
@@ -0,0 +1,294 @@
|
|
1
|
+
/**@constructor*/
|
2
|
+
BaseStemmer = function() {
|
3
|
+
this.setCurrent = function(value) {
|
4
|
+
this.current = value;
|
5
|
+
this.cursor = 0;
|
6
|
+
this.limit = this.current.length;
|
7
|
+
this.limit_backward = 0;
|
8
|
+
this.bra = this.cursor;
|
9
|
+
this.ket = this.limit;
|
10
|
+
};
|
11
|
+
|
12
|
+
this.getCurrent = function() {
|
13
|
+
return this.current;
|
14
|
+
};
|
15
|
+
|
16
|
+
this.copy_from = function(other) {
|
17
|
+
this.current = other.current;
|
18
|
+
this.cursor = other.cursor;
|
19
|
+
this.limit = other.limit;
|
20
|
+
this.limit_backward = other.limit_backward;
|
21
|
+
this.bra = other.bra;
|
22
|
+
this.ket = other.ket;
|
23
|
+
};
|
24
|
+
|
25
|
+
this.in_grouping = function(s, min, max) {
|
26
|
+
if (this.cursor >= this.limit) return false;
|
27
|
+
var ch = this.current.charCodeAt(this.cursor);
|
28
|
+
if (ch > max || ch < min) return false;
|
29
|
+
ch -= min;
|
30
|
+
if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) return false;
|
31
|
+
this.cursor++;
|
32
|
+
return true;
|
33
|
+
};
|
34
|
+
|
35
|
+
this.in_grouping_b = function(s, min, max) {
|
36
|
+
if (this.cursor <= this.limit_backward) return false;
|
37
|
+
var ch = this.current.charCodeAt(this.cursor - 1);
|
38
|
+
if (ch > max || ch < min) return false;
|
39
|
+
ch -= min;
|
40
|
+
if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) return false;
|
41
|
+
this.cursor--;
|
42
|
+
return true;
|
43
|
+
};
|
44
|
+
|
45
|
+
this.out_grouping = function(s, min, max) {
|
46
|
+
if (this.cursor >= this.limit) return false;
|
47
|
+
var ch = this.current.charCodeAt(this.cursor);
|
48
|
+
if (ch > max || ch < min) {
|
49
|
+
this.cursor++;
|
50
|
+
return true;
|
51
|
+
}
|
52
|
+
ch -= min;
|
53
|
+
if ((s[ch >>> 3] & (0X1 << (ch & 0x7))) == 0) {
|
54
|
+
this.cursor++;
|
55
|
+
return true;
|
56
|
+
}
|
57
|
+
return false;
|
58
|
+
};
|
59
|
+
|
60
|
+
this.out_grouping_b = function(s, min, max) {
|
61
|
+
if (this.cursor <= this.limit_backward) return false;
|
62
|
+
var ch = this.current.charCodeAt(this.cursor - 1);
|
63
|
+
if (ch > max || ch < min) {
|
64
|
+
this.cursor--;
|
65
|
+
return true;
|
66
|
+
}
|
67
|
+
ch -= min;
|
68
|
+
if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) {
|
69
|
+
this.cursor--;
|
70
|
+
return true;
|
71
|
+
}
|
72
|
+
return false;
|
73
|
+
};
|
74
|
+
|
75
|
+
this.eq_s = function(s)
|
76
|
+
{
|
77
|
+
if (this.limit - this.cursor < s.length) return false;
|
78
|
+
if (this.current.slice(this.cursor, this.cursor + s.length) != s)
|
79
|
+
{
|
80
|
+
return false;
|
81
|
+
}
|
82
|
+
this.cursor += s.length;
|
83
|
+
return true;
|
84
|
+
};
|
85
|
+
|
86
|
+
this.eq_s_b = function(s)
|
87
|
+
{
|
88
|
+
if (this.cursor - this.limit_backward < s.length) return false;
|
89
|
+
if (this.current.slice(this.cursor - s.length, this.cursor) != s)
|
90
|
+
{
|
91
|
+
return false;
|
92
|
+
}
|
93
|
+
this.cursor -= s.length;
|
94
|
+
return true;
|
95
|
+
};
|
96
|
+
|
97
|
+
/** @return {number} */ this.find_among = function(v)
|
98
|
+
{
|
99
|
+
var i = 0;
|
100
|
+
var j = v.length;
|
101
|
+
|
102
|
+
var c = this.cursor;
|
103
|
+
var l = this.limit;
|
104
|
+
|
105
|
+
var common_i = 0;
|
106
|
+
var common_j = 0;
|
107
|
+
|
108
|
+
var first_key_inspected = false;
|
109
|
+
|
110
|
+
while (true)
|
111
|
+
{
|
112
|
+
var k = i + ((j - i) >>> 1);
|
113
|
+
var diff = 0;
|
114
|
+
var common = common_i < common_j ? common_i : common_j; // smaller
|
115
|
+
// w[0]: string, w[1]: substring_i, w[2]: result, w[3]: function (optional)
|
116
|
+
var w = v[k];
|
117
|
+
var i2;
|
118
|
+
for (i2 = common; i2 < w[0].length; i2++)
|
119
|
+
{
|
120
|
+
if (c + common == l)
|
121
|
+
{
|
122
|
+
diff = -1;
|
123
|
+
break;
|
124
|
+
}
|
125
|
+
diff = this.current.charCodeAt(c + common) - w[0].charCodeAt(i2);
|
126
|
+
if (diff != 0) break;
|
127
|
+
common++;
|
128
|
+
}
|
129
|
+
if (diff < 0)
|
130
|
+
{
|
131
|
+
j = k;
|
132
|
+
common_j = common;
|
133
|
+
}
|
134
|
+
else
|
135
|
+
{
|
136
|
+
i = k;
|
137
|
+
common_i = common;
|
138
|
+
}
|
139
|
+
if (j - i <= 1)
|
140
|
+
{
|
141
|
+
if (i > 0) break; // v->s has been inspected
|
142
|
+
if (j == i) break; // only one item in v
|
143
|
+
|
144
|
+
// - but now we need to go round once more to get
|
145
|
+
// v->s inspected. This looks messy, but is actually
|
146
|
+
// the optimal approach.
|
147
|
+
|
148
|
+
if (first_key_inspected) break;
|
149
|
+
first_key_inspected = true;
|
150
|
+
}
|
151
|
+
}
|
152
|
+
do {
|
153
|
+
var w = v[i];
|
154
|
+
if (common_i >= w[0].length)
|
155
|
+
{
|
156
|
+
this.cursor = c + w[0].length;
|
157
|
+
if (w.length < 4) return w[2];
|
158
|
+
var res = w[3](this);
|
159
|
+
this.cursor = c + w[0].length;
|
160
|
+
if (res) return w[2];
|
161
|
+
}
|
162
|
+
i = w[1];
|
163
|
+
} while (i >= 0);
|
164
|
+
return 0;
|
165
|
+
};
|
166
|
+
|
167
|
+
// find_among_b is for backwards processing. Same comments apply
|
168
|
+
this.find_among_b = function(v)
|
169
|
+
{
|
170
|
+
var i = 0;
|
171
|
+
var j = v.length
|
172
|
+
|
173
|
+
var c = this.cursor;
|
174
|
+
var lb = this.limit_backward;
|
175
|
+
|
176
|
+
var common_i = 0;
|
177
|
+
var common_j = 0;
|
178
|
+
|
179
|
+
var first_key_inspected = false;
|
180
|
+
|
181
|
+
while (true)
|
182
|
+
{
|
183
|
+
var k = i + ((j - i) >> 1);
|
184
|
+
var diff = 0;
|
185
|
+
var common = common_i < common_j ? common_i : common_j;
|
186
|
+
var w = v[k];
|
187
|
+
var i2;
|
188
|
+
for (i2 = w[0].length - 1 - common; i2 >= 0; i2--)
|
189
|
+
{
|
190
|
+
if (c - common == lb)
|
191
|
+
{
|
192
|
+
diff = -1;
|
193
|
+
break;
|
194
|
+
}
|
195
|
+
diff = this.current.charCodeAt(c - 1 - common) - w[0].charCodeAt(i2);
|
196
|
+
if (diff != 0) break;
|
197
|
+
common++;
|
198
|
+
}
|
199
|
+
if (diff < 0)
|
200
|
+
{
|
201
|
+
j = k;
|
202
|
+
common_j = common;
|
203
|
+
}
|
204
|
+
else
|
205
|
+
{
|
206
|
+
i = k;
|
207
|
+
common_i = common;
|
208
|
+
}
|
209
|
+
if (j - i <= 1)
|
210
|
+
{
|
211
|
+
if (i > 0) break;
|
212
|
+
if (j == i) break;
|
213
|
+
if (first_key_inspected) break;
|
214
|
+
first_key_inspected = true;
|
215
|
+
}
|
216
|
+
}
|
217
|
+
do {
|
218
|
+
var w = v[i];
|
219
|
+
if (common_i >= w[0].length)
|
220
|
+
{
|
221
|
+
this.cursor = c - w[0].length;
|
222
|
+
if (w.length < 4) return w[2];
|
223
|
+
var res = w[3](this);
|
224
|
+
this.cursor = c - w[0].length;
|
225
|
+
if (res) return w[2];
|
226
|
+
}
|
227
|
+
i = w[1];
|
228
|
+
} while (i >= 0);
|
229
|
+
return 0;
|
230
|
+
};
|
231
|
+
|
232
|
+
/* to replace chars between c_bra and c_ket in this.current by the
|
233
|
+
* chars in s.
|
234
|
+
*/
|
235
|
+
this.replace_s = function(c_bra, c_ket, s)
|
236
|
+
{
|
237
|
+
var adjustment = s.length - (c_ket - c_bra);
|
238
|
+
this.current = this.current.slice(0, c_bra) + s + this.current.slice(c_ket);
|
239
|
+
this.limit += adjustment;
|
240
|
+
if (this.cursor >= c_ket) this.cursor += adjustment;
|
241
|
+
else if (this.cursor > c_bra) this.cursor = c_bra;
|
242
|
+
return adjustment;
|
243
|
+
};
|
244
|
+
|
245
|
+
this.slice_check = function()
|
246
|
+
{
|
247
|
+
if (this.bra < 0 ||
|
248
|
+
this.bra > this.ket ||
|
249
|
+
this.ket > this.limit ||
|
250
|
+
this.limit > this.current.length)
|
251
|
+
{
|
252
|
+
return false;
|
253
|
+
}
|
254
|
+
return true;
|
255
|
+
};
|
256
|
+
|
257
|
+
this.slice_from = function(s)
|
258
|
+
{
|
259
|
+
var result = false;
|
260
|
+
if (this.slice_check())
|
261
|
+
{
|
262
|
+
this.replace_s(this.bra, this.ket, s);
|
263
|
+
result = true;
|
264
|
+
}
|
265
|
+
return result;
|
266
|
+
};
|
267
|
+
|
268
|
+
this.slice_del = function()
|
269
|
+
{
|
270
|
+
return this.slice_from("");
|
271
|
+
};
|
272
|
+
|
273
|
+
this.insert = function(c_bra, c_ket, s)
|
274
|
+
{
|
275
|
+
var adjustment = this.replace_s(c_bra, c_ket, s);
|
276
|
+
if (c_bra <= this.bra) this.bra += adjustment;
|
277
|
+
if (c_bra <= this.ket) this.ket += adjustment;
|
278
|
+
};
|
279
|
+
|
280
|
+
this.slice_to = function()
|
281
|
+
{
|
282
|
+
var result = '';
|
283
|
+
if (this.slice_check())
|
284
|
+
{
|
285
|
+
result = this.current.slice(this.bra, this.ket);
|
286
|
+
}
|
287
|
+
return result;
|
288
|
+
};
|
289
|
+
|
290
|
+
this.assign_to = function()
|
291
|
+
{
|
292
|
+
return this.current.slice(0, this.limit);
|
293
|
+
};
|
294
|
+
};
|
@@ -0,0 +1,106 @@
|
|
1
|
+
const stemmer = require('base-stemmer.js');
|
2
|
+
|
3
|
+
const fs = require('fs');
|
4
|
+
const readline = require('readline');
|
5
|
+
|
6
|
+
function usage() {
|
7
|
+
console.log("usage: stemwords.js [-l <language>] -i <input file> -o <output file> [-c <character encoding>] [-h]\n");
|
8
|
+
console.log("The input file consists of a list of words to be stemmed, one per");
|
9
|
+
console.log("line. Words should be in lower case.\n");
|
10
|
+
console.log("If -c is given, the argument is the character encoding of the input");
|
11
|
+
console.log("and output files. If it is omitted, the UTF-8 encoding is used.\n");
|
12
|
+
console.log("The output file consists of the stemmed words, one per line.\n");
|
13
|
+
console.log("-h displays this help");
|
14
|
+
}
|
15
|
+
|
16
|
+
if (process.argv.length < 5)
|
17
|
+
{
|
18
|
+
usage();
|
19
|
+
}
|
20
|
+
else
|
21
|
+
{
|
22
|
+
var input = '';
|
23
|
+
var output = '';
|
24
|
+
var encoding = 'utf8';
|
25
|
+
var language = 'English';
|
26
|
+
var show_help = false;
|
27
|
+
while (process.argv.length > 0)
|
28
|
+
{
|
29
|
+
var arg = process.argv.shift();
|
30
|
+
switch (arg)
|
31
|
+
{
|
32
|
+
case "-h":
|
33
|
+
show_help = true;
|
34
|
+
process.argv.length = 0;
|
35
|
+
break;
|
36
|
+
case "-l":
|
37
|
+
if (process.argv.length == 0)
|
38
|
+
{
|
39
|
+
show_help = true;
|
40
|
+
break;
|
41
|
+
}
|
42
|
+
language = process.argv.shift();
|
43
|
+
break;
|
44
|
+
case "-i":
|
45
|
+
if (process.argv.length == 0)
|
46
|
+
{
|
47
|
+
show_help = true;
|
48
|
+
break;
|
49
|
+
}
|
50
|
+
input = process.argv.shift();
|
51
|
+
break;
|
52
|
+
case "-o":
|
53
|
+
if (process.argv.length == 0)
|
54
|
+
{
|
55
|
+
show_help = true;
|
56
|
+
break;
|
57
|
+
}
|
58
|
+
output = process.argv.shift();
|
59
|
+
break;
|
60
|
+
case "-c":
|
61
|
+
if (process.argv.length == 0)
|
62
|
+
{
|
63
|
+
show_help = true;
|
64
|
+
break;
|
65
|
+
}
|
66
|
+
encoding = process.argv.shift();
|
67
|
+
break;
|
68
|
+
}
|
69
|
+
}
|
70
|
+
if (show_help || input == '' || output == '')
|
71
|
+
{
|
72
|
+
usage();
|
73
|
+
}
|
74
|
+
else
|
75
|
+
{
|
76
|
+
stemming(language, input, output, encoding);
|
77
|
+
}
|
78
|
+
}
|
79
|
+
|
80
|
+
// function stemming (lang : string, input : string, output : string, encoding : string) {
|
81
|
+
function stemming (lang, input, output, encoding) {
|
82
|
+
const lines = readline.createInterface({
|
83
|
+
input: fs.createReadStream(input, encoding),
|
84
|
+
terminal: false
|
85
|
+
});
|
86
|
+
var out = fs.createWriteStream(output, encoding);
|
87
|
+
var stemmer = create(lang);
|
88
|
+
lines.on('line', (original) => {
|
89
|
+
out.write(stemmer.stemWord(original) + '\n');
|
90
|
+
});
|
91
|
+
}
|
92
|
+
|
93
|
+
function create (name) {
|
94
|
+
var lc_name = name.toLowerCase();
|
95
|
+
if (!lc_name.match('\\W') && lc_name != 'base') {
|
96
|
+
var algo = lc_name.substr(0, 1).toUpperCase() + lc_name.substr(1);
|
97
|
+
try {
|
98
|
+
const stemmer = require(lc_name + '-stemmer.js');
|
99
|
+
return Function('return new ' + algo + 'Stemmer()')();
|
100
|
+
} catch (error) {
|
101
|
+
}
|
102
|
+
}
|
103
|
+
console.log('Unknown stemming language: ' + name + '\n');
|
104
|
+
usage();
|
105
|
+
process.exit(1);
|
106
|
+
}
|
@@ -0,0 +1,96 @@
|
|
1
|
+
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include <string.h>
|
4
|
+
#include "../include/libstemmer.h"
|
5
|
+
#include "../runtime/api.h"
|
6
|
+
#include "@MODULES_H@"
|
7
|
+
|
8
|
+
struct sb_stemmer {
|
9
|
+
struct SN_env * (*create)(void);
|
10
|
+
void (*close)(struct SN_env *);
|
11
|
+
int (*stem)(struct SN_env *);
|
12
|
+
|
13
|
+
struct SN_env * env;
|
14
|
+
};
|
15
|
+
|
16
|
+
extern const char **
|
17
|
+
sb_stemmer_list(void)
|
18
|
+
{
|
19
|
+
return algorithm_names;
|
20
|
+
}
|
21
|
+
|
22
|
+
static stemmer_encoding_t
|
23
|
+
sb_getenc(const char * charenc)
|
24
|
+
{
|
25
|
+
const struct stemmer_encoding * encoding;
|
26
|
+
if (charenc == NULL) return ENC_UTF_8;
|
27
|
+
for (encoding = encodings; encoding->name != 0; encoding++) {
|
28
|
+
if (strcmp(encoding->name, charenc) == 0) break;
|
29
|
+
}
|
30
|
+
if (encoding->name == NULL) return ENC_UNKNOWN;
|
31
|
+
return encoding->enc;
|
32
|
+
}
|
33
|
+
|
34
|
+
extern struct sb_stemmer *
|
35
|
+
sb_stemmer_new(const char * algorithm, const char * charenc)
|
36
|
+
{
|
37
|
+
stemmer_encoding_t enc;
|
38
|
+
const struct stemmer_modules * module;
|
39
|
+
struct sb_stemmer * stemmer;
|
40
|
+
|
41
|
+
enc = sb_getenc(charenc);
|
42
|
+
if (enc == ENC_UNKNOWN) return NULL;
|
43
|
+
|
44
|
+
for (module = modules; module->name != 0; module++) {
|
45
|
+
if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
|
46
|
+
}
|
47
|
+
if (module->name == NULL) return NULL;
|
48
|
+
|
49
|
+
stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
|
50
|
+
if (stemmer == NULL) return NULL;
|
51
|
+
|
52
|
+
stemmer->create = module->create;
|
53
|
+
stemmer->close = module->close;
|
54
|
+
stemmer->stem = module->stem;
|
55
|
+
|
56
|
+
stemmer->env = stemmer->create();
|
57
|
+
if (stemmer->env == NULL)
|
58
|
+
{
|
59
|
+
sb_stemmer_delete(stemmer);
|
60
|
+
return NULL;
|
61
|
+
}
|
62
|
+
|
63
|
+
return stemmer;
|
64
|
+
}
|
65
|
+
|
66
|
+
void
|
67
|
+
sb_stemmer_delete(struct sb_stemmer * stemmer)
|
68
|
+
{
|
69
|
+
if (stemmer == 0) return;
|
70
|
+
if (stemmer->close) {
|
71
|
+
stemmer->close(stemmer->env);
|
72
|
+
stemmer->close = 0;
|
73
|
+
}
|
74
|
+
free(stemmer);
|
75
|
+
}
|
76
|
+
|
77
|
+
const sb_symbol *
|
78
|
+
sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size)
|
79
|
+
{
|
80
|
+
int ret;
|
81
|
+
if (SN_set_current(stemmer->env, size, (const symbol *)(word)))
|
82
|
+
{
|
83
|
+
stemmer->env->l = 0;
|
84
|
+
return NULL;
|
85
|
+
}
|
86
|
+
ret = stemmer->stem(stemmer->env);
|
87
|
+
if (ret < 0) return NULL;
|
88
|
+
stemmer->env->p[stemmer->env->l] = 0;
|
89
|
+
return (const sb_symbol *)(stemmer->env->p);
|
90
|
+
}
|
91
|
+
|
92
|
+
int
|
93
|
+
sb_stemmer_length(struct sb_stemmer * stemmer)
|
94
|
+
{
|
95
|
+
return stemmer->env->l;
|
96
|
+
}
|
@@ -0,0 +1,90 @@
|
|
1
|
+
#!/usr/bin/env perl
|
2
|
+
use strict;
|
3
|
+
use 5.006;
|
4
|
+
use warnings;
|
5
|
+
|
6
|
+
my $progname = $0;
|
7
|
+
|
8
|
+
if (scalar @ARGV != 2) {
|
9
|
+
print "Usage: $progname <outfile> <modules description file>\n";
|
10
|
+
exit 1;
|
11
|
+
}
|
12
|
+
|
13
|
+
my $outname = shift(@ARGV);
|
14
|
+
my $descfile = shift(@ARGV);
|
15
|
+
|
16
|
+
my %aliases = ();
|
17
|
+
my %algorithms = ();
|
18
|
+
my %algorithm_encs = ();
|
19
|
+
|
20
|
+
my %encs = ();
|
21
|
+
|
22
|
+
sub addalgenc($$) {
|
23
|
+
my $alg = shift();
|
24
|
+
my $enc = shift();
|
25
|
+
|
26
|
+
if (defined $algorithm_encs{$alg}) {
|
27
|
+
my $hashref = $algorithm_encs{$alg};
|
28
|
+
$$hashref{$enc}=1;
|
29
|
+
} else {
|
30
|
+
my %newhash = ($enc => 1);
|
31
|
+
$algorithm_encs{$alg}=\%newhash;
|
32
|
+
}
|
33
|
+
|
34
|
+
$encs{$enc} = 1;
|
35
|
+
}
|
36
|
+
|
37
|
+
sub readinput()
|
38
|
+
{
|
39
|
+
open DESCFILE, $descfile;
|
40
|
+
my $line;
|
41
|
+
while ($line = <DESCFILE>)
|
42
|
+
{
|
43
|
+
next if $line =~ m/^\s*#/;
|
44
|
+
next if $line =~ m/^\s*$/;
|
45
|
+
my ($alg,$encstr,$aliases) = split(/\s+/, $line);
|
46
|
+
my $enc;
|
47
|
+
my $alias;
|
48
|
+
|
49
|
+
$algorithms{$alg} = 1;
|
50
|
+
foreach $alias (split(/,/, $aliases)) {
|
51
|
+
foreach $enc (split(/,/, $encstr)) {
|
52
|
+
$aliases{$alias} = $alg;
|
53
|
+
addalgenc($alg, $enc);
|
54
|
+
}
|
55
|
+
}
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
sub printoutput()
|
60
|
+
{
|
61
|
+
open (OUT, ">$outname") or die "Can't open output file `$outname': $!\n";
|
62
|
+
|
63
|
+
print OUT <<EOS;
|
64
|
+
# $outname: Lists of stemming modules.
|
65
|
+
#
|
66
|
+
# This file is generated by mkalgorithms.pl from a list of module names.
|
67
|
+
# Do not edit manually.
|
68
|
+
EOS
|
69
|
+
|
70
|
+
my $need_sep = 0;
|
71
|
+
my $lang;
|
72
|
+
my $enc;
|
73
|
+
my @algorithms = sort keys(%algorithms);
|
74
|
+
print OUT "\nlibstemmer_algorithms =";
|
75
|
+
foreach $lang (@algorithms) {
|
76
|
+
print OUT "\\\n ", $lang;
|
77
|
+
}
|
78
|
+
print OUT "\n";
|
79
|
+
|
80
|
+
for my $enc (qw(ISO_8859_1 ISO_8859_2 KOI8_R)) {
|
81
|
+
print OUT "\n${enc}_algorithms =";
|
82
|
+
foreach $lang (@algorithms) {
|
83
|
+
print OUT "\\\n ", $lang if exists $algorithm_encs{$lang}->{$enc};
|
84
|
+
}
|
85
|
+
print OUT "\n";
|
86
|
+
}
|
87
|
+
}
|
88
|
+
|
89
|
+
readinput();
|
90
|
+
printoutput();
|