mittens 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE.txt +30 -0
  5. data/README.md +62 -0
  6. data/Rakefile +21 -0
  7. data/ext/mittens/ext.c +96 -0
  8. data/ext/mittens/extconf.rb +12 -0
  9. data/lib/mittens/version.rb +3 -0
  10. data/lib/mittens.rb +7 -0
  11. data/mittens.gemspec +22 -0
  12. data/vendor/snowball/.gitignore +26 -0
  13. data/vendor/snowball/.travis.yml +112 -0
  14. data/vendor/snowball/AUTHORS +27 -0
  15. data/vendor/snowball/CONTRIBUTING.rst +216 -0
  16. data/vendor/snowball/COPYING +29 -0
  17. data/vendor/snowball/GNUmakefile +742 -0
  18. data/vendor/snowball/NEWS +754 -0
  19. data/vendor/snowball/README.rst +37 -0
  20. data/vendor/snowball/ada/README.md +74 -0
  21. data/vendor/snowball/ada/generate/generate.adb +83 -0
  22. data/vendor/snowball/ada/generate.gpr +21 -0
  23. data/vendor/snowball/ada/src/stemmer.adb +620 -0
  24. data/vendor/snowball/ada/src/stemmer.ads +219 -0
  25. data/vendor/snowball/ada/src/stemwords.adb +70 -0
  26. data/vendor/snowball/ada/stemmer_config.gpr +83 -0
  27. data/vendor/snowball/ada/stemwords.gpr +21 -0
  28. data/vendor/snowball/algorithms/arabic.sbl +558 -0
  29. data/vendor/snowball/algorithms/armenian.sbl +301 -0
  30. data/vendor/snowball/algorithms/basque.sbl +149 -0
  31. data/vendor/snowball/algorithms/catalan.sbl +202 -0
  32. data/vendor/snowball/algorithms/danish.sbl +93 -0
  33. data/vendor/snowball/algorithms/dutch.sbl +164 -0
  34. data/vendor/snowball/algorithms/english.sbl +229 -0
  35. data/vendor/snowball/algorithms/finnish.sbl +197 -0
  36. data/vendor/snowball/algorithms/french.sbl +254 -0
  37. data/vendor/snowball/algorithms/german.sbl +139 -0
  38. data/vendor/snowball/algorithms/german2.sbl +145 -0
  39. data/vendor/snowball/algorithms/greek.sbl +701 -0
  40. data/vendor/snowball/algorithms/hindi.sbl +323 -0
  41. data/vendor/snowball/algorithms/hungarian.sbl +241 -0
  42. data/vendor/snowball/algorithms/indonesian.sbl +192 -0
  43. data/vendor/snowball/algorithms/irish.sbl +149 -0
  44. data/vendor/snowball/algorithms/italian.sbl +202 -0
  45. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
  46. data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
  47. data/vendor/snowball/algorithms/lovins.sbl +208 -0
  48. data/vendor/snowball/algorithms/nepali.sbl +92 -0
  49. data/vendor/snowball/algorithms/norwegian.sbl +80 -0
  50. data/vendor/snowball/algorithms/porter.sbl +139 -0
  51. data/vendor/snowball/algorithms/portuguese.sbl +218 -0
  52. data/vendor/snowball/algorithms/romanian.sbl +236 -0
  53. data/vendor/snowball/algorithms/russian.sbl +221 -0
  54. data/vendor/snowball/algorithms/serbian.sbl +2379 -0
  55. data/vendor/snowball/algorithms/spanish.sbl +230 -0
  56. data/vendor/snowball/algorithms/swedish.sbl +72 -0
  57. data/vendor/snowball/algorithms/tamil.sbl +405 -0
  58. data/vendor/snowball/algorithms/turkish.sbl +470 -0
  59. data/vendor/snowball/algorithms/yiddish.sbl +460 -0
  60. data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
  61. data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
  62. data/vendor/snowball/charsets/cp850.sbl +130 -0
  63. data/vendor/snowball/compiler/analyser.c +1547 -0
  64. data/vendor/snowball/compiler/driver.c +615 -0
  65. data/vendor/snowball/compiler/generator.c +1748 -0
  66. data/vendor/snowball/compiler/generator_ada.c +1702 -0
  67. data/vendor/snowball/compiler/generator_csharp.c +1322 -0
  68. data/vendor/snowball/compiler/generator_go.c +1278 -0
  69. data/vendor/snowball/compiler/generator_java.c +1313 -0
  70. data/vendor/snowball/compiler/generator_js.c +1316 -0
  71. data/vendor/snowball/compiler/generator_pascal.c +1387 -0
  72. data/vendor/snowball/compiler/generator_python.c +1337 -0
  73. data/vendor/snowball/compiler/generator_rust.c +1295 -0
  74. data/vendor/snowball/compiler/header.h +418 -0
  75. data/vendor/snowball/compiler/space.c +286 -0
  76. data/vendor/snowball/compiler/syswords.h +86 -0
  77. data/vendor/snowball/compiler/syswords2.h +13 -0
  78. data/vendor/snowball/compiler/tokeniser.c +567 -0
  79. data/vendor/snowball/csharp/.gitignore +8 -0
  80. data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
  81. data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
  82. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
  83. data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
  84. data/vendor/snowball/csharp/Stemwords/App.config +6 -0
  85. data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
  86. data/vendor/snowball/doc/TODO +12 -0
  87. data/vendor/snowball/doc/libstemmer_c_README +148 -0
  88. data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
  89. data/vendor/snowball/doc/libstemmer_java_README +67 -0
  90. data/vendor/snowball/doc/libstemmer_js_README +48 -0
  91. data/vendor/snowball/doc/libstemmer_python_README +113 -0
  92. data/vendor/snowball/examples/stemwords.c +204 -0
  93. data/vendor/snowball/go/README.md +55 -0
  94. data/vendor/snowball/go/among.go +16 -0
  95. data/vendor/snowball/go/env.go +403 -0
  96. data/vendor/snowball/go/stemwords/generate.go +68 -0
  97. data/vendor/snowball/go/stemwords/main.go +68 -0
  98. data/vendor/snowball/go/util.go +34 -0
  99. data/vendor/snowball/iconv.py +50 -0
  100. data/vendor/snowball/include/libstemmer.h +78 -0
  101. data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
  102. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
  103. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
  104. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
  105. data/vendor/snowball/javascript/base-stemmer.js +294 -0
  106. data/vendor/snowball/javascript/stemwords.js +106 -0
  107. data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
  108. data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
  109. data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
  110. data/vendor/snowball/libstemmer/modules.txt +63 -0
  111. data/vendor/snowball/libstemmer/test.c +34 -0
  112. data/vendor/snowball/pascal/.gitignore +4 -0
  113. data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
  114. data/vendor/snowball/pascal/generate.pl +23 -0
  115. data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
  116. data/vendor/snowball/python/MANIFEST.in +7 -0
  117. data/vendor/snowball/python/create_init.py +54 -0
  118. data/vendor/snowball/python/setup.cfg +6 -0
  119. data/vendor/snowball/python/setup.py +81 -0
  120. data/vendor/snowball/python/snowballstemmer/among.py +13 -0
  121. data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
  122. data/vendor/snowball/python/stemwords.py +101 -0
  123. data/vendor/snowball/python/testapp.py +28 -0
  124. data/vendor/snowball/runtime/api.c +58 -0
  125. data/vendor/snowball/runtime/api.h +32 -0
  126. data/vendor/snowball/runtime/header.h +61 -0
  127. data/vendor/snowball/runtime/utilities.c +513 -0
  128. data/vendor/snowball/rust/Cargo.toml +7 -0
  129. data/vendor/snowball/rust/build.rs +55 -0
  130. data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
  131. data/vendor/snowball/rust/src/main.rs +102 -0
  132. data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
  133. data/vendor/snowball/rust/src/snowball/among.rs +6 -0
  134. data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
  135. data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
  136. data/vendor/snowball/tests/stemtest.c +95 -0
  137. metadata +178 -0
@@ -0,0 +1,75 @@
1
+
2
+ package org.tartarus.snowball;
3
+
4
+ import java.lang.reflect.Method;
5
+ import java.io.BufferedReader;
6
+ import java.io.BufferedWriter;
7
+ import java.io.FileInputStream;
8
+ import java.io.FileOutputStream;
9
+ import java.io.InputStream;
10
+ import java.io.InputStreamReader;
11
+ import java.io.OutputStream;
12
+ import java.io.OutputStreamWriter;
13
+ import java.io.Reader;
14
+ import java.io.Writer;
15
+ import java.nio.charset.StandardCharsets;
16
+
17
+ public class TestApp {
18
+ private static void usage()
19
+ {
20
+ System.err.println("Usage: TestApp <algorithm> [<input file>] [-o <output file>]");
21
+ }
22
+
23
+ public static void main(String [] args) throws Throwable {
24
+ if (args.length < 2) {
25
+ usage();
26
+ return;
27
+ }
28
+
29
+ Class stemClass = Class.forName("org.tartarus.snowball.ext." +
30
+ args[0] + "Stemmer");
31
+ SnowballStemmer stemmer = (SnowballStemmer) stemClass.newInstance();
32
+
33
+ int arg = 1;
34
+
35
+ InputStream instream;
36
+ if (args.length > arg && !args[arg].equals("-o")) {
37
+ instream = new FileInputStream(args[arg++]);
38
+ } else {
39
+ instream = System.in;
40
+ }
41
+
42
+ OutputStream outstream;
43
+ if (args.length > arg) {
44
+ if (args.length != arg + 2 || !args[arg].equals("-o")) {
45
+ usage();
46
+ return;
47
+ }
48
+ outstream = new FileOutputStream(args[arg + 1]);
49
+ } else {
50
+ outstream = System.out;
51
+ }
52
+
53
+ Reader reader = new InputStreamReader(instream, StandardCharsets.UTF_8);
54
+ reader = new BufferedReader(reader);
55
+
56
+ Writer output = new OutputStreamWriter(outstream, StandardCharsets.UTF_8);
57
+ output = new BufferedWriter(output);
58
+
59
+ StringBuffer input = new StringBuffer();
60
+ int character;
61
+ while ((character = reader.read()) != -1) {
62
+ char ch = (char) character;
63
+ if (Character.isWhitespace(ch)) {
64
+ stemmer.setCurrent(input.toString());
65
+ stemmer.stem();
66
+ output.write(stemmer.getCurrent());
67
+ output.write('\n');
68
+ input.delete(0, input.length());
69
+ } else {
70
+ input.append(ch < 127 ? Character.toLowerCase(ch) : ch);
71
+ }
72
+ }
73
+ output.flush();
74
+ }
75
+ }
@@ -0,0 +1,294 @@
1
+ /**@constructor*/
2
+ BaseStemmer = function() {
3
+ this.setCurrent = function(value) {
4
+ this.current = value;
5
+ this.cursor = 0;
6
+ this.limit = this.current.length;
7
+ this.limit_backward = 0;
8
+ this.bra = this.cursor;
9
+ this.ket = this.limit;
10
+ };
11
+
12
+ this.getCurrent = function() {
13
+ return this.current;
14
+ };
15
+
16
+ this.copy_from = function(other) {
17
+ this.current = other.current;
18
+ this.cursor = other.cursor;
19
+ this.limit = other.limit;
20
+ this.limit_backward = other.limit_backward;
21
+ this.bra = other.bra;
22
+ this.ket = other.ket;
23
+ };
24
+
25
+ this.in_grouping = function(s, min, max) {
26
+ if (this.cursor >= this.limit) return false;
27
+ var ch = this.current.charCodeAt(this.cursor);
28
+ if (ch > max || ch < min) return false;
29
+ ch -= min;
30
+ if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) return false;
31
+ this.cursor++;
32
+ return true;
33
+ };
34
+
35
+ this.in_grouping_b = function(s, min, max) {
36
+ if (this.cursor <= this.limit_backward) return false;
37
+ var ch = this.current.charCodeAt(this.cursor - 1);
38
+ if (ch > max || ch < min) return false;
39
+ ch -= min;
40
+ if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) return false;
41
+ this.cursor--;
42
+ return true;
43
+ };
44
+
45
+ this.out_grouping = function(s, min, max) {
46
+ if (this.cursor >= this.limit) return false;
47
+ var ch = this.current.charCodeAt(this.cursor);
48
+ if (ch > max || ch < min) {
49
+ this.cursor++;
50
+ return true;
51
+ }
52
+ ch -= min;
53
+ if ((s[ch >>> 3] & (0X1 << (ch & 0x7))) == 0) {
54
+ this.cursor++;
55
+ return true;
56
+ }
57
+ return false;
58
+ };
59
+
60
+ this.out_grouping_b = function(s, min, max) {
61
+ if (this.cursor <= this.limit_backward) return false;
62
+ var ch = this.current.charCodeAt(this.cursor - 1);
63
+ if (ch > max || ch < min) {
64
+ this.cursor--;
65
+ return true;
66
+ }
67
+ ch -= min;
68
+ if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) {
69
+ this.cursor--;
70
+ return true;
71
+ }
72
+ return false;
73
+ };
74
+
75
+ this.eq_s = function(s)
76
+ {
77
+ if (this.limit - this.cursor < s.length) return false;
78
+ if (this.current.slice(this.cursor, this.cursor + s.length) != s)
79
+ {
80
+ return false;
81
+ }
82
+ this.cursor += s.length;
83
+ return true;
84
+ };
85
+
86
+ this.eq_s_b = function(s)
87
+ {
88
+ if (this.cursor - this.limit_backward < s.length) return false;
89
+ if (this.current.slice(this.cursor - s.length, this.cursor) != s)
90
+ {
91
+ return false;
92
+ }
93
+ this.cursor -= s.length;
94
+ return true;
95
+ };
96
+
97
+ /** @return {number} */ this.find_among = function(v)
98
+ {
99
+ var i = 0;
100
+ var j = v.length;
101
+
102
+ var c = this.cursor;
103
+ var l = this.limit;
104
+
105
+ var common_i = 0;
106
+ var common_j = 0;
107
+
108
+ var first_key_inspected = false;
109
+
110
+ while (true)
111
+ {
112
+ var k = i + ((j - i) >>> 1);
113
+ var diff = 0;
114
+ var common = common_i < common_j ? common_i : common_j; // smaller
115
+ // w[0]: string, w[1]: substring_i, w[2]: result, w[3]: function (optional)
116
+ var w = v[k];
117
+ var i2;
118
+ for (i2 = common; i2 < w[0].length; i2++)
119
+ {
120
+ if (c + common == l)
121
+ {
122
+ diff = -1;
123
+ break;
124
+ }
125
+ diff = this.current.charCodeAt(c + common) - w[0].charCodeAt(i2);
126
+ if (diff != 0) break;
127
+ common++;
128
+ }
129
+ if (diff < 0)
130
+ {
131
+ j = k;
132
+ common_j = common;
133
+ }
134
+ else
135
+ {
136
+ i = k;
137
+ common_i = common;
138
+ }
139
+ if (j - i <= 1)
140
+ {
141
+ if (i > 0) break; // v->s has been inspected
142
+ if (j == i) break; // only one item in v
143
+
144
+ // - but now we need to go round once more to get
145
+ // v->s inspected. This looks messy, but is actually
146
+ // the optimal approach.
147
+
148
+ if (first_key_inspected) break;
149
+ first_key_inspected = true;
150
+ }
151
+ }
152
+ do {
153
+ var w = v[i];
154
+ if (common_i >= w[0].length)
155
+ {
156
+ this.cursor = c + w[0].length;
157
+ if (w.length < 4) return w[2];
158
+ var res = w[3](this);
159
+ this.cursor = c + w[0].length;
160
+ if (res) return w[2];
161
+ }
162
+ i = w[1];
163
+ } while (i >= 0);
164
+ return 0;
165
+ };
166
+
167
+ // find_among_b is for backwards processing. Same comments apply
168
+ this.find_among_b = function(v)
169
+ {
170
+ var i = 0;
171
+ var j = v.length
172
+
173
+ var c = this.cursor;
174
+ var lb = this.limit_backward;
175
+
176
+ var common_i = 0;
177
+ var common_j = 0;
178
+
179
+ var first_key_inspected = false;
180
+
181
+ while (true)
182
+ {
183
+ var k = i + ((j - i) >> 1);
184
+ var diff = 0;
185
+ var common = common_i < common_j ? common_i : common_j;
186
+ var w = v[k];
187
+ var i2;
188
+ for (i2 = w[0].length - 1 - common; i2 >= 0; i2--)
189
+ {
190
+ if (c - common == lb)
191
+ {
192
+ diff = -1;
193
+ break;
194
+ }
195
+ diff = this.current.charCodeAt(c - 1 - common) - w[0].charCodeAt(i2);
196
+ if (diff != 0) break;
197
+ common++;
198
+ }
199
+ if (diff < 0)
200
+ {
201
+ j = k;
202
+ common_j = common;
203
+ }
204
+ else
205
+ {
206
+ i = k;
207
+ common_i = common;
208
+ }
209
+ if (j - i <= 1)
210
+ {
211
+ if (i > 0) break;
212
+ if (j == i) break;
213
+ if (first_key_inspected) break;
214
+ first_key_inspected = true;
215
+ }
216
+ }
217
+ do {
218
+ var w = v[i];
219
+ if (common_i >= w[0].length)
220
+ {
221
+ this.cursor = c - w[0].length;
222
+ if (w.length < 4) return w[2];
223
+ var res = w[3](this);
224
+ this.cursor = c - w[0].length;
225
+ if (res) return w[2];
226
+ }
227
+ i = w[1];
228
+ } while (i >= 0);
229
+ return 0;
230
+ };
231
+
232
+ /* to replace chars between c_bra and c_ket in this.current by the
233
+ * chars in s.
234
+ */
235
+ this.replace_s = function(c_bra, c_ket, s)
236
+ {
237
+ var adjustment = s.length - (c_ket - c_bra);
238
+ this.current = this.current.slice(0, c_bra) + s + this.current.slice(c_ket);
239
+ this.limit += adjustment;
240
+ if (this.cursor >= c_ket) this.cursor += adjustment;
241
+ else if (this.cursor > c_bra) this.cursor = c_bra;
242
+ return adjustment;
243
+ };
244
+
245
+ this.slice_check = function()
246
+ {
247
+ if (this.bra < 0 ||
248
+ this.bra > this.ket ||
249
+ this.ket > this.limit ||
250
+ this.limit > this.current.length)
251
+ {
252
+ return false;
253
+ }
254
+ return true;
255
+ };
256
+
257
+ this.slice_from = function(s)
258
+ {
259
+ var result = false;
260
+ if (this.slice_check())
261
+ {
262
+ this.replace_s(this.bra, this.ket, s);
263
+ result = true;
264
+ }
265
+ return result;
266
+ };
267
+
268
+ this.slice_del = function()
269
+ {
270
+ return this.slice_from("");
271
+ };
272
+
273
+ this.insert = function(c_bra, c_ket, s)
274
+ {
275
+ var adjustment = this.replace_s(c_bra, c_ket, s);
276
+ if (c_bra <= this.bra) this.bra += adjustment;
277
+ if (c_bra <= this.ket) this.ket += adjustment;
278
+ };
279
+
280
+ this.slice_to = function()
281
+ {
282
+ var result = '';
283
+ if (this.slice_check())
284
+ {
285
+ result = this.current.slice(this.bra, this.ket);
286
+ }
287
+ return result;
288
+ };
289
+
290
+ this.assign_to = function()
291
+ {
292
+ return this.current.slice(0, this.limit);
293
+ };
294
+ };
@@ -0,0 +1,106 @@
1
+ const stemmer = require('base-stemmer.js');
2
+
3
+ const fs = require('fs');
4
+ const readline = require('readline');
5
+
6
+ function usage() {
7
+ console.log("usage: stemwords.js [-l <language>] -i <input file> -o <output file> [-c <character encoding>] [-h]\n");
8
+ console.log("The input file consists of a list of words to be stemmed, one per");
9
+ console.log("line. Words should be in lower case.\n");
10
+ console.log("If -c is given, the argument is the character encoding of the input");
11
+ console.log("and output files. If it is omitted, the UTF-8 encoding is used.\n");
12
+ console.log("The output file consists of the stemmed words, one per line.\n");
13
+ console.log("-h displays this help");
14
+ }
15
+
16
+ if (process.argv.length < 5)
17
+ {
18
+ usage();
19
+ }
20
+ else
21
+ {
22
+ var input = '';
23
+ var output = '';
24
+ var encoding = 'utf8';
25
+ var language = 'English';
26
+ var show_help = false;
27
+ while (process.argv.length > 0)
28
+ {
29
+ var arg = process.argv.shift();
30
+ switch (arg)
31
+ {
32
+ case "-h":
33
+ show_help = true;
34
+ process.argv.length = 0;
35
+ break;
36
+ case "-l":
37
+ if (process.argv.length == 0)
38
+ {
39
+ show_help = true;
40
+ break;
41
+ }
42
+ language = process.argv.shift();
43
+ break;
44
+ case "-i":
45
+ if (process.argv.length == 0)
46
+ {
47
+ show_help = true;
48
+ break;
49
+ }
50
+ input = process.argv.shift();
51
+ break;
52
+ case "-o":
53
+ if (process.argv.length == 0)
54
+ {
55
+ show_help = true;
56
+ break;
57
+ }
58
+ output = process.argv.shift();
59
+ break;
60
+ case "-c":
61
+ if (process.argv.length == 0)
62
+ {
63
+ show_help = true;
64
+ break;
65
+ }
66
+ encoding = process.argv.shift();
67
+ break;
68
+ }
69
+ }
70
+ if (show_help || input == '' || output == '')
71
+ {
72
+ usage();
73
+ }
74
+ else
75
+ {
76
+ stemming(language, input, output, encoding);
77
+ }
78
+ }
79
+
80
+ // function stemming (lang : string, input : string, output : string, encoding : string) {
81
+ function stemming (lang, input, output, encoding) {
82
+ const lines = readline.createInterface({
83
+ input: fs.createReadStream(input, encoding),
84
+ terminal: false
85
+ });
86
+ var out = fs.createWriteStream(output, encoding);
87
+ var stemmer = create(lang);
88
+ lines.on('line', (original) => {
89
+ out.write(stemmer.stemWord(original) + '\n');
90
+ });
91
+ }
92
+
93
+ function create (name) {
94
+ var lc_name = name.toLowerCase();
95
+ if (!lc_name.match('\\W') && lc_name != 'base') {
96
+ var algo = lc_name.substr(0, 1).toUpperCase() + lc_name.substr(1);
97
+ try {
98
+ const stemmer = require(lc_name + '-stemmer.js');
99
+ return Function('return new ' + algo + 'Stemmer()')();
100
+ } catch (error) {
101
+ }
102
+ }
103
+ console.log('Unknown stemming language: ' + name + '\n');
104
+ usage();
105
+ process.exit(1);
106
+ }
@@ -0,0 +1,96 @@
1
+
2
+ #include <stdlib.h>
3
+ #include <string.h>
4
+ #include "../include/libstemmer.h"
5
+ #include "../runtime/api.h"
6
+ #include "@MODULES_H@"
7
+
8
+ struct sb_stemmer {
9
+ struct SN_env * (*create)(void);
10
+ void (*close)(struct SN_env *);
11
+ int (*stem)(struct SN_env *);
12
+
13
+ struct SN_env * env;
14
+ };
15
+
16
+ extern const char **
17
+ sb_stemmer_list(void)
18
+ {
19
+ return algorithm_names;
20
+ }
21
+
22
+ static stemmer_encoding_t
23
+ sb_getenc(const char * charenc)
24
+ {
25
+ const struct stemmer_encoding * encoding;
26
+ if (charenc == NULL) return ENC_UTF_8;
27
+ for (encoding = encodings; encoding->name != 0; encoding++) {
28
+ if (strcmp(encoding->name, charenc) == 0) break;
29
+ }
30
+ if (encoding->name == NULL) return ENC_UNKNOWN;
31
+ return encoding->enc;
32
+ }
33
+
34
+ extern struct sb_stemmer *
35
+ sb_stemmer_new(const char * algorithm, const char * charenc)
36
+ {
37
+ stemmer_encoding_t enc;
38
+ const struct stemmer_modules * module;
39
+ struct sb_stemmer * stemmer;
40
+
41
+ enc = sb_getenc(charenc);
42
+ if (enc == ENC_UNKNOWN) return NULL;
43
+
44
+ for (module = modules; module->name != 0; module++) {
45
+ if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
46
+ }
47
+ if (module->name == NULL) return NULL;
48
+
49
+ stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
50
+ if (stemmer == NULL) return NULL;
51
+
52
+ stemmer->create = module->create;
53
+ stemmer->close = module->close;
54
+ stemmer->stem = module->stem;
55
+
56
+ stemmer->env = stemmer->create();
57
+ if (stemmer->env == NULL)
58
+ {
59
+ sb_stemmer_delete(stemmer);
60
+ return NULL;
61
+ }
62
+
63
+ return stemmer;
64
+ }
65
+
66
+ void
67
+ sb_stemmer_delete(struct sb_stemmer * stemmer)
68
+ {
69
+ if (stemmer == 0) return;
70
+ if (stemmer->close) {
71
+ stemmer->close(stemmer->env);
72
+ stemmer->close = 0;
73
+ }
74
+ free(stemmer);
75
+ }
76
+
77
+ const sb_symbol *
78
+ sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size)
79
+ {
80
+ int ret;
81
+ if (SN_set_current(stemmer->env, size, (const symbol *)(word)))
82
+ {
83
+ stemmer->env->l = 0;
84
+ return NULL;
85
+ }
86
+ ret = stemmer->stem(stemmer->env);
87
+ if (ret < 0) return NULL;
88
+ stemmer->env->p[stemmer->env->l] = 0;
89
+ return (const sb_symbol *)(stemmer->env->p);
90
+ }
91
+
92
+ int
93
+ sb_stemmer_length(struct sb_stemmer * stemmer)
94
+ {
95
+ return stemmer->env->l;
96
+ }
@@ -0,0 +1,90 @@
1
+ #!/usr/bin/env perl
2
+ use strict;
3
+ use 5.006;
4
+ use warnings;
5
+
6
+ my $progname = $0;
7
+
8
+ if (scalar @ARGV != 2) {
9
+ print "Usage: $progname <outfile> <modules description file>\n";
10
+ exit 1;
11
+ }
12
+
13
+ my $outname = shift(@ARGV);
14
+ my $descfile = shift(@ARGV);
15
+
16
+ my %aliases = ();
17
+ my %algorithms = ();
18
+ my %algorithm_encs = ();
19
+
20
+ my %encs = ();
21
+
22
+ sub addalgenc($$) {
23
+ my $alg = shift();
24
+ my $enc = shift();
25
+
26
+ if (defined $algorithm_encs{$alg}) {
27
+ my $hashref = $algorithm_encs{$alg};
28
+ $$hashref{$enc}=1;
29
+ } else {
30
+ my %newhash = ($enc => 1);
31
+ $algorithm_encs{$alg}=\%newhash;
32
+ }
33
+
34
+ $encs{$enc} = 1;
35
+ }
36
+
37
+ sub readinput()
38
+ {
39
+ open DESCFILE, $descfile;
40
+ my $line;
41
+ while ($line = <DESCFILE>)
42
+ {
43
+ next if $line =~ m/^\s*#/;
44
+ next if $line =~ m/^\s*$/;
45
+ my ($alg,$encstr,$aliases) = split(/\s+/, $line);
46
+ my $enc;
47
+ my $alias;
48
+
49
+ $algorithms{$alg} = 1;
50
+ foreach $alias (split(/,/, $aliases)) {
51
+ foreach $enc (split(/,/, $encstr)) {
52
+ $aliases{$alias} = $alg;
53
+ addalgenc($alg, $enc);
54
+ }
55
+ }
56
+ }
57
+ }
58
+
59
+ sub printoutput()
60
+ {
61
+ open (OUT, ">$outname") or die "Can't open output file `$outname': $!\n";
62
+
63
+ print OUT <<EOS;
64
+ # $outname: Lists of stemming modules.
65
+ #
66
+ # This file is generated by mkalgorithms.pl from a list of module names.
67
+ # Do not edit manually.
68
+ EOS
69
+
70
+ my $need_sep = 0;
71
+ my $lang;
72
+ my $enc;
73
+ my @algorithms = sort keys(%algorithms);
74
+ print OUT "\nlibstemmer_algorithms =";
75
+ foreach $lang (@algorithms) {
76
+ print OUT "\\\n ", $lang;
77
+ }
78
+ print OUT "\n";
79
+
80
+ for my $enc (qw(ISO_8859_1 ISO_8859_2 KOI8_R)) {
81
+ print OUT "\n${enc}_algorithms =";
82
+ foreach $lang (@algorithms) {
83
+ print OUT "\\\n ", $lang if exists $algorithm_encs{$lang}->{$enc};
84
+ }
85
+ print OUT "\n";
86
+ }
87
+ }
88
+
89
+ readinput();
90
+ printoutput();