mittens 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (137) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE.txt +30 -0
  5. data/README.md +62 -0
  6. data/Rakefile +21 -0
  7. data/ext/mittens/ext.c +96 -0
  8. data/ext/mittens/extconf.rb +12 -0
  9. data/lib/mittens/version.rb +3 -0
  10. data/lib/mittens.rb +7 -0
  11. data/mittens.gemspec +22 -0
  12. data/vendor/snowball/.gitignore +26 -0
  13. data/vendor/snowball/.travis.yml +112 -0
  14. data/vendor/snowball/AUTHORS +27 -0
  15. data/vendor/snowball/CONTRIBUTING.rst +216 -0
  16. data/vendor/snowball/COPYING +29 -0
  17. data/vendor/snowball/GNUmakefile +742 -0
  18. data/vendor/snowball/NEWS +754 -0
  19. data/vendor/snowball/README.rst +37 -0
  20. data/vendor/snowball/ada/README.md +74 -0
  21. data/vendor/snowball/ada/generate/generate.adb +83 -0
  22. data/vendor/snowball/ada/generate.gpr +21 -0
  23. data/vendor/snowball/ada/src/stemmer.adb +620 -0
  24. data/vendor/snowball/ada/src/stemmer.ads +219 -0
  25. data/vendor/snowball/ada/src/stemwords.adb +70 -0
  26. data/vendor/snowball/ada/stemmer_config.gpr +83 -0
  27. data/vendor/snowball/ada/stemwords.gpr +21 -0
  28. data/vendor/snowball/algorithms/arabic.sbl +558 -0
  29. data/vendor/snowball/algorithms/armenian.sbl +301 -0
  30. data/vendor/snowball/algorithms/basque.sbl +149 -0
  31. data/vendor/snowball/algorithms/catalan.sbl +202 -0
  32. data/vendor/snowball/algorithms/danish.sbl +93 -0
  33. data/vendor/snowball/algorithms/dutch.sbl +164 -0
  34. data/vendor/snowball/algorithms/english.sbl +229 -0
  35. data/vendor/snowball/algorithms/finnish.sbl +197 -0
  36. data/vendor/snowball/algorithms/french.sbl +254 -0
  37. data/vendor/snowball/algorithms/german.sbl +139 -0
  38. data/vendor/snowball/algorithms/german2.sbl +145 -0
  39. data/vendor/snowball/algorithms/greek.sbl +701 -0
  40. data/vendor/snowball/algorithms/hindi.sbl +323 -0
  41. data/vendor/snowball/algorithms/hungarian.sbl +241 -0
  42. data/vendor/snowball/algorithms/indonesian.sbl +192 -0
  43. data/vendor/snowball/algorithms/irish.sbl +149 -0
  44. data/vendor/snowball/algorithms/italian.sbl +202 -0
  45. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
  46. data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
  47. data/vendor/snowball/algorithms/lovins.sbl +208 -0
  48. data/vendor/snowball/algorithms/nepali.sbl +92 -0
  49. data/vendor/snowball/algorithms/norwegian.sbl +80 -0
  50. data/vendor/snowball/algorithms/porter.sbl +139 -0
  51. data/vendor/snowball/algorithms/portuguese.sbl +218 -0
  52. data/vendor/snowball/algorithms/romanian.sbl +236 -0
  53. data/vendor/snowball/algorithms/russian.sbl +221 -0
  54. data/vendor/snowball/algorithms/serbian.sbl +2379 -0
  55. data/vendor/snowball/algorithms/spanish.sbl +230 -0
  56. data/vendor/snowball/algorithms/swedish.sbl +72 -0
  57. data/vendor/snowball/algorithms/tamil.sbl +405 -0
  58. data/vendor/snowball/algorithms/turkish.sbl +470 -0
  59. data/vendor/snowball/algorithms/yiddish.sbl +460 -0
  60. data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
  61. data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
  62. data/vendor/snowball/charsets/cp850.sbl +130 -0
  63. data/vendor/snowball/compiler/analyser.c +1547 -0
  64. data/vendor/snowball/compiler/driver.c +615 -0
  65. data/vendor/snowball/compiler/generator.c +1748 -0
  66. data/vendor/snowball/compiler/generator_ada.c +1702 -0
  67. data/vendor/snowball/compiler/generator_csharp.c +1322 -0
  68. data/vendor/snowball/compiler/generator_go.c +1278 -0
  69. data/vendor/snowball/compiler/generator_java.c +1313 -0
  70. data/vendor/snowball/compiler/generator_js.c +1316 -0
  71. data/vendor/snowball/compiler/generator_pascal.c +1387 -0
  72. data/vendor/snowball/compiler/generator_python.c +1337 -0
  73. data/vendor/snowball/compiler/generator_rust.c +1295 -0
  74. data/vendor/snowball/compiler/header.h +418 -0
  75. data/vendor/snowball/compiler/space.c +286 -0
  76. data/vendor/snowball/compiler/syswords.h +86 -0
  77. data/vendor/snowball/compiler/syswords2.h +13 -0
  78. data/vendor/snowball/compiler/tokeniser.c +567 -0
  79. data/vendor/snowball/csharp/.gitignore +8 -0
  80. data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
  81. data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
  82. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
  83. data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
  84. data/vendor/snowball/csharp/Stemwords/App.config +6 -0
  85. data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
  86. data/vendor/snowball/doc/TODO +12 -0
  87. data/vendor/snowball/doc/libstemmer_c_README +148 -0
  88. data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
  89. data/vendor/snowball/doc/libstemmer_java_README +67 -0
  90. data/vendor/snowball/doc/libstemmer_js_README +48 -0
  91. data/vendor/snowball/doc/libstemmer_python_README +113 -0
  92. data/vendor/snowball/examples/stemwords.c +204 -0
  93. data/vendor/snowball/go/README.md +55 -0
  94. data/vendor/snowball/go/among.go +16 -0
  95. data/vendor/snowball/go/env.go +403 -0
  96. data/vendor/snowball/go/stemwords/generate.go +68 -0
  97. data/vendor/snowball/go/stemwords/main.go +68 -0
  98. data/vendor/snowball/go/util.go +34 -0
  99. data/vendor/snowball/iconv.py +50 -0
  100. data/vendor/snowball/include/libstemmer.h +78 -0
  101. data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
  102. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
  103. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
  104. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
  105. data/vendor/snowball/javascript/base-stemmer.js +294 -0
  106. data/vendor/snowball/javascript/stemwords.js +106 -0
  107. data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
  108. data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
  109. data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
  110. data/vendor/snowball/libstemmer/modules.txt +63 -0
  111. data/vendor/snowball/libstemmer/test.c +34 -0
  112. data/vendor/snowball/pascal/.gitignore +4 -0
  113. data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
  114. data/vendor/snowball/pascal/generate.pl +23 -0
  115. data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
  116. data/vendor/snowball/python/MANIFEST.in +7 -0
  117. data/vendor/snowball/python/create_init.py +54 -0
  118. data/vendor/snowball/python/setup.cfg +6 -0
  119. data/vendor/snowball/python/setup.py +81 -0
  120. data/vendor/snowball/python/snowballstemmer/among.py +13 -0
  121. data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
  122. data/vendor/snowball/python/stemwords.py +101 -0
  123. data/vendor/snowball/python/testapp.py +28 -0
  124. data/vendor/snowball/runtime/api.c +58 -0
  125. data/vendor/snowball/runtime/api.h +32 -0
  126. data/vendor/snowball/runtime/header.h +61 -0
  127. data/vendor/snowball/runtime/utilities.c +513 -0
  128. data/vendor/snowball/rust/Cargo.toml +7 -0
  129. data/vendor/snowball/rust/build.rs +55 -0
  130. data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
  131. data/vendor/snowball/rust/src/main.rs +102 -0
  132. data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
  133. data/vendor/snowball/rust/src/snowball/among.rs +6 -0
  134. data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
  135. data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
  136. data/vendor/snowball/tests/stemtest.c +95 -0
  137. metadata +178 -0
@@ -0,0 +1,75 @@
1
+
2
+ package org.tartarus.snowball;
3
+
4
+ import java.lang.reflect.Method;
5
+ import java.io.BufferedReader;
6
+ import java.io.BufferedWriter;
7
+ import java.io.FileInputStream;
8
+ import java.io.FileOutputStream;
9
+ import java.io.InputStream;
10
+ import java.io.InputStreamReader;
11
+ import java.io.OutputStream;
12
+ import java.io.OutputStreamWriter;
13
+ import java.io.Reader;
14
+ import java.io.Writer;
15
+ import java.nio.charset.StandardCharsets;
16
+
17
+ public class TestApp {
18
+ private static void usage()
19
+ {
20
+ System.err.println("Usage: TestApp <algorithm> [<input file>] [-o <output file>]");
21
+ }
22
+
23
+ public static void main(String [] args) throws Throwable {
24
+ if (args.length < 2) {
25
+ usage();
26
+ return;
27
+ }
28
+
29
+ Class stemClass = Class.forName("org.tartarus.snowball.ext." +
30
+ args[0] + "Stemmer");
31
+ SnowballStemmer stemmer = (SnowballStemmer) stemClass.newInstance();
32
+
33
+ int arg = 1;
34
+
35
+ InputStream instream;
36
+ if (args.length > arg && !args[arg].equals("-o")) {
37
+ instream = new FileInputStream(args[arg++]);
38
+ } else {
39
+ instream = System.in;
40
+ }
41
+
42
+ OutputStream outstream;
43
+ if (args.length > arg) {
44
+ if (args.length != arg + 2 || !args[arg].equals("-o")) {
45
+ usage();
46
+ return;
47
+ }
48
+ outstream = new FileOutputStream(args[arg + 1]);
49
+ } else {
50
+ outstream = System.out;
51
+ }
52
+
53
+ Reader reader = new InputStreamReader(instream, StandardCharsets.UTF_8);
54
+ reader = new BufferedReader(reader);
55
+
56
+ Writer output = new OutputStreamWriter(outstream, StandardCharsets.UTF_8);
57
+ output = new BufferedWriter(output);
58
+
59
+ StringBuffer input = new StringBuffer();
60
+ int character;
61
+ while ((character = reader.read()) != -1) {
62
+ char ch = (char) character;
63
+ if (Character.isWhitespace(ch)) {
64
+ stemmer.setCurrent(input.toString());
65
+ stemmer.stem();
66
+ output.write(stemmer.getCurrent());
67
+ output.write('\n');
68
+ input.delete(0, input.length());
69
+ } else {
70
+ input.append(ch < 127 ? Character.toLowerCase(ch) : ch);
71
+ }
72
+ }
73
+ output.flush();
74
+ }
75
+ }
@@ -0,0 +1,294 @@
1
+ /**@constructor*/
2
+ BaseStemmer = function() {
3
+ this.setCurrent = function(value) {
4
+ this.current = value;
5
+ this.cursor = 0;
6
+ this.limit = this.current.length;
7
+ this.limit_backward = 0;
8
+ this.bra = this.cursor;
9
+ this.ket = this.limit;
10
+ };
11
+
12
+ this.getCurrent = function() {
13
+ return this.current;
14
+ };
15
+
16
+ this.copy_from = function(other) {
17
+ this.current = other.current;
18
+ this.cursor = other.cursor;
19
+ this.limit = other.limit;
20
+ this.limit_backward = other.limit_backward;
21
+ this.bra = other.bra;
22
+ this.ket = other.ket;
23
+ };
24
+
25
+ this.in_grouping = function(s, min, max) {
26
+ if (this.cursor >= this.limit) return false;
27
+ var ch = this.current.charCodeAt(this.cursor);
28
+ if (ch > max || ch < min) return false;
29
+ ch -= min;
30
+ if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) return false;
31
+ this.cursor++;
32
+ return true;
33
+ };
34
+
35
+ this.in_grouping_b = function(s, min, max) {
36
+ if (this.cursor <= this.limit_backward) return false;
37
+ var ch = this.current.charCodeAt(this.cursor - 1);
38
+ if (ch > max || ch < min) return false;
39
+ ch -= min;
40
+ if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) return false;
41
+ this.cursor--;
42
+ return true;
43
+ };
44
+
45
+ this.out_grouping = function(s, min, max) {
46
+ if (this.cursor >= this.limit) return false;
47
+ var ch = this.current.charCodeAt(this.cursor);
48
+ if (ch > max || ch < min) {
49
+ this.cursor++;
50
+ return true;
51
+ }
52
+ ch -= min;
53
+ if ((s[ch >>> 3] & (0X1 << (ch & 0x7))) == 0) {
54
+ this.cursor++;
55
+ return true;
56
+ }
57
+ return false;
58
+ };
59
+
60
+ this.out_grouping_b = function(s, min, max) {
61
+ if (this.cursor <= this.limit_backward) return false;
62
+ var ch = this.current.charCodeAt(this.cursor - 1);
63
+ if (ch > max || ch < min) {
64
+ this.cursor--;
65
+ return true;
66
+ }
67
+ ch -= min;
68
+ if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) {
69
+ this.cursor--;
70
+ return true;
71
+ }
72
+ return false;
73
+ };
74
+
75
+ this.eq_s = function(s)
76
+ {
77
+ if (this.limit - this.cursor < s.length) return false;
78
+ if (this.current.slice(this.cursor, this.cursor + s.length) != s)
79
+ {
80
+ return false;
81
+ }
82
+ this.cursor += s.length;
83
+ return true;
84
+ };
85
+
86
+ this.eq_s_b = function(s)
87
+ {
88
+ if (this.cursor - this.limit_backward < s.length) return false;
89
+ if (this.current.slice(this.cursor - s.length, this.cursor) != s)
90
+ {
91
+ return false;
92
+ }
93
+ this.cursor -= s.length;
94
+ return true;
95
+ };
96
+
97
+ /** @return {number} */ this.find_among = function(v)
98
+ {
99
+ var i = 0;
100
+ var j = v.length;
101
+
102
+ var c = this.cursor;
103
+ var l = this.limit;
104
+
105
+ var common_i = 0;
106
+ var common_j = 0;
107
+
108
+ var first_key_inspected = false;
109
+
110
+ while (true)
111
+ {
112
+ var k = i + ((j - i) >>> 1);
113
+ var diff = 0;
114
+ var common = common_i < common_j ? common_i : common_j; // smaller
115
+ // w[0]: string, w[1]: substring_i, w[2]: result, w[3]: function (optional)
116
+ var w = v[k];
117
+ var i2;
118
+ for (i2 = common; i2 < w[0].length; i2++)
119
+ {
120
+ if (c + common == l)
121
+ {
122
+ diff = -1;
123
+ break;
124
+ }
125
+ diff = this.current.charCodeAt(c + common) - w[0].charCodeAt(i2);
126
+ if (diff != 0) break;
127
+ common++;
128
+ }
129
+ if (diff < 0)
130
+ {
131
+ j = k;
132
+ common_j = common;
133
+ }
134
+ else
135
+ {
136
+ i = k;
137
+ common_i = common;
138
+ }
139
+ if (j - i <= 1)
140
+ {
141
+ if (i > 0) break; // v->s has been inspected
142
+ if (j == i) break; // only one item in v
143
+
144
+ // - but now we need to go round once more to get
145
+ // v->s inspected. This looks messy, but is actually
146
+ // the optimal approach.
147
+
148
+ if (first_key_inspected) break;
149
+ first_key_inspected = true;
150
+ }
151
+ }
152
+ do {
153
+ var w = v[i];
154
+ if (common_i >= w[0].length)
155
+ {
156
+ this.cursor = c + w[0].length;
157
+ if (w.length < 4) return w[2];
158
+ var res = w[3](this);
159
+ this.cursor = c + w[0].length;
160
+ if (res) return w[2];
161
+ }
162
+ i = w[1];
163
+ } while (i >= 0);
164
+ return 0;
165
+ };
166
+
167
+ // find_among_b is for backwards processing. Same comments apply
168
+ this.find_among_b = function(v)
169
+ {
170
+ var i = 0;
171
+ var j = v.length
172
+
173
+ var c = this.cursor;
174
+ var lb = this.limit_backward;
175
+
176
+ var common_i = 0;
177
+ var common_j = 0;
178
+
179
+ var first_key_inspected = false;
180
+
181
+ while (true)
182
+ {
183
+ var k = i + ((j - i) >> 1);
184
+ var diff = 0;
185
+ var common = common_i < common_j ? common_i : common_j;
186
+ var w = v[k];
187
+ var i2;
188
+ for (i2 = w[0].length - 1 - common; i2 >= 0; i2--)
189
+ {
190
+ if (c - common == lb)
191
+ {
192
+ diff = -1;
193
+ break;
194
+ }
195
+ diff = this.current.charCodeAt(c - 1 - common) - w[0].charCodeAt(i2);
196
+ if (diff != 0) break;
197
+ common++;
198
+ }
199
+ if (diff < 0)
200
+ {
201
+ j = k;
202
+ common_j = common;
203
+ }
204
+ else
205
+ {
206
+ i = k;
207
+ common_i = common;
208
+ }
209
+ if (j - i <= 1)
210
+ {
211
+ if (i > 0) break;
212
+ if (j == i) break;
213
+ if (first_key_inspected) break;
214
+ first_key_inspected = true;
215
+ }
216
+ }
217
+ do {
218
+ var w = v[i];
219
+ if (common_i >= w[0].length)
220
+ {
221
+ this.cursor = c - w[0].length;
222
+ if (w.length < 4) return w[2];
223
+ var res = w[3](this);
224
+ this.cursor = c - w[0].length;
225
+ if (res) return w[2];
226
+ }
227
+ i = w[1];
228
+ } while (i >= 0);
229
+ return 0;
230
+ };
231
+
232
+ /* to replace chars between c_bra and c_ket in this.current by the
233
+ * chars in s.
234
+ */
235
+ this.replace_s = function(c_bra, c_ket, s)
236
+ {
237
+ var adjustment = s.length - (c_ket - c_bra);
238
+ this.current = this.current.slice(0, c_bra) + s + this.current.slice(c_ket);
239
+ this.limit += adjustment;
240
+ if (this.cursor >= c_ket) this.cursor += adjustment;
241
+ else if (this.cursor > c_bra) this.cursor = c_bra;
242
+ return adjustment;
243
+ };
244
+
245
+ this.slice_check = function()
246
+ {
247
+ if (this.bra < 0 ||
248
+ this.bra > this.ket ||
249
+ this.ket > this.limit ||
250
+ this.limit > this.current.length)
251
+ {
252
+ return false;
253
+ }
254
+ return true;
255
+ };
256
+
257
+ this.slice_from = function(s)
258
+ {
259
+ var result = false;
260
+ if (this.slice_check())
261
+ {
262
+ this.replace_s(this.bra, this.ket, s);
263
+ result = true;
264
+ }
265
+ return result;
266
+ };
267
+
268
+ this.slice_del = function()
269
+ {
270
+ return this.slice_from("");
271
+ };
272
+
273
+ this.insert = function(c_bra, c_ket, s)
274
+ {
275
+ var adjustment = this.replace_s(c_bra, c_ket, s);
276
+ if (c_bra <= this.bra) this.bra += adjustment;
277
+ if (c_bra <= this.ket) this.ket += adjustment;
278
+ };
279
+
280
+ this.slice_to = function()
281
+ {
282
+ var result = '';
283
+ if (this.slice_check())
284
+ {
285
+ result = this.current.slice(this.bra, this.ket);
286
+ }
287
+ return result;
288
+ };
289
+
290
+ this.assign_to = function()
291
+ {
292
+ return this.current.slice(0, this.limit);
293
+ };
294
+ };
@@ -0,0 +1,106 @@
1
+ const stemmer = require('base-stemmer.js');
2
+
3
+ const fs = require('fs');
4
+ const readline = require('readline');
5
+
6
+ function usage() {
7
+ console.log("usage: stemwords.js [-l <language>] -i <input file> -o <output file> [-c <character encoding>] [-h]\n");
8
+ console.log("The input file consists of a list of words to be stemmed, one per");
9
+ console.log("line. Words should be in lower case.\n");
10
+ console.log("If -c is given, the argument is the character encoding of the input");
11
+ console.log("and output files. If it is omitted, the UTF-8 encoding is used.\n");
12
+ console.log("The output file consists of the stemmed words, one per line.\n");
13
+ console.log("-h displays this help");
14
+ }
15
+
16
+ if (process.argv.length < 5)
17
+ {
18
+ usage();
19
+ }
20
+ else
21
+ {
22
+ var input = '';
23
+ var output = '';
24
+ var encoding = 'utf8';
25
+ var language = 'English';
26
+ var show_help = false;
27
+ while (process.argv.length > 0)
28
+ {
29
+ var arg = process.argv.shift();
30
+ switch (arg)
31
+ {
32
+ case "-h":
33
+ show_help = true;
34
+ process.argv.length = 0;
35
+ break;
36
+ case "-l":
37
+ if (process.argv.length == 0)
38
+ {
39
+ show_help = true;
40
+ break;
41
+ }
42
+ language = process.argv.shift();
43
+ break;
44
+ case "-i":
45
+ if (process.argv.length == 0)
46
+ {
47
+ show_help = true;
48
+ break;
49
+ }
50
+ input = process.argv.shift();
51
+ break;
52
+ case "-o":
53
+ if (process.argv.length == 0)
54
+ {
55
+ show_help = true;
56
+ break;
57
+ }
58
+ output = process.argv.shift();
59
+ break;
60
+ case "-c":
61
+ if (process.argv.length == 0)
62
+ {
63
+ show_help = true;
64
+ break;
65
+ }
66
+ encoding = process.argv.shift();
67
+ break;
68
+ }
69
+ }
70
+ if (show_help || input == '' || output == '')
71
+ {
72
+ usage();
73
+ }
74
+ else
75
+ {
76
+ stemming(language, input, output, encoding);
77
+ }
78
+ }
79
+
80
+ // function stemming (lang : string, input : string, output : string, encoding : string) {
81
+ function stemming (lang, input, output, encoding) {
82
+ const lines = readline.createInterface({
83
+ input: fs.createReadStream(input, encoding),
84
+ terminal: false
85
+ });
86
+ var out = fs.createWriteStream(output, encoding);
87
+ var stemmer = create(lang);
88
+ lines.on('line', (original) => {
89
+ out.write(stemmer.stemWord(original) + '\n');
90
+ });
91
+ }
92
+
93
+ function create (name) {
94
+ var lc_name = name.toLowerCase();
95
+ if (!lc_name.match('\\W') && lc_name != 'base') {
96
+ var algo = lc_name.substr(0, 1).toUpperCase() + lc_name.substr(1);
97
+ try {
98
+ const stemmer = require(lc_name + '-stemmer.js');
99
+ return Function('return new ' + algo + 'Stemmer()')();
100
+ } catch (error) {
101
+ }
102
+ }
103
+ console.log('Unknown stemming language: ' + name + '\n');
104
+ usage();
105
+ process.exit(1);
106
+ }
@@ -0,0 +1,96 @@
1
+
2
+ #include <stdlib.h>
3
+ #include <string.h>
4
+ #include "../include/libstemmer.h"
5
+ #include "../runtime/api.h"
6
+ #include "@MODULES_H@"
7
+
8
+ struct sb_stemmer {
9
+ struct SN_env * (*create)(void);
10
+ void (*close)(struct SN_env *);
11
+ int (*stem)(struct SN_env *);
12
+
13
+ struct SN_env * env;
14
+ };
15
+
16
+ extern const char **
17
+ sb_stemmer_list(void)
18
+ {
19
+ return algorithm_names;
20
+ }
21
+
22
+ static stemmer_encoding_t
23
+ sb_getenc(const char * charenc)
24
+ {
25
+ const struct stemmer_encoding * encoding;
26
+ if (charenc == NULL) return ENC_UTF_8;
27
+ for (encoding = encodings; encoding->name != 0; encoding++) {
28
+ if (strcmp(encoding->name, charenc) == 0) break;
29
+ }
30
+ if (encoding->name == NULL) return ENC_UNKNOWN;
31
+ return encoding->enc;
32
+ }
33
+
34
+ extern struct sb_stemmer *
35
+ sb_stemmer_new(const char * algorithm, const char * charenc)
36
+ {
37
+ stemmer_encoding_t enc;
38
+ const struct stemmer_modules * module;
39
+ struct sb_stemmer * stemmer;
40
+
41
+ enc = sb_getenc(charenc);
42
+ if (enc == ENC_UNKNOWN) return NULL;
43
+
44
+ for (module = modules; module->name != 0; module++) {
45
+ if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
46
+ }
47
+ if (module->name == NULL) return NULL;
48
+
49
+ stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
50
+ if (stemmer == NULL) return NULL;
51
+
52
+ stemmer->create = module->create;
53
+ stemmer->close = module->close;
54
+ stemmer->stem = module->stem;
55
+
56
+ stemmer->env = stemmer->create();
57
+ if (stemmer->env == NULL)
58
+ {
59
+ sb_stemmer_delete(stemmer);
60
+ return NULL;
61
+ }
62
+
63
+ return stemmer;
64
+ }
65
+
66
+ void
67
+ sb_stemmer_delete(struct sb_stemmer * stemmer)
68
+ {
69
+ if (stemmer == 0) return;
70
+ if (stemmer->close) {
71
+ stemmer->close(stemmer->env);
72
+ stemmer->close = 0;
73
+ }
74
+ free(stemmer);
75
+ }
76
+
77
+ const sb_symbol *
78
+ sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size)
79
+ {
80
+ int ret;
81
+ if (SN_set_current(stemmer->env, size, (const symbol *)(word)))
82
+ {
83
+ stemmer->env->l = 0;
84
+ return NULL;
85
+ }
86
+ ret = stemmer->stem(stemmer->env);
87
+ if (ret < 0) return NULL;
88
+ stemmer->env->p[stemmer->env->l] = 0;
89
+ return (const sb_symbol *)(stemmer->env->p);
90
+ }
91
+
92
+ int
93
+ sb_stemmer_length(struct sb_stemmer * stemmer)
94
+ {
95
+ return stemmer->env->l;
96
+ }
@@ -0,0 +1,90 @@
1
+ #!/usr/bin/env perl
2
+ use strict;
3
+ use 5.006;
4
+ use warnings;
5
+
6
+ my $progname = $0;
7
+
8
+ if (scalar @ARGV != 2) {
9
+ print "Usage: $progname <outfile> <modules description file>\n";
10
+ exit 1;
11
+ }
12
+
13
+ my $outname = shift(@ARGV);
14
+ my $descfile = shift(@ARGV);
15
+
16
+ my %aliases = ();
17
+ my %algorithms = ();
18
+ my %algorithm_encs = ();
19
+
20
+ my %encs = ();
21
+
22
+ sub addalgenc($$) {
23
+ my $alg = shift();
24
+ my $enc = shift();
25
+
26
+ if (defined $algorithm_encs{$alg}) {
27
+ my $hashref = $algorithm_encs{$alg};
28
+ $$hashref{$enc}=1;
29
+ } else {
30
+ my %newhash = ($enc => 1);
31
+ $algorithm_encs{$alg}=\%newhash;
32
+ }
33
+
34
+ $encs{$enc} = 1;
35
+ }
36
+
37
+ sub readinput()
38
+ {
39
+ open DESCFILE, $descfile;
40
+ my $line;
41
+ while ($line = <DESCFILE>)
42
+ {
43
+ next if $line =~ m/^\s*#/;
44
+ next if $line =~ m/^\s*$/;
45
+ my ($alg,$encstr,$aliases) = split(/\s+/, $line);
46
+ my $enc;
47
+ my $alias;
48
+
49
+ $algorithms{$alg} = 1;
50
+ foreach $alias (split(/,/, $aliases)) {
51
+ foreach $enc (split(/,/, $encstr)) {
52
+ $aliases{$alias} = $alg;
53
+ addalgenc($alg, $enc);
54
+ }
55
+ }
56
+ }
57
+ }
58
+
59
+ sub printoutput()
60
+ {
61
+ open (OUT, ">$outname") or die "Can't open output file `$outname': $!\n";
62
+
63
+ print OUT <<EOS;
64
+ # $outname: Lists of stemming modules.
65
+ #
66
+ # This file is generated by mkalgorithms.pl from a list of module names.
67
+ # Do not edit manually.
68
+ EOS
69
+
70
+ my $need_sep = 0;
71
+ my $lang;
72
+ my $enc;
73
+ my @algorithms = sort keys(%algorithms);
74
+ print OUT "\nlibstemmer_algorithms =";
75
+ foreach $lang (@algorithms) {
76
+ print OUT "\\\n ", $lang;
77
+ }
78
+ print OUT "\n";
79
+
80
+ for my $enc (qw(ISO_8859_1 ISO_8859_2 KOI8_R)) {
81
+ print OUT "\n${enc}_algorithms =";
82
+ foreach $lang (@algorithms) {
83
+ print OUT "\\\n ", $lang if exists $algorithm_encs{$lang}->{$enc};
84
+ }
85
+ print OUT "\n";
86
+ }
87
+ }
88
+
89
+ readinput();
90
+ printoutput();