mittens 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE.txt +30 -0
  5. data/README.md +62 -0
  6. data/Rakefile +21 -0
  7. data/ext/mittens/ext.c +96 -0
  8. data/ext/mittens/extconf.rb +12 -0
  9. data/lib/mittens/version.rb +3 -0
  10. data/lib/mittens.rb +7 -0
  11. data/mittens.gemspec +22 -0
  12. data/vendor/snowball/.gitignore +26 -0
  13. data/vendor/snowball/.travis.yml +112 -0
  14. data/vendor/snowball/AUTHORS +27 -0
  15. data/vendor/snowball/CONTRIBUTING.rst +216 -0
  16. data/vendor/snowball/COPYING +29 -0
  17. data/vendor/snowball/GNUmakefile +742 -0
  18. data/vendor/snowball/NEWS +754 -0
  19. data/vendor/snowball/README.rst +37 -0
  20. data/vendor/snowball/ada/README.md +74 -0
  21. data/vendor/snowball/ada/generate/generate.adb +83 -0
  22. data/vendor/snowball/ada/generate.gpr +21 -0
  23. data/vendor/snowball/ada/src/stemmer.adb +620 -0
  24. data/vendor/snowball/ada/src/stemmer.ads +219 -0
  25. data/vendor/snowball/ada/src/stemwords.adb +70 -0
  26. data/vendor/snowball/ada/stemmer_config.gpr +83 -0
  27. data/vendor/snowball/ada/stemwords.gpr +21 -0
  28. data/vendor/snowball/algorithms/arabic.sbl +558 -0
  29. data/vendor/snowball/algorithms/armenian.sbl +301 -0
  30. data/vendor/snowball/algorithms/basque.sbl +149 -0
  31. data/vendor/snowball/algorithms/catalan.sbl +202 -0
  32. data/vendor/snowball/algorithms/danish.sbl +93 -0
  33. data/vendor/snowball/algorithms/dutch.sbl +164 -0
  34. data/vendor/snowball/algorithms/english.sbl +229 -0
  35. data/vendor/snowball/algorithms/finnish.sbl +197 -0
  36. data/vendor/snowball/algorithms/french.sbl +254 -0
  37. data/vendor/snowball/algorithms/german.sbl +139 -0
  38. data/vendor/snowball/algorithms/german2.sbl +145 -0
  39. data/vendor/snowball/algorithms/greek.sbl +701 -0
  40. data/vendor/snowball/algorithms/hindi.sbl +323 -0
  41. data/vendor/snowball/algorithms/hungarian.sbl +241 -0
  42. data/vendor/snowball/algorithms/indonesian.sbl +192 -0
  43. data/vendor/snowball/algorithms/irish.sbl +149 -0
  44. data/vendor/snowball/algorithms/italian.sbl +202 -0
  45. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
  46. data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
  47. data/vendor/snowball/algorithms/lovins.sbl +208 -0
  48. data/vendor/snowball/algorithms/nepali.sbl +92 -0
  49. data/vendor/snowball/algorithms/norwegian.sbl +80 -0
  50. data/vendor/snowball/algorithms/porter.sbl +139 -0
  51. data/vendor/snowball/algorithms/portuguese.sbl +218 -0
  52. data/vendor/snowball/algorithms/romanian.sbl +236 -0
  53. data/vendor/snowball/algorithms/russian.sbl +221 -0
  54. data/vendor/snowball/algorithms/serbian.sbl +2379 -0
  55. data/vendor/snowball/algorithms/spanish.sbl +230 -0
  56. data/vendor/snowball/algorithms/swedish.sbl +72 -0
  57. data/vendor/snowball/algorithms/tamil.sbl +405 -0
  58. data/vendor/snowball/algorithms/turkish.sbl +470 -0
  59. data/vendor/snowball/algorithms/yiddish.sbl +460 -0
  60. data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
  61. data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
  62. data/vendor/snowball/charsets/cp850.sbl +130 -0
  63. data/vendor/snowball/compiler/analyser.c +1547 -0
  64. data/vendor/snowball/compiler/driver.c +615 -0
  65. data/vendor/snowball/compiler/generator.c +1748 -0
  66. data/vendor/snowball/compiler/generator_ada.c +1702 -0
  67. data/vendor/snowball/compiler/generator_csharp.c +1322 -0
  68. data/vendor/snowball/compiler/generator_go.c +1278 -0
  69. data/vendor/snowball/compiler/generator_java.c +1313 -0
  70. data/vendor/snowball/compiler/generator_js.c +1316 -0
  71. data/vendor/snowball/compiler/generator_pascal.c +1387 -0
  72. data/vendor/snowball/compiler/generator_python.c +1337 -0
  73. data/vendor/snowball/compiler/generator_rust.c +1295 -0
  74. data/vendor/snowball/compiler/header.h +418 -0
  75. data/vendor/snowball/compiler/space.c +286 -0
  76. data/vendor/snowball/compiler/syswords.h +86 -0
  77. data/vendor/snowball/compiler/syswords2.h +13 -0
  78. data/vendor/snowball/compiler/tokeniser.c +567 -0
  79. data/vendor/snowball/csharp/.gitignore +8 -0
  80. data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
  81. data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
  82. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
  83. data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
  84. data/vendor/snowball/csharp/Stemwords/App.config +6 -0
  85. data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
  86. data/vendor/snowball/doc/TODO +12 -0
  87. data/vendor/snowball/doc/libstemmer_c_README +148 -0
  88. data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
  89. data/vendor/snowball/doc/libstemmer_java_README +67 -0
  90. data/vendor/snowball/doc/libstemmer_js_README +48 -0
  91. data/vendor/snowball/doc/libstemmer_python_README +113 -0
  92. data/vendor/snowball/examples/stemwords.c +204 -0
  93. data/vendor/snowball/go/README.md +55 -0
  94. data/vendor/snowball/go/among.go +16 -0
  95. data/vendor/snowball/go/env.go +403 -0
  96. data/vendor/snowball/go/stemwords/generate.go +68 -0
  97. data/vendor/snowball/go/stemwords/main.go +68 -0
  98. data/vendor/snowball/go/util.go +34 -0
  99. data/vendor/snowball/iconv.py +50 -0
  100. data/vendor/snowball/include/libstemmer.h +78 -0
  101. data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
  102. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
  103. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
  104. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
  105. data/vendor/snowball/javascript/base-stemmer.js +294 -0
  106. data/vendor/snowball/javascript/stemwords.js +106 -0
  107. data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
  108. data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
  109. data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
  110. data/vendor/snowball/libstemmer/modules.txt +63 -0
  111. data/vendor/snowball/libstemmer/test.c +34 -0
  112. data/vendor/snowball/pascal/.gitignore +4 -0
  113. data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
  114. data/vendor/snowball/pascal/generate.pl +23 -0
  115. data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
  116. data/vendor/snowball/python/MANIFEST.in +7 -0
  117. data/vendor/snowball/python/create_init.py +54 -0
  118. data/vendor/snowball/python/setup.cfg +6 -0
  119. data/vendor/snowball/python/setup.py +81 -0
  120. data/vendor/snowball/python/snowballstemmer/among.py +13 -0
  121. data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
  122. data/vendor/snowball/python/stemwords.py +101 -0
  123. data/vendor/snowball/python/testapp.py +28 -0
  124. data/vendor/snowball/runtime/api.c +58 -0
  125. data/vendor/snowball/runtime/api.h +32 -0
  126. data/vendor/snowball/runtime/header.h +61 -0
  127. data/vendor/snowball/runtime/utilities.c +513 -0
  128. data/vendor/snowball/rust/Cargo.toml +7 -0
  129. data/vendor/snowball/rust/build.rs +55 -0
  130. data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
  131. data/vendor/snowball/rust/src/main.rs +102 -0
  132. data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
  133. data/vendor/snowball/rust/src/snowball/among.rs +6 -0
  134. data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
  135. data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
  136. data/vendor/snowball/tests/stemtest.c +95 -0
  137. metadata +178 -0
@@ -0,0 +1,513 @@
1
+
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <string.h>
5
+
6
+ #include "header.h"
7
+
8
+ #define CREATE_SIZE 1
9
+
10
+ extern symbol * create_s(void) {
11
+ symbol * p;
12
+ void * mem = malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol));
13
+ if (mem == NULL) return NULL;
14
+ p = (symbol *) (HEAD + (char *) mem);
15
+ CAPACITY(p) = CREATE_SIZE;
16
+ SET_SIZE(p, 0);
17
+ return p;
18
+ }
19
+
20
+ extern void lose_s(symbol * p) {
21
+ if (p == NULL) return;
22
+ free((char *) p - HEAD);
23
+ }
24
+
25
+ /*
26
+ new_p = skip_utf8(p, c, l, n); skips n characters forwards from p + c.
27
+ new_p is the new position, or -1 on failure.
28
+
29
+ -- used to implement hop and next in the utf8 case.
30
+ */
31
+
32
+ extern int skip_utf8(const symbol * p, int c, int limit, int n) {
33
+ int b;
34
+ if (n < 0) return -1;
35
+ for (; n > 0; n--) {
36
+ if (c >= limit) return -1;
37
+ b = p[c++];
38
+ if (b >= 0xC0) { /* 1100 0000 */
39
+ while (c < limit) {
40
+ b = p[c];
41
+ if (b >= 0xC0 || b < 0x80) break;
42
+ /* break unless b is 10------ */
43
+ c++;
44
+ }
45
+ }
46
+ }
47
+ return c;
48
+ }
49
+
50
+ /*
51
+ new_p = skip_b_utf8(p, c, lb, n); skips n characters backwards from p + c - 1
52
+ new_p is the new position, or -1 on failure.
53
+
54
+ -- used to implement hop and next in the utf8 case.
55
+ */
56
+
57
+ extern int skip_b_utf8(const symbol * p, int c, int limit, int n) {
58
+ int b;
59
+ if (n < 0) return -1;
60
+ for (; n > 0; n--) {
61
+ if (c <= limit) return -1;
62
+ b = p[--c];
63
+ if (b >= 0x80) { /* 1000 0000 */
64
+ while (c > limit) {
65
+ b = p[c];
66
+ if (b >= 0xC0) break; /* 1100 0000 */
67
+ c--;
68
+ }
69
+ }
70
+ }
71
+ return c;
72
+ }
73
+
74
+ /* Code for character groupings: utf8 cases */
75
+
76
+ static int get_utf8(const symbol * p, int c, int l, int * slot) {
77
+ int b0, b1, b2;
78
+ if (c >= l) return 0;
79
+ b0 = p[c++];
80
+ if (b0 < 0xC0 || c == l) { /* 1100 0000 */
81
+ *slot = b0;
82
+ return 1;
83
+ }
84
+ b1 = p[c++] & 0x3F;
85
+ if (b0 < 0xE0 || c == l) { /* 1110 0000 */
86
+ *slot = (b0 & 0x1F) << 6 | b1;
87
+ return 2;
88
+ }
89
+ b2 = p[c++] & 0x3F;
90
+ if (b0 < 0xF0 || c == l) { /* 1111 0000 */
91
+ *slot = (b0 & 0xF) << 12 | b1 << 6 | b2;
92
+ return 3;
93
+ }
94
+ *slot = (b0 & 0x7) << 18 | b1 << 12 | b2 << 6 | (p[c] & 0x3F);
95
+ return 4;
96
+ }
97
+
98
+ static int get_b_utf8(const symbol * p, int c, int lb, int * slot) {
99
+ int a, b;
100
+ if (c <= lb) return 0;
101
+ b = p[--c];
102
+ if (b < 0x80 || c == lb) { /* 1000 0000 */
103
+ *slot = b;
104
+ return 1;
105
+ }
106
+ a = b & 0x3F;
107
+ b = p[--c];
108
+ if (b >= 0xC0 || c == lb) { /* 1100 0000 */
109
+ *slot = (b & 0x1F) << 6 | a;
110
+ return 2;
111
+ }
112
+ a |= (b & 0x3F) << 6;
113
+ b = p[--c];
114
+ if (b >= 0xE0 || c == lb) { /* 1110 0000 */
115
+ *slot = (b & 0xF) << 12 | a;
116
+ return 3;
117
+ }
118
+ *slot = (p[--c] & 0x7) << 18 | (b & 0x3F) << 12 | a;
119
+ return 4;
120
+ }
121
+
122
+ extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
123
+ do {
124
+ int ch;
125
+ int w = get_utf8(z->p, z->c, z->l, & ch);
126
+ if (!w) return -1;
127
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
128
+ return w;
129
+ z->c += w;
130
+ } while (repeat);
131
+ return 0;
132
+ }
133
+
134
+ extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
135
+ do {
136
+ int ch;
137
+ int w = get_b_utf8(z->p, z->c, z->lb, & ch);
138
+ if (!w) return -1;
139
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
140
+ return w;
141
+ z->c -= w;
142
+ } while (repeat);
143
+ return 0;
144
+ }
145
+
146
+ extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
147
+ do {
148
+ int ch;
149
+ int w = get_utf8(z->p, z->c, z->l, & ch);
150
+ if (!w) return -1;
151
+ if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
152
+ return w;
153
+ z->c += w;
154
+ } while (repeat);
155
+ return 0;
156
+ }
157
+
158
+ extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
159
+ do {
160
+ int ch;
161
+ int w = get_b_utf8(z->p, z->c, z->lb, & ch);
162
+ if (!w) return -1;
163
+ if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
164
+ return w;
165
+ z->c -= w;
166
+ } while (repeat);
167
+ return 0;
168
+ }
169
+
170
+ /* Code for character groupings: non-utf8 cases */
171
+
172
+ extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
173
+ do {
174
+ int ch;
175
+ if (z->c >= z->l) return -1;
176
+ ch = z->p[z->c];
177
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
178
+ return 1;
179
+ z->c++;
180
+ } while (repeat);
181
+ return 0;
182
+ }
183
+
184
+ extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
185
+ do {
186
+ int ch;
187
+ if (z->c <= z->lb) return -1;
188
+ ch = z->p[z->c - 1];
189
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
190
+ return 1;
191
+ z->c--;
192
+ } while (repeat);
193
+ return 0;
194
+ }
195
+
196
+ extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
197
+ do {
198
+ int ch;
199
+ if (z->c >= z->l) return -1;
200
+ ch = z->p[z->c];
201
+ if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
202
+ return 1;
203
+ z->c++;
204
+ } while (repeat);
205
+ return 0;
206
+ }
207
+
208
+ extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
209
+ do {
210
+ int ch;
211
+ if (z->c <= z->lb) return -1;
212
+ ch = z->p[z->c - 1];
213
+ if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
214
+ return 1;
215
+ z->c--;
216
+ } while (repeat);
217
+ return 0;
218
+ }
219
+
220
+ extern int eq_s(struct SN_env * z, int s_size, const symbol * s) {
221
+ if (z->l - z->c < s_size || memcmp(z->p + z->c, s, s_size * sizeof(symbol)) != 0) return 0;
222
+ z->c += s_size; return 1;
223
+ }
224
+
225
+ extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s) {
226
+ if (z->c - z->lb < s_size || memcmp(z->p + z->c - s_size, s, s_size * sizeof(symbol)) != 0) return 0;
227
+ z->c -= s_size; return 1;
228
+ }
229
+
230
+ extern int eq_v(struct SN_env * z, const symbol * p) {
231
+ return eq_s(z, SIZE(p), p);
232
+ }
233
+
234
+ extern int eq_v_b(struct SN_env * z, const symbol * p) {
235
+ return eq_s_b(z, SIZE(p), p);
236
+ }
237
+
238
+ extern int find_among(struct SN_env * z, const struct among * v, int v_size) {
239
+
240
+ int i = 0;
241
+ int j = v_size;
242
+
243
+ int c = z->c; int l = z->l;
244
+ const symbol * q = z->p + c;
245
+
246
+ const struct among * w;
247
+
248
+ int common_i = 0;
249
+ int common_j = 0;
250
+
251
+ int first_key_inspected = 0;
252
+
253
+ while (1) {
254
+ int k = i + ((j - i) >> 1);
255
+ int diff = 0;
256
+ int common = common_i < common_j ? common_i : common_j; /* smaller */
257
+ w = v + k;
258
+ {
259
+ int i2; for (i2 = common; i2 < w->s_size; i2++) {
260
+ if (c + common == l) { diff = -1; break; }
261
+ diff = q[common] - w->s[i2];
262
+ if (diff != 0) break;
263
+ common++;
264
+ }
265
+ }
266
+ if (diff < 0) {
267
+ j = k;
268
+ common_j = common;
269
+ } else {
270
+ i = k;
271
+ common_i = common;
272
+ }
273
+ if (j - i <= 1) {
274
+ if (i > 0) break; /* v->s has been inspected */
275
+ if (j == i) break; /* only one item in v */
276
+
277
+ /* - but now we need to go round once more to get
278
+ v->s inspected. This looks messy, but is actually
279
+ the optimal approach. */
280
+
281
+ if (first_key_inspected) break;
282
+ first_key_inspected = 1;
283
+ }
284
+ }
285
+ while (1) {
286
+ w = v + i;
287
+ if (common_i >= w->s_size) {
288
+ z->c = c + w->s_size;
289
+ if (w->function == 0) return w->result;
290
+ {
291
+ int res = w->function(z);
292
+ z->c = c + w->s_size;
293
+ if (res) return w->result;
294
+ }
295
+ }
296
+ i = w->substring_i;
297
+ if (i < 0) return 0;
298
+ }
299
+ }
300
+
301
+ /* find_among_b is for backwards processing. Same comments apply */
302
+
303
+ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) {
304
+
305
+ int i = 0;
306
+ int j = v_size;
307
+
308
+ int c = z->c; int lb = z->lb;
309
+ const symbol * q = z->p + c - 1;
310
+
311
+ const struct among * w;
312
+
313
+ int common_i = 0;
314
+ int common_j = 0;
315
+
316
+ int first_key_inspected = 0;
317
+
318
+ while (1) {
319
+ int k = i + ((j - i) >> 1);
320
+ int diff = 0;
321
+ int common = common_i < common_j ? common_i : common_j;
322
+ w = v + k;
323
+ {
324
+ int i2; for (i2 = w->s_size - 1 - common; i2 >= 0; i2--) {
325
+ if (c - common == lb) { diff = -1; break; }
326
+ diff = q[- common] - w->s[i2];
327
+ if (diff != 0) break;
328
+ common++;
329
+ }
330
+ }
331
+ if (diff < 0) { j = k; common_j = common; }
332
+ else { i = k; common_i = common; }
333
+ if (j - i <= 1) {
334
+ if (i > 0) break;
335
+ if (j == i) break;
336
+ if (first_key_inspected) break;
337
+ first_key_inspected = 1;
338
+ }
339
+ }
340
+ while (1) {
341
+ w = v + i;
342
+ if (common_i >= w->s_size) {
343
+ z->c = c - w->s_size;
344
+ if (w->function == 0) return w->result;
345
+ {
346
+ int res = w->function(z);
347
+ z->c = c - w->s_size;
348
+ if (res) return w->result;
349
+ }
350
+ }
351
+ i = w->substring_i;
352
+ if (i < 0) return 0;
353
+ }
354
+ }
355
+
356
+
357
+ /* Increase the size of the buffer pointed to by p to at least n symbols.
358
+ * If insufficient memory, returns NULL and frees the old buffer.
359
+ */
360
+ static symbol * increase_size(symbol * p, int n) {
361
+ symbol * q;
362
+ int new_size = n + 20;
363
+ void * mem = realloc((char *) p - HEAD,
364
+ HEAD + (new_size + 1) * sizeof(symbol));
365
+ if (mem == NULL) {
366
+ lose_s(p);
367
+ return NULL;
368
+ }
369
+ q = (symbol *) (HEAD + (char *)mem);
370
+ CAPACITY(q) = new_size;
371
+ return q;
372
+ }
373
+
374
+ /* to replace symbols between c_bra and c_ket in z->p by the
375
+ s_size symbols at s.
376
+ Returns 0 on success, -1 on error.
377
+ Also, frees z->p (and sets it to NULL) on error.
378
+ */
379
+ extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjptr)
380
+ {
381
+ int adjustment;
382
+ int len;
383
+ if (z->p == NULL) {
384
+ z->p = create_s();
385
+ if (z->p == NULL) return -1;
386
+ }
387
+ adjustment = s_size - (c_ket - c_bra);
388
+ len = SIZE(z->p);
389
+ if (adjustment != 0) {
390
+ if (adjustment + len > CAPACITY(z->p)) {
391
+ z->p = increase_size(z->p, adjustment + len);
392
+ if (z->p == NULL) return -1;
393
+ }
394
+ memmove(z->p + c_ket + adjustment,
395
+ z->p + c_ket,
396
+ (len - c_ket) * sizeof(symbol));
397
+ SET_SIZE(z->p, adjustment + len);
398
+ z->l += adjustment;
399
+ if (z->c >= c_ket)
400
+ z->c += adjustment;
401
+ else if (z->c > c_bra)
402
+ z->c = c_bra;
403
+ }
404
+ if (s_size) memmove(z->p + c_bra, s, s_size * sizeof(symbol));
405
+ if (adjptr != NULL)
406
+ *adjptr = adjustment;
407
+ return 0;
408
+ }
409
+
410
+ static int slice_check(struct SN_env * z) {
411
+
412
+ if (z->bra < 0 ||
413
+ z->bra > z->ket ||
414
+ z->ket > z->l ||
415
+ z->p == NULL ||
416
+ z->l > SIZE(z->p)) /* this line could be removed */
417
+ {
418
+ #if 0
419
+ fprintf(stderr, "faulty slice operation:\n");
420
+ debug(z, -1, 0);
421
+ #endif
422
+ return -1;
423
+ }
424
+ return 0;
425
+ }
426
+
427
+ extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s) {
428
+ if (slice_check(z)) return -1;
429
+ return replace_s(z, z->bra, z->ket, s_size, s, NULL);
430
+ }
431
+
432
+ extern int slice_from_v(struct SN_env * z, const symbol * p) {
433
+ return slice_from_s(z, SIZE(p), p);
434
+ }
435
+
436
+ extern int slice_del(struct SN_env * z) {
437
+ return slice_from_s(z, 0, 0);
438
+ }
439
+
440
+ extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s) {
441
+ int adjustment;
442
+ if (replace_s(z, bra, ket, s_size, s, &adjustment))
443
+ return -1;
444
+ if (bra <= z->bra) z->bra += adjustment;
445
+ if (bra <= z->ket) z->ket += adjustment;
446
+ return 0;
447
+ }
448
+
449
+ extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p) {
450
+ return insert_s(z, bra, ket, SIZE(p), p);
451
+ }
452
+
453
+ extern symbol * slice_to(struct SN_env * z, symbol * p) {
454
+ if (slice_check(z)) {
455
+ lose_s(p);
456
+ return NULL;
457
+ }
458
+ {
459
+ int len = z->ket - z->bra;
460
+ if (CAPACITY(p) < len) {
461
+ p = increase_size(p, len);
462
+ if (p == NULL)
463
+ return NULL;
464
+ }
465
+ memmove(p, z->p + z->bra, len * sizeof(symbol));
466
+ SET_SIZE(p, len);
467
+ }
468
+ return p;
469
+ }
470
+
471
+ extern symbol * assign_to(struct SN_env * z, symbol * p) {
472
+ int len = z->l;
473
+ if (CAPACITY(p) < len) {
474
+ p = increase_size(p, len);
475
+ if (p == NULL)
476
+ return NULL;
477
+ }
478
+ memmove(p, z->p, len * sizeof(symbol));
479
+ SET_SIZE(p, len);
480
+ return p;
481
+ }
482
+
483
+ extern int len_utf8(const symbol * p) {
484
+ int size = SIZE(p);
485
+ int len = 0;
486
+ while (size--) {
487
+ symbol b = *p++;
488
+ if (b >= 0xC0 || b < 0x80) ++len;
489
+ }
490
+ return len;
491
+ }
492
+
493
+ #if 0
494
+ extern void debug(struct SN_env * z, int number, int line_count) {
495
+ int i;
496
+ int limit = SIZE(z->p);
497
+ /*if (number >= 0) printf("%3d (line %4d): '", number, line_count);*/
498
+ if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
499
+ for (i = 0; i <= limit; i++) {
500
+ if (z->lb == i) printf("{");
501
+ if (z->bra == i) printf("[");
502
+ if (z->c == i) printf("|");
503
+ if (z->ket == i) printf("]");
504
+ if (z->l == i) printf("}");
505
+ if (i < limit)
506
+ { int ch = z->p[i];
507
+ if (ch == 0) ch = '#';
508
+ printf("%c", ch);
509
+ }
510
+ }
511
+ printf("'\n");
512
+ }
513
+ #endif
@@ -0,0 +1,7 @@
1
+ [package]
2
+ name = "testapp"
3
+ version = "0.1.0"
4
+ authors = ["Jakob Demler <jdemler@curry-software.com>"]
5
+ build = "build.rs"
6
+
7
+ [dependencies]
@@ -0,0 +1,55 @@
1
+ use std::env;
2
+ use std::fs;
3
+ use std::fs::{OpenOptions};
4
+ use std::io::Write;
5
+ use std::path::Path;
6
+
7
+
8
+ // This build script makes the code independent from the algorithms declared
9
+ // in the makefile.
10
+ // We check which stemmers were generated and then produce the corresponding
11
+ // includes for src/algorithms/mod.rs and a closure for src/main.rs to match
12
+ // strings to stemmers
13
+ fn main() {
14
+ let out_dir = env::var("OUT_DIR").unwrap();
15
+ let lang_match_path = Path::new(&out_dir).join("lang_matches.rs");
16
+ let lang_include_path = Path::new(&out_dir).join("lang_include.rs");
17
+ let mut lang_match_file = OpenOptions::new().write(true).create(true).truncate(true).open(&lang_match_path).unwrap();
18
+ let mut lang_include_file = OpenOptions::new().write(true).create(true).truncate(true).open(&lang_include_path).unwrap();
19
+
20
+ let src_dir = Path::new(&env::var("CARGO_MANIFEST_DIR").unwrap()).join("src");
21
+ let algo_dir = src_dir.join("snowball/algorithms");
22
+
23
+ lang_match_file.write_all(b"
24
+ move |lang:String|{
25
+ match lang.as_str() {")
26
+ .unwrap();
27
+
28
+ for file in fs::read_dir(&algo_dir).unwrap() {
29
+ let file = file.unwrap();
30
+ let path = file.path();
31
+ let filestem = path.file_stem().unwrap().to_str().unwrap();
32
+ if path.is_file() && filestem != "mod" {
33
+ //Also we need to copy all the stemmer files into OUT_DIR...
34
+ fs::copy(&path, Path::new(&out_dir).join(file.file_name())).unwrap();
35
+ let split = filestem.len() - 8;
36
+ let langname = &filestem[..split];
37
+ writeln!(&mut lang_match_file,
38
+ "\"{}\" => Stemmer {{ stemmer: snowball::algorithms::{}_stemmer::stem}},",
39
+ langname,
40
+ langname)
41
+ .unwrap();
42
+
43
+ writeln!(&mut lang_include_file, "pub mod {}_stemmer;", langname).unwrap();
44
+
45
+ }
46
+ }
47
+
48
+ lang_match_file.write_all(b"
49
+ x => panic!(\"Unknown algorithm '{}'\", x)
50
+ }
51
+ }
52
+ ")
53
+ .unwrap();
54
+
55
+ }
@@ -0,0 +1,30 @@
1
+ Applying this patch restores compatibility with Rust < 1.27 (but causes newer
2
+ versions to report "warning: trait objects without an explicit `dyn` are
3
+ deprecated").
4
+
5
+ diff --git a/rust/src/main.rs b/rust/src/main.rs
6
+ index 064325a9..bf752795 100644
7
+ --- a/rust/src/main.rs
8
+ +++ b/rust/src/main.rs
9
+ @@ -56,9 +56,9 @@ fn main() {
10
+
11
+
12
+ let mut output = if let Some(output_file) = output_arg {
13
+ - Box::new(File::create(Path::new(&output_file)).unwrap()) as Box<dyn Write>
14
+ + Box::new(File::create(Path::new(&output_file)).unwrap()) as Box<Write>
15
+ } else {
16
+ - Box::new(std::io::stdout()) as Box<dyn Write>
17
+ + Box::new(std::io::stdout()) as Box<Write>
18
+ };
19
+
20
+ if let Some(input_file) = input_arg {
21
+ diff --git a/rust/src/snowball/among.rs b/rust/src/snowball/among.rs
22
+ index 57fc8bae..70631933 100644
23
+ --- a/rust/src/snowball/among.rs
24
+ +++ b/rust/src/snowball/among.rs
25
+ @@ -3,4 +3,4 @@ use snowball::SnowballEnv;
26
+ pub struct Among<T: 'static>(pub &'static str,
27
+ pub i32,
28
+ pub i32,
29
+ - pub Option<&'static (dyn Fn(&mut SnowballEnv, &mut T) -> bool + Sync)>);
30
+ + pub Option<&'static (Fn(&mut SnowballEnv, &mut T) -> bool + Sync)>);
@@ -0,0 +1,102 @@
1
+ use std::fs::File;
2
+ use std::io::{BufRead, BufReader, Write};
3
+ use std::path::Path;
4
+ use std::env;
5
+ use std::borrow::Cow;
6
+
7
+ pub mod snowball;
8
+
9
+ use snowball::SnowballEnv;
10
+
11
+
12
+ fn usage(name: &str) {
13
+ println!("{} -l <language> [-i <input file>] [-o <output file>]
14
+ The input file consists of a list of words to be stemmed, one per
15
+ line. Words should be in lower case, but (for English) A-Z letters
16
+ are mapped to their a-z equivalents anyway. If omitted, stdin is
17
+ used.", name);
18
+ }
19
+
20
+ fn main() {
21
+ let args: Vec<String> = env::args().collect();
22
+ if args.len() < 3 {
23
+ usage(&args[0]);
24
+ } else {
25
+ let mut language = None;
26
+ let mut input_arg = None;
27
+ let mut output_arg = None;
28
+ let mut i = 1;
29
+ while i < args.len() {
30
+ match args[i].as_str() {
31
+ "-l" => {
32
+ language = Some(args[i+1].clone());
33
+ i += 2;
34
+ },
35
+ "-i" => {
36
+ input_arg = Some(args[i+1].clone());
37
+ i += 2;
38
+ },
39
+ "-o" => {
40
+ output_arg = Some(args[i+1].clone());
41
+ i += 2;
42
+ },
43
+ x => {
44
+ println!("Unrecognized option '{}'", x);
45
+ usage(&args[0]);
46
+ return
47
+ }
48
+ }
49
+ }
50
+ if language.is_none() {
51
+ println!("Please specify a language!");
52
+ usage(&args[0]);
53
+ return;
54
+ }
55
+ let stemmer = Stemmer::create(language.unwrap());
56
+
57
+
58
+ let mut output = if let Some(output_file) = output_arg {
59
+ Box::new(File::create(Path::new(&output_file)).unwrap()) as Box<dyn Write>
60
+ } else {
61
+ Box::new(std::io::stdout()) as Box<dyn Write>
62
+ };
63
+
64
+ if let Some(input_file) = input_arg {
65
+ for line in BufReader::new(File::open(Path::new(&input_file)).unwrap()).lines() {
66
+ writeln!(&mut output, "{}", stemmer.stem(&line.unwrap())).unwrap();
67
+ }
68
+ } else {
69
+ let stdin = std::io::stdin();
70
+ for line in stdin.lock().lines() {
71
+ writeln!(&mut output, "{}", stemmer.stem(&line.unwrap())).unwrap();
72
+ }
73
+ }
74
+ }
75
+ }
76
+
77
+
78
+ /// Wraps a usable interface around the actual stemmer implementation
79
+ pub struct Stemmer {
80
+ stemmer: fn(&mut SnowballEnv) -> bool,
81
+ }
82
+
83
+ impl Stemmer {
84
+ /// Create a new stemmer from an algorithm
85
+ pub fn create(lang: String) -> Self {
86
+ // Have a look at ../build.rs
87
+ // There we generate a file that is rust code for a closure that returns a stemmer.
88
+ // We match against all the algorithms in src/snowball/algoritms/ folder.
89
+ // Alas, this cannot be included as a match statement or function because of Rust's
90
+ // hygenic macros.
91
+ let match_language = include!(concat!(env!("OUT_DIR"), "/lang_matches.rs"));
92
+ match_language(lang)
93
+ }
94
+
95
+ /// Stem a single word
96
+ /// Please note, that the input is expected to be all lowercase (if that is applicable).
97
+ pub fn stem<'a>(&self, input: &'a str) -> Cow<'a, str> {
98
+ let mut env = SnowballEnv::create(input);
99
+ (self.stemmer)(&mut env);
100
+ env.get_current()
101
+ }
102
+ }
@@ -0,0 +1,2 @@
1
+ // Have a look at build.rs
2
+ include!(concat!(env!("OUT_DIR"), "/lang_include.rs"));