mittens 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +3 -3
  4. data/lib/mittens/version.rb +1 -1
  5. data/vendor/snowball/.github/workflows/ci.yml +216 -0
  6. data/vendor/snowball/CONTRIBUTING.rst +111 -62
  7. data/vendor/snowball/GNUmakefile +194 -136
  8. data/vendor/snowball/NEWS +798 -3
  9. data/vendor/snowball/README.rst +50 -1
  10. data/vendor/snowball/ada/src/stemmer.adb +25 -13
  11. data/vendor/snowball/ada/src/stemmer.ads +9 -9
  12. data/vendor/snowball/ada/stemmer_config.gpr +7 -7
  13. data/vendor/snowball/algorithms/basque.sbl +4 -19
  14. data/vendor/snowball/algorithms/catalan.sbl +2 -9
  15. data/vendor/snowball/algorithms/danish.sbl +1 -1
  16. data/vendor/snowball/algorithms/dutch.sbl +284 -122
  17. data/vendor/snowball/algorithms/dutch_porter.sbl +178 -0
  18. data/vendor/snowball/algorithms/english.sbl +52 -37
  19. data/vendor/snowball/algorithms/esperanto.sbl +157 -0
  20. data/vendor/snowball/algorithms/estonian.sbl +269 -0
  21. data/vendor/snowball/algorithms/finnish.sbl +2 -3
  22. data/vendor/snowball/algorithms/french.sbl +42 -16
  23. data/vendor/snowball/algorithms/german.sbl +35 -14
  24. data/vendor/snowball/algorithms/greek.sbl +76 -76
  25. data/vendor/snowball/algorithms/hungarian.sbl +8 -6
  26. data/vendor/snowball/algorithms/indonesian.sbl +14 -8
  27. data/vendor/snowball/algorithms/italian.sbl +11 -21
  28. data/vendor/snowball/algorithms/lithuanian.sbl +36 -37
  29. data/vendor/snowball/algorithms/lovins.sbl +0 -1
  30. data/vendor/snowball/algorithms/nepali.sbl +138 -37
  31. data/vendor/snowball/algorithms/norwegian.sbl +19 -5
  32. data/vendor/snowball/algorithms/porter.sbl +2 -2
  33. data/vendor/snowball/algorithms/portuguese.sbl +9 -13
  34. data/vendor/snowball/algorithms/romanian.sbl +17 -4
  35. data/vendor/snowball/algorithms/serbian.sbl +467 -468
  36. data/vendor/snowball/algorithms/spanish.sbl +5 -7
  37. data/vendor/snowball/algorithms/swedish.sbl +60 -6
  38. data/vendor/snowball/algorithms/tamil.sbl +207 -176
  39. data/vendor/snowball/algorithms/turkish.sbl +461 -445
  40. data/vendor/snowball/algorithms/yiddish.sbl +36 -38
  41. data/vendor/snowball/compiler/analyser.c +445 -192
  42. data/vendor/snowball/compiler/driver.c +109 -101
  43. data/vendor/snowball/compiler/generator.c +853 -464
  44. data/vendor/snowball/compiler/generator_ada.c +404 -366
  45. data/vendor/snowball/compiler/generator_csharp.c +297 -260
  46. data/vendor/snowball/compiler/generator_go.c +323 -254
  47. data/vendor/snowball/compiler/generator_java.c +326 -252
  48. data/vendor/snowball/compiler/generator_js.c +362 -252
  49. data/vendor/snowball/compiler/generator_pascal.c +349 -197
  50. data/vendor/snowball/compiler/generator_python.c +257 -240
  51. data/vendor/snowball/compiler/generator_rust.c +423 -251
  52. data/vendor/snowball/compiler/header.h +117 -71
  53. data/vendor/snowball/compiler/space.c +137 -68
  54. data/vendor/snowball/compiler/syswords.h +2 -2
  55. data/vendor/snowball/compiler/tokeniser.c +125 -107
  56. data/vendor/snowball/csharp/Snowball/Among.cs +14 -14
  57. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +7 -7
  58. data/vendor/snowball/csharp/Snowball/Stemmer.cs +57 -37
  59. data/vendor/snowball/csharp/Stemwords/App.config +2 -2
  60. data/vendor/snowball/csharp/Stemwords/Program.cs +16 -12
  61. data/vendor/snowball/doc/libstemmer_c_README +7 -4
  62. data/vendor/snowball/doc/libstemmer_csharp_README +4 -1
  63. data/vendor/snowball/doc/libstemmer_java_README +12 -1
  64. data/vendor/snowball/doc/libstemmer_js_README +6 -4
  65. data/vendor/snowball/doc/libstemmer_python_README +9 -4
  66. data/vendor/snowball/examples/stemwords.c +12 -12
  67. data/vendor/snowball/go/env.go +107 -31
  68. data/vendor/snowball/go/util.go +0 -4
  69. data/vendor/snowball/include/libstemmer.h +4 -0
  70. data/vendor/snowball/java/org/tartarus/snowball/Among.java +32 -15
  71. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +347 -261
  72. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +3 -0
  73. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +52 -37
  74. data/vendor/snowball/javascript/base-stemmer.js +186 -2
  75. data/vendor/snowball/javascript/stemwords.js +3 -6
  76. data/vendor/snowball/libstemmer/libstemmer_c.in +1 -1
  77. data/vendor/snowball/libstemmer/mkalgorithms.pl +6 -6
  78. data/vendor/snowball/libstemmer/mkmodules.pl +2 -2
  79. data/vendor/snowball/libstemmer/modules.txt +13 -10
  80. data/vendor/snowball/libstemmer/test.c +1 -1
  81. data/vendor/snowball/pascal/SnowballProgram.pas +84 -2
  82. data/vendor/snowball/pascal/generate.pl +13 -13
  83. data/vendor/snowball/python/create_init.py +4 -1
  84. data/vendor/snowball/python/setup.cfg +0 -3
  85. data/vendor/snowball/python/setup.py +8 -3
  86. data/vendor/snowball/python/snowballstemmer/basestemmer.py +20 -54
  87. data/vendor/snowball/python/stemwords.py +8 -12
  88. data/vendor/snowball/runtime/api.c +10 -5
  89. data/vendor/snowball/runtime/header.h +10 -9
  90. data/vendor/snowball/runtime/utilities.c +9 -9
  91. data/vendor/snowball/rust/build.rs +1 -1
  92. data/vendor/snowball/rust/src/snowball/snowball_env.rs +83 -5
  93. data/vendor/snowball/tests/stemtest.c +7 -4
  94. metadata +7 -7
  95. data/vendor/snowball/.travis.yml +0 -112
  96. data/vendor/snowball/algorithms/german2.sbl +0 -145
  97. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +0 -240
  98. data/vendor/snowball/compiler/syswords2.h +0 -13
@@ -1,6 +1,9 @@
1
1
 
2
2
  package org.tartarus.snowball;
3
3
 
4
+ /**
5
+ * Parent class of all snowball stemmers, which must implement <code>stem</code>
6
+ */
4
7
  public abstract class SnowballStemmer extends SnowballProgram {
5
8
  public abstract boolean stem();
6
9
 
@@ -1,7 +1,6 @@
1
1
 
2
2
  package org.tartarus.snowball;
3
3
 
4
- import java.lang.reflect.Method;
5
4
  import java.io.BufferedReader;
6
5
  import java.io.BufferedWriter;
7
6
  import java.io.FileInputStream;
@@ -13,6 +12,7 @@ import java.io.OutputStreamWriter;
13
12
  import java.io.Reader;
14
13
  import java.io.Writer;
15
14
  import java.nio.charset.StandardCharsets;
15
+ import java.util.Arrays;
16
16
 
17
17
  public class TestApp {
18
18
  private static void usage()
@@ -20,56 +20,71 @@ public class TestApp {
20
20
  System.err.println("Usage: TestApp <algorithm> [<input file>] [-o <output file>]");
21
21
  }
22
22
 
23
- public static void main(String [] args) throws Throwable {
24
- if (args.length < 2) {
23
+ private static SnowballStemmer getStemmer(String lang) {
24
+ try {
25
+ String c = "org.tartarus.snowball.ext." + lang + "Stemmer";
26
+ return (SnowballStemmer) Class.forName(c).getDeclaredConstructor().newInstance();
27
+ } catch (ReflectiveOperationException e) {
28
+ return null;
29
+ }
30
+ }
31
+
32
+ public static void main(String[] args) throws Throwable {
33
+ if (args.length < 1) {
25
34
  usage();
26
35
  return;
27
36
  }
28
37
 
29
- Class stemClass = Class.forName("org.tartarus.snowball.ext." +
30
- args[0] + "Stemmer");
31
- SnowballStemmer stemmer = (SnowballStemmer) stemClass.newInstance();
38
+ SnowballStemmer stemmer = getStemmer(args[0]);
39
+ if (stemmer == null) {
40
+ System.err.println("Stemmer " + args[0] + " not found");
41
+ return;
42
+ }
32
43
 
33
- int arg = 1;
44
+ int arg = 1;
34
45
 
35
- InputStream instream;
36
- if (args.length > arg && !args[arg].equals("-o")) {
37
- instream = new FileInputStream(args[arg++]);
38
- } else {
39
- instream = System.in;
40
- }
46
+ InputStream instream;
47
+ if (args.length > arg && !args[arg].equals("-o")) {
48
+ instream = new FileInputStream(args[arg++]);
49
+ } else {
50
+ instream = System.in;
51
+ }
41
52
 
42
53
  OutputStream outstream;
43
- if (args.length > arg) {
54
+ if (args.length > arg) {
44
55
  if (args.length != arg + 2 || !args[arg].equals("-o")) {
45
56
  usage();
46
57
  return;
47
58
  }
48
- outstream = new FileOutputStream(args[arg + 1]);
49
- } else {
50
- outstream = System.out;
51
- }
59
+ outstream = new FileOutputStream(args[arg + 1]);
60
+ } else {
61
+ outstream = System.out;
62
+ }
52
63
 
53
- Reader reader = new InputStreamReader(instream, StandardCharsets.UTF_8);
54
- reader = new BufferedReader(reader);
64
+ Reader reader = new InputStreamReader(instream, StandardCharsets.UTF_8);
65
+ reader = new BufferedReader(reader);
55
66
 
56
- Writer output = new OutputStreamWriter(outstream, StandardCharsets.UTF_8);
57
- output = new BufferedWriter(output);
67
+ Writer output = new OutputStreamWriter(outstream, StandardCharsets.UTF_8);
68
+ output = new BufferedWriter(output);
58
69
 
59
- StringBuffer input = new StringBuffer();
60
- int character;
61
- while ((character = reader.read()) != -1) {
62
- char ch = (char) character;
63
- if (Character.isWhitespace(ch)) {
64
- stemmer.setCurrent(input.toString());
65
- stemmer.stem();
66
- output.write(stemmer.getCurrent());
67
- output.write('\n');
68
- input.delete(0, input.length());
69
- } else {
70
- input.append(ch < 127 ? Character.toLowerCase(ch) : ch);
71
- }
72
- }
73
- output.flush();
70
+ char[] input = new char[8];
71
+ int length = 0;
72
+ int character;
73
+ while ((character = reader.read()) != -1) {
74
+ char ch = (char) character;
75
+ if (Character.isWhitespace(ch)) {
76
+ stemmer.setCurrent(input, length);
77
+ stemmer.stem();
78
+ output.write(stemmer.getCurrentBuffer(), 0, stemmer.getCurrentBufferLength());
79
+ output.write('\n');
80
+ length = 0;
81
+ } else {
82
+ if (length == input.length) {
83
+ input = Arrays.copyOf(input, length + 1);
84
+ }
85
+ input[length++] = ch < 127 ? Character.toLowerCase(ch) : ch;
86
+ }
87
+ }
88
+ output.close();
74
89
  }
75
90
  }
@@ -1,5 +1,18 @@
1
+ // @ts-check
2
+
1
3
  /**@constructor*/
2
- BaseStemmer = function() {
4
+ const BaseStemmer = function() {
5
+ /** @protected */
6
+ this.current = '';
7
+ this.cursor = 0;
8
+ this.limit = 0;
9
+ this.limit_backward = 0;
10
+ this.bra = 0;
11
+ this.ket = 0;
12
+
13
+ /**
14
+ * @param {string} value
15
+ */
3
16
  this.setCurrent = function(value) {
4
17
  this.current = value;
5
18
  this.cursor = 0;
@@ -9,11 +22,18 @@ BaseStemmer = function() {
9
22
  this.ket = this.limit;
10
23
  };
11
24
 
25
+ /**
26
+ * @return {string}
27
+ */
12
28
  this.getCurrent = function() {
13
29
  return this.current;
14
30
  };
15
31
 
32
+ /**
33
+ * @param {BaseStemmer} other
34
+ */
16
35
  this.copy_from = function(other) {
36
+ /** @protected */
17
37
  this.current = other.current;
18
38
  this.cursor = other.cursor;
19
39
  this.limit = other.limit;
@@ -22,7 +42,14 @@ BaseStemmer = function() {
22
42
  this.ket = other.ket;
23
43
  };
24
44
 
45
+ /**
46
+ * @param {number[]} s
47
+ * @param {number} min
48
+ * @param {number} max
49
+ * @return {boolean}
50
+ */
25
51
  this.in_grouping = function(s, min, max) {
52
+ /** @protected */
26
53
  if (this.cursor >= this.limit) return false;
27
54
  var ch = this.current.charCodeAt(this.cursor);
28
55
  if (ch > max || ch < min) return false;
@@ -32,7 +59,34 @@ BaseStemmer = function() {
32
59
  return true;
33
60
  };
34
61
 
62
+ /**
63
+ * @param {number[]} s
64
+ * @param {number} min
65
+ * @param {number} max
66
+ * @return {boolean}
67
+ */
68
+ this.go_in_grouping = function(s, min, max) {
69
+ /** @protected */
70
+ while (this.cursor < this.limit) {
71
+ var ch = this.current.charCodeAt(this.cursor);
72
+ if (ch > max || ch < min)
73
+ return true;
74
+ ch -= min;
75
+ if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0)
76
+ return true;
77
+ this.cursor++;
78
+ }
79
+ return false;
80
+ };
81
+
82
+ /**
83
+ * @param {number[]} s
84
+ * @param {number} min
85
+ * @param {number} max
86
+ * @return {boolean}
87
+ */
35
88
  this.in_grouping_b = function(s, min, max) {
89
+ /** @protected */
36
90
  if (this.cursor <= this.limit_backward) return false;
37
91
  var ch = this.current.charCodeAt(this.cursor - 1);
38
92
  if (ch > max || ch < min) return false;
@@ -42,7 +96,32 @@ BaseStemmer = function() {
42
96
  return true;
43
97
  };
44
98
 
99
+ /**
100
+ * @param {number[]} s
101
+ * @param {number} min
102
+ * @param {number} max
103
+ * @return {boolean}
104
+ */
105
+ this.go_in_grouping_b = function(s, min, max) {
106
+ /** @protected */
107
+ while (this.cursor > this.limit_backward) {
108
+ var ch = this.current.charCodeAt(this.cursor - 1);
109
+ if (ch > max || ch < min) return true;
110
+ ch -= min;
111
+ if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) return true;
112
+ this.cursor--;
113
+ }
114
+ return false;
115
+ };
116
+
117
+ /**
118
+ * @param {number[]} s
119
+ * @param {number} min
120
+ * @param {number} max
121
+ * @return {boolean}
122
+ */
45
123
  this.out_grouping = function(s, min, max) {
124
+ /** @protected */
46
125
  if (this.cursor >= this.limit) return false;
47
126
  var ch = this.current.charCodeAt(this.cursor);
48
127
  if (ch > max || ch < min) {
@@ -57,7 +136,35 @@ BaseStemmer = function() {
57
136
  return false;
58
137
  };
59
138
 
139
+ /**
140
+ * @param {number[]} s
141
+ * @param {number} min
142
+ * @param {number} max
143
+ * @return {boolean}
144
+ */
145
+ this.go_out_grouping = function(s, min, max) {
146
+ /** @protected */
147
+ while (this.cursor < this.limit) {
148
+ var ch = this.current.charCodeAt(this.cursor);
149
+ if (ch <= max && ch >= min) {
150
+ ch -= min;
151
+ if ((s[ch >>> 3] & (0X1 << (ch & 0x7))) != 0) {
152
+ return true;
153
+ }
154
+ }
155
+ this.cursor++;
156
+ }
157
+ return false;
158
+ };
159
+
160
+ /**
161
+ * @param {number[]} s
162
+ * @param {number} min
163
+ * @param {number} max
164
+ * @return {boolean}
165
+ */
60
166
  this.out_grouping_b = function(s, min, max) {
167
+ /** @protected */
61
168
  if (this.cursor <= this.limit_backward) return false;
62
169
  var ch = this.current.charCodeAt(this.cursor - 1);
63
170
  if (ch > max || ch < min) {
@@ -72,8 +179,34 @@ BaseStemmer = function() {
72
179
  return false;
73
180
  };
74
181
 
182
+ /**
183
+ * @param {number[]} s
184
+ * @param {number} min
185
+ * @param {number} max
186
+ * @return {boolean}
187
+ */
188
+ this.go_out_grouping_b = function(s, min, max) {
189
+ /** @protected */
190
+ while (this.cursor > this.limit_backward) {
191
+ var ch = this.current.charCodeAt(this.cursor - 1);
192
+ if (ch <= max && ch >= min) {
193
+ ch -= min;
194
+ if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) != 0) {
195
+ return true;
196
+ }
197
+ }
198
+ this.cursor--;
199
+ }
200
+ return false;
201
+ };
202
+
203
+ /**
204
+ * @param {string} s
205
+ * @return {boolean}
206
+ */
75
207
  this.eq_s = function(s)
76
208
  {
209
+ /** @protected */
77
210
  if (this.limit - this.cursor < s.length) return false;
78
211
  if (this.current.slice(this.cursor, this.cursor + s.length) != s)
79
212
  {
@@ -83,8 +216,13 @@ BaseStemmer = function() {
83
216
  return true;
84
217
  };
85
218
 
219
+ /**
220
+ * @param {string} s
221
+ * @return {boolean}
222
+ */
86
223
  this.eq_s_b = function(s)
87
224
  {
225
+ /** @protected */
88
226
  if (this.cursor - this.limit_backward < s.length) return false;
89
227
  if (this.current.slice(this.cursor - s.length, this.cursor) != s)
90
228
  {
@@ -94,8 +232,13 @@ BaseStemmer = function() {
94
232
  return true;
95
233
  };
96
234
 
97
- /** @return {number} */ this.find_among = function(v)
235
+ /**
236
+ * @param {Among[]} v
237
+ * @return {number}
238
+ */
239
+ this.find_among = function(v)
98
240
  {
241
+ /** @protected */
99
242
  var i = 0;
100
243
  var j = v.length;
101
244
 
@@ -165,8 +308,13 @@ BaseStemmer = function() {
165
308
  };
166
309
 
167
310
  // find_among_b is for backwards processing. Same comments apply
311
+ /**
312
+ * @param {Among[]} v
313
+ * @return {number}
314
+ */
168
315
  this.find_among_b = function(v)
169
316
  {
317
+ /** @protected */
170
318
  var i = 0;
171
319
  var j = v.length
172
320
 
@@ -232,8 +380,15 @@ BaseStemmer = function() {
232
380
  /* to replace chars between c_bra and c_ket in this.current by the
233
381
  * chars in s.
234
382
  */
383
+ /**
384
+ * @param {number} c_bra
385
+ * @param {number} c_ket
386
+ * @param {string} s
387
+ * @return {number}
388
+ */
235
389
  this.replace_s = function(c_bra, c_ket, s)
236
390
  {
391
+ /** @protected */
237
392
  var adjustment = s.length - (c_ket - c_bra);
238
393
  this.current = this.current.slice(0, c_bra) + s + this.current.slice(c_ket);
239
394
  this.limit += adjustment;
@@ -242,8 +397,12 @@ BaseStemmer = function() {
242
397
  return adjustment;
243
398
  };
244
399
 
400
+ /**
401
+ * @return {boolean}
402
+ */
245
403
  this.slice_check = function()
246
404
  {
405
+ /** @protected */
247
406
  if (this.bra < 0 ||
248
407
  this.bra > this.ket ||
249
408
  this.ket > this.limit ||
@@ -254,8 +413,13 @@ BaseStemmer = function() {
254
413
  return true;
255
414
  };
256
415
 
416
+ /**
417
+ * @param {number} c_bra
418
+ * @return {boolean}
419
+ */
257
420
  this.slice_from = function(s)
258
421
  {
422
+ /** @protected */
259
423
  var result = false;
260
424
  if (this.slice_check())
261
425
  {
@@ -265,20 +429,34 @@ BaseStemmer = function() {
265
429
  return result;
266
430
  };
267
431
 
432
+ /**
433
+ * @return {boolean}
434
+ */
268
435
  this.slice_del = function()
269
436
  {
437
+ /** @protected */
270
438
  return this.slice_from("");
271
439
  };
272
440
 
441
+ /**
442
+ * @param {number} c_bra
443
+ * @param {number} c_ket
444
+ * @param {string} s
445
+ */
273
446
  this.insert = function(c_bra, c_ket, s)
274
447
  {
448
+ /** @protected */
275
449
  var adjustment = this.replace_s(c_bra, c_ket, s);
276
450
  if (c_bra <= this.bra) this.bra += adjustment;
277
451
  if (c_bra <= this.ket) this.ket += adjustment;
278
452
  };
279
453
 
454
+ /**
455
+ * @return {string}
456
+ */
280
457
  this.slice_to = function()
281
458
  {
459
+ /** @protected */
282
460
  var result = '';
283
461
  if (this.slice_check())
284
462
  {
@@ -287,8 +465,14 @@ BaseStemmer = function() {
287
465
  return result;
288
466
  };
289
467
 
468
+ /**
469
+ * @return {string}
470
+ */
290
471
  this.assign_to = function()
291
472
  {
473
+ /** @protected */
292
474
  return this.current.slice(0, this.limit);
293
475
  };
294
476
  };
477
+
478
+ if (typeof module === 'object' && module.exports) module.exports = BaseStemmer;
@@ -1,5 +1,3 @@
1
- const stemmer = require('base-stemmer.js');
2
-
3
1
  const fs = require('fs');
4
2
  const readline = require('readline');
5
3
 
@@ -81,7 +79,7 @@ else
81
79
  function stemming (lang, input, output, encoding) {
82
80
  const lines = readline.createInterface({
83
81
  input: fs.createReadStream(input, encoding),
84
- terminal: false
82
+ terminal: false
85
83
  });
86
84
  var out = fs.createWriteStream(output, encoding);
87
85
  var stemmer = create(lang);
@@ -93,10 +91,9 @@ function stemming (lang, input, output, encoding) {
93
91
  function create (name) {
94
92
  var lc_name = name.toLowerCase();
95
93
  if (!lc_name.match('\\W') && lc_name != 'base') {
96
- var algo = lc_name.substr(0, 1).toUpperCase() + lc_name.substr(1);
97
94
  try {
98
- const stemmer = require(lc_name + '-stemmer.js');
99
- return Function('return new ' + algo + 'Stemmer()')();
95
+ const Stemmer = require(lc_name + '-stemmer.js');
96
+ return new Stemmer();
100
97
  } catch (error) {
101
98
  }
102
99
  }
@@ -45,7 +45,7 @@ sb_stemmer_new(const char * algorithm, const char * charenc)
45
45
  if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
46
46
  }
47
47
  if (module->name == NULL) return NULL;
48
-
48
+
49
49
  stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
50
50
  if (stemmer == NULL) return NULL;
51
51
 
@@ -73,16 +73,16 @@ EOS
73
73
  my @algorithms = sort keys(%algorithms);
74
74
  print OUT "\nlibstemmer_algorithms =";
75
75
  foreach $lang (@algorithms) {
76
- print OUT "\\\n ", $lang;
76
+ print OUT "\\\n ", $lang;
77
77
  }
78
78
  print OUT "\n";
79
79
 
80
80
  for my $enc (qw(ISO_8859_1 ISO_8859_2 KOI8_R)) {
81
- print OUT "\n${enc}_algorithms =";
82
- foreach $lang (@algorithms) {
83
- print OUT "\\\n ", $lang if exists $algorithm_encs{$lang}->{$enc};
84
- }
85
- print OUT "\n";
81
+ print OUT "\n${enc}_algorithms =";
82
+ foreach $lang (@algorithms) {
83
+ print OUT "\\\n ", $lang if exists $algorithm_encs{$lang}->{$enc};
84
+ }
85
+ print OUT "\n";
86
86
  }
87
87
  }
88
88
 
@@ -35,7 +35,7 @@ sub addalgenc($$) {
35
35
  my $norm_enc = lc $enc;
36
36
  $norm_enc =~ s/_//g;
37
37
  if ($norm_enc ne $enc_only) {
38
- return;
38
+ return;
39
39
  }
40
40
  }
41
41
 
@@ -146,7 +146,7 @@ EOS
146
146
 
147
147
  struct stemmer_modules {
148
148
  const char * name;
149
- stemmer_encoding_t enc;
149
+ stemmer_encoding_t enc;
150
150
  struct SN_env * (*create)(void);
151
151
  void (*close)(struct SN_env *);
152
152
  int (*stem)(struct SN_env *);
@@ -14,8 +14,10 @@ armenian UTF_8 armenian,hy,hye,arm
14
14
  basque UTF_8,ISO_8859_1 basque,eu,eus,baq
15
15
  catalan UTF_8,ISO_8859_1 catalan,ca,cat
16
16
  danish UTF_8,ISO_8859_1 danish,da,dan
17
- dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld
17
+ dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld,kraaij_pohlmann
18
18
  english UTF_8,ISO_8859_1 english,en,eng
19
+ esperanto UTF_8 esperanto,eo,epo
20
+ estonian UTF_8 estonian,et,est
19
21
  finnish UTF_8,ISO_8859_1 finnish,fi,fin
20
22
  french UTF_8,ISO_8859_1 french,fr,fre,fra
21
23
  german UTF_8,ISO_8859_1 german,de,ger,deu
@@ -29,7 +31,7 @@ lithuanian UTF_8 lithuanian,lt,lit
29
31
  nepali UTF_8 nepali,ne,nep
30
32
  norwegian UTF_8,ISO_8859_1 norwegian,no,nor
31
33
  portuguese UTF_8,ISO_8859_1 portuguese,pt,por
32
- romanian UTF_8,ISO_8859_2 romanian,ro,rum,ron
34
+ romanian UTF_8 romanian,ro,rum,ron
33
35
  russian UTF_8,KOI8_R russian,ru,rus
34
36
  serbian UTF_8 serbian,sr,srp
35
37
  spanish UTF_8,ISO_8859_1 spanish,es,esl,spa
@@ -42,7 +44,14 @@ yiddish UTF_8 yiddish,yi,yid
42
44
  # The porter algorithm is included in the libstemmer distribution to assist
43
45
  # with backwards compatibility, but for new systems the english algorithm
44
46
  # should be used in preference.
45
- porter UTF_8,ISO_8859_1 porter english
47
+ porter UTF_8,ISO_8859_1 porter english
48
+
49
+ # This is Martin Porter's Dutch stemmer. It was the default Dutch stemming
50
+ # in Snowball 2.2.0 and earlier, but after user feedback and careful evaluation
51
+ # we concluded that the Kraaij-Pohlmann Dutch stemmer was a better default.
52
+ # We still provide this to help people who have a lot of existing data indexed
53
+ # using it.
54
+ dutch_porter UTF_8,ISO_8859_1 dutch_porter dutch
46
55
 
47
56
  # Some other stemmers in the snowball project are not included in the standard
48
57
  # distribution. To compile a libstemmer with them in, add them to this list,
@@ -51,13 +60,7 @@ porter UTF_8,ISO_8859_1 porter english
51
60
  # intended for general use, and use of them is is not fully supported. These
52
61
  # algorithms are:
53
62
  #
54
- # german2 - This is a slight modification of the german stemmer.
55
- #german2 UTF_8,ISO_8859_1 german2 german
56
- #
57
- # kraaij_pohlmann - This is a different dutch stemmer.
58
- #kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann dutch
59
- #
60
63
  # lovins - This is an english stemmer, but fairly outdated, and
61
64
  # only really applicable to a restricted type of input text
62
65
  # (keywords in academic publications).
63
- #lovins UTF_8,ISO_8859_1 lovins english
66
+ #lovins UTF_8,ISO_8859_1 lovins english
@@ -13,7 +13,7 @@ int main () {
13
13
  struct sb_stemmer * s;
14
14
  const char ** list = sb_stemmer_list();
15
15
  if (*list == 0) error("TEST FAIL: empty list of stemmers");
16
-
16
+
17
17
  s = sb_stemmer_new("e");
18
18
  if (s != 0) error("TEST FAIL: non zero return for unrecognised language");
19
19
  s = sb_stemmer_new("english");