mittens 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/LICENSE.txt +1 -1
  4. data/README.md +4 -4
  5. data/lib/mittens/version.rb +1 -1
  6. data/mittens.gemspec +1 -1
  7. data/vendor/snowball/.github/workflows/ci.yml +216 -0
  8. data/vendor/snowball/CONTRIBUTING.rst +111 -62
  9. data/vendor/snowball/GNUmakefile +194 -136
  10. data/vendor/snowball/NEWS +798 -3
  11. data/vendor/snowball/README.rst +50 -1
  12. data/vendor/snowball/ada/src/stemmer.adb +25 -13
  13. data/vendor/snowball/ada/src/stemmer.ads +9 -9
  14. data/vendor/snowball/ada/stemmer_config.gpr +7 -7
  15. data/vendor/snowball/algorithms/basque.sbl +4 -19
  16. data/vendor/snowball/algorithms/catalan.sbl +2 -9
  17. data/vendor/snowball/algorithms/danish.sbl +1 -1
  18. data/vendor/snowball/algorithms/dutch.sbl +284 -122
  19. data/vendor/snowball/algorithms/dutch_porter.sbl +178 -0
  20. data/vendor/snowball/algorithms/english.sbl +52 -37
  21. data/vendor/snowball/algorithms/esperanto.sbl +157 -0
  22. data/vendor/snowball/algorithms/estonian.sbl +269 -0
  23. data/vendor/snowball/algorithms/finnish.sbl +2 -3
  24. data/vendor/snowball/algorithms/french.sbl +42 -16
  25. data/vendor/snowball/algorithms/german.sbl +35 -14
  26. data/vendor/snowball/algorithms/greek.sbl +76 -76
  27. data/vendor/snowball/algorithms/hungarian.sbl +8 -6
  28. data/vendor/snowball/algorithms/indonesian.sbl +14 -8
  29. data/vendor/snowball/algorithms/italian.sbl +11 -21
  30. data/vendor/snowball/algorithms/lithuanian.sbl +36 -37
  31. data/vendor/snowball/algorithms/lovins.sbl +0 -1
  32. data/vendor/snowball/algorithms/nepali.sbl +138 -37
  33. data/vendor/snowball/algorithms/norwegian.sbl +19 -5
  34. data/vendor/snowball/algorithms/porter.sbl +2 -2
  35. data/vendor/snowball/algorithms/portuguese.sbl +9 -13
  36. data/vendor/snowball/algorithms/romanian.sbl +17 -4
  37. data/vendor/snowball/algorithms/serbian.sbl +467 -468
  38. data/vendor/snowball/algorithms/spanish.sbl +5 -7
  39. data/vendor/snowball/algorithms/swedish.sbl +60 -6
  40. data/vendor/snowball/algorithms/tamil.sbl +207 -176
  41. data/vendor/snowball/algorithms/turkish.sbl +461 -445
  42. data/vendor/snowball/algorithms/yiddish.sbl +36 -38
  43. data/vendor/snowball/compiler/analyser.c +445 -192
  44. data/vendor/snowball/compiler/driver.c +109 -101
  45. data/vendor/snowball/compiler/generator.c +853 -464
  46. data/vendor/snowball/compiler/generator_ada.c +404 -366
  47. data/vendor/snowball/compiler/generator_csharp.c +297 -260
  48. data/vendor/snowball/compiler/generator_go.c +323 -254
  49. data/vendor/snowball/compiler/generator_java.c +326 -252
  50. data/vendor/snowball/compiler/generator_js.c +362 -252
  51. data/vendor/snowball/compiler/generator_pascal.c +349 -197
  52. data/vendor/snowball/compiler/generator_python.c +257 -240
  53. data/vendor/snowball/compiler/generator_rust.c +423 -251
  54. data/vendor/snowball/compiler/header.h +117 -71
  55. data/vendor/snowball/compiler/space.c +137 -68
  56. data/vendor/snowball/compiler/syswords.h +2 -2
  57. data/vendor/snowball/compiler/tokeniser.c +125 -107
  58. data/vendor/snowball/csharp/Snowball/Among.cs +14 -14
  59. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +7 -7
  60. data/vendor/snowball/csharp/Snowball/Stemmer.cs +57 -37
  61. data/vendor/snowball/csharp/Stemwords/App.config +2 -2
  62. data/vendor/snowball/csharp/Stemwords/Program.cs +16 -12
  63. data/vendor/snowball/doc/libstemmer_c_README +7 -4
  64. data/vendor/snowball/doc/libstemmer_csharp_README +4 -1
  65. data/vendor/snowball/doc/libstemmer_java_README +12 -1
  66. data/vendor/snowball/doc/libstemmer_js_README +6 -4
  67. data/vendor/snowball/doc/libstemmer_python_README +9 -4
  68. data/vendor/snowball/examples/stemwords.c +12 -12
  69. data/vendor/snowball/go/env.go +107 -31
  70. data/vendor/snowball/go/util.go +0 -4
  71. data/vendor/snowball/include/libstemmer.h +4 -0
  72. data/vendor/snowball/java/org/tartarus/snowball/Among.java +32 -15
  73. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +347 -261
  74. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +3 -0
  75. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +52 -37
  76. data/vendor/snowball/javascript/base-stemmer.js +186 -2
  77. data/vendor/snowball/javascript/stemwords.js +3 -6
  78. data/vendor/snowball/libstemmer/libstemmer_c.in +1 -1
  79. data/vendor/snowball/libstemmer/mkalgorithms.pl +6 -6
  80. data/vendor/snowball/libstemmer/mkmodules.pl +2 -2
  81. data/vendor/snowball/libstemmer/modules.txt +13 -10
  82. data/vendor/snowball/libstemmer/test.c +1 -1
  83. data/vendor/snowball/pascal/SnowballProgram.pas +84 -2
  84. data/vendor/snowball/pascal/generate.pl +13 -13
  85. data/vendor/snowball/python/create_init.py +4 -1
  86. data/vendor/snowball/python/setup.cfg +0 -3
  87. data/vendor/snowball/python/setup.py +8 -3
  88. data/vendor/snowball/python/snowballstemmer/basestemmer.py +20 -54
  89. data/vendor/snowball/python/stemwords.py +8 -12
  90. data/vendor/snowball/runtime/api.c +10 -5
  91. data/vendor/snowball/runtime/header.h +10 -9
  92. data/vendor/snowball/runtime/utilities.c +9 -9
  93. data/vendor/snowball/rust/build.rs +1 -1
  94. data/vendor/snowball/rust/src/snowball/snowball_env.rs +83 -5
  95. data/vendor/snowball/tests/stemtest.c +7 -4
  96. metadata +8 -12
  97. data/vendor/snowball/.travis.yml +0 -112
  98. data/vendor/snowball/algorithms/german2.sbl +0 -145
  99. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +0 -240
  100. data/vendor/snowball/compiler/syswords2.h +0 -13
@@ -43,14 +43,13 @@ func (env *Env) SetCurrent(s string) {
43
43
 
44
44
  func (env *Env) ReplaceS(bra, ket int, s string) int32 {
45
45
  adjustment := int32(len(s)) - (int32(ket) - int32(bra))
46
- result, _ := splitAt(env.current, bra)
46
+ result := env.current[:bra]
47
+ result += s
47
48
  rsplit := ket
48
49
  if ket < bra {
49
50
  rsplit = bra
50
51
  }
51
- _, rhs := splitAt(env.current, rsplit)
52
- result += s
53
- result += rhs
52
+ result += env.current[rsplit:]
54
53
 
55
54
  newLim := int32(env.Limit) + adjustment
56
55
  env.Limit = int(newLim)
@@ -159,16 +158,34 @@ func (env *Env) InGrouping(chars []byte, min, max int32) bool {
159
158
  }
160
159
 
161
160
  r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
162
- if r != utf8.RuneError {
163
- if r > max || r < min {
161
+ if r == utf8.RuneError {
162
+ return false
163
+ }
164
+ if r > max || r < min {
165
+ return false
166
+ }
167
+ r -= min
168
+ if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
169
+ return false
170
+ }
171
+ env.NextChar()
172
+ return true
173
+ }
174
+
175
+ func (env *Env) GoInGrouping(chars []byte, min, max int32) bool {
176
+ for env.Cursor < env.Limit {
177
+ r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
178
+ if r == utf8.RuneError {
164
179
  return false
165
180
  }
181
+ if r > max || r < min {
182
+ return true
183
+ }
166
184
  r -= min
167
185
  if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
168
- return false
186
+ return true
169
187
  }
170
188
  env.NextChar()
171
- return true
172
189
  }
173
190
  return false
174
191
  }
@@ -177,19 +194,41 @@ func (env *Env) InGroupingB(chars []byte, min, max int32) bool {
177
194
  if env.Cursor <= env.LimitBackward {
178
195
  return false
179
196
  }
197
+ c := env.Cursor
180
198
  env.PrevChar()
181
199
  r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
182
- if r != utf8.RuneError {
183
- env.NextChar()
184
- if r > max || r < min {
200
+ if r == utf8.RuneError {
201
+ return false
202
+ }
203
+ if r > max || r < min {
204
+ env.Cursor = c
205
+ return false
206
+ }
207
+ r -= min
208
+ if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
209
+ env.Cursor = c
210
+ return false
211
+ }
212
+ return true
213
+ }
214
+
215
+ func (env *Env) GoInGroupingB(chars []byte, min, max int32) bool {
216
+ for env.Cursor > env.LimitBackward {
217
+ c := env.Cursor
218
+ env.PrevChar()
219
+ r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
220
+ if r == utf8.RuneError {
185
221
  return false
186
222
  }
223
+ if r > max || r < min {
224
+ env.Cursor = c
225
+ return true
226
+ }
187
227
  r -= min
188
228
  if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
189
- return false
229
+ env.Cursor = c
230
+ return true
190
231
  }
191
- env.PrevChar()
192
- return true
193
232
  }
194
233
  return false
195
234
  }
@@ -199,16 +238,34 @@ func (env *Env) OutGrouping(chars []byte, min, max int32) bool {
199
238
  return false
200
239
  }
201
240
  r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
202
- if r != utf8.RuneError {
203
- if r > max || r < min {
204
- env.NextChar()
205
- return true
241
+ if r == utf8.RuneError {
242
+ return false
243
+ }
244
+ if r > max || r < min {
245
+ env.NextChar()
246
+ return true
247
+ }
248
+ r -= min
249
+ if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
250
+ env.NextChar()
251
+ return true
252
+ }
253
+ return false
254
+ }
255
+
256
+ func (env *Env) GoOutGrouping(chars []byte, min, max int32) bool {
257
+ for env.Cursor < env.Limit {
258
+ r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
259
+ if r == utf8.RuneError {
260
+ return false
206
261
  }
207
- r -= min
208
- if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
209
- env.NextChar()
210
- return true
262
+ if r <= max && r >= min {
263
+ r -= min
264
+ if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) != 0 {
265
+ return true
266
+ }
211
267
  }
268
+ env.NextChar()
212
269
  }
213
270
  return false
214
271
  }
@@ -217,18 +274,37 @@ func (env *Env) OutGroupingB(chars []byte, min, max int32) bool {
217
274
  if env.Cursor <= env.LimitBackward {
218
275
  return false
219
276
  }
277
+ c := env.Cursor
220
278
  env.PrevChar()
221
279
  r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
222
- if r != utf8.RuneError {
223
- env.NextChar()
224
- if r > max || r < min {
225
- env.PrevChar()
226
- return true
280
+ if r == utf8.RuneError {
281
+ return false
282
+ }
283
+ if r > max || r < min {
284
+ return true
285
+ }
286
+ r -= min
287
+ if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
288
+ return true
289
+ }
290
+ env.Cursor = c
291
+ return false
292
+ }
293
+
294
+ func (env *Env) GoOutGroupingB(chars []byte, min, max int32) bool {
295
+ for env.Cursor > env.LimitBackward {
296
+ c := env.Cursor
297
+ env.PrevChar()
298
+ r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
299
+ if r == utf8.RuneError {
300
+ return false
227
301
  }
228
- r -= min
229
- if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
230
- env.PrevChar()
231
- return true
302
+ if r <= max && r >= min {
303
+ r -= min
304
+ if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) != 0 {
305
+ env.Cursor = c
306
+ return true
307
+ }
232
308
  }
233
309
  }
234
310
  return false
@@ -8,10 +8,6 @@ import (
8
8
  const MaxInt = math.MaxInt32
9
9
  const MinInt = math.MinInt32
10
10
 
11
- func splitAt(str string, mid int) (string, string) {
12
- return str[:mid], str[mid:]
13
- }
14
-
15
11
  func min(a, b int) int {
16
12
  if a < b {
17
13
  return a
@@ -56,6 +56,10 @@ struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc)
56
56
  void sb_stemmer_delete(struct sb_stemmer * stemmer);
57
57
 
58
58
  /** Stem a word.
59
+ *
60
+ * The stemming algorithms generally expect the input text to use composed
61
+ * accents (Unicode NFC or NFKC) and to have been folded to lower case
62
+ * already.
59
63
  *
60
64
  * The return value is owned by the stemmer - it must not be freed or
61
65
  * modified, and it will become invalid when the stemmer is called again,
@@ -1,29 +1,46 @@
1
1
  package org.tartarus.snowball;
2
2
 
3
- import java.lang.reflect.Method;
3
+ import java.lang.invoke.MethodHandle;
4
+ import java.lang.invoke.MethodHandles;
5
+ import java.lang.invoke.MethodType;
6
+ import java.util.Locale;
4
7
 
8
+ /**
9
+ * Internal class used by Snowball stemmers
10
+ */
5
11
  public class Among {
6
12
  public Among (String s, int substring_i, int result) {
7
13
  this.s = s.toCharArray();
8
14
  this.substring_i = substring_i;
9
- this.result = result;
10
- this.method = null;
15
+ this.result = result;
16
+ this.method = null;
11
17
  }
12
18
 
13
19
  public Among (String s, int substring_i, int result, String methodname,
14
- Class<? extends SnowballProgram> programclass) {
20
+ MethodHandles.Lookup methodobject) {
15
21
  this.s = s.toCharArray();
16
22
  this.substring_i = substring_i;
17
- this.result = result;
18
- try {
19
- this.method = programclass.getDeclaredMethod(methodname);
20
- } catch (NoSuchMethodException e) {
21
- throw new RuntimeException(e);
22
- }
23
+ this.result = result;
24
+ final Class<? extends SnowballProgram> clazz = methodobject.lookupClass().asSubclass(SnowballProgram.class);
25
+ if (methodname.length() > 0) {
26
+ try {
27
+ this.method = methodobject.findVirtual(clazz, methodname, MethodType.methodType(boolean.class))
28
+ .asType(MethodType.methodType(boolean.class, SnowballProgram.class));
29
+ } catch (NoSuchMethodException | IllegalAccessException e) {
30
+ throw new RuntimeException(String.format(Locale.ENGLISH,
31
+ "Snowball program '%s' is broken, cannot access method: boolean %s()",
32
+ clazz.getSimpleName(), methodname
33
+ ), e);
34
+ }
35
+ } else {
36
+ this.method = null;
37
+ }
23
38
  }
24
39
 
25
- public final char[] s; /* search string */
26
- public final int substring_i; /* index to longest matching substring */
27
- public final int result; /* result of the lookup */
28
- public final Method method; /* method to use if substring matches */
29
- };
40
+ final char[] s; /* search string */
41
+ final int substring_i; /* index to longest matching substring */
42
+ final int result; /* result of the lookup */
43
+
44
+ // Make sure this is not accessible outside package for Java security reasons!
45
+ final MethodHandle method; /* method to use if substring matches */
46
+ }