mittens 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/LICENSE.txt +1 -1
- data/README.md +4 -4
- data/lib/mittens/version.rb +1 -1
- data/mittens.gemspec +1 -1
- data/vendor/snowball/.github/workflows/ci.yml +216 -0
- data/vendor/snowball/CONTRIBUTING.rst +111 -62
- data/vendor/snowball/GNUmakefile +194 -136
- data/vendor/snowball/NEWS +798 -3
- data/vendor/snowball/README.rst +50 -1
- data/vendor/snowball/ada/src/stemmer.adb +25 -13
- data/vendor/snowball/ada/src/stemmer.ads +9 -9
- data/vendor/snowball/ada/stemmer_config.gpr +7 -7
- data/vendor/snowball/algorithms/basque.sbl +4 -19
- data/vendor/snowball/algorithms/catalan.sbl +2 -9
- data/vendor/snowball/algorithms/danish.sbl +1 -1
- data/vendor/snowball/algorithms/dutch.sbl +284 -122
- data/vendor/snowball/algorithms/dutch_porter.sbl +178 -0
- data/vendor/snowball/algorithms/english.sbl +52 -37
- data/vendor/snowball/algorithms/esperanto.sbl +157 -0
- data/vendor/snowball/algorithms/estonian.sbl +269 -0
- data/vendor/snowball/algorithms/finnish.sbl +2 -3
- data/vendor/snowball/algorithms/french.sbl +42 -16
- data/vendor/snowball/algorithms/german.sbl +35 -14
- data/vendor/snowball/algorithms/greek.sbl +76 -76
- data/vendor/snowball/algorithms/hungarian.sbl +8 -6
- data/vendor/snowball/algorithms/indonesian.sbl +14 -8
- data/vendor/snowball/algorithms/italian.sbl +11 -21
- data/vendor/snowball/algorithms/lithuanian.sbl +36 -37
- data/vendor/snowball/algorithms/lovins.sbl +0 -1
- data/vendor/snowball/algorithms/nepali.sbl +138 -37
- data/vendor/snowball/algorithms/norwegian.sbl +19 -5
- data/vendor/snowball/algorithms/porter.sbl +2 -2
- data/vendor/snowball/algorithms/portuguese.sbl +9 -13
- data/vendor/snowball/algorithms/romanian.sbl +17 -4
- data/vendor/snowball/algorithms/serbian.sbl +467 -468
- data/vendor/snowball/algorithms/spanish.sbl +5 -7
- data/vendor/snowball/algorithms/swedish.sbl +60 -6
- data/vendor/snowball/algorithms/tamil.sbl +207 -176
- data/vendor/snowball/algorithms/turkish.sbl +461 -445
- data/vendor/snowball/algorithms/yiddish.sbl +36 -38
- data/vendor/snowball/compiler/analyser.c +445 -192
- data/vendor/snowball/compiler/driver.c +109 -101
- data/vendor/snowball/compiler/generator.c +853 -464
- data/vendor/snowball/compiler/generator_ada.c +404 -366
- data/vendor/snowball/compiler/generator_csharp.c +297 -260
- data/vendor/snowball/compiler/generator_go.c +323 -254
- data/vendor/snowball/compiler/generator_java.c +326 -252
- data/vendor/snowball/compiler/generator_js.c +362 -252
- data/vendor/snowball/compiler/generator_pascal.c +349 -197
- data/vendor/snowball/compiler/generator_python.c +257 -240
- data/vendor/snowball/compiler/generator_rust.c +423 -251
- data/vendor/snowball/compiler/header.h +117 -71
- data/vendor/snowball/compiler/space.c +137 -68
- data/vendor/snowball/compiler/syswords.h +2 -2
- data/vendor/snowball/compiler/tokeniser.c +125 -107
- data/vendor/snowball/csharp/Snowball/Among.cs +14 -14
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +7 -7
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +57 -37
- data/vendor/snowball/csharp/Stemwords/App.config +2 -2
- data/vendor/snowball/csharp/Stemwords/Program.cs +16 -12
- data/vendor/snowball/doc/libstemmer_c_README +7 -4
- data/vendor/snowball/doc/libstemmer_csharp_README +4 -1
- data/vendor/snowball/doc/libstemmer_java_README +12 -1
- data/vendor/snowball/doc/libstemmer_js_README +6 -4
- data/vendor/snowball/doc/libstemmer_python_README +9 -4
- data/vendor/snowball/examples/stemwords.c +12 -12
- data/vendor/snowball/go/env.go +107 -31
- data/vendor/snowball/go/util.go +0 -4
- data/vendor/snowball/include/libstemmer.h +4 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +32 -15
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +347 -261
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +3 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +52 -37
- data/vendor/snowball/javascript/base-stemmer.js +186 -2
- data/vendor/snowball/javascript/stemwords.js +3 -6
- data/vendor/snowball/libstemmer/libstemmer_c.in +1 -1
- data/vendor/snowball/libstemmer/mkalgorithms.pl +6 -6
- data/vendor/snowball/libstemmer/mkmodules.pl +2 -2
- data/vendor/snowball/libstemmer/modules.txt +13 -10
- data/vendor/snowball/libstemmer/test.c +1 -1
- data/vendor/snowball/pascal/SnowballProgram.pas +84 -2
- data/vendor/snowball/pascal/generate.pl +13 -13
- data/vendor/snowball/python/create_init.py +4 -1
- data/vendor/snowball/python/setup.cfg +0 -3
- data/vendor/snowball/python/setup.py +8 -3
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +20 -54
- data/vendor/snowball/python/stemwords.py +8 -12
- data/vendor/snowball/runtime/api.c +10 -5
- data/vendor/snowball/runtime/header.h +10 -9
- data/vendor/snowball/runtime/utilities.c +9 -9
- data/vendor/snowball/rust/build.rs +1 -1
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +83 -5
- data/vendor/snowball/tests/stemtest.c +7 -4
- metadata +8 -12
- data/vendor/snowball/.travis.yml +0 -112
- data/vendor/snowball/algorithms/german2.sbl +0 -145
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +0 -240
- data/vendor/snowball/compiler/syswords2.h +0 -13
data/vendor/snowball/go/env.go
CHANGED
@@ -43,14 +43,13 @@ func (env *Env) SetCurrent(s string) {
|
|
43
43
|
|
44
44
|
func (env *Env) ReplaceS(bra, ket int, s string) int32 {
|
45
45
|
adjustment := int32(len(s)) - (int32(ket) - int32(bra))
|
46
|
-
result
|
46
|
+
result := env.current[:bra]
|
47
|
+
result += s
|
47
48
|
rsplit := ket
|
48
49
|
if ket < bra {
|
49
50
|
rsplit = bra
|
50
51
|
}
|
51
|
-
|
52
|
-
result += s
|
53
|
-
result += rhs
|
52
|
+
result += env.current[rsplit:]
|
54
53
|
|
55
54
|
newLim := int32(env.Limit) + adjustment
|
56
55
|
env.Limit = int(newLim)
|
@@ -159,16 +158,34 @@ func (env *Env) InGrouping(chars []byte, min, max int32) bool {
|
|
159
158
|
}
|
160
159
|
|
161
160
|
r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
|
162
|
-
if r
|
163
|
-
|
161
|
+
if r == utf8.RuneError {
|
162
|
+
return false
|
163
|
+
}
|
164
|
+
if r > max || r < min {
|
165
|
+
return false
|
166
|
+
}
|
167
|
+
r -= min
|
168
|
+
if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
|
169
|
+
return false
|
170
|
+
}
|
171
|
+
env.NextChar()
|
172
|
+
return true
|
173
|
+
}
|
174
|
+
|
175
|
+
func (env *Env) GoInGrouping(chars []byte, min, max int32) bool {
|
176
|
+
for env.Cursor < env.Limit {
|
177
|
+
r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
|
178
|
+
if r == utf8.RuneError {
|
164
179
|
return false
|
165
180
|
}
|
181
|
+
if r > max || r < min {
|
182
|
+
return true
|
183
|
+
}
|
166
184
|
r -= min
|
167
185
|
if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
|
168
|
-
return
|
186
|
+
return true
|
169
187
|
}
|
170
188
|
env.NextChar()
|
171
|
-
return true
|
172
189
|
}
|
173
190
|
return false
|
174
191
|
}
|
@@ -177,19 +194,41 @@ func (env *Env) InGroupingB(chars []byte, min, max int32) bool {
|
|
177
194
|
if env.Cursor <= env.LimitBackward {
|
178
195
|
return false
|
179
196
|
}
|
197
|
+
c := env.Cursor
|
180
198
|
env.PrevChar()
|
181
199
|
r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
|
182
|
-
if r
|
183
|
-
|
184
|
-
|
200
|
+
if r == utf8.RuneError {
|
201
|
+
return false
|
202
|
+
}
|
203
|
+
if r > max || r < min {
|
204
|
+
env.Cursor = c
|
205
|
+
return false
|
206
|
+
}
|
207
|
+
r -= min
|
208
|
+
if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
|
209
|
+
env.Cursor = c
|
210
|
+
return false
|
211
|
+
}
|
212
|
+
return true
|
213
|
+
}
|
214
|
+
|
215
|
+
func (env *Env) GoInGroupingB(chars []byte, min, max int32) bool {
|
216
|
+
for env.Cursor > env.LimitBackward {
|
217
|
+
c := env.Cursor
|
218
|
+
env.PrevChar()
|
219
|
+
r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
|
220
|
+
if r == utf8.RuneError {
|
185
221
|
return false
|
186
222
|
}
|
223
|
+
if r > max || r < min {
|
224
|
+
env.Cursor = c
|
225
|
+
return true
|
226
|
+
}
|
187
227
|
r -= min
|
188
228
|
if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
|
189
|
-
|
229
|
+
env.Cursor = c
|
230
|
+
return true
|
190
231
|
}
|
191
|
-
env.PrevChar()
|
192
|
-
return true
|
193
232
|
}
|
194
233
|
return false
|
195
234
|
}
|
@@ -199,16 +238,34 @@ func (env *Env) OutGrouping(chars []byte, min, max int32) bool {
|
|
199
238
|
return false
|
200
239
|
}
|
201
240
|
r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
|
202
|
-
if r
|
203
|
-
|
204
|
-
|
205
|
-
|
241
|
+
if r == utf8.RuneError {
|
242
|
+
return false
|
243
|
+
}
|
244
|
+
if r > max || r < min {
|
245
|
+
env.NextChar()
|
246
|
+
return true
|
247
|
+
}
|
248
|
+
r -= min
|
249
|
+
if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
|
250
|
+
env.NextChar()
|
251
|
+
return true
|
252
|
+
}
|
253
|
+
return false
|
254
|
+
}
|
255
|
+
|
256
|
+
func (env *Env) GoOutGrouping(chars []byte, min, max int32) bool {
|
257
|
+
for env.Cursor < env.Limit {
|
258
|
+
r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
|
259
|
+
if r == utf8.RuneError {
|
260
|
+
return false
|
206
261
|
}
|
207
|
-
r
|
208
|
-
|
209
|
-
|
210
|
-
|
262
|
+
if r <= max && r >= min {
|
263
|
+
r -= min
|
264
|
+
if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) != 0 {
|
265
|
+
return true
|
266
|
+
}
|
211
267
|
}
|
268
|
+
env.NextChar()
|
212
269
|
}
|
213
270
|
return false
|
214
271
|
}
|
@@ -217,18 +274,37 @@ func (env *Env) OutGroupingB(chars []byte, min, max int32) bool {
|
|
217
274
|
if env.Cursor <= env.LimitBackward {
|
218
275
|
return false
|
219
276
|
}
|
277
|
+
c := env.Cursor
|
220
278
|
env.PrevChar()
|
221
279
|
r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
|
222
|
-
if r
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
280
|
+
if r == utf8.RuneError {
|
281
|
+
return false
|
282
|
+
}
|
283
|
+
if r > max || r < min {
|
284
|
+
return true
|
285
|
+
}
|
286
|
+
r -= min
|
287
|
+
if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
|
288
|
+
return true
|
289
|
+
}
|
290
|
+
env.Cursor = c
|
291
|
+
return false
|
292
|
+
}
|
293
|
+
|
294
|
+
func (env *Env) GoOutGroupingB(chars []byte, min, max int32) bool {
|
295
|
+
for env.Cursor > env.LimitBackward {
|
296
|
+
c := env.Cursor
|
297
|
+
env.PrevChar()
|
298
|
+
r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
|
299
|
+
if r == utf8.RuneError {
|
300
|
+
return false
|
227
301
|
}
|
228
|
-
r
|
229
|
-
|
230
|
-
|
231
|
-
|
302
|
+
if r <= max && r >= min {
|
303
|
+
r -= min
|
304
|
+
if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) != 0 {
|
305
|
+
env.Cursor = c
|
306
|
+
return true
|
307
|
+
}
|
232
308
|
}
|
233
309
|
}
|
234
310
|
return false
|
data/vendor/snowball/go/util.go
CHANGED
@@ -56,6 +56,10 @@ struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc)
|
|
56
56
|
void sb_stemmer_delete(struct sb_stemmer * stemmer);
|
57
57
|
|
58
58
|
/** Stem a word.
|
59
|
+
*
|
60
|
+
* The stemming algorithms generally expect the input text to use composed
|
61
|
+
* accents (Unicode NFC or NFKC) and to have been folded to lower case
|
62
|
+
* already.
|
59
63
|
*
|
60
64
|
* The return value is owned by the stemmer - it must not be freed or
|
61
65
|
* modified, and it will become invalid when the stemmer is called again,
|
@@ -1,29 +1,46 @@
|
|
1
1
|
package org.tartarus.snowball;
|
2
2
|
|
3
|
-
import java.lang.
|
3
|
+
import java.lang.invoke.MethodHandle;
|
4
|
+
import java.lang.invoke.MethodHandles;
|
5
|
+
import java.lang.invoke.MethodType;
|
6
|
+
import java.util.Locale;
|
4
7
|
|
8
|
+
/**
|
9
|
+
* Internal class used by Snowball stemmers
|
10
|
+
*/
|
5
11
|
public class Among {
|
6
12
|
public Among (String s, int substring_i, int result) {
|
7
13
|
this.s = s.toCharArray();
|
8
14
|
this.substring_i = substring_i;
|
9
|
-
|
10
|
-
|
15
|
+
this.result = result;
|
16
|
+
this.method = null;
|
11
17
|
}
|
12
18
|
|
13
19
|
public Among (String s, int substring_i, int result, String methodname,
|
14
|
-
|
20
|
+
MethodHandles.Lookup methodobject) {
|
15
21
|
this.s = s.toCharArray();
|
16
22
|
this.substring_i = substring_i;
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
+
this.result = result;
|
24
|
+
final Class<? extends SnowballProgram> clazz = methodobject.lookupClass().asSubclass(SnowballProgram.class);
|
25
|
+
if (methodname.length() > 0) {
|
26
|
+
try {
|
27
|
+
this.method = methodobject.findVirtual(clazz, methodname, MethodType.methodType(boolean.class))
|
28
|
+
.asType(MethodType.methodType(boolean.class, SnowballProgram.class));
|
29
|
+
} catch (NoSuchMethodException | IllegalAccessException e) {
|
30
|
+
throw new RuntimeException(String.format(Locale.ENGLISH,
|
31
|
+
"Snowball program '%s' is broken, cannot access method: boolean %s()",
|
32
|
+
clazz.getSimpleName(), methodname
|
33
|
+
), e);
|
34
|
+
}
|
35
|
+
} else {
|
36
|
+
this.method = null;
|
37
|
+
}
|
23
38
|
}
|
24
39
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
40
|
+
final char[] s; /* search string */
|
41
|
+
final int substring_i; /* index to longest matching substring */
|
42
|
+
final int result; /* result of the lookup */
|
43
|
+
|
44
|
+
// Make sure this is not accessible outside package for Java security reasons!
|
45
|
+
final MethodHandle method; /* method to use if substring matches */
|
46
|
+
}
|