mittens 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +3 -3
- data/lib/mittens/version.rb +1 -1
- data/vendor/snowball/.github/workflows/ci.yml +216 -0
- data/vendor/snowball/CONTRIBUTING.rst +111 -62
- data/vendor/snowball/GNUmakefile +194 -136
- data/vendor/snowball/NEWS +798 -3
- data/vendor/snowball/README.rst +50 -1
- data/vendor/snowball/ada/src/stemmer.adb +25 -13
- data/vendor/snowball/ada/src/stemmer.ads +9 -9
- data/vendor/snowball/ada/stemmer_config.gpr +7 -7
- data/vendor/snowball/algorithms/basque.sbl +4 -19
- data/vendor/snowball/algorithms/catalan.sbl +2 -9
- data/vendor/snowball/algorithms/danish.sbl +1 -1
- data/vendor/snowball/algorithms/dutch.sbl +284 -122
- data/vendor/snowball/algorithms/dutch_porter.sbl +178 -0
- data/vendor/snowball/algorithms/english.sbl +52 -37
- data/vendor/snowball/algorithms/esperanto.sbl +157 -0
- data/vendor/snowball/algorithms/estonian.sbl +269 -0
- data/vendor/snowball/algorithms/finnish.sbl +2 -3
- data/vendor/snowball/algorithms/french.sbl +42 -16
- data/vendor/snowball/algorithms/german.sbl +35 -14
- data/vendor/snowball/algorithms/greek.sbl +76 -76
- data/vendor/snowball/algorithms/hungarian.sbl +8 -6
- data/vendor/snowball/algorithms/indonesian.sbl +14 -8
- data/vendor/snowball/algorithms/italian.sbl +11 -21
- data/vendor/snowball/algorithms/lithuanian.sbl +36 -37
- data/vendor/snowball/algorithms/lovins.sbl +0 -1
- data/vendor/snowball/algorithms/nepali.sbl +138 -37
- data/vendor/snowball/algorithms/norwegian.sbl +19 -5
- data/vendor/snowball/algorithms/porter.sbl +2 -2
- data/vendor/snowball/algorithms/portuguese.sbl +9 -13
- data/vendor/snowball/algorithms/romanian.sbl +17 -4
- data/vendor/snowball/algorithms/serbian.sbl +467 -468
- data/vendor/snowball/algorithms/spanish.sbl +5 -7
- data/vendor/snowball/algorithms/swedish.sbl +60 -6
- data/vendor/snowball/algorithms/tamil.sbl +207 -176
- data/vendor/snowball/algorithms/turkish.sbl +461 -445
- data/vendor/snowball/algorithms/yiddish.sbl +36 -38
- data/vendor/snowball/compiler/analyser.c +445 -192
- data/vendor/snowball/compiler/driver.c +109 -101
- data/vendor/snowball/compiler/generator.c +853 -464
- data/vendor/snowball/compiler/generator_ada.c +404 -366
- data/vendor/snowball/compiler/generator_csharp.c +297 -260
- data/vendor/snowball/compiler/generator_go.c +323 -254
- data/vendor/snowball/compiler/generator_java.c +326 -252
- data/vendor/snowball/compiler/generator_js.c +362 -252
- data/vendor/snowball/compiler/generator_pascal.c +349 -197
- data/vendor/snowball/compiler/generator_python.c +257 -240
- data/vendor/snowball/compiler/generator_rust.c +423 -251
- data/vendor/snowball/compiler/header.h +117 -71
- data/vendor/snowball/compiler/space.c +137 -68
- data/vendor/snowball/compiler/syswords.h +2 -2
- data/vendor/snowball/compiler/tokeniser.c +125 -107
- data/vendor/snowball/csharp/Snowball/Among.cs +14 -14
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +7 -7
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +57 -37
- data/vendor/snowball/csharp/Stemwords/App.config +2 -2
- data/vendor/snowball/csharp/Stemwords/Program.cs +16 -12
- data/vendor/snowball/doc/libstemmer_c_README +7 -4
- data/vendor/snowball/doc/libstemmer_csharp_README +4 -1
- data/vendor/snowball/doc/libstemmer_java_README +12 -1
- data/vendor/snowball/doc/libstemmer_js_README +6 -4
- data/vendor/snowball/doc/libstemmer_python_README +9 -4
- data/vendor/snowball/examples/stemwords.c +12 -12
- data/vendor/snowball/go/env.go +107 -31
- data/vendor/snowball/go/util.go +0 -4
- data/vendor/snowball/include/libstemmer.h +4 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +32 -15
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +347 -261
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +3 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +52 -37
- data/vendor/snowball/javascript/base-stemmer.js +186 -2
- data/vendor/snowball/javascript/stemwords.js +3 -6
- data/vendor/snowball/libstemmer/libstemmer_c.in +1 -1
- data/vendor/snowball/libstemmer/mkalgorithms.pl +6 -6
- data/vendor/snowball/libstemmer/mkmodules.pl +2 -2
- data/vendor/snowball/libstemmer/modules.txt +13 -10
- data/vendor/snowball/libstemmer/test.c +1 -1
- data/vendor/snowball/pascal/SnowballProgram.pas +84 -2
- data/vendor/snowball/pascal/generate.pl +13 -13
- data/vendor/snowball/python/create_init.py +4 -1
- data/vendor/snowball/python/setup.cfg +0 -3
- data/vendor/snowball/python/setup.py +8 -3
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +20 -54
- data/vendor/snowball/python/stemwords.py +8 -12
- data/vendor/snowball/runtime/api.c +10 -5
- data/vendor/snowball/runtime/header.h +10 -9
- data/vendor/snowball/runtime/utilities.c +9 -9
- data/vendor/snowball/rust/build.rs +1 -1
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +83 -5
- data/vendor/snowball/tests/stemtest.c +7 -4
- metadata +7 -7
- data/vendor/snowball/.travis.yml +0 -112
- data/vendor/snowball/algorithms/german2.sbl +0 -145
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +0 -240
- data/vendor/snowball/compiler/syswords2.h +0 -13
@@ -3,8 +3,6 @@
|
|
3
3
|
*
|
4
4
|
* Author: Assaf Urieli
|
5
5
|
* Emails: assaf.urieli at gmail.com
|
6
|
-
* Version: 0.1 (15.05.2020)
|
7
|
-
*
|
8
6
|
********************************************* */
|
9
7
|
|
10
8
|
routines (
|
@@ -103,9 +101,9 @@ define mark_regions as (
|
|
103
101
|
(
|
104
102
|
try (
|
105
103
|
// Replace past participle ge- at start of word
|
106
|
-
// Unless word starts with gelt- or gebn-
|
104
|
+
// Unless word starts with gelt- or gebn- or the whole word is ge
|
107
105
|
['{Giml}{Ayen}']
|
108
|
-
not ('{Lamed}{Tes}' or '{Beys}{Nun}') <- 'GE'
|
106
|
+
not ('{Lamed}{Tes}' or '{Beys}{Nun}' or atlimit) <- 'GE'
|
109
107
|
)
|
110
108
|
|
111
109
|
try (
|
@@ -149,7 +147,7 @@ define mark_regions as (
|
|
149
147
|
// Either 3 consonants or the first non-vowel after a vowel
|
150
148
|
(
|
151
149
|
not (consonant consonant consonant setmark p1)
|
152
|
-
|
150
|
+
gopast vowel goto non-vowel setmark p1
|
153
151
|
)
|
154
152
|
try($p1 < x $p1 = x) // at least 3 past the prefix
|
155
153
|
)
|
@@ -178,7 +176,7 @@ backwardmode (
|
|
178
176
|
|
179
177
|
// Plural/adjective endings: -enem, -ener, -ene, -ens
|
180
178
|
'{Ayen}{Nun}{Ayen}' '{Ayen}{Nun}{Ayen}{Mem}' '{Ayen}{Nun}{Ayen}{Reysh}' '{Ayen}{Nun}{Samekh}'
|
181
|
-
(R1 delete
|
179
|
+
(R1 delete
|
182
180
|
[substring] among (
|
183
181
|
// -gegangen => -gey
|
184
182
|
'{Giml}{Alef}{Nun}{Giml}' (<- '{Giml}{TsveyYudn}')
|
@@ -237,7 +235,7 @@ backwardmode (
|
|
237
235
|
|
238
236
|
// Verb/past participle ending: -t
|
239
237
|
'{Tes}'
|
240
|
-
( R1 delete )
|
238
|
+
( R1 delete )
|
241
239
|
|
242
240
|
// As well as noun/adjectives ending in -tn, -te, -ter, -ts so that the "-t" doesn't differentiate
|
243
241
|
// Similarly for past participles: -tns, -tene, -tenem, -tener
|
@@ -271,95 +269,95 @@ backwardmode (
|
|
271
269
|
(<- '{Shin}{Reysh}{TsveyYudn}{Beys}' )
|
272
270
|
|
273
271
|
// -gemiten => -mayd
|
274
|
-
'GE{Mem}{Yud}{Tes}{Nun}'
|
272
|
+
'GE{Mem}{Yud}{Tes}{Nun}'
|
275
273
|
(<- '{Mem}{TsveyYudn}{Dalet}')
|
276
274
|
|
277
275
|
// -gebiten => -bayt
|
278
|
-
'GE{Beys}{Yud}{Tes}{Nun}'
|
276
|
+
'GE{Beys}{Yud}{Tes}{Nun}'
|
279
277
|
(<- '{Beys}{TsveyYudn}{Tes}')
|
280
278
|
|
281
279
|
// -gebisen => -bays
|
282
|
-
'GE{Beys}{Yud}{Samekh}{Nun}'
|
280
|
+
'GE{Beys}{Yud}{Samekh}{Nun}'
|
283
281
|
( <- '{Beys}{TsveyYudn}{Samekh}')
|
284
282
|
|
285
283
|
// -gevizen => -vayz
|
286
|
-
'{TsveyVovn}{Yud}{Zayen}{Nun}'
|
284
|
+
'{TsveyVovn}{Yud}{Zayen}{Nun}'
|
287
285
|
( <- '{TsveyVovn}{TsveyYudn}{Zayen}')
|
288
286
|
|
289
287
|
// -getriben => -trayb
|
290
|
-
'{Tes}{Reysh}{Yud}{Beys}{Nun}'
|
288
|
+
'{Tes}{Reysh}{Yud}{Beys}{Nun}'
|
291
289
|
( <- '{Tes}{Reysh}{TsveyYudn}{Beys}')
|
292
290
|
|
293
291
|
// -geliten => -layt
|
294
|
-
'GE{Lamed}{Yud}{Tes}{Nun}'
|
292
|
+
'GE{Lamed}{Yud}{Tes}{Nun}'
|
295
293
|
( <- '{Lamed}{TsveyYudn}{Tes}')
|
296
294
|
|
297
295
|
// -gekliben => -klayb
|
298
|
-
'{Kuf}{Lamed}{Yud}{Beys}{Nun}'
|
296
|
+
'{Kuf}{Lamed}{Yud}{Beys}{Nun}'
|
299
297
|
( <- '{Kuf}{Lamed}{TsveyYudn}{Beys}')
|
300
298
|
|
301
299
|
// -geriben => -rayb
|
302
|
-
'{Reysh}{Yud}{Beys}{Nun}'
|
300
|
+
'{Reysh}{Yud}{Beys}{Nun}'
|
303
301
|
( <- '{Reysh}{TsveyYudn}{Beys}')
|
304
302
|
|
305
303
|
// -gerisen => -rays
|
306
|
-
'GE{Reysh}{Yud}{Samekh}{Nun}'
|
304
|
+
'GE{Reysh}{Yud}{Samekh}{Nun}'
|
307
305
|
( <- '{Reysh}{TsveyYudn}{Samekh}')
|
308
306
|
|
309
307
|
// -geshvigen => -shvayg
|
310
|
-
'{Shin}{TsveyVovn}{Yud}{Giml}{Nun}'
|
308
|
+
'{Shin}{TsveyVovn}{Yud}{Giml}{Nun}'
|
311
309
|
( <- '{Shin}{TsveyVovn}{TsveyYudn}{Giml}')
|
312
310
|
|
313
311
|
// -geshmisen => -shmays
|
314
|
-
'{Shin}{Mem}{Yud}{Samekh}{Nun}'
|
312
|
+
'{Shin}{Mem}{Yud}{Samekh}{Nun}'
|
315
313
|
( <- '{Shin}{Mem}{TsveyYudn}{Samekh}')
|
316
314
|
|
317
315
|
// -geshniten => -shnayd
|
318
|
-
'{Shin}{Nun}{Yud}{Tes}{Nun}'
|
316
|
+
'{Shin}{Nun}{Yud}{Tes}{Nun}'
|
319
317
|
( <- '{Shin}{Nun}{TsveyYudn}{Dalet}')
|
320
318
|
|
321
319
|
// -gebunden => -bind
|
322
|
-
'{Beys}{Vov}{Nun}{Dalet}{Nun}'
|
320
|
+
'{Beys}{Vov}{Nun}{Dalet}{Nun}'
|
323
321
|
( <- '{Beys}{Yud}{Nun}{Dalet}')
|
324
322
|
|
325
323
|
// -gevuntshn => -vintsh
|
326
|
-
'{TsveyVovn}{Vov}{Tes}{Shin}{Nun}'
|
324
|
+
'{TsveyVovn}{Vov}{Tes}{Shin}{Nun}'
|
327
325
|
( <- '{TsveyVovn}{Yud}{Tes}{Shin}')
|
328
326
|
|
329
327
|
// -gezungen => -zing
|
330
|
-
'{Zayen}{Vov}{Nun}{Giml}{Nun}'
|
328
|
+
'{Zayen}{Vov}{Nun}{Giml}{Nun}'
|
331
329
|
( <- '{Zayen}{Yud}{Nun}{Giml}')
|
332
330
|
|
333
331
|
// -getrunken => -trink
|
334
|
-
'{Tes}{Reysh}{Vov}{Nun}{Kuf}{Nun}'
|
332
|
+
'{Tes}{Reysh}{Vov}{Nun}{Kuf}{Nun}'
|
335
333
|
( <- '{Tes}{Reysh}{Yud}{Nun}{Kuf}')
|
336
334
|
|
337
335
|
// -getsvungen => -tsving
|
338
|
-
'{Tsadek}{TsveyVovn}{Vov}{Nun}{Giml}{Nun}'
|
336
|
+
'{Tsadek}{TsveyVovn}{Vov}{Nun}{Giml}{Nun}'
|
339
337
|
( <- '{Tsadek}{TsveyVovn}{Yud}{Nun}{Giml}')
|
340
338
|
|
341
339
|
// -geshlungen => -shling
|
342
|
-
'{Shin}{Lamed}{Vov}{Nun}{Giml}{Nun}'
|
340
|
+
'{Shin}{Lamed}{Vov}{Nun}{Giml}{Nun}'
|
343
341
|
( <- '{Shin}{Lamed}{Yud}{Nun}{Giml}')
|
344
342
|
|
345
343
|
// -geboygen => -beyg
|
346
|
-
'{Beys}{VovYud}{Giml}{Nun}'
|
344
|
+
'{Beys}{VovYud}{Giml}{Nun}'
|
347
345
|
( <- '{Beys}{TsveyYudn}{Giml}')
|
348
346
|
|
349
347
|
// -gehoyben => -heyb
|
350
|
-
'{Hey}{VovYud}{Beys}{Nun}'
|
348
|
+
'{Hey}{VovYud}{Beys}{Nun}'
|
351
349
|
( <- '{Hey}{TsveyYudn}{Beys}')
|
352
350
|
|
353
351
|
// -farloyren => -farlir
|
354
|
-
'{Fey}{Alef}{Reysh}{Lamed}{VovYud}{Reysh}{Nun}'
|
352
|
+
'{Fey}{Alef}{Reysh}{Lamed}{VovYud}{Reysh}{Nun}'
|
355
353
|
( <- '{Fey}{Alef}{Reysh}{Lamed}{Yud}{Reysh}')
|
356
354
|
|
357
355
|
// -shtanen => -shtey
|
358
|
-
'{Shin}{Tes}{Alef}{Nun}{Ayen}{Nun}'
|
356
|
+
'{Shin}{Tes}{Alef}{Nun}{Ayen}{Nun}'
|
359
357
|
( <- '{Shin}{Tes}{TsveyYudn}')
|
360
358
|
|
361
359
|
// -geshvoyrn => -shver
|
362
|
-
'{Shin}{TsveyVovn}{VovYud}{Reysh}{Nun}'
|
360
|
+
'{Shin}{TsveyVovn}{VovYud}{Reysh}{Nun}'
|
363
361
|
( <- '{Shin}{TsveyVovn}{Ayen}{Reysh}')
|
364
362
|
|
365
363
|
// -(ge)brakht (shortened to -brakht after prefixes) => -breng
|
@@ -379,7 +377,7 @@ backwardmode (
|
|
379
377
|
// Plural ending: -im
|
380
378
|
'{Yud}{Mem}'
|
381
379
|
( R1 delete )
|
382
|
-
|
380
|
+
|
383
381
|
// Plural ending: -os (Hebraic), replace with -h
|
384
382
|
'{Vov}{Sof}'
|
385
383
|
( R1 <- '{Hey}' )
|
@@ -387,7 +385,7 @@ backwardmode (
|
|
387
385
|
// Diminutive endings: -elekh, -ele, -lekh, -eles, -elen
|
388
386
|
'{Ayen}{Lamed}{Ayen}{Khof}' '{Ayen}{Lamed}{Ayen}' '{Lamed}{Ayen}{Khof}' '{Ayen}{Lamed}{Ayen}{Samekh}' '{Ayen}{Lamed}{Ayen}{Nun}'
|
389
387
|
( R1 delete )
|
390
|
-
|
388
|
+
|
391
389
|
// Noun ending: -ist
|
392
390
|
'{Yud}{Samekh}{Tes}'
|
393
391
|
(
|
@@ -400,18 +398,18 @@ backwardmode (
|
|
400
398
|
// Noun ending: -istn
|
401
399
|
'{Yud}{Samekh}{Tes}{Nun}'
|
402
400
|
( R1 delete )
|
403
|
-
|
401
|
+
|
404
402
|
// Verb ending: -stu
|
405
403
|
'{Samekh}{Tes}{Vov}'
|
406
404
|
( R1 delete )
|
407
405
|
|
408
|
-
// Superlative ending: -ster, -ste, -stn
|
409
|
-
'{Samekh}{Tes}{Ayen}{Reysh}' '{Samekh}{Tes}{Ayen}' '{Samekh}{Tes}{Nun}'
|
406
|
+
// Superlative ending: -ster, -ste, -stn
|
407
|
+
'{Samekh}{Tes}{Ayen}{Reysh}' '{Samekh}{Tes}{Ayen}' '{Samekh}{Tes}{Nun}'
|
410
408
|
( R1 delete )
|
411
|
-
|
409
|
+
|
412
410
|
// Ambiguous verb ending: -st
|
413
411
|
'{Samekh}{Tes}'
|
414
|
-
( R1 delete )
|
412
|
+
( R1 delete )
|
415
413
|
)
|
416
414
|
)
|
417
415
|
|
@@ -436,7 +434,7 @@ backwardmode (
|
|
436
434
|
// Exceptions to above: -blik, -glik
|
437
435
|
'{Beys}{Lamed}{Yud}{Kuf}' '{Giml}{Lamed}{Yud}{Kuf}'
|
438
436
|
( true )
|
439
|
-
|
437
|
+
|
440
438
|
// Present participle endings: -ndik
|
441
439
|
'{Nun}{Dalet}{Yud}{Kuf}'
|
442
440
|
( R1 delete )
|