mittens 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +3 -3
- data/lib/mittens/version.rb +1 -1
- data/vendor/snowball/.github/workflows/ci.yml +216 -0
- data/vendor/snowball/CONTRIBUTING.rst +111 -62
- data/vendor/snowball/GNUmakefile +194 -136
- data/vendor/snowball/NEWS +798 -3
- data/vendor/snowball/README.rst +50 -1
- data/vendor/snowball/ada/src/stemmer.adb +25 -13
- data/vendor/snowball/ada/src/stemmer.ads +9 -9
- data/vendor/snowball/ada/stemmer_config.gpr +7 -7
- data/vendor/snowball/algorithms/basque.sbl +4 -19
- data/vendor/snowball/algorithms/catalan.sbl +2 -9
- data/vendor/snowball/algorithms/danish.sbl +1 -1
- data/vendor/snowball/algorithms/dutch.sbl +284 -122
- data/vendor/snowball/algorithms/dutch_porter.sbl +178 -0
- data/vendor/snowball/algorithms/english.sbl +52 -37
- data/vendor/snowball/algorithms/esperanto.sbl +157 -0
- data/vendor/snowball/algorithms/estonian.sbl +269 -0
- data/vendor/snowball/algorithms/finnish.sbl +2 -3
- data/vendor/snowball/algorithms/french.sbl +42 -16
- data/vendor/snowball/algorithms/german.sbl +35 -14
- data/vendor/snowball/algorithms/greek.sbl +76 -76
- data/vendor/snowball/algorithms/hungarian.sbl +8 -6
- data/vendor/snowball/algorithms/indonesian.sbl +14 -8
- data/vendor/snowball/algorithms/italian.sbl +11 -21
- data/vendor/snowball/algorithms/lithuanian.sbl +36 -37
- data/vendor/snowball/algorithms/lovins.sbl +0 -1
- data/vendor/snowball/algorithms/nepali.sbl +138 -37
- data/vendor/snowball/algorithms/norwegian.sbl +19 -5
- data/vendor/snowball/algorithms/porter.sbl +2 -2
- data/vendor/snowball/algorithms/portuguese.sbl +9 -13
- data/vendor/snowball/algorithms/romanian.sbl +17 -4
- data/vendor/snowball/algorithms/serbian.sbl +467 -468
- data/vendor/snowball/algorithms/spanish.sbl +5 -7
- data/vendor/snowball/algorithms/swedish.sbl +60 -6
- data/vendor/snowball/algorithms/tamil.sbl +207 -176
- data/vendor/snowball/algorithms/turkish.sbl +461 -445
- data/vendor/snowball/algorithms/yiddish.sbl +36 -38
- data/vendor/snowball/compiler/analyser.c +445 -192
- data/vendor/snowball/compiler/driver.c +109 -101
- data/vendor/snowball/compiler/generator.c +853 -464
- data/vendor/snowball/compiler/generator_ada.c +404 -366
- data/vendor/snowball/compiler/generator_csharp.c +297 -260
- data/vendor/snowball/compiler/generator_go.c +323 -254
- data/vendor/snowball/compiler/generator_java.c +326 -252
- data/vendor/snowball/compiler/generator_js.c +362 -252
- data/vendor/snowball/compiler/generator_pascal.c +349 -197
- data/vendor/snowball/compiler/generator_python.c +257 -240
- data/vendor/snowball/compiler/generator_rust.c +423 -251
- data/vendor/snowball/compiler/header.h +117 -71
- data/vendor/snowball/compiler/space.c +137 -68
- data/vendor/snowball/compiler/syswords.h +2 -2
- data/vendor/snowball/compiler/tokeniser.c +125 -107
- data/vendor/snowball/csharp/Snowball/Among.cs +14 -14
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +7 -7
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +57 -37
- data/vendor/snowball/csharp/Stemwords/App.config +2 -2
- data/vendor/snowball/csharp/Stemwords/Program.cs +16 -12
- data/vendor/snowball/doc/libstemmer_c_README +7 -4
- data/vendor/snowball/doc/libstemmer_csharp_README +4 -1
- data/vendor/snowball/doc/libstemmer_java_README +12 -1
- data/vendor/snowball/doc/libstemmer_js_README +6 -4
- data/vendor/snowball/doc/libstemmer_python_README +9 -4
- data/vendor/snowball/examples/stemwords.c +12 -12
- data/vendor/snowball/go/env.go +107 -31
- data/vendor/snowball/go/util.go +0 -4
- data/vendor/snowball/include/libstemmer.h +4 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +32 -15
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +347 -261
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +3 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +52 -37
- data/vendor/snowball/javascript/base-stemmer.js +186 -2
- data/vendor/snowball/javascript/stemwords.js +3 -6
- data/vendor/snowball/libstemmer/libstemmer_c.in +1 -1
- data/vendor/snowball/libstemmer/mkalgorithms.pl +6 -6
- data/vendor/snowball/libstemmer/mkmodules.pl +2 -2
- data/vendor/snowball/libstemmer/modules.txt +13 -10
- data/vendor/snowball/libstemmer/test.c +1 -1
- data/vendor/snowball/pascal/SnowballProgram.pas +84 -2
- data/vendor/snowball/pascal/generate.pl +13 -13
- data/vendor/snowball/python/create_init.py +4 -1
- data/vendor/snowball/python/setup.cfg +0 -3
- data/vendor/snowball/python/setup.py +8 -3
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +20 -54
- data/vendor/snowball/python/stemwords.py +8 -12
- data/vendor/snowball/runtime/api.c +10 -5
- data/vendor/snowball/runtime/header.h +10 -9
- data/vendor/snowball/runtime/utilities.c +9 -9
- data/vendor/snowball/rust/build.rs +1 -1
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +83 -5
- data/vendor/snowball/tests/stemtest.c +7 -4
- metadata +7 -7
- data/vendor/snowball/.travis.yml +0 -112
- data/vendor/snowball/algorithms/german2.sbl +0 -145
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +0 -240
- data/vendor/snowball/compiler/syswords2.h +0 -13
@@ -62,9 +62,9 @@ backwardmode (
|
|
62
62
|
// prefix not in {ke, peng, per}
|
63
63
|
define SUFFIX_KAN_OK as (
|
64
64
|
// On page 29, the example "kompas Q.31" says "Both Nazief and Porter
|
65
|
-
// stemmer converted the word peledakan (blast, explotion) to
|
66
|
-
// blast, to explode)". However, the algorithm as described
|
67
|
-
// behave in this way - grammatically the prefix pe- occurs as a
|
65
|
+
// stemmer converted the word peledakan (blast, explotion [sic]) to
|
66
|
+
// ledak (to blast, to explode)". However, the algorithm as described
|
67
|
+
// doesn't behave in this way - grammatically the prefix pe- occurs as a
|
68
68
|
// variation of both the first-order derivational prefix peng- and the
|
69
69
|
// second-order derivational prefix per-, but table 2.5 doesn't include
|
70
70
|
// "pe", only table 2.6 does, so "peledakan" is handled (incorrectly)
|
@@ -104,13 +104,19 @@ backwardmode (
|
|
104
104
|
//
|
105
105
|
// Elsewhere the paper defines V... as meaning "the stem starts with
|
106
106
|
// a vowel" and K... as meaning "the stem starts with a consonant".
|
107
|
+
// The meaning of | isn't actually defined, but clearly means
|
108
|
+
// alternation.
|
107
109
|
//
|
108
|
-
//
|
110
|
+
// However nowhere is the precedence of | vs ... defined, and there
|
111
|
+
// isn't a standard precedence we could reasonably assume. In other
|
112
|
+
// places where the paper says X|Y... it seems the | binds more
|
109
113
|
// tightly, so it's (V|K)...cᵢcⱼ not V|(K...cᵢcⱼ). That seems a bit
|
110
114
|
// odd as the first letter must be either a vowel or a consonant, so
|
111
|
-
// that really just means "ends cᵢcⱼ"
|
112
|
-
//
|
113
|
-
//
|
115
|
+
// that really just means "ends cᵢcⱼ" (and has at least one letter
|
116
|
+
// before cᵢ but we only call SUFFIX_I_OK if $measure > 2 which
|
117
|
+
// ensures that part). However, nowhere in the paper uses or defines
|
118
|
+
// a notation such as ...X, which may explain this seemingly redundant
|
119
|
+
// way of specifying this.
|
114
120
|
//
|
115
121
|
// The conditions elsewhere on prefix removal (e.g. V...) are clearly
|
116
122
|
// on the stem left after the prefix is removed. None of the other
|
@@ -118,7 +124,7 @@ backwardmode (
|
|
118
124
|
// consistency with the prefix rules we might expect that the cᵢcⱼ
|
119
125
|
// test is on what's left *after* removing the "i" suffix.
|
120
126
|
//
|
121
|
-
//
|
127
|
+
// Studying Indonesian wordlists and discussion with a native
|
122
128
|
// speaker leads us to conclude that the purpose of this check is to
|
123
129
|
// protect words of foreign origin (e.g. "televisi", "organisasi",
|
124
130
|
// "komunikasi") from stemming, and the common feature of these is
|
@@ -1,6 +1,5 @@
|
|
1
1
|
|
2
2
|
routines (
|
3
|
-
exceptions
|
4
3
|
prelude postlude mark_regions
|
5
4
|
RV R1 R2
|
6
5
|
attached_pronoun
|
@@ -59,6 +58,8 @@ define mark_regions as (
|
|
59
58
|
do (
|
60
59
|
( v (non-v gopast v) or (v gopast non-v) )
|
61
60
|
or
|
61
|
+
'divan' // Otherwise "divano" stems to "div" and collides with "diva".
|
62
|
+
or
|
62
63
|
( non-v (non-v gopast v) or (v next) )
|
63
64
|
setmark pV
|
64
65
|
)
|
@@ -108,7 +109,7 @@ backwardmode (
|
|
108
109
|
'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti'
|
109
110
|
'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente'
|
110
111
|
'atrice' 'atrici'
|
111
|
-
'ante' 'anti'
|
112
|
+
'ante' 'anti'
|
112
113
|
( R2 delete )
|
113
114
|
'azione' 'azioni' 'atore' 'atori'
|
114
115
|
( R2 delete
|
@@ -179,24 +180,13 @@ backwardmode (
|
|
179
180
|
)
|
180
181
|
)
|
181
182
|
|
182
|
-
define exceptions as (
|
183
|
-
['divano' atlimit ] <- 'divan' // Otherwise "divano" stems to "div" and collides with "diva"
|
184
|
-
)
|
185
|
-
|
186
183
|
define stem as (
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
do postlude
|
196
|
-
)
|
184
|
+
do prelude
|
185
|
+
do mark_regions
|
186
|
+
backwards (
|
187
|
+
do attached_pronoun
|
188
|
+
do (standard_suffix or verb_suffix)
|
189
|
+
do vowel_suffix
|
190
|
+
)
|
191
|
+
do postlude
|
197
192
|
)
|
198
|
-
|
199
|
-
/*
|
200
|
-
Note 1: additions of 15 Jun 2005
|
201
|
-
*/
|
202
|
-
|
@@ -5,10 +5,10 @@ stringescapes { }
|
|
5
5
|
|
6
6
|
/* Special characters in Unicode Latin Extended-A */
|
7
7
|
// ' nosine
|
8
|
-
stringdef
|
9
|
-
stringdef
|
10
|
-
stringdef
|
11
|
-
stringdef
|
8
|
+
stringdef ak '{U+0105}' // ą a + ogonek
|
9
|
+
stringdef ek '{U+0119}' // ę e + ogonek
|
10
|
+
stringdef ik '{U+012F}' // į i + ogonek
|
11
|
+
stringdef uk '{U+0173}' // ų u + ogonek
|
12
12
|
|
13
13
|
// . taskas
|
14
14
|
stringdef e. '{U+0117}' // ė e + dot
|
@@ -16,10 +16,10 @@ stringdef e. '{U+0117}' // ė e + dot
|
|
16
16
|
// - ilgoji
|
17
17
|
stringdef u- '{U+016B}' // ū u + macron
|
18
18
|
|
19
|
-
//
|
20
|
-
stringdef
|
21
|
-
stringdef
|
22
|
-
stringdef
|
19
|
+
// v varnele
|
20
|
+
stringdef cv '{U+010D}' // č c + caron (haček)
|
21
|
+
stringdef sv '{U+0161}' // š s + caron (haček)
|
22
|
+
stringdef zv '{U+017E}' // ž z + caron (haček)
|
23
23
|
|
24
24
|
// [C](VC)^m[V|C]
|
25
25
|
// definitions of variables for
|
@@ -31,31 +31,30 @@ integers ( p1 )
|
|
31
31
|
groupings ( v )
|
32
32
|
|
33
33
|
// v - all lithuanian vowels
|
34
|
-
define v 'aeiyou{
|
34
|
+
define v 'aeiyou{ak}{ek}{ik}{uk}{e.}{u-}'
|
35
35
|
|
36
36
|
// all lithuanian stemmer routines: 4 steps
|
37
37
|
routines (
|
38
|
-
step2
|
38
|
+
step2 step1 fix_chdz fix_gd fix_conflicts
|
39
39
|
)
|
40
40
|
|
41
41
|
backwardmode (
|
42
42
|
|
43
|
-
define R1 as $p1 <= cursor
|
44
43
|
define step1 as (
|
45
|
-
setlimit tomark p1 for ([substring])
|
44
|
+
setlimit tomark p1 for ([substring]) among (
|
46
45
|
// Daiktavardžiai (Nouns)
|
47
46
|
// I linksniuotė (declension I)
|
48
47
|
'as' 'ias' 'is' 'ys' // vyras, kelias, brolis, gaidys
|
49
48
|
'o' 'io' // vyro, kelio
|
50
49
|
'ui' 'iui' // vyrui, keliui
|
51
|
-
'{
|
50
|
+
'{ak}' 'i{ak}' '{ik}' // vyrą, kelią, brolį
|
52
51
|
'u' 'iu' // vyru, keliu
|
53
52
|
'e' 'yje' // vyre, kelyje
|
54
53
|
'y' 'au' 'i' // kely, brolau, broli,
|
55
54
|
'an' // nusižengiman
|
56
55
|
|
57
56
|
'ai' 'iai' // vyrai, keliai
|
58
|
-
'{
|
57
|
+
'{uk}' 'i{uk}' // vyrų, kelių
|
59
58
|
'ams' 'am' // vyrams, vyram
|
60
59
|
'iams' 'iam' // broliams, broliam
|
61
60
|
'us' 'ius' // vyrus, brolius
|
@@ -65,9 +64,9 @@ backwardmode (
|
|
65
64
|
'ysna' // žutysna
|
66
65
|
|
67
66
|
'asis' 'aisi' // sukimasis, sukimaisi
|
68
|
-
'osi' '{
|
67
|
+
'osi' '{uk}si' // sukimosi, sukimųsi
|
69
68
|
'uisi' // sukimuisi
|
70
|
-
'{
|
69
|
+
'{ak}si' // sukimąsi
|
71
70
|
'usi' // sukimusi
|
72
71
|
'esi' // sukimesi
|
73
72
|
|
@@ -89,7 +88,7 @@ backwardmode (
|
|
89
88
|
'{e.}' // gervė
|
90
89
|
'{e.}s' // gervės
|
91
90
|
'ei' // gervei
|
92
|
-
'{
|
91
|
+
'{ek}' // gervę
|
93
92
|
'{e.}j' '{e.}je' // gervėj, gervėje
|
94
93
|
'{e.}ms' // gervėms
|
95
94
|
'es' // gerves
|
@@ -114,14 +113,14 @@ backwardmode (
|
|
114
113
|
// V linksniuote (declension V)
|
115
114
|
'ies' 'ens' 'enio' 'ers' // avies, vandens, sesers
|
116
115
|
'eniui' 'eriai' // vandeniui, eriai
|
117
|
-
'en{
|
116
|
+
'en{ik}' 'er{ik}' // vandenį, seserį
|
118
117
|
'imi' 'eniu' 'erimi' 'eria' // avimi, vandeniu, seserimi, seseria
|
119
118
|
'enyje' 'eryje' // vandenyje, seseryje
|
120
119
|
'ie' 'enie' 'erie' // avie, vandenie, seserie
|
121
120
|
|
122
121
|
'enys' 'erys' // vandenys, seserys
|
123
|
-
// 'en{
|
124
|
-
'er{
|
122
|
+
// 'en{uk}' konfliktas su 'žandenų' 'antenų'
|
123
|
+
'er{uk}' // seserų
|
125
124
|
'ims' 'enims' 'erims' // avims, vandemins, seserims
|
126
125
|
'enis' // vandenis
|
127
126
|
'imis' // žebenkštimis
|
@@ -184,11 +183,11 @@ backwardmode (
|
|
184
183
|
'sit' 'site' // gersit, gersite
|
185
184
|
|
186
185
|
// tariamoji nuosaka (subjunctive mood)
|
187
|
-
'{
|
186
|
+
'{cv}iau' '{cv}iausi' // dirbčiau
|
188
187
|
'tum' 'tumei' // dirbtum, dirbtumei
|
189
188
|
'tumeis' 'tumeisi' // mokytumeis, mokytumeisi
|
190
|
-
// 't{
|
191
|
-
't{
|
189
|
+
// 't{uk}' nes blogai batutų -> batų
|
190
|
+
't{uk}si' // mokytųsi
|
192
191
|
// 'tume' konfliktas su 'šventume'
|
193
192
|
'tum{e.}m' // dirbtumėm
|
194
193
|
'tum{e.}me' // dirbtumėme
|
@@ -219,8 +218,8 @@ backwardmode (
|
|
219
218
|
// 'tis' konfliktas, nes rytme-tis -> rytme
|
220
219
|
|
221
220
|
// dalyviai (participles)
|
222
|
-
'{
|
223
|
-
't{
|
221
|
+
'{ak}s' 'i{ak}s' '{ik}s' // dirbąs, žaidžiąs, gulįs
|
222
|
+
't{uk}s' // suktųs -> suk
|
224
223
|
'sim{e.}s' // suksimės
|
225
224
|
'sit{e.}s' // suksitės
|
226
225
|
'kite' // supkite
|
@@ -236,7 +235,7 @@ backwardmode (
|
|
236
235
|
// budvardziu priesagos (Adjective suffixes)
|
237
236
|
// 'in' // konfliktas su 'augintinis' ir 'akiniais' // lauk-in-is
|
238
237
|
'ing' // tvark-ing-as
|
239
|
-
'i{
|
238
|
+
'i{sv}k' // lenk-išk-as
|
240
239
|
'{e.}t' // dem-ėt-as
|
241
240
|
'ot' // garban-ot-as
|
242
241
|
'uot' 'iuot' // lang-uot-as, akin-iuot-as
|
@@ -247,15 +246,15 @@ backwardmode (
|
|
247
246
|
'iul' // maž-ul-is
|
248
247
|
'{e.}l' // maž-ėl-is
|
249
248
|
'yl' // maž-yl-is
|
250
|
-
'u{
|
249
|
+
'u{cv}iuk' // maž-učiuk-as
|
251
250
|
'uliuk' // maž-uliuk-as
|
252
251
|
'ut{e.}ait' // maž-utėlait-is
|
253
252
|
'ok' // did-ok-as
|
254
253
|
'iok' // višč-iok-as
|
255
|
-
'sv' '{
|
254
|
+
'sv' '{sv}v' 'zgan' // sal-sv-as, pilk-šv-as, bal-zgan-as
|
256
255
|
'op' 'iop' // dvej-op-as, viener-iop-as
|
257
256
|
'ain' // apval-ain-as
|
258
|
-
'yk{
|
257
|
+
'yk{sv}t' 'yk{sv}{cv}' // ten-ykšt-is, vakar-ykšč-ias
|
259
258
|
|
260
259
|
// laisniai
|
261
260
|
'esn' // did-esn-is
|
@@ -266,17 +265,17 @@ backwardmode (
|
|
266
265
|
'ias' // žaliasis
|
267
266
|
'oj' 'ioj' // gerojo, žaliojo
|
268
267
|
'aj' 'iaj' // gerajam, žaliajam
|
269
|
-
'{
|
268
|
+
'{ak}j' 'i{ak}j' // garąjį, žaliąjį
|
270
269
|
'uoj' 'iuoj' // geruoju, žaliuoju
|
271
270
|
'iej' // gerieji
|
272
|
-
'{
|
271
|
+
'{uk}j' 'i{uk}j' // gerųjų, žaliųjų
|
273
272
|
'ies' // geriesiems
|
274
273
|
'uos' 'iuos' // geruosius, žaliuosius
|
275
274
|
'ais' 'iais' // geraisiais, žaliaisiais
|
276
275
|
|
277
276
|
// moteriska gimine (Female gender)
|
278
277
|
'os' 'ios' // gerosios, žaliosios
|
279
|
-
'{
|
278
|
+
'{ak}s' 'i{ak}s' // gerąsios, žaliąsias
|
280
279
|
|
281
280
|
// būtasis dažninis laikas (frequentative past tense)
|
282
281
|
'dav' // ei-dav-o
|
@@ -285,9 +284,9 @@ backwardmode (
|
|
285
284
|
'ant' 'iant'
|
286
285
|
'int' // tur-int-is
|
287
286
|
'{e.}j' // tur-ėj-o
|
288
|
-
'{
|
289
|
-
'{e.}j{
|
290
|
-
'{
|
287
|
+
'{ek}' //
|
288
|
+
'{e.}j{ek}'
|
289
|
+
'{ek}s' // dirb-ęs-is
|
291
290
|
|
292
291
|
'siant' // dirb-siant
|
293
292
|
|
@@ -336,8 +335,8 @@ backwardmode (
|
|
336
335
|
|
337
336
|
define fix_chdz as (
|
338
337
|
[substring] among (
|
339
|
-
'{
|
340
|
-
'd{
|
338
|
+
'{cv}' (<-'t')
|
339
|
+
'd{zv}' (<-'d')
|
341
340
|
)
|
342
341
|
)
|
343
342
|
|
@@ -6,10 +6,9 @@
|
|
6
6
|
*/
|
7
7
|
|
8
8
|
routines (
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
remove_category_3
|
9
|
+
remove_category_1
|
10
|
+
remove_category_2
|
11
|
+
remove_category_3
|
13
12
|
)
|
14
13
|
|
15
14
|
stringescapes {}
|
@@ -49,44 +48,146 @@ stringdef dvsau '{U+094C}' // DEVANAGARI_VOWEL_SIGN_AU
|
|
49
48
|
stringdef dsv '{U+094D}' // DEVANAGARI_SIGN_VIRAMA
|
50
49
|
|
51
50
|
externals ( stem )
|
52
|
-
backwardmode (
|
53
|
-
define remove_category_1 as(
|
54
|
-
[substring] among (
|
55
|
-
'{dlm}{dvsaa}{dlr}{dsv}{dlpha}{dlta}' '{dld}{dsv}{dlv}{dvsaa}{dlr}{dvsaa}' '{dls}{dsc}{dlg}{dvsai}' '{dls}{dsa}{dlg}'
|
56
|
-
'{dls}{dsc}{dlg}' '{dll}{dvsaa}{dli}' '{dll}{dvsaa}{dlii}' '{dlpa}{dlc}{dvsi}'
|
57
|
-
'{dll}{dvse}' '{dlr}{dlta}' '{dlm}{dvsai}' '{dlm}{dvsaa}'
|
58
|
-
(delete)
|
59
|
-
'{dlka}{dvso}' '{dlka}{dvsaa}' '{dlka}{dvsi}' '{dlka}{dvsii}' '{dlka}{dvsai}'(('{dle}' or '{dvse}' ()) or delete)
|
60
|
-
)
|
61
|
-
)
|
62
51
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
52
|
+
backwardmode (
|
53
|
+
define remove_category_1 as(
|
54
|
+
[substring] among (
|
55
|
+
'{dlm}{dvsaa}{dlr}{dsv}{dlpha}{dlta}'
|
56
|
+
'{dld}{dsv}{dlv}{dvsaa}{dlr}{dvsaa}'
|
57
|
+
'{dls}{dsc}{dlg}{dvsai}'
|
58
|
+
'{dls}{dsa}{dlg}'
|
59
|
+
'{dls}{dsc}{dlg}'
|
60
|
+
'{dll}{dvsaa}{dli}'
|
61
|
+
'{dll}{dvsaa}{dlii}'
|
62
|
+
'{dlpa}{dlc}{dvsi}'
|
63
|
+
'{dll}{dvse}'
|
64
|
+
'{dlr}{dlta}'
|
65
|
+
'{dlm}{dvsai}'
|
66
|
+
'{dlm}{dvsaa}'
|
67
|
+
(delete)
|
68
|
+
'{dlka}{dvso}'
|
69
|
+
'{dlka}{dvsaa}'
|
70
|
+
'{dlka}{dvsi}'
|
71
|
+
'{dlka}{dvsii}'
|
72
|
+
'{dlka}{dvsai}'
|
73
|
+
('{dle}' or '{dvse}' or delete)
|
74
|
+
)
|
75
|
+
)
|
68
76
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
77
|
+
define remove_category_2 as (
|
78
|
+
[substring] among(
|
79
|
+
'{dsc}' '{dsa}'
|
80
|
+
('{dly}{dvsau}' or '{dlc}{dvsau}' or '{dln}{dvsau}' or '{dltha}{dvse}' delete)
|
81
|
+
'{dvsai}'
|
82
|
+
('{dlta}{dsv}{dlr}' delete)
|
83
|
+
)
|
84
|
+
)
|
75
85
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
86
|
+
define remove_category_3 as(
|
87
|
+
[substring] among(
|
88
|
+
'{dltha}{dvsi}{dli}{dls}{dsv}'
|
89
|
+
'{dlh}{dvsu}{dln}{dvse}{dlc}'
|
90
|
+
'{dlh}{dvsu}{dln}{dsv}{dlc}'
|
91
|
+
'{dln}{dvse}{dlc}{dls}{dsv}'
|
92
|
+
'{dln}{dvse}{dlc}{dln}{dsv}'
|
93
|
+
'{dli}{dle}{dlka}{dvsii}'
|
94
|
+
'{dli}{dle}{dlka}{dvsaa}'
|
95
|
+
'{dli}{dle}{dlka}{dvso}'
|
96
|
+
'{dvsi}{dle}{dlka}{dvsii}'
|
97
|
+
'{dvsi}{dle}{dlka}{dvsaa}'
|
98
|
+
'{dvsi}{dle}{dlka}{dvso}'
|
99
|
+
'{dli}{dlc}{dln}{dsv}'
|
100
|
+
'{dvsi}{dlc}{dln}{dsv}'
|
101
|
+
'{dli}{dlc}{dls}{dsv}'
|
102
|
+
'{dvsi}{dlc}{dls}{dsv}'
|
103
|
+
'{dle}{dlc}{dln}{dsv}'
|
104
|
+
'{dvse}{dlc}{dln}{dsv}'
|
105
|
+
'{dle}{dlc}{dls}{dsv}'
|
106
|
+
'{dvse}{dlc}{dls}{dsv}'
|
107
|
+
'{dlc}{dvsi}{dln}{dsv}'
|
108
|
+
'{dlc}{dvse}{dls}{dsv}'
|
109
|
+
'{dlc}{dsv}{dly}{dvsau}'
|
110
|
+
'{dltha}{dvsi}{dln}{dsv}'
|
111
|
+
'{dltha}{dvsi}{dly}{dvso}'
|
112
|
+
'{dltha}{dvsi}{dly}{dvsau}'
|
113
|
+
'{dltha}{dvsi}{dls}{dsv}'
|
114
|
+
'{dltha}{dsv}{dly}{dvso}'
|
115
|
+
'{dltha}{dsv}{dly}{dvsau}'
|
116
|
+
'{dld}{dvsi}{dly}{dvso}'
|
117
|
+
'{dld}{dvse}{dlkha}{dvsi}'
|
118
|
+
'{dld}{dvse}{dlkha}{dvsii}'
|
119
|
+
'{dll}{dvsaa}{dln}{dsv}'
|
120
|
+
'{dlm}{dvsaa}{dltha}{dvsi}'
|
121
|
+
'{dln}{dvse}{dlka}{dvsai}'
|
122
|
+
'{dln}{dvse}{dlka}{dvsaa}'
|
123
|
+
'{dln}{dvse}{dlka}{dvso}'
|
124
|
+
'{dln}{dvse}{dlc}{dvsau}'
|
125
|
+
'{dlh}{dvso}{dls}{dsv}'
|
126
|
+
'{dli}{dln}{dsv}{dlc}'
|
127
|
+
'{dvsi}{dln}{dsv}{dlc}'
|
128
|
+
'{dln}{dvse}{dlc}{dvsu}'
|
129
|
+
'{dli}{dlc}{dvsau}'
|
130
|
+
'{dvsi}{dlc}{dvsau}'
|
131
|
+
'{dli}{dls}{dsv}'
|
132
|
+
'{dvsi}{dls}{dsv}'
|
133
|
+
'{dvsi}{dly}{dvso}'
|
134
|
+
'{dli}{dly}{dvso}'
|
135
|
+
'{dle}{dlka}{dvsaa}'
|
136
|
+
'{dvse}{dlka}{dvsaa}'
|
137
|
+
'{dle}{dlka}{dvsii}'
|
138
|
+
'{dvse}{dlka}{dvsii}'
|
139
|
+
'{dle}{dlka}{dvsai}'
|
140
|
+
'{dvse}{dlka}{dvsai}'
|
141
|
+
'{dle}{dlka}{dvso}'
|
142
|
+
'{dvse}{dlka}{dvso}'
|
143
|
+
'{dle}{dlc}{dvsu}'
|
144
|
+
'{dvse}{dlc}{dvsu}'
|
145
|
+
'{dle}{dlc}{dvsau}'
|
146
|
+
'{dvse}{dlc}{dvsau}'
|
147
|
+
'{dlc}{dln}{dsv}'
|
148
|
+
'{dlc}{dls}{dsv}'
|
149
|
+
'{dltha}{dvsi}{dle}'
|
150
|
+
'{dlpa}{dlr}{dsv}'
|
151
|
+
'{dlb}{dly}{dvso}'
|
152
|
+
'{dlh}{dlr}{dvsu}'
|
153
|
+
'{dlh}{dlr}{dvsuu}'
|
154
|
+
'{dvsi}{dld}{dvsaa}'
|
155
|
+
'{dli}{dld}{dvsaa}'
|
156
|
+
'{dvsi}{dld}{dvso}'
|
157
|
+
'{dli}{dld}{dvso}'
|
158
|
+
'{dvsi}{dld}{dvsai}'
|
159
|
+
'{dli}{dld}{dvsai}'
|
160
|
+
'{dln}{dvse}{dlc}'
|
161
|
+
'{dli}{dlc}'
|
162
|
+
'{dvsi}{dlc}'
|
163
|
+
'{dle}{dlc}'
|
164
|
+
'{dvse}{dlc}'
|
165
|
+
'{dlc}{dvsu}'
|
166
|
+
'{dlc}{dvse}'
|
167
|
+
'{dlc}{dvsau}'
|
168
|
+
'{dltha}{dvsii}'
|
169
|
+
'{dltha}{dvse}'
|
170
|
+
'{dld}{dvsaa}'
|
171
|
+
'{dld}{dvsii}'
|
172
|
+
'{dld}{dvsai}'
|
173
|
+
'{dld}{dvso}'
|
174
|
+
'{dln}{dvsu}'
|
175
|
+
'{dln}{dvse}'
|
176
|
+
'{dly}{dvso}'
|
177
|
+
'{dly}{dvsau}'
|
178
|
+
'{dlc}'
|
179
|
+
(delete)
|
180
|
+
)
|
181
|
+
)
|
82
182
|
|
83
183
|
)
|
84
184
|
|
85
185
|
define stem as (
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
186
|
+
backwards (
|
187
|
+
do remove_category_1
|
188
|
+
repeat (
|
189
|
+
do remove_category_2
|
190
|
+
remove_category_3
|
191
|
+
)
|
192
|
+
)
|
92
193
|
)
|
@@ -17,18 +17,22 @@ stringescapes {}
|
|
17
17
|
|
18
18
|
stringdef ae '{U+00E6}'
|
19
19
|
stringdef ao '{U+00E5}'
|
20
|
+
stringdef e^ '{U+00EA}' // e-circumflex
|
21
|
+
stringdef o` '{U+00F2}' // o-grave
|
22
|
+
stringdef o' '{U+00F3}' // o-acute
|
23
|
+
stringdef o^ '{U+00F4}' // o-circumflex
|
20
24
|
stringdef o/ '{U+00F8}'
|
21
25
|
|
22
|
-
define v '
|
26
|
+
define v 'ae{e^}io{o`}{o'}{o^}uy{ae}{ao}{o/}'
|
23
27
|
|
24
|
-
define s_ending '
|
28
|
+
define s_ending 'bcdfghjlmnoptvyz'
|
25
29
|
|
26
30
|
define mark_regions as (
|
27
31
|
|
28
32
|
$p1 = limit
|
29
33
|
|
30
34
|
test ( hop 3 setmark x )
|
31
|
-
|
35
|
+
gopast v gopast non-v setmark p1
|
32
36
|
try ( $p1 < x $p1 = x )
|
33
37
|
)
|
34
38
|
|
@@ -40,10 +44,20 @@ backwardmode (
|
|
40
44
|
|
41
45
|
'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar'
|
42
46
|
'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens'
|
43
|
-
'hetens' '
|
47
|
+
'hetens' 'ets' 'et' 'het' 'ast'
|
44
48
|
(delete)
|
49
|
+
'ers'
|
50
|
+
(
|
51
|
+
among (
|
52
|
+
'amm' 'ast' 'ind' 'kap' 'kk' 'lt' 'nk' 'omm' 'pp' 'v'
|
53
|
+
'{o/}st'
|
54
|
+
()
|
55
|
+
'giv' 'hav' 'skap' ''
|
56
|
+
(delete)
|
57
|
+
)
|
58
|
+
)
|
45
59
|
's'
|
46
|
-
(s_ending or ('k' non-v) delete)
|
60
|
+
(s_ending or ('r' not 'e') or ('k' non-v) delete)
|
47
61
|
'erte' 'ert'
|
48
62
|
(<-'er')
|
49
63
|
)
|
@@ -38,11 +38,11 @@ backwardmode (
|
|
38
38
|
test gopast v delete
|
39
39
|
test substring among(
|
40
40
|
'at' 'bl' 'iz'
|
41
|
-
(
|
41
|
+
(insert 'e')
|
42
42
|
'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt'
|
43
43
|
// ignoring double c, h, j, k, q, v, w, and x
|
44
44
|
([next] delete)
|
45
|
-
'' (atmark p1 test shortv
|
45
|
+
'' (atmark p1 test shortv insert 'e')
|
46
46
|
)
|
47
47
|
)
|
48
48
|
)
|
@@ -25,7 +25,7 @@ stringdef i' '{U+00ED}' // i-acute
|
|
25
25
|
stringdef o^ '{U+00F4}' // o-circumflex
|
26
26
|
stringdef o' '{U+00F3}' // o-acute
|
27
27
|
stringdef u' '{U+00FA}' // u-acute
|
28
|
-
stringdef
|
28
|
+
stringdef cc '{U+00E7}' // c-cedilla
|
29
29
|
|
30
30
|
stringdef a~ '{U+00E3}' // a-tilde
|
31
31
|
stringdef o~ '{U+00F5}' // o-tilde
|
@@ -38,7 +38,7 @@ define prelude as repeat (
|
|
38
38
|
'{a~}' (<- 'a~')
|
39
39
|
'{o~}' (<- 'o~')
|
40
40
|
'' (next)
|
41
|
-
)
|
41
|
+
)
|
42
42
|
)
|
43
43
|
|
44
44
|
define mark_regions as (
|
@@ -64,7 +64,7 @@ define postlude as repeat (
|
|
64
64
|
'a~' (<- '{a~}')
|
65
65
|
'o~' (<- '{o~}')
|
66
66
|
'' (next)
|
67
|
-
)
|
67
|
+
)
|
68
68
|
)
|
69
69
|
|
70
70
|
backwardmode (
|
@@ -86,9 +86,9 @@ backwardmode (
|
|
86
86
|
'amento' 'amentos'
|
87
87
|
'imento' 'imentos'
|
88
88
|
|
89
|
-
'adora' 'ador' 'a{
|
90
|
-
'adoras' 'adores' 'a{
|
91
|
-
'ante' 'antes' '{a^}ncia'
|
89
|
+
'adora' 'ador' 'a{cc}a~o'
|
90
|
+
'adoras' 'adores' 'a{cc}o~es' // no -ic test
|
91
|
+
'ante' 'antes' '{a^}ncia'
|
92
92
|
(
|
93
93
|
R2 delete
|
94
94
|
)
|
@@ -97,7 +97,7 @@ backwardmode (
|
|
97
97
|
(
|
98
98
|
R2 <- 'log'
|
99
99
|
)
|
100
|
-
'u{
|
100
|
+
'u{cc}a~o' 'u{cc}o~es'
|
101
101
|
(
|
102
102
|
R2 <- 'u'
|
103
103
|
)
|
@@ -122,7 +122,7 @@ backwardmode (
|
|
122
122
|
R2 delete
|
123
123
|
try (
|
124
124
|
[substring] among(
|
125
|
-
'ante'
|
125
|
+
'ante'
|
126
126
|
'avel'
|
127
127
|
'{i'}vel' (R2 delete)
|
128
128
|
)
|
@@ -193,7 +193,7 @@ backwardmode (
|
|
193
193
|
'e' '{e'}' '{e^}'
|
194
194
|
( RV delete [('u'] test 'g') or
|
195
195
|
('i'] test 'c') RV delete )
|
196
|
-
'{
|
196
|
+
'{cc}' (<-'c')
|
197
197
|
)
|
198
198
|
)
|
199
199
|
)
|
@@ -212,7 +212,3 @@ define stem as (
|
|
212
212
|
)
|
213
213
|
do postlude
|
214
214
|
)
|
215
|
-
|
216
|
-
/*
|
217
|
-
Note 1: additions of 15 Jun 2005
|
218
|
-
*/
|