mittens 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +3 -3
- data/lib/mittens/version.rb +1 -1
- data/vendor/snowball/.github/workflows/ci.yml +216 -0
- data/vendor/snowball/CONTRIBUTING.rst +111 -62
- data/vendor/snowball/GNUmakefile +194 -136
- data/vendor/snowball/NEWS +798 -3
- data/vendor/snowball/README.rst +50 -1
- data/vendor/snowball/ada/src/stemmer.adb +25 -13
- data/vendor/snowball/ada/src/stemmer.ads +9 -9
- data/vendor/snowball/ada/stemmer_config.gpr +7 -7
- data/vendor/snowball/algorithms/basque.sbl +4 -19
- data/vendor/snowball/algorithms/catalan.sbl +2 -9
- data/vendor/snowball/algorithms/danish.sbl +1 -1
- data/vendor/snowball/algorithms/dutch.sbl +284 -122
- data/vendor/snowball/algorithms/dutch_porter.sbl +178 -0
- data/vendor/snowball/algorithms/english.sbl +52 -37
- data/vendor/snowball/algorithms/esperanto.sbl +157 -0
- data/vendor/snowball/algorithms/estonian.sbl +269 -0
- data/vendor/snowball/algorithms/finnish.sbl +2 -3
- data/vendor/snowball/algorithms/french.sbl +42 -16
- data/vendor/snowball/algorithms/german.sbl +35 -14
- data/vendor/snowball/algorithms/greek.sbl +76 -76
- data/vendor/snowball/algorithms/hungarian.sbl +8 -6
- data/vendor/snowball/algorithms/indonesian.sbl +14 -8
- data/vendor/snowball/algorithms/italian.sbl +11 -21
- data/vendor/snowball/algorithms/lithuanian.sbl +36 -37
- data/vendor/snowball/algorithms/lovins.sbl +0 -1
- data/vendor/snowball/algorithms/nepali.sbl +138 -37
- data/vendor/snowball/algorithms/norwegian.sbl +19 -5
- data/vendor/snowball/algorithms/porter.sbl +2 -2
- data/vendor/snowball/algorithms/portuguese.sbl +9 -13
- data/vendor/snowball/algorithms/romanian.sbl +17 -4
- data/vendor/snowball/algorithms/serbian.sbl +467 -468
- data/vendor/snowball/algorithms/spanish.sbl +5 -7
- data/vendor/snowball/algorithms/swedish.sbl +60 -6
- data/vendor/snowball/algorithms/tamil.sbl +207 -176
- data/vendor/snowball/algorithms/turkish.sbl +461 -445
- data/vendor/snowball/algorithms/yiddish.sbl +36 -38
- data/vendor/snowball/compiler/analyser.c +445 -192
- data/vendor/snowball/compiler/driver.c +109 -101
- data/vendor/snowball/compiler/generator.c +853 -464
- data/vendor/snowball/compiler/generator_ada.c +404 -366
- data/vendor/snowball/compiler/generator_csharp.c +297 -260
- data/vendor/snowball/compiler/generator_go.c +323 -254
- data/vendor/snowball/compiler/generator_java.c +326 -252
- data/vendor/snowball/compiler/generator_js.c +362 -252
- data/vendor/snowball/compiler/generator_pascal.c +349 -197
- data/vendor/snowball/compiler/generator_python.c +257 -240
- data/vendor/snowball/compiler/generator_rust.c +423 -251
- data/vendor/snowball/compiler/header.h +117 -71
- data/vendor/snowball/compiler/space.c +137 -68
- data/vendor/snowball/compiler/syswords.h +2 -2
- data/vendor/snowball/compiler/tokeniser.c +125 -107
- data/vendor/snowball/csharp/Snowball/Among.cs +14 -14
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +7 -7
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +57 -37
- data/vendor/snowball/csharp/Stemwords/App.config +2 -2
- data/vendor/snowball/csharp/Stemwords/Program.cs +16 -12
- data/vendor/snowball/doc/libstemmer_c_README +7 -4
- data/vendor/snowball/doc/libstemmer_csharp_README +4 -1
- data/vendor/snowball/doc/libstemmer_java_README +12 -1
- data/vendor/snowball/doc/libstemmer_js_README +6 -4
- data/vendor/snowball/doc/libstemmer_python_README +9 -4
- data/vendor/snowball/examples/stemwords.c +12 -12
- data/vendor/snowball/go/env.go +107 -31
- data/vendor/snowball/go/util.go +0 -4
- data/vendor/snowball/include/libstemmer.h +4 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +32 -15
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +347 -261
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +3 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +52 -37
- data/vendor/snowball/javascript/base-stemmer.js +186 -2
- data/vendor/snowball/javascript/stemwords.js +3 -6
- data/vendor/snowball/libstemmer/libstemmer_c.in +1 -1
- data/vendor/snowball/libstemmer/mkalgorithms.pl +6 -6
- data/vendor/snowball/libstemmer/mkmodules.pl +2 -2
- data/vendor/snowball/libstemmer/modules.txt +13 -10
- data/vendor/snowball/libstemmer/test.c +1 -1
- data/vendor/snowball/pascal/SnowballProgram.pas +84 -2
- data/vendor/snowball/pascal/generate.pl +13 -13
- data/vendor/snowball/python/create_init.py +4 -1
- data/vendor/snowball/python/setup.cfg +0 -3
- data/vendor/snowball/python/setup.py +8 -3
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +20 -54
- data/vendor/snowball/python/stemwords.py +8 -12
- data/vendor/snowball/runtime/api.c +10 -5
- data/vendor/snowball/runtime/header.h +10 -9
- data/vendor/snowball/runtime/utilities.c +9 -9
- data/vendor/snowball/rust/build.rs +1 -1
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +83 -5
- data/vendor/snowball/tests/stemtest.c +7 -4
- metadata +7 -7
- data/vendor/snowball/.travis.yml +0 -112
- data/vendor/snowball/algorithms/german2.sbl +0 -145
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +0 -240
- data/vendor/snowball/compiler/syswords2.h +0 -13
@@ -8,15 +8,15 @@ routines (
|
|
8
8
|
R1 R2
|
9
9
|
Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5
|
10
10
|
exception1
|
11
|
-
exception2
|
12
11
|
)
|
13
12
|
|
14
13
|
externals ( stem )
|
15
14
|
|
16
|
-
groupings ( v v_WXY valid_LI )
|
15
|
+
groupings ( aeo v v_WXY valid_LI )
|
17
16
|
|
18
17
|
stringescapes {}
|
19
18
|
|
19
|
+
define aeo 'aeo'
|
20
20
|
define v 'aeiouy'
|
21
21
|
define v_WXY v + 'wxY'
|
22
22
|
|
@@ -34,9 +34,14 @@ define mark_regions as (
|
|
34
34
|
$p2 = limit
|
35
35
|
do(
|
36
36
|
among (
|
37
|
-
'gener'
|
38
|
-
'commun' //
|
39
|
-
'arsen' //
|
37
|
+
'gener' // generate/general/generic/generous
|
38
|
+
'commun' // communication/communism/community
|
39
|
+
'arsen' // arsenic/arsenal
|
40
|
+
'past' // past/paste
|
41
|
+
'univers' // universe/universal/university
|
42
|
+
'later' // lateral/later
|
43
|
+
'emerg' // emerge/emergency
|
44
|
+
'organ' // organ/organic/organize
|
40
45
|
// ... extensions possible here ...
|
41
46
|
) or (gopast v gopast non-v)
|
42
47
|
setmark p1
|
@@ -50,6 +55,8 @@ backwardmode (
|
|
50
55
|
( non-v_WXY v non-v )
|
51
56
|
or
|
52
57
|
( non-v v atlimit )
|
58
|
+
or
|
59
|
+
( 'past' ) // pasted/pasting
|
53
60
|
)
|
54
61
|
|
55
62
|
define R1 as $p1 <= cursor
|
@@ -74,19 +81,44 @@ backwardmode (
|
|
74
81
|
define Step_1b as (
|
75
82
|
[substring] among (
|
76
83
|
'eed' 'eedly'
|
77
|
-
(R1 <-'ee')
|
78
|
-
'ed' 'edly' 'ing' 'ingly'
|
79
84
|
(
|
80
|
-
|
81
|
-
|
85
|
+
do (
|
86
|
+
among (
|
87
|
+
'proc' 'exc' 'succ'
|
88
|
+
(atlimit)
|
89
|
+
) or (
|
90
|
+
R1 <-'ee'
|
91
|
+
)
|
92
|
+
)
|
93
|
+
)
|
94
|
+
'ed' 'edly' 'ingly'
|
95
|
+
(false) // Handled below.
|
96
|
+
'ing'
|
97
|
+
( // Handle exceptional cases here, rest handled below.
|
98
|
+
among (
|
99
|
+
// dying->die, lying->die, tying->tie, vying->vie
|
100
|
+
'y'
|
101
|
+
(test(non-v atlimit) ] <-'ie')
|
102
|
+
// Leave inning, outing, etc along.
|
103
|
+
'inn' 'out' 'cann' 'herr' 'earr' 'even'
|
104
|
+
(atlimit)
|
105
|
+
)
|
106
|
+
)
|
107
|
+
'' ()
|
108
|
+
) or (
|
109
|
+
// Handle 'ed' 'edly' 'ing' 'ingly'
|
110
|
+
test gopast v delete
|
111
|
+
[] test (
|
112
|
+
substring among(
|
82
113
|
'at' 'bl' 'iz'
|
83
|
-
(
|
114
|
+
(fail(<- 'e'))
|
84
115
|
'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt'
|
85
116
|
// ignoring double c, h, j, k, q, v, w, and x
|
86
|
-
(
|
87
|
-
'' (atmark p1 test shortv
|
117
|
+
(not (aeo atlimit))
|
118
|
+
'' (fail(atmark p1 test shortv <- 'e'))
|
88
119
|
)
|
89
120
|
)
|
121
|
+
[next] delete
|
90
122
|
)
|
91
123
|
)
|
92
124
|
|
@@ -116,6 +148,7 @@ backwardmode (
|
|
116
148
|
(<-'ive')
|
117
149
|
'biliti' 'bli'
|
118
150
|
(<-'ble')
|
151
|
+
'ogist' (<-'og')
|
119
152
|
'ogi' ('l' <-'og')
|
120
153
|
'fulli' (<-'ful')
|
121
154
|
'lessli' (<-'less')
|
@@ -133,7 +166,7 @@ backwardmode (
|
|
133
166
|
'ful' 'ness'
|
134
167
|
(delete)
|
135
168
|
'ative'
|
136
|
-
(R2 delete)
|
169
|
+
(R2 delete)
|
137
170
|
)
|
138
171
|
)
|
139
172
|
|
@@ -152,17 +185,6 @@ backwardmode (
|
|
152
185
|
'l' (R2 'l' delete)
|
153
186
|
)
|
154
187
|
)
|
155
|
-
|
156
|
-
define exception2 as (
|
157
|
-
|
158
|
-
[substring] atlimit among(
|
159
|
-
'inning' 'outing' 'canning' 'herring' 'earring'
|
160
|
-
'proceed' 'exceed' 'succeed'
|
161
|
-
|
162
|
-
// ... extensions possible here ...
|
163
|
-
|
164
|
-
)
|
165
|
-
)
|
166
188
|
)
|
167
189
|
|
168
190
|
define exception1 as (
|
@@ -171,11 +193,7 @@ define exception1 as (
|
|
171
193
|
|
172
194
|
/* special changes: */
|
173
195
|
|
174
|
-
'skis' (<-'ski')
|
175
196
|
'skies' (<-'sky')
|
176
|
-
'dying' (<-'die')
|
177
|
-
'lying' (<-'lie')
|
178
|
-
'tying' (<-'tie')
|
179
197
|
|
180
198
|
/* special -LY cases */
|
181
199
|
|
@@ -212,17 +230,14 @@ define stem as (
|
|
212
230
|
|
213
231
|
do Step_1a
|
214
232
|
|
215
|
-
|
216
|
-
|
217
|
-
do Step_1b
|
218
|
-
do Step_1c
|
233
|
+
do Step_1b
|
234
|
+
do Step_1c
|
219
235
|
|
220
|
-
|
221
|
-
|
222
|
-
|
236
|
+
do Step_2
|
237
|
+
do Step_3
|
238
|
+
do Step_4
|
223
239
|
|
224
|
-
|
225
|
-
)
|
240
|
+
do Step_5
|
226
241
|
)
|
227
242
|
do postlude
|
228
243
|
)
|
@@ -0,0 +1,157 @@
|
|
1
|
+
booleans ( foreign )
|
2
|
+
|
3
|
+
routines (
|
4
|
+
canonical_form
|
5
|
+
correlative
|
6
|
+
final_apostrophe
|
7
|
+
initial_apostrophe
|
8
|
+
long_word
|
9
|
+
merged_numeral
|
10
|
+
not_after_letter
|
11
|
+
pronoun
|
12
|
+
standard_suffix
|
13
|
+
ujn_suffix
|
14
|
+
uninflected
|
15
|
+
)
|
16
|
+
|
17
|
+
externals ( stem )
|
18
|
+
|
19
|
+
groupings ( vowel aou digit )
|
20
|
+
|
21
|
+
define vowel 'aeiou'
|
22
|
+
define aou 'aou'
|
23
|
+
define digit '0123456789'
|
24
|
+
|
25
|
+
stringescapes {}
|
26
|
+
|
27
|
+
stringdef c^ '{U+0109}'
|
28
|
+
stringdef g^ '{U+011D}'
|
29
|
+
stringdef h^ '{U+0125}'
|
30
|
+
stringdef j^ '{U+0135}'
|
31
|
+
stringdef s^ '{U+015D}'
|
32
|
+
stringdef u+ '{U+016D}'
|
33
|
+
|
34
|
+
stringdef a' '{U+00E1}'
|
35
|
+
stringdef e' '{U+00E9}'
|
36
|
+
stringdef i' '{U+00ED}'
|
37
|
+
stringdef o' '{U+00F3}'
|
38
|
+
stringdef u' '{U+00FA}'
|
39
|
+
|
40
|
+
define canonical_form as (
|
41
|
+
unset foreign
|
42
|
+
repeat (
|
43
|
+
[substring]
|
44
|
+
among(
|
45
|
+
'cx' (<- '{c^}')
|
46
|
+
'gx' (<- '{g^}')
|
47
|
+
'hx' (<- '{h^}')
|
48
|
+
'jx' (<- '{j^}')
|
49
|
+
'sx' (<- '{s^}')
|
50
|
+
'ux' (<- '{u+}')
|
51
|
+
'{a'}' (<- 'a' set foreign)
|
52
|
+
'{e'}' (<- 'e' set foreign)
|
53
|
+
'{i'}' (<- 'i' set foreign)
|
54
|
+
'{o'}' (<- 'o' set foreign)
|
55
|
+
'{u'}' (<- 'u' set foreign)
|
56
|
+
'q' 'w' 'x' 'y' (set foreign)
|
57
|
+
'-' (unset foreign)
|
58
|
+
'' (next)
|
59
|
+
)
|
60
|
+
)
|
61
|
+
not foreign
|
62
|
+
)
|
63
|
+
|
64
|
+
define initial_apostrophe as (
|
65
|
+
['{'}'] 'st' among('as' 'i' 'is' 'os' 'u' 'us') atlimit <- 'e'
|
66
|
+
)
|
67
|
+
|
68
|
+
backwardmode (
|
69
|
+
define pronoun as (
|
70
|
+
[try 'n']
|
71
|
+
among(
|
72
|
+
'ci' 'gi' '{g^}i' 'hi' 'ili' 'i{s^}i' 'ivi' 'li' 'mal{s^}i' 'mi' 'ni'
|
73
|
+
'oni' 'ri' 'si' '{s^}i' '{s^}li' 'vi'
|
74
|
+
)
|
75
|
+
(atlimit or '-') delete
|
76
|
+
)
|
77
|
+
|
78
|
+
define final_apostrophe as (
|
79
|
+
['{'}']
|
80
|
+
('l' atlimit <- 'a') or
|
81
|
+
('un' atlimit <- 'u') or
|
82
|
+
(
|
83
|
+
among(
|
84
|
+
'adi' 'almen' 'amb' 'ank' 'ankor' 'anstat' 'anta{u+}hier' 'apen'
|
85
|
+
'bald' '{c^}irk' 'hier' 'hodi' 'kontr' 'kvaz' 'malbald' 'malgr'
|
86
|
+
'morg' 'postmorg' 'presk' 'tut{c^}irk'
|
87
|
+
) (atlimit or '-') <- 'a{u+}'
|
88
|
+
) or
|
89
|
+
(<- 'o')
|
90
|
+
)
|
91
|
+
|
92
|
+
define ujn_suffix as (
|
93
|
+
[try 'n' try 'j'] among('aliu' 'unu') (atlimit or '-') delete
|
94
|
+
)
|
95
|
+
|
96
|
+
define uninflected as (
|
97
|
+
among(
|
98
|
+
'aha' 'amen' 'dirlididi' 'disde' 'ehe' 'ekde' 'elde' 'haha'
|
99
|
+
'haleluja' 'hola' 'hosana' 'hura' '{h^}a{h^}a' 'mal{c^}i' 'malkaj'
|
100
|
+
'malpli' 'maltra' 'maltre' 'maltro' 'minus' 'muu' 'oho' 'tamen'
|
101
|
+
'uhu'
|
102
|
+
)
|
103
|
+
(atlimit or '-')
|
104
|
+
)
|
105
|
+
|
106
|
+
define merged_numeral as (
|
107
|
+
among('du' 'tri' 'unu')
|
108
|
+
among('cent' 'dek')
|
109
|
+
)
|
110
|
+
|
111
|
+
define correlative as (
|
112
|
+
[]
|
113
|
+
// Ignore -al, -am, etc. since they can't be confused with suffixes.
|
114
|
+
test (
|
115
|
+
((try 'n'] 'e') or (try 'n' try 'j'] aou))
|
116
|
+
'i'
|
117
|
+
try among('{c^}' 'k' 'kelk' 'mult' 'nen' 'samt' 't')
|
118
|
+
(atlimit or '-')
|
119
|
+
)
|
120
|
+
delete
|
121
|
+
)
|
122
|
+
|
123
|
+
define long_word as (
|
124
|
+
loop 2 gopast vowel or (gopast '-' next) or gopast digit
|
125
|
+
)
|
126
|
+
|
127
|
+
define not_after_letter as ('-' or digit)
|
128
|
+
|
129
|
+
define standard_suffix as (
|
130
|
+
[substring try '-']
|
131
|
+
among(
|
132
|
+
'a' 'aj' 'ajn' 'an'
|
133
|
+
'e' 'en'
|
134
|
+
'i' 'as' 'is' 'os' 'u' 'us'
|
135
|
+
'o' 'oj' 'ojn' 'on'
|
136
|
+
'j' not_after_letter
|
137
|
+
'jn' not_after_letter
|
138
|
+
'n' not_after_letter
|
139
|
+
)
|
140
|
+
delete
|
141
|
+
)
|
142
|
+
)
|
143
|
+
|
144
|
+
define stem as (
|
145
|
+
test canonical_form
|
146
|
+
do initial_apostrophe
|
147
|
+
backwards (
|
148
|
+
not pronoun
|
149
|
+
do final_apostrophe
|
150
|
+
not correlative
|
151
|
+
not uninflected
|
152
|
+
not merged_numeral
|
153
|
+
not ujn_suffix
|
154
|
+
test long_word
|
155
|
+
standard_suffix
|
156
|
+
)
|
157
|
+
)
|
@@ -0,0 +1,269 @@
|
|
1
|
+
/* Estonian stemmer
|
2
|
+
|
3
|
+
Made by Linda Freienthal in January 2019.
|
4
|
+
|
5
|
+
*/
|
6
|
+
|
7
|
+
routines (
|
8
|
+
mark_regions
|
9
|
+
LONGV
|
10
|
+
special_noun_endings
|
11
|
+
case_ending
|
12
|
+
emphasis
|
13
|
+
plural_three_first_cases
|
14
|
+
undouble_kpt
|
15
|
+
i_plural
|
16
|
+
degrees
|
17
|
+
substantive
|
18
|
+
verb_exceptions
|
19
|
+
verb
|
20
|
+
nu
|
21
|
+
)
|
22
|
+
|
23
|
+
stringescapes {}
|
24
|
+
|
25
|
+
stringdef a" '{U+00E4}' //a-umlaut ä
|
26
|
+
stringdef o" '{U+00F6}' //o-umlaut ö
|
27
|
+
stringdef o~ '{U+00F5}' //o with tilde õ
|
28
|
+
stringdef u" '{U+00FC}' //u-umlaut ü
|
29
|
+
stringdef sv '{U+0161}' //s-caron š
|
30
|
+
stringdef zv '{U+017E}' //z-caron ž
|
31
|
+
|
32
|
+
externals ( stem )
|
33
|
+
integers ( p1 )
|
34
|
+
groupings ( V1 RV KI GI)
|
35
|
+
|
36
|
+
define V1 'aeiou{o~}{a"}{o"}{u"}'
|
37
|
+
define RV 'aeiuo'
|
38
|
+
define KI 'kptgbdshf{sv}z{zv}'
|
39
|
+
define GI 'cjlmnqrvwxaeiou{o~}{a"}{o"}{u"}'
|
40
|
+
|
41
|
+
define mark_regions as (
|
42
|
+
|
43
|
+
$p1 = limit
|
44
|
+
|
45
|
+
gopast V1 gopast non-V1 setmark p1
|
46
|
+
)
|
47
|
+
|
48
|
+
|
49
|
+
backwardmode (
|
50
|
+
|
51
|
+
define emphasis as (
|
52
|
+
setlimit tomark p1 for ([substring])
|
53
|
+
test hop 4 //kingi -> kingi
|
54
|
+
among(
|
55
|
+
'gi' ((GI and not LONGV) delete) //jookse-me-gi, bioloogi -> bioloogi
|
56
|
+
'ki' (KI delete) //kookki -> kook
|
57
|
+
)
|
58
|
+
|
59
|
+
)
|
60
|
+
|
61
|
+
// Signals t if a replacement was made; f otherwise.
|
62
|
+
define verb as (
|
63
|
+
setlimit tomark p1 for ([substring])
|
64
|
+
among(
|
65
|
+
'nuksin' 'nuksime' 'nuksid' 'nuksite' (delete) //seleta-nuksite
|
66
|
+
'ksin' 'ksid' 'ksime' 'ksite' (delete) //personal conditional: rõõmusta-ksin
|
67
|
+
'mata' (delete)
|
68
|
+
'takse' 'dakse' (delete) //impersonal: laul-dakse, luba-takse
|
69
|
+
'taks' 'daks' (delete) //impersonal conditional: laul-daks, saade-taks
|
70
|
+
'akse' (<-'a') //impersonal: tulla-kse, süüa-kse (-> söö), teha-kse (-> tegi), püüta-kse, leita-kse
|
71
|
+
'sime' (delete) //pl1pst: saat-sime
|
72
|
+
'site' (delete) //pl2pst: saat-site
|
73
|
+
'sin' (delete) //sg1pst: laul-sin, saat-sin
|
74
|
+
'me' (V1 delete) //pl1prs: laula-me, tule-me
|
75
|
+
'da' (V1 delete) //da-infinitive: luba-da
|
76
|
+
'n' (V1 delete) //sg1prs: kirjuta-n
|
77
|
+
'b' (V1 delete) //sg3prs: laula-b
|
78
|
+
)
|
79
|
+
)
|
80
|
+
|
81
|
+
define LONGV as
|
82
|
+
among('aa' 'ee' 'ii' 'oo' 'uu' '{a"}{a"}' '{o"}{o"}' '{u"}{u"}' '{o~}{o~}')
|
83
|
+
|
84
|
+
define i_plural as (
|
85
|
+
setlimit tomark p1 for ([substring])
|
86
|
+
among(
|
87
|
+
'i' (RV) //raama-tu-i, lapsiku-i
|
88
|
+
)
|
89
|
+
delete
|
90
|
+
)
|
91
|
+
|
92
|
+
define special_noun_endings as (
|
93
|
+
setlimit tomark p1 for ([substring])
|
94
|
+
among(
|
95
|
+
'lasse' (<- 'lase') //teadlasse -> teadlase
|
96
|
+
'last' (<- 'lase') //teadlast -> teadlase
|
97
|
+
'lane' (<- 'lase') //teadlane -> teadlase
|
98
|
+
'lasi'(<- 'lase') //teadlasi -> teadlase
|
99
|
+
'misse' (<- 'mise') //tegemisse -> tegemise
|
100
|
+
'mist' (<- 'mise') //kasutamist -> kasutamise
|
101
|
+
'mine' (<- 'mise') //tegemine -> tegemise
|
102
|
+
'misi' (<- 'mise') //kasutamisi -> kasutamise
|
103
|
+
'lisse' (<- 'lise') //rohelisse -> rohelise
|
104
|
+
'list' (<- 'lise') //tavalist -> tavalise
|
105
|
+
'line' (<- 'lise') //roheline -> rohelise
|
106
|
+
'lisi' (<- 'lise') //tavalisi -> tavalise
|
107
|
+
)
|
108
|
+
)
|
109
|
+
|
110
|
+
define case_ending as (
|
111
|
+
setlimit tomark p1 for ([substring])
|
112
|
+
among(
|
113
|
+
'sse' (RV or LONGV) //illative: saapa-sse
|
114
|
+
'st' (RV or LONGV) //elative: saapa-st and kapsas-t
|
115
|
+
'le' (RV or LONGV) //allative: raama-tu-le
|
116
|
+
'lt' (RV or LONGV) //ablative: raama-tu-lt
|
117
|
+
'ga' (RV or LONGV) //komitatiive: õpetaja-ga
|
118
|
+
'ks' (RV or LONGV) //translative: õpetaja-ks
|
119
|
+
'ta' (RV or LONGV) //abessive and da-infinitive: õpetaja-ta and hüpa-ta
|
120
|
+
't' (test hop 4) //partitiiv, raamatu-t
|
121
|
+
's' (RV or LONGV) //inessive and sg3pst: raama-tu-s and sõiti-s
|
122
|
+
'l' (RV or LONGV) //adessive: raama-tu-l and kapsa-l.
|
123
|
+
)
|
124
|
+
delete
|
125
|
+
)
|
126
|
+
|
127
|
+
|
128
|
+
define plural_three_first_cases as (
|
129
|
+
setlimit tomark p1 for ([substring])
|
130
|
+
among(
|
131
|
+
'ikkude' (<-'iku') //plural genitive: õnnelikkude -> õnneliku
|
132
|
+
'ikke' (<-'iku') //plural partitive: rahulikke -> rahuliku
|
133
|
+
'ike' (<-'iku') //plural genitive: ohtlike -> ohtliku
|
134
|
+
'sid' (not LONGV delete) //plural partitive and sg2pst and pl3pst: auto-sid and laul-sid (exludes plural nominative with words like gaasid, roosid)
|
135
|
+
// plural genitive and pl2: ministri-te, oluliste -> olulise and saada-te, laula-te;
|
136
|
+
// also torte -> tort (if not in compound word) and kokkuvõtte -> kokkuvõte and roheliste -> rohelise, tegemiste -> tegemise, teadlaste -> teadlase
|
137
|
+
'te' (
|
138
|
+
(test hop 4
|
139
|
+
among (
|
140
|
+
'mis' 'las' 'lis' (<- 'e')
|
141
|
+
't' ()
|
142
|
+
'' (delete)
|
143
|
+
)
|
144
|
+
) or <- 't'
|
145
|
+
)
|
146
|
+
'de' ((RV or LONGV) delete) //plural genitive: lauda-de
|
147
|
+
'd' ((RV or LONGV) delete) //plural nominative: voodi-d, rattai-d (rata), lapsiku-i-d
|
148
|
+
)
|
149
|
+
)
|
150
|
+
|
151
|
+
define nu as (
|
152
|
+
setlimit tomark p1 for ([substring])
|
153
|
+
among(
|
154
|
+
'nu' //haka-nu(-te-ga)
|
155
|
+
'tu' //luba-tu(-d)
|
156
|
+
'du' //laul-du(-te-st)
|
157
|
+
'va' //laul-va(-te-le)
|
158
|
+
)
|
159
|
+
delete
|
160
|
+
)
|
161
|
+
|
162
|
+
define undouble_kpt as (
|
163
|
+
// undouble '-C1C1V' where C1 is k, p or t:
|
164
|
+
// mõtte(-le) -> mõte, hakka(-n) -> haka
|
165
|
+
//
|
166
|
+
// We only undouble if the vowel is in R1 to avoid modifying short
|
167
|
+
// non-words (mostly to avoid modifying acronyms/initialisms such
|
168
|
+
// as "PPE").
|
169
|
+
V1 $(p1 <= cursor)
|
170
|
+
[substring] among(
|
171
|
+
'kk' (<- 'k')
|
172
|
+
'pp' (<- 'p')
|
173
|
+
'tt' (<- 't')
|
174
|
+
)
|
175
|
+
)
|
176
|
+
|
177
|
+
define degrees as (
|
178
|
+
setlimit tomark p1 for ([substring])
|
179
|
+
among(
|
180
|
+
'mai' (RV delete) //heleda-mai(-le)
|
181
|
+
'ma' (delete) //tuge-va-ma(-le) and ma-infinitive: sõit-ma
|
182
|
+
'm' (RV delete) //kauge-i-m, rõõmsa-m
|
183
|
+
)
|
184
|
+
)
|
185
|
+
|
186
|
+
define substantive as (
|
187
|
+
do special_noun_endings
|
188
|
+
do case_ending
|
189
|
+
do plural_three_first_cases
|
190
|
+
do degrees
|
191
|
+
do i_plural
|
192
|
+
do nu
|
193
|
+
)
|
194
|
+
)
|
195
|
+
|
196
|
+
|
197
|
+
define verb_exceptions as (
|
198
|
+
[substring] atlimit
|
199
|
+
among(
|
200
|
+
'joon' 'jood' 'joob' 'joote' 'joome' 'joovad' (<-'joo')
|
201
|
+
'j{o~}in' 'j{o~}id' 'j{o~}i' 'j{o~}ime' 'j{o~}ite' (<-'joo')
|
202
|
+
'joomata' 'juuakse' 'joodakse' 'juua' 'jooma' (<- 'joo')
|
203
|
+
'saan' 'saad' 'saab' 'saate' 'saame' 'saavad' (<-'saa')
|
204
|
+
'saaksin' 'saaksid' 'saaks' 'saaksite' 'saaksime' (<-'saa')
|
205
|
+
'sain' 'said' 'sai' 'saite' 'saime' (<-'saa')
|
206
|
+
'saamata' 'saadakse' 'saadi' 'saama' 'saada' (<-'saa')
|
207
|
+
'viin' 'viid' 'viib' 'viite' 'viime' 'viivad' (<-'viima')
|
208
|
+
'viiksin' 'viiksid' 'viiks' 'viiksite' 'viiksime' (<-'viima')
|
209
|
+
'viisin' 'viisite' 'viisime' (<-'viima')
|
210
|
+
'viimata' 'viiakse' 'viidi' 'viima' 'viia' (<-'viima')
|
211
|
+
'keen' 'keeb' 'keed' 'kees' 'keeme' 'keete' 'keevad' (<-'keesi')
|
212
|
+
'keeksin' 'keeks' 'keeksid' 'keeksime' 'keeksite' (<-'keesi')
|
213
|
+
'keemata' 'keema' 'keeta' 'keedakse' (<-'keesi')
|
214
|
+
'l{o"}{o"}n' 'l{o"}{o"}d' 'l{o"}{o"}b' 'l{o"}{o"}me' 'l{o"}{o"}te' 'l{o"}{o"}vad' (<-'l{o"}{o"}')
|
215
|
+
'l{o"}{o"}ksin' 'l{o"}{o"}ksid' 'l{o"}{o"}ks' 'l{o"}{o"}ksime' 'l{o"}{o"}ksite' (<-'l{o"}{o"}')
|
216
|
+
'l{o"}{o"}mata' 'l{u"}{u"}akse' 'l{o"}{o"}dakse' 'l{o"}{o"}di' 'l{o"}{o"}ma' 'l{u"}{u"}a' (<-'l{o"}{o"}')
|
217
|
+
// Both looma and lööma have these same past tense forms
|
218
|
+
'l{o~}in' 'l{o~}id' 'l{o~}i' 'l{o~}ime' 'l{o~}ite' (<-'l{o~}i')
|
219
|
+
'loon' 'lood' 'loob' 'loome' 'loote' 'loovad' (<-'loo')
|
220
|
+
'looksin' 'looksid' 'looks' 'looksime' 'looksite' (<-'loo')
|
221
|
+
'loomata' 'luuakse' 'loodi' 'luua' 'looma' (<-'loo')
|
222
|
+
'k{a"}in' 'k{a"}ib' 'k{a"}id' 'k{a"}is' 'k{a"}ime' 'k{a"}ite' 'k{a"}ivad' (<-'k{a"}isi')
|
223
|
+
'k{a"}iksin' 'k{a"}iks' 'k{a"}iksid' 'k{a"}iksime' 'k{a"}iksite' (<-'k{a"}isi')
|
224
|
+
'k{a"}imata' 'k{a"}iakse' 'k{a"}idi' 'k{a"}ia' 'k{a"}ima' (<-'k{a"}isi')
|
225
|
+
's{o"}{o"}n' 's{o"}{o"}b' 's{o"}{o"}d' 's{o"}{o"}me' 's{o"}{o"}te' 's{o"}{o"}vad' (<-'s{o"}{o"}')
|
226
|
+
's{o"}{o"}ksin' 's{o"}{o"}ks' 's{o"}{o"}ksid' 's{o"}{o"}ksime' 's{o"}{o"}ksite' (<-'s{o"}{o"}')
|
227
|
+
's{o~}in' 's{o~}i' 's{o~}id' 's{o~}ime' 's{o~}ite' (<-'s{o"}{o"}')
|
228
|
+
's{o"}{o"}mata' 's{u"}{u"}akse' 's{o"}{o"}dakse' 's{o"}{o"}di' 's{o"}{o"}ma' 's{u"}{u"}a' (<-'s{o"}{o"}')
|
229
|
+
'toon' 'tood' 'toob' 'toote' 'toome' 'toovad' (<-'too')
|
230
|
+
'tooksin' 'tooksid' 'tooks' 'tooksite' 'tooksime' (<-'too')
|
231
|
+
't{o~}in' 't{o~}id' 't{o~}i' 't{o~}ime' 't{o~}ite' (<-'too')
|
232
|
+
'toomata' 'tuuakse' 'toodi' 'tooma' 'tuua' (<-'too')
|
233
|
+
'v{o~}in' 'v{o~}id' 'v{o~}ib' 'v{o~}ime' 'v{o~}is' 'v{o~}ite' 'v{o~}ivad' (<-'v{o~}isi')
|
234
|
+
'v{o~}iksin' 'v{o~}iksid' 'v{o~}iks' 'v{o~}iksime' 'v{o~}iksite' (<-'v{o~}isi')
|
235
|
+
'v{o~}imata' 'v{o~}idakse' 'v{o~}idi' 'v{o~}ida' 'v{o~}ima' (<-'v{o~}isi')
|
236
|
+
'j{a"}{a"}n' 'j{a"}{a"}d' 'j{a"}{a"}b' 'j{a"}{a"}me' 'j{a"}{a"}te' 'j{a"}{a"}vad' (<-'j{a"}{a"}ma')
|
237
|
+
'j{a"}{a"}ksin' 'j{a"}{a"}ksid' 'j{a"}{a"}ks' 'j{a"}{a"}ksime' 'j{a"}{a"}ksite' (<-'j{a"}{a"}ma')
|
238
|
+
'j{a"}ime' 'j{a"}ite' 'j{a"}in' 'j{a"}id' 'j{a"}i' (<-'j{a"}{a"}ma')
|
239
|
+
'j{a"}{a"}mata' 'j{a"}{a"}dakse' 'j{a"}{a"}da' 'j{a"}{a"}ma' 'j{a"}{a"}di' (<-'j{a"}{a"}ma')
|
240
|
+
'm{u"}{u"}n' 'm{u"}{u"}d' 'm{u"}{u"}b' 'm{u"}{u"}s' 'm{u"}{u"}me' 'm{u"}{u"}te' 'm{u"}{u"}vad' (<-'m{u"}{u"}si')
|
241
|
+
'm{u"}{u"}ksin' 'm{u"}{u"}ksid' 'm{u"}{u"}ks' 'm{u"}{u"}ksime' 'm{u"}{u"}ksite' (<-'m{u"}{u"}si')
|
242
|
+
'm{u"}{u"}mata' 'm{u"}{u"}akse' 'm{u"}{u"}di' 'm{u"}{u"}a' 'm{u"}{u"}ma' (<-'m{u"}{u"}si')
|
243
|
+
'loeb' 'loen' 'loed' 'loeme' 'loete' 'loevad' (<- 'luge')
|
244
|
+
'loeks' 'loeksin' 'loeksid' 'loeksime' 'loeksite' (<- 'luge')
|
245
|
+
'p{o~}en' 'p{o~}eb' 'p{o~}ed' 'p{o~}eme' 'p{o~}ete' 'p{o~}evad' (<- 'p{o~}de')
|
246
|
+
'p{o~}eksin' 'p{o~}eks' 'p{o~}eksid' 'p{o~}eksime' 'p{o~}eksite' (<- 'p{o~}de')
|
247
|
+
'laon' 'laob' 'laod' 'laome' 'laote' 'laovad' (<- 'ladu')
|
248
|
+
'laoksin' 'laoks' 'laoksid' 'laoksime' 'laoksite' (<- 'ladu')
|
249
|
+
'teeksin' 'teeks' 'teeksid' 'teeksime' 'teeksite' (<- 'tegi')
|
250
|
+
'teen' 'teeb' 'teed' 'teeme' 'teete' 'teevad' (<- 'tegi')
|
251
|
+
'tegemata' 'tehakse' 'tehti' 'tegema' 'teha' (<-'tegi')
|
252
|
+
'n{a"}en' 'n{a"}eb' 'n{a"}ed' 'n{a"}eme' 'n{a"}ete' 'n{a"}evad' (<-'n{a"}gi')
|
253
|
+
'n{a"}eksin' 'n{a"}eks' 'n{a"}eksid' 'n{a"}eksime' 'n{a"}eksite' (<-'n{a"}gi')
|
254
|
+
'n{a"}gemata' 'n{a"}hakse' 'n{a"}hti' 'n{a"}ha' 'n{a"}gema' (<-'n{a"}gi')
|
255
|
+
)
|
256
|
+
)
|
257
|
+
|
258
|
+
|
259
|
+
define stem as (
|
260
|
+
not verb_exceptions
|
261
|
+
// p1 isn't used by verb_exceptions
|
262
|
+
do mark_regions
|
263
|
+
backwards (
|
264
|
+
do emphasis
|
265
|
+
do ( verb or substantive )
|
266
|
+
do undouble_kpt
|
267
|
+
|
268
|
+
)
|
269
|
+
)
|
@@ -44,8 +44,8 @@ define mark_regions as (
|
|
44
44
|
$p1 = limit
|
45
45
|
$p2 = limit
|
46
46
|
|
47
|
-
|
48
|
-
|
47
|
+
gopast V1 gopast non-V1 setmark p1
|
48
|
+
gopast V1 gopast non-V1 setmark p2
|
49
49
|
)
|
50
50
|
|
51
51
|
backwardmode (
|
@@ -194,4 +194,3 @@ define stem as (
|
|
194
194
|
do tidy
|
195
195
|
)
|
196
196
|
)
|
197
|
-
|