mittens 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +3 -3
- data/lib/mittens/version.rb +1 -1
- data/vendor/snowball/.github/workflows/ci.yml +216 -0
- data/vendor/snowball/CONTRIBUTING.rst +111 -62
- data/vendor/snowball/GNUmakefile +194 -136
- data/vendor/snowball/NEWS +798 -3
- data/vendor/snowball/README.rst +50 -1
- data/vendor/snowball/ada/src/stemmer.adb +25 -13
- data/vendor/snowball/ada/src/stemmer.ads +9 -9
- data/vendor/snowball/ada/stemmer_config.gpr +7 -7
- data/vendor/snowball/algorithms/basque.sbl +4 -19
- data/vendor/snowball/algorithms/catalan.sbl +2 -9
- data/vendor/snowball/algorithms/danish.sbl +1 -1
- data/vendor/snowball/algorithms/dutch.sbl +284 -122
- data/vendor/snowball/algorithms/dutch_porter.sbl +178 -0
- data/vendor/snowball/algorithms/english.sbl +52 -37
- data/vendor/snowball/algorithms/esperanto.sbl +157 -0
- data/vendor/snowball/algorithms/estonian.sbl +269 -0
- data/vendor/snowball/algorithms/finnish.sbl +2 -3
- data/vendor/snowball/algorithms/french.sbl +42 -16
- data/vendor/snowball/algorithms/german.sbl +35 -14
- data/vendor/snowball/algorithms/greek.sbl +76 -76
- data/vendor/snowball/algorithms/hungarian.sbl +8 -6
- data/vendor/snowball/algorithms/indonesian.sbl +14 -8
- data/vendor/snowball/algorithms/italian.sbl +11 -21
- data/vendor/snowball/algorithms/lithuanian.sbl +36 -37
- data/vendor/snowball/algorithms/lovins.sbl +0 -1
- data/vendor/snowball/algorithms/nepali.sbl +138 -37
- data/vendor/snowball/algorithms/norwegian.sbl +19 -5
- data/vendor/snowball/algorithms/porter.sbl +2 -2
- data/vendor/snowball/algorithms/portuguese.sbl +9 -13
- data/vendor/snowball/algorithms/romanian.sbl +17 -4
- data/vendor/snowball/algorithms/serbian.sbl +467 -468
- data/vendor/snowball/algorithms/spanish.sbl +5 -7
- data/vendor/snowball/algorithms/swedish.sbl +60 -6
- data/vendor/snowball/algorithms/tamil.sbl +207 -176
- data/vendor/snowball/algorithms/turkish.sbl +461 -445
- data/vendor/snowball/algorithms/yiddish.sbl +36 -38
- data/vendor/snowball/compiler/analyser.c +445 -192
- data/vendor/snowball/compiler/driver.c +109 -101
- data/vendor/snowball/compiler/generator.c +853 -464
- data/vendor/snowball/compiler/generator_ada.c +404 -366
- data/vendor/snowball/compiler/generator_csharp.c +297 -260
- data/vendor/snowball/compiler/generator_go.c +323 -254
- data/vendor/snowball/compiler/generator_java.c +326 -252
- data/vendor/snowball/compiler/generator_js.c +362 -252
- data/vendor/snowball/compiler/generator_pascal.c +349 -197
- data/vendor/snowball/compiler/generator_python.c +257 -240
- data/vendor/snowball/compiler/generator_rust.c +423 -251
- data/vendor/snowball/compiler/header.h +117 -71
- data/vendor/snowball/compiler/space.c +137 -68
- data/vendor/snowball/compiler/syswords.h +2 -2
- data/vendor/snowball/compiler/tokeniser.c +125 -107
- data/vendor/snowball/csharp/Snowball/Among.cs +14 -14
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +7 -7
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +57 -37
- data/vendor/snowball/csharp/Stemwords/App.config +2 -2
- data/vendor/snowball/csharp/Stemwords/Program.cs +16 -12
- data/vendor/snowball/doc/libstemmer_c_README +7 -4
- data/vendor/snowball/doc/libstemmer_csharp_README +4 -1
- data/vendor/snowball/doc/libstemmer_java_README +12 -1
- data/vendor/snowball/doc/libstemmer_js_README +6 -4
- data/vendor/snowball/doc/libstemmer_python_README +9 -4
- data/vendor/snowball/examples/stemwords.c +12 -12
- data/vendor/snowball/go/env.go +107 -31
- data/vendor/snowball/go/util.go +0 -4
- data/vendor/snowball/include/libstemmer.h +4 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +32 -15
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +347 -261
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +3 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +52 -37
- data/vendor/snowball/javascript/base-stemmer.js +186 -2
- data/vendor/snowball/javascript/stemwords.js +3 -6
- data/vendor/snowball/libstemmer/libstemmer_c.in +1 -1
- data/vendor/snowball/libstemmer/mkalgorithms.pl +6 -6
- data/vendor/snowball/libstemmer/mkmodules.pl +2 -2
- data/vendor/snowball/libstemmer/modules.txt +13 -10
- data/vendor/snowball/libstemmer/test.c +1 -1
- data/vendor/snowball/pascal/SnowballProgram.pas +84 -2
- data/vendor/snowball/pascal/generate.pl +13 -13
- data/vendor/snowball/python/create_init.py +4 -1
- data/vendor/snowball/python/setup.cfg +0 -3
- data/vendor/snowball/python/setup.py +8 -3
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +20 -54
- data/vendor/snowball/python/stemwords.py +8 -12
- data/vendor/snowball/runtime/api.c +10 -5
- data/vendor/snowball/runtime/header.h +10 -9
- data/vendor/snowball/runtime/utilities.c +9 -9
- data/vendor/snowball/rust/build.rs +1 -1
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +83 -5
- data/vendor/snowball/tests/stemtest.c +7 -4
- metadata +7 -7
- data/vendor/snowball/.travis.yml +0 -112
- data/vendor/snowball/algorithms/german2.sbl +0 -145
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +0 -240
- data/vendor/snowball/compiler/syswords2.h +0 -13
@@ -1,164 +1,326 @@
|
|
1
|
+
// Dutch stemming algorithm developed by Wessel Kraaij and Renée Pohlmann
|
2
|
+
|
3
|
+
strings ( ch )
|
4
|
+
integers ( p1 p2 )
|
5
|
+
booleans ( stemmed GE_removed )
|
6
|
+
|
1
7
|
routines (
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
8
|
+
|
9
|
+
R1 R2
|
10
|
+
C V VX
|
11
|
+
lengthen_V
|
12
|
+
Step_1 Step_2 Step_3 Step_4 Step_7
|
13
|
+
Step_6 Step_1c
|
14
|
+
Lose_prefix
|
15
|
+
Lose_infix
|
16
|
+
measure
|
9
17
|
)
|
10
18
|
|
11
19
|
externals ( stem )
|
12
20
|
|
13
|
-
|
14
|
-
|
15
|
-
integers ( p1 p2 )
|
16
|
-
|
17
|
-
groupings ( v v_I v_j )
|
21
|
+
groupings ( v v_WX A AEIOU AIOU E I O U )
|
18
22
|
|
19
23
|
stringescapes {}
|
20
24
|
|
21
25
|
/* special characters */
|
22
26
|
|
23
|
-
stringdef a
|
24
|
-
stringdef e" '{U+00EB}'
|
25
|
-
stringdef i" '{U+00EF}'
|
26
|
-
stringdef o" '{U+00F6}'
|
27
|
-
stringdef u" '{U+00FC}'
|
28
|
-
|
27
|
+
stringdef a` '{U+00E0}'
|
29
28
|
stringdef a' '{U+00E1}'
|
29
|
+
stringdef a^ '{U+00E2}'
|
30
|
+
stringdef a" '{U+00E4}'
|
31
|
+
stringdef e` '{U+00E8}'
|
30
32
|
stringdef e' '{U+00E9}'
|
33
|
+
stringdef e^ '{U+00EA}'
|
34
|
+
stringdef e" '{U+00EB}'
|
35
|
+
stringdef i` '{U+00EC}'
|
31
36
|
stringdef i' '{U+00ED}'
|
37
|
+
stringdef i^ '{U+00EE}'
|
38
|
+
stringdef i" '{U+00EF}'
|
39
|
+
stringdef o` '{U+00F2}'
|
32
40
|
stringdef o' '{U+00F3}'
|
41
|
+
stringdef o^ '{U+00F4}'
|
42
|
+
stringdef o" '{U+00F6}'
|
43
|
+
stringdef u` '{U+00F9}'
|
33
44
|
stringdef u' '{U+00FA}'
|
45
|
+
stringdef u^ '{U+00FB}'
|
46
|
+
stringdef u" '{U+00FC}'
|
34
47
|
|
35
|
-
|
36
|
-
|
37
|
-
define
|
38
|
-
define
|
39
|
-
define
|
40
|
-
|
41
|
-
define prelude as (
|
42
|
-
test repeat (
|
43
|
-
[substring] among(
|
44
|
-
'{a"}' '{a'}'
|
45
|
-
(<- 'a')
|
46
|
-
'{e"}' '{e'}'
|
47
|
-
(<- 'e')
|
48
|
-
'{i"}' '{i'}'
|
49
|
-
(<- 'i')
|
50
|
-
'{o"}' '{o'}'
|
51
|
-
(<- 'o')
|
52
|
-
'{u"}' '{u'}'
|
53
|
-
(<- 'u')
|
54
|
-
'' (next)
|
55
|
-
) //or next
|
56
|
-
)
|
57
|
-
try(['y'] <- 'Y')
|
58
|
-
repeat goto (
|
59
|
-
v [('i'] v <- 'I') or
|
60
|
-
('y'] <- 'Y')
|
61
|
-
)
|
62
|
-
)
|
63
|
-
|
64
|
-
define mark_regions as (
|
65
|
-
|
66
|
-
$p1 = limit
|
67
|
-
$p2 = limit
|
48
|
+
define A 'a{a"}{a'}{a`}{a^}'
|
49
|
+
define E 'e{e"}{e'}{e`}{e^}'
|
50
|
+
define I 'i{i"}{i'}{i`}{i^}'
|
51
|
+
define O 'o{o"}{o'}{o`}{o^}'
|
52
|
+
define U 'u{u"}{u'}{u`}{u^}'
|
68
53
|
|
69
|
-
|
70
|
-
|
71
|
-
|
54
|
+
define AIOU A + I + O + U
|
55
|
+
define AEIOU A + E + I + O + U
|
56
|
+
define v AEIOU + 'y'
|
57
|
+
define v_WX v + 'wx'
|
72
58
|
|
73
|
-
|
59
|
+
backwardmode (
|
74
60
|
|
75
|
-
define
|
61
|
+
define R1 as ($p1 <= cursor)
|
62
|
+
define R2 as ($p2 <= cursor)
|
76
63
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
'' (next)
|
81
|
-
) //or next
|
64
|
+
define V as test (v or 'ij')
|
65
|
+
define VX as test (next v or 'ij')
|
66
|
+
define C as test (not 'ij' non-v)
|
82
67
|
|
83
|
-
|
68
|
+
define lengthen_V as do (
|
69
|
+
non-v_WX
|
70
|
+
[substring] among (
|
71
|
+
'a' '{a"}' '{a'}' '{a`}' '{a^}'
|
72
|
+
'o' '{o"}' '{o'}' '{o`}' '{o^}'
|
73
|
+
'u' '{u"}' '{u'}' '{u`}' '{u^}'
|
74
|
+
(test (non-AEIOU or atlimit)
|
75
|
+
->ch insert ch)
|
76
|
+
'e' '{e'}' '{e`}' '{e^}'
|
77
|
+
(test (non-AEIOU or atlimit
|
78
|
+
not (AIOU or (E atlimit))
|
79
|
+
not (next AIOU non-AEIOU))
|
80
|
+
->ch insert ch)
|
81
|
+
'e{e"}'
|
82
|
+
(<-'e{e"}e')
|
83
|
+
'i{e"}'
|
84
|
+
(<-'iee')
|
85
|
+
)
|
86
|
+
)
|
84
87
|
|
85
|
-
|
88
|
+
define Step_1 as
|
89
|
+
(
|
90
|
+
[substring] among (
|
86
91
|
|
87
|
-
|
88
|
-
|
92
|
+
'{'}s' (delete)
|
93
|
+
's' (R1 not ('t' R1) C delete)
|
94
|
+
'ies' (R1 <-'ie')
|
95
|
+
'es'
|
96
|
+
((test ('ar' R1 C) delete lengthen_V) or
|
97
|
+
(test ('er' R1 C) delete) or
|
98
|
+
(R1 C <-'e'))
|
89
99
|
|
90
|
-
|
91
|
-
|
100
|
+
'{e'}s'
|
101
|
+
(R1 <-'{e'}')
|
102
|
+
'aus' (R1 V <-'au')
|
103
|
+
'en' (('hed' R1 ] <-'heid') or
|
104
|
+
('nd' delete) or
|
105
|
+
('d' R1 C ] delete) or
|
106
|
+
('i' or 'j' V delete) or
|
107
|
+
(R1 C delete lengthen_V))
|
108
|
+
'nde' (<-'nd')
|
109
|
+
)
|
92
110
|
)
|
93
111
|
|
94
|
-
define
|
95
|
-
|
96
|
-
[
|
97
|
-
|
98
|
-
|
112
|
+
define Step_2 as
|
113
|
+
(
|
114
|
+
[substring] among (
|
115
|
+
'je' (('{'}t' ] delete) or
|
116
|
+
('et' ] R1 C delete) or
|
117
|
+
('rnt' ] <-'rn') or
|
118
|
+
('t' ] R1 VX delete) or
|
119
|
+
('ink' ] <-'ing') or
|
120
|
+
('mp' ] <-'m') or
|
121
|
+
('{'}' ] R1 delete) or
|
122
|
+
(] R1 C delete))
|
123
|
+
'ge' (R1 <-'g')
|
124
|
+
'lijke'(R1 <-'lijk')
|
125
|
+
'ische'(R1 <-'isch')
|
126
|
+
'de' (R1 C delete)
|
127
|
+
'te' (R1 <-'t')
|
128
|
+
'se' (R1 <-'s')
|
129
|
+
're' (R1 <-'r')
|
130
|
+
'le' (R1 delete attach 'l' lengthen_V)
|
131
|
+
'ene' (R1 C delete attach 'en' lengthen_V)
|
132
|
+
'ieve' (R1 C <-'ief')
|
133
|
+
)
|
99
134
|
)
|
100
135
|
|
101
|
-
define
|
102
|
-
|
103
|
-
|
136
|
+
define Step_3 as
|
137
|
+
(
|
138
|
+
[substring] among (
|
139
|
+
'atie' (R1 <-'eer')
|
140
|
+
'iteit' (R1 delete lengthen_V)
|
141
|
+
'heid'
|
142
|
+
'sel'
|
143
|
+
'ster' (R1 delete)
|
144
|
+
'rder' (<-'r')
|
145
|
+
'ing'
|
146
|
+
'isme'
|
147
|
+
'erij' (// Exception added to avoid conflating
|
148
|
+
// `schilderij` (painting) and `schild` (shield).
|
149
|
+
('ild' <- 'er')
|
150
|
+
or
|
151
|
+
(R1 delete lengthen_V))
|
152
|
+
'arij' (R1 C <-'aar')
|
153
|
+
'fie' (R2 delete attach 'f' lengthen_V)
|
154
|
+
'gie' (R2 delete attach 'g' lengthen_V)
|
155
|
+
'tst' (R1 C <-'t')
|
156
|
+
'dst' (R1 C <-'d')
|
157
|
+
)
|
104
158
|
)
|
105
159
|
|
106
|
-
define
|
107
|
-
|
108
|
-
|
109
|
-
'
|
110
|
-
(
|
111
|
-
)
|
112
|
-
'
|
113
|
-
(
|
114
|
-
)
|
115
|
-
'
|
116
|
-
|
117
|
-
)
|
160
|
+
define Step_4 as
|
161
|
+
(
|
162
|
+
( [substring] among (
|
163
|
+
'ioneel' (R1 <-'ie')
|
164
|
+
'atief' (R1 <-'eer')
|
165
|
+
'baar' (R1 delete)
|
166
|
+
'naar' (R1 V <-'n')
|
167
|
+
'laar' (R1 V <-'l')
|
168
|
+
'raar' (R1 V <-'r')
|
169
|
+
'tant' (R1 <-'teer')
|
170
|
+
'lijker'
|
171
|
+
'lijkst' (R1 <-'lijk')
|
172
|
+
'achtig'
|
173
|
+
'achtiger'
|
174
|
+
'achtigst'(R1 delete)
|
175
|
+
'eriger'
|
176
|
+
'erigst'
|
177
|
+
'erig'
|
178
|
+
'end' (R1 C delete lengthen_V)
|
118
179
|
)
|
119
180
|
)
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
( R2 delete
|
130
|
-
(['ig'] R2 not 'e' delete) or undouble
|
131
|
-
)
|
132
|
-
'ig'
|
133
|
-
( R2 not 'e' delete
|
134
|
-
)
|
135
|
-
'lijk'
|
136
|
-
( R2 delete e_ending
|
137
|
-
)
|
138
|
-
'baar'
|
139
|
-
( R2 delete
|
140
|
-
)
|
141
|
-
'bar'
|
142
|
-
( R2 e_found delete
|
143
|
-
)
|
181
|
+
or
|
182
|
+
( [substring] among (
|
183
|
+
'iger'
|
184
|
+
'igst'
|
185
|
+
'ig' (R1
|
186
|
+
// Exception added to avoid conflating
|
187
|
+
// `innig` (intimate) and `in` (in).
|
188
|
+
not ('inn' atlimit)
|
189
|
+
C delete lengthen_V)
|
144
190
|
)
|
145
191
|
)
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
192
|
+
)
|
193
|
+
|
194
|
+
define Step_7 as
|
195
|
+
(
|
196
|
+
[substring] among (
|
197
|
+
'kt' (<-'k')
|
198
|
+
'ft' (<-'f')
|
199
|
+
'pt' (<-'p')
|
200
|
+
)
|
201
|
+
)
|
202
|
+
|
203
|
+
define Step_6 as
|
204
|
+
(
|
205
|
+
[substring] among (
|
206
|
+
'bb' (<-'b')
|
207
|
+
'cc' (<-'c')
|
208
|
+
'dd' (<-'d')
|
209
|
+
'ff' (<-'f')
|
210
|
+
'gg' (<-'g')
|
211
|
+
'hh' (<-'h')
|
212
|
+
'jj' (<-'j')
|
213
|
+
'kk' (<-'k')
|
214
|
+
'll' (<-'l')
|
215
|
+
'mm' (<-'m')
|
216
|
+
'nn' (// Exception added to avoid conflating
|
217
|
+
// `innen` (to collect/cash) and `in` (in).
|
218
|
+
not ('i' atlimit)
|
219
|
+
<-'n')
|
220
|
+
'pp' (<-'p')
|
221
|
+
'qq' (<-'q')
|
222
|
+
'rr' (<-'r')
|
223
|
+
'ss' (<-'s')
|
224
|
+
'tt' (<-'t')
|
225
|
+
'vv' (<-'v')
|
226
|
+
'ww' (<-'w')
|
227
|
+
'xx' (<-'x')
|
228
|
+
'zz' (<-'z')
|
229
|
+
'v' (<-'f')
|
230
|
+
'z' (<-'s')
|
231
|
+
)
|
232
|
+
)
|
233
|
+
|
234
|
+
define Step_1c as
|
235
|
+
(
|
236
|
+
[substring] R1 C among (
|
237
|
+
'd' (not ('n' R1)
|
238
|
+
// Exception added to avoid conflating
|
239
|
+
// `geïnd` (collected/cashed) and `in` (in).
|
240
|
+
// Instead we conflate `geïnd` with `innen`.
|
241
|
+
('in' atlimit <-'n') or
|
242
|
+
delete)
|
243
|
+
't' (not ('h' R1)
|
244
|
+
// Exception added to avoid conflating
|
245
|
+
// `geënt` (grafted) and `en` (and).
|
246
|
+
not ('en' atlimit)
|
247
|
+
delete
|
151
248
|
)
|
152
|
-
[next] delete
|
153
249
|
)
|
154
250
|
)
|
155
251
|
)
|
156
252
|
|
253
|
+
define Lose_prefix as (
|
254
|
+
['ge'] test hop 3 test (gopast ('ij' or v) repeat ('ij' or v) not atlimit)
|
255
|
+
// Exceptions added:
|
256
|
+
among (
|
257
|
+
// Avoid conflating `geeft` and `effen`/`effende`\`geeffende`.
|
258
|
+
'eft' (false)
|
259
|
+
|
260
|
+
// Avoid conflating `gevallen`/`geval` and `vallen`.
|
261
|
+
'val' (false)
|
262
|
+
'vali' (true)
|
263
|
+
|
264
|
+
// Avoid conflating `gevaren`/`gevaar` (danger), `gevaarten` (huge
|
265
|
+
// objects) and `varen` (to sail)
|
266
|
+
'vaa' 'vare' (false)
|
267
|
+
|
268
|
+
'' (true)
|
269
|
+
)
|
270
|
+
set GE_removed
|
271
|
+
delete
|
272
|
+
do ( [substring] among (
|
273
|
+
'{e"}' (<-'e')
|
274
|
+
'{i"}' (<-'i')
|
275
|
+
)
|
276
|
+
)
|
277
|
+
)
|
278
|
+
|
279
|
+
define Lose_infix as (
|
280
|
+
next
|
281
|
+
gopast (['ge']) test hop 3 test (gopast ('ij' or v) repeat ('ij' or v) not atlimit)
|
282
|
+
set GE_removed
|
283
|
+
delete
|
284
|
+
do ( [substring] among (
|
285
|
+
'{e"}' (<-'e')
|
286
|
+
'{i"}' (<-'i')
|
287
|
+
)
|
288
|
+
)
|
289
|
+
)
|
290
|
+
|
291
|
+
define measure as (
|
292
|
+
$p1 = limit
|
293
|
+
$p2 = limit
|
294
|
+
do(
|
295
|
+
repeat non-v atleast 1 ('ij' or v) non-v setmark p1
|
296
|
+
repeat non-v atleast 1 ('ij' or v) non-v setmark p2
|
297
|
+
)
|
298
|
+
|
299
|
+
)
|
157
300
|
define stem as (
|
158
301
|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
302
|
+
unset stemmed
|
303
|
+
|
304
|
+
measure
|
305
|
+
|
306
|
+
backwards (
|
307
|
+
do (Step_1 set stemmed )
|
308
|
+
do (Step_2 set stemmed )
|
309
|
+
do (Step_3 set stemmed )
|
310
|
+
do (Step_4 set stemmed )
|
311
|
+
)
|
312
|
+
unset GE_removed
|
313
|
+
do (Lose_prefix and measure)
|
314
|
+
backwards (
|
315
|
+
do (GE_removed set stemmed Step_1c)
|
316
|
+
)
|
317
|
+
unset GE_removed
|
318
|
+
do (Lose_infix and measure)
|
319
|
+
backwards (
|
320
|
+
do (GE_removed set stemmed Step_1c)
|
321
|
+
)
|
322
|
+
backwards (
|
323
|
+
do (Step_7 set stemmed )
|
324
|
+
do (stemmed Step_6)
|
325
|
+
)
|
164
326
|
)
|
@@ -0,0 +1,178 @@
|
|
1
|
+
// Dutch stemming algorithm developed by Martin Porter
|
2
|
+
|
3
|
+
routines (
|
4
|
+
prelude postlude
|
5
|
+
e_ending
|
6
|
+
en_ending
|
7
|
+
mark_regions
|
8
|
+
R1 R2
|
9
|
+
undouble
|
10
|
+
standard_suffix
|
11
|
+
)
|
12
|
+
|
13
|
+
externals ( stem )
|
14
|
+
|
15
|
+
booleans ( e_found )
|
16
|
+
|
17
|
+
integers ( p1 p2 x )
|
18
|
+
|
19
|
+
groupings ( v v_I v_j )
|
20
|
+
|
21
|
+
stringescapes {}
|
22
|
+
|
23
|
+
/* special characters */
|
24
|
+
|
25
|
+
stringdef a" '{U+00E4}'
|
26
|
+
stringdef e" '{U+00EB}'
|
27
|
+
stringdef i" '{U+00EF}'
|
28
|
+
stringdef o" '{U+00F6}'
|
29
|
+
stringdef u" '{U+00FC}'
|
30
|
+
|
31
|
+
stringdef a' '{U+00E1}'
|
32
|
+
stringdef e' '{U+00E9}'
|
33
|
+
stringdef i' '{U+00ED}'
|
34
|
+
stringdef o' '{U+00F3}'
|
35
|
+
stringdef u' '{U+00FA}'
|
36
|
+
|
37
|
+
stringdef e` '{U+00E8}'
|
38
|
+
|
39
|
+
define v 'aeiouy{e`}'
|
40
|
+
define v_I v + 'I'
|
41
|
+
define v_j v + 'j'
|
42
|
+
|
43
|
+
define prelude as (
|
44
|
+
test repeat (
|
45
|
+
[substring] among(
|
46
|
+
'{a"}' '{a'}'
|
47
|
+
(<- 'a')
|
48
|
+
'{e"}' '{e'}'
|
49
|
+
(<- 'e')
|
50
|
+
'{i"}' '{i'}'
|
51
|
+
(<- 'i')
|
52
|
+
'{o"}' '{o'}'
|
53
|
+
(<- 'o')
|
54
|
+
'{u"}' '{u'}'
|
55
|
+
(<- 'u')
|
56
|
+
'' (next)
|
57
|
+
)
|
58
|
+
)
|
59
|
+
try(['y'] <- 'Y')
|
60
|
+
repeat (
|
61
|
+
gopast v
|
62
|
+
try (
|
63
|
+
// If we see `i` not followed by a vowel then we know it couldn't
|
64
|
+
// match on the next iteration so we can advance past it.
|
65
|
+
//
|
66
|
+
// However if we replace `i` with `I` we do need to check the vowel
|
67
|
+
// after the `i` in the next iteration to match the documented
|
68
|
+
// behaviour, e.g. consider input `iiiii`. This may well not make
|
69
|
+
// a difference for any actual Dutch words though.
|
70
|
+
[('i'] do(v <- 'I')) or
|
71
|
+
('y'] <- 'Y')
|
72
|
+
)
|
73
|
+
)
|
74
|
+
)
|
75
|
+
|
76
|
+
define mark_regions as (
|
77
|
+
|
78
|
+
$p1 = limit
|
79
|
+
$p2 = limit
|
80
|
+
|
81
|
+
test(hop 3 setmark x)
|
82
|
+
|
83
|
+
gopast v gopast non-v setmark p1
|
84
|
+
try($p1 < x $p1 = x) // at least 3
|
85
|
+
gopast v gopast non-v setmark p2
|
86
|
+
|
87
|
+
)
|
88
|
+
|
89
|
+
define postlude as repeat (
|
90
|
+
|
91
|
+
[substring] among(
|
92
|
+
'Y' (<- 'y')
|
93
|
+
'I' (<- 'i')
|
94
|
+
'' (next)
|
95
|
+
)
|
96
|
+
|
97
|
+
)
|
98
|
+
|
99
|
+
backwardmode (
|
100
|
+
|
101
|
+
define R1 as $p1 <= cursor
|
102
|
+
define R2 as $p2 <= cursor
|
103
|
+
|
104
|
+
define undouble as (
|
105
|
+
test among('kk' 'dd' 'tt') [next] delete
|
106
|
+
)
|
107
|
+
|
108
|
+
define e_ending as (
|
109
|
+
unset e_found
|
110
|
+
['e'] R1 test non-v delete
|
111
|
+
set e_found
|
112
|
+
undouble
|
113
|
+
)
|
114
|
+
|
115
|
+
define en_ending as (
|
116
|
+
R1 non-v and not 'gem' delete
|
117
|
+
undouble
|
118
|
+
)
|
119
|
+
|
120
|
+
define standard_suffix as (
|
121
|
+
do (
|
122
|
+
[substring] among(
|
123
|
+
'heden'
|
124
|
+
( R1 <- 'heid'
|
125
|
+
)
|
126
|
+
'en' 'ene'
|
127
|
+
( en_ending
|
128
|
+
)
|
129
|
+
's' 'se'
|
130
|
+
( R1 non-v_j delete
|
131
|
+
)
|
132
|
+
)
|
133
|
+
)
|
134
|
+
do e_ending
|
135
|
+
|
136
|
+
do ( ['heid'] R2 not 'c' delete
|
137
|
+
['en'] en_ending
|
138
|
+
)
|
139
|
+
|
140
|
+
do (
|
141
|
+
[substring] among(
|
142
|
+
'end' 'ing'
|
143
|
+
( R2 delete
|
144
|
+
(['ig'] R2 not 'e' delete) or undouble
|
145
|
+
)
|
146
|
+
'ig'
|
147
|
+
( R2 not 'e' delete
|
148
|
+
)
|
149
|
+
'lijk'
|
150
|
+
( R2 delete e_ending
|
151
|
+
)
|
152
|
+
'baar'
|
153
|
+
( R2 delete
|
154
|
+
)
|
155
|
+
'bar'
|
156
|
+
( R2 e_found delete
|
157
|
+
)
|
158
|
+
)
|
159
|
+
)
|
160
|
+
do (
|
161
|
+
non-v_I
|
162
|
+
test (
|
163
|
+
among ('aa' 'ee' 'oo' 'uu')
|
164
|
+
non-v
|
165
|
+
)
|
166
|
+
[next] delete
|
167
|
+
)
|
168
|
+
)
|
169
|
+
)
|
170
|
+
|
171
|
+
define stem as (
|
172
|
+
|
173
|
+
do prelude
|
174
|
+
do mark_regions
|
175
|
+
backwards
|
176
|
+
do standard_suffix
|
177
|
+
do postlude
|
178
|
+
)
|