mittens 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +3 -3
- data/lib/mittens/version.rb +1 -1
- data/vendor/snowball/.github/workflows/ci.yml +216 -0
- data/vendor/snowball/CONTRIBUTING.rst +111 -62
- data/vendor/snowball/GNUmakefile +194 -136
- data/vendor/snowball/NEWS +798 -3
- data/vendor/snowball/README.rst +50 -1
- data/vendor/snowball/ada/src/stemmer.adb +25 -13
- data/vendor/snowball/ada/src/stemmer.ads +9 -9
- data/vendor/snowball/ada/stemmer_config.gpr +7 -7
- data/vendor/snowball/algorithms/basque.sbl +4 -19
- data/vendor/snowball/algorithms/catalan.sbl +2 -9
- data/vendor/snowball/algorithms/danish.sbl +1 -1
- data/vendor/snowball/algorithms/dutch.sbl +284 -122
- data/vendor/snowball/algorithms/dutch_porter.sbl +178 -0
- data/vendor/snowball/algorithms/english.sbl +52 -37
- data/vendor/snowball/algorithms/esperanto.sbl +157 -0
- data/vendor/snowball/algorithms/estonian.sbl +269 -0
- data/vendor/snowball/algorithms/finnish.sbl +2 -3
- data/vendor/snowball/algorithms/french.sbl +42 -16
- data/vendor/snowball/algorithms/german.sbl +35 -14
- data/vendor/snowball/algorithms/greek.sbl +76 -76
- data/vendor/snowball/algorithms/hungarian.sbl +8 -6
- data/vendor/snowball/algorithms/indonesian.sbl +14 -8
- data/vendor/snowball/algorithms/italian.sbl +11 -21
- data/vendor/snowball/algorithms/lithuanian.sbl +36 -37
- data/vendor/snowball/algorithms/lovins.sbl +0 -1
- data/vendor/snowball/algorithms/nepali.sbl +138 -37
- data/vendor/snowball/algorithms/norwegian.sbl +19 -5
- data/vendor/snowball/algorithms/porter.sbl +2 -2
- data/vendor/snowball/algorithms/portuguese.sbl +9 -13
- data/vendor/snowball/algorithms/romanian.sbl +17 -4
- data/vendor/snowball/algorithms/serbian.sbl +467 -468
- data/vendor/snowball/algorithms/spanish.sbl +5 -7
- data/vendor/snowball/algorithms/swedish.sbl +60 -6
- data/vendor/snowball/algorithms/tamil.sbl +207 -176
- data/vendor/snowball/algorithms/turkish.sbl +461 -445
- data/vendor/snowball/algorithms/yiddish.sbl +36 -38
- data/vendor/snowball/compiler/analyser.c +445 -192
- data/vendor/snowball/compiler/driver.c +109 -101
- data/vendor/snowball/compiler/generator.c +853 -464
- data/vendor/snowball/compiler/generator_ada.c +404 -366
- data/vendor/snowball/compiler/generator_csharp.c +297 -260
- data/vendor/snowball/compiler/generator_go.c +323 -254
- data/vendor/snowball/compiler/generator_java.c +326 -252
- data/vendor/snowball/compiler/generator_js.c +362 -252
- data/vendor/snowball/compiler/generator_pascal.c +349 -197
- data/vendor/snowball/compiler/generator_python.c +257 -240
- data/vendor/snowball/compiler/generator_rust.c +423 -251
- data/vendor/snowball/compiler/header.h +117 -71
- data/vendor/snowball/compiler/space.c +137 -68
- data/vendor/snowball/compiler/syswords.h +2 -2
- data/vendor/snowball/compiler/tokeniser.c +125 -107
- data/vendor/snowball/csharp/Snowball/Among.cs +14 -14
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +7 -7
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +57 -37
- data/vendor/snowball/csharp/Stemwords/App.config +2 -2
- data/vendor/snowball/csharp/Stemwords/Program.cs +16 -12
- data/vendor/snowball/doc/libstemmer_c_README +7 -4
- data/vendor/snowball/doc/libstemmer_csharp_README +4 -1
- data/vendor/snowball/doc/libstemmer_java_README +12 -1
- data/vendor/snowball/doc/libstemmer_js_README +6 -4
- data/vendor/snowball/doc/libstemmer_python_README +9 -4
- data/vendor/snowball/examples/stemwords.c +12 -12
- data/vendor/snowball/go/env.go +107 -31
- data/vendor/snowball/go/util.go +0 -4
- data/vendor/snowball/include/libstemmer.h +4 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +32 -15
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +347 -261
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +3 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +52 -37
- data/vendor/snowball/javascript/base-stemmer.js +186 -2
- data/vendor/snowball/javascript/stemwords.js +3 -6
- data/vendor/snowball/libstemmer/libstemmer_c.in +1 -1
- data/vendor/snowball/libstemmer/mkalgorithms.pl +6 -6
- data/vendor/snowball/libstemmer/mkmodules.pl +2 -2
- data/vendor/snowball/libstemmer/modules.txt +13 -10
- data/vendor/snowball/libstemmer/test.c +1 -1
- data/vendor/snowball/pascal/SnowballProgram.pas +84 -2
- data/vendor/snowball/pascal/generate.pl +13 -13
- data/vendor/snowball/python/create_init.py +4 -1
- data/vendor/snowball/python/setup.cfg +0 -3
- data/vendor/snowball/python/setup.py +8 -3
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +20 -54
- data/vendor/snowball/python/stemwords.py +8 -12
- data/vendor/snowball/runtime/api.c +10 -5
- data/vendor/snowball/runtime/header.h +10 -9
- data/vendor/snowball/runtime/utilities.c +9 -9
- data/vendor/snowball/rust/build.rs +1 -1
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +83 -5
- data/vendor/snowball/tests/stemtest.c +7 -4
- metadata +7 -7
- data/vendor/snowball/.travis.yml +0 -112
- data/vendor/snowball/algorithms/german2.sbl +0 -145
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +0 -240
- data/vendor/snowball/compiler/syswords2.h +0 -13
@@ -1,470 +1,486 @@
|
|
1
1
|
/* Stemmer for Turkish
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
* expressions from right to left FSMs for stemming suffixes.
|
29
|
-
*/
|
2
|
+
* author: Evren (Kapusuz) Çilden
|
3
|
+
* email: evren.kapusuz at gmail.com
|
4
|
+
*
|
5
|
+
* stems nominal verb suffixes
|
6
|
+
* stems nominal inflections
|
7
|
+
* more than one syllable word check
|
8
|
+
* (y,n,s,U) context check
|
9
|
+
* vowel harmony check
|
10
|
+
* last consonant check and conversion (b, c, d, ğ to p, ç, t, k)
|
11
|
+
*
|
12
|
+
* The stemming algorithm is based on the paper "An Affix Stripping
|
13
|
+
* Morphological Analyzer for Turkish" by Gülşen Eryiğit and
|
14
|
+
* Eşref Adalı (Proceedings of the IAESTED International Conference
|
15
|
+
* ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004,
|
16
|
+
* Innsbruck, Austria
|
17
|
+
*
|
18
|
+
* Turkish is an agglutinative language and has a very rich morphological
|
19
|
+
* structure. In Turkish, you can form many different words from a single stem
|
20
|
+
* by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means
|
21
|
+
* "You had been the doctor of him". The stem of the word is "doktor" and it
|
22
|
+
* takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about
|
23
|
+
* the append order of suffixes can be clearly described as FSMs.
|
24
|
+
* The paper referenced above defines some FSMs for right to left
|
25
|
+
* morphological analysis. I generated a method for constructing snowball
|
26
|
+
* expressions from right to left FSMs for stemming suffixes.
|
27
|
+
*/
|
30
28
|
|
31
29
|
routines (
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
30
|
+
append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings
|
31
|
+
check_vowel_harmony // tests vowel harmony for suffixes
|
32
|
+
is_reserved_word // tests whether current string is a reserved word ('ad','soyad')
|
33
|
+
mark_cAsInA // nominal verb suffix
|
34
|
+
mark_DA // noun suffix
|
35
|
+
mark_DAn // noun suffix
|
36
|
+
mark_DUr // nominal verb suffix
|
37
|
+
mark_ki // noun suffix
|
38
|
+
mark_lAr // noun suffix, nominal verb suffix
|
39
|
+
mark_lArI // noun suffix
|
40
|
+
mark_nA // noun suffix
|
41
|
+
mark_ncA // noun suffix
|
42
|
+
mark_ndA // noun suffix
|
43
|
+
mark_ndAn // noun suffix
|
44
|
+
mark_nU // noun suffix
|
45
|
+
mark_nUn // noun suffix
|
46
|
+
mark_nUz // nominal verb suffix
|
47
|
+
mark_sU // noun suffix
|
48
|
+
mark_sUn // nominal verb suffix
|
49
|
+
mark_sUnUz // nominal verb suffix
|
50
|
+
mark_possessives // -(U)m,-(U)n,-(U)mUz,-(U)nUz,
|
51
|
+
mark_yA // noun suffix
|
52
|
+
mark_ylA // noun suffix
|
53
|
+
mark_yU // noun suffix
|
54
|
+
mark_yUm // nominal verb suffix
|
55
|
+
mark_yUz // nominal verb suffix
|
56
|
+
mark_yDU // nominal verb suffix
|
57
|
+
mark_yken // nominal verb suffix
|
58
|
+
mark_ymUs_ // nominal verb suffix
|
59
|
+
mark_ysA // nominal verb suffix
|
60
|
+
|
61
|
+
mark_suffix_with_optional_y_consonant
|
62
|
+
mark_suffix_with_optional_U_vowel
|
63
|
+
mark_suffix_with_optional_n_consonant
|
64
|
+
mark_suffix_with_optional_s_consonant
|
65
|
+
|
66
|
+
more_than_one_syllable_word
|
67
|
+
|
68
|
+
post_process_last_consonants
|
69
|
+
postlude
|
70
|
+
|
71
|
+
remove_proper_noun_suffix
|
72
|
+
|
73
|
+
stem_nominal_verb_suffixes
|
74
|
+
stem_noun_suffixes
|
75
|
+
stem_suffix_chain_before_ki
|
76
76
|
)
|
77
77
|
|
78
|
-
stringescapes
|
78
|
+
stringescapes { }
|
79
79
|
|
80
80
|
/* Special characters in Unicode Latin-1 and Latin Extended-A */
|
81
|
-
stringdef
|
82
|
-
stringdef g~
|
83
|
-
stringdef i'
|
84
|
-
stringdef o"
|
85
|
-
stringdef
|
86
|
-
stringdef u"
|
81
|
+
stringdef cc '{U+00E7}' // LATIN SMALL LETTER C WITH CEDILLA
|
82
|
+
stringdef g~ '{U+011F}' // LATIN SMALL LETTER G WITH BREVE
|
83
|
+
stringdef i '{U+0131}' // LATIN SMALL LETTER I WITHOUT DOT
|
84
|
+
stringdef o" '{U+00F6}' // LATIN SMALL LETTER O WITH DIAERESIS
|
85
|
+
stringdef sc '{U+015F}' // LATIN SMALL LETTER S WITH CEDILLA
|
86
|
+
stringdef u" '{U+00FC}' // LATIN SMALL LETTER U WITH DIAERESIS
|
87
87
|
|
88
|
-
booleans
|
88
|
+
booleans ( continue_stemming_noun_suffixes )
|
89
89
|
|
90
|
-
groupings
|
90
|
+
groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6 )
|
91
91
|
|
92
|
-
define vowel
|
93
|
-
define U
|
92
|
+
define vowel 'ae{i}io{o"}u{u"}'
|
93
|
+
define U '{i}iu{u"}'
|
94
94
|
|
95
95
|
// the vowel grouping definitions below are used for checking vowel harmony
|
96
|
-
define vowel1
|
97
|
-
define vowel2
|
98
|
-
define vowel3
|
99
|
-
define vowel4
|
100
|
-
define vowel5
|
101
|
-
define vowel6
|
96
|
+
define vowel1 'a{i}ou' // vowels that can end with suffixes containing 'a'
|
97
|
+
define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e'
|
98
|
+
define vowel3 'a{i}' // vowels that can end with suffixes containing '{i}'
|
99
|
+
define vowel4 'ei' // vowels that can end with suffixes containing 'i'
|
100
|
+
define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u'
|
101
|
+
define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing '{o"}' or '{u"}'
|
102
102
|
|
103
|
-
externals
|
103
|
+
externals ( stem )
|
104
104
|
|
105
105
|
backwardmode (
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
)
|
106
|
+
// checks vowel harmony for possible suffixes,
|
107
|
+
// helps to detect whether the candidate for suffix applies to vowel harmony
|
108
|
+
// this rule is added to prevent over stemming
|
109
|
+
define check_vowel_harmony as (
|
110
|
+
test
|
111
|
+
(
|
112
|
+
(goto vowel) // if there is a vowel
|
113
|
+
(
|
114
|
+
('a' goto vowel1) or
|
115
|
+
('e' goto vowel2) or
|
116
|
+
('{i}' goto vowel3) or
|
117
|
+
('i' goto vowel4) or
|
118
|
+
('o' goto vowel5) or
|
119
|
+
('{o"}' goto vowel6) or
|
120
|
+
('u' goto vowel5) or
|
121
|
+
('{u"}' goto vowel6)
|
122
|
+
)
|
123
|
+
)
|
124
|
+
)
|
125
|
+
|
126
|
+
// if the last consonant before suffix is vowel and n then advance and delete
|
127
|
+
// if the last consonant before suffix is non vowel and n do nothing
|
128
|
+
// if the last consonant before suffix is not n then only delete the suffix
|
129
|
+
// assumption: slice beginning is set correctly
|
130
|
+
define mark_suffix_with_optional_n_consonant as (
|
131
|
+
('n' (test vowel))
|
132
|
+
or
|
133
|
+
((not(test 'n')) test(next vowel))
|
134
|
+
)
|
135
|
+
|
136
|
+
// if the last consonant before suffix is vowel and s then advance and delete
|
137
|
+
// if the last consonant before suffix is non vowel and s do nothing
|
138
|
+
// if the last consonant before suffix is not s then only delete the suffix
|
139
|
+
// assumption: slice beginning is set correctly
|
140
|
+
define mark_suffix_with_optional_s_consonant as (
|
141
|
+
('s' (test vowel))
|
142
|
+
or
|
143
|
+
((not(test 's')) test(next vowel))
|
144
|
+
)
|
145
|
+
|
146
|
+
// if the last consonant before suffix is vowel and y then advance and delete
|
147
|
+
// if the last consonant before suffix is non vowel and y do nothing
|
148
|
+
// if the last consonant before suffix is not y then only delete the suffix
|
149
|
+
// assumption: slice beginning is set correctly
|
150
|
+
define mark_suffix_with_optional_y_consonant as (
|
151
|
+
('y' (test vowel))
|
152
|
+
or
|
153
|
+
((not(test 'y')) test(next vowel))
|
154
|
+
)
|
155
|
+
|
156
|
+
define mark_suffix_with_optional_U_vowel as (
|
157
|
+
(U (test non-vowel))
|
158
|
+
or
|
159
|
+
((not(test U)) test(next non-vowel))
|
160
|
+
)
|
161
|
+
|
162
|
+
define mark_possessives as (
|
163
|
+
among ('m{i}z' 'miz' 'muz' 'm{u"}z'
|
164
|
+
'n{i}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n')
|
165
|
+
(mark_suffix_with_optional_U_vowel)
|
166
|
+
)
|
167
|
+
|
168
|
+
define mark_sU as (
|
169
|
+
check_vowel_harmony
|
170
|
+
U
|
171
|
+
(mark_suffix_with_optional_s_consonant)
|
172
|
+
)
|
173
|
+
|
174
|
+
define mark_lArI as (
|
175
|
+
among ('leri' 'lar{i}')
|
176
|
+
)
|
177
|
+
|
178
|
+
define mark_yU as (
|
179
|
+
check_vowel_harmony
|
180
|
+
U
|
181
|
+
(mark_suffix_with_optional_y_consonant)
|
182
|
+
)
|
183
|
+
|
184
|
+
define mark_nU as (
|
185
|
+
check_vowel_harmony
|
186
|
+
among ('n{i}' 'ni' 'nu' 'n{u"}')
|
187
|
+
)
|
188
|
+
|
189
|
+
define mark_nUn as (
|
190
|
+
check_vowel_harmony
|
191
|
+
among ('{i}n' 'in' 'un' '{u"}n')
|
192
|
+
(mark_suffix_with_optional_n_consonant)
|
193
|
+
)
|
194
|
+
|
195
|
+
define mark_yA as (
|
196
|
+
check_vowel_harmony
|
197
|
+
among('a' 'e')
|
198
|
+
(mark_suffix_with_optional_y_consonant)
|
199
|
+
)
|
200
|
+
|
201
|
+
define mark_nA as (
|
202
|
+
check_vowel_harmony
|
203
|
+
among('na' 'ne')
|
204
|
+
)
|
205
|
+
|
206
|
+
define mark_DA as (
|
207
|
+
check_vowel_harmony
|
208
|
+
among('da' 'de' 'ta' 'te')
|
209
|
+
)
|
210
|
+
|
211
|
+
define mark_ndA as (
|
212
|
+
check_vowel_harmony
|
213
|
+
among('nda' 'nde')
|
214
|
+
)
|
215
|
+
|
216
|
+
define mark_DAn as (
|
217
|
+
check_vowel_harmony
|
218
|
+
among('dan' 'den' 'tan' 'ten')
|
219
|
+
)
|
220
|
+
|
221
|
+
define mark_ndAn as (
|
222
|
+
check_vowel_harmony
|
223
|
+
among('ndan' 'nden')
|
224
|
+
)
|
225
|
+
|
226
|
+
define mark_ylA as (
|
227
|
+
check_vowel_harmony
|
228
|
+
among('la' 'le')
|
229
|
+
(mark_suffix_with_optional_y_consonant)
|
230
|
+
)
|
231
|
+
|
232
|
+
define mark_ki as (
|
233
|
+
'ki'
|
234
|
+
)
|
235
|
+
|
236
|
+
define mark_ncA as (
|
237
|
+
check_vowel_harmony
|
238
|
+
among('ca' 'ce')
|
239
|
+
(mark_suffix_with_optional_n_consonant)
|
240
|
+
)
|
241
|
+
|
242
|
+
define mark_yUm as (
|
243
|
+
check_vowel_harmony
|
244
|
+
among ('{i}m' 'im' 'um' '{u"}m')
|
245
|
+
(mark_suffix_with_optional_y_consonant)
|
246
|
+
)
|
247
|
+
|
248
|
+
define mark_sUn as (
|
249
|
+
check_vowel_harmony
|
250
|
+
among ('s{i}n' 'sin' 'sun' 's{u"}n' )
|
251
|
+
)
|
252
|
+
|
253
|
+
define mark_yUz as (
|
254
|
+
check_vowel_harmony
|
255
|
+
among ('{i}z' 'iz' 'uz' '{u"}z')
|
256
|
+
(mark_suffix_with_optional_y_consonant)
|
257
|
+
)
|
258
|
+
|
259
|
+
define mark_sUnUz as (
|
260
|
+
among ('s{i}n{i}z' 'siniz' 'sunuz' 's{u"}n{u"}z')
|
261
|
+
)
|
262
|
+
|
263
|
+
define mark_lAr as (
|
264
|
+
check_vowel_harmony
|
265
|
+
among ('ler' 'lar')
|
266
|
+
)
|
267
|
+
|
268
|
+
define mark_nUz as (
|
269
|
+
check_vowel_harmony
|
270
|
+
among ('n{i}z' 'niz' 'nuz' 'n{u"}z')
|
271
|
+
)
|
272
|
+
|
273
|
+
define mark_DUr as (
|
274
|
+
check_vowel_harmony
|
275
|
+
among ('t{i}r' 'tir' 'tur' 't{u"}r' 'd{i}r' 'dir' 'dur' 'd{u"}r')
|
276
|
+
)
|
277
|
+
|
278
|
+
define mark_cAsInA as (
|
279
|
+
among ('cas{i}na' 'cesine')
|
280
|
+
)
|
281
|
+
|
282
|
+
define mark_yDU as (
|
283
|
+
check_vowel_harmony
|
284
|
+
among ('t{i}m' 'tim' 'tum' 't{u"}m' 'd{i}m' 'dim' 'dum' 'd{u"}m'
|
285
|
+
't{i}n' 'tin' 'tun' 't{u"}n' 'd{i}n' 'din' 'dun' 'd{u"}n'
|
286
|
+
't{i}k' 'tik' 'tuk' 't{u"}k' 'd{i}k' 'dik' 'duk' 'd{u"}k'
|
287
|
+
't{i}' 'ti' 'tu' 't{u"}' 'd{i}' 'di' 'du' 'd{u"}')
|
288
|
+
(mark_suffix_with_optional_y_consonant)
|
289
|
+
)
|
290
|
+
|
291
|
+
// does not fully obey vowel harmony
|
292
|
+
define mark_ysA as (
|
293
|
+
among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se')
|
294
|
+
(mark_suffix_with_optional_y_consonant)
|
295
|
+
)
|
296
|
+
|
297
|
+
define mark_ymUs_ as (
|
298
|
+
check_vowel_harmony
|
299
|
+
among ('m{i}{sc}' 'mi{sc}' 'mu{sc}' 'm{u"}{sc}')
|
300
|
+
(mark_suffix_with_optional_y_consonant)
|
301
|
+
)
|
302
|
+
|
303
|
+
define mark_yken as (
|
304
|
+
'ken' (mark_suffix_with_optional_y_consonant)
|
305
|
+
)
|
306
|
+
|
307
|
+
define stem_nominal_verb_suffixes as (
|
308
|
+
[
|
309
|
+
set continue_stemming_noun_suffixes
|
310
|
+
(mark_ymUs_ or mark_yDU or mark_ysA or mark_yken)
|
311
|
+
or
|
312
|
+
(mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)
|
313
|
+
or
|
314
|
+
(
|
315
|
+
mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_))
|
316
|
+
unset continue_stemming_noun_suffixes
|
317
|
+
)
|
318
|
+
or
|
319
|
+
(mark_nUz (mark_yDU or mark_ysA))
|
320
|
+
or
|
321
|
+
((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_))
|
322
|
+
or
|
323
|
+
(mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_))
|
324
|
+
]delete
|
325
|
+
)
|
326
|
+
|
327
|
+
// stems noun suffix chains ending with -ki
|
328
|
+
define stem_suffix_chain_before_ki as (
|
329
|
+
[
|
330
|
+
mark_ki
|
331
|
+
(
|
332
|
+
(mark_DA] delete try([
|
333
|
+
(mark_lAr] delete try(stem_suffix_chain_before_ki))
|
334
|
+
or
|
335
|
+
(mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
|
336
|
+
|
337
|
+
))
|
338
|
+
or
|
339
|
+
(mark_nUn] delete try([
|
340
|
+
(mark_lArI] delete)
|
341
|
+
or
|
342
|
+
([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
|
343
|
+
or
|
344
|
+
(stem_suffix_chain_before_ki)
|
345
|
+
))
|
346
|
+
or
|
347
|
+
(mark_ndA (
|
348
|
+
(mark_lArI] delete)
|
349
|
+
or
|
350
|
+
((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki)))
|
351
|
+
or
|
352
|
+
(stem_suffix_chain_before_ki)
|
353
|
+
))
|
354
|
+
)
|
355
|
+
)
|
356
|
+
|
357
|
+
define stem_noun_suffixes as (
|
358
|
+
([mark_lAr] delete try(stem_suffix_chain_before_ki))
|
359
|
+
or
|
360
|
+
([mark_ncA] delete
|
361
|
+
try(
|
362
|
+
([mark_lArI] delete)
|
363
|
+
or
|
364
|
+
([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
|
365
|
+
or
|
366
|
+
([mark_lAr] delete stem_suffix_chain_before_ki)
|
367
|
+
)
|
368
|
+
)
|
369
|
+
or
|
370
|
+
([(mark_ndA or mark_nA)
|
371
|
+
(
|
372
|
+
(mark_lArI] delete)
|
373
|
+
or
|
374
|
+
(mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
|
375
|
+
or
|
376
|
+
(stem_suffix_chain_before_ki)
|
377
|
+
)
|
378
|
+
)
|
379
|
+
or
|
380
|
+
([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI)))
|
381
|
+
or
|
382
|
+
( [mark_DAn] delete try ([
|
383
|
+
(
|
384
|
+
(mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
|
385
|
+
or
|
386
|
+
(mark_lAr] delete try(stem_suffix_chain_before_ki))
|
387
|
+
or
|
388
|
+
(stem_suffix_chain_before_ki)
|
389
|
+
))
|
390
|
+
)
|
391
|
+
or
|
392
|
+
([mark_nUn or mark_ylA] delete
|
393
|
+
try(
|
394
|
+
([mark_lAr] delete stem_suffix_chain_before_ki)
|
395
|
+
or
|
396
|
+
([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
|
397
|
+
or
|
398
|
+
stem_suffix_chain_before_ki
|
399
|
+
)
|
400
|
+
)
|
401
|
+
or
|
402
|
+
([mark_lArI] delete)
|
403
|
+
or
|
404
|
+
(stem_suffix_chain_before_ki)
|
405
|
+
or
|
406
|
+
([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki))
|
407
|
+
or
|
408
|
+
([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
|
409
|
+
)
|
410
|
+
|
411
|
+
define post_process_last_consonants as (
|
412
|
+
[substring] among (
|
413
|
+
'b' (<- 'p')
|
414
|
+
'c' (<- '{cc}')
|
415
|
+
'd' (<- 't')
|
416
|
+
'{g~}' (<- 'k')
|
417
|
+
)
|
418
|
+
)
|
419
|
+
|
420
|
+
// after stemming if the word ends with 'd' or 'g' most probably last U is
|
421
|
+
// overstemmed like in 'kedim' -> 'ked'
|
422
|
+
// Turkish words don't usually end with 'd' or 'g'
|
423
|
+
// some very well known words are ignored (like 'ad' 'soyad'
|
424
|
+
// appends U to stems ending with d or g, decides which vowel to add
|
425
|
+
// based on the last vowel in the stem
|
426
|
+
define append_U_to_stems_ending_with_d_or_g as (
|
427
|
+
[] ('d' or 'g') goto vowel
|
428
|
+
|
429
|
+
(('a' or '{i}') <- '{i}')
|
430
|
+
or
|
431
|
+
(('e' or 'i') <- 'i')
|
432
|
+
or
|
433
|
+
(('o' or 'u') <- 'u')
|
434
|
+
or
|
435
|
+
(('{o"}' or '{u"}') <- '{u"}')
|
436
|
+
)
|
437
|
+
|
438
|
+
define is_reserved_word as (
|
439
|
+
'ad' try 'soy' atlimit
|
440
|
+
)
|
442
441
|
)
|
443
442
|
|
444
|
-
|
445
|
-
//
|
443
|
+
define remove_proper_noun_suffix as (
|
444
|
+
// Remove any leading apostrophes (e.g. from tokenisation of single-quoted
|
445
|
+
// text).
|
446
|
+
do ([goto not '{'}'] delete)
|
447
|
+
|
448
|
+
// https://en.wikipedia.org/wiki/Turkish_language says "In modern
|
449
|
+
// Turkish orthography, an apostrophe is used to separate proper names
|
450
|
+
// from any suffixes" with the example "Türkiye'dir ("it is Turkey")".
|
451
|
+
// Therefore we truncate at the first apostrophe, provided there are at least
|
452
|
+
// two characters before it (which avoids adversely affecting some foreign
|
453
|
+
// names and words such as "o'connor", "l'entrée").
|
454
|
+
do (
|
455
|
+
hop 2
|
456
|
+
goto '{'}' [ tolimit ] delete
|
457
|
+
)
|
458
|
+
)
|
459
|
+
|
460
|
+
// Test if there is more than one syllable.
|
461
|
+
// In Turkish each vowel indicates a distinct syllable.
|
446
462
|
define more_than_one_syllable_word as (
|
447
|
-
|
463
|
+
test (loop 2 gopast vowel)
|
448
464
|
)
|
449
465
|
|
450
466
|
define postlude as (
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
)
|
467
|
+
backwards (
|
468
|
+
not is_reserved_word
|
469
|
+
do append_U_to_stems_ending_with_d_or_g
|
470
|
+
do post_process_last_consonants
|
471
|
+
)
|
457
472
|
)
|
458
473
|
|
459
474
|
define stem as (
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
475
|
+
do remove_proper_noun_suffix
|
476
|
+
|
477
|
+
more_than_one_syllable_word
|
478
|
+
|
479
|
+
backwards (
|
480
|
+
do stem_nominal_verb_suffixes
|
481
|
+
continue_stemming_noun_suffixes
|
482
|
+
do stem_noun_suffixes
|
483
|
+
)
|
484
|
+
|
485
|
+
postlude
|
470
486
|
)
|