mittens 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +3 -3
  4. data/lib/mittens/version.rb +1 -1
  5. data/vendor/snowball/.github/workflows/ci.yml +216 -0
  6. data/vendor/snowball/CONTRIBUTING.rst +111 -62
  7. data/vendor/snowball/GNUmakefile +194 -136
  8. data/vendor/snowball/NEWS +798 -3
  9. data/vendor/snowball/README.rst +50 -1
  10. data/vendor/snowball/ada/src/stemmer.adb +25 -13
  11. data/vendor/snowball/ada/src/stemmer.ads +9 -9
  12. data/vendor/snowball/ada/stemmer_config.gpr +7 -7
  13. data/vendor/snowball/algorithms/basque.sbl +4 -19
  14. data/vendor/snowball/algorithms/catalan.sbl +2 -9
  15. data/vendor/snowball/algorithms/danish.sbl +1 -1
  16. data/vendor/snowball/algorithms/dutch.sbl +284 -122
  17. data/vendor/snowball/algorithms/dutch_porter.sbl +178 -0
  18. data/vendor/snowball/algorithms/english.sbl +52 -37
  19. data/vendor/snowball/algorithms/esperanto.sbl +157 -0
  20. data/vendor/snowball/algorithms/estonian.sbl +269 -0
  21. data/vendor/snowball/algorithms/finnish.sbl +2 -3
  22. data/vendor/snowball/algorithms/french.sbl +42 -16
  23. data/vendor/snowball/algorithms/german.sbl +35 -14
  24. data/vendor/snowball/algorithms/greek.sbl +76 -76
  25. data/vendor/snowball/algorithms/hungarian.sbl +8 -6
  26. data/vendor/snowball/algorithms/indonesian.sbl +14 -8
  27. data/vendor/snowball/algorithms/italian.sbl +11 -21
  28. data/vendor/snowball/algorithms/lithuanian.sbl +36 -37
  29. data/vendor/snowball/algorithms/lovins.sbl +0 -1
  30. data/vendor/snowball/algorithms/nepali.sbl +138 -37
  31. data/vendor/snowball/algorithms/norwegian.sbl +19 -5
  32. data/vendor/snowball/algorithms/porter.sbl +2 -2
  33. data/vendor/snowball/algorithms/portuguese.sbl +9 -13
  34. data/vendor/snowball/algorithms/romanian.sbl +17 -4
  35. data/vendor/snowball/algorithms/serbian.sbl +467 -468
  36. data/vendor/snowball/algorithms/spanish.sbl +5 -7
  37. data/vendor/snowball/algorithms/swedish.sbl +60 -6
  38. data/vendor/snowball/algorithms/tamil.sbl +207 -176
  39. data/vendor/snowball/algorithms/turkish.sbl +461 -445
  40. data/vendor/snowball/algorithms/yiddish.sbl +36 -38
  41. data/vendor/snowball/compiler/analyser.c +445 -192
  42. data/vendor/snowball/compiler/driver.c +109 -101
  43. data/vendor/snowball/compiler/generator.c +853 -464
  44. data/vendor/snowball/compiler/generator_ada.c +404 -366
  45. data/vendor/snowball/compiler/generator_csharp.c +297 -260
  46. data/vendor/snowball/compiler/generator_go.c +323 -254
  47. data/vendor/snowball/compiler/generator_java.c +326 -252
  48. data/vendor/snowball/compiler/generator_js.c +362 -252
  49. data/vendor/snowball/compiler/generator_pascal.c +349 -197
  50. data/vendor/snowball/compiler/generator_python.c +257 -240
  51. data/vendor/snowball/compiler/generator_rust.c +423 -251
  52. data/vendor/snowball/compiler/header.h +117 -71
  53. data/vendor/snowball/compiler/space.c +137 -68
  54. data/vendor/snowball/compiler/syswords.h +2 -2
  55. data/vendor/snowball/compiler/tokeniser.c +125 -107
  56. data/vendor/snowball/csharp/Snowball/Among.cs +14 -14
  57. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +7 -7
  58. data/vendor/snowball/csharp/Snowball/Stemmer.cs +57 -37
  59. data/vendor/snowball/csharp/Stemwords/App.config +2 -2
  60. data/vendor/snowball/csharp/Stemwords/Program.cs +16 -12
  61. data/vendor/snowball/doc/libstemmer_c_README +7 -4
  62. data/vendor/snowball/doc/libstemmer_csharp_README +4 -1
  63. data/vendor/snowball/doc/libstemmer_java_README +12 -1
  64. data/vendor/snowball/doc/libstemmer_js_README +6 -4
  65. data/vendor/snowball/doc/libstemmer_python_README +9 -4
  66. data/vendor/snowball/examples/stemwords.c +12 -12
  67. data/vendor/snowball/go/env.go +107 -31
  68. data/vendor/snowball/go/util.go +0 -4
  69. data/vendor/snowball/include/libstemmer.h +4 -0
  70. data/vendor/snowball/java/org/tartarus/snowball/Among.java +32 -15
  71. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +347 -261
  72. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +3 -0
  73. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +52 -37
  74. data/vendor/snowball/javascript/base-stemmer.js +186 -2
  75. data/vendor/snowball/javascript/stemwords.js +3 -6
  76. data/vendor/snowball/libstemmer/libstemmer_c.in +1 -1
  77. data/vendor/snowball/libstemmer/mkalgorithms.pl +6 -6
  78. data/vendor/snowball/libstemmer/mkmodules.pl +2 -2
  79. data/vendor/snowball/libstemmer/modules.txt +13 -10
  80. data/vendor/snowball/libstemmer/test.c +1 -1
  81. data/vendor/snowball/pascal/SnowballProgram.pas +84 -2
  82. data/vendor/snowball/pascal/generate.pl +13 -13
  83. data/vendor/snowball/python/create_init.py +4 -1
  84. data/vendor/snowball/python/setup.cfg +0 -3
  85. data/vendor/snowball/python/setup.py +8 -3
  86. data/vendor/snowball/python/snowballstemmer/basestemmer.py +20 -54
  87. data/vendor/snowball/python/stemwords.py +8 -12
  88. data/vendor/snowball/runtime/api.c +10 -5
  89. data/vendor/snowball/runtime/header.h +10 -9
  90. data/vendor/snowball/runtime/utilities.c +9 -9
  91. data/vendor/snowball/rust/build.rs +1 -1
  92. data/vendor/snowball/rust/src/snowball/snowball_env.rs +83 -5
  93. data/vendor/snowball/tests/stemtest.c +7 -4
  94. metadata +7 -7
  95. data/vendor/snowball/.travis.yml +0 -112
  96. data/vendor/snowball/algorithms/german2.sbl +0 -145
  97. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +0 -240
  98. data/vendor/snowball/compiler/syswords2.h +0 -13
@@ -1,470 +1,486 @@
1
1
  /* Stemmer for Turkish
2
- * author: Evren (Kapusuz) Çilden
3
- * email: evren.kapusuz at gmail.com
4
- * version: 1.0 (15.01.2007)
5
-
6
-
7
- * stems nominal verb suffixes
8
- * stems nominal inflections
9
- * more than one syllable word check
10
- * (y,n,s,U) context check
11
- * vowel harmony check
12
- * last consonant check and conversion (b, c, d, ğ to p, ç, t, k)
13
-
14
- * The stemming algorithm is based on the paper "An Affix Stripping
15
- * Morphological Analyzer for Turkish" by Gülşen Eryiğit and
16
- * Eşref Adalı (Proceedings of the IAESTED International Conference
17
- * ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004,
18
- * Innsbruck, Austria
19
-
20
- * Turkish is an agglutinative language and has a very rich morphological
21
- * structure. In Turkish, you can form many different words from a single stem
22
- * by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means
23
- * "You had been the doctor of him". The stem of the word is "doktor" and it
24
- * takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about
25
- * the append order of suffixes can be clearly described as FSMs.
26
- * The paper referenced above defines some FSMs for right to left
27
- * morphological analysis. I generated a method for constructing snowball
28
- * expressions from right to left FSMs for stemming suffixes.
29
- */
2
+ * author: Evren (Kapusuz) Çilden
3
+ * email: evren.kapusuz at gmail.com
4
+ *
5
+ * stems nominal verb suffixes
6
+ * stems nominal inflections
7
+ * more than one syllable word check
8
+ * (y,n,s,U) context check
9
+ * vowel harmony check
10
+ * last consonant check and conversion (b, c, d, ğ to p, ç, t, k)
11
+ *
12
+ * The stemming algorithm is based on the paper "An Affix Stripping
13
+ * Morphological Analyzer for Turkish" by Gülşen Eryiğit and
14
+ * Eşref Adalı (Proceedings of the IAESTED International Conference
15
+ * ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004,
16
+ * Innsbruck, Austria
17
+ *
18
+ * Turkish is an agglutinative language and has a very rich morphological
19
+ * structure. In Turkish, you can form many different words from a single stem
20
+ * by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means
21
+ * "You had been the doctor of him". The stem of the word is "doktor" and it
22
+ * takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about
23
+ * the append order of suffixes can be clearly described as FSMs.
24
+ * The paper referenced above defines some FSMs for right to left
25
+ * morphological analysis. I generated a method for constructing snowball
26
+ * expressions from right to left FSMs for stemming suffixes.
27
+ */
30
28
 
31
29
  routines (
32
- append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings
33
- check_vowel_harmony // tests vowel harmony for suffixes
34
- is_reserved_word // tests whether current string is a reserved word ('ad','soyad')
35
- mark_cAsInA // nominal verb suffix
36
- mark_DA // noun suffix
37
- mark_DAn // noun suffix
38
- mark_DUr // nominal verb suffix
39
- mark_ki // noun suffix
40
- mark_lAr // noun suffix, nominal verb suffix
41
- mark_lArI // noun suffix
42
- mark_nA // noun suffix
43
- mark_ncA // noun suffix
44
- mark_ndA // noun suffix
45
- mark_ndAn // noun suffix
46
- mark_nU // noun suffix
47
- mark_nUn // noun suffix
48
- mark_nUz // nominal verb suffix
49
- mark_sU // noun suffix
50
- mark_sUn // nominal verb suffix
51
- mark_sUnUz // nominal verb suffix
52
- mark_possessives // -(U)m,-(U)n,-(U)mUz,-(U)nUz,
53
- mark_yA // noun suffix
54
- mark_ylA // noun suffix
55
- mark_yU // noun suffix
56
- mark_yUm // nominal verb suffix
57
- mark_yUz // nominal verb suffix
58
- mark_yDU // nominal verb suffix
59
- mark_yken // nominal verb suffix
60
- mark_ymUs_ // nominal verb suffix
61
- mark_ysA // nominal verb suffix
62
-
63
- mark_suffix_with_optional_y_consonant
64
- mark_suffix_with_optional_U_vowel
65
- mark_suffix_with_optional_n_consonant
66
- mark_suffix_with_optional_s_consonant
67
-
68
- more_than_one_syllable_word
69
-
70
- post_process_last_consonants
71
- postlude
72
-
73
- stem_nominal_verb_suffixes
74
- stem_noun_suffixes
75
- stem_suffix_chain_before_ki
30
+ append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings
31
+ check_vowel_harmony // tests vowel harmony for suffixes
32
+ is_reserved_word // tests whether current string is a reserved word ('ad','soyad')
33
+ mark_cAsInA // nominal verb suffix
34
+ mark_DA // noun suffix
35
+ mark_DAn // noun suffix
36
+ mark_DUr // nominal verb suffix
37
+ mark_ki // noun suffix
38
+ mark_lAr // noun suffix, nominal verb suffix
39
+ mark_lArI // noun suffix
40
+ mark_nA // noun suffix
41
+ mark_ncA // noun suffix
42
+ mark_ndA // noun suffix
43
+ mark_ndAn // noun suffix
44
+ mark_nU // noun suffix
45
+ mark_nUn // noun suffix
46
+ mark_nUz // nominal verb suffix
47
+ mark_sU // noun suffix
48
+ mark_sUn // nominal verb suffix
49
+ mark_sUnUz // nominal verb suffix
50
+ mark_possessives // -(U)m,-(U)n,-(U)mUz,-(U)nUz,
51
+ mark_yA // noun suffix
52
+ mark_ylA // noun suffix
53
+ mark_yU // noun suffix
54
+ mark_yUm // nominal verb suffix
55
+ mark_yUz // nominal verb suffix
56
+ mark_yDU // nominal verb suffix
57
+ mark_yken // nominal verb suffix
58
+ mark_ymUs_ // nominal verb suffix
59
+ mark_ysA // nominal verb suffix
60
+
61
+ mark_suffix_with_optional_y_consonant
62
+ mark_suffix_with_optional_U_vowel
63
+ mark_suffix_with_optional_n_consonant
64
+ mark_suffix_with_optional_s_consonant
65
+
66
+ more_than_one_syllable_word
67
+
68
+ post_process_last_consonants
69
+ postlude
70
+
71
+ remove_proper_noun_suffix
72
+
73
+ stem_nominal_verb_suffixes
74
+ stem_noun_suffixes
75
+ stem_suffix_chain_before_ki
76
76
  )
77
77
 
78
- stringescapes { }
78
+ stringescapes { }
79
79
 
80
80
  /* Special characters in Unicode Latin-1 and Latin Extended-A */
81
- stringdef c, '{U+00E7}' // LATIN SMALL LETTER C WITH CEDILLA
82
- stringdef g~ '{U+011F}' // LATIN SMALL LETTER G WITH BREVE
83
- stringdef i' '{U+0131}' // LATIN SMALL LETTER I WITHOUT DOT
84
- stringdef o" '{U+00F6}' // LATIN SMALL LETTER O WITH DIAERESIS
85
- stringdef s, '{U+015F}' // LATIN SMALL LETTER S WITH CEDILLA
86
- stringdef u" '{U+00FC}' // LATIN SMALL LETTER U WITH DIAERESIS
81
+ stringdef cc '{U+00E7}' // LATIN SMALL LETTER C WITH CEDILLA
82
+ stringdef g~ '{U+011F}' // LATIN SMALL LETTER G WITH BREVE
83
+ stringdef i '{U+0131}' // LATIN SMALL LETTER I WITHOUT DOT
84
+ stringdef o" '{U+00F6}' // LATIN SMALL LETTER O WITH DIAERESIS
85
+ stringdef sc '{U+015F}' // LATIN SMALL LETTER S WITH CEDILLA
86
+ stringdef u" '{U+00FC}' // LATIN SMALL LETTER U WITH DIAERESIS
87
87
 
88
- booleans ( continue_stemming_noun_suffixes )
88
+ booleans ( continue_stemming_noun_suffixes )
89
89
 
90
- groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6)
90
+ groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6 )
91
91
 
92
- define vowel 'ae{i'}io{o"}u{u"}'
93
- define U '{i'}iu{u"}'
92
+ define vowel 'ae{i}io{o"}u{u"}'
93
+ define U '{i}iu{u"}'
94
94
 
95
95
  // the vowel grouping definitions below are used for checking vowel harmony
96
- define vowel1 'a{i'}ou' // vowels that can end with suffixes containing 'a'
97
- define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e'
98
- define vowel3 'a{i'}' // vowels that can end with suffixes containing 'i''
99
- define vowel4 'ei' // vowels that can end with suffixes containing 'i'
100
- define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u'
101
- define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing 'o"' or 'u"'
96
+ define vowel1 'a{i}ou' // vowels that can end with suffixes containing 'a'
97
+ define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e'
98
+ define vowel3 'a{i}' // vowels that can end with suffixes containing '{i}'
99
+ define vowel4 'ei' // vowels that can end with suffixes containing 'i'
100
+ define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u'
101
+ define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing '{o"}' or '{u"}'
102
102
 
103
- externals ( stem )
103
+ externals ( stem )
104
104
 
105
105
  backwardmode (
106
- // checks vowel harmony for possible suffixes,
107
- // helps to detect whether the candidate for suffix applies to vowel harmony
108
- // this rule is added to prevent over stemming
109
- define check_vowel_harmony as (
110
- test
111
- (
112
- (goto vowel) // if there is a vowel
113
- (
114
- ('a' goto vowel1) or
115
- ('e' goto vowel2) or
116
- ('{i'}' goto vowel3) or
117
- ('i' goto vowel4) or
118
- ('o' goto vowel5) or
119
- ('{o"}' goto vowel6) or
120
- ('u' goto vowel5) or
121
- ('{u"}' goto vowel6)
122
- )
123
- )
124
- )
125
-
126
- // if the last consonant before suffix is vowel and n then advance and delete
127
- // if the last consonant before suffix is non vowel and n do nothing
128
- // if the last consonant before suffix is not n then only delete the suffix
129
- // assumption: slice beginning is set correctly
130
- define mark_suffix_with_optional_n_consonant as (
131
- ('n' (test vowel))
132
- or
133
- ((not(test 'n')) test(next vowel))
134
-
135
- )
136
-
137
- // if the last consonant before suffix is vowel and s then advance and delete
138
- // if the last consonant before suffix is non vowel and s do nothing
139
- // if the last consonant before suffix is not s then only delete the suffix
140
- // assumption: slice beginning is set correctly
141
- define mark_suffix_with_optional_s_consonant as (
142
- ('s' (test vowel))
143
- or
144
- ((not(test 's')) test(next vowel))
145
- )
146
-
147
- // if the last consonant before suffix is vowel and y then advance and delete
148
- // if the last consonant before suffix is non vowel and y do nothing
149
- // if the last consonant before suffix is not y then only delete the suffix
150
- // assumption: slice beginning is set correctly
151
- define mark_suffix_with_optional_y_consonant as (
152
- ('y' (test vowel))
153
- or
154
- ((not(test 'y')) test(next vowel))
155
- )
156
-
157
- define mark_suffix_with_optional_U_vowel as (
158
- (U (test non-vowel))
159
- or
160
- ((not(test U)) test(next non-vowel))
161
-
162
- )
163
-
164
- define mark_possessives as (
165
- among ('m{i'}z' 'miz' 'muz' 'm{u"}z'
166
- 'n{i'}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n')
167
- (mark_suffix_with_optional_U_vowel)
168
- )
169
-
170
- define mark_sU as (
171
- check_vowel_harmony
172
- U
173
- (mark_suffix_with_optional_s_consonant)
174
- )
175
-
176
- define mark_lArI as (
177
- among ('leri' 'lar{i'}')
178
- )
179
-
180
- define mark_yU as (
181
- check_vowel_harmony
182
- U
183
- (mark_suffix_with_optional_y_consonant)
184
- )
185
-
186
- define mark_nU as (
187
- check_vowel_harmony
188
- among ('n{i'}' 'ni' 'nu' 'n{u"}')
189
- )
190
-
191
- define mark_nUn as (
192
- check_vowel_harmony
193
- among ('{i'}n' 'in' 'un' '{u"}n')
194
- (mark_suffix_with_optional_n_consonant)
195
- )
196
-
197
- define mark_yA as (
198
- check_vowel_harmony
199
- among('a' 'e')
200
- (mark_suffix_with_optional_y_consonant)
201
- )
202
-
203
- define mark_nA as (
204
- check_vowel_harmony
205
- among('na' 'ne')
206
- )
207
-
208
- define mark_DA as (
209
- check_vowel_harmony
210
- among('da' 'de' 'ta' 'te')
211
- )
212
-
213
- define mark_ndA as (
214
- check_vowel_harmony
215
- among('nda' 'nde')
216
- )
217
-
218
- define mark_DAn as (
219
- check_vowel_harmony
220
- among('dan' 'den' 'tan' 'ten')
221
- )
222
-
223
- define mark_ndAn as (
224
- check_vowel_harmony
225
- among('ndan' 'nden')
226
- )
227
-
228
- define mark_ylA as (
229
- check_vowel_harmony
230
- among('la' 'le')
231
- (mark_suffix_with_optional_y_consonant)
232
- )
233
-
234
- define mark_ki as (
235
- 'ki'
236
- )
237
-
238
- define mark_ncA as (
239
- check_vowel_harmony
240
- among('ca' 'ce')
241
- (mark_suffix_with_optional_n_consonant)
242
- )
243
-
244
- define mark_yUm as (
245
- check_vowel_harmony
246
- among ('{i'}m' 'im' 'um' '{u"}m')
247
- (mark_suffix_with_optional_y_consonant)
248
- )
249
-
250
- define mark_sUn as (
251
- check_vowel_harmony
252
- among ('s{i'}n' 'sin' 'sun' 's{u"}n' )
253
- )
254
-
255
- define mark_yUz as (
256
- check_vowel_harmony
257
- among ('{i'}z' 'iz' 'uz' '{u"}z')
258
- (mark_suffix_with_optional_y_consonant)
259
- )
260
-
261
- define mark_sUnUz as (
262
- among ('s{i'}n{i'}z' 'siniz' 'sunuz' 's{u"}n{u"}z')
263
- )
264
-
265
- define mark_lAr as (
266
- check_vowel_harmony
267
- among ('ler' 'lar')
268
- )
269
-
270
- define mark_nUz as (
271
- check_vowel_harmony
272
- among ('n{i'}z' 'niz' 'nuz' 'n{u"}z')
273
- )
274
-
275
- define mark_DUr as (
276
- check_vowel_harmony
277
- among ('t{i'}r' 'tir' 'tur' 't{u"}r' 'd{i'}r' 'dir' 'dur' 'd{u"}r')
278
- )
279
-
280
- define mark_cAsInA as (
281
- among ('cas{i'}na' 'cesine')
282
- )
283
-
284
- define mark_yDU as (
285
- check_vowel_harmony
286
- among ('t{i'}m' 'tim' 'tum' 't{u"}m' 'd{i'}m' 'dim' 'dum' 'd{u"}m'
287
- 't{i'}n' 'tin' 'tun' 't{u"}n' 'd{i'}n' 'din' 'dun' 'd{u"}n'
288
- 't{i'}k' 'tik' 'tuk' 't{u"}k' 'd{i'}k' 'dik' 'duk' 'd{u"}k'
289
- 't{i'}' 'ti' 'tu' 't{u"}' 'd{i'}' 'di' 'du' 'd{u"}')
290
- (mark_suffix_with_optional_y_consonant)
291
- )
292
-
293
- // does not fully obey vowel harmony
294
- define mark_ysA as (
295
- among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se')
296
- (mark_suffix_with_optional_y_consonant)
297
- )
298
-
299
- define mark_ymUs_ as (
300
- check_vowel_harmony
301
- among ('m{i'}{s,}' 'mi{s,}' 'mu{s,}' 'm{u"}{s,}')
302
- (mark_suffix_with_optional_y_consonant)
303
- )
304
-
305
- define mark_yken as (
306
- 'ken' (mark_suffix_with_optional_y_consonant)
307
- )
308
-
309
- define stem_nominal_verb_suffixes as (
310
- [
311
- set continue_stemming_noun_suffixes
312
- (mark_ymUs_ or mark_yDU or mark_ysA or mark_yken)
313
- or
314
- (mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)
315
- or
316
- (
317
- mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_))
318
- unset continue_stemming_noun_suffixes
319
- )
320
- or
321
- (mark_nUz (mark_yDU or mark_ysA))
322
- or
323
- ((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_))
324
- or
325
- (mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_))
326
- ]delete
327
- )
328
-
329
- // stems noun suffix chains ending with -ki
330
- define stem_suffix_chain_before_ki as (
331
- [
332
- mark_ki
333
- (
334
- (mark_DA] delete try([
335
- (mark_lAr] delete try(stem_suffix_chain_before_ki))
336
- or
337
- (mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
338
-
339
- ))
340
- or
341
- (mark_nUn] delete try([
342
- (mark_lArI] delete)
343
- or
344
- ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
345
- or
346
- (stem_suffix_chain_before_ki)
347
- ))
348
- or
349
- (mark_ndA (
350
- (mark_lArI] delete)
351
- or
352
- ((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki)))
353
- or
354
- (stem_suffix_chain_before_ki)
355
- ))
356
- )
357
- )
358
-
359
- define stem_noun_suffixes as (
360
- ([mark_lAr] delete try(stem_suffix_chain_before_ki))
361
- or
362
- ([mark_ncA] delete
363
- try(
364
- ([mark_lArI] delete)
365
- or
366
- ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
367
- or
368
- ([mark_lAr] delete stem_suffix_chain_before_ki)
369
- )
370
- )
371
- or
372
- ([(mark_ndA or mark_nA)
373
- (
374
- (mark_lArI] delete)
375
- or
376
- (mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
377
- or
378
- (stem_suffix_chain_before_ki)
379
- )
380
- )
381
- or
382
- ([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI)))
383
- or
384
- ( [mark_DAn] delete try ([
385
- (
386
- (mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
387
- or
388
- (mark_lAr] delete try(stem_suffix_chain_before_ki))
389
- or
390
- (stem_suffix_chain_before_ki)
391
- ))
392
- )
393
- or
394
- ([mark_nUn or mark_ylA] delete
395
- try(
396
- ([mark_lAr] delete stem_suffix_chain_before_ki)
397
- or
398
- ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
399
- or
400
- stem_suffix_chain_before_ki
401
- )
402
- )
403
- or
404
- ([mark_lArI] delete)
405
- or
406
- (stem_suffix_chain_before_ki)
407
- or
408
- ([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki))
409
- or
410
- ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
411
- )
412
-
413
- define post_process_last_consonants as (
414
- [substring] among (
415
- 'b' (<- 'p')
416
- 'c' (<- '{c,}')
417
- 'd' (<- 't')
418
- '{g~}' (<- 'k')
419
- )
420
- )
421
-
422
- // after stemming if the word ends with 'd' or 'g' most probably last U is overstemmed
423
- // like in 'kedim' -> 'ked'
424
- // Turkish words don't usually end with 'd' or 'g'
425
- // some very well known words are ignored (like 'ad' 'soyad'
426
- // appends U to stems ending with d or g, decides which vowel to add
427
- // based on the last vowel in the stem
428
- define append_U_to_stems_ending_with_d_or_g as (
429
- test('d' or 'g')
430
- (test((goto vowel) 'a' or '{i'}') <+ '{i'}')
431
- or
432
- (test((goto vowel) 'e' or 'i') <+ 'i')
433
- or
434
- (test((goto vowel) 'o' or 'u') <+ 'u')
435
- or
436
- (test((goto vowel) '{o"}' or '{u"}') <+ '{u"}')
437
- )
438
-
439
- define is_reserved_word as (
440
- 'ad' try 'soy' atlimit
441
- )
106
+ // checks vowel harmony for possible suffixes,
107
+ // helps to detect whether the candidate for suffix applies to vowel harmony
108
+ // this rule is added to prevent over stemming
109
+ define check_vowel_harmony as (
110
+ test
111
+ (
112
+ (goto vowel) // if there is a vowel
113
+ (
114
+ ('a' goto vowel1) or
115
+ ('e' goto vowel2) or
116
+ ('{i}' goto vowel3) or
117
+ ('i' goto vowel4) or
118
+ ('o' goto vowel5) or
119
+ ('{o"}' goto vowel6) or
120
+ ('u' goto vowel5) or
121
+ ('{u"}' goto vowel6)
122
+ )
123
+ )
124
+ )
125
+
126
+ // if the last consonant before suffix is vowel and n then advance and delete
127
+ // if the last consonant before suffix is non vowel and n do nothing
128
+ // if the last consonant before suffix is not n then only delete the suffix
129
+ // assumption: slice beginning is set correctly
130
+ define mark_suffix_with_optional_n_consonant as (
131
+ ('n' (test vowel))
132
+ or
133
+ ((not(test 'n')) test(next vowel))
134
+ )
135
+
136
+ // if the last consonant before suffix is vowel and s then advance and delete
137
+ // if the last consonant before suffix is non vowel and s do nothing
138
+ // if the last consonant before suffix is not s then only delete the suffix
139
+ // assumption: slice beginning is set correctly
140
+ define mark_suffix_with_optional_s_consonant as (
141
+ ('s' (test vowel))
142
+ or
143
+ ((not(test 's')) test(next vowel))
144
+ )
145
+
146
+ // if the last consonant before suffix is vowel and y then advance and delete
147
+ // if the last consonant before suffix is non vowel and y do nothing
148
+ // if the last consonant before suffix is not y then only delete the suffix
149
+ // assumption: slice beginning is set correctly
150
+ define mark_suffix_with_optional_y_consonant as (
151
+ ('y' (test vowel))
152
+ or
153
+ ((not(test 'y')) test(next vowel))
154
+ )
155
+
156
+ define mark_suffix_with_optional_U_vowel as (
157
+ (U (test non-vowel))
158
+ or
159
+ ((not(test U)) test(next non-vowel))
160
+ )
161
+
162
+ define mark_possessives as (
163
+ among ('m{i}z' 'miz' 'muz' 'm{u"}z'
164
+ 'n{i}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n')
165
+ (mark_suffix_with_optional_U_vowel)
166
+ )
167
+
168
+ define mark_sU as (
169
+ check_vowel_harmony
170
+ U
171
+ (mark_suffix_with_optional_s_consonant)
172
+ )
173
+
174
+ define mark_lArI as (
175
+ among ('leri' 'lar{i}')
176
+ )
177
+
178
+ define mark_yU as (
179
+ check_vowel_harmony
180
+ U
181
+ (mark_suffix_with_optional_y_consonant)
182
+ )
183
+
184
+ define mark_nU as (
185
+ check_vowel_harmony
186
+ among ('n{i}' 'ni' 'nu' 'n{u"}')
187
+ )
188
+
189
+ define mark_nUn as (
190
+ check_vowel_harmony
191
+ among ('{i}n' 'in' 'un' '{u"}n')
192
+ (mark_suffix_with_optional_n_consonant)
193
+ )
194
+
195
+ define mark_yA as (
196
+ check_vowel_harmony
197
+ among('a' 'e')
198
+ (mark_suffix_with_optional_y_consonant)
199
+ )
200
+
201
+ define mark_nA as (
202
+ check_vowel_harmony
203
+ among('na' 'ne')
204
+ )
205
+
206
+ define mark_DA as (
207
+ check_vowel_harmony
208
+ among('da' 'de' 'ta' 'te')
209
+ )
210
+
211
+ define mark_ndA as (
212
+ check_vowel_harmony
213
+ among('nda' 'nde')
214
+ )
215
+
216
+ define mark_DAn as (
217
+ check_vowel_harmony
218
+ among('dan' 'den' 'tan' 'ten')
219
+ )
220
+
221
+ define mark_ndAn as (
222
+ check_vowel_harmony
223
+ among('ndan' 'nden')
224
+ )
225
+
226
+ define mark_ylA as (
227
+ check_vowel_harmony
228
+ among('la' 'le')
229
+ (mark_suffix_with_optional_y_consonant)
230
+ )
231
+
232
+ define mark_ki as (
233
+ 'ki'
234
+ )
235
+
236
+ define mark_ncA as (
237
+ check_vowel_harmony
238
+ among('ca' 'ce')
239
+ (mark_suffix_with_optional_n_consonant)
240
+ )
241
+
242
+ define mark_yUm as (
243
+ check_vowel_harmony
244
+ among ('{i}m' 'im' 'um' '{u"}m')
245
+ (mark_suffix_with_optional_y_consonant)
246
+ )
247
+
248
+ define mark_sUn as (
249
+ check_vowel_harmony
250
+ among ('s{i}n' 'sin' 'sun' 's{u"}n' )
251
+ )
252
+
253
+ define mark_yUz as (
254
+ check_vowel_harmony
255
+ among ('{i}z' 'iz' 'uz' '{u"}z')
256
+ (mark_suffix_with_optional_y_consonant)
257
+ )
258
+
259
+ define mark_sUnUz as (
260
+ among ('s{i}n{i}z' 'siniz' 'sunuz' 's{u"}n{u"}z')
261
+ )
262
+
263
+ define mark_lAr as (
264
+ check_vowel_harmony
265
+ among ('ler' 'lar')
266
+ )
267
+
268
+ define mark_nUz as (
269
+ check_vowel_harmony
270
+ among ('n{i}z' 'niz' 'nuz' 'n{u"}z')
271
+ )
272
+
273
+ define mark_DUr as (
274
+ check_vowel_harmony
275
+ among ('t{i}r' 'tir' 'tur' 't{u"}r' 'd{i}r' 'dir' 'dur' 'd{u"}r')
276
+ )
277
+
278
+ define mark_cAsInA as (
279
+ among ('cas{i}na' 'cesine')
280
+ )
281
+
282
+ define mark_yDU as (
283
+ check_vowel_harmony
284
+ among ('t{i}m' 'tim' 'tum' 't{u"}m' 'd{i}m' 'dim' 'dum' 'd{u"}m'
285
+ 't{i}n' 'tin' 'tun' 't{u"}n' 'd{i}n' 'din' 'dun' 'd{u"}n'
286
+ 't{i}k' 'tik' 'tuk' 't{u"}k' 'd{i}k' 'dik' 'duk' 'd{u"}k'
287
+ 't{i}' 'ti' 'tu' 't{u"}' 'd{i}' 'di' 'du' 'd{u"}')
288
+ (mark_suffix_with_optional_y_consonant)
289
+ )
290
+
291
+ // does not fully obey vowel harmony
292
+ define mark_ysA as (
293
+ among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se')
294
+ (mark_suffix_with_optional_y_consonant)
295
+ )
296
+
297
+ define mark_ymUs_ as (
298
+ check_vowel_harmony
299
+ among ('m{i}{sc}' 'mi{sc}' 'mu{sc}' 'm{u"}{sc}')
300
+ (mark_suffix_with_optional_y_consonant)
301
+ )
302
+
303
+ define mark_yken as (
304
+ 'ken' (mark_suffix_with_optional_y_consonant)
305
+ )
306
+
307
+ define stem_nominal_verb_suffixes as (
308
+ [
309
+ set continue_stemming_noun_suffixes
310
+ (mark_ymUs_ or mark_yDU or mark_ysA or mark_yken)
311
+ or
312
+ (mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)
313
+ or
314
+ (
315
+ mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_))
316
+ unset continue_stemming_noun_suffixes
317
+ )
318
+ or
319
+ (mark_nUz (mark_yDU or mark_ysA))
320
+ or
321
+ ((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_))
322
+ or
323
+ (mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_))
324
+ ]delete
325
+ )
326
+
327
+ // stems noun suffix chains ending with -ki
328
+ define stem_suffix_chain_before_ki as (
329
+ [
330
+ mark_ki
331
+ (
332
+ (mark_DA] delete try([
333
+ (mark_lAr] delete try(stem_suffix_chain_before_ki))
334
+ or
335
+ (mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
336
+
337
+ ))
338
+ or
339
+ (mark_nUn] delete try([
340
+ (mark_lArI] delete)
341
+ or
342
+ ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
343
+ or
344
+ (stem_suffix_chain_before_ki)
345
+ ))
346
+ or
347
+ (mark_ndA (
348
+ (mark_lArI] delete)
349
+ or
350
+ ((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki)))
351
+ or
352
+ (stem_suffix_chain_before_ki)
353
+ ))
354
+ )
355
+ )
356
+
357
+ define stem_noun_suffixes as (
358
+ ([mark_lAr] delete try(stem_suffix_chain_before_ki))
359
+ or
360
+ ([mark_ncA] delete
361
+ try(
362
+ ([mark_lArI] delete)
363
+ or
364
+ ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
365
+ or
366
+ ([mark_lAr] delete stem_suffix_chain_before_ki)
367
+ )
368
+ )
369
+ or
370
+ ([(mark_ndA or mark_nA)
371
+ (
372
+ (mark_lArI] delete)
373
+ or
374
+ (mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
375
+ or
376
+ (stem_suffix_chain_before_ki)
377
+ )
378
+ )
379
+ or
380
+ ([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI)))
381
+ or
382
+ ( [mark_DAn] delete try ([
383
+ (
384
+ (mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
385
+ or
386
+ (mark_lAr] delete try(stem_suffix_chain_before_ki))
387
+ or
388
+ (stem_suffix_chain_before_ki)
389
+ ))
390
+ )
391
+ or
392
+ ([mark_nUn or mark_ylA] delete
393
+ try(
394
+ ([mark_lAr] delete stem_suffix_chain_before_ki)
395
+ or
396
+ ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
397
+ or
398
+ stem_suffix_chain_before_ki
399
+ )
400
+ )
401
+ or
402
+ ([mark_lArI] delete)
403
+ or
404
+ (stem_suffix_chain_before_ki)
405
+ or
406
+ ([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki))
407
+ or
408
+ ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
409
+ )
410
+
411
+ define post_process_last_consonants as (
412
+ [substring] among (
413
+ 'b' (<- 'p')
414
+ 'c' (<- '{cc}')
415
+ 'd' (<- 't')
416
+ '{g~}' (<- 'k')
417
+ )
418
+ )
419
+
420
+ // after stemming if the word ends with 'd' or 'g' most probably last U is
421
+ // overstemmed like in 'kedim' -> 'ked'
422
+ // Turkish words don't usually end with 'd' or 'g'
423
+ // some very well known words are ignored (like 'ad' 'soyad'
424
+ // appends U to stems ending with d or g, decides which vowel to add
425
+ // based on the last vowel in the stem
426
+ define append_U_to_stems_ending_with_d_or_g as (
427
+ [] ('d' or 'g') goto vowel
428
+
429
+ (('a' or '{i}') <- '{i}')
430
+ or
431
+ (('e' or 'i') <- 'i')
432
+ or
433
+ (('o' or 'u') <- 'u')
434
+ or
435
+ (('{o"}' or '{u"}') <- '{u"}')
436
+ )
437
+
438
+ define is_reserved_word as (
439
+ 'ad' try 'soy' atlimit
440
+ )
442
441
  )
443
442
 
444
- // Tests if there are more than one syllables
445
- // In Turkish each vowel indicates a distinct syllable
443
+ define remove_proper_noun_suffix as (
444
+ // Remove any leading apostrophes (e.g. from tokenisation of single-quoted
445
+ // text).
446
+ do ([goto not '{'}'] delete)
447
+
448
+ // https://en.wikipedia.org/wiki/Turkish_language says "In modern
449
+ // Turkish orthography, an apostrophe is used to separate proper names
450
+ // from any suffixes" with the example "Türkiye'dir ("it is Turkey")".
451
+ // Therefore we truncate at the first apostrophe, provided there are at least
452
+ // two characters before it (which avoids adversely affecting some foreign
453
+ // names and words such as "o'connor", "l'entrée").
454
+ do (
455
+ hop 2
456
+ goto '{'}' [ tolimit ] delete
457
+ )
458
+ )
459
+
460
+ // Test if there is more than one syllable.
461
+ // In Turkish each vowel indicates a distinct syllable.
446
462
  define more_than_one_syllable_word as (
447
- test (loop 2 gopast vowel)
463
+ test (loop 2 gopast vowel)
448
464
  )
449
465
 
450
466
  define postlude as (
451
- backwards (
452
- not(is_reserved_word)
453
- do append_U_to_stems_ending_with_d_or_g
454
- do post_process_last_consonants
455
-
456
- )
467
+ backwards (
468
+ not is_reserved_word
469
+ do append_U_to_stems_ending_with_d_or_g
470
+ do post_process_last_consonants
471
+ )
457
472
  )
458
473
 
459
474
  define stem as (
460
- (more_than_one_syllable_word)
461
- (
462
- backwards (
463
- do stem_nominal_verb_suffixes
464
- continue_stemming_noun_suffixes
465
- do stem_noun_suffixes
466
- )
467
-
468
- postlude
469
- )
475
+ do remove_proper_noun_suffix
476
+
477
+ more_than_one_syllable_word
478
+
479
+ backwards (
480
+ do stem_nominal_verb_suffixes
481
+ continue_stemming_noun_suffixes
482
+ do stem_noun_suffixes
483
+ )
484
+
485
+ postlude
470
486
  )