mittens 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE.txt +30 -0
  5. data/README.md +62 -0
  6. data/Rakefile +21 -0
  7. data/ext/mittens/ext.c +96 -0
  8. data/ext/mittens/extconf.rb +12 -0
  9. data/lib/mittens/version.rb +3 -0
  10. data/lib/mittens.rb +7 -0
  11. data/mittens.gemspec +22 -0
  12. data/vendor/snowball/.gitignore +26 -0
  13. data/vendor/snowball/.travis.yml +112 -0
  14. data/vendor/snowball/AUTHORS +27 -0
  15. data/vendor/snowball/CONTRIBUTING.rst +216 -0
  16. data/vendor/snowball/COPYING +29 -0
  17. data/vendor/snowball/GNUmakefile +742 -0
  18. data/vendor/snowball/NEWS +754 -0
  19. data/vendor/snowball/README.rst +37 -0
  20. data/vendor/snowball/ada/README.md +74 -0
  21. data/vendor/snowball/ada/generate/generate.adb +83 -0
  22. data/vendor/snowball/ada/generate.gpr +21 -0
  23. data/vendor/snowball/ada/src/stemmer.adb +620 -0
  24. data/vendor/snowball/ada/src/stemmer.ads +219 -0
  25. data/vendor/snowball/ada/src/stemwords.adb +70 -0
  26. data/vendor/snowball/ada/stemmer_config.gpr +83 -0
  27. data/vendor/snowball/ada/stemwords.gpr +21 -0
  28. data/vendor/snowball/algorithms/arabic.sbl +558 -0
  29. data/vendor/snowball/algorithms/armenian.sbl +301 -0
  30. data/vendor/snowball/algorithms/basque.sbl +149 -0
  31. data/vendor/snowball/algorithms/catalan.sbl +202 -0
  32. data/vendor/snowball/algorithms/danish.sbl +93 -0
  33. data/vendor/snowball/algorithms/dutch.sbl +164 -0
  34. data/vendor/snowball/algorithms/english.sbl +229 -0
  35. data/vendor/snowball/algorithms/finnish.sbl +197 -0
  36. data/vendor/snowball/algorithms/french.sbl +254 -0
  37. data/vendor/snowball/algorithms/german.sbl +139 -0
  38. data/vendor/snowball/algorithms/german2.sbl +145 -0
  39. data/vendor/snowball/algorithms/greek.sbl +701 -0
  40. data/vendor/snowball/algorithms/hindi.sbl +323 -0
  41. data/vendor/snowball/algorithms/hungarian.sbl +241 -0
  42. data/vendor/snowball/algorithms/indonesian.sbl +192 -0
  43. data/vendor/snowball/algorithms/irish.sbl +149 -0
  44. data/vendor/snowball/algorithms/italian.sbl +202 -0
  45. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
  46. data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
  47. data/vendor/snowball/algorithms/lovins.sbl +208 -0
  48. data/vendor/snowball/algorithms/nepali.sbl +92 -0
  49. data/vendor/snowball/algorithms/norwegian.sbl +80 -0
  50. data/vendor/snowball/algorithms/porter.sbl +139 -0
  51. data/vendor/snowball/algorithms/portuguese.sbl +218 -0
  52. data/vendor/snowball/algorithms/romanian.sbl +236 -0
  53. data/vendor/snowball/algorithms/russian.sbl +221 -0
  54. data/vendor/snowball/algorithms/serbian.sbl +2379 -0
  55. data/vendor/snowball/algorithms/spanish.sbl +230 -0
  56. data/vendor/snowball/algorithms/swedish.sbl +72 -0
  57. data/vendor/snowball/algorithms/tamil.sbl +405 -0
  58. data/vendor/snowball/algorithms/turkish.sbl +470 -0
  59. data/vendor/snowball/algorithms/yiddish.sbl +460 -0
  60. data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
  61. data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
  62. data/vendor/snowball/charsets/cp850.sbl +130 -0
  63. data/vendor/snowball/compiler/analyser.c +1547 -0
  64. data/vendor/snowball/compiler/driver.c +615 -0
  65. data/vendor/snowball/compiler/generator.c +1748 -0
  66. data/vendor/snowball/compiler/generator_ada.c +1702 -0
  67. data/vendor/snowball/compiler/generator_csharp.c +1322 -0
  68. data/vendor/snowball/compiler/generator_go.c +1278 -0
  69. data/vendor/snowball/compiler/generator_java.c +1313 -0
  70. data/vendor/snowball/compiler/generator_js.c +1316 -0
  71. data/vendor/snowball/compiler/generator_pascal.c +1387 -0
  72. data/vendor/snowball/compiler/generator_python.c +1337 -0
  73. data/vendor/snowball/compiler/generator_rust.c +1295 -0
  74. data/vendor/snowball/compiler/header.h +418 -0
  75. data/vendor/snowball/compiler/space.c +286 -0
  76. data/vendor/snowball/compiler/syswords.h +86 -0
  77. data/vendor/snowball/compiler/syswords2.h +13 -0
  78. data/vendor/snowball/compiler/tokeniser.c +567 -0
  79. data/vendor/snowball/csharp/.gitignore +8 -0
  80. data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
  81. data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
  82. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
  83. data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
  84. data/vendor/snowball/csharp/Stemwords/App.config +6 -0
  85. data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
  86. data/vendor/snowball/doc/TODO +12 -0
  87. data/vendor/snowball/doc/libstemmer_c_README +148 -0
  88. data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
  89. data/vendor/snowball/doc/libstemmer_java_README +67 -0
  90. data/vendor/snowball/doc/libstemmer_js_README +48 -0
  91. data/vendor/snowball/doc/libstemmer_python_README +113 -0
  92. data/vendor/snowball/examples/stemwords.c +204 -0
  93. data/vendor/snowball/go/README.md +55 -0
  94. data/vendor/snowball/go/among.go +16 -0
  95. data/vendor/snowball/go/env.go +403 -0
  96. data/vendor/snowball/go/stemwords/generate.go +68 -0
  97. data/vendor/snowball/go/stemwords/main.go +68 -0
  98. data/vendor/snowball/go/util.go +34 -0
  99. data/vendor/snowball/iconv.py +50 -0
  100. data/vendor/snowball/include/libstemmer.h +78 -0
  101. data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
  102. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
  103. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
  104. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
  105. data/vendor/snowball/javascript/base-stemmer.js +294 -0
  106. data/vendor/snowball/javascript/stemwords.js +106 -0
  107. data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
  108. data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
  109. data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
  110. data/vendor/snowball/libstemmer/modules.txt +63 -0
  111. data/vendor/snowball/libstemmer/test.c +34 -0
  112. data/vendor/snowball/pascal/.gitignore +4 -0
  113. data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
  114. data/vendor/snowball/pascal/generate.pl +23 -0
  115. data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
  116. data/vendor/snowball/python/MANIFEST.in +7 -0
  117. data/vendor/snowball/python/create_init.py +54 -0
  118. data/vendor/snowball/python/setup.cfg +6 -0
  119. data/vendor/snowball/python/setup.py +81 -0
  120. data/vendor/snowball/python/snowballstemmer/among.py +13 -0
  121. data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
  122. data/vendor/snowball/python/stemwords.py +101 -0
  123. data/vendor/snowball/python/testapp.py +28 -0
  124. data/vendor/snowball/runtime/api.c +58 -0
  125. data/vendor/snowball/runtime/api.h +32 -0
  126. data/vendor/snowball/runtime/header.h +61 -0
  127. data/vendor/snowball/runtime/utilities.c +513 -0
  128. data/vendor/snowball/rust/Cargo.toml +7 -0
  129. data/vendor/snowball/rust/build.rs +55 -0
  130. data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
  131. data/vendor/snowball/rust/src/main.rs +102 -0
  132. data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
  133. data/vendor/snowball/rust/src/snowball/among.rs +6 -0
  134. data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
  135. data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
  136. data/vendor/snowball/tests/stemtest.c +95 -0
  137. metadata +178 -0
@@ -0,0 +1,323 @@
1
+ // An implementation of "A Lightweight Stemmer for Hindi":
2
+ // http://www.kbcs.in/downloads/papers/StmmerHindi.pdf
3
+
4
+ externals ( stem )
5
+
6
+ stringescapes {}
7
+
8
+ // The transliteration scheme used for our stringdefs matches that used in the
9
+ // paper, as documented in the appendix. It appears to match the WX notation
10
+ // (https://en.wikipedia.org/wiki/WX_notation) except that WX apparently
11
+ // uses 'z' for Anunasika whereas the paper uses Mh.
12
+ //
13
+ // We discriminate dependent vowels by adding a leading "_" to their stringdef
14
+ // names (mnemonic: the _ signifies removing the implicit a from the preceding
15
+ // character).
16
+
17
+ // Vowels and sonorants:
18
+ stringdef a '{U+0905}'
19
+ stringdef A '{U+0906}'
20
+ stringdef i '{U+0907}'
21
+ stringdef I '{U+0908}'
22
+ stringdef u '{U+0909}'
23
+ stringdef U '{U+090A}'
24
+ stringdef q '{U+090B}'
25
+ stringdef e '{U+090F}'
26
+ stringdef E '{U+0910}'
27
+ stringdef o '{U+0913}'
28
+ stringdef O '{U+0914}'
29
+
30
+ // Vowel signs:
31
+ stringdef _A '{U+093E}'
32
+ stringdef _i '{U+093F}'
33
+ stringdef _I '{U+0940}'
34
+ stringdef _u '{U+0941}'
35
+ stringdef _U '{U+0942}'
36
+ stringdef _q '{U+0943}'
37
+ stringdef _e '{U+0947}'
38
+ stringdef _E '{U+0948}'
39
+ stringdef _o '{U+094B}'
40
+ stringdef _O '{U+094C}'
41
+
42
+ // Diacritics:
43
+ stringdef M '{U+0902}'
44
+ stringdef H '{U+0903}'
45
+ stringdef Mh '{U+0901}'
46
+ stringdef Z '{U+093C}' // Nukta
47
+ stringdef virama '{U+094D}'
48
+
49
+ // Velar consonants:
50
+ stringdef k '{U+0915}'
51
+ stringdef K '{U+0916}'
52
+ stringdef g '{U+0917}'
53
+ stringdef G '{U+0918}'
54
+ stringdef f '{U+0919}'
55
+
56
+ // Palatal consonants:
57
+ stringdef c '{U+091A}'
58
+ stringdef C '{U+091B}'
59
+ stringdef j '{U+091C}'
60
+ stringdef J '{U+091D}'
61
+ stringdef F '{U+091E}'
62
+
63
+ // Retroflex consonants:
64
+ stringdef t '{U+091F}'
65
+ stringdef T '{U+0920}'
66
+ stringdef d '{U+0921}'
67
+ stringdef D '{U+0922}'
68
+ stringdef N '{U+0923}'
69
+
70
+ // Dental consonants:
71
+ stringdef w '{U+0924}'
72
+ stringdef W '{U+0925}'
73
+ stringdef x '{U+0926}'
74
+ stringdef X '{U+0927}'
75
+ stringdef n '{U+0928}'
76
+
77
+ // Labial consonants:
78
+ stringdef p '{U+092A}'
79
+ stringdef P '{U+092B}'
80
+ stringdef b '{U+092C}'
81
+ stringdef B '{U+092D}'
82
+ stringdef m '{U+092E}'
83
+
84
+ // Semi-vowels:
85
+ stringdef y '{U+092F}'
86
+ stringdef r '{U+0930}'
87
+ stringdef l '{U+0932}'
88
+ stringdef v '{U+0935}'
89
+
90
+ // Fricatives:
91
+ stringdef S '{U+0936}'
92
+ stringdef R '{U+0937}'
93
+ stringdef s '{U+0938}'
94
+ stringdef h '{U+0939}'
95
+
96
+ stringdef lY '{U+0933}'
97
+
98
+ // Precomposed characters - letters + nukta:
99
+ stringdef nZ '{U+0929}' // ≡ {n}{Z}
100
+ stringdef rZ '{U+0931}' // ≡ {r}{Z}
101
+ stringdef lYZ '{U+0934}' // ≡ {lY}{Z}
102
+ stringdef kZ '{U+0958}' // ≡ {k}{Z}
103
+ stringdef KZ '{U+0959}' // ≡ {K}{Z}
104
+ stringdef gZ '{U+095A}' // ≡ {g}{Z}
105
+ stringdef jZ '{U+095B}' // ≡ {j}{Z}
106
+ stringdef dZ '{U+095C}' // ≡ {d}{Z}
107
+ stringdef DZ '{U+095D}' // ≡ {D}{Z}
108
+ stringdef PZ '{U+095E}' // ≡ {P}{Z}
109
+ stringdef yZ '{U+095F}' // ≡ {y}{Z}
110
+
111
+ groupings ( consonant )
112
+
113
+ routines ( CONSONANT )
114
+
115
+ define consonant '{k}{K}{g}{G}{f}' +
116
+ '{c}{C}{j}{J}{F}' +
117
+ '{t}{T}{d}{D}{N}' +
118
+ '{w}{W}{x}{X}{n}' +
119
+ '{p}{P}{b}{B}{m}' +
120
+ '{y}{r}{l}{v}' +
121
+ '{S}{R}{s}{h}' +
122
+ '{lY}' +
123
+ '{Z}' + // Nukta
124
+ // Precomposed characters - letter and nukta:
125
+ '{nZ}{rZ}{lYZ}{kZ}{KZ}{gZ}{jZ}{dZ}{DZ}{PZ}{yZ}'
126
+
127
+ backwardmode ( define CONSONANT as ( consonant ) )
128
+
129
+ define stem as (
130
+ // We assume in this implementation that the whole word doesn't count
131
+ // as a valid suffix to remove, so we remove the longest suffix from
132
+ // the list which leaves at least one character. This change affects
133
+ // 47 words out of the 65,140 in the sample vocabulary from Hindi
134
+ // wikipedia.
135
+ //
136
+ // The trick here is we use `next` in forward mode to advance the cursor
137
+ // to the second character, then `backwards` swaps the cursor and limit.
138
+ next
139
+ backwards (
140
+ [substring] among (
141
+ // The list below is derived from figure 3 in the paper.
142
+ //
143
+ // We perform the stemming on the Devanagari characters rather than
144
+ // transliterating to Latin, so we have adapted the list below to
145
+ // reflect this by converting suffixes back to Devanagari as
146
+ // follows:
147
+ //
148
+ // * within the suffixes, "a" after a consonant is dropped since
149
+ // consonants have an implicit "a".
150
+ //
151
+ // * within the suffixes, a vowel other than "a" after a consonant
152
+ // is a dependent vowel (vowel sign); a vowel (including "a")
153
+ // after a non-consonant is an independent vowel.
154
+ //
155
+ // * to allow the vowel at the start of each suffix being dependent
156
+ // or independent, we include each suffix twice. For the
157
+ // dependent version, a leading "a" is dropped and we check that
158
+ // the suffix is preceded by a consonant (which will have an
159
+ // implicit "a").
160
+ //
161
+ // * we add '{a}', which is needed for the example given right at
162
+ // the end of section 5 to work (conflating BarawIya and
163
+ // BarawIyawA), and which 3.1 a.v strongly suggests should be in
164
+ // the list:
165
+ //
166
+ // Thus, the following suffix deletions (longest possible
167
+ // match) are required to reduce inflected forms of masculine
168
+ // nouns to a common stem:
169
+ // a A i [...]
170
+ //
171
+ // Adding '{a}' only affect 2 words out of the 65,140 in the
172
+ // sample vocabulary.
173
+ //
174
+ // * The transliterations of our stems would end with "a" when our
175
+ // stems end in a consonant, so we also include {virama} in the
176
+ // list of suffixes to remove (this affects 222 words from the
177
+ // sample vocabulary).
178
+ //
179
+ // We've also assumed that Mh in the suffix list always means {Mh}
180
+ // and never {M}{h}{virama}. Only one of the 65,140 words in the
181
+ // sample vocabulary stems differently due to this (and that word
182
+ // seems to be a typo).
183
+
184
+ '{virama}'
185
+
186
+ '{a}'
187
+ '{A}'
188
+ '{i}'
189
+ '{I}'
190
+ '{u}'
191
+ '{U}'
192
+ '{e}'
193
+ '{o}'
194
+ '{e}{M}'
195
+ '{o}{M}'
196
+ '{A}{M}'
197
+ '{u}{A}{M}'
198
+ '{u}{e}{M}'
199
+ '{u}{o}{M}'
200
+ '{A}{e}{M}'
201
+ '{A}{o}{M}'
202
+ '{i}{y}{_A}{M}'
203
+ '{i}{y}{_o}{M}'
204
+ '{A}{i}{y}{_A}{M}'
205
+ '{A}{i}{y}{_o}{M}'
206
+ '{A}{Mh}'
207
+ '{i}{y}{_A}{Mh}'
208
+ '{A}{i}{y}{_A}{Mh}'
209
+ '{a}{w}{_A}{e}{M}'
210
+ '{a}{w}{_A}{o}{M}'
211
+ '{a}{n}{_A}{e}{M}'
212
+ '{a}{n}{_A}{o}{M}'
213
+ '{a}{w}{_A}'
214
+ '{a}{w}{_I}'
215
+ '{I}{M}'
216
+ '{a}{w}{_I}{M}'
217
+ '{a}{w}{_e}'
218
+ '{A}{w}{_A}'
219
+ '{A}{w}{_I}'
220
+ '{A}{w}{_I}{M}'
221
+ '{A}{w}{_e}'
222
+ '{a}{n}{_A}'
223
+ '{a}{n}{_I}'
224
+ '{a}{n}{_e}'
225
+ '{A}{n}{_A}'
226
+ '{A}{n}{_e}'
227
+ '{U}{M}{g}{_A}'
228
+ '{U}{M}{g}{_I}'
229
+ '{A}{U}{M}{g}{_A}'
230
+ '{A}{U}{M}{g}{_I}'
231
+ '{e}{M}{g}{_e}'
232
+ '{e}{M}{g}{_I}'
233
+ '{A}{e}{M}{g}{_e}'
234
+ '{A}{e}{M}{g}{_I}'
235
+ '{o}{g}{_e}'
236
+ '{o}{g}{_I}'
237
+ '{A}{o}{g}{_e}'
238
+ '{A}{o}{g}{_I}'
239
+ '{e}{g}{_A}'
240
+ '{e}{g}{_I}'
241
+ '{A}{e}{g}{_A}'
242
+ '{A}{e}{g}{_I}'
243
+ '{A}{y}{_A}'
244
+ '{A}{e}'
245
+ '{A}{I}'
246
+ '{A}{I}{M}'
247
+ '{i}{e}'
248
+ '{A}{o}'
249
+ '{A}{i}{e}'
250
+ '{a}{k}{r}'
251
+ '{A}{k}{r}'
252
+
253
+ '{_A}'
254
+ '{_i}'
255
+ '{_I}'
256
+ '{_u}'
257
+ '{_U}'
258
+ '{_e}'
259
+ '{_o}'
260
+ '{_e}{M}'
261
+ '{_o}{M}'
262
+ '{_A}{M}'
263
+ '{_u}{A}{M}'
264
+ '{_u}{e}{M}'
265
+ '{_u}{o}{M}'
266
+ '{_A}{e}{M}'
267
+ '{_A}{o}{M}'
268
+ '{_i}{y}{_A}{M}'
269
+ '{_i}{y}{_o}{M}'
270
+ '{_A}{i}{y}{_A}{M}'
271
+ '{_A}{i}{y}{_o}{M}'
272
+ '{_A}{Mh}'
273
+ '{_i}{y}{_A}{Mh}'
274
+ '{_A}{i}{y}{_A}{Mh}'
275
+ '{_I}{M}'
276
+ '{_A}{w}{_A}'
277
+ '{_A}{w}{_I}'
278
+ '{_A}{w}{_I}{M}'
279
+ '{_A}{w}{_e}'
280
+ '{_A}{n}{_A}'
281
+ '{_A}{n}{_e}'
282
+ '{_U}{M}{g}{_A}'
283
+ '{_U}{M}{g}{_I}'
284
+ '{_A}{U}{M}{g}{_A}'
285
+ '{_A}{U}{M}{g}{_I}'
286
+ '{_e}{M}{g}{_e}'
287
+ '{_e}{M}{g}{_I}'
288
+ '{_A}{e}{M}{g}{_e}'
289
+ '{_A}{e}{M}{g}{_I}'
290
+ '{_o}{g}{_e}'
291
+ '{_o}{g}{_I}'
292
+ '{_A}{o}{g}{_e}'
293
+ '{_A}{o}{g}{_I}'
294
+ '{_e}{g}{_A}'
295
+ '{_e}{g}{_I}'
296
+ '{_A}{e}{g}{_A}'
297
+ '{_A}{e}{g}{_I}'
298
+ '{_A}{y}{_A}'
299
+ '{_A}{e}'
300
+ '{_A}{I}'
301
+ '{_A}{I}{M}'
302
+ '{_i}{e}'
303
+ '{_A}{o}'
304
+ '{_A}{i}{e}'
305
+ '{_A}{k}{r}'
306
+
307
+ /* Suffixes with a leading implicit a: */
308
+ '{w}{_A}{e}{M}' CONSONANT
309
+ '{w}{_A}{o}{M}' CONSONANT
310
+ '{n}{_A}{e}{M}' CONSONANT
311
+ '{n}{_A}{o}{M}' CONSONANT
312
+ '{w}{_A}' CONSONANT
313
+ '{w}{_I}' CONSONANT
314
+ '{w}{_I}{M}' CONSONANT
315
+ '{w}{_e}' CONSONANT
316
+ '{n}{_A}' CONSONANT
317
+ '{n}{_I}' CONSONANT
318
+ '{n}{_e}' CONSONANT
319
+ '{k}{r}' CONSONANT
320
+ )
321
+ delete
322
+ )
323
+ )
@@ -0,0 +1,241 @@
1
+ /*
2
+ Hungarian Stemmer
3
+ Removes noun inflections
4
+ */
5
+
6
+ routines (
7
+ mark_regions
8
+ R1
9
+ v_ending
10
+ case
11
+ case_special
12
+ case_other
13
+ plural
14
+ owned
15
+ sing_owner
16
+ plur_owner
17
+ instrum
18
+ factive
19
+ undouble
20
+ double
21
+ )
22
+
23
+ externals ( stem )
24
+
25
+ integers ( p1 )
26
+ groupings ( v )
27
+
28
+ stringescapes {}
29
+
30
+ /* special characters */
31
+
32
+ stringdef a' '{U+00E1}' //a-acute
33
+ stringdef e' '{U+00E9}' //e-acute
34
+ stringdef i' '{U+00ED}' //i-acute
35
+ stringdef o' '{U+00F3}' //o-acute
36
+ stringdef o" '{U+00F6}' //o-umlaut
37
+ stringdef oq '{U+0151}' //o-double acute
38
+ stringdef u' '{U+00FA}' //u-acute
39
+ stringdef u" '{U+00FC}' //u-umlaut
40
+ stringdef uq '{U+0171}' //u-double acute
41
+
42
+ define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}'
43
+
44
+ define mark_regions as (
45
+
46
+ $p1 = limit
47
+
48
+ (v goto non-v
49
+ among('cs' 'gy' 'ly' 'ny' 'sz' 'ty' 'zs' 'dzs') or next
50
+ setmark p1)
51
+ or
52
+
53
+ (non-v gopast v setmark p1)
54
+ )
55
+
56
+ backwardmode (
57
+
58
+ define R1 as $p1 <= cursor
59
+
60
+ define v_ending as (
61
+ [substring] R1 among(
62
+ '{a'}' (<- 'a')
63
+ '{e'}' (<- 'e')
64
+ )
65
+ )
66
+
67
+ define double as (
68
+ test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm'
69
+ 'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs')
70
+ )
71
+
72
+ define undouble as (
73
+ next [hop 1] delete
74
+ )
75
+
76
+ define instrum as(
77
+ [substring] R1 among(
78
+ 'al' (double)
79
+ 'el' (double)
80
+ )
81
+ delete
82
+ undouble
83
+ )
84
+
85
+
86
+ define case as (
87
+ [substring] R1 among(
88
+ 'ban' 'ben'
89
+ 'ba' 'be'
90
+ 'ra' 're'
91
+ 'nak' 'nek'
92
+ 'val' 'vel'
93
+ 't{o'}l' 't{oq}l'
94
+ 'r{o'}l' 'r{oq}l'
95
+ 'b{o'}l' 'b{oq}l'
96
+ 'hoz' 'hez' 'h{o"}z'
97
+ 'n{a'}l' 'n{e'}l'
98
+ 'ig'
99
+ 'at' 'et' 'ot' '{o"}t'
100
+ '{e'}rt'
101
+ 'k{e'}pp' 'k{e'}ppen'
102
+ 'kor'
103
+ 'ul' '{u"}l'
104
+ 'v{a'}' 'v{e'}'
105
+ 'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt'
106
+ 'k{e'}nt'
107
+ 'en' 'on' 'an' '{o"}n'
108
+ 'n'
109
+ 't'
110
+ )
111
+ delete
112
+ v_ending
113
+ )
114
+
115
+ define case_special as(
116
+ [substring] R1 among(
117
+ '{e'}n' (<- 'e')
118
+ '{a'}n' (<- 'a')
119
+ '{a'}nk{e'}nt' (<- 'a')
120
+ )
121
+ )
122
+
123
+ define case_other as(
124
+ [substring] R1 among(
125
+ 'astul' 'est{u"}l' (delete)
126
+ 'stul' 'st{u"}l' (delete)
127
+ '{a'}stul' (<- 'a')
128
+ '{e'}st{u"}l' (<- 'e')
129
+ )
130
+ )
131
+
132
+ define factive as(
133
+ [substring] R1 among(
134
+ '{a'}' (double)
135
+ '{e'}' (double)
136
+ )
137
+ delete
138
+ undouble
139
+ )
140
+
141
+ define plural as (
142
+ [substring] R1 among(
143
+ '{a'}k' (<- 'a')
144
+ '{e'}k' (<- 'e')
145
+ '{o"}k' (delete)
146
+ 'ak' (delete)
147
+ 'ok' (delete)
148
+ 'ek' (delete)
149
+ 'k' (delete)
150
+ )
151
+ )
152
+
153
+ define owned as (
154
+ [substring] R1 among (
155
+ 'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete)
156
+ '{e'}k{e'}' (<- 'e')
157
+ '{a'}k{e'}' (<- 'a')
158
+ 'k{e'}' (delete)
159
+ '{e'}{e'}i' (<- 'e')
160
+ '{a'}{e'}i' (<- 'a')
161
+ '{e'}i' (delete)
162
+ '{e'}{e'}' (<- 'e')
163
+ '{e'}' (delete)
164
+ )
165
+ )
166
+
167
+ define sing_owner as (
168
+ [substring] R1 among(
169
+ '{u"}nk' 'unk' (delete)
170
+ '{a'}nk' (<- 'a')
171
+ '{e'}nk' (<- 'e')
172
+ 'nk' (delete)
173
+ '{a'}juk' (<- 'a')
174
+ '{e'}j{u"}k' (<- 'e')
175
+ 'juk' 'j{u"}k' (delete)
176
+ 'uk' '{u"}k' (delete)
177
+ 'em' 'om' 'am' (delete)
178
+ '{a'}m' (<- 'a')
179
+ '{e'}m' (<- 'e')
180
+ 'm' (delete)
181
+ 'od' 'ed' 'ad' '{o"}d' (delete)
182
+ '{a'}d' (<- 'a')
183
+ '{e'}d' (<- 'e')
184
+ 'd' (delete)
185
+ 'ja' 'je' (delete)
186
+ 'a' 'e' 'o' (delete)
187
+ '{a'}' (<- 'a')
188
+ '{e'}' (<- 'e')
189
+ )
190
+ )
191
+
192
+ define plur_owner as (
193
+ [substring] R1 among(
194
+ 'jaim' 'jeim' (delete)
195
+ '{a'}im' (<- 'a')
196
+ '{e'}im' (<- 'e')
197
+ 'aim' 'eim' (delete)
198
+ 'im' (delete)
199
+ 'jaid' 'jeid' (delete)
200
+ '{a'}id' (<- 'a')
201
+ '{e'}id' (<- 'e')
202
+ 'aid' 'eid' (delete)
203
+ 'id' (delete)
204
+ 'jai' 'jei' (delete)
205
+ '{a'}i' (<- 'a')
206
+ '{e'}i' (<- 'e')
207
+ 'ai' 'ei' (delete)
208
+ 'i' (delete)
209
+ 'jaink' 'jeink' (delete)
210
+ 'eink' 'aink' (delete)
211
+ '{a'}ink' (<- 'a')
212
+ '{e'}ink' (<- 'e')
213
+ 'ink'
214
+ 'jaitok' 'jeitek' (delete)
215
+ 'aitok' 'eitek' (delete)
216
+ '{a'}itok' (<- 'a')
217
+ '{e'}itek' (<- 'e')
218
+ 'itek' (delete)
219
+ 'jeik' 'jaik' (delete)
220
+ 'aik' 'eik' (delete)
221
+ '{a'}ik' (<- 'a')
222
+ '{e'}ik' (<- 'e')
223
+ 'ik' (delete)
224
+ )
225
+ )
226
+ )
227
+
228
+ define stem as (
229
+ do mark_regions
230
+ backwards (
231
+ do instrum
232
+ do case
233
+ do case_special
234
+ do case_other
235
+ do factive
236
+ do owned
237
+ do sing_owner
238
+ do plur_owner
239
+ do plural
240
+ )
241
+ )