mittens 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +30 -0
- data/README.md +62 -0
- data/Rakefile +21 -0
- data/ext/mittens/ext.c +96 -0
- data/ext/mittens/extconf.rb +12 -0
- data/lib/mittens/version.rb +3 -0
- data/lib/mittens.rb +7 -0
- data/mittens.gemspec +22 -0
- data/vendor/snowball/.gitignore +26 -0
- data/vendor/snowball/.travis.yml +112 -0
- data/vendor/snowball/AUTHORS +27 -0
- data/vendor/snowball/CONTRIBUTING.rst +216 -0
- data/vendor/snowball/COPYING +29 -0
- data/vendor/snowball/GNUmakefile +742 -0
- data/vendor/snowball/NEWS +754 -0
- data/vendor/snowball/README.rst +37 -0
- data/vendor/snowball/ada/README.md +74 -0
- data/vendor/snowball/ada/generate/generate.adb +83 -0
- data/vendor/snowball/ada/generate.gpr +21 -0
- data/vendor/snowball/ada/src/stemmer.adb +620 -0
- data/vendor/snowball/ada/src/stemmer.ads +219 -0
- data/vendor/snowball/ada/src/stemwords.adb +70 -0
- data/vendor/snowball/ada/stemmer_config.gpr +83 -0
- data/vendor/snowball/ada/stemwords.gpr +21 -0
- data/vendor/snowball/algorithms/arabic.sbl +558 -0
- data/vendor/snowball/algorithms/armenian.sbl +301 -0
- data/vendor/snowball/algorithms/basque.sbl +149 -0
- data/vendor/snowball/algorithms/catalan.sbl +202 -0
- data/vendor/snowball/algorithms/danish.sbl +93 -0
- data/vendor/snowball/algorithms/dutch.sbl +164 -0
- data/vendor/snowball/algorithms/english.sbl +229 -0
- data/vendor/snowball/algorithms/finnish.sbl +197 -0
- data/vendor/snowball/algorithms/french.sbl +254 -0
- data/vendor/snowball/algorithms/german.sbl +139 -0
- data/vendor/snowball/algorithms/german2.sbl +145 -0
- data/vendor/snowball/algorithms/greek.sbl +701 -0
- data/vendor/snowball/algorithms/hindi.sbl +323 -0
- data/vendor/snowball/algorithms/hungarian.sbl +241 -0
- data/vendor/snowball/algorithms/indonesian.sbl +192 -0
- data/vendor/snowball/algorithms/irish.sbl +149 -0
- data/vendor/snowball/algorithms/italian.sbl +202 -0
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
- data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
- data/vendor/snowball/algorithms/lovins.sbl +208 -0
- data/vendor/snowball/algorithms/nepali.sbl +92 -0
- data/vendor/snowball/algorithms/norwegian.sbl +80 -0
- data/vendor/snowball/algorithms/porter.sbl +139 -0
- data/vendor/snowball/algorithms/portuguese.sbl +218 -0
- data/vendor/snowball/algorithms/romanian.sbl +236 -0
- data/vendor/snowball/algorithms/russian.sbl +221 -0
- data/vendor/snowball/algorithms/serbian.sbl +2379 -0
- data/vendor/snowball/algorithms/spanish.sbl +230 -0
- data/vendor/snowball/algorithms/swedish.sbl +72 -0
- data/vendor/snowball/algorithms/tamil.sbl +405 -0
- data/vendor/snowball/algorithms/turkish.sbl +470 -0
- data/vendor/snowball/algorithms/yiddish.sbl +460 -0
- data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
- data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
- data/vendor/snowball/charsets/cp850.sbl +130 -0
- data/vendor/snowball/compiler/analyser.c +1547 -0
- data/vendor/snowball/compiler/driver.c +615 -0
- data/vendor/snowball/compiler/generator.c +1748 -0
- data/vendor/snowball/compiler/generator_ada.c +1702 -0
- data/vendor/snowball/compiler/generator_csharp.c +1322 -0
- data/vendor/snowball/compiler/generator_go.c +1278 -0
- data/vendor/snowball/compiler/generator_java.c +1313 -0
- data/vendor/snowball/compiler/generator_js.c +1316 -0
- data/vendor/snowball/compiler/generator_pascal.c +1387 -0
- data/vendor/snowball/compiler/generator_python.c +1337 -0
- data/vendor/snowball/compiler/generator_rust.c +1295 -0
- data/vendor/snowball/compiler/header.h +418 -0
- data/vendor/snowball/compiler/space.c +286 -0
- data/vendor/snowball/compiler/syswords.h +86 -0
- data/vendor/snowball/compiler/syswords2.h +13 -0
- data/vendor/snowball/compiler/tokeniser.c +567 -0
- data/vendor/snowball/csharp/.gitignore +8 -0
- data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
- data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
- data/vendor/snowball/csharp/Stemwords/App.config +6 -0
- data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
- data/vendor/snowball/doc/TODO +12 -0
- data/vendor/snowball/doc/libstemmer_c_README +148 -0
- data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
- data/vendor/snowball/doc/libstemmer_java_README +67 -0
- data/vendor/snowball/doc/libstemmer_js_README +48 -0
- data/vendor/snowball/doc/libstemmer_python_README +113 -0
- data/vendor/snowball/examples/stemwords.c +204 -0
- data/vendor/snowball/go/README.md +55 -0
- data/vendor/snowball/go/among.go +16 -0
- data/vendor/snowball/go/env.go +403 -0
- data/vendor/snowball/go/stemwords/generate.go +68 -0
- data/vendor/snowball/go/stemwords/main.go +68 -0
- data/vendor/snowball/go/util.go +34 -0
- data/vendor/snowball/iconv.py +50 -0
- data/vendor/snowball/include/libstemmer.h +78 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
- data/vendor/snowball/javascript/base-stemmer.js +294 -0
- data/vendor/snowball/javascript/stemwords.js +106 -0
- data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
- data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
- data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
- data/vendor/snowball/libstemmer/modules.txt +63 -0
- data/vendor/snowball/libstemmer/test.c +34 -0
- data/vendor/snowball/pascal/.gitignore +4 -0
- data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
- data/vendor/snowball/pascal/generate.pl +23 -0
- data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
- data/vendor/snowball/python/MANIFEST.in +7 -0
- data/vendor/snowball/python/create_init.py +54 -0
- data/vendor/snowball/python/setup.cfg +6 -0
- data/vendor/snowball/python/setup.py +81 -0
- data/vendor/snowball/python/snowballstemmer/among.py +13 -0
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
- data/vendor/snowball/python/stemwords.py +101 -0
- data/vendor/snowball/python/testapp.py +28 -0
- data/vendor/snowball/runtime/api.c +58 -0
- data/vendor/snowball/runtime/api.h +32 -0
- data/vendor/snowball/runtime/header.h +61 -0
- data/vendor/snowball/runtime/utilities.c +513 -0
- data/vendor/snowball/rust/Cargo.toml +7 -0
- data/vendor/snowball/rust/build.rs +55 -0
- data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
- data/vendor/snowball/rust/src/main.rs +102 -0
- data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
- data/vendor/snowball/rust/src/snowball/among.rs +6 -0
- data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
- data/vendor/snowball/tests/stemtest.c +95 -0
- metadata +178 -0
@@ -0,0 +1,323 @@
|
|
1
|
+
// An implementation of "A Lightweight Stemmer for Hindi":
|
2
|
+
// http://www.kbcs.in/downloads/papers/StmmerHindi.pdf
|
3
|
+
|
4
|
+
externals ( stem )
|
5
|
+
|
6
|
+
stringescapes {}
|
7
|
+
|
8
|
+
// The transliteration scheme used for our stringdefs matches that used in the
|
9
|
+
// paper, as documented in the appendix. It appears to match the WX notation
|
10
|
+
// (https://en.wikipedia.org/wiki/WX_notation) except that WX apparently
|
11
|
+
// uses 'z' for Anunasika whereas the paper uses Mh.
|
12
|
+
//
|
13
|
+
// We discriminate dependent vowels by adding a leading "_" to their stringdef
|
14
|
+
// names (mnemonic: the _ signifies removing the implicit a from the preceding
|
15
|
+
// character).
|
16
|
+
|
17
|
+
// Vowels and sonorants:
|
18
|
+
stringdef a '{U+0905}'
|
19
|
+
stringdef A '{U+0906}'
|
20
|
+
stringdef i '{U+0907}'
|
21
|
+
stringdef I '{U+0908}'
|
22
|
+
stringdef u '{U+0909}'
|
23
|
+
stringdef U '{U+090A}'
|
24
|
+
stringdef q '{U+090B}'
|
25
|
+
stringdef e '{U+090F}'
|
26
|
+
stringdef E '{U+0910}'
|
27
|
+
stringdef o '{U+0913}'
|
28
|
+
stringdef O '{U+0914}'
|
29
|
+
|
30
|
+
// Vowel signs:
|
31
|
+
stringdef _A '{U+093E}'
|
32
|
+
stringdef _i '{U+093F}'
|
33
|
+
stringdef _I '{U+0940}'
|
34
|
+
stringdef _u '{U+0941}'
|
35
|
+
stringdef _U '{U+0942}'
|
36
|
+
stringdef _q '{U+0943}'
|
37
|
+
stringdef _e '{U+0947}'
|
38
|
+
stringdef _E '{U+0948}'
|
39
|
+
stringdef _o '{U+094B}'
|
40
|
+
stringdef _O '{U+094C}'
|
41
|
+
|
42
|
+
// Diacritics:
|
43
|
+
stringdef M '{U+0902}'
|
44
|
+
stringdef H '{U+0903}'
|
45
|
+
stringdef Mh '{U+0901}'
|
46
|
+
stringdef Z '{U+093C}' // Nukta
|
47
|
+
stringdef virama '{U+094D}'
|
48
|
+
|
49
|
+
// Velar consonants:
|
50
|
+
stringdef k '{U+0915}'
|
51
|
+
stringdef K '{U+0916}'
|
52
|
+
stringdef g '{U+0917}'
|
53
|
+
stringdef G '{U+0918}'
|
54
|
+
stringdef f '{U+0919}'
|
55
|
+
|
56
|
+
// Palatal consonants:
|
57
|
+
stringdef c '{U+091A}'
|
58
|
+
stringdef C '{U+091B}'
|
59
|
+
stringdef j '{U+091C}'
|
60
|
+
stringdef J '{U+091D}'
|
61
|
+
stringdef F '{U+091E}'
|
62
|
+
|
63
|
+
// Retroflex consonants:
|
64
|
+
stringdef t '{U+091F}'
|
65
|
+
stringdef T '{U+0920}'
|
66
|
+
stringdef d '{U+0921}'
|
67
|
+
stringdef D '{U+0922}'
|
68
|
+
stringdef N '{U+0923}'
|
69
|
+
|
70
|
+
// Dental consonants:
|
71
|
+
stringdef w '{U+0924}'
|
72
|
+
stringdef W '{U+0925}'
|
73
|
+
stringdef x '{U+0926}'
|
74
|
+
stringdef X '{U+0927}'
|
75
|
+
stringdef n '{U+0928}'
|
76
|
+
|
77
|
+
// Labial consonants:
|
78
|
+
stringdef p '{U+092A}'
|
79
|
+
stringdef P '{U+092B}'
|
80
|
+
stringdef b '{U+092C}'
|
81
|
+
stringdef B '{U+092D}'
|
82
|
+
stringdef m '{U+092E}'
|
83
|
+
|
84
|
+
// Semi-vowels:
|
85
|
+
stringdef y '{U+092F}'
|
86
|
+
stringdef r '{U+0930}'
|
87
|
+
stringdef l '{U+0932}'
|
88
|
+
stringdef v '{U+0935}'
|
89
|
+
|
90
|
+
// Fricatives:
|
91
|
+
stringdef S '{U+0936}'
|
92
|
+
stringdef R '{U+0937}'
|
93
|
+
stringdef s '{U+0938}'
|
94
|
+
stringdef h '{U+0939}'
|
95
|
+
|
96
|
+
stringdef lY '{U+0933}'
|
97
|
+
|
98
|
+
// Precomposed characters - letters + nukta:
|
99
|
+
stringdef nZ '{U+0929}' // ≡ {n}{Z}
|
100
|
+
stringdef rZ '{U+0931}' // ≡ {r}{Z}
|
101
|
+
stringdef lYZ '{U+0934}' // ≡ {lY}{Z}
|
102
|
+
stringdef kZ '{U+0958}' // ≡ {k}{Z}
|
103
|
+
stringdef KZ '{U+0959}' // ≡ {K}{Z}
|
104
|
+
stringdef gZ '{U+095A}' // ≡ {g}{Z}
|
105
|
+
stringdef jZ '{U+095B}' // ≡ {j}{Z}
|
106
|
+
stringdef dZ '{U+095C}' // ≡ {d}{Z}
|
107
|
+
stringdef DZ '{U+095D}' // ≡ {D}{Z}
|
108
|
+
stringdef PZ '{U+095E}' // ≡ {P}{Z}
|
109
|
+
stringdef yZ '{U+095F}' // ≡ {y}{Z}
|
110
|
+
|
111
|
+
groupings ( consonant )
|
112
|
+
|
113
|
+
routines ( CONSONANT )
|
114
|
+
|
115
|
+
define consonant '{k}{K}{g}{G}{f}' +
|
116
|
+
'{c}{C}{j}{J}{F}' +
|
117
|
+
'{t}{T}{d}{D}{N}' +
|
118
|
+
'{w}{W}{x}{X}{n}' +
|
119
|
+
'{p}{P}{b}{B}{m}' +
|
120
|
+
'{y}{r}{l}{v}' +
|
121
|
+
'{S}{R}{s}{h}' +
|
122
|
+
'{lY}' +
|
123
|
+
'{Z}' + // Nukta
|
124
|
+
// Precomposed characters - letter and nukta:
|
125
|
+
'{nZ}{rZ}{lYZ}{kZ}{KZ}{gZ}{jZ}{dZ}{DZ}{PZ}{yZ}'
|
126
|
+
|
127
|
+
backwardmode ( define CONSONANT as ( consonant ) )
|
128
|
+
|
129
|
+
define stem as (
|
130
|
+
// We assume in this implementation that the whole word doesn't count
|
131
|
+
// as a valid suffix to remove, so we remove the longest suffix from
|
132
|
+
// the list which leaves at least one character. This change affects
|
133
|
+
// 47 words out of the 65,140 in the sample vocabulary from Hindi
|
134
|
+
// wikipedia.
|
135
|
+
//
|
136
|
+
// The trick here is we use `next` in forward mode to advance the cursor
|
137
|
+
// to the second character, then `backwards` swaps the cursor and limit.
|
138
|
+
next
|
139
|
+
backwards (
|
140
|
+
[substring] among (
|
141
|
+
// The list below is derived from figure 3 in the paper.
|
142
|
+
//
|
143
|
+
// We perform the stemming on the Devanagari characters rather than
|
144
|
+
// transliterating to Latin, so we have adapted the list below to
|
145
|
+
// reflect this by converting suffixes back to Devanagari as
|
146
|
+
// follows:
|
147
|
+
//
|
148
|
+
// * within the suffixes, "a" after a consonant is dropped since
|
149
|
+
// consonants have an implicit "a".
|
150
|
+
//
|
151
|
+
// * within the suffixes, a vowel other than "a" after a consonant
|
152
|
+
// is a dependent vowel (vowel sign); a vowel (including "a")
|
153
|
+
// after a non-consonant is an independent vowel.
|
154
|
+
//
|
155
|
+
// * to allow the vowel at the start of each suffix being dependent
|
156
|
+
// or independent, we include each suffix twice. For the
|
157
|
+
// dependent version, a leading "a" is dropped and we check that
|
158
|
+
// the suffix is preceded by a consonant (which will have an
|
159
|
+
// implicit "a").
|
160
|
+
//
|
161
|
+
// * we add '{a}', which is needed for the example given right at
|
162
|
+
// the end of section 5 to work (conflating BarawIya and
|
163
|
+
// BarawIyawA), and which 3.1 a.v strongly suggests should be in
|
164
|
+
// the list:
|
165
|
+
//
|
166
|
+
// Thus, the following suffix deletions (longest possible
|
167
|
+
// match) are required to reduce inflected forms of masculine
|
168
|
+
// nouns to a common stem:
|
169
|
+
// a A i [...]
|
170
|
+
//
|
171
|
+
// Adding '{a}' only affect 2 words out of the 65,140 in the
|
172
|
+
// sample vocabulary.
|
173
|
+
//
|
174
|
+
// * The transliterations of our stems would end with "a" when our
|
175
|
+
// stems end in a consonant, so we also include {virama} in the
|
176
|
+
// list of suffixes to remove (this affects 222 words from the
|
177
|
+
// sample vocabulary).
|
178
|
+
//
|
179
|
+
// We've also assumed that Mh in the suffix list always means {Mh}
|
180
|
+
// and never {M}{h}{virama}. Only one of the 65,140 words in the
|
181
|
+
// sample vocabulary stems differently due to this (and that word
|
182
|
+
// seems to be a typo).
|
183
|
+
|
184
|
+
'{virama}'
|
185
|
+
|
186
|
+
'{a}'
|
187
|
+
'{A}'
|
188
|
+
'{i}'
|
189
|
+
'{I}'
|
190
|
+
'{u}'
|
191
|
+
'{U}'
|
192
|
+
'{e}'
|
193
|
+
'{o}'
|
194
|
+
'{e}{M}'
|
195
|
+
'{o}{M}'
|
196
|
+
'{A}{M}'
|
197
|
+
'{u}{A}{M}'
|
198
|
+
'{u}{e}{M}'
|
199
|
+
'{u}{o}{M}'
|
200
|
+
'{A}{e}{M}'
|
201
|
+
'{A}{o}{M}'
|
202
|
+
'{i}{y}{_A}{M}'
|
203
|
+
'{i}{y}{_o}{M}'
|
204
|
+
'{A}{i}{y}{_A}{M}'
|
205
|
+
'{A}{i}{y}{_o}{M}'
|
206
|
+
'{A}{Mh}'
|
207
|
+
'{i}{y}{_A}{Mh}'
|
208
|
+
'{A}{i}{y}{_A}{Mh}'
|
209
|
+
'{a}{w}{_A}{e}{M}'
|
210
|
+
'{a}{w}{_A}{o}{M}'
|
211
|
+
'{a}{n}{_A}{e}{M}'
|
212
|
+
'{a}{n}{_A}{o}{M}'
|
213
|
+
'{a}{w}{_A}'
|
214
|
+
'{a}{w}{_I}'
|
215
|
+
'{I}{M}'
|
216
|
+
'{a}{w}{_I}{M}'
|
217
|
+
'{a}{w}{_e}'
|
218
|
+
'{A}{w}{_A}'
|
219
|
+
'{A}{w}{_I}'
|
220
|
+
'{A}{w}{_I}{M}'
|
221
|
+
'{A}{w}{_e}'
|
222
|
+
'{a}{n}{_A}'
|
223
|
+
'{a}{n}{_I}'
|
224
|
+
'{a}{n}{_e}'
|
225
|
+
'{A}{n}{_A}'
|
226
|
+
'{A}{n}{_e}'
|
227
|
+
'{U}{M}{g}{_A}'
|
228
|
+
'{U}{M}{g}{_I}'
|
229
|
+
'{A}{U}{M}{g}{_A}'
|
230
|
+
'{A}{U}{M}{g}{_I}'
|
231
|
+
'{e}{M}{g}{_e}'
|
232
|
+
'{e}{M}{g}{_I}'
|
233
|
+
'{A}{e}{M}{g}{_e}'
|
234
|
+
'{A}{e}{M}{g}{_I}'
|
235
|
+
'{o}{g}{_e}'
|
236
|
+
'{o}{g}{_I}'
|
237
|
+
'{A}{o}{g}{_e}'
|
238
|
+
'{A}{o}{g}{_I}'
|
239
|
+
'{e}{g}{_A}'
|
240
|
+
'{e}{g}{_I}'
|
241
|
+
'{A}{e}{g}{_A}'
|
242
|
+
'{A}{e}{g}{_I}'
|
243
|
+
'{A}{y}{_A}'
|
244
|
+
'{A}{e}'
|
245
|
+
'{A}{I}'
|
246
|
+
'{A}{I}{M}'
|
247
|
+
'{i}{e}'
|
248
|
+
'{A}{o}'
|
249
|
+
'{A}{i}{e}'
|
250
|
+
'{a}{k}{r}'
|
251
|
+
'{A}{k}{r}'
|
252
|
+
|
253
|
+
'{_A}'
|
254
|
+
'{_i}'
|
255
|
+
'{_I}'
|
256
|
+
'{_u}'
|
257
|
+
'{_U}'
|
258
|
+
'{_e}'
|
259
|
+
'{_o}'
|
260
|
+
'{_e}{M}'
|
261
|
+
'{_o}{M}'
|
262
|
+
'{_A}{M}'
|
263
|
+
'{_u}{A}{M}'
|
264
|
+
'{_u}{e}{M}'
|
265
|
+
'{_u}{o}{M}'
|
266
|
+
'{_A}{e}{M}'
|
267
|
+
'{_A}{o}{M}'
|
268
|
+
'{_i}{y}{_A}{M}'
|
269
|
+
'{_i}{y}{_o}{M}'
|
270
|
+
'{_A}{i}{y}{_A}{M}'
|
271
|
+
'{_A}{i}{y}{_o}{M}'
|
272
|
+
'{_A}{Mh}'
|
273
|
+
'{_i}{y}{_A}{Mh}'
|
274
|
+
'{_A}{i}{y}{_A}{Mh}'
|
275
|
+
'{_I}{M}'
|
276
|
+
'{_A}{w}{_A}'
|
277
|
+
'{_A}{w}{_I}'
|
278
|
+
'{_A}{w}{_I}{M}'
|
279
|
+
'{_A}{w}{_e}'
|
280
|
+
'{_A}{n}{_A}'
|
281
|
+
'{_A}{n}{_e}'
|
282
|
+
'{_U}{M}{g}{_A}'
|
283
|
+
'{_U}{M}{g}{_I}'
|
284
|
+
'{_A}{U}{M}{g}{_A}'
|
285
|
+
'{_A}{U}{M}{g}{_I}'
|
286
|
+
'{_e}{M}{g}{_e}'
|
287
|
+
'{_e}{M}{g}{_I}'
|
288
|
+
'{_A}{e}{M}{g}{_e}'
|
289
|
+
'{_A}{e}{M}{g}{_I}'
|
290
|
+
'{_o}{g}{_e}'
|
291
|
+
'{_o}{g}{_I}'
|
292
|
+
'{_A}{o}{g}{_e}'
|
293
|
+
'{_A}{o}{g}{_I}'
|
294
|
+
'{_e}{g}{_A}'
|
295
|
+
'{_e}{g}{_I}'
|
296
|
+
'{_A}{e}{g}{_A}'
|
297
|
+
'{_A}{e}{g}{_I}'
|
298
|
+
'{_A}{y}{_A}'
|
299
|
+
'{_A}{e}'
|
300
|
+
'{_A}{I}'
|
301
|
+
'{_A}{I}{M}'
|
302
|
+
'{_i}{e}'
|
303
|
+
'{_A}{o}'
|
304
|
+
'{_A}{i}{e}'
|
305
|
+
'{_A}{k}{r}'
|
306
|
+
|
307
|
+
/* Suffixes with a leading implicit a: */
|
308
|
+
'{w}{_A}{e}{M}' CONSONANT
|
309
|
+
'{w}{_A}{o}{M}' CONSONANT
|
310
|
+
'{n}{_A}{e}{M}' CONSONANT
|
311
|
+
'{n}{_A}{o}{M}' CONSONANT
|
312
|
+
'{w}{_A}' CONSONANT
|
313
|
+
'{w}{_I}' CONSONANT
|
314
|
+
'{w}{_I}{M}' CONSONANT
|
315
|
+
'{w}{_e}' CONSONANT
|
316
|
+
'{n}{_A}' CONSONANT
|
317
|
+
'{n}{_I}' CONSONANT
|
318
|
+
'{n}{_e}' CONSONANT
|
319
|
+
'{k}{r}' CONSONANT
|
320
|
+
)
|
321
|
+
delete
|
322
|
+
)
|
323
|
+
)
|
@@ -0,0 +1,241 @@
|
|
1
|
+
/*
|
2
|
+
Hungarian Stemmer
|
3
|
+
Removes noun inflections
|
4
|
+
*/
|
5
|
+
|
6
|
+
routines (
|
7
|
+
mark_regions
|
8
|
+
R1
|
9
|
+
v_ending
|
10
|
+
case
|
11
|
+
case_special
|
12
|
+
case_other
|
13
|
+
plural
|
14
|
+
owned
|
15
|
+
sing_owner
|
16
|
+
plur_owner
|
17
|
+
instrum
|
18
|
+
factive
|
19
|
+
undouble
|
20
|
+
double
|
21
|
+
)
|
22
|
+
|
23
|
+
externals ( stem )
|
24
|
+
|
25
|
+
integers ( p1 )
|
26
|
+
groupings ( v )
|
27
|
+
|
28
|
+
stringescapes {}
|
29
|
+
|
30
|
+
/* special characters */
|
31
|
+
|
32
|
+
stringdef a' '{U+00E1}' //a-acute
|
33
|
+
stringdef e' '{U+00E9}' //e-acute
|
34
|
+
stringdef i' '{U+00ED}' //i-acute
|
35
|
+
stringdef o' '{U+00F3}' //o-acute
|
36
|
+
stringdef o" '{U+00F6}' //o-umlaut
|
37
|
+
stringdef oq '{U+0151}' //o-double acute
|
38
|
+
stringdef u' '{U+00FA}' //u-acute
|
39
|
+
stringdef u" '{U+00FC}' //u-umlaut
|
40
|
+
stringdef uq '{U+0171}' //u-double acute
|
41
|
+
|
42
|
+
define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}'
|
43
|
+
|
44
|
+
define mark_regions as (
|
45
|
+
|
46
|
+
$p1 = limit
|
47
|
+
|
48
|
+
(v goto non-v
|
49
|
+
among('cs' 'gy' 'ly' 'ny' 'sz' 'ty' 'zs' 'dzs') or next
|
50
|
+
setmark p1)
|
51
|
+
or
|
52
|
+
|
53
|
+
(non-v gopast v setmark p1)
|
54
|
+
)
|
55
|
+
|
56
|
+
backwardmode (
|
57
|
+
|
58
|
+
define R1 as $p1 <= cursor
|
59
|
+
|
60
|
+
define v_ending as (
|
61
|
+
[substring] R1 among(
|
62
|
+
'{a'}' (<- 'a')
|
63
|
+
'{e'}' (<- 'e')
|
64
|
+
)
|
65
|
+
)
|
66
|
+
|
67
|
+
define double as (
|
68
|
+
test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm'
|
69
|
+
'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs')
|
70
|
+
)
|
71
|
+
|
72
|
+
define undouble as (
|
73
|
+
next [hop 1] delete
|
74
|
+
)
|
75
|
+
|
76
|
+
define instrum as(
|
77
|
+
[substring] R1 among(
|
78
|
+
'al' (double)
|
79
|
+
'el' (double)
|
80
|
+
)
|
81
|
+
delete
|
82
|
+
undouble
|
83
|
+
)
|
84
|
+
|
85
|
+
|
86
|
+
define case as (
|
87
|
+
[substring] R1 among(
|
88
|
+
'ban' 'ben'
|
89
|
+
'ba' 'be'
|
90
|
+
'ra' 're'
|
91
|
+
'nak' 'nek'
|
92
|
+
'val' 'vel'
|
93
|
+
't{o'}l' 't{oq}l'
|
94
|
+
'r{o'}l' 'r{oq}l'
|
95
|
+
'b{o'}l' 'b{oq}l'
|
96
|
+
'hoz' 'hez' 'h{o"}z'
|
97
|
+
'n{a'}l' 'n{e'}l'
|
98
|
+
'ig'
|
99
|
+
'at' 'et' 'ot' '{o"}t'
|
100
|
+
'{e'}rt'
|
101
|
+
'k{e'}pp' 'k{e'}ppen'
|
102
|
+
'kor'
|
103
|
+
'ul' '{u"}l'
|
104
|
+
'v{a'}' 'v{e'}'
|
105
|
+
'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt'
|
106
|
+
'k{e'}nt'
|
107
|
+
'en' 'on' 'an' '{o"}n'
|
108
|
+
'n'
|
109
|
+
't'
|
110
|
+
)
|
111
|
+
delete
|
112
|
+
v_ending
|
113
|
+
)
|
114
|
+
|
115
|
+
define case_special as(
|
116
|
+
[substring] R1 among(
|
117
|
+
'{e'}n' (<- 'e')
|
118
|
+
'{a'}n' (<- 'a')
|
119
|
+
'{a'}nk{e'}nt' (<- 'a')
|
120
|
+
)
|
121
|
+
)
|
122
|
+
|
123
|
+
define case_other as(
|
124
|
+
[substring] R1 among(
|
125
|
+
'astul' 'est{u"}l' (delete)
|
126
|
+
'stul' 'st{u"}l' (delete)
|
127
|
+
'{a'}stul' (<- 'a')
|
128
|
+
'{e'}st{u"}l' (<- 'e')
|
129
|
+
)
|
130
|
+
)
|
131
|
+
|
132
|
+
define factive as(
|
133
|
+
[substring] R1 among(
|
134
|
+
'{a'}' (double)
|
135
|
+
'{e'}' (double)
|
136
|
+
)
|
137
|
+
delete
|
138
|
+
undouble
|
139
|
+
)
|
140
|
+
|
141
|
+
define plural as (
|
142
|
+
[substring] R1 among(
|
143
|
+
'{a'}k' (<- 'a')
|
144
|
+
'{e'}k' (<- 'e')
|
145
|
+
'{o"}k' (delete)
|
146
|
+
'ak' (delete)
|
147
|
+
'ok' (delete)
|
148
|
+
'ek' (delete)
|
149
|
+
'k' (delete)
|
150
|
+
)
|
151
|
+
)
|
152
|
+
|
153
|
+
define owned as (
|
154
|
+
[substring] R1 among (
|
155
|
+
'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete)
|
156
|
+
'{e'}k{e'}' (<- 'e')
|
157
|
+
'{a'}k{e'}' (<- 'a')
|
158
|
+
'k{e'}' (delete)
|
159
|
+
'{e'}{e'}i' (<- 'e')
|
160
|
+
'{a'}{e'}i' (<- 'a')
|
161
|
+
'{e'}i' (delete)
|
162
|
+
'{e'}{e'}' (<- 'e')
|
163
|
+
'{e'}' (delete)
|
164
|
+
)
|
165
|
+
)
|
166
|
+
|
167
|
+
define sing_owner as (
|
168
|
+
[substring] R1 among(
|
169
|
+
'{u"}nk' 'unk' (delete)
|
170
|
+
'{a'}nk' (<- 'a')
|
171
|
+
'{e'}nk' (<- 'e')
|
172
|
+
'nk' (delete)
|
173
|
+
'{a'}juk' (<- 'a')
|
174
|
+
'{e'}j{u"}k' (<- 'e')
|
175
|
+
'juk' 'j{u"}k' (delete)
|
176
|
+
'uk' '{u"}k' (delete)
|
177
|
+
'em' 'om' 'am' (delete)
|
178
|
+
'{a'}m' (<- 'a')
|
179
|
+
'{e'}m' (<- 'e')
|
180
|
+
'm' (delete)
|
181
|
+
'od' 'ed' 'ad' '{o"}d' (delete)
|
182
|
+
'{a'}d' (<- 'a')
|
183
|
+
'{e'}d' (<- 'e')
|
184
|
+
'd' (delete)
|
185
|
+
'ja' 'je' (delete)
|
186
|
+
'a' 'e' 'o' (delete)
|
187
|
+
'{a'}' (<- 'a')
|
188
|
+
'{e'}' (<- 'e')
|
189
|
+
)
|
190
|
+
)
|
191
|
+
|
192
|
+
define plur_owner as (
|
193
|
+
[substring] R1 among(
|
194
|
+
'jaim' 'jeim' (delete)
|
195
|
+
'{a'}im' (<- 'a')
|
196
|
+
'{e'}im' (<- 'e')
|
197
|
+
'aim' 'eim' (delete)
|
198
|
+
'im' (delete)
|
199
|
+
'jaid' 'jeid' (delete)
|
200
|
+
'{a'}id' (<- 'a')
|
201
|
+
'{e'}id' (<- 'e')
|
202
|
+
'aid' 'eid' (delete)
|
203
|
+
'id' (delete)
|
204
|
+
'jai' 'jei' (delete)
|
205
|
+
'{a'}i' (<- 'a')
|
206
|
+
'{e'}i' (<- 'e')
|
207
|
+
'ai' 'ei' (delete)
|
208
|
+
'i' (delete)
|
209
|
+
'jaink' 'jeink' (delete)
|
210
|
+
'eink' 'aink' (delete)
|
211
|
+
'{a'}ink' (<- 'a')
|
212
|
+
'{e'}ink' (<- 'e')
|
213
|
+
'ink'
|
214
|
+
'jaitok' 'jeitek' (delete)
|
215
|
+
'aitok' 'eitek' (delete)
|
216
|
+
'{a'}itok' (<- 'a')
|
217
|
+
'{e'}itek' (<- 'e')
|
218
|
+
'itek' (delete)
|
219
|
+
'jeik' 'jaik' (delete)
|
220
|
+
'aik' 'eik' (delete)
|
221
|
+
'{a'}ik' (<- 'a')
|
222
|
+
'{e'}ik' (<- 'e')
|
223
|
+
'ik' (delete)
|
224
|
+
)
|
225
|
+
)
|
226
|
+
)
|
227
|
+
|
228
|
+
define stem as (
|
229
|
+
do mark_regions
|
230
|
+
backwards (
|
231
|
+
do instrum
|
232
|
+
do case
|
233
|
+
do case_special
|
234
|
+
do case_other
|
235
|
+
do factive
|
236
|
+
do owned
|
237
|
+
do sing_owner
|
238
|
+
do plur_owner
|
239
|
+
do plural
|
240
|
+
)
|
241
|
+
)
|