mittens 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +30 -0
- data/README.md +62 -0
- data/Rakefile +21 -0
- data/ext/mittens/ext.c +96 -0
- data/ext/mittens/extconf.rb +12 -0
- data/lib/mittens/version.rb +3 -0
- data/lib/mittens.rb +7 -0
- data/mittens.gemspec +22 -0
- data/vendor/snowball/.gitignore +26 -0
- data/vendor/snowball/.travis.yml +112 -0
- data/vendor/snowball/AUTHORS +27 -0
- data/vendor/snowball/CONTRIBUTING.rst +216 -0
- data/vendor/snowball/COPYING +29 -0
- data/vendor/snowball/GNUmakefile +742 -0
- data/vendor/snowball/NEWS +754 -0
- data/vendor/snowball/README.rst +37 -0
- data/vendor/snowball/ada/README.md +74 -0
- data/vendor/snowball/ada/generate/generate.adb +83 -0
- data/vendor/snowball/ada/generate.gpr +21 -0
- data/vendor/snowball/ada/src/stemmer.adb +620 -0
- data/vendor/snowball/ada/src/stemmer.ads +219 -0
- data/vendor/snowball/ada/src/stemwords.adb +70 -0
- data/vendor/snowball/ada/stemmer_config.gpr +83 -0
- data/vendor/snowball/ada/stemwords.gpr +21 -0
- data/vendor/snowball/algorithms/arabic.sbl +558 -0
- data/vendor/snowball/algorithms/armenian.sbl +301 -0
- data/vendor/snowball/algorithms/basque.sbl +149 -0
- data/vendor/snowball/algorithms/catalan.sbl +202 -0
- data/vendor/snowball/algorithms/danish.sbl +93 -0
- data/vendor/snowball/algorithms/dutch.sbl +164 -0
- data/vendor/snowball/algorithms/english.sbl +229 -0
- data/vendor/snowball/algorithms/finnish.sbl +197 -0
- data/vendor/snowball/algorithms/french.sbl +254 -0
- data/vendor/snowball/algorithms/german.sbl +139 -0
- data/vendor/snowball/algorithms/german2.sbl +145 -0
- data/vendor/snowball/algorithms/greek.sbl +701 -0
- data/vendor/snowball/algorithms/hindi.sbl +323 -0
- data/vendor/snowball/algorithms/hungarian.sbl +241 -0
- data/vendor/snowball/algorithms/indonesian.sbl +192 -0
- data/vendor/snowball/algorithms/irish.sbl +149 -0
- data/vendor/snowball/algorithms/italian.sbl +202 -0
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
- data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
- data/vendor/snowball/algorithms/lovins.sbl +208 -0
- data/vendor/snowball/algorithms/nepali.sbl +92 -0
- data/vendor/snowball/algorithms/norwegian.sbl +80 -0
- data/vendor/snowball/algorithms/porter.sbl +139 -0
- data/vendor/snowball/algorithms/portuguese.sbl +218 -0
- data/vendor/snowball/algorithms/romanian.sbl +236 -0
- data/vendor/snowball/algorithms/russian.sbl +221 -0
- data/vendor/snowball/algorithms/serbian.sbl +2379 -0
- data/vendor/snowball/algorithms/spanish.sbl +230 -0
- data/vendor/snowball/algorithms/swedish.sbl +72 -0
- data/vendor/snowball/algorithms/tamil.sbl +405 -0
- data/vendor/snowball/algorithms/turkish.sbl +470 -0
- data/vendor/snowball/algorithms/yiddish.sbl +460 -0
- data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
- data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
- data/vendor/snowball/charsets/cp850.sbl +130 -0
- data/vendor/snowball/compiler/analyser.c +1547 -0
- data/vendor/snowball/compiler/driver.c +615 -0
- data/vendor/snowball/compiler/generator.c +1748 -0
- data/vendor/snowball/compiler/generator_ada.c +1702 -0
- data/vendor/snowball/compiler/generator_csharp.c +1322 -0
- data/vendor/snowball/compiler/generator_go.c +1278 -0
- data/vendor/snowball/compiler/generator_java.c +1313 -0
- data/vendor/snowball/compiler/generator_js.c +1316 -0
- data/vendor/snowball/compiler/generator_pascal.c +1387 -0
- data/vendor/snowball/compiler/generator_python.c +1337 -0
- data/vendor/snowball/compiler/generator_rust.c +1295 -0
- data/vendor/snowball/compiler/header.h +418 -0
- data/vendor/snowball/compiler/space.c +286 -0
- data/vendor/snowball/compiler/syswords.h +86 -0
- data/vendor/snowball/compiler/syswords2.h +13 -0
- data/vendor/snowball/compiler/tokeniser.c +567 -0
- data/vendor/snowball/csharp/.gitignore +8 -0
- data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
- data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
- data/vendor/snowball/csharp/Stemwords/App.config +6 -0
- data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
- data/vendor/snowball/doc/TODO +12 -0
- data/vendor/snowball/doc/libstemmer_c_README +148 -0
- data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
- data/vendor/snowball/doc/libstemmer_java_README +67 -0
- data/vendor/snowball/doc/libstemmer_js_README +48 -0
- data/vendor/snowball/doc/libstemmer_python_README +113 -0
- data/vendor/snowball/examples/stemwords.c +204 -0
- data/vendor/snowball/go/README.md +55 -0
- data/vendor/snowball/go/among.go +16 -0
- data/vendor/snowball/go/env.go +403 -0
- data/vendor/snowball/go/stemwords/generate.go +68 -0
- data/vendor/snowball/go/stemwords/main.go +68 -0
- data/vendor/snowball/go/util.go +34 -0
- data/vendor/snowball/iconv.py +50 -0
- data/vendor/snowball/include/libstemmer.h +78 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
- data/vendor/snowball/javascript/base-stemmer.js +294 -0
- data/vendor/snowball/javascript/stemwords.js +106 -0
- data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
- data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
- data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
- data/vendor/snowball/libstemmer/modules.txt +63 -0
- data/vendor/snowball/libstemmer/test.c +34 -0
- data/vendor/snowball/pascal/.gitignore +4 -0
- data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
- data/vendor/snowball/pascal/generate.pl +23 -0
- data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
- data/vendor/snowball/python/MANIFEST.in +7 -0
- data/vendor/snowball/python/create_init.py +54 -0
- data/vendor/snowball/python/setup.cfg +6 -0
- data/vendor/snowball/python/setup.py +81 -0
- data/vendor/snowball/python/snowballstemmer/among.py +13 -0
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
- data/vendor/snowball/python/stemwords.py +101 -0
- data/vendor/snowball/python/testapp.py +28 -0
- data/vendor/snowball/runtime/api.c +58 -0
- data/vendor/snowball/runtime/api.h +32 -0
- data/vendor/snowball/runtime/header.h +61 -0
- data/vendor/snowball/runtime/utilities.c +513 -0
- data/vendor/snowball/rust/Cargo.toml +7 -0
- data/vendor/snowball/rust/build.rs +55 -0
- data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
- data/vendor/snowball/rust/src/main.rs +102 -0
- data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
- data/vendor/snowball/rust/src/snowball/among.rs +6 -0
- data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
- data/vendor/snowball/tests/stemtest.c +95 -0
- metadata +178 -0
@@ -0,0 +1,230 @@
|
|
1
|
+
routines (
|
2
|
+
postlude mark_regions
|
3
|
+
RV R1 R2
|
4
|
+
attached_pronoun
|
5
|
+
standard_suffix
|
6
|
+
y_verb_suffix
|
7
|
+
verb_suffix
|
8
|
+
residual_suffix
|
9
|
+
)
|
10
|
+
|
11
|
+
externals ( stem )
|
12
|
+
|
13
|
+
integers ( pV p1 p2 )
|
14
|
+
|
15
|
+
groupings ( v )
|
16
|
+
|
17
|
+
stringescapes {}
|
18
|
+
|
19
|
+
/* special characters */
|
20
|
+
|
21
|
+
stringdef a' '{U+00E1}' // a-acute
|
22
|
+
stringdef e' '{U+00E9}' // e-acute
|
23
|
+
stringdef i' '{U+00ED}' // i-acute
|
24
|
+
stringdef o' '{U+00F3}' // o-acute
|
25
|
+
stringdef u' '{U+00FA}' // u-acute
|
26
|
+
stringdef u" '{U+00FC}' // u-diaeresis
|
27
|
+
stringdef n~ '{U+00F1}' // n-tilde
|
28
|
+
|
29
|
+
define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}'
|
30
|
+
|
31
|
+
define mark_regions as (
|
32
|
+
|
33
|
+
$pV = limit
|
34
|
+
$p1 = limit
|
35
|
+
$p2 = limit // defaults
|
36
|
+
|
37
|
+
do (
|
38
|
+
( v (non-v gopast v) or (v gopast non-v) )
|
39
|
+
or
|
40
|
+
( non-v (non-v gopast v) or (v next) )
|
41
|
+
setmark pV
|
42
|
+
)
|
43
|
+
do (
|
44
|
+
gopast v gopast non-v setmark p1
|
45
|
+
gopast v gopast non-v setmark p2
|
46
|
+
)
|
47
|
+
)
|
48
|
+
|
49
|
+
define postlude as repeat (
|
50
|
+
[substring] among(
|
51
|
+
'{a'}' (<- 'a')
|
52
|
+
'{e'}' (<- 'e')
|
53
|
+
'{i'}' (<- 'i')
|
54
|
+
'{o'}' (<- 'o')
|
55
|
+
'{u'}' (<- 'u')
|
56
|
+
// and possibly {u"}->u here, or in prelude
|
57
|
+
'' (next)
|
58
|
+
) //or next
|
59
|
+
)
|
60
|
+
|
61
|
+
backwardmode (
|
62
|
+
|
63
|
+
define RV as $pV <= cursor
|
64
|
+
define R1 as $p1 <= cursor
|
65
|
+
define R2 as $p2 <= cursor
|
66
|
+
|
67
|
+
define attached_pronoun as (
|
68
|
+
[substring] among(
|
69
|
+
'me' 'se' 'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo'
|
70
|
+
'las' 'les' 'los' 'nos'
|
71
|
+
)
|
72
|
+
substring RV among(
|
73
|
+
'i{e'}ndo' (] <- 'iendo')
|
74
|
+
'{a'}ndo' (] <- 'ando')
|
75
|
+
'{a'}r' (] <- 'ar')
|
76
|
+
'{e'}r' (] <- 'er')
|
77
|
+
'{i'}r' (] <- 'ir')
|
78
|
+
'ando'
|
79
|
+
'iendo'
|
80
|
+
'ar' 'er' 'ir'
|
81
|
+
(delete)
|
82
|
+
'yendo' ('u' delete)
|
83
|
+
)
|
84
|
+
)
|
85
|
+
|
86
|
+
define standard_suffix as (
|
87
|
+
[substring] among(
|
88
|
+
|
89
|
+
'anza' 'anzas'
|
90
|
+
'ico' 'ica' 'icos' 'icas'
|
91
|
+
'ismo' 'ismos'
|
92
|
+
'able' 'ables'
|
93
|
+
'ible' 'ibles'
|
94
|
+
'ista' 'istas'
|
95
|
+
'oso' 'osa' 'osos' 'osas'
|
96
|
+
'amiento' 'amientos'
|
97
|
+
'imiento' 'imientos'
|
98
|
+
(
|
99
|
+
R2 delete
|
100
|
+
)
|
101
|
+
'adora' 'ador' 'aci{o'}n'
|
102
|
+
'adoras' 'adores' 'aciones'
|
103
|
+
'ante' 'antes' 'ancia' 'ancias'// Note 1
|
104
|
+
(
|
105
|
+
R2 delete
|
106
|
+
try ( ['ic'] R2 delete )
|
107
|
+
)
|
108
|
+
'log{i'}a'
|
109
|
+
'log{i'}as'
|
110
|
+
(
|
111
|
+
R2 <- 'log'
|
112
|
+
)
|
113
|
+
'uci{o'}n' 'uciones'
|
114
|
+
(
|
115
|
+
R2 <- 'u'
|
116
|
+
)
|
117
|
+
'encia' 'encias'
|
118
|
+
(
|
119
|
+
R2 <- 'ente'
|
120
|
+
)
|
121
|
+
'amente'
|
122
|
+
(
|
123
|
+
R1 delete
|
124
|
+
try (
|
125
|
+
[substring] R2 delete among(
|
126
|
+
'iv' (['at'] R2 delete)
|
127
|
+
'os'
|
128
|
+
'ic'
|
129
|
+
'ad'
|
130
|
+
)
|
131
|
+
)
|
132
|
+
)
|
133
|
+
'mente'
|
134
|
+
(
|
135
|
+
R2 delete
|
136
|
+
try (
|
137
|
+
[substring] among(
|
138
|
+
'ante' // Note 1
|
139
|
+
'able'
|
140
|
+
'ible' (R2 delete)
|
141
|
+
)
|
142
|
+
)
|
143
|
+
)
|
144
|
+
'idad'
|
145
|
+
'idades'
|
146
|
+
(
|
147
|
+
R2 delete
|
148
|
+
try (
|
149
|
+
[substring] among(
|
150
|
+
'abil'
|
151
|
+
'ic'
|
152
|
+
'iv' (R2 delete)
|
153
|
+
)
|
154
|
+
)
|
155
|
+
)
|
156
|
+
'iva' 'ivo'
|
157
|
+
'ivas' 'ivos'
|
158
|
+
(
|
159
|
+
R2 delete
|
160
|
+
try (
|
161
|
+
['at'] R2 delete // but not a further ['ic'] R2 delete
|
162
|
+
)
|
163
|
+
)
|
164
|
+
)
|
165
|
+
)
|
166
|
+
|
167
|
+
define y_verb_suffix as (
|
168
|
+
setlimit tomark pV for ([substring]) among(
|
169
|
+
'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}'
|
170
|
+
'yas' 'yes' 'yais' 'yamos'
|
171
|
+
('u' delete)
|
172
|
+
)
|
173
|
+
)
|
174
|
+
|
175
|
+
define verb_suffix as (
|
176
|
+
setlimit tomark pV for ([substring]) among(
|
177
|
+
|
178
|
+
'en' 'es' '{e'}is' 'emos'
|
179
|
+
(try ('u' test 'g') ] delete)
|
180
|
+
|
181
|
+
'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais'
|
182
|
+
'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}'
|
183
|
+
'ar{e'}'
|
184
|
+
'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais'
|
185
|
+
'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}'
|
186
|
+
'er{e'}'
|
187
|
+
'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais'
|
188
|
+
'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}'
|
189
|
+
'ir{e'}'
|
190
|
+
|
191
|
+
'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed'
|
192
|
+
'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an'
|
193
|
+
'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado'
|
194
|
+
'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as'
|
195
|
+
'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases'
|
196
|
+
'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais'
|
197
|
+
'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados'
|
198
|
+
'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos'
|
199
|
+
'{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos'
|
200
|
+
(delete)
|
201
|
+
)
|
202
|
+
)
|
203
|
+
|
204
|
+
define residual_suffix as (
|
205
|
+
[substring] among(
|
206
|
+
'os'
|
207
|
+
'a' 'o' '{a'}' '{i'}' '{o'}'
|
208
|
+
( RV delete )
|
209
|
+
'e' '{e'}'
|
210
|
+
( RV delete try( ['u'] test 'g' RV delete ) )
|
211
|
+
)
|
212
|
+
)
|
213
|
+
)
|
214
|
+
|
215
|
+
define stem as (
|
216
|
+
do mark_regions
|
217
|
+
backwards (
|
218
|
+
do attached_pronoun
|
219
|
+
do ( standard_suffix or
|
220
|
+
y_verb_suffix or
|
221
|
+
verb_suffix
|
222
|
+
)
|
223
|
+
do residual_suffix
|
224
|
+
)
|
225
|
+
do postlude
|
226
|
+
)
|
227
|
+
|
228
|
+
/*
|
229
|
+
Note 1: additions of 15 Jun 2005
|
230
|
+
*/
|
@@ -0,0 +1,72 @@
|
|
1
|
+
routines (
|
2
|
+
mark_regions
|
3
|
+
main_suffix
|
4
|
+
consonant_pair
|
5
|
+
other_suffix
|
6
|
+
)
|
7
|
+
|
8
|
+
externals ( stem )
|
9
|
+
|
10
|
+
integers ( p1 x )
|
11
|
+
|
12
|
+
groupings ( v s_ending )
|
13
|
+
|
14
|
+
stringescapes {}
|
15
|
+
|
16
|
+
/* special characters */
|
17
|
+
|
18
|
+
stringdef a" '{U+00E4}'
|
19
|
+
stringdef ao '{U+00E5}'
|
20
|
+
stringdef o" '{U+00F6}'
|
21
|
+
|
22
|
+
define v 'aeiouy{a"}{ao}{o"}'
|
23
|
+
|
24
|
+
define s_ending 'bcdfghjklmnoprtvy'
|
25
|
+
|
26
|
+
define mark_regions as (
|
27
|
+
|
28
|
+
$p1 = limit
|
29
|
+
test ( hop 3 setmark x )
|
30
|
+
goto v gopast non-v setmark p1
|
31
|
+
try ( $p1 < x $p1 = x )
|
32
|
+
)
|
33
|
+
|
34
|
+
backwardmode (
|
35
|
+
|
36
|
+
define main_suffix as (
|
37
|
+
setlimit tomark p1 for ([substring])
|
38
|
+
among(
|
39
|
+
|
40
|
+
'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne'
|
41
|
+
'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter'
|
42
|
+
'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens'
|
43
|
+
'hetens' 'erns' 'at' 'andet' 'het' 'ast'
|
44
|
+
(delete)
|
45
|
+
's'
|
46
|
+
(s_ending delete)
|
47
|
+
)
|
48
|
+
)
|
49
|
+
|
50
|
+
define consonant_pair as setlimit tomark p1 for (
|
51
|
+
among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt')
|
52
|
+
and ([next] delete)
|
53
|
+
)
|
54
|
+
|
55
|
+
define other_suffix as setlimit tomark p1 for (
|
56
|
+
[substring] among(
|
57
|
+
'lig' 'ig' 'els' (delete)
|
58
|
+
'l{o"}st' (<-'l{o"}s')
|
59
|
+
'fullt' (<-'full')
|
60
|
+
)
|
61
|
+
)
|
62
|
+
)
|
63
|
+
|
64
|
+
define stem as (
|
65
|
+
|
66
|
+
do mark_regions
|
67
|
+
backwards (
|
68
|
+
do main_suffix
|
69
|
+
do consonant_pair
|
70
|
+
do other_suffix
|
71
|
+
)
|
72
|
+
)
|
@@ -0,0 +1,405 @@
|
|
1
|
+
/*
|
2
|
+
* Affix stripping stemming algorithm for Tamil
|
3
|
+
* By Damodharan Rajalingam
|
4
|
+
*/
|
5
|
+
|
6
|
+
stringescapes {}
|
7
|
+
|
8
|
+
/* Aytham */
|
9
|
+
stringdef aytham '{U+0B83}'
|
10
|
+
|
11
|
+
/* Uyir - independent vowels */
|
12
|
+
stringdef a '{U+0B85}'
|
13
|
+
stringdef aa '{U+0B86}'
|
14
|
+
stringdef i '{U+0B87}'
|
15
|
+
stringdef ii '{U+0B88}'
|
16
|
+
stringdef u '{U+0B89}'
|
17
|
+
stringdef uu '{U+0B8A}'
|
18
|
+
stringdef e '{U+0B8E}'
|
19
|
+
stringdef ee '{U+0B8F}'
|
20
|
+
stringdef ai '{U+0B90}'
|
21
|
+
stringdef o '{U+0B92}'
|
22
|
+
stringdef oo '{U+0B93}'
|
23
|
+
stringdef au '{U+0B94}'
|
24
|
+
|
25
|
+
/* Consonants */
|
26
|
+
stringdef ka '{U+0B95}'
|
27
|
+
stringdef nga '{U+0B99}'
|
28
|
+
stringdef ca '{U+0B9A}'
|
29
|
+
stringdef ja '{U+0B9C}'
|
30
|
+
stringdef nya '{U+0B9E}'
|
31
|
+
stringdef tta '{U+0B9F}'
|
32
|
+
stringdef nna '{U+0BA3}'
|
33
|
+
stringdef ta '{U+0BA4}'
|
34
|
+
stringdef tha '{U+0BA4}'
|
35
|
+
stringdef na '{U+0BA8}'
|
36
|
+
stringdef nnna '{U+0BA9}'
|
37
|
+
stringdef pa '{U+0BAA}'
|
38
|
+
stringdef ma '{U+0BAE}'
|
39
|
+
stringdef ya '{U+0BAF}'
|
40
|
+
stringdef ra '{U+0BB0}'
|
41
|
+
stringdef rra '{U+0BB1}'
|
42
|
+
stringdef la '{U+0BB2}'
|
43
|
+
stringdef lla '{U+0BB3}'
|
44
|
+
stringdef llla '{U+0BB4}'
|
45
|
+
stringdef zha '{U+0BB4}'
|
46
|
+
stringdef va '{U+0BB5}'
|
47
|
+
|
48
|
+
/* Vatamozi - borrowed */
|
49
|
+
stringdef sha '{U+0BB6}'
|
50
|
+
stringdef ssa '{U+0BB7}'
|
51
|
+
stringdef sa '{U+0BB8}'
|
52
|
+
stringdef ha '{U+0BB9}'
|
53
|
+
|
54
|
+
|
55
|
+
/* Dependent vowel signs (kombu etc.) */
|
56
|
+
stringdef vs_aa '{U+0BBE}'
|
57
|
+
stringdef vs_i '{U+0BBF}'
|
58
|
+
stringdef vs_ii '{U+0BC0}'
|
59
|
+
stringdef vs_u '{U+0BC1}'
|
60
|
+
stringdef vs_uu '{U+0BC2}'
|
61
|
+
stringdef vs_e '{U+0BC6}'
|
62
|
+
stringdef vs_ee '{U+0BC7}'
|
63
|
+
stringdef vs_ai '{U+0BC8}'
|
64
|
+
stringdef vs_o '{U+0BCA}'
|
65
|
+
stringdef vs_oo '{U+0BCB}'
|
66
|
+
stringdef vs_au '{U+0BCC}'
|
67
|
+
|
68
|
+
/* Pulli */
|
69
|
+
stringdef pulli '{U+0BCD}'
|
70
|
+
|
71
|
+
/* AU length markk */
|
72
|
+
stringdef au_lmark '{U+0BD7}'
|
73
|
+
|
74
|
+
|
75
|
+
routines (
|
76
|
+
remove_plural_suffix
|
77
|
+
remove_question_suffixes
|
78
|
+
remove_question_prefixes
|
79
|
+
remove_pronoun_prefixes
|
80
|
+
remove_command_suffixes
|
81
|
+
remove_um
|
82
|
+
remove_vetrumai_urupukal
|
83
|
+
fix_va_start
|
84
|
+
fix_ending
|
85
|
+
fix_endings
|
86
|
+
remove_tense_suffix
|
87
|
+
remove_tense_suffixes
|
88
|
+
remove_common_word_endings
|
89
|
+
has_min_length
|
90
|
+
)
|
91
|
+
|
92
|
+
externals ( stem )
|
93
|
+
|
94
|
+
booleans (
|
95
|
+
found_a_match
|
96
|
+
found_vetrumai_urupu
|
97
|
+
)
|
98
|
+
|
99
|
+
define has_min_length as (
|
100
|
+
$(len > 4)
|
101
|
+
)
|
102
|
+
|
103
|
+
define fix_va_start as (
|
104
|
+
(try '{va}{vs_oo}' and [ '{va}{vs_oo}' ] <- '{oo}' ) or
|
105
|
+
(try '{va}{vs_o}' and [ '{va}{vs_o}' ] <- '{o}' ) or
|
106
|
+
(try '{va}{vs_u}' and [ '{va}{vs_u}' ] <- '{u}' ) or
|
107
|
+
(try '{va}{vs_uu}' and [ '{va}{vs_uu}' ] <- '{uu}' )
|
108
|
+
)
|
109
|
+
|
110
|
+
define fix_endings as (
|
111
|
+
do repeat fix_ending
|
112
|
+
)
|
113
|
+
|
114
|
+
define remove_question_prefixes as (
|
115
|
+
[ ('{e}' ) among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
|
116
|
+
do fix_va_start
|
117
|
+
)
|
118
|
+
|
119
|
+
// Gives signal t if an ending was fixed, signal f otherwise.
|
120
|
+
define fix_ending as (
|
121
|
+
$(len > 3)
|
122
|
+
backwards (
|
123
|
+
( [among('{na}{pulli}' '{na}{pulli}{ta}' '{na}{pulli}{ta}{pulli}') ] delete )
|
124
|
+
or
|
125
|
+
( ['{ya}{pulli}' test among('{vs_ai}' '{vs_i}' '{vs_ii}') ] delete )
|
126
|
+
or
|
127
|
+
( [ '{tta}{pulli}{pa}{pulli}' or '{tta}{pulli}{ka}{pulli}' ] <- '{lla}{pulli}' )
|
128
|
+
or
|
129
|
+
( [ '{nnna}{pulli}{rra}{pulli}' ] <- '{la}{pulli}' )
|
130
|
+
or
|
131
|
+
// ( [ '{rra}{pulli}{ka}{pulli}' or '{nnna}{pulli}{nnna}{pulli}' ] <- '{la}{pulli}' )
|
132
|
+
( [ '{rra}{pulli}{ka}{pulli}' ] <- '{la}{pulli}' )
|
133
|
+
or
|
134
|
+
( [ '{tta}{pulli}{tta}{pulli}' ] <- '{tta}{vs_u}' )
|
135
|
+
or
|
136
|
+
( found_vetrumai_urupu [ '{ta}{pulli}{ta}{pulli}' (test not '{vs_ai}') ] <- '{ma}{pulli}' ] )
|
137
|
+
or
|
138
|
+
( [ '{vs_u}{ka}{pulli}' or '{vs_u}{ka}{pulli}{ka}{pulli}' ] <- '{pulli}' )
|
139
|
+
or
|
140
|
+
( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete )
|
141
|
+
or
|
142
|
+
( [ '{vs_u}{ka}{pulli}' ] <- '{pulli}' )
|
143
|
+
or
|
144
|
+
( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete )
|
145
|
+
or
|
146
|
+
( [ '{pulli}' (among('{ya}' '{ra}' '{la}' '{va}' '{zha}' '{lla}') or among('{nga}' '{nya}' '{nna}' '{na}' '{ma}' '{nnna}')) '{pulli}' ] <- '{pulli}' )
|
147
|
+
or
|
148
|
+
( [ among('{va}' '{ya}' '{va}{pulli}') ] delete )
|
149
|
+
or
|
150
|
+
( [ '{nnna}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')) ] delete )
|
151
|
+
or
|
152
|
+
( [ '{nga}{pulli}' (test not '{vs_ai}')] <- '{ma}{pulli}' )
|
153
|
+
or
|
154
|
+
( [ '{nga}{pulli}' ] delete )
|
155
|
+
or
|
156
|
+
( [ '{pulli}' (test (among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') or '{pulli}')) ] delete )
|
157
|
+
)
|
158
|
+
)
|
159
|
+
|
160
|
+
define remove_pronoun_prefixes as (
|
161
|
+
unset found_a_match
|
162
|
+
[ among('{a}' '{i}' '{u}') among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
|
163
|
+
(set found_a_match)
|
164
|
+
do fix_va_start
|
165
|
+
)
|
166
|
+
|
167
|
+
define remove_plural_suffix as (
|
168
|
+
unset found_a_match
|
169
|
+
backwards (
|
170
|
+
( [ '{vs_u}{nga}{pulli}{ka}{lla}{pulli}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}')) ] <- '{pulli}' ) or
|
171
|
+
( [ '{rra}{pulli}{ka}{lla}{pulli}' ] <- '{la}{pulli}' ) or
|
172
|
+
( [ '{tta}{pulli}{ka}{lla}{pulli}' ] <- '{lla}{pulli}' ) or
|
173
|
+
( [ '{ka}{lla}{pulli}' ] delete )
|
174
|
+
(set found_a_match)
|
175
|
+
)
|
176
|
+
)
|
177
|
+
|
178
|
+
define remove_question_suffixes as (
|
179
|
+
has_min_length
|
180
|
+
unset found_a_match
|
181
|
+
backwards (
|
182
|
+
do (
|
183
|
+
[ among('{vs_oo}' '{vs_ee}' '{vs_aa}') ] <- '{pulli}'
|
184
|
+
(set found_a_match)
|
185
|
+
)
|
186
|
+
)
|
187
|
+
do fix_endings
|
188
|
+
)
|
189
|
+
|
190
|
+
define remove_command_suffixes as (
|
191
|
+
has_min_length
|
192
|
+
unset found_a_match
|
193
|
+
backwards (
|
194
|
+
[ among('{pa}{vs_i}' '{va}{vs_i}') ] delete
|
195
|
+
(set found_a_match)
|
196
|
+
)
|
197
|
+
)
|
198
|
+
|
199
|
+
define remove_um as (
|
200
|
+
unset found_a_match
|
201
|
+
has_min_length
|
202
|
+
backwards ( [ '{vs_u}{ma}{pulli}' ] <- '{pulli}'
|
203
|
+
(set found_a_match)
|
204
|
+
)
|
205
|
+
do fix_ending
|
206
|
+
)
|
207
|
+
|
208
|
+
define remove_common_word_endings as (
|
209
|
+
// These are not suffixes actually but are
|
210
|
+
// some words that are attached to other words
|
211
|
+
// but can be removed for stemming
|
212
|
+
unset found_a_match
|
213
|
+
has_min_length
|
214
|
+
backwards (
|
215
|
+
test ( [ '{vs_u}{tta}{nnna}{pulli}' or
|
216
|
+
'{vs_i}{la}{pulli}{la}{vs_ai}' or
|
217
|
+
'{vs_i}{tta}{ma}{pulli}' or
|
218
|
+
'{vs_i}{nnna}{pulli}{rra}{vs_i}' or
|
219
|
+
'{vs_aa}{ka}{vs_i}' or
|
220
|
+
'{vs_aa}{ka}{vs_i}{ya}' or
|
221
|
+
'{vs_e}{nnna}{pulli}{rra}{vs_u}' or
|
222
|
+
'{vs_u}{lla}{pulli}{lla}' or
|
223
|
+
'{vs_u}{tta}{vs_ai}{ya}' or
|
224
|
+
'{vs_u}{tta}{vs_ai}' or
|
225
|
+
'{vs_e}{nnna}{vs_u}{ma}{pulli}' or
|
226
|
+
('{la}{pulli}{la}' test (not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
|
227
|
+
'{vs_e}{nnna}' or
|
228
|
+
'{vs_aa}{ka}{vs_i}' ] <- '{pulli}'
|
229
|
+
(set found_a_match)
|
230
|
+
)
|
231
|
+
or
|
232
|
+
test ( [ among('{pa}{tta}{vs_u}'
|
233
|
+
'{pa}{tta}{pulli}{tta}'
|
234
|
+
'{pa}{tta}{pulli}{tta}{vs_u}'
|
235
|
+
'{pa}{tta}{pulli}{tta}{ta}{vs_u}'
|
236
|
+
'{pa}{tta}{pulli}{tta}{nna}'
|
237
|
+
'{ka}{vs_u}{ra}{vs_i}{ya}'
|
238
|
+
'{pa}{rra}{pulli}{rra}{vs_i}'
|
239
|
+
'{va}{vs_i}{tta}{vs_u}'
|
240
|
+
'{va}{vs_i}{tta}{pulli}{tta}{vs_u}'
|
241
|
+
'{pa}{tta}{vs_i}{ta}{vs_aa}{nnna}'
|
242
|
+
'{pa}{tta}{vs_i}'
|
243
|
+
'{ta}{vs_aa}{nnna}'
|
244
|
+
'{vs_e}{la}{pulli}{la}{vs_aa}{ma}{pulli}')
|
245
|
+
] delete
|
246
|
+
(set found_a_match)
|
247
|
+
)
|
248
|
+
)
|
249
|
+
do fix_endings
|
250
|
+
)
|
251
|
+
|
252
|
+
define remove_vetrumai_urupukal as (
|
253
|
+
unset found_a_match
|
254
|
+
unset found_vetrumai_urupu
|
255
|
+
has_min_length
|
256
|
+
backwards (
|
257
|
+
(
|
258
|
+
test ( ['{nnna}{vs_ai}'] delete )
|
259
|
+
or
|
260
|
+
test ([ ( '{vs_i}{nnna}{vs_ai}' or
|
261
|
+
'{vs_ai}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}'))) or
|
262
|
+
( '{vs_ai}' (test (among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}')))
|
263
|
+
] <- '{pulli}'
|
264
|
+
)
|
265
|
+
or
|
266
|
+
test ( [
|
267
|
+
'{vs_o}{tta}{vs_u}' or
|
268
|
+
'{vs_oo}{tta}{vs_u}' or
|
269
|
+
'{vs_i}{la}{pulli}' or
|
270
|
+
'{vs_i}{rra}{pulli}' or
|
271
|
+
('{vs_i}{nnna}{pulli}' (test not '{ma}')) or
|
272
|
+
'{vs_i}{nnna}{pulli}{rra}{vs_u}' or
|
273
|
+
'{vs_i}{ra}{vs_u}{na}{pulli}{ta}{vs_u}' or
|
274
|
+
'{va}{vs_i}{tta}' or
|
275
|
+
($(len >= 7) '{vs_i}{tta}{ma}{pulli}') or
|
276
|
+
'{vs_aa}{la}{pulli}' or
|
277
|
+
'{vs_u}{tta}{vs_ai}' or
|
278
|
+
'{vs_aa}{ma}{la}{pulli}' or
|
279
|
+
('{la}{pulli}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
|
280
|
+
'{vs_u}{lla}{pulli}'
|
281
|
+
] <- '{pulli}'
|
282
|
+
)
|
283
|
+
or
|
284
|
+
test ( [
|
285
|
+
'{ka}{nna}{pulli}' or
|
286
|
+
'{ma}{vs_u}{nnna}{pulli}' or
|
287
|
+
'{ma}{vs_ee}{la}{pulli}' or
|
288
|
+
'{ma}{vs_ee}{rra}{pulli}' or
|
289
|
+
'{ka}{vs_ii}{llla}{pulli}' or
|
290
|
+
'{pa}{vs_i}{nnna}{pulli}' or
|
291
|
+
('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')))
|
292
|
+
] delete
|
293
|
+
)
|
294
|
+
or
|
295
|
+
test ([ '{vs_ii}' ] <- '{vs_i}')
|
296
|
+
)
|
297
|
+
(set found_a_match)
|
298
|
+
(set found_vetrumai_urupu)
|
299
|
+
do ( [ '{vs_i}{nnna}{pulli}' ] <- '{pulli}' )
|
300
|
+
)
|
301
|
+
do fix_endings
|
302
|
+
)
|
303
|
+
|
304
|
+
define remove_tense_suffixes as (
|
305
|
+
set found_a_match
|
306
|
+
repeat ( found_a_match (do remove_tense_suffix) )
|
307
|
+
)
|
308
|
+
|
309
|
+
define remove_tense_suffix as (
|
310
|
+
unset found_a_match
|
311
|
+
has_min_length
|
312
|
+
backwards (
|
313
|
+
do (
|
314
|
+
test ( [among(
|
315
|
+
'{ka}{vs_o}{nna}{pulli}{tta}{vs_i}{ra}{pulli}'
|
316
|
+
'{pa}{tta}{vs_u}'
|
317
|
+
)] delete
|
318
|
+
(set found_a_match)
|
319
|
+
)
|
320
|
+
or
|
321
|
+
test ( [
|
322
|
+
'{ma}{vs_aa}{ra}{pulli}' or
|
323
|
+
'{ma}{vs_i}{nnna}{pulli}' or
|
324
|
+
'{nnna}{nnna}{pulli}' or
|
325
|
+
'{nnna}{vs_aa}{nnna}{pulli}' or
|
326
|
+
'{nnna}{vs_aa}{lla}{pulli}' or
|
327
|
+
'{nnna}{vs_aa}{ra}{pulli}' or
|
328
|
+
('{va}{nnna}{pulli}' test (not among('{a}' '{aa}' '{i}' '{ii}' '{u}' '{uu}' '{e}' '{ee}' '{ai}' '{o}' '{oo}' '{au}')) ) or
|
329
|
+
'{nnna}{lla}{pulli}' or
|
330
|
+
'{va}{lla}{pulli}' or
|
331
|
+
'{nnna}{ra}{pulli}' or
|
332
|
+
'{va}{ra}{pulli}' or
|
333
|
+
'{nnna}' or '{pa}' or '{ka}' or '{ta}' or '{ya}' or
|
334
|
+
'{pa}{nnna}{pulli}' or
|
335
|
+
'{pa}{lla}{pulli}' or
|
336
|
+
'{pa}{ra}{pulli}' or
|
337
|
+
('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
|
338
|
+
'{vs_i}{rra}{pulli}{rra}{vs_u}' or
|
339
|
+
'{pa}{ma}{pulli}' or
|
340
|
+
'{nnna}{ma}{pulli}' or
|
341
|
+
'{ta}{vs_u}{ma}{pulli}' or
|
342
|
+
'{rra}{vs_u}{ma}{pulli}' or
|
343
|
+
'{ka}{vs_u}{ma}{pulli}' or
|
344
|
+
'{nnna}{vs_e}{nnna}{pulli}' or
|
345
|
+
'{nnna}{vs_ai}' or
|
346
|
+
'{va}{vs_ai}'
|
347
|
+
] delete
|
348
|
+
(set found_a_match)
|
349
|
+
)
|
350
|
+
or
|
351
|
+
test ( [
|
352
|
+
('{vs_aa}{nnna}{pulli}' test (not '{ca}')) or
|
353
|
+
'{vs_aa}{lla}{pulli}' or
|
354
|
+
'{vs_aa}{ra}{pulli}' or
|
355
|
+
'{vs_ee}{nnna}{pulli}' or
|
356
|
+
'{vs_aa}' or
|
357
|
+
'{vs_aa}{ma}{pulli}' or
|
358
|
+
'{vs_e}{ma}{pulli}' or
|
359
|
+
'{vs_ee}{ma}{pulli}' or
|
360
|
+
'{vs_oo}{ma}{pulli}' or
|
361
|
+
'{ka}{vs_u}{ma}{pulli}' or
|
362
|
+
'{ta}{vs_u}{ma}{pulli}' or
|
363
|
+
'{tta}{vs_u}{ma}{pulli}' or
|
364
|
+
'{rra}{vs_u}{ma}{pulli}' or
|
365
|
+
'{vs_aa}{ya}{pulli}' or
|
366
|
+
'{nnna}{vs_e}{nnna}{pulli}' or
|
367
|
+
'{nnna}{vs_i}{ra}{pulli}' or
|
368
|
+
'{vs_ii}{ra}{pulli}' or
|
369
|
+
'{vs_ii}{ya}{ra}{pulli}'
|
370
|
+
] <- '{pulli}'
|
371
|
+
(set found_a_match)
|
372
|
+
)
|
373
|
+
or
|
374
|
+
test ( ([ '{ka}{vs_u}' or '{ta}{vs_u}' ) (test '{pulli}') ] delete
|
375
|
+
(set found_a_match)
|
376
|
+
)
|
377
|
+
)
|
378
|
+
do ([among(
|
379
|
+
'{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}'
|
380
|
+
'{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}{pulli}'
|
381
|
+
'{ka}{vs_i}{nnna}{pulli}{rra}'
|
382
|
+
'{ka}{vs_i}{nnna}{pulli}{rra}{pulli}'
|
383
|
+
'{ka}{vs_i}{rra}'
|
384
|
+
'{ka}{vs_i}{rra}{pulli}'
|
385
|
+
)] delete
|
386
|
+
(set found_a_match)
|
387
|
+
)
|
388
|
+
)
|
389
|
+
do fix_endings
|
390
|
+
)
|
391
|
+
|
392
|
+
define stem as (
|
393
|
+
unset found_vetrumai_urupu
|
394
|
+
do fix_ending
|
395
|
+
has_min_length
|
396
|
+
do remove_question_prefixes
|
397
|
+
do remove_pronoun_prefixes
|
398
|
+
do remove_question_suffixes
|
399
|
+
do remove_um
|
400
|
+
do remove_common_word_endings
|
401
|
+
do remove_vetrumai_urupukal
|
402
|
+
do remove_plural_suffix
|
403
|
+
do remove_command_suffixes
|
404
|
+
do remove_tense_suffixes
|
405
|
+
)
|