mittens 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (137) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE.txt +30 -0
  5. data/README.md +62 -0
  6. data/Rakefile +21 -0
  7. data/ext/mittens/ext.c +96 -0
  8. data/ext/mittens/extconf.rb +12 -0
  9. data/lib/mittens/version.rb +3 -0
  10. data/lib/mittens.rb +7 -0
  11. data/mittens.gemspec +22 -0
  12. data/vendor/snowball/.gitignore +26 -0
  13. data/vendor/snowball/.travis.yml +112 -0
  14. data/vendor/snowball/AUTHORS +27 -0
  15. data/vendor/snowball/CONTRIBUTING.rst +216 -0
  16. data/vendor/snowball/COPYING +29 -0
  17. data/vendor/snowball/GNUmakefile +742 -0
  18. data/vendor/snowball/NEWS +754 -0
  19. data/vendor/snowball/README.rst +37 -0
  20. data/vendor/snowball/ada/README.md +74 -0
  21. data/vendor/snowball/ada/generate/generate.adb +83 -0
  22. data/vendor/snowball/ada/generate.gpr +21 -0
  23. data/vendor/snowball/ada/src/stemmer.adb +620 -0
  24. data/vendor/snowball/ada/src/stemmer.ads +219 -0
  25. data/vendor/snowball/ada/src/stemwords.adb +70 -0
  26. data/vendor/snowball/ada/stemmer_config.gpr +83 -0
  27. data/vendor/snowball/ada/stemwords.gpr +21 -0
  28. data/vendor/snowball/algorithms/arabic.sbl +558 -0
  29. data/vendor/snowball/algorithms/armenian.sbl +301 -0
  30. data/vendor/snowball/algorithms/basque.sbl +149 -0
  31. data/vendor/snowball/algorithms/catalan.sbl +202 -0
  32. data/vendor/snowball/algorithms/danish.sbl +93 -0
  33. data/vendor/snowball/algorithms/dutch.sbl +164 -0
  34. data/vendor/snowball/algorithms/english.sbl +229 -0
  35. data/vendor/snowball/algorithms/finnish.sbl +197 -0
  36. data/vendor/snowball/algorithms/french.sbl +254 -0
  37. data/vendor/snowball/algorithms/german.sbl +139 -0
  38. data/vendor/snowball/algorithms/german2.sbl +145 -0
  39. data/vendor/snowball/algorithms/greek.sbl +701 -0
  40. data/vendor/snowball/algorithms/hindi.sbl +323 -0
  41. data/vendor/snowball/algorithms/hungarian.sbl +241 -0
  42. data/vendor/snowball/algorithms/indonesian.sbl +192 -0
  43. data/vendor/snowball/algorithms/irish.sbl +149 -0
  44. data/vendor/snowball/algorithms/italian.sbl +202 -0
  45. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
  46. data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
  47. data/vendor/snowball/algorithms/lovins.sbl +208 -0
  48. data/vendor/snowball/algorithms/nepali.sbl +92 -0
  49. data/vendor/snowball/algorithms/norwegian.sbl +80 -0
  50. data/vendor/snowball/algorithms/porter.sbl +139 -0
  51. data/vendor/snowball/algorithms/portuguese.sbl +218 -0
  52. data/vendor/snowball/algorithms/romanian.sbl +236 -0
  53. data/vendor/snowball/algorithms/russian.sbl +221 -0
  54. data/vendor/snowball/algorithms/serbian.sbl +2379 -0
  55. data/vendor/snowball/algorithms/spanish.sbl +230 -0
  56. data/vendor/snowball/algorithms/swedish.sbl +72 -0
  57. data/vendor/snowball/algorithms/tamil.sbl +405 -0
  58. data/vendor/snowball/algorithms/turkish.sbl +470 -0
  59. data/vendor/snowball/algorithms/yiddish.sbl +460 -0
  60. data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
  61. data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
  62. data/vendor/snowball/charsets/cp850.sbl +130 -0
  63. data/vendor/snowball/compiler/analyser.c +1547 -0
  64. data/vendor/snowball/compiler/driver.c +615 -0
  65. data/vendor/snowball/compiler/generator.c +1748 -0
  66. data/vendor/snowball/compiler/generator_ada.c +1702 -0
  67. data/vendor/snowball/compiler/generator_csharp.c +1322 -0
  68. data/vendor/snowball/compiler/generator_go.c +1278 -0
  69. data/vendor/snowball/compiler/generator_java.c +1313 -0
  70. data/vendor/snowball/compiler/generator_js.c +1316 -0
  71. data/vendor/snowball/compiler/generator_pascal.c +1387 -0
  72. data/vendor/snowball/compiler/generator_python.c +1337 -0
  73. data/vendor/snowball/compiler/generator_rust.c +1295 -0
  74. data/vendor/snowball/compiler/header.h +418 -0
  75. data/vendor/snowball/compiler/space.c +286 -0
  76. data/vendor/snowball/compiler/syswords.h +86 -0
  77. data/vendor/snowball/compiler/syswords2.h +13 -0
  78. data/vendor/snowball/compiler/tokeniser.c +567 -0
  79. data/vendor/snowball/csharp/.gitignore +8 -0
  80. data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
  81. data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
  82. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
  83. data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
  84. data/vendor/snowball/csharp/Stemwords/App.config +6 -0
  85. data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
  86. data/vendor/snowball/doc/TODO +12 -0
  87. data/vendor/snowball/doc/libstemmer_c_README +148 -0
  88. data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
  89. data/vendor/snowball/doc/libstemmer_java_README +67 -0
  90. data/vendor/snowball/doc/libstemmer_js_README +48 -0
  91. data/vendor/snowball/doc/libstemmer_python_README +113 -0
  92. data/vendor/snowball/examples/stemwords.c +204 -0
  93. data/vendor/snowball/go/README.md +55 -0
  94. data/vendor/snowball/go/among.go +16 -0
  95. data/vendor/snowball/go/env.go +403 -0
  96. data/vendor/snowball/go/stemwords/generate.go +68 -0
  97. data/vendor/snowball/go/stemwords/main.go +68 -0
  98. data/vendor/snowball/go/util.go +34 -0
  99. data/vendor/snowball/iconv.py +50 -0
  100. data/vendor/snowball/include/libstemmer.h +78 -0
  101. data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
  102. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
  103. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
  104. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
  105. data/vendor/snowball/javascript/base-stemmer.js +294 -0
  106. data/vendor/snowball/javascript/stemwords.js +106 -0
  107. data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
  108. data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
  109. data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
  110. data/vendor/snowball/libstemmer/modules.txt +63 -0
  111. data/vendor/snowball/libstemmer/test.c +34 -0
  112. data/vendor/snowball/pascal/.gitignore +4 -0
  113. data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
  114. data/vendor/snowball/pascal/generate.pl +23 -0
  115. data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
  116. data/vendor/snowball/python/MANIFEST.in +7 -0
  117. data/vendor/snowball/python/create_init.py +54 -0
  118. data/vendor/snowball/python/setup.cfg +6 -0
  119. data/vendor/snowball/python/setup.py +81 -0
  120. data/vendor/snowball/python/snowballstemmer/among.py +13 -0
  121. data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
  122. data/vendor/snowball/python/stemwords.py +101 -0
  123. data/vendor/snowball/python/testapp.py +28 -0
  124. data/vendor/snowball/runtime/api.c +58 -0
  125. data/vendor/snowball/runtime/api.h +32 -0
  126. data/vendor/snowball/runtime/header.h +61 -0
  127. data/vendor/snowball/runtime/utilities.c +513 -0
  128. data/vendor/snowball/rust/Cargo.toml +7 -0
  129. data/vendor/snowball/rust/build.rs +55 -0
  130. data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
  131. data/vendor/snowball/rust/src/main.rs +102 -0
  132. data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
  133. data/vendor/snowball/rust/src/snowball/among.rs +6 -0
  134. data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
  135. data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
  136. data/vendor/snowball/tests/stemtest.c +95 -0
  137. metadata +178 -0
@@ -0,0 +1,230 @@
1
+ routines (
2
+ postlude mark_regions
3
+ RV R1 R2
4
+ attached_pronoun
5
+ standard_suffix
6
+ y_verb_suffix
7
+ verb_suffix
8
+ residual_suffix
9
+ )
10
+
11
+ externals ( stem )
12
+
13
+ integers ( pV p1 p2 )
14
+
15
+ groupings ( v )
16
+
17
+ stringescapes {}
18
+
19
+ /* special characters */
20
+
21
+ stringdef a' '{U+00E1}' // a-acute
22
+ stringdef e' '{U+00E9}' // e-acute
23
+ stringdef i' '{U+00ED}' // i-acute
24
+ stringdef o' '{U+00F3}' // o-acute
25
+ stringdef u' '{U+00FA}' // u-acute
26
+ stringdef u" '{U+00FC}' // u-diaeresis
27
+ stringdef n~ '{U+00F1}' // n-tilde
28
+
29
+ define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}'
30
+
31
+ define mark_regions as (
32
+
33
+ $pV = limit
34
+ $p1 = limit
35
+ $p2 = limit // defaults
36
+
37
+ do (
38
+ ( v (non-v gopast v) or (v gopast non-v) )
39
+ or
40
+ ( non-v (non-v gopast v) or (v next) )
41
+ setmark pV
42
+ )
43
+ do (
44
+ gopast v gopast non-v setmark p1
45
+ gopast v gopast non-v setmark p2
46
+ )
47
+ )
48
+
49
+ define postlude as repeat (
50
+ [substring] among(
51
+ '{a'}' (<- 'a')
52
+ '{e'}' (<- 'e')
53
+ '{i'}' (<- 'i')
54
+ '{o'}' (<- 'o')
55
+ '{u'}' (<- 'u')
56
+ // and possibly {u"}->u here, or in prelude
57
+ '' (next)
58
+ ) //or next
59
+ )
60
+
61
+ backwardmode (
62
+
63
+ define RV as $pV <= cursor
64
+ define R1 as $p1 <= cursor
65
+ define R2 as $p2 <= cursor
66
+
67
+ define attached_pronoun as (
68
+ [substring] among(
69
+ 'me' 'se' 'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo'
70
+ 'las' 'les' 'los' 'nos'
71
+ )
72
+ substring RV among(
73
+ 'i{e'}ndo' (] <- 'iendo')
74
+ '{a'}ndo' (] <- 'ando')
75
+ '{a'}r' (] <- 'ar')
76
+ '{e'}r' (] <- 'er')
77
+ '{i'}r' (] <- 'ir')
78
+ 'ando'
79
+ 'iendo'
80
+ 'ar' 'er' 'ir'
81
+ (delete)
82
+ 'yendo' ('u' delete)
83
+ )
84
+ )
85
+
86
+ define standard_suffix as (
87
+ [substring] among(
88
+
89
+ 'anza' 'anzas'
90
+ 'ico' 'ica' 'icos' 'icas'
91
+ 'ismo' 'ismos'
92
+ 'able' 'ables'
93
+ 'ible' 'ibles'
94
+ 'ista' 'istas'
95
+ 'oso' 'osa' 'osos' 'osas'
96
+ 'amiento' 'amientos'
97
+ 'imiento' 'imientos'
98
+ (
99
+ R2 delete
100
+ )
101
+ 'adora' 'ador' 'aci{o'}n'
102
+ 'adoras' 'adores' 'aciones'
103
+ 'ante' 'antes' 'ancia' 'ancias'// Note 1
104
+ (
105
+ R2 delete
106
+ try ( ['ic'] R2 delete )
107
+ )
108
+ 'log{i'}a'
109
+ 'log{i'}as'
110
+ (
111
+ R2 <- 'log'
112
+ )
113
+ 'uci{o'}n' 'uciones'
114
+ (
115
+ R2 <- 'u'
116
+ )
117
+ 'encia' 'encias'
118
+ (
119
+ R2 <- 'ente'
120
+ )
121
+ 'amente'
122
+ (
123
+ R1 delete
124
+ try (
125
+ [substring] R2 delete among(
126
+ 'iv' (['at'] R2 delete)
127
+ 'os'
128
+ 'ic'
129
+ 'ad'
130
+ )
131
+ )
132
+ )
133
+ 'mente'
134
+ (
135
+ R2 delete
136
+ try (
137
+ [substring] among(
138
+ 'ante' // Note 1
139
+ 'able'
140
+ 'ible' (R2 delete)
141
+ )
142
+ )
143
+ )
144
+ 'idad'
145
+ 'idades'
146
+ (
147
+ R2 delete
148
+ try (
149
+ [substring] among(
150
+ 'abil'
151
+ 'ic'
152
+ 'iv' (R2 delete)
153
+ )
154
+ )
155
+ )
156
+ 'iva' 'ivo'
157
+ 'ivas' 'ivos'
158
+ (
159
+ R2 delete
160
+ try (
161
+ ['at'] R2 delete // but not a further ['ic'] R2 delete
162
+ )
163
+ )
164
+ )
165
+ )
166
+
167
+ define y_verb_suffix as (
168
+ setlimit tomark pV for ([substring]) among(
169
+ 'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}'
170
+ 'yas' 'yes' 'yais' 'yamos'
171
+ ('u' delete)
172
+ )
173
+ )
174
+
175
+ define verb_suffix as (
176
+ setlimit tomark pV for ([substring]) among(
177
+
178
+ 'en' 'es' '{e'}is' 'emos'
179
+ (try ('u' test 'g') ] delete)
180
+
181
+ 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais'
182
+ 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}'
183
+ 'ar{e'}'
184
+ 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais'
185
+ 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}'
186
+ 'er{e'}'
187
+ 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais'
188
+ 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}'
189
+ 'ir{e'}'
190
+
191
+ 'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed'
192
+ 'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an'
193
+ 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado'
194
+ 'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as'
195
+ 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases'
196
+ 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais'
197
+ 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados'
198
+ 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos'
199
+ '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos'
200
+ (delete)
201
+ )
202
+ )
203
+
204
+ define residual_suffix as (
205
+ [substring] among(
206
+ 'os'
207
+ 'a' 'o' '{a'}' '{i'}' '{o'}'
208
+ ( RV delete )
209
+ 'e' '{e'}'
210
+ ( RV delete try( ['u'] test 'g' RV delete ) )
211
+ )
212
+ )
213
+ )
214
+
215
+ define stem as (
216
+ do mark_regions
217
+ backwards (
218
+ do attached_pronoun
219
+ do ( standard_suffix or
220
+ y_verb_suffix or
221
+ verb_suffix
222
+ )
223
+ do residual_suffix
224
+ )
225
+ do postlude
226
+ )
227
+
228
+ /*
229
+ Note 1: additions of 15 Jun 2005
230
+ */
@@ -0,0 +1,72 @@
1
+ routines (
2
+ mark_regions
3
+ main_suffix
4
+ consonant_pair
5
+ other_suffix
6
+ )
7
+
8
+ externals ( stem )
9
+
10
+ integers ( p1 x )
11
+
12
+ groupings ( v s_ending )
13
+
14
+ stringescapes {}
15
+
16
+ /* special characters */
17
+
18
+ stringdef a" '{U+00E4}'
19
+ stringdef ao '{U+00E5}'
20
+ stringdef o" '{U+00F6}'
21
+
22
+ define v 'aeiouy{a"}{ao}{o"}'
23
+
24
+ define s_ending 'bcdfghjklmnoprtvy'
25
+
26
+ define mark_regions as (
27
+
28
+ $p1 = limit
29
+ test ( hop 3 setmark x )
30
+ goto v gopast non-v setmark p1
31
+ try ( $p1 < x $p1 = x )
32
+ )
33
+
34
+ backwardmode (
35
+
36
+ define main_suffix as (
37
+ setlimit tomark p1 for ([substring])
38
+ among(
39
+
40
+ 'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne'
41
+ 'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter'
42
+ 'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens'
43
+ 'hetens' 'erns' 'at' 'andet' 'het' 'ast'
44
+ (delete)
45
+ 's'
46
+ (s_ending delete)
47
+ )
48
+ )
49
+
50
+ define consonant_pair as setlimit tomark p1 for (
51
+ among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt')
52
+ and ([next] delete)
53
+ )
54
+
55
+ define other_suffix as setlimit tomark p1 for (
56
+ [substring] among(
57
+ 'lig' 'ig' 'els' (delete)
58
+ 'l{o"}st' (<-'l{o"}s')
59
+ 'fullt' (<-'full')
60
+ )
61
+ )
62
+ )
63
+
64
+ define stem as (
65
+
66
+ do mark_regions
67
+ backwards (
68
+ do main_suffix
69
+ do consonant_pair
70
+ do other_suffix
71
+ )
72
+ )
@@ -0,0 +1,405 @@
1
+ /*
2
+ * Affix stripping stemming algorithm for Tamil
3
+ * By Damodharan Rajalingam
4
+ */
5
+
6
+ stringescapes {}
7
+
8
+ /* Aytham */
9
+ stringdef aytham '{U+0B83}'
10
+
11
+ /* Uyir - independent vowels */
12
+ stringdef a '{U+0B85}'
13
+ stringdef aa '{U+0B86}'
14
+ stringdef i '{U+0B87}'
15
+ stringdef ii '{U+0B88}'
16
+ stringdef u '{U+0B89}'
17
+ stringdef uu '{U+0B8A}'
18
+ stringdef e '{U+0B8E}'
19
+ stringdef ee '{U+0B8F}'
20
+ stringdef ai '{U+0B90}'
21
+ stringdef o '{U+0B92}'
22
+ stringdef oo '{U+0B93}'
23
+ stringdef au '{U+0B94}'
24
+
25
+ /* Consonants */
26
+ stringdef ka '{U+0B95}'
27
+ stringdef nga '{U+0B99}'
28
+ stringdef ca '{U+0B9A}'
29
+ stringdef ja '{U+0B9C}'
30
+ stringdef nya '{U+0B9E}'
31
+ stringdef tta '{U+0B9F}'
32
+ stringdef nna '{U+0BA3}'
33
+ stringdef ta '{U+0BA4}'
34
+ stringdef tha '{U+0BA4}'
35
+ stringdef na '{U+0BA8}'
36
+ stringdef nnna '{U+0BA9}'
37
+ stringdef pa '{U+0BAA}'
38
+ stringdef ma '{U+0BAE}'
39
+ stringdef ya '{U+0BAF}'
40
+ stringdef ra '{U+0BB0}'
41
+ stringdef rra '{U+0BB1}'
42
+ stringdef la '{U+0BB2}'
43
+ stringdef lla '{U+0BB3}'
44
+ stringdef llla '{U+0BB4}'
45
+ stringdef zha '{U+0BB4}'
46
+ stringdef va '{U+0BB5}'
47
+
48
+ /* Vatamozi - borrowed */
49
+ stringdef sha '{U+0BB6}'
50
+ stringdef ssa '{U+0BB7}'
51
+ stringdef sa '{U+0BB8}'
52
+ stringdef ha '{U+0BB9}'
53
+
54
+
55
+ /* Dependent vowel signs (kombu etc.) */
56
+ stringdef vs_aa '{U+0BBE}'
57
+ stringdef vs_i '{U+0BBF}'
58
+ stringdef vs_ii '{U+0BC0}'
59
+ stringdef vs_u '{U+0BC1}'
60
+ stringdef vs_uu '{U+0BC2}'
61
+ stringdef vs_e '{U+0BC6}'
62
+ stringdef vs_ee '{U+0BC7}'
63
+ stringdef vs_ai '{U+0BC8}'
64
+ stringdef vs_o '{U+0BCA}'
65
+ stringdef vs_oo '{U+0BCB}'
66
+ stringdef vs_au '{U+0BCC}'
67
+
68
+ /* Pulli */
69
+ stringdef pulli '{U+0BCD}'
70
+
71
+ /* AU length markk */
72
+ stringdef au_lmark '{U+0BD7}'
73
+
74
+
75
+ routines (
76
+ remove_plural_suffix
77
+ remove_question_suffixes
78
+ remove_question_prefixes
79
+ remove_pronoun_prefixes
80
+ remove_command_suffixes
81
+ remove_um
82
+ remove_vetrumai_urupukal
83
+ fix_va_start
84
+ fix_ending
85
+ fix_endings
86
+ remove_tense_suffix
87
+ remove_tense_suffixes
88
+ remove_common_word_endings
89
+ has_min_length
90
+ )
91
+
92
+ externals ( stem )
93
+
94
+ booleans (
95
+ found_a_match
96
+ found_vetrumai_urupu
97
+ )
98
+
99
+ define has_min_length as (
100
+ $(len > 4)
101
+ )
102
+
103
+ define fix_va_start as (
104
+ (try '{va}{vs_oo}' and [ '{va}{vs_oo}' ] <- '{oo}' ) or
105
+ (try '{va}{vs_o}' and [ '{va}{vs_o}' ] <- '{o}' ) or
106
+ (try '{va}{vs_u}' and [ '{va}{vs_u}' ] <- '{u}' ) or
107
+ (try '{va}{vs_uu}' and [ '{va}{vs_uu}' ] <- '{uu}' )
108
+ )
109
+
110
+ define fix_endings as (
111
+ do repeat fix_ending
112
+ )
113
+
114
+ define remove_question_prefixes as (
115
+ [ ('{e}' ) among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
116
+ do fix_va_start
117
+ )
118
+
119
+ // Gives signal t if an ending was fixed, signal f otherwise.
120
+ define fix_ending as (
121
+ $(len > 3)
122
+ backwards (
123
+ ( [among('{na}{pulli}' '{na}{pulli}{ta}' '{na}{pulli}{ta}{pulli}') ] delete )
124
+ or
125
+ ( ['{ya}{pulli}' test among('{vs_ai}' '{vs_i}' '{vs_ii}') ] delete )
126
+ or
127
+ ( [ '{tta}{pulli}{pa}{pulli}' or '{tta}{pulli}{ka}{pulli}' ] <- '{lla}{pulli}' )
128
+ or
129
+ ( [ '{nnna}{pulli}{rra}{pulli}' ] <- '{la}{pulli}' )
130
+ or
131
+ // ( [ '{rra}{pulli}{ka}{pulli}' or '{nnna}{pulli}{nnna}{pulli}' ] <- '{la}{pulli}' )
132
+ ( [ '{rra}{pulli}{ka}{pulli}' ] <- '{la}{pulli}' )
133
+ or
134
+ ( [ '{tta}{pulli}{tta}{pulli}' ] <- '{tta}{vs_u}' )
135
+ or
136
+ ( found_vetrumai_urupu [ '{ta}{pulli}{ta}{pulli}' (test not '{vs_ai}') ] <- '{ma}{pulli}' ] )
137
+ or
138
+ ( [ '{vs_u}{ka}{pulli}' or '{vs_u}{ka}{pulli}{ka}{pulli}' ] <- '{pulli}' )
139
+ or
140
+ ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete )
141
+ or
142
+ ( [ '{vs_u}{ka}{pulli}' ] <- '{pulli}' )
143
+ or
144
+ ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete )
145
+ or
146
+ ( [ '{pulli}' (among('{ya}' '{ra}' '{la}' '{va}' '{zha}' '{lla}') or among('{nga}' '{nya}' '{nna}' '{na}' '{ma}' '{nnna}')) '{pulli}' ] <- '{pulli}' )
147
+ or
148
+ ( [ among('{va}' '{ya}' '{va}{pulli}') ] delete )
149
+ or
150
+ ( [ '{nnna}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')) ] delete )
151
+ or
152
+ ( [ '{nga}{pulli}' (test not '{vs_ai}')] <- '{ma}{pulli}' )
153
+ or
154
+ ( [ '{nga}{pulli}' ] delete )
155
+ or
156
+ ( [ '{pulli}' (test (among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') or '{pulli}')) ] delete )
157
+ )
158
+ )
159
+
160
+ define remove_pronoun_prefixes as (
161
+ unset found_a_match
162
+ [ among('{a}' '{i}' '{u}') among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
163
+ (set found_a_match)
164
+ do fix_va_start
165
+ )
166
+
167
+ define remove_plural_suffix as (
168
+ unset found_a_match
169
+ backwards (
170
+ ( [ '{vs_u}{nga}{pulli}{ka}{lla}{pulli}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}')) ] <- '{pulli}' ) or
171
+ ( [ '{rra}{pulli}{ka}{lla}{pulli}' ] <- '{la}{pulli}' ) or
172
+ ( [ '{tta}{pulli}{ka}{lla}{pulli}' ] <- '{lla}{pulli}' ) or
173
+ ( [ '{ka}{lla}{pulli}' ] delete )
174
+ (set found_a_match)
175
+ )
176
+ )
177
+
178
+ define remove_question_suffixes as (
179
+ has_min_length
180
+ unset found_a_match
181
+ backwards (
182
+ do (
183
+ [ among('{vs_oo}' '{vs_ee}' '{vs_aa}') ] <- '{pulli}'
184
+ (set found_a_match)
185
+ )
186
+ )
187
+ do fix_endings
188
+ )
189
+
190
+ define remove_command_suffixes as (
191
+ has_min_length
192
+ unset found_a_match
193
+ backwards (
194
+ [ among('{pa}{vs_i}' '{va}{vs_i}') ] delete
195
+ (set found_a_match)
196
+ )
197
+ )
198
+
199
+ define remove_um as (
200
+ unset found_a_match
201
+ has_min_length
202
+ backwards ( [ '{vs_u}{ma}{pulli}' ] <- '{pulli}'
203
+ (set found_a_match)
204
+ )
205
+ do fix_ending
206
+ )
207
+
208
+ define remove_common_word_endings as (
209
+ // These are not suffixes actually but are
210
+ // some words that are attached to other words
211
+ // but can be removed for stemming
212
+ unset found_a_match
213
+ has_min_length
214
+ backwards (
215
+ test ( [ '{vs_u}{tta}{nnna}{pulli}' or
216
+ '{vs_i}{la}{pulli}{la}{vs_ai}' or
217
+ '{vs_i}{tta}{ma}{pulli}' or
218
+ '{vs_i}{nnna}{pulli}{rra}{vs_i}' or
219
+ '{vs_aa}{ka}{vs_i}' or
220
+ '{vs_aa}{ka}{vs_i}{ya}' or
221
+ '{vs_e}{nnna}{pulli}{rra}{vs_u}' or
222
+ '{vs_u}{lla}{pulli}{lla}' or
223
+ '{vs_u}{tta}{vs_ai}{ya}' or
224
+ '{vs_u}{tta}{vs_ai}' or
225
+ '{vs_e}{nnna}{vs_u}{ma}{pulli}' or
226
+ ('{la}{pulli}{la}' test (not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
227
+ '{vs_e}{nnna}' or
228
+ '{vs_aa}{ka}{vs_i}' ] <- '{pulli}'
229
+ (set found_a_match)
230
+ )
231
+ or
232
+ test ( [ among('{pa}{tta}{vs_u}'
233
+ '{pa}{tta}{pulli}{tta}'
234
+ '{pa}{tta}{pulli}{tta}{vs_u}'
235
+ '{pa}{tta}{pulli}{tta}{ta}{vs_u}'
236
+ '{pa}{tta}{pulli}{tta}{nna}'
237
+ '{ka}{vs_u}{ra}{vs_i}{ya}'
238
+ '{pa}{rra}{pulli}{rra}{vs_i}'
239
+ '{va}{vs_i}{tta}{vs_u}'
240
+ '{va}{vs_i}{tta}{pulli}{tta}{vs_u}'
241
+ '{pa}{tta}{vs_i}{ta}{vs_aa}{nnna}'
242
+ '{pa}{tta}{vs_i}'
243
+ '{ta}{vs_aa}{nnna}'
244
+ '{vs_e}{la}{pulli}{la}{vs_aa}{ma}{pulli}')
245
+ ] delete
246
+ (set found_a_match)
247
+ )
248
+ )
249
+ do fix_endings
250
+ )
251
+
252
+ define remove_vetrumai_urupukal as (
253
+ unset found_a_match
254
+ unset found_vetrumai_urupu
255
+ has_min_length
256
+ backwards (
257
+ (
258
+ test ( ['{nnna}{vs_ai}'] delete )
259
+ or
260
+ test ([ ( '{vs_i}{nnna}{vs_ai}' or
261
+ '{vs_ai}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}'))) or
262
+ ( '{vs_ai}' (test (among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}')))
263
+ ] <- '{pulli}'
264
+ )
265
+ or
266
+ test ( [
267
+ '{vs_o}{tta}{vs_u}' or
268
+ '{vs_oo}{tta}{vs_u}' or
269
+ '{vs_i}{la}{pulli}' or
270
+ '{vs_i}{rra}{pulli}' or
271
+ ('{vs_i}{nnna}{pulli}' (test not '{ma}')) or
272
+ '{vs_i}{nnna}{pulli}{rra}{vs_u}' or
273
+ '{vs_i}{ra}{vs_u}{na}{pulli}{ta}{vs_u}' or
274
+ '{va}{vs_i}{tta}' or
275
+ ($(len >= 7) '{vs_i}{tta}{ma}{pulli}') or
276
+ '{vs_aa}{la}{pulli}' or
277
+ '{vs_u}{tta}{vs_ai}' or
278
+ '{vs_aa}{ma}{la}{pulli}' or
279
+ ('{la}{pulli}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
280
+ '{vs_u}{lla}{pulli}'
281
+ ] <- '{pulli}'
282
+ )
283
+ or
284
+ test ( [
285
+ '{ka}{nna}{pulli}' or
286
+ '{ma}{vs_u}{nnna}{pulli}' or
287
+ '{ma}{vs_ee}{la}{pulli}' or
288
+ '{ma}{vs_ee}{rra}{pulli}' or
289
+ '{ka}{vs_ii}{llla}{pulli}' or
290
+ '{pa}{vs_i}{nnna}{pulli}' or
291
+ ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')))
292
+ ] delete
293
+ )
294
+ or
295
+ test ([ '{vs_ii}' ] <- '{vs_i}')
296
+ )
297
+ (set found_a_match)
298
+ (set found_vetrumai_urupu)
299
+ do ( [ '{vs_i}{nnna}{pulli}' ] <- '{pulli}' )
300
+ )
301
+ do fix_endings
302
+ )
303
+
304
+ define remove_tense_suffixes as (
305
+ set found_a_match
306
+ repeat ( found_a_match (do remove_tense_suffix) )
307
+ )
308
+
309
+ define remove_tense_suffix as (
310
+ unset found_a_match
311
+ has_min_length
312
+ backwards (
313
+ do (
314
+ test ( [among(
315
+ '{ka}{vs_o}{nna}{pulli}{tta}{vs_i}{ra}{pulli}'
316
+ '{pa}{tta}{vs_u}'
317
+ )] delete
318
+ (set found_a_match)
319
+ )
320
+ or
321
+ test ( [
322
+ '{ma}{vs_aa}{ra}{pulli}' or
323
+ '{ma}{vs_i}{nnna}{pulli}' or
324
+ '{nnna}{nnna}{pulli}' or
325
+ '{nnna}{vs_aa}{nnna}{pulli}' or
326
+ '{nnna}{vs_aa}{lla}{pulli}' or
327
+ '{nnna}{vs_aa}{ra}{pulli}' or
328
+ ('{va}{nnna}{pulli}' test (not among('{a}' '{aa}' '{i}' '{ii}' '{u}' '{uu}' '{e}' '{ee}' '{ai}' '{o}' '{oo}' '{au}')) ) or
329
+ '{nnna}{lla}{pulli}' or
330
+ '{va}{lla}{pulli}' or
331
+ '{nnna}{ra}{pulli}' or
332
+ '{va}{ra}{pulli}' or
333
+ '{nnna}' or '{pa}' or '{ka}' or '{ta}' or '{ya}' or
334
+ '{pa}{nnna}{pulli}' or
335
+ '{pa}{lla}{pulli}' or
336
+ '{pa}{ra}{pulli}' or
337
+ ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
338
+ '{vs_i}{rra}{pulli}{rra}{vs_u}' or
339
+ '{pa}{ma}{pulli}' or
340
+ '{nnna}{ma}{pulli}' or
341
+ '{ta}{vs_u}{ma}{pulli}' or
342
+ '{rra}{vs_u}{ma}{pulli}' or
343
+ '{ka}{vs_u}{ma}{pulli}' or
344
+ '{nnna}{vs_e}{nnna}{pulli}' or
345
+ '{nnna}{vs_ai}' or
346
+ '{va}{vs_ai}'
347
+ ] delete
348
+ (set found_a_match)
349
+ )
350
+ or
351
+ test ( [
352
+ ('{vs_aa}{nnna}{pulli}' test (not '{ca}')) or
353
+ '{vs_aa}{lla}{pulli}' or
354
+ '{vs_aa}{ra}{pulli}' or
355
+ '{vs_ee}{nnna}{pulli}' or
356
+ '{vs_aa}' or
357
+ '{vs_aa}{ma}{pulli}' or
358
+ '{vs_e}{ma}{pulli}' or
359
+ '{vs_ee}{ma}{pulli}' or
360
+ '{vs_oo}{ma}{pulli}' or
361
+ '{ka}{vs_u}{ma}{pulli}' or
362
+ '{ta}{vs_u}{ma}{pulli}' or
363
+ '{tta}{vs_u}{ma}{pulli}' or
364
+ '{rra}{vs_u}{ma}{pulli}' or
365
+ '{vs_aa}{ya}{pulli}' or
366
+ '{nnna}{vs_e}{nnna}{pulli}' or
367
+ '{nnna}{vs_i}{ra}{pulli}' or
368
+ '{vs_ii}{ra}{pulli}' or
369
+ '{vs_ii}{ya}{ra}{pulli}'
370
+ ] <- '{pulli}'
371
+ (set found_a_match)
372
+ )
373
+ or
374
+ test ( ([ '{ka}{vs_u}' or '{ta}{vs_u}' ) (test '{pulli}') ] delete
375
+ (set found_a_match)
376
+ )
377
+ )
378
+ do ([among(
379
+ '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}'
380
+ '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}{pulli}'
381
+ '{ka}{vs_i}{nnna}{pulli}{rra}'
382
+ '{ka}{vs_i}{nnna}{pulli}{rra}{pulli}'
383
+ '{ka}{vs_i}{rra}'
384
+ '{ka}{vs_i}{rra}{pulli}'
385
+ )] delete
386
+ (set found_a_match)
387
+ )
388
+ )
389
+ do fix_endings
390
+ )
391
+
392
+ define stem as (
393
+ unset found_vetrumai_urupu
394
+ do fix_ending
395
+ has_min_length
396
+ do remove_question_prefixes
397
+ do remove_pronoun_prefixes
398
+ do remove_question_suffixes
399
+ do remove_um
400
+ do remove_common_word_endings
401
+ do remove_vetrumai_urupukal
402
+ do remove_plural_suffix
403
+ do remove_command_suffixes
404
+ do remove_tense_suffixes
405
+ )