mittens 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE.txt +30 -0
  5. data/README.md +62 -0
  6. data/Rakefile +21 -0
  7. data/ext/mittens/ext.c +96 -0
  8. data/ext/mittens/extconf.rb +12 -0
  9. data/lib/mittens/version.rb +3 -0
  10. data/lib/mittens.rb +7 -0
  11. data/mittens.gemspec +22 -0
  12. data/vendor/snowball/.gitignore +26 -0
  13. data/vendor/snowball/.travis.yml +112 -0
  14. data/vendor/snowball/AUTHORS +27 -0
  15. data/vendor/snowball/CONTRIBUTING.rst +216 -0
  16. data/vendor/snowball/COPYING +29 -0
  17. data/vendor/snowball/GNUmakefile +742 -0
  18. data/vendor/snowball/NEWS +754 -0
  19. data/vendor/snowball/README.rst +37 -0
  20. data/vendor/snowball/ada/README.md +74 -0
  21. data/vendor/snowball/ada/generate/generate.adb +83 -0
  22. data/vendor/snowball/ada/generate.gpr +21 -0
  23. data/vendor/snowball/ada/src/stemmer.adb +620 -0
  24. data/vendor/snowball/ada/src/stemmer.ads +219 -0
  25. data/vendor/snowball/ada/src/stemwords.adb +70 -0
  26. data/vendor/snowball/ada/stemmer_config.gpr +83 -0
  27. data/vendor/snowball/ada/stemwords.gpr +21 -0
  28. data/vendor/snowball/algorithms/arabic.sbl +558 -0
  29. data/vendor/snowball/algorithms/armenian.sbl +301 -0
  30. data/vendor/snowball/algorithms/basque.sbl +149 -0
  31. data/vendor/snowball/algorithms/catalan.sbl +202 -0
  32. data/vendor/snowball/algorithms/danish.sbl +93 -0
  33. data/vendor/snowball/algorithms/dutch.sbl +164 -0
  34. data/vendor/snowball/algorithms/english.sbl +229 -0
  35. data/vendor/snowball/algorithms/finnish.sbl +197 -0
  36. data/vendor/snowball/algorithms/french.sbl +254 -0
  37. data/vendor/snowball/algorithms/german.sbl +139 -0
  38. data/vendor/snowball/algorithms/german2.sbl +145 -0
  39. data/vendor/snowball/algorithms/greek.sbl +701 -0
  40. data/vendor/snowball/algorithms/hindi.sbl +323 -0
  41. data/vendor/snowball/algorithms/hungarian.sbl +241 -0
  42. data/vendor/snowball/algorithms/indonesian.sbl +192 -0
  43. data/vendor/snowball/algorithms/irish.sbl +149 -0
  44. data/vendor/snowball/algorithms/italian.sbl +202 -0
  45. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
  46. data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
  47. data/vendor/snowball/algorithms/lovins.sbl +208 -0
  48. data/vendor/snowball/algorithms/nepali.sbl +92 -0
  49. data/vendor/snowball/algorithms/norwegian.sbl +80 -0
  50. data/vendor/snowball/algorithms/porter.sbl +139 -0
  51. data/vendor/snowball/algorithms/portuguese.sbl +218 -0
  52. data/vendor/snowball/algorithms/romanian.sbl +236 -0
  53. data/vendor/snowball/algorithms/russian.sbl +221 -0
  54. data/vendor/snowball/algorithms/serbian.sbl +2379 -0
  55. data/vendor/snowball/algorithms/spanish.sbl +230 -0
  56. data/vendor/snowball/algorithms/swedish.sbl +72 -0
  57. data/vendor/snowball/algorithms/tamil.sbl +405 -0
  58. data/vendor/snowball/algorithms/turkish.sbl +470 -0
  59. data/vendor/snowball/algorithms/yiddish.sbl +460 -0
  60. data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
  61. data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
  62. data/vendor/snowball/charsets/cp850.sbl +130 -0
  63. data/vendor/snowball/compiler/analyser.c +1547 -0
  64. data/vendor/snowball/compiler/driver.c +615 -0
  65. data/vendor/snowball/compiler/generator.c +1748 -0
  66. data/vendor/snowball/compiler/generator_ada.c +1702 -0
  67. data/vendor/snowball/compiler/generator_csharp.c +1322 -0
  68. data/vendor/snowball/compiler/generator_go.c +1278 -0
  69. data/vendor/snowball/compiler/generator_java.c +1313 -0
  70. data/vendor/snowball/compiler/generator_js.c +1316 -0
  71. data/vendor/snowball/compiler/generator_pascal.c +1387 -0
  72. data/vendor/snowball/compiler/generator_python.c +1337 -0
  73. data/vendor/snowball/compiler/generator_rust.c +1295 -0
  74. data/vendor/snowball/compiler/header.h +418 -0
  75. data/vendor/snowball/compiler/space.c +286 -0
  76. data/vendor/snowball/compiler/syswords.h +86 -0
  77. data/vendor/snowball/compiler/syswords2.h +13 -0
  78. data/vendor/snowball/compiler/tokeniser.c +567 -0
  79. data/vendor/snowball/csharp/.gitignore +8 -0
  80. data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
  81. data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
  82. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
  83. data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
  84. data/vendor/snowball/csharp/Stemwords/App.config +6 -0
  85. data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
  86. data/vendor/snowball/doc/TODO +12 -0
  87. data/vendor/snowball/doc/libstemmer_c_README +148 -0
  88. data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
  89. data/vendor/snowball/doc/libstemmer_java_README +67 -0
  90. data/vendor/snowball/doc/libstemmer_js_README +48 -0
  91. data/vendor/snowball/doc/libstemmer_python_README +113 -0
  92. data/vendor/snowball/examples/stemwords.c +204 -0
  93. data/vendor/snowball/go/README.md +55 -0
  94. data/vendor/snowball/go/among.go +16 -0
  95. data/vendor/snowball/go/env.go +403 -0
  96. data/vendor/snowball/go/stemwords/generate.go +68 -0
  97. data/vendor/snowball/go/stemwords/main.go +68 -0
  98. data/vendor/snowball/go/util.go +34 -0
  99. data/vendor/snowball/iconv.py +50 -0
  100. data/vendor/snowball/include/libstemmer.h +78 -0
  101. data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
  102. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
  103. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
  104. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
  105. data/vendor/snowball/javascript/base-stemmer.js +294 -0
  106. data/vendor/snowball/javascript/stemwords.js +106 -0
  107. data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
  108. data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
  109. data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
  110. data/vendor/snowball/libstemmer/modules.txt +63 -0
  111. data/vendor/snowball/libstemmer/test.c +34 -0
  112. data/vendor/snowball/pascal/.gitignore +4 -0
  113. data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
  114. data/vendor/snowball/pascal/generate.pl +23 -0
  115. data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
  116. data/vendor/snowball/python/MANIFEST.in +7 -0
  117. data/vendor/snowball/python/create_init.py +54 -0
  118. data/vendor/snowball/python/setup.cfg +6 -0
  119. data/vendor/snowball/python/setup.py +81 -0
  120. data/vendor/snowball/python/snowballstemmer/among.py +13 -0
  121. data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
  122. data/vendor/snowball/python/stemwords.py +101 -0
  123. data/vendor/snowball/python/testapp.py +28 -0
  124. data/vendor/snowball/runtime/api.c +58 -0
  125. data/vendor/snowball/runtime/api.h +32 -0
  126. data/vendor/snowball/runtime/header.h +61 -0
  127. data/vendor/snowball/runtime/utilities.c +513 -0
  128. data/vendor/snowball/rust/Cargo.toml +7 -0
  129. data/vendor/snowball/rust/build.rs +55 -0
  130. data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
  131. data/vendor/snowball/rust/src/main.rs +102 -0
  132. data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
  133. data/vendor/snowball/rust/src/snowball/among.rs +6 -0
  134. data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
  135. data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
  136. data/vendor/snowball/tests/stemtest.c +95 -0
  137. metadata +178 -0
@@ -0,0 +1,230 @@
1
+ routines (
2
+ postlude mark_regions
3
+ RV R1 R2
4
+ attached_pronoun
5
+ standard_suffix
6
+ y_verb_suffix
7
+ verb_suffix
8
+ residual_suffix
9
+ )
10
+
11
+ externals ( stem )
12
+
13
+ integers ( pV p1 p2 )
14
+
15
+ groupings ( v )
16
+
17
+ stringescapes {}
18
+
19
+ /* special characters */
20
+
21
+ stringdef a' '{U+00E1}' // a-acute
22
+ stringdef e' '{U+00E9}' // e-acute
23
+ stringdef i' '{U+00ED}' // i-acute
24
+ stringdef o' '{U+00F3}' // o-acute
25
+ stringdef u' '{U+00FA}' // u-acute
26
+ stringdef u" '{U+00FC}' // u-diaeresis
27
+ stringdef n~ '{U+00F1}' // n-tilde
28
+
29
+ define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}'
30
+
31
+ define mark_regions as (
32
+
33
+ $pV = limit
34
+ $p1 = limit
35
+ $p2 = limit // defaults
36
+
37
+ do (
38
+ ( v (non-v gopast v) or (v gopast non-v) )
39
+ or
40
+ ( non-v (non-v gopast v) or (v next) )
41
+ setmark pV
42
+ )
43
+ do (
44
+ gopast v gopast non-v setmark p1
45
+ gopast v gopast non-v setmark p2
46
+ )
47
+ )
48
+
49
+ define postlude as repeat (
50
+ [substring] among(
51
+ '{a'}' (<- 'a')
52
+ '{e'}' (<- 'e')
53
+ '{i'}' (<- 'i')
54
+ '{o'}' (<- 'o')
55
+ '{u'}' (<- 'u')
56
+ // and possibly {u"}->u here, or in prelude
57
+ '' (next)
58
+ ) //or next
59
+ )
60
+
61
+ backwardmode (
62
+
63
+ define RV as $pV <= cursor
64
+ define R1 as $p1 <= cursor
65
+ define R2 as $p2 <= cursor
66
+
67
+ define attached_pronoun as (
68
+ [substring] among(
69
+ 'me' 'se' 'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo'
70
+ 'las' 'les' 'los' 'nos'
71
+ )
72
+ substring RV among(
73
+ 'i{e'}ndo' (] <- 'iendo')
74
+ '{a'}ndo' (] <- 'ando')
75
+ '{a'}r' (] <- 'ar')
76
+ '{e'}r' (] <- 'er')
77
+ '{i'}r' (] <- 'ir')
78
+ 'ando'
79
+ 'iendo'
80
+ 'ar' 'er' 'ir'
81
+ (delete)
82
+ 'yendo' ('u' delete)
83
+ )
84
+ )
85
+
86
+ define standard_suffix as (
87
+ [substring] among(
88
+
89
+ 'anza' 'anzas'
90
+ 'ico' 'ica' 'icos' 'icas'
91
+ 'ismo' 'ismos'
92
+ 'able' 'ables'
93
+ 'ible' 'ibles'
94
+ 'ista' 'istas'
95
+ 'oso' 'osa' 'osos' 'osas'
96
+ 'amiento' 'amientos'
97
+ 'imiento' 'imientos'
98
+ (
99
+ R2 delete
100
+ )
101
+ 'adora' 'ador' 'aci{o'}n'
102
+ 'adoras' 'adores' 'aciones'
103
+ 'ante' 'antes' 'ancia' 'ancias'// Note 1
104
+ (
105
+ R2 delete
106
+ try ( ['ic'] R2 delete )
107
+ )
108
+ 'log{i'}a'
109
+ 'log{i'}as'
110
+ (
111
+ R2 <- 'log'
112
+ )
113
+ 'uci{o'}n' 'uciones'
114
+ (
115
+ R2 <- 'u'
116
+ )
117
+ 'encia' 'encias'
118
+ (
119
+ R2 <- 'ente'
120
+ )
121
+ 'amente'
122
+ (
123
+ R1 delete
124
+ try (
125
+ [substring] R2 delete among(
126
+ 'iv' (['at'] R2 delete)
127
+ 'os'
128
+ 'ic'
129
+ 'ad'
130
+ )
131
+ )
132
+ )
133
+ 'mente'
134
+ (
135
+ R2 delete
136
+ try (
137
+ [substring] among(
138
+ 'ante' // Note 1
139
+ 'able'
140
+ 'ible' (R2 delete)
141
+ )
142
+ )
143
+ )
144
+ 'idad'
145
+ 'idades'
146
+ (
147
+ R2 delete
148
+ try (
149
+ [substring] among(
150
+ 'abil'
151
+ 'ic'
152
+ 'iv' (R2 delete)
153
+ )
154
+ )
155
+ )
156
+ 'iva' 'ivo'
157
+ 'ivas' 'ivos'
158
+ (
159
+ R2 delete
160
+ try (
161
+ ['at'] R2 delete // but not a further ['ic'] R2 delete
162
+ )
163
+ )
164
+ )
165
+ )
166
+
167
+ define y_verb_suffix as (
168
+ setlimit tomark pV for ([substring]) among(
169
+ 'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}'
170
+ 'yas' 'yes' 'yais' 'yamos'
171
+ ('u' delete)
172
+ )
173
+ )
174
+
175
+ define verb_suffix as (
176
+ setlimit tomark pV for ([substring]) among(
177
+
178
+ 'en' 'es' '{e'}is' 'emos'
179
+ (try ('u' test 'g') ] delete)
180
+
181
+ 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais'
182
+ 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}'
183
+ 'ar{e'}'
184
+ 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais'
185
+ 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}'
186
+ 'er{e'}'
187
+ 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais'
188
+ 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}'
189
+ 'ir{e'}'
190
+
191
+ 'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed'
192
+ 'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an'
193
+ 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado'
194
+ 'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as'
195
+ 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases'
196
+ 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais'
197
+ 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados'
198
+ 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos'
199
+ '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos'
200
+ (delete)
201
+ )
202
+ )
203
+
204
+ define residual_suffix as (
205
+ [substring] among(
206
+ 'os'
207
+ 'a' 'o' '{a'}' '{i'}' '{o'}'
208
+ ( RV delete )
209
+ 'e' '{e'}'
210
+ ( RV delete try( ['u'] test 'g' RV delete ) )
211
+ )
212
+ )
213
+ )
214
+
215
+ define stem as (
216
+ do mark_regions
217
+ backwards (
218
+ do attached_pronoun
219
+ do ( standard_suffix or
220
+ y_verb_suffix or
221
+ verb_suffix
222
+ )
223
+ do residual_suffix
224
+ )
225
+ do postlude
226
+ )
227
+
228
+ /*
229
+ Note 1: additions of 15 Jun 2005
230
+ */
@@ -0,0 +1,72 @@
1
+ routines (
2
+ mark_regions
3
+ main_suffix
4
+ consonant_pair
5
+ other_suffix
6
+ )
7
+
8
+ externals ( stem )
9
+
10
+ integers ( p1 x )
11
+
12
+ groupings ( v s_ending )
13
+
14
+ stringescapes {}
15
+
16
+ /* special characters */
17
+
18
+ stringdef a" '{U+00E4}'
19
+ stringdef ao '{U+00E5}'
20
+ stringdef o" '{U+00F6}'
21
+
22
+ define v 'aeiouy{a"}{ao}{o"}'
23
+
24
+ define s_ending 'bcdfghjklmnoprtvy'
25
+
26
+ define mark_regions as (
27
+
28
+ $p1 = limit
29
+ test ( hop 3 setmark x )
30
+ goto v gopast non-v setmark p1
31
+ try ( $p1 < x $p1 = x )
32
+ )
33
+
34
+ backwardmode (
35
+
36
+ define main_suffix as (
37
+ setlimit tomark p1 for ([substring])
38
+ among(
39
+
40
+ 'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne'
41
+ 'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter'
42
+ 'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens'
43
+ 'hetens' 'erns' 'at' 'andet' 'het' 'ast'
44
+ (delete)
45
+ 's'
46
+ (s_ending delete)
47
+ )
48
+ )
49
+
50
+ define consonant_pair as setlimit tomark p1 for (
51
+ among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt')
52
+ and ([next] delete)
53
+ )
54
+
55
+ define other_suffix as setlimit tomark p1 for (
56
+ [substring] among(
57
+ 'lig' 'ig' 'els' (delete)
58
+ 'l{o"}st' (<-'l{o"}s')
59
+ 'fullt' (<-'full')
60
+ )
61
+ )
62
+ )
63
+
64
+ define stem as (
65
+
66
+ do mark_regions
67
+ backwards (
68
+ do main_suffix
69
+ do consonant_pair
70
+ do other_suffix
71
+ )
72
+ )
@@ -0,0 +1,405 @@
1
+ /*
2
+ * Affix stripping stemming algorithm for Tamil
3
+ * By Damodharan Rajalingam
4
+ */
5
+
6
+ stringescapes {}
7
+
8
+ /* Aytham */
9
+ stringdef aytham '{U+0B83}'
10
+
11
+ /* Uyir - independent vowels */
12
+ stringdef a '{U+0B85}'
13
+ stringdef aa '{U+0B86}'
14
+ stringdef i '{U+0B87}'
15
+ stringdef ii '{U+0B88}'
16
+ stringdef u '{U+0B89}'
17
+ stringdef uu '{U+0B8A}'
18
+ stringdef e '{U+0B8E}'
19
+ stringdef ee '{U+0B8F}'
20
+ stringdef ai '{U+0B90}'
21
+ stringdef o '{U+0B92}'
22
+ stringdef oo '{U+0B93}'
23
+ stringdef au '{U+0B94}'
24
+
25
+ /* Consonants */
26
+ stringdef ka '{U+0B95}'
27
+ stringdef nga '{U+0B99}'
28
+ stringdef ca '{U+0B9A}'
29
+ stringdef ja '{U+0B9C}'
30
+ stringdef nya '{U+0B9E}'
31
+ stringdef tta '{U+0B9F}'
32
+ stringdef nna '{U+0BA3}'
33
+ stringdef ta '{U+0BA4}'
34
+ stringdef tha '{U+0BA4}'
35
+ stringdef na '{U+0BA8}'
36
+ stringdef nnna '{U+0BA9}'
37
+ stringdef pa '{U+0BAA}'
38
+ stringdef ma '{U+0BAE}'
39
+ stringdef ya '{U+0BAF}'
40
+ stringdef ra '{U+0BB0}'
41
+ stringdef rra '{U+0BB1}'
42
+ stringdef la '{U+0BB2}'
43
+ stringdef lla '{U+0BB3}'
44
+ stringdef llla '{U+0BB4}'
45
+ stringdef zha '{U+0BB4}'
46
+ stringdef va '{U+0BB5}'
47
+
48
+ /* Vatamozi - borrowed */
49
+ stringdef sha '{U+0BB6}'
50
+ stringdef ssa '{U+0BB7}'
51
+ stringdef sa '{U+0BB8}'
52
+ stringdef ha '{U+0BB9}'
53
+
54
+
55
+ /* Dependent vowel signs (kombu etc.) */
56
+ stringdef vs_aa '{U+0BBE}'
57
+ stringdef vs_i '{U+0BBF}'
58
+ stringdef vs_ii '{U+0BC0}'
59
+ stringdef vs_u '{U+0BC1}'
60
+ stringdef vs_uu '{U+0BC2}'
61
+ stringdef vs_e '{U+0BC6}'
62
+ stringdef vs_ee '{U+0BC7}'
63
+ stringdef vs_ai '{U+0BC8}'
64
+ stringdef vs_o '{U+0BCA}'
65
+ stringdef vs_oo '{U+0BCB}'
66
+ stringdef vs_au '{U+0BCC}'
67
+
68
+ /* Pulli */
69
+ stringdef pulli '{U+0BCD}'
70
+
71
+ /* AU length markk */
72
+ stringdef au_lmark '{U+0BD7}'
73
+
74
+
75
+ routines (
76
+ remove_plural_suffix
77
+ remove_question_suffixes
78
+ remove_question_prefixes
79
+ remove_pronoun_prefixes
80
+ remove_command_suffixes
81
+ remove_um
82
+ remove_vetrumai_urupukal
83
+ fix_va_start
84
+ fix_ending
85
+ fix_endings
86
+ remove_tense_suffix
87
+ remove_tense_suffixes
88
+ remove_common_word_endings
89
+ has_min_length
90
+ )
91
+
92
+ externals ( stem )
93
+
94
+ booleans (
95
+ found_a_match
96
+ found_vetrumai_urupu
97
+ )
98
+
99
+ define has_min_length as (
100
+ $(len > 4)
101
+ )
102
+
103
+ define fix_va_start as (
104
+ (try '{va}{vs_oo}' and [ '{va}{vs_oo}' ] <- '{oo}' ) or
105
+ (try '{va}{vs_o}' and [ '{va}{vs_o}' ] <- '{o}' ) or
106
+ (try '{va}{vs_u}' and [ '{va}{vs_u}' ] <- '{u}' ) or
107
+ (try '{va}{vs_uu}' and [ '{va}{vs_uu}' ] <- '{uu}' )
108
+ )
109
+
110
+ define fix_endings as (
111
+ do repeat fix_ending
112
+ )
113
+
114
+ define remove_question_prefixes as (
115
+ [ ('{e}' ) among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
116
+ do fix_va_start
117
+ )
118
+
119
+ // Gives signal t if an ending was fixed, signal f otherwise.
120
+ define fix_ending as (
121
+ $(len > 3)
122
+ backwards (
123
+ ( [among('{na}{pulli}' '{na}{pulli}{ta}' '{na}{pulli}{ta}{pulli}') ] delete )
124
+ or
125
+ ( ['{ya}{pulli}' test among('{vs_ai}' '{vs_i}' '{vs_ii}') ] delete )
126
+ or
127
+ ( [ '{tta}{pulli}{pa}{pulli}' or '{tta}{pulli}{ka}{pulli}' ] <- '{lla}{pulli}' )
128
+ or
129
+ ( [ '{nnna}{pulli}{rra}{pulli}' ] <- '{la}{pulli}' )
130
+ or
131
+ // ( [ '{rra}{pulli}{ka}{pulli}' or '{nnna}{pulli}{nnna}{pulli}' ] <- '{la}{pulli}' )
132
+ ( [ '{rra}{pulli}{ka}{pulli}' ] <- '{la}{pulli}' )
133
+ or
134
+ ( [ '{tta}{pulli}{tta}{pulli}' ] <- '{tta}{vs_u}' )
135
+ or
136
+ ( found_vetrumai_urupu [ '{ta}{pulli}{ta}{pulli}' (test not '{vs_ai}') ] <- '{ma}{pulli}' ] )
137
+ or
138
+ ( [ '{vs_u}{ka}{pulli}' or '{vs_u}{ka}{pulli}{ka}{pulli}' ] <- '{pulli}' )
139
+ or
140
+ ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete )
141
+ or
142
+ ( [ '{vs_u}{ka}{pulli}' ] <- '{pulli}' )
143
+ or
144
+ ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete )
145
+ or
146
+ ( [ '{pulli}' (among('{ya}' '{ra}' '{la}' '{va}' '{zha}' '{lla}') or among('{nga}' '{nya}' '{nna}' '{na}' '{ma}' '{nnna}')) '{pulli}' ] <- '{pulli}' )
147
+ or
148
+ ( [ among('{va}' '{ya}' '{va}{pulli}') ] delete )
149
+ or
150
+ ( [ '{nnna}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')) ] delete )
151
+ or
152
+ ( [ '{nga}{pulli}' (test not '{vs_ai}')] <- '{ma}{pulli}' )
153
+ or
154
+ ( [ '{nga}{pulli}' ] delete )
155
+ or
156
+ ( [ '{pulli}' (test (among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') or '{pulli}')) ] delete )
157
+ )
158
+ )
159
+
160
+ define remove_pronoun_prefixes as (
161
+ unset found_a_match
162
+ [ among('{a}' '{i}' '{u}') among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
163
+ (set found_a_match)
164
+ do fix_va_start
165
+ )
166
+
167
+ define remove_plural_suffix as (
168
+ unset found_a_match
169
+ backwards (
170
+ ( [ '{vs_u}{nga}{pulli}{ka}{lla}{pulli}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}')) ] <- '{pulli}' ) or
171
+ ( [ '{rra}{pulli}{ka}{lla}{pulli}' ] <- '{la}{pulli}' ) or
172
+ ( [ '{tta}{pulli}{ka}{lla}{pulli}' ] <- '{lla}{pulli}' ) or
173
+ ( [ '{ka}{lla}{pulli}' ] delete )
174
+ (set found_a_match)
175
+ )
176
+ )
177
+
178
+ define remove_question_suffixes as (
179
+ has_min_length
180
+ unset found_a_match
181
+ backwards (
182
+ do (
183
+ [ among('{vs_oo}' '{vs_ee}' '{vs_aa}') ] <- '{pulli}'
184
+ (set found_a_match)
185
+ )
186
+ )
187
+ do fix_endings
188
+ )
189
+
190
+ define remove_command_suffixes as (
191
+ has_min_length
192
+ unset found_a_match
193
+ backwards (
194
+ [ among('{pa}{vs_i}' '{va}{vs_i}') ] delete
195
+ (set found_a_match)
196
+ )
197
+ )
198
+
199
+ define remove_um as (
200
+ unset found_a_match
201
+ has_min_length
202
+ backwards ( [ '{vs_u}{ma}{pulli}' ] <- '{pulli}'
203
+ (set found_a_match)
204
+ )
205
+ do fix_ending
206
+ )
207
+
208
+ define remove_common_word_endings as (
209
+ // These are not suffixes actually but are
210
+ // some words that are attached to other words
211
+ // but can be removed for stemming
212
+ unset found_a_match
213
+ has_min_length
214
+ backwards (
215
+ test ( [ '{vs_u}{tta}{nnna}{pulli}' or
216
+ '{vs_i}{la}{pulli}{la}{vs_ai}' or
217
+ '{vs_i}{tta}{ma}{pulli}' or
218
+ '{vs_i}{nnna}{pulli}{rra}{vs_i}' or
219
+ '{vs_aa}{ka}{vs_i}' or
220
+ '{vs_aa}{ka}{vs_i}{ya}' or
221
+ '{vs_e}{nnna}{pulli}{rra}{vs_u}' or
222
+ '{vs_u}{lla}{pulli}{lla}' or
223
+ '{vs_u}{tta}{vs_ai}{ya}' or
224
+ '{vs_u}{tta}{vs_ai}' or
225
+ '{vs_e}{nnna}{vs_u}{ma}{pulli}' or
226
+ ('{la}{pulli}{la}' test (not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
227
+ '{vs_e}{nnna}' or
228
+ '{vs_aa}{ka}{vs_i}' ] <- '{pulli}'
229
+ (set found_a_match)
230
+ )
231
+ or
232
+ test ( [ among('{pa}{tta}{vs_u}'
233
+ '{pa}{tta}{pulli}{tta}'
234
+ '{pa}{tta}{pulli}{tta}{vs_u}'
235
+ '{pa}{tta}{pulli}{tta}{ta}{vs_u}'
236
+ '{pa}{tta}{pulli}{tta}{nna}'
237
+ '{ka}{vs_u}{ra}{vs_i}{ya}'
238
+ '{pa}{rra}{pulli}{rra}{vs_i}'
239
+ '{va}{vs_i}{tta}{vs_u}'
240
+ '{va}{vs_i}{tta}{pulli}{tta}{vs_u}'
241
+ '{pa}{tta}{vs_i}{ta}{vs_aa}{nnna}'
242
+ '{pa}{tta}{vs_i}'
243
+ '{ta}{vs_aa}{nnna}'
244
+ '{vs_e}{la}{pulli}{la}{vs_aa}{ma}{pulli}')
245
+ ] delete
246
+ (set found_a_match)
247
+ )
248
+ )
249
+ do fix_endings
250
+ )
251
+
252
+ define remove_vetrumai_urupukal as (
253
+ unset found_a_match
254
+ unset found_vetrumai_urupu
255
+ has_min_length
256
+ backwards (
257
+ (
258
+ test ( ['{nnna}{vs_ai}'] delete )
259
+ or
260
+ test ([ ( '{vs_i}{nnna}{vs_ai}' or
261
+ '{vs_ai}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}'))) or
262
+ ( '{vs_ai}' (test (among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}')))
263
+ ] <- '{pulli}'
264
+ )
265
+ or
266
+ test ( [
267
+ '{vs_o}{tta}{vs_u}' or
268
+ '{vs_oo}{tta}{vs_u}' or
269
+ '{vs_i}{la}{pulli}' or
270
+ '{vs_i}{rra}{pulli}' or
271
+ ('{vs_i}{nnna}{pulli}' (test not '{ma}')) or
272
+ '{vs_i}{nnna}{pulli}{rra}{vs_u}' or
273
+ '{vs_i}{ra}{vs_u}{na}{pulli}{ta}{vs_u}' or
274
+ '{va}{vs_i}{tta}' or
275
+ ($(len >= 7) '{vs_i}{tta}{ma}{pulli}') or
276
+ '{vs_aa}{la}{pulli}' or
277
+ '{vs_u}{tta}{vs_ai}' or
278
+ '{vs_aa}{ma}{la}{pulli}' or
279
+ ('{la}{pulli}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
280
+ '{vs_u}{lla}{pulli}'
281
+ ] <- '{pulli}'
282
+ )
283
+ or
284
+ test ( [
285
+ '{ka}{nna}{pulli}' or
286
+ '{ma}{vs_u}{nnna}{pulli}' or
287
+ '{ma}{vs_ee}{la}{pulli}' or
288
+ '{ma}{vs_ee}{rra}{pulli}' or
289
+ '{ka}{vs_ii}{llla}{pulli}' or
290
+ '{pa}{vs_i}{nnna}{pulli}' or
291
+ ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')))
292
+ ] delete
293
+ )
294
+ or
295
+ test ([ '{vs_ii}' ] <- '{vs_i}')
296
+ )
297
+ (set found_a_match)
298
+ (set found_vetrumai_urupu)
299
+ do ( [ '{vs_i}{nnna}{pulli}' ] <- '{pulli}' )
300
+ )
301
+ do fix_endings
302
+ )
303
+
304
+ define remove_tense_suffixes as (
305
+ set found_a_match
306
+ repeat ( found_a_match (do remove_tense_suffix) )
307
+ )
308
+
309
+ define remove_tense_suffix as (
310
+ unset found_a_match
311
+ has_min_length
312
+ backwards (
313
+ do (
314
+ test ( [among(
315
+ '{ka}{vs_o}{nna}{pulli}{tta}{vs_i}{ra}{pulli}'
316
+ '{pa}{tta}{vs_u}'
317
+ )] delete
318
+ (set found_a_match)
319
+ )
320
+ or
321
+ test ( [
322
+ '{ma}{vs_aa}{ra}{pulli}' or
323
+ '{ma}{vs_i}{nnna}{pulli}' or
324
+ '{nnna}{nnna}{pulli}' or
325
+ '{nnna}{vs_aa}{nnna}{pulli}' or
326
+ '{nnna}{vs_aa}{lla}{pulli}' or
327
+ '{nnna}{vs_aa}{ra}{pulli}' or
328
+ ('{va}{nnna}{pulli}' test (not among('{a}' '{aa}' '{i}' '{ii}' '{u}' '{uu}' '{e}' '{ee}' '{ai}' '{o}' '{oo}' '{au}')) ) or
329
+ '{nnna}{lla}{pulli}' or
330
+ '{va}{lla}{pulli}' or
331
+ '{nnna}{ra}{pulli}' or
332
+ '{va}{ra}{pulli}' or
333
+ '{nnna}' or '{pa}' or '{ka}' or '{ta}' or '{ya}' or
334
+ '{pa}{nnna}{pulli}' or
335
+ '{pa}{lla}{pulli}' or
336
+ '{pa}{ra}{pulli}' or
337
+ ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
338
+ '{vs_i}{rra}{pulli}{rra}{vs_u}' or
339
+ '{pa}{ma}{pulli}' or
340
+ '{nnna}{ma}{pulli}' or
341
+ '{ta}{vs_u}{ma}{pulli}' or
342
+ '{rra}{vs_u}{ma}{pulli}' or
343
+ '{ka}{vs_u}{ma}{pulli}' or
344
+ '{nnna}{vs_e}{nnna}{pulli}' or
345
+ '{nnna}{vs_ai}' or
346
+ '{va}{vs_ai}'
347
+ ] delete
348
+ (set found_a_match)
349
+ )
350
+ or
351
+ test ( [
352
+ ('{vs_aa}{nnna}{pulli}' test (not '{ca}')) or
353
+ '{vs_aa}{lla}{pulli}' or
354
+ '{vs_aa}{ra}{pulli}' or
355
+ '{vs_ee}{nnna}{pulli}' or
356
+ '{vs_aa}' or
357
+ '{vs_aa}{ma}{pulli}' or
358
+ '{vs_e}{ma}{pulli}' or
359
+ '{vs_ee}{ma}{pulli}' or
360
+ '{vs_oo}{ma}{pulli}' or
361
+ '{ka}{vs_u}{ma}{pulli}' or
362
+ '{ta}{vs_u}{ma}{pulli}' or
363
+ '{tta}{vs_u}{ma}{pulli}' or
364
+ '{rra}{vs_u}{ma}{pulli}' or
365
+ '{vs_aa}{ya}{pulli}' or
366
+ '{nnna}{vs_e}{nnna}{pulli}' or
367
+ '{nnna}{vs_i}{ra}{pulli}' or
368
+ '{vs_ii}{ra}{pulli}' or
369
+ '{vs_ii}{ya}{ra}{pulli}'
370
+ ] <- '{pulli}'
371
+ (set found_a_match)
372
+ )
373
+ or
374
+ test ( ([ '{ka}{vs_u}' or '{ta}{vs_u}' ) (test '{pulli}') ] delete
375
+ (set found_a_match)
376
+ )
377
+ )
378
+ do ([among(
379
+ '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}'
380
+ '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}{pulli}'
381
+ '{ka}{vs_i}{nnna}{pulli}{rra}'
382
+ '{ka}{vs_i}{nnna}{pulli}{rra}{pulli}'
383
+ '{ka}{vs_i}{rra}'
384
+ '{ka}{vs_i}{rra}{pulli}'
385
+ )] delete
386
+ (set found_a_match)
387
+ )
388
+ )
389
+ do fix_endings
390
+ )
391
+
392
+ define stem as (
393
+ unset found_vetrumai_urupu
394
+ do fix_ending
395
+ has_min_length
396
+ do remove_question_prefixes
397
+ do remove_pronoun_prefixes
398
+ do remove_question_suffixes
399
+ do remove_um
400
+ do remove_common_word_endings
401
+ do remove_vetrumai_urupukal
402
+ do remove_plural_suffix
403
+ do remove_command_suffixes
404
+ do remove_tense_suffixes
405
+ )