mittens 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE.txt +30 -0
  5. data/README.md +62 -0
  6. data/Rakefile +21 -0
  7. data/ext/mittens/ext.c +96 -0
  8. data/ext/mittens/extconf.rb +12 -0
  9. data/lib/mittens/version.rb +3 -0
  10. data/lib/mittens.rb +7 -0
  11. data/mittens.gemspec +22 -0
  12. data/vendor/snowball/.gitignore +26 -0
  13. data/vendor/snowball/.travis.yml +112 -0
  14. data/vendor/snowball/AUTHORS +27 -0
  15. data/vendor/snowball/CONTRIBUTING.rst +216 -0
  16. data/vendor/snowball/COPYING +29 -0
  17. data/vendor/snowball/GNUmakefile +742 -0
  18. data/vendor/snowball/NEWS +754 -0
  19. data/vendor/snowball/README.rst +37 -0
  20. data/vendor/snowball/ada/README.md +74 -0
  21. data/vendor/snowball/ada/generate/generate.adb +83 -0
  22. data/vendor/snowball/ada/generate.gpr +21 -0
  23. data/vendor/snowball/ada/src/stemmer.adb +620 -0
  24. data/vendor/snowball/ada/src/stemmer.ads +219 -0
  25. data/vendor/snowball/ada/src/stemwords.adb +70 -0
  26. data/vendor/snowball/ada/stemmer_config.gpr +83 -0
  27. data/vendor/snowball/ada/stemwords.gpr +21 -0
  28. data/vendor/snowball/algorithms/arabic.sbl +558 -0
  29. data/vendor/snowball/algorithms/armenian.sbl +301 -0
  30. data/vendor/snowball/algorithms/basque.sbl +149 -0
  31. data/vendor/snowball/algorithms/catalan.sbl +202 -0
  32. data/vendor/snowball/algorithms/danish.sbl +93 -0
  33. data/vendor/snowball/algorithms/dutch.sbl +164 -0
  34. data/vendor/snowball/algorithms/english.sbl +229 -0
  35. data/vendor/snowball/algorithms/finnish.sbl +197 -0
  36. data/vendor/snowball/algorithms/french.sbl +254 -0
  37. data/vendor/snowball/algorithms/german.sbl +139 -0
  38. data/vendor/snowball/algorithms/german2.sbl +145 -0
  39. data/vendor/snowball/algorithms/greek.sbl +701 -0
  40. data/vendor/snowball/algorithms/hindi.sbl +323 -0
  41. data/vendor/snowball/algorithms/hungarian.sbl +241 -0
  42. data/vendor/snowball/algorithms/indonesian.sbl +192 -0
  43. data/vendor/snowball/algorithms/irish.sbl +149 -0
  44. data/vendor/snowball/algorithms/italian.sbl +202 -0
  45. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
  46. data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
  47. data/vendor/snowball/algorithms/lovins.sbl +208 -0
  48. data/vendor/snowball/algorithms/nepali.sbl +92 -0
  49. data/vendor/snowball/algorithms/norwegian.sbl +80 -0
  50. data/vendor/snowball/algorithms/porter.sbl +139 -0
  51. data/vendor/snowball/algorithms/portuguese.sbl +218 -0
  52. data/vendor/snowball/algorithms/romanian.sbl +236 -0
  53. data/vendor/snowball/algorithms/russian.sbl +221 -0
  54. data/vendor/snowball/algorithms/serbian.sbl +2379 -0
  55. data/vendor/snowball/algorithms/spanish.sbl +230 -0
  56. data/vendor/snowball/algorithms/swedish.sbl +72 -0
  57. data/vendor/snowball/algorithms/tamil.sbl +405 -0
  58. data/vendor/snowball/algorithms/turkish.sbl +470 -0
  59. data/vendor/snowball/algorithms/yiddish.sbl +460 -0
  60. data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
  61. data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
  62. data/vendor/snowball/charsets/cp850.sbl +130 -0
  63. data/vendor/snowball/compiler/analyser.c +1547 -0
  64. data/vendor/snowball/compiler/driver.c +615 -0
  65. data/vendor/snowball/compiler/generator.c +1748 -0
  66. data/vendor/snowball/compiler/generator_ada.c +1702 -0
  67. data/vendor/snowball/compiler/generator_csharp.c +1322 -0
  68. data/vendor/snowball/compiler/generator_go.c +1278 -0
  69. data/vendor/snowball/compiler/generator_java.c +1313 -0
  70. data/vendor/snowball/compiler/generator_js.c +1316 -0
  71. data/vendor/snowball/compiler/generator_pascal.c +1387 -0
  72. data/vendor/snowball/compiler/generator_python.c +1337 -0
  73. data/vendor/snowball/compiler/generator_rust.c +1295 -0
  74. data/vendor/snowball/compiler/header.h +418 -0
  75. data/vendor/snowball/compiler/space.c +286 -0
  76. data/vendor/snowball/compiler/syswords.h +86 -0
  77. data/vendor/snowball/compiler/syswords2.h +13 -0
  78. data/vendor/snowball/compiler/tokeniser.c +567 -0
  79. data/vendor/snowball/csharp/.gitignore +8 -0
  80. data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
  81. data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
  82. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
  83. data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
  84. data/vendor/snowball/csharp/Stemwords/App.config +6 -0
  85. data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
  86. data/vendor/snowball/doc/TODO +12 -0
  87. data/vendor/snowball/doc/libstemmer_c_README +148 -0
  88. data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
  89. data/vendor/snowball/doc/libstemmer_java_README +67 -0
  90. data/vendor/snowball/doc/libstemmer_js_README +48 -0
  91. data/vendor/snowball/doc/libstemmer_python_README +113 -0
  92. data/vendor/snowball/examples/stemwords.c +204 -0
  93. data/vendor/snowball/go/README.md +55 -0
  94. data/vendor/snowball/go/among.go +16 -0
  95. data/vendor/snowball/go/env.go +403 -0
  96. data/vendor/snowball/go/stemwords/generate.go +68 -0
  97. data/vendor/snowball/go/stemwords/main.go +68 -0
  98. data/vendor/snowball/go/util.go +34 -0
  99. data/vendor/snowball/iconv.py +50 -0
  100. data/vendor/snowball/include/libstemmer.h +78 -0
  101. data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
  102. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
  103. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
  104. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
  105. data/vendor/snowball/javascript/base-stemmer.js +294 -0
  106. data/vendor/snowball/javascript/stemwords.js +106 -0
  107. data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
  108. data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
  109. data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
  110. data/vendor/snowball/libstemmer/modules.txt +63 -0
  111. data/vendor/snowball/libstemmer/test.c +34 -0
  112. data/vendor/snowball/pascal/.gitignore +4 -0
  113. data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
  114. data/vendor/snowball/pascal/generate.pl +23 -0
  115. data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
  116. data/vendor/snowball/python/MANIFEST.in +7 -0
  117. data/vendor/snowball/python/create_init.py +54 -0
  118. data/vendor/snowball/python/setup.cfg +6 -0
  119. data/vendor/snowball/python/setup.py +81 -0
  120. data/vendor/snowball/python/snowballstemmer/among.py +13 -0
  121. data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
  122. data/vendor/snowball/python/stemwords.py +101 -0
  123. data/vendor/snowball/python/testapp.py +28 -0
  124. data/vendor/snowball/runtime/api.c +58 -0
  125. data/vendor/snowball/runtime/api.h +32 -0
  126. data/vendor/snowball/runtime/header.h +61 -0
  127. data/vendor/snowball/runtime/utilities.c +513 -0
  128. data/vendor/snowball/rust/Cargo.toml +7 -0
  129. data/vendor/snowball/rust/build.rs +55 -0
  130. data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
  131. data/vendor/snowball/rust/src/main.rs +102 -0
  132. data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
  133. data/vendor/snowball/rust/src/snowball/among.rs +6 -0
  134. data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
  135. data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
  136. data/vendor/snowball/tests/stemtest.c +95 -0
  137. metadata +178 -0
@@ -0,0 +1,218 @@
1
+ routines (
2
+ prelude postlude mark_regions
3
+ RV R1 R2
4
+ standard_suffix
5
+ verb_suffix
6
+ residual_suffix
7
+ residual_form
8
+ )
9
+
10
+ externals ( stem )
11
+
12
+ integers ( pV p1 p2 )
13
+
14
+ groupings ( v )
15
+
16
+ stringescapes {}
17
+
18
+ /* special characters */
19
+
20
+ stringdef a' '{U+00E1}' // a-acute
21
+ stringdef a^ '{U+00E2}' // a-circumflex e.g. 'bota^nico
22
+ stringdef e' '{U+00E9}' // e-acute
23
+ stringdef e^ '{U+00EA}' // e-circumflex
24
+ stringdef i' '{U+00ED}' // i-acute
25
+ stringdef o^ '{U+00F4}' // o-circumflex
26
+ stringdef o' '{U+00F3}' // o-acute
27
+ stringdef u' '{U+00FA}' // u-acute
28
+ stringdef c, '{U+00E7}' // c-cedilla
29
+
30
+ stringdef a~ '{U+00E3}' // a-tilde
31
+ stringdef o~ '{U+00F5}' // o-tilde
32
+
33
+
34
+ define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}'
35
+
36
+ define prelude as repeat (
37
+ [substring] among(
38
+ '{a~}' (<- 'a~')
39
+ '{o~}' (<- 'o~')
40
+ '' (next)
41
+ ) //or next
42
+ )
43
+
44
+ define mark_regions as (
45
+
46
+ $pV = limit
47
+ $p1 = limit
48
+ $p2 = limit // defaults
49
+
50
+ do (
51
+ ( v (non-v gopast v) or (v gopast non-v) )
52
+ or
53
+ ( non-v (non-v gopast v) or (v next) )
54
+ setmark pV
55
+ )
56
+ do (
57
+ gopast v gopast non-v setmark p1
58
+ gopast v gopast non-v setmark p2
59
+ )
60
+ )
61
+
62
+ define postlude as repeat (
63
+ [substring] among(
64
+ 'a~' (<- '{a~}')
65
+ 'o~' (<- '{o~}')
66
+ '' (next)
67
+ ) //or next
68
+ )
69
+
70
+ backwardmode (
71
+
72
+ define RV as $pV <= cursor
73
+ define R1 as $p1 <= cursor
74
+ define R2 as $p2 <= cursor
75
+
76
+ define standard_suffix as (
77
+ [substring] among(
78
+
79
+ 'eza' 'ezas'
80
+ 'ico' 'ica' 'icos' 'icas'
81
+ 'ismo' 'ismos'
82
+ '{a'}vel'
83
+ '{i'}vel'
84
+ 'ista' 'istas'
85
+ 'oso' 'osa' 'osos' 'osas'
86
+ 'amento' 'amentos'
87
+ 'imento' 'imentos'
88
+
89
+ 'adora' 'ador' 'a{c,}a~o'
90
+ 'adoras' 'adores' 'a{c,}o~es' // no -ic test
91
+ 'ante' 'antes' '{a^}ncia' // Note 1
92
+ (
93
+ R2 delete
94
+ )
95
+ 'logia'
96
+ 'logias'
97
+ (
98
+ R2 <- 'log'
99
+ )
100
+ 'u{c,}a~o' 'u{c,}o~es'
101
+ (
102
+ R2 <- 'u'
103
+ )
104
+ '{e^}ncia' '{e^}ncias'
105
+ (
106
+ R2 <- 'ente'
107
+ )
108
+ 'amente'
109
+ (
110
+ R1 delete
111
+ try (
112
+ [substring] R2 delete among(
113
+ 'iv' (['at'] R2 delete)
114
+ 'os'
115
+ 'ic'
116
+ 'ad'
117
+ )
118
+ )
119
+ )
120
+ 'mente'
121
+ (
122
+ R2 delete
123
+ try (
124
+ [substring] among(
125
+ 'ante' // Note 1
126
+ 'avel'
127
+ '{i'}vel' (R2 delete)
128
+ )
129
+ )
130
+ )
131
+ 'idade'
132
+ 'idades'
133
+ (
134
+ R2 delete
135
+ try (
136
+ [substring] among(
137
+ 'abil'
138
+ 'ic'
139
+ 'iv' (R2 delete)
140
+ )
141
+ )
142
+ )
143
+ 'iva' 'ivo'
144
+ 'ivas' 'ivos'
145
+ (
146
+ R2 delete
147
+ try (
148
+ ['at'] R2 delete // but not a further ['ic'] R2 delete
149
+ )
150
+ )
151
+ 'ira' 'iras'
152
+ (
153
+ RV 'e' // -eira -eiras usually non-verbal
154
+ <- 'ir'
155
+ )
156
+ )
157
+ )
158
+
159
+ define verb_suffix as setlimit tomark pV for (
160
+ [substring] among(
161
+ 'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}'
162
+ 'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste'
163
+ 'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam'
164
+ 'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem'
165
+ 'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o'
166
+ 'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias'
167
+ 'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras'
168
+ 'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres'
169
+ 'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is'
170
+ 'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis'
171
+ '{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis'
172
+ '{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos'
173
+ '{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos'
174
+ 'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos'
175
+ 'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos'
176
+ '{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou'
177
+
178
+ 'ira' 'iras'
179
+ (delete)
180
+ )
181
+ )
182
+
183
+ define residual_suffix as (
184
+ [substring] among(
185
+ 'os'
186
+ 'a' 'i' 'o' '{a'}' '{i'}' '{o'}'
187
+ ( RV delete )
188
+ )
189
+ )
190
+
191
+ define residual_form as (
192
+ [substring] among(
193
+ 'e' '{e'}' '{e^}'
194
+ ( RV delete [('u'] test 'g') or
195
+ ('i'] test 'c') RV delete )
196
+ '{c,}' (<-'c')
197
+ )
198
+ )
199
+ )
200
+
201
+ define stem as (
202
+ do prelude
203
+ do mark_regions
204
+ backwards (
205
+ do (
206
+ ( ( standard_suffix or verb_suffix )
207
+ and do ( ['i'] test 'c' RV delete )
208
+ )
209
+ or residual_suffix
210
+ )
211
+ do residual_form
212
+ )
213
+ do postlude
214
+ )
215
+
216
+ /*
217
+ Note 1: additions of 15 Jun 2005
218
+ */
@@ -0,0 +1,236 @@
1
+
2
+ routines (
3
+ prelude postlude mark_regions
4
+ RV R1 R2
5
+ step_0
6
+ standard_suffix combo_suffix
7
+ verb_suffix
8
+ vowel_suffix
9
+ )
10
+
11
+ externals ( stem )
12
+
13
+ integers ( pV p1 p2 )
14
+
15
+ groupings ( v )
16
+
17
+ booleans ( standard_suffix_removed )
18
+
19
+ stringescapes {}
20
+
21
+ /* special characters */
22
+
23
+ stringdef a^ '{U+00E2}' // a circumflex
24
+ stringdef i^ '{U+00EE}' // i circumflex
25
+ stringdef a+ '{U+0103}' // a breve
26
+ stringdef s, '{U+015F}' // s cedilla
27
+ stringdef t, '{U+0163}' // t cedilla
28
+
29
+ define v 'aeiou{a^}{i^}{a+}'
30
+
31
+ define prelude as (
32
+ repeat goto (
33
+ v [ ('u' ] v <- 'U') or
34
+ ('i' ] v <- 'I')
35
+ )
36
+ )
37
+
38
+ define mark_regions as (
39
+
40
+ $pV = limit
41
+ $p1 = limit
42
+ $p2 = limit // defaults
43
+
44
+ do (
45
+ ( v (non-v gopast v) or (v gopast non-v) )
46
+ or
47
+ ( non-v (non-v gopast v) or (v next) )
48
+ setmark pV
49
+ )
50
+ do (
51
+ gopast v gopast non-v setmark p1
52
+ gopast v gopast non-v setmark p2
53
+ )
54
+ )
55
+
56
+ define postlude as repeat (
57
+
58
+ [substring] among(
59
+ 'I' (<- 'i')
60
+ 'U' (<- 'u')
61
+ '' (next)
62
+ )
63
+
64
+ )
65
+
66
+ backwardmode (
67
+
68
+ define RV as $pV <= cursor
69
+ define R1 as $p1 <= cursor
70
+ define R2 as $p2 <= cursor
71
+
72
+ define step_0 as (
73
+ [substring] R1 among(
74
+ 'ul' 'ului'
75
+ ( delete )
76
+ 'aua'
77
+ ( <-'a' )
78
+ 'ea' 'ele' 'elor'
79
+ ( <-'e' )
80
+ 'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor'
81
+ ( <-'i')
82
+ 'ile'
83
+ ( not 'ab' <- 'i' )
84
+ 'atei'
85
+ ( <- 'at' )
86
+ 'a{t,}ie' 'a{t,}ia'
87
+ ( <- 'a{t,}i' )
88
+ )
89
+ )
90
+
91
+ define combo_suffix as test (
92
+ [substring] R1 (
93
+ among(
94
+ /* 'IST'. alternative: include the following
95
+ 'alism' 'alisme'
96
+ 'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' (
97
+ <- 'al'
98
+ )
99
+ */
100
+ 'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' (
101
+ <- 'abil'
102
+ )
103
+ 'ibilitate' (
104
+ <- 'ibil'
105
+ )
106
+ 'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' (
107
+ <- 'iv'
108
+ )
109
+ 'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i'
110
+ 'icator' 'icatori'
111
+ 'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}'
112
+ 'ical' 'icala' 'icale' 'icali' 'ical{a+}' (
113
+ <- 'ic'
114
+ )
115
+ 'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune'
116
+ 'atoare' 'ator' 'atori'
117
+ '{a+}toare' '{a+}tor' '{a+}tori' (
118
+ <- 'at'
119
+ )
120
+ 'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune'
121
+ 'itoare' 'itor' 'itori' (
122
+ <- 'it'
123
+ )
124
+ )
125
+ set standard_suffix_removed
126
+ )
127
+ )
128
+
129
+ define standard_suffix as (
130
+ unset standard_suffix_removed
131
+ repeat combo_suffix
132
+ [substring] R2 (
133
+ among(
134
+
135
+ // past participle is treated here, rather than
136
+ // as a verb ending:
137
+ 'at' 'ata' 'at{a+}' 'ati' 'ate'
138
+ 'ut' 'uta' 'ut{a+}' 'uti' 'ute'
139
+ 'it' 'ita' 'it{a+}' 'iti' 'ite'
140
+
141
+ 'ic' 'ica' 'ice' 'ici' 'ic{a+}'
142
+ 'abil' 'abila' 'abile' 'abili' 'abil{a+}'
143
+ 'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}'
144
+ 'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i'
145
+ 'ant' 'anta' 'ante' 'anti' 'ant{a+}'
146
+ 'ator' 'atori'
147
+ 'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i'
148
+ 'iv' 'iva' 'ive' 'ivi' 'iv{a+}' (
149
+ delete
150
+ )
151
+ 'iune' 'iuni' (
152
+ '{t,}'] <- 't'
153
+ )
154
+ 'ism' 'isme'
155
+ 'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' (
156
+ <- 'ist'
157
+ /* 'IST'. alternative: remove with <- '' */
158
+ )
159
+ )
160
+ set standard_suffix_removed
161
+ )
162
+ )
163
+
164
+ define verb_suffix as setlimit tomark pV for (
165
+ [substring] among(
166
+ // 'long' infinitive:
167
+ 'are' 'ere' 'ire' '{a^}re'
168
+
169
+ // gerund:
170
+ 'ind' '{a^}nd'
171
+ 'indu' '{a^}ndu'
172
+
173
+ 'eze'
174
+ 'easc{a+}'
175
+ // present:
176
+ 'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti'
177
+ 'e{s,}te'
178
+ '{a+}sc' '{a+}{s,}ti'
179
+ '{a+}{s,}te'
180
+
181
+ // imperfect:
182
+ 'am' 'ai' 'au'
183
+ 'eam' 'eai' 'ea' 'ea{t,}i' 'eau'
184
+ 'iam' 'iai' 'ia' 'ia{t,}i' 'iau'
185
+
186
+ // past: // (not 'ii')
187
+ 'ui'
188
+ 'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}'
189
+ 'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}'
190
+ 'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}'
191
+ '{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}'
192
+
193
+ // pluferfect:
194
+ 'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}'
195
+ 'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}'
196
+ '{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i'
197
+ '{a^}ser{a+}'
198
+ 'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}'
199
+
200
+ ( non-v or 'u' delete )
201
+
202
+ // present:
203
+ '{a+}m' 'a{t,}i'
204
+ 'em' 'e{t,}i'
205
+ 'im' 'i{t,}i'
206
+ '{a^}m' '{a^}{t,}i'
207
+
208
+ // past:
209
+ 'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}'
210
+ 'sei' 'se'
211
+
212
+ // pluperfect:
213
+ 'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}'
214
+ (delete)
215
+ )
216
+ )
217
+
218
+ define vowel_suffix as (
219
+ [substring] RV among (
220
+ 'a' 'e' 'i' 'ie' '{a+}' ( delete )
221
+ )
222
+ )
223
+ )
224
+
225
+ define stem as (
226
+ do prelude
227
+ do mark_regions
228
+ backwards (
229
+ do step_0
230
+ do standard_suffix
231
+ do ( standard_suffix_removed or verb_suffix )
232
+ do vowel_suffix
233
+ )
234
+ do postlude
235
+ )
236
+
@@ -0,0 +1,221 @@
1
+ stringescapes {}
2
+
3
+ /* the 33 Cyrillic letters represented in ASCII characters following the
4
+ * conventions of the standard Library of Congress transliteration: */
5
+
6
+ stringdef a '{U+0430}'
7
+ stringdef b '{U+0431}'
8
+ stringdef v '{U+0432}'
9
+ stringdef g '{U+0433}'
10
+ stringdef d '{U+0434}'
11
+ stringdef e '{U+0435}'
12
+ stringdef e" '{U+0451}'
13
+ stringdef zh '{U+0436}'
14
+ stringdef z '{U+0437}'
15
+ stringdef i '{U+0438}'
16
+ stringdef i` '{U+0439}'
17
+ stringdef k '{U+043A}'
18
+ stringdef l '{U+043B}'
19
+ stringdef m '{U+043C}'
20
+ stringdef n '{U+043D}'
21
+ stringdef o '{U+043E}'
22
+ stringdef p '{U+043F}'
23
+ stringdef r '{U+0440}'
24
+ stringdef s '{U+0441}'
25
+ stringdef t '{U+0442}'
26
+ stringdef u '{U+0443}'
27
+ stringdef f '{U+0444}'
28
+ stringdef kh '{U+0445}'
29
+ stringdef ts '{U+0446}'
30
+ stringdef ch '{U+0447}'
31
+ stringdef sh '{U+0448}'
32
+ stringdef shch '{U+0449}'
33
+ stringdef " '{U+044A}'
34
+ stringdef y '{U+044B}'
35
+ stringdef ' '{U+044C}'
36
+ stringdef e` '{U+044D}'
37
+ stringdef iu '{U+044E}'
38
+ stringdef ia '{U+044F}'
39
+
40
+ routines ( mark_regions R2
41
+ perfective_gerund
42
+ adjective
43
+ adjectival
44
+ reflexive
45
+ verb
46
+ noun
47
+ derivational
48
+ tidy_up
49
+ )
50
+
51
+ externals ( stem )
52
+
53
+ integers ( pV p2 )
54
+
55
+ groupings ( v )
56
+
57
+ define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}'
58
+
59
+ define mark_regions as (
60
+
61
+ $pV = limit
62
+ $p2 = limit
63
+ do (
64
+ gopast v setmark pV gopast non-v
65
+ gopast v gopast non-v setmark p2
66
+ )
67
+ )
68
+
69
+ backwardmode (
70
+
71
+ define R2 as $p2 <= cursor
72
+
73
+ define perfective_gerund as (
74
+ [substring] among (
75
+ '{v}'
76
+ '{v}{sh}{i}'
77
+ '{v}{sh}{i}{s}{'}'
78
+ ('{a}' or '{ia}' delete)
79
+ '{i}{v}'
80
+ '{i}{v}{sh}{i}'
81
+ '{i}{v}{sh}{i}{s}{'}'
82
+ '{y}{v}'
83
+ '{y}{v}{sh}{i}'
84
+ '{y}{v}{sh}{i}{s}{'}'
85
+ (delete)
86
+ )
87
+ )
88
+
89
+ define adjective as (
90
+ [substring] among (
91
+ '{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}'
92
+ '{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}'
93
+ '{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}'
94
+ '{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}'
95
+ '{ia}{ia}'
96
+ // and -
97
+ '{o}{iu}' // - which is somewhat archaic
98
+ '{e}{iu}' // - soft form of {o}{iu}
99
+ (delete)
100
+ )
101
+ )
102
+
103
+ define adjectival as (
104
+ adjective
105
+
106
+ /* of the participle forms, em, vsh, ivsh, yvsh are readily removable.
107
+ nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of
108
+ errors. Removing im, uem, enn creates too many errors.
109
+ */
110
+
111
+ try (
112
+ [substring] among (
113
+ '{e}{m}' // present passive participle
114
+ '{n}{n}' // adjective from past passive participle
115
+ '{v}{sh}' // past active participle
116
+ '{iu}{shch}' '{shch}' // present active participle
117
+ ('{a}' or '{ia}' delete)
118
+
119
+ //but not '{i}{m}' '{u}{e}{m}' // present passive participle
120
+ //or '{e}{n}{n}' // adjective from past passive participle
121
+
122
+ '{i}{v}{sh}' '{y}{v}{sh}'// past active participle
123
+ '{u}{iu}{shch}' // present active participle
124
+ (delete)
125
+ )
126
+ )
127
+
128
+ )
129
+
130
+ define reflexive as (
131
+ [substring] among (
132
+ '{s}{ia}'
133
+ '{s}{'}'
134
+ (delete)
135
+ )
136
+ )
137
+
138
+ define verb as (
139
+ [substring] among (
140
+ '{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}'
141
+ '{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}'
142
+ '{n}{y}' '{t}{'}' '{e}{sh}{'}'
143
+
144
+ '{n}{n}{o}'
145
+ ('{a}' or '{ia}' delete)
146
+
147
+ '{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}'
148
+ '{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}'
149
+ '{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}'
150
+ '{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}'
151
+ '{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}'
152
+ '{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}'
153
+ (delete)
154
+ /* note the short passive participle tests:
155
+ '{n}{a}' '{n}' '{n}{o}' '{n}{y}'
156
+ '{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}'
157
+ */
158
+ )
159
+ )
160
+
161
+ define noun as (
162
+ [substring] among (
163
+ '{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}'
164
+ '{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}'
165
+ '{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}'
166
+ '{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}'
167
+ '{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}'
168
+ '{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}'
169
+ (delete)
170
+ /* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}'
171
+ '{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}'
172
+ omitted - they only occur on 12 words.
173
+ */
174
+ )
175
+ )
176
+
177
+ define derivational as (
178
+ [substring] R2 among (
179
+ '{o}{s}{t}'
180
+ '{o}{s}{t}{'}'
181
+ (delete)
182
+ )
183
+ )
184
+
185
+ define tidy_up as (
186
+ [substring] among (
187
+
188
+ '{e}{i`}{sh}'
189
+ '{e}{i`}{sh}{e}' // superlative forms
190
+ (delete
191
+ ['{n}'] '{n}' delete
192
+ )
193
+ '{n}'
194
+ ('{n}' delete) // e.g. -nno endings
195
+ '{'}'
196
+ (delete) // with some slight false conflations
197
+ )
198
+ )
199
+ )
200
+
201
+ define stem as (
202
+
203
+ // Normalise {e"} to {e}. The documentation has long suggested the user
204
+ // should do this before calling the stemmer - we now do it for them.
205
+ do repeat ( goto (['{e"}']) <- '{e}' )
206
+
207
+ do mark_regions
208
+ backwards setlimit tomark pV for (
209
+ do (
210
+ perfective_gerund or
211
+ ( try reflexive
212
+ adjectival or verb or noun
213
+ )
214
+ )
215
+ try([ '{i}' ] delete)
216
+ // because noun ending -i{iu} is being treated as verb ending -{iu}
217
+
218
+ do derivational
219
+ do tidy_up
220
+ )
221
+ )