mittens 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +30 -0
- data/README.md +62 -0
- data/Rakefile +21 -0
- data/ext/mittens/ext.c +96 -0
- data/ext/mittens/extconf.rb +12 -0
- data/lib/mittens/version.rb +3 -0
- data/lib/mittens.rb +7 -0
- data/mittens.gemspec +22 -0
- data/vendor/snowball/.gitignore +26 -0
- data/vendor/snowball/.travis.yml +112 -0
- data/vendor/snowball/AUTHORS +27 -0
- data/vendor/snowball/CONTRIBUTING.rst +216 -0
- data/vendor/snowball/COPYING +29 -0
- data/vendor/snowball/GNUmakefile +742 -0
- data/vendor/snowball/NEWS +754 -0
- data/vendor/snowball/README.rst +37 -0
- data/vendor/snowball/ada/README.md +74 -0
- data/vendor/snowball/ada/generate/generate.adb +83 -0
- data/vendor/snowball/ada/generate.gpr +21 -0
- data/vendor/snowball/ada/src/stemmer.adb +620 -0
- data/vendor/snowball/ada/src/stemmer.ads +219 -0
- data/vendor/snowball/ada/src/stemwords.adb +70 -0
- data/vendor/snowball/ada/stemmer_config.gpr +83 -0
- data/vendor/snowball/ada/stemwords.gpr +21 -0
- data/vendor/snowball/algorithms/arabic.sbl +558 -0
- data/vendor/snowball/algorithms/armenian.sbl +301 -0
- data/vendor/snowball/algorithms/basque.sbl +149 -0
- data/vendor/snowball/algorithms/catalan.sbl +202 -0
- data/vendor/snowball/algorithms/danish.sbl +93 -0
- data/vendor/snowball/algorithms/dutch.sbl +164 -0
- data/vendor/snowball/algorithms/english.sbl +229 -0
- data/vendor/snowball/algorithms/finnish.sbl +197 -0
- data/vendor/snowball/algorithms/french.sbl +254 -0
- data/vendor/snowball/algorithms/german.sbl +139 -0
- data/vendor/snowball/algorithms/german2.sbl +145 -0
- data/vendor/snowball/algorithms/greek.sbl +701 -0
- data/vendor/snowball/algorithms/hindi.sbl +323 -0
- data/vendor/snowball/algorithms/hungarian.sbl +241 -0
- data/vendor/snowball/algorithms/indonesian.sbl +192 -0
- data/vendor/snowball/algorithms/irish.sbl +149 -0
- data/vendor/snowball/algorithms/italian.sbl +202 -0
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
- data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
- data/vendor/snowball/algorithms/lovins.sbl +208 -0
- data/vendor/snowball/algorithms/nepali.sbl +92 -0
- data/vendor/snowball/algorithms/norwegian.sbl +80 -0
- data/vendor/snowball/algorithms/porter.sbl +139 -0
- data/vendor/snowball/algorithms/portuguese.sbl +218 -0
- data/vendor/snowball/algorithms/romanian.sbl +236 -0
- data/vendor/snowball/algorithms/russian.sbl +221 -0
- data/vendor/snowball/algorithms/serbian.sbl +2379 -0
- data/vendor/snowball/algorithms/spanish.sbl +230 -0
- data/vendor/snowball/algorithms/swedish.sbl +72 -0
- data/vendor/snowball/algorithms/tamil.sbl +405 -0
- data/vendor/snowball/algorithms/turkish.sbl +470 -0
- data/vendor/snowball/algorithms/yiddish.sbl +460 -0
- data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
- data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
- data/vendor/snowball/charsets/cp850.sbl +130 -0
- data/vendor/snowball/compiler/analyser.c +1547 -0
- data/vendor/snowball/compiler/driver.c +615 -0
- data/vendor/snowball/compiler/generator.c +1748 -0
- data/vendor/snowball/compiler/generator_ada.c +1702 -0
- data/vendor/snowball/compiler/generator_csharp.c +1322 -0
- data/vendor/snowball/compiler/generator_go.c +1278 -0
- data/vendor/snowball/compiler/generator_java.c +1313 -0
- data/vendor/snowball/compiler/generator_js.c +1316 -0
- data/vendor/snowball/compiler/generator_pascal.c +1387 -0
- data/vendor/snowball/compiler/generator_python.c +1337 -0
- data/vendor/snowball/compiler/generator_rust.c +1295 -0
- data/vendor/snowball/compiler/header.h +418 -0
- data/vendor/snowball/compiler/space.c +286 -0
- data/vendor/snowball/compiler/syswords.h +86 -0
- data/vendor/snowball/compiler/syswords2.h +13 -0
- data/vendor/snowball/compiler/tokeniser.c +567 -0
- data/vendor/snowball/csharp/.gitignore +8 -0
- data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
- data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
- data/vendor/snowball/csharp/Stemwords/App.config +6 -0
- data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
- data/vendor/snowball/doc/TODO +12 -0
- data/vendor/snowball/doc/libstemmer_c_README +148 -0
- data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
- data/vendor/snowball/doc/libstemmer_java_README +67 -0
- data/vendor/snowball/doc/libstemmer_js_README +48 -0
- data/vendor/snowball/doc/libstemmer_python_README +113 -0
- data/vendor/snowball/examples/stemwords.c +204 -0
- data/vendor/snowball/go/README.md +55 -0
- data/vendor/snowball/go/among.go +16 -0
- data/vendor/snowball/go/env.go +403 -0
- data/vendor/snowball/go/stemwords/generate.go +68 -0
- data/vendor/snowball/go/stemwords/main.go +68 -0
- data/vendor/snowball/go/util.go +34 -0
- data/vendor/snowball/iconv.py +50 -0
- data/vendor/snowball/include/libstemmer.h +78 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
- data/vendor/snowball/javascript/base-stemmer.js +294 -0
- data/vendor/snowball/javascript/stemwords.js +106 -0
- data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
- data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
- data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
- data/vendor/snowball/libstemmer/modules.txt +63 -0
- data/vendor/snowball/libstemmer/test.c +34 -0
- data/vendor/snowball/pascal/.gitignore +4 -0
- data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
- data/vendor/snowball/pascal/generate.pl +23 -0
- data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
- data/vendor/snowball/python/MANIFEST.in +7 -0
- data/vendor/snowball/python/create_init.py +54 -0
- data/vendor/snowball/python/setup.cfg +6 -0
- data/vendor/snowball/python/setup.py +81 -0
- data/vendor/snowball/python/snowballstemmer/among.py +13 -0
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
- data/vendor/snowball/python/stemwords.py +101 -0
- data/vendor/snowball/python/testapp.py +28 -0
- data/vendor/snowball/runtime/api.c +58 -0
- data/vendor/snowball/runtime/api.h +32 -0
- data/vendor/snowball/runtime/header.h +61 -0
- data/vendor/snowball/runtime/utilities.c +513 -0
- data/vendor/snowball/rust/Cargo.toml +7 -0
- data/vendor/snowball/rust/build.rs +55 -0
- data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
- data/vendor/snowball/rust/src/main.rs +102 -0
- data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
- data/vendor/snowball/rust/src/snowball/among.rs +6 -0
- data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
- data/vendor/snowball/tests/stemtest.c +95 -0
- metadata +178 -0
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
routines (
|
|
2
|
+
prelude postlude mark_regions
|
|
3
|
+
RV R1 R2
|
|
4
|
+
standard_suffix
|
|
5
|
+
verb_suffix
|
|
6
|
+
residual_suffix
|
|
7
|
+
residual_form
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
externals ( stem )
|
|
11
|
+
|
|
12
|
+
integers ( pV p1 p2 )
|
|
13
|
+
|
|
14
|
+
groupings ( v )
|
|
15
|
+
|
|
16
|
+
stringescapes {}
|
|
17
|
+
|
|
18
|
+
/* special characters */
|
|
19
|
+
|
|
20
|
+
stringdef a' '{U+00E1}' // a-acute
|
|
21
|
+
stringdef a^ '{U+00E2}' // a-circumflex e.g. 'bota^nico
|
|
22
|
+
stringdef e' '{U+00E9}' // e-acute
|
|
23
|
+
stringdef e^ '{U+00EA}' // e-circumflex
|
|
24
|
+
stringdef i' '{U+00ED}' // i-acute
|
|
25
|
+
stringdef o^ '{U+00F4}' // o-circumflex
|
|
26
|
+
stringdef o' '{U+00F3}' // o-acute
|
|
27
|
+
stringdef u' '{U+00FA}' // u-acute
|
|
28
|
+
stringdef c, '{U+00E7}' // c-cedilla
|
|
29
|
+
|
|
30
|
+
stringdef a~ '{U+00E3}' // a-tilde
|
|
31
|
+
stringdef o~ '{U+00F5}' // o-tilde
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}'
|
|
35
|
+
|
|
36
|
+
define prelude as repeat (
|
|
37
|
+
[substring] among(
|
|
38
|
+
'{a~}' (<- 'a~')
|
|
39
|
+
'{o~}' (<- 'o~')
|
|
40
|
+
'' (next)
|
|
41
|
+
) //or next
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
define mark_regions as (
|
|
45
|
+
|
|
46
|
+
$pV = limit
|
|
47
|
+
$p1 = limit
|
|
48
|
+
$p2 = limit // defaults
|
|
49
|
+
|
|
50
|
+
do (
|
|
51
|
+
( v (non-v gopast v) or (v gopast non-v) )
|
|
52
|
+
or
|
|
53
|
+
( non-v (non-v gopast v) or (v next) )
|
|
54
|
+
setmark pV
|
|
55
|
+
)
|
|
56
|
+
do (
|
|
57
|
+
gopast v gopast non-v setmark p1
|
|
58
|
+
gopast v gopast non-v setmark p2
|
|
59
|
+
)
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
define postlude as repeat (
|
|
63
|
+
[substring] among(
|
|
64
|
+
'a~' (<- '{a~}')
|
|
65
|
+
'o~' (<- '{o~}')
|
|
66
|
+
'' (next)
|
|
67
|
+
) //or next
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
backwardmode (
|
|
71
|
+
|
|
72
|
+
define RV as $pV <= cursor
|
|
73
|
+
define R1 as $p1 <= cursor
|
|
74
|
+
define R2 as $p2 <= cursor
|
|
75
|
+
|
|
76
|
+
define standard_suffix as (
|
|
77
|
+
[substring] among(
|
|
78
|
+
|
|
79
|
+
'eza' 'ezas'
|
|
80
|
+
'ico' 'ica' 'icos' 'icas'
|
|
81
|
+
'ismo' 'ismos'
|
|
82
|
+
'{a'}vel'
|
|
83
|
+
'{i'}vel'
|
|
84
|
+
'ista' 'istas'
|
|
85
|
+
'oso' 'osa' 'osos' 'osas'
|
|
86
|
+
'amento' 'amentos'
|
|
87
|
+
'imento' 'imentos'
|
|
88
|
+
|
|
89
|
+
'adora' 'ador' 'a{c,}a~o'
|
|
90
|
+
'adoras' 'adores' 'a{c,}o~es' // no -ic test
|
|
91
|
+
'ante' 'antes' '{a^}ncia' // Note 1
|
|
92
|
+
(
|
|
93
|
+
R2 delete
|
|
94
|
+
)
|
|
95
|
+
'logia'
|
|
96
|
+
'logias'
|
|
97
|
+
(
|
|
98
|
+
R2 <- 'log'
|
|
99
|
+
)
|
|
100
|
+
'u{c,}a~o' 'u{c,}o~es'
|
|
101
|
+
(
|
|
102
|
+
R2 <- 'u'
|
|
103
|
+
)
|
|
104
|
+
'{e^}ncia' '{e^}ncias'
|
|
105
|
+
(
|
|
106
|
+
R2 <- 'ente'
|
|
107
|
+
)
|
|
108
|
+
'amente'
|
|
109
|
+
(
|
|
110
|
+
R1 delete
|
|
111
|
+
try (
|
|
112
|
+
[substring] R2 delete among(
|
|
113
|
+
'iv' (['at'] R2 delete)
|
|
114
|
+
'os'
|
|
115
|
+
'ic'
|
|
116
|
+
'ad'
|
|
117
|
+
)
|
|
118
|
+
)
|
|
119
|
+
)
|
|
120
|
+
'mente'
|
|
121
|
+
(
|
|
122
|
+
R2 delete
|
|
123
|
+
try (
|
|
124
|
+
[substring] among(
|
|
125
|
+
'ante' // Note 1
|
|
126
|
+
'avel'
|
|
127
|
+
'{i'}vel' (R2 delete)
|
|
128
|
+
)
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
'idade'
|
|
132
|
+
'idades'
|
|
133
|
+
(
|
|
134
|
+
R2 delete
|
|
135
|
+
try (
|
|
136
|
+
[substring] among(
|
|
137
|
+
'abil'
|
|
138
|
+
'ic'
|
|
139
|
+
'iv' (R2 delete)
|
|
140
|
+
)
|
|
141
|
+
)
|
|
142
|
+
)
|
|
143
|
+
'iva' 'ivo'
|
|
144
|
+
'ivas' 'ivos'
|
|
145
|
+
(
|
|
146
|
+
R2 delete
|
|
147
|
+
try (
|
|
148
|
+
['at'] R2 delete // but not a further ['ic'] R2 delete
|
|
149
|
+
)
|
|
150
|
+
)
|
|
151
|
+
'ira' 'iras'
|
|
152
|
+
(
|
|
153
|
+
RV 'e' // -eira -eiras usually non-verbal
|
|
154
|
+
<- 'ir'
|
|
155
|
+
)
|
|
156
|
+
)
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
define verb_suffix as setlimit tomark pV for (
|
|
160
|
+
[substring] among(
|
|
161
|
+
'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}'
|
|
162
|
+
'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste'
|
|
163
|
+
'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam'
|
|
164
|
+
'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem'
|
|
165
|
+
'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o'
|
|
166
|
+
'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias'
|
|
167
|
+
'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras'
|
|
168
|
+
'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres'
|
|
169
|
+
'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is'
|
|
170
|
+
'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis'
|
|
171
|
+
'{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis'
|
|
172
|
+
'{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos'
|
|
173
|
+
'{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos'
|
|
174
|
+
'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos'
|
|
175
|
+
'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos'
|
|
176
|
+
'{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou'
|
|
177
|
+
|
|
178
|
+
'ira' 'iras'
|
|
179
|
+
(delete)
|
|
180
|
+
)
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
define residual_suffix as (
|
|
184
|
+
[substring] among(
|
|
185
|
+
'os'
|
|
186
|
+
'a' 'i' 'o' '{a'}' '{i'}' '{o'}'
|
|
187
|
+
( RV delete )
|
|
188
|
+
)
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
define residual_form as (
|
|
192
|
+
[substring] among(
|
|
193
|
+
'e' '{e'}' '{e^}'
|
|
194
|
+
( RV delete [('u'] test 'g') or
|
|
195
|
+
('i'] test 'c') RV delete )
|
|
196
|
+
'{c,}' (<-'c')
|
|
197
|
+
)
|
|
198
|
+
)
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
define stem as (
|
|
202
|
+
do prelude
|
|
203
|
+
do mark_regions
|
|
204
|
+
backwards (
|
|
205
|
+
do (
|
|
206
|
+
( ( standard_suffix or verb_suffix )
|
|
207
|
+
and do ( ['i'] test 'c' RV delete )
|
|
208
|
+
)
|
|
209
|
+
or residual_suffix
|
|
210
|
+
)
|
|
211
|
+
do residual_form
|
|
212
|
+
)
|
|
213
|
+
do postlude
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
/*
|
|
217
|
+
Note 1: additions of 15 Jun 2005
|
|
218
|
+
*/
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
|
|
2
|
+
routines (
|
|
3
|
+
prelude postlude mark_regions
|
|
4
|
+
RV R1 R2
|
|
5
|
+
step_0
|
|
6
|
+
standard_suffix combo_suffix
|
|
7
|
+
verb_suffix
|
|
8
|
+
vowel_suffix
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
externals ( stem )
|
|
12
|
+
|
|
13
|
+
integers ( pV p1 p2 )
|
|
14
|
+
|
|
15
|
+
groupings ( v )
|
|
16
|
+
|
|
17
|
+
booleans ( standard_suffix_removed )
|
|
18
|
+
|
|
19
|
+
stringescapes {}
|
|
20
|
+
|
|
21
|
+
/* special characters */
|
|
22
|
+
|
|
23
|
+
stringdef a^ '{U+00E2}' // a circumflex
|
|
24
|
+
stringdef i^ '{U+00EE}' // i circumflex
|
|
25
|
+
stringdef a+ '{U+0103}' // a breve
|
|
26
|
+
stringdef s, '{U+015F}' // s cedilla
|
|
27
|
+
stringdef t, '{U+0163}' // t cedilla
|
|
28
|
+
|
|
29
|
+
define v 'aeiou{a^}{i^}{a+}'
|
|
30
|
+
|
|
31
|
+
define prelude as (
|
|
32
|
+
repeat goto (
|
|
33
|
+
v [ ('u' ] v <- 'U') or
|
|
34
|
+
('i' ] v <- 'I')
|
|
35
|
+
)
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
define mark_regions as (
|
|
39
|
+
|
|
40
|
+
$pV = limit
|
|
41
|
+
$p1 = limit
|
|
42
|
+
$p2 = limit // defaults
|
|
43
|
+
|
|
44
|
+
do (
|
|
45
|
+
( v (non-v gopast v) or (v gopast non-v) )
|
|
46
|
+
or
|
|
47
|
+
( non-v (non-v gopast v) or (v next) )
|
|
48
|
+
setmark pV
|
|
49
|
+
)
|
|
50
|
+
do (
|
|
51
|
+
gopast v gopast non-v setmark p1
|
|
52
|
+
gopast v gopast non-v setmark p2
|
|
53
|
+
)
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
define postlude as repeat (
|
|
57
|
+
|
|
58
|
+
[substring] among(
|
|
59
|
+
'I' (<- 'i')
|
|
60
|
+
'U' (<- 'u')
|
|
61
|
+
'' (next)
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
backwardmode (
|
|
67
|
+
|
|
68
|
+
define RV as $pV <= cursor
|
|
69
|
+
define R1 as $p1 <= cursor
|
|
70
|
+
define R2 as $p2 <= cursor
|
|
71
|
+
|
|
72
|
+
define step_0 as (
|
|
73
|
+
[substring] R1 among(
|
|
74
|
+
'ul' 'ului'
|
|
75
|
+
( delete )
|
|
76
|
+
'aua'
|
|
77
|
+
( <-'a' )
|
|
78
|
+
'ea' 'ele' 'elor'
|
|
79
|
+
( <-'e' )
|
|
80
|
+
'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor'
|
|
81
|
+
( <-'i')
|
|
82
|
+
'ile'
|
|
83
|
+
( not 'ab' <- 'i' )
|
|
84
|
+
'atei'
|
|
85
|
+
( <- 'at' )
|
|
86
|
+
'a{t,}ie' 'a{t,}ia'
|
|
87
|
+
( <- 'a{t,}i' )
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
define combo_suffix as test (
|
|
92
|
+
[substring] R1 (
|
|
93
|
+
among(
|
|
94
|
+
/* 'IST'. alternative: include the following
|
|
95
|
+
'alism' 'alisme'
|
|
96
|
+
'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' (
|
|
97
|
+
<- 'al'
|
|
98
|
+
)
|
|
99
|
+
*/
|
|
100
|
+
'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' (
|
|
101
|
+
<- 'abil'
|
|
102
|
+
)
|
|
103
|
+
'ibilitate' (
|
|
104
|
+
<- 'ibil'
|
|
105
|
+
)
|
|
106
|
+
'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' (
|
|
107
|
+
<- 'iv'
|
|
108
|
+
)
|
|
109
|
+
'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i'
|
|
110
|
+
'icator' 'icatori'
|
|
111
|
+
'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}'
|
|
112
|
+
'ical' 'icala' 'icale' 'icali' 'ical{a+}' (
|
|
113
|
+
<- 'ic'
|
|
114
|
+
)
|
|
115
|
+
'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune'
|
|
116
|
+
'atoare' 'ator' 'atori'
|
|
117
|
+
'{a+}toare' '{a+}tor' '{a+}tori' (
|
|
118
|
+
<- 'at'
|
|
119
|
+
)
|
|
120
|
+
'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune'
|
|
121
|
+
'itoare' 'itor' 'itori' (
|
|
122
|
+
<- 'it'
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
set standard_suffix_removed
|
|
126
|
+
)
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
define standard_suffix as (
|
|
130
|
+
unset standard_suffix_removed
|
|
131
|
+
repeat combo_suffix
|
|
132
|
+
[substring] R2 (
|
|
133
|
+
among(
|
|
134
|
+
|
|
135
|
+
// past participle is treated here, rather than
|
|
136
|
+
// as a verb ending:
|
|
137
|
+
'at' 'ata' 'at{a+}' 'ati' 'ate'
|
|
138
|
+
'ut' 'uta' 'ut{a+}' 'uti' 'ute'
|
|
139
|
+
'it' 'ita' 'it{a+}' 'iti' 'ite'
|
|
140
|
+
|
|
141
|
+
'ic' 'ica' 'ice' 'ici' 'ic{a+}'
|
|
142
|
+
'abil' 'abila' 'abile' 'abili' 'abil{a+}'
|
|
143
|
+
'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}'
|
|
144
|
+
'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i'
|
|
145
|
+
'ant' 'anta' 'ante' 'anti' 'ant{a+}'
|
|
146
|
+
'ator' 'atori'
|
|
147
|
+
'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i'
|
|
148
|
+
'iv' 'iva' 'ive' 'ivi' 'iv{a+}' (
|
|
149
|
+
delete
|
|
150
|
+
)
|
|
151
|
+
'iune' 'iuni' (
|
|
152
|
+
'{t,}'] <- 't'
|
|
153
|
+
)
|
|
154
|
+
'ism' 'isme'
|
|
155
|
+
'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' (
|
|
156
|
+
<- 'ist'
|
|
157
|
+
/* 'IST'. alternative: remove with <- '' */
|
|
158
|
+
)
|
|
159
|
+
)
|
|
160
|
+
set standard_suffix_removed
|
|
161
|
+
)
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
define verb_suffix as setlimit tomark pV for (
|
|
165
|
+
[substring] among(
|
|
166
|
+
// 'long' infinitive:
|
|
167
|
+
'are' 'ere' 'ire' '{a^}re'
|
|
168
|
+
|
|
169
|
+
// gerund:
|
|
170
|
+
'ind' '{a^}nd'
|
|
171
|
+
'indu' '{a^}ndu'
|
|
172
|
+
|
|
173
|
+
'eze'
|
|
174
|
+
'easc{a+}'
|
|
175
|
+
// present:
|
|
176
|
+
'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti'
|
|
177
|
+
'e{s,}te'
|
|
178
|
+
'{a+}sc' '{a+}{s,}ti'
|
|
179
|
+
'{a+}{s,}te'
|
|
180
|
+
|
|
181
|
+
// imperfect:
|
|
182
|
+
'am' 'ai' 'au'
|
|
183
|
+
'eam' 'eai' 'ea' 'ea{t,}i' 'eau'
|
|
184
|
+
'iam' 'iai' 'ia' 'ia{t,}i' 'iau'
|
|
185
|
+
|
|
186
|
+
// past: // (not 'ii')
|
|
187
|
+
'ui'
|
|
188
|
+
'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}'
|
|
189
|
+
'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}'
|
|
190
|
+
'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}'
|
|
191
|
+
'{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}'
|
|
192
|
+
|
|
193
|
+
// pluferfect:
|
|
194
|
+
'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}'
|
|
195
|
+
'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}'
|
|
196
|
+
'{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i'
|
|
197
|
+
'{a^}ser{a+}'
|
|
198
|
+
'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}'
|
|
199
|
+
|
|
200
|
+
( non-v or 'u' delete )
|
|
201
|
+
|
|
202
|
+
// present:
|
|
203
|
+
'{a+}m' 'a{t,}i'
|
|
204
|
+
'em' 'e{t,}i'
|
|
205
|
+
'im' 'i{t,}i'
|
|
206
|
+
'{a^}m' '{a^}{t,}i'
|
|
207
|
+
|
|
208
|
+
// past:
|
|
209
|
+
'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}'
|
|
210
|
+
'sei' 'se'
|
|
211
|
+
|
|
212
|
+
// pluperfect:
|
|
213
|
+
'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}'
|
|
214
|
+
(delete)
|
|
215
|
+
)
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
define vowel_suffix as (
|
|
219
|
+
[substring] RV among (
|
|
220
|
+
'a' 'e' 'i' 'ie' '{a+}' ( delete )
|
|
221
|
+
)
|
|
222
|
+
)
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
define stem as (
|
|
226
|
+
do prelude
|
|
227
|
+
do mark_regions
|
|
228
|
+
backwards (
|
|
229
|
+
do step_0
|
|
230
|
+
do standard_suffix
|
|
231
|
+
do ( standard_suffix_removed or verb_suffix )
|
|
232
|
+
do vowel_suffix
|
|
233
|
+
)
|
|
234
|
+
do postlude
|
|
235
|
+
)
|
|
236
|
+
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
stringescapes {}
|
|
2
|
+
|
|
3
|
+
/* the 33 Cyrillic letters represented in ASCII characters following the
|
|
4
|
+
* conventions of the standard Library of Congress transliteration: */
|
|
5
|
+
|
|
6
|
+
stringdef a '{U+0430}'
|
|
7
|
+
stringdef b '{U+0431}'
|
|
8
|
+
stringdef v '{U+0432}'
|
|
9
|
+
stringdef g '{U+0433}'
|
|
10
|
+
stringdef d '{U+0434}'
|
|
11
|
+
stringdef e '{U+0435}'
|
|
12
|
+
stringdef e" '{U+0451}'
|
|
13
|
+
stringdef zh '{U+0436}'
|
|
14
|
+
stringdef z '{U+0437}'
|
|
15
|
+
stringdef i '{U+0438}'
|
|
16
|
+
stringdef i` '{U+0439}'
|
|
17
|
+
stringdef k '{U+043A}'
|
|
18
|
+
stringdef l '{U+043B}'
|
|
19
|
+
stringdef m '{U+043C}'
|
|
20
|
+
stringdef n '{U+043D}'
|
|
21
|
+
stringdef o '{U+043E}'
|
|
22
|
+
stringdef p '{U+043F}'
|
|
23
|
+
stringdef r '{U+0440}'
|
|
24
|
+
stringdef s '{U+0441}'
|
|
25
|
+
stringdef t '{U+0442}'
|
|
26
|
+
stringdef u '{U+0443}'
|
|
27
|
+
stringdef f '{U+0444}'
|
|
28
|
+
stringdef kh '{U+0445}'
|
|
29
|
+
stringdef ts '{U+0446}'
|
|
30
|
+
stringdef ch '{U+0447}'
|
|
31
|
+
stringdef sh '{U+0448}'
|
|
32
|
+
stringdef shch '{U+0449}'
|
|
33
|
+
stringdef " '{U+044A}'
|
|
34
|
+
stringdef y '{U+044B}'
|
|
35
|
+
stringdef ' '{U+044C}'
|
|
36
|
+
stringdef e` '{U+044D}'
|
|
37
|
+
stringdef iu '{U+044E}'
|
|
38
|
+
stringdef ia '{U+044F}'
|
|
39
|
+
|
|
40
|
+
routines ( mark_regions R2
|
|
41
|
+
perfective_gerund
|
|
42
|
+
adjective
|
|
43
|
+
adjectival
|
|
44
|
+
reflexive
|
|
45
|
+
verb
|
|
46
|
+
noun
|
|
47
|
+
derivational
|
|
48
|
+
tidy_up
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
externals ( stem )
|
|
52
|
+
|
|
53
|
+
integers ( pV p2 )
|
|
54
|
+
|
|
55
|
+
groupings ( v )
|
|
56
|
+
|
|
57
|
+
define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}'
|
|
58
|
+
|
|
59
|
+
define mark_regions as (
|
|
60
|
+
|
|
61
|
+
$pV = limit
|
|
62
|
+
$p2 = limit
|
|
63
|
+
do (
|
|
64
|
+
gopast v setmark pV gopast non-v
|
|
65
|
+
gopast v gopast non-v setmark p2
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
backwardmode (
|
|
70
|
+
|
|
71
|
+
define R2 as $p2 <= cursor
|
|
72
|
+
|
|
73
|
+
define perfective_gerund as (
|
|
74
|
+
[substring] among (
|
|
75
|
+
'{v}'
|
|
76
|
+
'{v}{sh}{i}'
|
|
77
|
+
'{v}{sh}{i}{s}{'}'
|
|
78
|
+
('{a}' or '{ia}' delete)
|
|
79
|
+
'{i}{v}'
|
|
80
|
+
'{i}{v}{sh}{i}'
|
|
81
|
+
'{i}{v}{sh}{i}{s}{'}'
|
|
82
|
+
'{y}{v}'
|
|
83
|
+
'{y}{v}{sh}{i}'
|
|
84
|
+
'{y}{v}{sh}{i}{s}{'}'
|
|
85
|
+
(delete)
|
|
86
|
+
)
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
define adjective as (
|
|
90
|
+
[substring] among (
|
|
91
|
+
'{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}'
|
|
92
|
+
'{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}'
|
|
93
|
+
'{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}'
|
|
94
|
+
'{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}'
|
|
95
|
+
'{ia}{ia}'
|
|
96
|
+
// and -
|
|
97
|
+
'{o}{iu}' // - which is somewhat archaic
|
|
98
|
+
'{e}{iu}' // - soft form of {o}{iu}
|
|
99
|
+
(delete)
|
|
100
|
+
)
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
define adjectival as (
|
|
104
|
+
adjective
|
|
105
|
+
|
|
106
|
+
/* of the participle forms, em, vsh, ivsh, yvsh are readily removable.
|
|
107
|
+
nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of
|
|
108
|
+
errors. Removing im, uem, enn creates too many errors.
|
|
109
|
+
*/
|
|
110
|
+
|
|
111
|
+
try (
|
|
112
|
+
[substring] among (
|
|
113
|
+
'{e}{m}' // present passive participle
|
|
114
|
+
'{n}{n}' // adjective from past passive participle
|
|
115
|
+
'{v}{sh}' // past active participle
|
|
116
|
+
'{iu}{shch}' '{shch}' // present active participle
|
|
117
|
+
('{a}' or '{ia}' delete)
|
|
118
|
+
|
|
119
|
+
//but not '{i}{m}' '{u}{e}{m}' // present passive participle
|
|
120
|
+
//or '{e}{n}{n}' // adjective from past passive participle
|
|
121
|
+
|
|
122
|
+
'{i}{v}{sh}' '{y}{v}{sh}'// past active participle
|
|
123
|
+
'{u}{iu}{shch}' // present active participle
|
|
124
|
+
(delete)
|
|
125
|
+
)
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
define reflexive as (
|
|
131
|
+
[substring] among (
|
|
132
|
+
'{s}{ia}'
|
|
133
|
+
'{s}{'}'
|
|
134
|
+
(delete)
|
|
135
|
+
)
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
define verb as (
|
|
139
|
+
[substring] among (
|
|
140
|
+
'{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}'
|
|
141
|
+
'{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}'
|
|
142
|
+
'{n}{y}' '{t}{'}' '{e}{sh}{'}'
|
|
143
|
+
|
|
144
|
+
'{n}{n}{o}'
|
|
145
|
+
('{a}' or '{ia}' delete)
|
|
146
|
+
|
|
147
|
+
'{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}'
|
|
148
|
+
'{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}'
|
|
149
|
+
'{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}'
|
|
150
|
+
'{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}'
|
|
151
|
+
'{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}'
|
|
152
|
+
'{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}'
|
|
153
|
+
(delete)
|
|
154
|
+
/* note the short passive participle tests:
|
|
155
|
+
'{n}{a}' '{n}' '{n}{o}' '{n}{y}'
|
|
156
|
+
'{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}'
|
|
157
|
+
*/
|
|
158
|
+
)
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
define noun as (
|
|
162
|
+
[substring] among (
|
|
163
|
+
'{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}'
|
|
164
|
+
'{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}'
|
|
165
|
+
'{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}'
|
|
166
|
+
'{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}'
|
|
167
|
+
'{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}'
|
|
168
|
+
'{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}'
|
|
169
|
+
(delete)
|
|
170
|
+
/* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}'
|
|
171
|
+
'{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}'
|
|
172
|
+
omitted - they only occur on 12 words.
|
|
173
|
+
*/
|
|
174
|
+
)
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
define derivational as (
|
|
178
|
+
[substring] R2 among (
|
|
179
|
+
'{o}{s}{t}'
|
|
180
|
+
'{o}{s}{t}{'}'
|
|
181
|
+
(delete)
|
|
182
|
+
)
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
define tidy_up as (
|
|
186
|
+
[substring] among (
|
|
187
|
+
|
|
188
|
+
'{e}{i`}{sh}'
|
|
189
|
+
'{e}{i`}{sh}{e}' // superlative forms
|
|
190
|
+
(delete
|
|
191
|
+
['{n}'] '{n}' delete
|
|
192
|
+
)
|
|
193
|
+
'{n}'
|
|
194
|
+
('{n}' delete) // e.g. -nno endings
|
|
195
|
+
'{'}'
|
|
196
|
+
(delete) // with some slight false conflations
|
|
197
|
+
)
|
|
198
|
+
)
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
define stem as (
|
|
202
|
+
|
|
203
|
+
// Normalise {e"} to {e}. The documentation has long suggested the user
|
|
204
|
+
// should do this before calling the stemmer - we now do it for them.
|
|
205
|
+
do repeat ( goto (['{e"}']) <- '{e}' )
|
|
206
|
+
|
|
207
|
+
do mark_regions
|
|
208
|
+
backwards setlimit tomark pV for (
|
|
209
|
+
do (
|
|
210
|
+
perfective_gerund or
|
|
211
|
+
( try reflexive
|
|
212
|
+
adjectival or verb or noun
|
|
213
|
+
)
|
|
214
|
+
)
|
|
215
|
+
try([ '{i}' ] delete)
|
|
216
|
+
// because noun ending -i{iu} is being treated as verb ending -{iu}
|
|
217
|
+
|
|
218
|
+
do derivational
|
|
219
|
+
do tidy_up
|
|
220
|
+
)
|
|
221
|
+
)
|