mittens 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +30 -0
- data/README.md +62 -0
- data/Rakefile +21 -0
- data/ext/mittens/ext.c +96 -0
- data/ext/mittens/extconf.rb +12 -0
- data/lib/mittens/version.rb +3 -0
- data/lib/mittens.rb +7 -0
- data/mittens.gemspec +22 -0
- data/vendor/snowball/.gitignore +26 -0
- data/vendor/snowball/.travis.yml +112 -0
- data/vendor/snowball/AUTHORS +27 -0
- data/vendor/snowball/CONTRIBUTING.rst +216 -0
- data/vendor/snowball/COPYING +29 -0
- data/vendor/snowball/GNUmakefile +742 -0
- data/vendor/snowball/NEWS +754 -0
- data/vendor/snowball/README.rst +37 -0
- data/vendor/snowball/ada/README.md +74 -0
- data/vendor/snowball/ada/generate/generate.adb +83 -0
- data/vendor/snowball/ada/generate.gpr +21 -0
- data/vendor/snowball/ada/src/stemmer.adb +620 -0
- data/vendor/snowball/ada/src/stemmer.ads +219 -0
- data/vendor/snowball/ada/src/stemwords.adb +70 -0
- data/vendor/snowball/ada/stemmer_config.gpr +83 -0
- data/vendor/snowball/ada/stemwords.gpr +21 -0
- data/vendor/snowball/algorithms/arabic.sbl +558 -0
- data/vendor/snowball/algorithms/armenian.sbl +301 -0
- data/vendor/snowball/algorithms/basque.sbl +149 -0
- data/vendor/snowball/algorithms/catalan.sbl +202 -0
- data/vendor/snowball/algorithms/danish.sbl +93 -0
- data/vendor/snowball/algorithms/dutch.sbl +164 -0
- data/vendor/snowball/algorithms/english.sbl +229 -0
- data/vendor/snowball/algorithms/finnish.sbl +197 -0
- data/vendor/snowball/algorithms/french.sbl +254 -0
- data/vendor/snowball/algorithms/german.sbl +139 -0
- data/vendor/snowball/algorithms/german2.sbl +145 -0
- data/vendor/snowball/algorithms/greek.sbl +701 -0
- data/vendor/snowball/algorithms/hindi.sbl +323 -0
- data/vendor/snowball/algorithms/hungarian.sbl +241 -0
- data/vendor/snowball/algorithms/indonesian.sbl +192 -0
- data/vendor/snowball/algorithms/irish.sbl +149 -0
- data/vendor/snowball/algorithms/italian.sbl +202 -0
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
- data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
- data/vendor/snowball/algorithms/lovins.sbl +208 -0
- data/vendor/snowball/algorithms/nepali.sbl +92 -0
- data/vendor/snowball/algorithms/norwegian.sbl +80 -0
- data/vendor/snowball/algorithms/porter.sbl +139 -0
- data/vendor/snowball/algorithms/portuguese.sbl +218 -0
- data/vendor/snowball/algorithms/romanian.sbl +236 -0
- data/vendor/snowball/algorithms/russian.sbl +221 -0
- data/vendor/snowball/algorithms/serbian.sbl +2379 -0
- data/vendor/snowball/algorithms/spanish.sbl +230 -0
- data/vendor/snowball/algorithms/swedish.sbl +72 -0
- data/vendor/snowball/algorithms/tamil.sbl +405 -0
- data/vendor/snowball/algorithms/turkish.sbl +470 -0
- data/vendor/snowball/algorithms/yiddish.sbl +460 -0
- data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
- data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
- data/vendor/snowball/charsets/cp850.sbl +130 -0
- data/vendor/snowball/compiler/analyser.c +1547 -0
- data/vendor/snowball/compiler/driver.c +615 -0
- data/vendor/snowball/compiler/generator.c +1748 -0
- data/vendor/snowball/compiler/generator_ada.c +1702 -0
- data/vendor/snowball/compiler/generator_csharp.c +1322 -0
- data/vendor/snowball/compiler/generator_go.c +1278 -0
- data/vendor/snowball/compiler/generator_java.c +1313 -0
- data/vendor/snowball/compiler/generator_js.c +1316 -0
- data/vendor/snowball/compiler/generator_pascal.c +1387 -0
- data/vendor/snowball/compiler/generator_python.c +1337 -0
- data/vendor/snowball/compiler/generator_rust.c +1295 -0
- data/vendor/snowball/compiler/header.h +418 -0
- data/vendor/snowball/compiler/space.c +286 -0
- data/vendor/snowball/compiler/syswords.h +86 -0
- data/vendor/snowball/compiler/syswords2.h +13 -0
- data/vendor/snowball/compiler/tokeniser.c +567 -0
- data/vendor/snowball/csharp/.gitignore +8 -0
- data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
- data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
- data/vendor/snowball/csharp/Stemwords/App.config +6 -0
- data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
- data/vendor/snowball/doc/TODO +12 -0
- data/vendor/snowball/doc/libstemmer_c_README +148 -0
- data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
- data/vendor/snowball/doc/libstemmer_java_README +67 -0
- data/vendor/snowball/doc/libstemmer_js_README +48 -0
- data/vendor/snowball/doc/libstemmer_python_README +113 -0
- data/vendor/snowball/examples/stemwords.c +204 -0
- data/vendor/snowball/go/README.md +55 -0
- data/vendor/snowball/go/among.go +16 -0
- data/vendor/snowball/go/env.go +403 -0
- data/vendor/snowball/go/stemwords/generate.go +68 -0
- data/vendor/snowball/go/stemwords/main.go +68 -0
- data/vendor/snowball/go/util.go +34 -0
- data/vendor/snowball/iconv.py +50 -0
- data/vendor/snowball/include/libstemmer.h +78 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
- data/vendor/snowball/javascript/base-stemmer.js +294 -0
- data/vendor/snowball/javascript/stemwords.js +106 -0
- data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
- data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
- data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
- data/vendor/snowball/libstemmer/modules.txt +63 -0
- data/vendor/snowball/libstemmer/test.c +34 -0
- data/vendor/snowball/pascal/.gitignore +4 -0
- data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
- data/vendor/snowball/pascal/generate.pl +23 -0
- data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
- data/vendor/snowball/python/MANIFEST.in +7 -0
- data/vendor/snowball/python/create_init.py +54 -0
- data/vendor/snowball/python/setup.cfg +6 -0
- data/vendor/snowball/python/setup.py +81 -0
- data/vendor/snowball/python/snowballstemmer/among.py +13 -0
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
- data/vendor/snowball/python/stemwords.py +101 -0
- data/vendor/snowball/python/testapp.py +28 -0
- data/vendor/snowball/runtime/api.c +58 -0
- data/vendor/snowball/runtime/api.h +32 -0
- data/vendor/snowball/runtime/header.h +61 -0
- data/vendor/snowball/runtime/utilities.c +513 -0
- data/vendor/snowball/rust/Cargo.toml +7 -0
- data/vendor/snowball/rust/build.rs +55 -0
- data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
- data/vendor/snowball/rust/src/main.rs +102 -0
- data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
- data/vendor/snowball/rust/src/snowball/among.rs +6 -0
- data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
- data/vendor/snowball/tests/stemtest.c +95 -0
- metadata +178 -0
@@ -0,0 +1,208 @@
|
|
1
|
+
|
2
|
+
stringescapes {}
|
3
|
+
|
4
|
+
routines (
|
5
|
+
A B C D E F G H I J K L M N O P Q R S T U V W X Y Z AA BB CC
|
6
|
+
|
7
|
+
endings
|
8
|
+
|
9
|
+
undouble respell
|
10
|
+
)
|
11
|
+
|
12
|
+
externals ( stem )
|
13
|
+
|
14
|
+
backwardmode (
|
15
|
+
|
16
|
+
/* Lovins' conditions A, B ... CC, as given in her Appendix B, where
|
17
|
+
a test for a two letter prefix ('test hop 2') is implicitly
|
18
|
+
assumed. Note that 'e' next 'u' corresponds to her u*e because
|
19
|
+
Snowball is scanning backwards. */
|
20
|
+
|
21
|
+
define A as ( hop 2 )
|
22
|
+
define B as ( hop 3 )
|
23
|
+
define C as ( hop 4 )
|
24
|
+
define D as ( hop 5 )
|
25
|
+
define E as ( test hop 2 not 'e' )
|
26
|
+
define F as ( test hop 3 not 'e' )
|
27
|
+
define G as ( test hop 3 'f' )
|
28
|
+
define H as ( test hop 2 't' or 'll' )
|
29
|
+
define I as ( test hop 2 not 'o' not 'e' )
|
30
|
+
define J as ( test hop 2 not 'a' not 'e' )
|
31
|
+
define K as ( test hop 3 'l' or 'i' or ('e' next 'u') )
|
32
|
+
define L as ( test hop 2 not 'u' not 'x' not ('s' not 'o') )
|
33
|
+
define M as ( test hop 2 not 'a' not 'c' not 'e' not 'm' )
|
34
|
+
define N as ( test hop 3 ( hop 2 not 's' or hop 2 ) )
|
35
|
+
define O as ( test hop 2 'l' or 'i' )
|
36
|
+
define P as ( test hop 2 not 'c' )
|
37
|
+
define Q as ( test hop 2 test hop 3 not 'l' not 'n' )
|
38
|
+
define R as ( test hop 2 'n' or 'r' )
|
39
|
+
define S as ( test hop 2 'dr' or ('t' not 't') )
|
40
|
+
define T as ( test hop 2 's' or ('t' not 'o') )
|
41
|
+
define U as ( test hop 2 'l' or 'm' or 'n' or 'r' )
|
42
|
+
define V as ( test hop 2 'c' )
|
43
|
+
define W as ( test hop 2 not 's' not 'u' )
|
44
|
+
define X as ( test hop 2 'l' or 'i' or ('e' next 'u') )
|
45
|
+
define Y as ( test hop 2 'in' )
|
46
|
+
define Z as ( test hop 2 not 'f' )
|
47
|
+
define AA as ( test hop 2 among ( 'd' 'f' 'ph' 'th' 'l' 'er' 'or'
|
48
|
+
'es' 't' ) )
|
49
|
+
define BB as ( test hop 3 not 'met' not 'ryst' )
|
50
|
+
define CC as ( test hop 2 'l' )
|
51
|
+
|
52
|
+
|
53
|
+
/* The system of endings, as given in Appendix A. */
|
54
|
+
|
55
|
+
define endings as (
|
56
|
+
[substring] among(
|
57
|
+
'alistically' B 'arizability' A 'izationally' B
|
58
|
+
|
59
|
+
'antialness' A 'arisations' A 'arizations' A 'entialness' A
|
60
|
+
|
61
|
+
'allically' C 'antaneous' A 'antiality' A 'arisation' A
|
62
|
+
'arization' A 'ationally' B 'ativeness' A 'eableness' E
|
63
|
+
'entations' A 'entiality' A 'entialize' A 'entiation' A
|
64
|
+
'ionalness' A 'istically' A 'itousness' A 'izability' A
|
65
|
+
'izational' A
|
66
|
+
|
67
|
+
'ableness' A 'arizable' A 'entation' A 'entially' A
|
68
|
+
'eousness' A 'ibleness' A 'icalness' A 'ionalism' A
|
69
|
+
'ionality' A 'ionalize' A 'iousness' A 'izations' A
|
70
|
+
'lessness' A
|
71
|
+
|
72
|
+
'ability' A 'aically' A 'alistic' B 'alities' A
|
73
|
+
'ariness' E 'aristic' A 'arizing' A 'ateness' A
|
74
|
+
'atingly' A 'ational' B 'atively' A 'ativism' A
|
75
|
+
'elihood' E 'encible' A 'entally' A 'entials' A
|
76
|
+
'entiate' A 'entness' A 'fulness' A 'ibility' A
|
77
|
+
'icalism' A 'icalist' A 'icality' A 'icalize' A
|
78
|
+
'ication' G 'icianry' A 'ination' A 'ingness' A
|
79
|
+
'ionally' A 'isation' A 'ishness' A 'istical' A
|
80
|
+
'iteness' A 'iveness' A 'ivistic' A 'ivities' A
|
81
|
+
'ization' F 'izement' A 'oidally' A 'ousness' A
|
82
|
+
|
83
|
+
'aceous' A 'acious' B 'action' G 'alness' A
|
84
|
+
'ancial' A 'ancies' A 'ancing' B 'ariser' A
|
85
|
+
'arized' A 'arizer' A 'atable' A 'ations' B
|
86
|
+
'atives' A 'eature' Z 'efully' A 'encies' A
|
87
|
+
'encing' A 'ential' A 'enting' C 'entist' A
|
88
|
+
'eously' A 'ialist' A 'iality' A 'ialize' A
|
89
|
+
'ically' A 'icance' A 'icians' A 'icists' A
|
90
|
+
'ifully' A 'ionals' A 'ionate' D 'ioning' A
|
91
|
+
'ionist' A 'iously' A 'istics' A 'izable' E
|
92
|
+
'lessly' A 'nesses' A 'oidism' A
|
93
|
+
|
94
|
+
'acies' A 'acity' A 'aging' B 'aical' A
|
95
|
+
'alist' A 'alism' B 'ality' A 'alize' A
|
96
|
+
'allic'BB 'anced' B 'ances' B 'antic' C
|
97
|
+
'arial' A 'aries' A 'arily' A 'arity' B
|
98
|
+
'arize' A 'aroid' A 'ately' A 'ating' I
|
99
|
+
'ation' B 'ative' A 'ators' A 'atory' A
|
100
|
+
'ature' E 'early' Y 'ehood' A 'eless' A
|
101
|
+
'elity' A 'ement' A 'enced' A 'ences' A
|
102
|
+
'eness' E 'ening' E 'ental' A 'ented' C
|
103
|
+
'ently' A 'fully' A 'ially' A 'icant' A
|
104
|
+
'ician' A 'icide' A 'icism' A 'icist' A
|
105
|
+
'icity' A 'idine' I 'iedly' A 'ihood' A
|
106
|
+
'inate' A 'iness' A 'ingly' B 'inism' J
|
107
|
+
'inity'CC 'ional' A 'ioned' A 'ished' A
|
108
|
+
'istic' A 'ities' A 'itous' A 'ively' A
|
109
|
+
'ivity' A 'izers' F 'izing' F 'oidal' A
|
110
|
+
'oides' A 'otide' A 'ously' A
|
111
|
+
|
112
|
+
'able' A 'ably' A 'ages' B 'ally' B
|
113
|
+
'ance' B 'ancy' B 'ants' B 'aric' A
|
114
|
+
'arly' K 'ated' I 'ates' A 'atic' B
|
115
|
+
'ator' A 'ealy' Y 'edly' E 'eful' A
|
116
|
+
'eity' A 'ence' A 'ency' A 'ened' E
|
117
|
+
'enly' E 'eous' A 'hood' A 'ials' A
|
118
|
+
'ians' A 'ible' A 'ibly' A 'ical' A
|
119
|
+
'ides' L 'iers' A 'iful' A 'ines' M
|
120
|
+
'ings' N 'ions' B 'ious' A 'isms' B
|
121
|
+
'ists' A 'itic' H 'ized' F 'izer' F
|
122
|
+
'less' A 'lily' A 'ness' A 'ogen' A
|
123
|
+
'ward' A 'wise' A 'ying' B 'yish' A
|
124
|
+
|
125
|
+
'acy' A 'age' B 'aic' A 'als'BB
|
126
|
+
'ant' B 'ars' O 'ary' F 'ata' A
|
127
|
+
'ate' A 'eal' Y 'ear' Y 'ely' E
|
128
|
+
'ene' E 'ent' C 'ery' E 'ese' A
|
129
|
+
'ful' A 'ial' A 'ian' A 'ics' A
|
130
|
+
'ide' L 'ied' A 'ier' A 'ies' P
|
131
|
+
'ily' A 'ine' M 'ing' N 'ion' Q
|
132
|
+
'ish' C 'ism' B 'ist' A 'ite'AA
|
133
|
+
'ity' A 'ium' A 'ive' A 'ize' F
|
134
|
+
'oid' A 'one' R 'ous' A
|
135
|
+
|
136
|
+
'ae' A 'al'BB 'ar' X 'as' B
|
137
|
+
'ed' E 'en' F 'es' E 'ia' A
|
138
|
+
'ic' A 'is' A 'ly' B 'on' S
|
139
|
+
'or' T 'um' U 'us' V 'yl' R
|
140
|
+
'{'}s' A 's{'}' A
|
141
|
+
|
142
|
+
'a' A 'e' A 'i' A 'o' A
|
143
|
+
's' W 'y' B
|
144
|
+
|
145
|
+
(delete)
|
146
|
+
)
|
147
|
+
)
|
148
|
+
|
149
|
+
/* Undoubling is rule 1 of appendix C. */
|
150
|
+
|
151
|
+
define undouble as (
|
152
|
+
test substring among ('bb' 'dd' 'gg' 'll' 'mm' 'nn' 'pp' 'rr' 'ss'
|
153
|
+
'tt')
|
154
|
+
[next] delete
|
155
|
+
)
|
156
|
+
|
157
|
+
/* The other appendix C rules can be done together. */
|
158
|
+
|
159
|
+
define respell as (
|
160
|
+
[substring] among (
|
161
|
+
'iev' (<-'ief')
|
162
|
+
'uct' (<-'uc')
|
163
|
+
'umpt' (<-'um')
|
164
|
+
'rpt' (<-'rb')
|
165
|
+
'urs' (<-'ur')
|
166
|
+
'istr' (<-'ister')
|
167
|
+
'metr' (<-'meter')
|
168
|
+
'olv' (<-'olut')
|
169
|
+
'ul' (not 'a' not 'i' not 'o' <-'l')
|
170
|
+
'bex' (<-'bic')
|
171
|
+
'dex' (<-'dic')
|
172
|
+
'pex' (<-'pic')
|
173
|
+
'tex' (<-'tic')
|
174
|
+
'ax' (<-'ac')
|
175
|
+
'ex' (<-'ec')
|
176
|
+
'ix' (<-'ic')
|
177
|
+
'lux' (<-'luc')
|
178
|
+
'uad' (<-'uas')
|
179
|
+
'vad' (<-'vas')
|
180
|
+
'cid' (<-'cis')
|
181
|
+
'lid' (<-'lis')
|
182
|
+
'erid' (<-'eris')
|
183
|
+
'pand' (<-'pans')
|
184
|
+
'end' (not 's' <-'ens')
|
185
|
+
'ond' (<-'ons')
|
186
|
+
'lud' (<-'lus')
|
187
|
+
'rud' (<-'rus')
|
188
|
+
'her' (not 'p' not 't' <-'hes')
|
189
|
+
'mit' (<-'mis')
|
190
|
+
'ent' (not 'm' <-'ens')
|
191
|
+
/* 'ent' was 'end' in the 1968 paper - a typo. */
|
192
|
+
'ert' (<-'ers')
|
193
|
+
'et' (not 'n' <-'es')
|
194
|
+
'yt' (<-'ys')
|
195
|
+
'yz' (<-'ys')
|
196
|
+
)
|
197
|
+
)
|
198
|
+
)
|
199
|
+
|
200
|
+
define stem as (
|
201
|
+
|
202
|
+
backwards (
|
203
|
+
do endings
|
204
|
+
do undouble
|
205
|
+
do respell
|
206
|
+
)
|
207
|
+
)
|
208
|
+
|
@@ -0,0 +1,92 @@
|
|
1
|
+
/*
|
2
|
+
* Authors:
|
3
|
+
* - Ingroj Shrestha <ing.stha@gmail.com>, Nepali NLP Group
|
4
|
+
* - Oleg Bartunov <obartunov@gmail.com>, Postgres Professional Ltd.
|
5
|
+
* - Shreeya Singh Dhakal, Nepali NLP Group
|
6
|
+
*/
|
7
|
+
|
8
|
+
routines (
|
9
|
+
remove_category_1
|
10
|
+
check_category_2
|
11
|
+
remove_category_2
|
12
|
+
remove_category_3
|
13
|
+
)
|
14
|
+
|
15
|
+
stringescapes {}
|
16
|
+
|
17
|
+
stringdef dsc '{U+0901}' // DEVANAGARI_SIGN_CANDRABINDU
|
18
|
+
stringdef dsa '{U+0902}' // DEVANAGARI_SIGN_ANUSVARA
|
19
|
+
stringdef dli '{U+0907}' // DEVANAGARI_LETTER_I
|
20
|
+
stringdef dlii '{U+0908}' // DEVANAGARI_LETTER_II
|
21
|
+
stringdef dle '{U+090F}' // DEVANAGARI_LETTER_E
|
22
|
+
stringdef dlka '{U+0915}' // DEVANAGARI_LETTER_KA
|
23
|
+
stringdef dlkha '{U+0916}' // DEVANAGARI_LETTER_KHA
|
24
|
+
stringdef dlg '{U+0917}' // DEVANAGARI_LETTER_GA
|
25
|
+
stringdef dlc '{U+091B}' // DEVANAGARI_LETTER_CHA
|
26
|
+
stringdef dlta '{U+0924}' // DEVANAGARI_LETTER_TA
|
27
|
+
stringdef dltha '{U+0925}' // DEVANAGARI_LETTER_THA
|
28
|
+
stringdef dld '{U+0926}' // DEVANAGARI_LETTER_DA
|
29
|
+
stringdef dln '{U+0928}' // DEVANAGARI_LETTER_NA
|
30
|
+
stringdef dlpa '{U+092A}' // DEVANAGARI_LETTER_PA
|
31
|
+
stringdef dlpha '{U+092B}' // DEVANAGARI_LETTER_PHA
|
32
|
+
stringdef dlb '{U+092D}' // DEVANAGARI_LETTER_BHA
|
33
|
+
stringdef dlm '{U+092E}' // DEVANAGARI_LETTER_MA
|
34
|
+
stringdef dly '{U+092F}' // DEVANAGARI_LETTER_YA
|
35
|
+
stringdef dlr '{U+0930}' // DEVANAGARI_LETTER_RA
|
36
|
+
stringdef dll '{U+0932}' // DEVANAGARI_LETTER_LA
|
37
|
+
stringdef dlv '{U+0935}' // DEVANAGARI_LETTER_VA
|
38
|
+
stringdef dls '{U+0938}' // DEVANAGARI_LETTER_SA
|
39
|
+
stringdef dlh '{U+0939}' // DEVANAGARI_LETTER_HA
|
40
|
+
stringdef dvsaa '{U+093E}' // DEVANAGARI_VOWEL_SIGN_AA
|
41
|
+
stringdef dvsi '{U+093F}' // DEVANAGARI_VOWEL_SIGN_I
|
42
|
+
stringdef dvsii '{U+0940}' // DEVANAGARI_VOWEL_SIGN_II
|
43
|
+
stringdef dvsu '{U+0941}' // DEVANAGARI_VOWEL_SIGN_U
|
44
|
+
stringdef dvsuu '{U+0942}' // DEVANAGARI_VOWEL_SIGN_UU
|
45
|
+
stringdef dvse '{U+0947}' // DEVANAGARI_VOWEL_SIGN_E
|
46
|
+
stringdef dvsai '{U+0948}' // DEVANAGARI_VOWEL_SIGN_AI
|
47
|
+
stringdef dvso '{U+094B}' // DEVANAGARI_VOWEL_SIGN_O
|
48
|
+
stringdef dvsau '{U+094C}' // DEVANAGARI_VOWEL_SIGN_AU
|
49
|
+
stringdef dsv '{U+094D}' // DEVANAGARI_SIGN_VIRAMA
|
50
|
+
|
51
|
+
externals ( stem )
|
52
|
+
backwardmode (
|
53
|
+
define remove_category_1 as(
|
54
|
+
[substring] among (
|
55
|
+
'{dlm}{dvsaa}{dlr}{dsv}{dlpha}{dlta}' '{dld}{dsv}{dlv}{dvsaa}{dlr}{dvsaa}' '{dls}{dsc}{dlg}{dvsai}' '{dls}{dsa}{dlg}'
|
56
|
+
'{dls}{dsc}{dlg}' '{dll}{dvsaa}{dli}' '{dll}{dvsaa}{dlii}' '{dlpa}{dlc}{dvsi}'
|
57
|
+
'{dll}{dvse}' '{dlr}{dlta}' '{dlm}{dvsai}' '{dlm}{dvsaa}'
|
58
|
+
(delete)
|
59
|
+
'{dlka}{dvso}' '{dlka}{dvsaa}' '{dlka}{dvsi}' '{dlka}{dvsii}' '{dlka}{dvsai}'(('{dle}' or '{dvse}' ()) or delete)
|
60
|
+
)
|
61
|
+
)
|
62
|
+
|
63
|
+
define check_category_2 as(
|
64
|
+
[substring] among(
|
65
|
+
'{dsc}' '{dsa}' '{dvsai}'
|
66
|
+
)
|
67
|
+
)
|
68
|
+
|
69
|
+
define remove_category_2 as (
|
70
|
+
[substring] among(
|
71
|
+
'{dsc}' '{dsa}' ('{dly}{dvsau}' or '{dlc}{dvsau}' or '{dln}{dvsau}' or '{dltha}{dvse}' delete)
|
72
|
+
'{dvsai}' ('{dlta}{dsv}{dlr}' delete)
|
73
|
+
)
|
74
|
+
)
|
75
|
+
|
76
|
+
define remove_category_3 as(
|
77
|
+
[substring] among(
|
78
|
+
'{dltha}{dvsi}{dli}{dls}{dsv}' '{dlh}{dvsu}{dln}{dvse}{dlc}' '{dlh}{dvsu}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dls}{dsv}' '{dln}{dvse}{dlc}{dln}{dsv}' '{dli}{dle}{dlka}{dvsii}' '{dli}{dle}{dlka}{dvsaa}' '{dli}{dle}{dlka}{dvso}' '{dvsi}{dle}{dlka}{dvsii}' '{dvsi}{dle}{dlka}{dvsaa}' '{dvsi}{dle}{dlka}{dvso}' '{dli}{dlc}{dln}{dsv}' '{dvsi}{dlc}{dln}{dsv}' '{dli}{dlc}{dls}{dsv}' '{dvsi}{dlc}{dls}{dsv}' '{dle}{dlc}{dln}{dsv}' '{dvse}{dlc}{dln}{dsv}' '{dle}{dlc}{dls}{dsv}' '{dvse}{dlc}{dls}{dsv}' '{dlc}{dvsi}{dln}{dsv}' '{dlc}{dvse}{dls}{dsv}' '{dlc}{dsv}{dly}{dvsau}' '{dltha}{dvsi}{dln}{dsv}' '{dltha}{dvsi}{dly}{dvso}' '{dltha}{dvsi}{dly}{dvsau}' '{dltha}{dvsi}{dls}{dsv}' '{dltha}{dsv}{dly}{dvso}' '{dltha}{dsv}{dly}{dvsau}' '{dld}{dvsi}{dly}{dvso}' '{dld}{dvse}{dlkha}{dvsi}' '{dld}{dvse}{dlkha}{dvsii}' '{dll}{dvsaa}{dln}{dsv}' '{dlm}{dvsaa}{dltha}{dvsi}' '{dln}{dvse}{dlka}{dvsai}' '{dln}{dvse}{dlka}{dvsaa}' '{dln}{dvse}{dlka}{dvso}' '{dln}{dvse}{dlc}{dvsau}' '{dlh}{dvso}{dls}{dsv}' '{dli}{dln}{dsv}{dlc}' '{dvsi}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dvsu}' '{dli}{dlc}{dvsau}' '{dvsi}{dlc}{dvsau}' '{dli}{dls}{dsv}' '{dvsi}{dls}{dsv}' '{dvsi}{dly}{dvso}' '{dli}{dly}{dvso}' '{dle}{dlka}{dvsaa}' '{dvse}{dlka}{dvsaa}' '{dle}{dlka}{dvsii}' '{dvse}{dlka}{dvsii}' '{dle}{dlka}{dvsai}' '{dvse}{dlka}{dvsai}' '{dle}{dlka}{dvso}' '{dvse}{dlka}{dvso}' '{dle}{dlc}{dvsu}' '{dvse}{dlc}{dvsu}' '{dle}{dlc}{dvsau}' '{dvse}{dlc}{dvsau}' '{dlc}{dln}{dsv}' '{dlc}{dls}{dsv}' '{dltha}{dvsi}{dle}' '{dlpa}{dlr}{dsv}' '{dlb}{dly}{dvso}' '{dlh}{dlr}{dvsu}' '{dlh}{dlr}{dvsuu}' '{dvsi}{dld}{dvsaa}' '{dli}{dld}{dvsaa}' '{dvsi}{dld}{dvso}' '{dli}{dld}{dvso}' '{dvsi}{dld}{dvsai}' '{dli}{dld}{dvsai}' '{dln}{dvse}{dlc}' '{dli}{dlc}' '{dvsi}{dlc}' '{dle}{dlc}' '{dvse}{dlc}' '{dlc}{dvsu}' '{dlc}{dvse}' '{dlc}{dvsau}' '{dltha}{dvsii}' '{dltha}{dvse}' '{dld}{dvsaa}' '{dld}{dvsii}' '{dld}{dvsai}' '{dld}{dvso}' '{dln}{dvsu}' '{dln}{dvse}' '{dly}{dvso}' '{dly}{dvsau}' '{dlc}'
|
79
|
+
(delete)
|
80
|
+
)
|
81
|
+
)
|
82
|
+
|
83
|
+
)
|
84
|
+
|
85
|
+
define stem as (
|
86
|
+
backwards (
|
87
|
+
do remove_category_1
|
88
|
+
do (
|
89
|
+
repeat (do (check_category_2 and remove_category_2) remove_category_3)
|
90
|
+
)
|
91
|
+
)
|
92
|
+
)
|
@@ -0,0 +1,80 @@
|
|
1
|
+
routines (
|
2
|
+
mark_regions
|
3
|
+
main_suffix
|
4
|
+
consonant_pair
|
5
|
+
other_suffix
|
6
|
+
)
|
7
|
+
|
8
|
+
externals ( stem )
|
9
|
+
|
10
|
+
integers ( p1 x )
|
11
|
+
|
12
|
+
groupings ( v s_ending )
|
13
|
+
|
14
|
+
stringescapes {}
|
15
|
+
|
16
|
+
/* special characters */
|
17
|
+
|
18
|
+
stringdef ae '{U+00E6}'
|
19
|
+
stringdef ao '{U+00E5}'
|
20
|
+
stringdef o/ '{U+00F8}'
|
21
|
+
|
22
|
+
define v 'aeiouy{ae}{ao}{o/}'
|
23
|
+
|
24
|
+
define s_ending 'bcdfghjlmnoprtvyz'
|
25
|
+
|
26
|
+
define mark_regions as (
|
27
|
+
|
28
|
+
$p1 = limit
|
29
|
+
|
30
|
+
test ( hop 3 setmark x )
|
31
|
+
goto v gopast non-v setmark p1
|
32
|
+
try ( $p1 < x $p1 = x )
|
33
|
+
)
|
34
|
+
|
35
|
+
backwardmode (
|
36
|
+
|
37
|
+
define main_suffix as (
|
38
|
+
setlimit tomark p1 for ([substring])
|
39
|
+
among(
|
40
|
+
|
41
|
+
'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar'
|
42
|
+
'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens'
|
43
|
+
'hetens' 'ers' 'ets' 'et' 'het' 'ast'
|
44
|
+
(delete)
|
45
|
+
's'
|
46
|
+
(s_ending or ('k' non-v) delete)
|
47
|
+
'erte' 'ert'
|
48
|
+
(<-'er')
|
49
|
+
)
|
50
|
+
)
|
51
|
+
|
52
|
+
define consonant_pair as (
|
53
|
+
test (
|
54
|
+
setlimit tomark p1 for ([substring])
|
55
|
+
among(
|
56
|
+
'dt' 'vt'
|
57
|
+
)
|
58
|
+
)
|
59
|
+
next] delete
|
60
|
+
)
|
61
|
+
|
62
|
+
define other_suffix as (
|
63
|
+
setlimit tomark p1 for ([substring])
|
64
|
+
among(
|
65
|
+
'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov'
|
66
|
+
'hetslov'
|
67
|
+
(delete)
|
68
|
+
)
|
69
|
+
)
|
70
|
+
)
|
71
|
+
|
72
|
+
define stem as (
|
73
|
+
|
74
|
+
do mark_regions
|
75
|
+
backwards (
|
76
|
+
do main_suffix
|
77
|
+
do consonant_pair
|
78
|
+
do other_suffix
|
79
|
+
)
|
80
|
+
)
|
@@ -0,0 +1,139 @@
|
|
1
|
+
integers ( p1 p2 )
|
2
|
+
booleans ( Y_found )
|
3
|
+
|
4
|
+
routines (
|
5
|
+
shortv
|
6
|
+
R1 R2
|
7
|
+
Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5a Step_5b
|
8
|
+
)
|
9
|
+
|
10
|
+
externals ( stem )
|
11
|
+
|
12
|
+
groupings ( v v_WXY )
|
13
|
+
|
14
|
+
define v 'aeiouy'
|
15
|
+
define v_WXY v + 'wxY'
|
16
|
+
|
17
|
+
backwardmode (
|
18
|
+
|
19
|
+
define shortv as ( non-v_WXY v non-v )
|
20
|
+
|
21
|
+
define R1 as $p1 <= cursor
|
22
|
+
define R2 as $p2 <= cursor
|
23
|
+
|
24
|
+
define Step_1a as (
|
25
|
+
[substring] among (
|
26
|
+
'sses' (<-'ss')
|
27
|
+
'ies' (<-'i')
|
28
|
+
'ss' ()
|
29
|
+
's' (delete)
|
30
|
+
)
|
31
|
+
)
|
32
|
+
|
33
|
+
define Step_1b as (
|
34
|
+
[substring] among (
|
35
|
+
'eed' (R1 <-'ee')
|
36
|
+
'ed'
|
37
|
+
'ing' (
|
38
|
+
test gopast v delete
|
39
|
+
test substring among(
|
40
|
+
'at' 'bl' 'iz'
|
41
|
+
(<+ 'e')
|
42
|
+
'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt'
|
43
|
+
// ignoring double c, h, j, k, q, v, w, and x
|
44
|
+
([next] delete)
|
45
|
+
'' (atmark p1 test shortv <+ 'e')
|
46
|
+
)
|
47
|
+
)
|
48
|
+
)
|
49
|
+
)
|
50
|
+
|
51
|
+
define Step_1c as (
|
52
|
+
['y' or 'Y']
|
53
|
+
gopast v
|
54
|
+
<-'i'
|
55
|
+
)
|
56
|
+
|
57
|
+
define Step_2 as (
|
58
|
+
[substring] R1 among (
|
59
|
+
'tional' (<-'tion')
|
60
|
+
'enci' (<-'ence')
|
61
|
+
'anci' (<-'ance')
|
62
|
+
'abli' (<-'able')
|
63
|
+
'entli' (<-'ent')
|
64
|
+
'eli' (<-'e')
|
65
|
+
'izer' 'ization'
|
66
|
+
(<-'ize')
|
67
|
+
'ational' 'ation' 'ator'
|
68
|
+
(<-'ate')
|
69
|
+
'alli' (<-'al')
|
70
|
+
'alism' 'aliti'
|
71
|
+
(<-'al')
|
72
|
+
'fulness' (<-'ful')
|
73
|
+
'ousli' 'ousness'
|
74
|
+
(<-'ous')
|
75
|
+
'iveness' 'iviti'
|
76
|
+
(<-'ive')
|
77
|
+
'biliti' (<-'ble')
|
78
|
+
)
|
79
|
+
)
|
80
|
+
|
81
|
+
define Step_3 as (
|
82
|
+
[substring] R1 among (
|
83
|
+
'alize' (<-'al')
|
84
|
+
'icate' 'iciti' 'ical'
|
85
|
+
(<-'ic')
|
86
|
+
'ative' 'ful' 'ness'
|
87
|
+
(delete)
|
88
|
+
)
|
89
|
+
)
|
90
|
+
|
91
|
+
define Step_4 as (
|
92
|
+
[substring] R2 among (
|
93
|
+
'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement'
|
94
|
+
'ment' 'ent' 'ou' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize'
|
95
|
+
(delete)
|
96
|
+
'ion' ('s' or 't' delete)
|
97
|
+
)
|
98
|
+
)
|
99
|
+
|
100
|
+
define Step_5a as (
|
101
|
+
['e']
|
102
|
+
R2 or (R1 not shortv)
|
103
|
+
delete
|
104
|
+
)
|
105
|
+
|
106
|
+
define Step_5b as (
|
107
|
+
['l']
|
108
|
+
R2 'l'
|
109
|
+
delete
|
110
|
+
)
|
111
|
+
)
|
112
|
+
|
113
|
+
define stem as (
|
114
|
+
|
115
|
+
unset Y_found
|
116
|
+
do ( ['y'] <-'Y' set Y_found)
|
117
|
+
do repeat(goto (v ['y']) <-'Y' set Y_found)
|
118
|
+
|
119
|
+
$p1 = limit
|
120
|
+
$p2 = limit
|
121
|
+
do(
|
122
|
+
gopast v gopast non-v setmark p1
|
123
|
+
gopast v gopast non-v setmark p2
|
124
|
+
)
|
125
|
+
|
126
|
+
backwards (
|
127
|
+
do Step_1a
|
128
|
+
do Step_1b
|
129
|
+
do Step_1c
|
130
|
+
do Step_2
|
131
|
+
do Step_3
|
132
|
+
do Step_4
|
133
|
+
do Step_5a
|
134
|
+
do Step_5b
|
135
|
+
)
|
136
|
+
|
137
|
+
do(Y_found repeat(goto (['Y']) <-'y'))
|
138
|
+
|
139
|
+
)
|