mittens 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +30 -0
- data/README.md +62 -0
- data/Rakefile +21 -0
- data/ext/mittens/ext.c +96 -0
- data/ext/mittens/extconf.rb +12 -0
- data/lib/mittens/version.rb +3 -0
- data/lib/mittens.rb +7 -0
- data/mittens.gemspec +22 -0
- data/vendor/snowball/.gitignore +26 -0
- data/vendor/snowball/.travis.yml +112 -0
- data/vendor/snowball/AUTHORS +27 -0
- data/vendor/snowball/CONTRIBUTING.rst +216 -0
- data/vendor/snowball/COPYING +29 -0
- data/vendor/snowball/GNUmakefile +742 -0
- data/vendor/snowball/NEWS +754 -0
- data/vendor/snowball/README.rst +37 -0
- data/vendor/snowball/ada/README.md +74 -0
- data/vendor/snowball/ada/generate/generate.adb +83 -0
- data/vendor/snowball/ada/generate.gpr +21 -0
- data/vendor/snowball/ada/src/stemmer.adb +620 -0
- data/vendor/snowball/ada/src/stemmer.ads +219 -0
- data/vendor/snowball/ada/src/stemwords.adb +70 -0
- data/vendor/snowball/ada/stemmer_config.gpr +83 -0
- data/vendor/snowball/ada/stemwords.gpr +21 -0
- data/vendor/snowball/algorithms/arabic.sbl +558 -0
- data/vendor/snowball/algorithms/armenian.sbl +301 -0
- data/vendor/snowball/algorithms/basque.sbl +149 -0
- data/vendor/snowball/algorithms/catalan.sbl +202 -0
- data/vendor/snowball/algorithms/danish.sbl +93 -0
- data/vendor/snowball/algorithms/dutch.sbl +164 -0
- data/vendor/snowball/algorithms/english.sbl +229 -0
- data/vendor/snowball/algorithms/finnish.sbl +197 -0
- data/vendor/snowball/algorithms/french.sbl +254 -0
- data/vendor/snowball/algorithms/german.sbl +139 -0
- data/vendor/snowball/algorithms/german2.sbl +145 -0
- data/vendor/snowball/algorithms/greek.sbl +701 -0
- data/vendor/snowball/algorithms/hindi.sbl +323 -0
- data/vendor/snowball/algorithms/hungarian.sbl +241 -0
- data/vendor/snowball/algorithms/indonesian.sbl +192 -0
- data/vendor/snowball/algorithms/irish.sbl +149 -0
- data/vendor/snowball/algorithms/italian.sbl +202 -0
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
- data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
- data/vendor/snowball/algorithms/lovins.sbl +208 -0
- data/vendor/snowball/algorithms/nepali.sbl +92 -0
- data/vendor/snowball/algorithms/norwegian.sbl +80 -0
- data/vendor/snowball/algorithms/porter.sbl +139 -0
- data/vendor/snowball/algorithms/portuguese.sbl +218 -0
- data/vendor/snowball/algorithms/romanian.sbl +236 -0
- data/vendor/snowball/algorithms/russian.sbl +221 -0
- data/vendor/snowball/algorithms/serbian.sbl +2379 -0
- data/vendor/snowball/algorithms/spanish.sbl +230 -0
- data/vendor/snowball/algorithms/swedish.sbl +72 -0
- data/vendor/snowball/algorithms/tamil.sbl +405 -0
- data/vendor/snowball/algorithms/turkish.sbl +470 -0
- data/vendor/snowball/algorithms/yiddish.sbl +460 -0
- data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
- data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
- data/vendor/snowball/charsets/cp850.sbl +130 -0
- data/vendor/snowball/compiler/analyser.c +1547 -0
- data/vendor/snowball/compiler/driver.c +615 -0
- data/vendor/snowball/compiler/generator.c +1748 -0
- data/vendor/snowball/compiler/generator_ada.c +1702 -0
- data/vendor/snowball/compiler/generator_csharp.c +1322 -0
- data/vendor/snowball/compiler/generator_go.c +1278 -0
- data/vendor/snowball/compiler/generator_java.c +1313 -0
- data/vendor/snowball/compiler/generator_js.c +1316 -0
- data/vendor/snowball/compiler/generator_pascal.c +1387 -0
- data/vendor/snowball/compiler/generator_python.c +1337 -0
- data/vendor/snowball/compiler/generator_rust.c +1295 -0
- data/vendor/snowball/compiler/header.h +418 -0
- data/vendor/snowball/compiler/space.c +286 -0
- data/vendor/snowball/compiler/syswords.h +86 -0
- data/vendor/snowball/compiler/syswords2.h +13 -0
- data/vendor/snowball/compiler/tokeniser.c +567 -0
- data/vendor/snowball/csharp/.gitignore +8 -0
- data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
- data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
- data/vendor/snowball/csharp/Stemwords/App.config +6 -0
- data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
- data/vendor/snowball/doc/TODO +12 -0
- data/vendor/snowball/doc/libstemmer_c_README +148 -0
- data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
- data/vendor/snowball/doc/libstemmer_java_README +67 -0
- data/vendor/snowball/doc/libstemmer_js_README +48 -0
- data/vendor/snowball/doc/libstemmer_python_README +113 -0
- data/vendor/snowball/examples/stemwords.c +204 -0
- data/vendor/snowball/go/README.md +55 -0
- data/vendor/snowball/go/among.go +16 -0
- data/vendor/snowball/go/env.go +403 -0
- data/vendor/snowball/go/stemwords/generate.go +68 -0
- data/vendor/snowball/go/stemwords/main.go +68 -0
- data/vendor/snowball/go/util.go +34 -0
- data/vendor/snowball/iconv.py +50 -0
- data/vendor/snowball/include/libstemmer.h +78 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
- data/vendor/snowball/javascript/base-stemmer.js +294 -0
- data/vendor/snowball/javascript/stemwords.js +106 -0
- data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
- data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
- data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
- data/vendor/snowball/libstemmer/modules.txt +63 -0
- data/vendor/snowball/libstemmer/test.c +34 -0
- data/vendor/snowball/pascal/.gitignore +4 -0
- data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
- data/vendor/snowball/pascal/generate.pl +23 -0
- data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
- data/vendor/snowball/python/MANIFEST.in +7 -0
- data/vendor/snowball/python/create_init.py +54 -0
- data/vendor/snowball/python/setup.cfg +6 -0
- data/vendor/snowball/python/setup.py +81 -0
- data/vendor/snowball/python/snowballstemmer/among.py +13 -0
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
- data/vendor/snowball/python/stemwords.py +101 -0
- data/vendor/snowball/python/testapp.py +28 -0
- data/vendor/snowball/runtime/api.c +58 -0
- data/vendor/snowball/runtime/api.h +32 -0
- data/vendor/snowball/runtime/header.h +61 -0
- data/vendor/snowball/runtime/utilities.c +513 -0
- data/vendor/snowball/rust/Cargo.toml +7 -0
- data/vendor/snowball/rust/build.rs +55 -0
- data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
- data/vendor/snowball/rust/src/main.rs +102 -0
- data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
- data/vendor/snowball/rust/src/snowball/among.rs +6 -0
- data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
- data/vendor/snowball/tests/stemtest.c +95 -0
- metadata +178 -0
@@ -0,0 +1,558 @@
|
|
1
|
+
/*
|
2
|
+
* Authors:
|
3
|
+
* - Assem Chelli, < assem [dot] ch [at] gmail >
|
4
|
+
* - Abdelkrim Aries <ab [underscore] aries [at] esi [dot] dz>
|
5
|
+
*
|
6
|
+
*/
|
7
|
+
|
8
|
+
stringescapes { }
|
9
|
+
|
10
|
+
/* the Arabic letters in Unicode */
|
11
|
+
// Hamza
|
12
|
+
stringdef o '{U+0621}' // Hamza
|
13
|
+
stringdef ao '{U+0623}' // Hamza above Alef
|
14
|
+
stringdef ao_ '{U+0625}' // Hamza below Alef
|
15
|
+
stringdef a~ '{U+0622}' // Alef madda
|
16
|
+
stringdef wo '{U+0624}' // Hamza above waw
|
17
|
+
stringdef yo '{U+0626}' // Hamza above yeh
|
18
|
+
|
19
|
+
// Letters
|
20
|
+
stringdef a '{U+0627}' // Alef
|
21
|
+
stringdef a_ '{U+0649}' // Alef Maksura
|
22
|
+
stringdef b '{U+0628}' // Beh
|
23
|
+
stringdef t_ '{U+0629}' // Teh_Marbuta
|
24
|
+
stringdef t '{U+062A}' // Teh
|
25
|
+
stringdef th '{U+062B}' // Theh
|
26
|
+
stringdef j '{U+062C}' // Jeem
|
27
|
+
stringdef h '{U+062D}' // Hah
|
28
|
+
stringdef x '{U+062E}' // Khah
|
29
|
+
stringdef d '{U+062F}' // Dal
|
30
|
+
stringdef dz '{U+0630}' // Thal
|
31
|
+
stringdef r '{U+0631}' // Reh
|
32
|
+
stringdef z '{U+0632}' // Zain
|
33
|
+
stringdef s '{U+0633}' // Seen
|
34
|
+
stringdef sh '{U+0634}' // Sheen
|
35
|
+
stringdef c '{U+0635}' // Sad
|
36
|
+
stringdef dh '{U+0636}' // Dad
|
37
|
+
stringdef tt '{U+0637}' // Tah
|
38
|
+
stringdef zh '{U+0638}' // Zah
|
39
|
+
stringdef i '{U+0639}' // Ain
|
40
|
+
stringdef gh '{U+063A}' // Ghain
|
41
|
+
stringdef f '{U+0641}' // Feh
|
42
|
+
stringdef q '{U+0642}' // Qaf
|
43
|
+
stringdef k '{U+0643}' // Kaf
|
44
|
+
stringdef l '{U+0644}' // Lam
|
45
|
+
stringdef m '{U+0645}' // Meem
|
46
|
+
stringdef n '{U+0646}' // Noon
|
47
|
+
stringdef e '{U+0647}' // Heh
|
48
|
+
stringdef w '{U+0648}' // Waw
|
49
|
+
stringdef y '{U+064A}' // Yeh
|
50
|
+
|
51
|
+
// Diacritics
|
52
|
+
stringdef aan '{U+064B}' // FatHatan
|
53
|
+
stringdef uun '{U+064C}' // Dammatan
|
54
|
+
stringdef iin '{U+064D}' // Kasratan
|
55
|
+
stringdef aa '{U+064E}' // FatHa
|
56
|
+
stringdef uu '{U+064F}' // Damma
|
57
|
+
stringdef ii '{U+0650}' // Kasra
|
58
|
+
stringdef oo '{U+0652}' // Sukun
|
59
|
+
stringdef ~ '{U+0651}' // Shadda
|
60
|
+
|
61
|
+
// Hindu–Arabic numerals
|
62
|
+
stringdef 0 '{U+0660}'
|
63
|
+
stringdef 1 '{U+0661}'
|
64
|
+
stringdef 2 '{U+0662}'
|
65
|
+
stringdef 3 '{U+0663}'
|
66
|
+
stringdef 4 '{U+0664}'
|
67
|
+
stringdef 5 '{U+0665}'
|
68
|
+
stringdef 6 '{U+0666}'
|
69
|
+
stringdef 7 '{U+0667}'
|
70
|
+
stringdef 8 '{U+0668}'
|
71
|
+
stringdef 9 '{U+0669}'
|
72
|
+
|
73
|
+
|
74
|
+
// Kasheeda
|
75
|
+
stringdef _ '{U+0640}' // Kasheeda, Tatweel
|
76
|
+
|
77
|
+
// Shaped forms
|
78
|
+
stringdef o1 '{U+FE80}' // HAMZA
|
79
|
+
stringdef ao1 '{U+FE83}' // ALEF_HAMZA_ABOVE
|
80
|
+
stringdef ao2 '{U+FE84}' // ALEF_HAMZA_ABOVE
|
81
|
+
stringdef ao_1 '{U+FE87}' // ALEF_HAMZA_BELOW
|
82
|
+
stringdef ao_2 '{U+FE88}' // ALEF_HAMZA_BELOW
|
83
|
+
stringdef yo1 '{U+FE8B}' // YEH_HAMZA
|
84
|
+
stringdef yo2 '{U+FE8C}' // YEH_HAMZA
|
85
|
+
stringdef yo3 '{U+FE89}' // YEH_HAMZA
|
86
|
+
stringdef yo4 '{U+FE8A}' // YEH_HAMZA
|
87
|
+
stringdef a~1 '{U+FE81}' // ALEF_MADDA
|
88
|
+
stringdef a~2 '{U+FE82}' // ALEF_MADDA
|
89
|
+
stringdef wo1 '{U+FE85}' // WAW_HAMZA
|
90
|
+
stringdef wo2 '{U+FE86}' // WAW_HAMZA
|
91
|
+
stringdef a1 '{U+FE8D}' // ALEF
|
92
|
+
stringdef a2 '{U+FE8E}' // ALEF
|
93
|
+
stringdef b1 '{U+FE8F}' // BEH
|
94
|
+
stringdef b2 '{U+FE90}' // BEH
|
95
|
+
stringdef b3 '{U+FE91}' // BEH
|
96
|
+
stringdef b4 '{U+FE92}' // BEH
|
97
|
+
stringdef t_1 '{U+FE93}' // TEH_MARBUTA
|
98
|
+
stringdef t_2 '{U+FE94}' // TEH_MARBUTA
|
99
|
+
stringdef t1 '{U+FE97}' // TEH
|
100
|
+
stringdef t2 '{U+FE98}' // TEH
|
101
|
+
stringdef t3 '{U+FE95}' // TEH
|
102
|
+
stringdef t4 '{U+FE96}' // TEH
|
103
|
+
stringdef th1 '{U+FE9B}' // THEH
|
104
|
+
stringdef th2 '{U+FE9C}' // THEH
|
105
|
+
stringdef th3 '{U+FE9A}' // THEH
|
106
|
+
stringdef th4 '{U+FE99}' // THEH
|
107
|
+
stringdef j1 '{U+FE9F}' // JEEM
|
108
|
+
stringdef j2 '{U+FEA0}' // JEEM
|
109
|
+
stringdef j3 '{U+FE9D}' // JEEM
|
110
|
+
stringdef j4 '{U+FE9E}' // JEEM
|
111
|
+
stringdef h1 '{U+FEA3}' // HAH
|
112
|
+
stringdef h2 '{U+FEA4}' // HAH
|
113
|
+
stringdef h3 '{U+FEA1}' // HAH
|
114
|
+
stringdef h4 '{U+FEA2}' // HAH
|
115
|
+
stringdef x1 '{U+FEA7}' // KHAH
|
116
|
+
stringdef x2 '{U+FEA8}' // KHAH
|
117
|
+
stringdef x3 '{U+FEA5}' // KHAH
|
118
|
+
stringdef x4 '{U+FEA6}' // KHAH
|
119
|
+
stringdef d1 '{U+FEA9}' // DAL
|
120
|
+
stringdef d2 '{U+FEAA}' // DAL
|
121
|
+
stringdef dz1 '{U+FEAB}' // THAL
|
122
|
+
stringdef dz2 '{U+FEAC}' // THAL
|
123
|
+
stringdef r1 '{U+FEAD}' // REH
|
124
|
+
stringdef r2 '{U+FEAE}' // REH
|
125
|
+
stringdef z1 '{U+FEAF}' // ZAIN
|
126
|
+
stringdef z2 '{U+FEB0}' // ZAIN
|
127
|
+
stringdef s1 '{U+FEB3}' // SEEN
|
128
|
+
stringdef s2 '{U+FEB4}' // SEEN
|
129
|
+
stringdef s3 '{U+FEB1}' // SEEN
|
130
|
+
stringdef s4 '{U+FEB2}' // SEEN
|
131
|
+
stringdef sh1 '{U+FEB7}' // SHEEN
|
132
|
+
stringdef sh2 '{U+FEB8}' // SHEEN
|
133
|
+
stringdef sh3 '{U+FEB5}' // SHEEN
|
134
|
+
stringdef sh4 '{U+FEB6}' // SHEEN
|
135
|
+
stringdef c1 '{U+FEBB}' // SAD
|
136
|
+
stringdef c2 '{U+FEBC}' // SAD
|
137
|
+
stringdef c3 '{U+FEB9}' // SAD
|
138
|
+
stringdef c4 '{U+FEBA}' // SAD
|
139
|
+
stringdef dh1 '{U+FEBF}' // DAD
|
140
|
+
stringdef dh2 '{U+FEC0}' // DAD
|
141
|
+
stringdef dh3 '{U+FEBD}' // DAD
|
142
|
+
stringdef dh4 '{U+FEBE}' // DAD
|
143
|
+
stringdef tt1 '{U+FEC3}' // TAH
|
144
|
+
stringdef tt2 '{U+FEC4}' // TAH
|
145
|
+
stringdef tt3 '{U+FEC1}' // TAH
|
146
|
+
stringdef tt4 '{U+FEC2}' // TAH
|
147
|
+
stringdef zh1 '{U+FEC7}' // ZAH
|
148
|
+
stringdef zh2 '{U+FEC8}' // ZAH
|
149
|
+
stringdef zh3 '{U+FEC5}' // ZAH
|
150
|
+
stringdef zh4 '{U+FEC6}' // ZAH
|
151
|
+
stringdef i1 '{U+FECB}' // AIN
|
152
|
+
stringdef i2 '{U+FECC}' // AIN
|
153
|
+
stringdef i3 '{U+FEC9}' // AIN
|
154
|
+
stringdef i4 '{U+FECA}' // AIN
|
155
|
+
stringdef gh1 '{U+FECF}' // GHAIN
|
156
|
+
stringdef gh2 '{U+FED0}' // GHAIN
|
157
|
+
stringdef gh3 '{U+FECD}' // GHAIN
|
158
|
+
stringdef gh4 '{U+FECE}' // GHAIN
|
159
|
+
stringdef f1 '{U+FED3}' // FEH
|
160
|
+
stringdef f2 '{U+FED4}' // FEH
|
161
|
+
stringdef f3 '{U+FED1}' // FEH
|
162
|
+
stringdef f4 '{U+FED2}' // FEH
|
163
|
+
stringdef q1 '{U+FED7}' // QAF
|
164
|
+
stringdef q2 '{U+FED8}' // QAF
|
165
|
+
stringdef q3 '{U+FED5}' // QAF
|
166
|
+
stringdef q4 '{U+FED6}' // QAF
|
167
|
+
stringdef k1 '{U+FEDB}' // KAF
|
168
|
+
stringdef k2 '{U+FEDC}' // KAF
|
169
|
+
stringdef k3 '{U+FED9}' // KAF
|
170
|
+
stringdef k4 '{U+FEDA}' // KAF
|
171
|
+
stringdef l1 '{U+FEDF}' // LAM
|
172
|
+
stringdef l2 '{U+FEE0}' // LAM
|
173
|
+
stringdef l3 '{U+FEDD}' // LAM
|
174
|
+
stringdef l4 '{U+FEDE}' // LAM
|
175
|
+
stringdef m1 '{U+FEE3}' // MEEM
|
176
|
+
stringdef m2 '{U+FEE4}' // MEEM
|
177
|
+
stringdef m3 '{U+FEE1}' // MEEM
|
178
|
+
stringdef m4 '{U+FEE2}' // MEEM
|
179
|
+
stringdef n1 '{U+FEE7}' // NOON
|
180
|
+
stringdef n2 '{U+FEE8}' // NOON
|
181
|
+
stringdef n3 '{U+FEE5}' // NOON
|
182
|
+
stringdef n4 '{U+FEE6}' // NOON
|
183
|
+
stringdef e1 '{U+FEEB}' // HEH
|
184
|
+
stringdef e2 '{U+FEEC}' // HEH
|
185
|
+
stringdef e3 '{U+FEE9}' // HEH
|
186
|
+
stringdef e4 '{U+FEEA}' // HEH
|
187
|
+
stringdef w1 '{U+FEED}' // WAW
|
188
|
+
stringdef w2 '{U+FEEE}' // WAW
|
189
|
+
stringdef a_1 '{U+FEEF}' // ALEF_MAKSURA
|
190
|
+
stringdef a_2 '{U+FEF0}' // ALEF_MAKSURA
|
191
|
+
stringdef y1 '{U+FEF3}' // YEH
|
192
|
+
stringdef y2 '{U+FEF4}' // YEH
|
193
|
+
stringdef y3 '{U+FEF1}' // YEH
|
194
|
+
stringdef y4 '{U+FEF2}' // YEH
|
195
|
+
|
196
|
+
// Ligatures Lam-Alef
|
197
|
+
stringdef la '{U+FEFB}' // LAM_ALEF
|
198
|
+
stringdef la2 '{U+FEFC}' // LAM_ALEF
|
199
|
+
stringdef lao '{U+FEF7}' // LAM_ALEF_HAMZA_ABOVE
|
200
|
+
stringdef lao2 '{U+FEF8}' // LAM_ALEF_HAMZA_ABOVE
|
201
|
+
stringdef lao_ '{U+FEF9}' // LAM_ALEF_HAMZA_BELOW
|
202
|
+
stringdef lao_2 '{U+FEFA}' // LAM_ALEF_HAMZA_BELOW
|
203
|
+
stringdef la~ '{U+FEF5}' // LAM_ALEF_MADDA_ABOVE
|
204
|
+
stringdef la~2 '{U+FEF6}' // LAM_ALEF_MADDA_ABOVE
|
205
|
+
|
206
|
+
|
207
|
+
booleans (
|
208
|
+
is_noun
|
209
|
+
is_verb
|
210
|
+
is_defined
|
211
|
+
)
|
212
|
+
|
213
|
+
routines (
|
214
|
+
Prefix_Step1
|
215
|
+
Prefix_Step2
|
216
|
+
Prefix_Step3a_Noun
|
217
|
+
Prefix_Step3b_Noun
|
218
|
+
Prefix_Step3_Verb
|
219
|
+
Prefix_Step4_Verb
|
220
|
+
|
221
|
+
Suffix_All_alef_maqsura
|
222
|
+
Suffix_Noun_Step1a
|
223
|
+
Suffix_Noun_Step1b
|
224
|
+
Suffix_Noun_Step2a
|
225
|
+
Suffix_Noun_Step2b
|
226
|
+
Suffix_Noun_Step2c1
|
227
|
+
Suffix_Noun_Step2c2
|
228
|
+
Suffix_Noun_Step3
|
229
|
+
Suffix_Verb_Step1
|
230
|
+
Suffix_Verb_Step2a
|
231
|
+
Suffix_Verb_Step2b
|
232
|
+
Suffix_Verb_Step2c
|
233
|
+
|
234
|
+
Normalize_post
|
235
|
+
Normalize_pre
|
236
|
+
|
237
|
+
Checks1
|
238
|
+
)
|
239
|
+
|
240
|
+
externals ( stem )
|
241
|
+
|
242
|
+
groupings ( )
|
243
|
+
|
244
|
+
|
245
|
+
// Normalizations
|
246
|
+
define Normalize_pre as (
|
247
|
+
do repeat (
|
248
|
+
(
|
249
|
+
[substring] among (
|
250
|
+
'{aan}' '{uun}' '{iin}' '{aa}' '{uu}' '{ii}' '{oo}' '{~}'( delete ) // strip vocalization
|
251
|
+
'{_}' ( delete ) // strip kasheeda
|
252
|
+
|
253
|
+
// Hindu–Arabic numerals
|
254
|
+
'{0}' ( <- '0')
|
255
|
+
'{1}' ( <- '1')
|
256
|
+
'{2}' ( <- '2')
|
257
|
+
'{3}' ( <- '3')
|
258
|
+
'{4}' ( <- '4')
|
259
|
+
'{5}' ( <- '5')
|
260
|
+
'{6}' ( <- '6')
|
261
|
+
'{7}' ( <- '7')
|
262
|
+
'{8}' ( <- '8')
|
263
|
+
'{9}' ( <- '9')
|
264
|
+
|
265
|
+
// Shaped forms
|
266
|
+
'{o1}' ( <- '{o}' ) // HAMZA
|
267
|
+
'{ao1}' '{ao2}' ( <- '{ao}' ) // ALEF_HAMZA_ABOVE
|
268
|
+
'{ao_1}' '{ao_2}' ( <- '{ao_}' ) // ALEF_HAMZA_BELOW
|
269
|
+
'{yo1}' '{yo2}' '{yo3}' '{yo4}' ( <- '{yo}' ) // YEH_HAMZA
|
270
|
+
'{a~1}' '{a~2}'( <- '{a~}' ) // ALEF_MADDA
|
271
|
+
'{wo1}' '{wo2}'( <- '{wo}' ) // WAW_HAMZA
|
272
|
+
'{a1}' '{a2}' ( <- '{a}' ) // ALEF
|
273
|
+
'{b1}' '{b2}' '{b3}' '{b4}' ( <- '{b}' ) // BEH
|
274
|
+
'{t_1}' '{t_2}' ( <- '{t_}' ) // TEH_MARBUTA
|
275
|
+
'{t1}' '{t2}' '{t3}' '{t4}' ( <- '{t}' ) // TEH
|
276
|
+
'{th1}' '{th2}' '{th3}' '{th4}' ( <- '{th}' ) // THEH
|
277
|
+
'{j1}' '{j2}' '{j3}' '{j4}'( <- '{j}' ) // JEEM
|
278
|
+
'{h1}' '{h2}' '{h3}' '{h4}' ( <- '{h}' ) // HAH
|
279
|
+
'{x1}' '{x2}' '{x3}' '{x4}'( <- '{x}' ) // KHAH
|
280
|
+
'{d1}' '{d2}' ( <- '{d}' ) // DAL
|
281
|
+
'{dz1}''{dz2}' ( <- '{dz}' ) // THAL
|
282
|
+
'{r1}' '{r2}'( <- '{r}' ) // REH
|
283
|
+
'{z1}' '{z2}' ( <- '{z}' ) // ZAIN
|
284
|
+
'{s1}' '{s2}' '{s3}' '{s4}'( <- '{s}' ) // SEEN
|
285
|
+
'{sh1}' '{sh2}' '{sh3}' '{sh4}' ( <- '{sh}' ) // SHEEN
|
286
|
+
'{c1}' '{c2}' '{c3}' '{c4}'( <- '{c}' ) // SAD
|
287
|
+
'{dh1}' '{dh2}' '{dh3}' '{dh4}'( <- '{dh}' ) // DAD
|
288
|
+
'{tt1}' '{tt2}' '{tt3}' '{tt4}' ( <- '{tt}' ) // TAH
|
289
|
+
'{zh1}' '{zh2}' '{zh3}' '{zh4}'( <- '{zh}' ) // ZAH
|
290
|
+
'{i1}' '{i2}' '{i3}' '{i4}'( <- '{i}' ) // AIN
|
291
|
+
'{gh1}' '{gh2}' '{gh3}' '{gh4}'( <- '{gh}' ) // GHAIN
|
292
|
+
'{f1}' '{f2}' '{f3}' '{f4}' ( <- '{f}' ) // FEH
|
293
|
+
'{q1}' '{q2}' '{q3}' '{q4}' ( <- '{q}' ) // QAF
|
294
|
+
'{k1}' '{k2}' '{k3}' '{k4}'( <- '{k}' ) // KAF
|
295
|
+
'{l1}' '{l2}' '{l3}' '{l4}'( <- '{l}' ) // LAM
|
296
|
+
'{m1}' '{m2}' '{m3}' '{m4}' ( <- '{m}' ) // MEEM
|
297
|
+
'{n1}' '{n2}' '{n3}' '{n4}'( <- '{n}' ) // NOON
|
298
|
+
'{e1}' '{e2}' '{e3}' '{e4}' ( <- '{e}' ) // HEH
|
299
|
+
'{w1}' '{w2}' ( <- '{w}' ) // WAW
|
300
|
+
'{a_1}' '{a_2}' ( <- '{a_}' ) // ALEF_MAKSURA
|
301
|
+
'{y1}' '{y2}' '{y3}' '{y4}' ( <- '{y}' ) // YEH
|
302
|
+
|
303
|
+
// Ligatures Lam-Alef
|
304
|
+
'{la}' '{la2}' (<- '{l}{a}')
|
305
|
+
'{lao}' '{lao2}' (<- '{l}{ao}')
|
306
|
+
'{lao_}' '{lao_2}' (<- '{l}{ao_}')
|
307
|
+
'{la~}' '{la~2}' (<- '{l}{a~}')
|
308
|
+
|
309
|
+
)
|
310
|
+
)
|
311
|
+
or
|
312
|
+
next
|
313
|
+
)
|
314
|
+
)
|
315
|
+
|
316
|
+
define Normalize_post as (
|
317
|
+
|
318
|
+
do (
|
319
|
+
// normalize last hamza
|
320
|
+
backwards (
|
321
|
+
[substring] among (
|
322
|
+
'{ao}''{ao_}' '{a~}' ( <- '{o}')
|
323
|
+
'{wo}' ( <- '{o}')
|
324
|
+
'{yo}' ( <- '{o}')
|
325
|
+
)
|
326
|
+
)
|
327
|
+
)
|
328
|
+
|
329
|
+
do repeat (
|
330
|
+
(
|
331
|
+
// normalize other hamza's
|
332
|
+
[substring] among (
|
333
|
+
'{ao}''{ao_}' '{a~}' ( <- '{a}')
|
334
|
+
'{wo}' ( <- '{w}')
|
335
|
+
'{yo}' ( <- '{y}')
|
336
|
+
)
|
337
|
+
)
|
338
|
+
or
|
339
|
+
next
|
340
|
+
)
|
341
|
+
)
|
342
|
+
|
343
|
+
// Checks
|
344
|
+
define Checks1 as (
|
345
|
+
[substring] among (
|
346
|
+
'{b}{a}{l}' '{k}{a}{l}' ($(len > 4) set is_noun unset is_verb set is_defined)
|
347
|
+
'{l}{l}' '{a}{l}' ($(len > 3) set is_noun unset is_verb set is_defined)
|
348
|
+
)
|
349
|
+
)
|
350
|
+
|
351
|
+
|
352
|
+
//prefixes
|
353
|
+
define Prefix_Step1 as (
|
354
|
+
[substring] among (
|
355
|
+
'{ao}{ao}' ($(len > 3) <- '{ao}' )
|
356
|
+
'{ao}{a~}' ($(len > 3) <- '{a~}' )
|
357
|
+
'{ao}{wo}' ($(len > 3) <- '{ao}' )
|
358
|
+
'{ao}{a}' ($(len > 3) <- '{a}' )
|
359
|
+
'{ao}{ao_}' ($(len > 3) <- '{ao_}' )
|
360
|
+
// '{ao}' ($(len > 3) delete) //rare case
|
361
|
+
)
|
362
|
+
)
|
363
|
+
|
364
|
+
define Prefix_Step2 as (
|
365
|
+
[substring] among (
|
366
|
+
'{f}' '{w}' ($(len > 3) not '{a}' delete)
|
367
|
+
)
|
368
|
+
)
|
369
|
+
|
370
|
+
define Prefix_Step3a_Noun as ( // it is noun and defined
|
371
|
+
[substring] among (
|
372
|
+
'{b}{a}{l}' '{k}{a}{l}' ($(len > 5) delete)
|
373
|
+
'{l}{l}' '{a}{l}' ($(len > 4) delete)
|
374
|
+
)
|
375
|
+
)
|
376
|
+
|
377
|
+
define Prefix_Step3b_Noun as ( // probably noun and defined
|
378
|
+
[substring] among (
|
379
|
+
'{b}{a}' ( ) // exception - not a valid verb prefix so can just succeed here
|
380
|
+
'{b}' ($(len > 3) delete)
|
381
|
+
// '{k}' '{l}' ($(len > 3) delete) // BUG: cause confusion
|
382
|
+
'{b}{b}' ($(len > 3) <- '{b}' )
|
383
|
+
'{k}{k}' ($(len > 3) <- '{k}' )
|
384
|
+
)
|
385
|
+
|
386
|
+
)
|
387
|
+
|
388
|
+
define Prefix_Step3_Verb as (
|
389
|
+
[substring] among (
|
390
|
+
//'{s}' ($(len > 4) delete)// BUG: cause confusion
|
391
|
+
'{s}{y}' ($(len > 4) <- '{y}' )
|
392
|
+
'{s}{t}' ($(len > 4) <- '{t}')
|
393
|
+
'{s}{n}' ($(len > 4) <- '{n}')
|
394
|
+
'{s}{ao}' ($(len > 4) <- '{ao}')
|
395
|
+
)
|
396
|
+
)
|
397
|
+
|
398
|
+
define Prefix_Step4_Verb as (
|
399
|
+
[substring] among (
|
400
|
+
'{y}{s}{t}' '{n}{s}{t}' '{t}{s}{t}' ($(len > 4) set is_verb unset is_noun <- '{a}{s}{t}' )
|
401
|
+
)
|
402
|
+
)
|
403
|
+
|
404
|
+
// suffixes
|
405
|
+
backwardmode (
|
406
|
+
|
407
|
+
define Suffix_Noun_Step1a as (
|
408
|
+
[substring] among (
|
409
|
+
'{y}' '{k}' '{e}' ($(len >= 4) delete)
|
410
|
+
'{n}{a}' '{k}{m}' '{e}{a}' '{e}{n}' '{e}{m}' ($(len >= 5) delete)
|
411
|
+
'{k}{m}{a}' '{e}{m}{a}' ($(len >= 6) delete)
|
412
|
+
)
|
413
|
+
)
|
414
|
+
define Suffix_Noun_Step1b as (
|
415
|
+
[substring] among (
|
416
|
+
'{n}' ($(len > 5) delete)
|
417
|
+
)
|
418
|
+
)
|
419
|
+
|
420
|
+
define Suffix_Noun_Step2a as (
|
421
|
+
[substring] among (
|
422
|
+
'{a}' '{y}' '{w}' ($(len > 4) delete)
|
423
|
+
)
|
424
|
+
)
|
425
|
+
|
426
|
+
define Suffix_Noun_Step2b as (
|
427
|
+
[substring] among (
|
428
|
+
'{a}{t}' ($(len >= 5) delete)
|
429
|
+
)
|
430
|
+
)
|
431
|
+
|
432
|
+
define Suffix_Noun_Step2c1 as (
|
433
|
+
[substring] among (
|
434
|
+
'{t}' ($(len >= 4) delete)
|
435
|
+
)
|
436
|
+
)
|
437
|
+
define Suffix_Noun_Step2c2 as ( // feminine t_
|
438
|
+
[substring] among (
|
439
|
+
'{t_}' ($(len >= 4) delete)
|
440
|
+
)
|
441
|
+
)
|
442
|
+
define Suffix_Noun_Step3 as ( // ya' nisbiya
|
443
|
+
[substring] among (
|
444
|
+
'{y}' ($(len >= 3) delete)
|
445
|
+
)
|
446
|
+
)
|
447
|
+
|
448
|
+
define Suffix_Verb_Step1 as (
|
449
|
+
[substring] among (
|
450
|
+
'{e}' '{k}' ($(len >= 4) delete)
|
451
|
+
'{n}{y}' '{n}{a}' '{e}{a}' '{e}{m}' '{e}{n}' '{k}{m}' '{k}{n}' ($(len >= 5) delete)
|
452
|
+
'{e}{m}{a}' '{k}{m}{a}' '{k}{m}{w}'($(len >= 6) delete)
|
453
|
+
)
|
454
|
+
)
|
455
|
+
define Suffix_Verb_Step2a as (
|
456
|
+
[substring] among (
|
457
|
+
'{t}' ($(len >= 4) delete)
|
458
|
+
'{a}' '{n}' '{y}' ($(len >= 4) delete)
|
459
|
+
'{n}{a}' '{t}{a}' '{t}{n}' ($(len >= 5) delete)// past
|
460
|
+
'{a}{n}' '{w}{n}' '{y}{n}' ($(len > 5) delete) // present
|
461
|
+
'{t}{m}{a}' ($(len >= 6) delete)
|
462
|
+
)
|
463
|
+
)
|
464
|
+
|
465
|
+
define Suffix_Verb_Step2b as (
|
466
|
+
[substring] among (
|
467
|
+
'{w}{a}' '{t}{m}' ($(len >= 5) delete)
|
468
|
+
)
|
469
|
+
)
|
470
|
+
|
471
|
+
|
472
|
+
define Suffix_Verb_Step2c as (
|
473
|
+
[substring] among (
|
474
|
+
'{w}' ($(len >= 4) delete)
|
475
|
+
'{t}{m}{w}' ($(len >= 6) delete)
|
476
|
+
)
|
477
|
+
)
|
478
|
+
|
479
|
+
define Suffix_All_alef_maqsura as (
|
480
|
+
[substring] among (
|
481
|
+
'{a_}' ( <- '{y}' ) // spell error
|
482
|
+
// '{a_}' ( delete ) // if noun > 3
|
483
|
+
// '{a_}' ( <- '{a}') // if verb
|
484
|
+
)
|
485
|
+
)
|
486
|
+
)
|
487
|
+
|
488
|
+
define stem as (
|
489
|
+
// set initial values
|
490
|
+
set is_noun
|
491
|
+
set is_verb
|
492
|
+
unset is_defined
|
493
|
+
|
494
|
+
// guess type and properties
|
495
|
+
do Checks1
|
496
|
+
|
497
|
+
// normalization pre-stemming
|
498
|
+
do Normalize_pre
|
499
|
+
|
500
|
+
|
501
|
+
backwards (
|
502
|
+
|
503
|
+
do (
|
504
|
+
//Suffixes for verbs
|
505
|
+
(
|
506
|
+
is_verb
|
507
|
+
(
|
508
|
+
(
|
509
|
+
(atleast 1 Suffix_Verb_Step1)
|
510
|
+
( Suffix_Verb_Step2a or Suffix_Verb_Step2c or next)
|
511
|
+
)
|
512
|
+
or Suffix_Verb_Step2b
|
513
|
+
or Suffix_Verb_Step2a
|
514
|
+
)
|
515
|
+
)
|
516
|
+
//Suffixes for nouns
|
517
|
+
or (
|
518
|
+
is_noun
|
519
|
+
(
|
520
|
+
|
521
|
+
try (
|
522
|
+
Suffix_Noun_Step2c2
|
523
|
+
or (not is_defined Suffix_Noun_Step1a (
|
524
|
+
Suffix_Noun_Step2a
|
525
|
+
or Suffix_Noun_Step2b
|
526
|
+
or Suffix_Noun_Step2c1
|
527
|
+
or next))
|
528
|
+
or (Suffix_Noun_Step1b (
|
529
|
+
Suffix_Noun_Step2a
|
530
|
+
or Suffix_Noun_Step2b
|
531
|
+
or Suffix_Noun_Step2c1))
|
532
|
+
or (not is_defined Suffix_Noun_Step2a)
|
533
|
+
or (Suffix_Noun_Step2b)
|
534
|
+
)
|
535
|
+
Suffix_Noun_Step3
|
536
|
+
)
|
537
|
+
|
538
|
+
)
|
539
|
+
|
540
|
+
// Suffixes for alef maqsura
|
541
|
+
or Suffix_All_alef_maqsura
|
542
|
+
)
|
543
|
+
)
|
544
|
+
|
545
|
+
//Prefixes
|
546
|
+
do (
|
547
|
+
try Prefix_Step1
|
548
|
+
try Prefix_Step2
|
549
|
+
( Prefix_Step3a_Noun
|
550
|
+
or (is_noun Prefix_Step3b_Noun)
|
551
|
+
or (is_verb try Prefix_Step3_Verb Prefix_Step4_Verb)
|
552
|
+
)
|
553
|
+
)
|
554
|
+
|
555
|
+
// normalization post-stemming
|
556
|
+
do Normalize_post
|
557
|
+
|
558
|
+
)
|