mittens 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +30 -0
- data/README.md +62 -0
- data/Rakefile +21 -0
- data/ext/mittens/ext.c +96 -0
- data/ext/mittens/extconf.rb +12 -0
- data/lib/mittens/version.rb +3 -0
- data/lib/mittens.rb +7 -0
- data/mittens.gemspec +22 -0
- data/vendor/snowball/.gitignore +26 -0
- data/vendor/snowball/.travis.yml +112 -0
- data/vendor/snowball/AUTHORS +27 -0
- data/vendor/snowball/CONTRIBUTING.rst +216 -0
- data/vendor/snowball/COPYING +29 -0
- data/vendor/snowball/GNUmakefile +742 -0
- data/vendor/snowball/NEWS +754 -0
- data/vendor/snowball/README.rst +37 -0
- data/vendor/snowball/ada/README.md +74 -0
- data/vendor/snowball/ada/generate/generate.adb +83 -0
- data/vendor/snowball/ada/generate.gpr +21 -0
- data/vendor/snowball/ada/src/stemmer.adb +620 -0
- data/vendor/snowball/ada/src/stemmer.ads +219 -0
- data/vendor/snowball/ada/src/stemwords.adb +70 -0
- data/vendor/snowball/ada/stemmer_config.gpr +83 -0
- data/vendor/snowball/ada/stemwords.gpr +21 -0
- data/vendor/snowball/algorithms/arabic.sbl +558 -0
- data/vendor/snowball/algorithms/armenian.sbl +301 -0
- data/vendor/snowball/algorithms/basque.sbl +149 -0
- data/vendor/snowball/algorithms/catalan.sbl +202 -0
- data/vendor/snowball/algorithms/danish.sbl +93 -0
- data/vendor/snowball/algorithms/dutch.sbl +164 -0
- data/vendor/snowball/algorithms/english.sbl +229 -0
- data/vendor/snowball/algorithms/finnish.sbl +197 -0
- data/vendor/snowball/algorithms/french.sbl +254 -0
- data/vendor/snowball/algorithms/german.sbl +139 -0
- data/vendor/snowball/algorithms/german2.sbl +145 -0
- data/vendor/snowball/algorithms/greek.sbl +701 -0
- data/vendor/snowball/algorithms/hindi.sbl +323 -0
- data/vendor/snowball/algorithms/hungarian.sbl +241 -0
- data/vendor/snowball/algorithms/indonesian.sbl +192 -0
- data/vendor/snowball/algorithms/irish.sbl +149 -0
- data/vendor/snowball/algorithms/italian.sbl +202 -0
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
- data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
- data/vendor/snowball/algorithms/lovins.sbl +208 -0
- data/vendor/snowball/algorithms/nepali.sbl +92 -0
- data/vendor/snowball/algorithms/norwegian.sbl +80 -0
- data/vendor/snowball/algorithms/porter.sbl +139 -0
- data/vendor/snowball/algorithms/portuguese.sbl +218 -0
- data/vendor/snowball/algorithms/romanian.sbl +236 -0
- data/vendor/snowball/algorithms/russian.sbl +221 -0
- data/vendor/snowball/algorithms/serbian.sbl +2379 -0
- data/vendor/snowball/algorithms/spanish.sbl +230 -0
- data/vendor/snowball/algorithms/swedish.sbl +72 -0
- data/vendor/snowball/algorithms/tamil.sbl +405 -0
- data/vendor/snowball/algorithms/turkish.sbl +470 -0
- data/vendor/snowball/algorithms/yiddish.sbl +460 -0
- data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
- data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
- data/vendor/snowball/charsets/cp850.sbl +130 -0
- data/vendor/snowball/compiler/analyser.c +1547 -0
- data/vendor/snowball/compiler/driver.c +615 -0
- data/vendor/snowball/compiler/generator.c +1748 -0
- data/vendor/snowball/compiler/generator_ada.c +1702 -0
- data/vendor/snowball/compiler/generator_csharp.c +1322 -0
- data/vendor/snowball/compiler/generator_go.c +1278 -0
- data/vendor/snowball/compiler/generator_java.c +1313 -0
- data/vendor/snowball/compiler/generator_js.c +1316 -0
- data/vendor/snowball/compiler/generator_pascal.c +1387 -0
- data/vendor/snowball/compiler/generator_python.c +1337 -0
- data/vendor/snowball/compiler/generator_rust.c +1295 -0
- data/vendor/snowball/compiler/header.h +418 -0
- data/vendor/snowball/compiler/space.c +286 -0
- data/vendor/snowball/compiler/syswords.h +86 -0
- data/vendor/snowball/compiler/syswords2.h +13 -0
- data/vendor/snowball/compiler/tokeniser.c +567 -0
- data/vendor/snowball/csharp/.gitignore +8 -0
- data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
- data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
- data/vendor/snowball/csharp/Stemwords/App.config +6 -0
- data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
- data/vendor/snowball/doc/TODO +12 -0
- data/vendor/snowball/doc/libstemmer_c_README +148 -0
- data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
- data/vendor/snowball/doc/libstemmer_java_README +67 -0
- data/vendor/snowball/doc/libstemmer_js_README +48 -0
- data/vendor/snowball/doc/libstemmer_python_README +113 -0
- data/vendor/snowball/examples/stemwords.c +204 -0
- data/vendor/snowball/go/README.md +55 -0
- data/vendor/snowball/go/among.go +16 -0
- data/vendor/snowball/go/env.go +403 -0
- data/vendor/snowball/go/stemwords/generate.go +68 -0
- data/vendor/snowball/go/stemwords/main.go +68 -0
- data/vendor/snowball/go/util.go +34 -0
- data/vendor/snowball/iconv.py +50 -0
- data/vendor/snowball/include/libstemmer.h +78 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
- data/vendor/snowball/javascript/base-stemmer.js +294 -0
- data/vendor/snowball/javascript/stemwords.js +106 -0
- data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
- data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
- data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
- data/vendor/snowball/libstemmer/modules.txt +63 -0
- data/vendor/snowball/libstemmer/test.c +34 -0
- data/vendor/snowball/pascal/.gitignore +4 -0
- data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
- data/vendor/snowball/pascal/generate.pl +23 -0
- data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
- data/vendor/snowball/python/MANIFEST.in +7 -0
- data/vendor/snowball/python/create_init.py +54 -0
- data/vendor/snowball/python/setup.cfg +6 -0
- data/vendor/snowball/python/setup.py +81 -0
- data/vendor/snowball/python/snowballstemmer/among.py +13 -0
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
- data/vendor/snowball/python/stemwords.py +101 -0
- data/vendor/snowball/python/testapp.py +28 -0
- data/vendor/snowball/runtime/api.c +58 -0
- data/vendor/snowball/runtime/api.h +32 -0
- data/vendor/snowball/runtime/header.h +61 -0
- data/vendor/snowball/runtime/utilities.c +513 -0
- data/vendor/snowball/rust/Cargo.toml +7 -0
- data/vendor/snowball/rust/build.rs +55 -0
- data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
- data/vendor/snowball/rust/src/main.rs +102 -0
- data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
- data/vendor/snowball/rust/src/snowball/among.rs +6 -0
- data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
- data/vendor/snowball/tests/stemtest.c +95 -0
- metadata +178 -0
|
@@ -0,0 +1,558 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Authors:
|
|
3
|
+
* - Assem Chelli, < assem [dot] ch [at] gmail >
|
|
4
|
+
* - Abdelkrim Aries <ab [underscore] aries [at] esi [dot] dz>
|
|
5
|
+
*
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
stringescapes { }
|
|
9
|
+
|
|
10
|
+
/* the Arabic letters in Unicode */
|
|
11
|
+
// Hamza
|
|
12
|
+
stringdef o '{U+0621}' // Hamza
|
|
13
|
+
stringdef ao '{U+0623}' // Hamza above Alef
|
|
14
|
+
stringdef ao_ '{U+0625}' // Hamza below Alef
|
|
15
|
+
stringdef a~ '{U+0622}' // Alef madda
|
|
16
|
+
stringdef wo '{U+0624}' // Hamza above waw
|
|
17
|
+
stringdef yo '{U+0626}' // Hamza above yeh
|
|
18
|
+
|
|
19
|
+
// Letters
|
|
20
|
+
stringdef a '{U+0627}' // Alef
|
|
21
|
+
stringdef a_ '{U+0649}' // Alef Maksura
|
|
22
|
+
stringdef b '{U+0628}' // Beh
|
|
23
|
+
stringdef t_ '{U+0629}' // Teh_Marbuta
|
|
24
|
+
stringdef t '{U+062A}' // Teh
|
|
25
|
+
stringdef th '{U+062B}' // Theh
|
|
26
|
+
stringdef j '{U+062C}' // Jeem
|
|
27
|
+
stringdef h '{U+062D}' // Hah
|
|
28
|
+
stringdef x '{U+062E}' // Khah
|
|
29
|
+
stringdef d '{U+062F}' // Dal
|
|
30
|
+
stringdef dz '{U+0630}' // Thal
|
|
31
|
+
stringdef r '{U+0631}' // Reh
|
|
32
|
+
stringdef z '{U+0632}' // Zain
|
|
33
|
+
stringdef s '{U+0633}' // Seen
|
|
34
|
+
stringdef sh '{U+0634}' // Sheen
|
|
35
|
+
stringdef c '{U+0635}' // Sad
|
|
36
|
+
stringdef dh '{U+0636}' // Dad
|
|
37
|
+
stringdef tt '{U+0637}' // Tah
|
|
38
|
+
stringdef zh '{U+0638}' // Zah
|
|
39
|
+
stringdef i '{U+0639}' // Ain
|
|
40
|
+
stringdef gh '{U+063A}' // Ghain
|
|
41
|
+
stringdef f '{U+0641}' // Feh
|
|
42
|
+
stringdef q '{U+0642}' // Qaf
|
|
43
|
+
stringdef k '{U+0643}' // Kaf
|
|
44
|
+
stringdef l '{U+0644}' // Lam
|
|
45
|
+
stringdef m '{U+0645}' // Meem
|
|
46
|
+
stringdef n '{U+0646}' // Noon
|
|
47
|
+
stringdef e '{U+0647}' // Heh
|
|
48
|
+
stringdef w '{U+0648}' // Waw
|
|
49
|
+
stringdef y '{U+064A}' // Yeh
|
|
50
|
+
|
|
51
|
+
// Diacritics
|
|
52
|
+
stringdef aan '{U+064B}' // FatHatan
|
|
53
|
+
stringdef uun '{U+064C}' // Dammatan
|
|
54
|
+
stringdef iin '{U+064D}' // Kasratan
|
|
55
|
+
stringdef aa '{U+064E}' // FatHa
|
|
56
|
+
stringdef uu '{U+064F}' // Damma
|
|
57
|
+
stringdef ii '{U+0650}' // Kasra
|
|
58
|
+
stringdef oo '{U+0652}' // Sukun
|
|
59
|
+
stringdef ~ '{U+0651}' // Shadda
|
|
60
|
+
|
|
61
|
+
// Hindu–Arabic numerals
|
|
62
|
+
stringdef 0 '{U+0660}'
|
|
63
|
+
stringdef 1 '{U+0661}'
|
|
64
|
+
stringdef 2 '{U+0662}'
|
|
65
|
+
stringdef 3 '{U+0663}'
|
|
66
|
+
stringdef 4 '{U+0664}'
|
|
67
|
+
stringdef 5 '{U+0665}'
|
|
68
|
+
stringdef 6 '{U+0666}'
|
|
69
|
+
stringdef 7 '{U+0667}'
|
|
70
|
+
stringdef 8 '{U+0668}'
|
|
71
|
+
stringdef 9 '{U+0669}'
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
// Kasheeda
|
|
75
|
+
stringdef _ '{U+0640}' // Kasheeda, Tatweel
|
|
76
|
+
|
|
77
|
+
// Shaped forms
|
|
78
|
+
stringdef o1 '{U+FE80}' // HAMZA
|
|
79
|
+
stringdef ao1 '{U+FE83}' // ALEF_HAMZA_ABOVE
|
|
80
|
+
stringdef ao2 '{U+FE84}' // ALEF_HAMZA_ABOVE
|
|
81
|
+
stringdef ao_1 '{U+FE87}' // ALEF_HAMZA_BELOW
|
|
82
|
+
stringdef ao_2 '{U+FE88}' // ALEF_HAMZA_BELOW
|
|
83
|
+
stringdef yo1 '{U+FE8B}' // YEH_HAMZA
|
|
84
|
+
stringdef yo2 '{U+FE8C}' // YEH_HAMZA
|
|
85
|
+
stringdef yo3 '{U+FE89}' // YEH_HAMZA
|
|
86
|
+
stringdef yo4 '{U+FE8A}' // YEH_HAMZA
|
|
87
|
+
stringdef a~1 '{U+FE81}' // ALEF_MADDA
|
|
88
|
+
stringdef a~2 '{U+FE82}' // ALEF_MADDA
|
|
89
|
+
stringdef wo1 '{U+FE85}' // WAW_HAMZA
|
|
90
|
+
stringdef wo2 '{U+FE86}' // WAW_HAMZA
|
|
91
|
+
stringdef a1 '{U+FE8D}' // ALEF
|
|
92
|
+
stringdef a2 '{U+FE8E}' // ALEF
|
|
93
|
+
stringdef b1 '{U+FE8F}' // BEH
|
|
94
|
+
stringdef b2 '{U+FE90}' // BEH
|
|
95
|
+
stringdef b3 '{U+FE91}' // BEH
|
|
96
|
+
stringdef b4 '{U+FE92}' // BEH
|
|
97
|
+
stringdef t_1 '{U+FE93}' // TEH_MARBUTA
|
|
98
|
+
stringdef t_2 '{U+FE94}' // TEH_MARBUTA
|
|
99
|
+
stringdef t1 '{U+FE97}' // TEH
|
|
100
|
+
stringdef t2 '{U+FE98}' // TEH
|
|
101
|
+
stringdef t3 '{U+FE95}' // TEH
|
|
102
|
+
stringdef t4 '{U+FE96}' // TEH
|
|
103
|
+
stringdef th1 '{U+FE9B}' // THEH
|
|
104
|
+
stringdef th2 '{U+FE9C}' // THEH
|
|
105
|
+
stringdef th3 '{U+FE9A}' // THEH
|
|
106
|
+
stringdef th4 '{U+FE99}' // THEH
|
|
107
|
+
stringdef j1 '{U+FE9F}' // JEEM
|
|
108
|
+
stringdef j2 '{U+FEA0}' // JEEM
|
|
109
|
+
stringdef j3 '{U+FE9D}' // JEEM
|
|
110
|
+
stringdef j4 '{U+FE9E}' // JEEM
|
|
111
|
+
stringdef h1 '{U+FEA3}' // HAH
|
|
112
|
+
stringdef h2 '{U+FEA4}' // HAH
|
|
113
|
+
stringdef h3 '{U+FEA1}' // HAH
|
|
114
|
+
stringdef h4 '{U+FEA2}' // HAH
|
|
115
|
+
stringdef x1 '{U+FEA7}' // KHAH
|
|
116
|
+
stringdef x2 '{U+FEA8}' // KHAH
|
|
117
|
+
stringdef x3 '{U+FEA5}' // KHAH
|
|
118
|
+
stringdef x4 '{U+FEA6}' // KHAH
|
|
119
|
+
stringdef d1 '{U+FEA9}' // DAL
|
|
120
|
+
stringdef d2 '{U+FEAA}' // DAL
|
|
121
|
+
stringdef dz1 '{U+FEAB}' // THAL
|
|
122
|
+
stringdef dz2 '{U+FEAC}' // THAL
|
|
123
|
+
stringdef r1 '{U+FEAD}' // REH
|
|
124
|
+
stringdef r2 '{U+FEAE}' // REH
|
|
125
|
+
stringdef z1 '{U+FEAF}' // ZAIN
|
|
126
|
+
stringdef z2 '{U+FEB0}' // ZAIN
|
|
127
|
+
stringdef s1 '{U+FEB3}' // SEEN
|
|
128
|
+
stringdef s2 '{U+FEB4}' // SEEN
|
|
129
|
+
stringdef s3 '{U+FEB1}' // SEEN
|
|
130
|
+
stringdef s4 '{U+FEB2}' // SEEN
|
|
131
|
+
stringdef sh1 '{U+FEB7}' // SHEEN
|
|
132
|
+
stringdef sh2 '{U+FEB8}' // SHEEN
|
|
133
|
+
stringdef sh3 '{U+FEB5}' // SHEEN
|
|
134
|
+
stringdef sh4 '{U+FEB6}' // SHEEN
|
|
135
|
+
stringdef c1 '{U+FEBB}' // SAD
|
|
136
|
+
stringdef c2 '{U+FEBC}' // SAD
|
|
137
|
+
stringdef c3 '{U+FEB9}' // SAD
|
|
138
|
+
stringdef c4 '{U+FEBA}' // SAD
|
|
139
|
+
stringdef dh1 '{U+FEBF}' // DAD
|
|
140
|
+
stringdef dh2 '{U+FEC0}' // DAD
|
|
141
|
+
stringdef dh3 '{U+FEBD}' // DAD
|
|
142
|
+
stringdef dh4 '{U+FEBE}' // DAD
|
|
143
|
+
stringdef tt1 '{U+FEC3}' // TAH
|
|
144
|
+
stringdef tt2 '{U+FEC4}' // TAH
|
|
145
|
+
stringdef tt3 '{U+FEC1}' // TAH
|
|
146
|
+
stringdef tt4 '{U+FEC2}' // TAH
|
|
147
|
+
stringdef zh1 '{U+FEC7}' // ZAH
|
|
148
|
+
stringdef zh2 '{U+FEC8}' // ZAH
|
|
149
|
+
stringdef zh3 '{U+FEC5}' // ZAH
|
|
150
|
+
stringdef zh4 '{U+FEC6}' // ZAH
|
|
151
|
+
stringdef i1 '{U+FECB}' // AIN
|
|
152
|
+
stringdef i2 '{U+FECC}' // AIN
|
|
153
|
+
stringdef i3 '{U+FEC9}' // AIN
|
|
154
|
+
stringdef i4 '{U+FECA}' // AIN
|
|
155
|
+
stringdef gh1 '{U+FECF}' // GHAIN
|
|
156
|
+
stringdef gh2 '{U+FED0}' // GHAIN
|
|
157
|
+
stringdef gh3 '{U+FECD}' // GHAIN
|
|
158
|
+
stringdef gh4 '{U+FECE}' // GHAIN
|
|
159
|
+
stringdef f1 '{U+FED3}' // FEH
|
|
160
|
+
stringdef f2 '{U+FED4}' // FEH
|
|
161
|
+
stringdef f3 '{U+FED1}' // FEH
|
|
162
|
+
stringdef f4 '{U+FED2}' // FEH
|
|
163
|
+
stringdef q1 '{U+FED7}' // QAF
|
|
164
|
+
stringdef q2 '{U+FED8}' // QAF
|
|
165
|
+
stringdef q3 '{U+FED5}' // QAF
|
|
166
|
+
stringdef q4 '{U+FED6}' // QAF
|
|
167
|
+
stringdef k1 '{U+FEDB}' // KAF
|
|
168
|
+
stringdef k2 '{U+FEDC}' // KAF
|
|
169
|
+
stringdef k3 '{U+FED9}' // KAF
|
|
170
|
+
stringdef k4 '{U+FEDA}' // KAF
|
|
171
|
+
stringdef l1 '{U+FEDF}' // LAM
|
|
172
|
+
stringdef l2 '{U+FEE0}' // LAM
|
|
173
|
+
stringdef l3 '{U+FEDD}' // LAM
|
|
174
|
+
stringdef l4 '{U+FEDE}' // LAM
|
|
175
|
+
stringdef m1 '{U+FEE3}' // MEEM
|
|
176
|
+
stringdef m2 '{U+FEE4}' // MEEM
|
|
177
|
+
stringdef m3 '{U+FEE1}' // MEEM
|
|
178
|
+
stringdef m4 '{U+FEE2}' // MEEM
|
|
179
|
+
stringdef n1 '{U+FEE7}' // NOON
|
|
180
|
+
stringdef n2 '{U+FEE8}' // NOON
|
|
181
|
+
stringdef n3 '{U+FEE5}' // NOON
|
|
182
|
+
stringdef n4 '{U+FEE6}' // NOON
|
|
183
|
+
stringdef e1 '{U+FEEB}' // HEH
|
|
184
|
+
stringdef e2 '{U+FEEC}' // HEH
|
|
185
|
+
stringdef e3 '{U+FEE9}' // HEH
|
|
186
|
+
stringdef e4 '{U+FEEA}' // HEH
|
|
187
|
+
stringdef w1 '{U+FEED}' // WAW
|
|
188
|
+
stringdef w2 '{U+FEEE}' // WAW
|
|
189
|
+
stringdef a_1 '{U+FEEF}' // ALEF_MAKSURA
|
|
190
|
+
stringdef a_2 '{U+FEF0}' // ALEF_MAKSURA
|
|
191
|
+
stringdef y1 '{U+FEF3}' // YEH
|
|
192
|
+
stringdef y2 '{U+FEF4}' // YEH
|
|
193
|
+
stringdef y3 '{U+FEF1}' // YEH
|
|
194
|
+
stringdef y4 '{U+FEF2}' // YEH
|
|
195
|
+
|
|
196
|
+
// Ligatures Lam-Alef
|
|
197
|
+
stringdef la '{U+FEFB}' // LAM_ALEF
|
|
198
|
+
stringdef la2 '{U+FEFC}' // LAM_ALEF
|
|
199
|
+
stringdef lao '{U+FEF7}' // LAM_ALEF_HAMZA_ABOVE
|
|
200
|
+
stringdef lao2 '{U+FEF8}' // LAM_ALEF_HAMZA_ABOVE
|
|
201
|
+
stringdef lao_ '{U+FEF9}' // LAM_ALEF_HAMZA_BELOW
|
|
202
|
+
stringdef lao_2 '{U+FEFA}' // LAM_ALEF_HAMZA_BELOW
|
|
203
|
+
stringdef la~ '{U+FEF5}' // LAM_ALEF_MADDA_ABOVE
|
|
204
|
+
stringdef la~2 '{U+FEF6}' // LAM_ALEF_MADDA_ABOVE
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
booleans (
|
|
208
|
+
is_noun
|
|
209
|
+
is_verb
|
|
210
|
+
is_defined
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
routines (
|
|
214
|
+
Prefix_Step1
|
|
215
|
+
Prefix_Step2
|
|
216
|
+
Prefix_Step3a_Noun
|
|
217
|
+
Prefix_Step3b_Noun
|
|
218
|
+
Prefix_Step3_Verb
|
|
219
|
+
Prefix_Step4_Verb
|
|
220
|
+
|
|
221
|
+
Suffix_All_alef_maqsura
|
|
222
|
+
Suffix_Noun_Step1a
|
|
223
|
+
Suffix_Noun_Step1b
|
|
224
|
+
Suffix_Noun_Step2a
|
|
225
|
+
Suffix_Noun_Step2b
|
|
226
|
+
Suffix_Noun_Step2c1
|
|
227
|
+
Suffix_Noun_Step2c2
|
|
228
|
+
Suffix_Noun_Step3
|
|
229
|
+
Suffix_Verb_Step1
|
|
230
|
+
Suffix_Verb_Step2a
|
|
231
|
+
Suffix_Verb_Step2b
|
|
232
|
+
Suffix_Verb_Step2c
|
|
233
|
+
|
|
234
|
+
Normalize_post
|
|
235
|
+
Normalize_pre
|
|
236
|
+
|
|
237
|
+
Checks1
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
externals ( stem )
|
|
241
|
+
|
|
242
|
+
groupings ( )
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
// Normalizations
|
|
246
|
+
define Normalize_pre as (
|
|
247
|
+
do repeat (
|
|
248
|
+
(
|
|
249
|
+
[substring] among (
|
|
250
|
+
'{aan}' '{uun}' '{iin}' '{aa}' '{uu}' '{ii}' '{oo}' '{~}'( delete ) // strip vocalization
|
|
251
|
+
'{_}' ( delete ) // strip kasheeda
|
|
252
|
+
|
|
253
|
+
// Hindu–Arabic numerals
|
|
254
|
+
'{0}' ( <- '0')
|
|
255
|
+
'{1}' ( <- '1')
|
|
256
|
+
'{2}' ( <- '2')
|
|
257
|
+
'{3}' ( <- '3')
|
|
258
|
+
'{4}' ( <- '4')
|
|
259
|
+
'{5}' ( <- '5')
|
|
260
|
+
'{6}' ( <- '6')
|
|
261
|
+
'{7}' ( <- '7')
|
|
262
|
+
'{8}' ( <- '8')
|
|
263
|
+
'{9}' ( <- '9')
|
|
264
|
+
|
|
265
|
+
// Shaped forms
|
|
266
|
+
'{o1}' ( <- '{o}' ) // HAMZA
|
|
267
|
+
'{ao1}' '{ao2}' ( <- '{ao}' ) // ALEF_HAMZA_ABOVE
|
|
268
|
+
'{ao_1}' '{ao_2}' ( <- '{ao_}' ) // ALEF_HAMZA_BELOW
|
|
269
|
+
'{yo1}' '{yo2}' '{yo3}' '{yo4}' ( <- '{yo}' ) // YEH_HAMZA
|
|
270
|
+
'{a~1}' '{a~2}'( <- '{a~}' ) // ALEF_MADDA
|
|
271
|
+
'{wo1}' '{wo2}'( <- '{wo}' ) // WAW_HAMZA
|
|
272
|
+
'{a1}' '{a2}' ( <- '{a}' ) // ALEF
|
|
273
|
+
'{b1}' '{b2}' '{b3}' '{b4}' ( <- '{b}' ) // BEH
|
|
274
|
+
'{t_1}' '{t_2}' ( <- '{t_}' ) // TEH_MARBUTA
|
|
275
|
+
'{t1}' '{t2}' '{t3}' '{t4}' ( <- '{t}' ) // TEH
|
|
276
|
+
'{th1}' '{th2}' '{th3}' '{th4}' ( <- '{th}' ) // THEH
|
|
277
|
+
'{j1}' '{j2}' '{j3}' '{j4}'( <- '{j}' ) // JEEM
|
|
278
|
+
'{h1}' '{h2}' '{h3}' '{h4}' ( <- '{h}' ) // HAH
|
|
279
|
+
'{x1}' '{x2}' '{x3}' '{x4}'( <- '{x}' ) // KHAH
|
|
280
|
+
'{d1}' '{d2}' ( <- '{d}' ) // DAL
|
|
281
|
+
'{dz1}''{dz2}' ( <- '{dz}' ) // THAL
|
|
282
|
+
'{r1}' '{r2}'( <- '{r}' ) // REH
|
|
283
|
+
'{z1}' '{z2}' ( <- '{z}' ) // ZAIN
|
|
284
|
+
'{s1}' '{s2}' '{s3}' '{s4}'( <- '{s}' ) // SEEN
|
|
285
|
+
'{sh1}' '{sh2}' '{sh3}' '{sh4}' ( <- '{sh}' ) // SHEEN
|
|
286
|
+
'{c1}' '{c2}' '{c3}' '{c4}'( <- '{c}' ) // SAD
|
|
287
|
+
'{dh1}' '{dh2}' '{dh3}' '{dh4}'( <- '{dh}' ) // DAD
|
|
288
|
+
'{tt1}' '{tt2}' '{tt3}' '{tt4}' ( <- '{tt}' ) // TAH
|
|
289
|
+
'{zh1}' '{zh2}' '{zh3}' '{zh4}'( <- '{zh}' ) // ZAH
|
|
290
|
+
'{i1}' '{i2}' '{i3}' '{i4}'( <- '{i}' ) // AIN
|
|
291
|
+
'{gh1}' '{gh2}' '{gh3}' '{gh4}'( <- '{gh}' ) // GHAIN
|
|
292
|
+
'{f1}' '{f2}' '{f3}' '{f4}' ( <- '{f}' ) // FEH
|
|
293
|
+
'{q1}' '{q2}' '{q3}' '{q4}' ( <- '{q}' ) // QAF
|
|
294
|
+
'{k1}' '{k2}' '{k3}' '{k4}'( <- '{k}' ) // KAF
|
|
295
|
+
'{l1}' '{l2}' '{l3}' '{l4}'( <- '{l}' ) // LAM
|
|
296
|
+
'{m1}' '{m2}' '{m3}' '{m4}' ( <- '{m}' ) // MEEM
|
|
297
|
+
'{n1}' '{n2}' '{n3}' '{n4}'( <- '{n}' ) // NOON
|
|
298
|
+
'{e1}' '{e2}' '{e3}' '{e4}' ( <- '{e}' ) // HEH
|
|
299
|
+
'{w1}' '{w2}' ( <- '{w}' ) // WAW
|
|
300
|
+
'{a_1}' '{a_2}' ( <- '{a_}' ) // ALEF_MAKSURA
|
|
301
|
+
'{y1}' '{y2}' '{y3}' '{y4}' ( <- '{y}' ) // YEH
|
|
302
|
+
|
|
303
|
+
// Ligatures Lam-Alef
|
|
304
|
+
'{la}' '{la2}' (<- '{l}{a}')
|
|
305
|
+
'{lao}' '{lao2}' (<- '{l}{ao}')
|
|
306
|
+
'{lao_}' '{lao_2}' (<- '{l}{ao_}')
|
|
307
|
+
'{la~}' '{la~2}' (<- '{l}{a~}')
|
|
308
|
+
|
|
309
|
+
)
|
|
310
|
+
)
|
|
311
|
+
or
|
|
312
|
+
next
|
|
313
|
+
)
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
define Normalize_post as (
|
|
317
|
+
|
|
318
|
+
do (
|
|
319
|
+
// normalize last hamza
|
|
320
|
+
backwards (
|
|
321
|
+
[substring] among (
|
|
322
|
+
'{ao}''{ao_}' '{a~}' ( <- '{o}')
|
|
323
|
+
'{wo}' ( <- '{o}')
|
|
324
|
+
'{yo}' ( <- '{o}')
|
|
325
|
+
)
|
|
326
|
+
)
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
do repeat (
|
|
330
|
+
(
|
|
331
|
+
// normalize other hamza's
|
|
332
|
+
[substring] among (
|
|
333
|
+
'{ao}''{ao_}' '{a~}' ( <- '{a}')
|
|
334
|
+
'{wo}' ( <- '{w}')
|
|
335
|
+
'{yo}' ( <- '{y}')
|
|
336
|
+
)
|
|
337
|
+
)
|
|
338
|
+
or
|
|
339
|
+
next
|
|
340
|
+
)
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
// Checks
|
|
344
|
+
define Checks1 as (
|
|
345
|
+
[substring] among (
|
|
346
|
+
'{b}{a}{l}' '{k}{a}{l}' ($(len > 4) set is_noun unset is_verb set is_defined)
|
|
347
|
+
'{l}{l}' '{a}{l}' ($(len > 3) set is_noun unset is_verb set is_defined)
|
|
348
|
+
)
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
//prefixes
|
|
353
|
+
define Prefix_Step1 as (
|
|
354
|
+
[substring] among (
|
|
355
|
+
'{ao}{ao}' ($(len > 3) <- '{ao}' )
|
|
356
|
+
'{ao}{a~}' ($(len > 3) <- '{a~}' )
|
|
357
|
+
'{ao}{wo}' ($(len > 3) <- '{ao}' )
|
|
358
|
+
'{ao}{a}' ($(len > 3) <- '{a}' )
|
|
359
|
+
'{ao}{ao_}' ($(len > 3) <- '{ao_}' )
|
|
360
|
+
// '{ao}' ($(len > 3) delete) //rare case
|
|
361
|
+
)
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
define Prefix_Step2 as (
|
|
365
|
+
[substring] among (
|
|
366
|
+
'{f}' '{w}' ($(len > 3) not '{a}' delete)
|
|
367
|
+
)
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
define Prefix_Step3a_Noun as ( // it is noun and defined
|
|
371
|
+
[substring] among (
|
|
372
|
+
'{b}{a}{l}' '{k}{a}{l}' ($(len > 5) delete)
|
|
373
|
+
'{l}{l}' '{a}{l}' ($(len > 4) delete)
|
|
374
|
+
)
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
define Prefix_Step3b_Noun as ( // probably noun and defined
|
|
378
|
+
[substring] among (
|
|
379
|
+
'{b}{a}' ( ) // exception - not a valid verb prefix so can just succeed here
|
|
380
|
+
'{b}' ($(len > 3) delete)
|
|
381
|
+
// '{k}' '{l}' ($(len > 3) delete) // BUG: cause confusion
|
|
382
|
+
'{b}{b}' ($(len > 3) <- '{b}' )
|
|
383
|
+
'{k}{k}' ($(len > 3) <- '{k}' )
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
define Prefix_Step3_Verb as (
|
|
389
|
+
[substring] among (
|
|
390
|
+
//'{s}' ($(len > 4) delete)// BUG: cause confusion
|
|
391
|
+
'{s}{y}' ($(len > 4) <- '{y}' )
|
|
392
|
+
'{s}{t}' ($(len > 4) <- '{t}')
|
|
393
|
+
'{s}{n}' ($(len > 4) <- '{n}')
|
|
394
|
+
'{s}{ao}' ($(len > 4) <- '{ao}')
|
|
395
|
+
)
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
define Prefix_Step4_Verb as (
|
|
399
|
+
[substring] among (
|
|
400
|
+
'{y}{s}{t}' '{n}{s}{t}' '{t}{s}{t}' ($(len > 4) set is_verb unset is_noun <- '{a}{s}{t}' )
|
|
401
|
+
)
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
// suffixes
|
|
405
|
+
backwardmode (
|
|
406
|
+
|
|
407
|
+
define Suffix_Noun_Step1a as (
|
|
408
|
+
[substring] among (
|
|
409
|
+
'{y}' '{k}' '{e}' ($(len >= 4) delete)
|
|
410
|
+
'{n}{a}' '{k}{m}' '{e}{a}' '{e}{n}' '{e}{m}' ($(len >= 5) delete)
|
|
411
|
+
'{k}{m}{a}' '{e}{m}{a}' ($(len >= 6) delete)
|
|
412
|
+
)
|
|
413
|
+
)
|
|
414
|
+
define Suffix_Noun_Step1b as (
|
|
415
|
+
[substring] among (
|
|
416
|
+
'{n}' ($(len > 5) delete)
|
|
417
|
+
)
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
define Suffix_Noun_Step2a as (
|
|
421
|
+
[substring] among (
|
|
422
|
+
'{a}' '{y}' '{w}' ($(len > 4) delete)
|
|
423
|
+
)
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
define Suffix_Noun_Step2b as (
|
|
427
|
+
[substring] among (
|
|
428
|
+
'{a}{t}' ($(len >= 5) delete)
|
|
429
|
+
)
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
define Suffix_Noun_Step2c1 as (
|
|
433
|
+
[substring] among (
|
|
434
|
+
'{t}' ($(len >= 4) delete)
|
|
435
|
+
)
|
|
436
|
+
)
|
|
437
|
+
define Suffix_Noun_Step2c2 as ( // feminine t_
|
|
438
|
+
[substring] among (
|
|
439
|
+
'{t_}' ($(len >= 4) delete)
|
|
440
|
+
)
|
|
441
|
+
)
|
|
442
|
+
define Suffix_Noun_Step3 as ( // ya' nisbiya
|
|
443
|
+
[substring] among (
|
|
444
|
+
'{y}' ($(len >= 3) delete)
|
|
445
|
+
)
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
define Suffix_Verb_Step1 as (
|
|
449
|
+
[substring] among (
|
|
450
|
+
'{e}' '{k}' ($(len >= 4) delete)
|
|
451
|
+
'{n}{y}' '{n}{a}' '{e}{a}' '{e}{m}' '{e}{n}' '{k}{m}' '{k}{n}' ($(len >= 5) delete)
|
|
452
|
+
'{e}{m}{a}' '{k}{m}{a}' '{k}{m}{w}'($(len >= 6) delete)
|
|
453
|
+
)
|
|
454
|
+
)
|
|
455
|
+
define Suffix_Verb_Step2a as (
|
|
456
|
+
[substring] among (
|
|
457
|
+
'{t}' ($(len >= 4) delete)
|
|
458
|
+
'{a}' '{n}' '{y}' ($(len >= 4) delete)
|
|
459
|
+
'{n}{a}' '{t}{a}' '{t}{n}' ($(len >= 5) delete)// past
|
|
460
|
+
'{a}{n}' '{w}{n}' '{y}{n}' ($(len > 5) delete) // present
|
|
461
|
+
'{t}{m}{a}' ($(len >= 6) delete)
|
|
462
|
+
)
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
define Suffix_Verb_Step2b as (
|
|
466
|
+
[substring] among (
|
|
467
|
+
'{w}{a}' '{t}{m}' ($(len >= 5) delete)
|
|
468
|
+
)
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
define Suffix_Verb_Step2c as (
|
|
473
|
+
[substring] among (
|
|
474
|
+
'{w}' ($(len >= 4) delete)
|
|
475
|
+
'{t}{m}{w}' ($(len >= 6) delete)
|
|
476
|
+
)
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
define Suffix_All_alef_maqsura as (
|
|
480
|
+
[substring] among (
|
|
481
|
+
'{a_}' ( <- '{y}' ) // spell error
|
|
482
|
+
// '{a_}' ( delete ) // if noun > 3
|
|
483
|
+
// '{a_}' ( <- '{a}') // if verb
|
|
484
|
+
)
|
|
485
|
+
)
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
define stem as (
|
|
489
|
+
// set initial values
|
|
490
|
+
set is_noun
|
|
491
|
+
set is_verb
|
|
492
|
+
unset is_defined
|
|
493
|
+
|
|
494
|
+
// guess type and properties
|
|
495
|
+
do Checks1
|
|
496
|
+
|
|
497
|
+
// normalization pre-stemming
|
|
498
|
+
do Normalize_pre
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
backwards (
|
|
502
|
+
|
|
503
|
+
do (
|
|
504
|
+
//Suffixes for verbs
|
|
505
|
+
(
|
|
506
|
+
is_verb
|
|
507
|
+
(
|
|
508
|
+
(
|
|
509
|
+
(atleast 1 Suffix_Verb_Step1)
|
|
510
|
+
( Suffix_Verb_Step2a or Suffix_Verb_Step2c or next)
|
|
511
|
+
)
|
|
512
|
+
or Suffix_Verb_Step2b
|
|
513
|
+
or Suffix_Verb_Step2a
|
|
514
|
+
)
|
|
515
|
+
)
|
|
516
|
+
//Suffixes for nouns
|
|
517
|
+
or (
|
|
518
|
+
is_noun
|
|
519
|
+
(
|
|
520
|
+
|
|
521
|
+
try (
|
|
522
|
+
Suffix_Noun_Step2c2
|
|
523
|
+
or (not is_defined Suffix_Noun_Step1a (
|
|
524
|
+
Suffix_Noun_Step2a
|
|
525
|
+
or Suffix_Noun_Step2b
|
|
526
|
+
or Suffix_Noun_Step2c1
|
|
527
|
+
or next))
|
|
528
|
+
or (Suffix_Noun_Step1b (
|
|
529
|
+
Suffix_Noun_Step2a
|
|
530
|
+
or Suffix_Noun_Step2b
|
|
531
|
+
or Suffix_Noun_Step2c1))
|
|
532
|
+
or (not is_defined Suffix_Noun_Step2a)
|
|
533
|
+
or (Suffix_Noun_Step2b)
|
|
534
|
+
)
|
|
535
|
+
Suffix_Noun_Step3
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
// Suffixes for alef maqsura
|
|
541
|
+
or Suffix_All_alef_maqsura
|
|
542
|
+
)
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
//Prefixes
|
|
546
|
+
do (
|
|
547
|
+
try Prefix_Step1
|
|
548
|
+
try Prefix_Step2
|
|
549
|
+
( Prefix_Step3a_Noun
|
|
550
|
+
or (is_noun Prefix_Step3b_Noun)
|
|
551
|
+
or (is_verb try Prefix_Step3_Verb Prefix_Step4_Verb)
|
|
552
|
+
)
|
|
553
|
+
)
|
|
554
|
+
|
|
555
|
+
// normalization post-stemming
|
|
556
|
+
do Normalize_post
|
|
557
|
+
|
|
558
|
+
)
|