mittens 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (137) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE.txt +30 -0
  5. data/README.md +62 -0
  6. data/Rakefile +21 -0
  7. data/ext/mittens/ext.c +96 -0
  8. data/ext/mittens/extconf.rb +12 -0
  9. data/lib/mittens/version.rb +3 -0
  10. data/lib/mittens.rb +7 -0
  11. data/mittens.gemspec +22 -0
  12. data/vendor/snowball/.gitignore +26 -0
  13. data/vendor/snowball/.travis.yml +112 -0
  14. data/vendor/snowball/AUTHORS +27 -0
  15. data/vendor/snowball/CONTRIBUTING.rst +216 -0
  16. data/vendor/snowball/COPYING +29 -0
  17. data/vendor/snowball/GNUmakefile +742 -0
  18. data/vendor/snowball/NEWS +754 -0
  19. data/vendor/snowball/README.rst +37 -0
  20. data/vendor/snowball/ada/README.md +74 -0
  21. data/vendor/snowball/ada/generate/generate.adb +83 -0
  22. data/vendor/snowball/ada/generate.gpr +21 -0
  23. data/vendor/snowball/ada/src/stemmer.adb +620 -0
  24. data/vendor/snowball/ada/src/stemmer.ads +219 -0
  25. data/vendor/snowball/ada/src/stemwords.adb +70 -0
  26. data/vendor/snowball/ada/stemmer_config.gpr +83 -0
  27. data/vendor/snowball/ada/stemwords.gpr +21 -0
  28. data/vendor/snowball/algorithms/arabic.sbl +558 -0
  29. data/vendor/snowball/algorithms/armenian.sbl +301 -0
  30. data/vendor/snowball/algorithms/basque.sbl +149 -0
  31. data/vendor/snowball/algorithms/catalan.sbl +202 -0
  32. data/vendor/snowball/algorithms/danish.sbl +93 -0
  33. data/vendor/snowball/algorithms/dutch.sbl +164 -0
  34. data/vendor/snowball/algorithms/english.sbl +229 -0
  35. data/vendor/snowball/algorithms/finnish.sbl +197 -0
  36. data/vendor/snowball/algorithms/french.sbl +254 -0
  37. data/vendor/snowball/algorithms/german.sbl +139 -0
  38. data/vendor/snowball/algorithms/german2.sbl +145 -0
  39. data/vendor/snowball/algorithms/greek.sbl +701 -0
  40. data/vendor/snowball/algorithms/hindi.sbl +323 -0
  41. data/vendor/snowball/algorithms/hungarian.sbl +241 -0
  42. data/vendor/snowball/algorithms/indonesian.sbl +192 -0
  43. data/vendor/snowball/algorithms/irish.sbl +149 -0
  44. data/vendor/snowball/algorithms/italian.sbl +202 -0
  45. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
  46. data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
  47. data/vendor/snowball/algorithms/lovins.sbl +208 -0
  48. data/vendor/snowball/algorithms/nepali.sbl +92 -0
  49. data/vendor/snowball/algorithms/norwegian.sbl +80 -0
  50. data/vendor/snowball/algorithms/porter.sbl +139 -0
  51. data/vendor/snowball/algorithms/portuguese.sbl +218 -0
  52. data/vendor/snowball/algorithms/romanian.sbl +236 -0
  53. data/vendor/snowball/algorithms/russian.sbl +221 -0
  54. data/vendor/snowball/algorithms/serbian.sbl +2379 -0
  55. data/vendor/snowball/algorithms/spanish.sbl +230 -0
  56. data/vendor/snowball/algorithms/swedish.sbl +72 -0
  57. data/vendor/snowball/algorithms/tamil.sbl +405 -0
  58. data/vendor/snowball/algorithms/turkish.sbl +470 -0
  59. data/vendor/snowball/algorithms/yiddish.sbl +460 -0
  60. data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
  61. data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
  62. data/vendor/snowball/charsets/cp850.sbl +130 -0
  63. data/vendor/snowball/compiler/analyser.c +1547 -0
  64. data/vendor/snowball/compiler/driver.c +615 -0
  65. data/vendor/snowball/compiler/generator.c +1748 -0
  66. data/vendor/snowball/compiler/generator_ada.c +1702 -0
  67. data/vendor/snowball/compiler/generator_csharp.c +1322 -0
  68. data/vendor/snowball/compiler/generator_go.c +1278 -0
  69. data/vendor/snowball/compiler/generator_java.c +1313 -0
  70. data/vendor/snowball/compiler/generator_js.c +1316 -0
  71. data/vendor/snowball/compiler/generator_pascal.c +1387 -0
  72. data/vendor/snowball/compiler/generator_python.c +1337 -0
  73. data/vendor/snowball/compiler/generator_rust.c +1295 -0
  74. data/vendor/snowball/compiler/header.h +418 -0
  75. data/vendor/snowball/compiler/space.c +286 -0
  76. data/vendor/snowball/compiler/syswords.h +86 -0
  77. data/vendor/snowball/compiler/syswords2.h +13 -0
  78. data/vendor/snowball/compiler/tokeniser.c +567 -0
  79. data/vendor/snowball/csharp/.gitignore +8 -0
  80. data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
  81. data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
  82. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
  83. data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
  84. data/vendor/snowball/csharp/Stemwords/App.config +6 -0
  85. data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
  86. data/vendor/snowball/doc/TODO +12 -0
  87. data/vendor/snowball/doc/libstemmer_c_README +148 -0
  88. data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
  89. data/vendor/snowball/doc/libstemmer_java_README +67 -0
  90. data/vendor/snowball/doc/libstemmer_js_README +48 -0
  91. data/vendor/snowball/doc/libstemmer_python_README +113 -0
  92. data/vendor/snowball/examples/stemwords.c +204 -0
  93. data/vendor/snowball/go/README.md +55 -0
  94. data/vendor/snowball/go/among.go +16 -0
  95. data/vendor/snowball/go/env.go +403 -0
  96. data/vendor/snowball/go/stemwords/generate.go +68 -0
  97. data/vendor/snowball/go/stemwords/main.go +68 -0
  98. data/vendor/snowball/go/util.go +34 -0
  99. data/vendor/snowball/iconv.py +50 -0
  100. data/vendor/snowball/include/libstemmer.h +78 -0
  101. data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
  102. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
  103. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
  104. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
  105. data/vendor/snowball/javascript/base-stemmer.js +294 -0
  106. data/vendor/snowball/javascript/stemwords.js +106 -0
  107. data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
  108. data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
  109. data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
  110. data/vendor/snowball/libstemmer/modules.txt +63 -0
  111. data/vendor/snowball/libstemmer/test.c +34 -0
  112. data/vendor/snowball/pascal/.gitignore +4 -0
  113. data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
  114. data/vendor/snowball/pascal/generate.pl +23 -0
  115. data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
  116. data/vendor/snowball/python/MANIFEST.in +7 -0
  117. data/vendor/snowball/python/create_init.py +54 -0
  118. data/vendor/snowball/python/setup.cfg +6 -0
  119. data/vendor/snowball/python/setup.py +81 -0
  120. data/vendor/snowball/python/snowballstemmer/among.py +13 -0
  121. data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
  122. data/vendor/snowball/python/stemwords.py +101 -0
  123. data/vendor/snowball/python/testapp.py +28 -0
  124. data/vendor/snowball/runtime/api.c +58 -0
  125. data/vendor/snowball/runtime/api.h +32 -0
  126. data/vendor/snowball/runtime/header.h +61 -0
  127. data/vendor/snowball/runtime/utilities.c +513 -0
  128. data/vendor/snowball/rust/Cargo.toml +7 -0
  129. data/vendor/snowball/rust/build.rs +55 -0
  130. data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
  131. data/vendor/snowball/rust/src/main.rs +102 -0
  132. data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
  133. data/vendor/snowball/rust/src/snowball/among.rs +6 -0
  134. data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
  135. data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
  136. data/vendor/snowball/tests/stemtest.c +95 -0
  137. metadata +178 -0
@@ -0,0 +1,558 @@
1
+ /*
2
+ * Authors:
3
+ * - Assem Chelli, < assem [dot] ch [at] gmail >
4
+ * - Abdelkrim Aries <ab [underscore] aries [at] esi [dot] dz>
5
+ *
6
+ */
7
+
8
+ stringescapes { }
9
+
10
+ /* the Arabic letters in Unicode */
11
+ // Hamza
12
+ stringdef o '{U+0621}' // Hamza
13
+ stringdef ao '{U+0623}' // Hamza above Alef
14
+ stringdef ao_ '{U+0625}' // Hamza below Alef
15
+ stringdef a~ '{U+0622}' // Alef madda
16
+ stringdef wo '{U+0624}' // Hamza above waw
17
+ stringdef yo '{U+0626}' // Hamza above yeh
18
+
19
+ // Letters
20
+ stringdef a '{U+0627}' // Alef
21
+ stringdef a_ '{U+0649}' // Alef Maksura
22
+ stringdef b '{U+0628}' // Beh
23
+ stringdef t_ '{U+0629}' // Teh_Marbuta
24
+ stringdef t '{U+062A}' // Teh
25
+ stringdef th '{U+062B}' // Theh
26
+ stringdef j '{U+062C}' // Jeem
27
+ stringdef h '{U+062D}' // Hah
28
+ stringdef x '{U+062E}' // Khah
29
+ stringdef d '{U+062F}' // Dal
30
+ stringdef dz '{U+0630}' // Thal
31
+ stringdef r '{U+0631}' // Reh
32
+ stringdef z '{U+0632}' // Zain
33
+ stringdef s '{U+0633}' // Seen
34
+ stringdef sh '{U+0634}' // Sheen
35
+ stringdef c '{U+0635}' // Sad
36
+ stringdef dh '{U+0636}' // Dad
37
+ stringdef tt '{U+0637}' // Tah
38
+ stringdef zh '{U+0638}' // Zah
39
+ stringdef i '{U+0639}' // Ain
40
+ stringdef gh '{U+063A}' // Ghain
41
+ stringdef f '{U+0641}' // Feh
42
+ stringdef q '{U+0642}' // Qaf
43
+ stringdef k '{U+0643}' // Kaf
44
+ stringdef l '{U+0644}' // Lam
45
+ stringdef m '{U+0645}' // Meem
46
+ stringdef n '{U+0646}' // Noon
47
+ stringdef e '{U+0647}' // Heh
48
+ stringdef w '{U+0648}' // Waw
49
+ stringdef y '{U+064A}' // Yeh
50
+
51
+ // Diacritics
52
+ stringdef aan '{U+064B}' // FatHatan
53
+ stringdef uun '{U+064C}' // Dammatan
54
+ stringdef iin '{U+064D}' // Kasratan
55
+ stringdef aa '{U+064E}' // FatHa
56
+ stringdef uu '{U+064F}' // Damma
57
+ stringdef ii '{U+0650}' // Kasra
58
+ stringdef oo '{U+0652}' // Sukun
59
+ stringdef ~ '{U+0651}' // Shadda
60
+
61
+ // Hindu–Arabic numerals
62
+ stringdef 0 '{U+0660}'
63
+ stringdef 1 '{U+0661}'
64
+ stringdef 2 '{U+0662}'
65
+ stringdef 3 '{U+0663}'
66
+ stringdef 4 '{U+0664}'
67
+ stringdef 5 '{U+0665}'
68
+ stringdef 6 '{U+0666}'
69
+ stringdef 7 '{U+0667}'
70
+ stringdef 8 '{U+0668}'
71
+ stringdef 9 '{U+0669}'
72
+
73
+
74
+ // Kasheeda
75
+ stringdef _ '{U+0640}' // Kasheeda, Tatweel
76
+
77
+ // Shaped forms
78
+ stringdef o1 '{U+FE80}' // HAMZA
79
+ stringdef ao1 '{U+FE83}' // ALEF_HAMZA_ABOVE
80
+ stringdef ao2 '{U+FE84}' // ALEF_HAMZA_ABOVE
81
+ stringdef ao_1 '{U+FE87}' // ALEF_HAMZA_BELOW
82
+ stringdef ao_2 '{U+FE88}' // ALEF_HAMZA_BELOW
83
+ stringdef yo1 '{U+FE8B}' // YEH_HAMZA
84
+ stringdef yo2 '{U+FE8C}' // YEH_HAMZA
85
+ stringdef yo3 '{U+FE89}' // YEH_HAMZA
86
+ stringdef yo4 '{U+FE8A}' // YEH_HAMZA
87
+ stringdef a~1 '{U+FE81}' // ALEF_MADDA
88
+ stringdef a~2 '{U+FE82}' // ALEF_MADDA
89
+ stringdef wo1 '{U+FE85}' // WAW_HAMZA
90
+ stringdef wo2 '{U+FE86}' // WAW_HAMZA
91
+ stringdef a1 '{U+FE8D}' // ALEF
92
+ stringdef a2 '{U+FE8E}' // ALEF
93
+ stringdef b1 '{U+FE8F}' // BEH
94
+ stringdef b2 '{U+FE90}' // BEH
95
+ stringdef b3 '{U+FE91}' // BEH
96
+ stringdef b4 '{U+FE92}' // BEH
97
+ stringdef t_1 '{U+FE93}' // TEH_MARBUTA
98
+ stringdef t_2 '{U+FE94}' // TEH_MARBUTA
99
+ stringdef t1 '{U+FE97}' // TEH
100
+ stringdef t2 '{U+FE98}' // TEH
101
+ stringdef t3 '{U+FE95}' // TEH
102
+ stringdef t4 '{U+FE96}' // TEH
103
+ stringdef th1 '{U+FE9B}' // THEH
104
+ stringdef th2 '{U+FE9C}' // THEH
105
+ stringdef th3 '{U+FE9A}' // THEH
106
+ stringdef th4 '{U+FE99}' // THEH
107
+ stringdef j1 '{U+FE9F}' // JEEM
108
+ stringdef j2 '{U+FEA0}' // JEEM
109
+ stringdef j3 '{U+FE9D}' // JEEM
110
+ stringdef j4 '{U+FE9E}' // JEEM
111
+ stringdef h1 '{U+FEA3}' // HAH
112
+ stringdef h2 '{U+FEA4}' // HAH
113
+ stringdef h3 '{U+FEA1}' // HAH
114
+ stringdef h4 '{U+FEA2}' // HAH
115
+ stringdef x1 '{U+FEA7}' // KHAH
116
+ stringdef x2 '{U+FEA8}' // KHAH
117
+ stringdef x3 '{U+FEA5}' // KHAH
118
+ stringdef x4 '{U+FEA6}' // KHAH
119
+ stringdef d1 '{U+FEA9}' // DAL
120
+ stringdef d2 '{U+FEAA}' // DAL
121
+ stringdef dz1 '{U+FEAB}' // THAL
122
+ stringdef dz2 '{U+FEAC}' // THAL
123
+ stringdef r1 '{U+FEAD}' // REH
124
+ stringdef r2 '{U+FEAE}' // REH
125
+ stringdef z1 '{U+FEAF}' // ZAIN
126
+ stringdef z2 '{U+FEB0}' // ZAIN
127
+ stringdef s1 '{U+FEB3}' // SEEN
128
+ stringdef s2 '{U+FEB4}' // SEEN
129
+ stringdef s3 '{U+FEB1}' // SEEN
130
+ stringdef s4 '{U+FEB2}' // SEEN
131
+ stringdef sh1 '{U+FEB7}' // SHEEN
132
+ stringdef sh2 '{U+FEB8}' // SHEEN
133
+ stringdef sh3 '{U+FEB5}' // SHEEN
134
+ stringdef sh4 '{U+FEB6}' // SHEEN
135
+ stringdef c1 '{U+FEBB}' // SAD
136
+ stringdef c2 '{U+FEBC}' // SAD
137
+ stringdef c3 '{U+FEB9}' // SAD
138
+ stringdef c4 '{U+FEBA}' // SAD
139
+ stringdef dh1 '{U+FEBF}' // DAD
140
+ stringdef dh2 '{U+FEC0}' // DAD
141
+ stringdef dh3 '{U+FEBD}' // DAD
142
+ stringdef dh4 '{U+FEBE}' // DAD
143
+ stringdef tt1 '{U+FEC3}' // TAH
144
+ stringdef tt2 '{U+FEC4}' // TAH
145
+ stringdef tt3 '{U+FEC1}' // TAH
146
+ stringdef tt4 '{U+FEC2}' // TAH
147
+ stringdef zh1 '{U+FEC7}' // ZAH
148
+ stringdef zh2 '{U+FEC8}' // ZAH
149
+ stringdef zh3 '{U+FEC5}' // ZAH
150
+ stringdef zh4 '{U+FEC6}' // ZAH
151
+ stringdef i1 '{U+FECB}' // AIN
152
+ stringdef i2 '{U+FECC}' // AIN
153
+ stringdef i3 '{U+FEC9}' // AIN
154
+ stringdef i4 '{U+FECA}' // AIN
155
+ stringdef gh1 '{U+FECF}' // GHAIN
156
+ stringdef gh2 '{U+FED0}' // GHAIN
157
+ stringdef gh3 '{U+FECD}' // GHAIN
158
+ stringdef gh4 '{U+FECE}' // GHAIN
159
+ stringdef f1 '{U+FED3}' // FEH
160
+ stringdef f2 '{U+FED4}' // FEH
161
+ stringdef f3 '{U+FED1}' // FEH
162
+ stringdef f4 '{U+FED2}' // FEH
163
+ stringdef q1 '{U+FED7}' // QAF
164
+ stringdef q2 '{U+FED8}' // QAF
165
+ stringdef q3 '{U+FED5}' // QAF
166
+ stringdef q4 '{U+FED6}' // QAF
167
+ stringdef k1 '{U+FEDB}' // KAF
168
+ stringdef k2 '{U+FEDC}' // KAF
169
+ stringdef k3 '{U+FED9}' // KAF
170
+ stringdef k4 '{U+FEDA}' // KAF
171
+ stringdef l1 '{U+FEDF}' // LAM
172
+ stringdef l2 '{U+FEE0}' // LAM
173
+ stringdef l3 '{U+FEDD}' // LAM
174
+ stringdef l4 '{U+FEDE}' // LAM
175
+ stringdef m1 '{U+FEE3}' // MEEM
176
+ stringdef m2 '{U+FEE4}' // MEEM
177
+ stringdef m3 '{U+FEE1}' // MEEM
178
+ stringdef m4 '{U+FEE2}' // MEEM
179
+ stringdef n1 '{U+FEE7}' // NOON
180
+ stringdef n2 '{U+FEE8}' // NOON
181
+ stringdef n3 '{U+FEE5}' // NOON
182
+ stringdef n4 '{U+FEE6}' // NOON
183
+ stringdef e1 '{U+FEEB}' // HEH
184
+ stringdef e2 '{U+FEEC}' // HEH
185
+ stringdef e3 '{U+FEE9}' // HEH
186
+ stringdef e4 '{U+FEEA}' // HEH
187
+ stringdef w1 '{U+FEED}' // WAW
188
+ stringdef w2 '{U+FEEE}' // WAW
189
+ stringdef a_1 '{U+FEEF}' // ALEF_MAKSURA
190
+ stringdef a_2 '{U+FEF0}' // ALEF_MAKSURA
191
+ stringdef y1 '{U+FEF3}' // YEH
192
+ stringdef y2 '{U+FEF4}' // YEH
193
+ stringdef y3 '{U+FEF1}' // YEH
194
+ stringdef y4 '{U+FEF2}' // YEH
195
+
196
+ // Ligatures Lam-Alef
197
+ stringdef la '{U+FEFB}' // LAM_ALEF
198
+ stringdef la2 '{U+FEFC}' // LAM_ALEF
199
+ stringdef lao '{U+FEF7}' // LAM_ALEF_HAMZA_ABOVE
200
+ stringdef lao2 '{U+FEF8}' // LAM_ALEF_HAMZA_ABOVE
201
+ stringdef lao_ '{U+FEF9}' // LAM_ALEF_HAMZA_BELOW
202
+ stringdef lao_2 '{U+FEFA}' // LAM_ALEF_HAMZA_BELOW
203
+ stringdef la~ '{U+FEF5}' // LAM_ALEF_MADDA_ABOVE
204
+ stringdef la~2 '{U+FEF6}' // LAM_ALEF_MADDA_ABOVE
205
+
206
+
207
+ booleans (
208
+ is_noun
209
+ is_verb
210
+ is_defined
211
+ )
212
+
213
+ routines (
214
+ Prefix_Step1
215
+ Prefix_Step2
216
+ Prefix_Step3a_Noun
217
+ Prefix_Step3b_Noun
218
+ Prefix_Step3_Verb
219
+ Prefix_Step4_Verb
220
+
221
+ Suffix_All_alef_maqsura
222
+ Suffix_Noun_Step1a
223
+ Suffix_Noun_Step1b
224
+ Suffix_Noun_Step2a
225
+ Suffix_Noun_Step2b
226
+ Suffix_Noun_Step2c1
227
+ Suffix_Noun_Step2c2
228
+ Suffix_Noun_Step3
229
+ Suffix_Verb_Step1
230
+ Suffix_Verb_Step2a
231
+ Suffix_Verb_Step2b
232
+ Suffix_Verb_Step2c
233
+
234
+ Normalize_post
235
+ Normalize_pre
236
+
237
+ Checks1
238
+ )
239
+
240
+ externals ( stem )
241
+
242
+ groupings ( )
243
+
244
+
245
+ // Normalizations
246
+ define Normalize_pre as (
247
+ do repeat (
248
+ (
249
+ [substring] among (
250
+ '{aan}' '{uun}' '{iin}' '{aa}' '{uu}' '{ii}' '{oo}' '{~}'( delete ) // strip vocalization
251
+ '{_}' ( delete ) // strip kasheeda
252
+
253
+ // Hindu–Arabic numerals
254
+ '{0}' ( <- '0')
255
+ '{1}' ( <- '1')
256
+ '{2}' ( <- '2')
257
+ '{3}' ( <- '3')
258
+ '{4}' ( <- '4')
259
+ '{5}' ( <- '5')
260
+ '{6}' ( <- '6')
261
+ '{7}' ( <- '7')
262
+ '{8}' ( <- '8')
263
+ '{9}' ( <- '9')
264
+
265
+ // Shaped forms
266
+ '{o1}' ( <- '{o}' ) // HAMZA
267
+ '{ao1}' '{ao2}' ( <- '{ao}' ) // ALEF_HAMZA_ABOVE
268
+ '{ao_1}' '{ao_2}' ( <- '{ao_}' ) // ALEF_HAMZA_BELOW
269
+ '{yo1}' '{yo2}' '{yo3}' '{yo4}' ( <- '{yo}' ) // YEH_HAMZA
270
+ '{a~1}' '{a~2}'( <- '{a~}' ) // ALEF_MADDA
271
+ '{wo1}' '{wo2}'( <- '{wo}' ) // WAW_HAMZA
272
+ '{a1}' '{a2}' ( <- '{a}' ) // ALEF
273
+ '{b1}' '{b2}' '{b3}' '{b4}' ( <- '{b}' ) // BEH
274
+ '{t_1}' '{t_2}' ( <- '{t_}' ) // TEH_MARBUTA
275
+ '{t1}' '{t2}' '{t3}' '{t4}' ( <- '{t}' ) // TEH
276
+ '{th1}' '{th2}' '{th3}' '{th4}' ( <- '{th}' ) // THEH
277
+ '{j1}' '{j2}' '{j3}' '{j4}'( <- '{j}' ) // JEEM
278
+ '{h1}' '{h2}' '{h3}' '{h4}' ( <- '{h}' ) // HAH
279
+ '{x1}' '{x2}' '{x3}' '{x4}'( <- '{x}' ) // KHAH
280
+ '{d1}' '{d2}' ( <- '{d}' ) // DAL
281
+ '{dz1}''{dz2}' ( <- '{dz}' ) // THAL
282
+ '{r1}' '{r2}'( <- '{r}' ) // REH
283
+ '{z1}' '{z2}' ( <- '{z}' ) // ZAIN
284
+ '{s1}' '{s2}' '{s3}' '{s4}'( <- '{s}' ) // SEEN
285
+ '{sh1}' '{sh2}' '{sh3}' '{sh4}' ( <- '{sh}' ) // SHEEN
286
+ '{c1}' '{c2}' '{c3}' '{c4}'( <- '{c}' ) // SAD
287
+ '{dh1}' '{dh2}' '{dh3}' '{dh4}'( <- '{dh}' ) // DAD
288
+ '{tt1}' '{tt2}' '{tt3}' '{tt4}' ( <- '{tt}' ) // TAH
289
+ '{zh1}' '{zh2}' '{zh3}' '{zh4}'( <- '{zh}' ) // ZAH
290
+ '{i1}' '{i2}' '{i3}' '{i4}'( <- '{i}' ) // AIN
291
+ '{gh1}' '{gh2}' '{gh3}' '{gh4}'( <- '{gh}' ) // GHAIN
292
+ '{f1}' '{f2}' '{f3}' '{f4}' ( <- '{f}' ) // FEH
293
+ '{q1}' '{q2}' '{q3}' '{q4}' ( <- '{q}' ) // QAF
294
+ '{k1}' '{k2}' '{k3}' '{k4}'( <- '{k}' ) // KAF
295
+ '{l1}' '{l2}' '{l3}' '{l4}'( <- '{l}' ) // LAM
296
+ '{m1}' '{m2}' '{m3}' '{m4}' ( <- '{m}' ) // MEEM
297
+ '{n1}' '{n2}' '{n3}' '{n4}'( <- '{n}' ) // NOON
298
+ '{e1}' '{e2}' '{e3}' '{e4}' ( <- '{e}' ) // HEH
299
+ '{w1}' '{w2}' ( <- '{w}' ) // WAW
300
+ '{a_1}' '{a_2}' ( <- '{a_}' ) // ALEF_MAKSURA
301
+ '{y1}' '{y2}' '{y3}' '{y4}' ( <- '{y}' ) // YEH
302
+
303
+ // Ligatures Lam-Alef
304
+ '{la}' '{la2}' (<- '{l}{a}')
305
+ '{lao}' '{lao2}' (<- '{l}{ao}')
306
+ '{lao_}' '{lao_2}' (<- '{l}{ao_}')
307
+ '{la~}' '{la~2}' (<- '{l}{a~}')
308
+
309
+ )
310
+ )
311
+ or
312
+ next
313
+ )
314
+ )
315
+
316
+ define Normalize_post as (
317
+
318
+ do (
319
+ // normalize last hamza
320
+ backwards (
321
+ [substring] among (
322
+ '{ao}''{ao_}' '{a~}' ( <- '{o}')
323
+ '{wo}' ( <- '{o}')
324
+ '{yo}' ( <- '{o}')
325
+ )
326
+ )
327
+ )
328
+
329
+ do repeat (
330
+ (
331
+ // normalize other hamza's
332
+ [substring] among (
333
+ '{ao}''{ao_}' '{a~}' ( <- '{a}')
334
+ '{wo}' ( <- '{w}')
335
+ '{yo}' ( <- '{y}')
336
+ )
337
+ )
338
+ or
339
+ next
340
+ )
341
+ )
342
+
343
+ // Checks
344
+ define Checks1 as (
345
+ [substring] among (
346
+ '{b}{a}{l}' '{k}{a}{l}' ($(len > 4) set is_noun unset is_verb set is_defined)
347
+ '{l}{l}' '{a}{l}' ($(len > 3) set is_noun unset is_verb set is_defined)
348
+ )
349
+ )
350
+
351
+
352
+ //prefixes
353
+ define Prefix_Step1 as (
354
+ [substring] among (
355
+ '{ao}{ao}' ($(len > 3) <- '{ao}' )
356
+ '{ao}{a~}' ($(len > 3) <- '{a~}' )
357
+ '{ao}{wo}' ($(len > 3) <- '{ao}' )
358
+ '{ao}{a}' ($(len > 3) <- '{a}' )
359
+ '{ao}{ao_}' ($(len > 3) <- '{ao_}' )
360
+ // '{ao}' ($(len > 3) delete) //rare case
361
+ )
362
+ )
363
+
364
+ define Prefix_Step2 as (
365
+ [substring] among (
366
+ '{f}' '{w}' ($(len > 3) not '{a}' delete)
367
+ )
368
+ )
369
+
370
+ define Prefix_Step3a_Noun as ( // it is noun and defined
371
+ [substring] among (
372
+ '{b}{a}{l}' '{k}{a}{l}' ($(len > 5) delete)
373
+ '{l}{l}' '{a}{l}' ($(len > 4) delete)
374
+ )
375
+ )
376
+
377
+ define Prefix_Step3b_Noun as ( // probably noun and defined
378
+ [substring] among (
379
+ '{b}{a}' ( ) // exception - not a valid verb prefix so can just succeed here
380
+ '{b}' ($(len > 3) delete)
381
+ // '{k}' '{l}' ($(len > 3) delete) // BUG: cause confusion
382
+ '{b}{b}' ($(len > 3) <- '{b}' )
383
+ '{k}{k}' ($(len > 3) <- '{k}' )
384
+ )
385
+
386
+ )
387
+
388
+ define Prefix_Step3_Verb as (
389
+ [substring] among (
390
+ //'{s}' ($(len > 4) delete)// BUG: cause confusion
391
+ '{s}{y}' ($(len > 4) <- '{y}' )
392
+ '{s}{t}' ($(len > 4) <- '{t}')
393
+ '{s}{n}' ($(len > 4) <- '{n}')
394
+ '{s}{ao}' ($(len > 4) <- '{ao}')
395
+ )
396
+ )
397
+
398
+ define Prefix_Step4_Verb as (
399
+ [substring] among (
400
+ '{y}{s}{t}' '{n}{s}{t}' '{t}{s}{t}' ($(len > 4) set is_verb unset is_noun <- '{a}{s}{t}' )
401
+ )
402
+ )
403
+
404
+ // suffixes
405
+ backwardmode (
406
+
407
+ define Suffix_Noun_Step1a as (
408
+ [substring] among (
409
+ '{y}' '{k}' '{e}' ($(len >= 4) delete)
410
+ '{n}{a}' '{k}{m}' '{e}{a}' '{e}{n}' '{e}{m}' ($(len >= 5) delete)
411
+ '{k}{m}{a}' '{e}{m}{a}' ($(len >= 6) delete)
412
+ )
413
+ )
414
+ define Suffix_Noun_Step1b as (
415
+ [substring] among (
416
+ '{n}' ($(len > 5) delete)
417
+ )
418
+ )
419
+
420
+ define Suffix_Noun_Step2a as (
421
+ [substring] among (
422
+ '{a}' '{y}' '{w}' ($(len > 4) delete)
423
+ )
424
+ )
425
+
426
+ define Suffix_Noun_Step2b as (
427
+ [substring] among (
428
+ '{a}{t}' ($(len >= 5) delete)
429
+ )
430
+ )
431
+
432
+ define Suffix_Noun_Step2c1 as (
433
+ [substring] among (
434
+ '{t}' ($(len >= 4) delete)
435
+ )
436
+ )
437
+ define Suffix_Noun_Step2c2 as ( // feminine t_
438
+ [substring] among (
439
+ '{t_}' ($(len >= 4) delete)
440
+ )
441
+ )
442
+ define Suffix_Noun_Step3 as ( // ya' nisbiya
443
+ [substring] among (
444
+ '{y}' ($(len >= 3) delete)
445
+ )
446
+ )
447
+
448
+ define Suffix_Verb_Step1 as (
449
+ [substring] among (
450
+ '{e}' '{k}' ($(len >= 4) delete)
451
+ '{n}{y}' '{n}{a}' '{e}{a}' '{e}{m}' '{e}{n}' '{k}{m}' '{k}{n}' ($(len >= 5) delete)
452
+ '{e}{m}{a}' '{k}{m}{a}' '{k}{m}{w}'($(len >= 6) delete)
453
+ )
454
+ )
455
+ define Suffix_Verb_Step2a as (
456
+ [substring] among (
457
+ '{t}' ($(len >= 4) delete)
458
+ '{a}' '{n}' '{y}' ($(len >= 4) delete)
459
+ '{n}{a}' '{t}{a}' '{t}{n}' ($(len >= 5) delete)// past
460
+ '{a}{n}' '{w}{n}' '{y}{n}' ($(len > 5) delete) // present
461
+ '{t}{m}{a}' ($(len >= 6) delete)
462
+ )
463
+ )
464
+
465
+ define Suffix_Verb_Step2b as (
466
+ [substring] among (
467
+ '{w}{a}' '{t}{m}' ($(len >= 5) delete)
468
+ )
469
+ )
470
+
471
+
472
+ define Suffix_Verb_Step2c as (
473
+ [substring] among (
474
+ '{w}' ($(len >= 4) delete)
475
+ '{t}{m}{w}' ($(len >= 6) delete)
476
+ )
477
+ )
478
+
479
+ define Suffix_All_alef_maqsura as (
480
+ [substring] among (
481
+ '{a_}' ( <- '{y}' ) // spell error
482
+ // '{a_}' ( delete ) // if noun > 3
483
+ // '{a_}' ( <- '{a}') // if verb
484
+ )
485
+ )
486
+ )
487
+
488
+ define stem as (
489
+ // set initial values
490
+ set is_noun
491
+ set is_verb
492
+ unset is_defined
493
+
494
+ // guess type and properties
495
+ do Checks1
496
+
497
+ // normalization pre-stemming
498
+ do Normalize_pre
499
+
500
+
501
+ backwards (
502
+
503
+ do (
504
+ //Suffixes for verbs
505
+ (
506
+ is_verb
507
+ (
508
+ (
509
+ (atleast 1 Suffix_Verb_Step1)
510
+ ( Suffix_Verb_Step2a or Suffix_Verb_Step2c or next)
511
+ )
512
+ or Suffix_Verb_Step2b
513
+ or Suffix_Verb_Step2a
514
+ )
515
+ )
516
+ //Suffixes for nouns
517
+ or (
518
+ is_noun
519
+ (
520
+
521
+ try (
522
+ Suffix_Noun_Step2c2
523
+ or (not is_defined Suffix_Noun_Step1a (
524
+ Suffix_Noun_Step2a
525
+ or Suffix_Noun_Step2b
526
+ or Suffix_Noun_Step2c1
527
+ or next))
528
+ or (Suffix_Noun_Step1b (
529
+ Suffix_Noun_Step2a
530
+ or Suffix_Noun_Step2b
531
+ or Suffix_Noun_Step2c1))
532
+ or (not is_defined Suffix_Noun_Step2a)
533
+ or (Suffix_Noun_Step2b)
534
+ )
535
+ Suffix_Noun_Step3
536
+ )
537
+
538
+ )
539
+
540
+ // Suffixes for alef maqsura
541
+ or Suffix_All_alef_maqsura
542
+ )
543
+ )
544
+
545
+ //Prefixes
546
+ do (
547
+ try Prefix_Step1
548
+ try Prefix_Step2
549
+ ( Prefix_Step3a_Noun
550
+ or (is_noun Prefix_Step3b_Noun)
551
+ or (is_verb try Prefix_Step3_Verb Prefix_Step4_Verb)
552
+ )
553
+ )
554
+
555
+ // normalization post-stemming
556
+ do Normalize_post
557
+
558
+ )