mittens 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE.txt +30 -0
  5. data/README.md +62 -0
  6. data/Rakefile +21 -0
  7. data/ext/mittens/ext.c +96 -0
  8. data/ext/mittens/extconf.rb +12 -0
  9. data/lib/mittens/version.rb +3 -0
  10. data/lib/mittens.rb +7 -0
  11. data/mittens.gemspec +22 -0
  12. data/vendor/snowball/.gitignore +26 -0
  13. data/vendor/snowball/.travis.yml +112 -0
  14. data/vendor/snowball/AUTHORS +27 -0
  15. data/vendor/snowball/CONTRIBUTING.rst +216 -0
  16. data/vendor/snowball/COPYING +29 -0
  17. data/vendor/snowball/GNUmakefile +742 -0
  18. data/vendor/snowball/NEWS +754 -0
  19. data/vendor/snowball/README.rst +37 -0
  20. data/vendor/snowball/ada/README.md +74 -0
  21. data/vendor/snowball/ada/generate/generate.adb +83 -0
  22. data/vendor/snowball/ada/generate.gpr +21 -0
  23. data/vendor/snowball/ada/src/stemmer.adb +620 -0
  24. data/vendor/snowball/ada/src/stemmer.ads +219 -0
  25. data/vendor/snowball/ada/src/stemwords.adb +70 -0
  26. data/vendor/snowball/ada/stemmer_config.gpr +83 -0
  27. data/vendor/snowball/ada/stemwords.gpr +21 -0
  28. data/vendor/snowball/algorithms/arabic.sbl +558 -0
  29. data/vendor/snowball/algorithms/armenian.sbl +301 -0
  30. data/vendor/snowball/algorithms/basque.sbl +149 -0
  31. data/vendor/snowball/algorithms/catalan.sbl +202 -0
  32. data/vendor/snowball/algorithms/danish.sbl +93 -0
  33. data/vendor/snowball/algorithms/dutch.sbl +164 -0
  34. data/vendor/snowball/algorithms/english.sbl +229 -0
  35. data/vendor/snowball/algorithms/finnish.sbl +197 -0
  36. data/vendor/snowball/algorithms/french.sbl +254 -0
  37. data/vendor/snowball/algorithms/german.sbl +139 -0
  38. data/vendor/snowball/algorithms/german2.sbl +145 -0
  39. data/vendor/snowball/algorithms/greek.sbl +701 -0
  40. data/vendor/snowball/algorithms/hindi.sbl +323 -0
  41. data/vendor/snowball/algorithms/hungarian.sbl +241 -0
  42. data/vendor/snowball/algorithms/indonesian.sbl +192 -0
  43. data/vendor/snowball/algorithms/irish.sbl +149 -0
  44. data/vendor/snowball/algorithms/italian.sbl +202 -0
  45. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
  46. data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
  47. data/vendor/snowball/algorithms/lovins.sbl +208 -0
  48. data/vendor/snowball/algorithms/nepali.sbl +92 -0
  49. data/vendor/snowball/algorithms/norwegian.sbl +80 -0
  50. data/vendor/snowball/algorithms/porter.sbl +139 -0
  51. data/vendor/snowball/algorithms/portuguese.sbl +218 -0
  52. data/vendor/snowball/algorithms/romanian.sbl +236 -0
  53. data/vendor/snowball/algorithms/russian.sbl +221 -0
  54. data/vendor/snowball/algorithms/serbian.sbl +2379 -0
  55. data/vendor/snowball/algorithms/spanish.sbl +230 -0
  56. data/vendor/snowball/algorithms/swedish.sbl +72 -0
  57. data/vendor/snowball/algorithms/tamil.sbl +405 -0
  58. data/vendor/snowball/algorithms/turkish.sbl +470 -0
  59. data/vendor/snowball/algorithms/yiddish.sbl +460 -0
  60. data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
  61. data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
  62. data/vendor/snowball/charsets/cp850.sbl +130 -0
  63. data/vendor/snowball/compiler/analyser.c +1547 -0
  64. data/vendor/snowball/compiler/driver.c +615 -0
  65. data/vendor/snowball/compiler/generator.c +1748 -0
  66. data/vendor/snowball/compiler/generator_ada.c +1702 -0
  67. data/vendor/snowball/compiler/generator_csharp.c +1322 -0
  68. data/vendor/snowball/compiler/generator_go.c +1278 -0
  69. data/vendor/snowball/compiler/generator_java.c +1313 -0
  70. data/vendor/snowball/compiler/generator_js.c +1316 -0
  71. data/vendor/snowball/compiler/generator_pascal.c +1387 -0
  72. data/vendor/snowball/compiler/generator_python.c +1337 -0
  73. data/vendor/snowball/compiler/generator_rust.c +1295 -0
  74. data/vendor/snowball/compiler/header.h +418 -0
  75. data/vendor/snowball/compiler/space.c +286 -0
  76. data/vendor/snowball/compiler/syswords.h +86 -0
  77. data/vendor/snowball/compiler/syswords2.h +13 -0
  78. data/vendor/snowball/compiler/tokeniser.c +567 -0
  79. data/vendor/snowball/csharp/.gitignore +8 -0
  80. data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
  81. data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
  82. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
  83. data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
  84. data/vendor/snowball/csharp/Stemwords/App.config +6 -0
  85. data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
  86. data/vendor/snowball/doc/TODO +12 -0
  87. data/vendor/snowball/doc/libstemmer_c_README +148 -0
  88. data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
  89. data/vendor/snowball/doc/libstemmer_java_README +67 -0
  90. data/vendor/snowball/doc/libstemmer_js_README +48 -0
  91. data/vendor/snowball/doc/libstemmer_python_README +113 -0
  92. data/vendor/snowball/examples/stemwords.c +204 -0
  93. data/vendor/snowball/go/README.md +55 -0
  94. data/vendor/snowball/go/among.go +16 -0
  95. data/vendor/snowball/go/env.go +403 -0
  96. data/vendor/snowball/go/stemwords/generate.go +68 -0
  97. data/vendor/snowball/go/stemwords/main.go +68 -0
  98. data/vendor/snowball/go/util.go +34 -0
  99. data/vendor/snowball/iconv.py +50 -0
  100. data/vendor/snowball/include/libstemmer.h +78 -0
  101. data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
  102. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
  103. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
  104. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
  105. data/vendor/snowball/javascript/base-stemmer.js +294 -0
  106. data/vendor/snowball/javascript/stemwords.js +106 -0
  107. data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
  108. data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
  109. data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
  110. data/vendor/snowball/libstemmer/modules.txt +63 -0
  111. data/vendor/snowball/libstemmer/test.c +34 -0
  112. data/vendor/snowball/pascal/.gitignore +4 -0
  113. data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
  114. data/vendor/snowball/pascal/generate.pl +23 -0
  115. data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
  116. data/vendor/snowball/python/MANIFEST.in +7 -0
  117. data/vendor/snowball/python/create_init.py +54 -0
  118. data/vendor/snowball/python/setup.cfg +6 -0
  119. data/vendor/snowball/python/setup.py +81 -0
  120. data/vendor/snowball/python/snowballstemmer/among.py +13 -0
  121. data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
  122. data/vendor/snowball/python/stemwords.py +101 -0
  123. data/vendor/snowball/python/testapp.py +28 -0
  124. data/vendor/snowball/runtime/api.c +58 -0
  125. data/vendor/snowball/runtime/api.h +32 -0
  126. data/vendor/snowball/runtime/header.h +61 -0
  127. data/vendor/snowball/runtime/utilities.c +513 -0
  128. data/vendor/snowball/rust/Cargo.toml +7 -0
  129. data/vendor/snowball/rust/build.rs +55 -0
  130. data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
  131. data/vendor/snowball/rust/src/main.rs +102 -0
  132. data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
  133. data/vendor/snowball/rust/src/snowball/among.rs +6 -0
  134. data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
  135. data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
  136. data/vendor/snowball/tests/stemtest.c +95 -0
  137. metadata +178 -0
@@ -0,0 +1,558 @@
1
+ /*
2
+ * Authors:
3
+ * - Assem Chelli, < assem [dot] ch [at] gmail >
4
+ * - Abdelkrim Aries <ab [underscore] aries [at] esi [dot] dz>
5
+ *
6
+ */
7
+
8
+ stringescapes { }
9
+
10
+ /* the Arabic letters in Unicode */
11
+ // Hamza
12
+ stringdef o '{U+0621}' // Hamza
13
+ stringdef ao '{U+0623}' // Hamza above Alef
14
+ stringdef ao_ '{U+0625}' // Hamza below Alef
15
+ stringdef a~ '{U+0622}' // Alef madda
16
+ stringdef wo '{U+0624}' // Hamza above waw
17
+ stringdef yo '{U+0626}' // Hamza above yeh
18
+
19
+ // Letters
20
+ stringdef a '{U+0627}' // Alef
21
+ stringdef a_ '{U+0649}' // Alef Maksura
22
+ stringdef b '{U+0628}' // Beh
23
+ stringdef t_ '{U+0629}' // Teh_Marbuta
24
+ stringdef t '{U+062A}' // Teh
25
+ stringdef th '{U+062B}' // Theh
26
+ stringdef j '{U+062C}' // Jeem
27
+ stringdef h '{U+062D}' // Hah
28
+ stringdef x '{U+062E}' // Khah
29
+ stringdef d '{U+062F}' // Dal
30
+ stringdef dz '{U+0630}' // Thal
31
+ stringdef r '{U+0631}' // Reh
32
+ stringdef z '{U+0632}' // Zain
33
+ stringdef s '{U+0633}' // Seen
34
+ stringdef sh '{U+0634}' // Sheen
35
+ stringdef c '{U+0635}' // Sad
36
+ stringdef dh '{U+0636}' // Dad
37
+ stringdef tt '{U+0637}' // Tah
38
+ stringdef zh '{U+0638}' // Zah
39
+ stringdef i '{U+0639}' // Ain
40
+ stringdef gh '{U+063A}' // Ghain
41
+ stringdef f '{U+0641}' // Feh
42
+ stringdef q '{U+0642}' // Qaf
43
+ stringdef k '{U+0643}' // Kaf
44
+ stringdef l '{U+0644}' // Lam
45
+ stringdef m '{U+0645}' // Meem
46
+ stringdef n '{U+0646}' // Noon
47
+ stringdef e '{U+0647}' // Heh
48
+ stringdef w '{U+0648}' // Waw
49
+ stringdef y '{U+064A}' // Yeh
50
+
51
+ // Diacritics
52
+ stringdef aan '{U+064B}' // FatHatan
53
+ stringdef uun '{U+064C}' // Dammatan
54
+ stringdef iin '{U+064D}' // Kasratan
55
+ stringdef aa '{U+064E}' // FatHa
56
+ stringdef uu '{U+064F}' // Damma
57
+ stringdef ii '{U+0650}' // Kasra
58
+ stringdef oo '{U+0652}' // Sukun
59
+ stringdef ~ '{U+0651}' // Shadda
60
+
61
+ // Hindu–Arabic numerals
62
+ stringdef 0 '{U+0660}'
63
+ stringdef 1 '{U+0661}'
64
+ stringdef 2 '{U+0662}'
65
+ stringdef 3 '{U+0663}'
66
+ stringdef 4 '{U+0664}'
67
+ stringdef 5 '{U+0665}'
68
+ stringdef 6 '{U+0666}'
69
+ stringdef 7 '{U+0667}'
70
+ stringdef 8 '{U+0668}'
71
+ stringdef 9 '{U+0669}'
72
+
73
+
74
+ // Kasheeda
75
+ stringdef _ '{U+0640}' // Kasheeda, Tatweel
76
+
77
+ // Shaped forms
78
+ stringdef o1 '{U+FE80}' // HAMZA
79
+ stringdef ao1 '{U+FE83}' // ALEF_HAMZA_ABOVE
80
+ stringdef ao2 '{U+FE84}' // ALEF_HAMZA_ABOVE
81
+ stringdef ao_1 '{U+FE87}' // ALEF_HAMZA_BELOW
82
+ stringdef ao_2 '{U+FE88}' // ALEF_HAMZA_BELOW
83
+ stringdef yo1 '{U+FE8B}' // YEH_HAMZA
84
+ stringdef yo2 '{U+FE8C}' // YEH_HAMZA
85
+ stringdef yo3 '{U+FE89}' // YEH_HAMZA
86
+ stringdef yo4 '{U+FE8A}' // YEH_HAMZA
87
+ stringdef a~1 '{U+FE81}' // ALEF_MADDA
88
+ stringdef a~2 '{U+FE82}' // ALEF_MADDA
89
+ stringdef wo1 '{U+FE85}' // WAW_HAMZA
90
+ stringdef wo2 '{U+FE86}' // WAW_HAMZA
91
+ stringdef a1 '{U+FE8D}' // ALEF
92
+ stringdef a2 '{U+FE8E}' // ALEF
93
+ stringdef b1 '{U+FE8F}' // BEH
94
+ stringdef b2 '{U+FE90}' // BEH
95
+ stringdef b3 '{U+FE91}' // BEH
96
+ stringdef b4 '{U+FE92}' // BEH
97
+ stringdef t_1 '{U+FE93}' // TEH_MARBUTA
98
+ stringdef t_2 '{U+FE94}' // TEH_MARBUTA
99
+ stringdef t1 '{U+FE97}' // TEH
100
+ stringdef t2 '{U+FE98}' // TEH
101
+ stringdef t3 '{U+FE95}' // TEH
102
+ stringdef t4 '{U+FE96}' // TEH
103
+ stringdef th1 '{U+FE9B}' // THEH
104
+ stringdef th2 '{U+FE9C}' // THEH
105
+ stringdef th3 '{U+FE9A}' // THEH
106
+ stringdef th4 '{U+FE99}' // THEH
107
+ stringdef j1 '{U+FE9F}' // JEEM
108
+ stringdef j2 '{U+FEA0}' // JEEM
109
+ stringdef j3 '{U+FE9D}' // JEEM
110
+ stringdef j4 '{U+FE9E}' // JEEM
111
+ stringdef h1 '{U+FEA3}' // HAH
112
+ stringdef h2 '{U+FEA4}' // HAH
113
+ stringdef h3 '{U+FEA1}' // HAH
114
+ stringdef h4 '{U+FEA2}' // HAH
115
+ stringdef x1 '{U+FEA7}' // KHAH
116
+ stringdef x2 '{U+FEA8}' // KHAH
117
+ stringdef x3 '{U+FEA5}' // KHAH
118
+ stringdef x4 '{U+FEA6}' // KHAH
119
+ stringdef d1 '{U+FEA9}' // DAL
120
+ stringdef d2 '{U+FEAA}' // DAL
121
+ stringdef dz1 '{U+FEAB}' // THAL
122
+ stringdef dz2 '{U+FEAC}' // THAL
123
+ stringdef r1 '{U+FEAD}' // REH
124
+ stringdef r2 '{U+FEAE}' // REH
125
+ stringdef z1 '{U+FEAF}' // ZAIN
126
+ stringdef z2 '{U+FEB0}' // ZAIN
127
+ stringdef s1 '{U+FEB3}' // SEEN
128
+ stringdef s2 '{U+FEB4}' // SEEN
129
+ stringdef s3 '{U+FEB1}' // SEEN
130
+ stringdef s4 '{U+FEB2}' // SEEN
131
+ stringdef sh1 '{U+FEB7}' // SHEEN
132
+ stringdef sh2 '{U+FEB8}' // SHEEN
133
+ stringdef sh3 '{U+FEB5}' // SHEEN
134
+ stringdef sh4 '{U+FEB6}' // SHEEN
135
+ stringdef c1 '{U+FEBB}' // SAD
136
+ stringdef c2 '{U+FEBC}' // SAD
137
+ stringdef c3 '{U+FEB9}' // SAD
138
+ stringdef c4 '{U+FEBA}' // SAD
139
+ stringdef dh1 '{U+FEBF}' // DAD
140
+ stringdef dh2 '{U+FEC0}' // DAD
141
+ stringdef dh3 '{U+FEBD}' // DAD
142
+ stringdef dh4 '{U+FEBE}' // DAD
143
+ stringdef tt1 '{U+FEC3}' // TAH
144
+ stringdef tt2 '{U+FEC4}' // TAH
145
+ stringdef tt3 '{U+FEC1}' // TAH
146
+ stringdef tt4 '{U+FEC2}' // TAH
147
+ stringdef zh1 '{U+FEC7}' // ZAH
148
+ stringdef zh2 '{U+FEC8}' // ZAH
149
+ stringdef zh3 '{U+FEC5}' // ZAH
150
+ stringdef zh4 '{U+FEC6}' // ZAH
151
+ stringdef i1 '{U+FECB}' // AIN
152
+ stringdef i2 '{U+FECC}' // AIN
153
+ stringdef i3 '{U+FEC9}' // AIN
154
+ stringdef i4 '{U+FECA}' // AIN
155
+ stringdef gh1 '{U+FECF}' // GHAIN
156
+ stringdef gh2 '{U+FED0}' // GHAIN
157
+ stringdef gh3 '{U+FECD}' // GHAIN
158
+ stringdef gh4 '{U+FECE}' // GHAIN
159
+ stringdef f1 '{U+FED3}' // FEH
160
+ stringdef f2 '{U+FED4}' // FEH
161
+ stringdef f3 '{U+FED1}' // FEH
162
+ stringdef f4 '{U+FED2}' // FEH
163
+ stringdef q1 '{U+FED7}' // QAF
164
+ stringdef q2 '{U+FED8}' // QAF
165
+ stringdef q3 '{U+FED5}' // QAF
166
+ stringdef q4 '{U+FED6}' // QAF
167
+ stringdef k1 '{U+FEDB}' // KAF
168
+ stringdef k2 '{U+FEDC}' // KAF
169
+ stringdef k3 '{U+FED9}' // KAF
170
+ stringdef k4 '{U+FEDA}' // KAF
171
+ stringdef l1 '{U+FEDF}' // LAM
172
+ stringdef l2 '{U+FEE0}' // LAM
173
+ stringdef l3 '{U+FEDD}' // LAM
174
+ stringdef l4 '{U+FEDE}' // LAM
175
+ stringdef m1 '{U+FEE3}' // MEEM
176
+ stringdef m2 '{U+FEE4}' // MEEM
177
+ stringdef m3 '{U+FEE1}' // MEEM
178
+ stringdef m4 '{U+FEE2}' // MEEM
179
+ stringdef n1 '{U+FEE7}' // NOON
180
+ stringdef n2 '{U+FEE8}' // NOON
181
+ stringdef n3 '{U+FEE5}' // NOON
182
+ stringdef n4 '{U+FEE6}' // NOON
183
+ stringdef e1 '{U+FEEB}' // HEH
184
+ stringdef e2 '{U+FEEC}' // HEH
185
+ stringdef e3 '{U+FEE9}' // HEH
186
+ stringdef e4 '{U+FEEA}' // HEH
187
+ stringdef w1 '{U+FEED}' // WAW
188
+ stringdef w2 '{U+FEEE}' // WAW
189
+ stringdef a_1 '{U+FEEF}' // ALEF_MAKSURA
190
+ stringdef a_2 '{U+FEF0}' // ALEF_MAKSURA
191
+ stringdef y1 '{U+FEF3}' // YEH
192
+ stringdef y2 '{U+FEF4}' // YEH
193
+ stringdef y3 '{U+FEF1}' // YEH
194
+ stringdef y4 '{U+FEF2}' // YEH
195
+
196
+ // Ligatures Lam-Alef
197
+ stringdef la '{U+FEFB}' // LAM_ALEF
198
+ stringdef la2 '{U+FEFC}' // LAM_ALEF
199
+ stringdef lao '{U+FEF7}' // LAM_ALEF_HAMZA_ABOVE
200
+ stringdef lao2 '{U+FEF8}' // LAM_ALEF_HAMZA_ABOVE
201
+ stringdef lao_ '{U+FEF9}' // LAM_ALEF_HAMZA_BELOW
202
+ stringdef lao_2 '{U+FEFA}' // LAM_ALEF_HAMZA_BELOW
203
+ stringdef la~ '{U+FEF5}' // LAM_ALEF_MADDA_ABOVE
204
+ stringdef la~2 '{U+FEF6}' // LAM_ALEF_MADDA_ABOVE
205
+
206
+
207
+ booleans (
208
+ is_noun
209
+ is_verb
210
+ is_defined
211
+ )
212
+
213
+ routines (
214
+ Prefix_Step1
215
+ Prefix_Step2
216
+ Prefix_Step3a_Noun
217
+ Prefix_Step3b_Noun
218
+ Prefix_Step3_Verb
219
+ Prefix_Step4_Verb
220
+
221
+ Suffix_All_alef_maqsura
222
+ Suffix_Noun_Step1a
223
+ Suffix_Noun_Step1b
224
+ Suffix_Noun_Step2a
225
+ Suffix_Noun_Step2b
226
+ Suffix_Noun_Step2c1
227
+ Suffix_Noun_Step2c2
228
+ Suffix_Noun_Step3
229
+ Suffix_Verb_Step1
230
+ Suffix_Verb_Step2a
231
+ Suffix_Verb_Step2b
232
+ Suffix_Verb_Step2c
233
+
234
+ Normalize_post
235
+ Normalize_pre
236
+
237
+ Checks1
238
+ )
239
+
240
+ externals ( stem )
241
+
242
+ groupings ( )
243
+
244
+
245
+ // Normalizations
246
+ define Normalize_pre as (
247
+ do repeat (
248
+ (
249
+ [substring] among (
250
+ '{aan}' '{uun}' '{iin}' '{aa}' '{uu}' '{ii}' '{oo}' '{~}'( delete ) // strip vocalization
251
+ '{_}' ( delete ) // strip kasheeda
252
+
253
+ // Hindu–Arabic numerals
254
+ '{0}' ( <- '0')
255
+ '{1}' ( <- '1')
256
+ '{2}' ( <- '2')
257
+ '{3}' ( <- '3')
258
+ '{4}' ( <- '4')
259
+ '{5}' ( <- '5')
260
+ '{6}' ( <- '6')
261
+ '{7}' ( <- '7')
262
+ '{8}' ( <- '8')
263
+ '{9}' ( <- '9')
264
+
265
+ // Shaped forms
266
+ '{o1}' ( <- '{o}' ) // HAMZA
267
+ '{ao1}' '{ao2}' ( <- '{ao}' ) // ALEF_HAMZA_ABOVE
268
+ '{ao_1}' '{ao_2}' ( <- '{ao_}' ) // ALEF_HAMZA_BELOW
269
+ '{yo1}' '{yo2}' '{yo3}' '{yo4}' ( <- '{yo}' ) // YEH_HAMZA
270
+ '{a~1}' '{a~2}'( <- '{a~}' ) // ALEF_MADDA
271
+ '{wo1}' '{wo2}'( <- '{wo}' ) // WAW_HAMZA
272
+ '{a1}' '{a2}' ( <- '{a}' ) // ALEF
273
+ '{b1}' '{b2}' '{b3}' '{b4}' ( <- '{b}' ) // BEH
274
+ '{t_1}' '{t_2}' ( <- '{t_}' ) // TEH_MARBUTA
275
+ '{t1}' '{t2}' '{t3}' '{t4}' ( <- '{t}' ) // TEH
276
+ '{th1}' '{th2}' '{th3}' '{th4}' ( <- '{th}' ) // THEH
277
+ '{j1}' '{j2}' '{j3}' '{j4}'( <- '{j}' ) // JEEM
278
+ '{h1}' '{h2}' '{h3}' '{h4}' ( <- '{h}' ) // HAH
279
+ '{x1}' '{x2}' '{x3}' '{x4}'( <- '{x}' ) // KHAH
280
+ '{d1}' '{d2}' ( <- '{d}' ) // DAL
281
+ '{dz1}''{dz2}' ( <- '{dz}' ) // THAL
282
+ '{r1}' '{r2}'( <- '{r}' ) // REH
283
+ '{z1}' '{z2}' ( <- '{z}' ) // ZAIN
284
+ '{s1}' '{s2}' '{s3}' '{s4}'( <- '{s}' ) // SEEN
285
+ '{sh1}' '{sh2}' '{sh3}' '{sh4}' ( <- '{sh}' ) // SHEEN
286
+ '{c1}' '{c2}' '{c3}' '{c4}'( <- '{c}' ) // SAD
287
+ '{dh1}' '{dh2}' '{dh3}' '{dh4}'( <- '{dh}' ) // DAD
288
+ '{tt1}' '{tt2}' '{tt3}' '{tt4}' ( <- '{tt}' ) // TAH
289
+ '{zh1}' '{zh2}' '{zh3}' '{zh4}'( <- '{zh}' ) // ZAH
290
+ '{i1}' '{i2}' '{i3}' '{i4}'( <- '{i}' ) // AIN
291
+ '{gh1}' '{gh2}' '{gh3}' '{gh4}'( <- '{gh}' ) // GHAIN
292
+ '{f1}' '{f2}' '{f3}' '{f4}' ( <- '{f}' ) // FEH
293
+ '{q1}' '{q2}' '{q3}' '{q4}' ( <- '{q}' ) // QAF
294
+ '{k1}' '{k2}' '{k3}' '{k4}'( <- '{k}' ) // KAF
295
+ '{l1}' '{l2}' '{l3}' '{l4}'( <- '{l}' ) // LAM
296
+ '{m1}' '{m2}' '{m3}' '{m4}' ( <- '{m}' ) // MEEM
297
+ '{n1}' '{n2}' '{n3}' '{n4}'( <- '{n}' ) // NOON
298
+ '{e1}' '{e2}' '{e3}' '{e4}' ( <- '{e}' ) // HEH
299
+ '{w1}' '{w2}' ( <- '{w}' ) // WAW
300
+ '{a_1}' '{a_2}' ( <- '{a_}' ) // ALEF_MAKSURA
301
+ '{y1}' '{y2}' '{y3}' '{y4}' ( <- '{y}' ) // YEH
302
+
303
+ // Ligatures Lam-Alef
304
+ '{la}' '{la2}' (<- '{l}{a}')
305
+ '{lao}' '{lao2}' (<- '{l}{ao}')
306
+ '{lao_}' '{lao_2}' (<- '{l}{ao_}')
307
+ '{la~}' '{la~2}' (<- '{l}{a~}')
308
+
309
+ )
310
+ )
311
+ or
312
+ next
313
+ )
314
+ )
315
+
316
+ define Normalize_post as (
317
+
318
+ do (
319
+ // normalize last hamza
320
+ backwards (
321
+ [substring] among (
322
+ '{ao}''{ao_}' '{a~}' ( <- '{o}')
323
+ '{wo}' ( <- '{o}')
324
+ '{yo}' ( <- '{o}')
325
+ )
326
+ )
327
+ )
328
+
329
+ do repeat (
330
+ (
331
+ // normalize other hamza's
332
+ [substring] among (
333
+ '{ao}''{ao_}' '{a~}' ( <- '{a}')
334
+ '{wo}' ( <- '{w}')
335
+ '{yo}' ( <- '{y}')
336
+ )
337
+ )
338
+ or
339
+ next
340
+ )
341
+ )
342
+
343
+ // Checks
344
+ define Checks1 as (
345
+ [substring] among (
346
+ '{b}{a}{l}' '{k}{a}{l}' ($(len > 4) set is_noun unset is_verb set is_defined)
347
+ '{l}{l}' '{a}{l}' ($(len > 3) set is_noun unset is_verb set is_defined)
348
+ )
349
+ )
350
+
351
+
352
+ //prefixes
353
+ define Prefix_Step1 as (
354
+ [substring] among (
355
+ '{ao}{ao}' ($(len > 3) <- '{ao}' )
356
+ '{ao}{a~}' ($(len > 3) <- '{a~}' )
357
+ '{ao}{wo}' ($(len > 3) <- '{ao}' )
358
+ '{ao}{a}' ($(len > 3) <- '{a}' )
359
+ '{ao}{ao_}' ($(len > 3) <- '{ao_}' )
360
+ // '{ao}' ($(len > 3) delete) //rare case
361
+ )
362
+ )
363
+
364
+ define Prefix_Step2 as (
365
+ [substring] among (
366
+ '{f}' '{w}' ($(len > 3) not '{a}' delete)
367
+ )
368
+ )
369
+
370
+ define Prefix_Step3a_Noun as ( // it is noun and defined
371
+ [substring] among (
372
+ '{b}{a}{l}' '{k}{a}{l}' ($(len > 5) delete)
373
+ '{l}{l}' '{a}{l}' ($(len > 4) delete)
374
+ )
375
+ )
376
+
377
+ define Prefix_Step3b_Noun as ( // probably noun and defined
378
+ [substring] among (
379
+ '{b}{a}' ( ) // exception - not a valid verb prefix so can just succeed here
380
+ '{b}' ($(len > 3) delete)
381
+ // '{k}' '{l}' ($(len > 3) delete) // BUG: cause confusion
382
+ '{b}{b}' ($(len > 3) <- '{b}' )
383
+ '{k}{k}' ($(len > 3) <- '{k}' )
384
+ )
385
+
386
+ )
387
+
388
+ define Prefix_Step3_Verb as (
389
+ [substring] among (
390
+ //'{s}' ($(len > 4) delete)// BUG: cause confusion
391
+ '{s}{y}' ($(len > 4) <- '{y}' )
392
+ '{s}{t}' ($(len > 4) <- '{t}')
393
+ '{s}{n}' ($(len > 4) <- '{n}')
394
+ '{s}{ao}' ($(len > 4) <- '{ao}')
395
+ )
396
+ )
397
+
398
+ define Prefix_Step4_Verb as (
399
+ [substring] among (
400
+ '{y}{s}{t}' '{n}{s}{t}' '{t}{s}{t}' ($(len > 4) set is_verb unset is_noun <- '{a}{s}{t}' )
401
+ )
402
+ )
403
+
404
+ // suffixes
405
+ backwardmode (
406
+
407
+ define Suffix_Noun_Step1a as (
408
+ [substring] among (
409
+ '{y}' '{k}' '{e}' ($(len >= 4) delete)
410
+ '{n}{a}' '{k}{m}' '{e}{a}' '{e}{n}' '{e}{m}' ($(len >= 5) delete)
411
+ '{k}{m}{a}' '{e}{m}{a}' ($(len >= 6) delete)
412
+ )
413
+ )
414
+ define Suffix_Noun_Step1b as (
415
+ [substring] among (
416
+ '{n}' ($(len > 5) delete)
417
+ )
418
+ )
419
+
420
+ define Suffix_Noun_Step2a as (
421
+ [substring] among (
422
+ '{a}' '{y}' '{w}' ($(len > 4) delete)
423
+ )
424
+ )
425
+
426
+ define Suffix_Noun_Step2b as (
427
+ [substring] among (
428
+ '{a}{t}' ($(len >= 5) delete)
429
+ )
430
+ )
431
+
432
+ define Suffix_Noun_Step2c1 as (
433
+ [substring] among (
434
+ '{t}' ($(len >= 4) delete)
435
+ )
436
+ )
437
+ define Suffix_Noun_Step2c2 as ( // feminine t_
438
+ [substring] among (
439
+ '{t_}' ($(len >= 4) delete)
440
+ )
441
+ )
442
+ define Suffix_Noun_Step3 as ( // ya' nisbiya
443
+ [substring] among (
444
+ '{y}' ($(len >= 3) delete)
445
+ )
446
+ )
447
+
448
+ define Suffix_Verb_Step1 as (
449
+ [substring] among (
450
+ '{e}' '{k}' ($(len >= 4) delete)
451
+ '{n}{y}' '{n}{a}' '{e}{a}' '{e}{m}' '{e}{n}' '{k}{m}' '{k}{n}' ($(len >= 5) delete)
452
+ '{e}{m}{a}' '{k}{m}{a}' '{k}{m}{w}'($(len >= 6) delete)
453
+ )
454
+ )
455
+ define Suffix_Verb_Step2a as (
456
+ [substring] among (
457
+ '{t}' ($(len >= 4) delete)
458
+ '{a}' '{n}' '{y}' ($(len >= 4) delete)
459
+ '{n}{a}' '{t}{a}' '{t}{n}' ($(len >= 5) delete)// past
460
+ '{a}{n}' '{w}{n}' '{y}{n}' ($(len > 5) delete) // present
461
+ '{t}{m}{a}' ($(len >= 6) delete)
462
+ )
463
+ )
464
+
465
+ define Suffix_Verb_Step2b as (
466
+ [substring] among (
467
+ '{w}{a}' '{t}{m}' ($(len >= 5) delete)
468
+ )
469
+ )
470
+
471
+
472
+ define Suffix_Verb_Step2c as (
473
+ [substring] among (
474
+ '{w}' ($(len >= 4) delete)
475
+ '{t}{m}{w}' ($(len >= 6) delete)
476
+ )
477
+ )
478
+
479
+ define Suffix_All_alef_maqsura as (
480
+ [substring] among (
481
+ '{a_}' ( <- '{y}' ) // spell error
482
+ // '{a_}' ( delete ) // if noun > 3
483
+ // '{a_}' ( <- '{a}') // if verb
484
+ )
485
+ )
486
+ )
487
+
488
+ define stem as (
489
+ // set initial values
490
+ set is_noun
491
+ set is_verb
492
+ unset is_defined
493
+
494
+ // guess type and properties
495
+ do Checks1
496
+
497
+ // normalization pre-stemming
498
+ do Normalize_pre
499
+
500
+
501
+ backwards (
502
+
503
+ do (
504
+ //Suffixes for verbs
505
+ (
506
+ is_verb
507
+ (
508
+ (
509
+ (atleast 1 Suffix_Verb_Step1)
510
+ ( Suffix_Verb_Step2a or Suffix_Verb_Step2c or next)
511
+ )
512
+ or Suffix_Verb_Step2b
513
+ or Suffix_Verb_Step2a
514
+ )
515
+ )
516
+ //Suffixes for nouns
517
+ or (
518
+ is_noun
519
+ (
520
+
521
+ try (
522
+ Suffix_Noun_Step2c2
523
+ or (not is_defined Suffix_Noun_Step1a (
524
+ Suffix_Noun_Step2a
525
+ or Suffix_Noun_Step2b
526
+ or Suffix_Noun_Step2c1
527
+ or next))
528
+ or (Suffix_Noun_Step1b (
529
+ Suffix_Noun_Step2a
530
+ or Suffix_Noun_Step2b
531
+ or Suffix_Noun_Step2c1))
532
+ or (not is_defined Suffix_Noun_Step2a)
533
+ or (Suffix_Noun_Step2b)
534
+ )
535
+ Suffix_Noun_Step3
536
+ )
537
+
538
+ )
539
+
540
+ // Suffixes for alef maqsura
541
+ or Suffix_All_alef_maqsura
542
+ )
543
+ )
544
+
545
+ //Prefixes
546
+ do (
547
+ try Prefix_Step1
548
+ try Prefix_Step2
549
+ ( Prefix_Step3a_Noun
550
+ or (is_noun Prefix_Step3b_Noun)
551
+ or (is_verb try Prefix_Step3_Verb Prefix_Step4_Verb)
552
+ )
553
+ )
554
+
555
+ // normalization post-stemming
556
+ do Normalize_post
557
+
558
+ )