phoonnx 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phoonnx/__init__.py +0 -0
- phoonnx/config.py +490 -0
- phoonnx/locale/ca/phonetic_spellings.txt +2 -0
- phoonnx/locale/en/phonetic_spellings.txt +1 -0
- phoonnx/locale/gl/phonetic_spellings.txt +2 -0
- phoonnx/locale/pt/phonetic_spellings.txt +2 -0
- phoonnx/phoneme_ids.py +453 -0
- phoonnx/phonemizers/__init__.py +45 -0
- phoonnx/phonemizers/ar.py +42 -0
- phoonnx/phonemizers/base.py +216 -0
- phoonnx/phonemizers/en.py +250 -0
- phoonnx/phonemizers/fa.py +46 -0
- phoonnx/phonemizers/gl.py +142 -0
- phoonnx/phonemizers/he.py +67 -0
- phoonnx/phonemizers/ja.py +119 -0
- phoonnx/phonemizers/ko.py +97 -0
- phoonnx/phonemizers/mul.py +606 -0
- phoonnx/phonemizers/vi.py +44 -0
- phoonnx/phonemizers/zh.py +308 -0
- phoonnx/thirdparty/__init__.py +0 -0
- phoonnx/thirdparty/arpa2ipa.py +249 -0
- phoonnx/thirdparty/cotovia/cotovia_aarch64 +0 -0
- phoonnx/thirdparty/cotovia/cotovia_x86_64 +0 -0
- phoonnx/thirdparty/hangul2ipa.py +783 -0
- phoonnx/thirdparty/ko_tables/aspiration.csv +20 -0
- phoonnx/thirdparty/ko_tables/assimilation.csv +31 -0
- phoonnx/thirdparty/ko_tables/double_coda.csv +17 -0
- phoonnx/thirdparty/ko_tables/hanja.tsv +8525 -0
- phoonnx/thirdparty/ko_tables/ipa.csv +22 -0
- phoonnx/thirdparty/ko_tables/neutralization.csv +11 -0
- phoonnx/thirdparty/ko_tables/tensification.csv +56 -0
- phoonnx/thirdparty/ko_tables/yale.csv +22 -0
- phoonnx/thirdparty/kog2p/__init__.py +385 -0
- phoonnx/thirdparty/kog2p/rulebook.txt +212 -0
- phoonnx/thirdparty/mantoq/__init__.py +67 -0
- phoonnx/thirdparty/mantoq/buck/__init__.py +0 -0
- phoonnx/thirdparty/mantoq/buck/phonetise_buckwalter.py +569 -0
- phoonnx/thirdparty/mantoq/buck/symbols.py +64 -0
- phoonnx/thirdparty/mantoq/buck/tokenization.py +105 -0
- phoonnx/thirdparty/mantoq/num2words.py +37 -0
- phoonnx/thirdparty/mantoq/pyarabic/__init__.py +12 -0
- phoonnx/thirdparty/mantoq/pyarabic/arabrepr.py +64 -0
- phoonnx/thirdparty/mantoq/pyarabic/araby.py +1647 -0
- phoonnx/thirdparty/mantoq/pyarabic/named_const.py +227 -0
- phoonnx/thirdparty/mantoq/pyarabic/normalize.py +161 -0
- phoonnx/thirdparty/mantoq/pyarabic/number.py +826 -0
- phoonnx/thirdparty/mantoq/pyarabic/number_const.py +1704 -0
- phoonnx/thirdparty/mantoq/pyarabic/stack.py +52 -0
- phoonnx/thirdparty/mantoq/pyarabic/trans.py +517 -0
- phoonnx/thirdparty/mantoq/unicode_symbol2label.py +4173 -0
- phoonnx/thirdparty/tashkeel/LICENSE +22 -0
- phoonnx/thirdparty/tashkeel/SOURCE +1 -0
- phoonnx/thirdparty/tashkeel/__init__.py +212 -0
- phoonnx/thirdparty/tashkeel/hint_id_map.json +18 -0
- phoonnx/thirdparty/tashkeel/input_id_map.json +56 -0
- phoonnx/thirdparty/tashkeel/model.onnx +0 -0
- phoonnx/thirdparty/tashkeel/target_id_map.json +17 -0
- phoonnx/thirdparty/zh_num.py +238 -0
- phoonnx/util.py +705 -0
- phoonnx/version.py +6 -0
- phoonnx/voice.py +521 -0
- phoonnx-0.0.0.dist-info/METADATA +255 -0
- phoonnx-0.0.0.dist-info/RECORD +86 -0
- phoonnx-0.0.0.dist-info/WHEEL +5 -0
- phoonnx-0.0.0.dist-info/top_level.txt +2 -0
- phoonnx_train/__main__.py +151 -0
- phoonnx_train/export_onnx.py +109 -0
- phoonnx_train/norm_audio/__init__.py +92 -0
- phoonnx_train/norm_audio/trim.py +54 -0
- phoonnx_train/norm_audio/vad.py +54 -0
- phoonnx_train/preprocess.py +420 -0
- phoonnx_train/vits/__init__.py +0 -0
- phoonnx_train/vits/attentions.py +427 -0
- phoonnx_train/vits/commons.py +147 -0
- phoonnx_train/vits/config.py +330 -0
- phoonnx_train/vits/dataset.py +214 -0
- phoonnx_train/vits/lightning.py +352 -0
- phoonnx_train/vits/losses.py +58 -0
- phoonnx_train/vits/mel_processing.py +139 -0
- phoonnx_train/vits/models.py +732 -0
- phoonnx_train/vits/modules.py +527 -0
- phoonnx_train/vits/monotonic_align/__init__.py +20 -0
- phoonnx_train/vits/monotonic_align/setup.py +13 -0
- phoonnx_train/vits/transforms.py +212 -0
- phoonnx_train/vits/utils.py +16 -0
- phoonnx_train/vits/wavfile.py +860 -0
@@ -0,0 +1,1647 @@
|
|
1
|
+
#!/usr/bin/python
|
2
|
+
# -*- coding=utf-8 -*-
|
3
|
+
"""
|
4
|
+
Arabic module
|
5
|
+
|
6
|
+
Features:
|
7
|
+
=========
|
8
|
+
- Arabic letters classification
|
9
|
+
- Text tokenization
|
10
|
+
- Strip Harakat (all, except Shadda, tatweel, last_haraka)
|
11
|
+
- Sperate and join Letters and Harakat
|
12
|
+
- Reduce tashkeel
|
13
|
+
- Mesure tashkeel similarity (Harakats, fully or partially vocalized, similarity with a template)
|
14
|
+
- Letters normalization (Ligatures and Hamza)
|
15
|
+
|
16
|
+
@author: Taha Zerrouki
|
17
|
+
@contact: taha dot zerrouki at gmail dot com
|
18
|
+
@copyright: Arabtechies, Arabeyes, Taha Zerrouki
|
19
|
+
@license: GPL
|
20
|
+
@date:2010/03/01
|
21
|
+
@version: 0.1
|
22
|
+
"""
|
23
|
+
import re
|
24
|
+
import unicodedata
|
25
|
+
|
26
|
+
from . import stack
|
27
|
+
|
28
|
+
COMMA = "\u060c"
|
29
|
+
SEMICOLON = "\u061b"
|
30
|
+
QUESTION = "\u061f"
|
31
|
+
HAMZA = "\u0621"
|
32
|
+
ALEF_MADDA = "\u0622"
|
33
|
+
ALEF_HAMZA_ABOVE = "\u0623"
|
34
|
+
WAW_HAMZA = "\u0624"
|
35
|
+
ALEF_HAMZA_BELOW = "\u0625"
|
36
|
+
YEH_HAMZA = "\u0626"
|
37
|
+
ALEF = "\u0627"
|
38
|
+
BEH = "\u0628"
|
39
|
+
TEH_MARBUTA = "\u0629"
|
40
|
+
TEH = "\u062a"
|
41
|
+
THEH = "\u062b"
|
42
|
+
JEEM = "\u062c"
|
43
|
+
HAH = "\u062d"
|
44
|
+
KHAH = "\u062e"
|
45
|
+
DAL = "\u062f"
|
46
|
+
THAL = "\u0630"
|
47
|
+
REH = "\u0631"
|
48
|
+
ZAIN = "\u0632"
|
49
|
+
SEEN = "\u0633"
|
50
|
+
SHEEN = "\u0634"
|
51
|
+
SAD = "\u0635"
|
52
|
+
DAD = "\u0636"
|
53
|
+
TAH = "\u0637"
|
54
|
+
ZAH = "\u0638"
|
55
|
+
AIN = "\u0639"
|
56
|
+
GHAIN = "\u063a"
|
57
|
+
TATWEEL = "\u0640"
|
58
|
+
FEH = "\u0641"
|
59
|
+
QAF = "\u0642"
|
60
|
+
KAF = "\u0643"
|
61
|
+
LAM = "\u0644"
|
62
|
+
MEEM = "\u0645"
|
63
|
+
NOON = "\u0646"
|
64
|
+
HEH = "\u0647"
|
65
|
+
WAW = "\u0648"
|
66
|
+
ALEF_MAKSURA = "\u0649"
|
67
|
+
YEH = "\u064a"
|
68
|
+
MADDA_ABOVE = "\u0653"
|
69
|
+
HAMZA_ABOVE = "\u0654"
|
70
|
+
HAMZA_BELOW = "\u0655"
|
71
|
+
ZERO = "\u0660"
|
72
|
+
ONE = "\u0661"
|
73
|
+
TWO = "\u0662"
|
74
|
+
THREE = "\u0663"
|
75
|
+
FOUR = "\u0664"
|
76
|
+
FIVE = "\u0665"
|
77
|
+
SIX = "\u0666"
|
78
|
+
SEVEN = "\u0667"
|
79
|
+
EIGHT = "\u0668"
|
80
|
+
NINE = "\u0669"
|
81
|
+
ZERO_W = "\u0030"
|
82
|
+
ONE_W = "\u0031"
|
83
|
+
TWO_W = "\u0032"
|
84
|
+
THREE_W = "\u0033"
|
85
|
+
FOUR_W = "\u0034"
|
86
|
+
FIVE_W = "\u0035"
|
87
|
+
SIX_W = "\u0036"
|
88
|
+
SEVEN_W = "\u0037"
|
89
|
+
EIGHT_W = "\u0038"
|
90
|
+
NINE_W = "\u0039"
|
91
|
+
ZERO_P = "\u06f0"
|
92
|
+
ONE_P = "\u06f1"
|
93
|
+
TWO_P = "\u06f2"
|
94
|
+
THREE_P = "\u06f3"
|
95
|
+
FOUR_P = "\u06f4"
|
96
|
+
FIVE_P = "\u06f5"
|
97
|
+
SIX_P = "\u06f6"
|
98
|
+
SEVEN_P = "\u06f7"
|
99
|
+
EIGHT_P = "\u06f8"
|
100
|
+
NINE_P = "\u06f9"
|
101
|
+
PERCENT = "\u066a"
|
102
|
+
DECIMAL = "\u066b"
|
103
|
+
THOUSANDS = "\u066c"
|
104
|
+
STAR = "\u066d"
|
105
|
+
MINI_ALEF = "\u0670"
|
106
|
+
ALEF_WASLA = "\u0671"
|
107
|
+
FULL_STOP = "\u06d4"
|
108
|
+
BYTE_ORDER_MARK = "\ufeff"
|
109
|
+
|
110
|
+
# Diacritics
|
111
|
+
FATHATAN = "\u064b"
|
112
|
+
DAMMATAN = "\u064c"
|
113
|
+
KASRATAN = "\u064d"
|
114
|
+
FATHA = "\u064e"
|
115
|
+
DAMMA = "\u064f"
|
116
|
+
KASRA = "\u0650"
|
117
|
+
SHADDA = "\u0651"
|
118
|
+
SUKUN = "\u0652"
|
119
|
+
|
120
|
+
# Small Letters
|
121
|
+
SMALL_ALEF = "\u0670"
|
122
|
+
SMALL_WAW = "\u06e5"
|
123
|
+
SMALL_YEH = "\u06e6"
|
124
|
+
# Ligatures
|
125
|
+
LAM_ALEF = "\ufefb"
|
126
|
+
LAM_ALEF_HAMZA_ABOVE = "\ufef7"
|
127
|
+
LAM_ALEF_HAMZA_BELOW = "\ufef9"
|
128
|
+
LAM_ALEF_MADDA_ABOVE = "\ufef5"
|
129
|
+
SIMPLE_LAM_ALEF = "\u0644\u0627"
|
130
|
+
SIMPLE_LAM_ALEF_HAMZA_ABOVE = "\u0644\u0623"
|
131
|
+
SIMPLE_LAM_ALEF_HAMZA_BELOW = "\u0644\u0625"
|
132
|
+
SIMPLE_LAM_ALEF_MADDA_ABOVE = "\u0644\u0622"
|
133
|
+
# groups
|
134
|
+
LETTERS = "".join(
|
135
|
+
[
|
136
|
+
ALEF,
|
137
|
+
BEH,
|
138
|
+
TEH,
|
139
|
+
TEH_MARBUTA,
|
140
|
+
THEH,
|
141
|
+
JEEM,
|
142
|
+
HAH,
|
143
|
+
KHAH,
|
144
|
+
DAL,
|
145
|
+
THAL,
|
146
|
+
REH,
|
147
|
+
ZAIN,
|
148
|
+
SEEN,
|
149
|
+
SHEEN,
|
150
|
+
SAD,
|
151
|
+
DAD,
|
152
|
+
TAH,
|
153
|
+
ZAH,
|
154
|
+
AIN,
|
155
|
+
GHAIN,
|
156
|
+
FEH,
|
157
|
+
QAF,
|
158
|
+
KAF,
|
159
|
+
LAM,
|
160
|
+
MEEM,
|
161
|
+
NOON,
|
162
|
+
HEH,
|
163
|
+
WAW,
|
164
|
+
ALEF_MAKSURA,
|
165
|
+
YEH,
|
166
|
+
HAMZA,
|
167
|
+
ALEF_MADDA,
|
168
|
+
ALEF_HAMZA_ABOVE,
|
169
|
+
WAW_HAMZA,
|
170
|
+
ALEF_HAMZA_BELOW,
|
171
|
+
YEH_HAMZA,
|
172
|
+
]
|
173
|
+
)
|
174
|
+
|
175
|
+
NUMBERS_EAST = (ZERO, ONE, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE)
|
176
|
+
NUMBERS_WEST = (
|
177
|
+
ZERO_W,
|
178
|
+
ONE_W,
|
179
|
+
TWO_W,
|
180
|
+
THREE_W,
|
181
|
+
FOUR_W,
|
182
|
+
FIVE_W,
|
183
|
+
SIX_W,
|
184
|
+
SEVEN_W,
|
185
|
+
EIGHT_W,
|
186
|
+
NINE_W,
|
187
|
+
)
|
188
|
+
NUMBERS_PERS = (
|
189
|
+
ZERO_P,
|
190
|
+
ONE_P,
|
191
|
+
TWO_P,
|
192
|
+
THREE_P,
|
193
|
+
FOUR_P,
|
194
|
+
FIVE_P,
|
195
|
+
SIX_P,
|
196
|
+
SEVEN_P,
|
197
|
+
EIGHT_P,
|
198
|
+
NINE_P,
|
199
|
+
)
|
200
|
+
|
201
|
+
|
202
|
+
TASHKEEL = (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA)
|
203
|
+
HARAKAT = (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN)
|
204
|
+
|
205
|
+
SHORTHARAKAT = (FATHA, DAMMA, KASRA, SUKUN)
|
206
|
+
|
207
|
+
TANWIN = (FATHATAN, DAMMATAN, KASRATAN)
|
208
|
+
|
209
|
+
NOT_DEF_HARAKA = TATWEEL
|
210
|
+
|
211
|
+
LIGUATURES = (
|
212
|
+
LAM_ALEF,
|
213
|
+
LAM_ALEF_HAMZA_ABOVE,
|
214
|
+
LAM_ALEF_HAMZA_BELOW,
|
215
|
+
LAM_ALEF_MADDA_ABOVE,
|
216
|
+
)
|
217
|
+
|
218
|
+
HAMZAT = (
|
219
|
+
HAMZA,
|
220
|
+
WAW_HAMZA,
|
221
|
+
YEH_HAMZA,
|
222
|
+
HAMZA_ABOVE,
|
223
|
+
HAMZA_BELOW,
|
224
|
+
ALEF_HAMZA_BELOW,
|
225
|
+
ALEF_HAMZA_ABOVE,
|
226
|
+
)
|
227
|
+
|
228
|
+
ALEFAT = (
|
229
|
+
ALEF,
|
230
|
+
ALEF_MADDA,
|
231
|
+
ALEF_HAMZA_ABOVE,
|
232
|
+
ALEF_HAMZA_BELOW,
|
233
|
+
ALEF_WASLA,
|
234
|
+
ALEF_MAKSURA,
|
235
|
+
SMALL_ALEF,
|
236
|
+
)
|
237
|
+
|
238
|
+
WEAK = (ALEF, WAW, YEH, ALEF_MAKSURA)
|
239
|
+
YEHLIKE = (YEH, YEH_HAMZA, ALEF_MAKSURA, SMALL_YEH)
|
240
|
+
|
241
|
+
WAWLIKE = (WAW, WAW_HAMZA, SMALL_WAW)
|
242
|
+
TEHLIKE = (TEH, TEH_MARBUTA)
|
243
|
+
|
244
|
+
SMALL = (SMALL_ALEF, SMALL_WAW, SMALL_YEH)
|
245
|
+
|
246
|
+
MOON = (
|
247
|
+
HAMZA,
|
248
|
+
ALEF_MADDA,
|
249
|
+
ALEF_HAMZA_ABOVE,
|
250
|
+
ALEF_HAMZA_BELOW,
|
251
|
+
ALEF,
|
252
|
+
BEH,
|
253
|
+
JEEM,
|
254
|
+
HAH,
|
255
|
+
KHAH,
|
256
|
+
AIN,
|
257
|
+
GHAIN,
|
258
|
+
FEH,
|
259
|
+
QAF,
|
260
|
+
KAF,
|
261
|
+
MEEM,
|
262
|
+
HEH,
|
263
|
+
WAW,
|
264
|
+
YEH,
|
265
|
+
)
|
266
|
+
|
267
|
+
SUN = (
|
268
|
+
TEH,
|
269
|
+
THEH,
|
270
|
+
DAL,
|
271
|
+
THAL,
|
272
|
+
REH,
|
273
|
+
ZAIN,
|
274
|
+
SEEN,
|
275
|
+
SHEEN,
|
276
|
+
SAD,
|
277
|
+
DAD,
|
278
|
+
TAH,
|
279
|
+
ZAH,
|
280
|
+
LAM,
|
281
|
+
NOON,
|
282
|
+
)
|
283
|
+
|
284
|
+
ALPHABETIC_ORDER = {
|
285
|
+
ALEF: 1,
|
286
|
+
BEH: 2,
|
287
|
+
TEH: 3,
|
288
|
+
TEH_MARBUTA: 3,
|
289
|
+
THEH: 4,
|
290
|
+
JEEM: 5,
|
291
|
+
HAH: 6,
|
292
|
+
KHAH: 7,
|
293
|
+
DAL: 8,
|
294
|
+
THAL: 9,
|
295
|
+
REH: 10,
|
296
|
+
ZAIN: 11,
|
297
|
+
SEEN: 12,
|
298
|
+
SHEEN: 13,
|
299
|
+
SAD: 14,
|
300
|
+
DAD: 15,
|
301
|
+
TAH: 16,
|
302
|
+
ZAH: 17,
|
303
|
+
AIN: 18,
|
304
|
+
GHAIN: 19,
|
305
|
+
FEH: 20,
|
306
|
+
QAF: 21,
|
307
|
+
KAF: 22,
|
308
|
+
LAM: 23,
|
309
|
+
MEEM: 24,
|
310
|
+
NOON: 25,
|
311
|
+
HEH: 26,
|
312
|
+
WAW: 27,
|
313
|
+
YEH: 28,
|
314
|
+
HAMZA: 29,
|
315
|
+
ALEF_MADDA: 29,
|
316
|
+
ALEF_HAMZA_ABOVE: 29,
|
317
|
+
WAW_HAMZA: 29,
|
318
|
+
ALEF_HAMZA_BELOW: 29,
|
319
|
+
YEH_HAMZA: 29,
|
320
|
+
}
|
321
|
+
|
322
|
+
NAMES = {
|
323
|
+
ALEF: "ألف",
|
324
|
+
BEH: "باء",
|
325
|
+
TEH: "تاء",
|
326
|
+
TEH_MARBUTA: "تاء مربوطة",
|
327
|
+
THEH: "ثاء",
|
328
|
+
JEEM: "جيم",
|
329
|
+
HAH: "حاء",
|
330
|
+
KHAH: "خاء",
|
331
|
+
DAL: "دال",
|
332
|
+
THAL: "ذال",
|
333
|
+
REH: "راء",
|
334
|
+
ZAIN: "زاي",
|
335
|
+
SEEN: "سين",
|
336
|
+
SHEEN: "شين",
|
337
|
+
SAD: "صاد",
|
338
|
+
DAD: "ضاد",
|
339
|
+
TAH: "طاء",
|
340
|
+
ZAH: "ظاء",
|
341
|
+
AIN: "عين",
|
342
|
+
GHAIN: "غين",
|
343
|
+
FEH: "فاء",
|
344
|
+
QAF: "قاف",
|
345
|
+
KAF: "كاف",
|
346
|
+
LAM: "لام",
|
347
|
+
MEEM: "ميم",
|
348
|
+
NOON: "نون",
|
349
|
+
HEH: "هاء",
|
350
|
+
WAW: "واو",
|
351
|
+
YEH: "ياء",
|
352
|
+
HAMZA: "همزة",
|
353
|
+
TATWEEL: "تطويل",
|
354
|
+
ALEF_MADDA: "ألف ممدودة",
|
355
|
+
ALEF_MAKSURA: "ألف مقصورة",
|
356
|
+
ALEF_HAMZA_ABOVE: "همزة على الألف",
|
357
|
+
WAW_HAMZA: "همزة على الواو",
|
358
|
+
ALEF_HAMZA_BELOW: "همزة تحت الألف",
|
359
|
+
YEH_HAMZA: "همزة على الياء",
|
360
|
+
FATHATAN: "فتحتان",
|
361
|
+
DAMMATAN: "ضمتان",
|
362
|
+
KASRATAN: "كسرتان",
|
363
|
+
FATHA: "فتحة",
|
364
|
+
DAMMA: "ضمة",
|
365
|
+
KASRA: "كسرة",
|
366
|
+
SHADDA: "شدة",
|
367
|
+
SUKUN: "سكون",
|
368
|
+
}
|
369
|
+
HAMZAT_STRING = "".join(HAMZAT)
|
370
|
+
HARAKAT_STRING = "".join(HARAKAT)
|
371
|
+
TASHKEEL_STRING = "".join(TASHKEEL)
|
372
|
+
# regular expretion
|
373
|
+
|
374
|
+
HARAKAT_PATTERN = re.compile("[" + "".join(HARAKAT) + "]", re.UNICODE)
|
375
|
+
# ~ """ pattern to strip Harakat"""
|
376
|
+
LASTHARAKA_PATTERN = re.compile(
|
377
|
+
"[%s]$|[%s]" % ("".join(HARAKAT), "".join(TANWIN)), re.UNICODE
|
378
|
+
)
|
379
|
+
# ~ """ Pattern to strip only the last haraka """
|
380
|
+
SHORTHARAKAT_PATTERN = re.compile("[" + "".join(SHORTHARAKAT) + "]", re.UNICODE)
|
381
|
+
# ~ Pattern to lookup Short Harakat(Fatha, Damma, Kasra, sukun, tanwin),
|
382
|
+
# but not shadda
|
383
|
+
TASHKEEL_PATTERN = re.compile("[" + "".join(TASHKEEL) + "]", re.UNICODE)
|
384
|
+
# ~ """ Harakat and shadda pattern """
|
385
|
+
HAMZAT_PATTERN = re.compile("[" + "".join(HAMZAT) + "]", re.UNICODE)
|
386
|
+
# ~ """ all hamzat pattern"""
|
387
|
+
ALEFAT_PATTERN = re.compile("[" + "".join(ALEFAT) + "]", re.UNICODE)
|
388
|
+
# ~ """ all alef like letters """
|
389
|
+
LIGUATURES_PATTERN = re.compile("[" + "".join(LIGUATURES) + "]", re.UNICODE)
|
390
|
+
# ~ """ all liguatures pattern """
|
391
|
+
TOKEN_PATTERN = re.compile(r"([^\w\u0670\u064b-\u0652']+)", re.UNICODE)
|
392
|
+
# ~ """ pattern to tokenize a text"""
|
393
|
+
TOKEN_PATTERN_SPLIT = re.compile(r"([\w\u0670\u064b-\u0652']+)", re.UNICODE)
|
394
|
+
# ~ """ pattern to tokenize a text with positions"""
|
395
|
+
TOKEN_REPLACE = re.compile("\t|\r|\f|\v| ")
|
396
|
+
|
397
|
+
# Arabic string
|
398
|
+
ARABIC_STRING = re.compile(
|
399
|
+
r"([^\u0600-\u0652%s%s%s\s\d])"
|
400
|
+
% (LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_MADDA_ABOVE),
|
401
|
+
re.UNICODE,
|
402
|
+
)
|
403
|
+
# Arabic range
|
404
|
+
ARABIC_RANGE = re.compile(
|
405
|
+
"([^\u0600-\u06ff\ufb50-\ufdff\ufe70-\ufeff\u0750-\u077f])", re.UNICODE
|
406
|
+
)
|
407
|
+
|
408
|
+
# Space fixes
|
409
|
+
FIX_SPACES_PAT = re.compile(r"\s*([?؟!.,،:]+(?:\s+[?؟!.,،:]+)*)\s*", re.UNICODE)
|
410
|
+
|
411
|
+
DIACRITICS = [
|
412
|
+
chr(x) for x in range(0x0600, 0x06FF) if unicodedata.category(chr(x)) == "Mn"
|
413
|
+
]
|
414
|
+
# ~ \u0610 ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM
|
415
|
+
# ~ \u0611 ARABIC SIGN ALAYHE ASSALLAM
|
416
|
+
# ~ \u0612 ARABIC SIGN RAHMATULLAH ALAYHE
|
417
|
+
# ~ \u0613 ARABIC SIGN RADI ALLAHOU ANHU
|
418
|
+
# ~ \u0614 ARABIC SIGN TAKHALLUS
|
419
|
+
# ~ \u0615 ARABIC SMALL HIGH TAH
|
420
|
+
# ~ \u0616 ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH
|
421
|
+
# ~ \u0617 ARABIC SMALL HIGH ZAIN
|
422
|
+
# ~ \u0618 ARABIC SMALL FATHA
|
423
|
+
# ~ \u0619 ARABIC SMALL DAMMA
|
424
|
+
# ~ \u061a ARABIC SMALL KASRA
|
425
|
+
# ~ \u064b ARABIC FATHATAN
|
426
|
+
# ~ \u064c ARABIC DAMMATAN
|
427
|
+
# ~ \u064d ARABIC KASRATAN
|
428
|
+
# ~ \u064e ARABIC FATHA
|
429
|
+
# ~ \u064f ARABIC DAMMA
|
430
|
+
# ~ \u0650 ARABIC KASRA
|
431
|
+
# ~ \u0651 ARABIC SHADDA
|
432
|
+
# ~ \u0652 ARABIC SUKUN
|
433
|
+
# ~ \u0653 ARABIC MADDAH ABOVE
|
434
|
+
# ~ \u0654 ARABIC HAMZA ABOVE
|
435
|
+
# ~ \u0655 ARABIC HAMZA BELOW
|
436
|
+
# ~ \u0656 ARABIC SUBSCRIPT ALEF
|
437
|
+
# ~ \u0657 ARABIC INVERTED DAMMA
|
438
|
+
# ~ \u0658 ARABIC MARK NOON GHUNNA
|
439
|
+
# ~ \u0659 ARABIC ZWARAKAY
|
440
|
+
# ~ \u065a ARABIC VOWEL SIGN SMALL V ABOVE
|
441
|
+
# ~ \u065b ARABIC VOWEL SIGN INVERTED SMALL V ABOVE
|
442
|
+
# ~ \u065c ARABIC VOWEL SIGN DOT BELOW
|
443
|
+
# ~ \u065d ARABIC REVERSED DAMMA
|
444
|
+
# ~ \u065e ARABIC FATHA WITH TWO DOTS
|
445
|
+
# ~ \u0670 ARABIC LETTER SUPERSCRIPT ALEF
|
446
|
+
# ~ \u06d6 ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA
|
447
|
+
# ~ \u06d7 ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA
|
448
|
+
# ~ \u06d8 ARABIC SMALL HIGH MEEM INITIAL FORM
|
449
|
+
# ~ \u06d9 ARABIC SMALL HIGH LAM ALEF
|
450
|
+
# ~ \u06da ARABIC SMALL HIGH JEEM
|
451
|
+
# ~ \u06db ARABIC SMALL HIGH THREE DOTS
|
452
|
+
# ~ \u06dc ARABIC SMALL HIGH SEEN
|
453
|
+
# ~ \u06df ARABIC SMALL HIGH ROUNDED ZERO
|
454
|
+
# ~ \u06e0 ARABIC SMALL HIGH UPRIGHT RECTANGULAR ZERO
|
455
|
+
# ~ \u06e1 ARABIC SMALL HIGH DOTLESS HEAD OF KHAH
|
456
|
+
# ~ \u06e2 ARABIC SMALL HIGH MEEM ISOLATED FORM
|
457
|
+
# ~ \u06e3 ARABIC SMALL LOW SEEN
|
458
|
+
# ~ \u06e4 ARABIC SMALL HIGH MADDA
|
459
|
+
# ~ \u06e7 ARABIC SMALL HIGH YEH
|
460
|
+
# ~ \u06e8 ARABIC SMALL HIGH NOON
|
461
|
+
# ~ \u06ea ARABIC EMPTY CENTRE LOW STOP
|
462
|
+
# ~ \u06eb ARABIC EMPTY CENTRE HIGH STOP
|
463
|
+
# ~ \u06ec ARABIC ROUNDED HIGH STOP WITH FILLED CENTRE
|
464
|
+
# ~ \u06ed ARABIC SMALL LOW MEEM
|
465
|
+
|
466
|
+
|
467
|
+
################################################
|
468
|
+
# { is letter functions
|
469
|
+
################################################
|
470
|
+
def is_sukun(archar):
|
471
|
+
"""Checks if the given ``archar``Sukun Mark."""
|
472
|
+
return archar == SUKUN
|
473
|
+
|
474
|
+
|
475
|
+
def is_shadda(archar):
|
476
|
+
"""Checks if the given ``archar`` is Shadda Mark."""
|
477
|
+
return archar == SHADDA
|
478
|
+
|
479
|
+
|
480
|
+
def is_tatweel(archar):
|
481
|
+
"""Checks if the given ``archar`` Tatweel letter modifier."""
|
482
|
+
return archar == TATWEEL
|
483
|
+
|
484
|
+
|
485
|
+
def is_tanwin(archar):
|
486
|
+
"""Checks if the given ``archar`` Tanwin Marks"""
|
487
|
+
return archar in TANWIN
|
488
|
+
|
489
|
+
|
490
|
+
def is_tashkeel(archar):
|
491
|
+
"""Checks if the given ``archar`` Arabic Tashkeel Marks (
|
492
|
+
- FATHA, DAMMA, KASRA, SUKUN,
|
493
|
+
- SHADDA,
|
494
|
+
- FATHATAN, DAMMATAN, KASRATAn)."""
|
495
|
+
return archar in TASHKEEL
|
496
|
+
|
497
|
+
|
498
|
+
def is_haraka(archar):
|
499
|
+
"""Checks if the given ``archar`` Arabic Harakat Marks (FATHA, DAMMA, KASRA, SUKUN, TANWIN)."""
|
500
|
+
return archar in HARAKAT
|
501
|
+
|
502
|
+
|
503
|
+
def is_shortharaka(archar):
|
504
|
+
"""Checks if the given ``archar`` short Harakat Marks (FATHA, DAMMA, KASRA, SUKUN)."""
|
505
|
+
return archar in SHORTHARAKAT
|
506
|
+
|
507
|
+
|
508
|
+
def is_ligature(archar):
|
509
|
+
"""Checks for Arabic Ligatures like LamAlef.
|
510
|
+
(LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE)
|
511
|
+
@param archar: arabic unicode char
|
512
|
+
@type archar: unicode
|
513
|
+
@return:
|
514
|
+
@rtype:Boolean
|
515
|
+
"""
|
516
|
+
return archar in LIGUATURES
|
517
|
+
|
518
|
+
|
519
|
+
def is_hamza(archar):
|
520
|
+
"""Checks for Arabic Hamza forms.
|
521
|
+
HAMZAT are (HAMZA, WAW_HAMZA, YEH_HAMZA, HAMZA_ABOVE, HAMZA_BELOW,
|
522
|
+
ALEF_HAMZA_BELOW, ALEF_HAMZA_ABOVE)
|
523
|
+
@param archar: arabic unicode char
|
524
|
+
@type archar: unicode
|
525
|
+
@return:
|
526
|
+
@rtype:Boolean
|
527
|
+
"""
|
528
|
+
return archar in HAMZAT
|
529
|
+
|
530
|
+
|
531
|
+
def is_alef(archar):
|
532
|
+
"""Checks for Arabic Alef forms.
|
533
|
+
ALEFAT = (ALEF, ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW, ALEF_WASLA, ALEF_MAKSURA)
|
534
|
+
@param archar: arabic unicode char
|
535
|
+
@type archar: unicode
|
536
|
+
@return:
|
537
|
+
@rtype:Boolean
|
538
|
+
"""
|
539
|
+
return archar in ALEFAT
|
540
|
+
|
541
|
+
|
542
|
+
def is_yehlike(archar):
|
543
|
+
"""Checks for Arabic Yeh forms.
|
544
|
+
Yeh forms : YEH, YEH_HAMZA, SMALL_YEH, ALEF_MAKSURA
|
545
|
+
@param archar: arabic unicode char
|
546
|
+
@type archar: unicode
|
547
|
+
@return:
|
548
|
+
@rtype:Boolean
|
549
|
+
"""
|
550
|
+
return archar in YEHLIKE
|
551
|
+
|
552
|
+
|
553
|
+
def is_wawlike(archar):
|
554
|
+
"""Checks for Arabic Waw like forms.
|
555
|
+
Waw forms : WAW, WAW_HAMZA, SMALL_WAW
|
556
|
+
@param archar: arabic unicode char
|
557
|
+
@type archar: unicode
|
558
|
+
@return:
|
559
|
+
@rtype:Boolean
|
560
|
+
"""
|
561
|
+
return archar in WAWLIKE
|
562
|
+
|
563
|
+
|
564
|
+
def is_teh(archar):
|
565
|
+
"""Checks for Arabic Teh forms.
|
566
|
+
Teh forms : TEH, TEH_MARBUTA
|
567
|
+
@param archar: arabic unicode char
|
568
|
+
@type archar: unicode
|
569
|
+
@return:
|
570
|
+
@rtype:Boolean
|
571
|
+
"""
|
572
|
+
return archar in TEHLIKE
|
573
|
+
|
574
|
+
|
575
|
+
def is_small(archar):
|
576
|
+
"""Checks for Arabic Small letters.
|
577
|
+
SMALL Letters : SMALL ALEF, SMALL WAW, SMALL YEH
|
578
|
+
@param archar: arabic unicode char
|
579
|
+
@type archar: unicode
|
580
|
+
@return:
|
581
|
+
@rtype:Boolean
|
582
|
+
"""
|
583
|
+
return archar in SMALL
|
584
|
+
|
585
|
+
|
586
|
+
def is_weak(archar):
|
587
|
+
"""Checks for Arabic Weak letters.
|
588
|
+
Weak Letters : ALEF, WAW, YEH, ALEF_MAKSURA
|
589
|
+
@param archar: arabic unicode char
|
590
|
+
@type archar: unicode
|
591
|
+
@return:
|
592
|
+
@rtype:Boolean
|
593
|
+
"""
|
594
|
+
return archar in WEAK
|
595
|
+
|
596
|
+
|
597
|
+
def is_moon(archar):
|
598
|
+
"""Checks for Arabic Moon letters.
|
599
|
+
Moon Letters :
|
600
|
+
@param archar: arabic unicode char
|
601
|
+
@type archar: unicode
|
602
|
+
@return:
|
603
|
+
@rtype:Boolean
|
604
|
+
"""
|
605
|
+
return archar in MOON
|
606
|
+
|
607
|
+
|
608
|
+
def is_sun(archar):
|
609
|
+
"""Checks for Arabic Sun letters.
|
610
|
+
Moon Letters :
|
611
|
+
@param archar: arabic unicode char
|
612
|
+
@type archar: unicode
|
613
|
+
@return:
|
614
|
+
@rtype:Boolean
|
615
|
+
"""
|
616
|
+
return archar in SUN
|
617
|
+
|
618
|
+
|
619
|
+
#####################################
|
620
|
+
# { general letter functions
|
621
|
+
#####################################
|
622
|
+
def order(archar):
|
623
|
+
"""return Arabic letter order between 1 and 29.
|
624
|
+
Alef order is 1, Yeh is 28, Hamza is 29.
|
625
|
+
Teh Marbuta has the same ordre with Teh, 3.
|
626
|
+
@param archar: arabic unicode char
|
627
|
+
@type archar: unicode
|
628
|
+
@return: arabic order.
|
629
|
+
@rtype: integer
|
630
|
+
"""
|
631
|
+
return ALPHABETIC_ORDER.get(archar, 0)
|
632
|
+
|
633
|
+
|
634
|
+
def name(archar, default_name=""):
|
635
|
+
"""return Arabic letter name in arabic. Alef order is 1, Yeh is 28,
|
636
|
+
Hamza is 29. Teh Marbuta has the same ordre with Teh, 3.
|
637
|
+
@param archar: arabic unicode char
|
638
|
+
@type archar: unicode
|
639
|
+
@param default_name: arabic unicode char
|
640
|
+
@type default_name: unicode
|
641
|
+
@return: arabic name.
|
642
|
+
@rtype: unicode
|
643
|
+
"""
|
644
|
+
return NAMES.get(archar, default_name)
|
645
|
+
|
646
|
+
|
647
|
+
def arabicrange():
|
648
|
+
"""return a list of arabic characteres .
|
649
|
+
Return a list of characteres between \u060c to \u0652
|
650
|
+
@return: list of arabic characteres.
|
651
|
+
@rtype: unicode
|
652
|
+
"""
|
653
|
+
mylist = []
|
654
|
+
for i in range(0x0600, 0x00653):
|
655
|
+
try:
|
656
|
+
mylist.append(chr(i))
|
657
|
+
except NameError:
|
658
|
+
# python 3 compatible
|
659
|
+
mylist.append(chr(i))
|
660
|
+
except ValueError:
|
661
|
+
pass
|
662
|
+
return mylist
|
663
|
+
|
664
|
+
|
665
|
+
#####################################
|
666
|
+
# { Has letter functions
|
667
|
+
#####################################
|
668
|
+
def has_shadda(word):
|
669
|
+
"""Checks if the arabic word contains shadda.
|
670
|
+
@param word: arabic unicode char
|
671
|
+
@type word: unicode
|
672
|
+
@return: if shadda exists
|
673
|
+
@rtype:Boolean
|
674
|
+
"""
|
675
|
+
if re.search(SHADDA, word):
|
676
|
+
return True
|
677
|
+
return False
|
678
|
+
|
679
|
+
|
680
|
+
#####################################
|
681
|
+
# { word and text functions
|
682
|
+
#####################################
|
683
|
+
def is_vocalized(word):
|
684
|
+
"""Checks if the arabic word is vocalized.
|
685
|
+
the word musn't have any spaces and pounctuations.
|
686
|
+
@param word: arabic unicode char
|
687
|
+
@type word: unicode
|
688
|
+
@return: if the word is vocalized
|
689
|
+
@rtype:Boolean
|
690
|
+
"""
|
691
|
+
if word.isalpha():
|
692
|
+
return False
|
693
|
+
for char in word:
|
694
|
+
if is_tashkeel(char):
|
695
|
+
break
|
696
|
+
else:
|
697
|
+
return False
|
698
|
+
return True
|
699
|
+
|
700
|
+
|
701
|
+
def is_vocalizedtext(text):
|
702
|
+
"""Checks if the arabic text is vocalized.
|
703
|
+
The text can contain many words and spaces
|
704
|
+
@param text: arabic unicode char
|
705
|
+
@type text: unicode
|
706
|
+
@return: if the word is vocalized
|
707
|
+
@rtype:Boolean
|
708
|
+
"""
|
709
|
+
return bool(re.search(HARAKAT_PATTERN, text))
|
710
|
+
|
711
|
+
|
712
|
+
def is_arabicstring(text):
|
713
|
+
"""Checks for an Arabic standard Unicode block characters
|
714
|
+
An arabic string can contain spaces, digits and pounctuation.
|
715
|
+
but only arabic standard characters, not extended arabic
|
716
|
+
@param text: input text
|
717
|
+
@type text: unicode
|
718
|
+
@return: True if all charaters are in Arabic block
|
719
|
+
@rtype: Boolean
|
720
|
+
"""
|
721
|
+
if ARABIC_STRING.search(text):
|
722
|
+
return False
|
723
|
+
return True
|
724
|
+
|
725
|
+
|
726
|
+
def is_arabicrange(text):
|
727
|
+
"""Checks for an Arabic Unicode block characters
|
728
|
+
@param text: input text
|
729
|
+
@type text: unicode
|
730
|
+
@return: True if all charaters are in Arabic block
|
731
|
+
@rtype: Boolean
|
732
|
+
"""
|
733
|
+
if ARABIC_RANGE.search(text):
|
734
|
+
return False
|
735
|
+
return True
|
736
|
+
|
737
|
+
|
738
|
+
def is_arabicword(word):
|
739
|
+
"""Checks for an valid Arabic word.
|
740
|
+
An Arabic word not contains spaces, digits and pounctuation
|
741
|
+
avoid some spelling error, TEH_MARBUTA must be at the end.
|
742
|
+
@param word: input word
|
743
|
+
@type word: unicode
|
744
|
+
@return: True if all charaters are in Arabic block
|
745
|
+
@rtype: Boolean
|
746
|
+
"""
|
747
|
+
if len(word) == 0:
|
748
|
+
return False
|
749
|
+
elif re.search(
|
750
|
+
"([^\u0600-\u0652%s%s%s])"
|
751
|
+
% (LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_MADDA_ABOVE),
|
752
|
+
word,
|
753
|
+
):
|
754
|
+
return False
|
755
|
+
elif is_haraka(word[0]) or word[0] in (WAW_HAMZA, YEH_HAMZA):
|
756
|
+
return False
|
757
|
+
# if Teh Marbuta or Alef_Maksura not in the end
|
758
|
+
elif re.match("^(.)*[%s](.)+$" % ALEF_MAKSURA, word):
|
759
|
+
return False
|
760
|
+
elif re.match(
|
761
|
+
"^(.)*[%s]([^%s%s%s])(.)+$" % (TEH_MARBUTA, DAMMA, KASRA, FATHA), word
|
762
|
+
):
|
763
|
+
return False
|
764
|
+
elif re.search(SHADDA + SHADDA, word):
|
765
|
+
return False
|
766
|
+
else:
|
767
|
+
return True
|
768
|
+
|
769
|
+
|
770
|
+
#####################################
|
771
|
+
# {Char functions
|
772
|
+
#####################################
|
773
|
+
def first_char(word):
|
774
|
+
"""
|
775
|
+
Return the first char
|
776
|
+
@param word: given word
|
777
|
+
@type word: unicode
|
778
|
+
@return: the first char
|
779
|
+
@rtype: unicode char
|
780
|
+
"""
|
781
|
+
return word[0]
|
782
|
+
|
783
|
+
|
784
|
+
def second_char(word):
|
785
|
+
"""
|
786
|
+
Return the second char
|
787
|
+
@param word: given word
|
788
|
+
@type word: unicode
|
789
|
+
@return: the first char
|
790
|
+
@rtype: unicode char
|
791
|
+
"""
|
792
|
+
return word[1:2]
|
793
|
+
|
794
|
+
|
795
|
+
def last_char(word):
|
796
|
+
"""
|
797
|
+
Return the last letter
|
798
|
+
example: zerrouki; 'i' is the last.
|
799
|
+
@param word: given word
|
800
|
+
@type word: unicode
|
801
|
+
@return: the last letter
|
802
|
+
@rtype: unicode char
|
803
|
+
"""
|
804
|
+
return word[-1:]
|
805
|
+
|
806
|
+
|
807
|
+
def secondlast_char(word):
|
808
|
+
"""
|
809
|
+
Return the second last letter example: zerrouki; 'k' is the second last.
|
810
|
+
@param word: given word
|
811
|
+
@type word: unicode
|
812
|
+
@return: the second last letter
|
813
|
+
@rtype: unicode char
|
814
|
+
"""
|
815
|
+
return word[-2:-1]
|
816
|
+
|
817
|
+
|
818
|
+
#####################################
|
819
|
+
# {Strip functions
|
820
|
+
#####################################
|
821
|
+
def strip_harakat(text):
|
822
|
+
"""Strip Harakat from arabic word except Shadda.
|
823
|
+
The striped marks are :
|
824
|
+
- FATHA, DAMMA, KASRA
|
825
|
+
- SUKUN
|
826
|
+
- FATHATAN, DAMMATAN, KASRATAN,
|
827
|
+
|
828
|
+
Example:
|
829
|
+
>>> text = u"الْعَرَبِيّةُ"
|
830
|
+
>>> strip_harakat(text)
|
831
|
+
>>> العربيّة
|
832
|
+
|
833
|
+
@param text: arabic text.
|
834
|
+
@type text: unicode.
|
835
|
+
@return: return a striped text.
|
836
|
+
@rtype: unicode.
|
837
|
+
"""
|
838
|
+
# if text:
|
839
|
+
# return re.sub(HARAKAT_PATTERN, u'', text)
|
840
|
+
# return text
|
841
|
+
if not text:
|
842
|
+
return text
|
843
|
+
elif is_vocalized(text):
|
844
|
+
for char in HARAKAT:
|
845
|
+
text = text.replace(char, "")
|
846
|
+
return text
|
847
|
+
|
848
|
+
|
849
|
+
def strip_lastharaka(text):
|
850
|
+
"""Strip the last Haraka from arabic word except Shadda.
|
851
|
+
The striped marks are :
|
852
|
+
- FATHA, DAMMA, KASRA
|
853
|
+
- SUKUN
|
854
|
+
- FATHATAN, DAMMATAN, KASRATAN
|
855
|
+
|
856
|
+
Example:
|
857
|
+
>>> text = u"الْعَرَبِيّةُ"
|
858
|
+
>>> strip_lastharaka(text)
|
859
|
+
الْعَرَبِيّة
|
860
|
+
|
861
|
+
@param text: arabic text.
|
862
|
+
@type text: unicode.
|
863
|
+
@return: return a striped text.
|
864
|
+
@rtype: unicode.
|
865
|
+
"""
|
866
|
+
if text:
|
867
|
+
if is_vocalized(text):
|
868
|
+
return re.sub(LASTHARAKA_PATTERN, "", text)
|
869
|
+
return text
|
870
|
+
|
871
|
+
|
872
|
+
def strip_tashkeel(text):
|
873
|
+
"""Strip vowels from a text, include Shadda.
|
874
|
+
The striped marks are :
|
875
|
+
- FATHA, DAMMA, KASRA
|
876
|
+
- SUKUN
|
877
|
+
- SHADDA
|
878
|
+
- FATHATAN, DAMMATAN, KASRATAN,, , .
|
879
|
+
|
880
|
+
Example:
|
881
|
+
>>> text = u"الْعَرَبِيّةُ"
|
882
|
+
>>> strip_tashkeel(text)
|
883
|
+
العربية
|
884
|
+
|
885
|
+
@param text: arabic text.
|
886
|
+
@type text: unicode.
|
887
|
+
@return: return a striped text.
|
888
|
+
@rtype: unicode.
|
889
|
+
"""
|
890
|
+
if not text:
|
891
|
+
return text
|
892
|
+
elif is_vocalized(text):
|
893
|
+
for char in TASHKEEL:
|
894
|
+
text = text.replace(char, "")
|
895
|
+
return text
|
896
|
+
|
897
|
+
|
898
|
+
def strip_small(text):
|
899
|
+
"""Strip small_letters from a text
|
900
|
+
The striped marks are :
|
901
|
+
- Small Alef الألف الخنجرية, .
|
902
|
+
-Small WAW
|
903
|
+
-Small Yeh
|
904
|
+
Example:
|
905
|
+
>>> text = u"الرحمن\u0670"
|
906
|
+
>>> strip_small(text)
|
907
|
+
الرحمن
|
908
|
+
|
909
|
+
@param text: arabic text.
|
910
|
+
@type text: unicode.
|
911
|
+
@return: return a striped text.
|
912
|
+
@rtype: unicode.
|
913
|
+
"""
|
914
|
+
if not text:
|
915
|
+
return text
|
916
|
+
for char in SMALL:
|
917
|
+
text = text.replace(char, "")
|
918
|
+
return text
|
919
|
+
|
920
|
+
|
921
|
+
def strip_tatweel(text):
|
922
|
+
"""
|
923
|
+
Strip tatweel from a text and return a result text.
|
924
|
+
|
925
|
+
Example:
|
926
|
+
>>> text = u"العـــــربية"
|
927
|
+
>>> strip_tatweel(text)
|
928
|
+
العربية
|
929
|
+
|
930
|
+
@param text: arabic text.
|
931
|
+
@type text: unicode.
|
932
|
+
@return: return a striped text.
|
933
|
+
@rtype: unicode.
|
934
|
+
|
935
|
+
"""
|
936
|
+
return text.replace(TATWEEL, "")
|
937
|
+
|
938
|
+
|
939
|
+
def strip_shadda(text):
|
940
|
+
"""
|
941
|
+
Strip Shadda from a text and return a result text.
|
942
|
+
|
943
|
+
Example:
|
944
|
+
>>> text = u"الشّمسيّة"
|
945
|
+
>>> strip_shadda(text)
|
946
|
+
الشمسية
|
947
|
+
|
948
|
+
@param text: arabic text.
|
949
|
+
@type text: unicode.
|
950
|
+
@return: return a striped text.
|
951
|
+
@rtype: unicode.
|
952
|
+
"""
|
953
|
+
return text.replace(SHADDA, "")
|
954
|
+
|
955
|
+
|
956
|
+
def strip_diacritics(text):
|
957
|
+
"""Strip arabic diacritics from a text
|
958
|
+
The striped marks are :
|
959
|
+
- Small Alef الألف الخنجرية, .
|
960
|
+
- Harakat + Shadda
|
961
|
+
- Quranic marks
|
962
|
+
- Extended arabic diacritics
|
963
|
+
Example:
|
964
|
+
>>> text = u"الرحمن\u0670"
|
965
|
+
>>> strip_small(text)
|
966
|
+
الرحمن
|
967
|
+
|
968
|
+
@param text: arabic text.
|
969
|
+
@type text: unicode.
|
970
|
+
@return: return a striped text.
|
971
|
+
@rtype: unicode.
|
972
|
+
"""
|
973
|
+
if not text:
|
974
|
+
return text
|
975
|
+
for char in DIACRITICS:
|
976
|
+
text = text.replace(char, "")
|
977
|
+
return text
|
978
|
+
|
979
|
+
|
980
|
+
def normalize_ligature(text):
|
981
|
+
"""Normalize Lam Alef ligatures into two letters (LAM and ALEF),
|
982
|
+
and Tand return a result text.
|
983
|
+
Some systems present lamAlef ligature as a single letter,
|
984
|
+
this function convert it into two letters,
|
985
|
+
The converted letters into LAM and ALEF are :
|
986
|
+
- LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE
|
987
|
+
|
988
|
+
Example:
|
989
|
+
>>> text = u"لانها لالء الاسلام"
|
990
|
+
>>> normalize_ligature(text)
|
991
|
+
لانها لالئ الاسلام
|
992
|
+
|
993
|
+
@param text: arabic text.
|
994
|
+
@type text: unicode.
|
995
|
+
@return: return a converted text.
|
996
|
+
@rtype: unicode.
|
997
|
+
"""
|
998
|
+
if text:
|
999
|
+
return LIGUATURES_PATTERN.sub("%s%s" % (LAM, ALEF), text)
|
1000
|
+
return text
|
1001
|
+
|
1002
|
+
|
1003
|
+
def normalize_hamza(word, method="uniform"):
|
1004
|
+
"""Standardize the Hamzat into one form of hamza,
|
1005
|
+
replace Madda by hamza and alef.
|
1006
|
+
Replace the LamAlefs by simplified letters.
|
1007
|
+
|
1008
|
+
|
1009
|
+
Example:
|
1010
|
+
>>> import pyarabic.araby as araby
|
1011
|
+
>>> text1 = u"جاء سؤال الأئمة عن الإسلام آجلا"
|
1012
|
+
>>> araby.normalize_hamza(text1)
|
1013
|
+
'جاء سءال الءءمة عن الءسلام ءءجلا'
|
1014
|
+
>>> araby.normalize_hamza(text1, method="tasheel")
|
1015
|
+
'جاء سوال الايمة عن الاسلام ا
|
1016
|
+
@param word: arabic text.
|
1017
|
+
@type word: unicode.
|
1018
|
+
@param method: how to convert hamzat (uniform, tasheel).
|
1019
|
+
@type method: unicode.
|
1020
|
+
@return: return a converted text.
|
1021
|
+
@rtype: unicode.
|
1022
|
+
"""
|
1023
|
+
if method == "tasheel" or method == "تسهيل":
|
1024
|
+
# Alefat to Alef
|
1025
|
+
word = word.replace(ALEF_MADDA, ALEF)
|
1026
|
+
word = word.replace(ALEF_HAMZA_ABOVE, ALEF)
|
1027
|
+
word = word.replace(ALEF_HAMZA_BELOW, ALEF)
|
1028
|
+
word = word.replace(HAMZA_ABOVE, ALEF)
|
1029
|
+
word = word.replace(HAMZA_BELOW, ALEF)
|
1030
|
+
# on Waw
|
1031
|
+
word = word.replace(WAW_HAMZA, WAW)
|
1032
|
+
# on Yeh
|
1033
|
+
word = word.replace(YEH_HAMZA, YEH)
|
1034
|
+
else:
|
1035
|
+
if word.startswith(ALEF_MADDA):
|
1036
|
+
if (
|
1037
|
+
len(word) >= 3
|
1038
|
+
and (word[1] not in HARAKAT)
|
1039
|
+
and (word[2] == SHADDA or len(word) == 3)
|
1040
|
+
):
|
1041
|
+
word = HAMZA + ALEF + word[1:]
|
1042
|
+
else:
|
1043
|
+
word = HAMZA + HAMZA + word[1:]
|
1044
|
+
# convert all Hamza from into one form
|
1045
|
+
word = word.replace(ALEF_MADDA, HAMZA + HAMZA)
|
1046
|
+
word = HAMZAT_PATTERN.sub(HAMZA, word)
|
1047
|
+
return word
|
1048
|
+
|
1049
|
+
|
1050
|
+
def normalize_teh(text):
|
1051
|
+
"""
|
1052
|
+
converts TEH_MARBUTA to HEH
|
1053
|
+
Example:
|
1054
|
+
>>> text = 'محبة'
|
1055
|
+
>>> normalize_teh(text)
|
1056
|
+
'محبه'
|
1057
|
+
|
1058
|
+
"""
|
1059
|
+
return re.sub("[" + "".join(TEH_MARBUTA) + "]", HEH, text)
|
1060
|
+
|
1061
|
+
|
1062
|
+
def normalize_alef(text):
|
1063
|
+
"""
|
1064
|
+
converts all alefs to ALEF_MAMDODA with the exception of Alef maksura
|
1065
|
+
|
1066
|
+
"""
|
1067
|
+
# a small alef before Alef Maksura is ommited
|
1068
|
+
text = text.replace(SMALL_ALEF + ALEF_MAKSURA, ALEF_MAKSURA)
|
1069
|
+
text = text.replace(ALEF_MAKSURA + SMALL_ALEF, ALEF_MAKSURA)
|
1070
|
+
return re.sub(ALEFAT_PATTERN, ALEF, text)
|
1071
|
+
|
1072
|
+
|
1073
|
+
def separate(word, extract_shadda=False):
|
1074
|
+
"""
|
1075
|
+
separate the letters from the vowels, in arabic word,
|
1076
|
+
if a letter hasn't a haraka, the not definited haraka is attributed.
|
1077
|
+
return (letters, vowels)
|
1078
|
+
|
1079
|
+
Example:
|
1080
|
+
>>> araby.separate(text)
|
1081
|
+
(u'\u0627\u0644\u0639\u0631\u0628\u064a\u0629',
|
1082
|
+
u'\u064e\u0652\u064e\u064e\u064e\u064e\u064f')
|
1083
|
+
>>> letters, marks =araby.separate(text)
|
1084
|
+
>>> print letters.encode('utf8')
|
1085
|
+
العربية
|
1086
|
+
>>> print marks.encode('utf8')
|
1087
|
+
>>> for m in marks:
|
1088
|
+
... print araby.name(m)
|
1089
|
+
فتحة
|
1090
|
+
سكون
|
1091
|
+
فتحة
|
1092
|
+
فتحة
|
1093
|
+
فتحة
|
1094
|
+
فتحة
|
1095
|
+
ضمة
|
1096
|
+
|
1097
|
+
@param word: the input word
|
1098
|
+
@type word: unicode
|
1099
|
+
@param extract_shadda: extract shadda as seperate text
|
1100
|
+
@type extract_shadda: Boolean
|
1101
|
+
@return: (letters, vowels)
|
1102
|
+
@rtype:couple of unicode
|
1103
|
+
"""
|
1104
|
+
stack1 = stack.Stack(word)
|
1105
|
+
# the word is inversed in the stack
|
1106
|
+
stack1.items.reverse()
|
1107
|
+
letters = stack.Stack()
|
1108
|
+
marks = stack.Stack()
|
1109
|
+
vowels = HARAKAT
|
1110
|
+
last1 = stack1.pop()
|
1111
|
+
# if the last element must be a letter,
|
1112
|
+
# the arabic word can't starts with a haraka
|
1113
|
+
# in th stack the word is inversed
|
1114
|
+
while last1 in vowels:
|
1115
|
+
last1 = stack1.pop()
|
1116
|
+
while last1 != None:
|
1117
|
+
if last1 in vowels:
|
1118
|
+
# we can't have two harakats beside.
|
1119
|
+
# the shadda is considered as a letter
|
1120
|
+
marks.pop()
|
1121
|
+
marks.push(last1)
|
1122
|
+
elif last1 == SHADDA:
|
1123
|
+
# is the element is a Shadda,
|
1124
|
+
# the previous letter must have a sukun as mark,
|
1125
|
+
# and the shadda take the indefinate mark
|
1126
|
+
marks.pop()
|
1127
|
+
marks.push(SUKUN)
|
1128
|
+
marks.push(NOT_DEF_HARAKA)
|
1129
|
+
letters.push(SHADDA)
|
1130
|
+
else:
|
1131
|
+
marks.push(NOT_DEF_HARAKA)
|
1132
|
+
letters.push(last1)
|
1133
|
+
last1 = stack1.pop()
|
1134
|
+
if extract_shadda:
|
1135
|
+
# the shadda is considered as letter
|
1136
|
+
wordletters = "".join(letters.items)
|
1137
|
+
# print wordletters.encode('utf8')
|
1138
|
+
shaddaplaces = re.sub("[^%s]" % SHADDA, TATWEEL, wordletters)
|
1139
|
+
shaddaplaces = re.sub("%s%s" % (TATWEEL, SHADDA), SHADDA, shaddaplaces)
|
1140
|
+
# print wordletters.encode('utf8')
|
1141
|
+
wordletters = strip_shadda(wordletters)
|
1142
|
+
# print wordletters.encode('utf8')
|
1143
|
+
return (wordletters, "".join(marks.items), shaddaplaces)
|
1144
|
+
else:
|
1145
|
+
return ("".join(letters.items), "".join(marks.items))
|
1146
|
+
|
1147
|
+
|
1148
|
+
def joint(letters, marks):
|
1149
|
+
"""joint the letters with the marks
|
1150
|
+
the length ot letters and marks must be equal
|
1151
|
+
return word
|
1152
|
+
|
1153
|
+
Example:
|
1154
|
+
>>> letters = u"العربية"
|
1155
|
+
>>> marks = u'\u064e\u0652\u064e\u064e\u064e\u064e\u064f'
|
1156
|
+
>>> word = araby.joint(letters, marks)
|
1157
|
+
>>> print word.encode('utf8')
|
1158
|
+
اَلْعَرَبَيَةُ
|
1159
|
+
|
1160
|
+
@param letters: the word letters
|
1161
|
+
@type letters: unicode
|
1162
|
+
@param marks: the word marks
|
1163
|
+
@type marks: unicode
|
1164
|
+
@return: word
|
1165
|
+
@rtype: unicode
|
1166
|
+
"""
|
1167
|
+
# The length ot letters and marks must be equal
|
1168
|
+
if len(letters) != len(marks):
|
1169
|
+
return ""
|
1170
|
+
stack_letter = stack.Stack(letters)
|
1171
|
+
stack_letter.items.reverse()
|
1172
|
+
stack_mark = stack.Stack(marks)
|
1173
|
+
stack_mark.items.reverse()
|
1174
|
+
|
1175
|
+
word_stack = stack.Stack()
|
1176
|
+
last_letter = stack_letter.pop()
|
1177
|
+
last_mark = stack_mark.pop()
|
1178
|
+
vowels = HARAKAT
|
1179
|
+
while last_letter != None and last_mark != None:
|
1180
|
+
if last_letter == SHADDA:
|
1181
|
+
top = word_stack.pop()
|
1182
|
+
if top not in vowels:
|
1183
|
+
word_stack.push(top)
|
1184
|
+
word_stack.push(last_letter)
|
1185
|
+
if last_mark != NOT_DEF_HARAKA:
|
1186
|
+
word_stack.push(last_mark)
|
1187
|
+
else:
|
1188
|
+
word_stack.push(last_letter)
|
1189
|
+
if last_mark != NOT_DEF_HARAKA:
|
1190
|
+
word_stack.push(last_mark)
|
1191
|
+
|
1192
|
+
last_letter = stack_letter.pop()
|
1193
|
+
last_mark = stack_mark.pop()
|
1194
|
+
|
1195
|
+
if not (stack_letter.is_empty() and stack_mark.is_empty()):
|
1196
|
+
return False
|
1197
|
+
else:
|
1198
|
+
return "".join(word_stack.items)
|
1199
|
+
|
1200
|
+
|
1201
|
+
def vocalizedlike(word1, word2):
|
1202
|
+
"""
|
1203
|
+
if the two words has the same letters and the same harakats, this fuction return True.
|
1204
|
+
The two words can be full vocalized, or partial vocalized
|
1205
|
+
|
1206
|
+
Example:
|
1207
|
+
>>> word1 = u"ضَربٌ"
|
1208
|
+
>>> word2 = u"ضَرْبٌ"
|
1209
|
+
>>> araby.vocalizedlike(word1, word2)
|
1210
|
+
True
|
1211
|
+
|
1212
|
+
@param word1: first word
|
1213
|
+
@type word1: unicode
|
1214
|
+
@param word2: second word
|
1215
|
+
@type word2: unicode
|
1216
|
+
@return: if two words have similar vocalization
|
1217
|
+
@rtype: Boolean
|
1218
|
+
"""
|
1219
|
+
if vocalized_similarity(word1, word2) < 0:
|
1220
|
+
return False
|
1221
|
+
else:
|
1222
|
+
return True
|
1223
|
+
|
1224
|
+
|
1225
|
+
# -------------------------
|
1226
|
+
# Function def vaznlike(word1, wazn):
|
1227
|
+
# -------------------------
|
1228
|
+
def waznlike(word1, wazn, extract_root=False):
|
1229
|
+
"""If the word1 is like a wazn (pattern), and can return root
|
1230
|
+
the letters must be equal,
|
1231
|
+
the wazn has FEH, AIN, LAM letters.
|
1232
|
+
this are as generic letters.
|
1233
|
+
The two words can be full vocalized, or partial vocalized
|
1234
|
+
|
1235
|
+
Example:
|
1236
|
+
>>> word1 = u"ضارب"
|
1237
|
+
>>> wazn = u"فَاعِل"
|
1238
|
+
>>> araby.waznlike(word1, wazn)
|
1239
|
+
True
|
1240
|
+
|
1241
|
+
@param word1: input word
|
1242
|
+
@type word1: unicode
|
1243
|
+
@param wazn: given word template وزن
|
1244
|
+
@type wazn: unicode
|
1245
|
+
@param extract_root: return root if True
|
1246
|
+
@type extract_root: boolean
|
1247
|
+
@return: if two words have similar vocalization
|
1248
|
+
@rtype: Boolean
|
1249
|
+
"""
|
1250
|
+
stack1 = stack.Stack(word1)
|
1251
|
+
stack2 = stack.Stack(wazn)
|
1252
|
+
root = stack.Stack()
|
1253
|
+
last1 = stack1.pop()
|
1254
|
+
last2 = stack2.pop()
|
1255
|
+
vowels = HARAKAT
|
1256
|
+
while last1 != None and last2 != None:
|
1257
|
+
if last1 == last2 and last2 not in (FEH, AIN, LAM):
|
1258
|
+
last1 = stack1.pop()
|
1259
|
+
last2 = stack2.pop()
|
1260
|
+
elif last1 not in vowels and last2 in (FEH, AIN, LAM):
|
1261
|
+
root.push(last1)
|
1262
|
+
# ~ print "t"
|
1263
|
+
last1 = stack1.pop()
|
1264
|
+
last2 = stack2.pop()
|
1265
|
+
elif last1 in vowels and last2 not in vowels:
|
1266
|
+
last1 = stack1.pop()
|
1267
|
+
elif last1 not in vowels and last2 in vowels:
|
1268
|
+
last2 = stack2.pop()
|
1269
|
+
else:
|
1270
|
+
break
|
1271
|
+
# reverse the root letters
|
1272
|
+
root.items.reverse()
|
1273
|
+
# ~ print " the root is ", root.items#"".join(root.items)
|
1274
|
+
if not (stack1.is_empty() and stack2.is_empty()):
|
1275
|
+
return False
|
1276
|
+
# if one letter is remind after pop in one stack
|
1277
|
+
elif last1 != None or last2 != None:
|
1278
|
+
return False
|
1279
|
+
else:
|
1280
|
+
if extract_root:
|
1281
|
+
return "".join(root.items)
|
1282
|
+
else:
|
1283
|
+
return True
|
1284
|
+
|
1285
|
+
|
1286
|
+
def shaddalike(partial, fully):
|
1287
|
+
"""
|
1288
|
+
If the two words has the same letters and the same harakats, this fuction return True.
|
1289
|
+
The first word is partially vocalized, the second is fully
|
1290
|
+
if the partially contians a shadda, it must be at the same place in the fully
|
1291
|
+
|
1292
|
+
Example:
|
1293
|
+
>>> word1 = u"ردّ"
|
1294
|
+
>>> word2=u"ردَّ"
|
1295
|
+
>>> araby.shaddalike(word1, word2)
|
1296
|
+
True
|
1297
|
+
|
1298
|
+
@param partial: the partially vocalized word
|
1299
|
+
@type partial: unicode
|
1300
|
+
@param fully: the fully vocalized word
|
1301
|
+
@type fully: unicode
|
1302
|
+
@return: if contains shadda
|
1303
|
+
@rtype: Boolean
|
1304
|
+
"""
|
1305
|
+
# المدخل ليس به شدة، لا داعي للبحث
|
1306
|
+
if not has_shadda(partial):
|
1307
|
+
return True
|
1308
|
+
# المدخل به شدة، والنتيجة ليس بها شدة، خاطئ
|
1309
|
+
elif not has_shadda(fully) and has_shadda(partial):
|
1310
|
+
return False
|
1311
|
+
|
1312
|
+
# المدخل والمخرج بهما شدة، نتأكد من موقعهما
|
1313
|
+
partial = strip_harakat(partial)
|
1314
|
+
fully = strip_harakat(fully)
|
1315
|
+
pstack = stack.Stack(partial)
|
1316
|
+
vstack = stack.Stack(fully)
|
1317
|
+
plast = pstack.pop()
|
1318
|
+
vlast = vstack.pop()
|
1319
|
+
# if debug: print "+0", Pstack, Vstack
|
1320
|
+
while plast != None and vlast != None:
|
1321
|
+
if plast == vlast:
|
1322
|
+
plast = pstack.pop()
|
1323
|
+
vlast = vstack.pop()
|
1324
|
+
elif plast == SHADDA and vlast != SHADDA:
|
1325
|
+
# if debug: print "+2", Pstack.items, Plast, Vstack.items, Vlast
|
1326
|
+
break
|
1327
|
+
elif plast != SHADDA and vlast == SHADDA:
|
1328
|
+
# if debug: print "+2", Pstack.items, Plast, Vstack.items, Vlast
|
1329
|
+
vlast = vstack.pop()
|
1330
|
+
else:
|
1331
|
+
# if debug: print "+2", Pstack.items, Plast, Vstack.items, Vlast
|
1332
|
+
break
|
1333
|
+
if not (pstack.is_empty() and vstack.is_empty()):
|
1334
|
+
return False
|
1335
|
+
else:
|
1336
|
+
return True
|
1337
|
+
|
1338
|
+
|
1339
|
+
def reduce_tashkeel(text):
|
1340
|
+
"""Reduce the Tashkeel, by deleting evident cases.
|
1341
|
+
|
1342
|
+
Exmaple:
|
1343
|
+
>>> word = u"يُتَسََلَّمْنَ"
|
1344
|
+
>>> reduced = araby.reduce_tashkeel(word)
|
1345
|
+
>>> print reduced.encode('utf8')
|
1346
|
+
يُتسلّمن
|
1347
|
+
|
1348
|
+
@param text: the input text fully vocalized.
|
1349
|
+
@type text: unicode.
|
1350
|
+
@return : partially vocalized text.
|
1351
|
+
@rtype: unicode.
|
1352
|
+
|
1353
|
+
"""
|
1354
|
+
patterns = [
|
1355
|
+
# delete all fathat, except on waw and yeh
|
1356
|
+
"(?<!(%s|%s))(%s|%s)"
|
1357
|
+
% (WAW, YEH, SUKUN, FATHA), # delete damma if followed by waw.
|
1358
|
+
"%s(?=%s)" % (DAMMA, WAW), # delete kasra if followed by yeh.
|
1359
|
+
"%s(?=%s)"
|
1360
|
+
% (KASRA, YEH), # delete fatha if followed by alef to reduce yeh maftouha
|
1361
|
+
# and waw maftouha before alef.
|
1362
|
+
"%s(?=%s)"
|
1363
|
+
% (
|
1364
|
+
FATHA,
|
1365
|
+
ALEF,
|
1366
|
+
), # delete fatha from yeh and waw if they are in the word begining.
|
1367
|
+
r"(?<=\s(%s|%s))%s"
|
1368
|
+
% (WAW, YEH, FATHA), # delete kasra if preceded by Hamza below alef.
|
1369
|
+
"(?<=%s)%s" % (ALEF_HAMZA_BELOW, KASRA),
|
1370
|
+
]
|
1371
|
+
reduced = text
|
1372
|
+
for pat in patterns:
|
1373
|
+
reduced = re.sub(pat, "", reduced)
|
1374
|
+
return reduced
|
1375
|
+
|
1376
|
+
|
1377
|
+
def vocalized_similarity(word1, word2):
|
1378
|
+
"""if the two words has the same letters and the same harakats, this function return True.
|
1379
|
+
The two words can be full vocalized, or partial vocalized
|
1380
|
+
|
1381
|
+
Example:
|
1382
|
+
>>> word1 = u"ضَربٌ"
|
1383
|
+
>>> word2 = u"ضَرْبٌ"
|
1384
|
+
>>> araby.vocalizedlike(word1, word2)
|
1385
|
+
True
|
1386
|
+
>>> word1 = u"ضَربٌ"
|
1387
|
+
>>> word2 = u"ضَرْبٍ"
|
1388
|
+
>>> araby.vocalized_similarity(word1, word2)
|
1389
|
+
-1
|
1390
|
+
|
1391
|
+
@param word1: first word
|
1392
|
+
@type word1: unicode
|
1393
|
+
@param word2: second word
|
1394
|
+
@type word2: unicode
|
1395
|
+
@return: return if words are similar, else return negative number of errors
|
1396
|
+
@rtype: Boolean / int
|
1397
|
+
"""
|
1398
|
+
stack1 = stack.Stack(word1)
|
1399
|
+
stack2 = stack.Stack(word2)
|
1400
|
+
last1 = stack1.pop()
|
1401
|
+
last2 = stack2.pop()
|
1402
|
+
err_count = 0
|
1403
|
+
vowels = HARAKAT
|
1404
|
+
while last1 != None and last2 != None:
|
1405
|
+
if last1 == last2:
|
1406
|
+
last1 = stack1.pop()
|
1407
|
+
last2 = stack2.pop()
|
1408
|
+
elif last1 in vowels and last2 not in vowels:
|
1409
|
+
last1 = stack1.pop()
|
1410
|
+
elif last1 not in vowels and last2 in vowels:
|
1411
|
+
last2 = stack2.pop()
|
1412
|
+
else:
|
1413
|
+
# break
|
1414
|
+
if last1 == SHADDA:
|
1415
|
+
last1 = stack1.pop()
|
1416
|
+
elif last2 == SHADDA:
|
1417
|
+
last2 = stack2.pop()
|
1418
|
+
else:
|
1419
|
+
last1 = stack1.pop()
|
1420
|
+
last2 = stack2.pop()
|
1421
|
+
err_count += 1
|
1422
|
+
if err_count > 0:
|
1423
|
+
return -err_count
|
1424
|
+
else:
|
1425
|
+
return True
|
1426
|
+
|
1427
|
+
|
1428
|
+
def sentence_tokenize(text):
|
1429
|
+
"""
|
1430
|
+
Tokenize text into sentences.
|
1431
|
+
|
1432
|
+
Example:
|
1433
|
+
>>> text = u"العربية لغة جميلة. والبلاد بعيدة، والشوق زائد"
|
1434
|
+
>>> tokens = araby.sentence_tokenize(text)
|
1435
|
+
>>> print(tokens)
|
1436
|
+
['العربية لغة جميلة.', 'والبلاد بعيدة،', 'والشوق زائد']
|
1437
|
+
|
1438
|
+
@param text: the input text.
|
1439
|
+
@type text: unicode.
|
1440
|
+
@return: list of sentences.
|
1441
|
+
@rtype: list.
|
1442
|
+
"""
|
1443
|
+
text = re.sub("([.,:;،؟?\n])+([\n\t\r ])+", r"\1<SPLIT>", text, re.UNICODE)
|
1444
|
+
sentences = re.split("<SPLIT>", text)
|
1445
|
+
return sentences
|
1446
|
+
|
1447
|
+
|
1448
|
+
def tokenize(text="", conditions=[], morphs=[]):
|
1449
|
+
"""
|
1450
|
+
Tokenize text into words.
|
1451
|
+
|
1452
|
+
Example:
|
1453
|
+
>>> text = u"العربية لغة جميلة."
|
1454
|
+
>>> tokens = araby.tokenize(text)
|
1455
|
+
>>> print u"\\n".join(tokens)
|
1456
|
+
العربية
|
1457
|
+
لغة
|
1458
|
+
جميلة
|
1459
|
+
.
|
1460
|
+
|
1461
|
+
Example 2 (To remove tashkeel and filter out non-Arabic words:):
|
1462
|
+
>>> text = u"ِاسمٌ الكلبِ في اللغةِ الإنجليزية Dog واسمُ الحمارِ Donky"
|
1463
|
+
>>> tokenize(text, conditions=is_arabicrange, morphs=strip_tashkeel)
|
1464
|
+
['اسم', 'الكلب', 'في', 'اللغة', 'الإنجليزية', 'واسم', 'الحمار']
|
1465
|
+
|
1466
|
+
Example 3 (This structure will enable us to create functions on the fly and pass them:):
|
1467
|
+
>>> text = u"طلع البدر علينا من ثنيات الوداع"
|
1468
|
+
>>>tokenize(text, conditions=lambda x: x.startswith(u'ال'))
|
1469
|
+
['البدر', 'الوداع']
|
1470
|
+
|
1471
|
+
@param text: the input text.
|
1472
|
+
@type text: unicode.
|
1473
|
+
@param conditions: a list of conditions to be applied on tokens, like avoiding non arabic letters.
|
1474
|
+
@type conditions: one or list of conditions .
|
1475
|
+
@param morphs: a list of morphological change functions to be applied on tokens, like striping tashkeel or normalizing tokens.
|
1476
|
+
@type morphs: one or list of morphological functions .
|
1477
|
+
@return: list of words.
|
1478
|
+
@rtype: list.
|
1479
|
+
"""
|
1480
|
+
if text:
|
1481
|
+
# to be tolerant and allow for a single condition and/or morph to be passed
|
1482
|
+
# without having to enclose it in a list
|
1483
|
+
if type(conditions) is not list:
|
1484
|
+
conditions = [conditions]
|
1485
|
+
if type(morphs) is not list:
|
1486
|
+
morphs = [morphs]
|
1487
|
+
|
1488
|
+
tokens = TOKEN_PATTERN.split(text)
|
1489
|
+
tokens = [
|
1490
|
+
TOKEN_REPLACE.sub("", tok) for tok in tokens if TOKEN_REPLACE.sub("", tok)
|
1491
|
+
]
|
1492
|
+
|
1493
|
+
if conditions:
|
1494
|
+
tokens = [tok for tok in tokens if all([cond(tok) for cond in conditions])]
|
1495
|
+
if morphs:
|
1496
|
+
|
1497
|
+
def morph(tok):
|
1498
|
+
for m in morphs:
|
1499
|
+
tok = m(tok)
|
1500
|
+
return tok
|
1501
|
+
|
1502
|
+
tokens = [morph(tok) for tok in tokens]
|
1503
|
+
return tokens
|
1504
|
+
else:
|
1505
|
+
return []
|
1506
|
+
|
1507
|
+
|
1508
|
+
def tokenize_with_location(text: str) -> list:
|
1509
|
+
"""
|
1510
|
+
Tokenize text into words with their positions.
|
1511
|
+
|
1512
|
+
Example:
|
1513
|
+
>>> text = "حدثنا ابن أبي عامر، قال: رايت مناما"
|
1514
|
+
>>> tokens = araby.tokenize_with_location(text)
|
1515
|
+
>>> print u"\\n".join(tokens)
|
1516
|
+
[{'token': 'حدثنا', 'start': 0, 'end': 5},
|
1517
|
+
{'token': 'ابن', 'start': 6, 'end': 9},
|
1518
|
+
{'token': 'أبي', 'start': 10, 'end': 13},
|
1519
|
+
{'token': 'عامر', 'start': 14, 'end': 18},
|
1520
|
+
{'token': 'قال', 'start': 20, 'end': 23},
|
1521
|
+
{'token': 'رايت', 'start': 25, 'end': 29},
|
1522
|
+
{'token': 'مناما','start': 30, 'end': 35}
|
1523
|
+
]
|
1524
|
+
|
1525
|
+
|
1526
|
+
@param text: the input text.
|
1527
|
+
@type text: unicode.
|
1528
|
+
@return: list of dict of (tokens, starts, ends).
|
1529
|
+
@rtype: list of dict.
|
1530
|
+
"""
|
1531
|
+
tokens = []
|
1532
|
+
for match in TOKEN_PATTERN_SPLIT.finditer(text):
|
1533
|
+
tokens.append(
|
1534
|
+
{
|
1535
|
+
"token": text[match.start() : match.end()],
|
1536
|
+
"start": match.start(),
|
1537
|
+
"end": match.end(),
|
1538
|
+
}
|
1539
|
+
)
|
1540
|
+
|
1541
|
+
return tokens
|
1542
|
+
|
1543
|
+
|
1544
|
+
def fix_spaces(text):
|
1545
|
+
""" """
|
1546
|
+
text = FIX_SPACES_PAT.sub(lambda x: "{} ".format(x.group(1).replace(" ", "")), text)
|
1547
|
+
return text.strip()
|
1548
|
+
|
1549
|
+
|
1550
|
+
def autocorrect(text):
|
1551
|
+
"""
|
1552
|
+
Correct most common errors on word
|
1553
|
+
like repetetion of harakats,or tanwin befor alef
|
1554
|
+
@param text: input text
|
1555
|
+
@type text: unicode
|
1556
|
+
@return: corrected text
|
1557
|
+
@rtype: unicode
|
1558
|
+
"""
|
1559
|
+
## HARAKAT
|
1560
|
+
text = re.sub(r"(?<=[\s\d])([%s])+" % (TASHKEEL_STRING), "", text, re.UNICODE)
|
1561
|
+
text = re.sub("^([%s])+" % (TASHKEEL_STRING), "", text, re.UNICODE)
|
1562
|
+
# tanwin on alef
|
1563
|
+
text = re.sub(ALEF + FATHATAN, FATHATAN + ALEF, text, re.UNICODE)
|
1564
|
+
|
1565
|
+
# SUKUN misplaced on alef /alef maksura and TEH merbuta
|
1566
|
+
text = re.sub(
|
1567
|
+
"(?<=[%s%s%s])([%s])+" % (ALEF, ALEF_MAKSURA, TEH_MARBUTA, SUKUN),
|
1568
|
+
"",
|
1569
|
+
text,
|
1570
|
+
re.UNICODE,
|
1571
|
+
)
|
1572
|
+
|
1573
|
+
# Hakara before Shadda
|
1574
|
+
text = re.sub("([%s])+(?=[%s])" % (HARAKAT_STRING, SHADDA), "", text, re.UNICODE)
|
1575
|
+
|
1576
|
+
# repeated harahat
|
1577
|
+
text = re.sub(
|
1578
|
+
"(?<=[%s])([%s])+" % (HARAKAT_STRING, HARAKAT_STRING), "", text, re.UNICODE
|
1579
|
+
)
|
1580
|
+
|
1581
|
+
## Letters
|
1582
|
+
return text
|
1583
|
+
|
1584
|
+
|
1585
|
+
def spellit(word, lang="ar"):
|
1586
|
+
"""
|
1587
|
+
write the word in full letter' names
|
1588
|
+
"""
|
1589
|
+
|
1590
|
+
names = []
|
1591
|
+
if lang == "unicode":
|
1592
|
+
for c in word:
|
1593
|
+
names.append(unicodedata.name(c, c))
|
1594
|
+
else:
|
1595
|
+
for c in word:
|
1596
|
+
names.append(name(c, c))
|
1597
|
+
return ", ".join(names)
|
1598
|
+
|
1599
|
+
|
1600
|
+
if __name__ == "__main__":
|
1601
|
+
# ~WORDS = [u'الْدَرَاجَةُ', u'الدّرّاجة',
|
1602
|
+
# ~u'سّلّامْ', ]
|
1603
|
+
# ~for wrd in WORDS:
|
1604
|
+
# ~l, m, s = separate(wrd, True)
|
1605
|
+
# ~l = joint(l, s)
|
1606
|
+
# ~print u'\t'.join([wrd, l, m, s]).encode('utf8')
|
1607
|
+
# ~newword = joint(l, m)
|
1608
|
+
# ~assert (newword != wrd)
|
1609
|
+
print("like: ", vocalizedlike("مُتَوَهِّمًا", "متوهمًا"))
|
1610
|
+
print("sim: ", vocalized_similarity("ثمّ", "ثُمَّ"))
|
1611
|
+
print("like: ", vocalizedlike("ثمّ", "ثُمَّ"))
|
1612
|
+
print("sim: ", vocalized_similarity("ثم", "ثُمَّ"))
|
1613
|
+
print("like: ", vocalizedlike("ثم", "ثُمَّ"))
|
1614
|
+
print("sim: ", vocalized_similarity("مُتَوَهِّمًا", "متوهمًا"))
|
1615
|
+
print("sim: ", vocalized_similarity("مُتَوَهِّمًا", "متوهمًا"))
|
1616
|
+
text1 = "العربية: لغة جميلة."
|
1617
|
+
wordlist = ["العربية", ":", "لغة", "جميلة", "."]
|
1618
|
+
wl = tokenize(text1)
|
1619
|
+
|
1620
|
+
print(" use tokenize")
|
1621
|
+
print(wl)
|
1622
|
+
# ~ print((repr(wl)).decode('unicode-escape'))
|
1623
|
+
# ~ print((repr(wordlist)).decode('unicode-escape'))
|
1624
|
+
# ~ TOKEN_PATTERN2 = re.compile(u"[^\w\u064b-\u0652']+", re.UNICODE)
|
1625
|
+
# ~ words = TOKEN_PATTERN2.split(text1)
|
1626
|
+
# ~ print(" first")
|
1627
|
+
# ~ print((repr(words)).decode('unicode-escape'))
|
1628
|
+
# ~ TOKEN_PATTERN3 = re.compile(u"([^\w\u064b-\u0652']+)", re.UNICODE)
|
1629
|
+
# ~ words = TOKEN_PATTERN3.split(text1)
|
1630
|
+
# ~ print(" modified")
|
1631
|
+
# ~ print (repr(words)).decode('unicode-escape')
|
1632
|
+
|
1633
|
+
# ~ TOKEN_PATTERN4 = re.compile(u"([^\w\u064b-\u0652']+)", re.UNICODE)
|
1634
|
+
# ~ words = TOKEN_PATTERN4.split(text1)
|
1635
|
+
# ~ print(" modified without r-prefix")
|
1636
|
+
# ~ print((repr(words)).decode('unicode-escape'))
|
1637
|
+
|
1638
|
+
# ~ text = u"ِاسمٌ الكلبِ في اللغةِ الإنجليزية Dog واسمُ الحمارِ Donky"
|
1639
|
+
# ~ words = tokenize(text, conditions=is_arabicrange, morphs=strip_tashkeel)
|
1640
|
+
# ~ print((repr(words)).decode('unicode-escape'))
|
1641
|
+
|
1642
|
+
# ~ #>> ['اسم', 'الكلب', 'في', 'اللغة', 'الإنجليزية', 'واسم', 'الحمار']
|
1643
|
+
|
1644
|
+
# ~ text = u"طلع البدر علينا من ثنيات الوداع"
|
1645
|
+
# ~ words = tokenize(text, conditions=lambda x: x.startswith(u'ال'))
|
1646
|
+
# ~ # >> ['البدر', 'الوداع']
|
1647
|
+
# ~ print((repr(words)).decode('unicode-escape'))
|