phoonnx 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. phoonnx/__init__.py +0 -0
  2. phoonnx/config.py +490 -0
  3. phoonnx/locale/ca/phonetic_spellings.txt +2 -0
  4. phoonnx/locale/en/phonetic_spellings.txt +1 -0
  5. phoonnx/locale/gl/phonetic_spellings.txt +2 -0
  6. phoonnx/locale/pt/phonetic_spellings.txt +2 -0
  7. phoonnx/phoneme_ids.py +453 -0
  8. phoonnx/phonemizers/__init__.py +45 -0
  9. phoonnx/phonemizers/ar.py +42 -0
  10. phoonnx/phonemizers/base.py +216 -0
  11. phoonnx/phonemizers/en.py +250 -0
  12. phoonnx/phonemizers/fa.py +46 -0
  13. phoonnx/phonemizers/gl.py +142 -0
  14. phoonnx/phonemizers/he.py +67 -0
  15. phoonnx/phonemizers/ja.py +119 -0
  16. phoonnx/phonemizers/ko.py +97 -0
  17. phoonnx/phonemizers/mul.py +606 -0
  18. phoonnx/phonemizers/vi.py +44 -0
  19. phoonnx/phonemizers/zh.py +308 -0
  20. phoonnx/thirdparty/__init__.py +0 -0
  21. phoonnx/thirdparty/arpa2ipa.py +249 -0
  22. phoonnx/thirdparty/cotovia/cotovia_aarch64 +0 -0
  23. phoonnx/thirdparty/cotovia/cotovia_x86_64 +0 -0
  24. phoonnx/thirdparty/hangul2ipa.py +783 -0
  25. phoonnx/thirdparty/ko_tables/aspiration.csv +20 -0
  26. phoonnx/thirdparty/ko_tables/assimilation.csv +31 -0
  27. phoonnx/thirdparty/ko_tables/double_coda.csv +17 -0
  28. phoonnx/thirdparty/ko_tables/hanja.tsv +8525 -0
  29. phoonnx/thirdparty/ko_tables/ipa.csv +22 -0
  30. phoonnx/thirdparty/ko_tables/neutralization.csv +11 -0
  31. phoonnx/thirdparty/ko_tables/tensification.csv +56 -0
  32. phoonnx/thirdparty/ko_tables/yale.csv +22 -0
  33. phoonnx/thirdparty/kog2p/__init__.py +385 -0
  34. phoonnx/thirdparty/kog2p/rulebook.txt +212 -0
  35. phoonnx/thirdparty/mantoq/__init__.py +67 -0
  36. phoonnx/thirdparty/mantoq/buck/__init__.py +0 -0
  37. phoonnx/thirdparty/mantoq/buck/phonetise_buckwalter.py +569 -0
  38. phoonnx/thirdparty/mantoq/buck/symbols.py +64 -0
  39. phoonnx/thirdparty/mantoq/buck/tokenization.py +105 -0
  40. phoonnx/thirdparty/mantoq/num2words.py +37 -0
  41. phoonnx/thirdparty/mantoq/pyarabic/__init__.py +12 -0
  42. phoonnx/thirdparty/mantoq/pyarabic/arabrepr.py +64 -0
  43. phoonnx/thirdparty/mantoq/pyarabic/araby.py +1647 -0
  44. phoonnx/thirdparty/mantoq/pyarabic/named_const.py +227 -0
  45. phoonnx/thirdparty/mantoq/pyarabic/normalize.py +161 -0
  46. phoonnx/thirdparty/mantoq/pyarabic/number.py +826 -0
  47. phoonnx/thirdparty/mantoq/pyarabic/number_const.py +1704 -0
  48. phoonnx/thirdparty/mantoq/pyarabic/stack.py +52 -0
  49. phoonnx/thirdparty/mantoq/pyarabic/trans.py +517 -0
  50. phoonnx/thirdparty/mantoq/unicode_symbol2label.py +4173 -0
  51. phoonnx/thirdparty/tashkeel/LICENSE +22 -0
  52. phoonnx/thirdparty/tashkeel/SOURCE +1 -0
  53. phoonnx/thirdparty/tashkeel/__init__.py +212 -0
  54. phoonnx/thirdparty/tashkeel/hint_id_map.json +18 -0
  55. phoonnx/thirdparty/tashkeel/input_id_map.json +56 -0
  56. phoonnx/thirdparty/tashkeel/model.onnx +0 -0
  57. phoonnx/thirdparty/tashkeel/target_id_map.json +17 -0
  58. phoonnx/thirdparty/zh_num.py +238 -0
  59. phoonnx/util.py +705 -0
  60. phoonnx/version.py +6 -0
  61. phoonnx/voice.py +521 -0
  62. phoonnx-0.0.0.dist-info/METADATA +255 -0
  63. phoonnx-0.0.0.dist-info/RECORD +86 -0
  64. phoonnx-0.0.0.dist-info/WHEEL +5 -0
  65. phoonnx-0.0.0.dist-info/top_level.txt +2 -0
  66. phoonnx_train/__main__.py +151 -0
  67. phoonnx_train/export_onnx.py +109 -0
  68. phoonnx_train/norm_audio/__init__.py +92 -0
  69. phoonnx_train/norm_audio/trim.py +54 -0
  70. phoonnx_train/norm_audio/vad.py +54 -0
  71. phoonnx_train/preprocess.py +420 -0
  72. phoonnx_train/vits/__init__.py +0 -0
  73. phoonnx_train/vits/attentions.py +427 -0
  74. phoonnx_train/vits/commons.py +147 -0
  75. phoonnx_train/vits/config.py +330 -0
  76. phoonnx_train/vits/dataset.py +214 -0
  77. phoonnx_train/vits/lightning.py +352 -0
  78. phoonnx_train/vits/losses.py +58 -0
  79. phoonnx_train/vits/mel_processing.py +139 -0
  80. phoonnx_train/vits/models.py +732 -0
  81. phoonnx_train/vits/modules.py +527 -0
  82. phoonnx_train/vits/monotonic_align/__init__.py +20 -0
  83. phoonnx_train/vits/monotonic_align/setup.py +13 -0
  84. phoonnx_train/vits/transforms.py +212 -0
  85. phoonnx_train/vits/utils.py +16 -0
  86. phoonnx_train/vits/wavfile.py +860 -0
@@ -0,0 +1,1647 @@
1
+ #!/usr/bin/python
2
+ # -*- coding=utf-8 -*-
3
+ """
4
+ Arabic module
5
+
6
+ Features:
7
+ =========
8
+ - Arabic letters classification
9
+ - Text tokenization
10
+ - Strip Harakat (all, except Shadda, tatweel, last_haraka)
11
+ - Sperate and join Letters and Harakat
12
+ - Reduce tashkeel
13
+ - Mesure tashkeel similarity (Harakats, fully or partially vocalized, similarity with a template)
14
+ - Letters normalization (Ligatures and Hamza)
15
+
16
+ @author: Taha Zerrouki
17
+ @contact: taha dot zerrouki at gmail dot com
18
+ @copyright: Arabtechies, Arabeyes, Taha Zerrouki
19
+ @license: GPL
20
+ @date:2010/03/01
21
+ @version: 0.1
22
+ """
23
+ import re
24
+ import unicodedata
25
+
26
+ from . import stack
27
+
28
+ COMMA = "\u060c"
29
+ SEMICOLON = "\u061b"
30
+ QUESTION = "\u061f"
31
+ HAMZA = "\u0621"
32
+ ALEF_MADDA = "\u0622"
33
+ ALEF_HAMZA_ABOVE = "\u0623"
34
+ WAW_HAMZA = "\u0624"
35
+ ALEF_HAMZA_BELOW = "\u0625"
36
+ YEH_HAMZA = "\u0626"
37
+ ALEF = "\u0627"
38
+ BEH = "\u0628"
39
+ TEH_MARBUTA = "\u0629"
40
+ TEH = "\u062a"
41
+ THEH = "\u062b"
42
+ JEEM = "\u062c"
43
+ HAH = "\u062d"
44
+ KHAH = "\u062e"
45
+ DAL = "\u062f"
46
+ THAL = "\u0630"
47
+ REH = "\u0631"
48
+ ZAIN = "\u0632"
49
+ SEEN = "\u0633"
50
+ SHEEN = "\u0634"
51
+ SAD = "\u0635"
52
+ DAD = "\u0636"
53
+ TAH = "\u0637"
54
+ ZAH = "\u0638"
55
+ AIN = "\u0639"
56
+ GHAIN = "\u063a"
57
+ TATWEEL = "\u0640"
58
+ FEH = "\u0641"
59
+ QAF = "\u0642"
60
+ KAF = "\u0643"
61
+ LAM = "\u0644"
62
+ MEEM = "\u0645"
63
+ NOON = "\u0646"
64
+ HEH = "\u0647"
65
+ WAW = "\u0648"
66
+ ALEF_MAKSURA = "\u0649"
67
+ YEH = "\u064a"
68
+ MADDA_ABOVE = "\u0653"
69
+ HAMZA_ABOVE = "\u0654"
70
+ HAMZA_BELOW = "\u0655"
71
+ ZERO = "\u0660"
72
+ ONE = "\u0661"
73
+ TWO = "\u0662"
74
+ THREE = "\u0663"
75
+ FOUR = "\u0664"
76
+ FIVE = "\u0665"
77
+ SIX = "\u0666"
78
+ SEVEN = "\u0667"
79
+ EIGHT = "\u0668"
80
+ NINE = "\u0669"
81
+ ZERO_W = "\u0030"
82
+ ONE_W = "\u0031"
83
+ TWO_W = "\u0032"
84
+ THREE_W = "\u0033"
85
+ FOUR_W = "\u0034"
86
+ FIVE_W = "\u0035"
87
+ SIX_W = "\u0036"
88
+ SEVEN_W = "\u0037"
89
+ EIGHT_W = "\u0038"
90
+ NINE_W = "\u0039"
91
+ ZERO_P = "\u06f0"
92
+ ONE_P = "\u06f1"
93
+ TWO_P = "\u06f2"
94
+ THREE_P = "\u06f3"
95
+ FOUR_P = "\u06f4"
96
+ FIVE_P = "\u06f5"
97
+ SIX_P = "\u06f6"
98
+ SEVEN_P = "\u06f7"
99
+ EIGHT_P = "\u06f8"
100
+ NINE_P = "\u06f9"
101
+ PERCENT = "\u066a"
102
+ DECIMAL = "\u066b"
103
+ THOUSANDS = "\u066c"
104
+ STAR = "\u066d"
105
+ MINI_ALEF = "\u0670"
106
+ ALEF_WASLA = "\u0671"
107
+ FULL_STOP = "\u06d4"
108
+ BYTE_ORDER_MARK = "\ufeff"
109
+
110
+ # Diacritics
111
+ FATHATAN = "\u064b"
112
+ DAMMATAN = "\u064c"
113
+ KASRATAN = "\u064d"
114
+ FATHA = "\u064e"
115
+ DAMMA = "\u064f"
116
+ KASRA = "\u0650"
117
+ SHADDA = "\u0651"
118
+ SUKUN = "\u0652"
119
+
120
+ # Small Letters
121
+ SMALL_ALEF = "\u0670"
122
+ SMALL_WAW = "\u06e5"
123
+ SMALL_YEH = "\u06e6"
124
+ # Ligatures
125
+ LAM_ALEF = "\ufefb"
126
+ LAM_ALEF_HAMZA_ABOVE = "\ufef7"
127
+ LAM_ALEF_HAMZA_BELOW = "\ufef9"
128
+ LAM_ALEF_MADDA_ABOVE = "\ufef5"
129
+ SIMPLE_LAM_ALEF = "\u0644\u0627"
130
+ SIMPLE_LAM_ALEF_HAMZA_ABOVE = "\u0644\u0623"
131
+ SIMPLE_LAM_ALEF_HAMZA_BELOW = "\u0644\u0625"
132
+ SIMPLE_LAM_ALEF_MADDA_ABOVE = "\u0644\u0622"
133
+ # groups
134
+ LETTERS = "".join(
135
+ [
136
+ ALEF,
137
+ BEH,
138
+ TEH,
139
+ TEH_MARBUTA,
140
+ THEH,
141
+ JEEM,
142
+ HAH,
143
+ KHAH,
144
+ DAL,
145
+ THAL,
146
+ REH,
147
+ ZAIN,
148
+ SEEN,
149
+ SHEEN,
150
+ SAD,
151
+ DAD,
152
+ TAH,
153
+ ZAH,
154
+ AIN,
155
+ GHAIN,
156
+ FEH,
157
+ QAF,
158
+ KAF,
159
+ LAM,
160
+ MEEM,
161
+ NOON,
162
+ HEH,
163
+ WAW,
164
+ ALEF_MAKSURA,
165
+ YEH,
166
+ HAMZA,
167
+ ALEF_MADDA,
168
+ ALEF_HAMZA_ABOVE,
169
+ WAW_HAMZA,
170
+ ALEF_HAMZA_BELOW,
171
+ YEH_HAMZA,
172
+ ]
173
+ )
174
+
175
+ NUMBERS_EAST = (ZERO, ONE, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE)
176
+ NUMBERS_WEST = (
177
+ ZERO_W,
178
+ ONE_W,
179
+ TWO_W,
180
+ THREE_W,
181
+ FOUR_W,
182
+ FIVE_W,
183
+ SIX_W,
184
+ SEVEN_W,
185
+ EIGHT_W,
186
+ NINE_W,
187
+ )
188
+ NUMBERS_PERS = (
189
+ ZERO_P,
190
+ ONE_P,
191
+ TWO_P,
192
+ THREE_P,
193
+ FOUR_P,
194
+ FIVE_P,
195
+ SIX_P,
196
+ SEVEN_P,
197
+ EIGHT_P,
198
+ NINE_P,
199
+ )
200
+
201
+
202
+ TASHKEEL = (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA)
203
+ HARAKAT = (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN)
204
+
205
+ SHORTHARAKAT = (FATHA, DAMMA, KASRA, SUKUN)
206
+
207
+ TANWIN = (FATHATAN, DAMMATAN, KASRATAN)
208
+
209
+ NOT_DEF_HARAKA = TATWEEL
210
+
211
+ LIGUATURES = (
212
+ LAM_ALEF,
213
+ LAM_ALEF_HAMZA_ABOVE,
214
+ LAM_ALEF_HAMZA_BELOW,
215
+ LAM_ALEF_MADDA_ABOVE,
216
+ )
217
+
218
+ HAMZAT = (
219
+ HAMZA,
220
+ WAW_HAMZA,
221
+ YEH_HAMZA,
222
+ HAMZA_ABOVE,
223
+ HAMZA_BELOW,
224
+ ALEF_HAMZA_BELOW,
225
+ ALEF_HAMZA_ABOVE,
226
+ )
227
+
228
+ ALEFAT = (
229
+ ALEF,
230
+ ALEF_MADDA,
231
+ ALEF_HAMZA_ABOVE,
232
+ ALEF_HAMZA_BELOW,
233
+ ALEF_WASLA,
234
+ ALEF_MAKSURA,
235
+ SMALL_ALEF,
236
+ )
237
+
238
+ WEAK = (ALEF, WAW, YEH, ALEF_MAKSURA)
239
+ YEHLIKE = (YEH, YEH_HAMZA, ALEF_MAKSURA, SMALL_YEH)
240
+
241
+ WAWLIKE = (WAW, WAW_HAMZA, SMALL_WAW)
242
+ TEHLIKE = (TEH, TEH_MARBUTA)
243
+
244
+ SMALL = (SMALL_ALEF, SMALL_WAW, SMALL_YEH)
245
+
246
+ MOON = (
247
+ HAMZA,
248
+ ALEF_MADDA,
249
+ ALEF_HAMZA_ABOVE,
250
+ ALEF_HAMZA_BELOW,
251
+ ALEF,
252
+ BEH,
253
+ JEEM,
254
+ HAH,
255
+ KHAH,
256
+ AIN,
257
+ GHAIN,
258
+ FEH,
259
+ QAF,
260
+ KAF,
261
+ MEEM,
262
+ HEH,
263
+ WAW,
264
+ YEH,
265
+ )
266
+
267
+ SUN = (
268
+ TEH,
269
+ THEH,
270
+ DAL,
271
+ THAL,
272
+ REH,
273
+ ZAIN,
274
+ SEEN,
275
+ SHEEN,
276
+ SAD,
277
+ DAD,
278
+ TAH,
279
+ ZAH,
280
+ LAM,
281
+ NOON,
282
+ )
283
+
284
+ ALPHABETIC_ORDER = {
285
+ ALEF: 1,
286
+ BEH: 2,
287
+ TEH: 3,
288
+ TEH_MARBUTA: 3,
289
+ THEH: 4,
290
+ JEEM: 5,
291
+ HAH: 6,
292
+ KHAH: 7,
293
+ DAL: 8,
294
+ THAL: 9,
295
+ REH: 10,
296
+ ZAIN: 11,
297
+ SEEN: 12,
298
+ SHEEN: 13,
299
+ SAD: 14,
300
+ DAD: 15,
301
+ TAH: 16,
302
+ ZAH: 17,
303
+ AIN: 18,
304
+ GHAIN: 19,
305
+ FEH: 20,
306
+ QAF: 21,
307
+ KAF: 22,
308
+ LAM: 23,
309
+ MEEM: 24,
310
+ NOON: 25,
311
+ HEH: 26,
312
+ WAW: 27,
313
+ YEH: 28,
314
+ HAMZA: 29,
315
+ ALEF_MADDA: 29,
316
+ ALEF_HAMZA_ABOVE: 29,
317
+ WAW_HAMZA: 29,
318
+ ALEF_HAMZA_BELOW: 29,
319
+ YEH_HAMZA: 29,
320
+ }
321
+
322
+ NAMES = {
323
+ ALEF: "ألف",
324
+ BEH: "باء",
325
+ TEH: "تاء",
326
+ TEH_MARBUTA: "تاء مربوطة",
327
+ THEH: "ثاء",
328
+ JEEM: "جيم",
329
+ HAH: "حاء",
330
+ KHAH: "خاء",
331
+ DAL: "دال",
332
+ THAL: "ذال",
333
+ REH: "راء",
334
+ ZAIN: "زاي",
335
+ SEEN: "سين",
336
+ SHEEN: "شين",
337
+ SAD: "صاد",
338
+ DAD: "ضاد",
339
+ TAH: "طاء",
340
+ ZAH: "ظاء",
341
+ AIN: "عين",
342
+ GHAIN: "غين",
343
+ FEH: "فاء",
344
+ QAF: "قاف",
345
+ KAF: "كاف",
346
+ LAM: "لام",
347
+ MEEM: "ميم",
348
+ NOON: "نون",
349
+ HEH: "هاء",
350
+ WAW: "واو",
351
+ YEH: "ياء",
352
+ HAMZA: "همزة",
353
+ TATWEEL: "تطويل",
354
+ ALEF_MADDA: "ألف ممدودة",
355
+ ALEF_MAKSURA: "ألف مقصورة",
356
+ ALEF_HAMZA_ABOVE: "همزة على الألف",
357
+ WAW_HAMZA: "همزة على الواو",
358
+ ALEF_HAMZA_BELOW: "همزة تحت الألف",
359
+ YEH_HAMZA: "همزة على الياء",
360
+ FATHATAN: "فتحتان",
361
+ DAMMATAN: "ضمتان",
362
+ KASRATAN: "كسرتان",
363
+ FATHA: "فتحة",
364
+ DAMMA: "ضمة",
365
+ KASRA: "كسرة",
366
+ SHADDA: "شدة",
367
+ SUKUN: "سكون",
368
+ }
369
+ HAMZAT_STRING = "".join(HAMZAT)
370
+ HARAKAT_STRING = "".join(HARAKAT)
371
+ TASHKEEL_STRING = "".join(TASHKEEL)
372
+ # regular expretion
373
+
374
+ HARAKAT_PATTERN = re.compile("[" + "".join(HARAKAT) + "]", re.UNICODE)
375
+ # ~ """ pattern to strip Harakat"""
376
+ LASTHARAKA_PATTERN = re.compile(
377
+ "[%s]$|[%s]" % ("".join(HARAKAT), "".join(TANWIN)), re.UNICODE
378
+ )
379
+ # ~ """ Pattern to strip only the last haraka """
380
+ SHORTHARAKAT_PATTERN = re.compile("[" + "".join(SHORTHARAKAT) + "]", re.UNICODE)
381
+ # ~ Pattern to lookup Short Harakat(Fatha, Damma, Kasra, sukun, tanwin),
382
+ # but not shadda
383
+ TASHKEEL_PATTERN = re.compile("[" + "".join(TASHKEEL) + "]", re.UNICODE)
384
+ # ~ """ Harakat and shadda pattern """
385
+ HAMZAT_PATTERN = re.compile("[" + "".join(HAMZAT) + "]", re.UNICODE)
386
+ # ~ """ all hamzat pattern"""
387
+ ALEFAT_PATTERN = re.compile("[" + "".join(ALEFAT) + "]", re.UNICODE)
388
+ # ~ """ all alef like letters """
389
+ LIGUATURES_PATTERN = re.compile("[" + "".join(LIGUATURES) + "]", re.UNICODE)
390
+ # ~ """ all liguatures pattern """
391
+ TOKEN_PATTERN = re.compile(r"([^\w\u0670\u064b-\u0652']+)", re.UNICODE)
392
+ # ~ """ pattern to tokenize a text"""
393
+ TOKEN_PATTERN_SPLIT = re.compile(r"([\w\u0670\u064b-\u0652']+)", re.UNICODE)
394
+ # ~ """ pattern to tokenize a text with positions"""
395
+ TOKEN_REPLACE = re.compile("\t|\r|\f|\v| ")
396
+
397
+ # Arabic string
398
+ ARABIC_STRING = re.compile(
399
+ r"([^\u0600-\u0652%s%s%s\s\d])"
400
+ % (LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_MADDA_ABOVE),
401
+ re.UNICODE,
402
+ )
403
+ # Arabic range
404
+ ARABIC_RANGE = re.compile(
405
+ "([^\u0600-\u06ff\ufb50-\ufdff\ufe70-\ufeff\u0750-\u077f])", re.UNICODE
406
+ )
407
+
408
+ # Space fixes
409
+ FIX_SPACES_PAT = re.compile(r"\s*([?؟!.,،:]+(?:\s+[?؟!.,،:]+)*)\s*", re.UNICODE)
410
+
411
+ DIACRITICS = [
412
+ chr(x) for x in range(0x0600, 0x06FF) if unicodedata.category(chr(x)) == "Mn"
413
+ ]
414
+ # ~ \u0610 ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM
415
+ # ~ \u0611 ARABIC SIGN ALAYHE ASSALLAM
416
+ # ~ \u0612 ARABIC SIGN RAHMATULLAH ALAYHE
417
+ # ~ \u0613 ARABIC SIGN RADI ALLAHOU ANHU
418
+ # ~ \u0614 ARABIC SIGN TAKHALLUS
419
+ # ~ \u0615 ARABIC SMALL HIGH TAH
420
+ # ~ \u0616 ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH
421
+ # ~ \u0617 ARABIC SMALL HIGH ZAIN
422
+ # ~ \u0618 ARABIC SMALL FATHA
423
+ # ~ \u0619 ARABIC SMALL DAMMA
424
+ # ~ \u061a ARABIC SMALL KASRA
425
+ # ~ \u064b ARABIC FATHATAN
426
+ # ~ \u064c ARABIC DAMMATAN
427
+ # ~ \u064d ARABIC KASRATAN
428
+ # ~ \u064e ARABIC FATHA
429
+ # ~ \u064f ARABIC DAMMA
430
+ # ~ \u0650 ARABIC KASRA
431
+ # ~ \u0651 ARABIC SHADDA
432
+ # ~ \u0652 ARABIC SUKUN
433
+ # ~ \u0653 ARABIC MADDAH ABOVE
434
+ # ~ \u0654 ARABIC HAMZA ABOVE
435
+ # ~ \u0655 ARABIC HAMZA BELOW
436
+ # ~ \u0656 ARABIC SUBSCRIPT ALEF
437
+ # ~ \u0657 ARABIC INVERTED DAMMA
438
+ # ~ \u0658 ARABIC MARK NOON GHUNNA
439
+ # ~ \u0659 ARABIC ZWARAKAY
440
+ # ~ \u065a ARABIC VOWEL SIGN SMALL V ABOVE
441
+ # ~ \u065b ARABIC VOWEL SIGN INVERTED SMALL V ABOVE
442
+ # ~ \u065c ARABIC VOWEL SIGN DOT BELOW
443
+ # ~ \u065d ARABIC REVERSED DAMMA
444
+ # ~ \u065e ARABIC FATHA WITH TWO DOTS
445
+ # ~ \u0670 ARABIC LETTER SUPERSCRIPT ALEF
446
+ # ~ \u06d6 ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA
447
+ # ~ \u06d7 ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA
448
+ # ~ \u06d8 ARABIC SMALL HIGH MEEM INITIAL FORM
449
+ # ~ \u06d9 ARABIC SMALL HIGH LAM ALEF
450
+ # ~ \u06da ARABIC SMALL HIGH JEEM
451
+ # ~ \u06db ARABIC SMALL HIGH THREE DOTS
452
+ # ~ \u06dc ARABIC SMALL HIGH SEEN
453
+ # ~ \u06df ARABIC SMALL HIGH ROUNDED ZERO
454
+ # ~ \u06e0 ARABIC SMALL HIGH UPRIGHT RECTANGULAR ZERO
455
+ # ~ \u06e1 ARABIC SMALL HIGH DOTLESS HEAD OF KHAH
456
+ # ~ \u06e2 ARABIC SMALL HIGH MEEM ISOLATED FORM
457
+ # ~ \u06e3 ARABIC SMALL LOW SEEN
458
+ # ~ \u06e4 ARABIC SMALL HIGH MADDA
459
+ # ~ \u06e7 ARABIC SMALL HIGH YEH
460
+ # ~ \u06e8 ARABIC SMALL HIGH NOON
461
+ # ~ \u06ea ARABIC EMPTY CENTRE LOW STOP
462
+ # ~ \u06eb ARABIC EMPTY CENTRE HIGH STOP
463
+ # ~ \u06ec ARABIC ROUNDED HIGH STOP WITH FILLED CENTRE
464
+ # ~ \u06ed ARABIC SMALL LOW MEEM
465
+
466
+
467
+ ################################################
468
+ # { is letter functions
469
+ ################################################
470
+ def is_sukun(archar):
471
+ """Checks if the given ``archar``Sukun Mark."""
472
+ return archar == SUKUN
473
+
474
+
475
+ def is_shadda(archar):
476
+ """Checks if the given ``archar`` is Shadda Mark."""
477
+ return archar == SHADDA
478
+
479
+
480
+ def is_tatweel(archar):
481
+ """Checks if the given ``archar`` Tatweel letter modifier."""
482
+ return archar == TATWEEL
483
+
484
+
485
+ def is_tanwin(archar):
486
+ """Checks if the given ``archar`` Tanwin Marks"""
487
+ return archar in TANWIN
488
+
489
+
490
+ def is_tashkeel(archar):
491
+ """Checks if the given ``archar`` Arabic Tashkeel Marks (
492
+ - FATHA, DAMMA, KASRA, SUKUN,
493
+ - SHADDA,
494
+ - FATHATAN, DAMMATAN, KASRATAn)."""
495
+ return archar in TASHKEEL
496
+
497
+
498
+ def is_haraka(archar):
499
+ """Checks if the given ``archar`` Arabic Harakat Marks (FATHA, DAMMA, KASRA, SUKUN, TANWIN)."""
500
+ return archar in HARAKAT
501
+
502
+
503
+ def is_shortharaka(archar):
504
+ """Checks if the given ``archar`` short Harakat Marks (FATHA, DAMMA, KASRA, SUKUN)."""
505
+ return archar in SHORTHARAKAT
506
+
507
+
508
+ def is_ligature(archar):
509
+ """Checks for Arabic Ligatures like LamAlef.
510
+ (LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE)
511
+ @param archar: arabic unicode char
512
+ @type archar: unicode
513
+ @return:
514
+ @rtype:Boolean
515
+ """
516
+ return archar in LIGUATURES
517
+
518
+
519
+ def is_hamza(archar):
520
+ """Checks for Arabic Hamza forms.
521
+ HAMZAT are (HAMZA, WAW_HAMZA, YEH_HAMZA, HAMZA_ABOVE, HAMZA_BELOW,
522
+ ALEF_HAMZA_BELOW, ALEF_HAMZA_ABOVE)
523
+ @param archar: arabic unicode char
524
+ @type archar: unicode
525
+ @return:
526
+ @rtype:Boolean
527
+ """
528
+ return archar in HAMZAT
529
+
530
+
531
+ def is_alef(archar):
532
+ """Checks for Arabic Alef forms.
533
+ ALEFAT = (ALEF, ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW, ALEF_WASLA, ALEF_MAKSURA)
534
+ @param archar: arabic unicode char
535
+ @type archar: unicode
536
+ @return:
537
+ @rtype:Boolean
538
+ """
539
+ return archar in ALEFAT
540
+
541
+
542
+ def is_yehlike(archar):
543
+ """Checks for Arabic Yeh forms.
544
+ Yeh forms : YEH, YEH_HAMZA, SMALL_YEH, ALEF_MAKSURA
545
+ @param archar: arabic unicode char
546
+ @type archar: unicode
547
+ @return:
548
+ @rtype:Boolean
549
+ """
550
+ return archar in YEHLIKE
551
+
552
+
553
+ def is_wawlike(archar):
554
+ """Checks for Arabic Waw like forms.
555
+ Waw forms : WAW, WAW_HAMZA, SMALL_WAW
556
+ @param archar: arabic unicode char
557
+ @type archar: unicode
558
+ @return:
559
+ @rtype:Boolean
560
+ """
561
+ return archar in WAWLIKE
562
+
563
+
564
+ def is_teh(archar):
565
+ """Checks for Arabic Teh forms.
566
+ Teh forms : TEH, TEH_MARBUTA
567
+ @param archar: arabic unicode char
568
+ @type archar: unicode
569
+ @return:
570
+ @rtype:Boolean
571
+ """
572
+ return archar in TEHLIKE
573
+
574
+
575
+ def is_small(archar):
576
+ """Checks for Arabic Small letters.
577
+ SMALL Letters : SMALL ALEF, SMALL WAW, SMALL YEH
578
+ @param archar: arabic unicode char
579
+ @type archar: unicode
580
+ @return:
581
+ @rtype:Boolean
582
+ """
583
+ return archar in SMALL
584
+
585
+
586
+ def is_weak(archar):
587
+ """Checks for Arabic Weak letters.
588
+ Weak Letters : ALEF, WAW, YEH, ALEF_MAKSURA
589
+ @param archar: arabic unicode char
590
+ @type archar: unicode
591
+ @return:
592
+ @rtype:Boolean
593
+ """
594
+ return archar in WEAK
595
+
596
+
597
+ def is_moon(archar):
598
+ """Checks for Arabic Moon letters.
599
+ Moon Letters :
600
+ @param archar: arabic unicode char
601
+ @type archar: unicode
602
+ @return:
603
+ @rtype:Boolean
604
+ """
605
+ return archar in MOON
606
+
607
+
608
+ def is_sun(archar):
609
+ """Checks for Arabic Sun letters.
610
+ Moon Letters :
611
+ @param archar: arabic unicode char
612
+ @type archar: unicode
613
+ @return:
614
+ @rtype:Boolean
615
+ """
616
+ return archar in SUN
617
+
618
+
619
+ #####################################
620
+ # { general letter functions
621
+ #####################################
622
+ def order(archar):
623
+ """return Arabic letter order between 1 and 29.
624
+ Alef order is 1, Yeh is 28, Hamza is 29.
625
+ Teh Marbuta has the same ordre with Teh, 3.
626
+ @param archar: arabic unicode char
627
+ @type archar: unicode
628
+ @return: arabic order.
629
+ @rtype: integer
630
+ """
631
+ return ALPHABETIC_ORDER.get(archar, 0)
632
+
633
+
634
+ def name(archar, default_name=""):
635
+ """return Arabic letter name in arabic. Alef order is 1, Yeh is 28,
636
+ Hamza is 29. Teh Marbuta has the same ordre with Teh, 3.
637
+ @param archar: arabic unicode char
638
+ @type archar: unicode
639
+ @param default_name: arabic unicode char
640
+ @type default_name: unicode
641
+ @return: arabic name.
642
+ @rtype: unicode
643
+ """
644
+ return NAMES.get(archar, default_name)
645
+
646
+
647
+ def arabicrange():
648
+ """return a list of arabic characteres .
649
+ Return a list of characteres between \u060c to \u0652
650
+ @return: list of arabic characteres.
651
+ @rtype: unicode
652
+ """
653
+ mylist = []
654
+ for i in range(0x0600, 0x00653):
655
+ try:
656
+ mylist.append(chr(i))
657
+ except NameError:
658
+ # python 3 compatible
659
+ mylist.append(chr(i))
660
+ except ValueError:
661
+ pass
662
+ return mylist
663
+
664
+
665
+ #####################################
666
+ # { Has letter functions
667
+ #####################################
668
+ def has_shadda(word):
669
+ """Checks if the arabic word contains shadda.
670
+ @param word: arabic unicode char
671
+ @type word: unicode
672
+ @return: if shadda exists
673
+ @rtype:Boolean
674
+ """
675
+ if re.search(SHADDA, word):
676
+ return True
677
+ return False
678
+
679
+
680
+ #####################################
681
+ # { word and text functions
682
+ #####################################
683
+ def is_vocalized(word):
684
+ """Checks if the arabic word is vocalized.
685
+ the word musn't have any spaces and pounctuations.
686
+ @param word: arabic unicode char
687
+ @type word: unicode
688
+ @return: if the word is vocalized
689
+ @rtype:Boolean
690
+ """
691
+ if word.isalpha():
692
+ return False
693
+ for char in word:
694
+ if is_tashkeel(char):
695
+ break
696
+ else:
697
+ return False
698
+ return True
699
+
700
+
701
+ def is_vocalizedtext(text):
702
+ """Checks if the arabic text is vocalized.
703
+ The text can contain many words and spaces
704
+ @param text: arabic unicode char
705
+ @type text: unicode
706
+ @return: if the word is vocalized
707
+ @rtype:Boolean
708
+ """
709
+ return bool(re.search(HARAKAT_PATTERN, text))
710
+
711
+
712
+ def is_arabicstring(text):
713
+ """Checks for an Arabic standard Unicode block characters
714
+ An arabic string can contain spaces, digits and pounctuation.
715
+ but only arabic standard characters, not extended arabic
716
+ @param text: input text
717
+ @type text: unicode
718
+ @return: True if all charaters are in Arabic block
719
+ @rtype: Boolean
720
+ """
721
+ if ARABIC_STRING.search(text):
722
+ return False
723
+ return True
724
+
725
+
726
+ def is_arabicrange(text):
727
+ """Checks for an Arabic Unicode block characters
728
+ @param text: input text
729
+ @type text: unicode
730
+ @return: True if all charaters are in Arabic block
731
+ @rtype: Boolean
732
+ """
733
+ if ARABIC_RANGE.search(text):
734
+ return False
735
+ return True
736
+
737
+
738
+ def is_arabicword(word):
739
+ """Checks for an valid Arabic word.
740
+ An Arabic word not contains spaces, digits and pounctuation
741
+ avoid some spelling error, TEH_MARBUTA must be at the end.
742
+ @param word: input word
743
+ @type word: unicode
744
+ @return: True if all charaters are in Arabic block
745
+ @rtype: Boolean
746
+ """
747
+ if len(word) == 0:
748
+ return False
749
+ elif re.search(
750
+ "([^\u0600-\u0652%s%s%s])"
751
+ % (LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_MADDA_ABOVE),
752
+ word,
753
+ ):
754
+ return False
755
+ elif is_haraka(word[0]) or word[0] in (WAW_HAMZA, YEH_HAMZA):
756
+ return False
757
+ # if Teh Marbuta or Alef_Maksura not in the end
758
+ elif re.match("^(.)*[%s](.)+$" % ALEF_MAKSURA, word):
759
+ return False
760
+ elif re.match(
761
+ "^(.)*[%s]([^%s%s%s])(.)+$" % (TEH_MARBUTA, DAMMA, KASRA, FATHA), word
762
+ ):
763
+ return False
764
+ elif re.search(SHADDA + SHADDA, word):
765
+ return False
766
+ else:
767
+ return True
768
+
769
+
770
+ #####################################
771
+ # {Char functions
772
+ #####################################
773
+ def first_char(word):
774
+ """
775
+ Return the first char
776
+ @param word: given word
777
+ @type word: unicode
778
+ @return: the first char
779
+ @rtype: unicode char
780
+ """
781
+ return word[0]
782
+
783
+
784
+ def second_char(word):
785
+ """
786
+ Return the second char
787
+ @param word: given word
788
+ @type word: unicode
789
+ @return: the first char
790
+ @rtype: unicode char
791
+ """
792
+ return word[1:2]
793
+
794
+
795
+ def last_char(word):
796
+ """
797
+ Return the last letter
798
+ example: zerrouki; 'i' is the last.
799
+ @param word: given word
800
+ @type word: unicode
801
+ @return: the last letter
802
+ @rtype: unicode char
803
+ """
804
+ return word[-1:]
805
+
806
+
807
+ def secondlast_char(word):
808
+ """
809
+ Return the second last letter example: zerrouki; 'k' is the second last.
810
+ @param word: given word
811
+ @type word: unicode
812
+ @return: the second last letter
813
+ @rtype: unicode char
814
+ """
815
+ return word[-2:-1]
816
+
817
+
818
+ #####################################
819
+ # {Strip functions
820
+ #####################################
821
+ def strip_harakat(text):
822
+ """Strip Harakat from arabic word except Shadda.
823
+ The striped marks are :
824
+ - FATHA, DAMMA, KASRA
825
+ - SUKUN
826
+ - FATHATAN, DAMMATAN, KASRATAN,
827
+
828
+ Example:
829
+ >>> text = u"الْعَرَبِيّةُ"
830
+ >>> strip_harakat(text)
831
+ >>> العربيّة
832
+
833
+ @param text: arabic text.
834
+ @type text: unicode.
835
+ @return: return a striped text.
836
+ @rtype: unicode.
837
+ """
838
+ # if text:
839
+ # return re.sub(HARAKAT_PATTERN, u'', text)
840
+ # return text
841
+ if not text:
842
+ return text
843
+ elif is_vocalized(text):
844
+ for char in HARAKAT:
845
+ text = text.replace(char, "")
846
+ return text
847
+
848
+
849
+ def strip_lastharaka(text):
850
+ """Strip the last Haraka from arabic word except Shadda.
851
+ The striped marks are :
852
+ - FATHA, DAMMA, KASRA
853
+ - SUKUN
854
+ - FATHATAN, DAMMATAN, KASRATAN
855
+
856
+ Example:
857
+ >>> text = u"الْعَرَبِيّةُ"
858
+ >>> strip_lastharaka(text)
859
+ الْعَرَبِيّة
860
+
861
+ @param text: arabic text.
862
+ @type text: unicode.
863
+ @return: return a striped text.
864
+ @rtype: unicode.
865
+ """
866
+ if text:
867
+ if is_vocalized(text):
868
+ return re.sub(LASTHARAKA_PATTERN, "", text)
869
+ return text
870
+
871
+
872
+ def strip_tashkeel(text):
873
+ """Strip vowels from a text, include Shadda.
874
+ The striped marks are :
875
+ - FATHA, DAMMA, KASRA
876
+ - SUKUN
877
+ - SHADDA
878
+ - FATHATAN, DAMMATAN, KASRATAN,, , .
879
+
880
+ Example:
881
+ >>> text = u"الْعَرَبِيّةُ"
882
+ >>> strip_tashkeel(text)
883
+ العربية
884
+
885
+ @param text: arabic text.
886
+ @type text: unicode.
887
+ @return: return a striped text.
888
+ @rtype: unicode.
889
+ """
890
+ if not text:
891
+ return text
892
+ elif is_vocalized(text):
893
+ for char in TASHKEEL:
894
+ text = text.replace(char, "")
895
+ return text
896
+
897
+
898
+ def strip_small(text):
899
+ """Strip small_letters from a text
900
+ The striped marks are :
901
+ - Small Alef الألف الخنجرية, .
902
+ -Small WAW
903
+ -Small Yeh
904
+ Example:
905
+ >>> text = u"الرحمن\u0670"
906
+ >>> strip_small(text)
907
+ الرحمن
908
+
909
+ @param text: arabic text.
910
+ @type text: unicode.
911
+ @return: return a striped text.
912
+ @rtype: unicode.
913
+ """
914
+ if not text:
915
+ return text
916
+ for char in SMALL:
917
+ text = text.replace(char, "")
918
+ return text
919
+
920
+
921
+ def strip_tatweel(text):
922
+ """
923
+ Strip tatweel from a text and return a result text.
924
+
925
+ Example:
926
+ >>> text = u"العـــــربية"
927
+ >>> strip_tatweel(text)
928
+ العربية
929
+
930
+ @param text: arabic text.
931
+ @type text: unicode.
932
+ @return: return a striped text.
933
+ @rtype: unicode.
934
+
935
+ """
936
+ return text.replace(TATWEEL, "")
937
+
938
+
939
+ def strip_shadda(text):
940
+ """
941
+ Strip Shadda from a text and return a result text.
942
+
943
+ Example:
944
+ >>> text = u"الشّمسيّة"
945
+ >>> strip_shadda(text)
946
+ الشمسية
947
+
948
+ @param text: arabic text.
949
+ @type text: unicode.
950
+ @return: return a striped text.
951
+ @rtype: unicode.
952
+ """
953
+ return text.replace(SHADDA, "")
954
+
955
+
956
+ def strip_diacritics(text):
957
+ """Strip arabic diacritics from a text
958
+ The striped marks are :
959
+ - Small Alef الألف الخنجرية, .
960
+ - Harakat + Shadda
961
+ - Quranic marks
962
+ - Extended arabic diacritics
963
+ Example:
964
+ >>> text = u"الرحمن\u0670"
965
+ >>> strip_small(text)
966
+ الرحمن
967
+
968
+ @param text: arabic text.
969
+ @type text: unicode.
970
+ @return: return a striped text.
971
+ @rtype: unicode.
972
+ """
973
+ if not text:
974
+ return text
975
+ for char in DIACRITICS:
976
+ text = text.replace(char, "")
977
+ return text
978
+
979
+
980
+ def normalize_ligature(text):
981
+ """Normalize Lam Alef ligatures into two letters (LAM and ALEF),
982
+ and Tand return a result text.
983
+ Some systems present lamAlef ligature as a single letter,
984
+ this function convert it into two letters,
985
+ The converted letters into LAM and ALEF are :
986
+ - LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE
987
+
988
+ Example:
989
+ >>> text = u"لانها لالء الاسلام"
990
+ >>> normalize_ligature(text)
991
+ لانها لالئ الاسلام
992
+
993
+ @param text: arabic text.
994
+ @type text: unicode.
995
+ @return: return a converted text.
996
+ @rtype: unicode.
997
+ """
998
+ if text:
999
+ return LIGUATURES_PATTERN.sub("%s%s" % (LAM, ALEF), text)
1000
+ return text
1001
+
1002
+
1003
+ def normalize_hamza(word, method="uniform"):
1004
+ """Standardize the Hamzat into one form of hamza,
1005
+ replace Madda by hamza and alef.
1006
+ Replace the LamAlefs by simplified letters.
1007
+
1008
+
1009
+ Example:
1010
+ >>> import pyarabic.araby as araby
1011
+ >>> text1 = u"جاء سؤال الأئمة عن الإسلام آجلا"
1012
+ >>> araby.normalize_hamza(text1)
1013
+ 'جاء سءال الءءمة عن الءسلام ءءجلا'
1014
+ >>> araby.normalize_hamza(text1, method="tasheel")
1015
+ 'جاء سوال الايمة عن الاسلام ا
1016
+ @param word: arabic text.
1017
+ @type word: unicode.
1018
+ @param method: how to convert hamzat (uniform, tasheel).
1019
+ @type method: unicode.
1020
+ @return: return a converted text.
1021
+ @rtype: unicode.
1022
+ """
1023
+ if method == "tasheel" or method == "تسهيل":
1024
+ # Alefat to Alef
1025
+ word = word.replace(ALEF_MADDA, ALEF)
1026
+ word = word.replace(ALEF_HAMZA_ABOVE, ALEF)
1027
+ word = word.replace(ALEF_HAMZA_BELOW, ALEF)
1028
+ word = word.replace(HAMZA_ABOVE, ALEF)
1029
+ word = word.replace(HAMZA_BELOW, ALEF)
1030
+ # on Waw
1031
+ word = word.replace(WAW_HAMZA, WAW)
1032
+ # on Yeh
1033
+ word = word.replace(YEH_HAMZA, YEH)
1034
+ else:
1035
+ if word.startswith(ALEF_MADDA):
1036
+ if (
1037
+ len(word) >= 3
1038
+ and (word[1] not in HARAKAT)
1039
+ and (word[2] == SHADDA or len(word) == 3)
1040
+ ):
1041
+ word = HAMZA + ALEF + word[1:]
1042
+ else:
1043
+ word = HAMZA + HAMZA + word[1:]
1044
+ # convert all Hamza from into one form
1045
+ word = word.replace(ALEF_MADDA, HAMZA + HAMZA)
1046
+ word = HAMZAT_PATTERN.sub(HAMZA, word)
1047
+ return word
1048
+
1049
+
1050
+ def normalize_teh(text):
1051
+ """
1052
+ converts TEH_MARBUTA to HEH
1053
+ Example:
1054
+ >>> text = 'محبة'
1055
+ >>> normalize_teh(text)
1056
+ 'محبه'
1057
+
1058
+ """
1059
+ return re.sub("[" + "".join(TEH_MARBUTA) + "]", HEH, text)
1060
+
1061
+
1062
+ def normalize_alef(text):
1063
+ """
1064
+ converts all alefs to ALEF_MAMDODA with the exception of Alef maksura
1065
+
1066
+ """
1067
+ # a small alef before Alef Maksura is ommited
1068
+ text = text.replace(SMALL_ALEF + ALEF_MAKSURA, ALEF_MAKSURA)
1069
+ text = text.replace(ALEF_MAKSURA + SMALL_ALEF, ALEF_MAKSURA)
1070
+ return re.sub(ALEFAT_PATTERN, ALEF, text)
1071
+
1072
+
1073
+ def separate(word, extract_shadda=False):
1074
+ """
1075
+ separate the letters from the vowels, in arabic word,
1076
+ if a letter hasn't a haraka, the not definited haraka is attributed.
1077
+ return (letters, vowels)
1078
+
1079
+ Example:
1080
+ >>> araby.separate(text)
1081
+ (u'\u0627\u0644\u0639\u0631\u0628\u064a\u0629',
1082
+ u'\u064e\u0652\u064e\u064e\u064e\u064e\u064f')
1083
+ >>> letters, marks =araby.separate(text)
1084
+ >>> print letters.encode('utf8')
1085
+ العربية
1086
+ >>> print marks.encode('utf8')
1087
+ >>> for m in marks:
1088
+ ... print araby.name(m)
1089
+ فتحة
1090
+ سكون
1091
+ فتحة
1092
+ فتحة
1093
+ فتحة
1094
+ فتحة
1095
+ ضمة
1096
+
1097
+ @param word: the input word
1098
+ @type word: unicode
1099
+ @param extract_shadda: extract shadda as seperate text
1100
+ @type extract_shadda: Boolean
1101
+ @return: (letters, vowels)
1102
+ @rtype:couple of unicode
1103
+ """
1104
+ stack1 = stack.Stack(word)
1105
+ # the word is inversed in the stack
1106
+ stack1.items.reverse()
1107
+ letters = stack.Stack()
1108
+ marks = stack.Stack()
1109
+ vowels = HARAKAT
1110
+ last1 = stack1.pop()
1111
+ # if the last element must be a letter,
1112
+ # the arabic word can't starts with a haraka
1113
+ # in th stack the word is inversed
1114
+ while last1 in vowels:
1115
+ last1 = stack1.pop()
1116
+ while last1 != None:
1117
+ if last1 in vowels:
1118
+ # we can't have two harakats beside.
1119
+ # the shadda is considered as a letter
1120
+ marks.pop()
1121
+ marks.push(last1)
1122
+ elif last1 == SHADDA:
1123
+ # is the element is a Shadda,
1124
+ # the previous letter must have a sukun as mark,
1125
+ # and the shadda take the indefinate mark
1126
+ marks.pop()
1127
+ marks.push(SUKUN)
1128
+ marks.push(NOT_DEF_HARAKA)
1129
+ letters.push(SHADDA)
1130
+ else:
1131
+ marks.push(NOT_DEF_HARAKA)
1132
+ letters.push(last1)
1133
+ last1 = stack1.pop()
1134
+ if extract_shadda:
1135
+ # the shadda is considered as letter
1136
+ wordletters = "".join(letters.items)
1137
+ # print wordletters.encode('utf8')
1138
+ shaddaplaces = re.sub("[^%s]" % SHADDA, TATWEEL, wordletters)
1139
+ shaddaplaces = re.sub("%s%s" % (TATWEEL, SHADDA), SHADDA, shaddaplaces)
1140
+ # print wordletters.encode('utf8')
1141
+ wordletters = strip_shadda(wordletters)
1142
+ # print wordletters.encode('utf8')
1143
+ return (wordletters, "".join(marks.items), shaddaplaces)
1144
+ else:
1145
+ return ("".join(letters.items), "".join(marks.items))
1146
+
1147
+
1148
+ def joint(letters, marks):
1149
+ """joint the letters with the marks
1150
+ the length ot letters and marks must be equal
1151
+ return word
1152
+
1153
+ Example:
1154
+ >>> letters = u"العربية"
1155
+ >>> marks = u'\u064e\u0652\u064e\u064e\u064e\u064e\u064f'
1156
+ >>> word = araby.joint(letters, marks)
1157
+ >>> print word.encode('utf8')
1158
+ اَلْعَرَبَيَةُ
1159
+
1160
+ @param letters: the word letters
1161
+ @type letters: unicode
1162
+ @param marks: the word marks
1163
+ @type marks: unicode
1164
+ @return: word
1165
+ @rtype: unicode
1166
+ """
1167
+ # The length ot letters and marks must be equal
1168
+ if len(letters) != len(marks):
1169
+ return ""
1170
+ stack_letter = stack.Stack(letters)
1171
+ stack_letter.items.reverse()
1172
+ stack_mark = stack.Stack(marks)
1173
+ stack_mark.items.reverse()
1174
+
1175
+ word_stack = stack.Stack()
1176
+ last_letter = stack_letter.pop()
1177
+ last_mark = stack_mark.pop()
1178
+ vowels = HARAKAT
1179
+ while last_letter != None and last_mark != None:
1180
+ if last_letter == SHADDA:
1181
+ top = word_stack.pop()
1182
+ if top not in vowels:
1183
+ word_stack.push(top)
1184
+ word_stack.push(last_letter)
1185
+ if last_mark != NOT_DEF_HARAKA:
1186
+ word_stack.push(last_mark)
1187
+ else:
1188
+ word_stack.push(last_letter)
1189
+ if last_mark != NOT_DEF_HARAKA:
1190
+ word_stack.push(last_mark)
1191
+
1192
+ last_letter = stack_letter.pop()
1193
+ last_mark = stack_mark.pop()
1194
+
1195
+ if not (stack_letter.is_empty() and stack_mark.is_empty()):
1196
+ return False
1197
+ else:
1198
+ return "".join(word_stack.items)
1199
+
1200
+
1201
+ def vocalizedlike(word1, word2):
1202
+ """
1203
+ if the two words has the same letters and the same harakats, this fuction return True.
1204
+ The two words can be full vocalized, or partial vocalized
1205
+
1206
+ Example:
1207
+ >>> word1 = u"ضَربٌ"
1208
+ >>> word2 = u"ضَرْبٌ"
1209
+ >>> araby.vocalizedlike(word1, word2)
1210
+ True
1211
+
1212
+ @param word1: first word
1213
+ @type word1: unicode
1214
+ @param word2: second word
1215
+ @type word2: unicode
1216
+ @return: if two words have similar vocalization
1217
+ @rtype: Boolean
1218
+ """
1219
+ if vocalized_similarity(word1, word2) < 0:
1220
+ return False
1221
+ else:
1222
+ return True
1223
+
1224
+
1225
+ # -------------------------
1226
+ # Function def vaznlike(word1, wazn):
1227
+ # -------------------------
1228
+ def waznlike(word1, wazn, extract_root=False):
1229
+ """If the word1 is like a wazn (pattern), and can return root
1230
+ the letters must be equal,
1231
+ the wazn has FEH, AIN, LAM letters.
1232
+ this are as generic letters.
1233
+ The two words can be full vocalized, or partial vocalized
1234
+
1235
+ Example:
1236
+ >>> word1 = u"ضارب"
1237
+ >>> wazn = u"فَاعِل"
1238
+ >>> araby.waznlike(word1, wazn)
1239
+ True
1240
+
1241
+ @param word1: input word
1242
+ @type word1: unicode
1243
+ @param wazn: given word template وزن
1244
+ @type wazn: unicode
1245
+ @param extract_root: return root if True
1246
+ @type extract_root: boolean
1247
+ @return: if two words have similar vocalization
1248
+ @rtype: Boolean
1249
+ """
1250
+ stack1 = stack.Stack(word1)
1251
+ stack2 = stack.Stack(wazn)
1252
+ root = stack.Stack()
1253
+ last1 = stack1.pop()
1254
+ last2 = stack2.pop()
1255
+ vowels = HARAKAT
1256
+ while last1 != None and last2 != None:
1257
+ if last1 == last2 and last2 not in (FEH, AIN, LAM):
1258
+ last1 = stack1.pop()
1259
+ last2 = stack2.pop()
1260
+ elif last1 not in vowels and last2 in (FEH, AIN, LAM):
1261
+ root.push(last1)
1262
+ # ~ print "t"
1263
+ last1 = stack1.pop()
1264
+ last2 = stack2.pop()
1265
+ elif last1 in vowels and last2 not in vowels:
1266
+ last1 = stack1.pop()
1267
+ elif last1 not in vowels and last2 in vowels:
1268
+ last2 = stack2.pop()
1269
+ else:
1270
+ break
1271
+ # reverse the root letters
1272
+ root.items.reverse()
1273
+ # ~ print " the root is ", root.items#"".join(root.items)
1274
+ if not (stack1.is_empty() and stack2.is_empty()):
1275
+ return False
1276
+ # if one letter is remind after pop in one stack
1277
+ elif last1 != None or last2 != None:
1278
+ return False
1279
+ else:
1280
+ if extract_root:
1281
+ return "".join(root.items)
1282
+ else:
1283
+ return True
1284
+
1285
+
1286
+ def shaddalike(partial, fully):
1287
+ """
1288
+ If the two words has the same letters and the same harakats, this fuction return True.
1289
+ The first word is partially vocalized, the second is fully
1290
+ if the partially contians a shadda, it must be at the same place in the fully
1291
+
1292
+ Example:
1293
+ >>> word1 = u"ردّ"
1294
+ >>> word2=u"ردَّ"
1295
+ >>> araby.shaddalike(word1, word2)
1296
+ True
1297
+
1298
+ @param partial: the partially vocalized word
1299
+ @type partial: unicode
1300
+ @param fully: the fully vocalized word
1301
+ @type fully: unicode
1302
+ @return: if contains shadda
1303
+ @rtype: Boolean
1304
+ """
1305
+ # المدخل ليس به شدة، لا داعي للبحث
1306
+ if not has_shadda(partial):
1307
+ return True
1308
+ # المدخل به شدة، والنتيجة ليس بها شدة، خاطئ
1309
+ elif not has_shadda(fully) and has_shadda(partial):
1310
+ return False
1311
+
1312
+ # المدخل والمخرج بهما شدة، نتأكد من موقعهما
1313
+ partial = strip_harakat(partial)
1314
+ fully = strip_harakat(fully)
1315
+ pstack = stack.Stack(partial)
1316
+ vstack = stack.Stack(fully)
1317
+ plast = pstack.pop()
1318
+ vlast = vstack.pop()
1319
+ # if debug: print "+0", Pstack, Vstack
1320
+ while plast != None and vlast != None:
1321
+ if plast == vlast:
1322
+ plast = pstack.pop()
1323
+ vlast = vstack.pop()
1324
+ elif plast == SHADDA and vlast != SHADDA:
1325
+ # if debug: print "+2", Pstack.items, Plast, Vstack.items, Vlast
1326
+ break
1327
+ elif plast != SHADDA and vlast == SHADDA:
1328
+ # if debug: print "+2", Pstack.items, Plast, Vstack.items, Vlast
1329
+ vlast = vstack.pop()
1330
+ else:
1331
+ # if debug: print "+2", Pstack.items, Plast, Vstack.items, Vlast
1332
+ break
1333
+ if not (pstack.is_empty() and vstack.is_empty()):
1334
+ return False
1335
+ else:
1336
+ return True
1337
+
1338
+
1339
+ def reduce_tashkeel(text):
1340
+ """Reduce the Tashkeel, by deleting evident cases.
1341
+
1342
+ Exmaple:
1343
+ >>> word = u"يُتَسََلَّمْنَ"
1344
+ >>> reduced = araby.reduce_tashkeel(word)
1345
+ >>> print reduced.encode('utf8')
1346
+ يُتسلّمن
1347
+
1348
+ @param text: the input text fully vocalized.
1349
+ @type text: unicode.
1350
+ @return : partially vocalized text.
1351
+ @rtype: unicode.
1352
+
1353
+ """
1354
+ patterns = [
1355
+ # delete all fathat, except on waw and yeh
1356
+ "(?<!(%s|%s))(%s|%s)"
1357
+ % (WAW, YEH, SUKUN, FATHA), # delete damma if followed by waw.
1358
+ "%s(?=%s)" % (DAMMA, WAW), # delete kasra if followed by yeh.
1359
+ "%s(?=%s)"
1360
+ % (KASRA, YEH), # delete fatha if followed by alef to reduce yeh maftouha
1361
+ # and waw maftouha before alef.
1362
+ "%s(?=%s)"
1363
+ % (
1364
+ FATHA,
1365
+ ALEF,
1366
+ ), # delete fatha from yeh and waw if they are in the word begining.
1367
+ r"(?<=\s(%s|%s))%s"
1368
+ % (WAW, YEH, FATHA), # delete kasra if preceded by Hamza below alef.
1369
+ "(?<=%s)%s" % (ALEF_HAMZA_BELOW, KASRA),
1370
+ ]
1371
+ reduced = text
1372
+ for pat in patterns:
1373
+ reduced = re.sub(pat, "", reduced)
1374
+ return reduced
1375
+
1376
+
1377
+ def vocalized_similarity(word1, word2):
1378
+ """if the two words has the same letters and the same harakats, this function return True.
1379
+ The two words can be full vocalized, or partial vocalized
1380
+
1381
+ Example:
1382
+ >>> word1 = u"ضَربٌ"
1383
+ >>> word2 = u"ضَرْبٌ"
1384
+ >>> araby.vocalizedlike(word1, word2)
1385
+ True
1386
+ >>> word1 = u"ضَربٌ"
1387
+ >>> word2 = u"ضَرْبٍ"
1388
+ >>> araby.vocalized_similarity(word1, word2)
1389
+ -1
1390
+
1391
+ @param word1: first word
1392
+ @type word1: unicode
1393
+ @param word2: second word
1394
+ @type word2: unicode
1395
+ @return: return if words are similar, else return negative number of errors
1396
+ @rtype: Boolean / int
1397
+ """
1398
+ stack1 = stack.Stack(word1)
1399
+ stack2 = stack.Stack(word2)
1400
+ last1 = stack1.pop()
1401
+ last2 = stack2.pop()
1402
+ err_count = 0
1403
+ vowels = HARAKAT
1404
+ while last1 != None and last2 != None:
1405
+ if last1 == last2:
1406
+ last1 = stack1.pop()
1407
+ last2 = stack2.pop()
1408
+ elif last1 in vowels and last2 not in vowels:
1409
+ last1 = stack1.pop()
1410
+ elif last1 not in vowels and last2 in vowels:
1411
+ last2 = stack2.pop()
1412
+ else:
1413
+ # break
1414
+ if last1 == SHADDA:
1415
+ last1 = stack1.pop()
1416
+ elif last2 == SHADDA:
1417
+ last2 = stack2.pop()
1418
+ else:
1419
+ last1 = stack1.pop()
1420
+ last2 = stack2.pop()
1421
+ err_count += 1
1422
+ if err_count > 0:
1423
+ return -err_count
1424
+ else:
1425
+ return True
1426
+
1427
+
1428
+ def sentence_tokenize(text):
1429
+ """
1430
+ Tokenize text into sentences.
1431
+
1432
+ Example:
1433
+ >>> text = u"العربية لغة جميلة. والبلاد بعيدة، والشوق زائد"
1434
+ >>> tokens = araby.sentence_tokenize(text)
1435
+ >>> print(tokens)
1436
+ ‎‎['العربية لغة جميلة.', 'والبلاد بعيدة،', 'والشوق زائد']
1437
+
1438
+ @param text: the input text.
1439
+ @type text: unicode.
1440
+ @return: list of sentences.
1441
+ @rtype: list.
1442
+ """
1443
+ text = re.sub("([.,:;،؟?\n])+([\n\t\r ])+", r"\1<SPLIT>", text, re.UNICODE)
1444
+ sentences = re.split("<SPLIT>", text)
1445
+ return sentences
1446
+
1447
+
1448
+ def tokenize(text="", conditions=[], morphs=[]):
1449
+ """
1450
+ Tokenize text into words.
1451
+
1452
+ Example:
1453
+ >>> text = u"العربية لغة جميلة."
1454
+ >>> tokens = araby.tokenize(text)
1455
+ >>> print u"\\n".join(tokens)
1456
+ ‎العربية
1457
+ ‎لغة
1458
+ ‎جميلة
1459
+ .
1460
+
1461
+ Example 2 (To remove tashkeel and filter out non-Arabic words:):
1462
+ >>> text = u"ِاسمٌ الكلبِ في اللغةِ الإنجليزية Dog واسمُ الحمارِ Donky"
1463
+ >>> tokenize(text, conditions=is_arabicrange, morphs=strip_tashkeel)
1464
+ ['اسم', 'الكلب', 'في', 'اللغة', 'الإنجليزية', 'واسم', 'الحمار']
1465
+
1466
+ Example 3 (This structure will enable us to create functions on the fly and pass them:):
1467
+ >>> text = u"طلع البدر علينا من ثنيات الوداع"
1468
+ >>>tokenize(text, conditions=lambda x: x.startswith(u'ال'))
1469
+ ['البدر', 'الوداع']
1470
+
1471
+ @param text: the input text.
1472
+ @type text: unicode.
1473
+ @param conditions: a list of conditions to be applied on tokens, like avoiding non arabic letters.
1474
+ @type conditions: one or list of conditions .
1475
+ @param morphs: a list of morphological change functions to be applied on tokens, like striping tashkeel or normalizing tokens.
1476
+ @type morphs: one or list of morphological functions .
1477
+ @return: list of words.
1478
+ @rtype: list.
1479
+ """
1480
+ if text:
1481
+ # to be tolerant and allow for a single condition and/or morph to be passed
1482
+ # without having to enclose it in a list
1483
+ if type(conditions) is not list:
1484
+ conditions = [conditions]
1485
+ if type(morphs) is not list:
1486
+ morphs = [morphs]
1487
+
1488
+ tokens = TOKEN_PATTERN.split(text)
1489
+ tokens = [
1490
+ TOKEN_REPLACE.sub("", tok) for tok in tokens if TOKEN_REPLACE.sub("", tok)
1491
+ ]
1492
+
1493
+ if conditions:
1494
+ tokens = [tok for tok in tokens if all([cond(tok) for cond in conditions])]
1495
+ if morphs:
1496
+
1497
+ def morph(tok):
1498
+ for m in morphs:
1499
+ tok = m(tok)
1500
+ return tok
1501
+
1502
+ tokens = [morph(tok) for tok in tokens]
1503
+ return tokens
1504
+ else:
1505
+ return []
1506
+
1507
+
1508
+ def tokenize_with_location(text: str) -> list:
1509
+ """
1510
+ Tokenize text into words with their positions.
1511
+
1512
+ Example:
1513
+ >>> text = "حدثنا ابن أبي عامر، قال: رايت مناما"
1514
+ >>> tokens = araby.tokenize_with_location(text)
1515
+ >>> print u"\\n".join(tokens)
1516
+ [{'token': 'حدثنا', 'start': 0, 'end': 5},
1517
+ {'token': 'ابن', 'start': 6, 'end': 9},
1518
+ {'token': 'أبي', 'start': 10, 'end': 13},
1519
+ {'token': 'عامر', 'start': 14, 'end': 18},
1520
+ {'token': 'قال', 'start': 20, 'end': 23},
1521
+ {'token': 'رايت', 'start': 25, 'end': 29},
1522
+ {'token': 'مناما','start': 30, 'end': 35}
1523
+ ]
1524
+
1525
+
1526
+ @param text: the input text.
1527
+ @type text: unicode.
1528
+ @return: list of dict of (tokens, starts, ends).
1529
+ @rtype: list of dict.
1530
+ """
1531
+ tokens = []
1532
+ for match in TOKEN_PATTERN_SPLIT.finditer(text):
1533
+ tokens.append(
1534
+ {
1535
+ "token": text[match.start() : match.end()],
1536
+ "start": match.start(),
1537
+ "end": match.end(),
1538
+ }
1539
+ )
1540
+
1541
+ return tokens
1542
+
1543
+
1544
+ def fix_spaces(text):
1545
+ """ """
1546
+ text = FIX_SPACES_PAT.sub(lambda x: "{} ".format(x.group(1).replace(" ", "")), text)
1547
+ return text.strip()
1548
+
1549
+
1550
+ def autocorrect(text):
1551
+ """
1552
+ Correct most common errors on word
1553
+ like repetetion of harakats,or tanwin befor alef
1554
+ @param text: input text
1555
+ @type text: unicode
1556
+ @return: corrected text
1557
+ @rtype: unicode
1558
+ """
1559
+ ## HARAKAT
1560
+ text = re.sub(r"(?<=[\s\d])([%s])+" % (TASHKEEL_STRING), "", text, re.UNICODE)
1561
+ text = re.sub("^([%s])+" % (TASHKEEL_STRING), "", text, re.UNICODE)
1562
+ # tanwin on alef
1563
+ text = re.sub(ALEF + FATHATAN, FATHATAN + ALEF, text, re.UNICODE)
1564
+
1565
+ # SUKUN misplaced on alef /alef maksura and TEH merbuta
1566
+ text = re.sub(
1567
+ "(?<=[%s%s%s])([%s])+" % (ALEF, ALEF_MAKSURA, TEH_MARBUTA, SUKUN),
1568
+ "",
1569
+ text,
1570
+ re.UNICODE,
1571
+ )
1572
+
1573
+ # Hakara before Shadda
1574
+ text = re.sub("([%s])+(?=[%s])" % (HARAKAT_STRING, SHADDA), "", text, re.UNICODE)
1575
+
1576
+ # repeated harahat
1577
+ text = re.sub(
1578
+ "(?<=[%s])([%s])+" % (HARAKAT_STRING, HARAKAT_STRING), "", text, re.UNICODE
1579
+ )
1580
+
1581
+ ## Letters
1582
+ return text
1583
+
1584
+
1585
+ def spellit(word, lang="ar"):
1586
+ """
1587
+ write the word in full letter' names
1588
+ """
1589
+
1590
+ names = []
1591
+ if lang == "unicode":
1592
+ for c in word:
1593
+ names.append(unicodedata.name(c, c))
1594
+ else:
1595
+ for c in word:
1596
+ names.append(name(c, c))
1597
+ return ", ".join(names)
1598
+
1599
+
1600
+ if __name__ == "__main__":
1601
+ # ~WORDS = [u'الْدَرَاجَةُ', u'الدّرّاجة',
1602
+ # ~u'سّلّامْ', ]
1603
+ # ~for wrd in WORDS:
1604
+ # ~l, m, s = separate(wrd, True)
1605
+ # ~l = joint(l, s)
1606
+ # ~print u'\t'.join([wrd, l, m, s]).encode('utf8')
1607
+ # ~newword = joint(l, m)
1608
+ # ~assert (newword != wrd)
1609
+ print("like: ", vocalizedlike("مُتَوَهِّمًا", "متوهمًا"))
1610
+ print("sim: ", vocalized_similarity("ثمّ", "ثُمَّ"))
1611
+ print("like: ", vocalizedlike("ثمّ", "ثُمَّ"))
1612
+ print("sim: ", vocalized_similarity("ثم", "ثُمَّ"))
1613
+ print("like: ", vocalizedlike("ثم", "ثُمَّ"))
1614
+ print("sim: ", vocalized_similarity("مُتَوَهِّمًا", "متوهمًا"))
1615
+ print("sim: ", vocalized_similarity("مُتَوَهِّمًا", "متوهمًا"))
1616
+ text1 = "العربية: لغة جميلة."
1617
+ wordlist = ["العربية", ":", "لغة", "جميلة", "."]
1618
+ wl = tokenize(text1)
1619
+
1620
+ print(" use tokenize")
1621
+ print(wl)
1622
+ # ~ print((repr(wl)).decode('unicode-escape'))
1623
+ # ~ print((repr(wordlist)).decode('unicode-escape'))
1624
+ # ~ TOKEN_PATTERN2 = re.compile(u"[^\w\u064b-\u0652']+", re.UNICODE)
1625
+ # ~ words = TOKEN_PATTERN2.split(text1)
1626
+ # ~ print(" first")
1627
+ # ~ print((repr(words)).decode('unicode-escape'))
1628
+ # ~ TOKEN_PATTERN3 = re.compile(u"([^\w\u064b-\u0652']+)", re.UNICODE)
1629
+ # ~ words = TOKEN_PATTERN3.split(text1)
1630
+ # ~ print(" modified")
1631
+ # ~ print (repr(words)).decode('unicode-escape')
1632
+
1633
+ # ~ TOKEN_PATTERN4 = re.compile(u"([^\w\u064b-\u0652']+)", re.UNICODE)
1634
+ # ~ words = TOKEN_PATTERN4.split(text1)
1635
+ # ~ print(" modified without r-prefix")
1636
+ # ~ print((repr(words)).decode('unicode-escape'))
1637
+
1638
+ # ~ text = u"ِاسمٌ الكلبِ في اللغةِ الإنجليزية Dog واسمُ الحمارِ Donky"
1639
+ # ~ words = tokenize(text, conditions=is_arabicrange, morphs=strip_tashkeel)
1640
+ # ~ print((repr(words)).decode('unicode-escape'))
1641
+
1642
+ # ~ #>> ['اسم', 'الكلب', 'في', 'اللغة', 'الإنجليزية', 'واسم', 'الحمار']
1643
+
1644
+ # ~ text = u"طلع البدر علينا من ثنيات الوداع"
1645
+ # ~ words = tokenize(text, conditions=lambda x: x.startswith(u'ال'))
1646
+ # ~ # >> ['البدر', 'الوداع']
1647
+ # ~ print((repr(words)).decode('unicode-escape'))