phoonnx 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. phoonnx/__init__.py +0 -0
  2. phoonnx/config.py +490 -0
  3. phoonnx/locale/ca/phonetic_spellings.txt +2 -0
  4. phoonnx/locale/en/phonetic_spellings.txt +1 -0
  5. phoonnx/locale/gl/phonetic_spellings.txt +2 -0
  6. phoonnx/locale/pt/phonetic_spellings.txt +2 -0
  7. phoonnx/phoneme_ids.py +453 -0
  8. phoonnx/phonemizers/__init__.py +45 -0
  9. phoonnx/phonemizers/ar.py +42 -0
  10. phoonnx/phonemizers/base.py +216 -0
  11. phoonnx/phonemizers/en.py +250 -0
  12. phoonnx/phonemizers/fa.py +46 -0
  13. phoonnx/phonemizers/gl.py +142 -0
  14. phoonnx/phonemizers/he.py +67 -0
  15. phoonnx/phonemizers/ja.py +119 -0
  16. phoonnx/phonemizers/ko.py +97 -0
  17. phoonnx/phonemizers/mul.py +606 -0
  18. phoonnx/phonemizers/vi.py +44 -0
  19. phoonnx/phonemizers/zh.py +308 -0
  20. phoonnx/thirdparty/__init__.py +0 -0
  21. phoonnx/thirdparty/arpa2ipa.py +249 -0
  22. phoonnx/thirdparty/cotovia/cotovia_aarch64 +0 -0
  23. phoonnx/thirdparty/cotovia/cotovia_x86_64 +0 -0
  24. phoonnx/thirdparty/hangul2ipa.py +783 -0
  25. phoonnx/thirdparty/ko_tables/aspiration.csv +20 -0
  26. phoonnx/thirdparty/ko_tables/assimilation.csv +31 -0
  27. phoonnx/thirdparty/ko_tables/double_coda.csv +17 -0
  28. phoonnx/thirdparty/ko_tables/hanja.tsv +8525 -0
  29. phoonnx/thirdparty/ko_tables/ipa.csv +22 -0
  30. phoonnx/thirdparty/ko_tables/neutralization.csv +11 -0
  31. phoonnx/thirdparty/ko_tables/tensification.csv +56 -0
  32. phoonnx/thirdparty/ko_tables/yale.csv +22 -0
  33. phoonnx/thirdparty/kog2p/__init__.py +385 -0
  34. phoonnx/thirdparty/kog2p/rulebook.txt +212 -0
  35. phoonnx/thirdparty/mantoq/__init__.py +67 -0
  36. phoonnx/thirdparty/mantoq/buck/__init__.py +0 -0
  37. phoonnx/thirdparty/mantoq/buck/phonetise_buckwalter.py +569 -0
  38. phoonnx/thirdparty/mantoq/buck/symbols.py +64 -0
  39. phoonnx/thirdparty/mantoq/buck/tokenization.py +105 -0
  40. phoonnx/thirdparty/mantoq/num2words.py +37 -0
  41. phoonnx/thirdparty/mantoq/pyarabic/__init__.py +12 -0
  42. phoonnx/thirdparty/mantoq/pyarabic/arabrepr.py +64 -0
  43. phoonnx/thirdparty/mantoq/pyarabic/araby.py +1647 -0
  44. phoonnx/thirdparty/mantoq/pyarabic/named_const.py +227 -0
  45. phoonnx/thirdparty/mantoq/pyarabic/normalize.py +161 -0
  46. phoonnx/thirdparty/mantoq/pyarabic/number.py +826 -0
  47. phoonnx/thirdparty/mantoq/pyarabic/number_const.py +1704 -0
  48. phoonnx/thirdparty/mantoq/pyarabic/stack.py +52 -0
  49. phoonnx/thirdparty/mantoq/pyarabic/trans.py +517 -0
  50. phoonnx/thirdparty/mantoq/unicode_symbol2label.py +4173 -0
  51. phoonnx/thirdparty/tashkeel/LICENSE +22 -0
  52. phoonnx/thirdparty/tashkeel/SOURCE +1 -0
  53. phoonnx/thirdparty/tashkeel/__init__.py +212 -0
  54. phoonnx/thirdparty/tashkeel/hint_id_map.json +18 -0
  55. phoonnx/thirdparty/tashkeel/input_id_map.json +56 -0
  56. phoonnx/thirdparty/tashkeel/model.onnx +0 -0
  57. phoonnx/thirdparty/tashkeel/target_id_map.json +17 -0
  58. phoonnx/thirdparty/zh_num.py +238 -0
  59. phoonnx/util.py +705 -0
  60. phoonnx/version.py +6 -0
  61. phoonnx/voice.py +521 -0
  62. phoonnx-0.0.0.dist-info/METADATA +255 -0
  63. phoonnx-0.0.0.dist-info/RECORD +86 -0
  64. phoonnx-0.0.0.dist-info/WHEEL +5 -0
  65. phoonnx-0.0.0.dist-info/top_level.txt +2 -0
  66. phoonnx_train/__main__.py +151 -0
  67. phoonnx_train/export_onnx.py +109 -0
  68. phoonnx_train/norm_audio/__init__.py +92 -0
  69. phoonnx_train/norm_audio/trim.py +54 -0
  70. phoonnx_train/norm_audio/vad.py +54 -0
  71. phoonnx_train/preprocess.py +420 -0
  72. phoonnx_train/vits/__init__.py +0 -0
  73. phoonnx_train/vits/attentions.py +427 -0
  74. phoonnx_train/vits/commons.py +147 -0
  75. phoonnx_train/vits/config.py +330 -0
  76. phoonnx_train/vits/dataset.py +214 -0
  77. phoonnx_train/vits/lightning.py +352 -0
  78. phoonnx_train/vits/losses.py +58 -0
  79. phoonnx_train/vits/mel_processing.py +139 -0
  80. phoonnx_train/vits/models.py +732 -0
  81. phoonnx_train/vits/modules.py +527 -0
  82. phoonnx_train/vits/monotonic_align/__init__.py +20 -0
  83. phoonnx_train/vits/monotonic_align/setup.py +13 -0
  84. phoonnx_train/vits/transforms.py +212 -0
  85. phoonnx_train/vits/utils.py +16 -0
  86. phoonnx_train/vits/wavfile.py +860 -0
@@ -0,0 +1,826 @@
1
+ #!/usr/bin/python
2
+ # -*- coding=utf-8 -*-
3
+ """
4
+ Arabic numbers routins
5
+ @author: Taha Zerrouki
6
+ @contact: taha dot zerrouki at gmail dot com
7
+ @copyright: Arabtechies, Arabeyes, Taha Zerrouki
8
+ @license: GPL
9
+ @date:2017/02/14
10
+ @version: 0.3
11
+ # ArNumbers is imported from
12
+ license: LGPL <http://www.gnu.org/licenses/lgpl.txt>
13
+ link http://www.ar-php.org
14
+ category Text
15
+ author Khaled Al-Shamaa <khaled.alshamaa@gmail.com>
16
+ copyright 2009 Khaled Al-Shamaa
17
+ """
18
+ import math
19
+ import sys
20
+
21
+ from . import arabrepr, araby
22
+ from . import named_const as nmconst
23
+ from . import number_const as nbconst
24
+
25
+
26
+ class ArNumbers(object):
27
+ """
28
+ Arabic number class
29
+ """
30
+
31
+ _individual = {}
32
+ _feminine = 1
33
+ _format = 1
34
+
35
+ ##"""
36
+ ## * Loads initialize values
37
+ ##"""
38
+ def __init__(self):
39
+ self._individual = nbconst.INDIVIDUALS
40
+ self.complications = nbconst.COMPLICATIONS
41
+
42
+ def set_feminine(self, value):
43
+ """
44
+ Set feminine flag of the counted object
45
+ @param value: value Counted object feminine (1 for masculine & 2 for feminine)
46
+ @type value: integer
47
+ @return: True if success, or False if fail
48
+ @rtype: boolean
49
+ """
50
+
51
+ flag = True
52
+ if value in (1, 2):
53
+ self._feminine = value
54
+ else:
55
+ flag = False
56
+ return flag
57
+
58
+ def set_format(self, value):
59
+ """
60
+ Set the grammar position flag of the counted object
61
+ @param value: Grammar position of counted object (1 if Marfoua & 2 if Mansoub or Majrour)
62
+ @type value: integer
63
+ @return: True if success, or False if fail
64
+ @rtype: boolean
65
+ """
66
+
67
+ flag = True
68
+
69
+ if value in (1, 2):
70
+ self._format = value
71
+ else:
72
+ flag = False
73
+ return flag
74
+
75
+ def get_feminine(self):
76
+ """
77
+ Get the feminine flag of counted object
78
+ @return: return current setting of counted object feminine flag
79
+ @rtype: integer
80
+ """
81
+ return self._feminine
82
+
83
+ def get_format(self):
84
+ """
85
+ Get the grammer position flag of counted object
86
+ @return: return current setting of counted object grammer position flag
87
+ @rtype: integer
88
+ """
89
+
90
+ return self._format
91
+
92
+ def int2str(self, number, output_charset=None, main=None):
93
+ """
94
+ Spell integer number in Arabic idiom
95
+ @param number: The number you want to spell in Arabic idiom
96
+ @type number: integer
97
+ @param output_charset: (optional) Output charset [utf-8|windows-1256|iso-8859-6]
98
+ default value is None (use set output charset)
99
+ @type output_charset: string
100
+ @param main: Main Ar-PHP object to access charset converter options
101
+ @type main: object
102
+ @return: The Arabic idiom that spells inserted number
103
+ @rtype: string
104
+ """
105
+ temp = number.split(".")
106
+ string = self._int2str(temp[0])
107
+ if len(temp) > 1:
108
+ dec = self._int2str(temp[1])
109
+ string += " فاصلة " + dec
110
+ if main:
111
+ if output_charset is None:
112
+ output_charset = main.getOutputCharset()
113
+ string = main.coreConvert(string, "utf-8", output_charset)
114
+ return string
115
+
116
+ def _int2str(self, number_str):
117
+ """
118
+ Spell integer number in Arabic idiom
119
+ @param number_str: The number you want to spell in Arabic idiom
120
+ @type number_str: integer.
121
+ @return: The Arabic idiom that spells inserted number
122
+ @rtype:string
123
+ """
124
+
125
+ blocks = []
126
+ items = []
127
+ string = ""
128
+ # ~ number = number#trunc(int(number)) #(int)number)
129
+ try:
130
+ number = int(number_str)
131
+ except ValueError:
132
+ number = 0
133
+ if int(number) > 0:
134
+ number_part = ""
135
+ while len(number_str) > 3:
136
+ blocks.append(number_str[-3:])
137
+ number_str = number_str[: len(number_str) - 3]
138
+ blocks.append(number_str)
139
+ blocks_num = len(blocks) - 1
140
+ i = blocks_num
141
+ while i >= 0: # (i = blocks_num i > = 0 i--):
142
+ number = math.floor(int(blocks[i]))
143
+ text = self._written_block(number)
144
+ if text:
145
+ if number == 1 and i != 0:
146
+ text = self.complications[i][4]
147
+ elif number == 2 and i != 0:
148
+ text = self.complications[i][self._format]
149
+ elif number > 2 and number < 11 and i != 0:
150
+ text += " " + self.complications[i][3]
151
+ elif i != 0:
152
+ text += " " + self.complications[i][4]
153
+ items.append(text)
154
+ i -= 1
155
+ string = " و ".join(items)
156
+ else:
157
+ string = "صفر"
158
+ return string
159
+
160
+ def _written_block(self, number):
161
+ """
162
+ Spell sub block number of three digits max in Arabic idiom
163
+ @param number: number Sub block number of three digits max you want to spell in Arabic idiom
164
+ @type number: integer
165
+ @return: The Arabic idiom that spells inserted sub block
166
+ @rtype: String
167
+ """
168
+ items = []
169
+ string = ""
170
+ number = int(number)
171
+ if number > 99:
172
+ hundred = math.floor(number / 100) * 100
173
+ number = number % 100
174
+
175
+ if hundred == 200:
176
+ items.append(self._individual[hundred][self._format])
177
+ else:
178
+ items.append(self._individual[hundred])
179
+ if number == 2 or number == 12:
180
+ items.append(self._individual[number][self._feminine][self._format])
181
+ elif number < 20:
182
+ items.append(self._individual[int(number)][self._feminine])
183
+ else:
184
+ ones = number % 10
185
+ tens = math.floor(number / 10) * 10
186
+ tens = int(tens)
187
+
188
+ if ones == 2:
189
+ items.append(self._individual[ones][self._feminine][self._format])
190
+ elif ones > 0:
191
+ items.append(self._individual[ones][self._feminine])
192
+ items.append(self._individual[tens][self._format])
193
+
194
+ if "" in items:
195
+ items.remove("")
196
+ string = " و ".join(items)
197
+ return string
198
+
199
+
200
+ def text2number(text):
201
+ """
202
+ Convert arabic text into number, for example convert تسعة وعشرون = >29.
203
+
204
+ Example:
205
+ >>> text2number(u"خمسمئة وثلاث وعشرون")
206
+ 523
207
+
208
+ @param text: input text
209
+ @type text: unicode
210
+ @return: number extracted from text
211
+ @rtype: integer
212
+ """
213
+ # the result total is 0
214
+ total = 0
215
+ # the partial total for the three number
216
+ partial = 0
217
+ text = araby.strip_tashkeel(text)
218
+ words = text.split(" ")
219
+ # print words
220
+ for word in words:
221
+ if word and word != "واحد" and word[0] in ("و", "ف", "ل", "ب", "ك"):
222
+ word = word[1:]
223
+ if word != "واحد" and word.startswith("و"):
224
+ word = word[1:]
225
+
226
+ if word in nbconst.NUMBER_WORDS:
227
+ actualnumber = nbconst.NUMBER_WORDS[word]
228
+ if actualnumber % 1000 == 0:
229
+ # the case of 1000 or 1 million
230
+ if partial == 0:
231
+ partial = 1
232
+ total += partial * actualnumber
233
+ # re-initiate the partial total
234
+ partial = 0
235
+ else:
236
+ partial += nbconst.NUMBER_WORDS[word]
237
+ # add the final partial to total
238
+ total += partial
239
+ return total
240
+
241
+
242
+ def number2text(anumber):
243
+ """
244
+ Convert number to arabic words, for example convert 25 --> خمسة و عشرون
245
+
246
+ Example:
247
+ >>> number2text(523)
248
+ خمسمئة وثلاث وعشرون
249
+
250
+ @param anumber: input number
251
+ @type anumber: int
252
+ @return: number words
253
+ @rtype: unicode
254
+ """
255
+ # test if the given type is numeric(float or int
256
+ # if ok, convert it to string
257
+ if type(anumber) is int or type(anumber) is float:
258
+ anumber = str(anumber)
259
+ # if the given type is str/unicode, test if it's a valid number
260
+ elif type(anumber) is str or type(anumber) is unicode:
261
+ try:
262
+ a = float(anumber)
263
+ except ValueError:
264
+ return "صفر"
265
+ # if the given number not a valid return 0
266
+ else:
267
+ return "صفر"
268
+ arbn = ArNumbers()
269
+ return arbn.int2str(anumber)
270
+
271
+ return total
272
+
273
+
274
+ def vocalize_number(wordlist, syn_tags=""):
275
+ """Vocalize a number words clause
276
+
277
+
278
+ @param wordlist: words to vocalize
279
+ @type wordlist: unicode list
280
+ @param syn_tags: tags about the clause
281
+ @type syn_tags: unicode
282
+ @return: the vocalized wordlist.
283
+ @rtype: unicode
284
+ """
285
+ newlist = []
286
+ prefix = ""
287
+ nextword = ""
288
+ # we can pass tags to this number word
289
+ tags = syn_tags
290
+ if len(wordlist) == 1:
291
+ word = wordlist[0]
292
+ word_nm = araby.strip_tashkeel(word)
293
+ key = word_nm
294
+ voc = word
295
+ # the first word can have prefixes
296
+ if (
297
+ word_nm
298
+ and not wordlist
299
+ and word_nm != "واحد"
300
+ and word[0] in ("و", "ف", "ل", "ب", "ك")
301
+ ):
302
+ if word_nm[0] in ("ل", "ب", "ك"):
303
+ tags += "مجرور"
304
+ key = word[1:]
305
+ elif word_nm != "واحد" and word_nm.startswith("و"):
306
+ key = word_nm[1:]
307
+ # تحذب بعض الكلمات لأنها تلتبس مع أسماء الأجزاء مثل خُمس وخمس
308
+ if key in nbconst.NUMBER_WORDS and key not in (
309
+ "عشر",
310
+ "خمس",
311
+ "سبع",
312
+ "تسع",
313
+ "خمسا",
314
+ "سبعا",
315
+ "تسعا",
316
+ "عشرا",
317
+ "ألفين",
318
+ "عشرة",
319
+ "صفر",
320
+ "ألف",
321
+ ):
322
+ voc = prefix + nbconst.VOCALIZED_NUMBER_WORDS[key]["i"]
323
+ return [
324
+ voc,
325
+ ]
326
+ for i, word in enumerate(wordlist):
327
+ # save the original word with possible harakat if exist
328
+ # ~ word = wordlist[i]
329
+ word_nm = araby.strip_tashkeel(word)
330
+ key = word_nm
331
+ # the first word can have prefixes
332
+ if (
333
+ i == 0
334
+ and word_nm
335
+ and word_nm != "واحد"
336
+ and word[0] in ("و", "ف", "ل", "ب", "ك")
337
+ ):
338
+ if word_nm[0] in ("ل", "ب", "ك"):
339
+ tags += "مجرور"
340
+ key = word[1:]
341
+ elif word_nm != "واحد" and word_nm.startswith("و"):
342
+ key = word_nm[1:]
343
+ if key in nbconst.NUMBER_WORDS:
344
+ if word_nm.endswith("ين"):
345
+ tags += "مجهول" # إما مجرور أو منصوب
346
+ elif word_nm.endswith("ان") or word_nm.endswith("ون"):
347
+ tags += "مرفوع"
348
+ pre_key = ""
349
+ for i, word in enumerate(wordlist):
350
+ # ~ word = wordlist[i]
351
+ if i + 1 < len(wordlist):
352
+ nextword = wordlist[i + 1]
353
+ else:
354
+ nextword = ""
355
+ key = word
356
+ # the first word can have prefixes
357
+ if word and word != "واحد" and word[0] in ("و", "ف", "ل", "ب", "ك"):
358
+ key = word[1:]
359
+ prefix = word[0]
360
+ if prefix in ("و", "ف", "ك"):
361
+ prefix += "َ"
362
+ elif prefix in ("ل", "ب"):
363
+ prefix += "ِ"
364
+ else:
365
+ prefix = ""
366
+ if key in nbconst.VOCALIZED_NUMBER_WORDS:
367
+ voc = ""
368
+ if nbconst.VOCALIZED_NUMBER_WORDS[key]["s"] == "*":
369
+ voc = prefix + nbconst.VOCALIZED_NUMBER_WORDS[key]["i"]
370
+
371
+ # مبني على النصب في حالة المركب العددي
372
+ elif nextword == "عشر" or nextword == "عشرة":
373
+ voc = prefix + nbconst.VOCALIZED_NUMBER_WORDS[key]["n"]
374
+ # مبني على النصب في حالة المركب العددي
375
+ elif key == "عشر" and pre_key in nbconst.NUMBER_TEN_MASCULIN_UNITS:
376
+ voc = "عَشَرَ"
377
+ elif key == "عشرة" and pre_key in nbconst.NUMBER_TEN_FEMININ_UNITS:
378
+ voc = "عَشْرَةَ"
379
+ elif "مرفوع" in tags:
380
+ if nextword.startswith("و"):
381
+ voc = prefix + nbconst.VOCALIZED_NUMBER_WORDS[key]["r2"]
382
+ else:
383
+ voc = prefix + nbconst.VOCALIZED_NUMBER_WORDS[key]["r"]
384
+ elif "مجهول" in tags:
385
+ voc = prefix + nbconst.VOCALIZED_NUMBER_WORDS[key]["i"]
386
+
387
+ elif "مجرور" in tags:
388
+ if nextword.startswith("و"):
389
+ voc = prefix + nbconst.VOCALIZED_NUMBER_WORDS[key]["j2"]
390
+ else:
391
+ voc = prefix + nbconst.VOCALIZED_NUMBER_WORDS[key]["j"]
392
+ # منصوب
393
+ elif "منصوب" in tags:
394
+ if nextword.startswith("و"):
395
+ voc = prefix + nbconst.VOCALIZED_NUMBER_WORDS[key]["n2"]
396
+ else:
397
+ voc = prefix + nbconst.VOCALIZED_NUMBER_WORDS[key]["n"]
398
+ else:
399
+ voc = prefix + nbconst.VOCALIZED_NUMBER_WORDS[key]["i"]
400
+ newlist.append(voc)
401
+ else:
402
+ newlist.append(prefix + key)
403
+ pre_key = key
404
+ return newlist
405
+
406
+
407
+ def is_unit(word):
408
+ """
409
+ return if the given word is a unit
410
+ @param word: given word to be tested
411
+ @type word: unicode
412
+ @return: if word is a unit return True else False.
413
+ @rtype: Boolean
414
+ """
415
+ return word in nbconst.UNIT_WORDS
416
+
417
+
418
+ def vocalize_unit(numeric, unit):
419
+ """Vocalize a number words
420
+ @param numeric: given number
421
+ @type numeric: integer
422
+ @param unit: unit to vocalize
423
+ @type unit: unicode
424
+ @return: the vocalized unit, or unit word if itsnt a unit word.
425
+ @rtype: unicode
426
+ """
427
+ # detect tags
428
+ # The given word is not a unit
429
+ unit_nm = araby.strip_tashkeel(unit)
430
+ if not is_unit(unit_nm):
431
+ return unit
432
+ tags = ""
433
+ vocalizedunit = unit
434
+
435
+ # العدد بين واحد واثنان يتطلب صفة للوحدة ويكون بعدها
436
+ # هذه الحالة لا تبرمج
437
+
438
+ if numeric >= 0 and numeric <= 2:
439
+ return unit
440
+ # الإضافة إلى تمييز مضاف إليه مجرور مفرد
441
+ # تممييز الألف والمئة والمليون والمليار
442
+ # يتطلب إضافة إلى مفرد
443
+ # مثلا ألف رجل
444
+ elif numeric % 100 == 0 or numeric % 1000 == 0:
445
+ tags = "SingleMajrour"
446
+ vocalizedunit = nbconst.UNIT_WORDS[unit_nm]["a"]
447
+ # العدد المفرد يتطلب
448
+ # إضافة إلى الجمع
449
+ elif numeric % 100 <= 10:
450
+ tags += "Plural"
451
+ vocalizedunit = nbconst.UNIT_WORDS[unit_nm]["p"]
452
+
453
+ elif numeric % 100 < 100:
454
+ tags += "SingleMansoub"
455
+ vocalizedunit = nbconst.UNIT_WORDS[unit_nm]["n"]
456
+ else:
457
+ tags = ""
458
+ vocalizedunit = nbconst.UNIT_WORDS[unit_nm]["i"]
459
+ if not vocalizedunit:
460
+ return "Error" + tags
461
+ else:
462
+ return vocalizedunit
463
+
464
+
465
+ def get_previous_tag(word):
466
+ """Get the word tags
467
+ @param word: given word
468
+ @type word: unicode
469
+ @return:word tag
470
+ @rtype: unicode
471
+ """
472
+ word = araby.strip_tashkeel(word)
473
+ # ~ tags = u''
474
+ if word in nmconst.NOUN_NASEB_LIST:
475
+ return "منصوب"
476
+ elif word in nmconst.JAR_LIST:
477
+ return "مجرور"
478
+ elif word in nmconst.RAFE3_LIST:
479
+ return "مرفوع"
480
+ else:
481
+ return ""
482
+
483
+
484
+ def extract_number_phrases(text):
485
+ """
486
+ Extract number words in a text.
487
+
488
+ Example:
489
+ >>> extract_number_phrases(u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا")
490
+ خمسمئة وثلاثة وعشرين
491
+ ثلاثة عشر
492
+
493
+ @param text: input text
494
+ @type text: unicode
495
+ @return: number words extracted from text
496
+ @rtype: integer
497
+ """
498
+ phrases = []
499
+
500
+ wordlist = araby.tokenize(text) # text.split(' ')
501
+ positions = detect_number_phrases_position(wordlist)
502
+
503
+ for pos in positions:
504
+ if len(pos) >= 2:
505
+ if pos[0] <= len(wordlist) and pos[1] <= len(wordlist):
506
+ phrases.append(" ".join(wordlist[pos[0] : pos[1] + 1]))
507
+ return phrases
508
+
509
+
510
+ def extract_number_context(
511
+ text,
512
+ ):
513
+ """
514
+ Extract number words in a text within context.
515
+
516
+ Example:
517
+ >>> extract_number_context(u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا")
518
+ ‎وجدت، خمسمئة وثلاثة وعشرين، دينارا
519
+ ‎فاشتريت، ثلاثة عشر ، دفتر
520
+
521
+ @param text: input text
522
+ @type text: unicode
523
+ @return: number words extracted from text
524
+ @rtype: integer
525
+ """
526
+ phrases = []
527
+ wordlist = araby.tokenize(text)
528
+ positions = detect_number_phrases_position(wordlist)
529
+
530
+ for pos in positions:
531
+ if len(pos) >= 2:
532
+ if pos[0] <= len(wordlist) and pos[1] <= len(wordlist):
533
+ if pos[0] - 1 >= 0:
534
+ prev = wordlist[pos[0] - 1]
535
+ else:
536
+ prev = ""
537
+ if pos[1] + 1 < len(wordlist):
538
+ nextword = wordlist[pos[1] + 1]
539
+ else:
540
+ nextword = ""
541
+ phrases.append(
542
+ (prev, " ".join(wordlist[pos[0] : pos[1] + 1]), nextword)
543
+ )
544
+ return phrases
545
+
546
+
547
+ def detect_number_phrases_position(wordlist):
548
+ """
549
+ Detect number words in a text and return positions of each phrase.
550
+ @param wordlist: wordlist
551
+ @type wordlist: unicode list
552
+ @return: list of numbers clause positions [(start,end),(start2,end2),]
553
+ @rtype: list of tuple
554
+ """
555
+ # ~ wordlist# = text.split(u' ')
556
+ # print words
557
+ phrases = []
558
+ startnumber = -1
559
+ endnumber = False
560
+ # ~ taglist = []
561
+ for i, word in enumerate(wordlist):
562
+ # ~ word = wordlist[i]
563
+ if i + 1 < len(wordlist):
564
+ nextword = araby.strip_tashkeel(wordlist[i + 1])
565
+ else:
566
+ nextword = None
567
+ # save the original word with possible harakat if exist
568
+ word_nm = araby.strip_tashkeel(word)
569
+ key = word_nm
570
+ # the first word can have prefixes
571
+ if (
572
+ word_nm
573
+ and not startnumber
574
+ and word_nm != "واحد"
575
+ and word_nm[0] in ("و", "ف", "ل", "ب", "ك")
576
+ ):
577
+ key = word_nm[1:]
578
+ elif word_nm != "واحد" and word_nm.startswith("و"):
579
+ key = word_nm[1:]
580
+ if key in nbconst.NUMBER_WORDS or key.isnumeric():
581
+ if key not in (
582
+ "أحد",
583
+ "إحدى",
584
+ "اثنا",
585
+ "اثني",
586
+ "اثنتي",
587
+ "اثنتا",
588
+ ) or nextword in ("عشر", "عشرة"):
589
+ if startnumber < 0:
590
+ startnumber = i
591
+ endnumber = i
592
+ # phrase.append(word)
593
+ else:
594
+ if startnumber >= 0: # There are a previous number phrase.
595
+ phrases.append((startnumber, endnumber))
596
+ startnumber = -1
597
+ # add the final phrases
598
+ if startnumber >= 0: # There are a previous number phrase.
599
+ phrases.append((startnumber, endnumber))
600
+
601
+ return phrases
602
+
603
+
604
+ def detect_numbers(wordlist):
605
+ """
606
+ Detect number words in a text and return a taglist as BIO.
607
+ @param wordlist: wordlist
608
+ @type wordlist: unicode list
609
+ @return: list of tags BIO
610
+ @rtype: list of unicode
611
+ """
612
+ # ~ phrases = []
613
+ starts = False
614
+ taglist = []
615
+
616
+ for i, word in enumerate(wordlist):
617
+ # ~ word = wordlist[i]
618
+ if i + 1 < len(wordlist):
619
+ nextword = araby.strip_tashkeel(wordlist[i + 1])
620
+ else:
621
+ nextword = None
622
+ # save the original word with possible harakat if exist
623
+ word_nm = araby.strip_tashkeel(word)
624
+ key = word_nm
625
+ # the first word can have prefixes
626
+ if (
627
+ word_nm
628
+ and not starts
629
+ and word_nm != "واحد"
630
+ and word_nm[0] in ("و", "ف", "ل", "ب", "ك")
631
+ ):
632
+ key = word_nm[1:]
633
+ elif word_nm != "واحد" and word_nm.startswith("و"):
634
+ key = word_nm[1:]
635
+ if key in nbconst.NUMBER_WORDS or key.isnumeric():
636
+ if key not in (
637
+ "أحد",
638
+ "إحدى",
639
+ "اثنا",
640
+ "اثني",
641
+ "اثنتي",
642
+ "اثنتا",
643
+ ) or nextword in ("عشر", "عشرة"):
644
+ if not starts:
645
+ taglist.append("DB")
646
+ starts = True
647
+ else:
648
+ taglist.append("DI")
649
+ else:
650
+ starts = False
651
+ taglist.append("O")
652
+ else:
653
+ starts = False
654
+ taglist.append("O")
655
+ return taglist
656
+
657
+
658
+ def detect_number_words(text):
659
+ """
660
+ Detect number words in a text.
661
+
662
+ Example:
663
+ >>> detect_number_words(u"وجدت خمسمئة وثلاثة وعشرين دينارا")
664
+ خمسمئة وثلاثة وعشرين
665
+
666
+ @param text: input text
667
+ @type text: unicode
668
+ @return: number words extracted from text
669
+ @rtype: integer
670
+ """
671
+
672
+ phrases_context = extract_number_context(text)
673
+ for ph_con in phrases_context:
674
+ if len(ph_con) >= 3:
675
+ previous = ph_con[0]
676
+ phrase = ph_con[1]
677
+ nextword = ph_con[2]
678
+ numberedwords = phrase
679
+ numeric = text2number(numberedwords)
680
+ tags = get_previous_tag(previous)
681
+ wordlist = araby.strip_tashkeel(numberedwords).split(" ")
682
+ vocalized = vocalize_number(wordlist, tags)
683
+ # calcul vocalization similarity:
684
+ sim = araby.vocalized_similarity(numberedwords, vocalized)
685
+ voc_unit = vocalize_unit(numeric, nextword)
686
+ sim_unit = araby.vocalized_similarity(voc_unit, nextword)
687
+
688
+ if sim < 0:
689
+ # ~ print u'\t'.join([str(sim), u' '.join(numberedwords), vocalized,
690
+ # ~ str(numeric), u' '.join([previous, phrase, nextword]),
691
+ # ~ nextword, voc_unit, str(sim_unit)]).encode('utf8')
692
+ print(
693
+ "\t".join([str(sim), " ".join(numberedwords), " ".join(vocalized)])
694
+ )
695
+ print(str(numeric), " ".join([previous, phrase, nextword]))
696
+ print("\t".join([nextword, voc_unit, str(sim_unit)]))
697
+
698
+
699
+ def pre_tashkeel_number(wordlist):
700
+ """
701
+ Vocalized a number clauses in a text.
702
+ @param wordlist: input text
703
+ @type wordlist: unicode
704
+ @return: wordlist with vocalized number clause
705
+ @rtype: list
706
+ """
707
+ taglist = detect_numbers(wordlist)
708
+ previous = ""
709
+ vocalized_list = []
710
+ chunk = []
711
+ previous_tag = ""
712
+ for word, tag in zip(wordlist, taglist):
713
+ if tag in ("DB", "DI"):
714
+ chunk.append(word)
715
+ else:
716
+ if chunk:
717
+ # get the tag of previous word
718
+ previous_tag = get_previous_tag(previous)
719
+ vocalized = vocalize_number(chunk, previous_tag)
720
+ vocalized_list.extend(vocalized)
721
+ chunk = []
722
+ vocalized_list.append(word)
723
+ previous = word
724
+ if chunk:
725
+ vocalized = vocalize_number(chunk, previous_tag)
726
+ vocalized_list.extend(vocalized)
727
+ chunk = []
728
+ return vocalized_list
729
+
730
+
731
+ def number2ordinal(anumber, feminin=False):
732
+ """
733
+ Convert number to arabic words in ordinal form, for example convert 25 --> الخامس والعشرون
734
+
735
+ Example:
736
+ >>> number2text(523)
737
+ الخمسمئة والثالث والعشرون
738
+ @param anumber: input number
739
+ @type anumber: int
740
+ @return: number words
741
+ @rtype: unicode
742
+ """
743
+ # test if the given type is numeric(float or int
744
+ # if ok, convert it to string
745
+ a = 0
746
+ if type(anumber) is int:
747
+ anumber = str(anumber)
748
+ # if the given type is str/unicode, test if it's a valid number
749
+ elif type(anumber) is str or type(anumber) is unicode:
750
+ try:
751
+ a = int(anumber)
752
+ except ValueError:
753
+ return "صفر"
754
+ # if the given number not a valid return 0
755
+ else:
756
+ return "الصفر"
757
+ if a == 1:
758
+ if feminin:
759
+ return "الأولى"
760
+ else:
761
+ return "الأول"
762
+ arbn = ArNumbers()
763
+ arbn_str = arbn.int2str(anumber)
764
+ # substitute numeral words to ordinal words
765
+ # normalize waw
766
+ arbn_str = arbn_str.replace("و ", "و")
767
+ tokens = arbn_str.split(" ")
768
+ # the first token:
769
+ # if it's unit, change it to ordinal
770
+ # and add definate article
771
+ new_list = []
772
+ if feminin:
773
+ ordinal_words = nbconst.UNITS_ORDINAL_WORDS_FEMININ
774
+ else:
775
+ ordinal_words = nbconst.UNITS_ORDINAL_WORDS
776
+ if tokens:
777
+ tok = tokens[0]
778
+ if tok in ordinal_words:
779
+ # ثلاث = ثالث
780
+ tok = ordinal_words.get(tok, tok)
781
+ # add definite article
782
+ tok = "ال" + tok
783
+ new_list.append(tok)
784
+
785
+ for tok in tokens[1:]:
786
+ # first strip first Waw
787
+ if tok.startswith(araby.WAW):
788
+ # strip waw
789
+ tok = tok[1:]
790
+ if tok in ordinal_words:
791
+ # ثلاث = ثالث
792
+ tok = ordinal_words.get(tok, tok)
793
+ tok = "وال" + tok
794
+ new_list.append(tok)
795
+ # ajust الواحد at the end
796
+ if new_list[-1:] == "الحادي":
797
+ new_list.pop()
798
+ new_list.append("الواحد")
799
+ elif new_list[-1:] == "الحادية":
800
+ new_list.pop()
801
+ new_list.append("الواحدة")
802
+ ordinal_string = " ".join(new_list)
803
+ return ordinal_string
804
+
805
+
806
+ if __name__ == "__main__":
807
+ # import number as ArabicNumberToLetters
808
+ TEXTS = [
809
+ "مليونان وألفان وإثنا عشر",
810
+ "جاء مليونان وألفان وإثنا عشر",
811
+ "وجدت خمسمئة وثلاث وعشرون دينارا",
812
+ "خمسمئة وثلاث وعشرون دينارا",
813
+ "وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا",
814
+ "لم أجد شيئا",
815
+ "وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا",
816
+ "من ثلاثمئة وخمسين بلدا ",
817
+ "من ثلاثمئة وخمسين بلدا ",
818
+ "من أربعمئة وخمسين بلدا ",
819
+ "السلام عليكم 2014",
820
+ ]
821
+ for txt in TEXTS:
822
+ word_list = araby.tokenize(txt)
823
+ positions_phrases = detect_number_phrases_position(word_list)
824
+ nb_phrases = extract_number_phrases(txt)
825
+ tag_list = detect_numbers(word_list)
826
+ tashkeel = " ".join(pre_tashkeel_number(word_list))