phoonnx 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phoonnx/__init__.py +0 -0
- phoonnx/config.py +490 -0
- phoonnx/locale/ca/phonetic_spellings.txt +2 -0
- phoonnx/locale/en/phonetic_spellings.txt +1 -0
- phoonnx/locale/gl/phonetic_spellings.txt +2 -0
- phoonnx/locale/pt/phonetic_spellings.txt +2 -0
- phoonnx/phoneme_ids.py +453 -0
- phoonnx/phonemizers/__init__.py +45 -0
- phoonnx/phonemizers/ar.py +42 -0
- phoonnx/phonemizers/base.py +216 -0
- phoonnx/phonemizers/en.py +250 -0
- phoonnx/phonemizers/fa.py +46 -0
- phoonnx/phonemizers/gl.py +142 -0
- phoonnx/phonemizers/he.py +67 -0
- phoonnx/phonemizers/ja.py +119 -0
- phoonnx/phonemizers/ko.py +97 -0
- phoonnx/phonemizers/mul.py +606 -0
- phoonnx/phonemizers/vi.py +44 -0
- phoonnx/phonemizers/zh.py +308 -0
- phoonnx/thirdparty/__init__.py +0 -0
- phoonnx/thirdparty/arpa2ipa.py +249 -0
- phoonnx/thirdparty/cotovia/cotovia_aarch64 +0 -0
- phoonnx/thirdparty/cotovia/cotovia_x86_64 +0 -0
- phoonnx/thirdparty/hangul2ipa.py +783 -0
- phoonnx/thirdparty/ko_tables/aspiration.csv +20 -0
- phoonnx/thirdparty/ko_tables/assimilation.csv +31 -0
- phoonnx/thirdparty/ko_tables/double_coda.csv +17 -0
- phoonnx/thirdparty/ko_tables/hanja.tsv +8525 -0
- phoonnx/thirdparty/ko_tables/ipa.csv +22 -0
- phoonnx/thirdparty/ko_tables/neutralization.csv +11 -0
- phoonnx/thirdparty/ko_tables/tensification.csv +56 -0
- phoonnx/thirdparty/ko_tables/yale.csv +22 -0
- phoonnx/thirdparty/kog2p/__init__.py +385 -0
- phoonnx/thirdparty/kog2p/rulebook.txt +212 -0
- phoonnx/thirdparty/mantoq/__init__.py +67 -0
- phoonnx/thirdparty/mantoq/buck/__init__.py +0 -0
- phoonnx/thirdparty/mantoq/buck/phonetise_buckwalter.py +569 -0
- phoonnx/thirdparty/mantoq/buck/symbols.py +64 -0
- phoonnx/thirdparty/mantoq/buck/tokenization.py +105 -0
- phoonnx/thirdparty/mantoq/num2words.py +37 -0
- phoonnx/thirdparty/mantoq/pyarabic/__init__.py +12 -0
- phoonnx/thirdparty/mantoq/pyarabic/arabrepr.py +64 -0
- phoonnx/thirdparty/mantoq/pyarabic/araby.py +1647 -0
- phoonnx/thirdparty/mantoq/pyarabic/named_const.py +227 -0
- phoonnx/thirdparty/mantoq/pyarabic/normalize.py +161 -0
- phoonnx/thirdparty/mantoq/pyarabic/number.py +826 -0
- phoonnx/thirdparty/mantoq/pyarabic/number_const.py +1704 -0
- phoonnx/thirdparty/mantoq/pyarabic/stack.py +52 -0
- phoonnx/thirdparty/mantoq/pyarabic/trans.py +517 -0
- phoonnx/thirdparty/mantoq/unicode_symbol2label.py +4173 -0
- phoonnx/thirdparty/tashkeel/LICENSE +22 -0
- phoonnx/thirdparty/tashkeel/SOURCE +1 -0
- phoonnx/thirdparty/tashkeel/__init__.py +212 -0
- phoonnx/thirdparty/tashkeel/hint_id_map.json +18 -0
- phoonnx/thirdparty/tashkeel/input_id_map.json +56 -0
- phoonnx/thirdparty/tashkeel/model.onnx +0 -0
- phoonnx/thirdparty/tashkeel/target_id_map.json +17 -0
- phoonnx/thirdparty/zh_num.py +238 -0
- phoonnx/util.py +705 -0
- phoonnx/version.py +6 -0
- phoonnx/voice.py +521 -0
- phoonnx-0.0.0.dist-info/METADATA +255 -0
- phoonnx-0.0.0.dist-info/RECORD +86 -0
- phoonnx-0.0.0.dist-info/WHEEL +5 -0
- phoonnx-0.0.0.dist-info/top_level.txt +2 -0
- phoonnx_train/__main__.py +151 -0
- phoonnx_train/export_onnx.py +109 -0
- phoonnx_train/norm_audio/__init__.py +92 -0
- phoonnx_train/norm_audio/trim.py +54 -0
- phoonnx_train/norm_audio/vad.py +54 -0
- phoonnx_train/preprocess.py +420 -0
- phoonnx_train/vits/__init__.py +0 -0
- phoonnx_train/vits/attentions.py +427 -0
- phoonnx_train/vits/commons.py +147 -0
- phoonnx_train/vits/config.py +330 -0
- phoonnx_train/vits/dataset.py +214 -0
- phoonnx_train/vits/lightning.py +352 -0
- phoonnx_train/vits/losses.py +58 -0
- phoonnx_train/vits/mel_processing.py +139 -0
- phoonnx_train/vits/models.py +732 -0
- phoonnx_train/vits/modules.py +527 -0
- phoonnx_train/vits/monotonic_align/__init__.py +20 -0
- phoonnx_train/vits/monotonic_align/setup.py +13 -0
- phoonnx_train/vits/transforms.py +212 -0
- phoonnx_train/vits/utils.py +16 -0
- phoonnx_train/vits/wavfile.py +860 -0
@@ -0,0 +1,826 @@
|
|
1
|
+
#!/usr/bin/python
|
2
|
+
# -*- coding=utf-8 -*-
|
3
|
+
"""
|
4
|
+
Arabic numbers routins
|
5
|
+
@author: Taha Zerrouki
|
6
|
+
@contact: taha dot zerrouki at gmail dot com
|
7
|
+
@copyright: Arabtechies, Arabeyes, Taha Zerrouki
|
8
|
+
@license: GPL
|
9
|
+
@date:2017/02/14
|
10
|
+
@version: 0.3
|
11
|
+
# ArNumbers is imported from
|
12
|
+
license: LGPL <http://www.gnu.org/licenses/lgpl.txt>
|
13
|
+
link http://www.ar-php.org
|
14
|
+
category Text
|
15
|
+
author Khaled Al-Shamaa <khaled.alshamaa@gmail.com>
|
16
|
+
copyright 2009 Khaled Al-Shamaa
|
17
|
+
"""
|
18
|
+
import math
|
19
|
+
import sys
|
20
|
+
|
21
|
+
from . import arabrepr, araby
|
22
|
+
from . import named_const as nmconst
|
23
|
+
from . import number_const as nbconst
|
24
|
+
|
25
|
+
|
26
|
+
class ArNumbers(object):
|
27
|
+
"""
|
28
|
+
Arabic number class
|
29
|
+
"""
|
30
|
+
|
31
|
+
_individual = {}
|
32
|
+
_feminine = 1
|
33
|
+
_format = 1
|
34
|
+
|
35
|
+
##"""
|
36
|
+
## * Loads initialize values
|
37
|
+
##"""
|
38
|
+
def __init__(self):
|
39
|
+
self._individual = nbconst.INDIVIDUALS
|
40
|
+
self.complications = nbconst.COMPLICATIONS
|
41
|
+
|
42
|
+
def set_feminine(self, value):
|
43
|
+
"""
|
44
|
+
Set feminine flag of the counted object
|
45
|
+
@param value: value Counted object feminine (1 for masculine & 2 for feminine)
|
46
|
+
@type value: integer
|
47
|
+
@return: True if success, or False if fail
|
48
|
+
@rtype: boolean
|
49
|
+
"""
|
50
|
+
|
51
|
+
flag = True
|
52
|
+
if value in (1, 2):
|
53
|
+
self._feminine = value
|
54
|
+
else:
|
55
|
+
flag = False
|
56
|
+
return flag
|
57
|
+
|
58
|
+
def set_format(self, value):
|
59
|
+
"""
|
60
|
+
Set the grammar position flag of the counted object
|
61
|
+
@param value: Grammar position of counted object (1 if Marfoua & 2 if Mansoub or Majrour)
|
62
|
+
@type value: integer
|
63
|
+
@return: True if success, or False if fail
|
64
|
+
@rtype: boolean
|
65
|
+
"""
|
66
|
+
|
67
|
+
flag = True
|
68
|
+
|
69
|
+
if value in (1, 2):
|
70
|
+
self._format = value
|
71
|
+
else:
|
72
|
+
flag = False
|
73
|
+
return flag
|
74
|
+
|
75
|
+
def get_feminine(self):
|
76
|
+
"""
|
77
|
+
Get the feminine flag of counted object
|
78
|
+
@return: return current setting of counted object feminine flag
|
79
|
+
@rtype: integer
|
80
|
+
"""
|
81
|
+
return self._feminine
|
82
|
+
|
83
|
+
def get_format(self):
|
84
|
+
"""
|
85
|
+
Get the grammer position flag of counted object
|
86
|
+
@return: return current setting of counted object grammer position flag
|
87
|
+
@rtype: integer
|
88
|
+
"""
|
89
|
+
|
90
|
+
return self._format
|
91
|
+
|
92
|
+
def int2str(self, number, output_charset=None, main=None):
|
93
|
+
"""
|
94
|
+
Spell integer number in Arabic idiom
|
95
|
+
@param number: The number you want to spell in Arabic idiom
|
96
|
+
@type number: integer
|
97
|
+
@param output_charset: (optional) Output charset [utf-8|windows-1256|iso-8859-6]
|
98
|
+
default value is None (use set output charset)
|
99
|
+
@type output_charset: string
|
100
|
+
@param main: Main Ar-PHP object to access charset converter options
|
101
|
+
@type main: object
|
102
|
+
@return: The Arabic idiom that spells inserted number
|
103
|
+
@rtype: string
|
104
|
+
"""
|
105
|
+
temp = number.split(".")
|
106
|
+
string = self._int2str(temp[0])
|
107
|
+
if len(temp) > 1:
|
108
|
+
dec = self._int2str(temp[1])
|
109
|
+
string += " فاصلة " + dec
|
110
|
+
if main:
|
111
|
+
if output_charset is None:
|
112
|
+
output_charset = main.getOutputCharset()
|
113
|
+
string = main.coreConvert(string, "utf-8", output_charset)
|
114
|
+
return string
|
115
|
+
|
116
|
+
def _int2str(self, number_str):
|
117
|
+
"""
|
118
|
+
Spell integer number in Arabic idiom
|
119
|
+
@param number_str: The number you want to spell in Arabic idiom
|
120
|
+
@type number_str: integer.
|
121
|
+
@return: The Arabic idiom that spells inserted number
|
122
|
+
@rtype:string
|
123
|
+
"""
|
124
|
+
|
125
|
+
blocks = []
|
126
|
+
items = []
|
127
|
+
string = ""
|
128
|
+
# ~ number = number#trunc(int(number)) #(int)number)
|
129
|
+
try:
|
130
|
+
number = int(number_str)
|
131
|
+
except ValueError:
|
132
|
+
number = 0
|
133
|
+
if int(number) > 0:
|
134
|
+
number_part = ""
|
135
|
+
while len(number_str) > 3:
|
136
|
+
blocks.append(number_str[-3:])
|
137
|
+
number_str = number_str[: len(number_str) - 3]
|
138
|
+
blocks.append(number_str)
|
139
|
+
blocks_num = len(blocks) - 1
|
140
|
+
i = blocks_num
|
141
|
+
while i >= 0: # (i = blocks_num i > = 0 i--):
|
142
|
+
number = math.floor(int(blocks[i]))
|
143
|
+
text = self._written_block(number)
|
144
|
+
if text:
|
145
|
+
if number == 1 and i != 0:
|
146
|
+
text = self.complications[i][4]
|
147
|
+
elif number == 2 and i != 0:
|
148
|
+
text = self.complications[i][self._format]
|
149
|
+
elif number > 2 and number < 11 and i != 0:
|
150
|
+
text += " " + self.complications[i][3]
|
151
|
+
elif i != 0:
|
152
|
+
text += " " + self.complications[i][4]
|
153
|
+
items.append(text)
|
154
|
+
i -= 1
|
155
|
+
string = " و ".join(items)
|
156
|
+
else:
|
157
|
+
string = "صفر"
|
158
|
+
return string
|
159
|
+
|
160
|
+
def _written_block(self, number):
|
161
|
+
"""
|
162
|
+
Spell sub block number of three digits max in Arabic idiom
|
163
|
+
@param number: number Sub block number of three digits max you want to spell in Arabic idiom
|
164
|
+
@type number: integer
|
165
|
+
@return: The Arabic idiom that spells inserted sub block
|
166
|
+
@rtype: String
|
167
|
+
"""
|
168
|
+
items = []
|
169
|
+
string = ""
|
170
|
+
number = int(number)
|
171
|
+
if number > 99:
|
172
|
+
hundred = math.floor(number / 100) * 100
|
173
|
+
number = number % 100
|
174
|
+
|
175
|
+
if hundred == 200:
|
176
|
+
items.append(self._individual[hundred][self._format])
|
177
|
+
else:
|
178
|
+
items.append(self._individual[hundred])
|
179
|
+
if number == 2 or number == 12:
|
180
|
+
items.append(self._individual[number][self._feminine][self._format])
|
181
|
+
elif number < 20:
|
182
|
+
items.append(self._individual[int(number)][self._feminine])
|
183
|
+
else:
|
184
|
+
ones = number % 10
|
185
|
+
tens = math.floor(number / 10) * 10
|
186
|
+
tens = int(tens)
|
187
|
+
|
188
|
+
if ones == 2:
|
189
|
+
items.append(self._individual[ones][self._feminine][self._format])
|
190
|
+
elif ones > 0:
|
191
|
+
items.append(self._individual[ones][self._feminine])
|
192
|
+
items.append(self._individual[tens][self._format])
|
193
|
+
|
194
|
+
if "" in items:
|
195
|
+
items.remove("")
|
196
|
+
string = " و ".join(items)
|
197
|
+
return string
|
198
|
+
|
199
|
+
|
200
|
+
def text2number(text):
|
201
|
+
"""
|
202
|
+
Convert arabic text into number, for example convert تسعة وعشرون = >29.
|
203
|
+
|
204
|
+
Example:
|
205
|
+
>>> text2number(u"خمسمئة وثلاث وعشرون")
|
206
|
+
523
|
207
|
+
|
208
|
+
@param text: input text
|
209
|
+
@type text: unicode
|
210
|
+
@return: number extracted from text
|
211
|
+
@rtype: integer
|
212
|
+
"""
|
213
|
+
# the result total is 0
|
214
|
+
total = 0
|
215
|
+
# the partial total for the three number
|
216
|
+
partial = 0
|
217
|
+
text = araby.strip_tashkeel(text)
|
218
|
+
words = text.split(" ")
|
219
|
+
# print words
|
220
|
+
for word in words:
|
221
|
+
if word and word != "واحد" and word[0] in ("و", "ف", "ل", "ب", "ك"):
|
222
|
+
word = word[1:]
|
223
|
+
if word != "واحد" and word.startswith("و"):
|
224
|
+
word = word[1:]
|
225
|
+
|
226
|
+
if word in nbconst.NUMBER_WORDS:
|
227
|
+
actualnumber = nbconst.NUMBER_WORDS[word]
|
228
|
+
if actualnumber % 1000 == 0:
|
229
|
+
# the case of 1000 or 1 million
|
230
|
+
if partial == 0:
|
231
|
+
partial = 1
|
232
|
+
total += partial * actualnumber
|
233
|
+
# re-initiate the partial total
|
234
|
+
partial = 0
|
235
|
+
else:
|
236
|
+
partial += nbconst.NUMBER_WORDS[word]
|
237
|
+
# add the final partial to total
|
238
|
+
total += partial
|
239
|
+
return total
|
240
|
+
|
241
|
+
|
242
|
+
def number2text(anumber):
|
243
|
+
"""
|
244
|
+
Convert number to arabic words, for example convert 25 --> خمسة و عشرون
|
245
|
+
|
246
|
+
Example:
|
247
|
+
>>> number2text(523)
|
248
|
+
خمسمئة وثلاث وعشرون
|
249
|
+
|
250
|
+
@param anumber: input number
|
251
|
+
@type anumber: int
|
252
|
+
@return: number words
|
253
|
+
@rtype: unicode
|
254
|
+
"""
|
255
|
+
# test if the given type is numeric(float or int
|
256
|
+
# if ok, convert it to string
|
257
|
+
if type(anumber) is int or type(anumber) is float:
|
258
|
+
anumber = str(anumber)
|
259
|
+
# if the given type is str/unicode, test if it's a valid number
|
260
|
+
elif type(anumber) is str or type(anumber) is unicode:
|
261
|
+
try:
|
262
|
+
a = float(anumber)
|
263
|
+
except ValueError:
|
264
|
+
return "صفر"
|
265
|
+
# if the given number not a valid return 0
|
266
|
+
else:
|
267
|
+
return "صفر"
|
268
|
+
arbn = ArNumbers()
|
269
|
+
return arbn.int2str(anumber)
|
270
|
+
|
271
|
+
return total
|
272
|
+
|
273
|
+
|
274
|
+
def vocalize_number(wordlist, syn_tags=""):
|
275
|
+
"""Vocalize a number words clause
|
276
|
+
|
277
|
+
|
278
|
+
@param wordlist: words to vocalize
|
279
|
+
@type wordlist: unicode list
|
280
|
+
@param syn_tags: tags about the clause
|
281
|
+
@type syn_tags: unicode
|
282
|
+
@return: the vocalized wordlist.
|
283
|
+
@rtype: unicode
|
284
|
+
"""
|
285
|
+
newlist = []
|
286
|
+
prefix = ""
|
287
|
+
nextword = ""
|
288
|
+
# we can pass tags to this number word
|
289
|
+
tags = syn_tags
|
290
|
+
if len(wordlist) == 1:
|
291
|
+
word = wordlist[0]
|
292
|
+
word_nm = araby.strip_tashkeel(word)
|
293
|
+
key = word_nm
|
294
|
+
voc = word
|
295
|
+
# the first word can have prefixes
|
296
|
+
if (
|
297
|
+
word_nm
|
298
|
+
and not wordlist
|
299
|
+
and word_nm != "واحد"
|
300
|
+
and word[0] in ("و", "ف", "ل", "ب", "ك")
|
301
|
+
):
|
302
|
+
if word_nm[0] in ("ل", "ب", "ك"):
|
303
|
+
tags += "مجرور"
|
304
|
+
key = word[1:]
|
305
|
+
elif word_nm != "واحد" and word_nm.startswith("و"):
|
306
|
+
key = word_nm[1:]
|
307
|
+
# تحذب بعض الكلمات لأنها تلتبس مع أسماء الأجزاء مثل خُمس وخمس
|
308
|
+
if key in nbconst.NUMBER_WORDS and key not in (
|
309
|
+
"عشر",
|
310
|
+
"خمس",
|
311
|
+
"سبع",
|
312
|
+
"تسع",
|
313
|
+
"خمسا",
|
314
|
+
"سبعا",
|
315
|
+
"تسعا",
|
316
|
+
"عشرا",
|
317
|
+
"ألفين",
|
318
|
+
"عشرة",
|
319
|
+
"صفر",
|
320
|
+
"ألف",
|
321
|
+
):
|
322
|
+
voc = prefix + nbconst.VOCALIZED_NUMBER_WORDS[key]["i"]
|
323
|
+
return [
|
324
|
+
voc,
|
325
|
+
]
|
326
|
+
for i, word in enumerate(wordlist):
|
327
|
+
# save the original word with possible harakat if exist
|
328
|
+
# ~ word = wordlist[i]
|
329
|
+
word_nm = araby.strip_tashkeel(word)
|
330
|
+
key = word_nm
|
331
|
+
# the first word can have prefixes
|
332
|
+
if (
|
333
|
+
i == 0
|
334
|
+
and word_nm
|
335
|
+
and word_nm != "واحد"
|
336
|
+
and word[0] in ("و", "ف", "ل", "ب", "ك")
|
337
|
+
):
|
338
|
+
if word_nm[0] in ("ل", "ب", "ك"):
|
339
|
+
tags += "مجرور"
|
340
|
+
key = word[1:]
|
341
|
+
elif word_nm != "واحد" and word_nm.startswith("و"):
|
342
|
+
key = word_nm[1:]
|
343
|
+
if key in nbconst.NUMBER_WORDS:
|
344
|
+
if word_nm.endswith("ين"):
|
345
|
+
tags += "مجهول" # إما مجرور أو منصوب
|
346
|
+
elif word_nm.endswith("ان") or word_nm.endswith("ون"):
|
347
|
+
tags += "مرفوع"
|
348
|
+
pre_key = ""
|
349
|
+
for i, word in enumerate(wordlist):
|
350
|
+
# ~ word = wordlist[i]
|
351
|
+
if i + 1 < len(wordlist):
|
352
|
+
nextword = wordlist[i + 1]
|
353
|
+
else:
|
354
|
+
nextword = ""
|
355
|
+
key = word
|
356
|
+
# the first word can have prefixes
|
357
|
+
if word and word != "واحد" and word[0] in ("و", "ف", "ل", "ب", "ك"):
|
358
|
+
key = word[1:]
|
359
|
+
prefix = word[0]
|
360
|
+
if prefix in ("و", "ف", "ك"):
|
361
|
+
prefix += "َ"
|
362
|
+
elif prefix in ("ل", "ب"):
|
363
|
+
prefix += "ِ"
|
364
|
+
else:
|
365
|
+
prefix = ""
|
366
|
+
if key in nbconst.VOCALIZED_NUMBER_WORDS:
|
367
|
+
voc = ""
|
368
|
+
if nbconst.VOCALIZED_NUMBER_WORDS[key]["s"] == "*":
|
369
|
+
voc = prefix + nbconst.VOCALIZED_NUMBER_WORDS[key]["i"]
|
370
|
+
|
371
|
+
# مبني على النصب في حالة المركب العددي
|
372
|
+
elif nextword == "عشر" or nextword == "عشرة":
|
373
|
+
voc = prefix + nbconst.VOCALIZED_NUMBER_WORDS[key]["n"]
|
374
|
+
# مبني على النصب في حالة المركب العددي
|
375
|
+
elif key == "عشر" and pre_key in nbconst.NUMBER_TEN_MASCULIN_UNITS:
|
376
|
+
voc = "عَشَرَ"
|
377
|
+
elif key == "عشرة" and pre_key in nbconst.NUMBER_TEN_FEMININ_UNITS:
|
378
|
+
voc = "عَشْرَةَ"
|
379
|
+
elif "مرفوع" in tags:
|
380
|
+
if nextword.startswith("و"):
|
381
|
+
voc = prefix + nbconst.VOCALIZED_NUMBER_WORDS[key]["r2"]
|
382
|
+
else:
|
383
|
+
voc = prefix + nbconst.VOCALIZED_NUMBER_WORDS[key]["r"]
|
384
|
+
elif "مجهول" in tags:
|
385
|
+
voc = prefix + nbconst.VOCALIZED_NUMBER_WORDS[key]["i"]
|
386
|
+
|
387
|
+
elif "مجرور" in tags:
|
388
|
+
if nextword.startswith("و"):
|
389
|
+
voc = prefix + nbconst.VOCALIZED_NUMBER_WORDS[key]["j2"]
|
390
|
+
else:
|
391
|
+
voc = prefix + nbconst.VOCALIZED_NUMBER_WORDS[key]["j"]
|
392
|
+
# منصوب
|
393
|
+
elif "منصوب" in tags:
|
394
|
+
if nextword.startswith("و"):
|
395
|
+
voc = prefix + nbconst.VOCALIZED_NUMBER_WORDS[key]["n2"]
|
396
|
+
else:
|
397
|
+
voc = prefix + nbconst.VOCALIZED_NUMBER_WORDS[key]["n"]
|
398
|
+
else:
|
399
|
+
voc = prefix + nbconst.VOCALIZED_NUMBER_WORDS[key]["i"]
|
400
|
+
newlist.append(voc)
|
401
|
+
else:
|
402
|
+
newlist.append(prefix + key)
|
403
|
+
pre_key = key
|
404
|
+
return newlist
|
405
|
+
|
406
|
+
|
407
|
+
def is_unit(word):
|
408
|
+
"""
|
409
|
+
return if the given word is a unit
|
410
|
+
@param word: given word to be tested
|
411
|
+
@type word: unicode
|
412
|
+
@return: if word is a unit return True else False.
|
413
|
+
@rtype: Boolean
|
414
|
+
"""
|
415
|
+
return word in nbconst.UNIT_WORDS
|
416
|
+
|
417
|
+
|
418
|
+
def vocalize_unit(numeric, unit):
|
419
|
+
"""Vocalize a number words
|
420
|
+
@param numeric: given number
|
421
|
+
@type numeric: integer
|
422
|
+
@param unit: unit to vocalize
|
423
|
+
@type unit: unicode
|
424
|
+
@return: the vocalized unit, or unit word if itsnt a unit word.
|
425
|
+
@rtype: unicode
|
426
|
+
"""
|
427
|
+
# detect tags
|
428
|
+
# The given word is not a unit
|
429
|
+
unit_nm = araby.strip_tashkeel(unit)
|
430
|
+
if not is_unit(unit_nm):
|
431
|
+
return unit
|
432
|
+
tags = ""
|
433
|
+
vocalizedunit = unit
|
434
|
+
|
435
|
+
# العدد بين واحد واثنان يتطلب صفة للوحدة ويكون بعدها
|
436
|
+
# هذه الحالة لا تبرمج
|
437
|
+
|
438
|
+
if numeric >= 0 and numeric <= 2:
|
439
|
+
return unit
|
440
|
+
# الإضافة إلى تمييز مضاف إليه مجرور مفرد
|
441
|
+
# تممييز الألف والمئة والمليون والمليار
|
442
|
+
# يتطلب إضافة إلى مفرد
|
443
|
+
# مثلا ألف رجل
|
444
|
+
elif numeric % 100 == 0 or numeric % 1000 == 0:
|
445
|
+
tags = "SingleMajrour"
|
446
|
+
vocalizedunit = nbconst.UNIT_WORDS[unit_nm]["a"]
|
447
|
+
# العدد المفرد يتطلب
|
448
|
+
# إضافة إلى الجمع
|
449
|
+
elif numeric % 100 <= 10:
|
450
|
+
tags += "Plural"
|
451
|
+
vocalizedunit = nbconst.UNIT_WORDS[unit_nm]["p"]
|
452
|
+
|
453
|
+
elif numeric % 100 < 100:
|
454
|
+
tags += "SingleMansoub"
|
455
|
+
vocalizedunit = nbconst.UNIT_WORDS[unit_nm]["n"]
|
456
|
+
else:
|
457
|
+
tags = ""
|
458
|
+
vocalizedunit = nbconst.UNIT_WORDS[unit_nm]["i"]
|
459
|
+
if not vocalizedunit:
|
460
|
+
return "Error" + tags
|
461
|
+
else:
|
462
|
+
return vocalizedunit
|
463
|
+
|
464
|
+
|
465
|
+
def get_previous_tag(word):
|
466
|
+
"""Get the word tags
|
467
|
+
@param word: given word
|
468
|
+
@type word: unicode
|
469
|
+
@return:word tag
|
470
|
+
@rtype: unicode
|
471
|
+
"""
|
472
|
+
word = araby.strip_tashkeel(word)
|
473
|
+
# ~ tags = u''
|
474
|
+
if word in nmconst.NOUN_NASEB_LIST:
|
475
|
+
return "منصوب"
|
476
|
+
elif word in nmconst.JAR_LIST:
|
477
|
+
return "مجرور"
|
478
|
+
elif word in nmconst.RAFE3_LIST:
|
479
|
+
return "مرفوع"
|
480
|
+
else:
|
481
|
+
return ""
|
482
|
+
|
483
|
+
|
484
|
+
def extract_number_phrases(text):
|
485
|
+
"""
|
486
|
+
Extract number words in a text.
|
487
|
+
|
488
|
+
Example:
|
489
|
+
>>> extract_number_phrases(u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا")
|
490
|
+
خمسمئة وثلاثة وعشرين
|
491
|
+
ثلاثة عشر
|
492
|
+
|
493
|
+
@param text: input text
|
494
|
+
@type text: unicode
|
495
|
+
@return: number words extracted from text
|
496
|
+
@rtype: integer
|
497
|
+
"""
|
498
|
+
phrases = []
|
499
|
+
|
500
|
+
wordlist = araby.tokenize(text) # text.split(' ')
|
501
|
+
positions = detect_number_phrases_position(wordlist)
|
502
|
+
|
503
|
+
for pos in positions:
|
504
|
+
if len(pos) >= 2:
|
505
|
+
if pos[0] <= len(wordlist) and pos[1] <= len(wordlist):
|
506
|
+
phrases.append(" ".join(wordlist[pos[0] : pos[1] + 1]))
|
507
|
+
return phrases
|
508
|
+
|
509
|
+
|
510
|
+
def extract_number_context(
|
511
|
+
text,
|
512
|
+
):
|
513
|
+
"""
|
514
|
+
Extract number words in a text within context.
|
515
|
+
|
516
|
+
Example:
|
517
|
+
>>> extract_number_context(u"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا")
|
518
|
+
وجدت، خمسمئة وثلاثة وعشرين، دينارا
|
519
|
+
فاشتريت، ثلاثة عشر ، دفتر
|
520
|
+
|
521
|
+
@param text: input text
|
522
|
+
@type text: unicode
|
523
|
+
@return: number words extracted from text
|
524
|
+
@rtype: integer
|
525
|
+
"""
|
526
|
+
phrases = []
|
527
|
+
wordlist = araby.tokenize(text)
|
528
|
+
positions = detect_number_phrases_position(wordlist)
|
529
|
+
|
530
|
+
for pos in positions:
|
531
|
+
if len(pos) >= 2:
|
532
|
+
if pos[0] <= len(wordlist) and pos[1] <= len(wordlist):
|
533
|
+
if pos[0] - 1 >= 0:
|
534
|
+
prev = wordlist[pos[0] - 1]
|
535
|
+
else:
|
536
|
+
prev = ""
|
537
|
+
if pos[1] + 1 < len(wordlist):
|
538
|
+
nextword = wordlist[pos[1] + 1]
|
539
|
+
else:
|
540
|
+
nextword = ""
|
541
|
+
phrases.append(
|
542
|
+
(prev, " ".join(wordlist[pos[0] : pos[1] + 1]), nextword)
|
543
|
+
)
|
544
|
+
return phrases
|
545
|
+
|
546
|
+
|
547
|
+
def detect_number_phrases_position(wordlist):
|
548
|
+
"""
|
549
|
+
Detect number words in a text and return positions of each phrase.
|
550
|
+
@param wordlist: wordlist
|
551
|
+
@type wordlist: unicode list
|
552
|
+
@return: list of numbers clause positions [(start,end),(start2,end2),]
|
553
|
+
@rtype: list of tuple
|
554
|
+
"""
|
555
|
+
# ~ wordlist# = text.split(u' ')
|
556
|
+
# print words
|
557
|
+
phrases = []
|
558
|
+
startnumber = -1
|
559
|
+
endnumber = False
|
560
|
+
# ~ taglist = []
|
561
|
+
for i, word in enumerate(wordlist):
|
562
|
+
# ~ word = wordlist[i]
|
563
|
+
if i + 1 < len(wordlist):
|
564
|
+
nextword = araby.strip_tashkeel(wordlist[i + 1])
|
565
|
+
else:
|
566
|
+
nextword = None
|
567
|
+
# save the original word with possible harakat if exist
|
568
|
+
word_nm = araby.strip_tashkeel(word)
|
569
|
+
key = word_nm
|
570
|
+
# the first word can have prefixes
|
571
|
+
if (
|
572
|
+
word_nm
|
573
|
+
and not startnumber
|
574
|
+
and word_nm != "واحد"
|
575
|
+
and word_nm[0] in ("و", "ف", "ل", "ب", "ك")
|
576
|
+
):
|
577
|
+
key = word_nm[1:]
|
578
|
+
elif word_nm != "واحد" and word_nm.startswith("و"):
|
579
|
+
key = word_nm[1:]
|
580
|
+
if key in nbconst.NUMBER_WORDS or key.isnumeric():
|
581
|
+
if key not in (
|
582
|
+
"أحد",
|
583
|
+
"إحدى",
|
584
|
+
"اثنا",
|
585
|
+
"اثني",
|
586
|
+
"اثنتي",
|
587
|
+
"اثنتا",
|
588
|
+
) or nextword in ("عشر", "عشرة"):
|
589
|
+
if startnumber < 0:
|
590
|
+
startnumber = i
|
591
|
+
endnumber = i
|
592
|
+
# phrase.append(word)
|
593
|
+
else:
|
594
|
+
if startnumber >= 0: # There are a previous number phrase.
|
595
|
+
phrases.append((startnumber, endnumber))
|
596
|
+
startnumber = -1
|
597
|
+
# add the final phrases
|
598
|
+
if startnumber >= 0: # There are a previous number phrase.
|
599
|
+
phrases.append((startnumber, endnumber))
|
600
|
+
|
601
|
+
return phrases
|
602
|
+
|
603
|
+
|
604
|
+
def detect_numbers(wordlist):
|
605
|
+
"""
|
606
|
+
Detect number words in a text and return a taglist as BIO.
|
607
|
+
@param wordlist: wordlist
|
608
|
+
@type wordlist: unicode list
|
609
|
+
@return: list of tags BIO
|
610
|
+
@rtype: list of unicode
|
611
|
+
"""
|
612
|
+
# ~ phrases = []
|
613
|
+
starts = False
|
614
|
+
taglist = []
|
615
|
+
|
616
|
+
for i, word in enumerate(wordlist):
|
617
|
+
# ~ word = wordlist[i]
|
618
|
+
if i + 1 < len(wordlist):
|
619
|
+
nextword = araby.strip_tashkeel(wordlist[i + 1])
|
620
|
+
else:
|
621
|
+
nextword = None
|
622
|
+
# save the original word with possible harakat if exist
|
623
|
+
word_nm = araby.strip_tashkeel(word)
|
624
|
+
key = word_nm
|
625
|
+
# the first word can have prefixes
|
626
|
+
if (
|
627
|
+
word_nm
|
628
|
+
and not starts
|
629
|
+
and word_nm != "واحد"
|
630
|
+
and word_nm[0] in ("و", "ف", "ل", "ب", "ك")
|
631
|
+
):
|
632
|
+
key = word_nm[1:]
|
633
|
+
elif word_nm != "واحد" and word_nm.startswith("و"):
|
634
|
+
key = word_nm[1:]
|
635
|
+
if key in nbconst.NUMBER_WORDS or key.isnumeric():
|
636
|
+
if key not in (
|
637
|
+
"أحد",
|
638
|
+
"إحدى",
|
639
|
+
"اثنا",
|
640
|
+
"اثني",
|
641
|
+
"اثنتي",
|
642
|
+
"اثنتا",
|
643
|
+
) or nextword in ("عشر", "عشرة"):
|
644
|
+
if not starts:
|
645
|
+
taglist.append("DB")
|
646
|
+
starts = True
|
647
|
+
else:
|
648
|
+
taglist.append("DI")
|
649
|
+
else:
|
650
|
+
starts = False
|
651
|
+
taglist.append("O")
|
652
|
+
else:
|
653
|
+
starts = False
|
654
|
+
taglist.append("O")
|
655
|
+
return taglist
|
656
|
+
|
657
|
+
|
658
|
+
def detect_number_words(text):
|
659
|
+
"""
|
660
|
+
Detect number words in a text.
|
661
|
+
|
662
|
+
Example:
|
663
|
+
>>> detect_number_words(u"وجدت خمسمئة وثلاثة وعشرين دينارا")
|
664
|
+
خمسمئة وثلاثة وعشرين
|
665
|
+
|
666
|
+
@param text: input text
|
667
|
+
@type text: unicode
|
668
|
+
@return: number words extracted from text
|
669
|
+
@rtype: integer
|
670
|
+
"""
|
671
|
+
|
672
|
+
phrases_context = extract_number_context(text)
|
673
|
+
for ph_con in phrases_context:
|
674
|
+
if len(ph_con) >= 3:
|
675
|
+
previous = ph_con[0]
|
676
|
+
phrase = ph_con[1]
|
677
|
+
nextword = ph_con[2]
|
678
|
+
numberedwords = phrase
|
679
|
+
numeric = text2number(numberedwords)
|
680
|
+
tags = get_previous_tag(previous)
|
681
|
+
wordlist = araby.strip_tashkeel(numberedwords).split(" ")
|
682
|
+
vocalized = vocalize_number(wordlist, tags)
|
683
|
+
# calcul vocalization similarity:
|
684
|
+
sim = araby.vocalized_similarity(numberedwords, vocalized)
|
685
|
+
voc_unit = vocalize_unit(numeric, nextword)
|
686
|
+
sim_unit = araby.vocalized_similarity(voc_unit, nextword)
|
687
|
+
|
688
|
+
if sim < 0:
|
689
|
+
# ~ print u'\t'.join([str(sim), u' '.join(numberedwords), vocalized,
|
690
|
+
# ~ str(numeric), u' '.join([previous, phrase, nextword]),
|
691
|
+
# ~ nextword, voc_unit, str(sim_unit)]).encode('utf8')
|
692
|
+
print(
|
693
|
+
"\t".join([str(sim), " ".join(numberedwords), " ".join(vocalized)])
|
694
|
+
)
|
695
|
+
print(str(numeric), " ".join([previous, phrase, nextword]))
|
696
|
+
print("\t".join([nextword, voc_unit, str(sim_unit)]))
|
697
|
+
|
698
|
+
|
699
|
+
def pre_tashkeel_number(wordlist):
|
700
|
+
"""
|
701
|
+
Vocalized a number clauses in a text.
|
702
|
+
@param wordlist: input text
|
703
|
+
@type wordlist: unicode
|
704
|
+
@return: wordlist with vocalized number clause
|
705
|
+
@rtype: list
|
706
|
+
"""
|
707
|
+
taglist = detect_numbers(wordlist)
|
708
|
+
previous = ""
|
709
|
+
vocalized_list = []
|
710
|
+
chunk = []
|
711
|
+
previous_tag = ""
|
712
|
+
for word, tag in zip(wordlist, taglist):
|
713
|
+
if tag in ("DB", "DI"):
|
714
|
+
chunk.append(word)
|
715
|
+
else:
|
716
|
+
if chunk:
|
717
|
+
# get the tag of previous word
|
718
|
+
previous_tag = get_previous_tag(previous)
|
719
|
+
vocalized = vocalize_number(chunk, previous_tag)
|
720
|
+
vocalized_list.extend(vocalized)
|
721
|
+
chunk = []
|
722
|
+
vocalized_list.append(word)
|
723
|
+
previous = word
|
724
|
+
if chunk:
|
725
|
+
vocalized = vocalize_number(chunk, previous_tag)
|
726
|
+
vocalized_list.extend(vocalized)
|
727
|
+
chunk = []
|
728
|
+
return vocalized_list
|
729
|
+
|
730
|
+
|
731
|
+
def number2ordinal(anumber, feminin=False):
|
732
|
+
"""
|
733
|
+
Convert number to arabic words in ordinal form, for example convert 25 --> الخامس والعشرون
|
734
|
+
|
735
|
+
Example:
|
736
|
+
>>> number2text(523)
|
737
|
+
الخمسمئة والثالث والعشرون
|
738
|
+
@param anumber: input number
|
739
|
+
@type anumber: int
|
740
|
+
@return: number words
|
741
|
+
@rtype: unicode
|
742
|
+
"""
|
743
|
+
# test if the given type is numeric(float or int
|
744
|
+
# if ok, convert it to string
|
745
|
+
a = 0
|
746
|
+
if type(anumber) is int:
|
747
|
+
anumber = str(anumber)
|
748
|
+
# if the given type is str/unicode, test if it's a valid number
|
749
|
+
elif type(anumber) is str or type(anumber) is unicode:
|
750
|
+
try:
|
751
|
+
a = int(anumber)
|
752
|
+
except ValueError:
|
753
|
+
return "صفر"
|
754
|
+
# if the given number not a valid return 0
|
755
|
+
else:
|
756
|
+
return "الصفر"
|
757
|
+
if a == 1:
|
758
|
+
if feminin:
|
759
|
+
return "الأولى"
|
760
|
+
else:
|
761
|
+
return "الأول"
|
762
|
+
arbn = ArNumbers()
|
763
|
+
arbn_str = arbn.int2str(anumber)
|
764
|
+
# substitute numeral words to ordinal words
|
765
|
+
# normalize waw
|
766
|
+
arbn_str = arbn_str.replace("و ", "و")
|
767
|
+
tokens = arbn_str.split(" ")
|
768
|
+
# the first token:
|
769
|
+
# if it's unit, change it to ordinal
|
770
|
+
# and add definate article
|
771
|
+
new_list = []
|
772
|
+
if feminin:
|
773
|
+
ordinal_words = nbconst.UNITS_ORDINAL_WORDS_FEMININ
|
774
|
+
else:
|
775
|
+
ordinal_words = nbconst.UNITS_ORDINAL_WORDS
|
776
|
+
if tokens:
|
777
|
+
tok = tokens[0]
|
778
|
+
if tok in ordinal_words:
|
779
|
+
# ثلاث = ثالث
|
780
|
+
tok = ordinal_words.get(tok, tok)
|
781
|
+
# add definite article
|
782
|
+
tok = "ال" + tok
|
783
|
+
new_list.append(tok)
|
784
|
+
|
785
|
+
for tok in tokens[1:]:
|
786
|
+
# first strip first Waw
|
787
|
+
if tok.startswith(araby.WAW):
|
788
|
+
# strip waw
|
789
|
+
tok = tok[1:]
|
790
|
+
if tok in ordinal_words:
|
791
|
+
# ثلاث = ثالث
|
792
|
+
tok = ordinal_words.get(tok, tok)
|
793
|
+
tok = "وال" + tok
|
794
|
+
new_list.append(tok)
|
795
|
+
# ajust الواحد at the end
|
796
|
+
if new_list[-1:] == "الحادي":
|
797
|
+
new_list.pop()
|
798
|
+
new_list.append("الواحد")
|
799
|
+
elif new_list[-1:] == "الحادية":
|
800
|
+
new_list.pop()
|
801
|
+
new_list.append("الواحدة")
|
802
|
+
ordinal_string = " ".join(new_list)
|
803
|
+
return ordinal_string
|
804
|
+
|
805
|
+
|
806
|
+
if __name__ == "__main__":
|
807
|
+
# import number as ArabicNumberToLetters
|
808
|
+
TEXTS = [
|
809
|
+
"مليونان وألفان وإثنا عشر",
|
810
|
+
"جاء مليونان وألفان وإثنا عشر",
|
811
|
+
"وجدت خمسمئة وثلاث وعشرون دينارا",
|
812
|
+
"خمسمئة وثلاث وعشرون دينارا",
|
813
|
+
"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا",
|
814
|
+
"لم أجد شيئا",
|
815
|
+
"وجدت خمسمئة وثلاثة وعشرين دينارا فاشتريت ثلاثة عشر دفترا",
|
816
|
+
"من ثلاثمئة وخمسين بلدا ",
|
817
|
+
"من ثلاثمئة وخمسين بلدا ",
|
818
|
+
"من أربعمئة وخمسين بلدا ",
|
819
|
+
"السلام عليكم 2014",
|
820
|
+
]
|
821
|
+
for txt in TEXTS:
|
822
|
+
word_list = araby.tokenize(txt)
|
823
|
+
positions_phrases = detect_number_phrases_position(word_list)
|
824
|
+
nb_phrases = extract_number_phrases(txt)
|
825
|
+
tag_list = detect_numbers(word_list)
|
826
|
+
tashkeel = " ".join(pre_tashkeel_number(word_list))
|