phoonnx 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phoonnx/__init__.py +0 -0
- phoonnx/config.py +490 -0
- phoonnx/locale/ca/phonetic_spellings.txt +2 -0
- phoonnx/locale/en/phonetic_spellings.txt +1 -0
- phoonnx/locale/gl/phonetic_spellings.txt +2 -0
- phoonnx/locale/pt/phonetic_spellings.txt +2 -0
- phoonnx/phoneme_ids.py +453 -0
- phoonnx/phonemizers/__init__.py +45 -0
- phoonnx/phonemizers/ar.py +42 -0
- phoonnx/phonemizers/base.py +216 -0
- phoonnx/phonemizers/en.py +250 -0
- phoonnx/phonemizers/fa.py +46 -0
- phoonnx/phonemizers/gl.py +142 -0
- phoonnx/phonemizers/he.py +67 -0
- phoonnx/phonemizers/ja.py +119 -0
- phoonnx/phonemizers/ko.py +97 -0
- phoonnx/phonemizers/mul.py +606 -0
- phoonnx/phonemizers/vi.py +44 -0
- phoonnx/phonemizers/zh.py +308 -0
- phoonnx/thirdparty/__init__.py +0 -0
- phoonnx/thirdparty/arpa2ipa.py +249 -0
- phoonnx/thirdparty/cotovia/cotovia_aarch64 +0 -0
- phoonnx/thirdparty/cotovia/cotovia_x86_64 +0 -0
- phoonnx/thirdparty/hangul2ipa.py +783 -0
- phoonnx/thirdparty/ko_tables/aspiration.csv +20 -0
- phoonnx/thirdparty/ko_tables/assimilation.csv +31 -0
- phoonnx/thirdparty/ko_tables/double_coda.csv +17 -0
- phoonnx/thirdparty/ko_tables/hanja.tsv +8525 -0
- phoonnx/thirdparty/ko_tables/ipa.csv +22 -0
- phoonnx/thirdparty/ko_tables/neutralization.csv +11 -0
- phoonnx/thirdparty/ko_tables/tensification.csv +56 -0
- phoonnx/thirdparty/ko_tables/yale.csv +22 -0
- phoonnx/thirdparty/kog2p/__init__.py +385 -0
- phoonnx/thirdparty/kog2p/rulebook.txt +212 -0
- phoonnx/thirdparty/mantoq/__init__.py +67 -0
- phoonnx/thirdparty/mantoq/buck/__init__.py +0 -0
- phoonnx/thirdparty/mantoq/buck/phonetise_buckwalter.py +569 -0
- phoonnx/thirdparty/mantoq/buck/symbols.py +64 -0
- phoonnx/thirdparty/mantoq/buck/tokenization.py +105 -0
- phoonnx/thirdparty/mantoq/num2words.py +37 -0
- phoonnx/thirdparty/mantoq/pyarabic/__init__.py +12 -0
- phoonnx/thirdparty/mantoq/pyarabic/arabrepr.py +64 -0
- phoonnx/thirdparty/mantoq/pyarabic/araby.py +1647 -0
- phoonnx/thirdparty/mantoq/pyarabic/named_const.py +227 -0
- phoonnx/thirdparty/mantoq/pyarabic/normalize.py +161 -0
- phoonnx/thirdparty/mantoq/pyarabic/number.py +826 -0
- phoonnx/thirdparty/mantoq/pyarabic/number_const.py +1704 -0
- phoonnx/thirdparty/mantoq/pyarabic/stack.py +52 -0
- phoonnx/thirdparty/mantoq/pyarabic/trans.py +517 -0
- phoonnx/thirdparty/mantoq/unicode_symbol2label.py +4173 -0
- phoonnx/thirdparty/tashkeel/LICENSE +22 -0
- phoonnx/thirdparty/tashkeel/SOURCE +1 -0
- phoonnx/thirdparty/tashkeel/__init__.py +212 -0
- phoonnx/thirdparty/tashkeel/hint_id_map.json +18 -0
- phoonnx/thirdparty/tashkeel/input_id_map.json +56 -0
- phoonnx/thirdparty/tashkeel/model.onnx +0 -0
- phoonnx/thirdparty/tashkeel/target_id_map.json +17 -0
- phoonnx/thirdparty/zh_num.py +238 -0
- phoonnx/util.py +705 -0
- phoonnx/version.py +6 -0
- phoonnx/voice.py +521 -0
- phoonnx-0.0.0.dist-info/METADATA +255 -0
- phoonnx-0.0.0.dist-info/RECORD +86 -0
- phoonnx-0.0.0.dist-info/WHEEL +5 -0
- phoonnx-0.0.0.dist-info/top_level.txt +2 -0
- phoonnx_train/__main__.py +151 -0
- phoonnx_train/export_onnx.py +109 -0
- phoonnx_train/norm_audio/__init__.py +92 -0
- phoonnx_train/norm_audio/trim.py +54 -0
- phoonnx_train/norm_audio/vad.py +54 -0
- phoonnx_train/preprocess.py +420 -0
- phoonnx_train/vits/__init__.py +0 -0
- phoonnx_train/vits/attentions.py +427 -0
- phoonnx_train/vits/commons.py +147 -0
- phoonnx_train/vits/config.py +330 -0
- phoonnx_train/vits/dataset.py +214 -0
- phoonnx_train/vits/lightning.py +352 -0
- phoonnx_train/vits/losses.py +58 -0
- phoonnx_train/vits/mel_processing.py +139 -0
- phoonnx_train/vits/models.py +732 -0
- phoonnx_train/vits/modules.py +527 -0
- phoonnx_train/vits/monotonic_align/__init__.py +20 -0
- phoonnx_train/vits/monotonic_align/setup.py +13 -0
- phoonnx_train/vits/transforms.py +212 -0
- phoonnx_train/vits/utils.py +16 -0
- phoonnx_train/vits/wavfile.py +860 -0
@@ -0,0 +1,52 @@
|
|
1
|
+
#!/usr/bin/python
|
2
|
+
# -*- coding=utf-8 -*-
|
3
|
+
"""
|
4
|
+
Stack module
|
5
|
+
@author: Taha Zerrouki
|
6
|
+
@contact: taha dot zerrouki at gmail dot com
|
7
|
+
@copyright: Arabtechies, Arabeyes, Taha Zerrouki
|
8
|
+
@license: GPL
|
9
|
+
@date:2010/03/01
|
10
|
+
@version: 0.1
|
11
|
+
"""
|
12
|
+
|
13
|
+
|
14
|
+
class Stack(object):
|
15
|
+
"""
|
16
|
+
Stack class
|
17
|
+
"""
|
18
|
+
|
19
|
+
def __init__(self, text=""):
|
20
|
+
"""
|
21
|
+
create a stack
|
22
|
+
"""
|
23
|
+
self.items = list(text)
|
24
|
+
|
25
|
+
def push(self, item):
|
26
|
+
"""
|
27
|
+
puch an item into the stack
|
28
|
+
@param item: pushed item
|
29
|
+
@type item: mixed
|
30
|
+
@return: None
|
31
|
+
@rtype: None
|
32
|
+
"""
|
33
|
+
self.items.append(item)
|
34
|
+
|
35
|
+
def pop(self):
|
36
|
+
"""
|
37
|
+
pop an item from the stack
|
38
|
+
@return: poped item
|
39
|
+
@rtype: mixed
|
40
|
+
"""
|
41
|
+
if not self.is_empty():
|
42
|
+
return self.items.pop()
|
43
|
+
else:
|
44
|
+
return None
|
45
|
+
|
46
|
+
def is_empty(self):
|
47
|
+
"""
|
48
|
+
test if the stack is empty
|
49
|
+
@return: True or False
|
50
|
+
@rtype: boolean
|
51
|
+
"""
|
52
|
+
return self.items == []
|
@@ -0,0 +1,517 @@
|
|
1
|
+
#!/usr/bin/python
|
2
|
+
# -*- coding=utf-8 -*-
|
3
|
+
"""
|
4
|
+
Arabic Transliteration routins
|
5
|
+
@author: Taha Zerrouki
|
6
|
+
@contact: taha dot zerrouki at gmail dot com
|
7
|
+
@license: GPL
|
8
|
+
@date:2018/08/146
|
9
|
+
@version: 0.1
|
10
|
+
"""
|
11
|
+
import re
|
12
|
+
|
13
|
+
from . import araby as ar
|
14
|
+
|
15
|
+
t2a_table = {
|
16
|
+
"A": ar.ALEF,
|
17
|
+
"b": ar.BEH,
|
18
|
+
"t": ar.TEH,
|
19
|
+
"p": ar.TEH_MARBUTA,
|
20
|
+
"v": ar.THEH,
|
21
|
+
"j": ar.JEEM,
|
22
|
+
"H": ar.HAH,
|
23
|
+
"x": ar.KHAH,
|
24
|
+
"d": ar.DAL,
|
25
|
+
"*": ar.THAL,
|
26
|
+
"r": ar.REH,
|
27
|
+
"z": ar.ZAIN,
|
28
|
+
"s": ar.SEEN,
|
29
|
+
"$": ar.SHEEN,
|
30
|
+
"S": ar.SAD,
|
31
|
+
"D": ar.DAD,
|
32
|
+
"T": ar.TAH,
|
33
|
+
"Z": ar.ZAH,
|
34
|
+
"E": ar.AIN,
|
35
|
+
"g": ar.GHAIN,
|
36
|
+
"f": ar.FEH,
|
37
|
+
"q": ar.QAF,
|
38
|
+
"k": ar.KAF,
|
39
|
+
"l": ar.LAM,
|
40
|
+
"m": ar.MEEM,
|
41
|
+
"n": ar.NOON,
|
42
|
+
"h": ar.HEH,
|
43
|
+
"w": ar.WAW,
|
44
|
+
"y": ar.YEH,
|
45
|
+
"Y": ar.ALEF_MAKSURA,
|
46
|
+
"'": ar.HAMZA,
|
47
|
+
"&": ar.WAW_HAMZA,
|
48
|
+
">": ar.ALEF_HAMZA_ABOVE,
|
49
|
+
"<": ar.ALEF_HAMZA_BELOW,
|
50
|
+
"|": ar.ALEF_MADDA,
|
51
|
+
"}": ar.YEH_HAMZA,
|
52
|
+
"_": ar.TATWEEL,
|
53
|
+
"a": ar.FATHA,
|
54
|
+
"F": ar.FATHATAN,
|
55
|
+
"i": ar.KASRA,
|
56
|
+
"K": ar.KASRATAN,
|
57
|
+
"u": ar.DAMMA,
|
58
|
+
"N": ar.DAMMATAN,
|
59
|
+
"~": ar.SHADDA,
|
60
|
+
"o": ar.SUKUN,
|
61
|
+
"`": ar.MINI_ALEF,
|
62
|
+
# ~ '{': ar.ALEF_WASLA
|
63
|
+
"{": ar.ALEF,
|
64
|
+
}
|
65
|
+
|
66
|
+
|
67
|
+
# conversion Tablke from the tim bulwalter represetation
|
68
|
+
# into sampa notation
|
69
|
+
t2sampa_table = {
|
70
|
+
"A": "a:", # ALEF,
|
71
|
+
"b": "b", # BEH
|
72
|
+
"t": "t", # TEH
|
73
|
+
"p": "h", # TEH_MARBUTA,
|
74
|
+
"v": "T", # THEH
|
75
|
+
"j": "g", # JEEM,
|
76
|
+
"H": "x", # HAH,
|
77
|
+
"x": "X", # KHAH,
|
78
|
+
"d": "d", # DAL,
|
79
|
+
"*": "D", # THAL,
|
80
|
+
"r": "r", # REH,
|
81
|
+
"z": "z", # ZAIN,
|
82
|
+
"s": "s", # SEEN,
|
83
|
+
"$": "S", # SHEEN,
|
84
|
+
"S": "s'", # SAD,
|
85
|
+
"D": "d'", # DAD,
|
86
|
+
"T": "t'", # TAH,
|
87
|
+
"Z": "D'", # ZAH,
|
88
|
+
"E": "?'", # AIN,
|
89
|
+
"g": "G", # GHAIN,
|
90
|
+
"f": "f", # FEH,
|
91
|
+
"q": "q", # QAF,
|
92
|
+
"k": "k", # KAF,
|
93
|
+
"l": "l", # LAM,
|
94
|
+
"m": "m", # MEEM,
|
95
|
+
"n": "n", # NOON,
|
96
|
+
"h": "h", # HEH,
|
97
|
+
"w": "w", # WAW,
|
98
|
+
"y": "j", # YEH,
|
99
|
+
"Y": ":", # ALEF_MAKSURA,
|
100
|
+
"'": "?", # HAMZA,
|
101
|
+
"&": "?", # WAW_HAMZA,
|
102
|
+
">": "?", # ALEF_HAMZA_ABOVE,
|
103
|
+
"<": "?", # ALEF_HAMZA_BELOW,
|
104
|
+
"|": "?a:", # ALEF_MADDA,
|
105
|
+
"}": "?", # YEH_HAMZA,
|
106
|
+
"_": "", # '',#TATWEEL,
|
107
|
+
"a": "a", # FATHA,
|
108
|
+
"F": "an", # FATHATAN,
|
109
|
+
"i": "i", # KASRA,
|
110
|
+
"K": "in", # KASRATAN,
|
111
|
+
"u": "u", # DAMMA,
|
112
|
+
"N": "un", # DAMMATAN,
|
113
|
+
"~": ar.SHADDA,
|
114
|
+
"o": "", # SUKUN,
|
115
|
+
"`": "a:", # MINI_ALEF,
|
116
|
+
"{": "", # ALEF_WASLA,
|
117
|
+
}
|
118
|
+
|
119
|
+
|
120
|
+
a2en_table = {
|
121
|
+
"ء": "2",
|
122
|
+
"آ": "A",
|
123
|
+
"أ": "A",
|
124
|
+
"ؤ": "2",
|
125
|
+
"إ": "2",
|
126
|
+
"ئ": "2",
|
127
|
+
"ا": "A",
|
128
|
+
"ب": "b",
|
129
|
+
"ة": "t",
|
130
|
+
"ت": "t",
|
131
|
+
"ث": "th",
|
132
|
+
"ج": "j",
|
133
|
+
"ح": "H",
|
134
|
+
"خ": "kh",
|
135
|
+
"د": "d",
|
136
|
+
"ذ": "dh",
|
137
|
+
"ر": "r",
|
138
|
+
"ز": "z",
|
139
|
+
"س": "s",
|
140
|
+
"ش": "sh",
|
141
|
+
"ص": "S",
|
142
|
+
"ض": "D",
|
143
|
+
"ط": "T",
|
144
|
+
"ظ": "zh",
|
145
|
+
"ع": "E",
|
146
|
+
"غ": "g",
|
147
|
+
"ـ": "",
|
148
|
+
"ف": "f",
|
149
|
+
"ق": "q",
|
150
|
+
"ك": "k",
|
151
|
+
"ل": "l",
|
152
|
+
"م": "m",
|
153
|
+
"ن": "n",
|
154
|
+
"ه": "h",
|
155
|
+
"و": "w",
|
156
|
+
"ى": "a",
|
157
|
+
"ي": "y",
|
158
|
+
"ً": "an",
|
159
|
+
"ٌ": "un",
|
160
|
+
"ٍ": "in",
|
161
|
+
"َ": "a",
|
162
|
+
"ُ": "u",
|
163
|
+
"ِ": "i",
|
164
|
+
"ّ": "",
|
165
|
+
"ْ": "",
|
166
|
+
"ٰ": "a",
|
167
|
+
}
|
168
|
+
|
169
|
+
a2t_table = {v: k for k, v in t2a_table.items()}
|
170
|
+
# correct case
|
171
|
+
a2t_table[ar.ALEF] = "A"
|
172
|
+
|
173
|
+
T2D_TRANS = str.maketrans(ar.NOT_DEF_HARAKA + ar.TASHKEEL_STRING, "012345678")
|
174
|
+
T2A_TRANS = str.maketrans(ar.NOT_DEF_HARAKA + ar.TASHKEEL_STRING, "0AUIauio3")
|
175
|
+
D2T_TRANS = str.maketrans("012345678", ar.NOT_DEF_HARAKA + ar.TASHKEEL_STRING)
|
176
|
+
A2T_TRANS = str.maketrans("0AUIauio3", ar.NOT_DEF_HARAKA + ar.TASHKEEL_STRING)
|
177
|
+
# -------------- Digits Trans -------------- #
|
178
|
+
E2W_TRANS = str.maketrans("".join(ar.NUMBERS_EAST), "".join(ar.NUMBERS_WEST))
|
179
|
+
E2P_TRANS = str.maketrans("".join(ar.NUMBERS_EAST), "".join(ar.NUMBERS_PERS))
|
180
|
+
W2E_TRANS = str.maketrans("".join(ar.NUMBERS_WEST), "".join(ar.NUMBERS_EAST))
|
181
|
+
W2P_TRANS = str.maketrans("".join(ar.NUMBERS_WEST), "".join(ar.NUMBERS_PERS))
|
182
|
+
P2E_TRANS = str.maketrans("".join(ar.NUMBERS_PERS), "".join(ar.NUMBERS_EAST))
|
183
|
+
P2W_TRANS = str.maketrans("".join(ar.NUMBERS_PERS), "".join(ar.NUMBERS_WEST))
|
184
|
+
|
185
|
+
|
186
|
+
def translate(word, table):
|
187
|
+
"""translate a word accoring to table"""
|
188
|
+
return word.translate(table)
|
189
|
+
|
190
|
+
|
191
|
+
def tim2utf8(s):
|
192
|
+
"Tranliteration to UTF-8 conversion of a string"
|
193
|
+
mystr = ""
|
194
|
+
for mychar in s:
|
195
|
+
mystr += t2a_table.get(mychar, mychar)
|
196
|
+
return mystr
|
197
|
+
|
198
|
+
|
199
|
+
def utf82tim(s):
|
200
|
+
"Tranliteration to Tim Buckwalter conversion of a string"
|
201
|
+
mystr = ""
|
202
|
+
for mychar in s:
|
203
|
+
mystr += a2t_table.get(mychar, mychar)
|
204
|
+
return mystr
|
205
|
+
|
206
|
+
|
207
|
+
def convertShadda(word, shadda="~"):
|
208
|
+
if word[0] == shadda:
|
209
|
+
# to avoid that shadda is present in the begining
|
210
|
+
word = word[1:]
|
211
|
+
while shadda in word:
|
212
|
+
i = word.index(shadda)
|
213
|
+
if i - 1 >= 0:
|
214
|
+
# replace the letter before shadda to double
|
215
|
+
# replace the first one only
|
216
|
+
word = word.replace(shadda, word[i - 1], 1)
|
217
|
+
return word
|
218
|
+
|
219
|
+
|
220
|
+
def tim2sampa(s):
|
221
|
+
"""Tranliteration to SAMPA code phonemes conversion of a string
|
222
|
+
We suppose that all words are full vocalized.
|
223
|
+
We convert according to t2s table,
|
224
|
+
and the shadda is converted to double letter
|
225
|
+
"""
|
226
|
+
mystr = ""
|
227
|
+
# convert shadda
|
228
|
+
# the conversion is made before translatiration,
|
229
|
+
# to avoid errors on phonems which have two letters as s[ or gH
|
230
|
+
s = convertShadda(s)
|
231
|
+
for mychar in s:
|
232
|
+
mystr += t2sampa_table.get(mychar, mychar)
|
233
|
+
|
234
|
+
# convert waw and yeh after damma and kasra
|
235
|
+
mystr = re.sub("(?<=u)w", ":", mystr)
|
236
|
+
mystr = re.sub("(?<=i)j", ":", mystr)
|
237
|
+
return mystr
|
238
|
+
|
239
|
+
|
240
|
+
def utf82latin(s):
|
241
|
+
"Tranliteration from UTF-8 to latin with plain english no symbol"
|
242
|
+
mystr = ""
|
243
|
+
for mychar in s:
|
244
|
+
mystr += a2en_table.get(mychar, mychar)
|
245
|
+
return mystr
|
246
|
+
|
247
|
+
|
248
|
+
def convert(text, code_from, code_to):
|
249
|
+
"""
|
250
|
+
convert text from code_from to code_to
|
251
|
+
|
252
|
+
"""
|
253
|
+
code1 = code_from.lower()
|
254
|
+
code2 = code_to.lower()
|
255
|
+
if code1 in ("utf", "utf8", "arabic"):
|
256
|
+
if code2 in ("tim", "buckwalter"):
|
257
|
+
return utf82tim(text)
|
258
|
+
elif code2 == "sampa":
|
259
|
+
return tim2sampa(utf82tim(text))
|
260
|
+
elif code2 in ("latin", "ascii"):
|
261
|
+
return utf82latin(text)
|
262
|
+
else:
|
263
|
+
return text
|
264
|
+
|
265
|
+
if code1 in ("tim", "buckwalter"):
|
266
|
+
if code2 in ("utf", "utf8", "arabic"):
|
267
|
+
return tim2utf8(text)
|
268
|
+
elif code2 == "sampa":
|
269
|
+
return tim2sampa(text)
|
270
|
+
else:
|
271
|
+
return text
|
272
|
+
|
273
|
+
|
274
|
+
def segment_language(text):
|
275
|
+
"""
|
276
|
+
Detect language
|
277
|
+
"""
|
278
|
+
if not text:
|
279
|
+
return text
|
280
|
+
resultlist = []
|
281
|
+
if re.search("[\u0600-\u06ff]", text[0]):
|
282
|
+
# ~ if re.search(u"[\u0600-\u06ff]", text):
|
283
|
+
arabic = True
|
284
|
+
else:
|
285
|
+
arabic = False
|
286
|
+
actual_text = ""
|
287
|
+
for k in text:
|
288
|
+
if re.search("[\u0600-\u06ff]", k):
|
289
|
+
if arabic:
|
290
|
+
actual_text += k
|
291
|
+
else:
|
292
|
+
resultlist.append(("latin", actual_text))
|
293
|
+
arabic = True
|
294
|
+
actual_text = k
|
295
|
+
elif re.search("[\s\d\?, :\!\(\)]", k):
|
296
|
+
actual_text += k
|
297
|
+
else:
|
298
|
+
if arabic:
|
299
|
+
i = len(actual_text)
|
300
|
+
temp_text = ""
|
301
|
+
while not re.search("[\u0600-\u06ff]", actual_text[i : i + 1]):
|
302
|
+
i -= 1
|
303
|
+
temp_text = actual_text[i + 1 :]
|
304
|
+
actual_text = actual_text[: i + 1]
|
305
|
+
resultlist.append(("arabic", actual_text))
|
306
|
+
arabic = False
|
307
|
+
actual_text = temp_text + k
|
308
|
+
else:
|
309
|
+
actual_text += k
|
310
|
+
if arabic:
|
311
|
+
resultlist.append(("arabic", actual_text))
|
312
|
+
else:
|
313
|
+
resultlist.append(("latin", actual_text))
|
314
|
+
return resultlist
|
315
|
+
|
316
|
+
|
317
|
+
def delimite_language(text, language="arabic", start="<", end=">"):
|
318
|
+
new_chunks_list = []
|
319
|
+
chunks = segment_language(text)
|
320
|
+
for lang, chunk in chunks:
|
321
|
+
if lang == language:
|
322
|
+
new_chunks_list.append("%s%s%s" % (start, chunk, end))
|
323
|
+
else:
|
324
|
+
new_chunks_list.append(chunk)
|
325
|
+
return " ".join(new_chunks_list)
|
326
|
+
|
327
|
+
|
328
|
+
def normalize_digits(text, source="all", out="west"):
|
329
|
+
"""
|
330
|
+
Normalize digits to and from the following writing systems:
|
331
|
+
west: Western Arabic numerals (0123456789)
|
332
|
+
east: Eastern Arabic (Hindu-Arabic) numerals (٠١٢٣٤٥٦٧٨٩)
|
333
|
+
persian: Persian/Urdu numerals (۰۱۲۳۴۵۶۷۸۹)
|
334
|
+
|
335
|
+
if `source = all`, then all digits contained in the text
|
336
|
+
will be normalized into `out` writing system.
|
337
|
+
Otherwise digits written in `source` will be normalized
|
338
|
+
without affecting the rest of the digits.
|
339
|
+
|
340
|
+
Example:
|
341
|
+
>>> text = u'۰۱۲۳۴۵۶۷۸۹ ٠١٢٣٤٥٦٧٨٩ 123456789'
|
342
|
+
>>> normalize_digits(text, source='all', out='west')
|
343
|
+
'0123456789 0123456789 0123456789'
|
344
|
+
>>> normalize_digits(text, source='persian', out='west')
|
345
|
+
>>> '0123456789 ٠١٢٣٤٥٦٧٨٩ 0123456789'
|
346
|
+
|
347
|
+
@param text: unnormalized text.
|
348
|
+
@type text: unicode.
|
349
|
+
@param source: Writing system for the digits to be normalized.
|
350
|
+
(default is all).
|
351
|
+
@type source: string
|
352
|
+
@param out: Intended writing system for source.
|
353
|
+
(default is west)
|
354
|
+
@return: returns a normalized text.
|
355
|
+
@rtype: unicode.
|
356
|
+
"""
|
357
|
+
source = source.lower()
|
358
|
+
out = out.lower()
|
359
|
+
assert source in ["all", "west", "east", "persian"], (
|
360
|
+
"Invalid option for `source`: %s" % source
|
361
|
+
)
|
362
|
+
assert out in ["west", "east", "persian"], "Invalid option for `out`: %s" % out
|
363
|
+
if source == out:
|
364
|
+
return text
|
365
|
+
source_to_out_tbl = {
|
366
|
+
"west": {"east": W2E_TRANS, "persian": W2P_TRANS},
|
367
|
+
"east": {"west": E2W_TRANS, "persian": E2P_TRANS},
|
368
|
+
"persian": {"west": P2W_TRANS, "east": P2E_TRANS},
|
369
|
+
}
|
370
|
+
if source == "all":
|
371
|
+
del source_to_out_tbl[out]
|
372
|
+
for tbl in source_to_out_tbl.values():
|
373
|
+
text = translate(text, tbl[out])
|
374
|
+
return text
|
375
|
+
return translate(text, source_to_out_tbl[source][out])
|
376
|
+
|
377
|
+
|
378
|
+
def encode_tashkeel(word, method="ascii"):
|
379
|
+
"""
|
380
|
+
Encode word marks into decimal or Ascii string to be saved as integer
|
381
|
+
|
382
|
+
Example:
|
383
|
+
>>> import pyarabic.trans
|
384
|
+
>>> word1 = u"هَارِبًا"
|
385
|
+
>>> pyarabic.trans.encode_tashkeel(word1)
|
386
|
+
('هاربا', 'a0iA0')
|
387
|
+
>>> pyarabic.trans.encode_tashkeel(word1, "decimal")
|
388
|
+
('هاربا', 40610)
|
389
|
+
>>> letters = u"هاربا"
|
390
|
+
>>> encoded_marks = u"a0iA0"
|
391
|
+
>>> pyarabic.trans.decode_tashkeel(letters, encoded_marks)
|
392
|
+
'هَارِبًا'
|
393
|
+
>>> letters = u"هاربا"
|
394
|
+
>>> encoded_marks = 40610
|
395
|
+
>>> pyarabic.trans.decode_tashkeel(letters, encoded_marks, "decimal")
|
396
|
+
'هَارِبًا'
|
397
|
+
|
398
|
+
|
399
|
+
@input word: diacritized arabic diacritcs
|
400
|
+
@type word: unicode
|
401
|
+
@return: (letters, encoded) zero if fails
|
402
|
+
@rtype: (letters, encoded) ttring/ integer
|
403
|
+
"""
|
404
|
+
letters, marks = ar.separate(word)
|
405
|
+
|
406
|
+
if method == "decimal":
|
407
|
+
transed = translate(marks, T2D_TRANS)
|
408
|
+
elif method == "ascii":
|
409
|
+
transed = translate(marks, T2A_TRANS)
|
410
|
+
else:
|
411
|
+
transed = translate(marks, T2A_TRANS)
|
412
|
+
|
413
|
+
if method == "decimal":
|
414
|
+
try:
|
415
|
+
transed = int(transed)
|
416
|
+
except:
|
417
|
+
return word, ""
|
418
|
+
return letters, transed
|
419
|
+
|
420
|
+
|
421
|
+
def decode_tashkeel(word, marks, method="ascii"):
|
422
|
+
"""decode tashkeel"""
|
423
|
+
"""
|
424
|
+
decode marks from decimal/ascii string to be joint on word
|
425
|
+
@input word: undiacritized arabic diacritcs
|
426
|
+
@type word: unicode
|
427
|
+
@input marks: encoded marks
|
428
|
+
@type marks: unicode/integer
|
429
|
+
@return: diacritized word
|
430
|
+
@rtype: unicode
|
431
|
+
"""
|
432
|
+
if type(marks) != (str):
|
433
|
+
marks = str(marks)
|
434
|
+
# zeros can be removed in int code, then we must add them to left
|
435
|
+
marks = marks.rjust(len(word), str("0"))
|
436
|
+
if method == "decimal":
|
437
|
+
transed = translate(marks, D2T_TRANS)
|
438
|
+
elif method == "ascii":
|
439
|
+
transed = translate(marks, A2T_TRANS)
|
440
|
+
else:
|
441
|
+
transed = translate(marks, A2T_TRANS)
|
442
|
+
word2 = ar.joint(word, transed)
|
443
|
+
return word2
|
444
|
+
|
445
|
+
|
446
|
+
if __name__ == "__main__":
|
447
|
+
|
448
|
+
words = """qulo
|
449
|
+
>aEuw*u
|
450
|
+
bi
|
451
|
+
rab~i
|
452
|
+
{l
|
453
|
+
n~aAsi
|
454
|
+
maliki
|
455
|
+
{l
|
456
|
+
n~aAsi
|
457
|
+
<ila`hi
|
458
|
+
{l
|
459
|
+
n~aAsi
|
460
|
+
min
|
461
|
+
$ar~i
|
462
|
+
{lo
|
463
|
+
wasowaAsi
|
464
|
+
{lo
|
465
|
+
xan~aAsi
|
466
|
+
{l~a*iY
|
467
|
+
yuwasowisu
|
468
|
+
fiY
|
469
|
+
Suduwri
|
470
|
+
{l
|
471
|
+
n~aAsi
|
472
|
+
mina
|
473
|
+
{lo
|
474
|
+
jin~api
|
475
|
+
wa
|
476
|
+
{l
|
477
|
+
n~aAsi""".split(
|
478
|
+
"\n"
|
479
|
+
)
|
480
|
+
for word in words:
|
481
|
+
arabic = tim2utf8(word)
|
482
|
+
timu = utf82tim(arabic)
|
483
|
+
sampa = tim2sampa(word)
|
484
|
+
arabic2 = convert(word, "tim", "utf")
|
485
|
+
timu2 = convert(arabic2, "utf", "tim") # utf82tim(arabic)
|
486
|
+
sampa2 = convert(word, "tim", "sampa") # tim2sampa(word)
|
487
|
+
print(
|
488
|
+
"\t".join(
|
489
|
+
[
|
490
|
+
word,
|
491
|
+
arabic,
|
492
|
+
timu,
|
493
|
+
sampa,
|
494
|
+
arabic2,
|
495
|
+
timu2,
|
496
|
+
sampa2,
|
497
|
+
str(timu == word),
|
498
|
+
str(arabic2 == arabic),
|
499
|
+
str(timu2 == timu),
|
500
|
+
str(sampa2 == sampa),
|
501
|
+
]
|
502
|
+
).encode("utf8")
|
503
|
+
)
|
504
|
+
# ~ print(u'\t'.join([word, tim2sampa(word)]).encode('utf8'))
|
505
|
+
|
506
|
+
utf2tim_table = {v: k for k, v in t2a_table.items()}
|
507
|
+
print(utf2tim_table)
|
508
|
+
|
509
|
+
from arabrepr import arepr
|
510
|
+
|
511
|
+
# test detect language
|
512
|
+
text = """السلام عليكم how are you, لم اسمع أخبارك منذ مدة, where are you going"""
|
513
|
+
print(arepr(segment_language(text)))
|
514
|
+
text_out = delimite_language(text, start="\RL{", end="}")
|
515
|
+
print(text_out.encode("utf8"))
|
516
|
+
text_out = delimite_language(text, start="<arabic>", end="</arabic>")
|
517
|
+
print(text_out.encode("utf8"))
|