phoonnx 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. phoonnx/__init__.py +0 -0
  2. phoonnx/config.py +490 -0
  3. phoonnx/locale/ca/phonetic_spellings.txt +2 -0
  4. phoonnx/locale/en/phonetic_spellings.txt +1 -0
  5. phoonnx/locale/gl/phonetic_spellings.txt +2 -0
  6. phoonnx/locale/pt/phonetic_spellings.txt +2 -0
  7. phoonnx/phoneme_ids.py +453 -0
  8. phoonnx/phonemizers/__init__.py +45 -0
  9. phoonnx/phonemizers/ar.py +42 -0
  10. phoonnx/phonemizers/base.py +216 -0
  11. phoonnx/phonemizers/en.py +250 -0
  12. phoonnx/phonemizers/fa.py +46 -0
  13. phoonnx/phonemizers/gl.py +142 -0
  14. phoonnx/phonemizers/he.py +67 -0
  15. phoonnx/phonemizers/ja.py +119 -0
  16. phoonnx/phonemizers/ko.py +97 -0
  17. phoonnx/phonemizers/mul.py +606 -0
  18. phoonnx/phonemizers/vi.py +44 -0
  19. phoonnx/phonemizers/zh.py +308 -0
  20. phoonnx/thirdparty/__init__.py +0 -0
  21. phoonnx/thirdparty/arpa2ipa.py +249 -0
  22. phoonnx/thirdparty/cotovia/cotovia_aarch64 +0 -0
  23. phoonnx/thirdparty/cotovia/cotovia_x86_64 +0 -0
  24. phoonnx/thirdparty/hangul2ipa.py +783 -0
  25. phoonnx/thirdparty/ko_tables/aspiration.csv +20 -0
  26. phoonnx/thirdparty/ko_tables/assimilation.csv +31 -0
  27. phoonnx/thirdparty/ko_tables/double_coda.csv +17 -0
  28. phoonnx/thirdparty/ko_tables/hanja.tsv +8525 -0
  29. phoonnx/thirdparty/ko_tables/ipa.csv +22 -0
  30. phoonnx/thirdparty/ko_tables/neutralization.csv +11 -0
  31. phoonnx/thirdparty/ko_tables/tensification.csv +56 -0
  32. phoonnx/thirdparty/ko_tables/yale.csv +22 -0
  33. phoonnx/thirdparty/kog2p/__init__.py +385 -0
  34. phoonnx/thirdparty/kog2p/rulebook.txt +212 -0
  35. phoonnx/thirdparty/mantoq/__init__.py +67 -0
  36. phoonnx/thirdparty/mantoq/buck/__init__.py +0 -0
  37. phoonnx/thirdparty/mantoq/buck/phonetise_buckwalter.py +569 -0
  38. phoonnx/thirdparty/mantoq/buck/symbols.py +64 -0
  39. phoonnx/thirdparty/mantoq/buck/tokenization.py +105 -0
  40. phoonnx/thirdparty/mantoq/num2words.py +37 -0
  41. phoonnx/thirdparty/mantoq/pyarabic/__init__.py +12 -0
  42. phoonnx/thirdparty/mantoq/pyarabic/arabrepr.py +64 -0
  43. phoonnx/thirdparty/mantoq/pyarabic/araby.py +1647 -0
  44. phoonnx/thirdparty/mantoq/pyarabic/named_const.py +227 -0
  45. phoonnx/thirdparty/mantoq/pyarabic/normalize.py +161 -0
  46. phoonnx/thirdparty/mantoq/pyarabic/number.py +826 -0
  47. phoonnx/thirdparty/mantoq/pyarabic/number_const.py +1704 -0
  48. phoonnx/thirdparty/mantoq/pyarabic/stack.py +52 -0
  49. phoonnx/thirdparty/mantoq/pyarabic/trans.py +517 -0
  50. phoonnx/thirdparty/mantoq/unicode_symbol2label.py +4173 -0
  51. phoonnx/thirdparty/tashkeel/LICENSE +22 -0
  52. phoonnx/thirdparty/tashkeel/SOURCE +1 -0
  53. phoonnx/thirdparty/tashkeel/__init__.py +212 -0
  54. phoonnx/thirdparty/tashkeel/hint_id_map.json +18 -0
  55. phoonnx/thirdparty/tashkeel/input_id_map.json +56 -0
  56. phoonnx/thirdparty/tashkeel/model.onnx +0 -0
  57. phoonnx/thirdparty/tashkeel/target_id_map.json +17 -0
  58. phoonnx/thirdparty/zh_num.py +238 -0
  59. phoonnx/util.py +705 -0
  60. phoonnx/version.py +6 -0
  61. phoonnx/voice.py +521 -0
  62. phoonnx-0.0.0.dist-info/METADATA +255 -0
  63. phoonnx-0.0.0.dist-info/RECORD +86 -0
  64. phoonnx-0.0.0.dist-info/WHEEL +5 -0
  65. phoonnx-0.0.0.dist-info/top_level.txt +2 -0
  66. phoonnx_train/__main__.py +151 -0
  67. phoonnx_train/export_onnx.py +109 -0
  68. phoonnx_train/norm_audio/__init__.py +92 -0
  69. phoonnx_train/norm_audio/trim.py +54 -0
  70. phoonnx_train/norm_audio/vad.py +54 -0
  71. phoonnx_train/preprocess.py +420 -0
  72. phoonnx_train/vits/__init__.py +0 -0
  73. phoonnx_train/vits/attentions.py +427 -0
  74. phoonnx_train/vits/commons.py +147 -0
  75. phoonnx_train/vits/config.py +330 -0
  76. phoonnx_train/vits/dataset.py +214 -0
  77. phoonnx_train/vits/lightning.py +352 -0
  78. phoonnx_train/vits/losses.py +58 -0
  79. phoonnx_train/vits/mel_processing.py +139 -0
  80. phoonnx_train/vits/models.py +732 -0
  81. phoonnx_train/vits/modules.py +527 -0
  82. phoonnx_train/vits/monotonic_align/__init__.py +20 -0
  83. phoonnx_train/vits/monotonic_align/setup.py +13 -0
  84. phoonnx_train/vits/transforms.py +212 -0
  85. phoonnx_train/vits/utils.py +16 -0
  86. phoonnx_train/vits/wavfile.py +860 -0
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/python
2
+ # -*- coding=utf-8 -*-
3
+ """
4
+ Stack module
5
+ @author: Taha Zerrouki
6
+ @contact: taha dot zerrouki at gmail dot com
7
+ @copyright: Arabtechies, Arabeyes, Taha Zerrouki
8
+ @license: GPL
9
+ @date:2010/03/01
10
+ @version: 0.1
11
+ """
12
+
13
+
14
+ class Stack(object):
15
+ """
16
+ Stack class
17
+ """
18
+
19
+ def __init__(self, text=""):
20
+ """
21
+ create a stack
22
+ """
23
+ self.items = list(text)
24
+
25
+ def push(self, item):
26
+ """
27
+ puch an item into the stack
28
+ @param item: pushed item
29
+ @type item: mixed
30
+ @return: None
31
+ @rtype: None
32
+ """
33
+ self.items.append(item)
34
+
35
+ def pop(self):
36
+ """
37
+ pop an item from the stack
38
+ @return: poped item
39
+ @rtype: mixed
40
+ """
41
+ if not self.is_empty():
42
+ return self.items.pop()
43
+ else:
44
+ return None
45
+
46
+ def is_empty(self):
47
+ """
48
+ test if the stack is empty
49
+ @return: True or False
50
+ @rtype: boolean
51
+ """
52
+ return self.items == []
@@ -0,0 +1,517 @@
1
+ #!/usr/bin/python
2
+ # -*- coding=utf-8 -*-
3
+ """
4
+ Arabic Transliteration routins
5
+ @author: Taha Zerrouki
6
+ @contact: taha dot zerrouki at gmail dot com
7
+ @license: GPL
8
+ @date:2018/08/146
9
+ @version: 0.1
10
+ """
11
+ import re
12
+
13
+ from . import araby as ar
14
+
15
+ t2a_table = {
16
+ "A": ar.ALEF,
17
+ "b": ar.BEH,
18
+ "t": ar.TEH,
19
+ "p": ar.TEH_MARBUTA,
20
+ "v": ar.THEH,
21
+ "j": ar.JEEM,
22
+ "H": ar.HAH,
23
+ "x": ar.KHAH,
24
+ "d": ar.DAL,
25
+ "*": ar.THAL,
26
+ "r": ar.REH,
27
+ "z": ar.ZAIN,
28
+ "s": ar.SEEN,
29
+ "$": ar.SHEEN,
30
+ "S": ar.SAD,
31
+ "D": ar.DAD,
32
+ "T": ar.TAH,
33
+ "Z": ar.ZAH,
34
+ "E": ar.AIN,
35
+ "g": ar.GHAIN,
36
+ "f": ar.FEH,
37
+ "q": ar.QAF,
38
+ "k": ar.KAF,
39
+ "l": ar.LAM,
40
+ "m": ar.MEEM,
41
+ "n": ar.NOON,
42
+ "h": ar.HEH,
43
+ "w": ar.WAW,
44
+ "y": ar.YEH,
45
+ "Y": ar.ALEF_MAKSURA,
46
+ "'": ar.HAMZA,
47
+ "&": ar.WAW_HAMZA,
48
+ ">": ar.ALEF_HAMZA_ABOVE,
49
+ "<": ar.ALEF_HAMZA_BELOW,
50
+ "|": ar.ALEF_MADDA,
51
+ "}": ar.YEH_HAMZA,
52
+ "_": ar.TATWEEL,
53
+ "a": ar.FATHA,
54
+ "F": ar.FATHATAN,
55
+ "i": ar.KASRA,
56
+ "K": ar.KASRATAN,
57
+ "u": ar.DAMMA,
58
+ "N": ar.DAMMATAN,
59
+ "~": ar.SHADDA,
60
+ "o": ar.SUKUN,
61
+ "`": ar.MINI_ALEF,
62
+ # ~ '{': ar.ALEF_WASLA
63
+ "{": ar.ALEF,
64
+ }
65
+
66
+
67
+ # conversion Tablke from the tim bulwalter represetation
68
+ # into sampa notation
69
+ t2sampa_table = {
70
+ "A": "a:", # ALEF,
71
+ "b": "b", # BEH
72
+ "t": "t", # TEH
73
+ "p": "h", # TEH_MARBUTA,
74
+ "v": "T", # THEH
75
+ "j": "g", # JEEM,
76
+ "H": "x", # HAH,
77
+ "x": "X", # KHAH,
78
+ "d": "d", # DAL,
79
+ "*": "D", # THAL,
80
+ "r": "r", # REH,
81
+ "z": "z", # ZAIN,
82
+ "s": "s", # SEEN,
83
+ "$": "S", # SHEEN,
84
+ "S": "s'", # SAD,
85
+ "D": "d'", # DAD,
86
+ "T": "t'", # TAH,
87
+ "Z": "D'", # ZAH,
88
+ "E": "?'", # AIN,
89
+ "g": "G", # GHAIN,
90
+ "f": "f", # FEH,
91
+ "q": "q", # QAF,
92
+ "k": "k", # KAF,
93
+ "l": "l", # LAM,
94
+ "m": "m", # MEEM,
95
+ "n": "n", # NOON,
96
+ "h": "h", # HEH,
97
+ "w": "w", # WAW,
98
+ "y": "j", # YEH,
99
+ "Y": ":", # ALEF_MAKSURA,
100
+ "'": "?", # HAMZA,
101
+ "&": "?", # WAW_HAMZA,
102
+ ">": "?", # ALEF_HAMZA_ABOVE,
103
+ "<": "?", # ALEF_HAMZA_BELOW,
104
+ "|": "?a:", # ALEF_MADDA,
105
+ "}": "?", # YEH_HAMZA,
106
+ "_": "", # '',#TATWEEL,
107
+ "a": "a", # FATHA,
108
+ "F": "an", # FATHATAN,
109
+ "i": "i", # KASRA,
110
+ "K": "in", # KASRATAN,
111
+ "u": "u", # DAMMA,
112
+ "N": "un", # DAMMATAN,
113
+ "~": ar.SHADDA,
114
+ "o": "", # SUKUN,
115
+ "`": "a:", # MINI_ALEF,
116
+ "{": "", # ALEF_WASLA,
117
+ }
118
+
119
+
120
+ a2en_table = {
121
+ "ء": "2",
122
+ "آ": "A",
123
+ "أ": "A",
124
+ "ؤ": "2",
125
+ "إ": "2",
126
+ "ئ": "2",
127
+ "ا": "A",
128
+ "ب": "b",
129
+ "ة": "t",
130
+ "ت": "t",
131
+ "ث": "th",
132
+ "ج": "j",
133
+ "ح": "H",
134
+ "خ": "kh",
135
+ "د": "d",
136
+ "ذ": "dh",
137
+ "ر": "r",
138
+ "ز": "z",
139
+ "س": "s",
140
+ "ش": "sh",
141
+ "ص": "S",
142
+ "ض": "D",
143
+ "ط": "T",
144
+ "ظ": "zh",
145
+ "ع": "E",
146
+ "غ": "g",
147
+ "ـ": "",
148
+ "ف": "f",
149
+ "ق": "q",
150
+ "ك": "k",
151
+ "ل": "l",
152
+ "م": "m",
153
+ "ن": "n",
154
+ "ه": "h",
155
+ "و": "w",
156
+ "ى": "a",
157
+ "ي": "y",
158
+ "ً": "an",
159
+ "ٌ": "un",
160
+ "ٍ": "in",
161
+ "َ": "a",
162
+ "ُ": "u",
163
+ "ِ": "i",
164
+ "ّ": "",
165
+ "ْ": "",
166
+ "ٰ": "a",
167
+ }
168
+
169
+ a2t_table = {v: k for k, v in t2a_table.items()}
170
+ # correct case
171
+ a2t_table[ar.ALEF] = "A"
172
+
173
+ T2D_TRANS = str.maketrans(ar.NOT_DEF_HARAKA + ar.TASHKEEL_STRING, "012345678")
174
+ T2A_TRANS = str.maketrans(ar.NOT_DEF_HARAKA + ar.TASHKEEL_STRING, "0AUIauio3")
175
+ D2T_TRANS = str.maketrans("012345678", ar.NOT_DEF_HARAKA + ar.TASHKEEL_STRING)
176
+ A2T_TRANS = str.maketrans("0AUIauio3", ar.NOT_DEF_HARAKA + ar.TASHKEEL_STRING)
177
+ # -------------- Digits Trans -------------- #
178
+ E2W_TRANS = str.maketrans("".join(ar.NUMBERS_EAST), "".join(ar.NUMBERS_WEST))
179
+ E2P_TRANS = str.maketrans("".join(ar.NUMBERS_EAST), "".join(ar.NUMBERS_PERS))
180
+ W2E_TRANS = str.maketrans("".join(ar.NUMBERS_WEST), "".join(ar.NUMBERS_EAST))
181
+ W2P_TRANS = str.maketrans("".join(ar.NUMBERS_WEST), "".join(ar.NUMBERS_PERS))
182
+ P2E_TRANS = str.maketrans("".join(ar.NUMBERS_PERS), "".join(ar.NUMBERS_EAST))
183
+ P2W_TRANS = str.maketrans("".join(ar.NUMBERS_PERS), "".join(ar.NUMBERS_WEST))
184
+
185
+
186
+ def translate(word, table):
187
+ """translate a word accoring to table"""
188
+ return word.translate(table)
189
+
190
+
191
+ def tim2utf8(s):
192
+ "Tranliteration to UTF-8 conversion of a string"
193
+ mystr = ""
194
+ for mychar in s:
195
+ mystr += t2a_table.get(mychar, mychar)
196
+ return mystr
197
+
198
+
199
+ def utf82tim(s):
200
+ "Tranliteration to Tim Buckwalter conversion of a string"
201
+ mystr = ""
202
+ for mychar in s:
203
+ mystr += a2t_table.get(mychar, mychar)
204
+ return mystr
205
+
206
+
207
+ def convertShadda(word, shadda="~"):
208
+ if word[0] == shadda:
209
+ # to avoid that shadda is present in the begining
210
+ word = word[1:]
211
+ while shadda in word:
212
+ i = word.index(shadda)
213
+ if i - 1 >= 0:
214
+ # replace the letter before shadda to double
215
+ # replace the first one only
216
+ word = word.replace(shadda, word[i - 1], 1)
217
+ return word
218
+
219
+
220
+ def tim2sampa(s):
221
+ """Tranliteration to SAMPA code phonemes conversion of a string
222
+ We suppose that all words are full vocalized.
223
+ We convert according to t2s table,
224
+ and the shadda is converted to double letter
225
+ """
226
+ mystr = ""
227
+ # convert shadda
228
+ # the conversion is made before translatiration,
229
+ # to avoid errors on phonems which have two letters as s[ or gH
230
+ s = convertShadda(s)
231
+ for mychar in s:
232
+ mystr += t2sampa_table.get(mychar, mychar)
233
+
234
+ # convert waw and yeh after damma and kasra
235
+ mystr = re.sub("(?<=u)w", ":", mystr)
236
+ mystr = re.sub("(?<=i)j", ":", mystr)
237
+ return mystr
238
+
239
+
240
+ def utf82latin(s):
241
+ "Tranliteration from UTF-8 to latin with plain english no symbol"
242
+ mystr = ""
243
+ for mychar in s:
244
+ mystr += a2en_table.get(mychar, mychar)
245
+ return mystr
246
+
247
+
248
+ def convert(text, code_from, code_to):
249
+ """
250
+ convert text from code_from to code_to
251
+
252
+ """
253
+ code1 = code_from.lower()
254
+ code2 = code_to.lower()
255
+ if code1 in ("utf", "utf8", "arabic"):
256
+ if code2 in ("tim", "buckwalter"):
257
+ return utf82tim(text)
258
+ elif code2 == "sampa":
259
+ return tim2sampa(utf82tim(text))
260
+ elif code2 in ("latin", "ascii"):
261
+ return utf82latin(text)
262
+ else:
263
+ return text
264
+
265
+ if code1 in ("tim", "buckwalter"):
266
+ if code2 in ("utf", "utf8", "arabic"):
267
+ return tim2utf8(text)
268
+ elif code2 == "sampa":
269
+ return tim2sampa(text)
270
+ else:
271
+ return text
272
+
273
+
274
+ def segment_language(text):
275
+ """
276
+ Detect language
277
+ """
278
+ if not text:
279
+ return text
280
+ resultlist = []
281
+ if re.search("[\u0600-\u06ff]", text[0]):
282
+ # ~ if re.search(u"[\u0600-\u06ff]", text):
283
+ arabic = True
284
+ else:
285
+ arabic = False
286
+ actual_text = ""
287
+ for k in text:
288
+ if re.search("[\u0600-\u06ff]", k):
289
+ if arabic:
290
+ actual_text += k
291
+ else:
292
+ resultlist.append(("latin", actual_text))
293
+ arabic = True
294
+ actual_text = k
295
+ elif re.search("[\s\d\?, :\!\(\)]", k):
296
+ actual_text += k
297
+ else:
298
+ if arabic:
299
+ i = len(actual_text)
300
+ temp_text = ""
301
+ while not re.search("[\u0600-\u06ff]", actual_text[i : i + 1]):
302
+ i -= 1
303
+ temp_text = actual_text[i + 1 :]
304
+ actual_text = actual_text[: i + 1]
305
+ resultlist.append(("arabic", actual_text))
306
+ arabic = False
307
+ actual_text = temp_text + k
308
+ else:
309
+ actual_text += k
310
+ if arabic:
311
+ resultlist.append(("arabic", actual_text))
312
+ else:
313
+ resultlist.append(("latin", actual_text))
314
+ return resultlist
315
+
316
+
317
+ def delimite_language(text, language="arabic", start="<", end=">"):
318
+ new_chunks_list = []
319
+ chunks = segment_language(text)
320
+ for lang, chunk in chunks:
321
+ if lang == language:
322
+ new_chunks_list.append("%s%s%s" % (start, chunk, end))
323
+ else:
324
+ new_chunks_list.append(chunk)
325
+ return " ".join(new_chunks_list)
326
+
327
+
328
+ def normalize_digits(text, source="all", out="west"):
329
+ """
330
+ Normalize digits to and from the following writing systems:
331
+ west: Western Arabic numerals (0123456789)
332
+ east: Eastern Arabic (Hindu-Arabic) numerals (٠١٢٣٤٥٦٧٨٩)
333
+ persian: Persian/Urdu numerals (۰۱۲۳۴۵۶۷۸۹)
334
+
335
+ if `source = all`, then all digits contained in the text
336
+ will be normalized into `out` writing system.
337
+ Otherwise digits written in `source` will be normalized
338
+ without affecting the rest of the digits.
339
+
340
+ Example:
341
+ >>> text = u'۰۱۲۳۴۵۶۷۸۹ ٠١٢٣٤٥٦٧٨٩ 123456789'
342
+ >>> normalize_digits(text, source='all', out='west')
343
+ '0123456789 0123456789 0123456789'
344
+ >>> normalize_digits(text, source='persian', out='west')
345
+ >>> '0123456789 ٠١٢٣٤٥٦٧٨٩ 0123456789'
346
+
347
+ @param text: unnormalized text.
348
+ @type text: unicode.
349
+ @param source: Writing system for the digits to be normalized.
350
+ (default is all).
351
+ @type source: string
352
+ @param out: Intended writing system for source.
353
+ (default is west)
354
+ @return: returns a normalized text.
355
+ @rtype: unicode.
356
+ """
357
+ source = source.lower()
358
+ out = out.lower()
359
+ assert source in ["all", "west", "east", "persian"], (
360
+ "Invalid option for `source`: %s" % source
361
+ )
362
+ assert out in ["west", "east", "persian"], "Invalid option for `out`: %s" % out
363
+ if source == out:
364
+ return text
365
+ source_to_out_tbl = {
366
+ "west": {"east": W2E_TRANS, "persian": W2P_TRANS},
367
+ "east": {"west": E2W_TRANS, "persian": E2P_TRANS},
368
+ "persian": {"west": P2W_TRANS, "east": P2E_TRANS},
369
+ }
370
+ if source == "all":
371
+ del source_to_out_tbl[out]
372
+ for tbl in source_to_out_tbl.values():
373
+ text = translate(text, tbl[out])
374
+ return text
375
+ return translate(text, source_to_out_tbl[source][out])
376
+
377
+
378
+ def encode_tashkeel(word, method="ascii"):
379
+ """
380
+ Encode word marks into decimal or Ascii string to be saved as integer
381
+
382
+ Example:
383
+ >>> import pyarabic.trans
384
+ >>> word1 = u"هَارِبًا"
385
+ >>> pyarabic.trans.encode_tashkeel(word1)
386
+ ('هاربا', 'a0iA0')
387
+ >>> pyarabic.trans.encode_tashkeel(word1, "decimal")
388
+ ('هاربا', 40610)
389
+ >>> letters = u"هاربا"
390
+ >>> encoded_marks = u"a0iA0"
391
+ >>> pyarabic.trans.decode_tashkeel(letters, encoded_marks)
392
+ 'هَارِبًا'
393
+ >>> letters = u"هاربا"
394
+ >>> encoded_marks = 40610
395
+ >>> pyarabic.trans.decode_tashkeel(letters, encoded_marks, "decimal")
396
+ 'هَارِبًا'
397
+
398
+
399
+ @input word: diacritized arabic diacritcs
400
+ @type word: unicode
401
+ @return: (letters, encoded) zero if fails
402
+ @rtype: (letters, encoded) ttring/ integer
403
+ """
404
+ letters, marks = ar.separate(word)
405
+
406
+ if method == "decimal":
407
+ transed = translate(marks, T2D_TRANS)
408
+ elif method == "ascii":
409
+ transed = translate(marks, T2A_TRANS)
410
+ else:
411
+ transed = translate(marks, T2A_TRANS)
412
+
413
+ if method == "decimal":
414
+ try:
415
+ transed = int(transed)
416
+ except:
417
+ return word, ""
418
+ return letters, transed
419
+
420
+
421
+ def decode_tashkeel(word, marks, method="ascii"):
422
+ """decode tashkeel"""
423
+ """
424
+ decode marks from decimal/ascii string to be joint on word
425
+ @input word: undiacritized arabic diacritcs
426
+ @type word: unicode
427
+ @input marks: encoded marks
428
+ @type marks: unicode/integer
429
+ @return: diacritized word
430
+ @rtype: unicode
431
+ """
432
+ if type(marks) != (str):
433
+ marks = str(marks)
434
+ # zeros can be removed in int code, then we must add them to left
435
+ marks = marks.rjust(len(word), str("0"))
436
+ if method == "decimal":
437
+ transed = translate(marks, D2T_TRANS)
438
+ elif method == "ascii":
439
+ transed = translate(marks, A2T_TRANS)
440
+ else:
441
+ transed = translate(marks, A2T_TRANS)
442
+ word2 = ar.joint(word, transed)
443
+ return word2
444
+
445
+
446
+ if __name__ == "__main__":
447
+
448
+ words = """qulo
449
+ >aEuw*u
450
+ bi
451
+ rab~i
452
+ {l
453
+ n~aAsi
454
+ maliki
455
+ {l
456
+ n~aAsi
457
+ <ila`hi
458
+ {l
459
+ n~aAsi
460
+ min
461
+ $ar~i
462
+ {lo
463
+ wasowaAsi
464
+ {lo
465
+ xan~aAsi
466
+ {l~a*iY
467
+ yuwasowisu
468
+ fiY
469
+ Suduwri
470
+ {l
471
+ n~aAsi
472
+ mina
473
+ {lo
474
+ jin~api
475
+ wa
476
+ {l
477
+ n~aAsi""".split(
478
+ "\n"
479
+ )
480
+ for word in words:
481
+ arabic = tim2utf8(word)
482
+ timu = utf82tim(arabic)
483
+ sampa = tim2sampa(word)
484
+ arabic2 = convert(word, "tim", "utf")
485
+ timu2 = convert(arabic2, "utf", "tim") # utf82tim(arabic)
486
+ sampa2 = convert(word, "tim", "sampa") # tim2sampa(word)
487
+ print(
488
+ "\t".join(
489
+ [
490
+ word,
491
+ arabic,
492
+ timu,
493
+ sampa,
494
+ arabic2,
495
+ timu2,
496
+ sampa2,
497
+ str(timu == word),
498
+ str(arabic2 == arabic),
499
+ str(timu2 == timu),
500
+ str(sampa2 == sampa),
501
+ ]
502
+ ).encode("utf8")
503
+ )
504
+ # ~ print(u'\t'.join([word, tim2sampa(word)]).encode('utf8'))
505
+
506
+ utf2tim_table = {v: k for k, v in t2a_table.items()}
507
+ print(utf2tim_table)
508
+
509
+ from arabrepr import arepr
510
+
511
+ # test detect language
512
+ text = """السلام عليكم how are you, لم اسمع أخبارك منذ مدة, where are you going"""
513
+ print(arepr(segment_language(text)))
514
+ text_out = delimite_language(text, start="\RL{", end="}")
515
+ print(text_out.encode("utf8"))
516
+ text_out = delimite_language(text, start="<arabic>", end="</arabic>")
517
+ print(text_out.encode("utf8"))