phoonnx 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phoonnx/__init__.py +0 -0
- phoonnx/config.py +490 -0
- phoonnx/locale/ca/phonetic_spellings.txt +2 -0
- phoonnx/locale/en/phonetic_spellings.txt +1 -0
- phoonnx/locale/gl/phonetic_spellings.txt +2 -0
- phoonnx/locale/pt/phonetic_spellings.txt +2 -0
- phoonnx/phoneme_ids.py +453 -0
- phoonnx/phonemizers/__init__.py +45 -0
- phoonnx/phonemizers/ar.py +42 -0
- phoonnx/phonemizers/base.py +216 -0
- phoonnx/phonemizers/en.py +250 -0
- phoonnx/phonemizers/fa.py +46 -0
- phoonnx/phonemizers/gl.py +142 -0
- phoonnx/phonemizers/he.py +67 -0
- phoonnx/phonemizers/ja.py +119 -0
- phoonnx/phonemizers/ko.py +97 -0
- phoonnx/phonemizers/mul.py +606 -0
- phoonnx/phonemizers/vi.py +44 -0
- phoonnx/phonemizers/zh.py +308 -0
- phoonnx/thirdparty/__init__.py +0 -0
- phoonnx/thirdparty/arpa2ipa.py +249 -0
- phoonnx/thirdparty/cotovia/cotovia_aarch64 +0 -0
- phoonnx/thirdparty/cotovia/cotovia_x86_64 +0 -0
- phoonnx/thirdparty/hangul2ipa.py +783 -0
- phoonnx/thirdparty/ko_tables/aspiration.csv +20 -0
- phoonnx/thirdparty/ko_tables/assimilation.csv +31 -0
- phoonnx/thirdparty/ko_tables/double_coda.csv +17 -0
- phoonnx/thirdparty/ko_tables/hanja.tsv +8525 -0
- phoonnx/thirdparty/ko_tables/ipa.csv +22 -0
- phoonnx/thirdparty/ko_tables/neutralization.csv +11 -0
- phoonnx/thirdparty/ko_tables/tensification.csv +56 -0
- phoonnx/thirdparty/ko_tables/yale.csv +22 -0
- phoonnx/thirdparty/kog2p/__init__.py +385 -0
- phoonnx/thirdparty/kog2p/rulebook.txt +212 -0
- phoonnx/thirdparty/mantoq/__init__.py +67 -0
- phoonnx/thirdparty/mantoq/buck/__init__.py +0 -0
- phoonnx/thirdparty/mantoq/buck/phonetise_buckwalter.py +569 -0
- phoonnx/thirdparty/mantoq/buck/symbols.py +64 -0
- phoonnx/thirdparty/mantoq/buck/tokenization.py +105 -0
- phoonnx/thirdparty/mantoq/num2words.py +37 -0
- phoonnx/thirdparty/mantoq/pyarabic/__init__.py +12 -0
- phoonnx/thirdparty/mantoq/pyarabic/arabrepr.py +64 -0
- phoonnx/thirdparty/mantoq/pyarabic/araby.py +1647 -0
- phoonnx/thirdparty/mantoq/pyarabic/named_const.py +227 -0
- phoonnx/thirdparty/mantoq/pyarabic/normalize.py +161 -0
- phoonnx/thirdparty/mantoq/pyarabic/number.py +826 -0
- phoonnx/thirdparty/mantoq/pyarabic/number_const.py +1704 -0
- phoonnx/thirdparty/mantoq/pyarabic/stack.py +52 -0
- phoonnx/thirdparty/mantoq/pyarabic/trans.py +517 -0
- phoonnx/thirdparty/mantoq/unicode_symbol2label.py +4173 -0
- phoonnx/thirdparty/tashkeel/LICENSE +22 -0
- phoonnx/thirdparty/tashkeel/SOURCE +1 -0
- phoonnx/thirdparty/tashkeel/__init__.py +212 -0
- phoonnx/thirdparty/tashkeel/hint_id_map.json +18 -0
- phoonnx/thirdparty/tashkeel/input_id_map.json +56 -0
- phoonnx/thirdparty/tashkeel/model.onnx +0 -0
- phoonnx/thirdparty/tashkeel/target_id_map.json +17 -0
- phoonnx/thirdparty/zh_num.py +238 -0
- phoonnx/util.py +705 -0
- phoonnx/version.py +6 -0
- phoonnx/voice.py +521 -0
- phoonnx-0.0.0.dist-info/METADATA +255 -0
- phoonnx-0.0.0.dist-info/RECORD +86 -0
- phoonnx-0.0.0.dist-info/WHEEL +5 -0
- phoonnx-0.0.0.dist-info/top_level.txt +2 -0
- phoonnx_train/__main__.py +151 -0
- phoonnx_train/export_onnx.py +109 -0
- phoonnx_train/norm_audio/__init__.py +92 -0
- phoonnx_train/norm_audio/trim.py +54 -0
- phoonnx_train/norm_audio/vad.py +54 -0
- phoonnx_train/preprocess.py +420 -0
- phoonnx_train/vits/__init__.py +0 -0
- phoonnx_train/vits/attentions.py +427 -0
- phoonnx_train/vits/commons.py +147 -0
- phoonnx_train/vits/config.py +330 -0
- phoonnx_train/vits/dataset.py +214 -0
- phoonnx_train/vits/lightning.py +352 -0
- phoonnx_train/vits/losses.py +58 -0
- phoonnx_train/vits/mel_processing.py +139 -0
- phoonnx_train/vits/models.py +732 -0
- phoonnx_train/vits/modules.py +527 -0
- phoonnx_train/vits/monotonic_align/__init__.py +20 -0
- phoonnx_train/vits/monotonic_align/setup.py +13 -0
- phoonnx_train/vits/transforms.py +212 -0
- phoonnx_train/vits/utils.py +16 -0
- phoonnx_train/vits/wavfile.py +860 -0
@@ -0,0 +1,105 @@
|
|
1
|
+
from .phonetise_buckwalter import (arabic_to_buckwalter, buckwalter_to_arabic,
|
2
|
+
process_utterance)
|
3
|
+
from .symbols import DOUBLING_TOKEN, EOS_TOKEN, SEPARATOR_TOKEN, symbols
|
4
|
+
|
5
|
+
vowels = [
|
6
|
+
"aa",
|
7
|
+
"AA",
|
8
|
+
"uu0",
|
9
|
+
"uu1",
|
10
|
+
"UU0",
|
11
|
+
"UU1",
|
12
|
+
"ii0",
|
13
|
+
"ii1",
|
14
|
+
"II0",
|
15
|
+
"II1",
|
16
|
+
"a",
|
17
|
+
"A",
|
18
|
+
"u0",
|
19
|
+
"u1",
|
20
|
+
"U0",
|
21
|
+
"U1",
|
22
|
+
"i0",
|
23
|
+
"i1",
|
24
|
+
"I0",
|
25
|
+
"I1",
|
26
|
+
]
|
27
|
+
|
28
|
+
vowel_map = {
|
29
|
+
"aa": "aa",
|
30
|
+
"AA": "aa",
|
31
|
+
"uu0": "uu",
|
32
|
+
"uu1": "uu",
|
33
|
+
"UU0": "uu",
|
34
|
+
"UU1": "uu",
|
35
|
+
"ii0": "ii",
|
36
|
+
"ii1": "ii",
|
37
|
+
"II0": "ii",
|
38
|
+
"II1": "ii",
|
39
|
+
"a": "a",
|
40
|
+
"A": "a",
|
41
|
+
"u0": "u",
|
42
|
+
"u1": "u",
|
43
|
+
"U0": "u",
|
44
|
+
"U1": "u",
|
45
|
+
"i0": "i",
|
46
|
+
"i1": "i",
|
47
|
+
"I0": "i",
|
48
|
+
"I1": "i",
|
49
|
+
}
|
50
|
+
|
51
|
+
phon_to_id_ = {phon: i for i, phon in enumerate(symbols)}
|
52
|
+
|
53
|
+
|
54
|
+
def tokens_to_ids(phonemes, phon_to_id=None):
|
55
|
+
if phon_to_id is None:
|
56
|
+
return [phon_to_id_[phon] for phon in phonemes]
|
57
|
+
return [phon_to_id[phon] for phon in phonemes]
|
58
|
+
|
59
|
+
|
60
|
+
def ids_to_tokens(ids):
|
61
|
+
return [symbols[id] for id in ids]
|
62
|
+
|
63
|
+
|
64
|
+
def arabic_to_phonemes(arabic):
|
65
|
+
buckw = arabic_to_buckwalter(arabic)
|
66
|
+
return process_utterance(buckw)
|
67
|
+
|
68
|
+
|
69
|
+
def buckwalter_to_phonemes(buckw):
|
70
|
+
return process_utterance(buckw)
|
71
|
+
|
72
|
+
|
73
|
+
def phonemes_to_tokens(phonemes: str, append_space=False):
|
74
|
+
phonemes = phonemes.replace("sil", "").replace("+", "_+_").split()
|
75
|
+
for i, phon in enumerate(phonemes):
|
76
|
+
if len(phon) == 2 and phon not in vowels and phon[0] == phon[1]:
|
77
|
+
phonemes[i] = phon[0]
|
78
|
+
phonemes.insert(i + 1, DOUBLING_TOKEN)
|
79
|
+
if phonemes[i] in vowels:
|
80
|
+
phonemes[i] = vowel_map[phonemes[i]]
|
81
|
+
|
82
|
+
if append_space:
|
83
|
+
phonemes.append(SEPARATOR_TOKEN)
|
84
|
+
|
85
|
+
phonemes.append(EOS_TOKEN)
|
86
|
+
|
87
|
+
return phonemes
|
88
|
+
|
89
|
+
|
90
|
+
def buckwalter_to_tokens(buckw, append_space=False):
|
91
|
+
phonemes = buckwalter_to_phonemes(buckw)
|
92
|
+
tokens = phonemes_to_tokens(phonemes, append_space=append_space)
|
93
|
+
return tokens
|
94
|
+
|
95
|
+
|
96
|
+
def arabic_to_tokens(arabic, append_space=False):
|
97
|
+
buckw = arabic_to_buckwalter(arabic)
|
98
|
+
tokens = buckwalter_to_tokens(buckw, append_space=append_space)
|
99
|
+
return tokens
|
100
|
+
|
101
|
+
|
102
|
+
def simplify_phonemes(phonemes):
|
103
|
+
for k, v in vowel_map.items():
|
104
|
+
phonemes = phonemes.replace(k, v)
|
105
|
+
return phonemes
|
@@ -0,0 +1,37 @@
|
|
1
|
+
import re
|
2
|
+
from functools import partial
|
3
|
+
|
4
|
+
from phoonnx.thirdparty.mantoq.pyarabic import araby
|
5
|
+
from phoonnx.thirdparty.mantoq.pyarabic import number as arnum
|
6
|
+
from phoonnx.thirdparty.mantoq.pyarabic.trans import normalize_digits
|
7
|
+
|
8
|
+
NUM_REGEX = re.compile(r"\d+")
|
9
|
+
PERCENT_NO_DIAC = "بالمئة"
|
10
|
+
PERCENT_DIAC = "بِالْمِئَة"
|
11
|
+
|
12
|
+
|
13
|
+
def _convert_num2words(m: re.Match, *, apply_tashkeel):
|
14
|
+
number = m.group(0)
|
15
|
+
word_representation = arnum.number2text(number)
|
16
|
+
if apply_tashkeel:
|
17
|
+
return " ".join(arnum.pre_tashkeel_number(word_representation.split(" ")))
|
18
|
+
return word_representation
|
19
|
+
|
20
|
+
|
21
|
+
def num2words(text: str, handle_percent=True, apply_tashkeel: bool = True) -> str:
|
22
|
+
"""
|
23
|
+
Converts numbers in `text` to Arabic words.
|
24
|
+
Simple conversion. Does not check if the number is date/currency...etc.
|
25
|
+
|
26
|
+
Args:
|
27
|
+
text: input text that may contain numbers
|
28
|
+
apply_tashkeel: diacritize added words
|
29
|
+
"""
|
30
|
+
text = normalize_digits(text)
|
31
|
+
output = NUM_REGEX.sub(
|
32
|
+
partial(_convert_num2words, apply_tashkeel=apply_tashkeel), text
|
33
|
+
)
|
34
|
+
if handle_percent:
|
35
|
+
replacement = PERCENT_DIAC if apply_tashkeel else PERCENT_NO_DIAC
|
36
|
+
output = output.replace("%", f" {replacement}")
|
37
|
+
return araby.fix_spaces(output)
|
@@ -0,0 +1,64 @@
|
|
1
|
+
#!/usr/bin/python
|
2
|
+
# -*- coding: UTF-8 -*-
|
3
|
+
"""
|
4
|
+
Improve repr predifined function to best display of objects containing unicode
|
5
|
+
Unicode represention texts
|
6
|
+
@author: Taha Zerrouki
|
7
|
+
@contact: taha dot zerrouki at gmail dot com
|
8
|
+
@copyright: Taha Zerrouki
|
9
|
+
@license: GPL
|
10
|
+
@date:2014/03/01
|
11
|
+
@version: 0.1
|
12
|
+
"""
|
13
|
+
from __future__ import (absolute_import, division, print_function,
|
14
|
+
unicode_literals)
|
15
|
+
|
16
|
+
import sys
|
17
|
+
|
18
|
+
if sys.version_info < (3, 0):
|
19
|
+
import repr as reprlib
|
20
|
+
else:
|
21
|
+
import reprlib
|
22
|
+
|
23
|
+
|
24
|
+
class ArabicRepr(reprlib.Repr):
|
25
|
+
"""A redifinition of repr fucntion,
|
26
|
+
you can use it like this
|
27
|
+
|
28
|
+
Example:
|
29
|
+
>>> import pyarabic.arabrepr as arabrepr
|
30
|
+
>>> arepr = arabrepr.ArabicRepr()
|
31
|
+
>>> repr = arepr.repr
|
32
|
+
>>> word = u"السلام عليكم ورحمة الله"
|
33
|
+
>>> wordlist = word.split(" ")
|
34
|
+
>>> print wordlist
|
35
|
+
[u'\u0627\u0644\u0633\u0644\u0627\u0645',
|
36
|
+
u'\u0639\u0644\u064a\u0643\u0645',
|
37
|
+
u'\u0648\u0631\u062d\u0645\u0629',
|
38
|
+
u'\u0627\u0644\u0644\u0647']
|
39
|
+
>>> print repr(wordlist)
|
40
|
+
[u'السلام', u'عليكم', u'ورحمة', u'الله']
|
41
|
+
"""
|
42
|
+
|
43
|
+
def repr_unicode(self, obj, level):
|
44
|
+
"Modify unicode display"
|
45
|
+
return "u'%s'" % obj
|
46
|
+
|
47
|
+
def arepr_unicode(self, obj, level):
|
48
|
+
"Modify unicode display"
|
49
|
+
return "u'%s'" % obj
|
50
|
+
|
51
|
+
|
52
|
+
if sys.version_info < (3, 0):
|
53
|
+
|
54
|
+
def arepr(data):
|
55
|
+
"""display a dict with arabic text properly"""
|
56
|
+
return repr(data).replace("},", "},\n").decode("unicode-escape").encode("utf8")
|
57
|
+
|
58
|
+
else:
|
59
|
+
|
60
|
+
def arepr(data):
|
61
|
+
"""display a dict with arabic text properly"""
|
62
|
+
return repr(data).replace(
|
63
|
+
"},", "},\n"
|
64
|
+
) # .decode('unicode-escape').encode('utf8')
|