phoonnx 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phoonnx/__init__.py +0 -0
- phoonnx/config.py +490 -0
- phoonnx/locale/ca/phonetic_spellings.txt +2 -0
- phoonnx/locale/en/phonetic_spellings.txt +1 -0
- phoonnx/locale/gl/phonetic_spellings.txt +2 -0
- phoonnx/locale/pt/phonetic_spellings.txt +2 -0
- phoonnx/phoneme_ids.py +453 -0
- phoonnx/phonemizers/__init__.py +45 -0
- phoonnx/phonemizers/ar.py +42 -0
- phoonnx/phonemizers/base.py +216 -0
- phoonnx/phonemizers/en.py +250 -0
- phoonnx/phonemizers/fa.py +46 -0
- phoonnx/phonemizers/gl.py +142 -0
- phoonnx/phonemizers/he.py +67 -0
- phoonnx/phonemizers/ja.py +119 -0
- phoonnx/phonemizers/ko.py +97 -0
- phoonnx/phonemizers/mul.py +606 -0
- phoonnx/phonemizers/vi.py +44 -0
- phoonnx/phonemizers/zh.py +308 -0
- phoonnx/thirdparty/__init__.py +0 -0
- phoonnx/thirdparty/arpa2ipa.py +249 -0
- phoonnx/thirdparty/cotovia/cotovia_aarch64 +0 -0
- phoonnx/thirdparty/cotovia/cotovia_x86_64 +0 -0
- phoonnx/thirdparty/hangul2ipa.py +783 -0
- phoonnx/thirdparty/ko_tables/aspiration.csv +20 -0
- phoonnx/thirdparty/ko_tables/assimilation.csv +31 -0
- phoonnx/thirdparty/ko_tables/double_coda.csv +17 -0
- phoonnx/thirdparty/ko_tables/hanja.tsv +8525 -0
- phoonnx/thirdparty/ko_tables/ipa.csv +22 -0
- phoonnx/thirdparty/ko_tables/neutralization.csv +11 -0
- phoonnx/thirdparty/ko_tables/tensification.csv +56 -0
- phoonnx/thirdparty/ko_tables/yale.csv +22 -0
- phoonnx/thirdparty/kog2p/__init__.py +385 -0
- phoonnx/thirdparty/kog2p/rulebook.txt +212 -0
- phoonnx/thirdparty/mantoq/__init__.py +67 -0
- phoonnx/thirdparty/mantoq/buck/__init__.py +0 -0
- phoonnx/thirdparty/mantoq/buck/phonetise_buckwalter.py +569 -0
- phoonnx/thirdparty/mantoq/buck/symbols.py +64 -0
- phoonnx/thirdparty/mantoq/buck/tokenization.py +105 -0
- phoonnx/thirdparty/mantoq/num2words.py +37 -0
- phoonnx/thirdparty/mantoq/pyarabic/__init__.py +12 -0
- phoonnx/thirdparty/mantoq/pyarabic/arabrepr.py +64 -0
- phoonnx/thirdparty/mantoq/pyarabic/araby.py +1647 -0
- phoonnx/thirdparty/mantoq/pyarabic/named_const.py +227 -0
- phoonnx/thirdparty/mantoq/pyarabic/normalize.py +161 -0
- phoonnx/thirdparty/mantoq/pyarabic/number.py +826 -0
- phoonnx/thirdparty/mantoq/pyarabic/number_const.py +1704 -0
- phoonnx/thirdparty/mantoq/pyarabic/stack.py +52 -0
- phoonnx/thirdparty/mantoq/pyarabic/trans.py +517 -0
- phoonnx/thirdparty/mantoq/unicode_symbol2label.py +4173 -0
- phoonnx/thirdparty/tashkeel/LICENSE +22 -0
- phoonnx/thirdparty/tashkeel/SOURCE +1 -0
- phoonnx/thirdparty/tashkeel/__init__.py +212 -0
- phoonnx/thirdparty/tashkeel/hint_id_map.json +18 -0
- phoonnx/thirdparty/tashkeel/input_id_map.json +56 -0
- phoonnx/thirdparty/tashkeel/model.onnx +0 -0
- phoonnx/thirdparty/tashkeel/target_id_map.json +17 -0
- phoonnx/thirdparty/zh_num.py +238 -0
- phoonnx/util.py +705 -0
- phoonnx/version.py +6 -0
- phoonnx/voice.py +521 -0
- phoonnx-0.0.0.dist-info/METADATA +255 -0
- phoonnx-0.0.0.dist-info/RECORD +86 -0
- phoonnx-0.0.0.dist-info/WHEEL +5 -0
- phoonnx-0.0.0.dist-info/top_level.txt +2 -0
- phoonnx_train/__main__.py +151 -0
- phoonnx_train/export_onnx.py +109 -0
- phoonnx_train/norm_audio/__init__.py +92 -0
- phoonnx_train/norm_audio/trim.py +54 -0
- phoonnx_train/norm_audio/vad.py +54 -0
- phoonnx_train/preprocess.py +420 -0
- phoonnx_train/vits/__init__.py +0 -0
- phoonnx_train/vits/attentions.py +427 -0
- phoonnx_train/vits/commons.py +147 -0
- phoonnx_train/vits/config.py +330 -0
- phoonnx_train/vits/dataset.py +214 -0
- phoonnx_train/vits/lightning.py +352 -0
- phoonnx_train/vits/losses.py +58 -0
- phoonnx_train/vits/mel_processing.py +139 -0
- phoonnx_train/vits/models.py +732 -0
- phoonnx_train/vits/modules.py +527 -0
- phoonnx_train/vits/monotonic_align/__init__.py +20 -0
- phoonnx_train/vits/monotonic_align/setup.py +13 -0
- phoonnx_train/vits/transforms.py +212 -0
- phoonnx_train/vits/utils.py +16 -0
- phoonnx_train/vits/wavfile.py +860 -0
phoonnx/__init__.py
ADDED
File without changes
|
phoonnx/config.py
ADDED
@@ -0,0 +1,490 @@
|
|
1
|
+
import json
|
2
|
+
from dataclasses import dataclass, field
|
3
|
+
from enum import Enum
|
4
|
+
from typing import Any, Mapping, Optional, Sequence
|
5
|
+
from phoonnx.phoneme_ids import (load_phoneme_ids, BlankBetween,
|
6
|
+
DEFAULT_BLANK_WORD_TOKEN, DEFAULT_BLANK_TOKEN,
|
7
|
+
DEFAULT_PAD_TOKEN, DEFAULT_BOS_TOKEN, DEFAULT_EOS_TOKEN)
|
8
|
+
|
9
|
+
|
10
|
+
DEFAULT_NOISE_SCALE = 0.667
|
11
|
+
DEFAULT_LENGTH_SCALE = 1.0
|
12
|
+
DEFAULT_NOISE_W_SCALE = 0.8
|
13
|
+
|
14
|
+
try:
|
15
|
+
from ovos_utils.log import LOG
|
16
|
+
except ImportError:
|
17
|
+
import logging
|
18
|
+
LOG = logging.getLogger(__name__)
|
19
|
+
|
20
|
+
|
21
|
+
class Alphabet(str, Enum):
|
22
|
+
UNICODE = "unicode"
|
23
|
+
IPA = "ipa"
|
24
|
+
ARPA = "arpa" # en
|
25
|
+
HANGUL = "hangul" # ko
|
26
|
+
KANA = "kana" # ja
|
27
|
+
HIRA = "hira" # ja
|
28
|
+
HEPBURN = "hepburn" # ja romanization
|
29
|
+
KUNREI = "kunrei" # ja romanization
|
30
|
+
NIHON = "nihon" # ja romanization
|
31
|
+
PINYIN = "pinyin" # zh
|
32
|
+
ERAAB = "eraab" # fa
|
33
|
+
COTOVIA = "cotovia" # gl
|
34
|
+
HANZI = "hanzi" # zh
|
35
|
+
|
36
|
+
|
37
|
+
|
38
|
+
class PhonemeType(str, Enum):
|
39
|
+
RAW = "raw" # direct phonemes
|
40
|
+
UNICODE = "unicode" # unicode codepoints
|
41
|
+
GRAPHEMES = "graphemes" # text characters
|
42
|
+
|
43
|
+
MISAKI = "misaki"
|
44
|
+
ESPEAK = "espeak"
|
45
|
+
GRUUT = "gruut"
|
46
|
+
EPITRAN = "epitran"
|
47
|
+
BYT5 = "byt5"
|
48
|
+
CHARSIU = "charsiu" # technically same as byt5, but needs special handling for whitespace
|
49
|
+
|
50
|
+
DEEPPHONEMIZER = "deepphonemizer" # en
|
51
|
+
OPENPHONEMIZER = "openphonemizer" # en
|
52
|
+
G2PEN = "g2pen" # en
|
53
|
+
|
54
|
+
G2PFA = "g2pfa"
|
55
|
+
OPENJTALK = "openjtalk" # ja
|
56
|
+
CUTLET = "cutlet" # ja
|
57
|
+
PYKAKASI = "pykakasi" # ja
|
58
|
+
COTOVIA = "cotovia" # galician (no ipa!)
|
59
|
+
PHONIKUD = "phonikud" # hebrew
|
60
|
+
MANTOQ = "mantoq" # arabic
|
61
|
+
VIPHONEME = "viphoneme" # vietnamese
|
62
|
+
G2PK = "g2pk" # korean
|
63
|
+
KOG2PK = "kog2p" # korean
|
64
|
+
G2PC = "g2pc" # chinese
|
65
|
+
G2PM = "g2pm" # chinese
|
66
|
+
PYPINYIN = "pypinyin" # chinese
|
67
|
+
XPINYIN = "xpinyin" # chinese
|
68
|
+
JIEBA = "jieba" # chinese (not a real phonemizer!)
|
69
|
+
|
70
|
+
|
71
|
+
@dataclass
|
72
|
+
class VoiceConfig:
|
73
|
+
"""TTS model configuration"""
|
74
|
+
|
75
|
+
num_symbols: int
|
76
|
+
"""Number of phonemes."""
|
77
|
+
|
78
|
+
num_speakers: int
|
79
|
+
"""Number of speakers."""
|
80
|
+
|
81
|
+
num_langs: int
|
82
|
+
"""Number of langs."""
|
83
|
+
|
84
|
+
sample_rate: int
|
85
|
+
"""Sample rate of output audio."""
|
86
|
+
|
87
|
+
lang_code: Optional[str]
|
88
|
+
"""Name of espeak-ng voice or alphabet."""
|
89
|
+
|
90
|
+
phoneme_id_map: Optional[Mapping[str, Sequence[int]]]
|
91
|
+
"""Phoneme -> [id,]. Used for phoneme-based models."""
|
92
|
+
|
93
|
+
phoneme_type: PhonemeType
|
94
|
+
"""espeak, byt5, text, cotovia, or graphemes."""
|
95
|
+
|
96
|
+
alphabet: Optional[Alphabet]
|
97
|
+
|
98
|
+
phonemizer_model: Optional[str]
|
99
|
+
"""for phonemizers that allow changing base model """
|
100
|
+
|
101
|
+
speaker_id_map: Mapping[str, int] = field(default_factory=dict)
|
102
|
+
"""Speaker -> id"""
|
103
|
+
|
104
|
+
lang_id_map: Mapping[str, int] = field(default_factory=dict)
|
105
|
+
"""lang-code -> id"""
|
106
|
+
|
107
|
+
# Inference settings
|
108
|
+
length_scale: float = DEFAULT_LENGTH_SCALE
|
109
|
+
noise_scale: float = DEFAULT_NOISE_SCALE
|
110
|
+
noise_w_scale: float = DEFAULT_NOISE_W_SCALE
|
111
|
+
|
112
|
+
# tokenization settings
|
113
|
+
blank_at_start: bool = True
|
114
|
+
blank_at_end: bool = True
|
115
|
+
include_whitespace: Optional[bool] = True
|
116
|
+
pad_token: Optional[str] = DEFAULT_PAD_TOKEN
|
117
|
+
blank_token: Optional[str] = DEFAULT_PAD_TOKEN
|
118
|
+
bos_token: Optional[str] = DEFAULT_BOS_TOKEN
|
119
|
+
eos_token: Optional[str] = DEFAULT_EOS_TOKEN
|
120
|
+
word_sep_token: Optional[str] = DEFAULT_BLANK_WORD_TOKEN
|
121
|
+
blank_between: BlankBetween = BlankBetween.TOKENS_AND_WORDS
|
122
|
+
|
123
|
+
def __post_init__(self):
|
124
|
+
self.lang_code = self.lang_code or "und"
|
125
|
+
|
126
|
+
@staticmethod
|
127
|
+
def is_mimic3(config: dict[str, Any]) -> bool:
|
128
|
+
# https://huggingface.co/mukowaty/mimic3-voices
|
129
|
+
|
130
|
+
# mimic3 models indicate a phonemizer strategy in their config
|
131
|
+
if ("phonemizer" not in config or
|
132
|
+
not isinstance(config["phonemizer"], str)):
|
133
|
+
return False
|
134
|
+
|
135
|
+
# mimic3 models include a "phonemes" section with token info
|
136
|
+
if "phonemes" not in config or not isinstance(config["phonemes"], dict):
|
137
|
+
return False
|
138
|
+
|
139
|
+
# validate phonemizer type as expected by mimic3
|
140
|
+
phonemizer = config["phonemizer"]
|
141
|
+
# class Phonemizer(str, Enum):
|
142
|
+
# SYMBOLS = "symbols"
|
143
|
+
# GRUUT = "gruut"
|
144
|
+
# ESPEAK = "espeak"
|
145
|
+
# EPITRAN = "epitran"
|
146
|
+
if phonemizer not in ["symbols", "gruut", "espeak", "epitran"]:
|
147
|
+
return False
|
148
|
+
|
149
|
+
return True
|
150
|
+
|
151
|
+
@staticmethod
|
152
|
+
def is_piper(config: dict[str, Any]) -> bool:
|
153
|
+
if "piper_version" in config:
|
154
|
+
return True
|
155
|
+
# piper models indicate a phonemizer strategy in their config
|
156
|
+
if ("phoneme_type" not in config or
|
157
|
+
not isinstance(config["phoneme_type"], str)):
|
158
|
+
return False
|
159
|
+
|
160
|
+
# piper models include a "phoneme_id_map" section mapping phonemes to int
|
161
|
+
if "phoneme_id_map" not in config or not isinstance(config["phoneme_id_map"], dict):
|
162
|
+
return False
|
163
|
+
|
164
|
+
# validate phonemizer type as expected by piper
|
165
|
+
phonemizer = config["phoneme_type"]
|
166
|
+
if phonemizer not in ["text", "espeak"]:
|
167
|
+
return False
|
168
|
+
|
169
|
+
return True
|
170
|
+
|
171
|
+
@staticmethod
|
172
|
+
def is_coqui_vits(config: dict[str, Any]) -> bool:
|
173
|
+
# coqui vits grapheme models include a "characters" section with token info
|
174
|
+
if "characters" not in config or not isinstance(config["characters"], dict):
|
175
|
+
return False
|
176
|
+
|
177
|
+
# double check this was trained with coqui
|
178
|
+
if config["characters"].get("characters_class", "") not in ["TTS.tts.models.vits.VitsCharacters",
|
179
|
+
"TTS.tts.utils.text.characters.Graphemes"]:
|
180
|
+
return False
|
181
|
+
|
182
|
+
return True
|
183
|
+
|
184
|
+
@staticmethod
|
185
|
+
def is_phoonnx(config: dict[str, Any]) -> bool:
|
186
|
+
# phoonnx models indicate a phonemizer strategy in their config
|
187
|
+
if ("phoneme_type" not in config or
|
188
|
+
not isinstance(config["phoneme_type"], str)):
|
189
|
+
return False
|
190
|
+
|
191
|
+
if "lang_code" not in config:
|
192
|
+
return False
|
193
|
+
|
194
|
+
# validate phonemizer type as expected
|
195
|
+
phonemizer = config["phoneme_type"]
|
196
|
+
if phonemizer not in list(PhonemeType):
|
197
|
+
return False
|
198
|
+
|
199
|
+
return True
|
200
|
+
|
201
|
+
@staticmethod
|
202
|
+
def is_cotovia(config: dict[str, Any]) -> bool:
|
203
|
+
# no way to determine unless explicitly configured unfortunately
|
204
|
+
# afaik only the sabela galician model uses this
|
205
|
+
# will fallback to coqui "graphemes" if "cotovia" not specified,
|
206
|
+
# this will work but will make mistakes
|
207
|
+
if (not VoiceConfig.is_coqui_vits(config)
|
208
|
+
or not VoiceConfig.is_phoonnx(config)):
|
209
|
+
return False
|
210
|
+
|
211
|
+
return config["phoneme_type"] == PhonemeType.COTOVIA
|
212
|
+
|
213
|
+
@staticmethod
|
214
|
+
def from_dict(config: dict[str, Any],
|
215
|
+
phonemes_txt: Optional[str] = None,
|
216
|
+
lang_code: Optional[str] = None,
|
217
|
+
phoneme_type_str: Optional[str] = None) -> "VoiceConfig":
|
218
|
+
"""Load configuration from a dictionary."""
|
219
|
+
blank_type = BlankBetween.TOKENS_AND_WORDS
|
220
|
+
lang_code = lang_code or config.get("lang_code")
|
221
|
+
phoneme_type_str = phoneme_type_str or config.get("phoneme_type")
|
222
|
+
phoneme_id_map = config.get("phoneme_id_map")
|
223
|
+
alphabet = config.get("alphabet")
|
224
|
+
|
225
|
+
if phonemes_txt:
|
226
|
+
if phonemes_txt.endswith(".txt"):
|
227
|
+
# either from mimic3 models or as an override at runtime
|
228
|
+
with open(phonemes_txt, "r", encoding="utf-8") as ids_file:
|
229
|
+
phoneme_id_map = load_phoneme_ids(ids_file)
|
230
|
+
elif phonemes_txt.endswith(".json"):
|
231
|
+
with open(phonemes_txt) as ids_file:
|
232
|
+
phoneme_id_map = json.load(ids_file)
|
233
|
+
|
234
|
+
# check if model was trained for PiperTTS
|
235
|
+
if VoiceConfig.is_piper(config):
|
236
|
+
lang_code = lang_code or (config.get("language", {}).get("code") or
|
237
|
+
config.get("espeak", {}).get("voice"))
|
238
|
+
phoneme_type_str = config.get("phoneme_type", PhonemeType.ESPEAK.value)
|
239
|
+
if phoneme_type_str == "text":
|
240
|
+
phoneme_type_str = PhonemeType.UNICODE.value
|
241
|
+
alphabet = Alphabet.UNICODE
|
242
|
+
else:
|
243
|
+
alphabet = Alphabet.IPA
|
244
|
+
|
245
|
+
# not configurable in piper
|
246
|
+
config["pad"] = DEFAULT_PAD_TOKEN
|
247
|
+
config["blank"] = DEFAULT_BLANK_TOKEN
|
248
|
+
config["bos"] = DEFAULT_BOS_TOKEN
|
249
|
+
config["eos"] = DEFAULT_EOS_TOKEN
|
250
|
+
|
251
|
+
# check if model was trained for Mimic3
|
252
|
+
elif VoiceConfig.is_mimic3(config):
|
253
|
+
if not phonemes_txt:
|
254
|
+
raise ValueError("mimic3 models require an external phonemes.txt file in addition to the config")
|
255
|
+
lang_code = config.get("text_language")
|
256
|
+
phoneme_type_str = config.get("phonemizer", PhonemeType.GRUUT.value)
|
257
|
+
# read phoneme settings
|
258
|
+
phoneme_cfg = config.get("phonemes", {})
|
259
|
+
blank_type = BlankBetween(phoneme_cfg.get("blank_between", "tokens_and_words"))
|
260
|
+
config.update(phoneme_cfg)
|
261
|
+
|
262
|
+
if phoneme_type_str == "symbols":
|
263
|
+
# Mimic3 "symbols" models are grapheme models
|
264
|
+
# symbol map comes from phonemes_txt
|
265
|
+
phoneme_type_str = PhonemeType.GRAPHEMES.value
|
266
|
+
alphabet = Alphabet.UNICODE
|
267
|
+
else:
|
268
|
+
alphabet = Alphabet.IPA
|
269
|
+
|
270
|
+
# check if model was trained with Coqui
|
271
|
+
# NOTE: cotovia is included here
|
272
|
+
elif VoiceConfig.is_coqui_vits(config):
|
273
|
+
if VoiceConfig.is_cotovia(config):
|
274
|
+
phoneme_type_str = PhonemeType.COTOVIA.value
|
275
|
+
alphabet = Alphabet.COTOVIA
|
276
|
+
else:
|
277
|
+
phoneme_type_str = PhonemeType.GRAPHEMES.value
|
278
|
+
alphabet = Alphabet.UNICODE
|
279
|
+
|
280
|
+
# NOTE: lang code usually not provided and often wrong :(
|
281
|
+
ds = config.get("datasets", [])
|
282
|
+
if ds and not lang_code:
|
283
|
+
lang_code = ds[0].get("language")
|
284
|
+
|
285
|
+
characters_config = config.get("characters", {})
|
286
|
+
if config.get("add_blank", True):
|
287
|
+
blank_type = BlankBetween.TOKENS
|
288
|
+
characters_config["blank"] = characters_config.get("blank") or "<BLNK>"
|
289
|
+
config.update(characters_config)
|
290
|
+
# For Coqui VITS grapheme models, build phoneme_id_map from characters
|
291
|
+
characters = characters_config.get("characters")
|
292
|
+
punctuations = characters_config.get("punctuations")
|
293
|
+
|
294
|
+
if not config.get("enable_eos_bos_chars", True):
|
295
|
+
config["bos"] = config["eos"] = None
|
296
|
+
|
297
|
+
# Construct vocabulary based on the order defined in the original Graphemes class
|
298
|
+
# [PAD, EOS, BOS, BLANK, CHARACTERS, PUNCTUATIONS]
|
299
|
+
vocab_list = []
|
300
|
+
|
301
|
+
if characters_config.get("pad") is not None:
|
302
|
+
vocab_list.append(characters_config["pad"])
|
303
|
+
|
304
|
+
# ?? - haven't see any coqui model
|
305
|
+
# adding bos and eos to vocab_list
|
306
|
+
|
307
|
+
#if characters_config.get("eos") is not None:
|
308
|
+
# vocab_list.append(characters_config["eos"])
|
309
|
+
#if characters_config.get("bos") is not None:
|
310
|
+
# vocab_list.append(characters_config["bos"])
|
311
|
+
|
312
|
+
if punctuations:
|
313
|
+
vocab_list.extend(list(punctuations))
|
314
|
+
if characters:
|
315
|
+
vocab_list.extend(list(characters))
|
316
|
+
|
317
|
+
|
318
|
+
if characters_config.get("blank") is not None:
|
319
|
+
vocab_list.append(characters_config["blank"])
|
320
|
+
|
321
|
+
# Ensure unique characters and sort if needed (though not strictly necessary for map creation)
|
322
|
+
# This part of logic was previously in Graphemes, now implicitly handled by set/list conversion
|
323
|
+
phoneme_id_map = {char: idx for idx, char in enumerate(vocab_list)}
|
324
|
+
|
325
|
+
phoneme_type = PhonemeType(phoneme_type_str)
|
326
|
+
LOG.debug(f"phonemizer: {phoneme_type}")
|
327
|
+
inference = config.get("inference", {})
|
328
|
+
|
329
|
+
include_whitespace = " " in config.get("characters", "") or " " in config.get("phoneme_id_map", {})
|
330
|
+
return VoiceConfig(
|
331
|
+
num_langs=config.get("num_langs", 1),
|
332
|
+
num_symbols=config.get("num_symbols", 256),
|
333
|
+
num_speakers=config.get("num_speakers", 1),
|
334
|
+
sample_rate=config.get("audio", {}).get("sample_rate", 16000),
|
335
|
+
noise_scale=inference.get("noise_scale", DEFAULT_NOISE_SCALE),
|
336
|
+
length_scale=inference.get("length_scale", DEFAULT_LENGTH_SCALE),
|
337
|
+
noise_w_scale=inference.get("noise_w", DEFAULT_NOISE_W_SCALE),
|
338
|
+
lang_code=lang_code,
|
339
|
+
alphabet=alphabet,
|
340
|
+
phonemizer_model=config.get("phonemizer_model"),
|
341
|
+
phoneme_id_map=phoneme_id_map,
|
342
|
+
phoneme_type=phoneme_type,
|
343
|
+
speaker_id_map=config.get("speaker_id_map", {}),
|
344
|
+
blank_between=blank_type,
|
345
|
+
include_whitespace=include_whitespace,
|
346
|
+
blank_at_start=config.get("blank_at_start", True),
|
347
|
+
blank_at_end=config.get("blank_at_end", True),
|
348
|
+
pad_token=config.get("pad"),
|
349
|
+
blank_token=config.get("blank"),
|
350
|
+
bos_token=config.get("bos"),
|
351
|
+
eos_token=config.get("eos"),
|
352
|
+
word_sep_token=config.get("word_sep_token") or config.get("blank_word", " ")
|
353
|
+
)
|
354
|
+
|
355
|
+
|
356
|
+
@dataclass
|
357
|
+
class SynthesisConfig:
|
358
|
+
"""Configuration for synthesis."""
|
359
|
+
|
360
|
+
speaker_id: Optional[int] = None
|
361
|
+
"""Index of speaker to use (multi-speaker voices only)."""
|
362
|
+
|
363
|
+
lang_id: Optional[int] = None
|
364
|
+
"""Index of lang to use (multi-lang voices only)."""
|
365
|
+
|
366
|
+
length_scale: Optional[float] = None
|
367
|
+
"""Phoneme length scale (< 1 is faster, > 1 is slower)."""
|
368
|
+
|
369
|
+
noise_scale: Optional[float] = None
|
370
|
+
"""Amount of generator noise to add."""
|
371
|
+
|
372
|
+
noise_w_scale: Optional[float] = None
|
373
|
+
"""Amount of phoneme width noise to add."""
|
374
|
+
|
375
|
+
normalize_audio: bool = True
|
376
|
+
"""Enable/disable scaling audio samples to fit full range."""
|
377
|
+
|
378
|
+
volume: float = 1.0
|
379
|
+
"""Multiplier for audio samples (< 1 is quieter, > 1 is louder)."""
|
380
|
+
|
381
|
+
enable_phonetic_spellings: bool = True
|
382
|
+
|
383
|
+
|
384
|
+
def get_phonemizer(phoneme_type: PhonemeType,
|
385
|
+
alphabet: Alphabet = Alphabet.IPA,
|
386
|
+
model: Optional[str] = None) -> 'Phonemizer':
|
387
|
+
from phoonnx.phonemizers import (EpitranPhonemizer, EspeakPhonemizer, OpenPhonemizer, OpenJTaklPhonemizer,
|
388
|
+
ByT5Phonemizer, CharsiuPhonemizer, DeepPhonemizer, PersianPhonemizer,
|
389
|
+
G2pCPhonemizer, G2pMPhonemizer, G2PKPhonemizer, G2PEnPhonemizer,
|
390
|
+
GruutPhonemizer, GraphemePhonemizer, MantoqPhonemizer, MisakiPhonemizer,
|
391
|
+
KoG2PPhonemizer, PypinyinPhonemizer, PyKakasiPhonemizer, CotoviaPhonemizer,
|
392
|
+
CutletPhonemizer, PhonikudPhonemizer, VIPhonemePhonemizer, XpinyinPhonemizer,
|
393
|
+
UnicodeCodepointPhonemizer, JiebaPhonemizer, RawPhonemes)
|
394
|
+
if phoneme_type == PhonemeType.ESPEAK:
|
395
|
+
phonemizer = EspeakPhonemizer()
|
396
|
+
elif phoneme_type == PhonemeType.BYT5:
|
397
|
+
phonemizer = ByT5Phonemizer(model)
|
398
|
+
elif phoneme_type == PhonemeType.CHARSIU:
|
399
|
+
phonemizer = CharsiuPhonemizer(model)
|
400
|
+
elif phoneme_type == PhonemeType.GRUUT:
|
401
|
+
phonemizer = GruutPhonemizer()
|
402
|
+
elif phoneme_type == PhonemeType.EPITRAN:
|
403
|
+
phonemizer = EpitranPhonemizer()
|
404
|
+
elif phoneme_type == PhonemeType.MISAKI:
|
405
|
+
phonemizer = MisakiPhonemizer()
|
406
|
+
elif phoneme_type == PhonemeType.DEEPPHONEMIZER:
|
407
|
+
phonemizer = DeepPhonemizer(model)
|
408
|
+
elif phoneme_type == PhonemeType.OPENPHONEMIZER:
|
409
|
+
phonemizer = OpenPhonemizer()
|
410
|
+
elif phoneme_type == PhonemeType.G2PEN:
|
411
|
+
phonemizer = G2PEnPhonemizer(alphabet=alphabet)
|
412
|
+
elif phoneme_type == PhonemeType.OPENJTALK:
|
413
|
+
phonemizer = OpenJTaklPhonemizer(alphabet=alphabet)
|
414
|
+
elif phoneme_type == PhonemeType.PYKAKASI:
|
415
|
+
phonemizer = PyKakasiPhonemizer(alphabet=alphabet)
|
416
|
+
elif phoneme_type == PhonemeType.CUTLET:
|
417
|
+
phonemizer = CutletPhonemizer(alphabet=alphabet)
|
418
|
+
elif phoneme_type == PhonemeType.G2PFA:
|
419
|
+
phonemizer = PersianPhonemizer(alphabet=alphabet)
|
420
|
+
elif phoneme_type == PhonemeType.PHONIKUD:
|
421
|
+
phonemizer = PhonikudPhonemizer()
|
422
|
+
elif phoneme_type == PhonemeType.MANTOQ:
|
423
|
+
phonemizer = MantoqPhonemizer()
|
424
|
+
elif phoneme_type == PhonemeType.VIPHONEME:
|
425
|
+
phonemizer = VIPhonemePhonemizer()
|
426
|
+
elif phoneme_type == PhonemeType.KOG2PK:
|
427
|
+
phonemizer = KoG2PPhonemizer(alphabet=alphabet)
|
428
|
+
elif phoneme_type == PhonemeType.G2PK:
|
429
|
+
phonemizer = G2PKPhonemizer(alphabet=alphabet)
|
430
|
+
elif phoneme_type == PhonemeType.PYPINYIN:
|
431
|
+
phonemizer = PypinyinPhonemizer(alphabet=alphabet)
|
432
|
+
elif phoneme_type == PhonemeType.XPINYIN:
|
433
|
+
phonemizer = XpinyinPhonemizer(alphabet=alphabet)
|
434
|
+
elif phoneme_type == PhonemeType.JIEBA:
|
435
|
+
phonemizer = JiebaPhonemizer()
|
436
|
+
elif phoneme_type == PhonemeType.G2PC:
|
437
|
+
phonemizer = G2pCPhonemizer(alphabet=alphabet)
|
438
|
+
elif phoneme_type == PhonemeType.G2PM:
|
439
|
+
phonemizer = G2pMPhonemizer(alphabet=alphabet)
|
440
|
+
elif phoneme_type == PhonemeType.COTOVIA:
|
441
|
+
phonemizer = CotoviaPhonemizer()
|
442
|
+
elif phoneme_type == PhonemeType.UNICODE:
|
443
|
+
phonemizer = UnicodeCodepointPhonemizer()
|
444
|
+
elif phoneme_type == PhonemeType.GRAPHEMES:
|
445
|
+
phonemizer = GraphemePhonemizer()
|
446
|
+
elif phoneme_type == PhonemeType.RAW:
|
447
|
+
phonemizer = RawPhonemes()
|
448
|
+
else:
|
449
|
+
raise ValueError("invalid phonemizer")
|
450
|
+
return phonemizer
|
451
|
+
|
452
|
+
|
453
|
+
|
454
|
+
if __name__ == "__main__":
|
455
|
+
config_files = [
|
456
|
+
"/home/miro/PycharmProjects/phoonnx_tts/sabela_cotovia_vits.json",
|
457
|
+
"/home/miro/PycharmProjects/phoonnx_tts/celtia_vits.json",
|
458
|
+
"/home/miro/PycharmProjects/phoonnx_tts/mimic3_gruut.json",
|
459
|
+
"/home/miro/PycharmProjects/phoonnx_tts/mimic3_espeak.json",
|
460
|
+
"/home/miro/PycharmProjects/phoonnx_tts/mimic3_epitran.json",
|
461
|
+
"/home/miro/PycharmProjects/phoonnx_tts/mimic3_symbols.json",
|
462
|
+
"/home/miro/PycharmProjects/phoonnx_tts/piper_espeak.json",
|
463
|
+
"/home/miro/PycharmProjects/phoonnx_tts/vits-coqui-pt-cv/config.json",
|
464
|
+
"/home/miro/PycharmProjects/phoonnx_tts/phonikud/model.config.json"
|
465
|
+
]
|
466
|
+
phoneme_txts = [
|
467
|
+
None,
|
468
|
+
None,
|
469
|
+
"/home/miro/PycharmProjects/phoonnx_tts/mimic3_ap/phonemes.txt",
|
470
|
+
"/home/miro/PycharmProjects/phoonnx_tts/mimic3_ap/phonemes.txt",
|
471
|
+
"/home/miro/PycharmProjects/phoonnx_tts/mimic3_ap/phonemes.txt",
|
472
|
+
"/home/miro/PycharmProjects/phoonnx_tts/mimic3_ap/phonemes.txt",
|
473
|
+
None,
|
474
|
+
None,
|
475
|
+
None
|
476
|
+
]
|
477
|
+
print("Testing model config file parsing\n###############")
|
478
|
+
for idx, cfile in enumerate(config_files):
|
479
|
+
print(f"\nConfig file: {cfile}")
|
480
|
+
with open(cfile) as f:
|
481
|
+
config = json.load(f)
|
482
|
+
print("Mimic3:", VoiceConfig.is_mimic3(config))
|
483
|
+
print("Piper:", VoiceConfig.is_piper(config))
|
484
|
+
print("Coqui:", VoiceConfig.is_coqui_vits(config))
|
485
|
+
print("Cotovia:", VoiceConfig.is_cotovia(config))
|
486
|
+
print("Phoonx:", VoiceConfig.is_phoonnx(config))
|
487
|
+
cfg = VoiceConfig.from_dict(config, phoneme_txts[idx])
|
488
|
+
print(cfg)
|
489
|
+
|
490
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
OpenVoiceOS: Open Voice O S
|