phoonnx 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phoonnx/__init__.py +0 -0
- phoonnx/config.py +490 -0
- phoonnx/locale/ca/phonetic_spellings.txt +2 -0
- phoonnx/locale/en/phonetic_spellings.txt +1 -0
- phoonnx/locale/gl/phonetic_spellings.txt +2 -0
- phoonnx/locale/pt/phonetic_spellings.txt +2 -0
- phoonnx/phoneme_ids.py +453 -0
- phoonnx/phonemizers/__init__.py +45 -0
- phoonnx/phonemizers/ar.py +42 -0
- phoonnx/phonemizers/base.py +216 -0
- phoonnx/phonemizers/en.py +250 -0
- phoonnx/phonemizers/fa.py +46 -0
- phoonnx/phonemizers/gl.py +142 -0
- phoonnx/phonemizers/he.py +67 -0
- phoonnx/phonemizers/ja.py +119 -0
- phoonnx/phonemizers/ko.py +97 -0
- phoonnx/phonemizers/mul.py +606 -0
- phoonnx/phonemizers/vi.py +44 -0
- phoonnx/phonemizers/zh.py +308 -0
- phoonnx/thirdparty/__init__.py +0 -0
- phoonnx/thirdparty/arpa2ipa.py +249 -0
- phoonnx/thirdparty/cotovia/cotovia_aarch64 +0 -0
- phoonnx/thirdparty/cotovia/cotovia_x86_64 +0 -0
- phoonnx/thirdparty/hangul2ipa.py +783 -0
- phoonnx/thirdparty/ko_tables/aspiration.csv +20 -0
- phoonnx/thirdparty/ko_tables/assimilation.csv +31 -0
- phoonnx/thirdparty/ko_tables/double_coda.csv +17 -0
- phoonnx/thirdparty/ko_tables/hanja.tsv +8525 -0
- phoonnx/thirdparty/ko_tables/ipa.csv +22 -0
- phoonnx/thirdparty/ko_tables/neutralization.csv +11 -0
- phoonnx/thirdparty/ko_tables/tensification.csv +56 -0
- phoonnx/thirdparty/ko_tables/yale.csv +22 -0
- phoonnx/thirdparty/kog2p/__init__.py +385 -0
- phoonnx/thirdparty/kog2p/rulebook.txt +212 -0
- phoonnx/thirdparty/mantoq/__init__.py +67 -0
- phoonnx/thirdparty/mantoq/buck/__init__.py +0 -0
- phoonnx/thirdparty/mantoq/buck/phonetise_buckwalter.py +569 -0
- phoonnx/thirdparty/mantoq/buck/symbols.py +64 -0
- phoonnx/thirdparty/mantoq/buck/tokenization.py +105 -0
- phoonnx/thirdparty/mantoq/num2words.py +37 -0
- phoonnx/thirdparty/mantoq/pyarabic/__init__.py +12 -0
- phoonnx/thirdparty/mantoq/pyarabic/arabrepr.py +64 -0
- phoonnx/thirdparty/mantoq/pyarabic/araby.py +1647 -0
- phoonnx/thirdparty/mantoq/pyarabic/named_const.py +227 -0
- phoonnx/thirdparty/mantoq/pyarabic/normalize.py +161 -0
- phoonnx/thirdparty/mantoq/pyarabic/number.py +826 -0
- phoonnx/thirdparty/mantoq/pyarabic/number_const.py +1704 -0
- phoonnx/thirdparty/mantoq/pyarabic/stack.py +52 -0
- phoonnx/thirdparty/mantoq/pyarabic/trans.py +517 -0
- phoonnx/thirdparty/mantoq/unicode_symbol2label.py +4173 -0
- phoonnx/thirdparty/tashkeel/LICENSE +22 -0
- phoonnx/thirdparty/tashkeel/SOURCE +1 -0
- phoonnx/thirdparty/tashkeel/__init__.py +212 -0
- phoonnx/thirdparty/tashkeel/hint_id_map.json +18 -0
- phoonnx/thirdparty/tashkeel/input_id_map.json +56 -0
- phoonnx/thirdparty/tashkeel/model.onnx +0 -0
- phoonnx/thirdparty/tashkeel/target_id_map.json +17 -0
- phoonnx/thirdparty/zh_num.py +238 -0
- phoonnx/util.py +705 -0
- phoonnx/version.py +6 -0
- phoonnx/voice.py +521 -0
- phoonnx-0.0.0.dist-info/METADATA +255 -0
- phoonnx-0.0.0.dist-info/RECORD +86 -0
- phoonnx-0.0.0.dist-info/WHEEL +5 -0
- phoonnx-0.0.0.dist-info/top_level.txt +2 -0
- phoonnx_train/__main__.py +151 -0
- phoonnx_train/export_onnx.py +109 -0
- phoonnx_train/norm_audio/__init__.py +92 -0
- phoonnx_train/norm_audio/trim.py +54 -0
- phoonnx_train/norm_audio/vad.py +54 -0
- phoonnx_train/preprocess.py +420 -0
- phoonnx_train/vits/__init__.py +0 -0
- phoonnx_train/vits/attentions.py +427 -0
- phoonnx_train/vits/commons.py +147 -0
- phoonnx_train/vits/config.py +330 -0
- phoonnx_train/vits/dataset.py +214 -0
- phoonnx_train/vits/lightning.py +352 -0
- phoonnx_train/vits/losses.py +58 -0
- phoonnx_train/vits/mel_processing.py +139 -0
- phoonnx_train/vits/models.py +732 -0
- phoonnx_train/vits/modules.py +527 -0
- phoonnx_train/vits/monotonic_align/__init__.py +20 -0
- phoonnx_train/vits/monotonic_align/setup.py +13 -0
- phoonnx_train/vits/transforms.py +212 -0
- phoonnx_train/vits/utils.py +16 -0
- phoonnx_train/vits/wavfile.py +860 -0
phoonnx/version.py
ADDED
phoonnx/voice.py
ADDED
@@ -0,0 +1,521 @@
|
|
1
|
+
import json
|
2
|
+
import os.path
|
3
|
+
import re
|
4
|
+
import wave
|
5
|
+
from dataclasses import dataclass, field
|
6
|
+
from pathlib import Path
|
7
|
+
from typing import Any, Iterable, Optional, Union, Dict
|
8
|
+
|
9
|
+
import numpy as np
|
10
|
+
import onnxruntime
|
11
|
+
from langcodes import closest_match
|
12
|
+
|
13
|
+
from phoonnx.config import PhonemeType, VoiceConfig, SynthesisConfig, get_phonemizer
|
14
|
+
from phoonnx.phoneme_ids import phonemes_to_ids, BlankBetween
|
15
|
+
from phoonnx.phonemizers import Phonemizer
|
16
|
+
from phoonnx.phonemizers.base import PhonemizedChunks
|
17
|
+
from phoonnx.thirdparty.tashkeel import TashkeelDiacritizer
|
18
|
+
|
19
|
+
_PHONEME_BLOCK_PATTERN = re.compile(r"(\[\[.*?\]\])")
|
20
|
+
|
21
|
+
try:
|
22
|
+
from ovos_utils.log import LOG
|
23
|
+
except ImportError:
|
24
|
+
import logging
|
25
|
+
|
26
|
+
LOG = logging.getLogger(__name__)
|
27
|
+
LOG.setLevel("DEBUG")
|
28
|
+
|
29
|
+
|
30
|
+
@dataclass
|
31
|
+
class PhoneticSpellings:
|
32
|
+
replacements: Dict[str, str] = field(default_factory=dict)
|
33
|
+
|
34
|
+
@staticmethod
|
35
|
+
def from_lang(lang: str, locale_path: str = f"{os.path.dirname(__file__)}/locale"):
|
36
|
+
langs = os.listdir(locale_path)
|
37
|
+
lang2, distance = closest_match(lang, langs)
|
38
|
+
if distance <= 10:
|
39
|
+
spellings_file = f"{locale_path}/{lang2}/phonetic_spellings.txt"
|
40
|
+
return PhoneticSpellings.from_path(spellings_file)
|
41
|
+
raise FileNotFoundError(f"Spellings file for '{lang}' not found")
|
42
|
+
|
43
|
+
@staticmethod
|
44
|
+
def from_path(spellings_file: str):
|
45
|
+
replacements = {}
|
46
|
+
with open(spellings_file) as f:
|
47
|
+
lines = f.read().split("\n")
|
48
|
+
for l in lines:
|
49
|
+
word, spelling = l.split(":", 1)
|
50
|
+
replacements[word.strip()] = spelling.strip()
|
51
|
+
return PhoneticSpellings(replacements)
|
52
|
+
|
53
|
+
def apply(self, text: str) -> str:
|
54
|
+
for k, v in self.replacements.items():
|
55
|
+
# Use regex to ensure word boundaries
|
56
|
+
pattern = r'\b' + re.escape(k) + r'\b'
|
57
|
+
# Replace using regex with case insensitivity
|
58
|
+
text = re.sub(pattern, v, text, flags=re.IGNORECASE)
|
59
|
+
return text
|
60
|
+
|
61
|
+
|
62
|
+
@dataclass
|
63
|
+
class AudioChunk:
|
64
|
+
"""Chunk of raw audio."""
|
65
|
+
|
66
|
+
sample_rate: int
|
67
|
+
"""Rate of chunk samples in Hertz."""
|
68
|
+
|
69
|
+
sample_width: int
|
70
|
+
"""Width of chunk samples in bytes."""
|
71
|
+
|
72
|
+
sample_channels: int
|
73
|
+
"""Number of channels in chunk samples."""
|
74
|
+
|
75
|
+
audio_float_array: np.ndarray
|
76
|
+
"""Audio data as float numpy array in [-1, 1]."""
|
77
|
+
|
78
|
+
_audio_int16_array: Optional[np.ndarray] = None
|
79
|
+
_audio_int16_bytes: Optional[bytes] = None
|
80
|
+
_MAX_WAV_VALUE: float = 32767.0
|
81
|
+
|
82
|
+
@property
|
83
|
+
def audio_int16_array(self) -> np.ndarray:
|
84
|
+
"""
|
85
|
+
Get audio as an int16 numpy array.
|
86
|
+
|
87
|
+
:return: Audio data as int16 numpy array.
|
88
|
+
"""
|
89
|
+
if self._audio_int16_array is None:
|
90
|
+
self._audio_int16_array = np.clip(
|
91
|
+
self.audio_float_array * self._MAX_WAV_VALUE, -self._MAX_WAV_VALUE, self._MAX_WAV_VALUE
|
92
|
+
).astype(np.int16)
|
93
|
+
|
94
|
+
return self._audio_int16_array
|
95
|
+
|
96
|
+
@property
|
97
|
+
def audio_int16_bytes(self) -> bytes:
|
98
|
+
"""
|
99
|
+
Get audio as 16-bit PCM bytes.
|
100
|
+
|
101
|
+
:return: Audio data as signed 16-bit sample bytes.
|
102
|
+
"""
|
103
|
+
return self.audio_int16_array.tobytes()
|
104
|
+
|
105
|
+
|
106
|
+
@dataclass
|
107
|
+
class TTSVoice:
|
108
|
+
session: onnxruntime.InferenceSession
|
109
|
+
|
110
|
+
config: VoiceConfig
|
111
|
+
|
112
|
+
phonetic_spellings: Optional[PhoneticSpellings] = None
|
113
|
+
|
114
|
+
phonemizer: Optional[Phonemizer] = None
|
115
|
+
|
116
|
+
# For Arabic text only
|
117
|
+
use_tashkeel: bool = True
|
118
|
+
tashkeel_diacritizier: Optional[TashkeelDiacritizer] = None # For Arabic text only
|
119
|
+
taskeen_threshold: Optional[float] = 0.8
|
120
|
+
|
121
|
+
def __post_init__(self):
|
122
|
+
try:
|
123
|
+
self.phonetic_spellings = PhoneticSpellings.from_lang(self.config.lang_code)
|
124
|
+
except FileNotFoundError:
|
125
|
+
pass
|
126
|
+
if self.phonemizer is None:
|
127
|
+
self.phonemizer = get_phonemizer(self.config.phoneme_type,
|
128
|
+
self.config.alphabet,
|
129
|
+
self.config.phonemizer_model)
|
130
|
+
|
131
|
+
# compat with piper arabic models - TODO move to espeak phonemizer
|
132
|
+
if self.config.lang_code.split("-")[0] == "ar" and self.use_tashkeel and self.tashkeel_diacritizier is None:
|
133
|
+
self.tashkeel_diacritizier = TashkeelDiacritizer()
|
134
|
+
|
135
|
+
@staticmethod
|
136
|
+
def load(
|
137
|
+
model_path: Union[str, Path],
|
138
|
+
config_path: Optional[Union[str, Path]] = None,
|
139
|
+
phonemes_txt: Optional[str] = None,
|
140
|
+
phoneme_map: Optional[str] = None,
|
141
|
+
lang_code: Optional[str] = None,
|
142
|
+
phoneme_type_str: Optional[str] = None,
|
143
|
+
use_cuda: bool = False
|
144
|
+
) -> "TTSVoice":
|
145
|
+
"""
|
146
|
+
Load an ONNX model and config.
|
147
|
+
|
148
|
+
:param model_path: Path to ONNX voice model.
|
149
|
+
:param config_path: Path to JSON voice config (defaults to model_path + ".json").
|
150
|
+
:param use_cuda: True if CUDA (GPU) should be used instead of CPU.
|
151
|
+
:return: Voice object.
|
152
|
+
"""
|
153
|
+
if config_path is None:
|
154
|
+
config_path = f"{model_path}.json"
|
155
|
+
LOG.debug("Guessing voice config path: %s", config_path)
|
156
|
+
|
157
|
+
with open(config_path, "r", encoding="utf-8") as config_file:
|
158
|
+
config_dict = json.load(config_file)
|
159
|
+
|
160
|
+
providers: list[Union[str, tuple[str, dict[str, Any]]]]
|
161
|
+
if use_cuda:
|
162
|
+
providers = [
|
163
|
+
(
|
164
|
+
"CUDAExecutionProvider",
|
165
|
+
{"cudnn_conv_algo_search": "HEURISTIC"},
|
166
|
+
)
|
167
|
+
]
|
168
|
+
LOG.debug("Using CUDA")
|
169
|
+
else:
|
170
|
+
providers = ["CPUExecutionProvider"]
|
171
|
+
|
172
|
+
return TTSVoice(
|
173
|
+
config=VoiceConfig.from_dict(config_dict,
|
174
|
+
phonemes_txt=phonemes_txt,
|
175
|
+
lang_code=lang_code,
|
176
|
+
phoneme_type_str=phoneme_type_str),
|
177
|
+
session=onnxruntime.InferenceSession(
|
178
|
+
str(model_path),
|
179
|
+
sess_options=onnxruntime.SessionOptions(),
|
180
|
+
providers=providers,
|
181
|
+
)
|
182
|
+
)
|
183
|
+
|
184
|
+
def phonemize(self, text: str) -> PhonemizedChunks:
|
185
|
+
"""
|
186
|
+
Text to phonemes grouped by sentence.
|
187
|
+
|
188
|
+
:param text: Text to phonemize.
|
189
|
+
:return: List of phonemes for each sentence.
|
190
|
+
"""
|
191
|
+
phonemes: list[list[str]] = []
|
192
|
+
|
193
|
+
text_parts = _PHONEME_BLOCK_PATTERN.split(text)
|
194
|
+
|
195
|
+
for i, text_part in enumerate(text_parts):
|
196
|
+
if text_part.startswith("[["):
|
197
|
+
# Phonemes
|
198
|
+
if not phonemes:
|
199
|
+
# Start new sentence
|
200
|
+
phonemes.append([])
|
201
|
+
|
202
|
+
if (i > 0) and (text_parts[i - 1].endswith(" ")):
|
203
|
+
phonemes[-1].append(" ")
|
204
|
+
|
205
|
+
phonemes[-1].extend(list(text_part[2:-2].strip())) # Ensure characters are split
|
206
|
+
|
207
|
+
if (i < (len(text_parts)) - 1) and (text_parts[i + 1].startswith(" ")):
|
208
|
+
phonemes[-1].append(" ")
|
209
|
+
|
210
|
+
continue
|
211
|
+
|
212
|
+
# Arabic diacritization
|
213
|
+
if self.config.lang_code.split("-")[0] == "ar" and self.use_tashkeel:
|
214
|
+
text_part = self.tashkeel_diacritizier(
|
215
|
+
text_part, taskeen_threshold=self.taskeen_threshold
|
216
|
+
)
|
217
|
+
|
218
|
+
# Phonemization
|
219
|
+
phonemes = self.phonemizer.phonemize(
|
220
|
+
text_part, self.config.lang_code
|
221
|
+
)
|
222
|
+
phonemes.extend(phonemes)
|
223
|
+
|
224
|
+
if phonemes and (not phonemes[-1]):
|
225
|
+
# Remove empty phonemes
|
226
|
+
phonemes.pop()
|
227
|
+
|
228
|
+
return phonemes
|
229
|
+
|
230
|
+
def phonemes_to_ids(self, phonemes: list[str]) -> list[int]:
|
231
|
+
"""
|
232
|
+
Phonemes to ids.
|
233
|
+
|
234
|
+
:param phonemes: List of phonemes (or characters for grapheme models).
|
235
|
+
:return: List of phoneme ids.
|
236
|
+
"""
|
237
|
+
if self.config.phoneme_id_map is None:
|
238
|
+
raise ValueError("self.config.phoneme_id_map is None")
|
239
|
+
return phonemes_to_ids(phonemes, self.config.phoneme_id_map,
|
240
|
+
blank_token=self.config.blank_token,
|
241
|
+
bos_token=self.config.bos_token,
|
242
|
+
eos_token=self.config.eos_token,
|
243
|
+
word_sep_token=self.config.word_sep_token,
|
244
|
+
include_whitespace=self.config.include_whitespace,
|
245
|
+
blank_at_start=self.config.blank_at_start,
|
246
|
+
blank_at_end=self.config.blank_at_end,
|
247
|
+
blank_between=BlankBetween.TOKENS_AND_WORDS,
|
248
|
+
)
|
249
|
+
|
250
|
+
def synthesize(
|
251
|
+
self,
|
252
|
+
text: str,
|
253
|
+
syn_config: Optional[SynthesisConfig] = None,
|
254
|
+
) -> Iterable[AudioChunk]:
|
255
|
+
"""
|
256
|
+
Synthesize one audio chunk per sentence from from text.
|
257
|
+
|
258
|
+
:param text: Text to synthesize.
|
259
|
+
:param syn_config: Synthesis configuration.
|
260
|
+
"""
|
261
|
+
if syn_config is None:
|
262
|
+
syn_config = SynthesisConfig()
|
263
|
+
|
264
|
+
LOG.debug("text=%s", text)
|
265
|
+
|
266
|
+
# user defined word-level replacements to force correct pronunciation
|
267
|
+
if self.phonetic_spellings and syn_config.enable_phonetic_spellings:
|
268
|
+
text = self.phonetic_spellings.apply(text)
|
269
|
+
|
270
|
+
# All phonemization goes through the unified self.phonemize method
|
271
|
+
sentence_phonemes = self.phonemize(text)
|
272
|
+
LOG.debug("phonemes=%s", sentence_phonemes)
|
273
|
+
all_phoneme_ids_for_synthesis = [
|
274
|
+
self.phonemes_to_ids(phonemes) for phonemes in sentence_phonemes if phonemes
|
275
|
+
]
|
276
|
+
|
277
|
+
for phoneme_ids in all_phoneme_ids_for_synthesis:
|
278
|
+
if not phoneme_ids:
|
279
|
+
continue
|
280
|
+
|
281
|
+
audio = self.phoneme_ids_to_audio(phoneme_ids, syn_config)
|
282
|
+
|
283
|
+
if syn_config.normalize_audio:
|
284
|
+
max_val = np.max(np.abs(audio))
|
285
|
+
if max_val < 1e-8:
|
286
|
+
# Prevent division by zero
|
287
|
+
audio = np.zeros_like(audio)
|
288
|
+
else:
|
289
|
+
audio = audio / max_val
|
290
|
+
|
291
|
+
if syn_config.volume != 1.0:
|
292
|
+
audio = audio * syn_config.volume
|
293
|
+
|
294
|
+
audio = np.clip(audio, -1.0, 1.0).astype(np.float32)
|
295
|
+
|
296
|
+
yield AudioChunk(
|
297
|
+
sample_rate=self.config.sample_rate,
|
298
|
+
sample_width=2,
|
299
|
+
sample_channels=1,
|
300
|
+
audio_float_array=audio,
|
301
|
+
)
|
302
|
+
|
303
|
+
def synthesize_wav(
|
304
|
+
self,
|
305
|
+
text: str,
|
306
|
+
wav_file: wave.Wave_write,
|
307
|
+
syn_config: Optional[SynthesisConfig] = None,
|
308
|
+
set_wav_format: bool = True,
|
309
|
+
) -> None:
|
310
|
+
"""
|
311
|
+
Synthesize and write WAV audio from text.
|
312
|
+
|
313
|
+
:param text: Text to synthesize.
|
314
|
+
:param wav_file: WAV file writer.
|
315
|
+
:param syn_config: Synthesis configuration.
|
316
|
+
:param set_wav_format: True if the WAV format should be set automatically.
|
317
|
+
"""
|
318
|
+
|
319
|
+
# 16-bit samples for silence
|
320
|
+
sentence_silence = 0.0 # Seconds of silence after each sentence
|
321
|
+
silence_int16_bytes = bytes(
|
322
|
+
int(self.config.sample_rate * sentence_silence * 2)
|
323
|
+
)
|
324
|
+
first_chunk = True
|
325
|
+
for audio_chunk in self.synthesize(text, syn_config=syn_config):
|
326
|
+
if first_chunk:
|
327
|
+
if set_wav_format:
|
328
|
+
# Set audio format on first chunk
|
329
|
+
wav_file.setframerate(audio_chunk.sample_rate)
|
330
|
+
wav_file.setsampwidth(audio_chunk.sample_width)
|
331
|
+
wav_file.setnchannels(audio_chunk.sample_channels)
|
332
|
+
|
333
|
+
first_chunk = False
|
334
|
+
|
335
|
+
if not first_chunk:
|
336
|
+
wav_file.writeframes(silence_int16_bytes)
|
337
|
+
|
338
|
+
wav_file.writeframes(audio_chunk.audio_int16_bytes)
|
339
|
+
|
340
|
+
def phoneme_ids_to_audio(
|
341
|
+
self, phoneme_ids: list[int], syn_config: Optional[SynthesisConfig] = None
|
342
|
+
) -> np.ndarray:
|
343
|
+
"""
|
344
|
+
Synthesize raw audio from phoneme ids.
|
345
|
+
|
346
|
+
:param phoneme_ids: List of phoneme ids.
|
347
|
+
:param syn_config: Synthesis configuration.
|
348
|
+
:return: Audio float numpy array from voice model (unnormalized, in range [-1, 1]).
|
349
|
+
"""
|
350
|
+
if syn_config is None:
|
351
|
+
syn_config = SynthesisConfig()
|
352
|
+
|
353
|
+
langid = syn_config.lang_id or 0
|
354
|
+
speaker_id = syn_config.speaker_id or 0
|
355
|
+
length_scale = syn_config.length_scale
|
356
|
+
noise_scale = syn_config.noise_scale
|
357
|
+
noise_w_scale = syn_config.noise_w_scale
|
358
|
+
|
359
|
+
expected_args = [model_input.name for model_input in self.session.get_inputs()]
|
360
|
+
# print("Expected ONNX Inputs:", expected_args)
|
361
|
+
|
362
|
+
phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
|
363
|
+
phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
|
364
|
+
args = {
|
365
|
+
"input": phoneme_ids_array,
|
366
|
+
"input_lengths": phoneme_ids_lengths
|
367
|
+
}
|
368
|
+
|
369
|
+
if length_scale is None:
|
370
|
+
length_scale = self.config.length_scale
|
371
|
+
if noise_scale is None:
|
372
|
+
noise_scale = self.config.noise_scale
|
373
|
+
if noise_w_scale is None:
|
374
|
+
noise_w_scale = self.config.noise_w_scale
|
375
|
+
if "scales" in expected_args:
|
376
|
+
args["scales"] = np.array(
|
377
|
+
[noise_scale, length_scale, noise_w_scale],
|
378
|
+
dtype=np.float32,
|
379
|
+
)
|
380
|
+
|
381
|
+
args["langid"] = np.array([langid], dtype=np.int64)
|
382
|
+
args["sid"] = np.array([speaker_id], dtype=np.int64)
|
383
|
+
|
384
|
+
# different models can be used and args may differ
|
385
|
+
args = {k: v for k, v in args.items() if k in expected_args}
|
386
|
+
audio = self.session.run(
|
387
|
+
None,
|
388
|
+
args,
|
389
|
+
)[0].squeeze()
|
390
|
+
|
391
|
+
return audio
|
392
|
+
|
393
|
+
|
394
|
+
if __name__ == "__main__":
|
395
|
+
from phoonnx.phonemizers.gl import CotoviaPhonemizer
|
396
|
+
from phoonnx.phonemizers.he import PhonikudPhonemizer
|
397
|
+
from phoonnx.phonemizers.mul import (EspeakPhonemizer, EpitranPhonemizer, GruutPhonemizer, ByT5Phonemizer)
|
398
|
+
|
399
|
+
syn_config = SynthesisConfig(enable_phonetic_spellings=True)
|
400
|
+
|
401
|
+
# test hebrew piper
|
402
|
+
model = "/home/miro/PycharmProjects/phoonnx_tts/phonikud/model.onnx"
|
403
|
+
config = "/home/miro/PycharmProjects/phoonnx_tts/phonikud/model.config.json"
|
404
|
+
|
405
|
+
voice = TTSVoice.load(model_path=model, config_path=config, use_cuda=False)
|
406
|
+
|
407
|
+
print("\n################")
|
408
|
+
# hebrew phonemes (raw input model)
|
409
|
+
pho = PhonikudPhonemizer(diacritics=True)
|
410
|
+
sentence = "הכוח לשנות מתחיל ברגע שבו אתה מאמין שזה אפשרי!"
|
411
|
+
sentence = pho.phonemize_string(sentence, "he")
|
412
|
+
|
413
|
+
print("## piper hebrew (raw)")
|
414
|
+
print("-", voice.config.phoneme_type)
|
415
|
+
slug = f"piper_{voice.config.phoneme_type.value}_{voice.config.lang_code}"
|
416
|
+
with wave.open(f"{slug}.wav", "wb") as wav_file:
|
417
|
+
voice.synthesize_wav(sentence, wav_file, syn_config)
|
418
|
+
|
419
|
+
print("\n################")
|
420
|
+
sentence = "הכוח לשנות מתחיל ברגע שבו אתה מאמין שזה אפשרי!"
|
421
|
+
voice.config.phoneme_type = PhonemeType.PHONIKUD
|
422
|
+
voice.phonemizer = pho
|
423
|
+
|
424
|
+
print("## piper hebrew (phonikud)")
|
425
|
+
print("-", voice.config.phoneme_type)
|
426
|
+
slug = f"piper_{voice.config.phoneme_type.value}_{voice.config.lang_code}"
|
427
|
+
with wave.open(f"{slug}.wav", "wb") as wav_file:
|
428
|
+
voice.synthesize_wav(sentence, wav_file, syn_config)
|
429
|
+
|
430
|
+
exit()
|
431
|
+
# test piper
|
432
|
+
model = "/home/miro/PycharmProjects/phoonnx_tts/miro_en-GB.onnx"
|
433
|
+
config = "/home/miro/PycharmProjects/phoonnx_tts/piper_espeak.json"
|
434
|
+
|
435
|
+
voice = TTSVoice.load(model_path=model, config_path=config, use_cuda=False)
|
436
|
+
byt5_phonemizer = ByT5Phonemizer()
|
437
|
+
gruut_phonemizer = GruutPhonemizer()
|
438
|
+
espeak_phonemizer = EspeakPhonemizer()
|
439
|
+
epitran_phonemizer = EpitranPhonemizer()
|
440
|
+
cotovia_phonemizer = CotoviaPhonemizer()
|
441
|
+
|
442
|
+
sentence = "A rainbow is a meteorological phenomenon that is caused by reflection, refraction and dispersion of light in water droplets resulting in a spectrum of light appearing in the sky. It takes the form of a multi-colored circular arc. Rainbows caused by sunlight always appear in the section of sky directly opposite the Sun."
|
443
|
+
|
444
|
+
print("\n################")
|
445
|
+
print("## piper")
|
446
|
+
for phonemizer_type, phonemizer in [
|
447
|
+
(PhonemeType.ESPEAK, espeak_phonemizer),
|
448
|
+
(PhonemeType.BYT5, byt5_phonemizer),
|
449
|
+
(PhonemeType.GRUUT, gruut_phonemizer),
|
450
|
+
(PhonemeType.EPITRAN, epitran_phonemizer)
|
451
|
+
]:
|
452
|
+
voice.config.phoneme_type = phonemizer_type
|
453
|
+
voice.phonemizer = phonemizer
|
454
|
+
print("-", phonemizer_type)
|
455
|
+
|
456
|
+
slug = f"piper_{phonemizer_type.value}_{voice.config.lang_code}"
|
457
|
+
with wave.open(f"{slug}.wav", "wb") as wav_file:
|
458
|
+
voice.synthesize_wav(sentence, wav_file, syn_config)
|
459
|
+
|
460
|
+
print("\n################")
|
461
|
+
print("## mimic3")
|
462
|
+
model = "/home/miro/PycharmProjects/phoonnx_tts/mimic3_ap/generator.onnx"
|
463
|
+
config = "/home/miro/PycharmProjects/phoonnx_tts/mimic3_ap/config.json"
|
464
|
+
phonemes_txt = "/home/miro/PycharmProjects/phoonnx_tts/mimic3_ap/phonemes.txt"
|
465
|
+
phoneme_map = "/home/miro/PycharmProjects/phoonnx_tts/mimic3_ap/phoneme_map.txt"
|
466
|
+
|
467
|
+
voice = TTSVoice.load(model_path=model, config_path=config,
|
468
|
+
phonemes_txt=phonemes_txt, phoneme_map=phoneme_map,
|
469
|
+
use_cuda=False)
|
470
|
+
for phonemizer_type, phonemizer in [
|
471
|
+
(PhonemeType.ESPEAK, espeak_phonemizer),
|
472
|
+
(PhonemeType.BYT5, byt5_phonemizer),
|
473
|
+
(PhonemeType.GRUUT, gruut_phonemizer),
|
474
|
+
(PhonemeType.EPITRAN, epitran_phonemizer)
|
475
|
+
]:
|
476
|
+
voice.config.phoneme_type = phonemizer_type
|
477
|
+
voice.phonemizer = phonemizer
|
478
|
+
print("-", phonemizer_type)
|
479
|
+
slug = f"mimic3_{voice.config.phoneme_type.value}_{voice.config.lang_code}"
|
480
|
+
with wave.open(f"{slug}.wav", "wb") as wav_file:
|
481
|
+
voice.synthesize_wav(sentence, wav_file, syn_config)
|
482
|
+
|
483
|
+
# Test grapheme model directly
|
484
|
+
print("\n################")
|
485
|
+
print("## coqui vits")
|
486
|
+
model = "/home/miro/PycharmProjects/phoonnx_tts/celtia_vits/model.onnx"
|
487
|
+
config = "/home/miro/PycharmProjects/phoonnx_tts/celtia_vits/config.json"
|
488
|
+
|
489
|
+
sentence = "Este é un sistema de conversión de texto a voz en lingua galega baseado en redes neuronais artificiais. Ten en conta que as funcionalidades incluídas nesta páxina ofrécense unicamente con fins de demostración. Se tes algún comentario, suxestión ou detectas algún problema durante a demostración, ponte en contacto connosco."
|
490
|
+
|
491
|
+
voice = TTSVoice.load(model_path=model, config_path=config,
|
492
|
+
use_cuda=False, lang_code="gl-ES")
|
493
|
+
print("-", voice.config.phoneme_type)
|
494
|
+
print(voice.config)
|
495
|
+
phones = voice.phonemize(sentence)
|
496
|
+
print(phones)
|
497
|
+
print(voice.phonemes_to_ids(phones[0]))
|
498
|
+
|
499
|
+
slug = f"vits_{voice.config.phoneme_type.value}_{voice.config.lang_code}"
|
500
|
+
with wave.open(f"{slug}.wav", "wb") as wav_file:
|
501
|
+
voice.synthesize_wav(sentence, wav_file, syn_config)
|
502
|
+
|
503
|
+
# Test cotovia phonemizer
|
504
|
+
print("\n################")
|
505
|
+
print("## cotovia coqui vits")
|
506
|
+
model = "/home/miro/PycharmProjects/phoonnx_tts/sabela_cotovia/model.onnx"
|
507
|
+
config = "/home/miro/PycharmProjects/phoonnx_tts/sabela_cotovia/config.json"
|
508
|
+
|
509
|
+
sentence = "Este é un sistema de conversión de texto a voz en lingua galega baseado en redes neuronais artificiais. Ten en conta que as funcionalidades incluídas nesta páxina ofrécense unicamente con fins de demostración. Se tes algún comentario, suxestión ou detectas algún problema durante a demostración, ponte en contacto connosco."
|
510
|
+
|
511
|
+
voice = TTSVoice.load(model_path=model, config_path=config,
|
512
|
+
use_cuda=False, lang_code="gl-ES")
|
513
|
+
print("-", voice.config.phoneme_type)
|
514
|
+
print(voice.config)
|
515
|
+
phones = voice.phonemize(sentence)
|
516
|
+
print(phones)
|
517
|
+
print(voice.phonemes_to_ids(phones[0]))
|
518
|
+
|
519
|
+
slug = f"vits_{voice.config.phoneme_type.value}_{voice.config.lang_code}"
|
520
|
+
with wave.open(f"{slug}.wav", "wb") as wav_file:
|
521
|
+
voice.synthesize_wav(sentence, wav_file, syn_config)
|