phoonnx 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. phoonnx/__init__.py +0 -0
  2. phoonnx/config.py +490 -0
  3. phoonnx/locale/ca/phonetic_spellings.txt +2 -0
  4. phoonnx/locale/en/phonetic_spellings.txt +1 -0
  5. phoonnx/locale/gl/phonetic_spellings.txt +2 -0
  6. phoonnx/locale/pt/phonetic_spellings.txt +2 -0
  7. phoonnx/phoneme_ids.py +453 -0
  8. phoonnx/phonemizers/__init__.py +45 -0
  9. phoonnx/phonemizers/ar.py +42 -0
  10. phoonnx/phonemizers/base.py +216 -0
  11. phoonnx/phonemizers/en.py +250 -0
  12. phoonnx/phonemizers/fa.py +46 -0
  13. phoonnx/phonemizers/gl.py +142 -0
  14. phoonnx/phonemizers/he.py +67 -0
  15. phoonnx/phonemizers/ja.py +119 -0
  16. phoonnx/phonemizers/ko.py +97 -0
  17. phoonnx/phonemizers/mul.py +606 -0
  18. phoonnx/phonemizers/vi.py +44 -0
  19. phoonnx/phonemizers/zh.py +308 -0
  20. phoonnx/thirdparty/__init__.py +0 -0
  21. phoonnx/thirdparty/arpa2ipa.py +249 -0
  22. phoonnx/thirdparty/cotovia/cotovia_aarch64 +0 -0
  23. phoonnx/thirdparty/cotovia/cotovia_x86_64 +0 -0
  24. phoonnx/thirdparty/hangul2ipa.py +783 -0
  25. phoonnx/thirdparty/ko_tables/aspiration.csv +20 -0
  26. phoonnx/thirdparty/ko_tables/assimilation.csv +31 -0
  27. phoonnx/thirdparty/ko_tables/double_coda.csv +17 -0
  28. phoonnx/thirdparty/ko_tables/hanja.tsv +8525 -0
  29. phoonnx/thirdparty/ko_tables/ipa.csv +22 -0
  30. phoonnx/thirdparty/ko_tables/neutralization.csv +11 -0
  31. phoonnx/thirdparty/ko_tables/tensification.csv +56 -0
  32. phoonnx/thirdparty/ko_tables/yale.csv +22 -0
  33. phoonnx/thirdparty/kog2p/__init__.py +385 -0
  34. phoonnx/thirdparty/kog2p/rulebook.txt +212 -0
  35. phoonnx/thirdparty/mantoq/__init__.py +67 -0
  36. phoonnx/thirdparty/mantoq/buck/__init__.py +0 -0
  37. phoonnx/thirdparty/mantoq/buck/phonetise_buckwalter.py +569 -0
  38. phoonnx/thirdparty/mantoq/buck/symbols.py +64 -0
  39. phoonnx/thirdparty/mantoq/buck/tokenization.py +105 -0
  40. phoonnx/thirdparty/mantoq/num2words.py +37 -0
  41. phoonnx/thirdparty/mantoq/pyarabic/__init__.py +12 -0
  42. phoonnx/thirdparty/mantoq/pyarabic/arabrepr.py +64 -0
  43. phoonnx/thirdparty/mantoq/pyarabic/araby.py +1647 -0
  44. phoonnx/thirdparty/mantoq/pyarabic/named_const.py +227 -0
  45. phoonnx/thirdparty/mantoq/pyarabic/normalize.py +161 -0
  46. phoonnx/thirdparty/mantoq/pyarabic/number.py +826 -0
  47. phoonnx/thirdparty/mantoq/pyarabic/number_const.py +1704 -0
  48. phoonnx/thirdparty/mantoq/pyarabic/stack.py +52 -0
  49. phoonnx/thirdparty/mantoq/pyarabic/trans.py +517 -0
  50. phoonnx/thirdparty/mantoq/unicode_symbol2label.py +4173 -0
  51. phoonnx/thirdparty/tashkeel/LICENSE +22 -0
  52. phoonnx/thirdparty/tashkeel/SOURCE +1 -0
  53. phoonnx/thirdparty/tashkeel/__init__.py +212 -0
  54. phoonnx/thirdparty/tashkeel/hint_id_map.json +18 -0
  55. phoonnx/thirdparty/tashkeel/input_id_map.json +56 -0
  56. phoonnx/thirdparty/tashkeel/model.onnx +0 -0
  57. phoonnx/thirdparty/tashkeel/target_id_map.json +17 -0
  58. phoonnx/thirdparty/zh_num.py +238 -0
  59. phoonnx/util.py +705 -0
  60. phoonnx/version.py +6 -0
  61. phoonnx/voice.py +521 -0
  62. phoonnx-0.0.0.dist-info/METADATA +255 -0
  63. phoonnx-0.0.0.dist-info/RECORD +86 -0
  64. phoonnx-0.0.0.dist-info/WHEEL +5 -0
  65. phoonnx-0.0.0.dist-info/top_level.txt +2 -0
  66. phoonnx_train/__main__.py +151 -0
  67. phoonnx_train/export_onnx.py +109 -0
  68. phoonnx_train/norm_audio/__init__.py +92 -0
  69. phoonnx_train/norm_audio/trim.py +54 -0
  70. phoonnx_train/norm_audio/vad.py +54 -0
  71. phoonnx_train/preprocess.py +420 -0
  72. phoonnx_train/vits/__init__.py +0 -0
  73. phoonnx_train/vits/attentions.py +427 -0
  74. phoonnx_train/vits/commons.py +147 -0
  75. phoonnx_train/vits/config.py +330 -0
  76. phoonnx_train/vits/dataset.py +214 -0
  77. phoonnx_train/vits/lightning.py +352 -0
  78. phoonnx_train/vits/losses.py +58 -0
  79. phoonnx_train/vits/mel_processing.py +139 -0
  80. phoonnx_train/vits/models.py +732 -0
  81. phoonnx_train/vits/modules.py +527 -0
  82. phoonnx_train/vits/monotonic_align/__init__.py +20 -0
  83. phoonnx_train/vits/monotonic_align/setup.py +13 -0
  84. phoonnx_train/vits/transforms.py +212 -0
  85. phoonnx_train/vits/utils.py +16 -0
  86. phoonnx_train/vits/wavfile.py +860 -0
phoonnx/version.py ADDED
@@ -0,0 +1,6 @@
1
+ # START_VERSION_BLOCK
2
+ VERSION_MAJOR = 0
3
+ VERSION_MINOR = 0
4
+ VERSION_BUILD = 0
5
+ VERSION_ALPHA = 0
6
+ # END_VERSION_BLOCK
phoonnx/voice.py ADDED
@@ -0,0 +1,521 @@
1
+ import json
2
+ import os.path
3
+ import re
4
+ import wave
5
+ from dataclasses import dataclass, field
6
+ from pathlib import Path
7
+ from typing import Any, Iterable, Optional, Union, Dict
8
+
9
+ import numpy as np
10
+ import onnxruntime
11
+ from langcodes import closest_match
12
+
13
+ from phoonnx.config import PhonemeType, VoiceConfig, SynthesisConfig, get_phonemizer
14
+ from phoonnx.phoneme_ids import phonemes_to_ids, BlankBetween
15
+ from phoonnx.phonemizers import Phonemizer
16
+ from phoonnx.phonemizers.base import PhonemizedChunks
17
+ from phoonnx.thirdparty.tashkeel import TashkeelDiacritizer
18
+
19
+ _PHONEME_BLOCK_PATTERN = re.compile(r"(\[\[.*?\]\])")
20
+
21
+ try:
22
+ from ovos_utils.log import LOG
23
+ except ImportError:
24
+ import logging
25
+
26
+ LOG = logging.getLogger(__name__)
27
+ LOG.setLevel("DEBUG")
28
+
29
+
30
+ @dataclass
31
+ class PhoneticSpellings:
32
+ replacements: Dict[str, str] = field(default_factory=dict)
33
+
34
+ @staticmethod
35
+ def from_lang(lang: str, locale_path: str = f"{os.path.dirname(__file__)}/locale"):
36
+ langs = os.listdir(locale_path)
37
+ lang2, distance = closest_match(lang, langs)
38
+ if distance <= 10:
39
+ spellings_file = f"{locale_path}/{lang2}/phonetic_spellings.txt"
40
+ return PhoneticSpellings.from_path(spellings_file)
41
+ raise FileNotFoundError(f"Spellings file for '{lang}' not found")
42
+
43
+ @staticmethod
44
+ def from_path(spellings_file: str):
45
+ replacements = {}
46
+ with open(spellings_file) as f:
47
+ lines = f.read().split("\n")
48
+ for l in lines:
49
+ word, spelling = l.split(":", 1)
50
+ replacements[word.strip()] = spelling.strip()
51
+ return PhoneticSpellings(replacements)
52
+
53
+ def apply(self, text: str) -> str:
54
+ for k, v in self.replacements.items():
55
+ # Use regex to ensure word boundaries
56
+ pattern = r'\b' + re.escape(k) + r'\b'
57
+ # Replace using regex with case insensitivity
58
+ text = re.sub(pattern, v, text, flags=re.IGNORECASE)
59
+ return text
60
+
61
+
62
+ @dataclass
63
+ class AudioChunk:
64
+ """Chunk of raw audio."""
65
+
66
+ sample_rate: int
67
+ """Rate of chunk samples in Hertz."""
68
+
69
+ sample_width: int
70
+ """Width of chunk samples in bytes."""
71
+
72
+ sample_channels: int
73
+ """Number of channels in chunk samples."""
74
+
75
+ audio_float_array: np.ndarray
76
+ """Audio data as float numpy array in [-1, 1]."""
77
+
78
+ _audio_int16_array: Optional[np.ndarray] = None
79
+ _audio_int16_bytes: Optional[bytes] = None
80
+ _MAX_WAV_VALUE: float = 32767.0
81
+
82
+ @property
83
+ def audio_int16_array(self) -> np.ndarray:
84
+ """
85
+ Get audio as an int16 numpy array.
86
+
87
+ :return: Audio data as int16 numpy array.
88
+ """
89
+ if self._audio_int16_array is None:
90
+ self._audio_int16_array = np.clip(
91
+ self.audio_float_array * self._MAX_WAV_VALUE, -self._MAX_WAV_VALUE, self._MAX_WAV_VALUE
92
+ ).astype(np.int16)
93
+
94
+ return self._audio_int16_array
95
+
96
+ @property
97
+ def audio_int16_bytes(self) -> bytes:
98
+ """
99
+ Get audio as 16-bit PCM bytes.
100
+
101
+ :return: Audio data as signed 16-bit sample bytes.
102
+ """
103
+ return self.audio_int16_array.tobytes()
104
+
105
+
106
+ @dataclass
107
+ class TTSVoice:
108
+ session: onnxruntime.InferenceSession
109
+
110
+ config: VoiceConfig
111
+
112
+ phonetic_spellings: Optional[PhoneticSpellings] = None
113
+
114
+ phonemizer: Optional[Phonemizer] = None
115
+
116
+ # For Arabic text only
117
+ use_tashkeel: bool = True
118
+ tashkeel_diacritizier: Optional[TashkeelDiacritizer] = None # For Arabic text only
119
+ taskeen_threshold: Optional[float] = 0.8
120
+
121
+ def __post_init__(self):
122
+ try:
123
+ self.phonetic_spellings = PhoneticSpellings.from_lang(self.config.lang_code)
124
+ except FileNotFoundError:
125
+ pass
126
+ if self.phonemizer is None:
127
+ self.phonemizer = get_phonemizer(self.config.phoneme_type,
128
+ self.config.alphabet,
129
+ self.config.phonemizer_model)
130
+
131
+ # compat with piper arabic models - TODO move to espeak phonemizer
132
+ if self.config.lang_code.split("-")[0] == "ar" and self.use_tashkeel and self.tashkeel_diacritizier is None:
133
+ self.tashkeel_diacritizier = TashkeelDiacritizer()
134
+
135
+ @staticmethod
136
+ def load(
137
+ model_path: Union[str, Path],
138
+ config_path: Optional[Union[str, Path]] = None,
139
+ phonemes_txt: Optional[str] = None,
140
+ phoneme_map: Optional[str] = None,
141
+ lang_code: Optional[str] = None,
142
+ phoneme_type_str: Optional[str] = None,
143
+ use_cuda: bool = False
144
+ ) -> "TTSVoice":
145
+ """
146
+ Load an ONNX model and config.
147
+
148
+ :param model_path: Path to ONNX voice model.
149
+ :param config_path: Path to JSON voice config (defaults to model_path + ".json").
150
+ :param use_cuda: True if CUDA (GPU) should be used instead of CPU.
151
+ :return: Voice object.
152
+ """
153
+ if config_path is None:
154
+ config_path = f"{model_path}.json"
155
+ LOG.debug("Guessing voice config path: %s", config_path)
156
+
157
+ with open(config_path, "r", encoding="utf-8") as config_file:
158
+ config_dict = json.load(config_file)
159
+
160
+ providers: list[Union[str, tuple[str, dict[str, Any]]]]
161
+ if use_cuda:
162
+ providers = [
163
+ (
164
+ "CUDAExecutionProvider",
165
+ {"cudnn_conv_algo_search": "HEURISTIC"},
166
+ )
167
+ ]
168
+ LOG.debug("Using CUDA")
169
+ else:
170
+ providers = ["CPUExecutionProvider"]
171
+
172
+ return TTSVoice(
173
+ config=VoiceConfig.from_dict(config_dict,
174
+ phonemes_txt=phonemes_txt,
175
+ lang_code=lang_code,
176
+ phoneme_type_str=phoneme_type_str),
177
+ session=onnxruntime.InferenceSession(
178
+ str(model_path),
179
+ sess_options=onnxruntime.SessionOptions(),
180
+ providers=providers,
181
+ )
182
+ )
183
+
184
+ def phonemize(self, text: str) -> PhonemizedChunks:
185
+ """
186
+ Text to phonemes grouped by sentence.
187
+
188
+ :param text: Text to phonemize.
189
+ :return: List of phonemes for each sentence.
190
+ """
191
+ phonemes: list[list[str]] = []
192
+
193
+ text_parts = _PHONEME_BLOCK_PATTERN.split(text)
194
+
195
+ for i, text_part in enumerate(text_parts):
196
+ if text_part.startswith("[["):
197
+ # Phonemes
198
+ if not phonemes:
199
+ # Start new sentence
200
+ phonemes.append([])
201
+
202
+ if (i > 0) and (text_parts[i - 1].endswith(" ")):
203
+ phonemes[-1].append(" ")
204
+
205
+ phonemes[-1].extend(list(text_part[2:-2].strip())) # Ensure characters are split
206
+
207
+ if (i < (len(text_parts)) - 1) and (text_parts[i + 1].startswith(" ")):
208
+ phonemes[-1].append(" ")
209
+
210
+ continue
211
+
212
+ # Arabic diacritization
213
+ if self.config.lang_code.split("-")[0] == "ar" and self.use_tashkeel:
214
+ text_part = self.tashkeel_diacritizier(
215
+ text_part, taskeen_threshold=self.taskeen_threshold
216
+ )
217
+
218
+ # Phonemization
219
+ phonemes = self.phonemizer.phonemize(
220
+ text_part, self.config.lang_code
221
+ )
222
+ phonemes.extend(phonemes)
223
+
224
+ if phonemes and (not phonemes[-1]):
225
+ # Remove empty phonemes
226
+ phonemes.pop()
227
+
228
+ return phonemes
229
+
230
+ def phonemes_to_ids(self, phonemes: list[str]) -> list[int]:
231
+ """
232
+ Phonemes to ids.
233
+
234
+ :param phonemes: List of phonemes (or characters for grapheme models).
235
+ :return: List of phoneme ids.
236
+ """
237
+ if self.config.phoneme_id_map is None:
238
+ raise ValueError("self.config.phoneme_id_map is None")
239
+ return phonemes_to_ids(phonemes, self.config.phoneme_id_map,
240
+ blank_token=self.config.blank_token,
241
+ bos_token=self.config.bos_token,
242
+ eos_token=self.config.eos_token,
243
+ word_sep_token=self.config.word_sep_token,
244
+ include_whitespace=self.config.include_whitespace,
245
+ blank_at_start=self.config.blank_at_start,
246
+ blank_at_end=self.config.blank_at_end,
247
+ blank_between=BlankBetween.TOKENS_AND_WORDS,
248
+ )
249
+
250
+ def synthesize(
251
+ self,
252
+ text: str,
253
+ syn_config: Optional[SynthesisConfig] = None,
254
+ ) -> Iterable[AudioChunk]:
255
+ """
256
+ Synthesize one audio chunk per sentence from from text.
257
+
258
+ :param text: Text to synthesize.
259
+ :param syn_config: Synthesis configuration.
260
+ """
261
+ if syn_config is None:
262
+ syn_config = SynthesisConfig()
263
+
264
+ LOG.debug("text=%s", text)
265
+
266
+ # user defined word-level replacements to force correct pronunciation
267
+ if self.phonetic_spellings and syn_config.enable_phonetic_spellings:
268
+ text = self.phonetic_spellings.apply(text)
269
+
270
+ # All phonemization goes through the unified self.phonemize method
271
+ sentence_phonemes = self.phonemize(text)
272
+ LOG.debug("phonemes=%s", sentence_phonemes)
273
+ all_phoneme_ids_for_synthesis = [
274
+ self.phonemes_to_ids(phonemes) for phonemes in sentence_phonemes if phonemes
275
+ ]
276
+
277
+ for phoneme_ids in all_phoneme_ids_for_synthesis:
278
+ if not phoneme_ids:
279
+ continue
280
+
281
+ audio = self.phoneme_ids_to_audio(phoneme_ids, syn_config)
282
+
283
+ if syn_config.normalize_audio:
284
+ max_val = np.max(np.abs(audio))
285
+ if max_val < 1e-8:
286
+ # Prevent division by zero
287
+ audio = np.zeros_like(audio)
288
+ else:
289
+ audio = audio / max_val
290
+
291
+ if syn_config.volume != 1.0:
292
+ audio = audio * syn_config.volume
293
+
294
+ audio = np.clip(audio, -1.0, 1.0).astype(np.float32)
295
+
296
+ yield AudioChunk(
297
+ sample_rate=self.config.sample_rate,
298
+ sample_width=2,
299
+ sample_channels=1,
300
+ audio_float_array=audio,
301
+ )
302
+
303
+ def synthesize_wav(
304
+ self,
305
+ text: str,
306
+ wav_file: wave.Wave_write,
307
+ syn_config: Optional[SynthesisConfig] = None,
308
+ set_wav_format: bool = True,
309
+ ) -> None:
310
+ """
311
+ Synthesize and write WAV audio from text.
312
+
313
+ :param text: Text to synthesize.
314
+ :param wav_file: WAV file writer.
315
+ :param syn_config: Synthesis configuration.
316
+ :param set_wav_format: True if the WAV format should be set automatically.
317
+ """
318
+
319
+ # 16-bit samples for silence
320
+ sentence_silence = 0.0 # Seconds of silence after each sentence
321
+ silence_int16_bytes = bytes(
322
+ int(self.config.sample_rate * sentence_silence * 2)
323
+ )
324
+ first_chunk = True
325
+ for audio_chunk in self.synthesize(text, syn_config=syn_config):
326
+ if first_chunk:
327
+ if set_wav_format:
328
+ # Set audio format on first chunk
329
+ wav_file.setframerate(audio_chunk.sample_rate)
330
+ wav_file.setsampwidth(audio_chunk.sample_width)
331
+ wav_file.setnchannels(audio_chunk.sample_channels)
332
+
333
+ first_chunk = False
334
+
335
+ if not first_chunk:
336
+ wav_file.writeframes(silence_int16_bytes)
337
+
338
+ wav_file.writeframes(audio_chunk.audio_int16_bytes)
339
+
340
+ def phoneme_ids_to_audio(
341
+ self, phoneme_ids: list[int], syn_config: Optional[SynthesisConfig] = None
342
+ ) -> np.ndarray:
343
+ """
344
+ Synthesize raw audio from phoneme ids.
345
+
346
+ :param phoneme_ids: List of phoneme ids.
347
+ :param syn_config: Synthesis configuration.
348
+ :return: Audio float numpy array from voice model (unnormalized, in range [-1, 1]).
349
+ """
350
+ if syn_config is None:
351
+ syn_config = SynthesisConfig()
352
+
353
+ langid = syn_config.lang_id or 0
354
+ speaker_id = syn_config.speaker_id or 0
355
+ length_scale = syn_config.length_scale
356
+ noise_scale = syn_config.noise_scale
357
+ noise_w_scale = syn_config.noise_w_scale
358
+
359
+ expected_args = [model_input.name for model_input in self.session.get_inputs()]
360
+ # print("Expected ONNX Inputs:", expected_args)
361
+
362
+ phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
363
+ phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
364
+ args = {
365
+ "input": phoneme_ids_array,
366
+ "input_lengths": phoneme_ids_lengths
367
+ }
368
+
369
+ if length_scale is None:
370
+ length_scale = self.config.length_scale
371
+ if noise_scale is None:
372
+ noise_scale = self.config.noise_scale
373
+ if noise_w_scale is None:
374
+ noise_w_scale = self.config.noise_w_scale
375
+ if "scales" in expected_args:
376
+ args["scales"] = np.array(
377
+ [noise_scale, length_scale, noise_w_scale],
378
+ dtype=np.float32,
379
+ )
380
+
381
+ args["langid"] = np.array([langid], dtype=np.int64)
382
+ args["sid"] = np.array([speaker_id], dtype=np.int64)
383
+
384
+ # different models can be used and args may differ
385
+ args = {k: v for k, v in args.items() if k in expected_args}
386
+ audio = self.session.run(
387
+ None,
388
+ args,
389
+ )[0].squeeze()
390
+
391
+ return audio
392
+
393
+
394
+ if __name__ == "__main__":
395
+ from phoonnx.phonemizers.gl import CotoviaPhonemizer
396
+ from phoonnx.phonemizers.he import PhonikudPhonemizer
397
+ from phoonnx.phonemizers.mul import (EspeakPhonemizer, EpitranPhonemizer, GruutPhonemizer, ByT5Phonemizer)
398
+
399
+ syn_config = SynthesisConfig(enable_phonetic_spellings=True)
400
+
401
+ # test hebrew piper
402
+ model = "/home/miro/PycharmProjects/phoonnx_tts/phonikud/model.onnx"
403
+ config = "/home/miro/PycharmProjects/phoonnx_tts/phonikud/model.config.json"
404
+
405
+ voice = TTSVoice.load(model_path=model, config_path=config, use_cuda=False)
406
+
407
+ print("\n################")
408
+ # hebrew phonemes (raw input model)
409
+ pho = PhonikudPhonemizer(diacritics=True)
410
+ sentence = "הכוח לשנות מתחיל ברגע שבו אתה מאמין שזה אפשרי!"
411
+ sentence = pho.phonemize_string(sentence, "he")
412
+
413
+ print("## piper hebrew (raw)")
414
+ print("-", voice.config.phoneme_type)
415
+ slug = f"piper_{voice.config.phoneme_type.value}_{voice.config.lang_code}"
416
+ with wave.open(f"{slug}.wav", "wb") as wav_file:
417
+ voice.synthesize_wav(sentence, wav_file, syn_config)
418
+
419
+ print("\n################")
420
+ sentence = "הכוח לשנות מתחיל ברגע שבו אתה מאמין שזה אפשרי!"
421
+ voice.config.phoneme_type = PhonemeType.PHONIKUD
422
+ voice.phonemizer = pho
423
+
424
+ print("## piper hebrew (phonikud)")
425
+ print("-", voice.config.phoneme_type)
426
+ slug = f"piper_{voice.config.phoneme_type.value}_{voice.config.lang_code}"
427
+ with wave.open(f"{slug}.wav", "wb") as wav_file:
428
+ voice.synthesize_wav(sentence, wav_file, syn_config)
429
+
430
+ exit()
431
+ # test piper
432
+ model = "/home/miro/PycharmProjects/phoonnx_tts/miro_en-GB.onnx"
433
+ config = "/home/miro/PycharmProjects/phoonnx_tts/piper_espeak.json"
434
+
435
+ voice = TTSVoice.load(model_path=model, config_path=config, use_cuda=False)
436
+ byt5_phonemizer = ByT5Phonemizer()
437
+ gruut_phonemizer = GruutPhonemizer()
438
+ espeak_phonemizer = EspeakPhonemizer()
439
+ epitran_phonemizer = EpitranPhonemizer()
440
+ cotovia_phonemizer = CotoviaPhonemizer()
441
+
442
+ sentence = "A rainbow is a meteorological phenomenon that is caused by reflection, refraction and dispersion of light in water droplets resulting in a spectrum of light appearing in the sky. It takes the form of a multi-colored circular arc. Rainbows caused by sunlight always appear in the section of sky directly opposite the Sun."
443
+
444
+ print("\n################")
445
+ print("## piper")
446
+ for phonemizer_type, phonemizer in [
447
+ (PhonemeType.ESPEAK, espeak_phonemizer),
448
+ (PhonemeType.BYT5, byt5_phonemizer),
449
+ (PhonemeType.GRUUT, gruut_phonemizer),
450
+ (PhonemeType.EPITRAN, epitran_phonemizer)
451
+ ]:
452
+ voice.config.phoneme_type = phonemizer_type
453
+ voice.phonemizer = phonemizer
454
+ print("-", phonemizer_type)
455
+
456
+ slug = f"piper_{phonemizer_type.value}_{voice.config.lang_code}"
457
+ with wave.open(f"{slug}.wav", "wb") as wav_file:
458
+ voice.synthesize_wav(sentence, wav_file, syn_config)
459
+
460
+ print("\n################")
461
+ print("## mimic3")
462
+ model = "/home/miro/PycharmProjects/phoonnx_tts/mimic3_ap/generator.onnx"
463
+ config = "/home/miro/PycharmProjects/phoonnx_tts/mimic3_ap/config.json"
464
+ phonemes_txt = "/home/miro/PycharmProjects/phoonnx_tts/mimic3_ap/phonemes.txt"
465
+ phoneme_map = "/home/miro/PycharmProjects/phoonnx_tts/mimic3_ap/phoneme_map.txt"
466
+
467
+ voice = TTSVoice.load(model_path=model, config_path=config,
468
+ phonemes_txt=phonemes_txt, phoneme_map=phoneme_map,
469
+ use_cuda=False)
470
+ for phonemizer_type, phonemizer in [
471
+ (PhonemeType.ESPEAK, espeak_phonemizer),
472
+ (PhonemeType.BYT5, byt5_phonemizer),
473
+ (PhonemeType.GRUUT, gruut_phonemizer),
474
+ (PhonemeType.EPITRAN, epitran_phonemizer)
475
+ ]:
476
+ voice.config.phoneme_type = phonemizer_type
477
+ voice.phonemizer = phonemizer
478
+ print("-", phonemizer_type)
479
+ slug = f"mimic3_{voice.config.phoneme_type.value}_{voice.config.lang_code}"
480
+ with wave.open(f"{slug}.wav", "wb") as wav_file:
481
+ voice.synthesize_wav(sentence, wav_file, syn_config)
482
+
483
+ # Test grapheme model directly
484
+ print("\n################")
485
+ print("## coqui vits")
486
+ model = "/home/miro/PycharmProjects/phoonnx_tts/celtia_vits/model.onnx"
487
+ config = "/home/miro/PycharmProjects/phoonnx_tts/celtia_vits/config.json"
488
+
489
+ sentence = "Este é un sistema de conversión de texto a voz en lingua galega baseado en redes neuronais artificiais. Ten en conta que as funcionalidades incluídas nesta páxina ofrécense unicamente con fins de demostración. Se tes algún comentario, suxestión ou detectas algún problema durante a demostración, ponte en contacto connosco."
490
+
491
+ voice = TTSVoice.load(model_path=model, config_path=config,
492
+ use_cuda=False, lang_code="gl-ES")
493
+ print("-", voice.config.phoneme_type)
494
+ print(voice.config)
495
+ phones = voice.phonemize(sentence)
496
+ print(phones)
497
+ print(voice.phonemes_to_ids(phones[0]))
498
+
499
+ slug = f"vits_{voice.config.phoneme_type.value}_{voice.config.lang_code}"
500
+ with wave.open(f"{slug}.wav", "wb") as wav_file:
501
+ voice.synthesize_wav(sentence, wav_file, syn_config)
502
+
503
+ # Test cotovia phonemizer
504
+ print("\n################")
505
+ print("## cotovia coqui vits")
506
+ model = "/home/miro/PycharmProjects/phoonnx_tts/sabela_cotovia/model.onnx"
507
+ config = "/home/miro/PycharmProjects/phoonnx_tts/sabela_cotovia/config.json"
508
+
509
+ sentence = "Este é un sistema de conversión de texto a voz en lingua galega baseado en redes neuronais artificiais. Ten en conta que as funcionalidades incluídas nesta páxina ofrécense unicamente con fins de demostración. Se tes algún comentario, suxestión ou detectas algún problema durante a demostración, ponte en contacto connosco."
510
+
511
+ voice = TTSVoice.load(model_path=model, config_path=config,
512
+ use_cuda=False, lang_code="gl-ES")
513
+ print("-", voice.config.phoneme_type)
514
+ print(voice.config)
515
+ phones = voice.phonemize(sentence)
516
+ print(phones)
517
+ print(voice.phonemes_to_ids(phones[0]))
518
+
519
+ slug = f"vits_{voice.config.phoneme_type.value}_{voice.config.lang_code}"
520
+ with wave.open(f"{slug}.wav", "wb") as wav_file:
521
+ voice.synthesize_wav(sentence, wav_file, syn_config)