phoonnx 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. phoonnx/__init__.py +0 -0
  2. phoonnx/config.py +490 -0
  3. phoonnx/locale/ca/phonetic_spellings.txt +2 -0
  4. phoonnx/locale/en/phonetic_spellings.txt +1 -0
  5. phoonnx/locale/gl/phonetic_spellings.txt +2 -0
  6. phoonnx/locale/pt/phonetic_spellings.txt +2 -0
  7. phoonnx/phoneme_ids.py +453 -0
  8. phoonnx/phonemizers/__init__.py +45 -0
  9. phoonnx/phonemizers/ar.py +42 -0
  10. phoonnx/phonemizers/base.py +216 -0
  11. phoonnx/phonemizers/en.py +250 -0
  12. phoonnx/phonemizers/fa.py +46 -0
  13. phoonnx/phonemizers/gl.py +142 -0
  14. phoonnx/phonemizers/he.py +67 -0
  15. phoonnx/phonemizers/ja.py +119 -0
  16. phoonnx/phonemizers/ko.py +97 -0
  17. phoonnx/phonemizers/mul.py +606 -0
  18. phoonnx/phonemizers/vi.py +44 -0
  19. phoonnx/phonemizers/zh.py +308 -0
  20. phoonnx/thirdparty/__init__.py +0 -0
  21. phoonnx/thirdparty/arpa2ipa.py +249 -0
  22. phoonnx/thirdparty/cotovia/cotovia_aarch64 +0 -0
  23. phoonnx/thirdparty/cotovia/cotovia_x86_64 +0 -0
  24. phoonnx/thirdparty/hangul2ipa.py +783 -0
  25. phoonnx/thirdparty/ko_tables/aspiration.csv +20 -0
  26. phoonnx/thirdparty/ko_tables/assimilation.csv +31 -0
  27. phoonnx/thirdparty/ko_tables/double_coda.csv +17 -0
  28. phoonnx/thirdparty/ko_tables/hanja.tsv +8525 -0
  29. phoonnx/thirdparty/ko_tables/ipa.csv +22 -0
  30. phoonnx/thirdparty/ko_tables/neutralization.csv +11 -0
  31. phoonnx/thirdparty/ko_tables/tensification.csv +56 -0
  32. phoonnx/thirdparty/ko_tables/yale.csv +22 -0
  33. phoonnx/thirdparty/kog2p/__init__.py +385 -0
  34. phoonnx/thirdparty/kog2p/rulebook.txt +212 -0
  35. phoonnx/thirdparty/mantoq/__init__.py +67 -0
  36. phoonnx/thirdparty/mantoq/buck/__init__.py +0 -0
  37. phoonnx/thirdparty/mantoq/buck/phonetise_buckwalter.py +569 -0
  38. phoonnx/thirdparty/mantoq/buck/symbols.py +64 -0
  39. phoonnx/thirdparty/mantoq/buck/tokenization.py +105 -0
  40. phoonnx/thirdparty/mantoq/num2words.py +37 -0
  41. phoonnx/thirdparty/mantoq/pyarabic/__init__.py +12 -0
  42. phoonnx/thirdparty/mantoq/pyarabic/arabrepr.py +64 -0
  43. phoonnx/thirdparty/mantoq/pyarabic/araby.py +1647 -0
  44. phoonnx/thirdparty/mantoq/pyarabic/named_const.py +227 -0
  45. phoonnx/thirdparty/mantoq/pyarabic/normalize.py +161 -0
  46. phoonnx/thirdparty/mantoq/pyarabic/number.py +826 -0
  47. phoonnx/thirdparty/mantoq/pyarabic/number_const.py +1704 -0
  48. phoonnx/thirdparty/mantoq/pyarabic/stack.py +52 -0
  49. phoonnx/thirdparty/mantoq/pyarabic/trans.py +517 -0
  50. phoonnx/thirdparty/mantoq/unicode_symbol2label.py +4173 -0
  51. phoonnx/thirdparty/tashkeel/LICENSE +22 -0
  52. phoonnx/thirdparty/tashkeel/SOURCE +1 -0
  53. phoonnx/thirdparty/tashkeel/__init__.py +212 -0
  54. phoonnx/thirdparty/tashkeel/hint_id_map.json +18 -0
  55. phoonnx/thirdparty/tashkeel/input_id_map.json +56 -0
  56. phoonnx/thirdparty/tashkeel/model.onnx +0 -0
  57. phoonnx/thirdparty/tashkeel/target_id_map.json +17 -0
  58. phoonnx/thirdparty/zh_num.py +238 -0
  59. phoonnx/util.py +705 -0
  60. phoonnx/version.py +6 -0
  61. phoonnx/voice.py +521 -0
  62. phoonnx-0.0.0.dist-info/METADATA +255 -0
  63. phoonnx-0.0.0.dist-info/RECORD +86 -0
  64. phoonnx-0.0.0.dist-info/WHEEL +5 -0
  65. phoonnx-0.0.0.dist-info/top_level.txt +2 -0
  66. phoonnx_train/__main__.py +151 -0
  67. phoonnx_train/export_onnx.py +109 -0
  68. phoonnx_train/norm_audio/__init__.py +92 -0
  69. phoonnx_train/norm_audio/trim.py +54 -0
  70. phoonnx_train/norm_audio/vad.py +54 -0
  71. phoonnx_train/preprocess.py +420 -0
  72. phoonnx_train/vits/__init__.py +0 -0
  73. phoonnx_train/vits/attentions.py +427 -0
  74. phoonnx_train/vits/commons.py +147 -0
  75. phoonnx_train/vits/config.py +330 -0
  76. phoonnx_train/vits/dataset.py +214 -0
  77. phoonnx_train/vits/lightning.py +352 -0
  78. phoonnx_train/vits/losses.py +58 -0
  79. phoonnx_train/vits/mel_processing.py +139 -0
  80. phoonnx_train/vits/models.py +732 -0
  81. phoonnx_train/vits/modules.py +527 -0
  82. phoonnx_train/vits/monotonic_align/__init__.py +20 -0
  83. phoonnx_train/vits/monotonic_align/setup.py +13 -0
  84. phoonnx_train/vits/transforms.py +212 -0
  85. phoonnx_train/vits/utils.py +16 -0
  86. phoonnx_train/vits/wavfile.py +860 -0
@@ -0,0 +1,606 @@
1
+ """multilingual phonemizers"""
2
+
3
+ import json
4
+ import os
5
+ import subprocess
6
+ from typing import List, Dict, Optional
7
+
8
+ import numpy as np
9
+ import onnxruntime
10
+ import requests
11
+ from phoonnx.config import Alphabet
12
+ from phoonnx.phonemizers.base import BasePhonemizer
13
+
14
+
15
+ class EspeakError(Exception):
16
+ """Custom exception for espeak-ng related errors."""
17
+ pass
18
+
19
+ class ByT5Phonemizer(BasePhonemizer):
20
+ """
21
+ A phonemizer class that uses a ByT5 ONNX model to convert text into phonemes.
22
+ """
23
+ MODEL2URL = {
24
+ "OpenVoiceOS/g2p-mbyt5-12l-ipa-childes-espeak-onnx": "https://huggingface.co/OpenVoiceOS/g2p-mbyt5-12l-ipa-childes-espeak-onnx/resolve/main/fdemelo_g2p-mbyt5-12l-ipa-childes-espeak.onnx",
25
+ # "OpenVoiceOS/g2p-multilingual-byt5-tiny-8l-ipa-childes-onnx": "https://huggingface.co/OpenVoiceOS/g2p-multilingual-byt5-tiny-8l-ipa-childes-onnx/resolve/main/byt5_g2p_model.onnx"
26
+ }
27
+ TOKENIZER_CONFIG_URL = "https://huggingface.co/OpenVoiceOS/g2p-multilingual-byt5-tiny-8l-ipa-childes-onnx/resolve/main/tokenizer_config.json"
28
+
29
+ BYT5_LANGS =['ca-ES', 'cy-GB', 'da-DK', 'de-DE', 'en-GB', 'en-US', 'es-ES', 'et-EE', 'eu-ES', 'fa-IR', 'fr-FR',
30
+ 'ga-IE', 'hr-HR', 'hu-HU', 'id-ID', 'is-IS', 'it-IT', 'ja-JP', 'ko-KR', 'nb-NO', 'nl-NL', 'pl-PL',
31
+ 'pt-BR', 'pt-PT', 'qu-PE', 'ro-RO', 'sr-RS', 'sv-SE', 'tr-TR', 'yue-CN', 'zh-CN']
32
+
33
+ _LEGACY_MODELS = ["g2p-multilingual-byt5-tiny-8l-ipa-childes-onnx"]
34
+ _LEGACY_LANGS = ['ca', 'cy', 'da', 'de', 'en-na', 'en-uk', 'es', 'et', 'eu', 'fa', 'fr', 'ga', 'hr', 'hu', 'id', 'is',
35
+ 'it', 'ja', 'ko', 'nl', 'no', 'pl', 'pt', 'pt-br', 'qu', 'ro', 'sr', 'sv', 'tr', 'zh', 'zh-yue']
36
+
37
+ def __init__(self, model: Optional[str] = None, tokenizer_config: Optional[str] = None,
38
+ use_cuda=bool(os.environ.get("CUDA", False))):
39
+ """
40
+ Initializes the ByT5Phonemizer with the ONNX model and tokenizer configuration.
41
+ If paths are not provided, it attempts to download them to a local directory.
42
+
43
+ Args:
44
+ model (str, optional): Path to the ONNX model file. If None, it will be downloaded.
45
+ tokenizer_config (str, optional): Path to the tokenizer configuration JSON file. If None, it will be downloaded.
46
+ """
47
+ super().__init__(Alphabet.IPA)
48
+ model = model or "OpenVoiceOS/g2p-mbyt5-12l-ipa-childes-espeak-onnx"
49
+ # Define the local data path for models and configs
50
+ data_path = os.path.expanduser("~/.local/share/phoonnx")
51
+ os.makedirs(data_path, exist_ok=True) # Ensure the directory exists
52
+
53
+ # Determine the actual paths for the model and tokenizer config
54
+ if model in self.MODEL2URL:
55
+ base = os.path.join(data_path, model)
56
+ os.makedirs(base, exist_ok=True)
57
+ self.onnx_model_path = os.path.join(base, self.MODEL2URL[model].split("/")[-1])
58
+ else:
59
+ self.onnx_model_path = model
60
+
61
+ if tokenizer_config is None:
62
+ self.tokenizer_config = os.path.join(data_path, "tokenizer_config.json")
63
+ else:
64
+ self.tokenizer_config = tokenizer_config
65
+
66
+ # Download model if it doesn't exist
67
+ if not os.path.exists(self.onnx_model_path):
68
+ if model not in self.MODEL2URL:
69
+ raise ValueError("unknown model")
70
+ print(f"Downloading ONNX model from {self.MODEL2URL[model]} to {self.onnx_model_path}...")
71
+ try:
72
+ response = requests.get(self.MODEL2URL[model], stream=True)
73
+ response.raise_for_status() # Raise an exception for HTTP errors
74
+ with open(self.onnx_model_path, 'wb') as f:
75
+ for chunk in response.iter_content(chunk_size=8192):
76
+ f.write(chunk)
77
+ print("ONNX model downloaded successfully.")
78
+ except requests.exceptions.RequestException as e:
79
+ raise IOError(f"Failed to download ONNX model: {e}")
80
+
81
+ # Download tokenizer config if it doesn't exist
82
+ if not os.path.exists(self.tokenizer_config):
83
+ print(f"Downloading tokenizer config from {self.TOKENIZER_CONFIG_URL} to {self.tokenizer_config}...")
84
+ try:
85
+ response = requests.get(self.TOKENIZER_CONFIG_URL, stream=True)
86
+ response.raise_for_status() # Raise an exception for HTTP errors
87
+ with open(self.tokenizer_config, 'wb') as f:
88
+ for chunk in response.iter_content(chunk_size=8192):
89
+ f.write(chunk)
90
+ print("Tokenizer config downloaded successfully.")
91
+ except requests.exceptions.RequestException as e:
92
+ raise IOError(f"Failed to download tokenizer config: {e}")
93
+
94
+ if use_cuda:
95
+ providers = [
96
+ (
97
+ "CUDAExecutionProvider",
98
+ {"cudnn_conv_algo_search": "HEURISTIC"},
99
+ )
100
+ ]
101
+ #LOG.debug("Using CUDA")
102
+ else:
103
+ providers = ["CPUExecutionProvider"]
104
+ self.session = onnxruntime.InferenceSession(self.onnx_model_path, providers=providers)
105
+ with open(self.tokenizer_config, "r") as f:
106
+ self.tokens: Dict[str, int] = json.load(f).get("added_tokens_decoder", {})
107
+
108
+ @classmethod
109
+ def get_lang(cls, target_lang: str) -> str:
110
+ """
111
+ Validates and returns the closest supported language code.
112
+
113
+ Args:
114
+ target_lang (str): The language code to validate.
115
+
116
+ Returns:
117
+ str: The validated language code.
118
+
119
+ Raises:
120
+ ValueError: If the language code is unsupported.
121
+ """
122
+ # Find the closest match
123
+ return cls.match_lang(target_lang, cls.BYT5_LANGS)
124
+
125
+ def _decode_phones(self, preds: List[int]) -> str:
126
+ """
127
+ Decodes predicted token IDs back into phonemes.
128
+
129
+ Args:
130
+ preds (list): A list of predicted token IDs from the ONNX model.
131
+
132
+ Returns:
133
+ str: The decoded phoneme string.
134
+ """
135
+ # Convert token IDs back to bytes, excluding special/added tokens
136
+ phone_bytes = [
137
+ bytes([token - 3]) for token in preds
138
+ if str(token) not in self.tokens
139
+ ]
140
+ # Join bytes and decode to UTF-8, ignoring errors
141
+ phones = b''.join(phone_bytes).decode("utf-8", errors="ignore")
142
+ return phones
143
+
144
+ @staticmethod
145
+ def _encode_text(text: str, lang: str) -> np.ndarray:
146
+ """
147
+ Encodes input text and language into a numpy array suitable for the model.
148
+ This function replaces the Hugging Face tokenizer for input preparation.
149
+
150
+ Args:
151
+ text (str): The input text to encode.
152
+ lang (str): The language code for the text.
153
+
154
+ Returns:
155
+ numpy.ndarray: A numpy array of encoded input IDs.
156
+ """
157
+ lang = ByT5Phonemizer.get_lang(lang) # match lang code
158
+ # Prepend language tag and encode the string to bytes
159
+ encoded_bytes = f"<{lang}>: {text}".encode("utf-8")
160
+ # Convert bytes to a list of integers, adding a shift to account for special tokens
161
+ # (<pad>, </s>, <unk> are typically 0, 1, 2, so we shift by 3 to avoid collision)
162
+ model_inputs = np.array([list(byte + 3 for byte in encoded_bytes)], dtype=np.int64)
163
+ return model_inputs
164
+
165
+ def _infer_onnx(self, text: str, lang: str) -> str:
166
+ """
167
+ Performs inference using ONNX Runtime without relying on Hugging Face Tokenizer.
168
+
169
+ Args:
170
+ text (str): The input text for G2P conversion.
171
+ lang (str): The language of the input text.
172
+
173
+ Returns:
174
+ str: The predicted phoneme string. Returns an empty string if the input text is empty.
175
+ """
176
+ if not text.strip():
177
+ return ""
178
+
179
+ # Get the names of the model's output tensors
180
+ onnx_output_names: List[str] = [out.name for out in self.session.get_outputs()]
181
+
182
+ # Use the custom _encode_text function to prepare input_ids
183
+ input_ids_np: np.ndarray = self._encode_text(text, lang)
184
+
185
+ # Manually create attention_mask (all ones for ByT5, indicating all tokens are attended to)
186
+ attention_mask_np: np.ndarray = np.ones_like(input_ids_np, dtype=np.int64)
187
+
188
+ # Hardcode decoder_start_token_id for ByT5 (typically 0 for pad_token_id)
189
+ # This is the initial token fed to the decoder to start generation.
190
+ decoder_start_token_id: int = 0 # Corresponds to <pad> for ByT5
191
+
192
+ generated_ids: List[int] = []
193
+ # Initialize the decoder input with the start token
194
+ decoder_input_ids_np: np.ndarray = np.array([[decoder_start_token_id]], dtype=np.int64)
195
+
196
+ max_length: int = 512 # Maximum length for the generated sequence
197
+
198
+ # Greedy decoding loop
199
+ for _ in range(max_length):
200
+ # Prepare inputs for the ONNX session
201
+ onnx_inputs: Dict[str, np.ndarray] = {
202
+ "input_ids": input_ids_np,
203
+ "attention_mask": attention_mask_np,
204
+ "decoder_input_ids": decoder_input_ids_np
205
+ }
206
+
207
+ # Run inference
208
+ outputs: List[np.ndarray] = self.session.run(onnx_output_names, onnx_inputs)
209
+ logits: np.ndarray = outputs[0] # Get the logits from the model output
210
+
211
+ # Get the logits for the last token in the sequence
212
+ next_token_logits: np.ndarray = logits[0, -1, :]
213
+ # Predict the next token by taking the argmax of the logits
214
+ next_token_id: int = np.argmax(next_token_logits).item() # .item() to get scalar from numpy array
215
+ generated_ids.append(next_token_id)
216
+
217
+ # Assuming EOS token ID for ByT5 is 1 (corresponds to </s>)
218
+ # This is a common convention for T5 models.
219
+ eos_token_id: int = 1
220
+ # If the EOS token is generated, stop decoding
221
+ if next_token_id == eos_token_id:
222
+ break
223
+
224
+ # Append the newly generated token to the decoder input for the next step
225
+ decoder_input_ids_np = np.concatenate((decoder_input_ids_np,
226
+ np.array([[next_token_id]],
227
+ dtype=np.int64)),
228
+ axis=1)
229
+
230
+ # Decode the generated token IDs into phonemes
231
+ return self._decode_phones(generated_ids)
232
+
233
+ def phonemize_string(self, text: str, lang: str) -> str:
234
+ return self._infer_onnx(text, lang)
235
+
236
+
237
+ class CharsiuPhonemizer(ByT5Phonemizer):
238
+ """
239
+ A phonemizer class that uses a Charsiu ByT5 ONNX model to convert text into phonemes.
240
+ """
241
+ # TODO - more models
242
+ MODEL2URL = {
243
+ "Jarbas/charsiu_g2p_multilingual_byT5_tiny_16_layers_100_onnx": "https://huggingface.co/Jarbas/charsiu_g2p_multilingual_byT5_tiny_16_layers_100_onnx/resolve/main/charsiu_g2p_multilingual_byT5_tiny_16_layers_100.onnx"
244
+ }
245
+ BYT5_LANGS = ['ady', 'afr', 'sqi', 'amh', 'ara', 'arg', 'arm-e', 'arm-w', 'aze', 'bak', 'eus', 'bel', 'ben', 'bos',
246
+ 'bul', 'bur', 'cat', 'yue', 'zho-t', 'zho-s', 'min', 'cze', 'dan', 'dut', 'eng-uk', 'eng-us', 'epo',
247
+ 'est', 'fin', 'fra', 'fra-qu', 'gla', 'geo', 'ger', 'gre', 'grc', 'grn', 'guj', 'hin', 'hun', 'ido',
248
+ 'ind', 'ina', 'ita', 'jam', 'jpn', 'kaz', 'khm', 'kor', 'kur', 'lat-clas', 'lat-eccl', 'lit', 'ltz',
249
+ 'mac', 'mlt', 'tts', 'nob', 'ori', 'pap', 'fas', 'pol', 'por-po', 'por-bz', 'ron', 'rus', 'san',
250
+ 'srp', 'hbs-latn', 'hbs-cyrl', 'snd', 'slo', 'slv', 'spa', 'spa-latin', 'spa-me', 'swa', 'swe', 'tgl',
251
+ 'tam', 'tat', 'tha', 'tur', 'tuk', 'ukr', 'vie-n', 'vie-c', 'vie-s', 'wel-nw', 'wel-sw', 'ice', 'ang',
252
+ 'gle', 'enm', 'syc', 'glg', 'sme', 'egy']
253
+
254
+ def __init__(self, model: Optional[str] = None, tokenizer_config: Optional[str] = None,
255
+ use_cuda=bool(os.environ.get("CUDA", False))):
256
+ """
257
+ Initializes the ByT5Phonemizer with the ONNX model and tokenizer configuration.
258
+ If paths are not provided, it attempts to download them to a local directory.
259
+
260
+ Args:
261
+ model (str, optional): Path to the ONNX model file. If None, it will be downloaded.
262
+ tokenizer_config (str, optional): Path to the tokenizer configuration JSON file. If None, it will be downloaded.
263
+ """
264
+ model = model or "Jarbas/charsiu_g2p_multilingual_byT5_tiny_16_layers_100_onnx"
265
+ super().__init__(model, tokenizer_config, use_cuda)
266
+
267
+ @classmethod
268
+ def get_lang(cls, target_lang: str) -> str:
269
+ """
270
+ Validates and returns the closest supported language code.
271
+
272
+ Args:
273
+ target_lang (str): The language code to validate.
274
+
275
+ Returns:
276
+ str: The validated language code.
277
+
278
+ Raises:
279
+ ValueError: If the language code is unsupported.
280
+ """
281
+ # Find the closest match
282
+ return cls.match_lang(target_lang, cls.BYT5_LANGS)
283
+
284
+ def phonemize_string(self, text: str, lang: str) -> str:
285
+ # charsiu models can't handle whitespace, need to be phonemized word by word
286
+ return " ".join([self._infer_onnx(w, lang) for w in text.split()])
287
+
288
+
289
+ class EspeakPhonemizer(BasePhonemizer):
290
+ """
291
+ A phonemizer class that uses the espeak-ng command-line tool to convert text into phonemes.
292
+ It segments the input text heuristically based on punctuation to mimic clause-by-clause processing.
293
+ """
294
+ ESPEAK_LANGS = ['es-419', 'ca', 'qya', 'ga', 'et', 'ky', 'io', 'fa-latn', 'en-gb', 'fo', 'haw', 'kl',
295
+ 'ta', 'ml', 'gd', 'sd', 'es', 'hy', 'ur', 'ro', 'hi', 'or', 'ti', 'ca-va', 'om', 'tr', 'pa',
296
+ 'smj', 'mk', 'bg', 'cv', "fr", 'fi', 'en-gb-x-rp', 'ru', 'mt', 'an', 'mr', 'pap', 'vi', 'id',
297
+ 'fr-be', 'ltg', 'my', 'nl', 'shn', 'ba', 'az', 'cmn', 'da', 'as', 'sw',
298
+ 'piqd', 'en-us', 'hr', 'it', 'ug', 'th', 'mi', 'cy', 'ru-lv', 'ia', 'tt', 'hu', 'xex', 'te', 'ne',
299
+ 'eu', 'ja', 'bpy', 'hak', 'cs', 'en-gb-scotland', 'hyw', 'uk', 'pt', 'bn', 'mto', 'yue',
300
+ 'be', 'gu', 'sv', 'sl', 'cmn-latn-pinyin', 'lfn', 'lv', 'fa', 'sjn', 'nog', 'ms',
301
+ 'vi-vn-x-central', 'lt', 'kn', 'he', 'qu', 'ca-ba', 'quc', 'nb', 'sk', 'tn', 'py', 'si', 'de',
302
+ 'ar', 'en-gb-x-gbcwmd', 'bs', 'qdb', 'sq', 'sr', 'tk', 'en-029', 'ht', 'ru-cl', 'af', 'pt-br',
303
+ 'fr-ch', 'ka', 'en-gb-x-gbclan', 'ko', 'is', 'ca-nw', 'gn', 'kok', 'la', 'lb', 'am', 'kk', 'ku',
304
+ 'kaa', 'jbo', 'eo', 'uz', 'nci', 'vi-vn-x-south', 'el', 'pl', 'grc', ]
305
+
306
+ def __init__(self):
307
+ super().__init__(Alphabet.IPA)
308
+
309
+ @classmethod
310
+ def get_lang(cls, target_lang: str) -> str:
311
+ """
312
+ Validates and returns the closest supported language code.
313
+
314
+ Args:
315
+ target_lang (str): The language code to validate.
316
+
317
+ Returns:
318
+ str: The validated language code.
319
+
320
+ Raises:
321
+ ValueError: If the language code is unsupported.
322
+ """
323
+ if target_lang.lower() == "en-gb":
324
+ return "en-gb-x-rp"
325
+ if target_lang in cls.ESPEAK_LANGS:
326
+ return target_lang
327
+ if target_lang.lower().split("-")[0] in cls.ESPEAK_LANGS:
328
+ return target_lang.lower().split("-")[0]
329
+ return cls.match_lang(target_lang, cls.ESPEAK_LANGS)
330
+
331
+ @staticmethod
332
+ def _run_espeak_command(args: List[str], input_text: str = None, check: bool = True) -> str:
333
+ """
334
+ Helper function to run espeak-ng commands via subprocess.
335
+ Executes 'espeak-ng' with the given arguments and input text.
336
+ Captures stdout and stderr, and raises EspeakError on failure.
337
+
338
+ Args:
339
+ args (List[str]): A list of command-line arguments for espeak-ng.
340
+ input_text (str, optional): The text to pass to espeak-ng's stdin. Defaults to None.
341
+ check (bool, optional): If True, raises a CalledProcessError if the command returns a non-zero exit code. Defaults to True.
342
+
343
+ Returns:
344
+ str: The stripped standard output from the espeak-ng command.
345
+
346
+ Raises:
347
+ EspeakError: If espeak-ng command is not found, or if the subprocess call fails.
348
+ """
349
+ command: List[str] = ['espeak-ng'] + args
350
+ try:
351
+ process: subprocess.CompletedProcess = subprocess.run(
352
+ command,
353
+ input=input_text,
354
+ capture_output=True,
355
+ text=True,
356
+ check=check,
357
+ encoding='utf-8',
358
+ errors='replace' # Replaces unencodable characters with a placeholder
359
+ )
360
+ return process.stdout.strip()
361
+ except FileNotFoundError:
362
+ raise EspeakError(
363
+ "espeak-ng command not found. Please ensure espeak-ng is installed "
364
+ "and available in your system's PATH."
365
+ )
366
+ except subprocess.CalledProcessError as e:
367
+ raise EspeakError(
368
+ f"espeak-ng command failed with error code {e.returncode}:\n"
369
+ f"STDOUT: {e.stdout}\n"
370
+ f"STDERR: {e.stderr}"
371
+ )
372
+ except Exception as e:
373
+ raise EspeakError(f"An unexpected error occurred while running espeak-ng: {e}")
374
+
375
+ def phonemize_string(self, text: str, lang: str) -> str:
376
+ lang = self.get_lang(lang)
377
+ return self._run_espeak_command(
378
+ ['-q', '-x', '--ipa', '-v', lang],
379
+ input_text=text
380
+ )
381
+
382
+
383
+ class GruutPhonemizer(BasePhonemizer):
384
+ """
385
+ A phonemizer class that uses the Gruut library to convert text into phonemes.
386
+ Note: Gruut's internal segmentation is sentence-based
387
+ """
388
+ GRUUT_LANGS = ["en", "ar", "ca", "cs", "de", "es", "fa", "fr", "it",
389
+ "lb", "nl", "pt", "ru", "sv", "sw"]
390
+
391
+ def __init__(self):
392
+ super().__init__(Alphabet.IPA)
393
+
394
+ @classmethod
395
+ def get_lang(cls, target_lang: str) -> str:
396
+ """
397
+ Validates and returns the closest supported language code.
398
+
399
+ Args:
400
+ target_lang (str): The language code to validate.
401
+
402
+ Returns:
403
+ str: The validated language code.
404
+
405
+ Raises:
406
+ ValueError: If the language code is unsupported.
407
+ """
408
+ return cls.match_lang(target_lang, cls.GRUUT_LANGS)
409
+
410
+ def _text_to_phonemes(self, text: str, lang: Optional[str] = None):
411
+ """
412
+ Generates phonemes for text using Gruut's sentence processing.
413
+ Yields lists of word phonemes for each sentence.
414
+ """
415
+ lang = self.get_lang(lang)
416
+ import gruut
417
+ for sentence in gruut.sentences(text, lang=lang):
418
+ sent_phonemes = [w.phonemes for w in sentence if w.phonemes]
419
+ if sentence and not sent_phonemes:
420
+ raise RuntimeError(f"did you install gruut[{lang}] ?")
421
+ if sentence.text.endswith("?"):
422
+ sent_phonemes[-1] = ["?"]
423
+ elif sentence.text.endswith("!"):
424
+ sent_phonemes[-1] = ["!"]
425
+ elif sentence.text.endswith(".") or sent_phonemes[-1] == ["‖"]:
426
+ sent_phonemes[-1] = ["."]
427
+ if sent_phonemes:
428
+ yield sent_phonemes
429
+
430
+ def phonemize_string(self, text: str, lang: str) -> str:
431
+ pho = ""
432
+ for sent_phonemes in self._text_to_phonemes(text, lang):
433
+ pho += " ".join(["".join(w) for w in sent_phonemes]) + " "
434
+ return pho.strip()
435
+
436
+
437
+ class EpitranPhonemizer(BasePhonemizer):
438
+ """
439
+ A phonemizer class that uses the Gruut library to convert text into phonemes.
440
+ Note: Gruut's internal segmentation is sentence-based
441
+ """
442
+ EPITRAN_LANGS = ['hsn-Latn', 'ful-Latn', 'jpn-Ktkn-red', 'tel-Telu', 'nld-Latn', 'aze-Latn', 'amh-Ethi-pp',
443
+ 'msa-Latn', 'spa-Latn-eu', 'ori-Orya', 'bxk-Latn', 'spa-Latn', 'kir-Cyrl', 'lij-Latn', 'kin-Latn',
444
+ 'ces-Latn', 'sin-Sinh', 'urd-Arab', 'vie-Latn', 'gan-Latn', 'fra-Latn', 'nan-Latn', 'kaz-Latn',
445
+ 'swe-Latn', 'jpn-Ktkn', 'tam-Taml', 'sag-Latn', 'csb-Latn', 'pii-latn_Holopainen2019', 'yue-Latn',
446
+ 'got-Latn', 'tur-Latn', 'aar-Latn', 'jav-Latn', 'ita-Latn', 'sna-Latn', 'ilo-Latn', 'tam-Taml-red',
447
+ 'kmr-Latn-red', 'uzb-Cyrl', 'amh-Ethi', 'mya-Mymr', 'aii-Syrc', 'lit-Latn', 'kmr-Latn',
448
+ 'hat-Latn-bab', 'ltc-Latn-bax', 'Goth2Latn', 'quy-Latn', 'hau-Latn', 'ood-Latn-alv', 'vie-Latn-so',
449
+ 'run-Latn', 'orm-Latn', 'ind-Latn', 'kir-Latn', 'mal-Mlym', 'ben-Beng-red', 'hun-Latn', 'uew',
450
+ 'sqi-Latn', 'jpn-Hrgn', 'deu-Latn-np', 'xho-Latn', 'fra-Latn-rev', 'fra-Latn-np', 'kaz-Cyrl-bab',
451
+ 'jpn-Hrgn-red', 'Latn2Goth', 'glg-Latn', 'uig-Arab', 'amh-Ethi-red', 'zul-Latn', 'hin-Deva',
452
+ 'uzb-Latn', 'tir-Ethi-red', 'kaz-Cyrl', 'mlt-Latn', 'deu-Latn-nar', 'est-Latn', 'eng-Latn',
453
+ 'pii-latn_Wiktionary', 'ckb-Arab', 'nya-Latn', 'mon-Cyrl-bab', 'fra-Latn-p', 'ood-Latn-sax',
454
+ 'ukr-Cyrl', 'tgl-Latn-red', 'lsm-Latn', 'kor-Hang', 'lav-Latn', 'generic-Latn', 'tur-Latn-red',
455
+ 'srp-Latn', 'tir-Ethi', 'kbd-Cyrl', 'hrv-Latn', 'srp-Cyrl', 'tpi-Latn', 'khm-Khmr', 'jam-Latn',
456
+ 'ben-Beng-east', 'por-Latn', 'cmn-Latn', 'cat-Latn', 'tha-Thai', 'ara-Arab', 'ben-Beng',
457
+ 'fin-Latn', 'hmn-Latn', 'lez-Cyrl', 'fas-Arab', 'lao-Laoo-prereform', 'mar-Deva', 'yor-Latn',
458
+ 'ron-Latn', 'tgl-Latn', 'lao-Laoo', 'deu-Latn', 'pan-Guru', 'tuk-Latn', 'tir-Ethi-pp', 'rus-Cyrl',
459
+ 'swa-Latn-red', 'ceb-Latn', 'wuu-Latn', 'hak-Latn', 'mri-Latn', 'epo-Latn', 'pol-Latn',
460
+ 'tur-Latn-bab', 'kat-Geor', 'tgk-Cyrl', 'aze-Cyrl', 'vie-Latn-ce', 'swa-Latn', 'tuk-Cyrl',
461
+ 'vie-Latn-no', 'nan-Latn-tl', 'zha-Latn', 'cjy-Latn', 'ava-Cyrl', 'som-Latn', 'kir-Arab']
462
+
463
+ def __init__(self):
464
+ super().__init__(Alphabet.IPA)
465
+ import epitran
466
+ self.epitran = epitran
467
+ self._epis: Dict[str, epitran.Epitran] = {}
468
+
469
+ @classmethod
470
+ def get_lang(cls, target_lang: str) -> str:
471
+ """
472
+ Validates and returns the closest supported language code.
473
+
474
+ Args:
475
+ target_lang (str): The language code to validate.
476
+
477
+ Returns:
478
+ str: The validated language code.
479
+
480
+ Raises:
481
+ ValueError: If the language code is unsupported.
482
+ """
483
+ return cls.match_lang(target_lang, cls.EPITRAN_LANGS)
484
+
485
+ def phonemize_string(self, text: str, lang: str) -> str:
486
+ lang = self.get_lang(lang)
487
+ epi = self._epis.get(lang)
488
+ if epi is None:
489
+ epi = self.epitran.Epitran(lang)
490
+ self._epis[lang] = epi
491
+ return epi.transliterate(text)
492
+
493
+
494
+ class MisakiPhonemizer(BasePhonemizer):
495
+ """
496
+ https://github.com/hexgrad/misaki
497
+ """
498
+ MISAKI_LANGS = ['en-US', 'en-GB', 'ko', 'ja', 'vi', 'zh']
499
+
500
+ def __init__(self):
501
+ super().__init__(Alphabet.IPA)
502
+ self.g2p_en = self.g2p_zh = self.g2p_ko = self.g2p_vi = self.g2p_ja = None
503
+
504
+ @classmethod
505
+ def get_lang(cls, target_lang: str) -> str:
506
+ """
507
+ Validates and returns the closest supported language code.
508
+
509
+ Args:
510
+ target_lang (str): The language code to validate.
511
+
512
+ Returns:
513
+ str: The validated language code.
514
+
515
+ Raises:
516
+ ValueError: If the language code is unsupported.
517
+ """
518
+ return cls.match_lang(target_lang, cls.MISAKI_LANGS)
519
+
520
+ def _get_phonemizer(self, lang: str):
521
+ """lazy load language specific phonemizer on first usage
522
+ NOTE: this can be slow
523
+ """
524
+ lang = self.get_lang(lang)
525
+
526
+ if lang == "zh":
527
+ if self.g2p_zh is None:
528
+ from misaki.zh import ZHG2P
529
+ self.g2p_zh = ZHG2P()
530
+ return self.g2p_zh
531
+ elif lang == "ko":
532
+ if self.g2p_ko is None:
533
+ from misaki.ko import KOG2P
534
+ self.g2p_ko = KOG2P()
535
+ return self.g2p_ko
536
+ elif lang == "vi":
537
+ if self.g2p_vi is None:
538
+ from misaki.vi import VIG2P
539
+ self.g2p_vi = VIG2P()
540
+ return self.g2p_vi
541
+ elif lang == "ja":
542
+ if self.g2p_ja is None:
543
+ from misaki.ja import JAG2P
544
+ self.g2p_ja = JAG2P()
545
+ return self.g2p_ja
546
+ else:
547
+ if self.g2p_en is None:
548
+ from misaki import en
549
+ self.g2p_en = en.G2P()
550
+ if lang == "en-GB":
551
+ self.g2p_en.british = True
552
+ elif lang == "en-US":
553
+ self.g2p_en.british = False
554
+ return self.g2p_en
555
+
556
+ def phonemize_string(self, text: str, lang: str) -> str:
557
+ pho = self._get_phonemizer(lang)
558
+ phonemes, tokens = pho(text)
559
+ return phonemes
560
+
561
+
562
+ if __name__ == "__main__":
563
+ # for comparison
564
+
565
+ byt5 = ByT5Phonemizer()
566
+ espeak = EspeakPhonemizer()
567
+ gruut = GruutPhonemizer()
568
+ epitr = EpitranPhonemizer()
569
+ charsiu = CharsiuPhonemizer()
570
+ misaki = MisakiPhonemizer()
571
+
572
+ lang = "en-gb"
573
+
574
+ text1 = "Hello, world. How are you?"
575
+
576
+ print("\n--- Getting phonemes for 'Hello, world. How are you?' ---")
577
+ phonemes1 = espeak.phonemize(text1, lang)
578
+ phonemes1b = gruut.phonemize(text1, lang)
579
+ phonemes1c = byt5.phonemize(text1, lang)
580
+ phonemes1d = epitr.phonemize(text1, lang)
581
+ phonemes1e = charsiu.phonemize(text1, lang)
582
+ phonemes1f = misaki.phonemize(text1, lang)
583
+ print(f" Espeak Phonemes: {phonemes1}")
584
+ print(f" Gruut Phonemes: {phonemes1b}")
585
+ print(f" byt5 Phonemes: {phonemes1c}")
586
+ print(f" Epitran Phonemes: {phonemes1d}")
587
+ print(f" Charsiu Phonemes: {phonemes1e}")
588
+
589
+ print(f" Misaki Phonemes: {phonemes1f}")
590
+
591
+
592
+ lang = "nl"
593
+ sentence = "DJ's en bezoekers van Tomorrowland waren woensdagavond dolblij toen het paradepaardje van het festival alsnog opende in Oostenrijk op de Mainstage.\nWant het optreden van Metallica, waar iedereen zo blij mee was, zou hoe dan ook doorgaan, aldus de DJ die het nieuws aankondigde."
594
+ sentence = "Een regenboog is een gekleurde cirkelboog die aan de hemel waargenomen kan worden als de, laagstaande, zon tegen een nevel van waterdruppeltjes aan schijnt en de zon zich achter de waarnemer bevindt. Het is een optisch effect dat wordt veroorzaakt door de breking en weerspiegeling van licht in de waterdruppels."
595
+ print(f"\n--- Getting phonemes for '{sentence}' ---")
596
+ text1 = sentence
597
+ phonemes1 = espeak.phonemize(text1, lang)
598
+ phonemes1b = gruut.phonemize(text1, lang)
599
+ phonemes1c = byt5.phonemize(text1, lang)
600
+ phonemes1d = epitr.phonemize(text1, lang)
601
+ phonemes1e = charsiu.phonemize(text1, lang)
602
+ print(f" Espeak Phonemes: {phonemes1}")
603
+ print(f" Gruut Phonemes: {phonemes1b}")
604
+ print(f" byt5 Phonemes: {phonemes1c}")
605
+ print(f" Epitran Phonemes: {phonemes1d}")
606
+ print(f" Charsiu Phonemes: {phonemes1e}")
@@ -0,0 +1,44 @@
1
+ from phoonnx.phonemizers.base import BasePhonemizer
2
+ from phoonnx.config import Alphabet
3
+
4
+
5
+ class VIPhonemePhonemizer(BasePhonemizer):
6
+ """https://github.com/v-nhandt21/Viphoneme"""
7
+ def __init__(self):
8
+ from viphoneme import vi2IPA
9
+ self.g2p = vi2IPA
10
+ super().__init__(Alphabet.IPA)
11
+
12
+ @classmethod
13
+ def get_lang(cls, target_lang: str) -> str:
14
+ """
15
+ Validates and returns the closest supported language code.
16
+
17
+ Args:
18
+ target_lang (str): The language code to validate.
19
+
20
+ Returns:
21
+ str: The validated language code.
22
+
23
+ Raises:
24
+ ValueError: If the language code is unsupported.
25
+ """
26
+ # this check is here only to throw an exception if invalid language is provided
27
+ return cls.match_lang(target_lang, ["vi"])
28
+
29
+ def phonemize_string(self, text: str, lang: str = "vi") -> str:
30
+ """
31
+ """
32
+ lang = self.get_lang(lang)
33
+ return self.g2p(text)
34
+
35
+
36
+ if __name__ == "__main__":
37
+ text = "Được viết vào 6/4/2020, có thể xử lí những trường hợp chứa English"
38
+
39
+ pho = VIPhonemePhonemizer()
40
+ lang = "vi"
41
+
42
+ print(f"\n--- Getting phonemes for '{text}' ---")
43
+ phonemes_cotovia = pho.phonemize(text, lang)
44
+ print(f" Phonemes: {phonemes_cotovia}")