phoonnx 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phoonnx/__init__.py +0 -0
- phoonnx/config.py +490 -0
- phoonnx/locale/ca/phonetic_spellings.txt +2 -0
- phoonnx/locale/en/phonetic_spellings.txt +1 -0
- phoonnx/locale/gl/phonetic_spellings.txt +2 -0
- phoonnx/locale/pt/phonetic_spellings.txt +2 -0
- phoonnx/phoneme_ids.py +453 -0
- phoonnx/phonemizers/__init__.py +45 -0
- phoonnx/phonemizers/ar.py +42 -0
- phoonnx/phonemizers/base.py +216 -0
- phoonnx/phonemizers/en.py +250 -0
- phoonnx/phonemizers/fa.py +46 -0
- phoonnx/phonemizers/gl.py +142 -0
- phoonnx/phonemizers/he.py +67 -0
- phoonnx/phonemizers/ja.py +119 -0
- phoonnx/phonemizers/ko.py +97 -0
- phoonnx/phonemizers/mul.py +606 -0
- phoonnx/phonemizers/vi.py +44 -0
- phoonnx/phonemizers/zh.py +308 -0
- phoonnx/thirdparty/__init__.py +0 -0
- phoonnx/thirdparty/arpa2ipa.py +249 -0
- phoonnx/thirdparty/cotovia/cotovia_aarch64 +0 -0
- phoonnx/thirdparty/cotovia/cotovia_x86_64 +0 -0
- phoonnx/thirdparty/hangul2ipa.py +783 -0
- phoonnx/thirdparty/ko_tables/aspiration.csv +20 -0
- phoonnx/thirdparty/ko_tables/assimilation.csv +31 -0
- phoonnx/thirdparty/ko_tables/double_coda.csv +17 -0
- phoonnx/thirdparty/ko_tables/hanja.tsv +8525 -0
- phoonnx/thirdparty/ko_tables/ipa.csv +22 -0
- phoonnx/thirdparty/ko_tables/neutralization.csv +11 -0
- phoonnx/thirdparty/ko_tables/tensification.csv +56 -0
- phoonnx/thirdparty/ko_tables/yale.csv +22 -0
- phoonnx/thirdparty/kog2p/__init__.py +385 -0
- phoonnx/thirdparty/kog2p/rulebook.txt +212 -0
- phoonnx/thirdparty/mantoq/__init__.py +67 -0
- phoonnx/thirdparty/mantoq/buck/__init__.py +0 -0
- phoonnx/thirdparty/mantoq/buck/phonetise_buckwalter.py +569 -0
- phoonnx/thirdparty/mantoq/buck/symbols.py +64 -0
- phoonnx/thirdparty/mantoq/buck/tokenization.py +105 -0
- phoonnx/thirdparty/mantoq/num2words.py +37 -0
- phoonnx/thirdparty/mantoq/pyarabic/__init__.py +12 -0
- phoonnx/thirdparty/mantoq/pyarabic/arabrepr.py +64 -0
- phoonnx/thirdparty/mantoq/pyarabic/araby.py +1647 -0
- phoonnx/thirdparty/mantoq/pyarabic/named_const.py +227 -0
- phoonnx/thirdparty/mantoq/pyarabic/normalize.py +161 -0
- phoonnx/thirdparty/mantoq/pyarabic/number.py +826 -0
- phoonnx/thirdparty/mantoq/pyarabic/number_const.py +1704 -0
- phoonnx/thirdparty/mantoq/pyarabic/stack.py +52 -0
- phoonnx/thirdparty/mantoq/pyarabic/trans.py +517 -0
- phoonnx/thirdparty/mantoq/unicode_symbol2label.py +4173 -0
- phoonnx/thirdparty/tashkeel/LICENSE +22 -0
- phoonnx/thirdparty/tashkeel/SOURCE +1 -0
- phoonnx/thirdparty/tashkeel/__init__.py +212 -0
- phoonnx/thirdparty/tashkeel/hint_id_map.json +18 -0
- phoonnx/thirdparty/tashkeel/input_id_map.json +56 -0
- phoonnx/thirdparty/tashkeel/model.onnx +0 -0
- phoonnx/thirdparty/tashkeel/target_id_map.json +17 -0
- phoonnx/thirdparty/zh_num.py +238 -0
- phoonnx/util.py +705 -0
- phoonnx/version.py +6 -0
- phoonnx/voice.py +521 -0
- phoonnx-0.0.0.dist-info/METADATA +255 -0
- phoonnx-0.0.0.dist-info/RECORD +86 -0
- phoonnx-0.0.0.dist-info/WHEEL +5 -0
- phoonnx-0.0.0.dist-info/top_level.txt +2 -0
- phoonnx_train/__main__.py +151 -0
- phoonnx_train/export_onnx.py +109 -0
- phoonnx_train/norm_audio/__init__.py +92 -0
- phoonnx_train/norm_audio/trim.py +54 -0
- phoonnx_train/norm_audio/vad.py +54 -0
- phoonnx_train/preprocess.py +420 -0
- phoonnx_train/vits/__init__.py +0 -0
- phoonnx_train/vits/attentions.py +427 -0
- phoonnx_train/vits/commons.py +147 -0
- phoonnx_train/vits/config.py +330 -0
- phoonnx_train/vits/dataset.py +214 -0
- phoonnx_train/vits/lightning.py +352 -0
- phoonnx_train/vits/losses.py +58 -0
- phoonnx_train/vits/mel_processing.py +139 -0
- phoonnx_train/vits/models.py +732 -0
- phoonnx_train/vits/modules.py +527 -0
- phoonnx_train/vits/monotonic_align/__init__.py +20 -0
- phoonnx_train/vits/monotonic_align/setup.py +13 -0
- phoonnx_train/vits/transforms.py +212 -0
- phoonnx_train/vits/utils.py +16 -0
- phoonnx_train/vits/wavfile.py +860 -0
@@ -0,0 +1,606 @@
|
|
1
|
+
"""multilingual phonemizers"""
|
2
|
+
|
3
|
+
import json
|
4
|
+
import os
|
5
|
+
import subprocess
|
6
|
+
from typing import List, Dict, Optional
|
7
|
+
|
8
|
+
import numpy as np
|
9
|
+
import onnxruntime
|
10
|
+
import requests
|
11
|
+
from phoonnx.config import Alphabet
|
12
|
+
from phoonnx.phonemizers.base import BasePhonemizer
|
13
|
+
|
14
|
+
|
15
|
+
class EspeakError(Exception):
|
16
|
+
"""Custom exception for espeak-ng related errors."""
|
17
|
+
pass
|
18
|
+
|
19
|
+
class ByT5Phonemizer(BasePhonemizer):
|
20
|
+
"""
|
21
|
+
A phonemizer class that uses a ByT5 ONNX model to convert text into phonemes.
|
22
|
+
"""
|
23
|
+
MODEL2URL = {
|
24
|
+
"OpenVoiceOS/g2p-mbyt5-12l-ipa-childes-espeak-onnx": "https://huggingface.co/OpenVoiceOS/g2p-mbyt5-12l-ipa-childes-espeak-onnx/resolve/main/fdemelo_g2p-mbyt5-12l-ipa-childes-espeak.onnx",
|
25
|
+
# "OpenVoiceOS/g2p-multilingual-byt5-tiny-8l-ipa-childes-onnx": "https://huggingface.co/OpenVoiceOS/g2p-multilingual-byt5-tiny-8l-ipa-childes-onnx/resolve/main/byt5_g2p_model.onnx"
|
26
|
+
}
|
27
|
+
TOKENIZER_CONFIG_URL = "https://huggingface.co/OpenVoiceOS/g2p-multilingual-byt5-tiny-8l-ipa-childes-onnx/resolve/main/tokenizer_config.json"
|
28
|
+
|
29
|
+
BYT5_LANGS =['ca-ES', 'cy-GB', 'da-DK', 'de-DE', 'en-GB', 'en-US', 'es-ES', 'et-EE', 'eu-ES', 'fa-IR', 'fr-FR',
|
30
|
+
'ga-IE', 'hr-HR', 'hu-HU', 'id-ID', 'is-IS', 'it-IT', 'ja-JP', 'ko-KR', 'nb-NO', 'nl-NL', 'pl-PL',
|
31
|
+
'pt-BR', 'pt-PT', 'qu-PE', 'ro-RO', 'sr-RS', 'sv-SE', 'tr-TR', 'yue-CN', 'zh-CN']
|
32
|
+
|
33
|
+
_LEGACY_MODELS = ["g2p-multilingual-byt5-tiny-8l-ipa-childes-onnx"]
|
34
|
+
_LEGACY_LANGS = ['ca', 'cy', 'da', 'de', 'en-na', 'en-uk', 'es', 'et', 'eu', 'fa', 'fr', 'ga', 'hr', 'hu', 'id', 'is',
|
35
|
+
'it', 'ja', 'ko', 'nl', 'no', 'pl', 'pt', 'pt-br', 'qu', 'ro', 'sr', 'sv', 'tr', 'zh', 'zh-yue']
|
36
|
+
|
37
|
+
def __init__(self, model: Optional[str] = None, tokenizer_config: Optional[str] = None,
|
38
|
+
use_cuda=bool(os.environ.get("CUDA", False))):
|
39
|
+
"""
|
40
|
+
Initializes the ByT5Phonemizer with the ONNX model and tokenizer configuration.
|
41
|
+
If paths are not provided, it attempts to download them to a local directory.
|
42
|
+
|
43
|
+
Args:
|
44
|
+
model (str, optional): Path to the ONNX model file. If None, it will be downloaded.
|
45
|
+
tokenizer_config (str, optional): Path to the tokenizer configuration JSON file. If None, it will be downloaded.
|
46
|
+
"""
|
47
|
+
super().__init__(Alphabet.IPA)
|
48
|
+
model = model or "OpenVoiceOS/g2p-mbyt5-12l-ipa-childes-espeak-onnx"
|
49
|
+
# Define the local data path for models and configs
|
50
|
+
data_path = os.path.expanduser("~/.local/share/phoonnx")
|
51
|
+
os.makedirs(data_path, exist_ok=True) # Ensure the directory exists
|
52
|
+
|
53
|
+
# Determine the actual paths for the model and tokenizer config
|
54
|
+
if model in self.MODEL2URL:
|
55
|
+
base = os.path.join(data_path, model)
|
56
|
+
os.makedirs(base, exist_ok=True)
|
57
|
+
self.onnx_model_path = os.path.join(base, self.MODEL2URL[model].split("/")[-1])
|
58
|
+
else:
|
59
|
+
self.onnx_model_path = model
|
60
|
+
|
61
|
+
if tokenizer_config is None:
|
62
|
+
self.tokenizer_config = os.path.join(data_path, "tokenizer_config.json")
|
63
|
+
else:
|
64
|
+
self.tokenizer_config = tokenizer_config
|
65
|
+
|
66
|
+
# Download model if it doesn't exist
|
67
|
+
if not os.path.exists(self.onnx_model_path):
|
68
|
+
if model not in self.MODEL2URL:
|
69
|
+
raise ValueError("unknown model")
|
70
|
+
print(f"Downloading ONNX model from {self.MODEL2URL[model]} to {self.onnx_model_path}...")
|
71
|
+
try:
|
72
|
+
response = requests.get(self.MODEL2URL[model], stream=True)
|
73
|
+
response.raise_for_status() # Raise an exception for HTTP errors
|
74
|
+
with open(self.onnx_model_path, 'wb') as f:
|
75
|
+
for chunk in response.iter_content(chunk_size=8192):
|
76
|
+
f.write(chunk)
|
77
|
+
print("ONNX model downloaded successfully.")
|
78
|
+
except requests.exceptions.RequestException as e:
|
79
|
+
raise IOError(f"Failed to download ONNX model: {e}")
|
80
|
+
|
81
|
+
# Download tokenizer config if it doesn't exist
|
82
|
+
if not os.path.exists(self.tokenizer_config):
|
83
|
+
print(f"Downloading tokenizer config from {self.TOKENIZER_CONFIG_URL} to {self.tokenizer_config}...")
|
84
|
+
try:
|
85
|
+
response = requests.get(self.TOKENIZER_CONFIG_URL, stream=True)
|
86
|
+
response.raise_for_status() # Raise an exception for HTTP errors
|
87
|
+
with open(self.tokenizer_config, 'wb') as f:
|
88
|
+
for chunk in response.iter_content(chunk_size=8192):
|
89
|
+
f.write(chunk)
|
90
|
+
print("Tokenizer config downloaded successfully.")
|
91
|
+
except requests.exceptions.RequestException as e:
|
92
|
+
raise IOError(f"Failed to download tokenizer config: {e}")
|
93
|
+
|
94
|
+
if use_cuda:
|
95
|
+
providers = [
|
96
|
+
(
|
97
|
+
"CUDAExecutionProvider",
|
98
|
+
{"cudnn_conv_algo_search": "HEURISTIC"},
|
99
|
+
)
|
100
|
+
]
|
101
|
+
#LOG.debug("Using CUDA")
|
102
|
+
else:
|
103
|
+
providers = ["CPUExecutionProvider"]
|
104
|
+
self.session = onnxruntime.InferenceSession(self.onnx_model_path, providers=providers)
|
105
|
+
with open(self.tokenizer_config, "r") as f:
|
106
|
+
self.tokens: Dict[str, int] = json.load(f).get("added_tokens_decoder", {})
|
107
|
+
|
108
|
+
@classmethod
|
109
|
+
def get_lang(cls, target_lang: str) -> str:
|
110
|
+
"""
|
111
|
+
Validates and returns the closest supported language code.
|
112
|
+
|
113
|
+
Args:
|
114
|
+
target_lang (str): The language code to validate.
|
115
|
+
|
116
|
+
Returns:
|
117
|
+
str: The validated language code.
|
118
|
+
|
119
|
+
Raises:
|
120
|
+
ValueError: If the language code is unsupported.
|
121
|
+
"""
|
122
|
+
# Find the closest match
|
123
|
+
return cls.match_lang(target_lang, cls.BYT5_LANGS)
|
124
|
+
|
125
|
+
def _decode_phones(self, preds: List[int]) -> str:
|
126
|
+
"""
|
127
|
+
Decodes predicted token IDs back into phonemes.
|
128
|
+
|
129
|
+
Args:
|
130
|
+
preds (list): A list of predicted token IDs from the ONNX model.
|
131
|
+
|
132
|
+
Returns:
|
133
|
+
str: The decoded phoneme string.
|
134
|
+
"""
|
135
|
+
# Convert token IDs back to bytes, excluding special/added tokens
|
136
|
+
phone_bytes = [
|
137
|
+
bytes([token - 3]) for token in preds
|
138
|
+
if str(token) not in self.tokens
|
139
|
+
]
|
140
|
+
# Join bytes and decode to UTF-8, ignoring errors
|
141
|
+
phones = b''.join(phone_bytes).decode("utf-8", errors="ignore")
|
142
|
+
return phones
|
143
|
+
|
144
|
+
@staticmethod
|
145
|
+
def _encode_text(text: str, lang: str) -> np.ndarray:
|
146
|
+
"""
|
147
|
+
Encodes input text and language into a numpy array suitable for the model.
|
148
|
+
This function replaces the Hugging Face tokenizer for input preparation.
|
149
|
+
|
150
|
+
Args:
|
151
|
+
text (str): The input text to encode.
|
152
|
+
lang (str): The language code for the text.
|
153
|
+
|
154
|
+
Returns:
|
155
|
+
numpy.ndarray: A numpy array of encoded input IDs.
|
156
|
+
"""
|
157
|
+
lang = ByT5Phonemizer.get_lang(lang) # match lang code
|
158
|
+
# Prepend language tag and encode the string to bytes
|
159
|
+
encoded_bytes = f"<{lang}>: {text}".encode("utf-8")
|
160
|
+
# Convert bytes to a list of integers, adding a shift to account for special tokens
|
161
|
+
# (<pad>, </s>, <unk> are typically 0, 1, 2, so we shift by 3 to avoid collision)
|
162
|
+
model_inputs = np.array([list(byte + 3 for byte in encoded_bytes)], dtype=np.int64)
|
163
|
+
return model_inputs
|
164
|
+
|
165
|
+
def _infer_onnx(self, text: str, lang: str) -> str:
|
166
|
+
"""
|
167
|
+
Performs inference using ONNX Runtime without relying on Hugging Face Tokenizer.
|
168
|
+
|
169
|
+
Args:
|
170
|
+
text (str): The input text for G2P conversion.
|
171
|
+
lang (str): The language of the input text.
|
172
|
+
|
173
|
+
Returns:
|
174
|
+
str: The predicted phoneme string. Returns an empty string if the input text is empty.
|
175
|
+
"""
|
176
|
+
if not text.strip():
|
177
|
+
return ""
|
178
|
+
|
179
|
+
# Get the names of the model's output tensors
|
180
|
+
onnx_output_names: List[str] = [out.name for out in self.session.get_outputs()]
|
181
|
+
|
182
|
+
# Use the custom _encode_text function to prepare input_ids
|
183
|
+
input_ids_np: np.ndarray = self._encode_text(text, lang)
|
184
|
+
|
185
|
+
# Manually create attention_mask (all ones for ByT5, indicating all tokens are attended to)
|
186
|
+
attention_mask_np: np.ndarray = np.ones_like(input_ids_np, dtype=np.int64)
|
187
|
+
|
188
|
+
# Hardcode decoder_start_token_id for ByT5 (typically 0 for pad_token_id)
|
189
|
+
# This is the initial token fed to the decoder to start generation.
|
190
|
+
decoder_start_token_id: int = 0 # Corresponds to <pad> for ByT5
|
191
|
+
|
192
|
+
generated_ids: List[int] = []
|
193
|
+
# Initialize the decoder input with the start token
|
194
|
+
decoder_input_ids_np: np.ndarray = np.array([[decoder_start_token_id]], dtype=np.int64)
|
195
|
+
|
196
|
+
max_length: int = 512 # Maximum length for the generated sequence
|
197
|
+
|
198
|
+
# Greedy decoding loop
|
199
|
+
for _ in range(max_length):
|
200
|
+
# Prepare inputs for the ONNX session
|
201
|
+
onnx_inputs: Dict[str, np.ndarray] = {
|
202
|
+
"input_ids": input_ids_np,
|
203
|
+
"attention_mask": attention_mask_np,
|
204
|
+
"decoder_input_ids": decoder_input_ids_np
|
205
|
+
}
|
206
|
+
|
207
|
+
# Run inference
|
208
|
+
outputs: List[np.ndarray] = self.session.run(onnx_output_names, onnx_inputs)
|
209
|
+
logits: np.ndarray = outputs[0] # Get the logits from the model output
|
210
|
+
|
211
|
+
# Get the logits for the last token in the sequence
|
212
|
+
next_token_logits: np.ndarray = logits[0, -1, :]
|
213
|
+
# Predict the next token by taking the argmax of the logits
|
214
|
+
next_token_id: int = np.argmax(next_token_logits).item() # .item() to get scalar from numpy array
|
215
|
+
generated_ids.append(next_token_id)
|
216
|
+
|
217
|
+
# Assuming EOS token ID for ByT5 is 1 (corresponds to </s>)
|
218
|
+
# This is a common convention for T5 models.
|
219
|
+
eos_token_id: int = 1
|
220
|
+
# If the EOS token is generated, stop decoding
|
221
|
+
if next_token_id == eos_token_id:
|
222
|
+
break
|
223
|
+
|
224
|
+
# Append the newly generated token to the decoder input for the next step
|
225
|
+
decoder_input_ids_np = np.concatenate((decoder_input_ids_np,
|
226
|
+
np.array([[next_token_id]],
|
227
|
+
dtype=np.int64)),
|
228
|
+
axis=1)
|
229
|
+
|
230
|
+
# Decode the generated token IDs into phonemes
|
231
|
+
return self._decode_phones(generated_ids)
|
232
|
+
|
233
|
+
def phonemize_string(self, text: str, lang: str) -> str:
|
234
|
+
return self._infer_onnx(text, lang)
|
235
|
+
|
236
|
+
|
237
|
+
class CharsiuPhonemizer(ByT5Phonemizer):
|
238
|
+
"""
|
239
|
+
A phonemizer class that uses a Charsiu ByT5 ONNX model to convert text into phonemes.
|
240
|
+
"""
|
241
|
+
# TODO - more models
|
242
|
+
MODEL2URL = {
|
243
|
+
"Jarbas/charsiu_g2p_multilingual_byT5_tiny_16_layers_100_onnx": "https://huggingface.co/Jarbas/charsiu_g2p_multilingual_byT5_tiny_16_layers_100_onnx/resolve/main/charsiu_g2p_multilingual_byT5_tiny_16_layers_100.onnx"
|
244
|
+
}
|
245
|
+
BYT5_LANGS = ['ady', 'afr', 'sqi', 'amh', 'ara', 'arg', 'arm-e', 'arm-w', 'aze', 'bak', 'eus', 'bel', 'ben', 'bos',
|
246
|
+
'bul', 'bur', 'cat', 'yue', 'zho-t', 'zho-s', 'min', 'cze', 'dan', 'dut', 'eng-uk', 'eng-us', 'epo',
|
247
|
+
'est', 'fin', 'fra', 'fra-qu', 'gla', 'geo', 'ger', 'gre', 'grc', 'grn', 'guj', 'hin', 'hun', 'ido',
|
248
|
+
'ind', 'ina', 'ita', 'jam', 'jpn', 'kaz', 'khm', 'kor', 'kur', 'lat-clas', 'lat-eccl', 'lit', 'ltz',
|
249
|
+
'mac', 'mlt', 'tts', 'nob', 'ori', 'pap', 'fas', 'pol', 'por-po', 'por-bz', 'ron', 'rus', 'san',
|
250
|
+
'srp', 'hbs-latn', 'hbs-cyrl', 'snd', 'slo', 'slv', 'spa', 'spa-latin', 'spa-me', 'swa', 'swe', 'tgl',
|
251
|
+
'tam', 'tat', 'tha', 'tur', 'tuk', 'ukr', 'vie-n', 'vie-c', 'vie-s', 'wel-nw', 'wel-sw', 'ice', 'ang',
|
252
|
+
'gle', 'enm', 'syc', 'glg', 'sme', 'egy']
|
253
|
+
|
254
|
+
def __init__(self, model: Optional[str] = None, tokenizer_config: Optional[str] = None,
|
255
|
+
use_cuda=bool(os.environ.get("CUDA", False))):
|
256
|
+
"""
|
257
|
+
Initializes the ByT5Phonemizer with the ONNX model and tokenizer configuration.
|
258
|
+
If paths are not provided, it attempts to download them to a local directory.
|
259
|
+
|
260
|
+
Args:
|
261
|
+
model (str, optional): Path to the ONNX model file. If None, it will be downloaded.
|
262
|
+
tokenizer_config (str, optional): Path to the tokenizer configuration JSON file. If None, it will be downloaded.
|
263
|
+
"""
|
264
|
+
model = model or "Jarbas/charsiu_g2p_multilingual_byT5_tiny_16_layers_100_onnx"
|
265
|
+
super().__init__(model, tokenizer_config, use_cuda)
|
266
|
+
|
267
|
+
@classmethod
|
268
|
+
def get_lang(cls, target_lang: str) -> str:
|
269
|
+
"""
|
270
|
+
Validates and returns the closest supported language code.
|
271
|
+
|
272
|
+
Args:
|
273
|
+
target_lang (str): The language code to validate.
|
274
|
+
|
275
|
+
Returns:
|
276
|
+
str: The validated language code.
|
277
|
+
|
278
|
+
Raises:
|
279
|
+
ValueError: If the language code is unsupported.
|
280
|
+
"""
|
281
|
+
# Find the closest match
|
282
|
+
return cls.match_lang(target_lang, cls.BYT5_LANGS)
|
283
|
+
|
284
|
+
def phonemize_string(self, text: str, lang: str) -> str:
|
285
|
+
# charsiu models can't handle whitespace, need to be phonemized word by word
|
286
|
+
return " ".join([self._infer_onnx(w, lang) for w in text.split()])
|
287
|
+
|
288
|
+
|
289
|
+
class EspeakPhonemizer(BasePhonemizer):
|
290
|
+
"""
|
291
|
+
A phonemizer class that uses the espeak-ng command-line tool to convert text into phonemes.
|
292
|
+
It segments the input text heuristically based on punctuation to mimic clause-by-clause processing.
|
293
|
+
"""
|
294
|
+
ESPEAK_LANGS = ['es-419', 'ca', 'qya', 'ga', 'et', 'ky', 'io', 'fa-latn', 'en-gb', 'fo', 'haw', 'kl',
|
295
|
+
'ta', 'ml', 'gd', 'sd', 'es', 'hy', 'ur', 'ro', 'hi', 'or', 'ti', 'ca-va', 'om', 'tr', 'pa',
|
296
|
+
'smj', 'mk', 'bg', 'cv', "fr", 'fi', 'en-gb-x-rp', 'ru', 'mt', 'an', 'mr', 'pap', 'vi', 'id',
|
297
|
+
'fr-be', 'ltg', 'my', 'nl', 'shn', 'ba', 'az', 'cmn', 'da', 'as', 'sw',
|
298
|
+
'piqd', 'en-us', 'hr', 'it', 'ug', 'th', 'mi', 'cy', 'ru-lv', 'ia', 'tt', 'hu', 'xex', 'te', 'ne',
|
299
|
+
'eu', 'ja', 'bpy', 'hak', 'cs', 'en-gb-scotland', 'hyw', 'uk', 'pt', 'bn', 'mto', 'yue',
|
300
|
+
'be', 'gu', 'sv', 'sl', 'cmn-latn-pinyin', 'lfn', 'lv', 'fa', 'sjn', 'nog', 'ms',
|
301
|
+
'vi-vn-x-central', 'lt', 'kn', 'he', 'qu', 'ca-ba', 'quc', 'nb', 'sk', 'tn', 'py', 'si', 'de',
|
302
|
+
'ar', 'en-gb-x-gbcwmd', 'bs', 'qdb', 'sq', 'sr', 'tk', 'en-029', 'ht', 'ru-cl', 'af', 'pt-br',
|
303
|
+
'fr-ch', 'ka', 'en-gb-x-gbclan', 'ko', 'is', 'ca-nw', 'gn', 'kok', 'la', 'lb', 'am', 'kk', 'ku',
|
304
|
+
'kaa', 'jbo', 'eo', 'uz', 'nci', 'vi-vn-x-south', 'el', 'pl', 'grc', ]
|
305
|
+
|
306
|
+
def __init__(self):
|
307
|
+
super().__init__(Alphabet.IPA)
|
308
|
+
|
309
|
+
@classmethod
|
310
|
+
def get_lang(cls, target_lang: str) -> str:
|
311
|
+
"""
|
312
|
+
Validates and returns the closest supported language code.
|
313
|
+
|
314
|
+
Args:
|
315
|
+
target_lang (str): The language code to validate.
|
316
|
+
|
317
|
+
Returns:
|
318
|
+
str: The validated language code.
|
319
|
+
|
320
|
+
Raises:
|
321
|
+
ValueError: If the language code is unsupported.
|
322
|
+
"""
|
323
|
+
if target_lang.lower() == "en-gb":
|
324
|
+
return "en-gb-x-rp"
|
325
|
+
if target_lang in cls.ESPEAK_LANGS:
|
326
|
+
return target_lang
|
327
|
+
if target_lang.lower().split("-")[0] in cls.ESPEAK_LANGS:
|
328
|
+
return target_lang.lower().split("-")[0]
|
329
|
+
return cls.match_lang(target_lang, cls.ESPEAK_LANGS)
|
330
|
+
|
331
|
+
@staticmethod
|
332
|
+
def _run_espeak_command(args: List[str], input_text: str = None, check: bool = True) -> str:
|
333
|
+
"""
|
334
|
+
Helper function to run espeak-ng commands via subprocess.
|
335
|
+
Executes 'espeak-ng' with the given arguments and input text.
|
336
|
+
Captures stdout and stderr, and raises EspeakError on failure.
|
337
|
+
|
338
|
+
Args:
|
339
|
+
args (List[str]): A list of command-line arguments for espeak-ng.
|
340
|
+
input_text (str, optional): The text to pass to espeak-ng's stdin. Defaults to None.
|
341
|
+
check (bool, optional): If True, raises a CalledProcessError if the command returns a non-zero exit code. Defaults to True.
|
342
|
+
|
343
|
+
Returns:
|
344
|
+
str: The stripped standard output from the espeak-ng command.
|
345
|
+
|
346
|
+
Raises:
|
347
|
+
EspeakError: If espeak-ng command is not found, or if the subprocess call fails.
|
348
|
+
"""
|
349
|
+
command: List[str] = ['espeak-ng'] + args
|
350
|
+
try:
|
351
|
+
process: subprocess.CompletedProcess = subprocess.run(
|
352
|
+
command,
|
353
|
+
input=input_text,
|
354
|
+
capture_output=True,
|
355
|
+
text=True,
|
356
|
+
check=check,
|
357
|
+
encoding='utf-8',
|
358
|
+
errors='replace' # Replaces unencodable characters with a placeholder
|
359
|
+
)
|
360
|
+
return process.stdout.strip()
|
361
|
+
except FileNotFoundError:
|
362
|
+
raise EspeakError(
|
363
|
+
"espeak-ng command not found. Please ensure espeak-ng is installed "
|
364
|
+
"and available in your system's PATH."
|
365
|
+
)
|
366
|
+
except subprocess.CalledProcessError as e:
|
367
|
+
raise EspeakError(
|
368
|
+
f"espeak-ng command failed with error code {e.returncode}:\n"
|
369
|
+
f"STDOUT: {e.stdout}\n"
|
370
|
+
f"STDERR: {e.stderr}"
|
371
|
+
)
|
372
|
+
except Exception as e:
|
373
|
+
raise EspeakError(f"An unexpected error occurred while running espeak-ng: {e}")
|
374
|
+
|
375
|
+
def phonemize_string(self, text: str, lang: str) -> str:
|
376
|
+
lang = self.get_lang(lang)
|
377
|
+
return self._run_espeak_command(
|
378
|
+
['-q', '-x', '--ipa', '-v', lang],
|
379
|
+
input_text=text
|
380
|
+
)
|
381
|
+
|
382
|
+
|
383
|
+
class GruutPhonemizer(BasePhonemizer):
|
384
|
+
"""
|
385
|
+
A phonemizer class that uses the Gruut library to convert text into phonemes.
|
386
|
+
Note: Gruut's internal segmentation is sentence-based
|
387
|
+
"""
|
388
|
+
GRUUT_LANGS = ["en", "ar", "ca", "cs", "de", "es", "fa", "fr", "it",
|
389
|
+
"lb", "nl", "pt", "ru", "sv", "sw"]
|
390
|
+
|
391
|
+
def __init__(self):
|
392
|
+
super().__init__(Alphabet.IPA)
|
393
|
+
|
394
|
+
@classmethod
|
395
|
+
def get_lang(cls, target_lang: str) -> str:
|
396
|
+
"""
|
397
|
+
Validates and returns the closest supported language code.
|
398
|
+
|
399
|
+
Args:
|
400
|
+
target_lang (str): The language code to validate.
|
401
|
+
|
402
|
+
Returns:
|
403
|
+
str: The validated language code.
|
404
|
+
|
405
|
+
Raises:
|
406
|
+
ValueError: If the language code is unsupported.
|
407
|
+
"""
|
408
|
+
return cls.match_lang(target_lang, cls.GRUUT_LANGS)
|
409
|
+
|
410
|
+
def _text_to_phonemes(self, text: str, lang: Optional[str] = None):
|
411
|
+
"""
|
412
|
+
Generates phonemes for text using Gruut's sentence processing.
|
413
|
+
Yields lists of word phonemes for each sentence.
|
414
|
+
"""
|
415
|
+
lang = self.get_lang(lang)
|
416
|
+
import gruut
|
417
|
+
for sentence in gruut.sentences(text, lang=lang):
|
418
|
+
sent_phonemes = [w.phonemes for w in sentence if w.phonemes]
|
419
|
+
if sentence and not sent_phonemes:
|
420
|
+
raise RuntimeError(f"did you install gruut[{lang}] ?")
|
421
|
+
if sentence.text.endswith("?"):
|
422
|
+
sent_phonemes[-1] = ["?"]
|
423
|
+
elif sentence.text.endswith("!"):
|
424
|
+
sent_phonemes[-1] = ["!"]
|
425
|
+
elif sentence.text.endswith(".") or sent_phonemes[-1] == ["‖"]:
|
426
|
+
sent_phonemes[-1] = ["."]
|
427
|
+
if sent_phonemes:
|
428
|
+
yield sent_phonemes
|
429
|
+
|
430
|
+
def phonemize_string(self, text: str, lang: str) -> str:
|
431
|
+
pho = ""
|
432
|
+
for sent_phonemes in self._text_to_phonemes(text, lang):
|
433
|
+
pho += " ".join(["".join(w) for w in sent_phonemes]) + " "
|
434
|
+
return pho.strip()
|
435
|
+
|
436
|
+
|
437
|
+
class EpitranPhonemizer(BasePhonemizer):
|
438
|
+
"""
|
439
|
+
A phonemizer class that uses the Gruut library to convert text into phonemes.
|
440
|
+
Note: Gruut's internal segmentation is sentence-based
|
441
|
+
"""
|
442
|
+
EPITRAN_LANGS = ['hsn-Latn', 'ful-Latn', 'jpn-Ktkn-red', 'tel-Telu', 'nld-Latn', 'aze-Latn', 'amh-Ethi-pp',
|
443
|
+
'msa-Latn', 'spa-Latn-eu', 'ori-Orya', 'bxk-Latn', 'spa-Latn', 'kir-Cyrl', 'lij-Latn', 'kin-Latn',
|
444
|
+
'ces-Latn', 'sin-Sinh', 'urd-Arab', 'vie-Latn', 'gan-Latn', 'fra-Latn', 'nan-Latn', 'kaz-Latn',
|
445
|
+
'swe-Latn', 'jpn-Ktkn', 'tam-Taml', 'sag-Latn', 'csb-Latn', 'pii-latn_Holopainen2019', 'yue-Latn',
|
446
|
+
'got-Latn', 'tur-Latn', 'aar-Latn', 'jav-Latn', 'ita-Latn', 'sna-Latn', 'ilo-Latn', 'tam-Taml-red',
|
447
|
+
'kmr-Latn-red', 'uzb-Cyrl', 'amh-Ethi', 'mya-Mymr', 'aii-Syrc', 'lit-Latn', 'kmr-Latn',
|
448
|
+
'hat-Latn-bab', 'ltc-Latn-bax', 'Goth2Latn', 'quy-Latn', 'hau-Latn', 'ood-Latn-alv', 'vie-Latn-so',
|
449
|
+
'run-Latn', 'orm-Latn', 'ind-Latn', 'kir-Latn', 'mal-Mlym', 'ben-Beng-red', 'hun-Latn', 'uew',
|
450
|
+
'sqi-Latn', 'jpn-Hrgn', 'deu-Latn-np', 'xho-Latn', 'fra-Latn-rev', 'fra-Latn-np', 'kaz-Cyrl-bab',
|
451
|
+
'jpn-Hrgn-red', 'Latn2Goth', 'glg-Latn', 'uig-Arab', 'amh-Ethi-red', 'zul-Latn', 'hin-Deva',
|
452
|
+
'uzb-Latn', 'tir-Ethi-red', 'kaz-Cyrl', 'mlt-Latn', 'deu-Latn-nar', 'est-Latn', 'eng-Latn',
|
453
|
+
'pii-latn_Wiktionary', 'ckb-Arab', 'nya-Latn', 'mon-Cyrl-bab', 'fra-Latn-p', 'ood-Latn-sax',
|
454
|
+
'ukr-Cyrl', 'tgl-Latn-red', 'lsm-Latn', 'kor-Hang', 'lav-Latn', 'generic-Latn', 'tur-Latn-red',
|
455
|
+
'srp-Latn', 'tir-Ethi', 'kbd-Cyrl', 'hrv-Latn', 'srp-Cyrl', 'tpi-Latn', 'khm-Khmr', 'jam-Latn',
|
456
|
+
'ben-Beng-east', 'por-Latn', 'cmn-Latn', 'cat-Latn', 'tha-Thai', 'ara-Arab', 'ben-Beng',
|
457
|
+
'fin-Latn', 'hmn-Latn', 'lez-Cyrl', 'fas-Arab', 'lao-Laoo-prereform', 'mar-Deva', 'yor-Latn',
|
458
|
+
'ron-Latn', 'tgl-Latn', 'lao-Laoo', 'deu-Latn', 'pan-Guru', 'tuk-Latn', 'tir-Ethi-pp', 'rus-Cyrl',
|
459
|
+
'swa-Latn-red', 'ceb-Latn', 'wuu-Latn', 'hak-Latn', 'mri-Latn', 'epo-Latn', 'pol-Latn',
|
460
|
+
'tur-Latn-bab', 'kat-Geor', 'tgk-Cyrl', 'aze-Cyrl', 'vie-Latn-ce', 'swa-Latn', 'tuk-Cyrl',
|
461
|
+
'vie-Latn-no', 'nan-Latn-tl', 'zha-Latn', 'cjy-Latn', 'ava-Cyrl', 'som-Latn', 'kir-Arab']
|
462
|
+
|
463
|
+
def __init__(self):
|
464
|
+
super().__init__(Alphabet.IPA)
|
465
|
+
import epitran
|
466
|
+
self.epitran = epitran
|
467
|
+
self._epis: Dict[str, epitran.Epitran] = {}
|
468
|
+
|
469
|
+
@classmethod
|
470
|
+
def get_lang(cls, target_lang: str) -> str:
|
471
|
+
"""
|
472
|
+
Validates and returns the closest supported language code.
|
473
|
+
|
474
|
+
Args:
|
475
|
+
target_lang (str): The language code to validate.
|
476
|
+
|
477
|
+
Returns:
|
478
|
+
str: The validated language code.
|
479
|
+
|
480
|
+
Raises:
|
481
|
+
ValueError: If the language code is unsupported.
|
482
|
+
"""
|
483
|
+
return cls.match_lang(target_lang, cls.EPITRAN_LANGS)
|
484
|
+
|
485
|
+
def phonemize_string(self, text: str, lang: str) -> str:
|
486
|
+
lang = self.get_lang(lang)
|
487
|
+
epi = self._epis.get(lang)
|
488
|
+
if epi is None:
|
489
|
+
epi = self.epitran.Epitran(lang)
|
490
|
+
self._epis[lang] = epi
|
491
|
+
return epi.transliterate(text)
|
492
|
+
|
493
|
+
|
494
|
+
class MisakiPhonemizer(BasePhonemizer):
|
495
|
+
"""
|
496
|
+
https://github.com/hexgrad/misaki
|
497
|
+
"""
|
498
|
+
MISAKI_LANGS = ['en-US', 'en-GB', 'ko', 'ja', 'vi', 'zh']
|
499
|
+
|
500
|
+
def __init__(self):
|
501
|
+
super().__init__(Alphabet.IPA)
|
502
|
+
self.g2p_en = self.g2p_zh = self.g2p_ko = self.g2p_vi = self.g2p_ja = None
|
503
|
+
|
504
|
+
@classmethod
|
505
|
+
def get_lang(cls, target_lang: str) -> str:
|
506
|
+
"""
|
507
|
+
Validates and returns the closest supported language code.
|
508
|
+
|
509
|
+
Args:
|
510
|
+
target_lang (str): The language code to validate.
|
511
|
+
|
512
|
+
Returns:
|
513
|
+
str: The validated language code.
|
514
|
+
|
515
|
+
Raises:
|
516
|
+
ValueError: If the language code is unsupported.
|
517
|
+
"""
|
518
|
+
return cls.match_lang(target_lang, cls.MISAKI_LANGS)
|
519
|
+
|
520
|
+
def _get_phonemizer(self, lang: str):
|
521
|
+
"""lazy load language specific phonemizer on first usage
|
522
|
+
NOTE: this can be slow
|
523
|
+
"""
|
524
|
+
lang = self.get_lang(lang)
|
525
|
+
|
526
|
+
if lang == "zh":
|
527
|
+
if self.g2p_zh is None:
|
528
|
+
from misaki.zh import ZHG2P
|
529
|
+
self.g2p_zh = ZHG2P()
|
530
|
+
return self.g2p_zh
|
531
|
+
elif lang == "ko":
|
532
|
+
if self.g2p_ko is None:
|
533
|
+
from misaki.ko import KOG2P
|
534
|
+
self.g2p_ko = KOG2P()
|
535
|
+
return self.g2p_ko
|
536
|
+
elif lang == "vi":
|
537
|
+
if self.g2p_vi is None:
|
538
|
+
from misaki.vi import VIG2P
|
539
|
+
self.g2p_vi = VIG2P()
|
540
|
+
return self.g2p_vi
|
541
|
+
elif lang == "ja":
|
542
|
+
if self.g2p_ja is None:
|
543
|
+
from misaki.ja import JAG2P
|
544
|
+
self.g2p_ja = JAG2P()
|
545
|
+
return self.g2p_ja
|
546
|
+
else:
|
547
|
+
if self.g2p_en is None:
|
548
|
+
from misaki import en
|
549
|
+
self.g2p_en = en.G2P()
|
550
|
+
if lang == "en-GB":
|
551
|
+
self.g2p_en.british = True
|
552
|
+
elif lang == "en-US":
|
553
|
+
self.g2p_en.british = False
|
554
|
+
return self.g2p_en
|
555
|
+
|
556
|
+
def phonemize_string(self, text: str, lang: str) -> str:
|
557
|
+
pho = self._get_phonemizer(lang)
|
558
|
+
phonemes, tokens = pho(text)
|
559
|
+
return phonemes
|
560
|
+
|
561
|
+
|
562
|
+
if __name__ == "__main__":
|
563
|
+
# for comparison
|
564
|
+
|
565
|
+
byt5 = ByT5Phonemizer()
|
566
|
+
espeak = EspeakPhonemizer()
|
567
|
+
gruut = GruutPhonemizer()
|
568
|
+
epitr = EpitranPhonemizer()
|
569
|
+
charsiu = CharsiuPhonemizer()
|
570
|
+
misaki = MisakiPhonemizer()
|
571
|
+
|
572
|
+
lang = "en-gb"
|
573
|
+
|
574
|
+
text1 = "Hello, world. How are you?"
|
575
|
+
|
576
|
+
print("\n--- Getting phonemes for 'Hello, world. How are you?' ---")
|
577
|
+
phonemes1 = espeak.phonemize(text1, lang)
|
578
|
+
phonemes1b = gruut.phonemize(text1, lang)
|
579
|
+
phonemes1c = byt5.phonemize(text1, lang)
|
580
|
+
phonemes1d = epitr.phonemize(text1, lang)
|
581
|
+
phonemes1e = charsiu.phonemize(text1, lang)
|
582
|
+
phonemes1f = misaki.phonemize(text1, lang)
|
583
|
+
print(f" Espeak Phonemes: {phonemes1}")
|
584
|
+
print(f" Gruut Phonemes: {phonemes1b}")
|
585
|
+
print(f" byt5 Phonemes: {phonemes1c}")
|
586
|
+
print(f" Epitran Phonemes: {phonemes1d}")
|
587
|
+
print(f" Charsiu Phonemes: {phonemes1e}")
|
588
|
+
|
589
|
+
print(f" Misaki Phonemes: {phonemes1f}")
|
590
|
+
|
591
|
+
|
592
|
+
lang = "nl"
|
593
|
+
sentence = "DJ's en bezoekers van Tomorrowland waren woensdagavond dolblij toen het paradepaardje van het festival alsnog opende in Oostenrijk op de Mainstage.\nWant het optreden van Metallica, waar iedereen zo blij mee was, zou hoe dan ook doorgaan, aldus de DJ die het nieuws aankondigde."
|
594
|
+
sentence = "Een regenboog is een gekleurde cirkelboog die aan de hemel waargenomen kan worden als de, laagstaande, zon tegen een nevel van waterdruppeltjes aan schijnt en de zon zich achter de waarnemer bevindt. Het is een optisch effect dat wordt veroorzaakt door de breking en weerspiegeling van licht in de waterdruppels."
|
595
|
+
print(f"\n--- Getting phonemes for '{sentence}' ---")
|
596
|
+
text1 = sentence
|
597
|
+
phonemes1 = espeak.phonemize(text1, lang)
|
598
|
+
phonemes1b = gruut.phonemize(text1, lang)
|
599
|
+
phonemes1c = byt5.phonemize(text1, lang)
|
600
|
+
phonemes1d = epitr.phonemize(text1, lang)
|
601
|
+
phonemes1e = charsiu.phonemize(text1, lang)
|
602
|
+
print(f" Espeak Phonemes: {phonemes1}")
|
603
|
+
print(f" Gruut Phonemes: {phonemes1b}")
|
604
|
+
print(f" byt5 Phonemes: {phonemes1c}")
|
605
|
+
print(f" Epitran Phonemes: {phonemes1d}")
|
606
|
+
print(f" Charsiu Phonemes: {phonemes1e}")
|
@@ -0,0 +1,44 @@
|
|
1
|
+
from phoonnx.phonemizers.base import BasePhonemizer
|
2
|
+
from phoonnx.config import Alphabet
|
3
|
+
|
4
|
+
|
5
|
+
class VIPhonemePhonemizer(BasePhonemizer):
|
6
|
+
"""https://github.com/v-nhandt21/Viphoneme"""
|
7
|
+
def __init__(self):
|
8
|
+
from viphoneme import vi2IPA
|
9
|
+
self.g2p = vi2IPA
|
10
|
+
super().__init__(Alphabet.IPA)
|
11
|
+
|
12
|
+
@classmethod
|
13
|
+
def get_lang(cls, target_lang: str) -> str:
|
14
|
+
"""
|
15
|
+
Validates and returns the closest supported language code.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
target_lang (str): The language code to validate.
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
str: The validated language code.
|
22
|
+
|
23
|
+
Raises:
|
24
|
+
ValueError: If the language code is unsupported.
|
25
|
+
"""
|
26
|
+
# this check is here only to throw an exception if invalid language is provided
|
27
|
+
return cls.match_lang(target_lang, ["vi"])
|
28
|
+
|
29
|
+
def phonemize_string(self, text: str, lang: str = "vi") -> str:
|
30
|
+
"""
|
31
|
+
"""
|
32
|
+
lang = self.get_lang(lang)
|
33
|
+
return self.g2p(text)
|
34
|
+
|
35
|
+
|
36
|
+
if __name__ == "__main__":
|
37
|
+
text = "Được viết vào 6/4/2020, có thể xử lí những trường hợp chứa English"
|
38
|
+
|
39
|
+
pho = VIPhonemePhonemizer()
|
40
|
+
lang = "vi"
|
41
|
+
|
42
|
+
print(f"\n--- Getting phonemes for '{text}' ---")
|
43
|
+
phonemes_cotovia = pho.phonemize(text, lang)
|
44
|
+
print(f" Phonemes: {phonemes_cotovia}")
|