phoonnx 0.1.0a1__py3-none-any.whl → 0.1.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phoonnx/config.py +4 -1
- phoonnx/phonemizers/ar.py +36 -44
- phoonnx/phonemizers/base.py +27 -1
- phoonnx/phonemizers/he.py +6 -25
- phoonnx/phonemizers/mul.py +617 -4
- phoonnx/thirdparty/hangul2ipa.py +1 -0
- phoonnx/thirdparty/mantoq/__init__.py +1 -26
- phoonnx/thirdparty/phonikud/__init__.py +24 -0
- phoonnx/version.py +5 -1
- phoonnx/voice.py +4 -16
- {phoonnx-0.1.0a1.dist-info → phoonnx-0.1.0a3.dist-info}/METADATA +2 -1
- {phoonnx-0.1.0a1.dist-info → phoonnx-0.1.0a3.dist-info}/RECORD +17 -16
- phoonnx_train/export_onnx.py +307 -56
- phoonnx_train/preprocess.py +36 -9
- phoonnx_train/vits/dataset.py +4 -0
- {phoonnx-0.1.0a1.dist-info → phoonnx-0.1.0a3.dist-info}/WHEEL +0 -0
- {phoonnx-0.1.0a1.dist-info → phoonnx-0.1.0a3.dist-info}/top_level.txt +0 -0
phoonnx_train/preprocess.py
CHANGED
@@ -19,8 +19,8 @@ from phoonnx.phoneme_ids import (phonemes_to_ids, DEFAULT_IPA_PHONEME_ID_MAP, DE
|
|
19
19
|
DEFAULT_BOS_TOKEN, DEFAULT_EOS_TOKEN, DEFAULT_BLANK_WORD_TOKEN)
|
20
20
|
from phoonnx_train.norm_audio import cache_norm_audio, make_silence_detector
|
21
21
|
from tqdm import tqdm
|
22
|
+
from phoonnx.version import VERSION_STR
|
22
23
|
|
23
|
-
_VERSION = "0.0.0"
|
24
24
|
_LOGGER = logging.getLogger("preprocess")
|
25
25
|
|
26
26
|
# Base phoneme map
|
@@ -105,7 +105,9 @@ def ljspeech_dataset(args: argparse.Namespace) -> Iterable[Utterance]:
|
|
105
105
|
|
106
106
|
wav_path = None
|
107
107
|
for wav_dir in wav_dirs:
|
108
|
-
potential_paths = [wav_dir / filename,
|
108
|
+
potential_paths = [wav_dir / filename,
|
109
|
+
wav_dir / f"{filename}.wav",
|
110
|
+
wav_dir / f"{filename.lstrip('0')}.wav"]
|
109
111
|
for path in potential_paths:
|
110
112
|
if path.exists():
|
111
113
|
wav_path = path
|
@@ -153,9 +155,17 @@ def phonemize_worker(
|
|
153
155
|
|
154
156
|
for utt in utterance_batch:
|
155
157
|
try:
|
158
|
+
# normalize text (case, numbers....)
|
159
|
+
utterance = casing(normalize( utt.text, args.language))
|
160
|
+
|
161
|
+
# add diacritics
|
162
|
+
if args.add_diacritics:
|
163
|
+
utterance = phonemizer.add_diacritics(utterance, args.language)
|
164
|
+
|
156
165
|
# Phonemize the text
|
157
|
-
|
158
|
-
utt.phonemes
|
166
|
+
utt.phonemes = phonemizer.phonemize_to_list(utterance, args.language)
|
167
|
+
if not utt.phonemes:
|
168
|
+
raise RuntimeError(f"Phonemes not found for '{utterance}'")
|
159
169
|
|
160
170
|
# Process audio if not skipping
|
161
171
|
if not args.skip_audio:
|
@@ -242,6 +252,9 @@ def main() -> None:
|
|
242
252
|
parser.add_argument(
|
243
253
|
"--debug", action="store_true", help="Print DEBUG messages to the console"
|
244
254
|
)
|
255
|
+
parser.add_argument(
|
256
|
+
"--add-diacritics", action="store_true", help="Add diacritics to text (phonemizer specific)"
|
257
|
+
)
|
245
258
|
args = parser.parse_args()
|
246
259
|
|
247
260
|
# Setup
|
@@ -293,7 +306,9 @@ def main() -> None:
|
|
293
306
|
_LOGGER.info("Starting single pass processing with %d workers...", args.max_workers)
|
294
307
|
|
295
308
|
# Initialize the phonemizer only once in the main process
|
296
|
-
phonemizer = get_phonemizer(args.phoneme_type,
|
309
|
+
phonemizer = get_phonemizer(args.phoneme_type,
|
310
|
+
args.alphabet,
|
311
|
+
args.phonemizer_model)
|
297
312
|
|
298
313
|
batch_size = max(1, int(num_utterances / (args.max_workers * 2)))
|
299
314
|
|
@@ -367,7 +382,10 @@ def main() -> None:
|
|
367
382
|
"quality": audio_quality,
|
368
383
|
},
|
369
384
|
"lang_code": args.language,
|
370
|
-
"inference": {"noise_scale": 0.667,
|
385
|
+
"inference": {"noise_scale": 0.667,
|
386
|
+
"length_scale": 1,
|
387
|
+
"noise_w": 0.8,
|
388
|
+
"add_diacritics": args.add_diacritics},
|
371
389
|
"alphabet": phonemizer.alphabet.value,
|
372
390
|
"phoneme_type": args.phoneme_type.value,
|
373
391
|
"phonemizer_model": args.phonemizer_model,
|
@@ -375,7 +393,7 @@ def main() -> None:
|
|
375
393
|
"num_symbols": len(final_phoneme_id_map),
|
376
394
|
"num_speakers": len(speaker_counts) if is_multispeaker else 1,
|
377
395
|
"speaker_id_map": speaker_ids,
|
378
|
-
"phoonnx_version":
|
396
|
+
"phoonnx_version": VERSION_STR,
|
379
397
|
}
|
380
398
|
|
381
399
|
with open(args.output_dir / "config.json", "w", encoding="utf-8") as config_file:
|
@@ -383,15 +401,23 @@ def main() -> None:
|
|
383
401
|
|
384
402
|
# --- Apply final phoneme IDs and write dataset.jsonl ---
|
385
403
|
_LOGGER.info("Writing dataset.jsonl...")
|
404
|
+
valid_utterances_count = 0
|
386
405
|
with open(args.output_dir / "dataset.jsonl", "w", encoding="utf-8") as dataset_file:
|
387
406
|
for utt in processed_utterances:
|
388
|
-
if utt.speaker is not None:
|
407
|
+
if is_multispeaker and utt.speaker is not None:
|
408
|
+
if utt.speaker not in speaker_ids:
|
409
|
+
_LOGGER.error("Speaker '%s' not in speaker_id_map. This indicates an issue with your metadata.csv file.", utt.speaker)
|
410
|
+
continue
|
389
411
|
utt.speaker_id = speaker_ids[utt.speaker]
|
390
412
|
|
391
413
|
# Apply the final phoneme ID map to each utterance
|
392
414
|
if utt.phonemes:
|
393
415
|
utt.phoneme_ids = phonemes_to_ids(utt.phonemes, id_map=final_phoneme_id_map)
|
394
416
|
|
417
|
+
if not utt.phoneme_ids:
|
418
|
+
_LOGGER.warning("Skipping utterance with invalid phoneme_ids before writing: %s", utt.audio_path)
|
419
|
+
continue
|
420
|
+
|
395
421
|
json.dump(
|
396
422
|
utt.asdict(),
|
397
423
|
dataset_file,
|
@@ -399,8 +425,9 @@ def main() -> None:
|
|
399
425
|
cls=PathEncoder,
|
400
426
|
)
|
401
427
|
print("", file=dataset_file)
|
428
|
+
valid_utterances_count += 1
|
402
429
|
|
403
|
-
_LOGGER.info("Preprocessing complete.")
|
430
|
+
_LOGGER.info("Preprocessing complete. Wrote %d valid utterances to dataset.jsonl.", valid_utterances_count)
|
404
431
|
|
405
432
|
|
406
433
|
# -----------------------------------------------------------------------------
|
phoonnx_train/vits/dataset.py
CHANGED
@@ -69,6 +69,8 @@ class PiperDataset(Dataset):
|
|
69
69
|
self.utterances.extend(
|
70
70
|
PiperDataset.load_dataset(dataset_path, max_phoneme_ids=max_phoneme_ids)
|
71
71
|
)
|
72
|
+
if not self.utterances:
|
73
|
+
raise ValueError("No utterances loaded")
|
72
74
|
|
73
75
|
def __len__(self):
|
74
76
|
return len(self.utterances)
|
@@ -120,6 +122,8 @@ class PiperDataset(Dataset):
|
|
120
122
|
@staticmethod
|
121
123
|
def load_utterance(line: str) -> Utterance:
|
122
124
|
utt_dict = json.loads(line)
|
125
|
+
if not utt_dict["phoneme_ids"]:
|
126
|
+
raise ValueError(f"invalid utterance line - phoneme_ids not set ({line})")
|
123
127
|
return Utterance(
|
124
128
|
phoneme_ids=utt_dict["phoneme_ids"],
|
125
129
|
audio_norm_path=Path(utt_dict["audio_norm_path"]),
|
File without changes
|
File without changes
|