phoonnx 0.1.0a1__py3-none-any.whl → 0.1.0a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,8 +19,8 @@ from phoonnx.phoneme_ids import (phonemes_to_ids, DEFAULT_IPA_PHONEME_ID_MAP, DE
19
19
  DEFAULT_BOS_TOKEN, DEFAULT_EOS_TOKEN, DEFAULT_BLANK_WORD_TOKEN)
20
20
  from phoonnx_train.norm_audio import cache_norm_audio, make_silence_detector
21
21
  from tqdm import tqdm
22
+ from phoonnx.version import VERSION_STR
22
23
 
23
- _VERSION = "0.0.0"
24
24
  _LOGGER = logging.getLogger("preprocess")
25
25
 
26
26
  # Base phoneme map
@@ -105,7 +105,9 @@ def ljspeech_dataset(args: argparse.Namespace) -> Iterable[Utterance]:
105
105
 
106
106
  wav_path = None
107
107
  for wav_dir in wav_dirs:
108
- potential_paths = [wav_dir / filename, wav_dir / f"{filename}.wav"]
108
+ potential_paths = [wav_dir / filename,
109
+ wav_dir / f"{filename}.wav",
110
+ wav_dir / f"{filename.lstrip('0')}.wav"]
109
111
  for path in potential_paths:
110
112
  if path.exists():
111
113
  wav_path = path
@@ -153,9 +155,17 @@ def phonemize_worker(
153
155
 
154
156
  for utt in utterance_batch:
155
157
  try:
158
+ # normalize text (case, numbers....)
159
+ utterance = casing(normalize( utt.text, args.language))
160
+
161
+ # add diacritics
162
+ if args.add_diacritics:
163
+ utterance = phonemizer.add_diacritics(utterance, args.language)
164
+
156
165
  # Phonemize the text
157
- norm_utt = casing(normalize(utt.text, args.language))
158
- utt.phonemes = phonemizer.phonemize_to_list(norm_utt, args.language)
166
+ utt.phonemes = phonemizer.phonemize_to_list(utterance, args.language)
167
+ if not utt.phonemes:
168
+ raise RuntimeError(f"Phonemes not found for '{utterance}'")
159
169
 
160
170
  # Process audio if not skipping
161
171
  if not args.skip_audio:
@@ -242,6 +252,9 @@ def main() -> None:
242
252
  parser.add_argument(
243
253
  "--debug", action="store_true", help="Print DEBUG messages to the console"
244
254
  )
255
+ parser.add_argument(
256
+ "--add-diacritics", action="store_true", help="Add diacritics to text (phonemizer specific)"
257
+ )
245
258
  args = parser.parse_args()
246
259
 
247
260
  # Setup
@@ -293,7 +306,9 @@ def main() -> None:
293
306
  _LOGGER.info("Starting single pass processing with %d workers...", args.max_workers)
294
307
 
295
308
  # Initialize the phonemizer only once in the main process
296
- phonemizer = get_phonemizer(args.phoneme_type, args.alphabet, args.phonemizer_model)
309
+ phonemizer = get_phonemizer(args.phoneme_type,
310
+ args.alphabet,
311
+ args.phonemizer_model)
297
312
 
298
313
  batch_size = max(1, int(num_utterances / (args.max_workers * 2)))
299
314
 
@@ -367,7 +382,10 @@ def main() -> None:
367
382
  "quality": audio_quality,
368
383
  },
369
384
  "lang_code": args.language,
370
- "inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8},
385
+ "inference": {"noise_scale": 0.667,
386
+ "length_scale": 1,
387
+ "noise_w": 0.8,
388
+ "add_diacritics": args.add_diacritics},
371
389
  "alphabet": phonemizer.alphabet.value,
372
390
  "phoneme_type": args.phoneme_type.value,
373
391
  "phonemizer_model": args.phonemizer_model,
@@ -375,7 +393,7 @@ def main() -> None:
375
393
  "num_symbols": len(final_phoneme_id_map),
376
394
  "num_speakers": len(speaker_counts) if is_multispeaker else 1,
377
395
  "speaker_id_map": speaker_ids,
378
- "phoonnx_version": _VERSION,
396
+ "phoonnx_version": VERSION_STR,
379
397
  }
380
398
 
381
399
  with open(args.output_dir / "config.json", "w", encoding="utf-8") as config_file:
@@ -383,15 +401,23 @@ def main() -> None:
383
401
 
384
402
  # --- Apply final phoneme IDs and write dataset.jsonl ---
385
403
  _LOGGER.info("Writing dataset.jsonl...")
404
+ valid_utterances_count = 0
386
405
  with open(args.output_dir / "dataset.jsonl", "w", encoding="utf-8") as dataset_file:
387
406
  for utt in processed_utterances:
388
- if utt.speaker is not None:
407
+ if is_multispeaker and utt.speaker is not None:
408
+ if utt.speaker not in speaker_ids:
409
+ _LOGGER.error("Speaker '%s' not in speaker_id_map. This indicates an issue with your metadata.csv file.", utt.speaker)
410
+ continue
389
411
  utt.speaker_id = speaker_ids[utt.speaker]
390
412
 
391
413
  # Apply the final phoneme ID map to each utterance
392
414
  if utt.phonemes:
393
415
  utt.phoneme_ids = phonemes_to_ids(utt.phonemes, id_map=final_phoneme_id_map)
394
416
 
417
+ if not utt.phoneme_ids:
418
+ _LOGGER.warning("Skipping utterance with invalid phoneme_ids before writing: %s", utt.audio_path)
419
+ continue
420
+
395
421
  json.dump(
396
422
  utt.asdict(),
397
423
  dataset_file,
@@ -399,8 +425,9 @@ def main() -> None:
399
425
  cls=PathEncoder,
400
426
  )
401
427
  print("", file=dataset_file)
428
+ valid_utterances_count += 1
402
429
 
403
- _LOGGER.info("Preprocessing complete.")
430
+ _LOGGER.info("Preprocessing complete. Wrote %d valid utterances to dataset.jsonl.", valid_utterances_count)
404
431
 
405
432
 
406
433
  # -----------------------------------------------------------------------------
@@ -69,6 +69,8 @@ class PiperDataset(Dataset):
69
69
  self.utterances.extend(
70
70
  PiperDataset.load_dataset(dataset_path, max_phoneme_ids=max_phoneme_ids)
71
71
  )
72
+ if not self.utterances:
73
+ raise ValueError("No utterances loaded")
72
74
 
73
75
  def __len__(self):
74
76
  return len(self.utterances)
@@ -120,6 +122,8 @@ class PiperDataset(Dataset):
120
122
  @staticmethod
121
123
  def load_utterance(line: str) -> Utterance:
122
124
  utt_dict = json.loads(line)
125
+ if not utt_dict["phoneme_ids"]:
126
+ raise ValueError(f"invalid utterance line - phoneme_ids not set ({line})")
123
127
  return Utterance(
124
128
  phoneme_ids=utt_dict["phoneme_ids"],
125
129
  audio_norm_path=Path(utt_dict["audio_norm_path"]),