PyPI - phoonnx - Versions diffs - 0.0.2a2__py3-none-any.whl → 0.1.0a3__py3-none-any.whl - Mend

phoonnx 0.0.2a2py3-none-any.whl → 0.1.0a3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

phoonnx/config.py +4 -1
phoonnx/phonemizers/ar.py +65 -9
phoonnx/phonemizers/base.py +27 -1
phoonnx/phonemizers/he.py +6 -25
phoonnx/phonemizers/mul.py +617 -4
phoonnx/thirdparty/bw2ipa.py +66 -0
phoonnx/thirdparty/hangul2ipa.py +1 -0
phoonnx/thirdparty/mantoq/__init__.py +1 -26
phoonnx/thirdparty/phonikud/__init__.py +24 -0
phoonnx/version.py +7 -3
phoonnx/voice.py +4 -16
{phoonnx-0.0.2a2.dist-info → phoonnx-0.1.0a3.dist-info}/METADATA +2 -1
{phoonnx-0.0.2a2.dist-info → phoonnx-0.1.0a3.dist-info}/RECORD +18 -16
phoonnx_train/export_onnx.py +307 -56
phoonnx_train/preprocess.py +36 -9
phoonnx_train/vits/dataset.py +4 -0
{phoonnx-0.0.2a2.dist-info → phoonnx-0.1.0a3.dist-info}/WHEEL +0 -0
{phoonnx-0.0.2a2.dist-info → phoonnx-0.1.0a3.dist-info}/top_level.txt +0 -0

phoonnx_train/export_onnx.py CHANGED Viewed

@@ -1,109 +1,360 @@
 #!/usr/bin/env python3
-import argparse
+import click
 import logging
+import json
+import os
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Dict, Any, Tuple
 import torch
 from phoonnx_train.vits.lightning import VitsModel
+from phoonnx.version import VERSION_STR
-_LOGGER = logging.getLogger("piper_train.export_onnx")
+# Basic logging configuration
+logging.basicConfig(level=logging.DEBUG)
+_LOGGER = logging.getLogger("phoonnx_train.export_onnx")
+# ONNX opset version
 OPSET_VERSION = 15
-def main() -> None:
-    """Main entry point"""
-    torch.manual_seed(1234)
+# --- Utility Functions ---
-    parser = argparse.ArgumentParser()
-    parser.add_argument("checkpoint", help="Path to model checkpoint (.ckpt)")
-    parser.add_argument("output", help="Path to output model (.onnx)")
+def add_meta_data(filename: Path, meta_data: Dict[str, Any]) -> None:
+    """
+    Add meta data to an ONNX model. The file is modified in-place.
-    parser.add_argument(
-        "--debug", action="store_true", help="Print DEBUG messages to the console"
-    )
-    args = parser.parse_args()
+    Args:
+      filename:
+        Path to the ONNX model file to be changed.
+      meta_data:
+        Key-value pairs to be stored as metadata. Values will be converted to strings.
+    """
+    try:
+        import onnx
+        # Load the ONNX model
+        model = onnx.load(str(filename))
+        # Clear existing metadata and add new properties
+        del model.metadata_props[:]
+        for key, value in meta_data.items():
+            meta = model.metadata_props.add()
+            meta.key = key
+            # Convert all values to string for ONNX metadata
+            meta.value = str(value)
+        onnx.save(model, str(filename))
+        _LOGGER.info(f"Added {len(meta_data)} metadata key/value pairs to ONNX model: {filename}")
+    except ImportError:
+        _LOGGER.error("The 'onnx' package is required to add metadata. Please install it with 'pip install onnx'.")
+    except Exception as e:
+        _LOGGER.error(f"Failed to add metadata to ONNX file {filename}: {e}")
+def export_tokens(config_path: Path, output_path: Path = Path("tokens.txt")) -> None:
+    """
+    Generates a tokens.txt file containing phoneme-to-id mapping from the model configuration.
+    The format is: `<phoneme> <id>` per line.
+    Args:
+        config_path: Path to the model configuration JSON file.
+        output_path: Path to save the resulting tokens.txt file.
+    """
+    try:
+        with open(config_path, "r", encoding="utf-8") as file:
+            config: Dict[str, Any] = json.load(file)
+    except Exception as e:
+        _LOGGER.error(f"Failed to load config file at {config_path}: {e}")
+        return
+    id_map: Optional[Dict[str, int]] = config.get("phoneme_id_map")
+    if not id_map:
+        _LOGGER.error("Could not find 'phoneme_id_map' in the config file.")
+        return
+    tokens_path = output_path
+    try:
+        with open(tokens_path, "w", encoding="utf-8") as f:
+            # Sort by ID to ensure a consistent output order
+            # The type hint for sorted_items is a list of tuples: List[Tuple[str, int]]
+            sorted_items: list[Tuple[str, int]] = sorted(id_map.items(), key=lambda item: item[1])
+            for s, i in sorted_items:
+                # Skip newlines or other invalid tokens if present in map
+                if s == "\n" or s == "":
+                    continue
+                f.write(f"{s} {i}\n")
+        _LOGGER.info(f"Generated tokens file at {tokens_path}")
+    except Exception as e:
+        _LOGGER.error(f"Failed to write tokens file to {tokens_path}: {e}")
+def convert_to_piper(config_path: Path, output_path: Path = Path("piper.json")) -> None:
+    """
+    Generates a Piper compatible JSON configuration file from the VITS model configuration.
+    This function currently serves as a placeholder for full Piper conversion logic.
+    Args:
+        config_path: Path to the VITS model configuration JSON file.
+        output_path: Path to save the resulting Piper JSON file.
+    """
+    with open(config_path, "r", encoding="utf-8") as file:
+        config: Dict[str, Any] = json.load(file)
+    piper_config = {
+        "phoneme_type": "espeak" if config.get("phoneme_type", "") == "espeak" else "raw",
+        "phoneme_map": {},
+        "audio": config.get("audio", {}),
+        "inference": config.get("inference", {}),
+        "phoneme_id_map": {k: [v] for k, v in config.get("phoneme_id_map", {}).items()},
+        "espeak": {
+            "voice": config.get("lang_code", "")
+        },
+        "language": {
+            "code": config.get("lang_code", "")
+        },
+        "num_symbols": config.get("num_symbols", 256),
+        "num_speakers": config.get("num_speakers", 1),
+        "speaker_id_map": {},
+        "piper_version": f"phoonnx-" + config.get("phoonnx_version", "0.0.0")
+    }
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(piper_config, f, indent=4, ensure_ascii=False)
-    if args.debug:
-        logging.basicConfig(level=logging.DEBUG)
-    else:
-        logging.basicConfig(level=logging.INFO)
-    _LOGGER.debug(args)
+# --- Main Logic using Click ---
+@click.command(help="Export a VITS model checkpoint to ONNX format.")
+@click.argument(
+    "checkpoint",
+    type=click.Path(exists=True, path_type=Path),
+  #  help="Path to the PyTorch checkpoint file (*.ckpt)."
+)
+@click.option(
+    "-c",
+    "--config",
+    type=click.Path(exists=True, path_type=Path),
+    help="Path to the model configuration JSON file."
+)
+@click.option(
+    "-o",
+    "--output-dir",
+    type=click.Path(path_type=Path),
+    default=Path(os.getcwd()), # Set default to current working directory
+    help="Output directory for the ONNX model. (Default: current directory)"
+)
+@click.option(
+    "-t",
+    "--generate-tokens",
+    is_flag=True,
+    help="Generate tokens.txt alongside the ONNX model. Some inference engines need this (eg. sherpa)"
+)
+@click.option(
+    "-p",
+    "--piper",
+    is_flag=True,
+    help="Generate a piper compatible .json file alongside the ONNX model."
+)
+def cli(
+        checkpoint: Path,
+        config: Path,
+        output_dir: Path,
+        generate_tokens: bool,
+        piper: bool,
+) -> None:
+    """
+    Main entry point for exporting a VITS model checkpoint to ONNX format.
+    Args:
+        checkpoint: Path to the PyTorch checkpoint file (*.ckpt).
+        config: Path to the model configuration JSON file.
+        output_dir: Output directory for the ONNX model and associated files.
+        generate_tokens: Flag to generate a tokens.txt file.
+        piper: Flag to generate a piper compatible .json file.
+    """
+    torch.manual_seed(1234)
+    _LOGGER.debug(f"Arguments: {checkpoint=}, {config=}, {output_dir=}, {generate_tokens=}, {piper=}")
     # -------------------------------------------------------------------------
+    # Paths and Setup
+    # Create output directory if it doesn't exist
+    output_dir.mkdir(parents=True, exist_ok=True)
+    _LOGGER.debug(f"Output directory ensured: {output_dir}")
+    # Load the phoonnx configuration
+    try:
+        with open(config, "r", encoding="utf-8") as f:
+            model_config: Dict[str, Any] = json.load(f)
+        _LOGGER.info(f"Loaded phoonnx config from {config}")
+    except Exception as e:
+        _LOGGER.error(f"Error loading config file {config}: {e}")
+        return
+    alphabet: str = model_config.get("alphabet", "")
+    phoneme_type: str = model_config.get("phoneme_type", "")
+    phonemizer_model: str = model_config.get("phonemizer_model", "")  # depends on phonemizer (eg. byt5)
+    piper_compatible: bool = alphabet == "ipa" and phoneme_type == "espeak"
-    args.checkpoint = Path(args.checkpoint)
-    args.output = Path(args.output)
-    args.output.parent.mkdir(parents=True, exist_ok=True)
+    # Ensure mandatory keys exist before accessing
+    sample_rate: int = model_config.get("audio", {}).get("sample_rate", 22050)
+    phoneme_id_map: Dict[str, int] = model_config.get("phoneme_id_map", {})
-    model = VitsModel.load_from_checkpoint(args.checkpoint, dataset=None)
-    model_g = model.model_g
+    if piper:
+        if not piper_compatible:
+            _LOGGER.warning("only models trained with ipa + espeak should be exported to piper. phonemization is not included in exported model.")
+        # Generate the piper.json file
+        piper_output_path = output_dir / f"{checkpoint.name}.piper.json"
+        convert_to_piper(config, piper_output_path)
-    num_symbols = model_g.n_vocab
-    num_speakers = model_g.n_speakers
+    if generate_tokens:
+        # Generate the tokens.txt file
+        tokens_output_path = output_dir / f"{checkpoint.name}.tokens.txt"
+        export_tokens(config, tokens_output_path)
-    # Inference only
+    # -------------------------------------------------------------------------
+    # Model Loading and Preparation
+    try:
+        model: VitsModel = VitsModel.load_from_checkpoint(
+            checkpoint,
+            dataset=None
+        )
+    except Exception as e:
+        _LOGGER.error(f"Error loading model checkpoint {checkpoint}: {e}")
+        return
+    model_g: torch.nn.Module = model.model_g
+    num_symbols: int = model_g.n_vocab
+    num_speakers: int = model_g.n_speakers
+    # Inference only setup
     model_g.eval()
     with torch.no_grad():
+        # Apply weight norm removal for inference mode
         model_g.dec.remove_weight_norm()
+        _LOGGER.debug("Removed weight normalization from decoder.")
+    # -------------------------------------------------------------------------
+    # Define ONNX-compatible forward function
+    def infer_forward(text: torch.Tensor, text_lengths: torch.Tensor, scales: torch.Tensor, sid: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Custom forward pass for ONNX export, simplifying the input scales and
+        returning only the audio tensor with shape [B, 1, T].
-    # old_forward = model_g.infer
+        Args:
+            text: Input phoneme sequence tensor, shape [B, T_in].
+            text_lengths: Tensor of sequence lengths, shape [B].
+            scales: Tensor containing [noise_scale, length_scale, noise_scale_w], shape [3].
+            sid: Optional speaker ID tensor, shape [B], for multi-speaker models.
-    def infer_forward(text, text_lengths, scales, sid=None):
-        noise_scale = scales[0]
-        length_scale = scales[1]
-        noise_scale_w = scales[2]
-        audio = model_g.infer(
+        Returns:
+            Generated audio tensor, shape [B, 1, T_out].
+        """
+        noise_scale: float = scales[0]
+        length_scale: float = scales[1]
+        noise_scale_w: float = scales[2]
+        # model_g.infer returns a tuple: (audio, attn, ids_slice, x_mask, z, z_mask, g)
+        audio: torch.Tensor = model_g.infer(
             text,
             text_lengths,
             noise_scale=noise_scale,
             length_scale=length_scale,
             noise_scale_w=noise_scale_w,
             sid=sid,
-        )[0].unsqueeze(1)
+        )[0].unsqueeze(1)  # [0] gets the audio tensor. unsqueeze(1) makes it [B, 1, T]
         return audio
+    # Replace the default forward with the inference one for ONNX export
     model_g.forward = infer_forward
-    dummy_input_length = 50
-    sequences = torch.randint(
+    # -------------------------------------------------------------------------
+    # Dummy Input Generation
+    dummy_input_length: int = 50
+    sequences: torch.Tensor = torch.randint(
         low=0, high=num_symbols, size=(1, dummy_input_length), dtype=torch.long
     )
-    sequence_lengths = torch.LongTensor([sequences.size(1)])
+    sequence_lengths: torch.Tensor = torch.LongTensor([sequences.size(1)])
     sid: Optional[torch.LongTensor] = None
+    input_names: list[str] = ["input", "input_lengths", "scales"]
+    dynamic_axes_map: Dict[str, Dict[int, str]] = {
+        "input": {0: "batch_size", 1: "phonemes"},
+        "input_lengths": {0: "batch_size"},
+        "output": {0: "batch_size", 1: "time"},
+    }
     if num_speakers > 1:
         sid = torch.LongTensor([0])
+        input_names.append("sid")
+        dynamic_axes_map["sid"] = {0: "batch_size"}
+        _LOGGER.debug(f"Multi-speaker model detected (n_speakers={num_speakers}). 'sid' included.")
-    # noise, noise_w, length
-    scales = torch.FloatTensor([0.667, 1.0, 0.8])
-    dummy_input = (sequences, sequence_lengths, scales, sid)
+    # noise, length, noise_w scales (hardcoded defaults)
+    scales: torch.Tensor = torch.FloatTensor([0.667, 1.0, 0.8])
+    dummy_input: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.LongTensor]] = (
+        sequences, sequence_lengths, scales, sid
+    )
+    # -------------------------------------------------------------------------
     # Export
-    torch.onnx.export(
-        model=model_g,
-        args=dummy_input,
-        f=str(args.output),
-        verbose=False,
-        opset_version=OPSET_VERSION,
-        input_names=["input", "input_lengths", "scales", "sid"],
-        output_names=["output"],
-        dynamic_axes={
-            "input": {0: "batch_size", 1: "phonemes"},
-            "input_lengths": {0: "batch_size"},
-            "output": {0: "batch_size", 1: "time"},
-        },
-    )
+    model_output: Path = output_dir / f"{checkpoint.name}.onnx"
+    _LOGGER.info(f"Starting ONNX export to {model_output} (opset={OPSET_VERSION})...")
+    try:
+        torch.onnx.export(
+            model=model_g,
+            args=dummy_input,
+            f=str(model_output),
+            verbose=False,
+            opset_version=OPSET_VERSION,
+            input_names=input_names,
+            output_names=["output"],
+            dynamic_axes=dynamic_axes_map,
+        )
+        _LOGGER.info(f"Successfully exported model to {model_output}")
+    except Exception as e:
+        _LOGGER.error(f"Failed during torch.onnx.export: {e}")
+        return
+    # -------------------------------------------------------------------------
+    # Add Metadata
+    metadata_dict: Dict[str, Any] = {
+        "model_type": "vits",
+        "n_speakers": num_speakers,
+        "n_vocab": num_symbols,
+        "sample_rate": sample_rate,
+        "alphabet": alphabet,
+        "phoneme_type": phoneme_type,
+        "phonemizer_model": phonemizer_model,
+        "phoneme_id_map": json.dumps(phoneme_id_map),
+        "has_espeak": phoneme_type == "espeak"
+    }
+    if piper_compatible:
+        metadata_dict["comment"] = "piper"
+    try:
+        add_meta_data(model_output, metadata_dict)
+    except Exception as e:
+        _LOGGER.error(f"Failed to add metadata to exported model {model_output}: {e}")
-    _LOGGER.info("Exported model to %s", args.output)
+    _LOGGER.info("Export complete.")
 # -----------------------------------------------------------------------------
 if __name__ == "__main__":
-    main()
+    cli()

phoonnx_train/preprocess.py CHANGED Viewed

@@ -19,8 +19,8 @@ from phoonnx.phoneme_ids import (phonemes_to_ids, DEFAULT_IPA_PHONEME_ID_MAP, DE
                                  DEFAULT_BOS_TOKEN, DEFAULT_EOS_TOKEN, DEFAULT_BLANK_WORD_TOKEN)
 from phoonnx_train.norm_audio import cache_norm_audio, make_silence_detector
 from tqdm import tqdm
+from phoonnx.version import VERSION_STR
-_VERSION = "0.0.0"
 _LOGGER = logging.getLogger("preprocess")
 # Base phoneme map
@@ -105,7 +105,9 @@ def ljspeech_dataset(args: argparse.Namespace) -> Iterable[Utterance]:
             wav_path = None
             for wav_dir in wav_dirs:
-                potential_paths = [wav_dir / filename, wav_dir / f"{filename}.wav"]
+                potential_paths = [wav_dir / filename,
+                                   wav_dir / f"{filename}.wav",
+                                   wav_dir / f"{filename.lstrip('0')}.wav"]
                 for path in potential_paths:
                     if path.exists():
                         wav_path = path
@@ -153,9 +155,17 @@ def phonemize_worker(
             for utt in utterance_batch:
                 try:
+                    # normalize text (case, numbers....)
+                    utterance = casing(normalize( utt.text, args.language))
+                    # add diacritics
+                    if args.add_diacritics:
+                        utterance = phonemizer.add_diacritics(utterance, args.language)
                     # Phonemize the text
-                    norm_utt = casing(normalize(utt.text, args.language))
-                    utt.phonemes = phonemizer.phonemize_to_list(norm_utt, args.language)
+                    utt.phonemes = phonemizer.phonemize_to_list(utterance, args.language)
+                    if not utt.phonemes:
+                        raise RuntimeError(f"Phonemes not found for '{utterance}'")
                     # Process audio if not skipping
                     if not args.skip_audio:
@@ -242,6 +252,9 @@ def main() -> None:
     parser.add_argument(
         "--debug", action="store_true", help="Print DEBUG messages to the console"
     )
+    parser.add_argument(
+        "--add-diacritics", action="store_true", help="Add diacritics to text (phonemizer specific)"
+    )
     args = parser.parse_args()
     # Setup
@@ -293,7 +306,9 @@ def main() -> None:
     _LOGGER.info("Starting single pass processing with %d workers...", args.max_workers)
     # Initialize the phonemizer only once in the main process
-    phonemizer = get_phonemizer(args.phoneme_type, args.alphabet, args.phonemizer_model)
+    phonemizer = get_phonemizer(args.phoneme_type,
+                                args.alphabet,
+                                args.phonemizer_model)
     batch_size = max(1, int(num_utterances / (args.max_workers * 2)))
@@ -367,7 +382,10 @@ def main() -> None:
             "quality": audio_quality,
         },
         "lang_code": args.language,
-        "inference": {"noise_scale": 0.667, "length_scale": 1, "noise_w": 0.8},
+        "inference": {"noise_scale": 0.667,
+                      "length_scale": 1,
+                      "noise_w": 0.8,
+                      "add_diacritics": args.add_diacritics},
         "alphabet": phonemizer.alphabet.value,
         "phoneme_type": args.phoneme_type.value,
         "phonemizer_model": args.phonemizer_model,
@@ -375,7 +393,7 @@ def main() -> None:
         "num_symbols": len(final_phoneme_id_map),
         "num_speakers": len(speaker_counts) if is_multispeaker else 1,
         "speaker_id_map": speaker_ids,
-        "phoonnx_version": _VERSION,
+        "phoonnx_version": VERSION_STR,
     }
     with open(args.output_dir / "config.json", "w", encoding="utf-8") as config_file:
@@ -383,15 +401,23 @@ def main() -> None:
     # --- Apply final phoneme IDs and write dataset.jsonl ---
     _LOGGER.info("Writing dataset.jsonl...")
+    valid_utterances_count = 0
     with open(args.output_dir / "dataset.jsonl", "w", encoding="utf-8") as dataset_file:
         for utt in processed_utterances:
-            if utt.speaker is not None:
+            if is_multispeaker and utt.speaker is not None:
+                if utt.speaker not in speaker_ids:
+                    _LOGGER.error("Speaker '%s' not in speaker_id_map. This indicates an issue with your metadata.csv file.", utt.speaker)
+                    continue
                 utt.speaker_id = speaker_ids[utt.speaker]
             # Apply the final phoneme ID map to each utterance
             if utt.phonemes:
                 utt.phoneme_ids = phonemes_to_ids(utt.phonemes, id_map=final_phoneme_id_map)
+            if not utt.phoneme_ids:
+                _LOGGER.warning("Skipping utterance with invalid phoneme_ids before writing: %s", utt.audio_path)
+                continue
             json.dump(
                 utt.asdict(),
                 dataset_file,
@@ -399,8 +425,9 @@ def main() -> None:
                 cls=PathEncoder,
             )
             print("", file=dataset_file)
+            valid_utterances_count += 1
-    _LOGGER.info("Preprocessing complete.")
+    _LOGGER.info("Preprocessing complete. Wrote %d valid utterances to dataset.jsonl.", valid_utterances_count)
 # -----------------------------------------------------------------------------

phoonnx_train/vits/dataset.py CHANGED Viewed

@@ -69,6 +69,8 @@ class PiperDataset(Dataset):
             self.utterances.extend(
                 PiperDataset.load_dataset(dataset_path, max_phoneme_ids=max_phoneme_ids)
             )
+        if not self.utterances:
+            raise ValueError("No utterances loaded")
     def __len__(self):
         return len(self.utterances)
@@ -120,6 +122,8 @@ class PiperDataset(Dataset):
     @staticmethod
     def load_utterance(line: str) -> Utterance:
         utt_dict = json.loads(line)
+        if not utt_dict["phoneme_ids"]:
+            raise ValueError(f"invalid utterance line - phoneme_ids not set ({line})")
         return Utterance(
             phoneme_ids=utt_dict["phoneme_ids"],
             audio_norm_path=Path(utt_dict["audio_norm_path"]),

{phoonnx-0.0.2a2.dist-info → phoonnx-0.1.0a3.dist-info}/WHEEL RENAMED Viewed

File without changes

{phoonnx-0.0.2a2.dist-info → phoonnx-0.1.0a3.dist-info}/top_level.txt RENAMED Viewed

File without changes

phoonnx 0.0.2a2__py3-none-any.whl → 0.1.0a3__py3-none-any.whl

phoonnx 0.0.2a2py3-none-any.whl → 0.1.0a3py3-none-any.whl