PyPI - phoonnx - Versions diffs - 0.1.0a1__py3-none-any.whl → 0.1.0a3__py3-none-any.whl - Mend

phoonnx 0.1.0a1py3-none-any.whl → 0.1.0a3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

phoonnx/config.py +4 -1
phoonnx/phonemizers/ar.py +36 -44
phoonnx/phonemizers/base.py +27 -1
phoonnx/phonemizers/he.py +6 -25
phoonnx/phonemizers/mul.py +617 -4
phoonnx/thirdparty/hangul2ipa.py +1 -0
phoonnx/thirdparty/mantoq/__init__.py +1 -26
phoonnx/thirdparty/phonikud/__init__.py +24 -0
phoonnx/version.py +5 -1
phoonnx/voice.py +4 -16
{phoonnx-0.1.0a1.dist-info → phoonnx-0.1.0a3.dist-info}/METADATA +2 -1
{phoonnx-0.1.0a1.dist-info → phoonnx-0.1.0a3.dist-info}/RECORD +17 -16
phoonnx_train/export_onnx.py +307 -56
phoonnx_train/preprocess.py +36 -9
phoonnx_train/vits/dataset.py +4 -0
{phoonnx-0.1.0a1.dist-info → phoonnx-0.1.0a3.dist-info}/WHEEL +0 -0
{phoonnx-0.1.0a1.dist-info → phoonnx-0.1.0a3.dist-info}/top_level.txt +0 -0

phoonnx/thirdparty/hangul2ipa.py CHANGED Viewed

@@ -1,3 +1,4 @@
+# taken from https://github.com/stannam/hangul_to_ipa
 import csv
 import math
 import os.path

phoonnx/thirdparty/mantoq/__init__.py CHANGED Viewed

@@ -3,14 +3,6 @@ from phoonnx.thirdparty.mantoq.buck.tokenization import (arabic_to_phonemes, pho
                                     phonemes_to_tokens, simplify_phonemes)
 from phoonnx.thirdparty.mantoq.buck.tokenization import tokens_to_ids as _tokens_to_id
 from phoonnx.thirdparty.mantoq.num2words import num2words
-import warnings
-from phoonnx.thirdparty.tashkeel import TashkeelDiacritizer
-try:
-    import onnxruntime
-    _TASHKEEL_AVAILABLE = True
-except ImportError:
-    _TASHKEEL_AVAILABLE = False
 _DIACRITIZER_INST = None
@@ -29,29 +21,12 @@ QUOTES_TABLE = str.maketrans(QUOTES, '"' * len(QUOTES))
 BRACKETS_TABLE = str.maketrans("[]{}", "()()")
-def tashkeel(text: str) -> str:
-    global _DIACRITIZER_INST
-    if not _TASHKEEL_AVAILABLE:
-        warnings.warn(
-            "Warning: The Tashkeel feature will not be available. Please re-install with the `libtashkeel` extra.",
-            UserWarning,
-        )
-        return text
-    if _DIACRITIZER_INST is None:
-        _DIACRITIZER_INST = TashkeelDiacritizer()
-    return _DIACRITIZER_INST.diacritize(text)
 def g2p(
     text: str,
-    add_tashkeel: bool = True,
     process_numbers: bool = True,
     append_eos: bool = False,
-) -> list[str]:
+) -> tuple[str, list[str]]:
     text = text.translate(AR_SPECIAL_PUNCS_TABLE).translate(QUOTES_TABLE).translate(BRACKETS_TABLE)
-    if add_tashkeel:
-        text = tashkeel(text)
     if process_numbers:
         text = num2words(text)
     normalized_text = text

phoonnx/thirdparty/phonikud/__init__.py ADDED Viewed

@@ -0,0 +1,24 @@
+import os
+import requests
+class PhonikudDiacritizer:
+    dl_url = "https://huggingface.co/thewh1teagle/phonikud-onnx/resolve/main/phonikud-1.0.int8.onnx"
+    def __init__(self):
+        base_path = os.path.expanduser("~/.local/share/phonikud")
+        fname = self.dl_url.split("/")[-1]
+        model = f"{base_path}/{fname}"
+        if not os.path.isfile(model):
+            os.makedirs(base_path, exist_ok=True)
+            # TODO - streaming download
+            data = requests.get(self.dl_url).content
+            with open(model, "wb") as f:
+                f.write(data)
+        from phonikud_onnx import Phonikud
+        self.phonikud = Phonikud(model)
+    def diacritize(self, text: str) -> str:
+        return self.phonikud.add_diacritics(text)

phoonnx/version.py CHANGED Viewed

@@ -2,5 +2,9 @@
 VERSION_MAJOR = 0
 VERSION_MINOR = 1
 VERSION_BUILD = 0
-VERSION_ALPHA = 1
+VERSION_ALPHA = 3
 # END_VERSION_BLOCK
+VERSION_STR = f"{VERSION_MAJOR}.{VERSION_MINOR}.{VERSION_BUILD}"
+if VERSION_ALPHA:
+    VERSION_STR += f"a{VERSION_ALPHA}"

phoonnx/voice.py CHANGED Viewed

@@ -14,7 +14,6 @@ from phoonnx.config import PhonemeType, VoiceConfig, SynthesisConfig, get_phonem
 from phoonnx.phoneme_ids import phonemes_to_ids, BlankBetween
 from phoonnx.phonemizers import Phonemizer
 from phoonnx.phonemizers.base import PhonemizedChunks
-from phoonnx.thirdparty.tashkeel import TashkeelDiacritizer
 _PHONEME_BLOCK_PATTERN = re.compile(r"(\[\[.*?\]\])")
@@ -113,11 +112,6 @@ class TTSVoice:
     phonemizer: Optional[Phonemizer] = None
-    # For Arabic text only
-    use_tashkeel: bool = True
-    tashkeel_diacritizier: Optional[TashkeelDiacritizer] = None  # For Arabic text only
-    taskeen_threshold: Optional[float] = 0.8
     def __post_init__(self):
         try:
             self.phonetic_spellings = PhoneticSpellings.from_lang(self.config.lang_code)
@@ -128,10 +122,6 @@ class TTSVoice:
                                              self.config.alphabet,
                                              self.config.phonemizer_model)
-        # compat with piper arabic models - TODO move to espeak phonemizer
-        if self.config.lang_code.split("-")[0] == "ar" and self.use_tashkeel and self.tashkeel_diacritizier is None:
-            self.tashkeel_diacritizier = TashkeelDiacritizer()
     @staticmethod
     def load(
             model_path: Union[str, Path],
@@ -209,12 +199,6 @@ class TTSVoice:
                 continue
-            # Arabic diacritization
-            if self.config.lang_code.split("-")[0] == "ar" and self.use_tashkeel:
-                text_part = self.tashkeel_diacritizier(
-                    text_part, taskeen_threshold=self.taskeen_threshold
-                )
             # Phonemization
             phonemes = self.phonemizer.phonemize(
                 text_part, self.config.lang_code
@@ -267,6 +251,10 @@ class TTSVoice:
         if self.phonetic_spellings and syn_config.enable_phonetic_spellings:
             text = self.phonetic_spellings.apply(text)
+        if syn_config.add_diacritics:
+            text = self.phonemizer.add_diacritics(text, self.config.lang_code)
+            LOG.debug("text+diacritics=%s", text)
         # All phonemization goes through the unified self.phonemize method
         sentence_phonemes = self.phonemize(text)
         LOG.debug("phonemes=%s", sentence_phonemes)

{phoonnx-0.1.0a1.dist-info → phoonnx-0.1.0a3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: phoonnx
-Version: 0.1.0a1
+Version: 0.1.0a3
 Home-page: https://github.com/TigreGotico/phoonnx
 Author: JarbasAi
 Author-email: jarbasai@mailfence.com
@@ -220,6 +220,7 @@ Requires-Dist: librosa<1,>=0.9.2; extra == "train"
 Requires-Dist: numpy<2,>=1.19.0; extra == "train"
 Requires-Dist: pytorch-lightning<2.0; extra == "train"
 Requires-Dist: torch<2,>=1.11.0; extra == "train"
+Requires-Dist: click; extra == "train"
 Provides-Extra: uew
 Requires-Dist: epitran; extra == "uew"
 Provides-Extra: ug

{phoonnx-0.1.0a1.dist-info → phoonnx-0.1.0a3.dist-info}/RECORD RENAMED Viewed

@@ -1,29 +1,29 @@
 phoonnx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-phoonnx/config.py,sha256=81H34oPG2BaiOA6UM1KapoT341n068LqRprKb5ER6mY,19451
+phoonnx/config.py,sha256=IYhC-kYjLgYmBroId6YeOE2Vp7SMNGtiGqIIe_09NJk,19531
 phoonnx/phoneme_ids.py,sha256=FiNgZwV6naEsBh6XwFLh3_FyOgPiCsK9qo7S0v-CmI4,13667
 phoonnx/util.py,sha256=XSjFEoqSFcujFTHxednacgC9GrSYyF-Il5L6Utmxmu4,25909
-phoonnx/version.py,sha256=95gLFCt-8xv9DgF7FIF6CljWmhm8SUhevumEBfo7Pl0,114
-phoonnx/voice.py,sha256=FR_LafK1vSi_anPERJjZBuH3Bb9vUIof0MAW6TnALlA,20024
+phoonnx/version.py,sha256=WnY5J2wtSTore9QbKwfk04gQhBsYq4HVmV5CBjEhGnk,236
+phoonnx/voice.py,sha256=JXjmbrhJd4mmTiLgz4O_Pa5_rKGUC9xzuBfqxYDw3Mg,19420
 phoonnx/locale/ca/phonetic_spellings.txt,sha256=igv3t7jxLSRE5GHsdn57HOpxiWNcEmECPql6m02wbO0,47
 phoonnx/locale/en/phonetic_spellings.txt,sha256=xGQlWOABLzbttpQvopl9CU-NnwEJRqKx8iuylsdUoQA,27
 phoonnx/locale/gl/phonetic_spellings.txt,sha256=igv3t7jxLSRE5GHsdn57HOpxiWNcEmECPql6m02wbO0,47
 phoonnx/locale/pt/phonetic_spellings.txt,sha256=KntS8QMynEJ5A3Clvcjq4qlmL-ThSbhfD6v0nKSrlqs,49
 phoonnx/phonemizers/__init__.py,sha256=QGBZk0QUgJdg2MwUWY9Kpk6ucwrEJYtHb07YcNvXCV4,1647
-phoonnx/phonemizers/ar.py,sha256=29bCfYhlhx0QX3PQyx3EkUghzh8YfkxNAnMAICXX6I8,4148
-phoonnx/phonemizers/base.py,sha256=yPg6-dvscYpl3rR3JEULG1PRF-i8DWC_C3HAZGLbxOo,7648
+phoonnx/phonemizers/ar.py,sha256=xxILq5iyH0kcI-NqFfRK4abGtpdUbykBjt_dZmPuO2w,3216
+phoonnx/phonemizers/base.py,sha256=FHvAsvSjAl_oSa1GoeEi96CQ_JO_xkKXWq0ukuMxiuo,8660
 phoonnx/phonemizers/en.py,sha256=N2SVoVhplQao7Ej5TXbxJU-YkAgkY0Fr9iYBFnsjFSE,9271
 phoonnx/phonemizers/fa.py,sha256=d_DZM2wqomf4gcRH_rFcNA3VkQWKHru8vwBwaNG8Ll8,1452
 phoonnx/phonemizers/gl.py,sha256=jEFKJJViHufZtB7lGNwWQCdWGiNKDCVZ_GRYXTaw_2c,6614
-phoonnx/phonemizers/he.py,sha256=KbRI3XRZa8UtJdNWmn_fd-t5lmFSIp4Mw8UgcO5l-Po,2211
+phoonnx/phonemizers/he.py,sha256=49OFS34wSFvvR9B3z2bGSzSLmlIvnn2HtkHBOkHS9Ns,1383
 phoonnx/phonemizers/ja.py,sha256=Xojsrt715ihnIiEk9K6giYqDo9Iykw-SHfIidrHtHSU,3834
 phoonnx/phonemizers/ko.py,sha256=kwWoOFqanCB8kv2JRx17A0hP78P1wbXlX6e8VBn1ezQ,2989
-phoonnx/phonemizers/mul.py,sha256=37G_G58aGnVpdEm9vZEAOdGEHJ9TLBE17bU1HFvQ2rU,27291
+phoonnx/phonemizers/mul.py,sha256=-h6uN_laUD-unNRGThzjyiOZpN6pSl4uinCndg5-0TA,94184
 phoonnx/phonemizers/vi.py,sha256=_XJc-Xeawr1Lxr7o8mE_hJao1aGcj4g01XYAOxC_Scg,1311
 phoonnx/phonemizers/zh.py,sha256=88Ywq8h9LDanlyz8RHjRSCY_PRK_Dq808tBADyrgaP8,9657
 phoonnx/thirdparty/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 phoonnx/thirdparty/arpa2ipa.py,sha256=Uj1G5NgP5oBBfSm26LGB8QoumdT-NqCLQTZHT165-_o,5850
 phoonnx/thirdparty/bw2ipa.py,sha256=5FiWC4AP4KXkqtbclbinoXEsUnSYEjk4VWAPasMMcbg,2328
-phoonnx/thirdparty/hangul2ipa.py,sha256=e2c0WOy5lFMcf6GS7pNqIbauMKBX07S84lCczZAZJGA,27518
+phoonnx/thirdparty/hangul2ipa.py,sha256=Pj06lL-GkOH4ZkLuakwQAT045fEVsijGhwoY_EEEVKc,27572
 phoonnx/thirdparty/zh_num.py,sha256=SESA6gvSJW3LZ0FLoybXn2SpbxqhQTi9Tg_U2IZ5JYY,7147
 phoonnx/thirdparty/cotovia/cotovia_aarch64,sha256=BsAWZN452Lm9kDU4i6rQGHFSlmxP3GfHRKhbJMUQrfA,6764592
 phoonnx/thirdparty/cotovia/cotovia_x86_64,sha256=-6BNx_cd49nnDreOAsGtVtePs_X76esrqcNAfmksN1o,1379832
@@ -37,7 +37,7 @@ phoonnx/thirdparty/ko_tables/tensification.csv,sha256=V4Xf3A1G1iMBzwZevBKQuk_lPa
 phoonnx/thirdparty/ko_tables/yale.csv,sha256=UhtDbPXRAAyAKoQMXmwhVBwJ5pfZQ_Duk28qBtRUdsU,297
 phoonnx/thirdparty/kog2p/__init__.py,sha256=yLizadg7RXM-3dQyftD4XSk8r2jb0QOlHQ6as9uUa4U,10267
 phoonnx/thirdparty/kog2p/rulebook.txt,sha256=FQE3nej8wojl6ilVUBYo7f8bIk0Hjci-B7HPXhM-xNc,9303
-phoonnx/thirdparty/mantoq/__init__.py,sha256=4kZuZ3RA5ZhQwTOQGkHF9jQYSvetNTn9uWi5Dsx101k,2106
+phoonnx/thirdparty/mantoq/__init__.py,sha256=02FftO4Onmp_S-XdukbBQ3aRVvqEQyo1frCLWgcF9cY,1428
 phoonnx/thirdparty/mantoq/num2words.py,sha256=9-ncMtxV1FusD9rNur1lu7l2DWhwUwI1mFiqiPSMH_Q,1264
 phoonnx/thirdparty/mantoq/unicode_symbol2label.py,sha256=CeZNv7qWeQS4Ejvz-sKgK--5eNYdVVv04WHPaOeK4gk,259409
 phoonnx/thirdparty/mantoq/buck/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -53,6 +53,7 @@ phoonnx/thirdparty/mantoq/pyarabic/number.py,sha256=NjFZPWRu-9dZDLgxfv9oDjmh-kWY
 phoonnx/thirdparty/mantoq/pyarabic/number_const.py,sha256=vAvRVENxTrl9gWPllSXF-yqK9fAW6htuA2d041btC_A,42361
 phoonnx/thirdparty/mantoq/pyarabic/stack.py,sha256=aJeSzQxVNdomDTWXuxIXWXVOc2BW_3iRWnwmBLkB8jM,1022
 phoonnx/thirdparty/mantoq/pyarabic/trans.py,sha256=cusyHk9Y01iuvMLJXxgCnIiGyAORzEdSosDKX4cAhPc,13713
+phoonnx/thirdparty/phonikud/__init__.py,sha256=g1dCelCZbwlKT0Ibaky6Ckp59wMw5g_1DDyDXauqFTg,760
 phoonnx/thirdparty/tashkeel/LICENSE,sha256=mQjTJ6MGAXzmYkO7x4O2VuEeSwCMx7lncbc26TnrVjw,1067
 phoonnx/thirdparty/tashkeel/SOURCE,sha256=SmnRz-Am5EXv-n2-RokJVEhnn8zeF1QZJVvMQDA_Qds,38
 phoonnx/thirdparty/tashkeel/__init__.py,sha256=FRdGNCTQaai9X077vlNh4tFOvWgm1U2lIUgnQKO5q0s,7119
@@ -61,8 +62,8 @@ phoonnx/thirdparty/tashkeel/input_id_map.json,sha256=cnpJqjx-k53AbzKyfC4GxMS771l
 phoonnx/thirdparty/tashkeel/model.onnx,sha256=UsQNQsoJT_n_B6CR0KHq_XuqXPI4jmCpzIm6zY5elV8,4788213
 phoonnx/thirdparty/tashkeel/target_id_map.json,sha256=baNAJL_UwP9U91mLt01aAEBRRNdGr-csFB_O6roh7TA,181
 phoonnx_train/__main__.py,sha256=FUAIsbQ-w2i_hoNiBuriQFk4uoryhL4ydyVY-hVjw1U,5086
-phoonnx_train/export_onnx.py,sha256=dcFJRZl4YvBk_Dj3j0aNAQVEqKfBHTzV22pzvQwSETQ,2909
-phoonnx_train/preprocess.py,sha256=0kto9Holywby6lnoQucBXq2wYEKDItRvdkvYbQnLJeo,14447
+phoonnx_train/export_onnx.py,sha256=CPfgNEm0hnXPSlgme0R9jr-6jZ5fKFpG5DZJFMkC-h4,12820
+phoonnx_train/preprocess.py,sha256=8_Opy5QVNjVmSVmh1_IF23bcNebVIEXuK2KcollIy28,15793
 phoonnx_train/norm_audio/__init__.py,sha256=Al_YwqMnENXRWp0c79cDZqbdd7pFYARXKxCfBaedr1c,3030
 phoonnx_train/norm_audio/trim.py,sha256=_ZsE3SYhahQSdEdBLeSwyFJGcvEbt-5E_lnWwTT4tcY,1698
 phoonnx_train/norm_audio/vad.py,sha256=DXHfRD0qqFJ52FjPvrL5LlN6keJWuc9Nf6TNhxpwC_4,1600
@@ -70,7 +71,7 @@ phoonnx_train/vits/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuF
 phoonnx_train/vits/attentions.py,sha256=yc_ViF8zR8z68DzphmVVVn27f9xK_5wi8S4ITLXVQL0,15134
 phoonnx_train/vits/commons.py,sha256=JsD8CdZ3ZcYYubYhw8So5hICBziFlCrKLrv1lMDRCDM,4645
 phoonnx_train/vits/config.py,sha256=oSuUIhw9Am7BQ5JwDgtCO-P1zRyN7nPgR-U1XuncJls,10789
-phoonnx_train/vits/dataset.py,sha256=DLLGSCkn3GF9uktoTprH1ERblZ18GO6-QsClQKWa98o,6804
+phoonnx_train/vits/dataset.py,sha256=1V1tVh5dSLjFMBsuzrAsoGtYWSBT4iU64Jdqi8oG-y0,7016
 phoonnx_train/vits/lightning.py,sha256=ZBuSIiJ7EUU1Za2V8Uh6-_HGGRW_qwpXLLs1cEDirHA,12301
 phoonnx_train/vits/losses.py,sha256=j-uINhBcYxVXFvFutiewQpTuw-qF-J6M6hdJVeOKqNE,1401
 phoonnx_train/vits/mel_processing.py,sha256=huIjbQgewSmM39hdzRZvZUCI7fTNSMmLcAv3f8zYb8k,3956
@@ -81,7 +82,7 @@ phoonnx_train/vits/utils.py,sha256=exiyrtPHbnnGvcHWSbaH9-gR6srH5ZPHlKiqV2IHUrQ,4
 phoonnx_train/vits/wavfile.py,sha256=oQZiTIrdw0oLTbcVwKfGXye1WtKte6qK_52qVwiMvfc,26396
 phoonnx_train/vits/monotonic_align/__init__.py,sha256=5IdAOD1Z7UloMb6d_9NRFsXoNIjEQ3h9mvOSh_AtO3k,636
 phoonnx_train/vits/monotonic_align/setup.py,sha256=0K5iJJ2mKIklx6ncEfCQS34skm5hHPiz9vRlQEvevvY,266
-phoonnx-0.1.0a1.dist-info/METADATA,sha256=9FZiRhA48da6ZbX1qCrKKVqsWMWQwfedz-bUXATd6Sk,8145
-phoonnx-0.1.0a1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
-phoonnx-0.1.0a1.dist-info/top_level.txt,sha256=ZrnHXe-4HqbOSX6fbdY-JiP7YEu2Bok9T0ji351MrmM,22
-phoonnx-0.1.0a1.dist-info/RECORD,,
+phoonnx-0.1.0a3.dist-info/METADATA,sha256=3U1Ea0g2HxtWPsIs7NCxzPdo7ZTr4s_lRs9gIOC6MWY,8184
+phoonnx-0.1.0a3.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
+phoonnx-0.1.0a3.dist-info/top_level.txt,sha256=ZrnHXe-4HqbOSX6fbdY-JiP7YEu2Bok9T0ji351MrmM,22
+phoonnx-0.1.0a3.dist-info/RECORD,,

phoonnx_train/export_onnx.py CHANGED Viewed

@@ -1,109 +1,360 @@
 #!/usr/bin/env python3
-import argparse
+import click
 import logging
+import json
+import os
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Dict, Any, Tuple
 import torch
 from phoonnx_train.vits.lightning import VitsModel
+from phoonnx.version import VERSION_STR
-_LOGGER = logging.getLogger("piper_train.export_onnx")
+# Basic logging configuration
+logging.basicConfig(level=logging.DEBUG)
+_LOGGER = logging.getLogger("phoonnx_train.export_onnx")
+# ONNX opset version
 OPSET_VERSION = 15
-def main() -> None:
-    """Main entry point"""
-    torch.manual_seed(1234)
+# --- Utility Functions ---
-    parser = argparse.ArgumentParser()
-    parser.add_argument("checkpoint", help="Path to model checkpoint (.ckpt)")
-    parser.add_argument("output", help="Path to output model (.onnx)")
+def add_meta_data(filename: Path, meta_data: Dict[str, Any]) -> None:
+    """
+    Add meta data to an ONNX model. The file is modified in-place.
-    parser.add_argument(
-        "--debug", action="store_true", help="Print DEBUG messages to the console"
-    )
-    args = parser.parse_args()
+    Args:
+      filename:
+        Path to the ONNX model file to be changed.
+      meta_data:
+        Key-value pairs to be stored as metadata. Values will be converted to strings.
+    """
+    try:
+        import onnx
+        # Load the ONNX model
+        model = onnx.load(str(filename))
+        # Clear existing metadata and add new properties
+        del model.metadata_props[:]
+        for key, value in meta_data.items():
+            meta = model.metadata_props.add()
+            meta.key = key
+            # Convert all values to string for ONNX metadata
+            meta.value = str(value)
+        onnx.save(model, str(filename))
+        _LOGGER.info(f"Added {len(meta_data)} metadata key/value pairs to ONNX model: {filename}")
+    except ImportError:
+        _LOGGER.error("The 'onnx' package is required to add metadata. Please install it with 'pip install onnx'.")
+    except Exception as e:
+        _LOGGER.error(f"Failed to add metadata to ONNX file {filename}: {e}")
+def export_tokens(config_path: Path, output_path: Path = Path("tokens.txt")) -> None:
+    """
+    Generates a tokens.txt file containing phoneme-to-id mapping from the model configuration.
+    The format is: `<phoneme> <id>` per line.
+    Args:
+        config_path: Path to the model configuration JSON file.
+        output_path: Path to save the resulting tokens.txt file.
+    """
+    try:
+        with open(config_path, "r", encoding="utf-8") as file:
+            config: Dict[str, Any] = json.load(file)
+    except Exception as e:
+        _LOGGER.error(f"Failed to load config file at {config_path}: {e}")
+        return
+    id_map: Optional[Dict[str, int]] = config.get("phoneme_id_map")
+    if not id_map:
+        _LOGGER.error("Could not find 'phoneme_id_map' in the config file.")
+        return
+    tokens_path = output_path
+    try:
+        with open(tokens_path, "w", encoding="utf-8") as f:
+            # Sort by ID to ensure a consistent output order
+            # The type hint for sorted_items is a list of tuples: List[Tuple[str, int]]
+            sorted_items: list[Tuple[str, int]] = sorted(id_map.items(), key=lambda item: item[1])
+            for s, i in sorted_items:
+                # Skip newlines or other invalid tokens if present in map
+                if s == "\n" or s == "":
+                    continue
+                f.write(f"{s} {i}\n")
+        _LOGGER.info(f"Generated tokens file at {tokens_path}")
+    except Exception as e:
+        _LOGGER.error(f"Failed to write tokens file to {tokens_path}: {e}")
+def convert_to_piper(config_path: Path, output_path: Path = Path("piper.json")) -> None:
+    """
+    Generates a Piper compatible JSON configuration file from the VITS model configuration.
+    This function currently serves as a placeholder for full Piper conversion logic.
+    Args:
+        config_path: Path to the VITS model configuration JSON file.
+        output_path: Path to save the resulting Piper JSON file.
+    """
+    with open(config_path, "r", encoding="utf-8") as file:
+        config: Dict[str, Any] = json.load(file)
+    piper_config = {
+        "phoneme_type": "espeak" if config.get("phoneme_type", "") == "espeak" else "raw",
+        "phoneme_map": {},
+        "audio": config.get("audio", {}),
+        "inference": config.get("inference", {}),
+        "phoneme_id_map": {k: [v] for k, v in config.get("phoneme_id_map", {}).items()},
+        "espeak": {
+            "voice": config.get("lang_code", "")
+        },
+        "language": {
+            "code": config.get("lang_code", "")
+        },
+        "num_symbols": config.get("num_symbols", 256),
+        "num_speakers": config.get("num_speakers", 1),
+        "speaker_id_map": {},
+        "piper_version": f"phoonnx-" + config.get("phoonnx_version", "0.0.0")
+    }
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(piper_config, f, indent=4, ensure_ascii=False)
-    if args.debug:
-        logging.basicConfig(level=logging.DEBUG)
-    else:
-        logging.basicConfig(level=logging.INFO)
-    _LOGGER.debug(args)
+# --- Main Logic using Click ---
+@click.command(help="Export a VITS model checkpoint to ONNX format.")
+@click.argument(
+    "checkpoint",
+    type=click.Path(exists=True, path_type=Path),
+  #  help="Path to the PyTorch checkpoint file (*.ckpt)."
+)
+@click.option(
+    "-c",
+    "--config",
+    type=click.Path(exists=True, path_type=Path),
+    help="Path to the model configuration JSON file."
+)
+@click.option(
+    "-o",
+    "--output-dir",
+    type=click.Path(path_type=Path),
+    default=Path(os.getcwd()), # Set default to current working directory
+    help="Output directory for the ONNX model. (Default: current directory)"
+)
+@click.option(
+    "-t",
+    "--generate-tokens",
+    is_flag=True,
+    help="Generate tokens.txt alongside the ONNX model. Some inference engines need this (eg. sherpa)"
+)
+@click.option(
+    "-p",
+    "--piper",
+    is_flag=True,
+    help="Generate a piper compatible .json file alongside the ONNX model."
+)
+def cli(
+        checkpoint: Path,
+        config: Path,
+        output_dir: Path,
+        generate_tokens: bool,
+        piper: bool,
+) -> None:
+    """
+    Main entry point for exporting a VITS model checkpoint to ONNX format.
+    Args:
+        checkpoint: Path to the PyTorch checkpoint file (*.ckpt).
+        config: Path to the model configuration JSON file.
+        output_dir: Output directory for the ONNX model and associated files.
+        generate_tokens: Flag to generate a tokens.txt file.
+        piper: Flag to generate a piper compatible .json file.
+    """
+    torch.manual_seed(1234)
+    _LOGGER.debug(f"Arguments: {checkpoint=}, {config=}, {output_dir=}, {generate_tokens=}, {piper=}")
     # -------------------------------------------------------------------------
+    # Paths and Setup
+    # Create output directory if it doesn't exist
+    output_dir.mkdir(parents=True, exist_ok=True)
+    _LOGGER.debug(f"Output directory ensured: {output_dir}")
+    # Load the phoonnx configuration
+    try:
+        with open(config, "r", encoding="utf-8") as f:
+            model_config: Dict[str, Any] = json.load(f)
+        _LOGGER.info(f"Loaded phoonnx config from {config}")
+    except Exception as e:
+        _LOGGER.error(f"Error loading config file {config}: {e}")
+        return
+    alphabet: str = model_config.get("alphabet", "")
+    phoneme_type: str = model_config.get("phoneme_type", "")
+    phonemizer_model: str = model_config.get("phonemizer_model", "")  # depends on phonemizer (eg. byt5)
+    piper_compatible: bool = alphabet == "ipa" and phoneme_type == "espeak"
-    args.checkpoint = Path(args.checkpoint)
-    args.output = Path(args.output)
-    args.output.parent.mkdir(parents=True, exist_ok=True)
+    # Ensure mandatory keys exist before accessing
+    sample_rate: int = model_config.get("audio", {}).get("sample_rate", 22050)
+    phoneme_id_map: Dict[str, int] = model_config.get("phoneme_id_map", {})
-    model = VitsModel.load_from_checkpoint(args.checkpoint, dataset=None)
-    model_g = model.model_g
+    if piper:
+        if not piper_compatible:
+            _LOGGER.warning("only models trained with ipa + espeak should be exported to piper. phonemization is not included in exported model.")
+        # Generate the piper.json file
+        piper_output_path = output_dir / f"{checkpoint.name}.piper.json"
+        convert_to_piper(config, piper_output_path)
-    num_symbols = model_g.n_vocab
-    num_speakers = model_g.n_speakers
+    if generate_tokens:
+        # Generate the tokens.txt file
+        tokens_output_path = output_dir / f"{checkpoint.name}.tokens.txt"
+        export_tokens(config, tokens_output_path)
-    # Inference only
+    # -------------------------------------------------------------------------
+    # Model Loading and Preparation
+    try:
+        model: VitsModel = VitsModel.load_from_checkpoint(
+            checkpoint,
+            dataset=None
+        )
+    except Exception as e:
+        _LOGGER.error(f"Error loading model checkpoint {checkpoint}: {e}")
+        return
+    model_g: torch.nn.Module = model.model_g
+    num_symbols: int = model_g.n_vocab
+    num_speakers: int = model_g.n_speakers
+    # Inference only setup
     model_g.eval()
     with torch.no_grad():
+        # Apply weight norm removal for inference mode
         model_g.dec.remove_weight_norm()
+        _LOGGER.debug("Removed weight normalization from decoder.")
+    # -------------------------------------------------------------------------
+    # Define ONNX-compatible forward function
+    def infer_forward(text: torch.Tensor, text_lengths: torch.Tensor, scales: torch.Tensor, sid: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Custom forward pass for ONNX export, simplifying the input scales and
+        returning only the audio tensor with shape [B, 1, T].
-    # old_forward = model_g.infer
+        Args:
+            text: Input phoneme sequence tensor, shape [B, T_in].
+            text_lengths: Tensor of sequence lengths, shape [B].
+            scales: Tensor containing [noise_scale, length_scale, noise_scale_w], shape [3].
+            sid: Optional speaker ID tensor, shape [B], for multi-speaker models.
-    def infer_forward(text, text_lengths, scales, sid=None):
-        noise_scale = scales[0]
-        length_scale = scales[1]
-        noise_scale_w = scales[2]
-        audio = model_g.infer(
+        Returns:
+            Generated audio tensor, shape [B, 1, T_out].
+        """
+        noise_scale: float = scales[0]
+        length_scale: float = scales[1]
+        noise_scale_w: float = scales[2]
+        # model_g.infer returns a tuple: (audio, attn, ids_slice, x_mask, z, z_mask, g)
+        audio: torch.Tensor = model_g.infer(
             text,
             text_lengths,
             noise_scale=noise_scale,
             length_scale=length_scale,
             noise_scale_w=noise_scale_w,
             sid=sid,
-        )[0].unsqueeze(1)
+        )[0].unsqueeze(1)  # [0] gets the audio tensor. unsqueeze(1) makes it [B, 1, T]
         return audio
+    # Replace the default forward with the inference one for ONNX export
     model_g.forward = infer_forward
-    dummy_input_length = 50
-    sequences = torch.randint(
+    # -------------------------------------------------------------------------
+    # Dummy Input Generation
+    dummy_input_length: int = 50
+    sequences: torch.Tensor = torch.randint(
         low=0, high=num_symbols, size=(1, dummy_input_length), dtype=torch.long
     )
-    sequence_lengths = torch.LongTensor([sequences.size(1)])
+    sequence_lengths: torch.Tensor = torch.LongTensor([sequences.size(1)])
     sid: Optional[torch.LongTensor] = None
+    input_names: list[str] = ["input", "input_lengths", "scales"]
+    dynamic_axes_map: Dict[str, Dict[int, str]] = {
+        "input": {0: "batch_size", 1: "phonemes"},
+        "input_lengths": {0: "batch_size"},
+        "output": {0: "batch_size", 1: "time"},
+    }
     if num_speakers > 1:
         sid = torch.LongTensor([0])
+        input_names.append("sid")
+        dynamic_axes_map["sid"] = {0: "batch_size"}
+        _LOGGER.debug(f"Multi-speaker model detected (n_speakers={num_speakers}). 'sid' included.")
-    # noise, noise_w, length
-    scales = torch.FloatTensor([0.667, 1.0, 0.8])
-    dummy_input = (sequences, sequence_lengths, scales, sid)
+    # noise, length, noise_w scales (hardcoded defaults)
+    scales: torch.Tensor = torch.FloatTensor([0.667, 1.0, 0.8])
+    dummy_input: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.LongTensor]] = (
+        sequences, sequence_lengths, scales, sid
+    )
+    # -------------------------------------------------------------------------
     # Export
-    torch.onnx.export(
-        model=model_g,
-        args=dummy_input,
-        f=str(args.output),
-        verbose=False,
-        opset_version=OPSET_VERSION,
-        input_names=["input", "input_lengths", "scales", "sid"],
-        output_names=["output"],
-        dynamic_axes={
-            "input": {0: "batch_size", 1: "phonemes"},
-            "input_lengths": {0: "batch_size"},
-            "output": {0: "batch_size", 1: "time"},
-        },
-    )
+    model_output: Path = output_dir / f"{checkpoint.name}.onnx"
+    _LOGGER.info(f"Starting ONNX export to {model_output} (opset={OPSET_VERSION})...")
+    try:
+        torch.onnx.export(
+            model=model_g,
+            args=dummy_input,
+            f=str(model_output),
+            verbose=False,
+            opset_version=OPSET_VERSION,
+            input_names=input_names,
+            output_names=["output"],
+            dynamic_axes=dynamic_axes_map,
+        )
+        _LOGGER.info(f"Successfully exported model to {model_output}")
+    except Exception as e:
+        _LOGGER.error(f"Failed during torch.onnx.export: {e}")
+        return
+    # -------------------------------------------------------------------------
+    # Add Metadata
+    metadata_dict: Dict[str, Any] = {
+        "model_type": "vits",
+        "n_speakers": num_speakers,
+        "n_vocab": num_symbols,
+        "sample_rate": sample_rate,
+        "alphabet": alphabet,
+        "phoneme_type": phoneme_type,
+        "phonemizer_model": phonemizer_model,
+        "phoneme_id_map": json.dumps(phoneme_id_map),
+        "has_espeak": phoneme_type == "espeak"
+    }
+    if piper_compatible:
+        metadata_dict["comment"] = "piper"
+    try:
+        add_meta_data(model_output, metadata_dict)
+    except Exception as e:
+        _LOGGER.error(f"Failed to add metadata to exported model {model_output}: {e}")
-    _LOGGER.info("Exported model to %s", args.output)
+    _LOGGER.info("Export complete.")
 # -----------------------------------------------------------------------------
 if __name__ == "__main__":
-    main()
+    cli()

phoonnx 0.1.0a1__py3-none-any.whl → 0.1.0a3__py3-none-any.whl

phoonnx 0.1.0a1py3-none-any.whl → 0.1.0a3py3-none-any.whl