vox-indextts 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .eggs/
8
+ *.egg
9
+ .venv/
10
+ venv/
11
+ .env
12
+ .pytest_cache/
13
+ .ruff_cache/
14
+ .mypy_cache/
15
+ *.so
16
+ *.dylib
17
+ .coverage
@@ -0,0 +1,39 @@
1
+ Metadata-Version: 2.4
2
+ Name: vox-indextts
3
+ Version: 0.1.0
4
+ Summary: IndexTTS adapters for Vox
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: numpy<2.4,>=1.26.0
7
+ Requires-Dist: soundfile>=0.13.1
8
+ Requires-Dist: vox-runtime>=0.2.2
9
+ Description-Content-Type: text/markdown
10
+
11
+ # vox-indextts
12
+
13
+ `vox-indextts` provides a Vox TTS adapter for IndexTTS2.
14
+
15
+ Adapters:
16
+
17
+ - `indextts-tts-torch` - IndexTTS2 voice cloning backend
18
+
19
+ ## Install
20
+
21
+ ```bash
22
+ pip install vox-indextts
23
+ ```
24
+
25
+ ## Runtime Dependencies
26
+
27
+ The adapter package is intentionally light. The upstream IndexTTS runtime is
28
+ installed on demand from GitHub into the isolated target runtime
29
+ `$VOX_HOME/runtime/indextts`.
30
+
31
+ ## Use with Vox
32
+
33
+ ```bash
34
+ vox pull indextts-tts-torch:2
35
+ vox run indextts-tts-torch:2 "Hello from IndexTTS"
36
+ ```
37
+
38
+ IndexTTS is a voice-cloning backend. Pass `reference_audio` through the Vox API
39
+ or use a voice value that points to a local WAV file.
@@ -0,0 +1,29 @@
1
+ # vox-indextts
2
+
3
+ `vox-indextts` provides a Vox TTS adapter for IndexTTS2.
4
+
5
+ Adapters:
6
+
7
+ - `indextts-tts-torch` - IndexTTS2 voice cloning backend
8
+
9
+ ## Install
10
+
11
+ ```bash
12
+ pip install vox-indextts
13
+ ```
14
+
15
+ ## Runtime Dependencies
16
+
17
+ The adapter package is intentionally light. The upstream IndexTTS runtime is
18
+ installed on demand from GitHub into the isolated target runtime
19
+ `$VOX_HOME/runtime/indextts`.
20
+
21
+ ## Use with Vox
22
+
23
+ ```bash
24
+ vox pull indextts-tts-torch:2
25
+ vox run indextts-tts-torch:2 "Hello from IndexTTS"
26
+ ```
27
+
28
+ IndexTTS is a voice-cloning backend. Pass `reference_audio` through the Vox API
29
+ or use a voice value that points to a local WAV file.
@@ -0,0 +1,27 @@
1
+ [project]
2
+ name = "vox-indextts"
3
+ version = "0.1.0"
4
+ description = "IndexTTS adapters for Vox"
5
+ readme = { file = "README.md", content-type = "text/markdown" }
6
+ requires-python = ">=3.11"
7
+ dependencies = [
8
+ "vox-runtime>=0.2.2",
9
+ "numpy>=1.26.0,<2.4",
10
+ "soundfile>=0.13.1",
11
+ ]
12
+
13
+ [tool.vox.adapter]
14
+ import-package = "vox_indextts"
15
+ runtime-policy = "target-runtime"
16
+ runtime-names = ["indextts"]
17
+ adapter-types = ["tts"]
18
+
19
+ [project.entry-points."vox.adapters"]
20
+ indextts-tts-torch = "vox_indextts.adapter:IndexTTSAdapter"
21
+
22
+ [build-system]
23
+ requires = ["hatchling"]
24
+ build-backend = "hatchling.build"
25
+
26
+ [tool.hatch.build.targets.wheel]
27
+ packages = ["src/vox_indextts"]
@@ -0,0 +1,3 @@
1
+ from vox_indextts.adapter import IndexTTSAdapter
2
+
3
+ __all__ = ["IndexTTSAdapter"]
@@ -0,0 +1,281 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import logging
5
+ import subprocess
6
+ import tempfile
7
+ from collections.abc import AsyncIterator, Callable
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ import numpy as np
12
+ import soundfile as sf
13
+ from numpy.typing import NDArray
14
+
15
+ from vox.core.adapter import TTSAdapter
16
+ from vox.core.adapter_runtime import (
17
+ activate_runtime_path,
18
+ install_target_runtime_requirements,
19
+ purge_runtime_modules,
20
+ )
21
+ from vox.core.adapter_runtime import (
22
+ runtime_root as vox_runtime_root,
23
+ )
24
+ from vox.core.types import AdapterInfo, ModelFormat, ModelType, SynthesizeChunk, VoiceInfo
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ INDEXTTS_SAMPLE_RATE = 24_000
29
+ INDEXTTS_REPO = "git+https://github.com/index-tts/index-tts.git"
30
+
31
+
32
+ def _runtime_root() -> Path:
33
+ return vox_runtime_root() / "indextts"
34
+
35
+
36
+ def _ensure_runtime_path() -> str:
37
+ runtime_dir = _runtime_root()
38
+ runtime_dir.mkdir(parents=True, exist_ok=True)
39
+ return activate_runtime_path(runtime_dir, root=runtime_dir.parent)
40
+
41
+
42
+ def _run_install_command(cmd: list[str], timeout: int) -> subprocess.CompletedProcess[str]:
43
+ return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
44
+
45
+
46
+ def _install_indextts_runtime() -> None:
47
+ runtime_path = _ensure_runtime_path()
48
+ if not install_target_runtime_requirements(
49
+ runtime_path,
50
+ (INDEXTTS_REPO,),
51
+ timeout=1200,
52
+ install_runner=_run_install_command,
53
+ context="IndexTTS runtime install",
54
+ ):
55
+ raise RuntimeError("Failed to install IndexTTS runtime from GitHub.")
56
+
57
+
58
+ def _clear_indextts_modules() -> None:
59
+ purge_runtime_modules(("indextts",))
60
+
61
+
62
+ def _load_indextts_class() -> type[Any]:
63
+ _ensure_runtime_path()
64
+ try:
65
+ module = importlib.import_module("indextts.infer_v2")
66
+ except ImportError:
67
+ _install_indextts_runtime()
68
+ _clear_indextts_modules()
69
+ module = importlib.import_module("indextts.infer_v2")
70
+
71
+ cls = getattr(module, "IndexTTS2", None)
72
+ if cls is None:
73
+ raise RuntimeError("IndexTTS runtime is installed, but indextts.infer_v2.IndexTTS2 was not found.")
74
+ return cls
75
+
76
+
77
+ def _voice_path(voice: str | None) -> str | None:
78
+ if not voice:
79
+ return None
80
+ path = Path(voice).expanduser()
81
+ return str(path) if path.is_file() else None
82
+
83
+
84
+ def _write_reference_audio(path: Path, reference_audio: NDArray[np.float32], sample_rate: int) -> None:
85
+ path.parent.mkdir(parents=True, exist_ok=True)
86
+ sf.write(path, np.asarray(reference_audio, dtype=np.float32), sample_rate)
87
+
88
+
89
+ def _read_audio(path: Path) -> tuple[NDArray[np.float32], int]:
90
+ audio, sample_rate = sf.read(path, dtype="float32", always_2d=False)
91
+ return np.asarray(audio, dtype=np.float32).reshape(-1), int(sample_rate)
92
+
93
+
94
+ def _audio_from_result(result: Any, fallback_path: Path) -> tuple[NDArray[np.float32], int]:
95
+ if fallback_path.is_file():
96
+ return _read_audio(fallback_path)
97
+ if isinstance(result, str | Path):
98
+ return _read_audio(Path(result))
99
+ if isinstance(result, dict):
100
+ for key in ("audio_path", "wav_path", "output_path"):
101
+ value = result.get(key)
102
+ if isinstance(value, str | Path):
103
+ return _read_audio(Path(value))
104
+ for key in ("audio", "wav", "waveform"):
105
+ if key in result:
106
+ return _audio_array(result[key]), INDEXTTS_SAMPLE_RATE
107
+ if result is not None:
108
+ return _audio_array(result), INDEXTTS_SAMPLE_RATE
109
+ raise RuntimeError("IndexTTS produced no audio.")
110
+
111
+
112
+ def _audio_array(audio: Any) -> NDArray[np.float32]:
113
+ if hasattr(audio, "detach"):
114
+ audio = audio.detach()
115
+ if hasattr(audio, "cpu"):
116
+ audio = audio.cpu()
117
+ if hasattr(audio, "numpy"):
118
+ audio = audio.numpy()
119
+ array = np.asarray(audio, dtype=np.float32).reshape(-1)
120
+ if array.size == 0:
121
+ raise RuntimeError("IndexTTS produced no audio.")
122
+ return array
123
+
124
+
125
+ def _candidate_model_configs(model_root: Path) -> list[Path]:
126
+ candidates = [
127
+ model_root / "config.yaml",
128
+ model_root / "config.yml",
129
+ model_root / "indextts2.yaml",
130
+ model_root / "checkpoints" / "config.yaml",
131
+ ]
132
+ return [candidate for candidate in candidates if candidate.is_file()]
133
+
134
+
135
+ def _construct_model(cls: type[Any], model_path: Path, device: str) -> Any:
136
+ cfg_candidates = _candidate_model_configs(model_path)
137
+ attempts: list[Callable[[], Any]] = []
138
+ for cfg_path in cfg_candidates:
139
+ attempts.append(
140
+ lambda cfg_path=cfg_path: cls(
141
+ cfg_path=str(cfg_path),
142
+ model_dir=str(model_path),
143
+ device=device,
144
+ use_fp16=device == "cuda",
145
+ use_cuda_kernel=device == "cuda",
146
+ use_deepspeed=False,
147
+ )
148
+ )
149
+ attempts.append(lambda cfg_path=cfg_path: cls(cfg_path=str(cfg_path), model_dir=str(model_path), device=device))
150
+ attempts.append(lambda cfg_path=cfg_path: cls(str(cfg_path), str(model_path), device=device))
151
+ attempts.extend(
152
+ [
153
+ lambda: cls(model_dir=str(model_path), device=device),
154
+ lambda: cls(str(model_path), device=device),
155
+ lambda: cls(str(model_path)),
156
+ ]
157
+ )
158
+
159
+ errors: list[str] = []
160
+ for attempt in attempts:
161
+ try:
162
+ return attempt()
163
+ except TypeError as exc:
164
+ errors.append(str(exc))
165
+
166
+ raise RuntimeError("Could not initialize IndexTTS2 with the available constructor signatures.") from (
167
+ TypeError("; ".join(errors)) if errors else None
168
+ )
169
+
170
+
171
+ def _infer_to_file(model: Any, text: str, reference_path: str, output_path: Path) -> Any:
172
+ attempts: list[Callable[[], Any]] = [
173
+ lambda: model.infer(spk_audio_prompt=reference_path, text=text, output_path=str(output_path)),
174
+ lambda: model.infer(audio_prompt=reference_path, text=text, output_path=str(output_path)),
175
+ lambda: model.infer(text=text, audio_prompt=reference_path, output_path=str(output_path)),
176
+ lambda: model.infer(reference_path, text, str(output_path)),
177
+ ]
178
+ errors: list[str] = []
179
+ for attempt in attempts:
180
+ try:
181
+ return attempt()
182
+ except TypeError as exc:
183
+ errors.append(str(exc))
184
+ raise RuntimeError("Could not call IndexTTS2.infer with the supported adapter signatures.") from TypeError(
185
+ "; ".join(errors)
186
+ )
187
+
188
+
189
+ class IndexTTSAdapter(TTSAdapter):
190
+ def __init__(self) -> None:
191
+ self._model: Any | None = None
192
+ self._device = "cpu"
193
+ self._sample_rate = INDEXTTS_SAMPLE_RATE
194
+
195
+ def info(self) -> AdapterInfo:
196
+ return AdapterInfo(
197
+ name="indextts-tts-torch",
198
+ type=ModelType.TTS,
199
+ architectures=("indextts-tts-torch", "indextts2", "indextts"),
200
+ default_sample_rate=INDEXTTS_SAMPLE_RATE,
201
+ supported_formats=(ModelFormat.PYTORCH,),
202
+ supports_streaming=False,
203
+ supports_voice_cloning=True,
204
+ supported_languages=("en", "zh"),
205
+ )
206
+
207
+ def load(self, model_path: str, device: str, **kwargs: Any) -> None:
208
+ if self._model is not None:
209
+ return
210
+
211
+ kwargs.pop("_source", None)
212
+ self._device = device
213
+ cls = _load_indextts_class()
214
+ logger.info("Loading IndexTTS2 runtime from %s (device=%s)", model_path, self._device)
215
+ self._model = _construct_model(cls, Path(model_path), self._device)
216
+
217
+ def unload(self) -> None:
218
+ self._model = None
219
+ self._device = "cpu"
220
+ self._sample_rate = INDEXTTS_SAMPLE_RATE
221
+
222
+ @property
223
+ def is_loaded(self) -> bool:
224
+ return self._model is not None
225
+
226
+ async def synthesize(
227
+ self,
228
+ text: str,
229
+ *,
230
+ voice: str | None = None,
231
+ speed: float = 1.0,
232
+ language: str | None = None,
233
+ reference_audio: NDArray[np.float32] | None = None,
234
+ reference_text: str | None = None,
235
+ ) -> AsyncIterator[SynthesizeChunk]:
236
+ if self._model is None:
237
+ raise RuntimeError("IndexTTS model is not loaded — call load() first")
238
+ if not text or not text.strip():
239
+ return
240
+
241
+ voice_file = _voice_path(voice)
242
+ if reference_audio is None and voice_file is None:
243
+ raise ValueError("IndexTTS requires reference_audio or a voice path for speaker cloning.")
244
+
245
+ with tempfile.TemporaryDirectory(prefix="vox-indextts-") as tmpdir:
246
+ tmpdir_path = Path(tmpdir)
247
+ if reference_audio is not None:
248
+ ref_path = tmpdir_path / "reference.wav"
249
+ _write_reference_audio(ref_path, reference_audio, self._sample_rate)
250
+ reference_path = str(ref_path)
251
+ else:
252
+ assert voice_file is not None
253
+ reference_path = voice_file
254
+
255
+ output_path = tmpdir_path / "output.wav"
256
+ result = _infer_to_file(self._model, text, reference_path, output_path)
257
+ audio, sample_rate = _audio_from_result(result, output_path)
258
+
259
+ chunk_size = sample_rate * 2
260
+ for i in range(0, len(audio), chunk_size):
261
+ yield SynthesizeChunk(
262
+ audio=audio[i:i + chunk_size].tobytes(),
263
+ sample_rate=sample_rate,
264
+ is_final=False,
265
+ )
266
+
267
+ yield SynthesizeChunk(audio=b"", sample_rate=sample_rate, is_final=True)
268
+
269
+ def list_voices(self) -> list[VoiceInfo]:
270
+ return [
271
+ VoiceInfo(
272
+ id="reference",
273
+ name="Reference audio",
274
+ language=None,
275
+ description="Pass reference_audio or a voice path to clone a speaker.",
276
+ is_cloned=True,
277
+ )
278
+ ]
279
+
280
+ def estimate_vram_bytes(self, **kwargs: Any) -> int:
281
+ return 6_000_000_000