vox-indextts 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vox-indextts
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: IndexTTS adapters for Vox
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: numpy<2.4,>=1.26.0
|
|
7
|
+
Requires-Dist: soundfile>=0.13.1
|
|
8
|
+
Requires-Dist: vox-runtime>=0.2.2
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
|
|
11
|
+
# vox-indextts
|
|
12
|
+
|
|
13
|
+
`vox-indextts` provides a Vox TTS adapter for IndexTTS2.
|
|
14
|
+
|
|
15
|
+
Adapters:
|
|
16
|
+
|
|
17
|
+
- `indextts-tts-torch` - IndexTTS2 voice cloning backend
|
|
18
|
+
|
|
19
|
+
## Install
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install vox-indextts
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Runtime Dependencies
|
|
26
|
+
|
|
27
|
+
The adapter package is intentionally light. The upstream IndexTTS runtime is
|
|
28
|
+
installed on demand from GitHub into the isolated target runtime
|
|
29
|
+
`$VOX_HOME/runtime/indextts`.
|
|
30
|
+
|
|
31
|
+
## Use with Vox
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
vox pull indextts-tts-torch:2
|
|
35
|
+
vox run indextts-tts-torch:2 "Hello from IndexTTS"
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
IndexTTS is a voice-cloning backend. Pass `reference_audio` through the Vox API
|
|
39
|
+
or use a voice value that points to a local WAV file.
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# vox-indextts
|
|
2
|
+
|
|
3
|
+
`vox-indextts` provides a Vox TTS adapter for IndexTTS2.
|
|
4
|
+
|
|
5
|
+
Adapters:
|
|
6
|
+
|
|
7
|
+
- `indextts-tts-torch` - IndexTTS2 voice cloning backend
|
|
8
|
+
|
|
9
|
+
## Install
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install vox-indextts
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Runtime Dependencies
|
|
16
|
+
|
|
17
|
+
The adapter package is intentionally light. The upstream IndexTTS runtime is
|
|
18
|
+
installed on demand from GitHub into the isolated target runtime
|
|
19
|
+
`$VOX_HOME/runtime/indextts`.
|
|
20
|
+
|
|
21
|
+
## Use with Vox
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
vox pull indextts-tts-torch:2
|
|
25
|
+
vox run indextts-tts-torch:2 "Hello from IndexTTS"
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
IndexTTS is a voice-cloning backend. Pass `reference_audio` through the Vox API
|
|
29
|
+
or use a voice value that points to a local WAV file.
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "vox-indextts"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "IndexTTS adapters for Vox"
|
|
5
|
+
readme = { file = "README.md", content-type = "text/markdown" }
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"vox-runtime>=0.2.2",
|
|
9
|
+
"numpy>=1.26.0,<2.4",
|
|
10
|
+
"soundfile>=0.13.1",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
[tool.vox.adapter]
|
|
14
|
+
import-package = "vox_indextts"
|
|
15
|
+
runtime-policy = "target-runtime"
|
|
16
|
+
runtime-names = ["indextts"]
|
|
17
|
+
adapter-types = ["tts"]
|
|
18
|
+
|
|
19
|
+
[project.entry-points."vox.adapters"]
|
|
20
|
+
indextts-tts-torch = "vox_indextts.adapter:IndexTTSAdapter"
|
|
21
|
+
|
|
22
|
+
[build-system]
|
|
23
|
+
requires = ["hatchling"]
|
|
24
|
+
build-backend = "hatchling.build"
|
|
25
|
+
|
|
26
|
+
[tool.hatch.build.targets.wheel]
|
|
27
|
+
packages = ["src/vox_indextts"]
|
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib
|
|
4
|
+
import logging
|
|
5
|
+
import subprocess
|
|
6
|
+
import tempfile
|
|
7
|
+
from collections.abc import AsyncIterator, Callable
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import soundfile as sf
|
|
13
|
+
from numpy.typing import NDArray
|
|
14
|
+
|
|
15
|
+
from vox.core.adapter import TTSAdapter
|
|
16
|
+
from vox.core.adapter_runtime import (
|
|
17
|
+
activate_runtime_path,
|
|
18
|
+
install_target_runtime_requirements,
|
|
19
|
+
purge_runtime_modules,
|
|
20
|
+
)
|
|
21
|
+
from vox.core.adapter_runtime import (
|
|
22
|
+
runtime_root as vox_runtime_root,
|
|
23
|
+
)
|
|
24
|
+
from vox.core.types import AdapterInfo, ModelFormat, ModelType, SynthesizeChunk, VoiceInfo
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
INDEXTTS_SAMPLE_RATE = 24_000
|
|
29
|
+
INDEXTTS_REPO = "git+https://github.com/index-tts/index-tts.git"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _runtime_root() -> Path:
|
|
33
|
+
return vox_runtime_root() / "indextts"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _ensure_runtime_path() -> str:
|
|
37
|
+
runtime_dir = _runtime_root()
|
|
38
|
+
runtime_dir.mkdir(parents=True, exist_ok=True)
|
|
39
|
+
return activate_runtime_path(runtime_dir, root=runtime_dir.parent)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _run_install_command(cmd: list[str], timeout: int) -> subprocess.CompletedProcess[str]:
|
|
43
|
+
return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _install_indextts_runtime() -> None:
|
|
47
|
+
runtime_path = _ensure_runtime_path()
|
|
48
|
+
if not install_target_runtime_requirements(
|
|
49
|
+
runtime_path,
|
|
50
|
+
(INDEXTTS_REPO,),
|
|
51
|
+
timeout=1200,
|
|
52
|
+
install_runner=_run_install_command,
|
|
53
|
+
context="IndexTTS runtime install",
|
|
54
|
+
):
|
|
55
|
+
raise RuntimeError("Failed to install IndexTTS runtime from GitHub.")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _clear_indextts_modules() -> None:
|
|
59
|
+
purge_runtime_modules(("indextts",))
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _load_indextts_class() -> type[Any]:
|
|
63
|
+
_ensure_runtime_path()
|
|
64
|
+
try:
|
|
65
|
+
module = importlib.import_module("indextts.infer_v2")
|
|
66
|
+
except ImportError:
|
|
67
|
+
_install_indextts_runtime()
|
|
68
|
+
_clear_indextts_modules()
|
|
69
|
+
module = importlib.import_module("indextts.infer_v2")
|
|
70
|
+
|
|
71
|
+
cls = getattr(module, "IndexTTS2", None)
|
|
72
|
+
if cls is None:
|
|
73
|
+
raise RuntimeError("IndexTTS runtime is installed, but indextts.infer_v2.IndexTTS2 was not found.")
|
|
74
|
+
return cls
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _voice_path(voice: str | None) -> str | None:
|
|
78
|
+
if not voice:
|
|
79
|
+
return None
|
|
80
|
+
path = Path(voice).expanduser()
|
|
81
|
+
return str(path) if path.is_file() else None
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _write_reference_audio(path: Path, reference_audio: NDArray[np.float32], sample_rate: int) -> None:
|
|
85
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
86
|
+
sf.write(path, np.asarray(reference_audio, dtype=np.float32), sample_rate)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _read_audio(path: Path) -> tuple[NDArray[np.float32], int]:
|
|
90
|
+
audio, sample_rate = sf.read(path, dtype="float32", always_2d=False)
|
|
91
|
+
return np.asarray(audio, dtype=np.float32).reshape(-1), int(sample_rate)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _audio_from_result(result: Any, fallback_path: Path) -> tuple[NDArray[np.float32], int]:
|
|
95
|
+
if fallback_path.is_file():
|
|
96
|
+
return _read_audio(fallback_path)
|
|
97
|
+
if isinstance(result, str | Path):
|
|
98
|
+
return _read_audio(Path(result))
|
|
99
|
+
if isinstance(result, dict):
|
|
100
|
+
for key in ("audio_path", "wav_path", "output_path"):
|
|
101
|
+
value = result.get(key)
|
|
102
|
+
if isinstance(value, str | Path):
|
|
103
|
+
return _read_audio(Path(value))
|
|
104
|
+
for key in ("audio", "wav", "waveform"):
|
|
105
|
+
if key in result:
|
|
106
|
+
return _audio_array(result[key]), INDEXTTS_SAMPLE_RATE
|
|
107
|
+
if result is not None:
|
|
108
|
+
return _audio_array(result), INDEXTTS_SAMPLE_RATE
|
|
109
|
+
raise RuntimeError("IndexTTS produced no audio.")
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _audio_array(audio: Any) -> NDArray[np.float32]:
|
|
113
|
+
if hasattr(audio, "detach"):
|
|
114
|
+
audio = audio.detach()
|
|
115
|
+
if hasattr(audio, "cpu"):
|
|
116
|
+
audio = audio.cpu()
|
|
117
|
+
if hasattr(audio, "numpy"):
|
|
118
|
+
audio = audio.numpy()
|
|
119
|
+
array = np.asarray(audio, dtype=np.float32).reshape(-1)
|
|
120
|
+
if array.size == 0:
|
|
121
|
+
raise RuntimeError("IndexTTS produced no audio.")
|
|
122
|
+
return array
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _candidate_model_configs(model_root: Path) -> list[Path]:
|
|
126
|
+
candidates = [
|
|
127
|
+
model_root / "config.yaml",
|
|
128
|
+
model_root / "config.yml",
|
|
129
|
+
model_root / "indextts2.yaml",
|
|
130
|
+
model_root / "checkpoints" / "config.yaml",
|
|
131
|
+
]
|
|
132
|
+
return [candidate for candidate in candidates if candidate.is_file()]
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _construct_model(cls: type[Any], model_path: Path, device: str) -> Any:
|
|
136
|
+
cfg_candidates = _candidate_model_configs(model_path)
|
|
137
|
+
attempts: list[Callable[[], Any]] = []
|
|
138
|
+
for cfg_path in cfg_candidates:
|
|
139
|
+
attempts.append(
|
|
140
|
+
lambda cfg_path=cfg_path: cls(
|
|
141
|
+
cfg_path=str(cfg_path),
|
|
142
|
+
model_dir=str(model_path),
|
|
143
|
+
device=device,
|
|
144
|
+
use_fp16=device == "cuda",
|
|
145
|
+
use_cuda_kernel=device == "cuda",
|
|
146
|
+
use_deepspeed=False,
|
|
147
|
+
)
|
|
148
|
+
)
|
|
149
|
+
attempts.append(lambda cfg_path=cfg_path: cls(cfg_path=str(cfg_path), model_dir=str(model_path), device=device))
|
|
150
|
+
attempts.append(lambda cfg_path=cfg_path: cls(str(cfg_path), str(model_path), device=device))
|
|
151
|
+
attempts.extend(
|
|
152
|
+
[
|
|
153
|
+
lambda: cls(model_dir=str(model_path), device=device),
|
|
154
|
+
lambda: cls(str(model_path), device=device),
|
|
155
|
+
lambda: cls(str(model_path)),
|
|
156
|
+
]
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
errors: list[str] = []
|
|
160
|
+
for attempt in attempts:
|
|
161
|
+
try:
|
|
162
|
+
return attempt()
|
|
163
|
+
except TypeError as exc:
|
|
164
|
+
errors.append(str(exc))
|
|
165
|
+
|
|
166
|
+
raise RuntimeError("Could not initialize IndexTTS2 with the available constructor signatures.") from (
|
|
167
|
+
TypeError("; ".join(errors)) if errors else None
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _infer_to_file(model: Any, text: str, reference_path: str, output_path: Path) -> Any:
|
|
172
|
+
attempts: list[Callable[[], Any]] = [
|
|
173
|
+
lambda: model.infer(spk_audio_prompt=reference_path, text=text, output_path=str(output_path)),
|
|
174
|
+
lambda: model.infer(audio_prompt=reference_path, text=text, output_path=str(output_path)),
|
|
175
|
+
lambda: model.infer(text=text, audio_prompt=reference_path, output_path=str(output_path)),
|
|
176
|
+
lambda: model.infer(reference_path, text, str(output_path)),
|
|
177
|
+
]
|
|
178
|
+
errors: list[str] = []
|
|
179
|
+
for attempt in attempts:
|
|
180
|
+
try:
|
|
181
|
+
return attempt()
|
|
182
|
+
except TypeError as exc:
|
|
183
|
+
errors.append(str(exc))
|
|
184
|
+
raise RuntimeError("Could not call IndexTTS2.infer with the supported adapter signatures.") from TypeError(
|
|
185
|
+
"; ".join(errors)
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
class IndexTTSAdapter(TTSAdapter):
|
|
190
|
+
def __init__(self) -> None:
|
|
191
|
+
self._model: Any | None = None
|
|
192
|
+
self._device = "cpu"
|
|
193
|
+
self._sample_rate = INDEXTTS_SAMPLE_RATE
|
|
194
|
+
|
|
195
|
+
def info(self) -> AdapterInfo:
|
|
196
|
+
return AdapterInfo(
|
|
197
|
+
name="indextts-tts-torch",
|
|
198
|
+
type=ModelType.TTS,
|
|
199
|
+
architectures=("indextts-tts-torch", "indextts2", "indextts"),
|
|
200
|
+
default_sample_rate=INDEXTTS_SAMPLE_RATE,
|
|
201
|
+
supported_formats=(ModelFormat.PYTORCH,),
|
|
202
|
+
supports_streaming=False,
|
|
203
|
+
supports_voice_cloning=True,
|
|
204
|
+
supported_languages=("en", "zh"),
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
def load(self, model_path: str, device: str, **kwargs: Any) -> None:
|
|
208
|
+
if self._model is not None:
|
|
209
|
+
return
|
|
210
|
+
|
|
211
|
+
kwargs.pop("_source", None)
|
|
212
|
+
self._device = device
|
|
213
|
+
cls = _load_indextts_class()
|
|
214
|
+
logger.info("Loading IndexTTS2 runtime from %s (device=%s)", model_path, self._device)
|
|
215
|
+
self._model = _construct_model(cls, Path(model_path), self._device)
|
|
216
|
+
|
|
217
|
+
def unload(self) -> None:
|
|
218
|
+
self._model = None
|
|
219
|
+
self._device = "cpu"
|
|
220
|
+
self._sample_rate = INDEXTTS_SAMPLE_RATE
|
|
221
|
+
|
|
222
|
+
@property
|
|
223
|
+
def is_loaded(self) -> bool:
|
|
224
|
+
return self._model is not None
|
|
225
|
+
|
|
226
|
+
async def synthesize(
|
|
227
|
+
self,
|
|
228
|
+
text: str,
|
|
229
|
+
*,
|
|
230
|
+
voice: str | None = None,
|
|
231
|
+
speed: float = 1.0,
|
|
232
|
+
language: str | None = None,
|
|
233
|
+
reference_audio: NDArray[np.float32] | None = None,
|
|
234
|
+
reference_text: str | None = None,
|
|
235
|
+
) -> AsyncIterator[SynthesizeChunk]:
|
|
236
|
+
if self._model is None:
|
|
237
|
+
raise RuntimeError("IndexTTS model is not loaded — call load() first")
|
|
238
|
+
if not text or not text.strip():
|
|
239
|
+
return
|
|
240
|
+
|
|
241
|
+
voice_file = _voice_path(voice)
|
|
242
|
+
if reference_audio is None and voice_file is None:
|
|
243
|
+
raise ValueError("IndexTTS requires reference_audio or a voice path for speaker cloning.")
|
|
244
|
+
|
|
245
|
+
with tempfile.TemporaryDirectory(prefix="vox-indextts-") as tmpdir:
|
|
246
|
+
tmpdir_path = Path(tmpdir)
|
|
247
|
+
if reference_audio is not None:
|
|
248
|
+
ref_path = tmpdir_path / "reference.wav"
|
|
249
|
+
_write_reference_audio(ref_path, reference_audio, self._sample_rate)
|
|
250
|
+
reference_path = str(ref_path)
|
|
251
|
+
else:
|
|
252
|
+
assert voice_file is not None
|
|
253
|
+
reference_path = voice_file
|
|
254
|
+
|
|
255
|
+
output_path = tmpdir_path / "output.wav"
|
|
256
|
+
result = _infer_to_file(self._model, text, reference_path, output_path)
|
|
257
|
+
audio, sample_rate = _audio_from_result(result, output_path)
|
|
258
|
+
|
|
259
|
+
chunk_size = sample_rate * 2
|
|
260
|
+
for i in range(0, len(audio), chunk_size):
|
|
261
|
+
yield SynthesizeChunk(
|
|
262
|
+
audio=audio[i:i + chunk_size].tobytes(),
|
|
263
|
+
sample_rate=sample_rate,
|
|
264
|
+
is_final=False,
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
yield SynthesizeChunk(audio=b"", sample_rate=sample_rate, is_final=True)
|
|
268
|
+
|
|
269
|
+
def list_voices(self) -> list[VoiceInfo]:
|
|
270
|
+
return [
|
|
271
|
+
VoiceInfo(
|
|
272
|
+
id="reference",
|
|
273
|
+
name="Reference audio",
|
|
274
|
+
language=None,
|
|
275
|
+
description="Pass reference_audio or a voice path to clone a speaker.",
|
|
276
|
+
is_cloned=True,
|
|
277
|
+
)
|
|
278
|
+
]
|
|
279
|
+
|
|
280
|
+
def estimate_vram_bytes(self, **kwargs: Any) -> int:
|
|
281
|
+
return 6_000_000_000
|