vox-chatterbox 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vox-chatterbox
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Resemble Chatterbox TTS adapters for Vox
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: numpy<2.4,>=1.26.0
|
|
7
|
+
Requires-Dist: soundfile>=0.13.1
|
|
8
|
+
Requires-Dist: vox-runtime>=0.2.2
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
|
|
11
|
+
# vox-chatterbox
|
|
12
|
+
|
|
13
|
+
`vox-chatterbox` provides Vox TTS adapters for Resemble AI Chatterbox.
|
|
14
|
+
|
|
15
|
+
Adapters:
|
|
16
|
+
|
|
17
|
+
- `chatterbox-tts-turbo` - Chatterbox Turbo backend
|
|
18
|
+
- `chatterbox-tts` - Chatterbox backend
|
|
19
|
+
- `chatterbox-tts-multilingual` - Chatterbox multilingual backend
|
|
20
|
+
|
|
21
|
+
## Install
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install vox-chatterbox
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Runtime Dependencies
|
|
28
|
+
|
|
29
|
+
The adapter package is intentionally light. The Chatterbox backend package is
|
|
30
|
+
installed on demand into the isolated target runtime
|
|
31
|
+
`$VOX_HOME/runtime/chatterbox`.
|
|
32
|
+
|
|
33
|
+
## Use with Vox
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
vox pull chatterbox-tts-turbo:0.1.7
|
|
37
|
+
vox run chatterbox-tts-turbo:0.1.7 "Hello from Chatterbox"
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
For voice cloning, pass a reference audio sample through the Vox API or use a
|
|
41
|
+
voice value that points to a local WAV file.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# vox-chatterbox
|
|
2
|
+
|
|
3
|
+
`vox-chatterbox` provides Vox TTS adapters for Resemble AI Chatterbox.
|
|
4
|
+
|
|
5
|
+
Adapters:
|
|
6
|
+
|
|
7
|
+
- `chatterbox-tts-turbo` - Chatterbox Turbo backend
|
|
8
|
+
- `chatterbox-tts` - Chatterbox backend
|
|
9
|
+
- `chatterbox-tts-multilingual` - Chatterbox multilingual backend
|
|
10
|
+
|
|
11
|
+
## Install
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
pip install vox-chatterbox
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Runtime Dependencies
|
|
18
|
+
|
|
19
|
+
The adapter package is intentionally light. The Chatterbox backend package is
|
|
20
|
+
installed on demand into the isolated target runtime
|
|
21
|
+
`$VOX_HOME/runtime/chatterbox`.
|
|
22
|
+
|
|
23
|
+
## Use with Vox
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
vox pull chatterbox-tts-turbo:0.1.7
|
|
27
|
+
vox run chatterbox-tts-turbo:0.1.7 "Hello from Chatterbox"
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
For voice cloning, pass a reference audio sample through the Vox API or use a
|
|
31
|
+
voice value that points to a local WAV file.
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "vox-chatterbox"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Resemble Chatterbox TTS adapters for Vox"
|
|
5
|
+
readme = { file = "README.md", content-type = "text/markdown" }
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"vox-runtime>=0.2.2",
|
|
9
|
+
"numpy>=1.26.0,<2.4",
|
|
10
|
+
"soundfile>=0.13.1",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
[tool.vox.adapter]
|
|
14
|
+
import-package = "vox_chatterbox"
|
|
15
|
+
runtime-policy = "target-runtime"
|
|
16
|
+
runtime-names = ["chatterbox"]
|
|
17
|
+
adapter-types = ["tts"]
|
|
18
|
+
|
|
19
|
+
[project.entry-points."vox.adapters"]
|
|
20
|
+
chatterbox-tts-turbo = "vox_chatterbox.adapter:ChatterboxTurboAdapter"
|
|
21
|
+
chatterbox-tts = "vox_chatterbox.adapter:ChatterboxAdapter"
|
|
22
|
+
chatterbox-tts-multilingual = "vox_chatterbox.adapter:ChatterboxMultilingualAdapter"
|
|
23
|
+
|
|
24
|
+
[build-system]
|
|
25
|
+
requires = ["hatchling"]
|
|
26
|
+
build-backend = "hatchling.build"
|
|
27
|
+
|
|
28
|
+
[tool.hatch.build.targets.wheel]
|
|
29
|
+
packages = ["src/vox_chatterbox"]
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib
|
|
4
|
+
import logging
|
|
5
|
+
import subprocess
|
|
6
|
+
import tempfile
|
|
7
|
+
from collections.abc import AsyncIterator
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import soundfile as sf
|
|
13
|
+
from numpy.typing import NDArray
|
|
14
|
+
|
|
15
|
+
from vox.core.adapter import TTSAdapter
|
|
16
|
+
from vox.core.adapter_runtime import (
|
|
17
|
+
activate_runtime_path,
|
|
18
|
+
install_target_runtime_requirements,
|
|
19
|
+
purge_runtime_modules,
|
|
20
|
+
)
|
|
21
|
+
from vox.core.adapter_runtime import (
|
|
22
|
+
runtime_root as vox_runtime_root,
|
|
23
|
+
)
|
|
24
|
+
from vox.core.types import AdapterInfo, ModelFormat, ModelType, SynthesizeChunk, VoiceInfo
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
CHATTERBOX_SAMPLE_RATE = 24_000
|
|
29
|
+
CHATTERBOX_RUNTIME_DEPS = ("chatterbox-tts>=0.1.7,<0.2.0",)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _runtime_root() -> Path:
|
|
33
|
+
return vox_runtime_root() / "chatterbox"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _ensure_runtime_path() -> str:
|
|
37
|
+
runtime_dir = _runtime_root()
|
|
38
|
+
runtime_dir.mkdir(parents=True, exist_ok=True)
|
|
39
|
+
return activate_runtime_path(runtime_dir, root=runtime_dir.parent)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _run_install_command(cmd: list[str], timeout: int) -> subprocess.CompletedProcess[str]:
|
|
43
|
+
return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _install_chatterbox_runtime() -> None:
|
|
47
|
+
runtime_path = _ensure_runtime_path()
|
|
48
|
+
if not install_target_runtime_requirements(
|
|
49
|
+
runtime_path,
|
|
50
|
+
CHATTERBOX_RUNTIME_DEPS,
|
|
51
|
+
timeout=900,
|
|
52
|
+
install_runner=_run_install_command,
|
|
53
|
+
context="Chatterbox runtime install",
|
|
54
|
+
):
|
|
55
|
+
raise RuntimeError("Failed to install Chatterbox runtime package.")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _clear_chatterbox_modules() -> None:
|
|
59
|
+
purge_runtime_modules(("chatterbox", "s3tokenizer"))
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _load_chatterbox_class(module_name: str, class_name: str) -> type[Any]:
|
|
63
|
+
_ensure_runtime_path()
|
|
64
|
+
try:
|
|
65
|
+
module = importlib.import_module(module_name)
|
|
66
|
+
except ImportError:
|
|
67
|
+
_install_chatterbox_runtime()
|
|
68
|
+
_clear_chatterbox_modules()
|
|
69
|
+
module = importlib.import_module(module_name)
|
|
70
|
+
|
|
71
|
+
cls = getattr(module, class_name, None)
|
|
72
|
+
if cls is None:
|
|
73
|
+
raise RuntimeError(
|
|
74
|
+
f"Chatterbox runtime is installed, but {module_name}.{class_name} was not found."
|
|
75
|
+
)
|
|
76
|
+
return cls
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _float_audio(audio: Any) -> NDArray[np.float32]:
|
|
80
|
+
if isinstance(audio, dict):
|
|
81
|
+
for key in ("audio", "wav", "waveform"):
|
|
82
|
+
if key in audio:
|
|
83
|
+
return _float_audio(audio[key])
|
|
84
|
+
raise RuntimeError("Chatterbox returned a dict without audio data.")
|
|
85
|
+
|
|
86
|
+
if isinstance(audio, tuple | list) and audio and not isinstance(audio[0], (int, float, np.number)):
|
|
87
|
+
return _float_audio(audio[0])
|
|
88
|
+
|
|
89
|
+
if hasattr(audio, "detach"):
|
|
90
|
+
audio = audio.detach()
|
|
91
|
+
if hasattr(audio, "cpu"):
|
|
92
|
+
audio = audio.cpu()
|
|
93
|
+
if hasattr(audio, "numpy"):
|
|
94
|
+
audio = audio.numpy()
|
|
95
|
+
|
|
96
|
+
array = np.asarray(audio, dtype=np.float32).reshape(-1)
|
|
97
|
+
if array.size == 0:
|
|
98
|
+
raise RuntimeError("Chatterbox produced no audio.")
|
|
99
|
+
return array
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _sample_rate(model: Any) -> int:
|
|
103
|
+
for attr in ("sr", "sample_rate", "sampling_rate"):
|
|
104
|
+
value = getattr(model, attr, None)
|
|
105
|
+
if isinstance(value, int) and value > 0:
|
|
106
|
+
return value
|
|
107
|
+
return CHATTERBOX_SAMPLE_RATE
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _load_model(cls: type[Any], device: str) -> Any:
|
|
111
|
+
if hasattr(cls, "from_pretrained"):
|
|
112
|
+
try:
|
|
113
|
+
return cls.from_pretrained(device=device)
|
|
114
|
+
except TypeError:
|
|
115
|
+
return cls.from_pretrained()
|
|
116
|
+
try:
|
|
117
|
+
return cls(device=device)
|
|
118
|
+
except TypeError:
|
|
119
|
+
return cls()
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _voice_path(voice: str | None) -> str | None:
|
|
123
|
+
if not voice:
|
|
124
|
+
return None
|
|
125
|
+
path = Path(voice).expanduser()
|
|
126
|
+
return str(path) if path.is_file() else None
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _write_reference_audio(path: Path, reference_audio: NDArray[np.float32], sample_rate: int) -> None:
|
|
130
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
131
|
+
sf.write(path, np.asarray(reference_audio, dtype=np.float32), sample_rate)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class _BaseChatterboxAdapter(TTSAdapter):
|
|
135
|
+
adapter_name = "chatterbox-tts"
|
|
136
|
+
architectures = ("chatterbox-tts", "chatterbox")
|
|
137
|
+
runtime_module = "chatterbox.tts"
|
|
138
|
+
runtime_class = "ChatterboxTTS"
|
|
139
|
+
supported_languages: tuple[str, ...] = ("en",)
|
|
140
|
+
supports_streaming = False
|
|
141
|
+
|
|
142
|
+
def __init__(self) -> None:
|
|
143
|
+
self._model: Any | None = None
|
|
144
|
+
self._device = "cpu"
|
|
145
|
+
self._sample_rate = CHATTERBOX_SAMPLE_RATE
|
|
146
|
+
|
|
147
|
+
def info(self) -> AdapterInfo:
|
|
148
|
+
return AdapterInfo(
|
|
149
|
+
name=self.adapter_name,
|
|
150
|
+
type=ModelType.TTS,
|
|
151
|
+
architectures=self.architectures,
|
|
152
|
+
default_sample_rate=CHATTERBOX_SAMPLE_RATE,
|
|
153
|
+
supported_formats=(ModelFormat.PYTORCH,),
|
|
154
|
+
supports_streaming=self.supports_streaming,
|
|
155
|
+
supports_voice_cloning=True,
|
|
156
|
+
supported_languages=self.supported_languages,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
def load(self, model_path: str, device: str, **kwargs: Any) -> None:
|
|
160
|
+
if self._model is not None:
|
|
161
|
+
return
|
|
162
|
+
|
|
163
|
+
kwargs.pop("_source", None)
|
|
164
|
+
self._device = device
|
|
165
|
+
cls = _load_chatterbox_class(self.runtime_module, self.runtime_class)
|
|
166
|
+
logger.info("Loading Chatterbox runtime %s (device=%s)", self.runtime_class, self._device)
|
|
167
|
+
self._model = _load_model(cls, self._device)
|
|
168
|
+
self._sample_rate = _sample_rate(self._model)
|
|
169
|
+
|
|
170
|
+
def unload(self) -> None:
|
|
171
|
+
self._model = None
|
|
172
|
+
self._device = "cpu"
|
|
173
|
+
self._sample_rate = CHATTERBOX_SAMPLE_RATE
|
|
174
|
+
|
|
175
|
+
@property
|
|
176
|
+
def is_loaded(self) -> bool:
|
|
177
|
+
return self._model is not None
|
|
178
|
+
|
|
179
|
+
async def synthesize(
|
|
180
|
+
self,
|
|
181
|
+
text: str,
|
|
182
|
+
*,
|
|
183
|
+
voice: str | None = None,
|
|
184
|
+
speed: float = 1.0,
|
|
185
|
+
language: str | None = None,
|
|
186
|
+
reference_audio: NDArray[np.float32] | None = None,
|
|
187
|
+
reference_text: str | None = None,
|
|
188
|
+
) -> AsyncIterator[SynthesizeChunk]:
|
|
189
|
+
if self._model is None:
|
|
190
|
+
raise RuntimeError("Chatterbox model is not loaded — call load() first")
|
|
191
|
+
if not text or not text.strip():
|
|
192
|
+
return
|
|
193
|
+
|
|
194
|
+
kwargs: dict[str, Any] = {}
|
|
195
|
+
if speed and speed != 1.0:
|
|
196
|
+
kwargs["speed"] = speed
|
|
197
|
+
if reference_text:
|
|
198
|
+
kwargs["audio_prompt_text"] = reference_text
|
|
199
|
+
if self.supported_languages != ("en",):
|
|
200
|
+
kwargs["language_id"] = language or "en"
|
|
201
|
+
|
|
202
|
+
voice_file = _voice_path(voice)
|
|
203
|
+
if voice_file is not None:
|
|
204
|
+
kwargs["audio_prompt_path"] = voice_file
|
|
205
|
+
|
|
206
|
+
if reference_audio is not None:
|
|
207
|
+
with tempfile.TemporaryDirectory(prefix="vox-chatterbox-") as tmpdir:
|
|
208
|
+
ref_path = Path(tmpdir) / "reference.wav"
|
|
209
|
+
_write_reference_audio(ref_path, reference_audio, self._sample_rate)
|
|
210
|
+
kwargs["audio_prompt_path"] = str(ref_path)
|
|
211
|
+
audio = _float_audio(self._model.generate(text, **kwargs))
|
|
212
|
+
else:
|
|
213
|
+
audio = _float_audio(self._model.generate(text, **kwargs))
|
|
214
|
+
|
|
215
|
+
chunk_size = self._sample_rate * 2
|
|
216
|
+
for i in range(0, len(audio), chunk_size):
|
|
217
|
+
yield SynthesizeChunk(
|
|
218
|
+
audio=audio[i:i + chunk_size].tobytes(),
|
|
219
|
+
sample_rate=self._sample_rate,
|
|
220
|
+
is_final=False,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
yield SynthesizeChunk(audio=b"", sample_rate=self._sample_rate, is_final=True)
|
|
224
|
+
|
|
225
|
+
def list_voices(self) -> list[VoiceInfo]:
|
|
226
|
+
return [
|
|
227
|
+
VoiceInfo(
|
|
228
|
+
id="reference",
|
|
229
|
+
name="Reference audio",
|
|
230
|
+
language=None,
|
|
231
|
+
description="Pass reference_audio or a voice path to clone a speaker.",
|
|
232
|
+
is_cloned=True,
|
|
233
|
+
)
|
|
234
|
+
]
|
|
235
|
+
|
|
236
|
+
def estimate_vram_bytes(self, **kwargs: Any) -> int:
|
|
237
|
+
return 2_000_000_000
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
class ChatterboxTurboAdapter(_BaseChatterboxAdapter):
|
|
241
|
+
adapter_name = "chatterbox-tts-turbo"
|
|
242
|
+
architectures = ("chatterbox-tts-turbo", "chatterbox-turbo")
|
|
243
|
+
runtime_module = "chatterbox.tts_turbo"
|
|
244
|
+
runtime_class = "ChatterboxTurboTTS"
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
class ChatterboxAdapter(_BaseChatterboxAdapter):
|
|
248
|
+
adapter_name = "chatterbox-tts"
|
|
249
|
+
architectures = ("chatterbox-tts", "chatterbox")
|
|
250
|
+
runtime_class = "ChatterboxTTS"
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
class ChatterboxMultilingualAdapter(_BaseChatterboxAdapter):
|
|
254
|
+
adapter_name = "chatterbox-tts-multilingual"
|
|
255
|
+
architectures = ("chatterbox-tts-multilingual", "chatterbox-multilingual")
|
|
256
|
+
runtime_module = "chatterbox.mtl_tts"
|
|
257
|
+
runtime_class = "ChatterboxMultilingualTTS"
|
|
258
|
+
supported_languages = (
|
|
259
|
+
"ar",
|
|
260
|
+
"da",
|
|
261
|
+
"de",
|
|
262
|
+
"el",
|
|
263
|
+
"en",
|
|
264
|
+
"es",
|
|
265
|
+
"fi",
|
|
266
|
+
"fr",
|
|
267
|
+
"he",
|
|
268
|
+
"hi",
|
|
269
|
+
"it",
|
|
270
|
+
"ja",
|
|
271
|
+
"ko",
|
|
272
|
+
"ms",
|
|
273
|
+
"nl",
|
|
274
|
+
"no",
|
|
275
|
+
"pl",
|
|
276
|
+
"pt",
|
|
277
|
+
"ru",
|
|
278
|
+
"sv",
|
|
279
|
+
"sw",
|
|
280
|
+
"tr",
|
|
281
|
+
"zh",
|
|
282
|
+
)
|