vox-chatterbox 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .eggs/
8
+ *.egg
9
+ .venv/
10
+ venv/
11
+ .env
12
+ .pytest_cache/
13
+ .ruff_cache/
14
+ .mypy_cache/
15
+ *.so
16
+ *.dylib
17
+ .coverage
@@ -0,0 +1,41 @@
1
+ Metadata-Version: 2.4
2
+ Name: vox-chatterbox
3
+ Version: 0.1.0
4
+ Summary: Resemble Chatterbox TTS adapters for Vox
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: numpy<2.4,>=1.26.0
7
+ Requires-Dist: soundfile>=0.13.1
8
+ Requires-Dist: vox-runtime>=0.2.2
9
+ Description-Content-Type: text/markdown
10
+
11
+ # vox-chatterbox
12
+
13
+ `vox-chatterbox` provides Vox TTS adapters for Resemble AI Chatterbox.
14
+
15
+ Adapters:
16
+
17
+ - `chatterbox-tts-turbo` - Chatterbox Turbo backend
18
+ - `chatterbox-tts` - Chatterbox backend
19
+ - `chatterbox-tts-multilingual` - Chatterbox multilingual backend
20
+
21
+ ## Install
22
+
23
+ ```bash
24
+ pip install vox-chatterbox
25
+ ```
26
+
27
+ ## Runtime Dependencies
28
+
29
+ The adapter package is intentionally light. The Chatterbox backend package is
30
+ installed on demand into the isolated target runtime
31
+ `$VOX_HOME/runtime/chatterbox`.
32
+
33
+ ## Use with Vox
34
+
35
+ ```bash
36
+ vox pull chatterbox-tts-turbo:0.1.7
37
+ vox run chatterbox-tts-turbo:0.1.7 "Hello from Chatterbox"
38
+ ```
39
+
40
+ For voice cloning, pass a reference audio sample through the Vox API or use a
41
+ voice value that points to a local WAV file.
@@ -0,0 +1,31 @@
1
+ # vox-chatterbox
2
+
3
+ `vox-chatterbox` provides Vox TTS adapters for Resemble AI Chatterbox.
4
+
5
+ Adapters:
6
+
7
+ - `chatterbox-tts-turbo` - Chatterbox Turbo backend
8
+ - `chatterbox-tts` - Chatterbox backend
9
+ - `chatterbox-tts-multilingual` - Chatterbox multilingual backend
10
+
11
+ ## Install
12
+
13
+ ```bash
14
+ pip install vox-chatterbox
15
+ ```
16
+
17
+ ## Runtime Dependencies
18
+
19
+ The adapter package is intentionally light. The Chatterbox backend package is
20
+ installed on demand into the isolated target runtime
21
+ `$VOX_HOME/runtime/chatterbox`.
22
+
23
+ ## Use with Vox
24
+
25
+ ```bash
26
+ vox pull chatterbox-tts-turbo:0.1.7
27
+ vox run chatterbox-tts-turbo:0.1.7 "Hello from Chatterbox"
28
+ ```
29
+
30
+ For voice cloning, pass a reference audio sample through the Vox API or use a
31
+ voice value that points to a local WAV file.
@@ -0,0 +1,29 @@
1
+ [project]
2
+ name = "vox-chatterbox"
3
+ version = "0.1.0"
4
+ description = "Resemble Chatterbox TTS adapters for Vox"
5
+ readme = { file = "README.md", content-type = "text/markdown" }
6
+ requires-python = ">=3.11"
7
+ dependencies = [
8
+ "vox-runtime>=0.2.2",
9
+ "numpy>=1.26.0,<2.4",
10
+ "soundfile>=0.13.1",
11
+ ]
12
+
13
+ [tool.vox.adapter]
14
+ import-package = "vox_chatterbox"
15
+ runtime-policy = "target-runtime"
16
+ runtime-names = ["chatterbox"]
17
+ adapter-types = ["tts"]
18
+
19
+ [project.entry-points."vox.adapters"]
20
+ chatterbox-tts-turbo = "vox_chatterbox.adapter:ChatterboxTurboAdapter"
21
+ chatterbox-tts = "vox_chatterbox.adapter:ChatterboxAdapter"
22
+ chatterbox-tts-multilingual = "vox_chatterbox.adapter:ChatterboxMultilingualAdapter"
23
+
24
+ [build-system]
25
+ requires = ["hatchling"]
26
+ build-backend = "hatchling.build"
27
+
28
+ [tool.hatch.build.targets.wheel]
29
+ packages = ["src/vox_chatterbox"]
@@ -0,0 +1,11 @@
1
+ from vox_chatterbox.adapter import (
2
+ ChatterboxAdapter,
3
+ ChatterboxMultilingualAdapter,
4
+ ChatterboxTurboAdapter,
5
+ )
6
+
7
+ __all__ = [
8
+ "ChatterboxAdapter",
9
+ "ChatterboxMultilingualAdapter",
10
+ "ChatterboxTurboAdapter",
11
+ ]
@@ -0,0 +1,282 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import logging
5
+ import subprocess
6
+ import tempfile
7
+ from collections.abc import AsyncIterator
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ import numpy as np
12
+ import soundfile as sf
13
+ from numpy.typing import NDArray
14
+
15
+ from vox.core.adapter import TTSAdapter
16
+ from vox.core.adapter_runtime import (
17
+ activate_runtime_path,
18
+ install_target_runtime_requirements,
19
+ purge_runtime_modules,
20
+ )
21
+ from vox.core.adapter_runtime import (
22
+ runtime_root as vox_runtime_root,
23
+ )
24
+ from vox.core.types import AdapterInfo, ModelFormat, ModelType, SynthesizeChunk, VoiceInfo
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ CHATTERBOX_SAMPLE_RATE = 24_000
29
+ CHATTERBOX_RUNTIME_DEPS = ("chatterbox-tts>=0.1.7,<0.2.0",)
30
+
31
+
32
+ def _runtime_root() -> Path:
33
+ return vox_runtime_root() / "chatterbox"
34
+
35
+
36
+ def _ensure_runtime_path() -> str:
37
+ runtime_dir = _runtime_root()
38
+ runtime_dir.mkdir(parents=True, exist_ok=True)
39
+ return activate_runtime_path(runtime_dir, root=runtime_dir.parent)
40
+
41
+
42
+ def _run_install_command(cmd: list[str], timeout: int) -> subprocess.CompletedProcess[str]:
43
+ return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
44
+
45
+
46
+ def _install_chatterbox_runtime() -> None:
47
+ runtime_path = _ensure_runtime_path()
48
+ if not install_target_runtime_requirements(
49
+ runtime_path,
50
+ CHATTERBOX_RUNTIME_DEPS,
51
+ timeout=900,
52
+ install_runner=_run_install_command,
53
+ context="Chatterbox runtime install",
54
+ ):
55
+ raise RuntimeError("Failed to install Chatterbox runtime package.")
56
+
57
+
58
+ def _clear_chatterbox_modules() -> None:
59
+ purge_runtime_modules(("chatterbox", "s3tokenizer"))
60
+
61
+
62
+ def _load_chatterbox_class(module_name: str, class_name: str) -> type[Any]:
63
+ _ensure_runtime_path()
64
+ try:
65
+ module = importlib.import_module(module_name)
66
+ except ImportError:
67
+ _install_chatterbox_runtime()
68
+ _clear_chatterbox_modules()
69
+ module = importlib.import_module(module_name)
70
+
71
+ cls = getattr(module, class_name, None)
72
+ if cls is None:
73
+ raise RuntimeError(
74
+ f"Chatterbox runtime is installed, but {module_name}.{class_name} was not found."
75
+ )
76
+ return cls
77
+
78
+
79
+ def _float_audio(audio: Any) -> NDArray[np.float32]:
80
+ if isinstance(audio, dict):
81
+ for key in ("audio", "wav", "waveform"):
82
+ if key in audio:
83
+ return _float_audio(audio[key])
84
+ raise RuntimeError("Chatterbox returned a dict without audio data.")
85
+
86
+ if isinstance(audio, tuple | list) and audio and not isinstance(audio[0], (int, float, np.number)):
87
+ return _float_audio(audio[0])
88
+
89
+ if hasattr(audio, "detach"):
90
+ audio = audio.detach()
91
+ if hasattr(audio, "cpu"):
92
+ audio = audio.cpu()
93
+ if hasattr(audio, "numpy"):
94
+ audio = audio.numpy()
95
+
96
+ array = np.asarray(audio, dtype=np.float32).reshape(-1)
97
+ if array.size == 0:
98
+ raise RuntimeError("Chatterbox produced no audio.")
99
+ return array
100
+
101
+
102
+ def _sample_rate(model: Any) -> int:
103
+ for attr in ("sr", "sample_rate", "sampling_rate"):
104
+ value = getattr(model, attr, None)
105
+ if isinstance(value, int) and value > 0:
106
+ return value
107
+ return CHATTERBOX_SAMPLE_RATE
108
+
109
+
110
+ def _load_model(cls: type[Any], device: str) -> Any:
111
+ if hasattr(cls, "from_pretrained"):
112
+ try:
113
+ return cls.from_pretrained(device=device)
114
+ except TypeError:
115
+ return cls.from_pretrained()
116
+ try:
117
+ return cls(device=device)
118
+ except TypeError:
119
+ return cls()
120
+
121
+
122
+ def _voice_path(voice: str | None) -> str | None:
123
+ if not voice:
124
+ return None
125
+ path = Path(voice).expanduser()
126
+ return str(path) if path.is_file() else None
127
+
128
+
129
+ def _write_reference_audio(path: Path, reference_audio: NDArray[np.float32], sample_rate: int) -> None:
130
+ path.parent.mkdir(parents=True, exist_ok=True)
131
+ sf.write(path, np.asarray(reference_audio, dtype=np.float32), sample_rate)
132
+
133
+
134
+ class _BaseChatterboxAdapter(TTSAdapter):
135
+ adapter_name = "chatterbox-tts"
136
+ architectures = ("chatterbox-tts", "chatterbox")
137
+ runtime_module = "chatterbox.tts"
138
+ runtime_class = "ChatterboxTTS"
139
+ supported_languages: tuple[str, ...] = ("en",)
140
+ supports_streaming = False
141
+
142
+ def __init__(self) -> None:
143
+ self._model: Any | None = None
144
+ self._device = "cpu"
145
+ self._sample_rate = CHATTERBOX_SAMPLE_RATE
146
+
147
+ def info(self) -> AdapterInfo:
148
+ return AdapterInfo(
149
+ name=self.adapter_name,
150
+ type=ModelType.TTS,
151
+ architectures=self.architectures,
152
+ default_sample_rate=CHATTERBOX_SAMPLE_RATE,
153
+ supported_formats=(ModelFormat.PYTORCH,),
154
+ supports_streaming=self.supports_streaming,
155
+ supports_voice_cloning=True,
156
+ supported_languages=self.supported_languages,
157
+ )
158
+
159
+ def load(self, model_path: str, device: str, **kwargs: Any) -> None:
160
+ if self._model is not None:
161
+ return
162
+
163
+ kwargs.pop("_source", None)
164
+ self._device = device
165
+ cls = _load_chatterbox_class(self.runtime_module, self.runtime_class)
166
+ logger.info("Loading Chatterbox runtime %s (device=%s)", self.runtime_class, self._device)
167
+ self._model = _load_model(cls, self._device)
168
+ self._sample_rate = _sample_rate(self._model)
169
+
170
+ def unload(self) -> None:
171
+ self._model = None
172
+ self._device = "cpu"
173
+ self._sample_rate = CHATTERBOX_SAMPLE_RATE
174
+
175
+ @property
176
+ def is_loaded(self) -> bool:
177
+ return self._model is not None
178
+
179
+ async def synthesize(
180
+ self,
181
+ text: str,
182
+ *,
183
+ voice: str | None = None,
184
+ speed: float = 1.0,
185
+ language: str | None = None,
186
+ reference_audio: NDArray[np.float32] | None = None,
187
+ reference_text: str | None = None,
188
+ ) -> AsyncIterator[SynthesizeChunk]:
189
+ if self._model is None:
190
+ raise RuntimeError("Chatterbox model is not loaded — call load() first")
191
+ if not text or not text.strip():
192
+ return
193
+
194
+ kwargs: dict[str, Any] = {}
195
+ if speed and speed != 1.0:
196
+ kwargs["speed"] = speed
197
+ if reference_text:
198
+ kwargs["audio_prompt_text"] = reference_text
199
+ if self.supported_languages != ("en",):
200
+ kwargs["language_id"] = language or "en"
201
+
202
+ voice_file = _voice_path(voice)
203
+ if voice_file is not None:
204
+ kwargs["audio_prompt_path"] = voice_file
205
+
206
+ if reference_audio is not None:
207
+ with tempfile.TemporaryDirectory(prefix="vox-chatterbox-") as tmpdir:
208
+ ref_path = Path(tmpdir) / "reference.wav"
209
+ _write_reference_audio(ref_path, reference_audio, self._sample_rate)
210
+ kwargs["audio_prompt_path"] = str(ref_path)
211
+ audio = _float_audio(self._model.generate(text, **kwargs))
212
+ else:
213
+ audio = _float_audio(self._model.generate(text, **kwargs))
214
+
215
+ chunk_size = self._sample_rate * 2
216
+ for i in range(0, len(audio), chunk_size):
217
+ yield SynthesizeChunk(
218
+ audio=audio[i:i + chunk_size].tobytes(),
219
+ sample_rate=self._sample_rate,
220
+ is_final=False,
221
+ )
222
+
223
+ yield SynthesizeChunk(audio=b"", sample_rate=self._sample_rate, is_final=True)
224
+
225
+ def list_voices(self) -> list[VoiceInfo]:
226
+ return [
227
+ VoiceInfo(
228
+ id="reference",
229
+ name="Reference audio",
230
+ language=None,
231
+ description="Pass reference_audio or a voice path to clone a speaker.",
232
+ is_cloned=True,
233
+ )
234
+ ]
235
+
236
+ def estimate_vram_bytes(self, **kwargs: Any) -> int:
237
+ return 2_000_000_000
238
+
239
+
240
+ class ChatterboxTurboAdapter(_BaseChatterboxAdapter):
241
+ adapter_name = "chatterbox-tts-turbo"
242
+ architectures = ("chatterbox-tts-turbo", "chatterbox-turbo")
243
+ runtime_module = "chatterbox.tts_turbo"
244
+ runtime_class = "ChatterboxTurboTTS"
245
+
246
+
247
+ class ChatterboxAdapter(_BaseChatterboxAdapter):
248
+ adapter_name = "chatterbox-tts"
249
+ architectures = ("chatterbox-tts", "chatterbox")
250
+ runtime_class = "ChatterboxTTS"
251
+
252
+
253
+ class ChatterboxMultilingualAdapter(_BaseChatterboxAdapter):
254
+ adapter_name = "chatterbox-tts-multilingual"
255
+ architectures = ("chatterbox-tts-multilingual", "chatterbox-multilingual")
256
+ runtime_module = "chatterbox.mtl_tts"
257
+ runtime_class = "ChatterboxMultilingualTTS"
258
+ supported_languages = (
259
+ "ar",
260
+ "da",
261
+ "de",
262
+ "el",
263
+ "en",
264
+ "es",
265
+ "fi",
266
+ "fr",
267
+ "he",
268
+ "hi",
269
+ "it",
270
+ "ja",
271
+ "ko",
272
+ "ms",
273
+ "nl",
274
+ "no",
275
+ "pl",
276
+ "pt",
277
+ "ru",
278
+ "sv",
279
+ "sw",
280
+ "tr",
281
+ "zh",
282
+ )