vox-cosyvoice 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .eggs/
8
+ *.egg
9
+ .venv/
10
+ venv/
11
+ .env
12
+ .pytest_cache/
13
+ .ruff_cache/
14
+ .mypy_cache/
15
+ *.so
16
+ *.dylib
17
+ .coverage
@@ -0,0 +1,40 @@
1
+ Metadata-Version: 2.4
2
+ Name: vox-cosyvoice
3
+ Version: 0.1.0
4
+ Summary: CosyVoice 2 TTS adapter for Vox
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: numpy<2.4,>=1.26.0
7
+ Requires-Dist: soundfile>=0.13.1
8
+ Requires-Dist: vox-runtime>=0.2.2
9
+ Description-Content-Type: text/markdown
10
+
11
+ # vox-cosyvoice
12
+
13
+ `vox-cosyvoice` provides a Vox TTS adapter for CosyVoice 2.
14
+
15
+ Adapters:
16
+
17
+ - `cosyvoice2-tts-torch` - CosyVoice 2 zero-shot streaming backend
18
+
19
+ ## Install
20
+
21
+ ```bash
22
+ pip install vox-cosyvoice
23
+ ```
24
+
25
+ ## Runtime Dependencies
26
+
27
+ The adapter package is intentionally light. The official CosyVoice runtime is
28
+ installed on demand from GitHub into the isolated target runtime
29
+ `$VOX_HOME/runtime/cosyvoice`.
30
+
31
+ ## Use with Vox
32
+
33
+ ```bash
34
+ vox pull cosyvoice2-tts-torch:0.5b
35
+ vox run cosyvoice2-tts-torch:0.5b "Hello from CosyVoice 2"
36
+ ```
37
+
38
+ CosyVoice 2 is best used with a reference voice. Pass `reference_audio` and
39
+ `reference_text` through the Vox API, or use a voice value that points to a
40
+ local WAV file.
@@ -0,0 +1,30 @@
1
+ # vox-cosyvoice
2
+
3
+ `vox-cosyvoice` provides a Vox TTS adapter for CosyVoice 2.
4
+
5
+ Adapters:
6
+
7
+ - `cosyvoice2-tts-torch` - CosyVoice 2 zero-shot streaming backend
8
+
9
+ ## Install
10
+
11
+ ```bash
12
+ pip install vox-cosyvoice
13
+ ```
14
+
15
+ ## Runtime Dependencies
16
+
17
+ The adapter package is intentionally light. The official CosyVoice runtime is
18
+ installed on demand from GitHub into the isolated target runtime
19
+ `$VOX_HOME/runtime/cosyvoice`.
20
+
21
+ ## Use with Vox
22
+
23
+ ```bash
24
+ vox pull cosyvoice2-tts-torch:0.5b
25
+ vox run cosyvoice2-tts-torch:0.5b "Hello from CosyVoice 2"
26
+ ```
27
+
28
+ CosyVoice 2 is best used with a reference voice. Pass `reference_audio` and
29
+ `reference_text` through the Vox API, or use a voice value that points to a
30
+ local WAV file.
@@ -0,0 +1,27 @@
1
+ [project]
2
+ name = "vox-cosyvoice"
3
+ version = "0.1.0"
4
+ description = "CosyVoice 2 TTS adapter for Vox"
5
+ readme = { file = "README.md", content-type = "text/markdown" }
6
+ requires-python = ">=3.11"
7
+ dependencies = [
8
+ "vox-runtime>=0.2.2",
9
+ "numpy>=1.26.0,<2.4",
10
+ "soundfile>=0.13.1",
11
+ ]
12
+
13
+ [tool.vox.adapter]
14
+ import-package = "vox_cosyvoice"
15
+ runtime-policy = "target-runtime"
16
+ runtime-names = ["cosyvoice"]
17
+ adapter-types = ["tts"]
18
+
19
+ [project.entry-points."vox.adapters"]
20
+ cosyvoice2-tts-torch = "vox_cosyvoice.adapter:CosyVoice2Adapter"
21
+
22
+ [build-system]
23
+ requires = ["hatchling"]
24
+ build-backend = "hatchling.build"
25
+
26
+ [tool.hatch.build.targets.wheel]
27
+ packages = ["src/vox_cosyvoice"]
@@ -0,0 +1,3 @@
1
+ from vox_cosyvoice.adapter import CosyVoice2Adapter
2
+
3
+ __all__ = ["CosyVoice2Adapter"]
@@ -0,0 +1,217 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import logging
5
+ import subprocess
6
+ import tempfile
7
+ from collections.abc import AsyncIterator
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ import numpy as np
12
+ import soundfile as sf
13
+ from numpy.typing import NDArray
14
+
15
+ from vox.core.adapter import TTSAdapter
16
+ from vox.core.adapter_runtime import (
17
+ activate_runtime_path,
18
+ install_target_runtime_requirements,
19
+ purge_runtime_modules,
20
+ )
21
+ from vox.core.adapter_runtime import (
22
+ runtime_root as vox_runtime_root,
23
+ )
24
+ from vox.core.types import AdapterInfo, ModelFormat, ModelType, SynthesizeChunk, VoiceInfo
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ COSYVOICE_SAMPLE_RATE = 24_000
29
+ COSYVOICE_REPO = "git+https://github.com/FunAudioLLM/CosyVoice.git"
30
+
31
+
32
+ def _runtime_root() -> Path:
33
+ return vox_runtime_root() / "cosyvoice"
34
+
35
+
36
+ def _ensure_runtime_path() -> str:
37
+ runtime_dir = _runtime_root()
38
+ runtime_dir.mkdir(parents=True, exist_ok=True)
39
+ return activate_runtime_path(runtime_dir, root=runtime_dir.parent)
40
+
41
+
42
+ def _run_install_command(cmd: list[str], timeout: int) -> subprocess.CompletedProcess[str]:
43
+ return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
44
+
45
+
46
+ def _install_cosyvoice_runtime() -> None:
47
+ runtime_path = _ensure_runtime_path()
48
+ if not install_target_runtime_requirements(
49
+ runtime_path,
50
+ (COSYVOICE_REPO,),
51
+ timeout=1800,
52
+ install_runner=_run_install_command,
53
+ context="CosyVoice runtime install",
54
+ ):
55
+ raise RuntimeError("Failed to install CosyVoice runtime from GitHub.")
56
+
57
+
58
+ def _clear_cosyvoice_modules() -> None:
59
+ purge_runtime_modules(("cosyvoice", "matcha", "wetext"))
60
+
61
+
62
+ def _load_cosyvoice_class() -> type[Any]:
63
+ _ensure_runtime_path()
64
+ try:
65
+ module = importlib.import_module("cosyvoice.cli.cosyvoice")
66
+ except ImportError:
67
+ _install_cosyvoice_runtime()
68
+ _clear_cosyvoice_modules()
69
+ module = importlib.import_module("cosyvoice.cli.cosyvoice")
70
+
71
+ cls = getattr(module, "CosyVoice2", None)
72
+ if cls is None:
73
+ raise RuntimeError("CosyVoice runtime is installed, but cosyvoice.cli.cosyvoice.CosyVoice2 was not found.")
74
+ return cls
75
+
76
+
77
+ def _voice_path(voice: str | None) -> Path | None:
78
+ if not voice:
79
+ return None
80
+ path = Path(voice).expanduser()
81
+ return path if path.is_file() else None
82
+
83
+
84
+ def _write_reference_audio(path: Path, reference_audio: NDArray[np.float32], sample_rate: int) -> None:
85
+ path.parent.mkdir(parents=True, exist_ok=True)
86
+ sf.write(path, np.asarray(reference_audio, dtype=np.float32), sample_rate)
87
+
88
+
89
+ def _extract_audio(output: Any) -> NDArray[np.float32]:
90
+ if isinstance(output, dict):
91
+ for key in ("tts_speech", "audio", "wav", "waveform"):
92
+ if key in output:
93
+ return _extract_audio(output[key])
94
+ raise RuntimeError("CosyVoice returned a dict without audio data.")
95
+ if isinstance(output, tuple | list) and output and not isinstance(output[0], (int, float, np.number)):
96
+ return _extract_audio(output[0])
97
+ if hasattr(output, "detach"):
98
+ output = output.detach()
99
+ if hasattr(output, "cpu"):
100
+ output = output.cpu()
101
+ if hasattr(output, "numpy"):
102
+ output = output.numpy()
103
+ audio = np.asarray(output, dtype=np.float32).reshape(-1)
104
+ if audio.size == 0:
105
+ raise RuntimeError("CosyVoice produced no audio.")
106
+ return audio
107
+
108
+
109
+ class CosyVoice2Adapter(TTSAdapter):
110
+ def __init__(self) -> None:
111
+ self._model: Any | None = None
112
+ self._model_id = ""
113
+ self._device = "cpu"
114
+
115
+ def info(self) -> AdapterInfo:
116
+ return AdapterInfo(
117
+ name="cosyvoice2-tts-torch",
118
+ type=ModelType.TTS,
119
+ architectures=("cosyvoice2-tts-torch", "cosyvoice2", "cosyvoice"),
120
+ default_sample_rate=COSYVOICE_SAMPLE_RATE,
121
+ supported_formats=(ModelFormat.PYTORCH,),
122
+ supports_streaming=True,
123
+ supports_voice_cloning=True,
124
+ supported_languages=(),
125
+ )
126
+
127
+ def load(self, model_path: str, device: str, **kwargs: Any) -> None:
128
+ if self._model is not None:
129
+ return
130
+
131
+ kwargs.pop("_source", None)
132
+ self._model_id = model_path
133
+ self._device = device
134
+ cls = _load_cosyvoice_class()
135
+
136
+ logger.info("Loading CosyVoice2 model from %s (device=%s)", model_path, self._device)
137
+ try:
138
+ self._model = cls(model_path, load_jit=False, load_trt=False, load_vllm=False, fp16=device == "cuda")
139
+ except TypeError:
140
+ self._model = cls(model_path)
141
+
142
+ def unload(self) -> None:
143
+ self._model = None
144
+ self._model_id = ""
145
+ self._device = "cpu"
146
+
147
+ @property
148
+ def is_loaded(self) -> bool:
149
+ return self._model is not None
150
+
151
+ async def synthesize(
152
+ self,
153
+ text: str,
154
+ *,
155
+ voice: str | None = None,
156
+ speed: float = 1.0,
157
+ language: str | None = None,
158
+ reference_audio: NDArray[np.float32] | None = None,
159
+ reference_text: str | None = None,
160
+ ) -> AsyncIterator[SynthesizeChunk]:
161
+ if self._model is None:
162
+ raise RuntimeError("CosyVoice2 model is not loaded — call load() first")
163
+ if not text or not text.strip():
164
+ return
165
+
166
+ voice_file = _voice_path(voice)
167
+ zero_shot_spk_id = "" if voice_file is not None else (voice or "")
168
+ if reference_audio is None and voice_file is None and not zero_shot_spk_id:
169
+ raise ValueError("CosyVoice2 requires reference_audio, a voice path, or a zero_shot_spk_id voice value.")
170
+
171
+ with tempfile.TemporaryDirectory(prefix="vox-cosyvoice-") as tmpdir:
172
+ prompt_wav = ""
173
+ if reference_audio is not None:
174
+ ref_path = Path(tmpdir) / "reference.wav"
175
+ _write_reference_audio(ref_path, reference_audio, COSYVOICE_SAMPLE_RATE)
176
+ prompt_wav = str(ref_path)
177
+ elif voice_file is not None:
178
+ prompt_wav = str(voice_file)
179
+
180
+ outputs = self._model.inference_zero_shot(
181
+ text,
182
+ reference_text or "",
183
+ prompt_wav,
184
+ zero_shot_spk_id=zero_shot_spk_id,
185
+ stream=True,
186
+ speed=speed,
187
+ )
188
+
189
+ yielded = False
190
+ for output in outputs:
191
+ audio = _extract_audio(output)
192
+ if audio.size:
193
+ yielded = True
194
+ yield SynthesizeChunk(
195
+ audio=audio.tobytes(),
196
+ sample_rate=COSYVOICE_SAMPLE_RATE,
197
+ is_final=False,
198
+ )
199
+
200
+ if not yielded:
201
+ raise RuntimeError("CosyVoice2 produced no audio.")
202
+
203
+ yield SynthesizeChunk(audio=b"", sample_rate=COSYVOICE_SAMPLE_RATE, is_final=True)
204
+
205
+ def list_voices(self) -> list[VoiceInfo]:
206
+ return [
207
+ VoiceInfo(
208
+ id="reference",
209
+ name="Reference audio",
210
+ language=None,
211
+ description="Pass reference_audio/reference_text, a voice path, or a saved zero_shot_spk_id.",
212
+ is_cloned=True,
213
+ )
214
+ ]
215
+
216
+ def estimate_vram_bytes(self, **kwargs: Any) -> int:
217
+ return 5_000_000_000