vox-cosyvoice 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vox-cosyvoice
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: CosyVoice 2 TTS adapter for Vox
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: numpy<2.4,>=1.26.0
|
|
7
|
+
Requires-Dist: soundfile>=0.13.1
|
|
8
|
+
Requires-Dist: vox-runtime>=0.2.2
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
|
|
11
|
+
# vox-cosyvoice
|
|
12
|
+
|
|
13
|
+
`vox-cosyvoice` provides a Vox TTS adapter for CosyVoice 2.
|
|
14
|
+
|
|
15
|
+
Adapters:
|
|
16
|
+
|
|
17
|
+
- `cosyvoice2-tts-torch` - CosyVoice 2 zero-shot streaming backend
|
|
18
|
+
|
|
19
|
+
## Install
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install vox-cosyvoice
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Runtime Dependencies
|
|
26
|
+
|
|
27
|
+
The adapter package is intentionally light. The official CosyVoice runtime is
|
|
28
|
+
installed on demand from GitHub into the isolated target runtime
|
|
29
|
+
`$VOX_HOME/runtime/cosyvoice`.
|
|
30
|
+
|
|
31
|
+
## Use with Vox
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
vox pull cosyvoice2-tts-torch:0.5b
|
|
35
|
+
vox run cosyvoice2-tts-torch:0.5b "Hello from CosyVoice 2"
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
CosyVoice 2 is best used with a reference voice. Pass `reference_audio` and
|
|
39
|
+
`reference_text` through the Vox API, or use a voice value that points to a
|
|
40
|
+
local WAV file.
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# vox-cosyvoice
|
|
2
|
+
|
|
3
|
+
`vox-cosyvoice` provides a Vox TTS adapter for CosyVoice 2.
|
|
4
|
+
|
|
5
|
+
Adapters:
|
|
6
|
+
|
|
7
|
+
- `cosyvoice2-tts-torch` - CosyVoice 2 zero-shot streaming backend
|
|
8
|
+
|
|
9
|
+
## Install
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install vox-cosyvoice
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Runtime Dependencies
|
|
16
|
+
|
|
17
|
+
The adapter package is intentionally light. The official CosyVoice runtime is
|
|
18
|
+
installed on demand from GitHub into the isolated target runtime
|
|
19
|
+
`$VOX_HOME/runtime/cosyvoice`.
|
|
20
|
+
|
|
21
|
+
## Use with Vox
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
vox pull cosyvoice2-tts-torch:0.5b
|
|
25
|
+
vox run cosyvoice2-tts-torch:0.5b "Hello from CosyVoice 2"
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
CosyVoice 2 is best used with a reference voice. Pass `reference_audio` and
|
|
29
|
+
`reference_text` through the Vox API, or use a voice value that points to a
|
|
30
|
+
local WAV file.
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "vox-cosyvoice"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "CosyVoice 2 TTS adapter for Vox"
|
|
5
|
+
readme = { file = "README.md", content-type = "text/markdown" }
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"vox-runtime>=0.2.2",
|
|
9
|
+
"numpy>=1.26.0,<2.4",
|
|
10
|
+
"soundfile>=0.13.1",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
[tool.vox.adapter]
|
|
14
|
+
import-package = "vox_cosyvoice"
|
|
15
|
+
runtime-policy = "target-runtime"
|
|
16
|
+
runtime-names = ["cosyvoice"]
|
|
17
|
+
adapter-types = ["tts"]
|
|
18
|
+
|
|
19
|
+
[project.entry-points."vox.adapters"]
|
|
20
|
+
cosyvoice2-tts-torch = "vox_cosyvoice.adapter:CosyVoice2Adapter"
|
|
21
|
+
|
|
22
|
+
[build-system]
|
|
23
|
+
requires = ["hatchling"]
|
|
24
|
+
build-backend = "hatchling.build"
|
|
25
|
+
|
|
26
|
+
[tool.hatch.build.targets.wheel]
|
|
27
|
+
packages = ["src/vox_cosyvoice"]
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib
|
|
4
|
+
import logging
|
|
5
|
+
import subprocess
|
|
6
|
+
import tempfile
|
|
7
|
+
from collections.abc import AsyncIterator
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import soundfile as sf
|
|
13
|
+
from numpy.typing import NDArray
|
|
14
|
+
|
|
15
|
+
from vox.core.adapter import TTSAdapter
|
|
16
|
+
from vox.core.adapter_runtime import (
|
|
17
|
+
activate_runtime_path,
|
|
18
|
+
install_target_runtime_requirements,
|
|
19
|
+
purge_runtime_modules,
|
|
20
|
+
)
|
|
21
|
+
from vox.core.adapter_runtime import (
|
|
22
|
+
runtime_root as vox_runtime_root,
|
|
23
|
+
)
|
|
24
|
+
from vox.core.types import AdapterInfo, ModelFormat, ModelType, SynthesizeChunk, VoiceInfo
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
COSYVOICE_SAMPLE_RATE = 24_000
|
|
29
|
+
COSYVOICE_REPO = "git+https://github.com/FunAudioLLM/CosyVoice.git"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _runtime_root() -> Path:
|
|
33
|
+
return vox_runtime_root() / "cosyvoice"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _ensure_runtime_path() -> str:
|
|
37
|
+
runtime_dir = _runtime_root()
|
|
38
|
+
runtime_dir.mkdir(parents=True, exist_ok=True)
|
|
39
|
+
return activate_runtime_path(runtime_dir, root=runtime_dir.parent)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _run_install_command(cmd: list[str], timeout: int) -> subprocess.CompletedProcess[str]:
|
|
43
|
+
return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _install_cosyvoice_runtime() -> None:
|
|
47
|
+
runtime_path = _ensure_runtime_path()
|
|
48
|
+
if not install_target_runtime_requirements(
|
|
49
|
+
runtime_path,
|
|
50
|
+
(COSYVOICE_REPO,),
|
|
51
|
+
timeout=1800,
|
|
52
|
+
install_runner=_run_install_command,
|
|
53
|
+
context="CosyVoice runtime install",
|
|
54
|
+
):
|
|
55
|
+
raise RuntimeError("Failed to install CosyVoice runtime from GitHub.")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _clear_cosyvoice_modules() -> None:
|
|
59
|
+
purge_runtime_modules(("cosyvoice", "matcha", "wetext"))
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _load_cosyvoice_class() -> type[Any]:
|
|
63
|
+
_ensure_runtime_path()
|
|
64
|
+
try:
|
|
65
|
+
module = importlib.import_module("cosyvoice.cli.cosyvoice")
|
|
66
|
+
except ImportError:
|
|
67
|
+
_install_cosyvoice_runtime()
|
|
68
|
+
_clear_cosyvoice_modules()
|
|
69
|
+
module = importlib.import_module("cosyvoice.cli.cosyvoice")
|
|
70
|
+
|
|
71
|
+
cls = getattr(module, "CosyVoice2", None)
|
|
72
|
+
if cls is None:
|
|
73
|
+
raise RuntimeError("CosyVoice runtime is installed, but cosyvoice.cli.cosyvoice.CosyVoice2 was not found.")
|
|
74
|
+
return cls
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _voice_path(voice: str | None) -> Path | None:
|
|
78
|
+
if not voice:
|
|
79
|
+
return None
|
|
80
|
+
path = Path(voice).expanduser()
|
|
81
|
+
return path if path.is_file() else None
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _write_reference_audio(path: Path, reference_audio: NDArray[np.float32], sample_rate: int) -> None:
|
|
85
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
86
|
+
sf.write(path, np.asarray(reference_audio, dtype=np.float32), sample_rate)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _extract_audio(output: Any) -> NDArray[np.float32]:
|
|
90
|
+
if isinstance(output, dict):
|
|
91
|
+
for key in ("tts_speech", "audio", "wav", "waveform"):
|
|
92
|
+
if key in output:
|
|
93
|
+
return _extract_audio(output[key])
|
|
94
|
+
raise RuntimeError("CosyVoice returned a dict without audio data.")
|
|
95
|
+
if isinstance(output, tuple | list) and output and not isinstance(output[0], (int, float, np.number)):
|
|
96
|
+
return _extract_audio(output[0])
|
|
97
|
+
if hasattr(output, "detach"):
|
|
98
|
+
output = output.detach()
|
|
99
|
+
if hasattr(output, "cpu"):
|
|
100
|
+
output = output.cpu()
|
|
101
|
+
if hasattr(output, "numpy"):
|
|
102
|
+
output = output.numpy()
|
|
103
|
+
audio = np.asarray(output, dtype=np.float32).reshape(-1)
|
|
104
|
+
if audio.size == 0:
|
|
105
|
+
raise RuntimeError("CosyVoice produced no audio.")
|
|
106
|
+
return audio
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class CosyVoice2Adapter(TTSAdapter):
|
|
110
|
+
def __init__(self) -> None:
|
|
111
|
+
self._model: Any | None = None
|
|
112
|
+
self._model_id = ""
|
|
113
|
+
self._device = "cpu"
|
|
114
|
+
|
|
115
|
+
def info(self) -> AdapterInfo:
|
|
116
|
+
return AdapterInfo(
|
|
117
|
+
name="cosyvoice2-tts-torch",
|
|
118
|
+
type=ModelType.TTS,
|
|
119
|
+
architectures=("cosyvoice2-tts-torch", "cosyvoice2", "cosyvoice"),
|
|
120
|
+
default_sample_rate=COSYVOICE_SAMPLE_RATE,
|
|
121
|
+
supported_formats=(ModelFormat.PYTORCH,),
|
|
122
|
+
supports_streaming=True,
|
|
123
|
+
supports_voice_cloning=True,
|
|
124
|
+
supported_languages=(),
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
def load(self, model_path: str, device: str, **kwargs: Any) -> None:
|
|
128
|
+
if self._model is not None:
|
|
129
|
+
return
|
|
130
|
+
|
|
131
|
+
kwargs.pop("_source", None)
|
|
132
|
+
self._model_id = model_path
|
|
133
|
+
self._device = device
|
|
134
|
+
cls = _load_cosyvoice_class()
|
|
135
|
+
|
|
136
|
+
logger.info("Loading CosyVoice2 model from %s (device=%s)", model_path, self._device)
|
|
137
|
+
try:
|
|
138
|
+
self._model = cls(model_path, load_jit=False, load_trt=False, load_vllm=False, fp16=device == "cuda")
|
|
139
|
+
except TypeError:
|
|
140
|
+
self._model = cls(model_path)
|
|
141
|
+
|
|
142
|
+
def unload(self) -> None:
|
|
143
|
+
self._model = None
|
|
144
|
+
self._model_id = ""
|
|
145
|
+
self._device = "cpu"
|
|
146
|
+
|
|
147
|
+
@property
|
|
148
|
+
def is_loaded(self) -> bool:
|
|
149
|
+
return self._model is not None
|
|
150
|
+
|
|
151
|
+
async def synthesize(
|
|
152
|
+
self,
|
|
153
|
+
text: str,
|
|
154
|
+
*,
|
|
155
|
+
voice: str | None = None,
|
|
156
|
+
speed: float = 1.0,
|
|
157
|
+
language: str | None = None,
|
|
158
|
+
reference_audio: NDArray[np.float32] | None = None,
|
|
159
|
+
reference_text: str | None = None,
|
|
160
|
+
) -> AsyncIterator[SynthesizeChunk]:
|
|
161
|
+
if self._model is None:
|
|
162
|
+
raise RuntimeError("CosyVoice2 model is not loaded — call load() first")
|
|
163
|
+
if not text or not text.strip():
|
|
164
|
+
return
|
|
165
|
+
|
|
166
|
+
voice_file = _voice_path(voice)
|
|
167
|
+
zero_shot_spk_id = "" if voice_file is not None else (voice or "")
|
|
168
|
+
if reference_audio is None and voice_file is None and not zero_shot_spk_id:
|
|
169
|
+
raise ValueError("CosyVoice2 requires reference_audio, a voice path, or a zero_shot_spk_id voice value.")
|
|
170
|
+
|
|
171
|
+
with tempfile.TemporaryDirectory(prefix="vox-cosyvoice-") as tmpdir:
|
|
172
|
+
prompt_wav = ""
|
|
173
|
+
if reference_audio is not None:
|
|
174
|
+
ref_path = Path(tmpdir) / "reference.wav"
|
|
175
|
+
_write_reference_audio(ref_path, reference_audio, COSYVOICE_SAMPLE_RATE)
|
|
176
|
+
prompt_wav = str(ref_path)
|
|
177
|
+
elif voice_file is not None:
|
|
178
|
+
prompt_wav = str(voice_file)
|
|
179
|
+
|
|
180
|
+
outputs = self._model.inference_zero_shot(
|
|
181
|
+
text,
|
|
182
|
+
reference_text or "",
|
|
183
|
+
prompt_wav,
|
|
184
|
+
zero_shot_spk_id=zero_shot_spk_id,
|
|
185
|
+
stream=True,
|
|
186
|
+
speed=speed,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
yielded = False
|
|
190
|
+
for output in outputs:
|
|
191
|
+
audio = _extract_audio(output)
|
|
192
|
+
if audio.size:
|
|
193
|
+
yielded = True
|
|
194
|
+
yield SynthesizeChunk(
|
|
195
|
+
audio=audio.tobytes(),
|
|
196
|
+
sample_rate=COSYVOICE_SAMPLE_RATE,
|
|
197
|
+
is_final=False,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
if not yielded:
|
|
201
|
+
raise RuntimeError("CosyVoice2 produced no audio.")
|
|
202
|
+
|
|
203
|
+
yield SynthesizeChunk(audio=b"", sample_rate=COSYVOICE_SAMPLE_RATE, is_final=True)
|
|
204
|
+
|
|
205
|
+
def list_voices(self) -> list[VoiceInfo]:
|
|
206
|
+
return [
|
|
207
|
+
VoiceInfo(
|
|
208
|
+
id="reference",
|
|
209
|
+
name="Reference audio",
|
|
210
|
+
language=None,
|
|
211
|
+
description="Pass reference_audio/reference_text, a voice path, or a saved zero_shot_spk_id.",
|
|
212
|
+
is_cloned=True,
|
|
213
|
+
)
|
|
214
|
+
]
|
|
215
|
+
|
|
216
|
+
def estimate_vram_bytes(self, **kwargs: Any) -> int:
|
|
217
|
+
return 5_000_000_000
|