vox-orpheus 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vox-orpheus
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Orpheus TTS adapter for Vox
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: numpy<2.4,>=1.26.0
|
|
7
|
+
Requires-Dist: vox-runtime>=0.2.2
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
|
|
10
|
+
# vox-orpheus
|
|
11
|
+
|
|
12
|
+
`vox-orpheus` provides a Vox TTS adapter for Orpheus.
|
|
13
|
+
|
|
14
|
+
Adapters:
|
|
15
|
+
|
|
16
|
+
- `orpheus-tts-vllm` - Orpheus medium 3B backend through vLLM
|
|
17
|
+
|
|
18
|
+
## Install
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install vox-orpheus
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Runtime Dependencies
|
|
25
|
+
|
|
26
|
+
The adapter package is intentionally light. The `orpheus-speech` backend and
|
|
27
|
+
its vLLM/SNAC runtime dependencies are installed on demand into the isolated
|
|
28
|
+
target runtime `$VOX_HOME/runtime/orpheus`.
|
|
29
|
+
|
|
30
|
+
## Use with Vox
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
vox pull orpheus-tts-vllm:medium-3b
|
|
34
|
+
vox run orpheus-tts-vllm:medium-3b "Hello from Orpheus" --voice tara
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Orpheus is GPU-oriented. Expect this adapter to require a CUDA-capable runtime
|
|
38
|
+
for practical latency.
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# vox-orpheus
|
|
2
|
+
|
|
3
|
+
`vox-orpheus` provides a Vox TTS adapter for Orpheus.
|
|
4
|
+
|
|
5
|
+
Adapters:
|
|
6
|
+
|
|
7
|
+
- `orpheus-tts-vllm` - Orpheus medium 3B backend through vLLM
|
|
8
|
+
|
|
9
|
+
## Install
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install vox-orpheus
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Runtime Dependencies
|
|
16
|
+
|
|
17
|
+
The adapter package is intentionally light. The `orpheus-speech` backend and
|
|
18
|
+
its vLLM/SNAC runtime dependencies are installed on demand into the isolated
|
|
19
|
+
target runtime `$VOX_HOME/runtime/orpheus`.
|
|
20
|
+
|
|
21
|
+
## Use with Vox
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
vox pull orpheus-tts-vllm:medium-3b
|
|
25
|
+
vox run orpheus-tts-vllm:medium-3b "Hello from Orpheus" --voice tara
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Orpheus is GPU-oriented. Expect this adapter to require a CUDA-capable runtime
|
|
29
|
+
for practical latency.
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "vox-orpheus"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Orpheus TTS adapter for Vox"
|
|
5
|
+
readme = { file = "README.md", content-type = "text/markdown" }
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"vox-runtime>=0.2.2",
|
|
9
|
+
"numpy>=1.26.0,<2.4",
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
[tool.vox.adapter]
|
|
13
|
+
import-package = "vox_orpheus"
|
|
14
|
+
runtime-policy = "target-runtime"
|
|
15
|
+
runtime-names = ["orpheus"]
|
|
16
|
+
adapter-types = ["tts"]
|
|
17
|
+
|
|
18
|
+
[project.entry-points."vox.adapters"]
|
|
19
|
+
orpheus-tts-vllm = "vox_orpheus.adapter:OrpheusAdapter"
|
|
20
|
+
|
|
21
|
+
[build-system]
|
|
22
|
+
requires = ["hatchling"]
|
|
23
|
+
build-backend = "hatchling.build"
|
|
24
|
+
|
|
25
|
+
[tool.hatch.build.targets.wheel]
|
|
26
|
+
packages = ["src/vox_orpheus"]
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib
|
|
4
|
+
import logging
|
|
5
|
+
import subprocess
|
|
6
|
+
from collections.abc import AsyncIterator
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
from numpy.typing import NDArray
|
|
11
|
+
|
|
12
|
+
from vox.core.adapter import TTSAdapter
|
|
13
|
+
from vox.core.adapter_runtime import (
|
|
14
|
+
activate_runtime_path,
|
|
15
|
+
install_target_runtime_requirements,
|
|
16
|
+
purge_runtime_modules,
|
|
17
|
+
)
|
|
18
|
+
from vox.core.adapter_runtime import (
|
|
19
|
+
runtime_root as vox_runtime_root,
|
|
20
|
+
)
|
|
21
|
+
from vox.core.types import AdapterInfo, ModelFormat, ModelType, SynthesizeChunk, VoiceInfo
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
ORPHEUS_SAMPLE_RATE = 24_000
|
|
26
|
+
ORPHEUS_RUNTIME_DEPS = ("orpheus-speech==0.1.0",)
|
|
27
|
+
DEFAULT_VOICE = "tara"
|
|
28
|
+
ORPHEUS_VOICES = ("tara", "leah", "jess", "leo", "dan", "mia", "zoe", "zac")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _runtime_root():
|
|
32
|
+
return vox_runtime_root() / "orpheus"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _ensure_runtime_path() -> str:
|
|
36
|
+
runtime_dir = _runtime_root()
|
|
37
|
+
runtime_dir.mkdir(parents=True, exist_ok=True)
|
|
38
|
+
return activate_runtime_path(runtime_dir, root=runtime_dir.parent)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _run_install_command(cmd: list[str], timeout: int) -> subprocess.CompletedProcess[str]:
|
|
42
|
+
return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _install_orpheus_runtime() -> None:
|
|
46
|
+
runtime_path = _ensure_runtime_path()
|
|
47
|
+
if not install_target_runtime_requirements(
|
|
48
|
+
runtime_path,
|
|
49
|
+
ORPHEUS_RUNTIME_DEPS,
|
|
50
|
+
timeout=1800,
|
|
51
|
+
install_runner=_run_install_command,
|
|
52
|
+
context="Orpheus runtime install",
|
|
53
|
+
):
|
|
54
|
+
raise RuntimeError("Failed to install Orpheus runtime package.")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _clear_orpheus_modules() -> None:
|
|
58
|
+
purge_runtime_modules(("orpheus_tts", "snac", "vllm"))
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _load_orpheus_model_class() -> type[Any]:
|
|
62
|
+
_ensure_runtime_path()
|
|
63
|
+
try:
|
|
64
|
+
module = importlib.import_module("orpheus_tts")
|
|
65
|
+
except ImportError:
|
|
66
|
+
_install_orpheus_runtime()
|
|
67
|
+
_clear_orpheus_modules()
|
|
68
|
+
module = importlib.import_module("orpheus_tts")
|
|
69
|
+
|
|
70
|
+
cls = getattr(module, "OrpheusModel", None)
|
|
71
|
+
if cls is None:
|
|
72
|
+
raise RuntimeError("Orpheus runtime is installed, but orpheus_tts.OrpheusModel was not found.")
|
|
73
|
+
return cls
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _select_dtype(device: str) -> Any:
|
|
77
|
+
try:
|
|
78
|
+
import torch
|
|
79
|
+
except ImportError:
|
|
80
|
+
return None
|
|
81
|
+
if device == "cuda":
|
|
82
|
+
return torch.bfloat16
|
|
83
|
+
return torch.float32
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _pcm16_bytes_to_float32(pcm: bytes) -> NDArray[np.float32]:
|
|
87
|
+
audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32)
|
|
88
|
+
if audio.size == 0:
|
|
89
|
+
return audio
|
|
90
|
+
return audio / 32768.0
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _audio_array(chunk: Any) -> NDArray[np.float32]:
|
|
94
|
+
if isinstance(chunk, bytes | bytearray):
|
|
95
|
+
return _pcm16_bytes_to_float32(bytes(chunk))
|
|
96
|
+
if hasattr(chunk, "detach"):
|
|
97
|
+
chunk = chunk.detach()
|
|
98
|
+
if hasattr(chunk, "cpu"):
|
|
99
|
+
chunk = chunk.cpu()
|
|
100
|
+
if hasattr(chunk, "numpy"):
|
|
101
|
+
chunk = chunk.numpy()
|
|
102
|
+
audio = np.asarray(chunk, dtype=np.float32).reshape(-1)
|
|
103
|
+
if audio.size == 0:
|
|
104
|
+
raise RuntimeError("Orpheus produced an empty audio chunk.")
|
|
105
|
+
return audio
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class OrpheusAdapter(TTSAdapter):
|
|
109
|
+
def __init__(self) -> None:
|
|
110
|
+
self._model: Any | None = None
|
|
111
|
+
self._model_id = ""
|
|
112
|
+
self._device = "cpu"
|
|
113
|
+
|
|
114
|
+
def info(self) -> AdapterInfo:
|
|
115
|
+
return AdapterInfo(
|
|
116
|
+
name="orpheus-tts-vllm",
|
|
117
|
+
type=ModelType.TTS,
|
|
118
|
+
architectures=("orpheus-tts-vllm", "orpheus"),
|
|
119
|
+
default_sample_rate=ORPHEUS_SAMPLE_RATE,
|
|
120
|
+
supported_formats=(ModelFormat.PYTORCH,),
|
|
121
|
+
supports_streaming=True,
|
|
122
|
+
supports_voice_cloning=False,
|
|
123
|
+
supported_languages=("en",),
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def load(self, model_path: str, device: str, **kwargs: Any) -> None:
|
|
127
|
+
if self._model is not None:
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
source = kwargs.pop("_source", None)
|
|
131
|
+
self._model_id = source if source else model_path
|
|
132
|
+
self._device = device
|
|
133
|
+
cls = _load_orpheus_model_class()
|
|
134
|
+
dtype = _select_dtype(device)
|
|
135
|
+
|
|
136
|
+
logger.info("Loading Orpheus model: %s (device=%s)", self._model_id, self._device)
|
|
137
|
+
try:
|
|
138
|
+
self._model = cls(self._model_id, dtype=dtype)
|
|
139
|
+
except TypeError:
|
|
140
|
+
self._model = cls(self._model_id)
|
|
141
|
+
|
|
142
|
+
def unload(self) -> None:
|
|
143
|
+
self._model = None
|
|
144
|
+
self._model_id = ""
|
|
145
|
+
self._device = "cpu"
|
|
146
|
+
|
|
147
|
+
@property
|
|
148
|
+
def is_loaded(self) -> bool:
|
|
149
|
+
return self._model is not None
|
|
150
|
+
|
|
151
|
+
async def synthesize(
|
|
152
|
+
self,
|
|
153
|
+
text: str,
|
|
154
|
+
*,
|
|
155
|
+
voice: str | None = None,
|
|
156
|
+
speed: float = 1.0,
|
|
157
|
+
language: str | None = None,
|
|
158
|
+
reference_audio: NDArray[np.float32] | None = None,
|
|
159
|
+
reference_text: str | None = None,
|
|
160
|
+
) -> AsyncIterator[SynthesizeChunk]:
|
|
161
|
+
if self._model is None:
|
|
162
|
+
raise RuntimeError("Orpheus model is not loaded — call load() first")
|
|
163
|
+
if reference_audio is not None or reference_text is not None:
|
|
164
|
+
raise ValueError("Orpheus does not support reference_audio/reference_text.")
|
|
165
|
+
if not text or not text.strip():
|
|
166
|
+
return
|
|
167
|
+
|
|
168
|
+
selected_voice = voice or DEFAULT_VOICE
|
|
169
|
+
kwargs: dict[str, Any] = {
|
|
170
|
+
"prompt": text,
|
|
171
|
+
"voice": selected_voice,
|
|
172
|
+
}
|
|
173
|
+
if speed and speed != 1.0:
|
|
174
|
+
logger.debug("Orpheus runtime does not expose a speed control; ignoring speed=%s", speed)
|
|
175
|
+
|
|
176
|
+
for raw_chunk in self._model.generate_speech(**kwargs):
|
|
177
|
+
audio = _audio_array(raw_chunk)
|
|
178
|
+
if audio.size:
|
|
179
|
+
yield SynthesizeChunk(
|
|
180
|
+
audio=audio.tobytes(),
|
|
181
|
+
sample_rate=ORPHEUS_SAMPLE_RATE,
|
|
182
|
+
is_final=False,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
yield SynthesizeChunk(audio=b"", sample_rate=ORPHEUS_SAMPLE_RATE, is_final=True)
|
|
186
|
+
|
|
187
|
+
def list_voices(self) -> list[VoiceInfo]:
|
|
188
|
+
return [
|
|
189
|
+
VoiceInfo(
|
|
190
|
+
id=voice,
|
|
191
|
+
name=voice.title(),
|
|
192
|
+
language="en",
|
|
193
|
+
description="Orpheus preset voice",
|
|
194
|
+
)
|
|
195
|
+
for voice in ORPHEUS_VOICES
|
|
196
|
+
]
|
|
197
|
+
|
|
198
|
+
def estimate_vram_bytes(self, **kwargs: Any) -> int:
|
|
199
|
+
return 10_000_000_000
|