vox-orpheus 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .eggs/
8
+ *.egg
9
+ .venv/
10
+ venv/
11
+ .env
12
+ .pytest_cache/
13
+ .ruff_cache/
14
+ .mypy_cache/
15
+ *.so
16
+ *.dylib
17
+ .coverage
@@ -0,0 +1,38 @@
1
+ Metadata-Version: 2.4
2
+ Name: vox-orpheus
3
+ Version: 0.1.0
4
+ Summary: Orpheus TTS adapter for Vox
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: numpy<2.4,>=1.26.0
7
+ Requires-Dist: vox-runtime>=0.2.2
8
+ Description-Content-Type: text/markdown
9
+
10
+ # vox-orpheus
11
+
12
+ `vox-orpheus` provides a Vox TTS adapter for Orpheus.
13
+
14
+ Adapters:
15
+
16
+ - `orpheus-tts-vllm` - Orpheus medium 3B backend through vLLM
17
+
18
+ ## Install
19
+
20
+ ```bash
21
+ pip install vox-orpheus
22
+ ```
23
+
24
+ ## Runtime Dependencies
25
+
26
+ The adapter package is intentionally light. The `orpheus-speech` backend and
27
+ its vLLM/SNAC runtime dependencies are installed on demand into the isolated
28
+ target runtime `$VOX_HOME/runtime/orpheus`.
29
+
30
+ ## Use with Vox
31
+
32
+ ```bash
33
+ vox pull orpheus-tts-vllm:medium-3b
34
+ vox run orpheus-tts-vllm:medium-3b "Hello from Orpheus" --voice tara
35
+ ```
36
+
37
+ Orpheus is GPU-oriented. Expect this adapter to require a CUDA-capable runtime
38
+ for practical latency.
@@ -0,0 +1,29 @@
1
+ # vox-orpheus
2
+
3
+ `vox-orpheus` provides a Vox TTS adapter for Orpheus.
4
+
5
+ Adapters:
6
+
7
+ - `orpheus-tts-vllm` - Orpheus medium 3B backend through vLLM
8
+
9
+ ## Install
10
+
11
+ ```bash
12
+ pip install vox-orpheus
13
+ ```
14
+
15
+ ## Runtime Dependencies
16
+
17
+ The adapter package is intentionally light. The `orpheus-speech` backend and
18
+ its vLLM/SNAC runtime dependencies are installed on demand into the isolated
19
+ target runtime `$VOX_HOME/runtime/orpheus`.
20
+
21
+ ## Use with Vox
22
+
23
+ ```bash
24
+ vox pull orpheus-tts-vllm:medium-3b
25
+ vox run orpheus-tts-vllm:medium-3b "Hello from Orpheus" --voice tara
26
+ ```
27
+
28
+ Orpheus is GPU-oriented. Expect this adapter to require a CUDA-capable runtime
29
+ for practical latency.
@@ -0,0 +1,26 @@
1
+ [project]
2
+ name = "vox-orpheus"
3
+ version = "0.1.0"
4
+ description = "Orpheus TTS adapter for Vox"
5
+ readme = { file = "README.md", content-type = "text/markdown" }
6
+ requires-python = ">=3.11"
7
+ dependencies = [
8
+ "vox-runtime>=0.2.2",
9
+ "numpy>=1.26.0,<2.4",
10
+ ]
11
+
12
+ [tool.vox.adapter]
13
+ import-package = "vox_orpheus"
14
+ runtime-policy = "target-runtime"
15
+ runtime-names = ["orpheus"]
16
+ adapter-types = ["tts"]
17
+
18
+ [project.entry-points."vox.adapters"]
19
+ orpheus-tts-vllm = "vox_orpheus.adapter:OrpheusAdapter"
20
+
21
+ [build-system]
22
+ requires = ["hatchling"]
23
+ build-backend = "hatchling.build"
24
+
25
+ [tool.hatch.build.targets.wheel]
26
+ packages = ["src/vox_orpheus"]
@@ -0,0 +1,3 @@
1
+ from vox_orpheus.adapter import OrpheusAdapter
2
+
3
+ __all__ = ["OrpheusAdapter"]
@@ -0,0 +1,199 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import logging
5
+ import subprocess
6
+ from collections.abc import AsyncIterator
7
+ from typing import Any
8
+
9
+ import numpy as np
10
+ from numpy.typing import NDArray
11
+
12
+ from vox.core.adapter import TTSAdapter
13
+ from vox.core.adapter_runtime import (
14
+ activate_runtime_path,
15
+ install_target_runtime_requirements,
16
+ purge_runtime_modules,
17
+ )
18
+ from vox.core.adapter_runtime import (
19
+ runtime_root as vox_runtime_root,
20
+ )
21
+ from vox.core.types import AdapterInfo, ModelFormat, ModelType, SynthesizeChunk, VoiceInfo
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ ORPHEUS_SAMPLE_RATE = 24_000
26
+ ORPHEUS_RUNTIME_DEPS = ("orpheus-speech==0.1.0",)
27
+ DEFAULT_VOICE = "tara"
28
+ ORPHEUS_VOICES = ("tara", "leah", "jess", "leo", "dan", "mia", "zoe", "zac")
29
+
30
+
31
+ def _runtime_root():
32
+ return vox_runtime_root() / "orpheus"
33
+
34
+
35
+ def _ensure_runtime_path() -> str:
36
+ runtime_dir = _runtime_root()
37
+ runtime_dir.mkdir(parents=True, exist_ok=True)
38
+ return activate_runtime_path(runtime_dir, root=runtime_dir.parent)
39
+
40
+
41
+ def _run_install_command(cmd: list[str], timeout: int) -> subprocess.CompletedProcess[str]:
42
+ return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
43
+
44
+
45
+ def _install_orpheus_runtime() -> None:
46
+ runtime_path = _ensure_runtime_path()
47
+ if not install_target_runtime_requirements(
48
+ runtime_path,
49
+ ORPHEUS_RUNTIME_DEPS,
50
+ timeout=1800,
51
+ install_runner=_run_install_command,
52
+ context="Orpheus runtime install",
53
+ ):
54
+ raise RuntimeError("Failed to install Orpheus runtime package.")
55
+
56
+
57
+ def _clear_orpheus_modules() -> None:
58
+ purge_runtime_modules(("orpheus_tts", "snac", "vllm"))
59
+
60
+
61
+ def _load_orpheus_model_class() -> type[Any]:
62
+ _ensure_runtime_path()
63
+ try:
64
+ module = importlib.import_module("orpheus_tts")
65
+ except ImportError:
66
+ _install_orpheus_runtime()
67
+ _clear_orpheus_modules()
68
+ module = importlib.import_module("orpheus_tts")
69
+
70
+ cls = getattr(module, "OrpheusModel", None)
71
+ if cls is None:
72
+ raise RuntimeError("Orpheus runtime is installed, but orpheus_tts.OrpheusModel was not found.")
73
+ return cls
74
+
75
+
76
+ def _select_dtype(device: str) -> Any:
77
+ try:
78
+ import torch
79
+ except ImportError:
80
+ return None
81
+ if device == "cuda":
82
+ return torch.bfloat16
83
+ return torch.float32
84
+
85
+
86
+ def _pcm16_bytes_to_float32(pcm: bytes) -> NDArray[np.float32]:
87
+ audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32)
88
+ if audio.size == 0:
89
+ return audio
90
+ return audio / 32768.0
91
+
92
+
93
+ def _audio_array(chunk: Any) -> NDArray[np.float32]:
94
+ if isinstance(chunk, bytes | bytearray):
95
+ return _pcm16_bytes_to_float32(bytes(chunk))
96
+ if hasattr(chunk, "detach"):
97
+ chunk = chunk.detach()
98
+ if hasattr(chunk, "cpu"):
99
+ chunk = chunk.cpu()
100
+ if hasattr(chunk, "numpy"):
101
+ chunk = chunk.numpy()
102
+ audio = np.asarray(chunk, dtype=np.float32).reshape(-1)
103
+ if audio.size == 0:
104
+ raise RuntimeError("Orpheus produced an empty audio chunk.")
105
+ return audio
106
+
107
+
108
+ class OrpheusAdapter(TTSAdapter):
109
+ def __init__(self) -> None:
110
+ self._model: Any | None = None
111
+ self._model_id = ""
112
+ self._device = "cpu"
113
+
114
+ def info(self) -> AdapterInfo:
115
+ return AdapterInfo(
116
+ name="orpheus-tts-vllm",
117
+ type=ModelType.TTS,
118
+ architectures=("orpheus-tts-vllm", "orpheus"),
119
+ default_sample_rate=ORPHEUS_SAMPLE_RATE,
120
+ supported_formats=(ModelFormat.PYTORCH,),
121
+ supports_streaming=True,
122
+ supports_voice_cloning=False,
123
+ supported_languages=("en",),
124
+ )
125
+
126
+ def load(self, model_path: str, device: str, **kwargs: Any) -> None:
127
+ if self._model is not None:
128
+ return
129
+
130
+ source = kwargs.pop("_source", None)
131
+ self._model_id = source if source else model_path
132
+ self._device = device
133
+ cls = _load_orpheus_model_class()
134
+ dtype = _select_dtype(device)
135
+
136
+ logger.info("Loading Orpheus model: %s (device=%s)", self._model_id, self._device)
137
+ try:
138
+ self._model = cls(self._model_id, dtype=dtype)
139
+ except TypeError:
140
+ self._model = cls(self._model_id)
141
+
142
+ def unload(self) -> None:
143
+ self._model = None
144
+ self._model_id = ""
145
+ self._device = "cpu"
146
+
147
+ @property
148
+ def is_loaded(self) -> bool:
149
+ return self._model is not None
150
+
151
+ async def synthesize(
152
+ self,
153
+ text: str,
154
+ *,
155
+ voice: str | None = None,
156
+ speed: float = 1.0,
157
+ language: str | None = None,
158
+ reference_audio: NDArray[np.float32] | None = None,
159
+ reference_text: str | None = None,
160
+ ) -> AsyncIterator[SynthesizeChunk]:
161
+ if self._model is None:
162
+ raise RuntimeError("Orpheus model is not loaded — call load() first")
163
+ if reference_audio is not None or reference_text is not None:
164
+ raise ValueError("Orpheus does not support reference_audio/reference_text.")
165
+ if not text or not text.strip():
166
+ return
167
+
168
+ selected_voice = voice or DEFAULT_VOICE
169
+ kwargs: dict[str, Any] = {
170
+ "prompt": text,
171
+ "voice": selected_voice,
172
+ }
173
+ if speed and speed != 1.0:
174
+ logger.debug("Orpheus runtime does not expose a speed control; ignoring speed=%s", speed)
175
+
176
+ for raw_chunk in self._model.generate_speech(**kwargs):
177
+ audio = _audio_array(raw_chunk)
178
+ if audio.size:
179
+ yield SynthesizeChunk(
180
+ audio=audio.tobytes(),
181
+ sample_rate=ORPHEUS_SAMPLE_RATE,
182
+ is_final=False,
183
+ )
184
+
185
+ yield SynthesizeChunk(audio=b"", sample_rate=ORPHEUS_SAMPLE_RATE, is_final=True)
186
+
187
+ def list_voices(self) -> list[VoiceInfo]:
188
+ return [
189
+ VoiceInfo(
190
+ id=voice,
191
+ name=voice.title(),
192
+ language="en",
193
+ description="Orpheus preset voice",
194
+ )
195
+ for voice in ORPHEUS_VOICES
196
+ ]
197
+
198
+ def estimate_vram_bytes(self, **kwargs: Any) -> int:
199
+ return 10_000_000_000