vox-piper 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .eggs/
8
+ *.egg
9
+ .venv/
10
+ venv/
11
+ .env
12
+ .pytest_cache/
13
+ .ruff_cache/
14
+ .mypy_cache/
15
+ *.so
16
+ *.dylib
17
+ .coverage
@@ -0,0 +1,8 @@
1
+ Metadata-Version: 2.4
2
+ Name: vox-piper
3
+ Version: 0.1.0
4
+ Summary: Piper TTS adapter for Vox
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: numpy<2.4,>=1.26.0
7
+ Requires-Dist: piper-tts<2.0.0,>=1.2.0
8
+ Requires-Dist: vox>=0.1.0
@@ -0,0 +1,20 @@
1
+ [project]
2
+ name = "vox-piper"
3
+ version = "0.1.0"
4
+ description = "Piper TTS adapter for Vox"
5
+ requires-python = ">=3.11"
6
+ dependencies = [
7
+ "vox>=0.1.0",
8
+ "piper-tts>=1.2.0,<2.0.0",
9
+ "numpy>=1.26.0,<2.4",
10
+ ]
11
+
12
+ [project.entry-points."vox.adapters"]
13
+ piper = "vox_piper.adapter:PiperAdapter"
14
+
15
+ [build-system]
16
+ requires = ["hatchling"]
17
+ build-backend = "hatchling.build"
18
+
19
+ [tool.hatch.build.targets.wheel]
20
+ packages = ["src/vox_piper"]
@@ -0,0 +1,5 @@
1
+ from __future__ import annotations
2
+
3
+ from vox_piper.adapter import PiperAdapter
4
+
5
+ __all__ = ["PiperAdapter"]
@@ -0,0 +1,301 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib.util
4
+ import json
5
+ import logging
6
+ import subprocess
7
+ import sys
8
+ from collections.abc import AsyncIterator
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ import numpy as np
13
+ import torch
14
+ from numpy.typing import NDArray
15
+
16
+ from vox.core.adapter import TTSAdapter
17
+ from vox.core.types import (
18
+ AdapterInfo,
19
+ ModelFormat,
20
+ ModelType,
21
+ SynthesizeChunk,
22
+ VoiceInfo,
23
+ )
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+ PIPER_SAMPLE_RATE = 22_050
28
+ _DEFAULT_VOICE_ID = "default"
29
+
30
+
31
+ def _select_device(device: str) -> str:
32
+ if device == "cpu":
33
+ return "cpu"
34
+ if device in ("cuda", "auto") and torch.cuda.is_available():
35
+ return "cuda"
36
+ if device in ("mps", "auto") and torch.backends.mps.is_available():
37
+ return "mps"
38
+ return "cpu"
39
+
40
+
41
+ def _read_json(path: Path) -> dict[str, Any]:
42
+ with open(path) as f:
43
+ return json.load(f)
44
+
45
+
46
+ def _extract_sample_rate(config: dict[str, Any]) -> int:
47
+ audio = config.get("audio")
48
+ if isinstance(audio, dict) and isinstance(audio.get("sample_rate"), int):
49
+ return audio["sample_rate"]
50
+ if isinstance(config.get("sample_rate"), int):
51
+ return config["sample_rate"]
52
+ return PIPER_SAMPLE_RATE
53
+
54
+
55
+ def _extract_speakers(config: dict[str, Any]) -> list[VoiceInfo]:
56
+ speaker_map = config.get("speaker_id_map")
57
+ if isinstance(speaker_map, dict) and speaker_map:
58
+ voices: list[VoiceInfo] = []
59
+ for name, speaker_id in sorted(speaker_map.items(), key=lambda item: item[1]):
60
+ voices.append(
61
+ VoiceInfo(
62
+ id=str(name),
63
+ name=str(name),
64
+ language="en-us",
65
+ description=f"Piper speaker {speaker_id}",
66
+ )
67
+ )
68
+ return voices
69
+
70
+ voice_name = config.get("voice") or config.get("name") or _DEFAULT_VOICE_ID
71
+ return [
72
+ VoiceInfo(
73
+ id=str(voice_name),
74
+ name=str(voice_name),
75
+ language="en-us",
76
+ description="Default Piper voice",
77
+ )
78
+ ]
79
+
80
+
81
+ def _find_model_files(model_dir: Path) -> tuple[Path, Path]:
82
+ candidates = sorted(model_dir.rglob("*.onnx"))
83
+ if not candidates:
84
+ raise FileNotFoundError(f"No Piper ONNX model found in {model_dir}")
85
+
86
+ model_file = candidates[0]
87
+ config_candidates = [
88
+ model_file.with_suffix(".onnx.json"),
89
+ model_file.with_suffix(".json"),
90
+ ]
91
+ for config_file in config_candidates:
92
+ if config_file.is_file():
93
+ return model_file, config_file
94
+
95
+ raise FileNotFoundError(f"No Piper config JSON found next to {model_file}")
96
+
97
+
98
+ def _pcm16_to_float32_bytes(pcm: bytes) -> NDArray[np.float32]:
99
+ audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32)
100
+ if audio.size == 0:
101
+ return audio
102
+ return audio / 32768.0
103
+
104
+
105
+ def _speaker_id_for_voice(config: dict[str, Any], voice: str | None) -> int | None:
106
+ speaker_map = config.get("speaker_id_map")
107
+ if not isinstance(speaker_map, dict) or not speaker_map:
108
+ return None
109
+
110
+ if voice is None:
111
+ default_voice = config.get("default_voice")
112
+ if isinstance(default_voice, str) and default_voice in speaker_map:
113
+ return int(speaker_map[default_voice])
114
+ first_name = next(iter(sorted(speaker_map.items(), key=lambda item: item[1])))[0]
115
+ return int(speaker_map[first_name])
116
+
117
+ if voice in speaker_map:
118
+ return int(speaker_map[voice])
119
+
120
+ try:
121
+ return int(voice)
122
+ except ValueError as exc:
123
+ available = ", ".join(sorted(speaker_map))
124
+ raise ValueError(f"Unknown Piper voice '{voice}'. Available voices: {available}") from exc
125
+
126
+
127
+ def _load_piper_voice_class() -> Any:
128
+ try:
129
+ from piper import PiperVoice
130
+ return PiperVoice
131
+ except ImportError:
132
+ _install_piper_runtime()
133
+ try:
134
+ from piper import PiperVoice
135
+ return PiperVoice
136
+ except ImportError as exc: # pragma: no cover - depends on runtime image
137
+ raise RuntimeError("Piper requires the piper-tts runtime package") from exc
138
+
139
+
140
+ def _build_synthesis_config(
141
+ config: dict[str, Any],
142
+ voice: str | None,
143
+ speed: float,
144
+ ) -> Any:
145
+ from piper.config import SynthesisConfig
146
+
147
+ speaker_id = _speaker_id_for_voice(config, voice)
148
+ length_scale = max(0.25, min(4.0, 1.0 / max(speed, 0.01)))
149
+ return SynthesisConfig(speaker_id=speaker_id, length_scale=length_scale)
150
+
151
+
152
+ def _ensure_pip_available() -> None:
153
+ if importlib.util.find_spec("pip") is not None:
154
+ return
155
+
156
+ result = subprocess.run(
157
+ [sys.executable, "-m", "ensurepip", "--default-pip"],
158
+ capture_output=True,
159
+ text=True,
160
+ timeout=300,
161
+ )
162
+ if result.returncode != 0:
163
+ raise RuntimeError(
164
+ "Failed to bootstrap pip for Piper runtime install. "
165
+ f"stderr: {result.stderr.strip()}"
166
+ )
167
+
168
+
169
+ def _install_piper_runtime() -> None:
170
+ _ensure_pip_available()
171
+ result = subprocess.run(
172
+ [sys.executable, "-m", "pip", "install", "piper-tts>=1.2.0,<2.0.0"],
173
+ capture_output=True,
174
+ text=True,
175
+ timeout=900,
176
+ )
177
+ if result.returncode != 0:
178
+ raise RuntimeError(
179
+ "Failed to install Piper runtime package. "
180
+ f"stderr: {result.stderr.strip()}"
181
+ )
182
+
183
+
184
+ class PiperAdapter(TTSAdapter):
185
+ def __init__(self) -> None:
186
+ self._voice: Any = None
187
+ self._config: dict[str, Any] = {}
188
+ self._model_id: str = ""
189
+ self._device: str = "cpu"
190
+ self._sample_rate: int = PIPER_SAMPLE_RATE
191
+ self._voices: list[VoiceInfo] = []
192
+
193
+ def info(self) -> AdapterInfo:
194
+ return AdapterInfo(
195
+ name="piper",
196
+ type=ModelType.TTS,
197
+ architectures=("piper",),
198
+ default_sample_rate=PIPER_SAMPLE_RATE,
199
+ supported_formats=(ModelFormat.ONNX,),
200
+ supports_streaming=True,
201
+ supports_voice_cloning=False,
202
+ supported_languages=("en-us",),
203
+ )
204
+
205
+ def load(self, model_path: str, device: str, **kwargs: Any) -> None:
206
+ if self._voice is not None:
207
+ return
208
+
209
+ model_dir = Path(model_path)
210
+ model_file, config_file = _find_model_files(model_dir)
211
+ self._config = _read_json(config_file)
212
+ self._sample_rate = _extract_sample_rate(self._config)
213
+ self._voices = _extract_speakers(self._config)
214
+ self._model_id = kwargs.pop("_source", None) or str(model_file)
215
+ self._device = _select_device(device)
216
+
217
+ use_cuda = self._device == "cuda"
218
+ PiperVoice = _load_piper_voice_class()
219
+ logger.info(
220
+ "Loading Piper model: %s (device=%s, sample_rate=%s)",
221
+ self._model_id,
222
+ self._device,
223
+ self._sample_rate,
224
+ )
225
+ self._voice = PiperVoice.load(str(model_file), str(config_file), use_cuda=use_cuda)
226
+
227
+ def unload(self) -> None:
228
+ self._voice = None
229
+ self._config = {}
230
+ self._voices = []
231
+ self._model_id = ""
232
+ self._device = "cpu"
233
+
234
+ @property
235
+ def is_loaded(self) -> bool:
236
+ return self._voice is not None
237
+
238
+ async def synthesize(
239
+ self,
240
+ text: str,
241
+ *,
242
+ voice: str | None = None,
243
+ speed: float = 1.0,
244
+ language: str | None = None,
245
+ reference_audio: NDArray[np.float32] | None = None,
246
+ reference_text: str | None = None,
247
+ ) -> AsyncIterator[SynthesizeChunk]:
248
+ if self._voice is None:
249
+ raise RuntimeError("Piper model is not loaded — call load() first")
250
+
251
+ if reference_audio is not None or reference_text is not None:
252
+ raise ValueError("Piper does not support reference_audio/reference_text")
253
+
254
+ if not text or not text.strip():
255
+ return
256
+
257
+ syn_config = _build_synthesis_config(self._config, voice, speed)
258
+ audio_chunks = list(self._voice.synthesize(text, syn_config=syn_config))
259
+ if not audio_chunks:
260
+ raise RuntimeError("Piper produced no audio")
261
+
262
+ sample_rate = int(getattr(audio_chunks[0], "sample_rate", self._sample_rate))
263
+ audio_arrays: list[NDArray[np.float32]] = []
264
+ for chunk in audio_chunks:
265
+ audio = getattr(chunk, "audio_float_array", None)
266
+ if audio is None:
267
+ audio = getattr(chunk, "_audio_int16_array", None)
268
+ if audio is None:
269
+ audio_bytes = getattr(chunk, "_audio_int16_bytes", None)
270
+ if audio_bytes is not None:
271
+ audio = np.frombuffer(audio_bytes, dtype=np.int16)
272
+ if audio is None:
273
+ continue
274
+ audio_arrays.append(np.asarray(audio, dtype=np.float32).reshape(-1))
275
+
276
+ if not audio_arrays:
277
+ raise RuntimeError("Piper produced no audio")
278
+
279
+ audio = np.concatenate(audio_arrays)
280
+ if audio.dtype != np.float32:
281
+ audio = audio.astype(np.float32)
282
+
283
+ if audio.size == 0:
284
+ raise RuntimeError("Piper produced no audio")
285
+
286
+ chunk_size = sample_rate * 2
287
+ for i in range(0, len(audio), chunk_size):
288
+ chunk = audio[i:i + chunk_size]
289
+ yield SynthesizeChunk(
290
+ audio=chunk.tobytes(),
291
+ sample_rate=sample_rate,
292
+ is_final=False,
293
+ )
294
+
295
+ yield SynthesizeChunk(audio=b"", sample_rate=sample_rate, is_final=True)
296
+
297
+ def list_voices(self) -> list[VoiceInfo]:
298
+ return list(self._voices)
299
+
300
+ def estimate_vram_bytes(self, **kwargs: Any) -> int:
301
+ return 220_000_000