vox-piper 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vox_piper-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "vox-piper"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Piper TTS adapter for Vox"
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
dependencies = [
|
|
7
|
+
"vox>=0.1.0",
|
|
8
|
+
"piper-tts>=1.2.0,<2.0.0",
|
|
9
|
+
"numpy>=1.26.0,<2.4",
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
[project.entry-points."vox.adapters"]
|
|
13
|
+
piper = "vox_piper.adapter:PiperAdapter"
|
|
14
|
+
|
|
15
|
+
[build-system]
|
|
16
|
+
requires = ["hatchling"]
|
|
17
|
+
build-backend = "hatchling.build"
|
|
18
|
+
|
|
19
|
+
[tool.hatch.build.targets.wheel]
|
|
20
|
+
packages = ["src/vox_piper"]
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib.util
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import subprocess
|
|
7
|
+
import sys
|
|
8
|
+
from collections.abc import AsyncIterator
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import torch
|
|
14
|
+
from numpy.typing import NDArray
|
|
15
|
+
|
|
16
|
+
from vox.core.adapter import TTSAdapter
|
|
17
|
+
from vox.core.types import (
|
|
18
|
+
AdapterInfo,
|
|
19
|
+
ModelFormat,
|
|
20
|
+
ModelType,
|
|
21
|
+
SynthesizeChunk,
|
|
22
|
+
VoiceInfo,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
PIPER_SAMPLE_RATE = 22_050
|
|
28
|
+
_DEFAULT_VOICE_ID = "default"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _select_device(device: str) -> str:
|
|
32
|
+
if device == "cpu":
|
|
33
|
+
return "cpu"
|
|
34
|
+
if device in ("cuda", "auto") and torch.cuda.is_available():
|
|
35
|
+
return "cuda"
|
|
36
|
+
if device in ("mps", "auto") and torch.backends.mps.is_available():
|
|
37
|
+
return "mps"
|
|
38
|
+
return "cpu"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _read_json(path: Path) -> dict[str, Any]:
|
|
42
|
+
with open(path) as f:
|
|
43
|
+
return json.load(f)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _extract_sample_rate(config: dict[str, Any]) -> int:
|
|
47
|
+
audio = config.get("audio")
|
|
48
|
+
if isinstance(audio, dict) and isinstance(audio.get("sample_rate"), int):
|
|
49
|
+
return audio["sample_rate"]
|
|
50
|
+
if isinstance(config.get("sample_rate"), int):
|
|
51
|
+
return config["sample_rate"]
|
|
52
|
+
return PIPER_SAMPLE_RATE
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _extract_speakers(config: dict[str, Any]) -> list[VoiceInfo]:
|
|
56
|
+
speaker_map = config.get("speaker_id_map")
|
|
57
|
+
if isinstance(speaker_map, dict) and speaker_map:
|
|
58
|
+
voices: list[VoiceInfo] = []
|
|
59
|
+
for name, speaker_id in sorted(speaker_map.items(), key=lambda item: item[1]):
|
|
60
|
+
voices.append(
|
|
61
|
+
VoiceInfo(
|
|
62
|
+
id=str(name),
|
|
63
|
+
name=str(name),
|
|
64
|
+
language="en-us",
|
|
65
|
+
description=f"Piper speaker {speaker_id}",
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
return voices
|
|
69
|
+
|
|
70
|
+
voice_name = config.get("voice") or config.get("name") or _DEFAULT_VOICE_ID
|
|
71
|
+
return [
|
|
72
|
+
VoiceInfo(
|
|
73
|
+
id=str(voice_name),
|
|
74
|
+
name=str(voice_name),
|
|
75
|
+
language="en-us",
|
|
76
|
+
description="Default Piper voice",
|
|
77
|
+
)
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _find_model_files(model_dir: Path) -> tuple[Path, Path]:
|
|
82
|
+
candidates = sorted(model_dir.rglob("*.onnx"))
|
|
83
|
+
if not candidates:
|
|
84
|
+
raise FileNotFoundError(f"No Piper ONNX model found in {model_dir}")
|
|
85
|
+
|
|
86
|
+
model_file = candidates[0]
|
|
87
|
+
config_candidates = [
|
|
88
|
+
model_file.with_suffix(".onnx.json"),
|
|
89
|
+
model_file.with_suffix(".json"),
|
|
90
|
+
]
|
|
91
|
+
for config_file in config_candidates:
|
|
92
|
+
if config_file.is_file():
|
|
93
|
+
return model_file, config_file
|
|
94
|
+
|
|
95
|
+
raise FileNotFoundError(f"No Piper config JSON found next to {model_file}")
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _pcm16_to_float32_bytes(pcm: bytes) -> NDArray[np.float32]:
|
|
99
|
+
audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32)
|
|
100
|
+
if audio.size == 0:
|
|
101
|
+
return audio
|
|
102
|
+
return audio / 32768.0
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _speaker_id_for_voice(config: dict[str, Any], voice: str | None) -> int | None:
|
|
106
|
+
speaker_map = config.get("speaker_id_map")
|
|
107
|
+
if not isinstance(speaker_map, dict) or not speaker_map:
|
|
108
|
+
return None
|
|
109
|
+
|
|
110
|
+
if voice is None:
|
|
111
|
+
default_voice = config.get("default_voice")
|
|
112
|
+
if isinstance(default_voice, str) and default_voice in speaker_map:
|
|
113
|
+
return int(speaker_map[default_voice])
|
|
114
|
+
first_name = next(iter(sorted(speaker_map.items(), key=lambda item: item[1])))[0]
|
|
115
|
+
return int(speaker_map[first_name])
|
|
116
|
+
|
|
117
|
+
if voice in speaker_map:
|
|
118
|
+
return int(speaker_map[voice])
|
|
119
|
+
|
|
120
|
+
try:
|
|
121
|
+
return int(voice)
|
|
122
|
+
except ValueError as exc:
|
|
123
|
+
available = ", ".join(sorted(speaker_map))
|
|
124
|
+
raise ValueError(f"Unknown Piper voice '{voice}'. Available voices: {available}") from exc
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _load_piper_voice_class() -> Any:
|
|
128
|
+
try:
|
|
129
|
+
from piper import PiperVoice
|
|
130
|
+
return PiperVoice
|
|
131
|
+
except ImportError:
|
|
132
|
+
_install_piper_runtime()
|
|
133
|
+
try:
|
|
134
|
+
from piper import PiperVoice
|
|
135
|
+
return PiperVoice
|
|
136
|
+
except ImportError as exc: # pragma: no cover - depends on runtime image
|
|
137
|
+
raise RuntimeError("Piper requires the piper-tts runtime package") from exc
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _build_synthesis_config(
|
|
141
|
+
config: dict[str, Any],
|
|
142
|
+
voice: str | None,
|
|
143
|
+
speed: float,
|
|
144
|
+
) -> Any:
|
|
145
|
+
from piper.config import SynthesisConfig
|
|
146
|
+
|
|
147
|
+
speaker_id = _speaker_id_for_voice(config, voice)
|
|
148
|
+
length_scale = max(0.25, min(4.0, 1.0 / max(speed, 0.01)))
|
|
149
|
+
return SynthesisConfig(speaker_id=speaker_id, length_scale=length_scale)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _ensure_pip_available() -> None:
|
|
153
|
+
if importlib.util.find_spec("pip") is not None:
|
|
154
|
+
return
|
|
155
|
+
|
|
156
|
+
result = subprocess.run(
|
|
157
|
+
[sys.executable, "-m", "ensurepip", "--default-pip"],
|
|
158
|
+
capture_output=True,
|
|
159
|
+
text=True,
|
|
160
|
+
timeout=300,
|
|
161
|
+
)
|
|
162
|
+
if result.returncode != 0:
|
|
163
|
+
raise RuntimeError(
|
|
164
|
+
"Failed to bootstrap pip for Piper runtime install. "
|
|
165
|
+
f"stderr: {result.stderr.strip()}"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _install_piper_runtime() -> None:
|
|
170
|
+
_ensure_pip_available()
|
|
171
|
+
result = subprocess.run(
|
|
172
|
+
[sys.executable, "-m", "pip", "install", "piper-tts>=1.2.0,<2.0.0"],
|
|
173
|
+
capture_output=True,
|
|
174
|
+
text=True,
|
|
175
|
+
timeout=900,
|
|
176
|
+
)
|
|
177
|
+
if result.returncode != 0:
|
|
178
|
+
raise RuntimeError(
|
|
179
|
+
"Failed to install Piper runtime package. "
|
|
180
|
+
f"stderr: {result.stderr.strip()}"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
class PiperAdapter(TTSAdapter):
|
|
185
|
+
def __init__(self) -> None:
|
|
186
|
+
self._voice: Any = None
|
|
187
|
+
self._config: dict[str, Any] = {}
|
|
188
|
+
self._model_id: str = ""
|
|
189
|
+
self._device: str = "cpu"
|
|
190
|
+
self._sample_rate: int = PIPER_SAMPLE_RATE
|
|
191
|
+
self._voices: list[VoiceInfo] = []
|
|
192
|
+
|
|
193
|
+
def info(self) -> AdapterInfo:
|
|
194
|
+
return AdapterInfo(
|
|
195
|
+
name="piper",
|
|
196
|
+
type=ModelType.TTS,
|
|
197
|
+
architectures=("piper",),
|
|
198
|
+
default_sample_rate=PIPER_SAMPLE_RATE,
|
|
199
|
+
supported_formats=(ModelFormat.ONNX,),
|
|
200
|
+
supports_streaming=True,
|
|
201
|
+
supports_voice_cloning=False,
|
|
202
|
+
supported_languages=("en-us",),
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
def load(self, model_path: str, device: str, **kwargs: Any) -> None:
|
|
206
|
+
if self._voice is not None:
|
|
207
|
+
return
|
|
208
|
+
|
|
209
|
+
model_dir = Path(model_path)
|
|
210
|
+
model_file, config_file = _find_model_files(model_dir)
|
|
211
|
+
self._config = _read_json(config_file)
|
|
212
|
+
self._sample_rate = _extract_sample_rate(self._config)
|
|
213
|
+
self._voices = _extract_speakers(self._config)
|
|
214
|
+
self._model_id = kwargs.pop("_source", None) or str(model_file)
|
|
215
|
+
self._device = _select_device(device)
|
|
216
|
+
|
|
217
|
+
use_cuda = self._device == "cuda"
|
|
218
|
+
PiperVoice = _load_piper_voice_class()
|
|
219
|
+
logger.info(
|
|
220
|
+
"Loading Piper model: %s (device=%s, sample_rate=%s)",
|
|
221
|
+
self._model_id,
|
|
222
|
+
self._device,
|
|
223
|
+
self._sample_rate,
|
|
224
|
+
)
|
|
225
|
+
self._voice = PiperVoice.load(str(model_file), str(config_file), use_cuda=use_cuda)
|
|
226
|
+
|
|
227
|
+
def unload(self) -> None:
|
|
228
|
+
self._voice = None
|
|
229
|
+
self._config = {}
|
|
230
|
+
self._voices = []
|
|
231
|
+
self._model_id = ""
|
|
232
|
+
self._device = "cpu"
|
|
233
|
+
|
|
234
|
+
@property
|
|
235
|
+
def is_loaded(self) -> bool:
|
|
236
|
+
return self._voice is not None
|
|
237
|
+
|
|
238
|
+
async def synthesize(
|
|
239
|
+
self,
|
|
240
|
+
text: str,
|
|
241
|
+
*,
|
|
242
|
+
voice: str | None = None,
|
|
243
|
+
speed: float = 1.0,
|
|
244
|
+
language: str | None = None,
|
|
245
|
+
reference_audio: NDArray[np.float32] | None = None,
|
|
246
|
+
reference_text: str | None = None,
|
|
247
|
+
) -> AsyncIterator[SynthesizeChunk]:
|
|
248
|
+
if self._voice is None:
|
|
249
|
+
raise RuntimeError("Piper model is not loaded — call load() first")
|
|
250
|
+
|
|
251
|
+
if reference_audio is not None or reference_text is not None:
|
|
252
|
+
raise ValueError("Piper does not support reference_audio/reference_text")
|
|
253
|
+
|
|
254
|
+
if not text or not text.strip():
|
|
255
|
+
return
|
|
256
|
+
|
|
257
|
+
syn_config = _build_synthesis_config(self._config, voice, speed)
|
|
258
|
+
audio_chunks = list(self._voice.synthesize(text, syn_config=syn_config))
|
|
259
|
+
if not audio_chunks:
|
|
260
|
+
raise RuntimeError("Piper produced no audio")
|
|
261
|
+
|
|
262
|
+
sample_rate = int(getattr(audio_chunks[0], "sample_rate", self._sample_rate))
|
|
263
|
+
audio_arrays: list[NDArray[np.float32]] = []
|
|
264
|
+
for chunk in audio_chunks:
|
|
265
|
+
audio = getattr(chunk, "audio_float_array", None)
|
|
266
|
+
if audio is None:
|
|
267
|
+
audio = getattr(chunk, "_audio_int16_array", None)
|
|
268
|
+
if audio is None:
|
|
269
|
+
audio_bytes = getattr(chunk, "_audio_int16_bytes", None)
|
|
270
|
+
if audio_bytes is not None:
|
|
271
|
+
audio = np.frombuffer(audio_bytes, dtype=np.int16)
|
|
272
|
+
if audio is None:
|
|
273
|
+
continue
|
|
274
|
+
audio_arrays.append(np.asarray(audio, dtype=np.float32).reshape(-1))
|
|
275
|
+
|
|
276
|
+
if not audio_arrays:
|
|
277
|
+
raise RuntimeError("Piper produced no audio")
|
|
278
|
+
|
|
279
|
+
audio = np.concatenate(audio_arrays)
|
|
280
|
+
if audio.dtype != np.float32:
|
|
281
|
+
audio = audio.astype(np.float32)
|
|
282
|
+
|
|
283
|
+
if audio.size == 0:
|
|
284
|
+
raise RuntimeError("Piper produced no audio")
|
|
285
|
+
|
|
286
|
+
chunk_size = sample_rate * 2
|
|
287
|
+
for i in range(0, len(audio), chunk_size):
|
|
288
|
+
chunk = audio[i:i + chunk_size]
|
|
289
|
+
yield SynthesizeChunk(
|
|
290
|
+
audio=chunk.tobytes(),
|
|
291
|
+
sample_rate=sample_rate,
|
|
292
|
+
is_final=False,
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
yield SynthesizeChunk(audio=b"", sample_rate=sample_rate, is_final=True)
|
|
296
|
+
|
|
297
|
+
def list_voices(self) -> list[VoiceInfo]:
|
|
298
|
+
return list(self._voices)
|
|
299
|
+
|
|
300
|
+
def estimate_vram_bytes(self, **kwargs: Any) -> int:
|
|
301
|
+
return 220_000_000
|