agent-cli 0.70.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_cli/__init__.py +5 -0
- agent_cli/__main__.py +6 -0
- agent_cli/_extras.json +14 -0
- agent_cli/_requirements/.gitkeep +0 -0
- agent_cli/_requirements/audio.txt +79 -0
- agent_cli/_requirements/faster-whisper.txt +215 -0
- agent_cli/_requirements/kokoro.txt +425 -0
- agent_cli/_requirements/llm.txt +183 -0
- agent_cli/_requirements/memory.txt +355 -0
- agent_cli/_requirements/mlx-whisper.txt +222 -0
- agent_cli/_requirements/piper.txt +176 -0
- agent_cli/_requirements/rag.txt +402 -0
- agent_cli/_requirements/server.txt +154 -0
- agent_cli/_requirements/speed.txt +77 -0
- agent_cli/_requirements/vad.txt +155 -0
- agent_cli/_requirements/wyoming.txt +71 -0
- agent_cli/_tools.py +368 -0
- agent_cli/agents/__init__.py +23 -0
- agent_cli/agents/_voice_agent_common.py +136 -0
- agent_cli/agents/assistant.py +383 -0
- agent_cli/agents/autocorrect.py +284 -0
- agent_cli/agents/chat.py +496 -0
- agent_cli/agents/memory/__init__.py +31 -0
- agent_cli/agents/memory/add.py +190 -0
- agent_cli/agents/memory/proxy.py +160 -0
- agent_cli/agents/rag_proxy.py +128 -0
- agent_cli/agents/speak.py +209 -0
- agent_cli/agents/transcribe.py +671 -0
- agent_cli/agents/transcribe_daemon.py +499 -0
- agent_cli/agents/voice_edit.py +291 -0
- agent_cli/api.py +22 -0
- agent_cli/cli.py +106 -0
- agent_cli/config.py +503 -0
- agent_cli/config_cmd.py +307 -0
- agent_cli/constants.py +27 -0
- agent_cli/core/__init__.py +1 -0
- agent_cli/core/audio.py +461 -0
- agent_cli/core/audio_format.py +299 -0
- agent_cli/core/chroma.py +88 -0
- agent_cli/core/deps.py +191 -0
- agent_cli/core/openai_proxy.py +139 -0
- agent_cli/core/process.py +195 -0
- agent_cli/core/reranker.py +120 -0
- agent_cli/core/sse.py +87 -0
- agent_cli/core/transcription_logger.py +70 -0
- agent_cli/core/utils.py +526 -0
- agent_cli/core/vad.py +175 -0
- agent_cli/core/watch.py +65 -0
- agent_cli/dev/__init__.py +14 -0
- agent_cli/dev/cli.py +1588 -0
- agent_cli/dev/coding_agents/__init__.py +19 -0
- agent_cli/dev/coding_agents/aider.py +24 -0
- agent_cli/dev/coding_agents/base.py +167 -0
- agent_cli/dev/coding_agents/claude.py +39 -0
- agent_cli/dev/coding_agents/codex.py +24 -0
- agent_cli/dev/coding_agents/continue_dev.py +15 -0
- agent_cli/dev/coding_agents/copilot.py +24 -0
- agent_cli/dev/coding_agents/cursor_agent.py +48 -0
- agent_cli/dev/coding_agents/gemini.py +28 -0
- agent_cli/dev/coding_agents/opencode.py +15 -0
- agent_cli/dev/coding_agents/registry.py +49 -0
- agent_cli/dev/editors/__init__.py +19 -0
- agent_cli/dev/editors/base.py +89 -0
- agent_cli/dev/editors/cursor.py +15 -0
- agent_cli/dev/editors/emacs.py +46 -0
- agent_cli/dev/editors/jetbrains.py +56 -0
- agent_cli/dev/editors/nano.py +31 -0
- agent_cli/dev/editors/neovim.py +33 -0
- agent_cli/dev/editors/registry.py +59 -0
- agent_cli/dev/editors/sublime.py +20 -0
- agent_cli/dev/editors/vim.py +42 -0
- agent_cli/dev/editors/vscode.py +15 -0
- agent_cli/dev/editors/zed.py +20 -0
- agent_cli/dev/project.py +568 -0
- agent_cli/dev/registry.py +52 -0
- agent_cli/dev/skill/SKILL.md +141 -0
- agent_cli/dev/skill/examples.md +571 -0
- agent_cli/dev/terminals/__init__.py +19 -0
- agent_cli/dev/terminals/apple_terminal.py +82 -0
- agent_cli/dev/terminals/base.py +56 -0
- agent_cli/dev/terminals/gnome.py +51 -0
- agent_cli/dev/terminals/iterm2.py +84 -0
- agent_cli/dev/terminals/kitty.py +77 -0
- agent_cli/dev/terminals/registry.py +48 -0
- agent_cli/dev/terminals/tmux.py +58 -0
- agent_cli/dev/terminals/warp.py +132 -0
- agent_cli/dev/terminals/zellij.py +78 -0
- agent_cli/dev/worktree.py +856 -0
- agent_cli/docs_gen.py +417 -0
- agent_cli/example-config.toml +185 -0
- agent_cli/install/__init__.py +5 -0
- agent_cli/install/common.py +89 -0
- agent_cli/install/extras.py +174 -0
- agent_cli/install/hotkeys.py +48 -0
- agent_cli/install/services.py +87 -0
- agent_cli/memory/__init__.py +7 -0
- agent_cli/memory/_files.py +250 -0
- agent_cli/memory/_filters.py +63 -0
- agent_cli/memory/_git.py +157 -0
- agent_cli/memory/_indexer.py +142 -0
- agent_cli/memory/_ingest.py +408 -0
- agent_cli/memory/_persistence.py +182 -0
- agent_cli/memory/_prompt.py +91 -0
- agent_cli/memory/_retrieval.py +294 -0
- agent_cli/memory/_store.py +169 -0
- agent_cli/memory/_streaming.py +44 -0
- agent_cli/memory/_tasks.py +48 -0
- agent_cli/memory/api.py +113 -0
- agent_cli/memory/client.py +272 -0
- agent_cli/memory/engine.py +361 -0
- agent_cli/memory/entities.py +43 -0
- agent_cli/memory/models.py +112 -0
- agent_cli/opts.py +433 -0
- agent_cli/py.typed +0 -0
- agent_cli/rag/__init__.py +3 -0
- agent_cli/rag/_indexer.py +67 -0
- agent_cli/rag/_indexing.py +226 -0
- agent_cli/rag/_prompt.py +30 -0
- agent_cli/rag/_retriever.py +156 -0
- agent_cli/rag/_store.py +48 -0
- agent_cli/rag/_utils.py +218 -0
- agent_cli/rag/api.py +175 -0
- agent_cli/rag/client.py +299 -0
- agent_cli/rag/engine.py +302 -0
- agent_cli/rag/models.py +55 -0
- agent_cli/scripts/.runtime/.gitkeep +0 -0
- agent_cli/scripts/__init__.py +1 -0
- agent_cli/scripts/check_plugin_skill_sync.py +50 -0
- agent_cli/scripts/linux-hotkeys/README.md +63 -0
- agent_cli/scripts/linux-hotkeys/toggle-autocorrect.sh +45 -0
- agent_cli/scripts/linux-hotkeys/toggle-transcription.sh +58 -0
- agent_cli/scripts/linux-hotkeys/toggle-voice-edit.sh +58 -0
- agent_cli/scripts/macos-hotkeys/README.md +45 -0
- agent_cli/scripts/macos-hotkeys/skhd-config-example +5 -0
- agent_cli/scripts/macos-hotkeys/toggle-autocorrect.sh +12 -0
- agent_cli/scripts/macos-hotkeys/toggle-transcription.sh +37 -0
- agent_cli/scripts/macos-hotkeys/toggle-voice-edit.sh +37 -0
- agent_cli/scripts/nvidia-asr-server/README.md +99 -0
- agent_cli/scripts/nvidia-asr-server/pyproject.toml +27 -0
- agent_cli/scripts/nvidia-asr-server/server.py +255 -0
- agent_cli/scripts/nvidia-asr-server/shell.nix +32 -0
- agent_cli/scripts/nvidia-asr-server/uv.lock +4654 -0
- agent_cli/scripts/run-openwakeword.sh +11 -0
- agent_cli/scripts/run-piper-windows.ps1 +30 -0
- agent_cli/scripts/run-piper.sh +24 -0
- agent_cli/scripts/run-whisper-linux.sh +40 -0
- agent_cli/scripts/run-whisper-macos.sh +6 -0
- agent_cli/scripts/run-whisper-windows.ps1 +51 -0
- agent_cli/scripts/run-whisper.sh +9 -0
- agent_cli/scripts/run_faster_whisper_server.py +136 -0
- agent_cli/scripts/setup-linux-hotkeys.sh +72 -0
- agent_cli/scripts/setup-linux.sh +108 -0
- agent_cli/scripts/setup-macos-hotkeys.sh +61 -0
- agent_cli/scripts/setup-macos.sh +76 -0
- agent_cli/scripts/setup-windows.ps1 +63 -0
- agent_cli/scripts/start-all-services-windows.ps1 +53 -0
- agent_cli/scripts/start-all-services.sh +178 -0
- agent_cli/scripts/sync_extras.py +138 -0
- agent_cli/server/__init__.py +3 -0
- agent_cli/server/cli.py +721 -0
- agent_cli/server/common.py +222 -0
- agent_cli/server/model_manager.py +288 -0
- agent_cli/server/model_registry.py +225 -0
- agent_cli/server/proxy/__init__.py +3 -0
- agent_cli/server/proxy/api.py +444 -0
- agent_cli/server/streaming.py +67 -0
- agent_cli/server/tts/__init__.py +3 -0
- agent_cli/server/tts/api.py +335 -0
- agent_cli/server/tts/backends/__init__.py +82 -0
- agent_cli/server/tts/backends/base.py +139 -0
- agent_cli/server/tts/backends/kokoro.py +403 -0
- agent_cli/server/tts/backends/piper.py +253 -0
- agent_cli/server/tts/model_manager.py +201 -0
- agent_cli/server/tts/model_registry.py +28 -0
- agent_cli/server/tts/wyoming_handler.py +249 -0
- agent_cli/server/whisper/__init__.py +3 -0
- agent_cli/server/whisper/api.py +413 -0
- agent_cli/server/whisper/backends/__init__.py +89 -0
- agent_cli/server/whisper/backends/base.py +97 -0
- agent_cli/server/whisper/backends/faster_whisper.py +225 -0
- agent_cli/server/whisper/backends/mlx.py +270 -0
- agent_cli/server/whisper/languages.py +116 -0
- agent_cli/server/whisper/model_manager.py +157 -0
- agent_cli/server/whisper/model_registry.py +28 -0
- agent_cli/server/whisper/wyoming_handler.py +203 -0
- agent_cli/services/__init__.py +343 -0
- agent_cli/services/_wyoming_utils.py +64 -0
- agent_cli/services/asr.py +506 -0
- agent_cli/services/llm.py +228 -0
- agent_cli/services/tts.py +450 -0
- agent_cli/services/wake_word.py +142 -0
- agent_cli-0.70.5.dist-info/METADATA +2118 -0
- agent_cli-0.70.5.dist-info/RECORD +196 -0
- agent_cli-0.70.5.dist-info/WHEEL +4 -0
- agent_cli-0.70.5.dist-info/entry_points.txt +4 -0
- agent_cli-0.70.5.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,403 @@
|
|
|
1
|
+
"""Kokoro TTS backend using PyTorch-based synthesis."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import io
|
|
7
|
+
import logging
|
|
8
|
+
import time
|
|
9
|
+
import wave
|
|
10
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from multiprocessing import Manager, get_context
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import TYPE_CHECKING, Any
|
|
15
|
+
|
|
16
|
+
from agent_cli import constants
|
|
17
|
+
from agent_cli.core.process import set_process_title
|
|
18
|
+
from agent_cli.server.streaming import AsyncQueueReader, QueueWriter
|
|
19
|
+
from agent_cli.server.tts.backends.base import (
|
|
20
|
+
BackendConfig,
|
|
21
|
+
InvalidTextError,
|
|
22
|
+
SynthesisResult,
|
|
23
|
+
get_backend_cache_dir,
|
|
24
|
+
get_torch_device,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from collections.abc import AsyncIterator
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
# HuggingFace repository for Kokoro model and voices
|
|
33
|
+
KOKORO_HF_REPO = "hexgrad/Kokoro-82M"
|
|
34
|
+
|
|
35
|
+
# Default voice if none specified
|
|
36
|
+
DEFAULT_VOICE = "af_heart"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# --- Subprocess state (only used within subprocess worker) ---
|
|
40
|
+
# This state persists across function calls within the subprocess because:
|
|
41
|
+
# 1. Model loading is expensive and must be reused across synthesis calls
|
|
42
|
+
# 2. PyTorch models cannot be pickled/passed through IPC queues
|
|
43
|
+
# 3. The subprocess is long-lived (ProcessPoolExecutor reuses workers)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class _SubprocessState:
|
|
48
|
+
"""Container for subprocess-local state. Not shared with main process."""
|
|
49
|
+
|
|
50
|
+
model: Any = None
|
|
51
|
+
device: str | None = None
|
|
52
|
+
pipelines: dict[str, Any] = field(default_factory=dict)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
_state = _SubprocessState()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# --- Subprocess worker functions (run in isolated process) ---
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _hf_download(filename: str, local_dir: Path) -> Path:
|
|
62
|
+
"""Download a file from Kokoro HuggingFace repo."""
|
|
63
|
+
from huggingface_hub import hf_hub_download # noqa: PLC0415
|
|
64
|
+
|
|
65
|
+
local_dir.mkdir(parents=True, exist_ok=True)
|
|
66
|
+
hf_hub_download(repo_id=KOKORO_HF_REPO, filename=filename, local_dir=local_dir)
|
|
67
|
+
return local_dir / Path(filename).name
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _ensure_model(cache_dir: Path) -> Path:
|
|
71
|
+
"""Ensure model and config exist, downloading if needed."""
|
|
72
|
+
model_dir = cache_dir / "model"
|
|
73
|
+
model_path = model_dir / "kokoro-v1_0.pth"
|
|
74
|
+
config_path = model_dir / "config.json"
|
|
75
|
+
|
|
76
|
+
if not model_path.exists():
|
|
77
|
+
logger.info("Downloading Kokoro model...")
|
|
78
|
+
_hf_download("kokoro-v1_0.pth", model_dir)
|
|
79
|
+
if not config_path.exists():
|
|
80
|
+
logger.info("Downloading Kokoro config...")
|
|
81
|
+
_hf_download("config.json", model_dir)
|
|
82
|
+
|
|
83
|
+
return model_path
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _ensure_voice(voice_name: str, cache_dir: Path) -> Path:
|
|
87
|
+
"""Ensure voice file exists, downloading if needed."""
|
|
88
|
+
voice_path = cache_dir / "voices" / f"{voice_name}.pt"
|
|
89
|
+
if not voice_path.exists():
|
|
90
|
+
logger.info("Downloading voice '%s'...", voice_name)
|
|
91
|
+
# HuggingFace downloads to local_dir/filename, so pass cache_dir (not cache_dir/voices)
|
|
92
|
+
_hf_download(f"voices/{voice_name}.pt", cache_dir)
|
|
93
|
+
return voice_path
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _resolve_model_path(model_name: str, cache_dir: Path) -> Path:
|
|
97
|
+
"""Resolve model path, downloading if necessary."""
|
|
98
|
+
# Explicit path to existing file
|
|
99
|
+
path = Path(model_name)
|
|
100
|
+
if path.exists() and path.suffix == ".pth":
|
|
101
|
+
return path
|
|
102
|
+
|
|
103
|
+
# Otherwise download from HuggingFace
|
|
104
|
+
return _ensure_model(cache_dir)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _resolve_voice_path(voice: str | None, cache_dir: Path) -> tuple[str, str]:
|
|
108
|
+
"""Resolve voice name to path and determine language code."""
|
|
109
|
+
voice_name = voice or DEFAULT_VOICE
|
|
110
|
+
|
|
111
|
+
# Explicit path to existing file
|
|
112
|
+
path = Path(voice_name)
|
|
113
|
+
if path.exists() and path.suffix == ".pt":
|
|
114
|
+
# Kokoro convention: first letter of voice name = language code (a=American, b=British, etc.)
|
|
115
|
+
return str(path), path.stem[0].lower()
|
|
116
|
+
|
|
117
|
+
# Download from HuggingFace if needed
|
|
118
|
+
voice_path = _ensure_voice(voice_name, cache_dir)
|
|
119
|
+
return str(voice_path), voice_name[0].lower()
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _get_pipeline(voice: str | None, cache_dir: str) -> tuple[Any, str]:
|
|
123
|
+
"""Get or create pipeline for the given voice. Returns (pipeline, voice_path)."""
|
|
124
|
+
from kokoro import KPipeline # noqa: PLC0415
|
|
125
|
+
|
|
126
|
+
cache_path = Path(cache_dir)
|
|
127
|
+
voice_path, lang_code = _resolve_voice_path(voice, cache_path)
|
|
128
|
+
|
|
129
|
+
if lang_code not in _state.pipelines:
|
|
130
|
+
_state.pipelines[lang_code] = KPipeline(
|
|
131
|
+
lang_code=lang_code,
|
|
132
|
+
model=_state.model,
|
|
133
|
+
device=_state.device,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
return _state.pipelines[lang_code], voice_path
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _load_model_in_subprocess(
|
|
140
|
+
model_name: str,
|
|
141
|
+
device: str,
|
|
142
|
+
cache_dir: str,
|
|
143
|
+
) -> str:
|
|
144
|
+
"""Load Kokoro model in subprocess. Returns actual device string."""
|
|
145
|
+
import torch # noqa: PLC0415
|
|
146
|
+
from kokoro import KModel, KPipeline # noqa: PLC0415
|
|
147
|
+
|
|
148
|
+
set_process_title("tts-kokoro")
|
|
149
|
+
cache_path = Path(cache_dir)
|
|
150
|
+
|
|
151
|
+
# Resolve model path (downloads if needed)
|
|
152
|
+
model_path = _resolve_model_path(model_name, cache_path)
|
|
153
|
+
config_path = model_path.parent / "config.json"
|
|
154
|
+
|
|
155
|
+
# Determine actual device
|
|
156
|
+
if device == "auto":
|
|
157
|
+
device = get_torch_device()
|
|
158
|
+
|
|
159
|
+
# Load and move model to device
|
|
160
|
+
model = KModel(config=str(config_path), model=str(model_path)).eval()
|
|
161
|
+
if device == "cuda":
|
|
162
|
+
model = model.cuda()
|
|
163
|
+
elif device == "mps":
|
|
164
|
+
model = model.to(torch.device("mps"))
|
|
165
|
+
|
|
166
|
+
# Store in subprocess state for reuse
|
|
167
|
+
_state.model = model
|
|
168
|
+
_state.device = device
|
|
169
|
+
_state.pipelines = {}
|
|
170
|
+
|
|
171
|
+
# Warmup pipeline for default language
|
|
172
|
+
lang = DEFAULT_VOICE[0]
|
|
173
|
+
logger.info("Warming up pipeline for lang_code '%s'...", lang)
|
|
174
|
+
_state.pipelines[lang] = KPipeline(lang_code=lang, model=model, device=device)
|
|
175
|
+
|
|
176
|
+
return device
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _synthesize_in_subprocess(
|
|
180
|
+
text: str,
|
|
181
|
+
voice: str | None,
|
|
182
|
+
speed: float,
|
|
183
|
+
cache_dir: str,
|
|
184
|
+
) -> dict[str, Any]:
|
|
185
|
+
"""Synthesize text to audio in subprocess."""
|
|
186
|
+
import numpy as np # noqa: PLC0415
|
|
187
|
+
|
|
188
|
+
pipeline, voice_path = _get_pipeline(voice, cache_dir)
|
|
189
|
+
|
|
190
|
+
# Synthesize and collect audio chunks
|
|
191
|
+
audio_chunks = [
|
|
192
|
+
r.audio.numpy()
|
|
193
|
+
for r in pipeline(text, voice=voice_path, speed=speed, model=_state.model)
|
|
194
|
+
if r.audio is not None
|
|
195
|
+
]
|
|
196
|
+
if not audio_chunks:
|
|
197
|
+
msg = "No audio generated"
|
|
198
|
+
raise RuntimeError(msg)
|
|
199
|
+
|
|
200
|
+
# Convert to int16 WAV
|
|
201
|
+
audio = np.concatenate(audio_chunks)
|
|
202
|
+
audio_int16 = (audio * 32767).astype(np.int16)
|
|
203
|
+
|
|
204
|
+
sample_rate = constants.KOKORO_DEFAULT_SAMPLE_RATE
|
|
205
|
+
buffer = io.BytesIO()
|
|
206
|
+
with wave.open(buffer, "wb") as wav:
|
|
207
|
+
wav.setnchannels(1)
|
|
208
|
+
wav.setsampwidth(2)
|
|
209
|
+
wav.setframerate(sample_rate)
|
|
210
|
+
wav.writeframes(audio_int16.tobytes())
|
|
211
|
+
|
|
212
|
+
return {
|
|
213
|
+
"audio": buffer.getvalue(),
|
|
214
|
+
"sample_rate": sample_rate,
|
|
215
|
+
"duration": len(audio_int16) / sample_rate,
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _synthesize_stream_in_subprocess(
|
|
220
|
+
text: str,
|
|
221
|
+
voice: str | None,
|
|
222
|
+
speed: float,
|
|
223
|
+
cache_dir: str,
|
|
224
|
+
output_queue: Any, # Manager queue proxy
|
|
225
|
+
) -> None:
|
|
226
|
+
"""Stream audio chunks through queue as Kokoro generates them."""
|
|
227
|
+
import numpy as np # noqa: PLC0415
|
|
228
|
+
|
|
229
|
+
writer = QueueWriter(output_queue)
|
|
230
|
+
|
|
231
|
+
try:
|
|
232
|
+
pipeline, voice_path = _get_pipeline(voice, cache_dir)
|
|
233
|
+
|
|
234
|
+
chunk_count = 0
|
|
235
|
+
total_samples = 0
|
|
236
|
+
|
|
237
|
+
for result in pipeline(text, voice=voice_path, speed=speed, model=_state.model):
|
|
238
|
+
if result.audio is not None:
|
|
239
|
+
# Convert to int16 PCM bytes
|
|
240
|
+
audio_int16 = (result.audio.numpy() * 32767).astype(np.int16)
|
|
241
|
+
writer.send_data(audio_int16.tobytes())
|
|
242
|
+
chunk_count += 1
|
|
243
|
+
total_samples += len(audio_int16)
|
|
244
|
+
|
|
245
|
+
sample_rate = constants.KOKORO_DEFAULT_SAMPLE_RATE
|
|
246
|
+
writer.send_done(
|
|
247
|
+
{
|
|
248
|
+
"chunk_count": chunk_count,
|
|
249
|
+
"total_samples": total_samples,
|
|
250
|
+
"duration": total_samples / sample_rate,
|
|
251
|
+
"sample_rate": sample_rate,
|
|
252
|
+
},
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
except Exception as e:
|
|
256
|
+
writer.send_error(e)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
class KokoroBackend:
|
|
260
|
+
"""Kokoro TTS backend with subprocess isolation.
|
|
261
|
+
|
|
262
|
+
Uses kokoro library for high-quality neural TTS on CUDA, MPS, or CPU.
|
|
263
|
+
Models and voices auto-download from HuggingFace on first use.
|
|
264
|
+
Subprocess terminates on unload, releasing all GPU/CPU memory.
|
|
265
|
+
"""
|
|
266
|
+
|
|
267
|
+
def __init__(self, config: BackendConfig) -> None:
|
|
268
|
+
"""Initialize the Kokoro backend."""
|
|
269
|
+
self._config = config
|
|
270
|
+
self._executor: ProcessPoolExecutor | None = None
|
|
271
|
+
self._device: str | None = None
|
|
272
|
+
self._cache_dir = config.cache_dir or get_backend_cache_dir("kokoro")
|
|
273
|
+
|
|
274
|
+
@property
|
|
275
|
+
def is_loaded(self) -> bool:
|
|
276
|
+
"""Check if the model is currently loaded."""
|
|
277
|
+
return self._executor is not None
|
|
278
|
+
|
|
279
|
+
@property
|
|
280
|
+
def device(self) -> str | None:
|
|
281
|
+
"""Get the device the model is loaded on."""
|
|
282
|
+
return self._device
|
|
283
|
+
|
|
284
|
+
async def load(self) -> float:
|
|
285
|
+
"""Load model in subprocess. Downloads from HuggingFace if needed."""
|
|
286
|
+
if self._executor is not None:
|
|
287
|
+
return 0.0
|
|
288
|
+
|
|
289
|
+
start_time = time.time()
|
|
290
|
+
ctx = get_context("spawn")
|
|
291
|
+
self._executor = ProcessPoolExecutor(max_workers=1, mp_context=ctx)
|
|
292
|
+
|
|
293
|
+
loop = asyncio.get_running_loop()
|
|
294
|
+
self._device = await loop.run_in_executor(
|
|
295
|
+
self._executor,
|
|
296
|
+
_load_model_in_subprocess,
|
|
297
|
+
self._config.model_name,
|
|
298
|
+
self._config.device,
|
|
299
|
+
str(self._cache_dir),
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
load_duration = time.time() - start_time
|
|
303
|
+
logger.info("Loaded Kokoro model on %s in %.2fs", self._device, load_duration)
|
|
304
|
+
return load_duration
|
|
305
|
+
|
|
306
|
+
async def unload(self) -> None:
|
|
307
|
+
"""Shutdown subprocess, releasing all memory."""
|
|
308
|
+
if self._executor is None:
|
|
309
|
+
return
|
|
310
|
+
self._executor.shutdown(wait=False, cancel_futures=True)
|
|
311
|
+
self._executor = None
|
|
312
|
+
self._device = None
|
|
313
|
+
logger.info("Kokoro model unloaded (subprocess terminated)")
|
|
314
|
+
|
|
315
|
+
async def synthesize(
|
|
316
|
+
self,
|
|
317
|
+
text: str,
|
|
318
|
+
*,
|
|
319
|
+
voice: str | None = None,
|
|
320
|
+
speed: float = 1.0,
|
|
321
|
+
) -> SynthesisResult:
|
|
322
|
+
"""Synthesize text to audio."""
|
|
323
|
+
if self._executor is None:
|
|
324
|
+
msg = "Model not loaded. Call load() first."
|
|
325
|
+
raise RuntimeError(msg)
|
|
326
|
+
|
|
327
|
+
if not text or not text.strip():
|
|
328
|
+
msg = "Text cannot be empty"
|
|
329
|
+
raise InvalidTextError(msg)
|
|
330
|
+
|
|
331
|
+
loop = asyncio.get_running_loop()
|
|
332
|
+
result = await loop.run_in_executor(
|
|
333
|
+
self._executor,
|
|
334
|
+
_synthesize_in_subprocess,
|
|
335
|
+
text,
|
|
336
|
+
voice,
|
|
337
|
+
speed,
|
|
338
|
+
str(self._cache_dir),
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
return SynthesisResult(
|
|
342
|
+
audio=result["audio"],
|
|
343
|
+
sample_rate=result["sample_rate"],
|
|
344
|
+
sample_width=2,
|
|
345
|
+
channels=1,
|
|
346
|
+
duration=result["duration"],
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
@property
|
|
350
|
+
def supports_streaming(self) -> bool:
|
|
351
|
+
"""Kokoro backend supports streaming synthesis."""
|
|
352
|
+
return True
|
|
353
|
+
|
|
354
|
+
async def synthesize_stream(
|
|
355
|
+
self,
|
|
356
|
+
text: str,
|
|
357
|
+
*,
|
|
358
|
+
voice: str | None = None,
|
|
359
|
+
speed: float = 1.0,
|
|
360
|
+
) -> AsyncIterator[bytes]:
|
|
361
|
+
"""Stream synthesized audio chunks as they are generated."""
|
|
362
|
+
if self._executor is None:
|
|
363
|
+
msg = "Model not loaded. Call load() first."
|
|
364
|
+
raise RuntimeError(msg)
|
|
365
|
+
|
|
366
|
+
if not text or not text.strip():
|
|
367
|
+
msg = "Text cannot be empty"
|
|
368
|
+
raise InvalidTextError(msg)
|
|
369
|
+
|
|
370
|
+
# Use Manager queue for cross-process communication
|
|
371
|
+
# Manager queues work with already-running subprocesses
|
|
372
|
+
manager = Manager()
|
|
373
|
+
try:
|
|
374
|
+
queue = manager.Queue(maxsize=10) # Backpressure control
|
|
375
|
+
loop = asyncio.get_running_loop()
|
|
376
|
+
|
|
377
|
+
# Submit streaming worker to subprocess
|
|
378
|
+
# Manager queue is a proxy that works with already-running subprocesses
|
|
379
|
+
future = loop.run_in_executor(
|
|
380
|
+
self._executor,
|
|
381
|
+
_synthesize_stream_in_subprocess,
|
|
382
|
+
text,
|
|
383
|
+
voice,
|
|
384
|
+
speed,
|
|
385
|
+
str(self._cache_dir),
|
|
386
|
+
queue, # type: ignore[arg-type]
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
# Yield chunks as they arrive
|
|
390
|
+
reader = AsyncQueueReader(queue, timeout=30.0) # type: ignore[arg-type]
|
|
391
|
+
async for chunk in reader:
|
|
392
|
+
if chunk.chunk_type == "done":
|
|
393
|
+
break
|
|
394
|
+
if chunk.chunk_type == "error":
|
|
395
|
+
msg = str(chunk.payload)
|
|
396
|
+
raise RuntimeError(msg)
|
|
397
|
+
if chunk.payload is not None:
|
|
398
|
+
yield chunk.payload # type: ignore[misc]
|
|
399
|
+
|
|
400
|
+
# Ensure subprocess completes
|
|
401
|
+
await future
|
|
402
|
+
finally:
|
|
403
|
+
manager.shutdown()
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
"""Piper TTS backend using piper-tts library."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import io
|
|
7
|
+
import logging
|
|
8
|
+
import time
|
|
9
|
+
import wave
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import TYPE_CHECKING, Any, NoReturn
|
|
12
|
+
|
|
13
|
+
from agent_cli import constants
|
|
14
|
+
from agent_cli.server.tts.backends.base import (
|
|
15
|
+
BackendConfig,
|
|
16
|
+
InvalidTextError,
|
|
17
|
+
SynthesisResult,
|
|
18
|
+
get_backend_cache_dir,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from piper import PiperVoice
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _load_model_sync(
|
|
28
|
+
model_name: str,
|
|
29
|
+
cache_dir: str | None,
|
|
30
|
+
) -> tuple[Any, int]:
|
|
31
|
+
"""Load Piper model synchronously (for use in process pool).
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
model_name: Model name (e.g., 'en_US-lessac-medium') or path to .onnx file.
|
|
35
|
+
cache_dir: Optional cache directory for downloaded models.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Tuple of (PiperVoice, sample_rate).
|
|
39
|
+
|
|
40
|
+
"""
|
|
41
|
+
from piper import PiperVoice # noqa: PLC0415
|
|
42
|
+
from piper.download_voices import download_voice # noqa: PLC0415
|
|
43
|
+
|
|
44
|
+
# Use default cache dir if not specified
|
|
45
|
+
download_dir = Path(cache_dir) if cache_dir else get_backend_cache_dir("piper")
|
|
46
|
+
download_dir.mkdir(parents=True, exist_ok=True)
|
|
47
|
+
|
|
48
|
+
# Check if model_name is already a path to an existing file
|
|
49
|
+
model_path = Path(model_name)
|
|
50
|
+
if model_path.exists() and model_path.suffix == ".onnx":
|
|
51
|
+
# Direct path to model file
|
|
52
|
+
voice = PiperVoice.load(str(model_path), use_cuda=False)
|
|
53
|
+
return voice, voice.config.sample_rate
|
|
54
|
+
|
|
55
|
+
# Otherwise, treat as a voice name and download if needed
|
|
56
|
+
voice_code = model_name.strip()
|
|
57
|
+
expected_model_path = download_dir / f"{voice_code}.onnx"
|
|
58
|
+
|
|
59
|
+
if not expected_model_path.exists():
|
|
60
|
+
logger.info("Downloading Piper voice: %s", voice_code)
|
|
61
|
+
download_voice(voice_code, download_dir)
|
|
62
|
+
|
|
63
|
+
# Load the voice
|
|
64
|
+
voice = PiperVoice.load(str(expected_model_path), use_cuda=False)
|
|
65
|
+
|
|
66
|
+
return voice, voice.config.sample_rate
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _synthesize_sync(
|
|
70
|
+
voice: PiperVoice,
|
|
71
|
+
text: str,
|
|
72
|
+
sample_rate: int,
|
|
73
|
+
length_scale: float,
|
|
74
|
+
) -> tuple[bytes, float]:
|
|
75
|
+
"""Synthesize text to audio synchronously.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
voice: Loaded PiperVoice instance.
|
|
79
|
+
text: Text to synthesize.
|
|
80
|
+
sample_rate: Sample rate from model config.
|
|
81
|
+
length_scale: Length scale (inverse of speed).
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
Tuple of (audio_bytes, duration_seconds).
|
|
85
|
+
|
|
86
|
+
"""
|
|
87
|
+
from piper import SynthesisConfig # noqa: PLC0415
|
|
88
|
+
|
|
89
|
+
# Create synthesis config with speed adjustment
|
|
90
|
+
syn_config = SynthesisConfig(length_scale=length_scale)
|
|
91
|
+
|
|
92
|
+
# Create WAV buffer
|
|
93
|
+
buffer = io.BytesIO()
|
|
94
|
+
with wave.open(buffer, "wb") as wav_file:
|
|
95
|
+
wav_file.setnchannels(1)
|
|
96
|
+
wav_file.setsampwidth(2) # 16-bit
|
|
97
|
+
wav_file.setframerate(sample_rate)
|
|
98
|
+
|
|
99
|
+
# Synthesize and write audio chunks
|
|
100
|
+
for audio_chunk in voice.synthesize(text, syn_config):
|
|
101
|
+
wav_file.writeframes(audio_chunk.audio_int16_bytes)
|
|
102
|
+
|
|
103
|
+
audio_data = buffer.getvalue()
|
|
104
|
+
|
|
105
|
+
# Calculate duration: PCM data size / (sample_rate * channels * bytes_per_sample)
|
|
106
|
+
data_size = len(audio_data) - constants.WAV_HEADER_SIZE
|
|
107
|
+
duration = data_size / (sample_rate * 1 * 2)
|
|
108
|
+
|
|
109
|
+
return audio_data, duration
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class PiperBackend:
|
|
113
|
+
"""Piper TTS backend using ONNX-based synthesis.
|
|
114
|
+
|
|
115
|
+
This backend uses the piper-tts library for fast, CPU-friendly TTS.
|
|
116
|
+
Models are downloaded from HuggingFace on first use.
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
def __init__(self, config: BackendConfig) -> None:
|
|
120
|
+
"""Initialize the Piper backend.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
config: Backend configuration.
|
|
124
|
+
|
|
125
|
+
"""
|
|
126
|
+
self._config = config
|
|
127
|
+
self._voice: PiperVoice | None = None
|
|
128
|
+
self._sample_rate: int = constants.PIPER_DEFAULT_SAMPLE_RATE # Updated on load
|
|
129
|
+
self._device: str | None = None
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def is_loaded(self) -> bool:
|
|
133
|
+
"""Check if the model is currently loaded."""
|
|
134
|
+
return self._voice is not None
|
|
135
|
+
|
|
136
|
+
@property
|
|
137
|
+
def device(self) -> str | None:
|
|
138
|
+
"""Get the device the model is loaded on, or None if not loaded."""
|
|
139
|
+
return self._device
|
|
140
|
+
|
|
141
|
+
async def load(self) -> float:
|
|
142
|
+
"""Load the model into memory.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Load duration in seconds.
|
|
146
|
+
|
|
147
|
+
"""
|
|
148
|
+
if self._voice is not None:
|
|
149
|
+
return 0.0
|
|
150
|
+
|
|
151
|
+
start_time = time.time()
|
|
152
|
+
|
|
153
|
+
# Load synchronously since Piper is fast and CPU-only
|
|
154
|
+
loop = asyncio.get_running_loop()
|
|
155
|
+
voice, sample_rate = await loop.run_in_executor(
|
|
156
|
+
None,
|
|
157
|
+
_load_model_sync,
|
|
158
|
+
self._config.model_name,
|
|
159
|
+
str(self._config.cache_dir) if self._config.cache_dir else None,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
self._voice = voice
|
|
163
|
+
self._sample_rate = sample_rate
|
|
164
|
+
self._device = "cpu" # Piper is CPU-only
|
|
165
|
+
|
|
166
|
+
load_duration = time.time() - start_time
|
|
167
|
+
logger.info(
|
|
168
|
+
"Loaded Piper model %s in %.2fs (sample_rate=%d)",
|
|
169
|
+
self._config.model_name,
|
|
170
|
+
load_duration,
|
|
171
|
+
self._sample_rate,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
return load_duration
|
|
175
|
+
|
|
176
|
+
async def unload(self) -> None:
|
|
177
|
+
"""Unload the model and free memory."""
|
|
178
|
+
if self._voice is not None:
|
|
179
|
+
logger.info("Unloading Piper model %s", self._config.model_name)
|
|
180
|
+
self._voice = None
|
|
181
|
+
self._device = None
|
|
182
|
+
|
|
183
|
+
async def synthesize(
|
|
184
|
+
self,
|
|
185
|
+
text: str,
|
|
186
|
+
*,
|
|
187
|
+
voice: str | None = None, # noqa: ARG002
|
|
188
|
+
speed: float = 1.0,
|
|
189
|
+
) -> SynthesisResult:
|
|
190
|
+
"""Synthesize text to audio.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
text: Text to synthesize.
|
|
194
|
+
voice: Voice to use (not used for Piper - voice is the model).
|
|
195
|
+
speed: Speech speed multiplier (0.25 to 4.0).
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
SynthesisResult with audio data and metadata.
|
|
199
|
+
|
|
200
|
+
Raises:
|
|
201
|
+
InvalidTextError: If the text is empty or invalid.
|
|
202
|
+
RuntimeError: If the model is not loaded.
|
|
203
|
+
|
|
204
|
+
"""
|
|
205
|
+
if self._voice is None:
|
|
206
|
+
msg = "Model not loaded"
|
|
207
|
+
raise RuntimeError(msg)
|
|
208
|
+
|
|
209
|
+
if not text or not text.strip():
|
|
210
|
+
msg = "Text cannot be empty"
|
|
211
|
+
raise InvalidTextError(msg)
|
|
212
|
+
|
|
213
|
+
# Convert speed to length_scale (inverse relationship)
|
|
214
|
+
# Speed is already validated/clamped by the API layer
|
|
215
|
+
# length_scale < 1.0 = faster, > 1.0 = slower
|
|
216
|
+
length_scale = 1.0 / speed
|
|
217
|
+
|
|
218
|
+
# Run synthesis in executor to avoid blocking.
|
|
219
|
+
# Thread-safe: ONNX Runtime InferenceSession.run() is thread-safe since v1.10+,
|
|
220
|
+
# so concurrent requests can share the same PiperVoice instance safely.
|
|
221
|
+
loop = asyncio.get_running_loop()
|
|
222
|
+
audio_data, duration = await loop.run_in_executor(
|
|
223
|
+
None,
|
|
224
|
+
_synthesize_sync,
|
|
225
|
+
self._voice,
|
|
226
|
+
text,
|
|
227
|
+
self._sample_rate,
|
|
228
|
+
length_scale,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
return SynthesisResult(
|
|
232
|
+
audio=audio_data,
|
|
233
|
+
sample_rate=self._sample_rate,
|
|
234
|
+
sample_width=2, # 16-bit
|
|
235
|
+
channels=1, # Mono
|
|
236
|
+
duration=duration,
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
@property
|
|
240
|
+
def supports_streaming(self) -> bool:
|
|
241
|
+
"""Piper backend does not support streaming synthesis."""
|
|
242
|
+
return False
|
|
243
|
+
|
|
244
|
+
def synthesize_stream(
|
|
245
|
+
self,
|
|
246
|
+
text: str,
|
|
247
|
+
*,
|
|
248
|
+
voice: str | None = None,
|
|
249
|
+
speed: float = 1.0,
|
|
250
|
+
) -> NoReturn:
|
|
251
|
+
"""Streaming is not supported by Piper backend."""
|
|
252
|
+
msg = "Streaming synthesis is not supported by Piper backend"
|
|
253
|
+
raise NotImplementedError(msg)
|