agent-cli 0.70.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_cli/__init__.py +5 -0
- agent_cli/__main__.py +6 -0
- agent_cli/_extras.json +14 -0
- agent_cli/_requirements/.gitkeep +0 -0
- agent_cli/_requirements/audio.txt +79 -0
- agent_cli/_requirements/faster-whisper.txt +215 -0
- agent_cli/_requirements/kokoro.txt +425 -0
- agent_cli/_requirements/llm.txt +183 -0
- agent_cli/_requirements/memory.txt +355 -0
- agent_cli/_requirements/mlx-whisper.txt +222 -0
- agent_cli/_requirements/piper.txt +176 -0
- agent_cli/_requirements/rag.txt +402 -0
- agent_cli/_requirements/server.txt +154 -0
- agent_cli/_requirements/speed.txt +77 -0
- agent_cli/_requirements/vad.txt +155 -0
- agent_cli/_requirements/wyoming.txt +71 -0
- agent_cli/_tools.py +368 -0
- agent_cli/agents/__init__.py +23 -0
- agent_cli/agents/_voice_agent_common.py +136 -0
- agent_cli/agents/assistant.py +383 -0
- agent_cli/agents/autocorrect.py +284 -0
- agent_cli/agents/chat.py +496 -0
- agent_cli/agents/memory/__init__.py +31 -0
- agent_cli/agents/memory/add.py +190 -0
- agent_cli/agents/memory/proxy.py +160 -0
- agent_cli/agents/rag_proxy.py +128 -0
- agent_cli/agents/speak.py +209 -0
- agent_cli/agents/transcribe.py +671 -0
- agent_cli/agents/transcribe_daemon.py +499 -0
- agent_cli/agents/voice_edit.py +291 -0
- agent_cli/api.py +22 -0
- agent_cli/cli.py +106 -0
- agent_cli/config.py +503 -0
- agent_cli/config_cmd.py +307 -0
- agent_cli/constants.py +27 -0
- agent_cli/core/__init__.py +1 -0
- agent_cli/core/audio.py +461 -0
- agent_cli/core/audio_format.py +299 -0
- agent_cli/core/chroma.py +88 -0
- agent_cli/core/deps.py +191 -0
- agent_cli/core/openai_proxy.py +139 -0
- agent_cli/core/process.py +195 -0
- agent_cli/core/reranker.py +120 -0
- agent_cli/core/sse.py +87 -0
- agent_cli/core/transcription_logger.py +70 -0
- agent_cli/core/utils.py +526 -0
- agent_cli/core/vad.py +175 -0
- agent_cli/core/watch.py +65 -0
- agent_cli/dev/__init__.py +14 -0
- agent_cli/dev/cli.py +1588 -0
- agent_cli/dev/coding_agents/__init__.py +19 -0
- agent_cli/dev/coding_agents/aider.py +24 -0
- agent_cli/dev/coding_agents/base.py +167 -0
- agent_cli/dev/coding_agents/claude.py +39 -0
- agent_cli/dev/coding_agents/codex.py +24 -0
- agent_cli/dev/coding_agents/continue_dev.py +15 -0
- agent_cli/dev/coding_agents/copilot.py +24 -0
- agent_cli/dev/coding_agents/cursor_agent.py +48 -0
- agent_cli/dev/coding_agents/gemini.py +28 -0
- agent_cli/dev/coding_agents/opencode.py +15 -0
- agent_cli/dev/coding_agents/registry.py +49 -0
- agent_cli/dev/editors/__init__.py +19 -0
- agent_cli/dev/editors/base.py +89 -0
- agent_cli/dev/editors/cursor.py +15 -0
- agent_cli/dev/editors/emacs.py +46 -0
- agent_cli/dev/editors/jetbrains.py +56 -0
- agent_cli/dev/editors/nano.py +31 -0
- agent_cli/dev/editors/neovim.py +33 -0
- agent_cli/dev/editors/registry.py +59 -0
- agent_cli/dev/editors/sublime.py +20 -0
- agent_cli/dev/editors/vim.py +42 -0
- agent_cli/dev/editors/vscode.py +15 -0
- agent_cli/dev/editors/zed.py +20 -0
- agent_cli/dev/project.py +568 -0
- agent_cli/dev/registry.py +52 -0
- agent_cli/dev/skill/SKILL.md +141 -0
- agent_cli/dev/skill/examples.md +571 -0
- agent_cli/dev/terminals/__init__.py +19 -0
- agent_cli/dev/terminals/apple_terminal.py +82 -0
- agent_cli/dev/terminals/base.py +56 -0
- agent_cli/dev/terminals/gnome.py +51 -0
- agent_cli/dev/terminals/iterm2.py +84 -0
- agent_cli/dev/terminals/kitty.py +77 -0
- agent_cli/dev/terminals/registry.py +48 -0
- agent_cli/dev/terminals/tmux.py +58 -0
- agent_cli/dev/terminals/warp.py +132 -0
- agent_cli/dev/terminals/zellij.py +78 -0
- agent_cli/dev/worktree.py +856 -0
- agent_cli/docs_gen.py +417 -0
- agent_cli/example-config.toml +185 -0
- agent_cli/install/__init__.py +5 -0
- agent_cli/install/common.py +89 -0
- agent_cli/install/extras.py +174 -0
- agent_cli/install/hotkeys.py +48 -0
- agent_cli/install/services.py +87 -0
- agent_cli/memory/__init__.py +7 -0
- agent_cli/memory/_files.py +250 -0
- agent_cli/memory/_filters.py +63 -0
- agent_cli/memory/_git.py +157 -0
- agent_cli/memory/_indexer.py +142 -0
- agent_cli/memory/_ingest.py +408 -0
- agent_cli/memory/_persistence.py +182 -0
- agent_cli/memory/_prompt.py +91 -0
- agent_cli/memory/_retrieval.py +294 -0
- agent_cli/memory/_store.py +169 -0
- agent_cli/memory/_streaming.py +44 -0
- agent_cli/memory/_tasks.py +48 -0
- agent_cli/memory/api.py +113 -0
- agent_cli/memory/client.py +272 -0
- agent_cli/memory/engine.py +361 -0
- agent_cli/memory/entities.py +43 -0
- agent_cli/memory/models.py +112 -0
- agent_cli/opts.py +433 -0
- agent_cli/py.typed +0 -0
- agent_cli/rag/__init__.py +3 -0
- agent_cli/rag/_indexer.py +67 -0
- agent_cli/rag/_indexing.py +226 -0
- agent_cli/rag/_prompt.py +30 -0
- agent_cli/rag/_retriever.py +156 -0
- agent_cli/rag/_store.py +48 -0
- agent_cli/rag/_utils.py +218 -0
- agent_cli/rag/api.py +175 -0
- agent_cli/rag/client.py +299 -0
- agent_cli/rag/engine.py +302 -0
- agent_cli/rag/models.py +55 -0
- agent_cli/scripts/.runtime/.gitkeep +0 -0
- agent_cli/scripts/__init__.py +1 -0
- agent_cli/scripts/check_plugin_skill_sync.py +50 -0
- agent_cli/scripts/linux-hotkeys/README.md +63 -0
- agent_cli/scripts/linux-hotkeys/toggle-autocorrect.sh +45 -0
- agent_cli/scripts/linux-hotkeys/toggle-transcription.sh +58 -0
- agent_cli/scripts/linux-hotkeys/toggle-voice-edit.sh +58 -0
- agent_cli/scripts/macos-hotkeys/README.md +45 -0
- agent_cli/scripts/macos-hotkeys/skhd-config-example +5 -0
- agent_cli/scripts/macos-hotkeys/toggle-autocorrect.sh +12 -0
- agent_cli/scripts/macos-hotkeys/toggle-transcription.sh +37 -0
- agent_cli/scripts/macos-hotkeys/toggle-voice-edit.sh +37 -0
- agent_cli/scripts/nvidia-asr-server/README.md +99 -0
- agent_cli/scripts/nvidia-asr-server/pyproject.toml +27 -0
- agent_cli/scripts/nvidia-asr-server/server.py +255 -0
- agent_cli/scripts/nvidia-asr-server/shell.nix +32 -0
- agent_cli/scripts/nvidia-asr-server/uv.lock +4654 -0
- agent_cli/scripts/run-openwakeword.sh +11 -0
- agent_cli/scripts/run-piper-windows.ps1 +30 -0
- agent_cli/scripts/run-piper.sh +24 -0
- agent_cli/scripts/run-whisper-linux.sh +40 -0
- agent_cli/scripts/run-whisper-macos.sh +6 -0
- agent_cli/scripts/run-whisper-windows.ps1 +51 -0
- agent_cli/scripts/run-whisper.sh +9 -0
- agent_cli/scripts/run_faster_whisper_server.py +136 -0
- agent_cli/scripts/setup-linux-hotkeys.sh +72 -0
- agent_cli/scripts/setup-linux.sh +108 -0
- agent_cli/scripts/setup-macos-hotkeys.sh +61 -0
- agent_cli/scripts/setup-macos.sh +76 -0
- agent_cli/scripts/setup-windows.ps1 +63 -0
- agent_cli/scripts/start-all-services-windows.ps1 +53 -0
- agent_cli/scripts/start-all-services.sh +178 -0
- agent_cli/scripts/sync_extras.py +138 -0
- agent_cli/server/__init__.py +3 -0
- agent_cli/server/cli.py +721 -0
- agent_cli/server/common.py +222 -0
- agent_cli/server/model_manager.py +288 -0
- agent_cli/server/model_registry.py +225 -0
- agent_cli/server/proxy/__init__.py +3 -0
- agent_cli/server/proxy/api.py +444 -0
- agent_cli/server/streaming.py +67 -0
- agent_cli/server/tts/__init__.py +3 -0
- agent_cli/server/tts/api.py +335 -0
- agent_cli/server/tts/backends/__init__.py +82 -0
- agent_cli/server/tts/backends/base.py +139 -0
- agent_cli/server/tts/backends/kokoro.py +403 -0
- agent_cli/server/tts/backends/piper.py +253 -0
- agent_cli/server/tts/model_manager.py +201 -0
- agent_cli/server/tts/model_registry.py +28 -0
- agent_cli/server/tts/wyoming_handler.py +249 -0
- agent_cli/server/whisper/__init__.py +3 -0
- agent_cli/server/whisper/api.py +413 -0
- agent_cli/server/whisper/backends/__init__.py +89 -0
- agent_cli/server/whisper/backends/base.py +97 -0
- agent_cli/server/whisper/backends/faster_whisper.py +225 -0
- agent_cli/server/whisper/backends/mlx.py +270 -0
- agent_cli/server/whisper/languages.py +116 -0
- agent_cli/server/whisper/model_manager.py +157 -0
- agent_cli/server/whisper/model_registry.py +28 -0
- agent_cli/server/whisper/wyoming_handler.py +203 -0
- agent_cli/services/__init__.py +343 -0
- agent_cli/services/_wyoming_utils.py +64 -0
- agent_cli/services/asr.py +506 -0
- agent_cli/services/llm.py +228 -0
- agent_cli/services/tts.py +450 -0
- agent_cli/services/wake_word.py +142 -0
- agent_cli-0.70.5.dist-info/METADATA +2118 -0
- agent_cli-0.70.5.dist-info/RECORD +196 -0
- agent_cli-0.70.5.dist-info/WHEEL +4 -0
- agent_cli-0.70.5.dist-info/entry_points.txt +4 -0
- agent_cli-0.70.5.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Core streaming types for subprocess-based audio generation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from collections.abc import AsyncIterator
|
|
11
|
+
from multiprocessing import Queue
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class StreamChunk:
|
|
16
|
+
"""A chunk of streamed data from a subprocess worker."""
|
|
17
|
+
|
|
18
|
+
chunk_type: Literal["data", "error", "done"]
|
|
19
|
+
payload: bytes | str | None = None
|
|
20
|
+
metadata: dict[str, Any] | None = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class AsyncQueueReader:
|
|
24
|
+
"""Async iterator over multiprocessing.Queue with timeout handling."""
|
|
25
|
+
|
|
26
|
+
def __init__(self, queue: Queue, *, timeout: float = 30.0) -> None:
|
|
27
|
+
"""Initialize reader with queue and timeout."""
|
|
28
|
+
self._queue = queue
|
|
29
|
+
self._timeout = timeout
|
|
30
|
+
|
|
31
|
+
def __aiter__(self) -> AsyncIterator[StreamChunk]:
|
|
32
|
+
"""Return async iterator."""
|
|
33
|
+
return self
|
|
34
|
+
|
|
35
|
+
async def __anext__(self) -> StreamChunk:
|
|
36
|
+
"""Get the next chunk from the queue."""
|
|
37
|
+
loop = asyncio.get_running_loop()
|
|
38
|
+
try:
|
|
39
|
+
chunk = await asyncio.wait_for(
|
|
40
|
+
loop.run_in_executor(None, self._queue.get),
|
|
41
|
+
timeout=self._timeout,
|
|
42
|
+
)
|
|
43
|
+
except TimeoutError as e:
|
|
44
|
+
msg = f"Queue read timeout after {self._timeout}s"
|
|
45
|
+
raise TimeoutError(msg) from e
|
|
46
|
+
return chunk
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class QueueWriter:
|
|
50
|
+
"""Helper for subprocess to send chunks/errors/done sentinel."""
|
|
51
|
+
|
|
52
|
+
def __init__(self, queue: Queue) -> None:
|
|
53
|
+
"""Initialize writer with queue."""
|
|
54
|
+
self._queue = queue
|
|
55
|
+
|
|
56
|
+
def send_data(self, data: bytes, metadata: dict[str, Any] | None = None) -> None:
|
|
57
|
+
"""Send a data chunk."""
|
|
58
|
+
self._queue.put(StreamChunk("data", data, metadata))
|
|
59
|
+
|
|
60
|
+
def send_error(self, error: str | Exception) -> None:
|
|
61
|
+
"""Send an error chunk."""
|
|
62
|
+
error_msg = str(error) if isinstance(error, Exception) else error
|
|
63
|
+
self._queue.put(StreamChunk("error", error_msg))
|
|
64
|
+
|
|
65
|
+
def send_done(self, metadata: dict[str, Any] | None = None) -> None:
|
|
66
|
+
"""Send the done sentinel."""
|
|
67
|
+
self._queue.put(StreamChunk("done", metadata=metadata))
|
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
"""FastAPI application for TTS server with OpenAI-compatible API."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import TYPE_CHECKING, Annotated, Literal
|
|
7
|
+
|
|
8
|
+
from fastapi import FastAPI, HTTPException, Query
|
|
9
|
+
from fastapi.responses import StreamingResponse
|
|
10
|
+
from pydantic import BaseModel
|
|
11
|
+
|
|
12
|
+
from agent_cli import constants
|
|
13
|
+
from agent_cli.core.audio_format import check_ffmpeg_available, convert_to_mp3
|
|
14
|
+
from agent_cli.server.common import configure_app, create_lifespan
|
|
15
|
+
from agent_cli.server.tts.backends.base import InvalidTextError
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from collections.abc import AsyncIterator
|
|
19
|
+
|
|
20
|
+
from agent_cli.server.tts.model_registry import TTSModelRegistry
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _format_audio_response(
|
|
26
|
+
audio: bytes,
|
|
27
|
+
response_format: str,
|
|
28
|
+
sample_rate: int,
|
|
29
|
+
sample_width: int,
|
|
30
|
+
channels: int,
|
|
31
|
+
) -> StreamingResponse:
|
|
32
|
+
"""Format audio data as a streaming response."""
|
|
33
|
+
if response_format == "wav":
|
|
34
|
+
return StreamingResponse(iter([audio]), media_type="audio/wav")
|
|
35
|
+
|
|
36
|
+
if response_format == "pcm":
|
|
37
|
+
pcm_data = (
|
|
38
|
+
audio[constants.WAV_HEADER_SIZE :] if len(audio) > constants.WAV_HEADER_SIZE else audio
|
|
39
|
+
)
|
|
40
|
+
return StreamingResponse(
|
|
41
|
+
iter([pcm_data]),
|
|
42
|
+
media_type="audio/pcm",
|
|
43
|
+
headers={
|
|
44
|
+
"X-Sample-Rate": str(sample_rate),
|
|
45
|
+
"X-Sample-Width": str(sample_width),
|
|
46
|
+
"X-Channels": str(channels),
|
|
47
|
+
},
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
if response_format == "mp3":
|
|
51
|
+
if not check_ffmpeg_available():
|
|
52
|
+
raise HTTPException(
|
|
53
|
+
status_code=422,
|
|
54
|
+
detail="MP3 format requires ffmpeg to be installed",
|
|
55
|
+
)
|
|
56
|
+
try:
|
|
57
|
+
mp3_data = convert_to_mp3(audio, input_format="wav")
|
|
58
|
+
except RuntimeError as e:
|
|
59
|
+
raise HTTPException(status_code=500, detail=str(e)) from e
|
|
60
|
+
return StreamingResponse(iter([mp3_data]), media_type="audio/mpeg")
|
|
61
|
+
|
|
62
|
+
# Unreachable due to early validation
|
|
63
|
+
msg = f"Unsupported response_format: {response_format}"
|
|
64
|
+
raise HTTPException(status_code=422, detail=msg) # pragma: no cover
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# --- Pydantic Models ---
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class ModelStatusResponse(BaseModel):
|
|
71
|
+
"""Status of a single model."""
|
|
72
|
+
|
|
73
|
+
name: str
|
|
74
|
+
loaded: bool
|
|
75
|
+
device: str | None
|
|
76
|
+
ttl_seconds: int
|
|
77
|
+
ttl_remaining: float | None
|
|
78
|
+
active_requests: int
|
|
79
|
+
# Stats
|
|
80
|
+
load_count: int
|
|
81
|
+
unload_count: int
|
|
82
|
+
total_requests: int
|
|
83
|
+
total_characters: int
|
|
84
|
+
total_audio_seconds: float
|
|
85
|
+
total_synthesis_seconds: float
|
|
86
|
+
last_load_time: float | None
|
|
87
|
+
last_request_time: float | None
|
|
88
|
+
load_duration_seconds: float | None
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class HealthResponse(BaseModel):
|
|
92
|
+
"""Health check response."""
|
|
93
|
+
|
|
94
|
+
status: str
|
|
95
|
+
models: list[ModelStatusResponse]
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class UnloadResponse(BaseModel):
|
|
99
|
+
"""Response from model unload request."""
|
|
100
|
+
|
|
101
|
+
status: str
|
|
102
|
+
model: str
|
|
103
|
+
was_loaded: bool
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class SpeechRequest(BaseModel):
|
|
107
|
+
"""Request body for JSON speech synthesis endpoint."""
|
|
108
|
+
|
|
109
|
+
input: str
|
|
110
|
+
model: str = "tts-1"
|
|
111
|
+
voice: str = "alloy"
|
|
112
|
+
response_format: Literal["mp3", "wav", "pcm"] = "mp3"
|
|
113
|
+
speed: float = 1.0
|
|
114
|
+
stream_format: Literal["audio"] | None = None
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class VoiceInfo(BaseModel):
|
|
118
|
+
"""Information about an available voice."""
|
|
119
|
+
|
|
120
|
+
voice_id: str
|
|
121
|
+
name: str
|
|
122
|
+
description: str
|
|
123
|
+
preview_url: str | None = None
|
|
124
|
+
labels: dict[str, str] | None = None
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class VoicesResponse(BaseModel):
|
|
128
|
+
"""Response containing available voices."""
|
|
129
|
+
|
|
130
|
+
voices: list[VoiceInfo]
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# --- App Factory ---
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def create_app(
|
|
137
|
+
registry: TTSModelRegistry,
|
|
138
|
+
*,
|
|
139
|
+
enable_wyoming: bool = True,
|
|
140
|
+
wyoming_uri: str = "tcp://0.0.0.0:10200",
|
|
141
|
+
) -> FastAPI:
|
|
142
|
+
"""Create the FastAPI application.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
registry: The model registry to use.
|
|
146
|
+
enable_wyoming: Whether to start Wyoming server.
|
|
147
|
+
wyoming_uri: URI for Wyoming server.
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
Configured FastAPI application.
|
|
151
|
+
|
|
152
|
+
"""
|
|
153
|
+
lifespan = create_lifespan(
|
|
154
|
+
registry,
|
|
155
|
+
wyoming_handler_module="agent_cli.server.tts.wyoming_handler",
|
|
156
|
+
enable_wyoming=enable_wyoming,
|
|
157
|
+
wyoming_uri=wyoming_uri,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
app = FastAPI(
|
|
161
|
+
title="TTS Server",
|
|
162
|
+
description="OpenAI-compatible TTS server with TTL-based model unloading",
|
|
163
|
+
version="1.0.0",
|
|
164
|
+
lifespan=lifespan,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
configure_app(app)
|
|
168
|
+
|
|
169
|
+
# --- Health & Status Endpoints ---
|
|
170
|
+
|
|
171
|
+
@app.get("/health", response_model=HealthResponse)
|
|
172
|
+
async def health_check() -> HealthResponse:
|
|
173
|
+
"""Health check endpoint."""
|
|
174
|
+
models = [
|
|
175
|
+
ModelStatusResponse(
|
|
176
|
+
name=s.name,
|
|
177
|
+
loaded=s.loaded,
|
|
178
|
+
device=s.device,
|
|
179
|
+
ttl_seconds=s.ttl_seconds,
|
|
180
|
+
ttl_remaining=s.ttl_remaining,
|
|
181
|
+
active_requests=s.active_requests,
|
|
182
|
+
load_count=s.load_count,
|
|
183
|
+
unload_count=s.unload_count,
|
|
184
|
+
total_requests=s.total_requests,
|
|
185
|
+
total_characters=int(s.extra.get("total_characters", 0.0)),
|
|
186
|
+
total_audio_seconds=s.total_audio_seconds,
|
|
187
|
+
total_synthesis_seconds=s.extra.get("total_synthesis_seconds", 0.0),
|
|
188
|
+
last_load_time=s.last_load_time,
|
|
189
|
+
last_request_time=s.last_request_time,
|
|
190
|
+
load_duration_seconds=s.load_duration_seconds,
|
|
191
|
+
)
|
|
192
|
+
for s in registry.list_status()
|
|
193
|
+
]
|
|
194
|
+
return HealthResponse(status="healthy", models=models)
|
|
195
|
+
|
|
196
|
+
@app.post("/v1/model/unload", response_model=UnloadResponse)
|
|
197
|
+
async def unload_model(
|
|
198
|
+
model: Annotated[str | None, Query(description="Model to unload")] = None,
|
|
199
|
+
) -> UnloadResponse:
|
|
200
|
+
"""Manually unload a model from memory."""
|
|
201
|
+
try:
|
|
202
|
+
manager = registry.get_manager(model)
|
|
203
|
+
was_loaded = await manager.unload()
|
|
204
|
+
return UnloadResponse(
|
|
205
|
+
status="success",
|
|
206
|
+
model=manager.config.model_name,
|
|
207
|
+
was_loaded=was_loaded,
|
|
208
|
+
)
|
|
209
|
+
except ValueError as e:
|
|
210
|
+
raise HTTPException(status_code=404, detail=str(e)) from e
|
|
211
|
+
|
|
212
|
+
@app.get("/v1/voices", response_model=VoicesResponse)
|
|
213
|
+
async def list_voices() -> VoicesResponse:
|
|
214
|
+
"""List available voices (models).
|
|
215
|
+
|
|
216
|
+
For Piper TTS, each model IS a voice. This endpoint returns
|
|
217
|
+
the list of registered models as available voices.
|
|
218
|
+
"""
|
|
219
|
+
voices = [
|
|
220
|
+
VoiceInfo(
|
|
221
|
+
voice_id=s.name,
|
|
222
|
+
name=s.name,
|
|
223
|
+
description=f"Piper TTS voice: {s.name}",
|
|
224
|
+
labels={"language": s.name.split("_")[0] if "_" in s.name else "en"},
|
|
225
|
+
)
|
|
226
|
+
for s in registry.list_status()
|
|
227
|
+
]
|
|
228
|
+
return VoicesResponse(voices=voices)
|
|
229
|
+
|
|
230
|
+
# --- OpenAI-Compatible TTS Endpoint ---
|
|
231
|
+
|
|
232
|
+
async def _synthesize(
|
|
233
|
+
input_text: str,
|
|
234
|
+
model: str,
|
|
235
|
+
voice: str,
|
|
236
|
+
response_format: str,
|
|
237
|
+
speed: float,
|
|
238
|
+
stream_format: str | None,
|
|
239
|
+
) -> StreamingResponse:
|
|
240
|
+
"""Core synthesis logic shared by JSON and form endpoints."""
|
|
241
|
+
# Resolve model name - "tts-1" and "tts-1-hd" are OpenAI's model names
|
|
242
|
+
model_name = None if model in ("tts-1", "tts-1-hd") else model
|
|
243
|
+
|
|
244
|
+
try:
|
|
245
|
+
manager = registry.get_manager(model_name)
|
|
246
|
+
except ValueError as e:
|
|
247
|
+
raise HTTPException(status_code=400, detail=str(e)) from e
|
|
248
|
+
|
|
249
|
+
if not input_text.strip():
|
|
250
|
+
raise HTTPException(status_code=400, detail="Input text cannot be empty")
|
|
251
|
+
|
|
252
|
+
# Clamp speed to valid range
|
|
253
|
+
speed = max(0.25, min(4.0, speed))
|
|
254
|
+
|
|
255
|
+
# Handle streaming mode (OpenAI uses stream_format=audio with response_format=pcm)
|
|
256
|
+
if stream_format is not None:
|
|
257
|
+
if stream_format != "audio":
|
|
258
|
+
raise HTTPException(
|
|
259
|
+
status_code=422,
|
|
260
|
+
detail="Only 'audio' stream_format is supported",
|
|
261
|
+
)
|
|
262
|
+
if response_format != "pcm":
|
|
263
|
+
raise HTTPException(
|
|
264
|
+
status_code=422,
|
|
265
|
+
detail="Streaming requires response_format=pcm",
|
|
266
|
+
)
|
|
267
|
+
if not manager.supports_streaming:
|
|
268
|
+
raise HTTPException(
|
|
269
|
+
status_code=422,
|
|
270
|
+
detail="This model does not support streaming synthesis",
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
async def generate_audio() -> AsyncIterator[bytes]:
|
|
274
|
+
async for chunk in manager.synthesize_stream(
|
|
275
|
+
input_text,
|
|
276
|
+
voice=voice,
|
|
277
|
+
speed=speed,
|
|
278
|
+
):
|
|
279
|
+
yield chunk
|
|
280
|
+
|
|
281
|
+
return StreamingResponse(
|
|
282
|
+
generate_audio(),
|
|
283
|
+
media_type="audio/pcm",
|
|
284
|
+
headers={
|
|
285
|
+
"X-Sample-Rate": str(constants.KOKORO_DEFAULT_SAMPLE_RATE),
|
|
286
|
+
"X-Sample-Width": "2",
|
|
287
|
+
"X-Channels": "1",
|
|
288
|
+
},
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
# Non-streaming mode: validate format and synthesize complete audio
|
|
292
|
+
valid_formats = ("wav", "pcm", "mp3")
|
|
293
|
+
if response_format not in valid_formats:
|
|
294
|
+
raise HTTPException(
|
|
295
|
+
status_code=422,
|
|
296
|
+
detail=f"Unsupported response_format: {response_format}. Supported: {', '.join(valid_formats)}",
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
try:
|
|
300
|
+
result = await manager.synthesize(
|
|
301
|
+
input_text,
|
|
302
|
+
voice=voice,
|
|
303
|
+
speed=speed,
|
|
304
|
+
)
|
|
305
|
+
except InvalidTextError as e:
|
|
306
|
+
raise HTTPException(status_code=400, detail=str(e)) from e
|
|
307
|
+
except Exception as e:
|
|
308
|
+
logger.exception("Synthesis failed")
|
|
309
|
+
raise HTTPException(status_code=500, detail=str(e)) from e
|
|
310
|
+
|
|
311
|
+
return _format_audio_response(
|
|
312
|
+
result.audio,
|
|
313
|
+
response_format,
|
|
314
|
+
result.sample_rate,
|
|
315
|
+
result.sample_width,
|
|
316
|
+
result.channels,
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
@app.post("/v1/audio/speech")
|
|
320
|
+
async def synthesize_speech(request: SpeechRequest) -> StreamingResponse:
|
|
321
|
+
"""OpenAI-compatible text-to-speech endpoint.
|
|
322
|
+
|
|
323
|
+
Accepts JSON body with input, model, voice, response_format, speed,
|
|
324
|
+
and optional stream_format parameters.
|
|
325
|
+
"""
|
|
326
|
+
return await _synthesize(
|
|
327
|
+
input_text=request.input,
|
|
328
|
+
model=request.model,
|
|
329
|
+
voice=request.voice,
|
|
330
|
+
response_format=request.response_format,
|
|
331
|
+
speed=request.speed,
|
|
332
|
+
stream_format=request.stream_format,
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
return app
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""TTS backend factory with platform auto-detection."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import TYPE_CHECKING, Literal
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from agent_cli.server.tts.backends.base import TTSBackend
|
|
10
|
+
|
|
11
|
+
from agent_cli.server.tts.backends.base import (
|
|
12
|
+
BackendConfig,
|
|
13
|
+
SynthesisResult,
|
|
14
|
+
has_gpu,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
BackendType = Literal["piper", "kokoro", "auto"]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def detect_backend() -> Literal["piper", "kokoro"]:
|
|
23
|
+
"""Detect the best backend for the current platform.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
"kokoro" if GPU is available and kokoro is installed, otherwise "piper".
|
|
27
|
+
|
|
28
|
+
"""
|
|
29
|
+
if has_gpu():
|
|
30
|
+
try:
|
|
31
|
+
import kokoro # noqa: F401, PLC0415
|
|
32
|
+
|
|
33
|
+
return "kokoro"
|
|
34
|
+
except ImportError:
|
|
35
|
+
pass
|
|
36
|
+
return "piper"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def create_backend(
|
|
40
|
+
config: BackendConfig,
|
|
41
|
+
backend_type: BackendType = "auto",
|
|
42
|
+
) -> TTSBackend:
|
|
43
|
+
"""Create a TTS backend instance.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
config: Backend configuration.
|
|
47
|
+
backend_type: Backend to use, or "auto" for platform detection.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Configured TTSBackend instance.
|
|
51
|
+
|
|
52
|
+
Raises:
|
|
53
|
+
ImportError: If the required backend package is not installed.
|
|
54
|
+
ValueError: If an unknown backend type is specified.
|
|
55
|
+
|
|
56
|
+
"""
|
|
57
|
+
if backend_type == "auto":
|
|
58
|
+
backend_type = detect_backend()
|
|
59
|
+
|
|
60
|
+
logger.debug("Creating %s backend for model %s", backend_type, config.model_name)
|
|
61
|
+
|
|
62
|
+
if backend_type == "piper":
|
|
63
|
+
from agent_cli.server.tts.backends.piper import PiperBackend # noqa: PLC0415
|
|
64
|
+
|
|
65
|
+
return PiperBackend(config)
|
|
66
|
+
|
|
67
|
+
if backend_type == "kokoro":
|
|
68
|
+
from agent_cli.server.tts.backends.kokoro import KokoroBackend # noqa: PLC0415
|
|
69
|
+
|
|
70
|
+
return KokoroBackend(config)
|
|
71
|
+
|
|
72
|
+
msg = f"Unknown backend type: {backend_type}"
|
|
73
|
+
raise ValueError(msg)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
__all__ = [
|
|
77
|
+
"BackendConfig",
|
|
78
|
+
"BackendType",
|
|
79
|
+
"SynthesisResult",
|
|
80
|
+
"create_backend",
|
|
81
|
+
"detect_backend",
|
|
82
|
+
]
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""Base types and protocol for TTS backends."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING, Protocol, runtime_checkable
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from collections.abc import AsyncIterator
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_torch_device() -> str:
|
|
14
|
+
"""Detect the best available PyTorch device."""
|
|
15
|
+
try:
|
|
16
|
+
import torch # noqa: PLC0415
|
|
17
|
+
|
|
18
|
+
if torch.cuda.is_available():
|
|
19
|
+
return "cuda"
|
|
20
|
+
if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
|
21
|
+
return "mps"
|
|
22
|
+
except ImportError:
|
|
23
|
+
pass
|
|
24
|
+
return "cpu"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def has_gpu() -> bool:
|
|
28
|
+
"""Check if a GPU (CUDA or MPS) is available."""
|
|
29
|
+
return get_torch_device() in ("cuda", "mps")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_backend_cache_dir(backend_name: str) -> Path:
|
|
33
|
+
"""Get default cache directory for a TTS backend."""
|
|
34
|
+
cache_dir = Path.home() / ".cache" / backend_name
|
|
35
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
36
|
+
return cache_dir
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class SynthesisResult:
|
|
41
|
+
"""Result of a synthesis operation."""
|
|
42
|
+
|
|
43
|
+
audio: bytes
|
|
44
|
+
sample_rate: int
|
|
45
|
+
sample_width: int
|
|
46
|
+
channels: int
|
|
47
|
+
duration: float
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class BackendConfig:
|
|
52
|
+
"""Configuration for a TTS backend."""
|
|
53
|
+
|
|
54
|
+
model_name: str
|
|
55
|
+
device: str = "auto"
|
|
56
|
+
cache_dir: Path | None = None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class InvalidTextError(ValueError):
|
|
60
|
+
"""Raised when the input text is invalid or unsupported."""
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@runtime_checkable
|
|
64
|
+
class TTSBackend(Protocol):
|
|
65
|
+
"""Protocol for TTS synthesis backends.
|
|
66
|
+
|
|
67
|
+
Backends handle model loading, unloading, and synthesis.
|
|
68
|
+
The ModelManager handles TTL, stats, and lifecycle.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def is_loaded(self) -> bool:
|
|
73
|
+
"""Check if the model is currently loaded."""
|
|
74
|
+
...
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def device(self) -> str | None:
|
|
78
|
+
"""Get the device the model is loaded on, or None if not loaded."""
|
|
79
|
+
...
|
|
80
|
+
|
|
81
|
+
async def load(self) -> float:
|
|
82
|
+
"""Load the model into memory.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
Load duration in seconds.
|
|
86
|
+
|
|
87
|
+
"""
|
|
88
|
+
...
|
|
89
|
+
|
|
90
|
+
async def unload(self) -> None:
|
|
91
|
+
"""Unload the model and free memory."""
|
|
92
|
+
...
|
|
93
|
+
|
|
94
|
+
async def synthesize(
|
|
95
|
+
self,
|
|
96
|
+
text: str,
|
|
97
|
+
*,
|
|
98
|
+
voice: str | None = None,
|
|
99
|
+
speed: float = 1.0,
|
|
100
|
+
) -> SynthesisResult:
|
|
101
|
+
"""Synthesize text to audio.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
text: Text to synthesize.
|
|
105
|
+
voice: Voice to use (optional, uses model default if not specified).
|
|
106
|
+
speed: Speech speed multiplier (0.25 to 4.0).
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
SynthesisResult with audio data and metadata.
|
|
110
|
+
|
|
111
|
+
"""
|
|
112
|
+
...
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def supports_streaming(self) -> bool:
|
|
116
|
+
"""Check if backend supports streaming synthesis."""
|
|
117
|
+
return False
|
|
118
|
+
|
|
119
|
+
def synthesize_stream(
|
|
120
|
+
self,
|
|
121
|
+
text: str,
|
|
122
|
+
*,
|
|
123
|
+
voice: str | None = None,
|
|
124
|
+
speed: float = 1.0,
|
|
125
|
+
) -> AsyncIterator[bytes]:
|
|
126
|
+
"""Stream synthesized audio chunks as they are generated.
|
|
127
|
+
|
|
128
|
+
Implementations should be async generators (async def with yield).
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
text: Text to synthesize.
|
|
132
|
+
voice: Voice to use (optional).
|
|
133
|
+
speed: Speech speed multiplier (0.25 to 4.0).
|
|
134
|
+
|
|
135
|
+
Yields:
|
|
136
|
+
Raw PCM audio chunks (int16, mono).
|
|
137
|
+
|
|
138
|
+
"""
|
|
139
|
+
...
|