agent-cli 0.70.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_cli/__init__.py +5 -0
- agent_cli/__main__.py +6 -0
- agent_cli/_extras.json +14 -0
- agent_cli/_requirements/.gitkeep +0 -0
- agent_cli/_requirements/audio.txt +79 -0
- agent_cli/_requirements/faster-whisper.txt +215 -0
- agent_cli/_requirements/kokoro.txt +425 -0
- agent_cli/_requirements/llm.txt +183 -0
- agent_cli/_requirements/memory.txt +355 -0
- agent_cli/_requirements/mlx-whisper.txt +222 -0
- agent_cli/_requirements/piper.txt +176 -0
- agent_cli/_requirements/rag.txt +402 -0
- agent_cli/_requirements/server.txt +154 -0
- agent_cli/_requirements/speed.txt +77 -0
- agent_cli/_requirements/vad.txt +155 -0
- agent_cli/_requirements/wyoming.txt +71 -0
- agent_cli/_tools.py +368 -0
- agent_cli/agents/__init__.py +23 -0
- agent_cli/agents/_voice_agent_common.py +136 -0
- agent_cli/agents/assistant.py +383 -0
- agent_cli/agents/autocorrect.py +284 -0
- agent_cli/agents/chat.py +496 -0
- agent_cli/agents/memory/__init__.py +31 -0
- agent_cli/agents/memory/add.py +190 -0
- agent_cli/agents/memory/proxy.py +160 -0
- agent_cli/agents/rag_proxy.py +128 -0
- agent_cli/agents/speak.py +209 -0
- agent_cli/agents/transcribe.py +671 -0
- agent_cli/agents/transcribe_daemon.py +499 -0
- agent_cli/agents/voice_edit.py +291 -0
- agent_cli/api.py +22 -0
- agent_cli/cli.py +106 -0
- agent_cli/config.py +503 -0
- agent_cli/config_cmd.py +307 -0
- agent_cli/constants.py +27 -0
- agent_cli/core/__init__.py +1 -0
- agent_cli/core/audio.py +461 -0
- agent_cli/core/audio_format.py +299 -0
- agent_cli/core/chroma.py +88 -0
- agent_cli/core/deps.py +191 -0
- agent_cli/core/openai_proxy.py +139 -0
- agent_cli/core/process.py +195 -0
- agent_cli/core/reranker.py +120 -0
- agent_cli/core/sse.py +87 -0
- agent_cli/core/transcription_logger.py +70 -0
- agent_cli/core/utils.py +526 -0
- agent_cli/core/vad.py +175 -0
- agent_cli/core/watch.py +65 -0
- agent_cli/dev/__init__.py +14 -0
- agent_cli/dev/cli.py +1588 -0
- agent_cli/dev/coding_agents/__init__.py +19 -0
- agent_cli/dev/coding_agents/aider.py +24 -0
- agent_cli/dev/coding_agents/base.py +167 -0
- agent_cli/dev/coding_agents/claude.py +39 -0
- agent_cli/dev/coding_agents/codex.py +24 -0
- agent_cli/dev/coding_agents/continue_dev.py +15 -0
- agent_cli/dev/coding_agents/copilot.py +24 -0
- agent_cli/dev/coding_agents/cursor_agent.py +48 -0
- agent_cli/dev/coding_agents/gemini.py +28 -0
- agent_cli/dev/coding_agents/opencode.py +15 -0
- agent_cli/dev/coding_agents/registry.py +49 -0
- agent_cli/dev/editors/__init__.py +19 -0
- agent_cli/dev/editors/base.py +89 -0
- agent_cli/dev/editors/cursor.py +15 -0
- agent_cli/dev/editors/emacs.py +46 -0
- agent_cli/dev/editors/jetbrains.py +56 -0
- agent_cli/dev/editors/nano.py +31 -0
- agent_cli/dev/editors/neovim.py +33 -0
- agent_cli/dev/editors/registry.py +59 -0
- agent_cli/dev/editors/sublime.py +20 -0
- agent_cli/dev/editors/vim.py +42 -0
- agent_cli/dev/editors/vscode.py +15 -0
- agent_cli/dev/editors/zed.py +20 -0
- agent_cli/dev/project.py +568 -0
- agent_cli/dev/registry.py +52 -0
- agent_cli/dev/skill/SKILL.md +141 -0
- agent_cli/dev/skill/examples.md +571 -0
- agent_cli/dev/terminals/__init__.py +19 -0
- agent_cli/dev/terminals/apple_terminal.py +82 -0
- agent_cli/dev/terminals/base.py +56 -0
- agent_cli/dev/terminals/gnome.py +51 -0
- agent_cli/dev/terminals/iterm2.py +84 -0
- agent_cli/dev/terminals/kitty.py +77 -0
- agent_cli/dev/terminals/registry.py +48 -0
- agent_cli/dev/terminals/tmux.py +58 -0
- agent_cli/dev/terminals/warp.py +132 -0
- agent_cli/dev/terminals/zellij.py +78 -0
- agent_cli/dev/worktree.py +856 -0
- agent_cli/docs_gen.py +417 -0
- agent_cli/example-config.toml +185 -0
- agent_cli/install/__init__.py +5 -0
- agent_cli/install/common.py +89 -0
- agent_cli/install/extras.py +174 -0
- agent_cli/install/hotkeys.py +48 -0
- agent_cli/install/services.py +87 -0
- agent_cli/memory/__init__.py +7 -0
- agent_cli/memory/_files.py +250 -0
- agent_cli/memory/_filters.py +63 -0
- agent_cli/memory/_git.py +157 -0
- agent_cli/memory/_indexer.py +142 -0
- agent_cli/memory/_ingest.py +408 -0
- agent_cli/memory/_persistence.py +182 -0
- agent_cli/memory/_prompt.py +91 -0
- agent_cli/memory/_retrieval.py +294 -0
- agent_cli/memory/_store.py +169 -0
- agent_cli/memory/_streaming.py +44 -0
- agent_cli/memory/_tasks.py +48 -0
- agent_cli/memory/api.py +113 -0
- agent_cli/memory/client.py +272 -0
- agent_cli/memory/engine.py +361 -0
- agent_cli/memory/entities.py +43 -0
- agent_cli/memory/models.py +112 -0
- agent_cli/opts.py +433 -0
- agent_cli/py.typed +0 -0
- agent_cli/rag/__init__.py +3 -0
- agent_cli/rag/_indexer.py +67 -0
- agent_cli/rag/_indexing.py +226 -0
- agent_cli/rag/_prompt.py +30 -0
- agent_cli/rag/_retriever.py +156 -0
- agent_cli/rag/_store.py +48 -0
- agent_cli/rag/_utils.py +218 -0
- agent_cli/rag/api.py +175 -0
- agent_cli/rag/client.py +299 -0
- agent_cli/rag/engine.py +302 -0
- agent_cli/rag/models.py +55 -0
- agent_cli/scripts/.runtime/.gitkeep +0 -0
- agent_cli/scripts/__init__.py +1 -0
- agent_cli/scripts/check_plugin_skill_sync.py +50 -0
- agent_cli/scripts/linux-hotkeys/README.md +63 -0
- agent_cli/scripts/linux-hotkeys/toggle-autocorrect.sh +45 -0
- agent_cli/scripts/linux-hotkeys/toggle-transcription.sh +58 -0
- agent_cli/scripts/linux-hotkeys/toggle-voice-edit.sh +58 -0
- agent_cli/scripts/macos-hotkeys/README.md +45 -0
- agent_cli/scripts/macos-hotkeys/skhd-config-example +5 -0
- agent_cli/scripts/macos-hotkeys/toggle-autocorrect.sh +12 -0
- agent_cli/scripts/macos-hotkeys/toggle-transcription.sh +37 -0
- agent_cli/scripts/macos-hotkeys/toggle-voice-edit.sh +37 -0
- agent_cli/scripts/nvidia-asr-server/README.md +99 -0
- agent_cli/scripts/nvidia-asr-server/pyproject.toml +27 -0
- agent_cli/scripts/nvidia-asr-server/server.py +255 -0
- agent_cli/scripts/nvidia-asr-server/shell.nix +32 -0
- agent_cli/scripts/nvidia-asr-server/uv.lock +4654 -0
- agent_cli/scripts/run-openwakeword.sh +11 -0
- agent_cli/scripts/run-piper-windows.ps1 +30 -0
- agent_cli/scripts/run-piper.sh +24 -0
- agent_cli/scripts/run-whisper-linux.sh +40 -0
- agent_cli/scripts/run-whisper-macos.sh +6 -0
- agent_cli/scripts/run-whisper-windows.ps1 +51 -0
- agent_cli/scripts/run-whisper.sh +9 -0
- agent_cli/scripts/run_faster_whisper_server.py +136 -0
- agent_cli/scripts/setup-linux-hotkeys.sh +72 -0
- agent_cli/scripts/setup-linux.sh +108 -0
- agent_cli/scripts/setup-macos-hotkeys.sh +61 -0
- agent_cli/scripts/setup-macos.sh +76 -0
- agent_cli/scripts/setup-windows.ps1 +63 -0
- agent_cli/scripts/start-all-services-windows.ps1 +53 -0
- agent_cli/scripts/start-all-services.sh +178 -0
- agent_cli/scripts/sync_extras.py +138 -0
- agent_cli/server/__init__.py +3 -0
- agent_cli/server/cli.py +721 -0
- agent_cli/server/common.py +222 -0
- agent_cli/server/model_manager.py +288 -0
- agent_cli/server/model_registry.py +225 -0
- agent_cli/server/proxy/__init__.py +3 -0
- agent_cli/server/proxy/api.py +444 -0
- agent_cli/server/streaming.py +67 -0
- agent_cli/server/tts/__init__.py +3 -0
- agent_cli/server/tts/api.py +335 -0
- agent_cli/server/tts/backends/__init__.py +82 -0
- agent_cli/server/tts/backends/base.py +139 -0
- agent_cli/server/tts/backends/kokoro.py +403 -0
- agent_cli/server/tts/backends/piper.py +253 -0
- agent_cli/server/tts/model_manager.py +201 -0
- agent_cli/server/tts/model_registry.py +28 -0
- agent_cli/server/tts/wyoming_handler.py +249 -0
- agent_cli/server/whisper/__init__.py +3 -0
- agent_cli/server/whisper/api.py +413 -0
- agent_cli/server/whisper/backends/__init__.py +89 -0
- agent_cli/server/whisper/backends/base.py +97 -0
- agent_cli/server/whisper/backends/faster_whisper.py +225 -0
- agent_cli/server/whisper/backends/mlx.py +270 -0
- agent_cli/server/whisper/languages.py +116 -0
- agent_cli/server/whisper/model_manager.py +157 -0
- agent_cli/server/whisper/model_registry.py +28 -0
- agent_cli/server/whisper/wyoming_handler.py +203 -0
- agent_cli/services/__init__.py +343 -0
- agent_cli/services/_wyoming_utils.py +64 -0
- agent_cli/services/asr.py +506 -0
- agent_cli/services/llm.py +228 -0
- agent_cli/services/tts.py +450 -0
- agent_cli/services/wake_word.py +142 -0
- agent_cli-0.70.5.dist-info/METADATA +2118 -0
- agent_cli-0.70.5.dist-info/RECORD +196 -0
- agent_cli-0.70.5.dist-info/WHEEL +4 -0
- agent_cli-0.70.5.dist-info/entry_points.txt +4 -0
- agent_cli-0.70.5.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,413 @@
|
|
|
1
|
+
"""FastAPI application for Whisper ASR server with OpenAI-compatible API."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import contextlib
|
|
6
|
+
import io
|
|
7
|
+
import logging
|
|
8
|
+
import wave
|
|
9
|
+
from typing import TYPE_CHECKING, Annotated, Any, Literal
|
|
10
|
+
|
|
11
|
+
from fastapi import FastAPI, File, Form, HTTPException, Query, UploadFile, WebSocket
|
|
12
|
+
from fastapi.responses import PlainTextResponse
|
|
13
|
+
from pydantic import BaseModel
|
|
14
|
+
|
|
15
|
+
from agent_cli.server.common import configure_app, create_lifespan, setup_wav_file
|
|
16
|
+
from agent_cli.server.whisper.backends.base import InvalidAudioError
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from agent_cli.server.whisper.model_registry import WhisperModelRegistry
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _split_seconds(seconds: float) -> tuple[int, int, int, int]:
|
|
25
|
+
"""Split seconds into (hours, minutes, seconds, milliseconds)."""
|
|
26
|
+
hours = int(seconds // 3600)
|
|
27
|
+
minutes = int((seconds % 3600) // 60)
|
|
28
|
+
secs = int(seconds % 60)
|
|
29
|
+
millis = int((seconds % 1) * 1000)
|
|
30
|
+
return hours, minutes, secs, millis
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _format_timestamp(seconds: float, *, always_include_hours: bool = False) -> str:
|
|
34
|
+
"""Format seconds as HH:MM:SS,mmm for SRT format."""
|
|
35
|
+
hours, minutes, secs, millis = _split_seconds(seconds)
|
|
36
|
+
if always_include_hours or hours > 0:
|
|
37
|
+
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
|
|
38
|
+
return f"{minutes:02d}:{secs:02d},{millis:03d}"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _format_vtt_timestamp(seconds: float) -> str:
|
|
42
|
+
"""Format seconds as HH:MM:SS.mmm for VTT format."""
|
|
43
|
+
hours, minutes, secs, millis = _split_seconds(seconds)
|
|
44
|
+
return f"{hours:02d}:{minutes:02d}:{secs:02d}.{millis:03d}"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _format_srt(segments: list[dict[str, Any]]) -> str:
|
|
48
|
+
"""Format segments as SRT subtitles."""
|
|
49
|
+
lines = []
|
|
50
|
+
for i, seg in enumerate(segments, 1):
|
|
51
|
+
start = _format_timestamp(seg["start"], always_include_hours=True)
|
|
52
|
+
end = _format_timestamp(seg["end"], always_include_hours=True)
|
|
53
|
+
text = seg["text"].strip()
|
|
54
|
+
lines.append(f"{i}\n{start} --> {end}\n{text}\n")
|
|
55
|
+
return "\n".join(lines)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _format_vtt(segments: list[dict[str, Any]]) -> str:
|
|
59
|
+
"""Format segments as WebVTT subtitles."""
|
|
60
|
+
lines = ["WEBVTT", ""]
|
|
61
|
+
for seg in segments:
|
|
62
|
+
start = _format_vtt_timestamp(seg["start"])
|
|
63
|
+
end = _format_vtt_timestamp(seg["end"])
|
|
64
|
+
text = seg["text"].strip()
|
|
65
|
+
lines.append(f"{start} --> {end}\n{text}\n")
|
|
66
|
+
return "\n".join(lines)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# --- Pydantic Models ---
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class TranscriptionResponse(BaseModel):
|
|
73
|
+
"""OpenAI-compatible transcription response."""
|
|
74
|
+
|
|
75
|
+
text: str
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class VerboseTranscriptionResponse(BaseModel):
|
|
79
|
+
"""OpenAI-compatible verbose transcription response."""
|
|
80
|
+
|
|
81
|
+
task: Literal["transcribe", "translate"]
|
|
82
|
+
language: str
|
|
83
|
+
duration: float
|
|
84
|
+
text: str
|
|
85
|
+
segments: list[dict[str, Any]]
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class ModelStatusResponse(BaseModel):
|
|
89
|
+
"""Status of a single model."""
|
|
90
|
+
|
|
91
|
+
name: str
|
|
92
|
+
loaded: bool
|
|
93
|
+
device: str | None
|
|
94
|
+
ttl_seconds: int
|
|
95
|
+
ttl_remaining: float | None
|
|
96
|
+
active_requests: int
|
|
97
|
+
# Stats
|
|
98
|
+
load_count: int
|
|
99
|
+
unload_count: int
|
|
100
|
+
total_requests: int
|
|
101
|
+
total_audio_seconds: float
|
|
102
|
+
total_transcription_seconds: float
|
|
103
|
+
last_load_time: float | None
|
|
104
|
+
last_request_time: float | None
|
|
105
|
+
load_duration_seconds: float | None
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class HealthResponse(BaseModel):
|
|
109
|
+
"""Health check response."""
|
|
110
|
+
|
|
111
|
+
status: str
|
|
112
|
+
models: list[ModelStatusResponse]
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class UnloadResponse(BaseModel):
|
|
116
|
+
"""Response from model unload request."""
|
|
117
|
+
|
|
118
|
+
status: str
|
|
119
|
+
model: str
|
|
120
|
+
was_loaded: bool
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
# --- App Factory ---
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def create_app( # noqa: C901, PLR0915
|
|
127
|
+
registry: WhisperModelRegistry,
|
|
128
|
+
*,
|
|
129
|
+
enable_wyoming: bool = True,
|
|
130
|
+
wyoming_uri: str = "tcp://0.0.0.0:10300",
|
|
131
|
+
) -> FastAPI:
|
|
132
|
+
"""Create the FastAPI application.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
registry: The model registry to use.
|
|
136
|
+
enable_wyoming: Whether to start Wyoming server.
|
|
137
|
+
wyoming_uri: URI for Wyoming server.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Configured FastAPI application.
|
|
141
|
+
|
|
142
|
+
"""
|
|
143
|
+
lifespan = create_lifespan(
|
|
144
|
+
registry,
|
|
145
|
+
wyoming_handler_module="agent_cli.server.whisper.wyoming_handler",
|
|
146
|
+
enable_wyoming=enable_wyoming,
|
|
147
|
+
wyoming_uri=wyoming_uri,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
app = FastAPI(
|
|
151
|
+
title="Whisper ASR Server",
|
|
152
|
+
description="OpenAI-compatible Whisper ASR server with TTL-based model unloading",
|
|
153
|
+
version="1.0.0",
|
|
154
|
+
lifespan=lifespan,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
configure_app(app)
|
|
158
|
+
|
|
159
|
+
# --- Health & Status Endpoints ---
|
|
160
|
+
|
|
161
|
+
@app.get("/health", response_model=HealthResponse)
|
|
162
|
+
async def health_check() -> HealthResponse:
|
|
163
|
+
"""Health check endpoint."""
|
|
164
|
+
models = [
|
|
165
|
+
ModelStatusResponse(
|
|
166
|
+
name=s.name,
|
|
167
|
+
loaded=s.loaded,
|
|
168
|
+
device=s.device,
|
|
169
|
+
ttl_seconds=s.ttl_seconds,
|
|
170
|
+
ttl_remaining=s.ttl_remaining,
|
|
171
|
+
active_requests=s.active_requests,
|
|
172
|
+
load_count=s.load_count,
|
|
173
|
+
unload_count=s.unload_count,
|
|
174
|
+
total_requests=s.total_requests,
|
|
175
|
+
total_audio_seconds=s.total_audio_seconds,
|
|
176
|
+
total_transcription_seconds=s.extra.get("total_transcription_seconds", 0.0),
|
|
177
|
+
last_load_time=s.last_load_time,
|
|
178
|
+
last_request_time=s.last_request_time,
|
|
179
|
+
load_duration_seconds=s.load_duration_seconds,
|
|
180
|
+
)
|
|
181
|
+
for s in registry.list_status()
|
|
182
|
+
]
|
|
183
|
+
return HealthResponse(status="healthy", models=models)
|
|
184
|
+
|
|
185
|
+
@app.post("/v1/model/unload", response_model=UnloadResponse)
|
|
186
|
+
async def unload_model(
|
|
187
|
+
model: Annotated[str | None, Query(description="Model to unload")] = None,
|
|
188
|
+
) -> UnloadResponse:
|
|
189
|
+
"""Manually unload a model from memory."""
|
|
190
|
+
try:
|
|
191
|
+
manager = registry.get_manager(model)
|
|
192
|
+
was_loaded = await manager.unload()
|
|
193
|
+
return UnloadResponse(
|
|
194
|
+
status="success",
|
|
195
|
+
model=manager.config.model_name,
|
|
196
|
+
was_loaded=was_loaded,
|
|
197
|
+
)
|
|
198
|
+
except ValueError as e:
|
|
199
|
+
raise HTTPException(status_code=404, detail=str(e)) from e
|
|
200
|
+
|
|
201
|
+
# --- OpenAI-Compatible Transcription Endpoints ---
|
|
202
|
+
|
|
203
|
+
@app.post("/v1/audio/transcriptions", response_model=None)
|
|
204
|
+
async def transcribe_audio(
|
|
205
|
+
file: Annotated[UploadFile, File(description="Audio file to transcribe")],
|
|
206
|
+
model: Annotated[str, Form(description="Model to use")] = "whisper-1",
|
|
207
|
+
language: Annotated[str | None, Form(description="Language code")] = None,
|
|
208
|
+
prompt: Annotated[str | None, Form(description="Initial prompt")] = None,
|
|
209
|
+
response_format: Annotated[
|
|
210
|
+
Literal["json", "text", "srt", "verbose_json", "vtt"],
|
|
211
|
+
Form(description="Response format"),
|
|
212
|
+
] = "json",
|
|
213
|
+
temperature: Annotated[float, Form(description="Sampling temperature")] = 0.0,
|
|
214
|
+
) -> TranscriptionResponse | VerboseTranscriptionResponse | PlainTextResponse:
|
|
215
|
+
"""OpenAI-compatible audio transcription endpoint."""
|
|
216
|
+
return await _do_transcription(
|
|
217
|
+
file=file,
|
|
218
|
+
model=model,
|
|
219
|
+
language=language,
|
|
220
|
+
prompt=prompt,
|
|
221
|
+
response_format=response_format,
|
|
222
|
+
temperature=temperature,
|
|
223
|
+
task="transcribe",
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
@app.post("/v1/audio/translations", response_model=None)
|
|
227
|
+
async def translate_audio(
|
|
228
|
+
file: Annotated[UploadFile, File(description="Audio file to translate")],
|
|
229
|
+
model: Annotated[str, Form(description="Model to use")] = "whisper-1",
|
|
230
|
+
prompt: Annotated[str | None, Form(description="Initial prompt")] = None,
|
|
231
|
+
response_format: Annotated[
|
|
232
|
+
Literal["json", "text", "srt", "verbose_json", "vtt"],
|
|
233
|
+
Form(description="Response format"),
|
|
234
|
+
] = "json",
|
|
235
|
+
temperature: Annotated[float, Form(description="Sampling temperature")] = 0.0,
|
|
236
|
+
) -> TranscriptionResponse | VerboseTranscriptionResponse | PlainTextResponse:
|
|
237
|
+
"""OpenAI-compatible audio translation endpoint (always to English)."""
|
|
238
|
+
return await _do_transcription(
|
|
239
|
+
file=file,
|
|
240
|
+
model=model,
|
|
241
|
+
language=None, # Translation always outputs English
|
|
242
|
+
prompt=prompt,
|
|
243
|
+
response_format=response_format,
|
|
244
|
+
temperature=temperature,
|
|
245
|
+
task="translate",
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
async def _do_transcription(
|
|
249
|
+
*,
|
|
250
|
+
file: UploadFile,
|
|
251
|
+
model: str,
|
|
252
|
+
language: str | None,
|
|
253
|
+
prompt: str | None,
|
|
254
|
+
response_format: Literal["json", "text", "srt", "verbose_json", "vtt"],
|
|
255
|
+
temperature: float,
|
|
256
|
+
task: Literal["transcribe", "translate"],
|
|
257
|
+
) -> TranscriptionResponse | VerboseTranscriptionResponse | PlainTextResponse:
|
|
258
|
+
"""Perform transcription with the specified parameters."""
|
|
259
|
+
# Resolve model name - "whisper-1" is OpenAI's model name, use default
|
|
260
|
+
model_name = None if model in ("whisper-1", "whisper-large-v3") else model
|
|
261
|
+
|
|
262
|
+
try:
|
|
263
|
+
manager = registry.get_manager(model_name)
|
|
264
|
+
except ValueError as e:
|
|
265
|
+
raise HTTPException(status_code=400, detail=str(e)) from e
|
|
266
|
+
|
|
267
|
+
# Read audio data
|
|
268
|
+
audio_data = await file.read()
|
|
269
|
+
|
|
270
|
+
if not audio_data:
|
|
271
|
+
raise HTTPException(status_code=400, detail="Empty audio file")
|
|
272
|
+
|
|
273
|
+
try:
|
|
274
|
+
result = await manager.transcribe(
|
|
275
|
+
audio_data,
|
|
276
|
+
source_filename=file.filename,
|
|
277
|
+
language=language,
|
|
278
|
+
task=task,
|
|
279
|
+
initial_prompt=prompt,
|
|
280
|
+
temperature=temperature,
|
|
281
|
+
)
|
|
282
|
+
except InvalidAudioError as e:
|
|
283
|
+
raise HTTPException(status_code=400, detail=str(e)) from e
|
|
284
|
+
except Exception as e:
|
|
285
|
+
logger.exception("Transcription failed")
|
|
286
|
+
raise HTTPException(status_code=500, detail=str(e)) from e
|
|
287
|
+
|
|
288
|
+
# Format response
|
|
289
|
+
if response_format == "text":
|
|
290
|
+
return PlainTextResponse(content=result.text)
|
|
291
|
+
|
|
292
|
+
if response_format == "srt":
|
|
293
|
+
srt_content = _format_srt(result.segments)
|
|
294
|
+
return PlainTextResponse(content=srt_content, media_type="text/plain")
|
|
295
|
+
|
|
296
|
+
if response_format == "vtt":
|
|
297
|
+
vtt_content = _format_vtt(result.segments)
|
|
298
|
+
return PlainTextResponse(content=vtt_content, media_type="text/vtt")
|
|
299
|
+
|
|
300
|
+
if response_format == "verbose_json":
|
|
301
|
+
return VerboseTranscriptionResponse(
|
|
302
|
+
task=task,
|
|
303
|
+
language=result.language,
|
|
304
|
+
duration=result.duration,
|
|
305
|
+
text=result.text,
|
|
306
|
+
segments=result.segments,
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
# Default is json format
|
|
310
|
+
return TranscriptionResponse(text=result.text)
|
|
311
|
+
|
|
312
|
+
# --- WebSocket Streaming Endpoint ---
|
|
313
|
+
|
|
314
|
+
@app.websocket("/v1/audio/transcriptions/stream")
|
|
315
|
+
async def stream_transcription(
|
|
316
|
+
websocket: WebSocket,
|
|
317
|
+
model: Annotated[str | None, Query(description="Model to use")] = None,
|
|
318
|
+
language: Annotated[str | None, Query(description="Language code")] = None,
|
|
319
|
+
) -> None:
|
|
320
|
+
"""WebSocket endpoint for streaming transcription.
|
|
321
|
+
|
|
322
|
+
Protocol:
|
|
323
|
+
- Client sends binary audio chunks (16kHz, 16-bit, mono PCM)
|
|
324
|
+
- Client sends b"EOS" to signal end of audio
|
|
325
|
+
- Server sends JSON messages with transcription results
|
|
326
|
+
|
|
327
|
+
Message format from server:
|
|
328
|
+
{"type": "partial", "text": "...", "is_final": false}
|
|
329
|
+
{"type": "final", "text": "...", "is_final": true, "segments": [...]}
|
|
330
|
+
{"type": "error", "message": "..."}
|
|
331
|
+
"""
|
|
332
|
+
await websocket.accept()
|
|
333
|
+
|
|
334
|
+
try:
|
|
335
|
+
# Match OpenAI model aliases to the default model, like REST endpoints.
|
|
336
|
+
resolved_model = None if model in ("whisper-1", "whisper-large-v3") else model
|
|
337
|
+
manager = registry.get_manager(resolved_model)
|
|
338
|
+
except ValueError as e:
|
|
339
|
+
await websocket.send_json({"type": "error", "message": str(e)})
|
|
340
|
+
await websocket.close()
|
|
341
|
+
return
|
|
342
|
+
|
|
343
|
+
# Collect audio data
|
|
344
|
+
audio_buffer = io.BytesIO()
|
|
345
|
+
wav_file: wave.Wave_write | None = None
|
|
346
|
+
|
|
347
|
+
try:
|
|
348
|
+
while True:
|
|
349
|
+
data = await websocket.receive_bytes()
|
|
350
|
+
|
|
351
|
+
# Initialize WAV file on first chunk (before EOS check)
|
|
352
|
+
if wav_file is None:
|
|
353
|
+
wav_file = wave.open(audio_buffer, "wb") # noqa: SIM115
|
|
354
|
+
setup_wav_file(wav_file)
|
|
355
|
+
|
|
356
|
+
# Check for end of stream (EOS marker)
|
|
357
|
+
eos_marker = b"EOS"
|
|
358
|
+
eos_len = len(eos_marker)
|
|
359
|
+
if data == eos_marker:
|
|
360
|
+
break
|
|
361
|
+
if data[-eos_len:] == eos_marker:
|
|
362
|
+
# Write remaining data before EOS marker
|
|
363
|
+
if len(data) > eos_len:
|
|
364
|
+
wav_file.writeframes(data[:-eos_len])
|
|
365
|
+
break
|
|
366
|
+
|
|
367
|
+
wav_file.writeframes(data)
|
|
368
|
+
|
|
369
|
+
# Close WAV file
|
|
370
|
+
if wav_file is not None:
|
|
371
|
+
wav_file.close()
|
|
372
|
+
|
|
373
|
+
# Get audio data
|
|
374
|
+
audio_buffer.seek(0)
|
|
375
|
+
audio_data = audio_buffer.read()
|
|
376
|
+
|
|
377
|
+
if not audio_data:
|
|
378
|
+
await websocket.send_json({"type": "error", "message": "No audio received"})
|
|
379
|
+
await websocket.close()
|
|
380
|
+
return
|
|
381
|
+
|
|
382
|
+
# Transcribe
|
|
383
|
+
try:
|
|
384
|
+
result = await manager.transcribe(
|
|
385
|
+
audio_data,
|
|
386
|
+
language=language,
|
|
387
|
+
task="transcribe",
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
await websocket.send_json(
|
|
391
|
+
{
|
|
392
|
+
"type": "final",
|
|
393
|
+
"text": result.text,
|
|
394
|
+
"is_final": True,
|
|
395
|
+
"language": result.language,
|
|
396
|
+
"duration": result.duration,
|
|
397
|
+
"segments": result.segments,
|
|
398
|
+
},
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
except Exception as e:
|
|
402
|
+
await websocket.send_json({"type": "error", "message": str(e)})
|
|
403
|
+
|
|
404
|
+
except Exception as e:
|
|
405
|
+
logger.exception("WebSocket error")
|
|
406
|
+
with contextlib.suppress(Exception):
|
|
407
|
+
await websocket.send_json({"type": "error", "message": str(e)})
|
|
408
|
+
|
|
409
|
+
finally:
|
|
410
|
+
with contextlib.suppress(Exception):
|
|
411
|
+
await websocket.close()
|
|
412
|
+
|
|
413
|
+
return app
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Whisper backend factory with platform auto-detection."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import platform
|
|
7
|
+
import sys
|
|
8
|
+
from typing import TYPE_CHECKING, Literal
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from agent_cli.server.whisper.backends.base import WhisperBackend
|
|
12
|
+
|
|
13
|
+
from agent_cli.server.whisper.backends.base import (
|
|
14
|
+
BackendConfig,
|
|
15
|
+
TranscriptionResult,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
BackendType = Literal["faster-whisper", "mlx", "auto"]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def detect_backend() -> Literal["faster-whisper", "mlx"]:
|
|
24
|
+
"""Detect the best backend for the current platform.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
"mlx" on macOS ARM with mlx-whisper installed,
|
|
28
|
+
"faster-whisper" otherwise.
|
|
29
|
+
|
|
30
|
+
"""
|
|
31
|
+
# Check for macOS ARM (Apple Silicon)
|
|
32
|
+
if sys.platform == "darwin" and platform.machine() == "arm64":
|
|
33
|
+
try:
|
|
34
|
+
import mlx_whisper # noqa: F401, PLC0415
|
|
35
|
+
|
|
36
|
+
logger.debug("Detected macOS ARM with mlx-whisper available")
|
|
37
|
+
return "mlx"
|
|
38
|
+
except ImportError:
|
|
39
|
+
logger.debug("macOS ARM detected but mlx-whisper not installed")
|
|
40
|
+
|
|
41
|
+
return "faster-whisper"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def create_backend(
|
|
45
|
+
config: BackendConfig,
|
|
46
|
+
backend_type: BackendType = "auto",
|
|
47
|
+
) -> WhisperBackend:
|
|
48
|
+
"""Create a Whisper backend instance.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
config: Backend configuration.
|
|
52
|
+
backend_type: Backend to use, or "auto" for platform detection.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Configured WhisperBackend instance.
|
|
56
|
+
|
|
57
|
+
Raises:
|
|
58
|
+
ImportError: If the required backend package is not installed.
|
|
59
|
+
ValueError: If an unknown backend type is specified.
|
|
60
|
+
|
|
61
|
+
"""
|
|
62
|
+
if backend_type == "auto":
|
|
63
|
+
backend_type = detect_backend()
|
|
64
|
+
|
|
65
|
+
logger.debug("Creating %s backend for model %s", backend_type, config.model_name)
|
|
66
|
+
|
|
67
|
+
if backend_type == "mlx":
|
|
68
|
+
from agent_cli.server.whisper.backends.mlx import MLXWhisperBackend # noqa: PLC0415
|
|
69
|
+
|
|
70
|
+
return MLXWhisperBackend(config)
|
|
71
|
+
|
|
72
|
+
if backend_type == "faster-whisper":
|
|
73
|
+
from agent_cli.server.whisper.backends.faster_whisper import ( # noqa: PLC0415
|
|
74
|
+
FasterWhisperBackend,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return FasterWhisperBackend(config)
|
|
78
|
+
|
|
79
|
+
msg = f"Unknown backend type: {backend_type}"
|
|
80
|
+
raise ValueError(msg)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
__all__ = [
|
|
84
|
+
"BackendConfig",
|
|
85
|
+
"BackendType",
|
|
86
|
+
"TranscriptionResult",
|
|
87
|
+
"create_backend",
|
|
88
|
+
"detect_backend",
|
|
89
|
+
]
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Base types and protocol for Whisper backends."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Literal, Protocol, runtime_checkable
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class TranscriptionResult:
|
|
14
|
+
"""Result of a transcription."""
|
|
15
|
+
|
|
16
|
+
text: str
|
|
17
|
+
language: str
|
|
18
|
+
language_probability: float
|
|
19
|
+
duration: float
|
|
20
|
+
segments: list[dict[str, Any]] = field(default_factory=list)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class BackendConfig:
|
|
25
|
+
"""Configuration for a Whisper backend."""
|
|
26
|
+
|
|
27
|
+
model_name: str
|
|
28
|
+
device: str = "auto"
|
|
29
|
+
compute_type: str = "auto"
|
|
30
|
+
cpu_threads: int = 4
|
|
31
|
+
cache_dir: Path | None = None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class InvalidAudioError(ValueError):
|
|
35
|
+
"""Raised when the input audio is invalid or unsupported."""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@runtime_checkable
|
|
39
|
+
class WhisperBackend(Protocol):
|
|
40
|
+
"""Protocol for Whisper transcription backends.
|
|
41
|
+
|
|
42
|
+
Backends handle model loading, unloading, and transcription.
|
|
43
|
+
The ModelManager handles TTL, stats, and lifecycle.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def is_loaded(self) -> bool:
|
|
48
|
+
"""Check if the model is currently loaded."""
|
|
49
|
+
...
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def device(self) -> str | None:
|
|
53
|
+
"""Get the device the model is loaded on, or None if not loaded."""
|
|
54
|
+
...
|
|
55
|
+
|
|
56
|
+
async def load(self) -> float:
|
|
57
|
+
"""Load the model into memory.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
Load duration in seconds.
|
|
61
|
+
|
|
62
|
+
"""
|
|
63
|
+
...
|
|
64
|
+
|
|
65
|
+
async def unload(self) -> None:
|
|
66
|
+
"""Unload the model and free memory."""
|
|
67
|
+
...
|
|
68
|
+
|
|
69
|
+
async def transcribe(
|
|
70
|
+
self,
|
|
71
|
+
audio: bytes,
|
|
72
|
+
*,
|
|
73
|
+
source_filename: str | None = None,
|
|
74
|
+
language: str | None = None,
|
|
75
|
+
task: Literal["transcribe", "translate"] = "transcribe",
|
|
76
|
+
initial_prompt: str | None = None,
|
|
77
|
+
temperature: float = 0.0,
|
|
78
|
+
vad_filter: bool = True,
|
|
79
|
+
word_timestamps: bool = False,
|
|
80
|
+
) -> TranscriptionResult:
|
|
81
|
+
"""Transcribe audio data.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
audio: Audio data as bytes (WAV format, 16kHz, 16-bit, mono)
|
|
85
|
+
source_filename: Optional filename to help detect audio format.
|
|
86
|
+
language: Language code or None for auto-detection
|
|
87
|
+
task: "transcribe" or "translate" (to English)
|
|
88
|
+
initial_prompt: Optional prompt to guide transcription
|
|
89
|
+
temperature: Sampling temperature
|
|
90
|
+
vad_filter: Whether to use VAD filtering
|
|
91
|
+
word_timestamps: Whether to include word-level timestamps
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
TranscriptionResult with text and metadata.
|
|
95
|
+
|
|
96
|
+
"""
|
|
97
|
+
...
|