agent-cli 0.70.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. agent_cli/__init__.py +5 -0
  2. agent_cli/__main__.py +6 -0
  3. agent_cli/_extras.json +14 -0
  4. agent_cli/_requirements/.gitkeep +0 -0
  5. agent_cli/_requirements/audio.txt +79 -0
  6. agent_cli/_requirements/faster-whisper.txt +215 -0
  7. agent_cli/_requirements/kokoro.txt +425 -0
  8. agent_cli/_requirements/llm.txt +183 -0
  9. agent_cli/_requirements/memory.txt +355 -0
  10. agent_cli/_requirements/mlx-whisper.txt +222 -0
  11. agent_cli/_requirements/piper.txt +176 -0
  12. agent_cli/_requirements/rag.txt +402 -0
  13. agent_cli/_requirements/server.txt +154 -0
  14. agent_cli/_requirements/speed.txt +77 -0
  15. agent_cli/_requirements/vad.txt +155 -0
  16. agent_cli/_requirements/wyoming.txt +71 -0
  17. agent_cli/_tools.py +368 -0
  18. agent_cli/agents/__init__.py +23 -0
  19. agent_cli/agents/_voice_agent_common.py +136 -0
  20. agent_cli/agents/assistant.py +383 -0
  21. agent_cli/agents/autocorrect.py +284 -0
  22. agent_cli/agents/chat.py +496 -0
  23. agent_cli/agents/memory/__init__.py +31 -0
  24. agent_cli/agents/memory/add.py +190 -0
  25. agent_cli/agents/memory/proxy.py +160 -0
  26. agent_cli/agents/rag_proxy.py +128 -0
  27. agent_cli/agents/speak.py +209 -0
  28. agent_cli/agents/transcribe.py +671 -0
  29. agent_cli/agents/transcribe_daemon.py +499 -0
  30. agent_cli/agents/voice_edit.py +291 -0
  31. agent_cli/api.py +22 -0
  32. agent_cli/cli.py +106 -0
  33. agent_cli/config.py +503 -0
  34. agent_cli/config_cmd.py +307 -0
  35. agent_cli/constants.py +27 -0
  36. agent_cli/core/__init__.py +1 -0
  37. agent_cli/core/audio.py +461 -0
  38. agent_cli/core/audio_format.py +299 -0
  39. agent_cli/core/chroma.py +88 -0
  40. agent_cli/core/deps.py +191 -0
  41. agent_cli/core/openai_proxy.py +139 -0
  42. agent_cli/core/process.py +195 -0
  43. agent_cli/core/reranker.py +120 -0
  44. agent_cli/core/sse.py +87 -0
  45. agent_cli/core/transcription_logger.py +70 -0
  46. agent_cli/core/utils.py +526 -0
  47. agent_cli/core/vad.py +175 -0
  48. agent_cli/core/watch.py +65 -0
  49. agent_cli/dev/__init__.py +14 -0
  50. agent_cli/dev/cli.py +1588 -0
  51. agent_cli/dev/coding_agents/__init__.py +19 -0
  52. agent_cli/dev/coding_agents/aider.py +24 -0
  53. agent_cli/dev/coding_agents/base.py +167 -0
  54. agent_cli/dev/coding_agents/claude.py +39 -0
  55. agent_cli/dev/coding_agents/codex.py +24 -0
  56. agent_cli/dev/coding_agents/continue_dev.py +15 -0
  57. agent_cli/dev/coding_agents/copilot.py +24 -0
  58. agent_cli/dev/coding_agents/cursor_agent.py +48 -0
  59. agent_cli/dev/coding_agents/gemini.py +28 -0
  60. agent_cli/dev/coding_agents/opencode.py +15 -0
  61. agent_cli/dev/coding_agents/registry.py +49 -0
  62. agent_cli/dev/editors/__init__.py +19 -0
  63. agent_cli/dev/editors/base.py +89 -0
  64. agent_cli/dev/editors/cursor.py +15 -0
  65. agent_cli/dev/editors/emacs.py +46 -0
  66. agent_cli/dev/editors/jetbrains.py +56 -0
  67. agent_cli/dev/editors/nano.py +31 -0
  68. agent_cli/dev/editors/neovim.py +33 -0
  69. agent_cli/dev/editors/registry.py +59 -0
  70. agent_cli/dev/editors/sublime.py +20 -0
  71. agent_cli/dev/editors/vim.py +42 -0
  72. agent_cli/dev/editors/vscode.py +15 -0
  73. agent_cli/dev/editors/zed.py +20 -0
  74. agent_cli/dev/project.py +568 -0
  75. agent_cli/dev/registry.py +52 -0
  76. agent_cli/dev/skill/SKILL.md +141 -0
  77. agent_cli/dev/skill/examples.md +571 -0
  78. agent_cli/dev/terminals/__init__.py +19 -0
  79. agent_cli/dev/terminals/apple_terminal.py +82 -0
  80. agent_cli/dev/terminals/base.py +56 -0
  81. agent_cli/dev/terminals/gnome.py +51 -0
  82. agent_cli/dev/terminals/iterm2.py +84 -0
  83. agent_cli/dev/terminals/kitty.py +77 -0
  84. agent_cli/dev/terminals/registry.py +48 -0
  85. agent_cli/dev/terminals/tmux.py +58 -0
  86. agent_cli/dev/terminals/warp.py +132 -0
  87. agent_cli/dev/terminals/zellij.py +78 -0
  88. agent_cli/dev/worktree.py +856 -0
  89. agent_cli/docs_gen.py +417 -0
  90. agent_cli/example-config.toml +185 -0
  91. agent_cli/install/__init__.py +5 -0
  92. agent_cli/install/common.py +89 -0
  93. agent_cli/install/extras.py +174 -0
  94. agent_cli/install/hotkeys.py +48 -0
  95. agent_cli/install/services.py +87 -0
  96. agent_cli/memory/__init__.py +7 -0
  97. agent_cli/memory/_files.py +250 -0
  98. agent_cli/memory/_filters.py +63 -0
  99. agent_cli/memory/_git.py +157 -0
  100. agent_cli/memory/_indexer.py +142 -0
  101. agent_cli/memory/_ingest.py +408 -0
  102. agent_cli/memory/_persistence.py +182 -0
  103. agent_cli/memory/_prompt.py +91 -0
  104. agent_cli/memory/_retrieval.py +294 -0
  105. agent_cli/memory/_store.py +169 -0
  106. agent_cli/memory/_streaming.py +44 -0
  107. agent_cli/memory/_tasks.py +48 -0
  108. agent_cli/memory/api.py +113 -0
  109. agent_cli/memory/client.py +272 -0
  110. agent_cli/memory/engine.py +361 -0
  111. agent_cli/memory/entities.py +43 -0
  112. agent_cli/memory/models.py +112 -0
  113. agent_cli/opts.py +433 -0
  114. agent_cli/py.typed +0 -0
  115. agent_cli/rag/__init__.py +3 -0
  116. agent_cli/rag/_indexer.py +67 -0
  117. agent_cli/rag/_indexing.py +226 -0
  118. agent_cli/rag/_prompt.py +30 -0
  119. agent_cli/rag/_retriever.py +156 -0
  120. agent_cli/rag/_store.py +48 -0
  121. agent_cli/rag/_utils.py +218 -0
  122. agent_cli/rag/api.py +175 -0
  123. agent_cli/rag/client.py +299 -0
  124. agent_cli/rag/engine.py +302 -0
  125. agent_cli/rag/models.py +55 -0
  126. agent_cli/scripts/.runtime/.gitkeep +0 -0
  127. agent_cli/scripts/__init__.py +1 -0
  128. agent_cli/scripts/check_plugin_skill_sync.py +50 -0
  129. agent_cli/scripts/linux-hotkeys/README.md +63 -0
  130. agent_cli/scripts/linux-hotkeys/toggle-autocorrect.sh +45 -0
  131. agent_cli/scripts/linux-hotkeys/toggle-transcription.sh +58 -0
  132. agent_cli/scripts/linux-hotkeys/toggle-voice-edit.sh +58 -0
  133. agent_cli/scripts/macos-hotkeys/README.md +45 -0
  134. agent_cli/scripts/macos-hotkeys/skhd-config-example +5 -0
  135. agent_cli/scripts/macos-hotkeys/toggle-autocorrect.sh +12 -0
  136. agent_cli/scripts/macos-hotkeys/toggle-transcription.sh +37 -0
  137. agent_cli/scripts/macos-hotkeys/toggle-voice-edit.sh +37 -0
  138. agent_cli/scripts/nvidia-asr-server/README.md +99 -0
  139. agent_cli/scripts/nvidia-asr-server/pyproject.toml +27 -0
  140. agent_cli/scripts/nvidia-asr-server/server.py +255 -0
  141. agent_cli/scripts/nvidia-asr-server/shell.nix +32 -0
  142. agent_cli/scripts/nvidia-asr-server/uv.lock +4654 -0
  143. agent_cli/scripts/run-openwakeword.sh +11 -0
  144. agent_cli/scripts/run-piper-windows.ps1 +30 -0
  145. agent_cli/scripts/run-piper.sh +24 -0
  146. agent_cli/scripts/run-whisper-linux.sh +40 -0
  147. agent_cli/scripts/run-whisper-macos.sh +6 -0
  148. agent_cli/scripts/run-whisper-windows.ps1 +51 -0
  149. agent_cli/scripts/run-whisper.sh +9 -0
  150. agent_cli/scripts/run_faster_whisper_server.py +136 -0
  151. agent_cli/scripts/setup-linux-hotkeys.sh +72 -0
  152. agent_cli/scripts/setup-linux.sh +108 -0
  153. agent_cli/scripts/setup-macos-hotkeys.sh +61 -0
  154. agent_cli/scripts/setup-macos.sh +76 -0
  155. agent_cli/scripts/setup-windows.ps1 +63 -0
  156. agent_cli/scripts/start-all-services-windows.ps1 +53 -0
  157. agent_cli/scripts/start-all-services.sh +178 -0
  158. agent_cli/scripts/sync_extras.py +138 -0
  159. agent_cli/server/__init__.py +3 -0
  160. agent_cli/server/cli.py +721 -0
  161. agent_cli/server/common.py +222 -0
  162. agent_cli/server/model_manager.py +288 -0
  163. agent_cli/server/model_registry.py +225 -0
  164. agent_cli/server/proxy/__init__.py +3 -0
  165. agent_cli/server/proxy/api.py +444 -0
  166. agent_cli/server/streaming.py +67 -0
  167. agent_cli/server/tts/__init__.py +3 -0
  168. agent_cli/server/tts/api.py +335 -0
  169. agent_cli/server/tts/backends/__init__.py +82 -0
  170. agent_cli/server/tts/backends/base.py +139 -0
  171. agent_cli/server/tts/backends/kokoro.py +403 -0
  172. agent_cli/server/tts/backends/piper.py +253 -0
  173. agent_cli/server/tts/model_manager.py +201 -0
  174. agent_cli/server/tts/model_registry.py +28 -0
  175. agent_cli/server/tts/wyoming_handler.py +249 -0
  176. agent_cli/server/whisper/__init__.py +3 -0
  177. agent_cli/server/whisper/api.py +413 -0
  178. agent_cli/server/whisper/backends/__init__.py +89 -0
  179. agent_cli/server/whisper/backends/base.py +97 -0
  180. agent_cli/server/whisper/backends/faster_whisper.py +225 -0
  181. agent_cli/server/whisper/backends/mlx.py +270 -0
  182. agent_cli/server/whisper/languages.py +116 -0
  183. agent_cli/server/whisper/model_manager.py +157 -0
  184. agent_cli/server/whisper/model_registry.py +28 -0
  185. agent_cli/server/whisper/wyoming_handler.py +203 -0
  186. agent_cli/services/__init__.py +343 -0
  187. agent_cli/services/_wyoming_utils.py +64 -0
  188. agent_cli/services/asr.py +506 -0
  189. agent_cli/services/llm.py +228 -0
  190. agent_cli/services/tts.py +450 -0
  191. agent_cli/services/wake_word.py +142 -0
  192. agent_cli-0.70.5.dist-info/METADATA +2118 -0
  193. agent_cli-0.70.5.dist-info/RECORD +196 -0
  194. agent_cli-0.70.5.dist-info/WHEEL +4 -0
  195. agent_cli-0.70.5.dist-info/entry_points.txt +4 -0
  196. agent_cli-0.70.5.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,413 @@
1
+ """FastAPI application for Whisper ASR server with OpenAI-compatible API."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import contextlib
6
+ import io
7
+ import logging
8
+ import wave
9
+ from typing import TYPE_CHECKING, Annotated, Any, Literal
10
+
11
+ from fastapi import FastAPI, File, Form, HTTPException, Query, UploadFile, WebSocket
12
+ from fastapi.responses import PlainTextResponse
13
+ from pydantic import BaseModel
14
+
15
+ from agent_cli.server.common import configure_app, create_lifespan, setup_wav_file
16
+ from agent_cli.server.whisper.backends.base import InvalidAudioError
17
+
18
+ if TYPE_CHECKING:
19
+ from agent_cli.server.whisper.model_registry import WhisperModelRegistry
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def _split_seconds(seconds: float) -> tuple[int, int, int, int]:
25
+ """Split seconds into (hours, minutes, seconds, milliseconds)."""
26
+ hours = int(seconds // 3600)
27
+ minutes = int((seconds % 3600) // 60)
28
+ secs = int(seconds % 60)
29
+ millis = int((seconds % 1) * 1000)
30
+ return hours, minutes, secs, millis
31
+
32
+
33
+ def _format_timestamp(seconds: float, *, always_include_hours: bool = False) -> str:
34
+ """Format seconds as HH:MM:SS,mmm for SRT format."""
35
+ hours, minutes, secs, millis = _split_seconds(seconds)
36
+ if always_include_hours or hours > 0:
37
+ return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
38
+ return f"{minutes:02d}:{secs:02d},{millis:03d}"
39
+
40
+
41
+ def _format_vtt_timestamp(seconds: float) -> str:
42
+ """Format seconds as HH:MM:SS.mmm for VTT format."""
43
+ hours, minutes, secs, millis = _split_seconds(seconds)
44
+ return f"{hours:02d}:{minutes:02d}:{secs:02d}.{millis:03d}"
45
+
46
+
47
+ def _format_srt(segments: list[dict[str, Any]]) -> str:
48
+ """Format segments as SRT subtitles."""
49
+ lines = []
50
+ for i, seg in enumerate(segments, 1):
51
+ start = _format_timestamp(seg["start"], always_include_hours=True)
52
+ end = _format_timestamp(seg["end"], always_include_hours=True)
53
+ text = seg["text"].strip()
54
+ lines.append(f"{i}\n{start} --> {end}\n{text}\n")
55
+ return "\n".join(lines)
56
+
57
+
58
+ def _format_vtt(segments: list[dict[str, Any]]) -> str:
59
+ """Format segments as WebVTT subtitles."""
60
+ lines = ["WEBVTT", ""]
61
+ for seg in segments:
62
+ start = _format_vtt_timestamp(seg["start"])
63
+ end = _format_vtt_timestamp(seg["end"])
64
+ text = seg["text"].strip()
65
+ lines.append(f"{start} --> {end}\n{text}\n")
66
+ return "\n".join(lines)
67
+
68
+
69
+ # --- Pydantic Models ---
70
+
71
+
72
+ class TranscriptionResponse(BaseModel):
73
+ """OpenAI-compatible transcription response."""
74
+
75
+ text: str
76
+
77
+
78
+ class VerboseTranscriptionResponse(BaseModel):
79
+ """OpenAI-compatible verbose transcription response."""
80
+
81
+ task: Literal["transcribe", "translate"]
82
+ language: str
83
+ duration: float
84
+ text: str
85
+ segments: list[dict[str, Any]]
86
+
87
+
88
+ class ModelStatusResponse(BaseModel):
89
+ """Status of a single model."""
90
+
91
+ name: str
92
+ loaded: bool
93
+ device: str | None
94
+ ttl_seconds: int
95
+ ttl_remaining: float | None
96
+ active_requests: int
97
+ # Stats
98
+ load_count: int
99
+ unload_count: int
100
+ total_requests: int
101
+ total_audio_seconds: float
102
+ total_transcription_seconds: float
103
+ last_load_time: float | None
104
+ last_request_time: float | None
105
+ load_duration_seconds: float | None
106
+
107
+
108
+ class HealthResponse(BaseModel):
109
+ """Health check response."""
110
+
111
+ status: str
112
+ models: list[ModelStatusResponse]
113
+
114
+
115
+ class UnloadResponse(BaseModel):
116
+ """Response from model unload request."""
117
+
118
+ status: str
119
+ model: str
120
+ was_loaded: bool
121
+
122
+
123
+ # --- App Factory ---
124
+
125
+
126
+ def create_app( # noqa: C901, PLR0915
127
+ registry: WhisperModelRegistry,
128
+ *,
129
+ enable_wyoming: bool = True,
130
+ wyoming_uri: str = "tcp://0.0.0.0:10300",
131
+ ) -> FastAPI:
132
+ """Create the FastAPI application.
133
+
134
+ Args:
135
+ registry: The model registry to use.
136
+ enable_wyoming: Whether to start Wyoming server.
137
+ wyoming_uri: URI for Wyoming server.
138
+
139
+ Returns:
140
+ Configured FastAPI application.
141
+
142
+ """
143
+ lifespan = create_lifespan(
144
+ registry,
145
+ wyoming_handler_module="agent_cli.server.whisper.wyoming_handler",
146
+ enable_wyoming=enable_wyoming,
147
+ wyoming_uri=wyoming_uri,
148
+ )
149
+
150
+ app = FastAPI(
151
+ title="Whisper ASR Server",
152
+ description="OpenAI-compatible Whisper ASR server with TTL-based model unloading",
153
+ version="1.0.0",
154
+ lifespan=lifespan,
155
+ )
156
+
157
+ configure_app(app)
158
+
159
+ # --- Health & Status Endpoints ---
160
+
161
+ @app.get("/health", response_model=HealthResponse)
162
+ async def health_check() -> HealthResponse:
163
+ """Health check endpoint."""
164
+ models = [
165
+ ModelStatusResponse(
166
+ name=s.name,
167
+ loaded=s.loaded,
168
+ device=s.device,
169
+ ttl_seconds=s.ttl_seconds,
170
+ ttl_remaining=s.ttl_remaining,
171
+ active_requests=s.active_requests,
172
+ load_count=s.load_count,
173
+ unload_count=s.unload_count,
174
+ total_requests=s.total_requests,
175
+ total_audio_seconds=s.total_audio_seconds,
176
+ total_transcription_seconds=s.extra.get("total_transcription_seconds", 0.0),
177
+ last_load_time=s.last_load_time,
178
+ last_request_time=s.last_request_time,
179
+ load_duration_seconds=s.load_duration_seconds,
180
+ )
181
+ for s in registry.list_status()
182
+ ]
183
+ return HealthResponse(status="healthy", models=models)
184
+
185
+ @app.post("/v1/model/unload", response_model=UnloadResponse)
186
+ async def unload_model(
187
+ model: Annotated[str | None, Query(description="Model to unload")] = None,
188
+ ) -> UnloadResponse:
189
+ """Manually unload a model from memory."""
190
+ try:
191
+ manager = registry.get_manager(model)
192
+ was_loaded = await manager.unload()
193
+ return UnloadResponse(
194
+ status="success",
195
+ model=manager.config.model_name,
196
+ was_loaded=was_loaded,
197
+ )
198
+ except ValueError as e:
199
+ raise HTTPException(status_code=404, detail=str(e)) from e
200
+
201
+ # --- OpenAI-Compatible Transcription Endpoints ---
202
+
203
+ @app.post("/v1/audio/transcriptions", response_model=None)
204
+ async def transcribe_audio(
205
+ file: Annotated[UploadFile, File(description="Audio file to transcribe")],
206
+ model: Annotated[str, Form(description="Model to use")] = "whisper-1",
207
+ language: Annotated[str | None, Form(description="Language code")] = None,
208
+ prompt: Annotated[str | None, Form(description="Initial prompt")] = None,
209
+ response_format: Annotated[
210
+ Literal["json", "text", "srt", "verbose_json", "vtt"],
211
+ Form(description="Response format"),
212
+ ] = "json",
213
+ temperature: Annotated[float, Form(description="Sampling temperature")] = 0.0,
214
+ ) -> TranscriptionResponse | VerboseTranscriptionResponse | PlainTextResponse:
215
+ """OpenAI-compatible audio transcription endpoint."""
216
+ return await _do_transcription(
217
+ file=file,
218
+ model=model,
219
+ language=language,
220
+ prompt=prompt,
221
+ response_format=response_format,
222
+ temperature=temperature,
223
+ task="transcribe",
224
+ )
225
+
226
+ @app.post("/v1/audio/translations", response_model=None)
227
+ async def translate_audio(
228
+ file: Annotated[UploadFile, File(description="Audio file to translate")],
229
+ model: Annotated[str, Form(description="Model to use")] = "whisper-1",
230
+ prompt: Annotated[str | None, Form(description="Initial prompt")] = None,
231
+ response_format: Annotated[
232
+ Literal["json", "text", "srt", "verbose_json", "vtt"],
233
+ Form(description="Response format"),
234
+ ] = "json",
235
+ temperature: Annotated[float, Form(description="Sampling temperature")] = 0.0,
236
+ ) -> TranscriptionResponse | VerboseTranscriptionResponse | PlainTextResponse:
237
+ """OpenAI-compatible audio translation endpoint (always to English)."""
238
+ return await _do_transcription(
239
+ file=file,
240
+ model=model,
241
+ language=None, # Translation always outputs English
242
+ prompt=prompt,
243
+ response_format=response_format,
244
+ temperature=temperature,
245
+ task="translate",
246
+ )
247
+
248
+ async def _do_transcription(
249
+ *,
250
+ file: UploadFile,
251
+ model: str,
252
+ language: str | None,
253
+ prompt: str | None,
254
+ response_format: Literal["json", "text", "srt", "verbose_json", "vtt"],
255
+ temperature: float,
256
+ task: Literal["transcribe", "translate"],
257
+ ) -> TranscriptionResponse | VerboseTranscriptionResponse | PlainTextResponse:
258
+ """Perform transcription with the specified parameters."""
259
+ # Resolve model name - "whisper-1" is OpenAI's model name, use default
260
+ model_name = None if model in ("whisper-1", "whisper-large-v3") else model
261
+
262
+ try:
263
+ manager = registry.get_manager(model_name)
264
+ except ValueError as e:
265
+ raise HTTPException(status_code=400, detail=str(e)) from e
266
+
267
+ # Read audio data
268
+ audio_data = await file.read()
269
+
270
+ if not audio_data:
271
+ raise HTTPException(status_code=400, detail="Empty audio file")
272
+
273
+ try:
274
+ result = await manager.transcribe(
275
+ audio_data,
276
+ source_filename=file.filename,
277
+ language=language,
278
+ task=task,
279
+ initial_prompt=prompt,
280
+ temperature=temperature,
281
+ )
282
+ except InvalidAudioError as e:
283
+ raise HTTPException(status_code=400, detail=str(e)) from e
284
+ except Exception as e:
285
+ logger.exception("Transcription failed")
286
+ raise HTTPException(status_code=500, detail=str(e)) from e
287
+
288
+ # Format response
289
+ if response_format == "text":
290
+ return PlainTextResponse(content=result.text)
291
+
292
+ if response_format == "srt":
293
+ srt_content = _format_srt(result.segments)
294
+ return PlainTextResponse(content=srt_content, media_type="text/plain")
295
+
296
+ if response_format == "vtt":
297
+ vtt_content = _format_vtt(result.segments)
298
+ return PlainTextResponse(content=vtt_content, media_type="text/vtt")
299
+
300
+ if response_format == "verbose_json":
301
+ return VerboseTranscriptionResponse(
302
+ task=task,
303
+ language=result.language,
304
+ duration=result.duration,
305
+ text=result.text,
306
+ segments=result.segments,
307
+ )
308
+
309
+ # Default is json format
310
+ return TranscriptionResponse(text=result.text)
311
+
312
+ # --- WebSocket Streaming Endpoint ---
313
+
314
+ @app.websocket("/v1/audio/transcriptions/stream")
315
+ async def stream_transcription(
316
+ websocket: WebSocket,
317
+ model: Annotated[str | None, Query(description="Model to use")] = None,
318
+ language: Annotated[str | None, Query(description="Language code")] = None,
319
+ ) -> None:
320
+ """WebSocket endpoint for streaming transcription.
321
+
322
+ Protocol:
323
+ - Client sends binary audio chunks (16kHz, 16-bit, mono PCM)
324
+ - Client sends b"EOS" to signal end of audio
325
+ - Server sends JSON messages with transcription results
326
+
327
+ Message format from server:
328
+ {"type": "partial", "text": "...", "is_final": false}
329
+ {"type": "final", "text": "...", "is_final": true, "segments": [...]}
330
+ {"type": "error", "message": "..."}
331
+ """
332
+ await websocket.accept()
333
+
334
+ try:
335
+ # Match OpenAI model aliases to the default model, like REST endpoints.
336
+ resolved_model = None if model in ("whisper-1", "whisper-large-v3") else model
337
+ manager = registry.get_manager(resolved_model)
338
+ except ValueError as e:
339
+ await websocket.send_json({"type": "error", "message": str(e)})
340
+ await websocket.close()
341
+ return
342
+
343
+ # Collect audio data
344
+ audio_buffer = io.BytesIO()
345
+ wav_file: wave.Wave_write | None = None
346
+
347
+ try:
348
+ while True:
349
+ data = await websocket.receive_bytes()
350
+
351
+ # Initialize WAV file on first chunk (before EOS check)
352
+ if wav_file is None:
353
+ wav_file = wave.open(audio_buffer, "wb") # noqa: SIM115
354
+ setup_wav_file(wav_file)
355
+
356
+ # Check for end of stream (EOS marker)
357
+ eos_marker = b"EOS"
358
+ eos_len = len(eos_marker)
359
+ if data == eos_marker:
360
+ break
361
+ if data[-eos_len:] == eos_marker:
362
+ # Write remaining data before EOS marker
363
+ if len(data) > eos_len:
364
+ wav_file.writeframes(data[:-eos_len])
365
+ break
366
+
367
+ wav_file.writeframes(data)
368
+
369
+ # Close WAV file
370
+ if wav_file is not None:
371
+ wav_file.close()
372
+
373
+ # Get audio data
374
+ audio_buffer.seek(0)
375
+ audio_data = audio_buffer.read()
376
+
377
+ if not audio_data:
378
+ await websocket.send_json({"type": "error", "message": "No audio received"})
379
+ await websocket.close()
380
+ return
381
+
382
+ # Transcribe
383
+ try:
384
+ result = await manager.transcribe(
385
+ audio_data,
386
+ language=language,
387
+ task="transcribe",
388
+ )
389
+
390
+ await websocket.send_json(
391
+ {
392
+ "type": "final",
393
+ "text": result.text,
394
+ "is_final": True,
395
+ "language": result.language,
396
+ "duration": result.duration,
397
+ "segments": result.segments,
398
+ },
399
+ )
400
+
401
+ except Exception as e:
402
+ await websocket.send_json({"type": "error", "message": str(e)})
403
+
404
+ except Exception as e:
405
+ logger.exception("WebSocket error")
406
+ with contextlib.suppress(Exception):
407
+ await websocket.send_json({"type": "error", "message": str(e)})
408
+
409
+ finally:
410
+ with contextlib.suppress(Exception):
411
+ await websocket.close()
412
+
413
+ return app
@@ -0,0 +1,89 @@
1
+ """Whisper backend factory with platform auto-detection."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import platform
7
+ import sys
8
+ from typing import TYPE_CHECKING, Literal
9
+
10
+ if TYPE_CHECKING:
11
+ from agent_cli.server.whisper.backends.base import WhisperBackend
12
+
13
+ from agent_cli.server.whisper.backends.base import (
14
+ BackendConfig,
15
+ TranscriptionResult,
16
+ )
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ BackendType = Literal["faster-whisper", "mlx", "auto"]
21
+
22
+
23
+ def detect_backend() -> Literal["faster-whisper", "mlx"]:
24
+ """Detect the best backend for the current platform.
25
+
26
+ Returns:
27
+ "mlx" on macOS ARM with mlx-whisper installed,
28
+ "faster-whisper" otherwise.
29
+
30
+ """
31
+ # Check for macOS ARM (Apple Silicon)
32
+ if sys.platform == "darwin" and platform.machine() == "arm64":
33
+ try:
34
+ import mlx_whisper # noqa: F401, PLC0415
35
+
36
+ logger.debug("Detected macOS ARM with mlx-whisper available")
37
+ return "mlx"
38
+ except ImportError:
39
+ logger.debug("macOS ARM detected but mlx-whisper not installed")
40
+
41
+ return "faster-whisper"
42
+
43
+
44
+ def create_backend(
45
+ config: BackendConfig,
46
+ backend_type: BackendType = "auto",
47
+ ) -> WhisperBackend:
48
+ """Create a Whisper backend instance.
49
+
50
+ Args:
51
+ config: Backend configuration.
52
+ backend_type: Backend to use, or "auto" for platform detection.
53
+
54
+ Returns:
55
+ Configured WhisperBackend instance.
56
+
57
+ Raises:
58
+ ImportError: If the required backend package is not installed.
59
+ ValueError: If an unknown backend type is specified.
60
+
61
+ """
62
+ if backend_type == "auto":
63
+ backend_type = detect_backend()
64
+
65
+ logger.debug("Creating %s backend for model %s", backend_type, config.model_name)
66
+
67
+ if backend_type == "mlx":
68
+ from agent_cli.server.whisper.backends.mlx import MLXWhisperBackend # noqa: PLC0415
69
+
70
+ return MLXWhisperBackend(config)
71
+
72
+ if backend_type == "faster-whisper":
73
+ from agent_cli.server.whisper.backends.faster_whisper import ( # noqa: PLC0415
74
+ FasterWhisperBackend,
75
+ )
76
+
77
+ return FasterWhisperBackend(config)
78
+
79
+ msg = f"Unknown backend type: {backend_type}"
80
+ raise ValueError(msg)
81
+
82
+
83
+ __all__ = [
84
+ "BackendConfig",
85
+ "BackendType",
86
+ "TranscriptionResult",
87
+ "create_backend",
88
+ "detect_backend",
89
+ ]
@@ -0,0 +1,97 @@
1
+ """Base types and protocol for Whisper backends."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import TYPE_CHECKING, Any, Literal, Protocol, runtime_checkable
7
+
8
+ if TYPE_CHECKING:
9
+ from pathlib import Path
10
+
11
+
12
+ @dataclass
13
+ class TranscriptionResult:
14
+ """Result of a transcription."""
15
+
16
+ text: str
17
+ language: str
18
+ language_probability: float
19
+ duration: float
20
+ segments: list[dict[str, Any]] = field(default_factory=list)
21
+
22
+
23
+ @dataclass
24
+ class BackendConfig:
25
+ """Configuration for a Whisper backend."""
26
+
27
+ model_name: str
28
+ device: str = "auto"
29
+ compute_type: str = "auto"
30
+ cpu_threads: int = 4
31
+ cache_dir: Path | None = None
32
+
33
+
34
+ class InvalidAudioError(ValueError):
35
+ """Raised when the input audio is invalid or unsupported."""
36
+
37
+
38
+ @runtime_checkable
39
+ class WhisperBackend(Protocol):
40
+ """Protocol for Whisper transcription backends.
41
+
42
+ Backends handle model loading, unloading, and transcription.
43
+ The ModelManager handles TTL, stats, and lifecycle.
44
+ """
45
+
46
+ @property
47
+ def is_loaded(self) -> bool:
48
+ """Check if the model is currently loaded."""
49
+ ...
50
+
51
+ @property
52
+ def device(self) -> str | None:
53
+ """Get the device the model is loaded on, or None if not loaded."""
54
+ ...
55
+
56
+ async def load(self) -> float:
57
+ """Load the model into memory.
58
+
59
+ Returns:
60
+ Load duration in seconds.
61
+
62
+ """
63
+ ...
64
+
65
+ async def unload(self) -> None:
66
+ """Unload the model and free memory."""
67
+ ...
68
+
69
+ async def transcribe(
70
+ self,
71
+ audio: bytes,
72
+ *,
73
+ source_filename: str | None = None,
74
+ language: str | None = None,
75
+ task: Literal["transcribe", "translate"] = "transcribe",
76
+ initial_prompt: str | None = None,
77
+ temperature: float = 0.0,
78
+ vad_filter: bool = True,
79
+ word_timestamps: bool = False,
80
+ ) -> TranscriptionResult:
81
+ """Transcribe audio data.
82
+
83
+ Args:
84
+ audio: Audio data as bytes (WAV format, 16kHz, 16-bit, mono)
85
+ source_filename: Optional filename to help detect audio format.
86
+ language: Language code or None for auto-detection
87
+ task: "transcribe" or "translate" (to English)
88
+ initial_prompt: Optional prompt to guide transcription
89
+ temperature: Sampling temperature
90
+ vad_filter: Whether to use VAD filtering
91
+ word_timestamps: Whether to include word-level timestamps
92
+
93
+ Returns:
94
+ TranscriptionResult with text and metadata.
95
+
96
+ """
97
+ ...