agent-cli 0.70.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_cli/__init__.py +5 -0
- agent_cli/__main__.py +6 -0
- agent_cli/_extras.json +14 -0
- agent_cli/_requirements/.gitkeep +0 -0
- agent_cli/_requirements/audio.txt +79 -0
- agent_cli/_requirements/faster-whisper.txt +215 -0
- agent_cli/_requirements/kokoro.txt +425 -0
- agent_cli/_requirements/llm.txt +183 -0
- agent_cli/_requirements/memory.txt +355 -0
- agent_cli/_requirements/mlx-whisper.txt +222 -0
- agent_cli/_requirements/piper.txt +176 -0
- agent_cli/_requirements/rag.txt +402 -0
- agent_cli/_requirements/server.txt +154 -0
- agent_cli/_requirements/speed.txt +77 -0
- agent_cli/_requirements/vad.txt +155 -0
- agent_cli/_requirements/wyoming.txt +71 -0
- agent_cli/_tools.py +368 -0
- agent_cli/agents/__init__.py +23 -0
- agent_cli/agents/_voice_agent_common.py +136 -0
- agent_cli/agents/assistant.py +383 -0
- agent_cli/agents/autocorrect.py +284 -0
- agent_cli/agents/chat.py +496 -0
- agent_cli/agents/memory/__init__.py +31 -0
- agent_cli/agents/memory/add.py +190 -0
- agent_cli/agents/memory/proxy.py +160 -0
- agent_cli/agents/rag_proxy.py +128 -0
- agent_cli/agents/speak.py +209 -0
- agent_cli/agents/transcribe.py +671 -0
- agent_cli/agents/transcribe_daemon.py +499 -0
- agent_cli/agents/voice_edit.py +291 -0
- agent_cli/api.py +22 -0
- agent_cli/cli.py +106 -0
- agent_cli/config.py +503 -0
- agent_cli/config_cmd.py +307 -0
- agent_cli/constants.py +27 -0
- agent_cli/core/__init__.py +1 -0
- agent_cli/core/audio.py +461 -0
- agent_cli/core/audio_format.py +299 -0
- agent_cli/core/chroma.py +88 -0
- agent_cli/core/deps.py +191 -0
- agent_cli/core/openai_proxy.py +139 -0
- agent_cli/core/process.py +195 -0
- agent_cli/core/reranker.py +120 -0
- agent_cli/core/sse.py +87 -0
- agent_cli/core/transcription_logger.py +70 -0
- agent_cli/core/utils.py +526 -0
- agent_cli/core/vad.py +175 -0
- agent_cli/core/watch.py +65 -0
- agent_cli/dev/__init__.py +14 -0
- agent_cli/dev/cli.py +1588 -0
- agent_cli/dev/coding_agents/__init__.py +19 -0
- agent_cli/dev/coding_agents/aider.py +24 -0
- agent_cli/dev/coding_agents/base.py +167 -0
- agent_cli/dev/coding_agents/claude.py +39 -0
- agent_cli/dev/coding_agents/codex.py +24 -0
- agent_cli/dev/coding_agents/continue_dev.py +15 -0
- agent_cli/dev/coding_agents/copilot.py +24 -0
- agent_cli/dev/coding_agents/cursor_agent.py +48 -0
- agent_cli/dev/coding_agents/gemini.py +28 -0
- agent_cli/dev/coding_agents/opencode.py +15 -0
- agent_cli/dev/coding_agents/registry.py +49 -0
- agent_cli/dev/editors/__init__.py +19 -0
- agent_cli/dev/editors/base.py +89 -0
- agent_cli/dev/editors/cursor.py +15 -0
- agent_cli/dev/editors/emacs.py +46 -0
- agent_cli/dev/editors/jetbrains.py +56 -0
- agent_cli/dev/editors/nano.py +31 -0
- agent_cli/dev/editors/neovim.py +33 -0
- agent_cli/dev/editors/registry.py +59 -0
- agent_cli/dev/editors/sublime.py +20 -0
- agent_cli/dev/editors/vim.py +42 -0
- agent_cli/dev/editors/vscode.py +15 -0
- agent_cli/dev/editors/zed.py +20 -0
- agent_cli/dev/project.py +568 -0
- agent_cli/dev/registry.py +52 -0
- agent_cli/dev/skill/SKILL.md +141 -0
- agent_cli/dev/skill/examples.md +571 -0
- agent_cli/dev/terminals/__init__.py +19 -0
- agent_cli/dev/terminals/apple_terminal.py +82 -0
- agent_cli/dev/terminals/base.py +56 -0
- agent_cli/dev/terminals/gnome.py +51 -0
- agent_cli/dev/terminals/iterm2.py +84 -0
- agent_cli/dev/terminals/kitty.py +77 -0
- agent_cli/dev/terminals/registry.py +48 -0
- agent_cli/dev/terminals/tmux.py +58 -0
- agent_cli/dev/terminals/warp.py +132 -0
- agent_cli/dev/terminals/zellij.py +78 -0
- agent_cli/dev/worktree.py +856 -0
- agent_cli/docs_gen.py +417 -0
- agent_cli/example-config.toml +185 -0
- agent_cli/install/__init__.py +5 -0
- agent_cli/install/common.py +89 -0
- agent_cli/install/extras.py +174 -0
- agent_cli/install/hotkeys.py +48 -0
- agent_cli/install/services.py +87 -0
- agent_cli/memory/__init__.py +7 -0
- agent_cli/memory/_files.py +250 -0
- agent_cli/memory/_filters.py +63 -0
- agent_cli/memory/_git.py +157 -0
- agent_cli/memory/_indexer.py +142 -0
- agent_cli/memory/_ingest.py +408 -0
- agent_cli/memory/_persistence.py +182 -0
- agent_cli/memory/_prompt.py +91 -0
- agent_cli/memory/_retrieval.py +294 -0
- agent_cli/memory/_store.py +169 -0
- agent_cli/memory/_streaming.py +44 -0
- agent_cli/memory/_tasks.py +48 -0
- agent_cli/memory/api.py +113 -0
- agent_cli/memory/client.py +272 -0
- agent_cli/memory/engine.py +361 -0
- agent_cli/memory/entities.py +43 -0
- agent_cli/memory/models.py +112 -0
- agent_cli/opts.py +433 -0
- agent_cli/py.typed +0 -0
- agent_cli/rag/__init__.py +3 -0
- agent_cli/rag/_indexer.py +67 -0
- agent_cli/rag/_indexing.py +226 -0
- agent_cli/rag/_prompt.py +30 -0
- agent_cli/rag/_retriever.py +156 -0
- agent_cli/rag/_store.py +48 -0
- agent_cli/rag/_utils.py +218 -0
- agent_cli/rag/api.py +175 -0
- agent_cli/rag/client.py +299 -0
- agent_cli/rag/engine.py +302 -0
- agent_cli/rag/models.py +55 -0
- agent_cli/scripts/.runtime/.gitkeep +0 -0
- agent_cli/scripts/__init__.py +1 -0
- agent_cli/scripts/check_plugin_skill_sync.py +50 -0
- agent_cli/scripts/linux-hotkeys/README.md +63 -0
- agent_cli/scripts/linux-hotkeys/toggle-autocorrect.sh +45 -0
- agent_cli/scripts/linux-hotkeys/toggle-transcription.sh +58 -0
- agent_cli/scripts/linux-hotkeys/toggle-voice-edit.sh +58 -0
- agent_cli/scripts/macos-hotkeys/README.md +45 -0
- agent_cli/scripts/macos-hotkeys/skhd-config-example +5 -0
- agent_cli/scripts/macos-hotkeys/toggle-autocorrect.sh +12 -0
- agent_cli/scripts/macos-hotkeys/toggle-transcription.sh +37 -0
- agent_cli/scripts/macos-hotkeys/toggle-voice-edit.sh +37 -0
- agent_cli/scripts/nvidia-asr-server/README.md +99 -0
- agent_cli/scripts/nvidia-asr-server/pyproject.toml +27 -0
- agent_cli/scripts/nvidia-asr-server/server.py +255 -0
- agent_cli/scripts/nvidia-asr-server/shell.nix +32 -0
- agent_cli/scripts/nvidia-asr-server/uv.lock +4654 -0
- agent_cli/scripts/run-openwakeword.sh +11 -0
- agent_cli/scripts/run-piper-windows.ps1 +30 -0
- agent_cli/scripts/run-piper.sh +24 -0
- agent_cli/scripts/run-whisper-linux.sh +40 -0
- agent_cli/scripts/run-whisper-macos.sh +6 -0
- agent_cli/scripts/run-whisper-windows.ps1 +51 -0
- agent_cli/scripts/run-whisper.sh +9 -0
- agent_cli/scripts/run_faster_whisper_server.py +136 -0
- agent_cli/scripts/setup-linux-hotkeys.sh +72 -0
- agent_cli/scripts/setup-linux.sh +108 -0
- agent_cli/scripts/setup-macos-hotkeys.sh +61 -0
- agent_cli/scripts/setup-macos.sh +76 -0
- agent_cli/scripts/setup-windows.ps1 +63 -0
- agent_cli/scripts/start-all-services-windows.ps1 +53 -0
- agent_cli/scripts/start-all-services.sh +178 -0
- agent_cli/scripts/sync_extras.py +138 -0
- agent_cli/server/__init__.py +3 -0
- agent_cli/server/cli.py +721 -0
- agent_cli/server/common.py +222 -0
- agent_cli/server/model_manager.py +288 -0
- agent_cli/server/model_registry.py +225 -0
- agent_cli/server/proxy/__init__.py +3 -0
- agent_cli/server/proxy/api.py +444 -0
- agent_cli/server/streaming.py +67 -0
- agent_cli/server/tts/__init__.py +3 -0
- agent_cli/server/tts/api.py +335 -0
- agent_cli/server/tts/backends/__init__.py +82 -0
- agent_cli/server/tts/backends/base.py +139 -0
- agent_cli/server/tts/backends/kokoro.py +403 -0
- agent_cli/server/tts/backends/piper.py +253 -0
- agent_cli/server/tts/model_manager.py +201 -0
- agent_cli/server/tts/model_registry.py +28 -0
- agent_cli/server/tts/wyoming_handler.py +249 -0
- agent_cli/server/whisper/__init__.py +3 -0
- agent_cli/server/whisper/api.py +413 -0
- agent_cli/server/whisper/backends/__init__.py +89 -0
- agent_cli/server/whisper/backends/base.py +97 -0
- agent_cli/server/whisper/backends/faster_whisper.py +225 -0
- agent_cli/server/whisper/backends/mlx.py +270 -0
- agent_cli/server/whisper/languages.py +116 -0
- agent_cli/server/whisper/model_manager.py +157 -0
- agent_cli/server/whisper/model_registry.py +28 -0
- agent_cli/server/whisper/wyoming_handler.py +203 -0
- agent_cli/services/__init__.py +343 -0
- agent_cli/services/_wyoming_utils.py +64 -0
- agent_cli/services/asr.py +506 -0
- agent_cli/services/llm.py +228 -0
- agent_cli/services/tts.py +450 -0
- agent_cli/services/wake_word.py +142 -0
- agent_cli-0.70.5.dist-info/METADATA +2118 -0
- agent_cli-0.70.5.dist-info/RECORD +196 -0
- agent_cli-0.70.5.dist-info/WHEEL +4 -0
- agent_cli-0.70.5.dist-info/entry_points.txt +4 -0
- agent_cli-0.70.5.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
"""Faster-whisper backend for Linux/CUDA systems."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
import tempfile
|
|
8
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from multiprocessing import get_context
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Literal
|
|
13
|
+
|
|
14
|
+
from agent_cli.core.process import set_process_title
|
|
15
|
+
from agent_cli.server.whisper.backends.base import (
|
|
16
|
+
BackendConfig,
|
|
17
|
+
TranscriptionResult,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# --- Subprocess state (only used within subprocess worker) ---
|
|
24
|
+
# This state persists across function calls within the subprocess because:
|
|
25
|
+
# 1. Model loading is expensive and must be reused across transcription calls
|
|
26
|
+
# 2. CTranslate2 models cannot be pickled/passed through IPC queues
|
|
27
|
+
# 3. The subprocess is long-lived (ProcessPoolExecutor reuses workers)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class _SubprocessState:
|
|
32
|
+
"""Container for subprocess-local state. Not shared with main process."""
|
|
33
|
+
|
|
34
|
+
model: Any = None
|
|
35
|
+
device: str | None = None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
_state = _SubprocessState()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# --- Subprocess worker functions (run in isolated process) ---
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _load_model_in_subprocess(
|
|
45
|
+
model_name: str,
|
|
46
|
+
device: str,
|
|
47
|
+
compute_type: str,
|
|
48
|
+
cpu_threads: int,
|
|
49
|
+
download_root: str | None,
|
|
50
|
+
) -> str:
|
|
51
|
+
"""Load model in subprocess. Returns actual device string."""
|
|
52
|
+
from faster_whisper import WhisperModel # noqa: PLC0415
|
|
53
|
+
|
|
54
|
+
set_process_title("whisper-faster")
|
|
55
|
+
model = WhisperModel(
|
|
56
|
+
model_name,
|
|
57
|
+
device=device,
|
|
58
|
+
compute_type=compute_type,
|
|
59
|
+
cpu_threads=cpu_threads,
|
|
60
|
+
download_root=download_root,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Store in subprocess state for reuse across transcription calls
|
|
64
|
+
_state.model = model
|
|
65
|
+
_state.device = str(model.model.device)
|
|
66
|
+
|
|
67
|
+
return _state.device
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _transcribe_in_subprocess(
|
|
71
|
+
audio_bytes: bytes,
|
|
72
|
+
kwargs: dict[str, Any],
|
|
73
|
+
) -> dict[str, Any]:
|
|
74
|
+
"""Run transcription in subprocess. Reuses model from _state."""
|
|
75
|
+
if _state.model is None:
|
|
76
|
+
msg = "Model not loaded in subprocess. Call _load_model_in_subprocess first."
|
|
77
|
+
raise RuntimeError(msg)
|
|
78
|
+
|
|
79
|
+
# Write audio to temp file - faster-whisper needs a file path
|
|
80
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
|
81
|
+
tmp.write(audio_bytes)
|
|
82
|
+
tmp_path = tmp.name
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
segments, info = _state.model.transcribe(tmp_path, **kwargs)
|
|
86
|
+
segment_list = list(segments) # Consume lazy generator
|
|
87
|
+
finally:
|
|
88
|
+
Path(tmp_path).unlink(missing_ok=True)
|
|
89
|
+
|
|
90
|
+
return {
|
|
91
|
+
"text": " ".join(seg.text.strip() for seg in segment_list),
|
|
92
|
+
"language": info.language,
|
|
93
|
+
"language_probability": info.language_probability,
|
|
94
|
+
"duration": info.duration,
|
|
95
|
+
"segments": [
|
|
96
|
+
{
|
|
97
|
+
"id": seg.id,
|
|
98
|
+
"start": seg.start,
|
|
99
|
+
"end": seg.end,
|
|
100
|
+
"text": seg.text,
|
|
101
|
+
"tokens": seg.tokens,
|
|
102
|
+
"avg_logprob": seg.avg_logprob,
|
|
103
|
+
"no_speech_prob": seg.no_speech_prob,
|
|
104
|
+
}
|
|
105
|
+
for seg in segment_list
|
|
106
|
+
],
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class FasterWhisperBackend:
|
|
111
|
+
"""Whisper backend using faster-whisper (CTranslate2).
|
|
112
|
+
|
|
113
|
+
Uses subprocess isolation: when unloaded, the subprocess terminates
|
|
114
|
+
and the OS reclaims ALL memory (Python's pymalloc doesn't return
|
|
115
|
+
freed memory to OS otherwise).
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
def __init__(self, config: BackendConfig) -> None:
|
|
119
|
+
"""Initialize the backend."""
|
|
120
|
+
self._config = config
|
|
121
|
+
self._executor: ProcessPoolExecutor | None = None
|
|
122
|
+
self._device: str | None = None
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def is_loaded(self) -> bool:
|
|
126
|
+
"""Check if the model is loaded."""
|
|
127
|
+
return self._executor is not None
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def device(self) -> str | None:
|
|
131
|
+
"""Get the device the model is on."""
|
|
132
|
+
return self._device
|
|
133
|
+
|
|
134
|
+
async def load(self) -> float:
|
|
135
|
+
"""Start subprocess and load model."""
|
|
136
|
+
import time # noqa: PLC0415
|
|
137
|
+
|
|
138
|
+
logger.debug(
|
|
139
|
+
"Starting faster-whisper subprocess for model %s (device=%s, compute_type=%s)",
|
|
140
|
+
self._config.model_name,
|
|
141
|
+
self._config.device,
|
|
142
|
+
self._config.compute_type,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
start_time = time.time()
|
|
146
|
+
|
|
147
|
+
# Subprocess isolation: spawn context for clean state
|
|
148
|
+
ctx = get_context("spawn")
|
|
149
|
+
self._executor = ProcessPoolExecutor(max_workers=1, mp_context=ctx)
|
|
150
|
+
|
|
151
|
+
download_root = str(self._config.cache_dir) if self._config.cache_dir else None
|
|
152
|
+
loop = asyncio.get_running_loop()
|
|
153
|
+
self._device = await loop.run_in_executor(
|
|
154
|
+
self._executor,
|
|
155
|
+
_load_model_in_subprocess,
|
|
156
|
+
self._config.model_name,
|
|
157
|
+
self._config.device,
|
|
158
|
+
self._config.compute_type,
|
|
159
|
+
self._config.cpu_threads,
|
|
160
|
+
download_root,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
load_duration = time.time() - start_time
|
|
164
|
+
logger.info(
|
|
165
|
+
"Model %s loaded on %s in %.2fs",
|
|
166
|
+
self._config.model_name,
|
|
167
|
+
self._device,
|
|
168
|
+
load_duration,
|
|
169
|
+
)
|
|
170
|
+
return load_duration
|
|
171
|
+
|
|
172
|
+
async def unload(self) -> None:
|
|
173
|
+
"""Shutdown subprocess, releasing ALL memory."""
|
|
174
|
+
if self._executor is None:
|
|
175
|
+
return
|
|
176
|
+
logger.debug(
|
|
177
|
+
"Shutting down faster-whisper subprocess for model %s",
|
|
178
|
+
self._config.model_name,
|
|
179
|
+
)
|
|
180
|
+
self._executor.shutdown(wait=False, cancel_futures=True)
|
|
181
|
+
self._executor = None
|
|
182
|
+
self._device = None
|
|
183
|
+
logger.info("Model %s unloaded (subprocess terminated)", self._config.model_name)
|
|
184
|
+
|
|
185
|
+
async def transcribe(
|
|
186
|
+
self,
|
|
187
|
+
audio: bytes,
|
|
188
|
+
*,
|
|
189
|
+
source_filename: str | None = None, # noqa: ARG002
|
|
190
|
+
language: str | None = None,
|
|
191
|
+
task: Literal["transcribe", "translate"] = "transcribe",
|
|
192
|
+
initial_prompt: str | None = None,
|
|
193
|
+
temperature: float = 0.0,
|
|
194
|
+
vad_filter: bool = True,
|
|
195
|
+
word_timestamps: bool = False,
|
|
196
|
+
) -> TranscriptionResult:
|
|
197
|
+
"""Transcribe audio using faster-whisper in subprocess."""
|
|
198
|
+
if self._executor is None:
|
|
199
|
+
msg = "Model not loaded. Call load() first."
|
|
200
|
+
raise RuntimeError(msg)
|
|
201
|
+
|
|
202
|
+
kwargs: dict[str, Any] = {
|
|
203
|
+
"language": language,
|
|
204
|
+
"task": task,
|
|
205
|
+
"initial_prompt": initial_prompt,
|
|
206
|
+
"temperature": temperature,
|
|
207
|
+
"vad_filter": vad_filter,
|
|
208
|
+
"word_timestamps": word_timestamps,
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
loop = asyncio.get_running_loop()
|
|
212
|
+
result = await loop.run_in_executor(
|
|
213
|
+
self._executor,
|
|
214
|
+
_transcribe_in_subprocess,
|
|
215
|
+
audio,
|
|
216
|
+
kwargs,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
return TranscriptionResult(
|
|
220
|
+
text=result["text"],
|
|
221
|
+
language=result["language"],
|
|
222
|
+
language_probability=result["language_probability"],
|
|
223
|
+
duration=result["duration"],
|
|
224
|
+
segments=result["segments"],
|
|
225
|
+
)
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""MLX Whisper backend for macOS Apple Silicon."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
import wave
|
|
8
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
9
|
+
from multiprocessing import get_context
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
11
|
+
|
|
12
|
+
from agent_cli import constants
|
|
13
|
+
from agent_cli.core.audio_format import (
|
|
14
|
+
convert_audio_to_wyoming_format,
|
|
15
|
+
extract_pcm_from_wav,
|
|
16
|
+
)
|
|
17
|
+
from agent_cli.core.process import set_process_title
|
|
18
|
+
from agent_cli.server.whisper.backends.base import (
|
|
19
|
+
BackendConfig,
|
|
20
|
+
InvalidAudioError,
|
|
21
|
+
TranscriptionResult,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
import numpy as np
|
|
26
|
+
from numpy.typing import NDArray
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
# MLX model name mapping: canonical name -> HuggingFace repo
|
|
31
|
+
_MLX_MODEL_MAP: dict[str, str] = {
|
|
32
|
+
"tiny": "mlx-community/whisper-tiny",
|
|
33
|
+
"small": "mlx-community/whisper-small-mlx",
|
|
34
|
+
"medium": "mlx-community/whisper-medium-mlx",
|
|
35
|
+
"large": "mlx-community/whisper-large-v3-mlx",
|
|
36
|
+
"large-v2": "mlx-community/whisper-large-v2-mlx",
|
|
37
|
+
"large-v3": "mlx-community/whisper-large-v3-mlx",
|
|
38
|
+
"large-v3-turbo": "mlx-community/whisper-large-v3-turbo",
|
|
39
|
+
"turbo": "mlx-community/whisper-large-v3-turbo",
|
|
40
|
+
"large-v3-turbo-q4": "mlx-community/whisper-large-v3-turbo-q4",
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _resolve_mlx_model_name(model_name: str) -> str:
|
|
45
|
+
"""Resolve a model name to an MLX HuggingFace repo."""
|
|
46
|
+
if model_name.startswith("mlx-community/"):
|
|
47
|
+
return model_name
|
|
48
|
+
if model_name in _MLX_MODEL_MAP:
|
|
49
|
+
return _MLX_MODEL_MAP[model_name]
|
|
50
|
+
for prefix in ("whisper-", "openai/whisper-"):
|
|
51
|
+
if model_name.startswith(prefix):
|
|
52
|
+
stripped = model_name[len(prefix) :]
|
|
53
|
+
if stripped in _MLX_MODEL_MAP:
|
|
54
|
+
return _MLX_MODEL_MAP[stripped]
|
|
55
|
+
return model_name
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _pcm_to_float(audio_bytes: bytes) -> NDArray[np.float32]:
|
|
59
|
+
"""Convert 16-bit PCM audio bytes to float32 array normalized to [-1, 1]."""
|
|
60
|
+
import numpy as np # noqa: PLC0415
|
|
61
|
+
|
|
62
|
+
return np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _convert_audio_to_pcm(audio_bytes: bytes, source_filename: str | None) -> bytes:
|
|
66
|
+
"""Convert audio bytes to raw PCM using FFmpeg."""
|
|
67
|
+
filename = source_filename or "audio"
|
|
68
|
+
try:
|
|
69
|
+
return convert_audio_to_wyoming_format(audio_bytes, filename)
|
|
70
|
+
except RuntimeError as exc:
|
|
71
|
+
logger.warning("FFmpeg conversion failed for MLX Whisper: %s", exc)
|
|
72
|
+
msg = (
|
|
73
|
+
"Unsupported audio format for MLX Whisper. "
|
|
74
|
+
"Provide a 16kHz mono 16-bit WAV file or install ffmpeg to convert uploads."
|
|
75
|
+
)
|
|
76
|
+
raise InvalidAudioError(msg) from exc
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _prepare_audio_pcm(audio: bytes, source_filename: str | None) -> bytes:
|
|
80
|
+
"""Extract PCM from WAV or convert with FFmpeg if needed."""
|
|
81
|
+
try:
|
|
82
|
+
wav = extract_pcm_from_wav(audio)
|
|
83
|
+
except (wave.Error, EOFError) as exc:
|
|
84
|
+
logger.debug("WAV parsing failed (%s); converting with FFmpeg", exc)
|
|
85
|
+
return _convert_audio_to_pcm(audio, source_filename)
|
|
86
|
+
|
|
87
|
+
needs_conversion = (
|
|
88
|
+
wav.sample_rate != constants.AUDIO_RATE
|
|
89
|
+
or wav.num_channels != constants.AUDIO_CHANNELS
|
|
90
|
+
or wav.sample_width != constants.AUDIO_FORMAT_WIDTH
|
|
91
|
+
)
|
|
92
|
+
if needs_conversion:
|
|
93
|
+
logger.debug(
|
|
94
|
+
"WAV format mismatch (rate=%s, channels=%s, width=%s); converting",
|
|
95
|
+
wav.sample_rate,
|
|
96
|
+
wav.num_channels,
|
|
97
|
+
wav.sample_width,
|
|
98
|
+
)
|
|
99
|
+
name = (
|
|
100
|
+
source_filename
|
|
101
|
+
if source_filename and source_filename.lower().endswith(".wav")
|
|
102
|
+
else "audio.wav"
|
|
103
|
+
)
|
|
104
|
+
return _convert_audio_to_pcm(audio, name)
|
|
105
|
+
return wav.pcm_data
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# --- Subprocess worker functions (run in isolated process) ---
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _load_model_in_subprocess(model_name: str) -> None:
|
|
112
|
+
"""Load model in subprocess. Called once when executor starts."""
|
|
113
|
+
import mlx.core as mx # noqa: PLC0415
|
|
114
|
+
from mlx_whisper.transcribe import ModelHolder # noqa: PLC0415
|
|
115
|
+
|
|
116
|
+
set_process_title("whisper-mlx")
|
|
117
|
+
ModelHolder.get_model(model_name, mx.float16)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _transcribe_in_subprocess(
|
|
121
|
+
model_name: str,
|
|
122
|
+
audio_bytes: bytes,
|
|
123
|
+
audio_shape: tuple[int, ...],
|
|
124
|
+
audio_dtype: str,
|
|
125
|
+
kwargs: dict[str, Any],
|
|
126
|
+
) -> dict[str, Any]:
|
|
127
|
+
"""Run transcription in subprocess. Model stays loaded between calls."""
|
|
128
|
+
import mlx_whisper # noqa: PLC0415
|
|
129
|
+
import numpy as np # noqa: PLC0415
|
|
130
|
+
|
|
131
|
+
audio_array = np.frombuffer(audio_bytes, dtype=audio_dtype).reshape(audio_shape)
|
|
132
|
+
result = mlx_whisper.transcribe(audio_array, path_or_hf_repo=model_name, **kwargs)
|
|
133
|
+
|
|
134
|
+
return {
|
|
135
|
+
"text": result.get("text", ""),
|
|
136
|
+
"language": result.get("language", "en"),
|
|
137
|
+
"segments": result.get("segments", []),
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class MLXWhisperBackend:
|
|
142
|
+
"""Whisper backend using mlx-whisper for Apple Silicon.
|
|
143
|
+
|
|
144
|
+
Uses subprocess isolation: when unloaded, the subprocess terminates
|
|
145
|
+
and the OS reclaims ALL memory (Python's pymalloc doesn't return
|
|
146
|
+
freed memory to OS otherwise).
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
def __init__(self, config: BackendConfig) -> None:
|
|
150
|
+
"""Initialize the backend."""
|
|
151
|
+
self._config = config
|
|
152
|
+
self._resolved_model = _resolve_mlx_model_name(config.model_name)
|
|
153
|
+
self._executor: ProcessPoolExecutor | None = None
|
|
154
|
+
|
|
155
|
+
@property
|
|
156
|
+
def is_loaded(self) -> bool:
|
|
157
|
+
"""Check if the model is loaded."""
|
|
158
|
+
return self._executor is not None
|
|
159
|
+
|
|
160
|
+
@property
|
|
161
|
+
def device(self) -> str | None:
|
|
162
|
+
"""Get the device - always 'mps' (Metal) for MLX."""
|
|
163
|
+
return "mps" if self._executor is not None else None
|
|
164
|
+
|
|
165
|
+
async def load(self) -> float:
|
|
166
|
+
"""Start subprocess and load model."""
|
|
167
|
+
import time # noqa: PLC0415
|
|
168
|
+
|
|
169
|
+
logger.debug(
|
|
170
|
+
"Starting MLX subprocess for model %s (resolved: %s)",
|
|
171
|
+
self._config.model_name,
|
|
172
|
+
self._resolved_model,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
start_time = time.time()
|
|
176
|
+
|
|
177
|
+
# Subprocess isolation: spawn context for clean state
|
|
178
|
+
ctx = get_context("spawn")
|
|
179
|
+
self._executor = ProcessPoolExecutor(max_workers=1, mp_context=ctx)
|
|
180
|
+
|
|
181
|
+
loop = asyncio.get_running_loop()
|
|
182
|
+
await loop.run_in_executor(
|
|
183
|
+
self._executor,
|
|
184
|
+
_load_model_in_subprocess,
|
|
185
|
+
self._resolved_model,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
load_duration = time.time() - start_time
|
|
189
|
+
logger.info(
|
|
190
|
+
"Model %s loaded in subprocess in %.2fs",
|
|
191
|
+
self._config.model_name,
|
|
192
|
+
load_duration,
|
|
193
|
+
)
|
|
194
|
+
return load_duration
|
|
195
|
+
|
|
196
|
+
async def unload(self) -> None:
|
|
197
|
+
"""Shutdown subprocess, releasing ALL memory."""
|
|
198
|
+
if self._executor is None:
|
|
199
|
+
return
|
|
200
|
+
logger.debug("Shutting down MLX subprocess for model %s", self._resolved_model)
|
|
201
|
+
self._executor.shutdown(wait=False, cancel_futures=True)
|
|
202
|
+
self._executor = None
|
|
203
|
+
logger.info("Model %s unloaded (subprocess terminated)", self._config.model_name)
|
|
204
|
+
|
|
205
|
+
async def transcribe(
|
|
206
|
+
self,
|
|
207
|
+
audio: bytes,
|
|
208
|
+
*,
|
|
209
|
+
source_filename: str | None = None,
|
|
210
|
+
language: str | None = None,
|
|
211
|
+
task: Literal["transcribe", "translate"] = "transcribe",
|
|
212
|
+
initial_prompt: str | None = None,
|
|
213
|
+
temperature: float = 0.0,
|
|
214
|
+
vad_filter: bool = True, # noqa: ARG002 - not supported by mlx-whisper
|
|
215
|
+
word_timestamps: bool = False,
|
|
216
|
+
) -> TranscriptionResult:
|
|
217
|
+
"""Transcribe audio using mlx-whisper in subprocess."""
|
|
218
|
+
if self._executor is None:
|
|
219
|
+
msg = "Model not loaded. Call load() first."
|
|
220
|
+
raise RuntimeError(msg)
|
|
221
|
+
|
|
222
|
+
pcm_data = _prepare_audio_pcm(audio, source_filename)
|
|
223
|
+
audio_array = _pcm_to_float(pcm_data)
|
|
224
|
+
|
|
225
|
+
kwargs: dict[str, Any] = {
|
|
226
|
+
"temperature": temperature,
|
|
227
|
+
"word_timestamps": word_timestamps,
|
|
228
|
+
}
|
|
229
|
+
if language:
|
|
230
|
+
kwargs["language"] = language
|
|
231
|
+
if task == "translate":
|
|
232
|
+
kwargs["task"] = "translate"
|
|
233
|
+
if initial_prompt:
|
|
234
|
+
kwargs["initial_prompt"] = initial_prompt
|
|
235
|
+
|
|
236
|
+
loop = asyncio.get_running_loop()
|
|
237
|
+
result = await loop.run_in_executor(
|
|
238
|
+
self._executor,
|
|
239
|
+
_transcribe_in_subprocess,
|
|
240
|
+
self._resolved_model,
|
|
241
|
+
audio_array.tobytes(),
|
|
242
|
+
audio_array.shape,
|
|
243
|
+
str(audio_array.dtype),
|
|
244
|
+
kwargs,
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
text = result.get("text", "").strip()
|
|
248
|
+
detected_language = result.get("language", "en")
|
|
249
|
+
language_probability = 1.0 if language else 0.95
|
|
250
|
+
segments = result.get("segments", [])
|
|
251
|
+
duration = segments[-1].get("end", 0.0) if segments else len(pcm_data) / 32000.0
|
|
252
|
+
|
|
253
|
+
return TranscriptionResult(
|
|
254
|
+
text=text,
|
|
255
|
+
language=detected_language,
|
|
256
|
+
language_probability=language_probability,
|
|
257
|
+
duration=duration,
|
|
258
|
+
segments=[
|
|
259
|
+
{
|
|
260
|
+
"id": i,
|
|
261
|
+
"start": seg.get("start", 0.0),
|
|
262
|
+
"end": seg.get("end", 0.0),
|
|
263
|
+
"text": seg.get("text", ""),
|
|
264
|
+
"tokens": seg.get("tokens", []),
|
|
265
|
+
"avg_logprob": seg.get("avg_logprob", 0.0),
|
|
266
|
+
"no_speech_prob": seg.get("no_speech_prob", 0.0),
|
|
267
|
+
}
|
|
268
|
+
for i, seg in enumerate(segments)
|
|
269
|
+
],
|
|
270
|
+
)
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Language codes supported by Whisper ASR models.
|
|
2
|
+
|
|
3
|
+
This list is derived from the OpenAI Whisper model's supported languages.
|
|
4
|
+
Source: https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
|
|
5
|
+
|
|
6
|
+
The codes are ISO 639-1 (2-letter) or ISO 639-2 (3-letter) codes where
|
|
7
|
+
2-letter codes are not available.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
# Language codes supported by Whisper models
|
|
13
|
+
# fmt: off
|
|
14
|
+
WHISPER_LANGUAGE_CODES: list[str] = [
|
|
15
|
+
"af", # Afrikaans
|
|
16
|
+
"am", # Amharic
|
|
17
|
+
"ar", # Arabic
|
|
18
|
+
"as", # Assamese
|
|
19
|
+
"az", # Azerbaijani
|
|
20
|
+
"ba", # Bashkir
|
|
21
|
+
"be", # Belarusian
|
|
22
|
+
"bg", # Bulgarian
|
|
23
|
+
"bn", # Bengali
|
|
24
|
+
"bo", # Tibetan
|
|
25
|
+
"br", # Breton
|
|
26
|
+
"bs", # Bosnian
|
|
27
|
+
"ca", # Catalan
|
|
28
|
+
"cs", # Czech
|
|
29
|
+
"cy", # Welsh
|
|
30
|
+
"da", # Danish
|
|
31
|
+
"de", # German
|
|
32
|
+
"el", # Greek
|
|
33
|
+
"en", # English
|
|
34
|
+
"es", # Spanish
|
|
35
|
+
"et", # Estonian
|
|
36
|
+
"eu", # Basque
|
|
37
|
+
"fa", # Persian
|
|
38
|
+
"fi", # Finnish
|
|
39
|
+
"fo", # Faroese
|
|
40
|
+
"fr", # French
|
|
41
|
+
"gl", # Galician
|
|
42
|
+
"gu", # Gujarati
|
|
43
|
+
"ha", # Hausa
|
|
44
|
+
"haw", # Hawaiian
|
|
45
|
+
"he", # Hebrew
|
|
46
|
+
"hi", # Hindi
|
|
47
|
+
"hr", # Croatian
|
|
48
|
+
"ht", # Haitian Creole
|
|
49
|
+
"hu", # Hungarian
|
|
50
|
+
"hy", # Armenian
|
|
51
|
+
"id", # Indonesian
|
|
52
|
+
"is", # Icelandic
|
|
53
|
+
"it", # Italian
|
|
54
|
+
"ja", # Japanese
|
|
55
|
+
"jw", # Javanese
|
|
56
|
+
"ka", # Georgian
|
|
57
|
+
"kk", # Kazakh
|
|
58
|
+
"km", # Khmer
|
|
59
|
+
"kn", # Kannada
|
|
60
|
+
"ko", # Korean
|
|
61
|
+
"la", # Latin
|
|
62
|
+
"lb", # Luxembourgish
|
|
63
|
+
"ln", # Lingala
|
|
64
|
+
"lo", # Lao
|
|
65
|
+
"lt", # Lithuanian
|
|
66
|
+
"lv", # Latvian
|
|
67
|
+
"mg", # Malagasy
|
|
68
|
+
"mi", # Maori
|
|
69
|
+
"mk", # Macedonian
|
|
70
|
+
"ml", # Malayalam
|
|
71
|
+
"mn", # Mongolian
|
|
72
|
+
"mr", # Marathi
|
|
73
|
+
"ms", # Malay
|
|
74
|
+
"mt", # Maltese
|
|
75
|
+
"my", # Myanmar (Burmese)
|
|
76
|
+
"ne", # Nepali
|
|
77
|
+
"nl", # Dutch
|
|
78
|
+
"nn", # Norwegian Nynorsk
|
|
79
|
+
"no", # Norwegian
|
|
80
|
+
"oc", # Occitan
|
|
81
|
+
"pa", # Punjabi
|
|
82
|
+
"pl", # Polish
|
|
83
|
+
"ps", # Pashto
|
|
84
|
+
"pt", # Portuguese
|
|
85
|
+
"ro", # Romanian
|
|
86
|
+
"ru", # Russian
|
|
87
|
+
"sa", # Sanskrit
|
|
88
|
+
"sd", # Sindhi
|
|
89
|
+
"si", # Sinhala
|
|
90
|
+
"sk", # Slovak
|
|
91
|
+
"sl", # Slovenian
|
|
92
|
+
"sn", # Shona
|
|
93
|
+
"so", # Somali
|
|
94
|
+
"sq", # Albanian
|
|
95
|
+
"sr", # Serbian
|
|
96
|
+
"su", # Sundanese
|
|
97
|
+
"sv", # Swedish
|
|
98
|
+
"sw", # Swahili
|
|
99
|
+
"ta", # Tamil
|
|
100
|
+
"te", # Telugu
|
|
101
|
+
"tg", # Tajik
|
|
102
|
+
"th", # Thai
|
|
103
|
+
"tk", # Turkmen
|
|
104
|
+
"tl", # Tagalog
|
|
105
|
+
"tr", # Turkish
|
|
106
|
+
"tt", # Tatar
|
|
107
|
+
"uk", # Ukrainian
|
|
108
|
+
"ur", # Urdu
|
|
109
|
+
"uz", # Uzbek
|
|
110
|
+
"vi", # Vietnamese
|
|
111
|
+
"yi", # Yiddish
|
|
112
|
+
"yo", # Yoruba
|
|
113
|
+
"zh", # Chinese
|
|
114
|
+
"yue", # Cantonese
|
|
115
|
+
]
|
|
116
|
+
# fmt: on
|