agent-cli 0.70.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. agent_cli/__init__.py +5 -0
  2. agent_cli/__main__.py +6 -0
  3. agent_cli/_extras.json +14 -0
  4. agent_cli/_requirements/.gitkeep +0 -0
  5. agent_cli/_requirements/audio.txt +79 -0
  6. agent_cli/_requirements/faster-whisper.txt +215 -0
  7. agent_cli/_requirements/kokoro.txt +425 -0
  8. agent_cli/_requirements/llm.txt +183 -0
  9. agent_cli/_requirements/memory.txt +355 -0
  10. agent_cli/_requirements/mlx-whisper.txt +222 -0
  11. agent_cli/_requirements/piper.txt +176 -0
  12. agent_cli/_requirements/rag.txt +402 -0
  13. agent_cli/_requirements/server.txt +154 -0
  14. agent_cli/_requirements/speed.txt +77 -0
  15. agent_cli/_requirements/vad.txt +155 -0
  16. agent_cli/_requirements/wyoming.txt +71 -0
  17. agent_cli/_tools.py +368 -0
  18. agent_cli/agents/__init__.py +23 -0
  19. agent_cli/agents/_voice_agent_common.py +136 -0
  20. agent_cli/agents/assistant.py +383 -0
  21. agent_cli/agents/autocorrect.py +284 -0
  22. agent_cli/agents/chat.py +496 -0
  23. agent_cli/agents/memory/__init__.py +31 -0
  24. agent_cli/agents/memory/add.py +190 -0
  25. agent_cli/agents/memory/proxy.py +160 -0
  26. agent_cli/agents/rag_proxy.py +128 -0
  27. agent_cli/agents/speak.py +209 -0
  28. agent_cli/agents/transcribe.py +671 -0
  29. agent_cli/agents/transcribe_daemon.py +499 -0
  30. agent_cli/agents/voice_edit.py +291 -0
  31. agent_cli/api.py +22 -0
  32. agent_cli/cli.py +106 -0
  33. agent_cli/config.py +503 -0
  34. agent_cli/config_cmd.py +307 -0
  35. agent_cli/constants.py +27 -0
  36. agent_cli/core/__init__.py +1 -0
  37. agent_cli/core/audio.py +461 -0
  38. agent_cli/core/audio_format.py +299 -0
  39. agent_cli/core/chroma.py +88 -0
  40. agent_cli/core/deps.py +191 -0
  41. agent_cli/core/openai_proxy.py +139 -0
  42. agent_cli/core/process.py +195 -0
  43. agent_cli/core/reranker.py +120 -0
  44. agent_cli/core/sse.py +87 -0
  45. agent_cli/core/transcription_logger.py +70 -0
  46. agent_cli/core/utils.py +526 -0
  47. agent_cli/core/vad.py +175 -0
  48. agent_cli/core/watch.py +65 -0
  49. agent_cli/dev/__init__.py +14 -0
  50. agent_cli/dev/cli.py +1588 -0
  51. agent_cli/dev/coding_agents/__init__.py +19 -0
  52. agent_cli/dev/coding_agents/aider.py +24 -0
  53. agent_cli/dev/coding_agents/base.py +167 -0
  54. agent_cli/dev/coding_agents/claude.py +39 -0
  55. agent_cli/dev/coding_agents/codex.py +24 -0
  56. agent_cli/dev/coding_agents/continue_dev.py +15 -0
  57. agent_cli/dev/coding_agents/copilot.py +24 -0
  58. agent_cli/dev/coding_agents/cursor_agent.py +48 -0
  59. agent_cli/dev/coding_agents/gemini.py +28 -0
  60. agent_cli/dev/coding_agents/opencode.py +15 -0
  61. agent_cli/dev/coding_agents/registry.py +49 -0
  62. agent_cli/dev/editors/__init__.py +19 -0
  63. agent_cli/dev/editors/base.py +89 -0
  64. agent_cli/dev/editors/cursor.py +15 -0
  65. agent_cli/dev/editors/emacs.py +46 -0
  66. agent_cli/dev/editors/jetbrains.py +56 -0
  67. agent_cli/dev/editors/nano.py +31 -0
  68. agent_cli/dev/editors/neovim.py +33 -0
  69. agent_cli/dev/editors/registry.py +59 -0
  70. agent_cli/dev/editors/sublime.py +20 -0
  71. agent_cli/dev/editors/vim.py +42 -0
  72. agent_cli/dev/editors/vscode.py +15 -0
  73. agent_cli/dev/editors/zed.py +20 -0
  74. agent_cli/dev/project.py +568 -0
  75. agent_cli/dev/registry.py +52 -0
  76. agent_cli/dev/skill/SKILL.md +141 -0
  77. agent_cli/dev/skill/examples.md +571 -0
  78. agent_cli/dev/terminals/__init__.py +19 -0
  79. agent_cli/dev/terminals/apple_terminal.py +82 -0
  80. agent_cli/dev/terminals/base.py +56 -0
  81. agent_cli/dev/terminals/gnome.py +51 -0
  82. agent_cli/dev/terminals/iterm2.py +84 -0
  83. agent_cli/dev/terminals/kitty.py +77 -0
  84. agent_cli/dev/terminals/registry.py +48 -0
  85. agent_cli/dev/terminals/tmux.py +58 -0
  86. agent_cli/dev/terminals/warp.py +132 -0
  87. agent_cli/dev/terminals/zellij.py +78 -0
  88. agent_cli/dev/worktree.py +856 -0
  89. agent_cli/docs_gen.py +417 -0
  90. agent_cli/example-config.toml +185 -0
  91. agent_cli/install/__init__.py +5 -0
  92. agent_cli/install/common.py +89 -0
  93. agent_cli/install/extras.py +174 -0
  94. agent_cli/install/hotkeys.py +48 -0
  95. agent_cli/install/services.py +87 -0
  96. agent_cli/memory/__init__.py +7 -0
  97. agent_cli/memory/_files.py +250 -0
  98. agent_cli/memory/_filters.py +63 -0
  99. agent_cli/memory/_git.py +157 -0
  100. agent_cli/memory/_indexer.py +142 -0
  101. agent_cli/memory/_ingest.py +408 -0
  102. agent_cli/memory/_persistence.py +182 -0
  103. agent_cli/memory/_prompt.py +91 -0
  104. agent_cli/memory/_retrieval.py +294 -0
  105. agent_cli/memory/_store.py +169 -0
  106. agent_cli/memory/_streaming.py +44 -0
  107. agent_cli/memory/_tasks.py +48 -0
  108. agent_cli/memory/api.py +113 -0
  109. agent_cli/memory/client.py +272 -0
  110. agent_cli/memory/engine.py +361 -0
  111. agent_cli/memory/entities.py +43 -0
  112. agent_cli/memory/models.py +112 -0
  113. agent_cli/opts.py +433 -0
  114. agent_cli/py.typed +0 -0
  115. agent_cli/rag/__init__.py +3 -0
  116. agent_cli/rag/_indexer.py +67 -0
  117. agent_cli/rag/_indexing.py +226 -0
  118. agent_cli/rag/_prompt.py +30 -0
  119. agent_cli/rag/_retriever.py +156 -0
  120. agent_cli/rag/_store.py +48 -0
  121. agent_cli/rag/_utils.py +218 -0
  122. agent_cli/rag/api.py +175 -0
  123. agent_cli/rag/client.py +299 -0
  124. agent_cli/rag/engine.py +302 -0
  125. agent_cli/rag/models.py +55 -0
  126. agent_cli/scripts/.runtime/.gitkeep +0 -0
  127. agent_cli/scripts/__init__.py +1 -0
  128. agent_cli/scripts/check_plugin_skill_sync.py +50 -0
  129. agent_cli/scripts/linux-hotkeys/README.md +63 -0
  130. agent_cli/scripts/linux-hotkeys/toggle-autocorrect.sh +45 -0
  131. agent_cli/scripts/linux-hotkeys/toggle-transcription.sh +58 -0
  132. agent_cli/scripts/linux-hotkeys/toggle-voice-edit.sh +58 -0
  133. agent_cli/scripts/macos-hotkeys/README.md +45 -0
  134. agent_cli/scripts/macos-hotkeys/skhd-config-example +5 -0
  135. agent_cli/scripts/macos-hotkeys/toggle-autocorrect.sh +12 -0
  136. agent_cli/scripts/macos-hotkeys/toggle-transcription.sh +37 -0
  137. agent_cli/scripts/macos-hotkeys/toggle-voice-edit.sh +37 -0
  138. agent_cli/scripts/nvidia-asr-server/README.md +99 -0
  139. agent_cli/scripts/nvidia-asr-server/pyproject.toml +27 -0
  140. agent_cli/scripts/nvidia-asr-server/server.py +255 -0
  141. agent_cli/scripts/nvidia-asr-server/shell.nix +32 -0
  142. agent_cli/scripts/nvidia-asr-server/uv.lock +4654 -0
  143. agent_cli/scripts/run-openwakeword.sh +11 -0
  144. agent_cli/scripts/run-piper-windows.ps1 +30 -0
  145. agent_cli/scripts/run-piper.sh +24 -0
  146. agent_cli/scripts/run-whisper-linux.sh +40 -0
  147. agent_cli/scripts/run-whisper-macos.sh +6 -0
  148. agent_cli/scripts/run-whisper-windows.ps1 +51 -0
  149. agent_cli/scripts/run-whisper.sh +9 -0
  150. agent_cli/scripts/run_faster_whisper_server.py +136 -0
  151. agent_cli/scripts/setup-linux-hotkeys.sh +72 -0
  152. agent_cli/scripts/setup-linux.sh +108 -0
  153. agent_cli/scripts/setup-macos-hotkeys.sh +61 -0
  154. agent_cli/scripts/setup-macos.sh +76 -0
  155. agent_cli/scripts/setup-windows.ps1 +63 -0
  156. agent_cli/scripts/start-all-services-windows.ps1 +53 -0
  157. agent_cli/scripts/start-all-services.sh +178 -0
  158. agent_cli/scripts/sync_extras.py +138 -0
  159. agent_cli/server/__init__.py +3 -0
  160. agent_cli/server/cli.py +721 -0
  161. agent_cli/server/common.py +222 -0
  162. agent_cli/server/model_manager.py +288 -0
  163. agent_cli/server/model_registry.py +225 -0
  164. agent_cli/server/proxy/__init__.py +3 -0
  165. agent_cli/server/proxy/api.py +444 -0
  166. agent_cli/server/streaming.py +67 -0
  167. agent_cli/server/tts/__init__.py +3 -0
  168. agent_cli/server/tts/api.py +335 -0
  169. agent_cli/server/tts/backends/__init__.py +82 -0
  170. agent_cli/server/tts/backends/base.py +139 -0
  171. agent_cli/server/tts/backends/kokoro.py +403 -0
  172. agent_cli/server/tts/backends/piper.py +253 -0
  173. agent_cli/server/tts/model_manager.py +201 -0
  174. agent_cli/server/tts/model_registry.py +28 -0
  175. agent_cli/server/tts/wyoming_handler.py +249 -0
  176. agent_cli/server/whisper/__init__.py +3 -0
  177. agent_cli/server/whisper/api.py +413 -0
  178. agent_cli/server/whisper/backends/__init__.py +89 -0
  179. agent_cli/server/whisper/backends/base.py +97 -0
  180. agent_cli/server/whisper/backends/faster_whisper.py +225 -0
  181. agent_cli/server/whisper/backends/mlx.py +270 -0
  182. agent_cli/server/whisper/languages.py +116 -0
  183. agent_cli/server/whisper/model_manager.py +157 -0
  184. agent_cli/server/whisper/model_registry.py +28 -0
  185. agent_cli/server/whisper/wyoming_handler.py +203 -0
  186. agent_cli/services/__init__.py +343 -0
  187. agent_cli/services/_wyoming_utils.py +64 -0
  188. agent_cli/services/asr.py +506 -0
  189. agent_cli/services/llm.py +228 -0
  190. agent_cli/services/tts.py +450 -0
  191. agent_cli/services/wake_word.py +142 -0
  192. agent_cli-0.70.5.dist-info/METADATA +2118 -0
  193. agent_cli-0.70.5.dist-info/RECORD +196 -0
  194. agent_cli-0.70.5.dist-info/WHEEL +4 -0
  195. agent_cli-0.70.5.dist-info/entry_points.txt +4 -0
  196. agent_cli-0.70.5.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,225 @@
1
+ """Faster-whisper backend for Linux/CUDA systems."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ import tempfile
8
+ from concurrent.futures import ProcessPoolExecutor
9
+ from dataclasses import dataclass
10
+ from multiprocessing import get_context
11
+ from pathlib import Path
12
+ from typing import Any, Literal
13
+
14
+ from agent_cli.core.process import set_process_title
15
+ from agent_cli.server.whisper.backends.base import (
16
+ BackendConfig,
17
+ TranscriptionResult,
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ # --- Subprocess state (only used within subprocess worker) ---
24
+ # This state persists across function calls within the subprocess because:
25
+ # 1. Model loading is expensive and must be reused across transcription calls
26
+ # 2. CTranslate2 models cannot be pickled/passed through IPC queues
27
+ # 3. The subprocess is long-lived (ProcessPoolExecutor reuses workers)
28
+
29
+
30
+ @dataclass
31
+ class _SubprocessState:
32
+ """Container for subprocess-local state. Not shared with main process."""
33
+
34
+ model: Any = None
35
+ device: str | None = None
36
+
37
+
38
+ _state = _SubprocessState()
39
+
40
+
41
+ # --- Subprocess worker functions (run in isolated process) ---
42
+
43
+
44
+ def _load_model_in_subprocess(
45
+ model_name: str,
46
+ device: str,
47
+ compute_type: str,
48
+ cpu_threads: int,
49
+ download_root: str | None,
50
+ ) -> str:
51
+ """Load model in subprocess. Returns actual device string."""
52
+ from faster_whisper import WhisperModel # noqa: PLC0415
53
+
54
+ set_process_title("whisper-faster")
55
+ model = WhisperModel(
56
+ model_name,
57
+ device=device,
58
+ compute_type=compute_type,
59
+ cpu_threads=cpu_threads,
60
+ download_root=download_root,
61
+ )
62
+
63
+ # Store in subprocess state for reuse across transcription calls
64
+ _state.model = model
65
+ _state.device = str(model.model.device)
66
+
67
+ return _state.device
68
+
69
+
70
+ def _transcribe_in_subprocess(
71
+ audio_bytes: bytes,
72
+ kwargs: dict[str, Any],
73
+ ) -> dict[str, Any]:
74
+ """Run transcription in subprocess. Reuses model from _state."""
75
+ if _state.model is None:
76
+ msg = "Model not loaded in subprocess. Call _load_model_in_subprocess first."
77
+ raise RuntimeError(msg)
78
+
79
+ # Write audio to temp file - faster-whisper needs a file path
80
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
81
+ tmp.write(audio_bytes)
82
+ tmp_path = tmp.name
83
+
84
+ try:
85
+ segments, info = _state.model.transcribe(tmp_path, **kwargs)
86
+ segment_list = list(segments) # Consume lazy generator
87
+ finally:
88
+ Path(tmp_path).unlink(missing_ok=True)
89
+
90
+ return {
91
+ "text": " ".join(seg.text.strip() for seg in segment_list),
92
+ "language": info.language,
93
+ "language_probability": info.language_probability,
94
+ "duration": info.duration,
95
+ "segments": [
96
+ {
97
+ "id": seg.id,
98
+ "start": seg.start,
99
+ "end": seg.end,
100
+ "text": seg.text,
101
+ "tokens": seg.tokens,
102
+ "avg_logprob": seg.avg_logprob,
103
+ "no_speech_prob": seg.no_speech_prob,
104
+ }
105
+ for seg in segment_list
106
+ ],
107
+ }
108
+
109
+
110
+ class FasterWhisperBackend:
111
+ """Whisper backend using faster-whisper (CTranslate2).
112
+
113
+ Uses subprocess isolation: when unloaded, the subprocess terminates
114
+ and the OS reclaims ALL memory (Python's pymalloc doesn't return
115
+ freed memory to OS otherwise).
116
+ """
117
+
118
+ def __init__(self, config: BackendConfig) -> None:
119
+ """Initialize the backend."""
120
+ self._config = config
121
+ self._executor: ProcessPoolExecutor | None = None
122
+ self._device: str | None = None
123
+
124
+ @property
125
+ def is_loaded(self) -> bool:
126
+ """Check if the model is loaded."""
127
+ return self._executor is not None
128
+
129
+ @property
130
+ def device(self) -> str | None:
131
+ """Get the device the model is on."""
132
+ return self._device
133
+
134
+ async def load(self) -> float:
135
+ """Start subprocess and load model."""
136
+ import time # noqa: PLC0415
137
+
138
+ logger.debug(
139
+ "Starting faster-whisper subprocess for model %s (device=%s, compute_type=%s)",
140
+ self._config.model_name,
141
+ self._config.device,
142
+ self._config.compute_type,
143
+ )
144
+
145
+ start_time = time.time()
146
+
147
+ # Subprocess isolation: spawn context for clean state
148
+ ctx = get_context("spawn")
149
+ self._executor = ProcessPoolExecutor(max_workers=1, mp_context=ctx)
150
+
151
+ download_root = str(self._config.cache_dir) if self._config.cache_dir else None
152
+ loop = asyncio.get_running_loop()
153
+ self._device = await loop.run_in_executor(
154
+ self._executor,
155
+ _load_model_in_subprocess,
156
+ self._config.model_name,
157
+ self._config.device,
158
+ self._config.compute_type,
159
+ self._config.cpu_threads,
160
+ download_root,
161
+ )
162
+
163
+ load_duration = time.time() - start_time
164
+ logger.info(
165
+ "Model %s loaded on %s in %.2fs",
166
+ self._config.model_name,
167
+ self._device,
168
+ load_duration,
169
+ )
170
+ return load_duration
171
+
172
+ async def unload(self) -> None:
173
+ """Shutdown subprocess, releasing ALL memory."""
174
+ if self._executor is None:
175
+ return
176
+ logger.debug(
177
+ "Shutting down faster-whisper subprocess for model %s",
178
+ self._config.model_name,
179
+ )
180
+ self._executor.shutdown(wait=False, cancel_futures=True)
181
+ self._executor = None
182
+ self._device = None
183
+ logger.info("Model %s unloaded (subprocess terminated)", self._config.model_name)
184
+
185
+ async def transcribe(
186
+ self,
187
+ audio: bytes,
188
+ *,
189
+ source_filename: str | None = None, # noqa: ARG002
190
+ language: str | None = None,
191
+ task: Literal["transcribe", "translate"] = "transcribe",
192
+ initial_prompt: str | None = None,
193
+ temperature: float = 0.0,
194
+ vad_filter: bool = True,
195
+ word_timestamps: bool = False,
196
+ ) -> TranscriptionResult:
197
+ """Transcribe audio using faster-whisper in subprocess."""
198
+ if self._executor is None:
199
+ msg = "Model not loaded. Call load() first."
200
+ raise RuntimeError(msg)
201
+
202
+ kwargs: dict[str, Any] = {
203
+ "language": language,
204
+ "task": task,
205
+ "initial_prompt": initial_prompt,
206
+ "temperature": temperature,
207
+ "vad_filter": vad_filter,
208
+ "word_timestamps": word_timestamps,
209
+ }
210
+
211
+ loop = asyncio.get_running_loop()
212
+ result = await loop.run_in_executor(
213
+ self._executor,
214
+ _transcribe_in_subprocess,
215
+ audio,
216
+ kwargs,
217
+ )
218
+
219
+ return TranscriptionResult(
220
+ text=result["text"],
221
+ language=result["language"],
222
+ language_probability=result["language_probability"],
223
+ duration=result["duration"],
224
+ segments=result["segments"],
225
+ )
@@ -0,0 +1,270 @@
1
+ """MLX Whisper backend for macOS Apple Silicon."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ import wave
8
+ from concurrent.futures import ProcessPoolExecutor
9
+ from multiprocessing import get_context
10
+ from typing import TYPE_CHECKING, Any, Literal
11
+
12
+ from agent_cli import constants
13
+ from agent_cli.core.audio_format import (
14
+ convert_audio_to_wyoming_format,
15
+ extract_pcm_from_wav,
16
+ )
17
+ from agent_cli.core.process import set_process_title
18
+ from agent_cli.server.whisper.backends.base import (
19
+ BackendConfig,
20
+ InvalidAudioError,
21
+ TranscriptionResult,
22
+ )
23
+
24
+ if TYPE_CHECKING:
25
+ import numpy as np
26
+ from numpy.typing import NDArray
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ # MLX model name mapping: canonical name -> HuggingFace repo
31
+ _MLX_MODEL_MAP: dict[str, str] = {
32
+ "tiny": "mlx-community/whisper-tiny",
33
+ "small": "mlx-community/whisper-small-mlx",
34
+ "medium": "mlx-community/whisper-medium-mlx",
35
+ "large": "mlx-community/whisper-large-v3-mlx",
36
+ "large-v2": "mlx-community/whisper-large-v2-mlx",
37
+ "large-v3": "mlx-community/whisper-large-v3-mlx",
38
+ "large-v3-turbo": "mlx-community/whisper-large-v3-turbo",
39
+ "turbo": "mlx-community/whisper-large-v3-turbo",
40
+ "large-v3-turbo-q4": "mlx-community/whisper-large-v3-turbo-q4",
41
+ }
42
+
43
+
44
+ def _resolve_mlx_model_name(model_name: str) -> str:
45
+ """Resolve a model name to an MLX HuggingFace repo."""
46
+ if model_name.startswith("mlx-community/"):
47
+ return model_name
48
+ if model_name in _MLX_MODEL_MAP:
49
+ return _MLX_MODEL_MAP[model_name]
50
+ for prefix in ("whisper-", "openai/whisper-"):
51
+ if model_name.startswith(prefix):
52
+ stripped = model_name[len(prefix) :]
53
+ if stripped in _MLX_MODEL_MAP:
54
+ return _MLX_MODEL_MAP[stripped]
55
+ return model_name
56
+
57
+
58
+ def _pcm_to_float(audio_bytes: bytes) -> NDArray[np.float32]:
59
+ """Convert 16-bit PCM audio bytes to float32 array normalized to [-1, 1]."""
60
+ import numpy as np # noqa: PLC0415
61
+
62
+ return np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
63
+
64
+
65
+ def _convert_audio_to_pcm(audio_bytes: bytes, source_filename: str | None) -> bytes:
66
+ """Convert audio bytes to raw PCM using FFmpeg."""
67
+ filename = source_filename or "audio"
68
+ try:
69
+ return convert_audio_to_wyoming_format(audio_bytes, filename)
70
+ except RuntimeError as exc:
71
+ logger.warning("FFmpeg conversion failed for MLX Whisper: %s", exc)
72
+ msg = (
73
+ "Unsupported audio format for MLX Whisper. "
74
+ "Provide a 16kHz mono 16-bit WAV file or install ffmpeg to convert uploads."
75
+ )
76
+ raise InvalidAudioError(msg) from exc
77
+
78
+
79
+ def _prepare_audio_pcm(audio: bytes, source_filename: str | None) -> bytes:
80
+ """Extract PCM from WAV or convert with FFmpeg if needed."""
81
+ try:
82
+ wav = extract_pcm_from_wav(audio)
83
+ except (wave.Error, EOFError) as exc:
84
+ logger.debug("WAV parsing failed (%s); converting with FFmpeg", exc)
85
+ return _convert_audio_to_pcm(audio, source_filename)
86
+
87
+ needs_conversion = (
88
+ wav.sample_rate != constants.AUDIO_RATE
89
+ or wav.num_channels != constants.AUDIO_CHANNELS
90
+ or wav.sample_width != constants.AUDIO_FORMAT_WIDTH
91
+ )
92
+ if needs_conversion:
93
+ logger.debug(
94
+ "WAV format mismatch (rate=%s, channels=%s, width=%s); converting",
95
+ wav.sample_rate,
96
+ wav.num_channels,
97
+ wav.sample_width,
98
+ )
99
+ name = (
100
+ source_filename
101
+ if source_filename and source_filename.lower().endswith(".wav")
102
+ else "audio.wav"
103
+ )
104
+ return _convert_audio_to_pcm(audio, name)
105
+ return wav.pcm_data
106
+
107
+
108
+ # --- Subprocess worker functions (run in isolated process) ---
109
+
110
+
111
+ def _load_model_in_subprocess(model_name: str) -> None:
112
+ """Load model in subprocess. Called once when executor starts."""
113
+ import mlx.core as mx # noqa: PLC0415
114
+ from mlx_whisper.transcribe import ModelHolder # noqa: PLC0415
115
+
116
+ set_process_title("whisper-mlx")
117
+ ModelHolder.get_model(model_name, mx.float16)
118
+
119
+
120
+ def _transcribe_in_subprocess(
121
+ model_name: str,
122
+ audio_bytes: bytes,
123
+ audio_shape: tuple[int, ...],
124
+ audio_dtype: str,
125
+ kwargs: dict[str, Any],
126
+ ) -> dict[str, Any]:
127
+ """Run transcription in subprocess. Model stays loaded between calls."""
128
+ import mlx_whisper # noqa: PLC0415
129
+ import numpy as np # noqa: PLC0415
130
+
131
+ audio_array = np.frombuffer(audio_bytes, dtype=audio_dtype).reshape(audio_shape)
132
+ result = mlx_whisper.transcribe(audio_array, path_or_hf_repo=model_name, **kwargs)
133
+
134
+ return {
135
+ "text": result.get("text", ""),
136
+ "language": result.get("language", "en"),
137
+ "segments": result.get("segments", []),
138
+ }
139
+
140
+
141
+ class MLXWhisperBackend:
142
+ """Whisper backend using mlx-whisper for Apple Silicon.
143
+
144
+ Uses subprocess isolation: when unloaded, the subprocess terminates
145
+ and the OS reclaims ALL memory (Python's pymalloc doesn't return
146
+ freed memory to OS otherwise).
147
+ """
148
+
149
+ def __init__(self, config: BackendConfig) -> None:
150
+ """Initialize the backend."""
151
+ self._config = config
152
+ self._resolved_model = _resolve_mlx_model_name(config.model_name)
153
+ self._executor: ProcessPoolExecutor | None = None
154
+
155
+ @property
156
+ def is_loaded(self) -> bool:
157
+ """Check if the model is loaded."""
158
+ return self._executor is not None
159
+
160
+ @property
161
+ def device(self) -> str | None:
162
+ """Get the device - always 'mps' (Metal) for MLX."""
163
+ return "mps" if self._executor is not None else None
164
+
165
+ async def load(self) -> float:
166
+ """Start subprocess and load model."""
167
+ import time # noqa: PLC0415
168
+
169
+ logger.debug(
170
+ "Starting MLX subprocess for model %s (resolved: %s)",
171
+ self._config.model_name,
172
+ self._resolved_model,
173
+ )
174
+
175
+ start_time = time.time()
176
+
177
+ # Subprocess isolation: spawn context for clean state
178
+ ctx = get_context("spawn")
179
+ self._executor = ProcessPoolExecutor(max_workers=1, mp_context=ctx)
180
+
181
+ loop = asyncio.get_running_loop()
182
+ await loop.run_in_executor(
183
+ self._executor,
184
+ _load_model_in_subprocess,
185
+ self._resolved_model,
186
+ )
187
+
188
+ load_duration = time.time() - start_time
189
+ logger.info(
190
+ "Model %s loaded in subprocess in %.2fs",
191
+ self._config.model_name,
192
+ load_duration,
193
+ )
194
+ return load_duration
195
+
196
+ async def unload(self) -> None:
197
+ """Shutdown subprocess, releasing ALL memory."""
198
+ if self._executor is None:
199
+ return
200
+ logger.debug("Shutting down MLX subprocess for model %s", self._resolved_model)
201
+ self._executor.shutdown(wait=False, cancel_futures=True)
202
+ self._executor = None
203
+ logger.info("Model %s unloaded (subprocess terminated)", self._config.model_name)
204
+
205
+ async def transcribe(
206
+ self,
207
+ audio: bytes,
208
+ *,
209
+ source_filename: str | None = None,
210
+ language: str | None = None,
211
+ task: Literal["transcribe", "translate"] = "transcribe",
212
+ initial_prompt: str | None = None,
213
+ temperature: float = 0.0,
214
+ vad_filter: bool = True, # noqa: ARG002 - not supported by mlx-whisper
215
+ word_timestamps: bool = False,
216
+ ) -> TranscriptionResult:
217
+ """Transcribe audio using mlx-whisper in subprocess."""
218
+ if self._executor is None:
219
+ msg = "Model not loaded. Call load() first."
220
+ raise RuntimeError(msg)
221
+
222
+ pcm_data = _prepare_audio_pcm(audio, source_filename)
223
+ audio_array = _pcm_to_float(pcm_data)
224
+
225
+ kwargs: dict[str, Any] = {
226
+ "temperature": temperature,
227
+ "word_timestamps": word_timestamps,
228
+ }
229
+ if language:
230
+ kwargs["language"] = language
231
+ if task == "translate":
232
+ kwargs["task"] = "translate"
233
+ if initial_prompt:
234
+ kwargs["initial_prompt"] = initial_prompt
235
+
236
+ loop = asyncio.get_running_loop()
237
+ result = await loop.run_in_executor(
238
+ self._executor,
239
+ _transcribe_in_subprocess,
240
+ self._resolved_model,
241
+ audio_array.tobytes(),
242
+ audio_array.shape,
243
+ str(audio_array.dtype),
244
+ kwargs,
245
+ )
246
+
247
+ text = result.get("text", "").strip()
248
+ detected_language = result.get("language", "en")
249
+ language_probability = 1.0 if language else 0.95
250
+ segments = result.get("segments", [])
251
+ duration = segments[-1].get("end", 0.0) if segments else len(pcm_data) / 32000.0
252
+
253
+ return TranscriptionResult(
254
+ text=text,
255
+ language=detected_language,
256
+ language_probability=language_probability,
257
+ duration=duration,
258
+ segments=[
259
+ {
260
+ "id": i,
261
+ "start": seg.get("start", 0.0),
262
+ "end": seg.get("end", 0.0),
263
+ "text": seg.get("text", ""),
264
+ "tokens": seg.get("tokens", []),
265
+ "avg_logprob": seg.get("avg_logprob", 0.0),
266
+ "no_speech_prob": seg.get("no_speech_prob", 0.0),
267
+ }
268
+ for i, seg in enumerate(segments)
269
+ ],
270
+ )
@@ -0,0 +1,116 @@
1
+ """Language codes supported by Whisper ASR models.
2
+
3
+ This list is derived from the OpenAI Whisper model's supported languages.
4
+ Source: https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
5
+
6
+ The codes are ISO 639-1 (2-letter) or ISO 639-2 (3-letter) codes where
7
+ 2-letter codes are not available.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ # Language codes supported by Whisper models
13
+ # fmt: off
14
+ WHISPER_LANGUAGE_CODES: list[str] = [
15
+ "af", # Afrikaans
16
+ "am", # Amharic
17
+ "ar", # Arabic
18
+ "as", # Assamese
19
+ "az", # Azerbaijani
20
+ "ba", # Bashkir
21
+ "be", # Belarusian
22
+ "bg", # Bulgarian
23
+ "bn", # Bengali
24
+ "bo", # Tibetan
25
+ "br", # Breton
26
+ "bs", # Bosnian
27
+ "ca", # Catalan
28
+ "cs", # Czech
29
+ "cy", # Welsh
30
+ "da", # Danish
31
+ "de", # German
32
+ "el", # Greek
33
+ "en", # English
34
+ "es", # Spanish
35
+ "et", # Estonian
36
+ "eu", # Basque
37
+ "fa", # Persian
38
+ "fi", # Finnish
39
+ "fo", # Faroese
40
+ "fr", # French
41
+ "gl", # Galician
42
+ "gu", # Gujarati
43
+ "ha", # Hausa
44
+ "haw", # Hawaiian
45
+ "he", # Hebrew
46
+ "hi", # Hindi
47
+ "hr", # Croatian
48
+ "ht", # Haitian Creole
49
+ "hu", # Hungarian
50
+ "hy", # Armenian
51
+ "id", # Indonesian
52
+ "is", # Icelandic
53
+ "it", # Italian
54
+ "ja", # Japanese
55
+ "jw", # Javanese
56
+ "ka", # Georgian
57
+ "kk", # Kazakh
58
+ "km", # Khmer
59
+ "kn", # Kannada
60
+ "ko", # Korean
61
+ "la", # Latin
62
+ "lb", # Luxembourgish
63
+ "ln", # Lingala
64
+ "lo", # Lao
65
+ "lt", # Lithuanian
66
+ "lv", # Latvian
67
+ "mg", # Malagasy
68
+ "mi", # Maori
69
+ "mk", # Macedonian
70
+ "ml", # Malayalam
71
+ "mn", # Mongolian
72
+ "mr", # Marathi
73
+ "ms", # Malay
74
+ "mt", # Maltese
75
+ "my", # Myanmar (Burmese)
76
+ "ne", # Nepali
77
+ "nl", # Dutch
78
+ "nn", # Norwegian Nynorsk
79
+ "no", # Norwegian
80
+ "oc", # Occitan
81
+ "pa", # Punjabi
82
+ "pl", # Polish
83
+ "ps", # Pashto
84
+ "pt", # Portuguese
85
+ "ro", # Romanian
86
+ "ru", # Russian
87
+ "sa", # Sanskrit
88
+ "sd", # Sindhi
89
+ "si", # Sinhala
90
+ "sk", # Slovak
91
+ "sl", # Slovenian
92
+ "sn", # Shona
93
+ "so", # Somali
94
+ "sq", # Albanian
95
+ "sr", # Serbian
96
+ "su", # Sundanese
97
+ "sv", # Swedish
98
+ "sw", # Swahili
99
+ "ta", # Tamil
100
+ "te", # Telugu
101
+ "tg", # Tajik
102
+ "th", # Thai
103
+ "tk", # Turkmen
104
+ "tl", # Tagalog
105
+ "tr", # Turkish
106
+ "tt", # Tatar
107
+ "uk", # Ukrainian
108
+ "ur", # Urdu
109
+ "uz", # Uzbek
110
+ "vi", # Vietnamese
111
+ "yi", # Yiddish
112
+ "yo", # Yoruba
113
+ "zh", # Chinese
114
+ "yue", # Cantonese
115
+ ]
116
+ # fmt: on