agent-cli 0.70.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_cli/__init__.py +5 -0
- agent_cli/__main__.py +6 -0
- agent_cli/_extras.json +14 -0
- agent_cli/_requirements/.gitkeep +0 -0
- agent_cli/_requirements/audio.txt +79 -0
- agent_cli/_requirements/faster-whisper.txt +215 -0
- agent_cli/_requirements/kokoro.txt +425 -0
- agent_cli/_requirements/llm.txt +183 -0
- agent_cli/_requirements/memory.txt +355 -0
- agent_cli/_requirements/mlx-whisper.txt +222 -0
- agent_cli/_requirements/piper.txt +176 -0
- agent_cli/_requirements/rag.txt +402 -0
- agent_cli/_requirements/server.txt +154 -0
- agent_cli/_requirements/speed.txt +77 -0
- agent_cli/_requirements/vad.txt +155 -0
- agent_cli/_requirements/wyoming.txt +71 -0
- agent_cli/_tools.py +368 -0
- agent_cli/agents/__init__.py +23 -0
- agent_cli/agents/_voice_agent_common.py +136 -0
- agent_cli/agents/assistant.py +383 -0
- agent_cli/agents/autocorrect.py +284 -0
- agent_cli/agents/chat.py +496 -0
- agent_cli/agents/memory/__init__.py +31 -0
- agent_cli/agents/memory/add.py +190 -0
- agent_cli/agents/memory/proxy.py +160 -0
- agent_cli/agents/rag_proxy.py +128 -0
- agent_cli/agents/speak.py +209 -0
- agent_cli/agents/transcribe.py +671 -0
- agent_cli/agents/transcribe_daemon.py +499 -0
- agent_cli/agents/voice_edit.py +291 -0
- agent_cli/api.py +22 -0
- agent_cli/cli.py +106 -0
- agent_cli/config.py +503 -0
- agent_cli/config_cmd.py +307 -0
- agent_cli/constants.py +27 -0
- agent_cli/core/__init__.py +1 -0
- agent_cli/core/audio.py +461 -0
- agent_cli/core/audio_format.py +299 -0
- agent_cli/core/chroma.py +88 -0
- agent_cli/core/deps.py +191 -0
- agent_cli/core/openai_proxy.py +139 -0
- agent_cli/core/process.py +195 -0
- agent_cli/core/reranker.py +120 -0
- agent_cli/core/sse.py +87 -0
- agent_cli/core/transcription_logger.py +70 -0
- agent_cli/core/utils.py +526 -0
- agent_cli/core/vad.py +175 -0
- agent_cli/core/watch.py +65 -0
- agent_cli/dev/__init__.py +14 -0
- agent_cli/dev/cli.py +1588 -0
- agent_cli/dev/coding_agents/__init__.py +19 -0
- agent_cli/dev/coding_agents/aider.py +24 -0
- agent_cli/dev/coding_agents/base.py +167 -0
- agent_cli/dev/coding_agents/claude.py +39 -0
- agent_cli/dev/coding_agents/codex.py +24 -0
- agent_cli/dev/coding_agents/continue_dev.py +15 -0
- agent_cli/dev/coding_agents/copilot.py +24 -0
- agent_cli/dev/coding_agents/cursor_agent.py +48 -0
- agent_cli/dev/coding_agents/gemini.py +28 -0
- agent_cli/dev/coding_agents/opencode.py +15 -0
- agent_cli/dev/coding_agents/registry.py +49 -0
- agent_cli/dev/editors/__init__.py +19 -0
- agent_cli/dev/editors/base.py +89 -0
- agent_cli/dev/editors/cursor.py +15 -0
- agent_cli/dev/editors/emacs.py +46 -0
- agent_cli/dev/editors/jetbrains.py +56 -0
- agent_cli/dev/editors/nano.py +31 -0
- agent_cli/dev/editors/neovim.py +33 -0
- agent_cli/dev/editors/registry.py +59 -0
- agent_cli/dev/editors/sublime.py +20 -0
- agent_cli/dev/editors/vim.py +42 -0
- agent_cli/dev/editors/vscode.py +15 -0
- agent_cli/dev/editors/zed.py +20 -0
- agent_cli/dev/project.py +568 -0
- agent_cli/dev/registry.py +52 -0
- agent_cli/dev/skill/SKILL.md +141 -0
- agent_cli/dev/skill/examples.md +571 -0
- agent_cli/dev/terminals/__init__.py +19 -0
- agent_cli/dev/terminals/apple_terminal.py +82 -0
- agent_cli/dev/terminals/base.py +56 -0
- agent_cli/dev/terminals/gnome.py +51 -0
- agent_cli/dev/terminals/iterm2.py +84 -0
- agent_cli/dev/terminals/kitty.py +77 -0
- agent_cli/dev/terminals/registry.py +48 -0
- agent_cli/dev/terminals/tmux.py +58 -0
- agent_cli/dev/terminals/warp.py +132 -0
- agent_cli/dev/terminals/zellij.py +78 -0
- agent_cli/dev/worktree.py +856 -0
- agent_cli/docs_gen.py +417 -0
- agent_cli/example-config.toml +185 -0
- agent_cli/install/__init__.py +5 -0
- agent_cli/install/common.py +89 -0
- agent_cli/install/extras.py +174 -0
- agent_cli/install/hotkeys.py +48 -0
- agent_cli/install/services.py +87 -0
- agent_cli/memory/__init__.py +7 -0
- agent_cli/memory/_files.py +250 -0
- agent_cli/memory/_filters.py +63 -0
- agent_cli/memory/_git.py +157 -0
- agent_cli/memory/_indexer.py +142 -0
- agent_cli/memory/_ingest.py +408 -0
- agent_cli/memory/_persistence.py +182 -0
- agent_cli/memory/_prompt.py +91 -0
- agent_cli/memory/_retrieval.py +294 -0
- agent_cli/memory/_store.py +169 -0
- agent_cli/memory/_streaming.py +44 -0
- agent_cli/memory/_tasks.py +48 -0
- agent_cli/memory/api.py +113 -0
- agent_cli/memory/client.py +272 -0
- agent_cli/memory/engine.py +361 -0
- agent_cli/memory/entities.py +43 -0
- agent_cli/memory/models.py +112 -0
- agent_cli/opts.py +433 -0
- agent_cli/py.typed +0 -0
- agent_cli/rag/__init__.py +3 -0
- agent_cli/rag/_indexer.py +67 -0
- agent_cli/rag/_indexing.py +226 -0
- agent_cli/rag/_prompt.py +30 -0
- agent_cli/rag/_retriever.py +156 -0
- agent_cli/rag/_store.py +48 -0
- agent_cli/rag/_utils.py +218 -0
- agent_cli/rag/api.py +175 -0
- agent_cli/rag/client.py +299 -0
- agent_cli/rag/engine.py +302 -0
- agent_cli/rag/models.py +55 -0
- agent_cli/scripts/.runtime/.gitkeep +0 -0
- agent_cli/scripts/__init__.py +1 -0
- agent_cli/scripts/check_plugin_skill_sync.py +50 -0
- agent_cli/scripts/linux-hotkeys/README.md +63 -0
- agent_cli/scripts/linux-hotkeys/toggle-autocorrect.sh +45 -0
- agent_cli/scripts/linux-hotkeys/toggle-transcription.sh +58 -0
- agent_cli/scripts/linux-hotkeys/toggle-voice-edit.sh +58 -0
- agent_cli/scripts/macos-hotkeys/README.md +45 -0
- agent_cli/scripts/macos-hotkeys/skhd-config-example +5 -0
- agent_cli/scripts/macos-hotkeys/toggle-autocorrect.sh +12 -0
- agent_cli/scripts/macos-hotkeys/toggle-transcription.sh +37 -0
- agent_cli/scripts/macos-hotkeys/toggle-voice-edit.sh +37 -0
- agent_cli/scripts/nvidia-asr-server/README.md +99 -0
- agent_cli/scripts/nvidia-asr-server/pyproject.toml +27 -0
- agent_cli/scripts/nvidia-asr-server/server.py +255 -0
- agent_cli/scripts/nvidia-asr-server/shell.nix +32 -0
- agent_cli/scripts/nvidia-asr-server/uv.lock +4654 -0
- agent_cli/scripts/run-openwakeword.sh +11 -0
- agent_cli/scripts/run-piper-windows.ps1 +30 -0
- agent_cli/scripts/run-piper.sh +24 -0
- agent_cli/scripts/run-whisper-linux.sh +40 -0
- agent_cli/scripts/run-whisper-macos.sh +6 -0
- agent_cli/scripts/run-whisper-windows.ps1 +51 -0
- agent_cli/scripts/run-whisper.sh +9 -0
- agent_cli/scripts/run_faster_whisper_server.py +136 -0
- agent_cli/scripts/setup-linux-hotkeys.sh +72 -0
- agent_cli/scripts/setup-linux.sh +108 -0
- agent_cli/scripts/setup-macos-hotkeys.sh +61 -0
- agent_cli/scripts/setup-macos.sh +76 -0
- agent_cli/scripts/setup-windows.ps1 +63 -0
- agent_cli/scripts/start-all-services-windows.ps1 +53 -0
- agent_cli/scripts/start-all-services.sh +178 -0
- agent_cli/scripts/sync_extras.py +138 -0
- agent_cli/server/__init__.py +3 -0
- agent_cli/server/cli.py +721 -0
- agent_cli/server/common.py +222 -0
- agent_cli/server/model_manager.py +288 -0
- agent_cli/server/model_registry.py +225 -0
- agent_cli/server/proxy/__init__.py +3 -0
- agent_cli/server/proxy/api.py +444 -0
- agent_cli/server/streaming.py +67 -0
- agent_cli/server/tts/__init__.py +3 -0
- agent_cli/server/tts/api.py +335 -0
- agent_cli/server/tts/backends/__init__.py +82 -0
- agent_cli/server/tts/backends/base.py +139 -0
- agent_cli/server/tts/backends/kokoro.py +403 -0
- agent_cli/server/tts/backends/piper.py +253 -0
- agent_cli/server/tts/model_manager.py +201 -0
- agent_cli/server/tts/model_registry.py +28 -0
- agent_cli/server/tts/wyoming_handler.py +249 -0
- agent_cli/server/whisper/__init__.py +3 -0
- agent_cli/server/whisper/api.py +413 -0
- agent_cli/server/whisper/backends/__init__.py +89 -0
- agent_cli/server/whisper/backends/base.py +97 -0
- agent_cli/server/whisper/backends/faster_whisper.py +225 -0
- agent_cli/server/whisper/backends/mlx.py +270 -0
- agent_cli/server/whisper/languages.py +116 -0
- agent_cli/server/whisper/model_manager.py +157 -0
- agent_cli/server/whisper/model_registry.py +28 -0
- agent_cli/server/whisper/wyoming_handler.py +203 -0
- agent_cli/services/__init__.py +343 -0
- agent_cli/services/_wyoming_utils.py +64 -0
- agent_cli/services/asr.py +506 -0
- agent_cli/services/llm.py +228 -0
- agent_cli/services/tts.py +450 -0
- agent_cli/services/wake_word.py +142 -0
- agent_cli-0.70.5.dist-info/METADATA +2118 -0
- agent_cli-0.70.5.dist-info/RECORD +196 -0
- agent_cli-0.70.5.dist-info/WHEEL +4 -0
- agent_cli-0.70.5.dist-info/entry_points.txt +4 -0
- agent_cli-0.70.5.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
"""Registry for managing multiple models.
|
|
2
|
+
|
|
3
|
+
This module provides a concrete model registry that handles:
|
|
4
|
+
- Registration of multiple models with independent configurations
|
|
5
|
+
- Default model selection
|
|
6
|
+
- Lifecycle management (start/stop)
|
|
7
|
+
- Model preloading
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from typing import TYPE_CHECKING, Any, Generic, Protocol, TypeVar, runtime_checkable
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from collections.abc import Callable
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@runtime_checkable
|
|
23
|
+
class ManagerProtocol(Protocol):
|
|
24
|
+
"""Protocol defining the interface for model managers."""
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def is_loaded(self) -> bool:
|
|
28
|
+
"""Check if the model is loaded."""
|
|
29
|
+
...
|
|
30
|
+
|
|
31
|
+
async def start(self) -> None:
|
|
32
|
+
"""Start the manager."""
|
|
33
|
+
...
|
|
34
|
+
|
|
35
|
+
async def stop(self) -> None:
|
|
36
|
+
"""Stop the manager."""
|
|
37
|
+
...
|
|
38
|
+
|
|
39
|
+
async def get_model(self) -> Any:
|
|
40
|
+
"""Get the model, loading if needed."""
|
|
41
|
+
...
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# Type variable for manager type
|
|
45
|
+
ManagerT = TypeVar("ManagerT", bound=ManagerProtocol)
|
|
46
|
+
ConfigT = TypeVar("ConfigT")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class ModelStatus:
|
|
51
|
+
"""Status of a registered model."""
|
|
52
|
+
|
|
53
|
+
name: str
|
|
54
|
+
loaded: bool
|
|
55
|
+
device: str | None
|
|
56
|
+
ttl_seconds: int
|
|
57
|
+
ttl_remaining: float | None
|
|
58
|
+
active_requests: int
|
|
59
|
+
# Stats
|
|
60
|
+
load_count: int
|
|
61
|
+
unload_count: int
|
|
62
|
+
total_requests: int
|
|
63
|
+
total_audio_seconds: float
|
|
64
|
+
total_processing_seconds: float
|
|
65
|
+
last_load_time: float | None
|
|
66
|
+
last_request_time: float | None
|
|
67
|
+
load_duration_seconds: float | None
|
|
68
|
+
extra: dict[str, float]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class ModelRegistry(Generic[ManagerT, ConfigT]):
|
|
72
|
+
"""Registry for managing multiple models with independent TTLs.
|
|
73
|
+
|
|
74
|
+
Each model can have its own configuration (device, TTL).
|
|
75
|
+
Models are loaded lazily and unloaded independently based on their TTL.
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
manager_factory: Callable[[ConfigT], ManagerT],
|
|
81
|
+
default_model: str | None = None,
|
|
82
|
+
) -> None:
|
|
83
|
+
"""Initialize the registry.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
manager_factory: Function to create a manager from config.
|
|
87
|
+
default_model: Name of the default model to use when not specified.
|
|
88
|
+
|
|
89
|
+
"""
|
|
90
|
+
self._manager_factory = manager_factory
|
|
91
|
+
self._managers: dict[str, ManagerT] = {}
|
|
92
|
+
self._default_model = default_model
|
|
93
|
+
self._started = False
|
|
94
|
+
|
|
95
|
+
@staticmethod
|
|
96
|
+
def _default_get_status(name: str, manager: Any) -> ModelStatus:
|
|
97
|
+
"""Default status getter for managers with standard interface."""
|
|
98
|
+
return ModelStatus(
|
|
99
|
+
name=name,
|
|
100
|
+
loaded=manager.is_loaded,
|
|
101
|
+
device=manager.device,
|
|
102
|
+
ttl_seconds=manager.config.ttl_seconds,
|
|
103
|
+
ttl_remaining=manager.ttl_remaining,
|
|
104
|
+
active_requests=manager.active_requests,
|
|
105
|
+
load_count=manager.stats.load_count,
|
|
106
|
+
unload_count=manager.stats.unload_count,
|
|
107
|
+
total_requests=manager.stats.total_requests,
|
|
108
|
+
total_audio_seconds=manager.stats.total_audio_seconds,
|
|
109
|
+
total_processing_seconds=manager.stats.total_processing_seconds,
|
|
110
|
+
last_load_time=manager.stats.last_load_time,
|
|
111
|
+
last_request_time=manager.stats.last_request_time,
|
|
112
|
+
load_duration_seconds=manager.stats.load_duration_seconds,
|
|
113
|
+
extra=manager.stats.extra,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def default_model(self) -> str | None:
|
|
118
|
+
"""Get the default model name."""
|
|
119
|
+
return self._default_model
|
|
120
|
+
|
|
121
|
+
@default_model.setter
|
|
122
|
+
def default_model(self, name: str | None) -> None:
|
|
123
|
+
"""Set the default model name."""
|
|
124
|
+
if name is not None and name not in self._managers:
|
|
125
|
+
msg = f"Model '{name}' is not registered"
|
|
126
|
+
raise ValueError(msg)
|
|
127
|
+
self._default_model = name
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def models(self) -> list[str]:
|
|
131
|
+
"""Get list of registered model names."""
|
|
132
|
+
return list(self._managers.keys())
|
|
133
|
+
|
|
134
|
+
def register(self, config: ConfigT) -> None:
|
|
135
|
+
"""Register a model with the given configuration.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
config: Model configuration including name, device, TTL, etc.
|
|
139
|
+
Must have a model_name attribute.
|
|
140
|
+
|
|
141
|
+
Raises:
|
|
142
|
+
ValueError: If a model with this name is already registered.
|
|
143
|
+
|
|
144
|
+
"""
|
|
145
|
+
model_name: str = config.model_name # type: ignore[attr-defined]
|
|
146
|
+
|
|
147
|
+
if model_name in self._managers:
|
|
148
|
+
msg = f"Model '{model_name}' is already registered"
|
|
149
|
+
raise ValueError(msg)
|
|
150
|
+
|
|
151
|
+
manager = self._manager_factory(config)
|
|
152
|
+
self._managers[model_name] = manager
|
|
153
|
+
|
|
154
|
+
# Set as default if it's the first model
|
|
155
|
+
if self._default_model is None:
|
|
156
|
+
self._default_model = model_name
|
|
157
|
+
|
|
158
|
+
logger.debug("Registered model %s", model_name)
|
|
159
|
+
|
|
160
|
+
def get_manager(self, model_name: str | None = None) -> ManagerT:
|
|
161
|
+
"""Get the manager for a specific model.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
model_name: Name of the model, or None to use the default.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
The manager for the requested model.
|
|
168
|
+
|
|
169
|
+
Raises:
|
|
170
|
+
ValueError: If the model is not registered or no default is set.
|
|
171
|
+
|
|
172
|
+
"""
|
|
173
|
+
name = model_name or self._default_model
|
|
174
|
+
|
|
175
|
+
if name is None:
|
|
176
|
+
msg = "No model specified and no default model set"
|
|
177
|
+
raise ValueError(msg)
|
|
178
|
+
|
|
179
|
+
if name not in self._managers:
|
|
180
|
+
msg = f"Model '{name}' is not registered. Available: {list(self._managers.keys())}"
|
|
181
|
+
raise ValueError(msg)
|
|
182
|
+
|
|
183
|
+
return self._managers[name]
|
|
184
|
+
|
|
185
|
+
def list_status(self) -> list[ModelStatus]:
|
|
186
|
+
"""Get status of all registered models."""
|
|
187
|
+
return [self._default_get_status(name, manager) for name, manager in self._managers.items()]
|
|
188
|
+
|
|
189
|
+
async def start(self) -> None:
|
|
190
|
+
"""Start all model managers (TTL watchers)."""
|
|
191
|
+
if self._started:
|
|
192
|
+
return
|
|
193
|
+
|
|
194
|
+
for manager in self._managers.values():
|
|
195
|
+
await manager.start()
|
|
196
|
+
|
|
197
|
+
self._started = True
|
|
198
|
+
logger.debug("Started registry with %d model(s)", len(self._managers))
|
|
199
|
+
|
|
200
|
+
async def stop(self) -> None:
|
|
201
|
+
"""Stop all model managers and unload all models."""
|
|
202
|
+
for manager in self._managers.values():
|
|
203
|
+
await manager.stop()
|
|
204
|
+
|
|
205
|
+
self._started = False
|
|
206
|
+
logger.debug("Stopped registry")
|
|
207
|
+
|
|
208
|
+
async def preload(self, model_names: list[str] | None = None) -> None:
|
|
209
|
+
"""Preload models into memory.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
model_names: List of model names to preload, or None for all.
|
|
213
|
+
|
|
214
|
+
"""
|
|
215
|
+
names = model_names or list(self._managers.keys())
|
|
216
|
+
|
|
217
|
+
for name in names:
|
|
218
|
+
if name not in self._managers:
|
|
219
|
+
logger.warning("Cannot preload unknown model: %s", name)
|
|
220
|
+
continue
|
|
221
|
+
|
|
222
|
+
manager = self._managers[name]
|
|
223
|
+
if not manager.is_loaded:
|
|
224
|
+
logger.debug("Preloading model %s", name)
|
|
225
|
+
await manager.get_model()
|
|
@@ -0,0 +1,444 @@
|
|
|
1
|
+
"""FastAPI web service for Agent CLI transcription."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import TYPE_CHECKING, Annotated, Any
|
|
9
|
+
|
|
10
|
+
from fastapi import Depends, FastAPI, File, Form, HTTPException, Request, UploadFile
|
|
11
|
+
from pydantic import BaseModel
|
|
12
|
+
|
|
13
|
+
from agent_cli import config, opts
|
|
14
|
+
from agent_cli.agents.transcribe import (
|
|
15
|
+
AGENT_INSTRUCTIONS,
|
|
16
|
+
INSTRUCTION,
|
|
17
|
+
SYSTEM_PROMPT,
|
|
18
|
+
_build_context_payload,
|
|
19
|
+
)
|
|
20
|
+
from agent_cli.core.audio_format import (
|
|
21
|
+
VALID_EXTENSIONS,
|
|
22
|
+
convert_audio_to_wyoming_format,
|
|
23
|
+
is_valid_audio_file,
|
|
24
|
+
)
|
|
25
|
+
from agent_cli.core.transcription_logger import TranscriptionLogger, get_default_logger
|
|
26
|
+
from agent_cli.server.common import log_requests_middleware
|
|
27
|
+
from agent_cli.services import asr
|
|
28
|
+
from agent_cli.services.llm import process_and_update_clipboard
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from typer.models import OptionInfo
|
|
32
|
+
|
|
33
|
+
# Configure logging
|
|
34
|
+
logging.basicConfig(level=logging.INFO)
|
|
35
|
+
LOGGER = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
app = FastAPI(
|
|
38
|
+
title="Agent CLI Transcription API",
|
|
39
|
+
description="Web service for audio transcription and text cleanup",
|
|
40
|
+
version="1.0.0",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@app.on_event("startup")
|
|
45
|
+
async def log_effective_config() -> None:
|
|
46
|
+
"""Log effective configuration on startup to help debug env var issues."""
|
|
47
|
+
(
|
|
48
|
+
provider_cfg,
|
|
49
|
+
wyoming_cfg,
|
|
50
|
+
openai_asr_cfg,
|
|
51
|
+
gemini_asr_cfg,
|
|
52
|
+
ollama_cfg,
|
|
53
|
+
openai_llm_cfg,
|
|
54
|
+
gemini_llm_cfg,
|
|
55
|
+
_,
|
|
56
|
+
) = _load_transcription_configs()
|
|
57
|
+
|
|
58
|
+
LOGGER.info("ASR provider: %s", provider_cfg.asr_provider)
|
|
59
|
+
if provider_cfg.asr_provider == "wyoming":
|
|
60
|
+
LOGGER.info(" Wyoming: %s:%d", wyoming_cfg.asr_wyoming_ip, wyoming_cfg.asr_wyoming_port)
|
|
61
|
+
elif provider_cfg.asr_provider == "openai":
|
|
62
|
+
LOGGER.info(" Model: %s", openai_asr_cfg.asr_openai_model)
|
|
63
|
+
LOGGER.info(" Base URL: %s", openai_asr_cfg.openai_base_url or "https://api.openai.com/v1")
|
|
64
|
+
elif provider_cfg.asr_provider == "gemini":
|
|
65
|
+
LOGGER.info(" Model: %s", gemini_asr_cfg.asr_gemini_model)
|
|
66
|
+
|
|
67
|
+
LOGGER.info("LLM provider: %s", provider_cfg.llm_provider)
|
|
68
|
+
if provider_cfg.llm_provider == "ollama":
|
|
69
|
+
LOGGER.info(" Model: %s", ollama_cfg.llm_ollama_model)
|
|
70
|
+
LOGGER.info(" Host: %s", ollama_cfg.llm_ollama_host)
|
|
71
|
+
elif provider_cfg.llm_provider == "openai":
|
|
72
|
+
LOGGER.info(" Model: %s", openai_llm_cfg.llm_openai_model)
|
|
73
|
+
LOGGER.info(" Base URL: %s", openai_llm_cfg.openai_base_url or "https://api.openai.com/v1")
|
|
74
|
+
elif provider_cfg.llm_provider == "gemini":
|
|
75
|
+
LOGGER.info(" Model: %s", gemini_llm_cfg.llm_gemini_model)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@app.middleware("http")
|
|
79
|
+
async def log_requests(request: Request, call_next) -> Any: # type: ignore[no-untyped-def] # noqa: ANN001
|
|
80
|
+
"""Log basic request information."""
|
|
81
|
+
return await log_requests_middleware(request, call_next)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class TranscriptionResponse(BaseModel):
|
|
85
|
+
"""Response model for transcription endpoint."""
|
|
86
|
+
|
|
87
|
+
raw_transcript: str
|
|
88
|
+
cleaned_transcript: str | None = None
|
|
89
|
+
success: bool
|
|
90
|
+
error: str | None = None
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class HealthResponse(BaseModel):
|
|
94
|
+
"""Response model for health check."""
|
|
95
|
+
|
|
96
|
+
status: str
|
|
97
|
+
version: str
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class TranscriptionRequest(BaseModel):
|
|
101
|
+
"""Request model for transcription endpoint."""
|
|
102
|
+
|
|
103
|
+
cleanup: bool = True
|
|
104
|
+
extra_instructions: str | None = None
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
async def _parse_transcription_form(
|
|
108
|
+
cleanup: Annotated[str | bool, Form()] = True,
|
|
109
|
+
extra_instructions: Annotated[str | None, Form()] = None,
|
|
110
|
+
) -> TranscriptionRequest:
|
|
111
|
+
"""Parse form data into TranscriptionRequest model."""
|
|
112
|
+
cleanup_bool = cleanup.lower() in ("true", "1", "yes") if isinstance(cleanup, str) else cleanup
|
|
113
|
+
return TranscriptionRequest(cleanup=cleanup_bool, extra_instructions=extra_instructions)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@app.get("/health", response_model=HealthResponse)
|
|
117
|
+
async def health_check() -> HealthResponse:
|
|
118
|
+
"""Health check endpoint."""
|
|
119
|
+
return HealthResponse(status="healthy", version="1.0.0")
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
async def _transcribe_with_provider(
|
|
123
|
+
audio_data: bytes,
|
|
124
|
+
filename: str,
|
|
125
|
+
provider_cfg: config.ProviderSelection,
|
|
126
|
+
wyoming_asr_cfg: config.WyomingASR,
|
|
127
|
+
openai_asr_cfg: config.OpenAIASR,
|
|
128
|
+
gemini_asr_cfg: config.GeminiASR,
|
|
129
|
+
) -> str:
|
|
130
|
+
"""Transcribe audio using the configured provider."""
|
|
131
|
+
transcriber = asr.create_recorded_audio_transcriber(provider_cfg)
|
|
132
|
+
file_suffix = Path(filename).suffix.lower() or ".wav"
|
|
133
|
+
|
|
134
|
+
if provider_cfg.asr_provider == "wyoming":
|
|
135
|
+
return await transcriber(
|
|
136
|
+
audio_data=audio_data,
|
|
137
|
+
wyoming_asr_cfg=wyoming_asr_cfg,
|
|
138
|
+
logger=LOGGER,
|
|
139
|
+
)
|
|
140
|
+
if provider_cfg.asr_provider == "openai":
|
|
141
|
+
return await transcriber(
|
|
142
|
+
audio_data=audio_data,
|
|
143
|
+
openai_asr_cfg=openai_asr_cfg,
|
|
144
|
+
logger=LOGGER,
|
|
145
|
+
file_suffix=file_suffix,
|
|
146
|
+
)
|
|
147
|
+
if provider_cfg.asr_provider == "gemini":
|
|
148
|
+
return await transcriber(
|
|
149
|
+
audio_data=audio_data,
|
|
150
|
+
gemini_asr_cfg=gemini_asr_cfg,
|
|
151
|
+
logger=LOGGER,
|
|
152
|
+
file_suffix=file_suffix,
|
|
153
|
+
)
|
|
154
|
+
msg = f"Unsupported ASR provider: {provider_cfg.asr_provider}"
|
|
155
|
+
raise NotImplementedError(msg)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
async def _extract_audio_file_from_request(
|
|
159
|
+
request: Request,
|
|
160
|
+
audio: UploadFile | None,
|
|
161
|
+
) -> UploadFile:
|
|
162
|
+
"""Extract and validate audio file from request."""
|
|
163
|
+
# First try the standard 'audio' parameter
|
|
164
|
+
if audio is not None:
|
|
165
|
+
return audio
|
|
166
|
+
|
|
167
|
+
# iOS Shortcuts may use a different field name, scan form for audio files
|
|
168
|
+
LOGGER.info("No 'audio' parameter found, scanning form fields for audio files")
|
|
169
|
+
form_data = await request.form()
|
|
170
|
+
|
|
171
|
+
for key, value in form_data.items():
|
|
172
|
+
if is_valid_audio_file(value):
|
|
173
|
+
LOGGER.info("Found audio file in field '%s': %s", key, value.filename)
|
|
174
|
+
return value
|
|
175
|
+
|
|
176
|
+
# No audio file found anywhere
|
|
177
|
+
raise HTTPException(
|
|
178
|
+
status_code=422,
|
|
179
|
+
detail="No audio file provided. Ensure the form field is named 'audio' and type is 'File'.",
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _validate_audio_file(audio: UploadFile) -> None:
|
|
184
|
+
"""Validate audio file and return file extension."""
|
|
185
|
+
if not audio or not audio.filename:
|
|
186
|
+
LOGGER.error("No filename provided in request")
|
|
187
|
+
raise HTTPException(status_code=400, detail="No filename provided")
|
|
188
|
+
|
|
189
|
+
file_ext = Path(audio.filename).suffix.lower()
|
|
190
|
+
|
|
191
|
+
if file_ext not in VALID_EXTENSIONS:
|
|
192
|
+
raise HTTPException(
|
|
193
|
+
status_code=400,
|
|
194
|
+
detail=f"Unsupported audio format: {file_ext}. Supported: {', '.join(VALID_EXTENSIONS)}",
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _cfg(key: str, defaults: dict[str, Any], opt: OptionInfo) -> Any:
|
|
199
|
+
"""Get config with priority: env var > config file > option default."""
|
|
200
|
+
if opt.envvar and (env_val := os.environ.get(opt.envvar)):
|
|
201
|
+
return int(env_val) if isinstance(opt.default, int) else env_val
|
|
202
|
+
return defaults.get(key, opt.default)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _load_transcription_configs() -> tuple[
|
|
206
|
+
config.ProviderSelection,
|
|
207
|
+
config.WyomingASR,
|
|
208
|
+
config.OpenAIASR,
|
|
209
|
+
config.GeminiASR,
|
|
210
|
+
config.Ollama,
|
|
211
|
+
config.OpenAILLM,
|
|
212
|
+
config.GeminiLLM,
|
|
213
|
+
dict[str, Any],
|
|
214
|
+
]:
|
|
215
|
+
"""Load config objects. Priority: env var > config file > default."""
|
|
216
|
+
loaded_config = config.load_config()
|
|
217
|
+
wildcard_config = loaded_config.get("defaults", {})
|
|
218
|
+
command_config = loaded_config.get("transcribe", {})
|
|
219
|
+
defaults = {**wildcard_config, **command_config}
|
|
220
|
+
|
|
221
|
+
provider_cfg = config.ProviderSelection(
|
|
222
|
+
asr_provider=_cfg("asr_provider", defaults, opts.ASR_PROVIDER),
|
|
223
|
+
llm_provider=_cfg("llm_provider", defaults, opts.LLM_PROVIDER),
|
|
224
|
+
tts_provider=_cfg("tts_provider", defaults, opts.TTS_PROVIDER),
|
|
225
|
+
)
|
|
226
|
+
wyoming_asr_cfg = config.WyomingASR(
|
|
227
|
+
asr_wyoming_ip=_cfg("asr_wyoming_ip", defaults, opts.ASR_WYOMING_IP),
|
|
228
|
+
asr_wyoming_port=_cfg("asr_wyoming_port", defaults, opts.ASR_WYOMING_PORT),
|
|
229
|
+
)
|
|
230
|
+
openai_asr_cfg = config.OpenAIASR(
|
|
231
|
+
asr_openai_model=_cfg("asr_openai_model", defaults, opts.ASR_OPENAI_MODEL),
|
|
232
|
+
openai_api_key=_cfg("openai_api_key", defaults, opts.OPENAI_API_KEY),
|
|
233
|
+
openai_base_url=_cfg("asr_openai_base_url", defaults, opts.ASR_OPENAI_BASE_URL),
|
|
234
|
+
asr_openai_prompt=_cfg("asr_openai_prompt", defaults, opts.ASR_OPENAI_PROMPT),
|
|
235
|
+
)
|
|
236
|
+
gemini_asr_cfg = config.GeminiASR(
|
|
237
|
+
asr_gemini_model=_cfg("asr_gemini_model", defaults, opts.ASR_GEMINI_MODEL),
|
|
238
|
+
gemini_api_key=_cfg("gemini_api_key", defaults, opts.GEMINI_API_KEY),
|
|
239
|
+
)
|
|
240
|
+
ollama_cfg = config.Ollama(
|
|
241
|
+
llm_ollama_model=_cfg("llm_ollama_model", defaults, opts.LLM_OLLAMA_MODEL),
|
|
242
|
+
llm_ollama_host=_cfg("llm_ollama_host", defaults, opts.LLM_OLLAMA_HOST),
|
|
243
|
+
)
|
|
244
|
+
openai_llm_cfg = config.OpenAILLM(
|
|
245
|
+
llm_openai_model=_cfg("llm_openai_model", defaults, opts.LLM_OPENAI_MODEL),
|
|
246
|
+
openai_api_key=_cfg("openai_api_key", defaults, opts.OPENAI_API_KEY),
|
|
247
|
+
openai_base_url=_cfg("openai_base_url", defaults, opts.OPENAI_BASE_URL),
|
|
248
|
+
)
|
|
249
|
+
gemini_llm_cfg = config.GeminiLLM(
|
|
250
|
+
llm_gemini_model=_cfg("llm_gemini_model", defaults, opts.LLM_GEMINI_MODEL),
|
|
251
|
+
gemini_api_key=_cfg("gemini_api_key", defaults, opts.GEMINI_API_KEY),
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
return (
|
|
255
|
+
provider_cfg,
|
|
256
|
+
wyoming_asr_cfg,
|
|
257
|
+
openai_asr_cfg,
|
|
258
|
+
gemini_asr_cfg,
|
|
259
|
+
ollama_cfg,
|
|
260
|
+
openai_llm_cfg,
|
|
261
|
+
gemini_llm_cfg,
|
|
262
|
+
defaults,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _convert_audio_for_local_asr(audio_data: bytes, filename: str) -> bytes:
|
|
267
|
+
"""Convert audio to Wyoming format if needed for local ASR."""
|
|
268
|
+
LOGGER.info("Converting %s audio to Wyoming format", filename)
|
|
269
|
+
converted_data = convert_audio_to_wyoming_format(audio_data, filename)
|
|
270
|
+
LOGGER.info("Audio conversion successful")
|
|
271
|
+
return converted_data
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
async def _process_transcript_cleanup(
|
|
275
|
+
raw_transcript: str,
|
|
276
|
+
cleanup: bool,
|
|
277
|
+
extra_instructions: str | None,
|
|
278
|
+
defaults: dict[str, Any],
|
|
279
|
+
provider_cfg: config.ProviderSelection,
|
|
280
|
+
ollama_cfg: config.Ollama,
|
|
281
|
+
openai_llm_cfg: config.OpenAILLM,
|
|
282
|
+
gemini_llm_cfg: config.GeminiLLM,
|
|
283
|
+
transcription_log: Path | None,
|
|
284
|
+
) -> str | None:
|
|
285
|
+
"""Process transcript cleanup with LLM if requested."""
|
|
286
|
+
if not cleanup:
|
|
287
|
+
return None
|
|
288
|
+
|
|
289
|
+
instructions = AGENT_INSTRUCTIONS
|
|
290
|
+
config_extra = defaults.get("extra_instructions", "")
|
|
291
|
+
if config_extra:
|
|
292
|
+
instructions += f"\n\n{config_extra}"
|
|
293
|
+
if extra_instructions:
|
|
294
|
+
instructions += f"\n\n{extra_instructions}"
|
|
295
|
+
|
|
296
|
+
combined_context, context_note = _build_context_payload(
|
|
297
|
+
transcription_log=transcription_log,
|
|
298
|
+
clipboard_snapshot=None,
|
|
299
|
+
)
|
|
300
|
+
if context_note:
|
|
301
|
+
instructions += context_note
|
|
302
|
+
|
|
303
|
+
return await process_and_update_clipboard(
|
|
304
|
+
system_prompt=SYSTEM_PROMPT,
|
|
305
|
+
agent_instructions=instructions,
|
|
306
|
+
provider_cfg=provider_cfg,
|
|
307
|
+
ollama_cfg=ollama_cfg,
|
|
308
|
+
openai_cfg=openai_llm_cfg,
|
|
309
|
+
gemini_cfg=gemini_llm_cfg,
|
|
310
|
+
logger=LOGGER,
|
|
311
|
+
original_text=raw_transcript,
|
|
312
|
+
instruction=INSTRUCTION,
|
|
313
|
+
clipboard=False, # Don't copy to clipboard in web service
|
|
314
|
+
quiet=True,
|
|
315
|
+
live=None,
|
|
316
|
+
context=combined_context,
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
@app.post("/transcribe", response_model=TranscriptionResponse)
|
|
321
|
+
async def transcribe_audio(
|
|
322
|
+
request: Request,
|
|
323
|
+
form_data: Annotated[TranscriptionRequest, Depends(_parse_transcription_form)],
|
|
324
|
+
audio: Annotated[UploadFile | None, File()] = None,
|
|
325
|
+
) -> TranscriptionResponse:
|
|
326
|
+
"""Transcribe audio file and optionally clean up the text.
|
|
327
|
+
|
|
328
|
+
Args:
|
|
329
|
+
request: FastAPI request object
|
|
330
|
+
audio: Audio file (wav, mp3, m4a, etc.)
|
|
331
|
+
form_data: Form data with cleanup and extra_instructions
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
TranscriptionResponse with raw and cleaned transcripts
|
|
335
|
+
|
|
336
|
+
"""
|
|
337
|
+
# Initialize variables outside try block to ensure they exist in finally block
|
|
338
|
+
raw_transcript = ""
|
|
339
|
+
cleaned_transcript = None
|
|
340
|
+
transcription_logger: TranscriptionLogger | None = None
|
|
341
|
+
|
|
342
|
+
try:
|
|
343
|
+
# Extract and validate audio file
|
|
344
|
+
audio_file = await _extract_audio_file_from_request(request, audio)
|
|
345
|
+
_validate_audio_file(audio_file)
|
|
346
|
+
|
|
347
|
+
# Extract form data (Pydantic handles string->bool conversion automatically)
|
|
348
|
+
cleanup = form_data.cleanup
|
|
349
|
+
extra_instructions = form_data.extra_instructions
|
|
350
|
+
|
|
351
|
+
# Load all configurations
|
|
352
|
+
(
|
|
353
|
+
provider_cfg,
|
|
354
|
+
wyoming_asr_cfg,
|
|
355
|
+
openai_asr_cfg,
|
|
356
|
+
gemini_asr_cfg,
|
|
357
|
+
ollama_cfg,
|
|
358
|
+
openai_llm_cfg,
|
|
359
|
+
gemini_llm_cfg,
|
|
360
|
+
defaults,
|
|
361
|
+
) = _load_transcription_configs()
|
|
362
|
+
|
|
363
|
+
# Read uploaded file
|
|
364
|
+
audio_data = await audio_file.read()
|
|
365
|
+
LOGGER.info(
|
|
366
|
+
"Received audio: filename=%s, size=%d bytes, content_type=%s",
|
|
367
|
+
audio_file.filename,
|
|
368
|
+
len(audio_data),
|
|
369
|
+
audio_file.content_type,
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
# Convert audio to Wyoming format if using local ASR
|
|
373
|
+
if provider_cfg.asr_provider == "wyoming":
|
|
374
|
+
audio_data = _convert_audio_for_local_asr(audio_data, audio_file.filename)
|
|
375
|
+
|
|
376
|
+
# Transcribe audio using the configured provider
|
|
377
|
+
raw_transcript = await _transcribe_with_provider(
|
|
378
|
+
audio_data,
|
|
379
|
+
audio_file.filename or "audio.wav",
|
|
380
|
+
provider_cfg,
|
|
381
|
+
wyoming_asr_cfg,
|
|
382
|
+
openai_asr_cfg,
|
|
383
|
+
gemini_asr_cfg,
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
if not raw_transcript:
|
|
387
|
+
return TranscriptionResponse(
|
|
388
|
+
raw_transcript="",
|
|
389
|
+
success=False,
|
|
390
|
+
error="No transcript generated from audio",
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
if transcription_logger is None:
|
|
394
|
+
try:
|
|
395
|
+
transcription_logger = get_default_logger()
|
|
396
|
+
except Exception as log_init_error:
|
|
397
|
+
LOGGER.warning("Failed to initialize transcription logger: %s", log_init_error)
|
|
398
|
+
|
|
399
|
+
# Process transcript cleanup if requested
|
|
400
|
+
cleaned_transcript = await _process_transcript_cleanup(
|
|
401
|
+
raw_transcript,
|
|
402
|
+
cleanup,
|
|
403
|
+
extra_instructions,
|
|
404
|
+
defaults,
|
|
405
|
+
provider_cfg,
|
|
406
|
+
ollama_cfg,
|
|
407
|
+
openai_llm_cfg,
|
|
408
|
+
gemini_llm_cfg,
|
|
409
|
+
transcription_logger.log_file if transcription_logger else None,
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
# If cleanup was requested but failed, indicate partial success
|
|
413
|
+
if cleanup and cleaned_transcript is None:
|
|
414
|
+
return TranscriptionResponse(
|
|
415
|
+
raw_transcript=raw_transcript,
|
|
416
|
+
cleaned_transcript=None,
|
|
417
|
+
success=True, # Transcription succeeded even if cleanup failed
|
|
418
|
+
error="Transcription successful but cleanup failed. Check LLM configuration.",
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
return TranscriptionResponse(
|
|
422
|
+
raw_transcript=raw_transcript,
|
|
423
|
+
cleaned_transcript=cleaned_transcript,
|
|
424
|
+
success=True,
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
except HTTPException:
|
|
428
|
+
# Re-raise HTTPExceptions so FastAPI handles them properly
|
|
429
|
+
raise
|
|
430
|
+
except Exception as e:
|
|
431
|
+
LOGGER.exception("Error during transcription")
|
|
432
|
+
return TranscriptionResponse(raw_transcript="", success=False, error=str(e))
|
|
433
|
+
finally:
|
|
434
|
+
# Log the transcription automatically (even if it failed)
|
|
435
|
+
# Only log if we have something to log
|
|
436
|
+
if raw_transcript or cleaned_transcript:
|
|
437
|
+
try:
|
|
438
|
+
transcription_logger = transcription_logger or get_default_logger()
|
|
439
|
+
transcription_logger.log_transcription(
|
|
440
|
+
raw=raw_transcript,
|
|
441
|
+
processed=cleaned_transcript,
|
|
442
|
+
)
|
|
443
|
+
except Exception as log_error:
|
|
444
|
+
LOGGER.warning("Failed to log transcription: %s", log_error)
|