agent-cli 0.70.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. agent_cli/__init__.py +5 -0
  2. agent_cli/__main__.py +6 -0
  3. agent_cli/_extras.json +14 -0
  4. agent_cli/_requirements/.gitkeep +0 -0
  5. agent_cli/_requirements/audio.txt +79 -0
  6. agent_cli/_requirements/faster-whisper.txt +215 -0
  7. agent_cli/_requirements/kokoro.txt +425 -0
  8. agent_cli/_requirements/llm.txt +183 -0
  9. agent_cli/_requirements/memory.txt +355 -0
  10. agent_cli/_requirements/mlx-whisper.txt +222 -0
  11. agent_cli/_requirements/piper.txt +176 -0
  12. agent_cli/_requirements/rag.txt +402 -0
  13. agent_cli/_requirements/server.txt +154 -0
  14. agent_cli/_requirements/speed.txt +77 -0
  15. agent_cli/_requirements/vad.txt +155 -0
  16. agent_cli/_requirements/wyoming.txt +71 -0
  17. agent_cli/_tools.py +368 -0
  18. agent_cli/agents/__init__.py +23 -0
  19. agent_cli/agents/_voice_agent_common.py +136 -0
  20. agent_cli/agents/assistant.py +383 -0
  21. agent_cli/agents/autocorrect.py +284 -0
  22. agent_cli/agents/chat.py +496 -0
  23. agent_cli/agents/memory/__init__.py +31 -0
  24. agent_cli/agents/memory/add.py +190 -0
  25. agent_cli/agents/memory/proxy.py +160 -0
  26. agent_cli/agents/rag_proxy.py +128 -0
  27. agent_cli/agents/speak.py +209 -0
  28. agent_cli/agents/transcribe.py +671 -0
  29. agent_cli/agents/transcribe_daemon.py +499 -0
  30. agent_cli/agents/voice_edit.py +291 -0
  31. agent_cli/api.py +22 -0
  32. agent_cli/cli.py +106 -0
  33. agent_cli/config.py +503 -0
  34. agent_cli/config_cmd.py +307 -0
  35. agent_cli/constants.py +27 -0
  36. agent_cli/core/__init__.py +1 -0
  37. agent_cli/core/audio.py +461 -0
  38. agent_cli/core/audio_format.py +299 -0
  39. agent_cli/core/chroma.py +88 -0
  40. agent_cli/core/deps.py +191 -0
  41. agent_cli/core/openai_proxy.py +139 -0
  42. agent_cli/core/process.py +195 -0
  43. agent_cli/core/reranker.py +120 -0
  44. agent_cli/core/sse.py +87 -0
  45. agent_cli/core/transcription_logger.py +70 -0
  46. agent_cli/core/utils.py +526 -0
  47. agent_cli/core/vad.py +175 -0
  48. agent_cli/core/watch.py +65 -0
  49. agent_cli/dev/__init__.py +14 -0
  50. agent_cli/dev/cli.py +1588 -0
  51. agent_cli/dev/coding_agents/__init__.py +19 -0
  52. agent_cli/dev/coding_agents/aider.py +24 -0
  53. agent_cli/dev/coding_agents/base.py +167 -0
  54. agent_cli/dev/coding_agents/claude.py +39 -0
  55. agent_cli/dev/coding_agents/codex.py +24 -0
  56. agent_cli/dev/coding_agents/continue_dev.py +15 -0
  57. agent_cli/dev/coding_agents/copilot.py +24 -0
  58. agent_cli/dev/coding_agents/cursor_agent.py +48 -0
  59. agent_cli/dev/coding_agents/gemini.py +28 -0
  60. agent_cli/dev/coding_agents/opencode.py +15 -0
  61. agent_cli/dev/coding_agents/registry.py +49 -0
  62. agent_cli/dev/editors/__init__.py +19 -0
  63. agent_cli/dev/editors/base.py +89 -0
  64. agent_cli/dev/editors/cursor.py +15 -0
  65. agent_cli/dev/editors/emacs.py +46 -0
  66. agent_cli/dev/editors/jetbrains.py +56 -0
  67. agent_cli/dev/editors/nano.py +31 -0
  68. agent_cli/dev/editors/neovim.py +33 -0
  69. agent_cli/dev/editors/registry.py +59 -0
  70. agent_cli/dev/editors/sublime.py +20 -0
  71. agent_cli/dev/editors/vim.py +42 -0
  72. agent_cli/dev/editors/vscode.py +15 -0
  73. agent_cli/dev/editors/zed.py +20 -0
  74. agent_cli/dev/project.py +568 -0
  75. agent_cli/dev/registry.py +52 -0
  76. agent_cli/dev/skill/SKILL.md +141 -0
  77. agent_cli/dev/skill/examples.md +571 -0
  78. agent_cli/dev/terminals/__init__.py +19 -0
  79. agent_cli/dev/terminals/apple_terminal.py +82 -0
  80. agent_cli/dev/terminals/base.py +56 -0
  81. agent_cli/dev/terminals/gnome.py +51 -0
  82. agent_cli/dev/terminals/iterm2.py +84 -0
  83. agent_cli/dev/terminals/kitty.py +77 -0
  84. agent_cli/dev/terminals/registry.py +48 -0
  85. agent_cli/dev/terminals/tmux.py +58 -0
  86. agent_cli/dev/terminals/warp.py +132 -0
  87. agent_cli/dev/terminals/zellij.py +78 -0
  88. agent_cli/dev/worktree.py +856 -0
  89. agent_cli/docs_gen.py +417 -0
  90. agent_cli/example-config.toml +185 -0
  91. agent_cli/install/__init__.py +5 -0
  92. agent_cli/install/common.py +89 -0
  93. agent_cli/install/extras.py +174 -0
  94. agent_cli/install/hotkeys.py +48 -0
  95. agent_cli/install/services.py +87 -0
  96. agent_cli/memory/__init__.py +7 -0
  97. agent_cli/memory/_files.py +250 -0
  98. agent_cli/memory/_filters.py +63 -0
  99. agent_cli/memory/_git.py +157 -0
  100. agent_cli/memory/_indexer.py +142 -0
  101. agent_cli/memory/_ingest.py +408 -0
  102. agent_cli/memory/_persistence.py +182 -0
  103. agent_cli/memory/_prompt.py +91 -0
  104. agent_cli/memory/_retrieval.py +294 -0
  105. agent_cli/memory/_store.py +169 -0
  106. agent_cli/memory/_streaming.py +44 -0
  107. agent_cli/memory/_tasks.py +48 -0
  108. agent_cli/memory/api.py +113 -0
  109. agent_cli/memory/client.py +272 -0
  110. agent_cli/memory/engine.py +361 -0
  111. agent_cli/memory/entities.py +43 -0
  112. agent_cli/memory/models.py +112 -0
  113. agent_cli/opts.py +433 -0
  114. agent_cli/py.typed +0 -0
  115. agent_cli/rag/__init__.py +3 -0
  116. agent_cli/rag/_indexer.py +67 -0
  117. agent_cli/rag/_indexing.py +226 -0
  118. agent_cli/rag/_prompt.py +30 -0
  119. agent_cli/rag/_retriever.py +156 -0
  120. agent_cli/rag/_store.py +48 -0
  121. agent_cli/rag/_utils.py +218 -0
  122. agent_cli/rag/api.py +175 -0
  123. agent_cli/rag/client.py +299 -0
  124. agent_cli/rag/engine.py +302 -0
  125. agent_cli/rag/models.py +55 -0
  126. agent_cli/scripts/.runtime/.gitkeep +0 -0
  127. agent_cli/scripts/__init__.py +1 -0
  128. agent_cli/scripts/check_plugin_skill_sync.py +50 -0
  129. agent_cli/scripts/linux-hotkeys/README.md +63 -0
  130. agent_cli/scripts/linux-hotkeys/toggle-autocorrect.sh +45 -0
  131. agent_cli/scripts/linux-hotkeys/toggle-transcription.sh +58 -0
  132. agent_cli/scripts/linux-hotkeys/toggle-voice-edit.sh +58 -0
  133. agent_cli/scripts/macos-hotkeys/README.md +45 -0
  134. agent_cli/scripts/macos-hotkeys/skhd-config-example +5 -0
  135. agent_cli/scripts/macos-hotkeys/toggle-autocorrect.sh +12 -0
  136. agent_cli/scripts/macos-hotkeys/toggle-transcription.sh +37 -0
  137. agent_cli/scripts/macos-hotkeys/toggle-voice-edit.sh +37 -0
  138. agent_cli/scripts/nvidia-asr-server/README.md +99 -0
  139. agent_cli/scripts/nvidia-asr-server/pyproject.toml +27 -0
  140. agent_cli/scripts/nvidia-asr-server/server.py +255 -0
  141. agent_cli/scripts/nvidia-asr-server/shell.nix +32 -0
  142. agent_cli/scripts/nvidia-asr-server/uv.lock +4654 -0
  143. agent_cli/scripts/run-openwakeword.sh +11 -0
  144. agent_cli/scripts/run-piper-windows.ps1 +30 -0
  145. agent_cli/scripts/run-piper.sh +24 -0
  146. agent_cli/scripts/run-whisper-linux.sh +40 -0
  147. agent_cli/scripts/run-whisper-macos.sh +6 -0
  148. agent_cli/scripts/run-whisper-windows.ps1 +51 -0
  149. agent_cli/scripts/run-whisper.sh +9 -0
  150. agent_cli/scripts/run_faster_whisper_server.py +136 -0
  151. agent_cli/scripts/setup-linux-hotkeys.sh +72 -0
  152. agent_cli/scripts/setup-linux.sh +108 -0
  153. agent_cli/scripts/setup-macos-hotkeys.sh +61 -0
  154. agent_cli/scripts/setup-macos.sh +76 -0
  155. agent_cli/scripts/setup-windows.ps1 +63 -0
  156. agent_cli/scripts/start-all-services-windows.ps1 +53 -0
  157. agent_cli/scripts/start-all-services.sh +178 -0
  158. agent_cli/scripts/sync_extras.py +138 -0
  159. agent_cli/server/__init__.py +3 -0
  160. agent_cli/server/cli.py +721 -0
  161. agent_cli/server/common.py +222 -0
  162. agent_cli/server/model_manager.py +288 -0
  163. agent_cli/server/model_registry.py +225 -0
  164. agent_cli/server/proxy/__init__.py +3 -0
  165. agent_cli/server/proxy/api.py +444 -0
  166. agent_cli/server/streaming.py +67 -0
  167. agent_cli/server/tts/__init__.py +3 -0
  168. agent_cli/server/tts/api.py +335 -0
  169. agent_cli/server/tts/backends/__init__.py +82 -0
  170. agent_cli/server/tts/backends/base.py +139 -0
  171. agent_cli/server/tts/backends/kokoro.py +403 -0
  172. agent_cli/server/tts/backends/piper.py +253 -0
  173. agent_cli/server/tts/model_manager.py +201 -0
  174. agent_cli/server/tts/model_registry.py +28 -0
  175. agent_cli/server/tts/wyoming_handler.py +249 -0
  176. agent_cli/server/whisper/__init__.py +3 -0
  177. agent_cli/server/whisper/api.py +413 -0
  178. agent_cli/server/whisper/backends/__init__.py +89 -0
  179. agent_cli/server/whisper/backends/base.py +97 -0
  180. agent_cli/server/whisper/backends/faster_whisper.py +225 -0
  181. agent_cli/server/whisper/backends/mlx.py +270 -0
  182. agent_cli/server/whisper/languages.py +116 -0
  183. agent_cli/server/whisper/model_manager.py +157 -0
  184. agent_cli/server/whisper/model_registry.py +28 -0
  185. agent_cli/server/whisper/wyoming_handler.py +203 -0
  186. agent_cli/services/__init__.py +343 -0
  187. agent_cli/services/_wyoming_utils.py +64 -0
  188. agent_cli/services/asr.py +506 -0
  189. agent_cli/services/llm.py +228 -0
  190. agent_cli/services/tts.py +450 -0
  191. agent_cli/services/wake_word.py +142 -0
  192. agent_cli-0.70.5.dist-info/METADATA +2118 -0
  193. agent_cli-0.70.5.dist-info/RECORD +196 -0
  194. agent_cli-0.70.5.dist-info/WHEEL +4 -0
  195. agent_cli-0.70.5.dist-info/entry_points.txt +4 -0
  196. agent_cli-0.70.5.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,225 @@
1
+ """Registry for managing multiple models.
2
+
3
+ This module provides a concrete model registry that handles:
4
+ - Registration of multiple models with independent configurations
5
+ - Default model selection
6
+ - Lifecycle management (start/stop)
7
+ - Model preloading
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+ from dataclasses import dataclass
14
+ from typing import TYPE_CHECKING, Any, Generic, Protocol, TypeVar, runtime_checkable
15
+
16
+ if TYPE_CHECKING:
17
+ from collections.abc import Callable
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ @runtime_checkable
23
+ class ManagerProtocol(Protocol):
24
+ """Protocol defining the interface for model managers."""
25
+
26
+ @property
27
+ def is_loaded(self) -> bool:
28
+ """Check if the model is loaded."""
29
+ ...
30
+
31
+ async def start(self) -> None:
32
+ """Start the manager."""
33
+ ...
34
+
35
+ async def stop(self) -> None:
36
+ """Stop the manager."""
37
+ ...
38
+
39
+ async def get_model(self) -> Any:
40
+ """Get the model, loading if needed."""
41
+ ...
42
+
43
+
44
+ # Type variable for manager type
45
+ ManagerT = TypeVar("ManagerT", bound=ManagerProtocol)
46
+ ConfigT = TypeVar("ConfigT")
47
+
48
+
49
+ @dataclass
50
+ class ModelStatus:
51
+ """Status of a registered model."""
52
+
53
+ name: str
54
+ loaded: bool
55
+ device: str | None
56
+ ttl_seconds: int
57
+ ttl_remaining: float | None
58
+ active_requests: int
59
+ # Stats
60
+ load_count: int
61
+ unload_count: int
62
+ total_requests: int
63
+ total_audio_seconds: float
64
+ total_processing_seconds: float
65
+ last_load_time: float | None
66
+ last_request_time: float | None
67
+ load_duration_seconds: float | None
68
+ extra: dict[str, float]
69
+
70
+
71
+ class ModelRegistry(Generic[ManagerT, ConfigT]):
72
+ """Registry for managing multiple models with independent TTLs.
73
+
74
+ Each model can have its own configuration (device, TTL).
75
+ Models are loaded lazily and unloaded independently based on their TTL.
76
+ """
77
+
78
+ def __init__(
79
+ self,
80
+ manager_factory: Callable[[ConfigT], ManagerT],
81
+ default_model: str | None = None,
82
+ ) -> None:
83
+ """Initialize the registry.
84
+
85
+ Args:
86
+ manager_factory: Function to create a manager from config.
87
+ default_model: Name of the default model to use when not specified.
88
+
89
+ """
90
+ self._manager_factory = manager_factory
91
+ self._managers: dict[str, ManagerT] = {}
92
+ self._default_model = default_model
93
+ self._started = False
94
+
95
+ @staticmethod
96
+ def _default_get_status(name: str, manager: Any) -> ModelStatus:
97
+ """Default status getter for managers with standard interface."""
98
+ return ModelStatus(
99
+ name=name,
100
+ loaded=manager.is_loaded,
101
+ device=manager.device,
102
+ ttl_seconds=manager.config.ttl_seconds,
103
+ ttl_remaining=manager.ttl_remaining,
104
+ active_requests=manager.active_requests,
105
+ load_count=manager.stats.load_count,
106
+ unload_count=manager.stats.unload_count,
107
+ total_requests=manager.stats.total_requests,
108
+ total_audio_seconds=manager.stats.total_audio_seconds,
109
+ total_processing_seconds=manager.stats.total_processing_seconds,
110
+ last_load_time=manager.stats.last_load_time,
111
+ last_request_time=manager.stats.last_request_time,
112
+ load_duration_seconds=manager.stats.load_duration_seconds,
113
+ extra=manager.stats.extra,
114
+ )
115
+
116
+ @property
117
+ def default_model(self) -> str | None:
118
+ """Get the default model name."""
119
+ return self._default_model
120
+
121
+ @default_model.setter
122
+ def default_model(self, name: str | None) -> None:
123
+ """Set the default model name."""
124
+ if name is not None and name not in self._managers:
125
+ msg = f"Model '{name}' is not registered"
126
+ raise ValueError(msg)
127
+ self._default_model = name
128
+
129
+ @property
130
+ def models(self) -> list[str]:
131
+ """Get list of registered model names."""
132
+ return list(self._managers.keys())
133
+
134
+ def register(self, config: ConfigT) -> None:
135
+ """Register a model with the given configuration.
136
+
137
+ Args:
138
+ config: Model configuration including name, device, TTL, etc.
139
+ Must have a model_name attribute.
140
+
141
+ Raises:
142
+ ValueError: If a model with this name is already registered.
143
+
144
+ """
145
+ model_name: str = config.model_name # type: ignore[attr-defined]
146
+
147
+ if model_name in self._managers:
148
+ msg = f"Model '{model_name}' is already registered"
149
+ raise ValueError(msg)
150
+
151
+ manager = self._manager_factory(config)
152
+ self._managers[model_name] = manager
153
+
154
+ # Set as default if it's the first model
155
+ if self._default_model is None:
156
+ self._default_model = model_name
157
+
158
+ logger.debug("Registered model %s", model_name)
159
+
160
+ def get_manager(self, model_name: str | None = None) -> ManagerT:
161
+ """Get the manager for a specific model.
162
+
163
+ Args:
164
+ model_name: Name of the model, or None to use the default.
165
+
166
+ Returns:
167
+ The manager for the requested model.
168
+
169
+ Raises:
170
+ ValueError: If the model is not registered or no default is set.
171
+
172
+ """
173
+ name = model_name or self._default_model
174
+
175
+ if name is None:
176
+ msg = "No model specified and no default model set"
177
+ raise ValueError(msg)
178
+
179
+ if name not in self._managers:
180
+ msg = f"Model '{name}' is not registered. Available: {list(self._managers.keys())}"
181
+ raise ValueError(msg)
182
+
183
+ return self._managers[name]
184
+
185
+ def list_status(self) -> list[ModelStatus]:
186
+ """Get status of all registered models."""
187
+ return [self._default_get_status(name, manager) for name, manager in self._managers.items()]
188
+
189
+ async def start(self) -> None:
190
+ """Start all model managers (TTL watchers)."""
191
+ if self._started:
192
+ return
193
+
194
+ for manager in self._managers.values():
195
+ await manager.start()
196
+
197
+ self._started = True
198
+ logger.debug("Started registry with %d model(s)", len(self._managers))
199
+
200
+ async def stop(self) -> None:
201
+ """Stop all model managers and unload all models."""
202
+ for manager in self._managers.values():
203
+ await manager.stop()
204
+
205
+ self._started = False
206
+ logger.debug("Stopped registry")
207
+
208
+ async def preload(self, model_names: list[str] | None = None) -> None:
209
+ """Preload models into memory.
210
+
211
+ Args:
212
+ model_names: List of model names to preload, or None for all.
213
+
214
+ """
215
+ names = model_names or list(self._managers.keys())
216
+
217
+ for name in names:
218
+ if name not in self._managers:
219
+ logger.warning("Cannot preload unknown model: %s", name)
220
+ continue
221
+
222
+ manager = self._managers[name]
223
+ if not manager.is_loaded:
224
+ logger.debug("Preloading model %s", name)
225
+ await manager.get_model()
@@ -0,0 +1,3 @@
1
+ """Proxy server for transcription services."""
2
+
3
+ from __future__ import annotations
@@ -0,0 +1,444 @@
1
+ """FastAPI web service for Agent CLI transcription."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import os
7
+ from pathlib import Path
8
+ from typing import TYPE_CHECKING, Annotated, Any
9
+
10
+ from fastapi import Depends, FastAPI, File, Form, HTTPException, Request, UploadFile
11
+ from pydantic import BaseModel
12
+
13
+ from agent_cli import config, opts
14
+ from agent_cli.agents.transcribe import (
15
+ AGENT_INSTRUCTIONS,
16
+ INSTRUCTION,
17
+ SYSTEM_PROMPT,
18
+ _build_context_payload,
19
+ )
20
+ from agent_cli.core.audio_format import (
21
+ VALID_EXTENSIONS,
22
+ convert_audio_to_wyoming_format,
23
+ is_valid_audio_file,
24
+ )
25
+ from agent_cli.core.transcription_logger import TranscriptionLogger, get_default_logger
26
+ from agent_cli.server.common import log_requests_middleware
27
+ from agent_cli.services import asr
28
+ from agent_cli.services.llm import process_and_update_clipboard
29
+
30
+ if TYPE_CHECKING:
31
+ from typer.models import OptionInfo
32
+
33
+ # Configure logging
34
+ logging.basicConfig(level=logging.INFO)
35
+ LOGGER = logging.getLogger(__name__)
36
+
37
+ app = FastAPI(
38
+ title="Agent CLI Transcription API",
39
+ description="Web service for audio transcription and text cleanup",
40
+ version="1.0.0",
41
+ )
42
+
43
+
44
+ @app.on_event("startup")
45
+ async def log_effective_config() -> None:
46
+ """Log effective configuration on startup to help debug env var issues."""
47
+ (
48
+ provider_cfg,
49
+ wyoming_cfg,
50
+ openai_asr_cfg,
51
+ gemini_asr_cfg,
52
+ ollama_cfg,
53
+ openai_llm_cfg,
54
+ gemini_llm_cfg,
55
+ _,
56
+ ) = _load_transcription_configs()
57
+
58
+ LOGGER.info("ASR provider: %s", provider_cfg.asr_provider)
59
+ if provider_cfg.asr_provider == "wyoming":
60
+ LOGGER.info(" Wyoming: %s:%d", wyoming_cfg.asr_wyoming_ip, wyoming_cfg.asr_wyoming_port)
61
+ elif provider_cfg.asr_provider == "openai":
62
+ LOGGER.info(" Model: %s", openai_asr_cfg.asr_openai_model)
63
+ LOGGER.info(" Base URL: %s", openai_asr_cfg.openai_base_url or "https://api.openai.com/v1")
64
+ elif provider_cfg.asr_provider == "gemini":
65
+ LOGGER.info(" Model: %s", gemini_asr_cfg.asr_gemini_model)
66
+
67
+ LOGGER.info("LLM provider: %s", provider_cfg.llm_provider)
68
+ if provider_cfg.llm_provider == "ollama":
69
+ LOGGER.info(" Model: %s", ollama_cfg.llm_ollama_model)
70
+ LOGGER.info(" Host: %s", ollama_cfg.llm_ollama_host)
71
+ elif provider_cfg.llm_provider == "openai":
72
+ LOGGER.info(" Model: %s", openai_llm_cfg.llm_openai_model)
73
+ LOGGER.info(" Base URL: %s", openai_llm_cfg.openai_base_url or "https://api.openai.com/v1")
74
+ elif provider_cfg.llm_provider == "gemini":
75
+ LOGGER.info(" Model: %s", gemini_llm_cfg.llm_gemini_model)
76
+
77
+
78
+ @app.middleware("http")
79
+ async def log_requests(request: Request, call_next) -> Any: # type: ignore[no-untyped-def] # noqa: ANN001
80
+ """Log basic request information."""
81
+ return await log_requests_middleware(request, call_next)
82
+
83
+
84
+ class TranscriptionResponse(BaseModel):
85
+ """Response model for transcription endpoint."""
86
+
87
+ raw_transcript: str
88
+ cleaned_transcript: str | None = None
89
+ success: bool
90
+ error: str | None = None
91
+
92
+
93
+ class HealthResponse(BaseModel):
94
+ """Response model for health check."""
95
+
96
+ status: str
97
+ version: str
98
+
99
+
100
+ class TranscriptionRequest(BaseModel):
101
+ """Request model for transcription endpoint."""
102
+
103
+ cleanup: bool = True
104
+ extra_instructions: str | None = None
105
+
106
+
107
+ async def _parse_transcription_form(
108
+ cleanup: Annotated[str | bool, Form()] = True,
109
+ extra_instructions: Annotated[str | None, Form()] = None,
110
+ ) -> TranscriptionRequest:
111
+ """Parse form data into TranscriptionRequest model."""
112
+ cleanup_bool = cleanup.lower() in ("true", "1", "yes") if isinstance(cleanup, str) else cleanup
113
+ return TranscriptionRequest(cleanup=cleanup_bool, extra_instructions=extra_instructions)
114
+
115
+
116
+ @app.get("/health", response_model=HealthResponse)
117
+ async def health_check() -> HealthResponse:
118
+ """Health check endpoint."""
119
+ return HealthResponse(status="healthy", version="1.0.0")
120
+
121
+
122
+ async def _transcribe_with_provider(
123
+ audio_data: bytes,
124
+ filename: str,
125
+ provider_cfg: config.ProviderSelection,
126
+ wyoming_asr_cfg: config.WyomingASR,
127
+ openai_asr_cfg: config.OpenAIASR,
128
+ gemini_asr_cfg: config.GeminiASR,
129
+ ) -> str:
130
+ """Transcribe audio using the configured provider."""
131
+ transcriber = asr.create_recorded_audio_transcriber(provider_cfg)
132
+ file_suffix = Path(filename).suffix.lower() or ".wav"
133
+
134
+ if provider_cfg.asr_provider == "wyoming":
135
+ return await transcriber(
136
+ audio_data=audio_data,
137
+ wyoming_asr_cfg=wyoming_asr_cfg,
138
+ logger=LOGGER,
139
+ )
140
+ if provider_cfg.asr_provider == "openai":
141
+ return await transcriber(
142
+ audio_data=audio_data,
143
+ openai_asr_cfg=openai_asr_cfg,
144
+ logger=LOGGER,
145
+ file_suffix=file_suffix,
146
+ )
147
+ if provider_cfg.asr_provider == "gemini":
148
+ return await transcriber(
149
+ audio_data=audio_data,
150
+ gemini_asr_cfg=gemini_asr_cfg,
151
+ logger=LOGGER,
152
+ file_suffix=file_suffix,
153
+ )
154
+ msg = f"Unsupported ASR provider: {provider_cfg.asr_provider}"
155
+ raise NotImplementedError(msg)
156
+
157
+
158
+ async def _extract_audio_file_from_request(
159
+ request: Request,
160
+ audio: UploadFile | None,
161
+ ) -> UploadFile:
162
+ """Extract and validate audio file from request."""
163
+ # First try the standard 'audio' parameter
164
+ if audio is not None:
165
+ return audio
166
+
167
+ # iOS Shortcuts may use a different field name, scan form for audio files
168
+ LOGGER.info("No 'audio' parameter found, scanning form fields for audio files")
169
+ form_data = await request.form()
170
+
171
+ for key, value in form_data.items():
172
+ if is_valid_audio_file(value):
173
+ LOGGER.info("Found audio file in field '%s': %s", key, value.filename)
174
+ return value
175
+
176
+ # No audio file found anywhere
177
+ raise HTTPException(
178
+ status_code=422,
179
+ detail="No audio file provided. Ensure the form field is named 'audio' and type is 'File'.",
180
+ )
181
+
182
+
183
+ def _validate_audio_file(audio: UploadFile) -> None:
184
+ """Validate audio file and return file extension."""
185
+ if not audio or not audio.filename:
186
+ LOGGER.error("No filename provided in request")
187
+ raise HTTPException(status_code=400, detail="No filename provided")
188
+
189
+ file_ext = Path(audio.filename).suffix.lower()
190
+
191
+ if file_ext not in VALID_EXTENSIONS:
192
+ raise HTTPException(
193
+ status_code=400,
194
+ detail=f"Unsupported audio format: {file_ext}. Supported: {', '.join(VALID_EXTENSIONS)}",
195
+ )
196
+
197
+
198
+ def _cfg(key: str, defaults: dict[str, Any], opt: OptionInfo) -> Any:
199
+ """Get config with priority: env var > config file > option default."""
200
+ if opt.envvar and (env_val := os.environ.get(opt.envvar)):
201
+ return int(env_val) if isinstance(opt.default, int) else env_val
202
+ return defaults.get(key, opt.default)
203
+
204
+
205
+ def _load_transcription_configs() -> tuple[
206
+ config.ProviderSelection,
207
+ config.WyomingASR,
208
+ config.OpenAIASR,
209
+ config.GeminiASR,
210
+ config.Ollama,
211
+ config.OpenAILLM,
212
+ config.GeminiLLM,
213
+ dict[str, Any],
214
+ ]:
215
+ """Load config objects. Priority: env var > config file > default."""
216
+ loaded_config = config.load_config()
217
+ wildcard_config = loaded_config.get("defaults", {})
218
+ command_config = loaded_config.get("transcribe", {})
219
+ defaults = {**wildcard_config, **command_config}
220
+
221
+ provider_cfg = config.ProviderSelection(
222
+ asr_provider=_cfg("asr_provider", defaults, opts.ASR_PROVIDER),
223
+ llm_provider=_cfg("llm_provider", defaults, opts.LLM_PROVIDER),
224
+ tts_provider=_cfg("tts_provider", defaults, opts.TTS_PROVIDER),
225
+ )
226
+ wyoming_asr_cfg = config.WyomingASR(
227
+ asr_wyoming_ip=_cfg("asr_wyoming_ip", defaults, opts.ASR_WYOMING_IP),
228
+ asr_wyoming_port=_cfg("asr_wyoming_port", defaults, opts.ASR_WYOMING_PORT),
229
+ )
230
+ openai_asr_cfg = config.OpenAIASR(
231
+ asr_openai_model=_cfg("asr_openai_model", defaults, opts.ASR_OPENAI_MODEL),
232
+ openai_api_key=_cfg("openai_api_key", defaults, opts.OPENAI_API_KEY),
233
+ openai_base_url=_cfg("asr_openai_base_url", defaults, opts.ASR_OPENAI_BASE_URL),
234
+ asr_openai_prompt=_cfg("asr_openai_prompt", defaults, opts.ASR_OPENAI_PROMPT),
235
+ )
236
+ gemini_asr_cfg = config.GeminiASR(
237
+ asr_gemini_model=_cfg("asr_gemini_model", defaults, opts.ASR_GEMINI_MODEL),
238
+ gemini_api_key=_cfg("gemini_api_key", defaults, opts.GEMINI_API_KEY),
239
+ )
240
+ ollama_cfg = config.Ollama(
241
+ llm_ollama_model=_cfg("llm_ollama_model", defaults, opts.LLM_OLLAMA_MODEL),
242
+ llm_ollama_host=_cfg("llm_ollama_host", defaults, opts.LLM_OLLAMA_HOST),
243
+ )
244
+ openai_llm_cfg = config.OpenAILLM(
245
+ llm_openai_model=_cfg("llm_openai_model", defaults, opts.LLM_OPENAI_MODEL),
246
+ openai_api_key=_cfg("openai_api_key", defaults, opts.OPENAI_API_KEY),
247
+ openai_base_url=_cfg("openai_base_url", defaults, opts.OPENAI_BASE_URL),
248
+ )
249
+ gemini_llm_cfg = config.GeminiLLM(
250
+ llm_gemini_model=_cfg("llm_gemini_model", defaults, opts.LLM_GEMINI_MODEL),
251
+ gemini_api_key=_cfg("gemini_api_key", defaults, opts.GEMINI_API_KEY),
252
+ )
253
+
254
+ return (
255
+ provider_cfg,
256
+ wyoming_asr_cfg,
257
+ openai_asr_cfg,
258
+ gemini_asr_cfg,
259
+ ollama_cfg,
260
+ openai_llm_cfg,
261
+ gemini_llm_cfg,
262
+ defaults,
263
+ )
264
+
265
+
266
+ def _convert_audio_for_local_asr(audio_data: bytes, filename: str) -> bytes:
267
+ """Convert audio to Wyoming format if needed for local ASR."""
268
+ LOGGER.info("Converting %s audio to Wyoming format", filename)
269
+ converted_data = convert_audio_to_wyoming_format(audio_data, filename)
270
+ LOGGER.info("Audio conversion successful")
271
+ return converted_data
272
+
273
+
274
+ async def _process_transcript_cleanup(
275
+ raw_transcript: str,
276
+ cleanup: bool,
277
+ extra_instructions: str | None,
278
+ defaults: dict[str, Any],
279
+ provider_cfg: config.ProviderSelection,
280
+ ollama_cfg: config.Ollama,
281
+ openai_llm_cfg: config.OpenAILLM,
282
+ gemini_llm_cfg: config.GeminiLLM,
283
+ transcription_log: Path | None,
284
+ ) -> str | None:
285
+ """Process transcript cleanup with LLM if requested."""
286
+ if not cleanup:
287
+ return None
288
+
289
+ instructions = AGENT_INSTRUCTIONS
290
+ config_extra = defaults.get("extra_instructions", "")
291
+ if config_extra:
292
+ instructions += f"\n\n{config_extra}"
293
+ if extra_instructions:
294
+ instructions += f"\n\n{extra_instructions}"
295
+
296
+ combined_context, context_note = _build_context_payload(
297
+ transcription_log=transcription_log,
298
+ clipboard_snapshot=None,
299
+ )
300
+ if context_note:
301
+ instructions += context_note
302
+
303
+ return await process_and_update_clipboard(
304
+ system_prompt=SYSTEM_PROMPT,
305
+ agent_instructions=instructions,
306
+ provider_cfg=provider_cfg,
307
+ ollama_cfg=ollama_cfg,
308
+ openai_cfg=openai_llm_cfg,
309
+ gemini_cfg=gemini_llm_cfg,
310
+ logger=LOGGER,
311
+ original_text=raw_transcript,
312
+ instruction=INSTRUCTION,
313
+ clipboard=False, # Don't copy to clipboard in web service
314
+ quiet=True,
315
+ live=None,
316
+ context=combined_context,
317
+ )
318
+
319
+
320
+ @app.post("/transcribe", response_model=TranscriptionResponse)
321
+ async def transcribe_audio(
322
+ request: Request,
323
+ form_data: Annotated[TranscriptionRequest, Depends(_parse_transcription_form)],
324
+ audio: Annotated[UploadFile | None, File()] = None,
325
+ ) -> TranscriptionResponse:
326
+ """Transcribe audio file and optionally clean up the text.
327
+
328
+ Args:
329
+ request: FastAPI request object
330
+ audio: Audio file (wav, mp3, m4a, etc.)
331
+ form_data: Form data with cleanup and extra_instructions
332
+
333
+ Returns:
334
+ TranscriptionResponse with raw and cleaned transcripts
335
+
336
+ """
337
+ # Initialize variables outside try block to ensure they exist in finally block
338
+ raw_transcript = ""
339
+ cleaned_transcript = None
340
+ transcription_logger: TranscriptionLogger | None = None
341
+
342
+ try:
343
+ # Extract and validate audio file
344
+ audio_file = await _extract_audio_file_from_request(request, audio)
345
+ _validate_audio_file(audio_file)
346
+
347
+ # Extract form data (Pydantic handles string->bool conversion automatically)
348
+ cleanup = form_data.cleanup
349
+ extra_instructions = form_data.extra_instructions
350
+
351
+ # Load all configurations
352
+ (
353
+ provider_cfg,
354
+ wyoming_asr_cfg,
355
+ openai_asr_cfg,
356
+ gemini_asr_cfg,
357
+ ollama_cfg,
358
+ openai_llm_cfg,
359
+ gemini_llm_cfg,
360
+ defaults,
361
+ ) = _load_transcription_configs()
362
+
363
+ # Read uploaded file
364
+ audio_data = await audio_file.read()
365
+ LOGGER.info(
366
+ "Received audio: filename=%s, size=%d bytes, content_type=%s",
367
+ audio_file.filename,
368
+ len(audio_data),
369
+ audio_file.content_type,
370
+ )
371
+
372
+ # Convert audio to Wyoming format if using local ASR
373
+ if provider_cfg.asr_provider == "wyoming":
374
+ audio_data = _convert_audio_for_local_asr(audio_data, audio_file.filename)
375
+
376
+ # Transcribe audio using the configured provider
377
+ raw_transcript = await _transcribe_with_provider(
378
+ audio_data,
379
+ audio_file.filename or "audio.wav",
380
+ provider_cfg,
381
+ wyoming_asr_cfg,
382
+ openai_asr_cfg,
383
+ gemini_asr_cfg,
384
+ )
385
+
386
+ if not raw_transcript:
387
+ return TranscriptionResponse(
388
+ raw_transcript="",
389
+ success=False,
390
+ error="No transcript generated from audio",
391
+ )
392
+
393
+ if transcription_logger is None:
394
+ try:
395
+ transcription_logger = get_default_logger()
396
+ except Exception as log_init_error:
397
+ LOGGER.warning("Failed to initialize transcription logger: %s", log_init_error)
398
+
399
+ # Process transcript cleanup if requested
400
+ cleaned_transcript = await _process_transcript_cleanup(
401
+ raw_transcript,
402
+ cleanup,
403
+ extra_instructions,
404
+ defaults,
405
+ provider_cfg,
406
+ ollama_cfg,
407
+ openai_llm_cfg,
408
+ gemini_llm_cfg,
409
+ transcription_logger.log_file if transcription_logger else None,
410
+ )
411
+
412
+ # If cleanup was requested but failed, indicate partial success
413
+ if cleanup and cleaned_transcript is None:
414
+ return TranscriptionResponse(
415
+ raw_transcript=raw_transcript,
416
+ cleaned_transcript=None,
417
+ success=True, # Transcription succeeded even if cleanup failed
418
+ error="Transcription successful but cleanup failed. Check LLM configuration.",
419
+ )
420
+
421
+ return TranscriptionResponse(
422
+ raw_transcript=raw_transcript,
423
+ cleaned_transcript=cleaned_transcript,
424
+ success=True,
425
+ )
426
+
427
+ except HTTPException:
428
+ # Re-raise HTTPExceptions so FastAPI handles them properly
429
+ raise
430
+ except Exception as e:
431
+ LOGGER.exception("Error during transcription")
432
+ return TranscriptionResponse(raw_transcript="", success=False, error=str(e))
433
+ finally:
434
+ # Log the transcription automatically (even if it failed)
435
+ # Only log if we have something to log
436
+ if raw_transcript or cleaned_transcript:
437
+ try:
438
+ transcription_logger = transcription_logger or get_default_logger()
439
+ transcription_logger.log_transcription(
440
+ raw=raw_transcript,
441
+ processed=cleaned_transcript,
442
+ )
443
+ except Exception as log_error:
444
+ LOGGER.warning("Failed to log transcription: %s", log_error)