agent-cli 0.70.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. agent_cli/__init__.py +5 -0
  2. agent_cli/__main__.py +6 -0
  3. agent_cli/_extras.json +14 -0
  4. agent_cli/_requirements/.gitkeep +0 -0
  5. agent_cli/_requirements/audio.txt +79 -0
  6. agent_cli/_requirements/faster-whisper.txt +215 -0
  7. agent_cli/_requirements/kokoro.txt +425 -0
  8. agent_cli/_requirements/llm.txt +183 -0
  9. agent_cli/_requirements/memory.txt +355 -0
  10. agent_cli/_requirements/mlx-whisper.txt +222 -0
  11. agent_cli/_requirements/piper.txt +176 -0
  12. agent_cli/_requirements/rag.txt +402 -0
  13. agent_cli/_requirements/server.txt +154 -0
  14. agent_cli/_requirements/speed.txt +77 -0
  15. agent_cli/_requirements/vad.txt +155 -0
  16. agent_cli/_requirements/wyoming.txt +71 -0
  17. agent_cli/_tools.py +368 -0
  18. agent_cli/agents/__init__.py +23 -0
  19. agent_cli/agents/_voice_agent_common.py +136 -0
  20. agent_cli/agents/assistant.py +383 -0
  21. agent_cli/agents/autocorrect.py +284 -0
  22. agent_cli/agents/chat.py +496 -0
  23. agent_cli/agents/memory/__init__.py +31 -0
  24. agent_cli/agents/memory/add.py +190 -0
  25. agent_cli/agents/memory/proxy.py +160 -0
  26. agent_cli/agents/rag_proxy.py +128 -0
  27. agent_cli/agents/speak.py +209 -0
  28. agent_cli/agents/transcribe.py +671 -0
  29. agent_cli/agents/transcribe_daemon.py +499 -0
  30. agent_cli/agents/voice_edit.py +291 -0
  31. agent_cli/api.py +22 -0
  32. agent_cli/cli.py +106 -0
  33. agent_cli/config.py +503 -0
  34. agent_cli/config_cmd.py +307 -0
  35. agent_cli/constants.py +27 -0
  36. agent_cli/core/__init__.py +1 -0
  37. agent_cli/core/audio.py +461 -0
  38. agent_cli/core/audio_format.py +299 -0
  39. agent_cli/core/chroma.py +88 -0
  40. agent_cli/core/deps.py +191 -0
  41. agent_cli/core/openai_proxy.py +139 -0
  42. agent_cli/core/process.py +195 -0
  43. agent_cli/core/reranker.py +120 -0
  44. agent_cli/core/sse.py +87 -0
  45. agent_cli/core/transcription_logger.py +70 -0
  46. agent_cli/core/utils.py +526 -0
  47. agent_cli/core/vad.py +175 -0
  48. agent_cli/core/watch.py +65 -0
  49. agent_cli/dev/__init__.py +14 -0
  50. agent_cli/dev/cli.py +1588 -0
  51. agent_cli/dev/coding_agents/__init__.py +19 -0
  52. agent_cli/dev/coding_agents/aider.py +24 -0
  53. agent_cli/dev/coding_agents/base.py +167 -0
  54. agent_cli/dev/coding_agents/claude.py +39 -0
  55. agent_cli/dev/coding_agents/codex.py +24 -0
  56. agent_cli/dev/coding_agents/continue_dev.py +15 -0
  57. agent_cli/dev/coding_agents/copilot.py +24 -0
  58. agent_cli/dev/coding_agents/cursor_agent.py +48 -0
  59. agent_cli/dev/coding_agents/gemini.py +28 -0
  60. agent_cli/dev/coding_agents/opencode.py +15 -0
  61. agent_cli/dev/coding_agents/registry.py +49 -0
  62. agent_cli/dev/editors/__init__.py +19 -0
  63. agent_cli/dev/editors/base.py +89 -0
  64. agent_cli/dev/editors/cursor.py +15 -0
  65. agent_cli/dev/editors/emacs.py +46 -0
  66. agent_cli/dev/editors/jetbrains.py +56 -0
  67. agent_cli/dev/editors/nano.py +31 -0
  68. agent_cli/dev/editors/neovim.py +33 -0
  69. agent_cli/dev/editors/registry.py +59 -0
  70. agent_cli/dev/editors/sublime.py +20 -0
  71. agent_cli/dev/editors/vim.py +42 -0
  72. agent_cli/dev/editors/vscode.py +15 -0
  73. agent_cli/dev/editors/zed.py +20 -0
  74. agent_cli/dev/project.py +568 -0
  75. agent_cli/dev/registry.py +52 -0
  76. agent_cli/dev/skill/SKILL.md +141 -0
  77. agent_cli/dev/skill/examples.md +571 -0
  78. agent_cli/dev/terminals/__init__.py +19 -0
  79. agent_cli/dev/terminals/apple_terminal.py +82 -0
  80. agent_cli/dev/terminals/base.py +56 -0
  81. agent_cli/dev/terminals/gnome.py +51 -0
  82. agent_cli/dev/terminals/iterm2.py +84 -0
  83. agent_cli/dev/terminals/kitty.py +77 -0
  84. agent_cli/dev/terminals/registry.py +48 -0
  85. agent_cli/dev/terminals/tmux.py +58 -0
  86. agent_cli/dev/terminals/warp.py +132 -0
  87. agent_cli/dev/terminals/zellij.py +78 -0
  88. agent_cli/dev/worktree.py +856 -0
  89. agent_cli/docs_gen.py +417 -0
  90. agent_cli/example-config.toml +185 -0
  91. agent_cli/install/__init__.py +5 -0
  92. agent_cli/install/common.py +89 -0
  93. agent_cli/install/extras.py +174 -0
  94. agent_cli/install/hotkeys.py +48 -0
  95. agent_cli/install/services.py +87 -0
  96. agent_cli/memory/__init__.py +7 -0
  97. agent_cli/memory/_files.py +250 -0
  98. agent_cli/memory/_filters.py +63 -0
  99. agent_cli/memory/_git.py +157 -0
  100. agent_cli/memory/_indexer.py +142 -0
  101. agent_cli/memory/_ingest.py +408 -0
  102. agent_cli/memory/_persistence.py +182 -0
  103. agent_cli/memory/_prompt.py +91 -0
  104. agent_cli/memory/_retrieval.py +294 -0
  105. agent_cli/memory/_store.py +169 -0
  106. agent_cli/memory/_streaming.py +44 -0
  107. agent_cli/memory/_tasks.py +48 -0
  108. agent_cli/memory/api.py +113 -0
  109. agent_cli/memory/client.py +272 -0
  110. agent_cli/memory/engine.py +361 -0
  111. agent_cli/memory/entities.py +43 -0
  112. agent_cli/memory/models.py +112 -0
  113. agent_cli/opts.py +433 -0
  114. agent_cli/py.typed +0 -0
  115. agent_cli/rag/__init__.py +3 -0
  116. agent_cli/rag/_indexer.py +67 -0
  117. agent_cli/rag/_indexing.py +226 -0
  118. agent_cli/rag/_prompt.py +30 -0
  119. agent_cli/rag/_retriever.py +156 -0
  120. agent_cli/rag/_store.py +48 -0
  121. agent_cli/rag/_utils.py +218 -0
  122. agent_cli/rag/api.py +175 -0
  123. agent_cli/rag/client.py +299 -0
  124. agent_cli/rag/engine.py +302 -0
  125. agent_cli/rag/models.py +55 -0
  126. agent_cli/scripts/.runtime/.gitkeep +0 -0
  127. agent_cli/scripts/__init__.py +1 -0
  128. agent_cli/scripts/check_plugin_skill_sync.py +50 -0
  129. agent_cli/scripts/linux-hotkeys/README.md +63 -0
  130. agent_cli/scripts/linux-hotkeys/toggle-autocorrect.sh +45 -0
  131. agent_cli/scripts/linux-hotkeys/toggle-transcription.sh +58 -0
  132. agent_cli/scripts/linux-hotkeys/toggle-voice-edit.sh +58 -0
  133. agent_cli/scripts/macos-hotkeys/README.md +45 -0
  134. agent_cli/scripts/macos-hotkeys/skhd-config-example +5 -0
  135. agent_cli/scripts/macos-hotkeys/toggle-autocorrect.sh +12 -0
  136. agent_cli/scripts/macos-hotkeys/toggle-transcription.sh +37 -0
  137. agent_cli/scripts/macos-hotkeys/toggle-voice-edit.sh +37 -0
  138. agent_cli/scripts/nvidia-asr-server/README.md +99 -0
  139. agent_cli/scripts/nvidia-asr-server/pyproject.toml +27 -0
  140. agent_cli/scripts/nvidia-asr-server/server.py +255 -0
  141. agent_cli/scripts/nvidia-asr-server/shell.nix +32 -0
  142. agent_cli/scripts/nvidia-asr-server/uv.lock +4654 -0
  143. agent_cli/scripts/run-openwakeword.sh +11 -0
  144. agent_cli/scripts/run-piper-windows.ps1 +30 -0
  145. agent_cli/scripts/run-piper.sh +24 -0
  146. agent_cli/scripts/run-whisper-linux.sh +40 -0
  147. agent_cli/scripts/run-whisper-macos.sh +6 -0
  148. agent_cli/scripts/run-whisper-windows.ps1 +51 -0
  149. agent_cli/scripts/run-whisper.sh +9 -0
  150. agent_cli/scripts/run_faster_whisper_server.py +136 -0
  151. agent_cli/scripts/setup-linux-hotkeys.sh +72 -0
  152. agent_cli/scripts/setup-linux.sh +108 -0
  153. agent_cli/scripts/setup-macos-hotkeys.sh +61 -0
  154. agent_cli/scripts/setup-macos.sh +76 -0
  155. agent_cli/scripts/setup-windows.ps1 +63 -0
  156. agent_cli/scripts/start-all-services-windows.ps1 +53 -0
  157. agent_cli/scripts/start-all-services.sh +178 -0
  158. agent_cli/scripts/sync_extras.py +138 -0
  159. agent_cli/server/__init__.py +3 -0
  160. agent_cli/server/cli.py +721 -0
  161. agent_cli/server/common.py +222 -0
  162. agent_cli/server/model_manager.py +288 -0
  163. agent_cli/server/model_registry.py +225 -0
  164. agent_cli/server/proxy/__init__.py +3 -0
  165. agent_cli/server/proxy/api.py +444 -0
  166. agent_cli/server/streaming.py +67 -0
  167. agent_cli/server/tts/__init__.py +3 -0
  168. agent_cli/server/tts/api.py +335 -0
  169. agent_cli/server/tts/backends/__init__.py +82 -0
  170. agent_cli/server/tts/backends/base.py +139 -0
  171. agent_cli/server/tts/backends/kokoro.py +403 -0
  172. agent_cli/server/tts/backends/piper.py +253 -0
  173. agent_cli/server/tts/model_manager.py +201 -0
  174. agent_cli/server/tts/model_registry.py +28 -0
  175. agent_cli/server/tts/wyoming_handler.py +249 -0
  176. agent_cli/server/whisper/__init__.py +3 -0
  177. agent_cli/server/whisper/api.py +413 -0
  178. agent_cli/server/whisper/backends/__init__.py +89 -0
  179. agent_cli/server/whisper/backends/base.py +97 -0
  180. agent_cli/server/whisper/backends/faster_whisper.py +225 -0
  181. agent_cli/server/whisper/backends/mlx.py +270 -0
  182. agent_cli/server/whisper/languages.py +116 -0
  183. agent_cli/server/whisper/model_manager.py +157 -0
  184. agent_cli/server/whisper/model_registry.py +28 -0
  185. agent_cli/server/whisper/wyoming_handler.py +203 -0
  186. agent_cli/services/__init__.py +343 -0
  187. agent_cli/services/_wyoming_utils.py +64 -0
  188. agent_cli/services/asr.py +506 -0
  189. agent_cli/services/llm.py +228 -0
  190. agent_cli/services/tts.py +450 -0
  191. agent_cli/services/wake_word.py +142 -0
  192. agent_cli-0.70.5.dist-info/METADATA +2118 -0
  193. agent_cli-0.70.5.dist-info/RECORD +196 -0
  194. agent_cli-0.70.5.dist-info/WHEEL +4 -0
  195. agent_cli-0.70.5.dist-info/entry_points.txt +4 -0
  196. agent_cli-0.70.5.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,671 @@
1
+ """Wyoming ASR Client for streaming microphone audio to a transcription server."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import logging
8
+ import platform
9
+ import time
10
+ from contextlib import suppress
11
+ from datetime import UTC, datetime, timedelta
12
+ from pathlib import Path # noqa: TC003
13
+ from typing import Any, TypedDict
14
+
15
+ import typer
16
+
17
+ from agent_cli import config, opts
18
+ from agent_cli.cli import app
19
+ from agent_cli.core import process
20
+ from agent_cli.core.audio import setup_devices
21
+ from agent_cli.core.deps import requires_extras
22
+ from agent_cli.core.utils import (
23
+ enable_json_mode,
24
+ format_short_timedelta,
25
+ iter_lines_from_file_end,
26
+ maybe_live,
27
+ parse_json_line,
28
+ print_command_line_args,
29
+ print_input_panel,
30
+ print_output_panel,
31
+ print_with_style,
32
+ setup_logging,
33
+ signal_handling_context,
34
+ stop_or_status_or_toggle,
35
+ )
36
+ from agent_cli.services import (
37
+ GEMINI_SUPPORTED_FORMATS,
38
+ OPENAI_SUPPORTED_FORMATS,
39
+ asr,
40
+ )
41
+ from agent_cli.services.asr import (
42
+ create_recorded_audio_transcriber,
43
+ get_last_recording,
44
+ load_audio_from_file,
45
+ )
46
+ from agent_cli.services.llm import process_and_update_clipboard
47
+
48
+ LOGGER = logging.getLogger()
49
+
50
+
51
+ class TranscriptResult(TypedDict, total=False):
52
+ """Result of transcription with optional LLM processing."""
53
+
54
+ raw_transcript: str | None
55
+ transcript: str | None
56
+ llm_enabled: bool
57
+
58
+
59
+ SYSTEM_PROMPT = """
60
+ CRITICAL: You must respond with ONLY the cleaned transcription text. Do NOT add any prefixes, explanations, or commentary whatsoever.
61
+
62
+ WRONG responses (DO NOT DO THIS):
63
+ - "Sure. Here's the cleaned-up text: [text]"
64
+ - "Here is the cleaned text: [text]"
65
+ - "Certainly. Here's the cleaned-up text: [text]"
66
+ - Any text wrapped in quotes like "[text]"
67
+
68
+ CORRECT response: Just the cleaned text directly, nothing else.
69
+
70
+ You are an AI transcription cleanup assistant. Your purpose is to improve and refine raw speech-to-text transcriptions by correcting errors, adding proper punctuation, and enhancing readability while preserving the original meaning and intent.
71
+
72
+ Your tasks include:
73
+ - Correcting obvious speech recognition errors and mishearing
74
+ - Adding appropriate punctuation (periods, commas, question marks, etc.)
75
+ - Fixing capitalization where needed
76
+ - Removing filler words, false starts, and repeated words when they clearly weren't intentional
77
+ - Improving sentence structure and flow while maintaining the speaker's voice and meaning
78
+ - Formatting the text for better readability
79
+
80
+ Important rules:
81
+ - Do not change the core meaning or content of the transcription
82
+ - Do not add information that wasn't spoken
83
+ - Do not remove content unless it's clearly an error or filler
84
+ - Do not wrap your output in markdown or code blocks
85
+ """
86
+
87
+ AGENT_INSTRUCTIONS = """
88
+ REMINDER: Respond with ONLY the cleaned text. No prefixes like "Here's the cleaned text:" or quotes around your response.
89
+
90
+ You will be given a block of raw transcribed text enclosed in <original-text> tags, and a cleanup instruction enclosed in <instruction> tags.
91
+
92
+ Your job is to process the transcribed text according to the instruction, which will typically involve:
93
+ - Correcting speech recognition errors
94
+ - Adding proper punctuation and capitalization
95
+ - Removing obvious filler words and false starts
96
+ - Improving readability while preserving meaning
97
+
98
+ Your response must be JUST the cleaned text - nothing before it, nothing after it, no quotes around it.
99
+ """
100
+
101
+ INSTRUCTION = """
102
+ Please clean up this transcribed text by correcting any speech recognition errors, adding appropriate punctuation and capitalization, removing obvious filler words or false starts, and improving overall readability while preserving the original meaning and intent of the speaker.
103
+ """
104
+
105
+ RECENT_CONTEXT_LOOKBACK_SECONDS = 60 * 60 # 1 hour
106
+ RECENT_CONTEXT_MAX_ENTRIES = 3
107
+ RECENT_CONTEXT_MAX_CHARS = 500
108
+ RECENT_CONTEXT_READ_CHUNK_BYTES = 4096
109
+ CLIPBOARD_CONTEXT_MAX_CHARS = 500
110
+
111
+
112
+ def _build_context_line(
113
+ entry: dict[str, Any],
114
+ *,
115
+ now: datetime,
116
+ cutoff: datetime,
117
+ max_chars_per_entry: int,
118
+ ) -> tuple[str | None, bool]:
119
+ timestamp_str = entry.get("timestamp")
120
+ if not timestamp_str:
121
+ return None, False
122
+
123
+ try:
124
+ entry_ts = datetime.fromisoformat(timestamp_str)
125
+ except ValueError:
126
+ return None, False
127
+
128
+ if entry_ts < cutoff:
129
+ return None, True
130
+
131
+ # Both the CLI (`raw_output`/`processed_output`) and API (`raw`/`processed`)
132
+ # logging formats are supported, preferring the raw transcript when present.
133
+ text = (entry.get("raw_output") or entry.get("raw") or "").strip()
134
+ if not text:
135
+ return None, False
136
+
137
+ if max_chars_per_entry > 0 and len(text) > max_chars_per_entry:
138
+ text = text[:max_chars_per_entry].rstrip() + "..."
139
+
140
+ delta_str = format_short_timedelta(now - entry_ts)
141
+ return f"- {delta_str} ago (raw transcript): {text}", False
142
+
143
+
144
+ def _gather_recent_transcription_context(
145
+ log_file: Path,
146
+ *,
147
+ max_age_seconds: int = RECENT_CONTEXT_LOOKBACK_SECONDS,
148
+ max_entries: int = RECENT_CONTEXT_MAX_ENTRIES,
149
+ max_chars_per_entry: int = RECENT_CONTEXT_MAX_CHARS,
150
+ now: datetime | None = None,
151
+ chunk_size: int = RECENT_CONTEXT_READ_CHUNK_BYTES,
152
+ ) -> str | None:
153
+ """Return recent transcription snippets to give the LLM additional context."""
154
+ if max_entries <= 0 or max_age_seconds <= 0:
155
+ return None
156
+ if not log_file.exists():
157
+ return None
158
+ if chunk_size <= 0:
159
+ chunk_size = RECENT_CONTEXT_READ_CHUNK_BYTES
160
+
161
+ now = now or datetime.now(UTC)
162
+ cutoff = now - timedelta(seconds=max_age_seconds)
163
+ context_entries: list[str] = []
164
+
165
+ try:
166
+ for line in iter_lines_from_file_end(log_file, chunk_size):
167
+ entry = parse_json_line(line)
168
+ if not entry:
169
+ continue
170
+ context_line, should_stop = _build_context_line(
171
+ entry,
172
+ now=now,
173
+ cutoff=cutoff,
174
+ max_chars_per_entry=max_chars_per_entry,
175
+ )
176
+ if should_stop:
177
+ break
178
+ if context_line:
179
+ context_entries.append(context_line)
180
+ if len(context_entries) >= max_entries:
181
+ break
182
+ except OSError as exc:
183
+ LOGGER.debug("Unable to read transcription log %s: %s", log_file, exc)
184
+ return None
185
+
186
+ if not context_entries:
187
+ return None
188
+
189
+ history_lines = "\n".join(reversed(context_entries))
190
+ header = "Recent transcript history (time deltas relative to now):\n"
191
+ return header + history_lines
192
+
193
+
194
+ def _build_context_payload(
195
+ *,
196
+ transcription_log: Path | None,
197
+ clipboard_snapshot: str | None,
198
+ ) -> tuple[str | None, str | None]:
199
+ """Return combined context text and the note to append to instructions."""
200
+ context_sections: list[str] = []
201
+
202
+ if transcription_log:
203
+ log_context = _gather_recent_transcription_context(transcription_log)
204
+ if log_context:
205
+ context_sections.append(log_context)
206
+
207
+ if clipboard_snapshot:
208
+ clipboard_text = clipboard_snapshot.strip()
209
+ if clipboard_text:
210
+ if len(clipboard_text) > CLIPBOARD_CONTEXT_MAX_CHARS:
211
+ clipboard_text = clipboard_text[:CLIPBOARD_CONTEXT_MAX_CHARS].rstrip() + "..."
212
+ context_sections.append(
213
+ "Clipboard content captured before this recording "
214
+ "(truncated for safety; may be unrelated to the new request):\n"
215
+ f"- {clipboard_text}",
216
+ )
217
+
218
+ if not context_sections:
219
+ return None, None
220
+
221
+ combined_context = "\n\n".join(context_sections)
222
+ instructions_note = (
223
+ "\n\n<context> contains recent log transcripts and/or clipboard text. "
224
+ "Treat it as optional background and clean only the text inside <original-text>."
225
+ )
226
+ return combined_context, instructions_note
227
+
228
+
229
+ def log_transcription(
230
+ log_file: Path,
231
+ role: str,
232
+ raw_transcript: str,
233
+ processed_transcript: str | None = None,
234
+ model_info: str | None = None,
235
+ ) -> None:
236
+ """Log transcription results with metadata."""
237
+ log_entry = {
238
+ "timestamp": datetime.now(UTC).isoformat(),
239
+ "hostname": platform.node(),
240
+ "role": role,
241
+ "model": model_info,
242
+ "raw_output": raw_transcript,
243
+ "processed_output": processed_transcript,
244
+ }
245
+
246
+ # Append to log file
247
+ with log_file.open("a", encoding="utf-8") as f:
248
+ f.write(json.dumps(log_entry) + "\n")
249
+
250
+
251
+ async def _async_main( # noqa: PLR0912, PLR0915, C901
252
+ *,
253
+ extra_instructions: str | None,
254
+ provider_cfg: config.ProviderSelection,
255
+ general_cfg: config.General,
256
+ audio_in_cfg: config.AudioInput | None = None,
257
+ wyoming_asr_cfg: config.WyomingASR,
258
+ openai_asr_cfg: config.OpenAIASR,
259
+ gemini_asr_cfg: config.GeminiASR,
260
+ ollama_cfg: config.Ollama,
261
+ openai_llm_cfg: config.OpenAILLM,
262
+ gemini_llm_cfg: config.GeminiLLM,
263
+ llm_enabled: bool,
264
+ transcription_log: Path | None,
265
+ # Optional parameters for file-based transcription
266
+ audio_file_path: Path | None = None,
267
+ save_recording: bool = True,
268
+ process_name: str | None = None,
269
+ ) -> TranscriptResult:
270
+ """Unified async entry point for both live and file-based transcription."""
271
+ start_time = time.monotonic()
272
+ transcript: str | None
273
+
274
+ with maybe_live(not general_cfg.quiet) as live:
275
+ if audio_file_path:
276
+ # File-based transcription
277
+ # Determine if we can use native format support (skip PCM conversion)
278
+ suffix = audio_file_path.suffix.lower()
279
+ use_native_format = (
280
+ provider_cfg.asr_provider == "openai" and suffix in OPENAI_SUPPORTED_FORMATS
281
+ ) or (provider_cfg.asr_provider == "gemini" and suffix in GEMINI_SUPPORTED_FORMATS)
282
+
283
+ # Wyoming always needs PCM, OpenAI/Gemini can use native formats
284
+ audio_data = load_audio_from_file(
285
+ audio_file_path,
286
+ LOGGER,
287
+ convert_to_pcm=not use_native_format,
288
+ )
289
+ if not audio_data:
290
+ print_with_style(
291
+ f"❌ Failed to load audio from {audio_file_path}",
292
+ style="red",
293
+ )
294
+ return TranscriptResult(
295
+ raw_transcript=None,
296
+ transcript=None,
297
+ llm_enabled=False,
298
+ )
299
+
300
+ recorded_transcriber = create_recorded_audio_transcriber(provider_cfg)
301
+
302
+ # Call with appropriate arguments based on provider
303
+ if provider_cfg.asr_provider == "openai":
304
+ transcript = await recorded_transcriber(
305
+ audio_data,
306
+ openai_asr_cfg,
307
+ LOGGER,
308
+ quiet=general_cfg.quiet,
309
+ file_suffix=suffix if use_native_format else ".wav",
310
+ extra_instructions=extra_instructions,
311
+ )
312
+ elif provider_cfg.asr_provider == "gemini":
313
+ transcript = await recorded_transcriber(
314
+ audio_data,
315
+ gemini_asr_cfg,
316
+ LOGGER,
317
+ quiet=general_cfg.quiet,
318
+ file_suffix=suffix if use_native_format else ".wav",
319
+ extra_instructions=extra_instructions,
320
+ )
321
+ elif provider_cfg.asr_provider == "wyoming":
322
+ transcript = await recorded_transcriber(
323
+ audio_data=audio_data,
324
+ wyoming_asr_cfg=wyoming_asr_cfg,
325
+ logger=LOGGER,
326
+ quiet=general_cfg.quiet,
327
+ extra_instructions=extra_instructions,
328
+ )
329
+ else:
330
+ msg = f"Unsupported ASR provider: {provider_cfg.asr_provider}"
331
+ raise NotImplementedError(msg)
332
+ else:
333
+ # Live recording transcription
334
+ if not audio_in_cfg:
335
+ msg = "Missing audio configuration for live recording"
336
+ raise ValueError(msg)
337
+
338
+ with signal_handling_context(LOGGER, general_cfg.quiet, process_name) as stop_event:
339
+ live_transcriber = asr.create_transcriber(
340
+ provider_cfg,
341
+ audio_in_cfg,
342
+ wyoming_asr_cfg,
343
+ openai_asr_cfg,
344
+ gemini_asr_cfg,
345
+ )
346
+ transcript = await live_transcriber(
347
+ logger=LOGGER,
348
+ stop_event=stop_event,
349
+ quiet=general_cfg.quiet,
350
+ live=live,
351
+ save_recording=save_recording,
352
+ extra_instructions=extra_instructions,
353
+ )
354
+
355
+ elapsed = time.monotonic() - start_time
356
+
357
+ if llm_enabled and transcript:
358
+ if not general_cfg.quiet:
359
+ print_input_panel(
360
+ transcript,
361
+ title="📝 Raw Transcript",
362
+ subtitle=f"[dim]took {elapsed:.2f}s[/dim]",
363
+ )
364
+ clipboard_snapshot: str | None = None
365
+ if general_cfg.clipboard:
366
+ import pyperclip # noqa: PLC0415
367
+
368
+ clipboard_snapshot = pyperclip.paste()
369
+ pyperclip.copy(transcript)
370
+ LOGGER.info("Copied raw transcript to clipboard before LLM processing.")
371
+ instructions = AGENT_INSTRUCTIONS
372
+ if extra_instructions:
373
+ instructions += f"\n\n{extra_instructions}"
374
+
375
+ combined_context, context_note = _build_context_payload(
376
+ transcription_log=transcription_log,
377
+ clipboard_snapshot=clipboard_snapshot,
378
+ )
379
+ if context_note:
380
+ instructions += context_note
381
+
382
+ # Get model info for logging
383
+ if provider_cfg.llm_provider == "ollama":
384
+ model_info = f"{provider_cfg.llm_provider}:{ollama_cfg.llm_ollama_model}"
385
+ elif provider_cfg.llm_provider == "openai":
386
+ model_info = f"{provider_cfg.llm_provider}:{openai_llm_cfg.llm_openai_model}"
387
+ elif provider_cfg.llm_provider == "gemini":
388
+ model_info = f"{provider_cfg.llm_provider}:{gemini_llm_cfg.llm_gemini_model}"
389
+ else:
390
+ msg = f"Unsupported LLM provider: {provider_cfg.llm_provider}"
391
+ raise ValueError(msg)
392
+ processed_transcript = await process_and_update_clipboard(
393
+ system_prompt=SYSTEM_PROMPT,
394
+ agent_instructions=instructions,
395
+ provider_cfg=provider_cfg,
396
+ ollama_cfg=ollama_cfg,
397
+ openai_cfg=openai_llm_cfg,
398
+ gemini_cfg=gemini_llm_cfg,
399
+ logger=LOGGER,
400
+ original_text=transcript,
401
+ instruction=INSTRUCTION,
402
+ clipboard=general_cfg.clipboard,
403
+ quiet=general_cfg.quiet,
404
+ live=live,
405
+ context=combined_context,
406
+ )
407
+
408
+ # Log transcription if requested
409
+ if transcription_log:
410
+ log_transcription(
411
+ log_file=transcription_log,
412
+ role="assistant",
413
+ raw_transcript=transcript,
414
+ processed_transcript=processed_transcript,
415
+ model_info=model_info,
416
+ )
417
+ return TranscriptResult(
418
+ raw_transcript=transcript,
419
+ transcript=processed_transcript,
420
+ llm_enabled=True,
421
+ )
422
+
423
+ # When not using LLM, show transcript in output panel for consistency
424
+ if transcript:
425
+ if general_cfg.quiet:
426
+ # Quiet mode: print result to stdout for Keyboard Maestro to capture
427
+ print(transcript)
428
+ else:
429
+ print_output_panel(
430
+ transcript,
431
+ title="📝 Transcript",
432
+ subtitle="[dim]Copied to clipboard[/dim]" if general_cfg.clipboard else "",
433
+ )
434
+
435
+ # Log transcription if requested (raw only)
436
+ if transcription_log:
437
+ asr_model_info = f"{provider_cfg.asr_provider}"
438
+ if provider_cfg.asr_provider == "openai":
439
+ asr_model_info += f":{openai_asr_cfg.asr_openai_model}"
440
+ log_transcription(
441
+ log_file=transcription_log,
442
+ role="user",
443
+ raw_transcript=transcript,
444
+ processed_transcript=None,
445
+ model_info=asr_model_info,
446
+ )
447
+
448
+ if general_cfg.clipboard:
449
+ import pyperclip # noqa: PLC0415
450
+
451
+ pyperclip.copy(transcript)
452
+ LOGGER.info("Copied transcript to clipboard.")
453
+ else:
454
+ LOGGER.info("Clipboard copy disabled.")
455
+ else:
456
+ LOGGER.info("Transcript empty.")
457
+ if not general_cfg.quiet:
458
+ print_with_style("⚠️ No transcript captured.", style="yellow")
459
+
460
+ return TranscriptResult(
461
+ raw_transcript=transcript,
462
+ transcript=transcript,
463
+ llm_enabled=False,
464
+ )
465
+
466
+
467
+ @app.command("transcribe", rich_help_panel="Voice Commands")
468
+ @requires_extras("audio", "llm")
469
+ def transcribe( # noqa: PLR0912
470
+ *,
471
+ extra_instructions: str | None = typer.Option(
472
+ None,
473
+ "--extra-instructions",
474
+ help="Additional instructions for the LLM to process the transcription.",
475
+ rich_help_panel="LLM Configuration",
476
+ ),
477
+ from_file: Path | None = opts.FROM_FILE,
478
+ last_recording: int = opts.LAST_RECORDING,
479
+ save_recording: bool = opts.SAVE_RECORDING,
480
+ # --- Provider Selection ---
481
+ asr_provider: str = opts.ASR_PROVIDER,
482
+ llm_provider: str = opts.LLM_PROVIDER,
483
+ # --- ASR (Audio) Configuration ---
484
+ input_device_index: int | None = opts.INPUT_DEVICE_INDEX,
485
+ input_device_name: str | None = opts.INPUT_DEVICE_NAME,
486
+ asr_wyoming_ip: str = opts.ASR_WYOMING_IP,
487
+ asr_wyoming_port: int = opts.ASR_WYOMING_PORT,
488
+ asr_openai_model: str = opts.ASR_OPENAI_MODEL,
489
+ asr_openai_base_url: str | None = opts.ASR_OPENAI_BASE_URL,
490
+ asr_openai_prompt: str | None = opts.ASR_OPENAI_PROMPT,
491
+ asr_gemini_model: str = opts.ASR_GEMINI_MODEL,
492
+ # --- LLM Configuration ---
493
+ llm_ollama_model: str = opts.LLM_OLLAMA_MODEL,
494
+ llm_ollama_host: str = opts.LLM_OLLAMA_HOST,
495
+ llm_openai_model: str = opts.LLM_OPENAI_MODEL,
496
+ openai_api_key: str | None = opts.OPENAI_API_KEY,
497
+ openai_base_url: str | None = opts.OPENAI_BASE_URL,
498
+ llm_gemini_model: str = opts.LLM_GEMINI_MODEL,
499
+ gemini_api_key: str | None = opts.GEMINI_API_KEY,
500
+ llm: bool = opts.LLM,
501
+ # --- Process Management ---
502
+ stop: bool = opts.STOP,
503
+ status: bool = opts.STATUS,
504
+ toggle: bool = opts.TOGGLE,
505
+ # --- General Options ---
506
+ clipboard: bool = opts.CLIPBOARD,
507
+ log_level: opts.LogLevel = opts.LOG_LEVEL,
508
+ log_file: str | None = opts.LOG_FILE,
509
+ list_devices: bool = opts.LIST_DEVICES,
510
+ quiet: bool = opts.QUIET,
511
+ json_output: bool = opts.JSON_OUTPUT,
512
+ config_file: str | None = opts.CONFIG_FILE,
513
+ print_args: bool = opts.PRINT_ARGS,
514
+ transcription_log: Path | None = opts.TRANSCRIPTION_LOG,
515
+ ) -> None:
516
+ """Wyoming ASR Client for streaming microphone audio to a transcription server."""
517
+ if print_args:
518
+ print_command_line_args(locals())
519
+
520
+ # JSON output implies quiet mode and no clipboard - set this early before any output
521
+ effective_quiet = quiet or json_output
522
+ if json_output:
523
+ enable_json_mode()
524
+
525
+ setup_logging(log_level, log_file, quiet=effective_quiet)
526
+
527
+ # Expand user path for transcription log
528
+ if transcription_log:
529
+ transcription_log = transcription_log.expanduser()
530
+
531
+ # Handle recovery options
532
+ if last_recording and from_file:
533
+ print_with_style("❌ Cannot use both --last-recording and --from-file", style="red")
534
+ return
535
+
536
+ # Determine audio source
537
+ audio_file_path = None
538
+ if last_recording > 0: # 0 means disabled
539
+ audio_file_path = get_last_recording(last_recording)
540
+ if not audio_file_path:
541
+ if last_recording == 1:
542
+ print_with_style("❌ No saved recordings found", style="red")
543
+ else:
544
+ print_with_style(
545
+ f"❌ Recording #{last_recording} not found (not enough recordings)",
546
+ style="red",
547
+ )
548
+ return
549
+ if not quiet:
550
+ ordinal = "most recent" if last_recording == 1 else f"#{last_recording}"
551
+ print_with_style(
552
+ f"📁 Using {ordinal} recording: {audio_file_path.name}",
553
+ style="blue",
554
+ )
555
+ elif from_file:
556
+ audio_file_path = from_file.expanduser()
557
+ if not audio_file_path.exists():
558
+ print_with_style(f"❌ File not found: {audio_file_path}", style="red")
559
+ return
560
+
561
+ # Create all config objects once
562
+ effective_clipboard = clipboard and not json_output
563
+ general_cfg = config.General(
564
+ log_level=log_level,
565
+ log_file=log_file,
566
+ quiet=effective_quiet,
567
+ list_devices=list_devices,
568
+ clipboard=effective_clipboard,
569
+ )
570
+ provider_cfg = config.ProviderSelection(
571
+ asr_provider=asr_provider,
572
+ llm_provider=llm_provider,
573
+ tts_provider="wyoming", # Not used in transcribe
574
+ )
575
+ wyoming_asr_cfg = config.WyomingASR(
576
+ asr_wyoming_ip=asr_wyoming_ip,
577
+ asr_wyoming_port=asr_wyoming_port,
578
+ )
579
+ openai_asr_cfg = config.OpenAIASR(
580
+ asr_openai_model=asr_openai_model,
581
+ openai_api_key=openai_api_key,
582
+ openai_base_url=asr_openai_base_url or openai_base_url,
583
+ asr_openai_prompt=asr_openai_prompt,
584
+ )
585
+ gemini_asr_cfg = config.GeminiASR(
586
+ asr_gemini_model=asr_gemini_model,
587
+ gemini_api_key=gemini_api_key,
588
+ )
589
+ ollama_cfg = config.Ollama(
590
+ llm_ollama_model=llm_ollama_model,
591
+ llm_ollama_host=llm_ollama_host,
592
+ )
593
+ openai_llm_cfg = config.OpenAILLM(
594
+ llm_openai_model=llm_openai_model,
595
+ openai_api_key=openai_api_key,
596
+ openai_base_url=openai_base_url,
597
+ )
598
+ gemini_llm_cfg = config.GeminiLLM(
599
+ llm_gemini_model=llm_gemini_model,
600
+ gemini_api_key=gemini_api_key,
601
+ )
602
+
603
+ # Handle recovery mode (transcribing from file)
604
+ if audio_file_path:
605
+ # We're transcribing from a saved file
606
+ result = asyncio.run(
607
+ _async_main(
608
+ audio_file_path=audio_file_path,
609
+ extra_instructions=extra_instructions,
610
+ provider_cfg=provider_cfg,
611
+ general_cfg=general_cfg,
612
+ wyoming_asr_cfg=wyoming_asr_cfg,
613
+ openai_asr_cfg=openai_asr_cfg,
614
+ gemini_asr_cfg=gemini_asr_cfg,
615
+ ollama_cfg=ollama_cfg,
616
+ openai_llm_cfg=openai_llm_cfg,
617
+ gemini_llm_cfg=gemini_llm_cfg,
618
+ llm_enabled=llm,
619
+ transcription_log=transcription_log,
620
+ ),
621
+ )
622
+ if json_output:
623
+ print(json.dumps(result))
624
+ return
625
+
626
+ # Normal recording mode
627
+ process_name = "transcribe"
628
+ if stop_or_status_or_toggle(
629
+ process_name,
630
+ "transcribe",
631
+ stop,
632
+ status,
633
+ toggle,
634
+ quiet=general_cfg.quiet,
635
+ ):
636
+ return
637
+
638
+ audio_in_cfg = config.AudioInput(
639
+ input_device_index=input_device_index,
640
+ input_device_name=input_device_name,
641
+ )
642
+
643
+ # We only use setup_devices for its input device handling
644
+ device_info = setup_devices(general_cfg, audio_in_cfg, None)
645
+ if device_info is None:
646
+ return
647
+ input_device_index, _, _ = device_info
648
+ audio_in_cfg.input_device_index = input_device_index
649
+
650
+ # Use context manager for PID file management
651
+ with process.pid_file_context(process_name), suppress(KeyboardInterrupt):
652
+ result = asyncio.run(
653
+ _async_main(
654
+ extra_instructions=extra_instructions,
655
+ provider_cfg=provider_cfg,
656
+ general_cfg=general_cfg,
657
+ audio_in_cfg=audio_in_cfg,
658
+ wyoming_asr_cfg=wyoming_asr_cfg,
659
+ openai_asr_cfg=openai_asr_cfg,
660
+ gemini_asr_cfg=gemini_asr_cfg,
661
+ ollama_cfg=ollama_cfg,
662
+ openai_llm_cfg=openai_llm_cfg,
663
+ gemini_llm_cfg=gemini_llm_cfg,
664
+ llm_enabled=llm,
665
+ transcription_log=transcription_log,
666
+ save_recording=save_recording,
667
+ process_name=process_name,
668
+ ),
669
+ )
670
+ if json_output:
671
+ print(json.dumps(result))