agent-cli 0.70.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_cli/__init__.py +5 -0
- agent_cli/__main__.py +6 -0
- agent_cli/_extras.json +14 -0
- agent_cli/_requirements/.gitkeep +0 -0
- agent_cli/_requirements/audio.txt +79 -0
- agent_cli/_requirements/faster-whisper.txt +215 -0
- agent_cli/_requirements/kokoro.txt +425 -0
- agent_cli/_requirements/llm.txt +183 -0
- agent_cli/_requirements/memory.txt +355 -0
- agent_cli/_requirements/mlx-whisper.txt +222 -0
- agent_cli/_requirements/piper.txt +176 -0
- agent_cli/_requirements/rag.txt +402 -0
- agent_cli/_requirements/server.txt +154 -0
- agent_cli/_requirements/speed.txt +77 -0
- agent_cli/_requirements/vad.txt +155 -0
- agent_cli/_requirements/wyoming.txt +71 -0
- agent_cli/_tools.py +368 -0
- agent_cli/agents/__init__.py +23 -0
- agent_cli/agents/_voice_agent_common.py +136 -0
- agent_cli/agents/assistant.py +383 -0
- agent_cli/agents/autocorrect.py +284 -0
- agent_cli/agents/chat.py +496 -0
- agent_cli/agents/memory/__init__.py +31 -0
- agent_cli/agents/memory/add.py +190 -0
- agent_cli/agents/memory/proxy.py +160 -0
- agent_cli/agents/rag_proxy.py +128 -0
- agent_cli/agents/speak.py +209 -0
- agent_cli/agents/transcribe.py +671 -0
- agent_cli/agents/transcribe_daemon.py +499 -0
- agent_cli/agents/voice_edit.py +291 -0
- agent_cli/api.py +22 -0
- agent_cli/cli.py +106 -0
- agent_cli/config.py +503 -0
- agent_cli/config_cmd.py +307 -0
- agent_cli/constants.py +27 -0
- agent_cli/core/__init__.py +1 -0
- agent_cli/core/audio.py +461 -0
- agent_cli/core/audio_format.py +299 -0
- agent_cli/core/chroma.py +88 -0
- agent_cli/core/deps.py +191 -0
- agent_cli/core/openai_proxy.py +139 -0
- agent_cli/core/process.py +195 -0
- agent_cli/core/reranker.py +120 -0
- agent_cli/core/sse.py +87 -0
- agent_cli/core/transcription_logger.py +70 -0
- agent_cli/core/utils.py +526 -0
- agent_cli/core/vad.py +175 -0
- agent_cli/core/watch.py +65 -0
- agent_cli/dev/__init__.py +14 -0
- agent_cli/dev/cli.py +1588 -0
- agent_cli/dev/coding_agents/__init__.py +19 -0
- agent_cli/dev/coding_agents/aider.py +24 -0
- agent_cli/dev/coding_agents/base.py +167 -0
- agent_cli/dev/coding_agents/claude.py +39 -0
- agent_cli/dev/coding_agents/codex.py +24 -0
- agent_cli/dev/coding_agents/continue_dev.py +15 -0
- agent_cli/dev/coding_agents/copilot.py +24 -0
- agent_cli/dev/coding_agents/cursor_agent.py +48 -0
- agent_cli/dev/coding_agents/gemini.py +28 -0
- agent_cli/dev/coding_agents/opencode.py +15 -0
- agent_cli/dev/coding_agents/registry.py +49 -0
- agent_cli/dev/editors/__init__.py +19 -0
- agent_cli/dev/editors/base.py +89 -0
- agent_cli/dev/editors/cursor.py +15 -0
- agent_cli/dev/editors/emacs.py +46 -0
- agent_cli/dev/editors/jetbrains.py +56 -0
- agent_cli/dev/editors/nano.py +31 -0
- agent_cli/dev/editors/neovim.py +33 -0
- agent_cli/dev/editors/registry.py +59 -0
- agent_cli/dev/editors/sublime.py +20 -0
- agent_cli/dev/editors/vim.py +42 -0
- agent_cli/dev/editors/vscode.py +15 -0
- agent_cli/dev/editors/zed.py +20 -0
- agent_cli/dev/project.py +568 -0
- agent_cli/dev/registry.py +52 -0
- agent_cli/dev/skill/SKILL.md +141 -0
- agent_cli/dev/skill/examples.md +571 -0
- agent_cli/dev/terminals/__init__.py +19 -0
- agent_cli/dev/terminals/apple_terminal.py +82 -0
- agent_cli/dev/terminals/base.py +56 -0
- agent_cli/dev/terminals/gnome.py +51 -0
- agent_cli/dev/terminals/iterm2.py +84 -0
- agent_cli/dev/terminals/kitty.py +77 -0
- agent_cli/dev/terminals/registry.py +48 -0
- agent_cli/dev/terminals/tmux.py +58 -0
- agent_cli/dev/terminals/warp.py +132 -0
- agent_cli/dev/terminals/zellij.py +78 -0
- agent_cli/dev/worktree.py +856 -0
- agent_cli/docs_gen.py +417 -0
- agent_cli/example-config.toml +185 -0
- agent_cli/install/__init__.py +5 -0
- agent_cli/install/common.py +89 -0
- agent_cli/install/extras.py +174 -0
- agent_cli/install/hotkeys.py +48 -0
- agent_cli/install/services.py +87 -0
- agent_cli/memory/__init__.py +7 -0
- agent_cli/memory/_files.py +250 -0
- agent_cli/memory/_filters.py +63 -0
- agent_cli/memory/_git.py +157 -0
- agent_cli/memory/_indexer.py +142 -0
- agent_cli/memory/_ingest.py +408 -0
- agent_cli/memory/_persistence.py +182 -0
- agent_cli/memory/_prompt.py +91 -0
- agent_cli/memory/_retrieval.py +294 -0
- agent_cli/memory/_store.py +169 -0
- agent_cli/memory/_streaming.py +44 -0
- agent_cli/memory/_tasks.py +48 -0
- agent_cli/memory/api.py +113 -0
- agent_cli/memory/client.py +272 -0
- agent_cli/memory/engine.py +361 -0
- agent_cli/memory/entities.py +43 -0
- agent_cli/memory/models.py +112 -0
- agent_cli/opts.py +433 -0
- agent_cli/py.typed +0 -0
- agent_cli/rag/__init__.py +3 -0
- agent_cli/rag/_indexer.py +67 -0
- agent_cli/rag/_indexing.py +226 -0
- agent_cli/rag/_prompt.py +30 -0
- agent_cli/rag/_retriever.py +156 -0
- agent_cli/rag/_store.py +48 -0
- agent_cli/rag/_utils.py +218 -0
- agent_cli/rag/api.py +175 -0
- agent_cli/rag/client.py +299 -0
- agent_cli/rag/engine.py +302 -0
- agent_cli/rag/models.py +55 -0
- agent_cli/scripts/.runtime/.gitkeep +0 -0
- agent_cli/scripts/__init__.py +1 -0
- agent_cli/scripts/check_plugin_skill_sync.py +50 -0
- agent_cli/scripts/linux-hotkeys/README.md +63 -0
- agent_cli/scripts/linux-hotkeys/toggle-autocorrect.sh +45 -0
- agent_cli/scripts/linux-hotkeys/toggle-transcription.sh +58 -0
- agent_cli/scripts/linux-hotkeys/toggle-voice-edit.sh +58 -0
- agent_cli/scripts/macos-hotkeys/README.md +45 -0
- agent_cli/scripts/macos-hotkeys/skhd-config-example +5 -0
- agent_cli/scripts/macos-hotkeys/toggle-autocorrect.sh +12 -0
- agent_cli/scripts/macos-hotkeys/toggle-transcription.sh +37 -0
- agent_cli/scripts/macos-hotkeys/toggle-voice-edit.sh +37 -0
- agent_cli/scripts/nvidia-asr-server/README.md +99 -0
- agent_cli/scripts/nvidia-asr-server/pyproject.toml +27 -0
- agent_cli/scripts/nvidia-asr-server/server.py +255 -0
- agent_cli/scripts/nvidia-asr-server/shell.nix +32 -0
- agent_cli/scripts/nvidia-asr-server/uv.lock +4654 -0
- agent_cli/scripts/run-openwakeword.sh +11 -0
- agent_cli/scripts/run-piper-windows.ps1 +30 -0
- agent_cli/scripts/run-piper.sh +24 -0
- agent_cli/scripts/run-whisper-linux.sh +40 -0
- agent_cli/scripts/run-whisper-macos.sh +6 -0
- agent_cli/scripts/run-whisper-windows.ps1 +51 -0
- agent_cli/scripts/run-whisper.sh +9 -0
- agent_cli/scripts/run_faster_whisper_server.py +136 -0
- agent_cli/scripts/setup-linux-hotkeys.sh +72 -0
- agent_cli/scripts/setup-linux.sh +108 -0
- agent_cli/scripts/setup-macos-hotkeys.sh +61 -0
- agent_cli/scripts/setup-macos.sh +76 -0
- agent_cli/scripts/setup-windows.ps1 +63 -0
- agent_cli/scripts/start-all-services-windows.ps1 +53 -0
- agent_cli/scripts/start-all-services.sh +178 -0
- agent_cli/scripts/sync_extras.py +138 -0
- agent_cli/server/__init__.py +3 -0
- agent_cli/server/cli.py +721 -0
- agent_cli/server/common.py +222 -0
- agent_cli/server/model_manager.py +288 -0
- agent_cli/server/model_registry.py +225 -0
- agent_cli/server/proxy/__init__.py +3 -0
- agent_cli/server/proxy/api.py +444 -0
- agent_cli/server/streaming.py +67 -0
- agent_cli/server/tts/__init__.py +3 -0
- agent_cli/server/tts/api.py +335 -0
- agent_cli/server/tts/backends/__init__.py +82 -0
- agent_cli/server/tts/backends/base.py +139 -0
- agent_cli/server/tts/backends/kokoro.py +403 -0
- agent_cli/server/tts/backends/piper.py +253 -0
- agent_cli/server/tts/model_manager.py +201 -0
- agent_cli/server/tts/model_registry.py +28 -0
- agent_cli/server/tts/wyoming_handler.py +249 -0
- agent_cli/server/whisper/__init__.py +3 -0
- agent_cli/server/whisper/api.py +413 -0
- agent_cli/server/whisper/backends/__init__.py +89 -0
- agent_cli/server/whisper/backends/base.py +97 -0
- agent_cli/server/whisper/backends/faster_whisper.py +225 -0
- agent_cli/server/whisper/backends/mlx.py +270 -0
- agent_cli/server/whisper/languages.py +116 -0
- agent_cli/server/whisper/model_manager.py +157 -0
- agent_cli/server/whisper/model_registry.py +28 -0
- agent_cli/server/whisper/wyoming_handler.py +203 -0
- agent_cli/services/__init__.py +343 -0
- agent_cli/services/_wyoming_utils.py +64 -0
- agent_cli/services/asr.py +506 -0
- agent_cli/services/llm.py +228 -0
- agent_cli/services/tts.py +450 -0
- agent_cli/services/wake_word.py +142 -0
- agent_cli-0.70.5.dist-info/METADATA +2118 -0
- agent_cli-0.70.5.dist-info/RECORD +196 -0
- agent_cli-0.70.5.dist-info/WHEEL +4 -0
- agent_cli-0.70.5.dist-info/entry_points.txt +4 -0
- agent_cli-0.70.5.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,671 @@
|
|
|
1
|
+
"""Wyoming ASR Client for streaming microphone audio to a transcription server."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import platform
|
|
9
|
+
import time
|
|
10
|
+
from contextlib import suppress
|
|
11
|
+
from datetime import UTC, datetime, timedelta
|
|
12
|
+
from pathlib import Path # noqa: TC003
|
|
13
|
+
from typing import Any, TypedDict
|
|
14
|
+
|
|
15
|
+
import typer
|
|
16
|
+
|
|
17
|
+
from agent_cli import config, opts
|
|
18
|
+
from agent_cli.cli import app
|
|
19
|
+
from agent_cli.core import process
|
|
20
|
+
from agent_cli.core.audio import setup_devices
|
|
21
|
+
from agent_cli.core.deps import requires_extras
|
|
22
|
+
from agent_cli.core.utils import (
|
|
23
|
+
enable_json_mode,
|
|
24
|
+
format_short_timedelta,
|
|
25
|
+
iter_lines_from_file_end,
|
|
26
|
+
maybe_live,
|
|
27
|
+
parse_json_line,
|
|
28
|
+
print_command_line_args,
|
|
29
|
+
print_input_panel,
|
|
30
|
+
print_output_panel,
|
|
31
|
+
print_with_style,
|
|
32
|
+
setup_logging,
|
|
33
|
+
signal_handling_context,
|
|
34
|
+
stop_or_status_or_toggle,
|
|
35
|
+
)
|
|
36
|
+
from agent_cli.services import (
|
|
37
|
+
GEMINI_SUPPORTED_FORMATS,
|
|
38
|
+
OPENAI_SUPPORTED_FORMATS,
|
|
39
|
+
asr,
|
|
40
|
+
)
|
|
41
|
+
from agent_cli.services.asr import (
|
|
42
|
+
create_recorded_audio_transcriber,
|
|
43
|
+
get_last_recording,
|
|
44
|
+
load_audio_from_file,
|
|
45
|
+
)
|
|
46
|
+
from agent_cli.services.llm import process_and_update_clipboard
|
|
47
|
+
|
|
48
|
+
LOGGER = logging.getLogger()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class TranscriptResult(TypedDict, total=False):
|
|
52
|
+
"""Result of transcription with optional LLM processing."""
|
|
53
|
+
|
|
54
|
+
raw_transcript: str | None
|
|
55
|
+
transcript: str | None
|
|
56
|
+
llm_enabled: bool
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
SYSTEM_PROMPT = """
|
|
60
|
+
CRITICAL: You must respond with ONLY the cleaned transcription text. Do NOT add any prefixes, explanations, or commentary whatsoever.
|
|
61
|
+
|
|
62
|
+
WRONG responses (DO NOT DO THIS):
|
|
63
|
+
- "Sure. Here's the cleaned-up text: [text]"
|
|
64
|
+
- "Here is the cleaned text: [text]"
|
|
65
|
+
- "Certainly. Here's the cleaned-up text: [text]"
|
|
66
|
+
- Any text wrapped in quotes like "[text]"
|
|
67
|
+
|
|
68
|
+
CORRECT response: Just the cleaned text directly, nothing else.
|
|
69
|
+
|
|
70
|
+
You are an AI transcription cleanup assistant. Your purpose is to improve and refine raw speech-to-text transcriptions by correcting errors, adding proper punctuation, and enhancing readability while preserving the original meaning and intent.
|
|
71
|
+
|
|
72
|
+
Your tasks include:
|
|
73
|
+
- Correcting obvious speech recognition errors and mishearing
|
|
74
|
+
- Adding appropriate punctuation (periods, commas, question marks, etc.)
|
|
75
|
+
- Fixing capitalization where needed
|
|
76
|
+
- Removing filler words, false starts, and repeated words when they clearly weren't intentional
|
|
77
|
+
- Improving sentence structure and flow while maintaining the speaker's voice and meaning
|
|
78
|
+
- Formatting the text for better readability
|
|
79
|
+
|
|
80
|
+
Important rules:
|
|
81
|
+
- Do not change the core meaning or content of the transcription
|
|
82
|
+
- Do not add information that wasn't spoken
|
|
83
|
+
- Do not remove content unless it's clearly an error or filler
|
|
84
|
+
- Do not wrap your output in markdown or code blocks
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
AGENT_INSTRUCTIONS = """
|
|
88
|
+
REMINDER: Respond with ONLY the cleaned text. No prefixes like "Here's the cleaned text:" or quotes around your response.
|
|
89
|
+
|
|
90
|
+
You will be given a block of raw transcribed text enclosed in <original-text> tags, and a cleanup instruction enclosed in <instruction> tags.
|
|
91
|
+
|
|
92
|
+
Your job is to process the transcribed text according to the instruction, which will typically involve:
|
|
93
|
+
- Correcting speech recognition errors
|
|
94
|
+
- Adding proper punctuation and capitalization
|
|
95
|
+
- Removing obvious filler words and false starts
|
|
96
|
+
- Improving readability while preserving meaning
|
|
97
|
+
|
|
98
|
+
Your response must be JUST the cleaned text - nothing before it, nothing after it, no quotes around it.
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
INSTRUCTION = """
|
|
102
|
+
Please clean up this transcribed text by correcting any speech recognition errors, adding appropriate punctuation and capitalization, removing obvious filler words or false starts, and improving overall readability while preserving the original meaning and intent of the speaker.
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
RECENT_CONTEXT_LOOKBACK_SECONDS = 60 * 60 # 1 hour
|
|
106
|
+
RECENT_CONTEXT_MAX_ENTRIES = 3
|
|
107
|
+
RECENT_CONTEXT_MAX_CHARS = 500
|
|
108
|
+
RECENT_CONTEXT_READ_CHUNK_BYTES = 4096
|
|
109
|
+
CLIPBOARD_CONTEXT_MAX_CHARS = 500
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _build_context_line(
|
|
113
|
+
entry: dict[str, Any],
|
|
114
|
+
*,
|
|
115
|
+
now: datetime,
|
|
116
|
+
cutoff: datetime,
|
|
117
|
+
max_chars_per_entry: int,
|
|
118
|
+
) -> tuple[str | None, bool]:
|
|
119
|
+
timestamp_str = entry.get("timestamp")
|
|
120
|
+
if not timestamp_str:
|
|
121
|
+
return None, False
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
entry_ts = datetime.fromisoformat(timestamp_str)
|
|
125
|
+
except ValueError:
|
|
126
|
+
return None, False
|
|
127
|
+
|
|
128
|
+
if entry_ts < cutoff:
|
|
129
|
+
return None, True
|
|
130
|
+
|
|
131
|
+
# Both the CLI (`raw_output`/`processed_output`) and API (`raw`/`processed`)
|
|
132
|
+
# logging formats are supported, preferring the raw transcript when present.
|
|
133
|
+
text = (entry.get("raw_output") or entry.get("raw") or "").strip()
|
|
134
|
+
if not text:
|
|
135
|
+
return None, False
|
|
136
|
+
|
|
137
|
+
if max_chars_per_entry > 0 and len(text) > max_chars_per_entry:
|
|
138
|
+
text = text[:max_chars_per_entry].rstrip() + "..."
|
|
139
|
+
|
|
140
|
+
delta_str = format_short_timedelta(now - entry_ts)
|
|
141
|
+
return f"- {delta_str} ago (raw transcript): {text}", False
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _gather_recent_transcription_context(
|
|
145
|
+
log_file: Path,
|
|
146
|
+
*,
|
|
147
|
+
max_age_seconds: int = RECENT_CONTEXT_LOOKBACK_SECONDS,
|
|
148
|
+
max_entries: int = RECENT_CONTEXT_MAX_ENTRIES,
|
|
149
|
+
max_chars_per_entry: int = RECENT_CONTEXT_MAX_CHARS,
|
|
150
|
+
now: datetime | None = None,
|
|
151
|
+
chunk_size: int = RECENT_CONTEXT_READ_CHUNK_BYTES,
|
|
152
|
+
) -> str | None:
|
|
153
|
+
"""Return recent transcription snippets to give the LLM additional context."""
|
|
154
|
+
if max_entries <= 0 or max_age_seconds <= 0:
|
|
155
|
+
return None
|
|
156
|
+
if not log_file.exists():
|
|
157
|
+
return None
|
|
158
|
+
if chunk_size <= 0:
|
|
159
|
+
chunk_size = RECENT_CONTEXT_READ_CHUNK_BYTES
|
|
160
|
+
|
|
161
|
+
now = now or datetime.now(UTC)
|
|
162
|
+
cutoff = now - timedelta(seconds=max_age_seconds)
|
|
163
|
+
context_entries: list[str] = []
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
for line in iter_lines_from_file_end(log_file, chunk_size):
|
|
167
|
+
entry = parse_json_line(line)
|
|
168
|
+
if not entry:
|
|
169
|
+
continue
|
|
170
|
+
context_line, should_stop = _build_context_line(
|
|
171
|
+
entry,
|
|
172
|
+
now=now,
|
|
173
|
+
cutoff=cutoff,
|
|
174
|
+
max_chars_per_entry=max_chars_per_entry,
|
|
175
|
+
)
|
|
176
|
+
if should_stop:
|
|
177
|
+
break
|
|
178
|
+
if context_line:
|
|
179
|
+
context_entries.append(context_line)
|
|
180
|
+
if len(context_entries) >= max_entries:
|
|
181
|
+
break
|
|
182
|
+
except OSError as exc:
|
|
183
|
+
LOGGER.debug("Unable to read transcription log %s: %s", log_file, exc)
|
|
184
|
+
return None
|
|
185
|
+
|
|
186
|
+
if not context_entries:
|
|
187
|
+
return None
|
|
188
|
+
|
|
189
|
+
history_lines = "\n".join(reversed(context_entries))
|
|
190
|
+
header = "Recent transcript history (time deltas relative to now):\n"
|
|
191
|
+
return header + history_lines
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _build_context_payload(
|
|
195
|
+
*,
|
|
196
|
+
transcription_log: Path | None,
|
|
197
|
+
clipboard_snapshot: str | None,
|
|
198
|
+
) -> tuple[str | None, str | None]:
|
|
199
|
+
"""Return combined context text and the note to append to instructions."""
|
|
200
|
+
context_sections: list[str] = []
|
|
201
|
+
|
|
202
|
+
if transcription_log:
|
|
203
|
+
log_context = _gather_recent_transcription_context(transcription_log)
|
|
204
|
+
if log_context:
|
|
205
|
+
context_sections.append(log_context)
|
|
206
|
+
|
|
207
|
+
if clipboard_snapshot:
|
|
208
|
+
clipboard_text = clipboard_snapshot.strip()
|
|
209
|
+
if clipboard_text:
|
|
210
|
+
if len(clipboard_text) > CLIPBOARD_CONTEXT_MAX_CHARS:
|
|
211
|
+
clipboard_text = clipboard_text[:CLIPBOARD_CONTEXT_MAX_CHARS].rstrip() + "..."
|
|
212
|
+
context_sections.append(
|
|
213
|
+
"Clipboard content captured before this recording "
|
|
214
|
+
"(truncated for safety; may be unrelated to the new request):\n"
|
|
215
|
+
f"- {clipboard_text}",
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
if not context_sections:
|
|
219
|
+
return None, None
|
|
220
|
+
|
|
221
|
+
combined_context = "\n\n".join(context_sections)
|
|
222
|
+
instructions_note = (
|
|
223
|
+
"\n\n<context> contains recent log transcripts and/or clipboard text. "
|
|
224
|
+
"Treat it as optional background and clean only the text inside <original-text>."
|
|
225
|
+
)
|
|
226
|
+
return combined_context, instructions_note
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def log_transcription(
|
|
230
|
+
log_file: Path,
|
|
231
|
+
role: str,
|
|
232
|
+
raw_transcript: str,
|
|
233
|
+
processed_transcript: str | None = None,
|
|
234
|
+
model_info: str | None = None,
|
|
235
|
+
) -> None:
|
|
236
|
+
"""Log transcription results with metadata."""
|
|
237
|
+
log_entry = {
|
|
238
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
239
|
+
"hostname": platform.node(),
|
|
240
|
+
"role": role,
|
|
241
|
+
"model": model_info,
|
|
242
|
+
"raw_output": raw_transcript,
|
|
243
|
+
"processed_output": processed_transcript,
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
# Append to log file
|
|
247
|
+
with log_file.open("a", encoding="utf-8") as f:
|
|
248
|
+
f.write(json.dumps(log_entry) + "\n")
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
async def _async_main( # noqa: PLR0912, PLR0915, C901
|
|
252
|
+
*,
|
|
253
|
+
extra_instructions: str | None,
|
|
254
|
+
provider_cfg: config.ProviderSelection,
|
|
255
|
+
general_cfg: config.General,
|
|
256
|
+
audio_in_cfg: config.AudioInput | None = None,
|
|
257
|
+
wyoming_asr_cfg: config.WyomingASR,
|
|
258
|
+
openai_asr_cfg: config.OpenAIASR,
|
|
259
|
+
gemini_asr_cfg: config.GeminiASR,
|
|
260
|
+
ollama_cfg: config.Ollama,
|
|
261
|
+
openai_llm_cfg: config.OpenAILLM,
|
|
262
|
+
gemini_llm_cfg: config.GeminiLLM,
|
|
263
|
+
llm_enabled: bool,
|
|
264
|
+
transcription_log: Path | None,
|
|
265
|
+
# Optional parameters for file-based transcription
|
|
266
|
+
audio_file_path: Path | None = None,
|
|
267
|
+
save_recording: bool = True,
|
|
268
|
+
process_name: str | None = None,
|
|
269
|
+
) -> TranscriptResult:
|
|
270
|
+
"""Unified async entry point for both live and file-based transcription."""
|
|
271
|
+
start_time = time.monotonic()
|
|
272
|
+
transcript: str | None
|
|
273
|
+
|
|
274
|
+
with maybe_live(not general_cfg.quiet) as live:
|
|
275
|
+
if audio_file_path:
|
|
276
|
+
# File-based transcription
|
|
277
|
+
# Determine if we can use native format support (skip PCM conversion)
|
|
278
|
+
suffix = audio_file_path.suffix.lower()
|
|
279
|
+
use_native_format = (
|
|
280
|
+
provider_cfg.asr_provider == "openai" and suffix in OPENAI_SUPPORTED_FORMATS
|
|
281
|
+
) or (provider_cfg.asr_provider == "gemini" and suffix in GEMINI_SUPPORTED_FORMATS)
|
|
282
|
+
|
|
283
|
+
# Wyoming always needs PCM, OpenAI/Gemini can use native formats
|
|
284
|
+
audio_data = load_audio_from_file(
|
|
285
|
+
audio_file_path,
|
|
286
|
+
LOGGER,
|
|
287
|
+
convert_to_pcm=not use_native_format,
|
|
288
|
+
)
|
|
289
|
+
if not audio_data:
|
|
290
|
+
print_with_style(
|
|
291
|
+
f"❌ Failed to load audio from {audio_file_path}",
|
|
292
|
+
style="red",
|
|
293
|
+
)
|
|
294
|
+
return TranscriptResult(
|
|
295
|
+
raw_transcript=None,
|
|
296
|
+
transcript=None,
|
|
297
|
+
llm_enabled=False,
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
recorded_transcriber = create_recorded_audio_transcriber(provider_cfg)
|
|
301
|
+
|
|
302
|
+
# Call with appropriate arguments based on provider
|
|
303
|
+
if provider_cfg.asr_provider == "openai":
|
|
304
|
+
transcript = await recorded_transcriber(
|
|
305
|
+
audio_data,
|
|
306
|
+
openai_asr_cfg,
|
|
307
|
+
LOGGER,
|
|
308
|
+
quiet=general_cfg.quiet,
|
|
309
|
+
file_suffix=suffix if use_native_format else ".wav",
|
|
310
|
+
extra_instructions=extra_instructions,
|
|
311
|
+
)
|
|
312
|
+
elif provider_cfg.asr_provider == "gemini":
|
|
313
|
+
transcript = await recorded_transcriber(
|
|
314
|
+
audio_data,
|
|
315
|
+
gemini_asr_cfg,
|
|
316
|
+
LOGGER,
|
|
317
|
+
quiet=general_cfg.quiet,
|
|
318
|
+
file_suffix=suffix if use_native_format else ".wav",
|
|
319
|
+
extra_instructions=extra_instructions,
|
|
320
|
+
)
|
|
321
|
+
elif provider_cfg.asr_provider == "wyoming":
|
|
322
|
+
transcript = await recorded_transcriber(
|
|
323
|
+
audio_data=audio_data,
|
|
324
|
+
wyoming_asr_cfg=wyoming_asr_cfg,
|
|
325
|
+
logger=LOGGER,
|
|
326
|
+
quiet=general_cfg.quiet,
|
|
327
|
+
extra_instructions=extra_instructions,
|
|
328
|
+
)
|
|
329
|
+
else:
|
|
330
|
+
msg = f"Unsupported ASR provider: {provider_cfg.asr_provider}"
|
|
331
|
+
raise NotImplementedError(msg)
|
|
332
|
+
else:
|
|
333
|
+
# Live recording transcription
|
|
334
|
+
if not audio_in_cfg:
|
|
335
|
+
msg = "Missing audio configuration for live recording"
|
|
336
|
+
raise ValueError(msg)
|
|
337
|
+
|
|
338
|
+
with signal_handling_context(LOGGER, general_cfg.quiet, process_name) as stop_event:
|
|
339
|
+
live_transcriber = asr.create_transcriber(
|
|
340
|
+
provider_cfg,
|
|
341
|
+
audio_in_cfg,
|
|
342
|
+
wyoming_asr_cfg,
|
|
343
|
+
openai_asr_cfg,
|
|
344
|
+
gemini_asr_cfg,
|
|
345
|
+
)
|
|
346
|
+
transcript = await live_transcriber(
|
|
347
|
+
logger=LOGGER,
|
|
348
|
+
stop_event=stop_event,
|
|
349
|
+
quiet=general_cfg.quiet,
|
|
350
|
+
live=live,
|
|
351
|
+
save_recording=save_recording,
|
|
352
|
+
extra_instructions=extra_instructions,
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
elapsed = time.monotonic() - start_time
|
|
356
|
+
|
|
357
|
+
if llm_enabled and transcript:
|
|
358
|
+
if not general_cfg.quiet:
|
|
359
|
+
print_input_panel(
|
|
360
|
+
transcript,
|
|
361
|
+
title="📝 Raw Transcript",
|
|
362
|
+
subtitle=f"[dim]took {elapsed:.2f}s[/dim]",
|
|
363
|
+
)
|
|
364
|
+
clipboard_snapshot: str | None = None
|
|
365
|
+
if general_cfg.clipboard:
|
|
366
|
+
import pyperclip # noqa: PLC0415
|
|
367
|
+
|
|
368
|
+
clipboard_snapshot = pyperclip.paste()
|
|
369
|
+
pyperclip.copy(transcript)
|
|
370
|
+
LOGGER.info("Copied raw transcript to clipboard before LLM processing.")
|
|
371
|
+
instructions = AGENT_INSTRUCTIONS
|
|
372
|
+
if extra_instructions:
|
|
373
|
+
instructions += f"\n\n{extra_instructions}"
|
|
374
|
+
|
|
375
|
+
combined_context, context_note = _build_context_payload(
|
|
376
|
+
transcription_log=transcription_log,
|
|
377
|
+
clipboard_snapshot=clipboard_snapshot,
|
|
378
|
+
)
|
|
379
|
+
if context_note:
|
|
380
|
+
instructions += context_note
|
|
381
|
+
|
|
382
|
+
# Get model info for logging
|
|
383
|
+
if provider_cfg.llm_provider == "ollama":
|
|
384
|
+
model_info = f"{provider_cfg.llm_provider}:{ollama_cfg.llm_ollama_model}"
|
|
385
|
+
elif provider_cfg.llm_provider == "openai":
|
|
386
|
+
model_info = f"{provider_cfg.llm_provider}:{openai_llm_cfg.llm_openai_model}"
|
|
387
|
+
elif provider_cfg.llm_provider == "gemini":
|
|
388
|
+
model_info = f"{provider_cfg.llm_provider}:{gemini_llm_cfg.llm_gemini_model}"
|
|
389
|
+
else:
|
|
390
|
+
msg = f"Unsupported LLM provider: {provider_cfg.llm_provider}"
|
|
391
|
+
raise ValueError(msg)
|
|
392
|
+
processed_transcript = await process_and_update_clipboard(
|
|
393
|
+
system_prompt=SYSTEM_PROMPT,
|
|
394
|
+
agent_instructions=instructions,
|
|
395
|
+
provider_cfg=provider_cfg,
|
|
396
|
+
ollama_cfg=ollama_cfg,
|
|
397
|
+
openai_cfg=openai_llm_cfg,
|
|
398
|
+
gemini_cfg=gemini_llm_cfg,
|
|
399
|
+
logger=LOGGER,
|
|
400
|
+
original_text=transcript,
|
|
401
|
+
instruction=INSTRUCTION,
|
|
402
|
+
clipboard=general_cfg.clipboard,
|
|
403
|
+
quiet=general_cfg.quiet,
|
|
404
|
+
live=live,
|
|
405
|
+
context=combined_context,
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
# Log transcription if requested
|
|
409
|
+
if transcription_log:
|
|
410
|
+
log_transcription(
|
|
411
|
+
log_file=transcription_log,
|
|
412
|
+
role="assistant",
|
|
413
|
+
raw_transcript=transcript,
|
|
414
|
+
processed_transcript=processed_transcript,
|
|
415
|
+
model_info=model_info,
|
|
416
|
+
)
|
|
417
|
+
return TranscriptResult(
|
|
418
|
+
raw_transcript=transcript,
|
|
419
|
+
transcript=processed_transcript,
|
|
420
|
+
llm_enabled=True,
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
# When not using LLM, show transcript in output panel for consistency
|
|
424
|
+
if transcript:
|
|
425
|
+
if general_cfg.quiet:
|
|
426
|
+
# Quiet mode: print result to stdout for Keyboard Maestro to capture
|
|
427
|
+
print(transcript)
|
|
428
|
+
else:
|
|
429
|
+
print_output_panel(
|
|
430
|
+
transcript,
|
|
431
|
+
title="📝 Transcript",
|
|
432
|
+
subtitle="[dim]Copied to clipboard[/dim]" if general_cfg.clipboard else "",
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
# Log transcription if requested (raw only)
|
|
436
|
+
if transcription_log:
|
|
437
|
+
asr_model_info = f"{provider_cfg.asr_provider}"
|
|
438
|
+
if provider_cfg.asr_provider == "openai":
|
|
439
|
+
asr_model_info += f":{openai_asr_cfg.asr_openai_model}"
|
|
440
|
+
log_transcription(
|
|
441
|
+
log_file=transcription_log,
|
|
442
|
+
role="user",
|
|
443
|
+
raw_transcript=transcript,
|
|
444
|
+
processed_transcript=None,
|
|
445
|
+
model_info=asr_model_info,
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
if general_cfg.clipboard:
|
|
449
|
+
import pyperclip # noqa: PLC0415
|
|
450
|
+
|
|
451
|
+
pyperclip.copy(transcript)
|
|
452
|
+
LOGGER.info("Copied transcript to clipboard.")
|
|
453
|
+
else:
|
|
454
|
+
LOGGER.info("Clipboard copy disabled.")
|
|
455
|
+
else:
|
|
456
|
+
LOGGER.info("Transcript empty.")
|
|
457
|
+
if not general_cfg.quiet:
|
|
458
|
+
print_with_style("⚠️ No transcript captured.", style="yellow")
|
|
459
|
+
|
|
460
|
+
return TranscriptResult(
|
|
461
|
+
raw_transcript=transcript,
|
|
462
|
+
transcript=transcript,
|
|
463
|
+
llm_enabled=False,
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
@app.command("transcribe", rich_help_panel="Voice Commands")
|
|
468
|
+
@requires_extras("audio", "llm")
|
|
469
|
+
def transcribe( # noqa: PLR0912
|
|
470
|
+
*,
|
|
471
|
+
extra_instructions: str | None = typer.Option(
|
|
472
|
+
None,
|
|
473
|
+
"--extra-instructions",
|
|
474
|
+
help="Additional instructions for the LLM to process the transcription.",
|
|
475
|
+
rich_help_panel="LLM Configuration",
|
|
476
|
+
),
|
|
477
|
+
from_file: Path | None = opts.FROM_FILE,
|
|
478
|
+
last_recording: int = opts.LAST_RECORDING,
|
|
479
|
+
save_recording: bool = opts.SAVE_RECORDING,
|
|
480
|
+
# --- Provider Selection ---
|
|
481
|
+
asr_provider: str = opts.ASR_PROVIDER,
|
|
482
|
+
llm_provider: str = opts.LLM_PROVIDER,
|
|
483
|
+
# --- ASR (Audio) Configuration ---
|
|
484
|
+
input_device_index: int | None = opts.INPUT_DEVICE_INDEX,
|
|
485
|
+
input_device_name: str | None = opts.INPUT_DEVICE_NAME,
|
|
486
|
+
asr_wyoming_ip: str = opts.ASR_WYOMING_IP,
|
|
487
|
+
asr_wyoming_port: int = opts.ASR_WYOMING_PORT,
|
|
488
|
+
asr_openai_model: str = opts.ASR_OPENAI_MODEL,
|
|
489
|
+
asr_openai_base_url: str | None = opts.ASR_OPENAI_BASE_URL,
|
|
490
|
+
asr_openai_prompt: str | None = opts.ASR_OPENAI_PROMPT,
|
|
491
|
+
asr_gemini_model: str = opts.ASR_GEMINI_MODEL,
|
|
492
|
+
# --- LLM Configuration ---
|
|
493
|
+
llm_ollama_model: str = opts.LLM_OLLAMA_MODEL,
|
|
494
|
+
llm_ollama_host: str = opts.LLM_OLLAMA_HOST,
|
|
495
|
+
llm_openai_model: str = opts.LLM_OPENAI_MODEL,
|
|
496
|
+
openai_api_key: str | None = opts.OPENAI_API_KEY,
|
|
497
|
+
openai_base_url: str | None = opts.OPENAI_BASE_URL,
|
|
498
|
+
llm_gemini_model: str = opts.LLM_GEMINI_MODEL,
|
|
499
|
+
gemini_api_key: str | None = opts.GEMINI_API_KEY,
|
|
500
|
+
llm: bool = opts.LLM,
|
|
501
|
+
# --- Process Management ---
|
|
502
|
+
stop: bool = opts.STOP,
|
|
503
|
+
status: bool = opts.STATUS,
|
|
504
|
+
toggle: bool = opts.TOGGLE,
|
|
505
|
+
# --- General Options ---
|
|
506
|
+
clipboard: bool = opts.CLIPBOARD,
|
|
507
|
+
log_level: opts.LogLevel = opts.LOG_LEVEL,
|
|
508
|
+
log_file: str | None = opts.LOG_FILE,
|
|
509
|
+
list_devices: bool = opts.LIST_DEVICES,
|
|
510
|
+
quiet: bool = opts.QUIET,
|
|
511
|
+
json_output: bool = opts.JSON_OUTPUT,
|
|
512
|
+
config_file: str | None = opts.CONFIG_FILE,
|
|
513
|
+
print_args: bool = opts.PRINT_ARGS,
|
|
514
|
+
transcription_log: Path | None = opts.TRANSCRIPTION_LOG,
|
|
515
|
+
) -> None:
|
|
516
|
+
"""Wyoming ASR Client for streaming microphone audio to a transcription server."""
|
|
517
|
+
if print_args:
|
|
518
|
+
print_command_line_args(locals())
|
|
519
|
+
|
|
520
|
+
# JSON output implies quiet mode and no clipboard - set this early before any output
|
|
521
|
+
effective_quiet = quiet or json_output
|
|
522
|
+
if json_output:
|
|
523
|
+
enable_json_mode()
|
|
524
|
+
|
|
525
|
+
setup_logging(log_level, log_file, quiet=effective_quiet)
|
|
526
|
+
|
|
527
|
+
# Expand user path for transcription log
|
|
528
|
+
if transcription_log:
|
|
529
|
+
transcription_log = transcription_log.expanduser()
|
|
530
|
+
|
|
531
|
+
# Handle recovery options
|
|
532
|
+
if last_recording and from_file:
|
|
533
|
+
print_with_style("❌ Cannot use both --last-recording and --from-file", style="red")
|
|
534
|
+
return
|
|
535
|
+
|
|
536
|
+
# Determine audio source
|
|
537
|
+
audio_file_path = None
|
|
538
|
+
if last_recording > 0: # 0 means disabled
|
|
539
|
+
audio_file_path = get_last_recording(last_recording)
|
|
540
|
+
if not audio_file_path:
|
|
541
|
+
if last_recording == 1:
|
|
542
|
+
print_with_style("❌ No saved recordings found", style="red")
|
|
543
|
+
else:
|
|
544
|
+
print_with_style(
|
|
545
|
+
f"❌ Recording #{last_recording} not found (not enough recordings)",
|
|
546
|
+
style="red",
|
|
547
|
+
)
|
|
548
|
+
return
|
|
549
|
+
if not quiet:
|
|
550
|
+
ordinal = "most recent" if last_recording == 1 else f"#{last_recording}"
|
|
551
|
+
print_with_style(
|
|
552
|
+
f"📁 Using {ordinal} recording: {audio_file_path.name}",
|
|
553
|
+
style="blue",
|
|
554
|
+
)
|
|
555
|
+
elif from_file:
|
|
556
|
+
audio_file_path = from_file.expanduser()
|
|
557
|
+
if not audio_file_path.exists():
|
|
558
|
+
print_with_style(f"❌ File not found: {audio_file_path}", style="red")
|
|
559
|
+
return
|
|
560
|
+
|
|
561
|
+
# Create all config objects once
|
|
562
|
+
effective_clipboard = clipboard and not json_output
|
|
563
|
+
general_cfg = config.General(
|
|
564
|
+
log_level=log_level,
|
|
565
|
+
log_file=log_file,
|
|
566
|
+
quiet=effective_quiet,
|
|
567
|
+
list_devices=list_devices,
|
|
568
|
+
clipboard=effective_clipboard,
|
|
569
|
+
)
|
|
570
|
+
provider_cfg = config.ProviderSelection(
|
|
571
|
+
asr_provider=asr_provider,
|
|
572
|
+
llm_provider=llm_provider,
|
|
573
|
+
tts_provider="wyoming", # Not used in transcribe
|
|
574
|
+
)
|
|
575
|
+
wyoming_asr_cfg = config.WyomingASR(
|
|
576
|
+
asr_wyoming_ip=asr_wyoming_ip,
|
|
577
|
+
asr_wyoming_port=asr_wyoming_port,
|
|
578
|
+
)
|
|
579
|
+
openai_asr_cfg = config.OpenAIASR(
|
|
580
|
+
asr_openai_model=asr_openai_model,
|
|
581
|
+
openai_api_key=openai_api_key,
|
|
582
|
+
openai_base_url=asr_openai_base_url or openai_base_url,
|
|
583
|
+
asr_openai_prompt=asr_openai_prompt,
|
|
584
|
+
)
|
|
585
|
+
gemini_asr_cfg = config.GeminiASR(
|
|
586
|
+
asr_gemini_model=asr_gemini_model,
|
|
587
|
+
gemini_api_key=gemini_api_key,
|
|
588
|
+
)
|
|
589
|
+
ollama_cfg = config.Ollama(
|
|
590
|
+
llm_ollama_model=llm_ollama_model,
|
|
591
|
+
llm_ollama_host=llm_ollama_host,
|
|
592
|
+
)
|
|
593
|
+
openai_llm_cfg = config.OpenAILLM(
|
|
594
|
+
llm_openai_model=llm_openai_model,
|
|
595
|
+
openai_api_key=openai_api_key,
|
|
596
|
+
openai_base_url=openai_base_url,
|
|
597
|
+
)
|
|
598
|
+
gemini_llm_cfg = config.GeminiLLM(
|
|
599
|
+
llm_gemini_model=llm_gemini_model,
|
|
600
|
+
gemini_api_key=gemini_api_key,
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
# Handle recovery mode (transcribing from file)
|
|
604
|
+
if audio_file_path:
|
|
605
|
+
# We're transcribing from a saved file
|
|
606
|
+
result = asyncio.run(
|
|
607
|
+
_async_main(
|
|
608
|
+
audio_file_path=audio_file_path,
|
|
609
|
+
extra_instructions=extra_instructions,
|
|
610
|
+
provider_cfg=provider_cfg,
|
|
611
|
+
general_cfg=general_cfg,
|
|
612
|
+
wyoming_asr_cfg=wyoming_asr_cfg,
|
|
613
|
+
openai_asr_cfg=openai_asr_cfg,
|
|
614
|
+
gemini_asr_cfg=gemini_asr_cfg,
|
|
615
|
+
ollama_cfg=ollama_cfg,
|
|
616
|
+
openai_llm_cfg=openai_llm_cfg,
|
|
617
|
+
gemini_llm_cfg=gemini_llm_cfg,
|
|
618
|
+
llm_enabled=llm,
|
|
619
|
+
transcription_log=transcription_log,
|
|
620
|
+
),
|
|
621
|
+
)
|
|
622
|
+
if json_output:
|
|
623
|
+
print(json.dumps(result))
|
|
624
|
+
return
|
|
625
|
+
|
|
626
|
+
# Normal recording mode
|
|
627
|
+
process_name = "transcribe"
|
|
628
|
+
if stop_or_status_or_toggle(
|
|
629
|
+
process_name,
|
|
630
|
+
"transcribe",
|
|
631
|
+
stop,
|
|
632
|
+
status,
|
|
633
|
+
toggle,
|
|
634
|
+
quiet=general_cfg.quiet,
|
|
635
|
+
):
|
|
636
|
+
return
|
|
637
|
+
|
|
638
|
+
audio_in_cfg = config.AudioInput(
|
|
639
|
+
input_device_index=input_device_index,
|
|
640
|
+
input_device_name=input_device_name,
|
|
641
|
+
)
|
|
642
|
+
|
|
643
|
+
# We only use setup_devices for its input device handling
|
|
644
|
+
device_info = setup_devices(general_cfg, audio_in_cfg, None)
|
|
645
|
+
if device_info is None:
|
|
646
|
+
return
|
|
647
|
+
input_device_index, _, _ = device_info
|
|
648
|
+
audio_in_cfg.input_device_index = input_device_index
|
|
649
|
+
|
|
650
|
+
# Use context manager for PID file management
|
|
651
|
+
with process.pid_file_context(process_name), suppress(KeyboardInterrupt):
|
|
652
|
+
result = asyncio.run(
|
|
653
|
+
_async_main(
|
|
654
|
+
extra_instructions=extra_instructions,
|
|
655
|
+
provider_cfg=provider_cfg,
|
|
656
|
+
general_cfg=general_cfg,
|
|
657
|
+
audio_in_cfg=audio_in_cfg,
|
|
658
|
+
wyoming_asr_cfg=wyoming_asr_cfg,
|
|
659
|
+
openai_asr_cfg=openai_asr_cfg,
|
|
660
|
+
gemini_asr_cfg=gemini_asr_cfg,
|
|
661
|
+
ollama_cfg=ollama_cfg,
|
|
662
|
+
openai_llm_cfg=openai_llm_cfg,
|
|
663
|
+
gemini_llm_cfg=gemini_llm_cfg,
|
|
664
|
+
llm_enabled=llm,
|
|
665
|
+
transcription_log=transcription_log,
|
|
666
|
+
save_recording=save_recording,
|
|
667
|
+
process_name=process_name,
|
|
668
|
+
),
|
|
669
|
+
)
|
|
670
|
+
if json_output:
|
|
671
|
+
print(json.dumps(result))
|