agent-cli 0.70.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_cli/__init__.py +5 -0
- agent_cli/__main__.py +6 -0
- agent_cli/_extras.json +14 -0
- agent_cli/_requirements/.gitkeep +0 -0
- agent_cli/_requirements/audio.txt +79 -0
- agent_cli/_requirements/faster-whisper.txt +215 -0
- agent_cli/_requirements/kokoro.txt +425 -0
- agent_cli/_requirements/llm.txt +183 -0
- agent_cli/_requirements/memory.txt +355 -0
- agent_cli/_requirements/mlx-whisper.txt +222 -0
- agent_cli/_requirements/piper.txt +176 -0
- agent_cli/_requirements/rag.txt +402 -0
- agent_cli/_requirements/server.txt +154 -0
- agent_cli/_requirements/speed.txt +77 -0
- agent_cli/_requirements/vad.txt +155 -0
- agent_cli/_requirements/wyoming.txt +71 -0
- agent_cli/_tools.py +368 -0
- agent_cli/agents/__init__.py +23 -0
- agent_cli/agents/_voice_agent_common.py +136 -0
- agent_cli/agents/assistant.py +383 -0
- agent_cli/agents/autocorrect.py +284 -0
- agent_cli/agents/chat.py +496 -0
- agent_cli/agents/memory/__init__.py +31 -0
- agent_cli/agents/memory/add.py +190 -0
- agent_cli/agents/memory/proxy.py +160 -0
- agent_cli/agents/rag_proxy.py +128 -0
- agent_cli/agents/speak.py +209 -0
- agent_cli/agents/transcribe.py +671 -0
- agent_cli/agents/transcribe_daemon.py +499 -0
- agent_cli/agents/voice_edit.py +291 -0
- agent_cli/api.py +22 -0
- agent_cli/cli.py +106 -0
- agent_cli/config.py +503 -0
- agent_cli/config_cmd.py +307 -0
- agent_cli/constants.py +27 -0
- agent_cli/core/__init__.py +1 -0
- agent_cli/core/audio.py +461 -0
- agent_cli/core/audio_format.py +299 -0
- agent_cli/core/chroma.py +88 -0
- agent_cli/core/deps.py +191 -0
- agent_cli/core/openai_proxy.py +139 -0
- agent_cli/core/process.py +195 -0
- agent_cli/core/reranker.py +120 -0
- agent_cli/core/sse.py +87 -0
- agent_cli/core/transcription_logger.py +70 -0
- agent_cli/core/utils.py +526 -0
- agent_cli/core/vad.py +175 -0
- agent_cli/core/watch.py +65 -0
- agent_cli/dev/__init__.py +14 -0
- agent_cli/dev/cli.py +1588 -0
- agent_cli/dev/coding_agents/__init__.py +19 -0
- agent_cli/dev/coding_agents/aider.py +24 -0
- agent_cli/dev/coding_agents/base.py +167 -0
- agent_cli/dev/coding_agents/claude.py +39 -0
- agent_cli/dev/coding_agents/codex.py +24 -0
- agent_cli/dev/coding_agents/continue_dev.py +15 -0
- agent_cli/dev/coding_agents/copilot.py +24 -0
- agent_cli/dev/coding_agents/cursor_agent.py +48 -0
- agent_cli/dev/coding_agents/gemini.py +28 -0
- agent_cli/dev/coding_agents/opencode.py +15 -0
- agent_cli/dev/coding_agents/registry.py +49 -0
- agent_cli/dev/editors/__init__.py +19 -0
- agent_cli/dev/editors/base.py +89 -0
- agent_cli/dev/editors/cursor.py +15 -0
- agent_cli/dev/editors/emacs.py +46 -0
- agent_cli/dev/editors/jetbrains.py +56 -0
- agent_cli/dev/editors/nano.py +31 -0
- agent_cli/dev/editors/neovim.py +33 -0
- agent_cli/dev/editors/registry.py +59 -0
- agent_cli/dev/editors/sublime.py +20 -0
- agent_cli/dev/editors/vim.py +42 -0
- agent_cli/dev/editors/vscode.py +15 -0
- agent_cli/dev/editors/zed.py +20 -0
- agent_cli/dev/project.py +568 -0
- agent_cli/dev/registry.py +52 -0
- agent_cli/dev/skill/SKILL.md +141 -0
- agent_cli/dev/skill/examples.md +571 -0
- agent_cli/dev/terminals/__init__.py +19 -0
- agent_cli/dev/terminals/apple_terminal.py +82 -0
- agent_cli/dev/terminals/base.py +56 -0
- agent_cli/dev/terminals/gnome.py +51 -0
- agent_cli/dev/terminals/iterm2.py +84 -0
- agent_cli/dev/terminals/kitty.py +77 -0
- agent_cli/dev/terminals/registry.py +48 -0
- agent_cli/dev/terminals/tmux.py +58 -0
- agent_cli/dev/terminals/warp.py +132 -0
- agent_cli/dev/terminals/zellij.py +78 -0
- agent_cli/dev/worktree.py +856 -0
- agent_cli/docs_gen.py +417 -0
- agent_cli/example-config.toml +185 -0
- agent_cli/install/__init__.py +5 -0
- agent_cli/install/common.py +89 -0
- agent_cli/install/extras.py +174 -0
- agent_cli/install/hotkeys.py +48 -0
- agent_cli/install/services.py +87 -0
- agent_cli/memory/__init__.py +7 -0
- agent_cli/memory/_files.py +250 -0
- agent_cli/memory/_filters.py +63 -0
- agent_cli/memory/_git.py +157 -0
- agent_cli/memory/_indexer.py +142 -0
- agent_cli/memory/_ingest.py +408 -0
- agent_cli/memory/_persistence.py +182 -0
- agent_cli/memory/_prompt.py +91 -0
- agent_cli/memory/_retrieval.py +294 -0
- agent_cli/memory/_store.py +169 -0
- agent_cli/memory/_streaming.py +44 -0
- agent_cli/memory/_tasks.py +48 -0
- agent_cli/memory/api.py +113 -0
- agent_cli/memory/client.py +272 -0
- agent_cli/memory/engine.py +361 -0
- agent_cli/memory/entities.py +43 -0
- agent_cli/memory/models.py +112 -0
- agent_cli/opts.py +433 -0
- agent_cli/py.typed +0 -0
- agent_cli/rag/__init__.py +3 -0
- agent_cli/rag/_indexer.py +67 -0
- agent_cli/rag/_indexing.py +226 -0
- agent_cli/rag/_prompt.py +30 -0
- agent_cli/rag/_retriever.py +156 -0
- agent_cli/rag/_store.py +48 -0
- agent_cli/rag/_utils.py +218 -0
- agent_cli/rag/api.py +175 -0
- agent_cli/rag/client.py +299 -0
- agent_cli/rag/engine.py +302 -0
- agent_cli/rag/models.py +55 -0
- agent_cli/scripts/.runtime/.gitkeep +0 -0
- agent_cli/scripts/__init__.py +1 -0
- agent_cli/scripts/check_plugin_skill_sync.py +50 -0
- agent_cli/scripts/linux-hotkeys/README.md +63 -0
- agent_cli/scripts/linux-hotkeys/toggle-autocorrect.sh +45 -0
- agent_cli/scripts/linux-hotkeys/toggle-transcription.sh +58 -0
- agent_cli/scripts/linux-hotkeys/toggle-voice-edit.sh +58 -0
- agent_cli/scripts/macos-hotkeys/README.md +45 -0
- agent_cli/scripts/macos-hotkeys/skhd-config-example +5 -0
- agent_cli/scripts/macos-hotkeys/toggle-autocorrect.sh +12 -0
- agent_cli/scripts/macos-hotkeys/toggle-transcription.sh +37 -0
- agent_cli/scripts/macos-hotkeys/toggle-voice-edit.sh +37 -0
- agent_cli/scripts/nvidia-asr-server/README.md +99 -0
- agent_cli/scripts/nvidia-asr-server/pyproject.toml +27 -0
- agent_cli/scripts/nvidia-asr-server/server.py +255 -0
- agent_cli/scripts/nvidia-asr-server/shell.nix +32 -0
- agent_cli/scripts/nvidia-asr-server/uv.lock +4654 -0
- agent_cli/scripts/run-openwakeword.sh +11 -0
- agent_cli/scripts/run-piper-windows.ps1 +30 -0
- agent_cli/scripts/run-piper.sh +24 -0
- agent_cli/scripts/run-whisper-linux.sh +40 -0
- agent_cli/scripts/run-whisper-macos.sh +6 -0
- agent_cli/scripts/run-whisper-windows.ps1 +51 -0
- agent_cli/scripts/run-whisper.sh +9 -0
- agent_cli/scripts/run_faster_whisper_server.py +136 -0
- agent_cli/scripts/setup-linux-hotkeys.sh +72 -0
- agent_cli/scripts/setup-linux.sh +108 -0
- agent_cli/scripts/setup-macos-hotkeys.sh +61 -0
- agent_cli/scripts/setup-macos.sh +76 -0
- agent_cli/scripts/setup-windows.ps1 +63 -0
- agent_cli/scripts/start-all-services-windows.ps1 +53 -0
- agent_cli/scripts/start-all-services.sh +178 -0
- agent_cli/scripts/sync_extras.py +138 -0
- agent_cli/server/__init__.py +3 -0
- agent_cli/server/cli.py +721 -0
- agent_cli/server/common.py +222 -0
- agent_cli/server/model_manager.py +288 -0
- agent_cli/server/model_registry.py +225 -0
- agent_cli/server/proxy/__init__.py +3 -0
- agent_cli/server/proxy/api.py +444 -0
- agent_cli/server/streaming.py +67 -0
- agent_cli/server/tts/__init__.py +3 -0
- agent_cli/server/tts/api.py +335 -0
- agent_cli/server/tts/backends/__init__.py +82 -0
- agent_cli/server/tts/backends/base.py +139 -0
- agent_cli/server/tts/backends/kokoro.py +403 -0
- agent_cli/server/tts/backends/piper.py +253 -0
- agent_cli/server/tts/model_manager.py +201 -0
- agent_cli/server/tts/model_registry.py +28 -0
- agent_cli/server/tts/wyoming_handler.py +249 -0
- agent_cli/server/whisper/__init__.py +3 -0
- agent_cli/server/whisper/api.py +413 -0
- agent_cli/server/whisper/backends/__init__.py +89 -0
- agent_cli/server/whisper/backends/base.py +97 -0
- agent_cli/server/whisper/backends/faster_whisper.py +225 -0
- agent_cli/server/whisper/backends/mlx.py +270 -0
- agent_cli/server/whisper/languages.py +116 -0
- agent_cli/server/whisper/model_manager.py +157 -0
- agent_cli/server/whisper/model_registry.py +28 -0
- agent_cli/server/whisper/wyoming_handler.py +203 -0
- agent_cli/services/__init__.py +343 -0
- agent_cli/services/_wyoming_utils.py +64 -0
- agent_cli/services/asr.py +506 -0
- agent_cli/services/llm.py +228 -0
- agent_cli/services/tts.py +450 -0
- agent_cli/services/wake_word.py +142 -0
- agent_cli-0.70.5.dist-info/METADATA +2118 -0
- agent_cli-0.70.5.dist-info/RECORD +196 -0
- agent_cli-0.70.5.dist-info/WHEEL +4 -0
- agent_cli-0.70.5.dist-info/entry_points.txt +4 -0
- agent_cli-0.70.5.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,499 @@
|
|
|
1
|
+
"""Continuous transcription daemon with voice activity detection."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import platform
|
|
9
|
+
import signal
|
|
10
|
+
from contextlib import suppress
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from datetime import UTC, datetime
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import TYPE_CHECKING
|
|
15
|
+
|
|
16
|
+
import typer
|
|
17
|
+
|
|
18
|
+
from agent_cli import config, constants, opts
|
|
19
|
+
from agent_cli.agents.transcribe import (
|
|
20
|
+
AGENT_INSTRUCTIONS,
|
|
21
|
+
INSTRUCTION,
|
|
22
|
+
SYSTEM_PROMPT,
|
|
23
|
+
)
|
|
24
|
+
from agent_cli.cli import app
|
|
25
|
+
from agent_cli.core import process
|
|
26
|
+
from agent_cli.core.audio import open_audio_stream, setup_devices, setup_input_stream
|
|
27
|
+
from agent_cli.core.audio_format import check_ffmpeg_available, save_audio_as_mp3
|
|
28
|
+
from agent_cli.core.deps import requires_extras
|
|
29
|
+
from agent_cli.core.utils import (
|
|
30
|
+
console,
|
|
31
|
+
print_command_line_args,
|
|
32
|
+
print_with_style,
|
|
33
|
+
setup_logging,
|
|
34
|
+
)
|
|
35
|
+
from agent_cli.services.asr import create_recorded_audio_transcriber
|
|
36
|
+
from agent_cli.services.llm import process_and_update_clipboard
|
|
37
|
+
|
|
38
|
+
if TYPE_CHECKING:
|
|
39
|
+
from agent_cli.core.vad import VoiceActivityDetector
|
|
40
|
+
|
|
41
|
+
LOGGER = logging.getLogger()
|
|
42
|
+
|
|
43
|
+
_DEFAULT_AUDIO_DIR = Path.home() / ".config" / "agent-cli" / "audio"
|
|
44
|
+
_DEFAULT_LOG_FILE = Path.home() / ".config" / "agent-cli" / "transcriptions.jsonl"
|
|
45
|
+
_MIN_SEGMENT_DURATION_SECONDS = 0.3
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class DaemonConfig:
|
|
50
|
+
"""Bundle of all daemon configuration."""
|
|
51
|
+
|
|
52
|
+
role: str
|
|
53
|
+
vad: VoiceActivityDetector
|
|
54
|
+
input_device_index: int | None
|
|
55
|
+
provider: config.ProviderSelection
|
|
56
|
+
wyoming_asr: config.WyomingASR
|
|
57
|
+
openai_asr: config.OpenAIASR
|
|
58
|
+
gemini_asr: config.GeminiASR
|
|
59
|
+
ollama: config.Ollama
|
|
60
|
+
openai_llm: config.OpenAILLM
|
|
61
|
+
gemini_llm: config.GeminiLLM
|
|
62
|
+
llm_enabled: bool
|
|
63
|
+
save_audio: bool
|
|
64
|
+
audio_dir: Path
|
|
65
|
+
log_file: Path
|
|
66
|
+
quiet: bool
|
|
67
|
+
clipboard: bool
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _generate_audio_path(audio_dir: Path, timestamp: datetime) -> Path:
|
|
71
|
+
"""Generate a path for an audio file based on timestamp."""
|
|
72
|
+
date_dir = audio_dir / timestamp.strftime("%Y/%m/%d")
|
|
73
|
+
date_dir.mkdir(parents=True, exist_ok=True)
|
|
74
|
+
filename = timestamp.strftime("%H%M%S") + f"_{timestamp.microsecond // 1000:03d}.mp3"
|
|
75
|
+
return date_dir / filename
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _log_segment(
|
|
79
|
+
log_file: Path,
|
|
80
|
+
*,
|
|
81
|
+
timestamp: datetime,
|
|
82
|
+
role: str,
|
|
83
|
+
raw_output: str,
|
|
84
|
+
processed_output: str | None,
|
|
85
|
+
audio_file: Path | None,
|
|
86
|
+
duration_seconds: float,
|
|
87
|
+
model_info: str | None = None,
|
|
88
|
+
) -> None:
|
|
89
|
+
"""Append a transcription segment to the log file."""
|
|
90
|
+
entry = {
|
|
91
|
+
"timestamp": timestamp.isoformat(),
|
|
92
|
+
"hostname": platform.node(),
|
|
93
|
+
"role": role,
|
|
94
|
+
"model": model_info,
|
|
95
|
+
"raw_output": raw_output,
|
|
96
|
+
"processed_output": processed_output,
|
|
97
|
+
"audio_file": str(audio_file) if audio_file else None,
|
|
98
|
+
"duration_seconds": round(duration_seconds, 2),
|
|
99
|
+
}
|
|
100
|
+
log_file.parent.mkdir(parents=True, exist_ok=True)
|
|
101
|
+
with log_file.open("a", encoding="utf-8") as f:
|
|
102
|
+
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
async def _process_segment( # noqa: PLR0912
|
|
106
|
+
cfg: DaemonConfig,
|
|
107
|
+
segment: bytes,
|
|
108
|
+
timestamp: datetime,
|
|
109
|
+
) -> None:
|
|
110
|
+
"""Process a speech segment: transcribe, optionally LLM-clean, and log."""
|
|
111
|
+
duration = cfg.vad.get_segment_duration_seconds(segment)
|
|
112
|
+
if duration < _MIN_SEGMENT_DURATION_SECONDS:
|
|
113
|
+
LOGGER.debug("Skipping very short segment: %.2fs", duration)
|
|
114
|
+
return
|
|
115
|
+
|
|
116
|
+
# Save audio as MP3 if requested (run in thread to avoid blocking event loop)
|
|
117
|
+
audio_path: Path | None = None
|
|
118
|
+
if cfg.save_audio:
|
|
119
|
+
try:
|
|
120
|
+
audio_path = _generate_audio_path(cfg.audio_dir, timestamp)
|
|
121
|
+
await asyncio.to_thread(save_audio_as_mp3, segment, audio_path)
|
|
122
|
+
LOGGER.debug("Saved audio to %s", audio_path)
|
|
123
|
+
except RuntimeError:
|
|
124
|
+
LOGGER.exception("Failed to save audio as MP3")
|
|
125
|
+
|
|
126
|
+
# Transcribe
|
|
127
|
+
transcriber = create_recorded_audio_transcriber(cfg.provider)
|
|
128
|
+
if cfg.provider.asr_provider == "openai":
|
|
129
|
+
transcript = await transcriber(segment, cfg.openai_asr, LOGGER, quiet=cfg.quiet)
|
|
130
|
+
elif cfg.provider.asr_provider == "gemini":
|
|
131
|
+
transcript = await transcriber(segment, cfg.gemini_asr, LOGGER, quiet=cfg.quiet)
|
|
132
|
+
elif cfg.provider.asr_provider == "wyoming":
|
|
133
|
+
transcript = await transcriber(
|
|
134
|
+
audio_data=segment,
|
|
135
|
+
wyoming_asr_cfg=cfg.wyoming_asr,
|
|
136
|
+
logger=LOGGER,
|
|
137
|
+
quiet=cfg.quiet,
|
|
138
|
+
)
|
|
139
|
+
else:
|
|
140
|
+
msg = f"Unsupported ASR provider: {cfg.provider.asr_provider}"
|
|
141
|
+
raise NotImplementedError(msg)
|
|
142
|
+
|
|
143
|
+
if not transcript or not transcript.strip():
|
|
144
|
+
LOGGER.debug("Empty transcript, skipping")
|
|
145
|
+
if not cfg.quiet:
|
|
146
|
+
console.print("[green]👂 Listening...[/green]" + " " * 20, end="\r")
|
|
147
|
+
return
|
|
148
|
+
|
|
149
|
+
if not cfg.quiet:
|
|
150
|
+
console.print(" " * 50, end="\r")
|
|
151
|
+
console.print(
|
|
152
|
+
f"[dim]{timestamp.strftime('%H:%M:%S')}[/dim] [cyan]{cfg.role}[/cyan]: {transcript}",
|
|
153
|
+
)
|
|
154
|
+
console.file.flush()
|
|
155
|
+
|
|
156
|
+
# LLM cleanup if enabled
|
|
157
|
+
processed: str | None = None
|
|
158
|
+
model_info: str | None = None
|
|
159
|
+
|
|
160
|
+
if cfg.llm_enabled:
|
|
161
|
+
models = {
|
|
162
|
+
"ollama": cfg.ollama.llm_ollama_model,
|
|
163
|
+
"openai": cfg.openai_llm.llm_openai_model,
|
|
164
|
+
"gemini": cfg.gemini_llm.llm_gemini_model,
|
|
165
|
+
}
|
|
166
|
+
model_info = f"{cfg.provider.llm_provider}:{models.get(cfg.provider.llm_provider, '')}"
|
|
167
|
+
|
|
168
|
+
processed = await process_and_update_clipboard(
|
|
169
|
+
system_prompt=SYSTEM_PROMPT,
|
|
170
|
+
agent_instructions=AGENT_INSTRUCTIONS,
|
|
171
|
+
provider_cfg=cfg.provider,
|
|
172
|
+
ollama_cfg=cfg.ollama,
|
|
173
|
+
openai_cfg=cfg.openai_llm,
|
|
174
|
+
gemini_cfg=cfg.gemini_llm,
|
|
175
|
+
logger=LOGGER,
|
|
176
|
+
original_text=transcript,
|
|
177
|
+
instruction=INSTRUCTION,
|
|
178
|
+
clipboard=False,
|
|
179
|
+
quiet=True,
|
|
180
|
+
live=None,
|
|
181
|
+
context=None,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
if not cfg.quiet and processed and processed != transcript:
|
|
185
|
+
console.print(f" [dim]→[/dim] [green]{processed}[/green]")
|
|
186
|
+
|
|
187
|
+
# Copy to clipboard if enabled
|
|
188
|
+
if cfg.clipboard:
|
|
189
|
+
import pyperclip # noqa: PLC0415
|
|
190
|
+
|
|
191
|
+
text_to_copy = processed if processed else transcript
|
|
192
|
+
pyperclip.copy(text_to_copy)
|
|
193
|
+
|
|
194
|
+
# Log
|
|
195
|
+
asr_model: str = cfg.provider.asr_provider
|
|
196
|
+
if cfg.provider.asr_provider == "openai":
|
|
197
|
+
asr_model += f":{cfg.openai_asr.asr_openai_model}"
|
|
198
|
+
|
|
199
|
+
_log_segment(
|
|
200
|
+
cfg.log_file,
|
|
201
|
+
timestamp=timestamp,
|
|
202
|
+
role=cfg.role,
|
|
203
|
+
raw_output=transcript,
|
|
204
|
+
processed_output=processed,
|
|
205
|
+
audio_file=audio_path,
|
|
206
|
+
duration_seconds=duration,
|
|
207
|
+
model_info=model_info or asr_model,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
if not cfg.quiet:
|
|
211
|
+
console.print("[green]👂 Listening...[/green]" + " " * 20, end="\r")
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
async def _daemon_loop(cfg: DaemonConfig) -> None: # noqa: PLR0912, PLR0915
|
|
215
|
+
"""Main daemon loop: continuously capture audio and process speech segments."""
|
|
216
|
+
stream_config = setup_input_stream(cfg.input_device_index)
|
|
217
|
+
background_tasks: set[asyncio.Task[None]] = set()
|
|
218
|
+
|
|
219
|
+
if not cfg.quiet:
|
|
220
|
+
print_with_style("🎙️ Transcribe daemon started. Listening...", style="green")
|
|
221
|
+
print_with_style(f" Role: {cfg.role}", style="dim")
|
|
222
|
+
print_with_style(f" Log file: {cfg.log_file}", style="dim")
|
|
223
|
+
if cfg.save_audio:
|
|
224
|
+
print_with_style(f" Audio dir: {cfg.audio_dir}", style="dim")
|
|
225
|
+
print_with_style(" Press Ctrl+C to stop.", style="dim")
|
|
226
|
+
console.print()
|
|
227
|
+
|
|
228
|
+
was_speaking = False
|
|
229
|
+
shutdown_event = asyncio.Event()
|
|
230
|
+
|
|
231
|
+
loop = asyncio.get_running_loop()
|
|
232
|
+
for sig in (signal.SIGINT, signal.SIGTERM):
|
|
233
|
+
loop.add_signal_handler(sig, shutdown_event.set)
|
|
234
|
+
|
|
235
|
+
with open_audio_stream(stream_config) as stream:
|
|
236
|
+
try:
|
|
237
|
+
while not shutdown_event.is_set():
|
|
238
|
+
try:
|
|
239
|
+
data, _ = await asyncio.to_thread(stream.read, constants.AUDIO_CHUNK_SIZE)
|
|
240
|
+
chunk = data.tobytes()
|
|
241
|
+
except asyncio.CancelledError:
|
|
242
|
+
break
|
|
243
|
+
except Exception:
|
|
244
|
+
LOGGER.exception("Error reading audio stream")
|
|
245
|
+
await asyncio.sleep(0.1)
|
|
246
|
+
continue
|
|
247
|
+
|
|
248
|
+
is_speaking, segment = cfg.vad.process_chunk(chunk)
|
|
249
|
+
|
|
250
|
+
if not cfg.quiet:
|
|
251
|
+
if is_speaking and not was_speaking:
|
|
252
|
+
console.print("[red]🔴 Recording...[/red]", end="\r")
|
|
253
|
+
elif not is_speaking and was_speaking and segment is None:
|
|
254
|
+
console.print("[yellow]⏸️ Pause detected...[/yellow]", end="\r")
|
|
255
|
+
|
|
256
|
+
was_speaking = is_speaking
|
|
257
|
+
|
|
258
|
+
if segment:
|
|
259
|
+
timestamp = datetime.now(UTC).astimezone()
|
|
260
|
+
duration = cfg.vad.get_segment_duration_seconds(segment)
|
|
261
|
+
|
|
262
|
+
if not cfg.quiet:
|
|
263
|
+
console.print(
|
|
264
|
+
f"[blue]⏳ Processing {duration:.1f}s segment...[/blue]",
|
|
265
|
+
end="\r",
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
LOGGER.debug("Speech segment detected, %.2f seconds", duration)
|
|
269
|
+
|
|
270
|
+
task = asyncio.create_task(_process_segment(cfg, segment, timestamp))
|
|
271
|
+
background_tasks.add(task)
|
|
272
|
+
task.add_done_callback(background_tasks.discard)
|
|
273
|
+
|
|
274
|
+
except (KeyboardInterrupt, asyncio.CancelledError):
|
|
275
|
+
LOGGER.debug("Shutdown signal received")
|
|
276
|
+
finally:
|
|
277
|
+
for sig in (signal.SIGINT, signal.SIGTERM):
|
|
278
|
+
with suppress(ValueError):
|
|
279
|
+
loop.remove_signal_handler(sig)
|
|
280
|
+
with suppress(Exception):
|
|
281
|
+
stream.abort()
|
|
282
|
+
for task in background_tasks:
|
|
283
|
+
if not task.done():
|
|
284
|
+
task.cancel()
|
|
285
|
+
if background_tasks:
|
|
286
|
+
with suppress(asyncio.TimeoutError):
|
|
287
|
+
await asyncio.wait(background_tasks, timeout=2.0)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
@app.command("transcribe-daemon", rich_help_panel="Voice Commands")
|
|
291
|
+
@requires_extras("audio", "vad", "llm")
|
|
292
|
+
def transcribe_daemon( # noqa: PLR0912
|
|
293
|
+
*,
|
|
294
|
+
# Daemon-specific options
|
|
295
|
+
role: str = typer.Option(
|
|
296
|
+
"user",
|
|
297
|
+
"--role",
|
|
298
|
+
"-r",
|
|
299
|
+
help="Role name for logging (e.g., 'meeting', 'notes', 'user').",
|
|
300
|
+
),
|
|
301
|
+
silence_threshold: float = typer.Option(
|
|
302
|
+
1.0,
|
|
303
|
+
"--silence-threshold",
|
|
304
|
+
"-s",
|
|
305
|
+
help="Seconds of silence to end a speech segment.",
|
|
306
|
+
),
|
|
307
|
+
min_segment: float = typer.Option(
|
|
308
|
+
0.25,
|
|
309
|
+
"--min-segment",
|
|
310
|
+
"-m",
|
|
311
|
+
help="Minimum speech duration in seconds to trigger a segment.",
|
|
312
|
+
),
|
|
313
|
+
vad_threshold: float = typer.Option(
|
|
314
|
+
0.3,
|
|
315
|
+
"--vad-threshold",
|
|
316
|
+
help="VAD speech detection threshold (0.0-1.0). Higher = more aggressive filtering.",
|
|
317
|
+
),
|
|
318
|
+
save_audio: bool = typer.Option(
|
|
319
|
+
True, # noqa: FBT003
|
|
320
|
+
"--save-audio/--no-save-audio",
|
|
321
|
+
help="Save audio segments as MP3 files.",
|
|
322
|
+
),
|
|
323
|
+
audio_dir: Path | None = typer.Option( # noqa: B008
|
|
324
|
+
None,
|
|
325
|
+
"--audio-dir",
|
|
326
|
+
help="Directory for MP3 files. Default: ~/.config/agent-cli/audio",
|
|
327
|
+
),
|
|
328
|
+
transcription_log: Path | None = typer.Option( # noqa: B008
|
|
329
|
+
None,
|
|
330
|
+
"--transcription-log",
|
|
331
|
+
"-t",
|
|
332
|
+
help="JSON Lines log file path. Default: ~/.config/agent-cli/transcriptions.jsonl",
|
|
333
|
+
),
|
|
334
|
+
clipboard: bool = typer.Option(
|
|
335
|
+
False, # noqa: FBT003
|
|
336
|
+
"--clipboard/--no-clipboard",
|
|
337
|
+
help="Copy each transcription to clipboard.",
|
|
338
|
+
),
|
|
339
|
+
# --- Provider Selection ---
|
|
340
|
+
asr_provider: str = opts.ASR_PROVIDER,
|
|
341
|
+
llm_provider: str = opts.LLM_PROVIDER,
|
|
342
|
+
# --- ASR (Audio) Configuration ---
|
|
343
|
+
input_device_index: int | None = opts.INPUT_DEVICE_INDEX,
|
|
344
|
+
input_device_name: str | None = opts.INPUT_DEVICE_NAME,
|
|
345
|
+
asr_wyoming_ip: str = opts.ASR_WYOMING_IP,
|
|
346
|
+
asr_wyoming_port: int = opts.ASR_WYOMING_PORT,
|
|
347
|
+
asr_openai_model: str = opts.ASR_OPENAI_MODEL,
|
|
348
|
+
asr_openai_base_url: str | None = opts.ASR_OPENAI_BASE_URL,
|
|
349
|
+
asr_openai_prompt: str | None = opts.ASR_OPENAI_PROMPT,
|
|
350
|
+
asr_gemini_model: str = opts.ASR_GEMINI_MODEL,
|
|
351
|
+
# --- LLM Configuration ---
|
|
352
|
+
llm_ollama_model: str = opts.LLM_OLLAMA_MODEL,
|
|
353
|
+
llm_ollama_host: str = opts.LLM_OLLAMA_HOST,
|
|
354
|
+
llm_openai_model: str = opts.LLM_OPENAI_MODEL,
|
|
355
|
+
openai_api_key: str | None = opts.OPENAI_API_KEY,
|
|
356
|
+
openai_base_url: str | None = opts.OPENAI_BASE_URL,
|
|
357
|
+
llm_gemini_model: str = opts.LLM_GEMINI_MODEL,
|
|
358
|
+
gemini_api_key: str | None = opts.GEMINI_API_KEY,
|
|
359
|
+
llm: bool = opts.LLM,
|
|
360
|
+
# --- Process Management ---
|
|
361
|
+
stop: bool = opts.STOP,
|
|
362
|
+
status: bool = opts.STATUS,
|
|
363
|
+
# --- General Options ---
|
|
364
|
+
log_level: opts.LogLevel = opts.LOG_LEVEL,
|
|
365
|
+
log_file_logging: str | None = opts.LOG_FILE,
|
|
366
|
+
list_devices: bool = opts.LIST_DEVICES,
|
|
367
|
+
quiet: bool = opts.QUIET,
|
|
368
|
+
config_file: str | None = opts.CONFIG_FILE,
|
|
369
|
+
print_args: bool = opts.PRINT_ARGS,
|
|
370
|
+
) -> None:
|
|
371
|
+
"""Run a continuous transcription daemon with voice activity detection.
|
|
372
|
+
|
|
373
|
+
This command runs indefinitely, capturing audio from your microphone,
|
|
374
|
+
detecting speech segments using Silero VAD, transcribing them, and
|
|
375
|
+
logging results with timestamps.
|
|
376
|
+
|
|
377
|
+
Examples:
|
|
378
|
+
# Basic daemon
|
|
379
|
+
agent-cli transcribe-daemon
|
|
380
|
+
|
|
381
|
+
# With role and custom silence threshold
|
|
382
|
+
agent-cli transcribe-daemon --role meeting --silence-threshold 1.5
|
|
383
|
+
|
|
384
|
+
# With LLM cleanup
|
|
385
|
+
agent-cli transcribe-daemon --llm --role notes
|
|
386
|
+
|
|
387
|
+
# Custom log file and audio directory
|
|
388
|
+
agent-cli transcribe-daemon --transcription-log ~/meeting.jsonl --audio-dir ~/audio
|
|
389
|
+
|
|
390
|
+
"""
|
|
391
|
+
if print_args:
|
|
392
|
+
print_command_line_args(locals())
|
|
393
|
+
setup_logging(log_level, log_file_logging, quiet=quiet)
|
|
394
|
+
|
|
395
|
+
process_name = "transcribe-daemon"
|
|
396
|
+
|
|
397
|
+
# Handle stop/status commands
|
|
398
|
+
if stop:
|
|
399
|
+
if process.kill_process(process_name):
|
|
400
|
+
if not quiet:
|
|
401
|
+
print_with_style(f"✅ Stopped {process_name}", style="green")
|
|
402
|
+
elif not quiet:
|
|
403
|
+
print_with_style(f"⚠️ {process_name} is not running", style="yellow")
|
|
404
|
+
return
|
|
405
|
+
|
|
406
|
+
if status:
|
|
407
|
+
if process.is_process_running(process_name):
|
|
408
|
+
if not quiet:
|
|
409
|
+
print_with_style(f"✅ {process_name} is running", style="green")
|
|
410
|
+
elif not quiet:
|
|
411
|
+
print_with_style(f"⚠️ {process_name} is not running", style="yellow")
|
|
412
|
+
return
|
|
413
|
+
|
|
414
|
+
# Validate VAD threshold
|
|
415
|
+
if vad_threshold < 0.0 or vad_threshold > 1.0:
|
|
416
|
+
print_with_style("❌ VAD threshold must be 0.0-1.0", style="red")
|
|
417
|
+
raise typer.Exit(1)
|
|
418
|
+
|
|
419
|
+
# Check FFmpeg availability if saving audio
|
|
420
|
+
if save_audio and not check_ffmpeg_available():
|
|
421
|
+
print_with_style(
|
|
422
|
+
"⚠️ FFmpeg not found. Audio saving disabled. Install FFmpeg for MP3 support.",
|
|
423
|
+
style="yellow",
|
|
424
|
+
)
|
|
425
|
+
save_audio = False
|
|
426
|
+
|
|
427
|
+
# Setup audio device
|
|
428
|
+
general_cfg = config.General(
|
|
429
|
+
log_level=log_level,
|
|
430
|
+
log_file=log_file_logging,
|
|
431
|
+
quiet=quiet,
|
|
432
|
+
list_devices=list_devices,
|
|
433
|
+
clipboard=False,
|
|
434
|
+
)
|
|
435
|
+
audio_in_cfg = config.AudioInput(
|
|
436
|
+
input_device_index=input_device_index,
|
|
437
|
+
input_device_name=input_device_name,
|
|
438
|
+
)
|
|
439
|
+
device_info = setup_devices(general_cfg, audio_in_cfg, None)
|
|
440
|
+
if device_info is None:
|
|
441
|
+
return
|
|
442
|
+
resolved_input_device_index, _, _ = device_info
|
|
443
|
+
|
|
444
|
+
# Import VAD here to avoid loading torch/numpy at module import time
|
|
445
|
+
from agent_cli.core.vad import VoiceActivityDetector # noqa: PLC0415
|
|
446
|
+
|
|
447
|
+
# Create daemon config
|
|
448
|
+
cfg = DaemonConfig(
|
|
449
|
+
role=role,
|
|
450
|
+
vad=VoiceActivityDetector(
|
|
451
|
+
threshold=vad_threshold,
|
|
452
|
+
silence_threshold_ms=int(silence_threshold * 1000),
|
|
453
|
+
min_speech_duration_ms=int(min_segment * 1000),
|
|
454
|
+
),
|
|
455
|
+
input_device_index=resolved_input_device_index,
|
|
456
|
+
provider=config.ProviderSelection(
|
|
457
|
+
asr_provider=asr_provider,
|
|
458
|
+
llm_provider=llm_provider,
|
|
459
|
+
tts_provider="wyoming",
|
|
460
|
+
),
|
|
461
|
+
wyoming_asr=config.WyomingASR(
|
|
462
|
+
asr_wyoming_ip=asr_wyoming_ip,
|
|
463
|
+
asr_wyoming_port=asr_wyoming_port,
|
|
464
|
+
),
|
|
465
|
+
openai_asr=config.OpenAIASR(
|
|
466
|
+
asr_openai_model=asr_openai_model,
|
|
467
|
+
openai_api_key=openai_api_key,
|
|
468
|
+
openai_base_url=asr_openai_base_url or openai_base_url,
|
|
469
|
+
asr_openai_prompt=asr_openai_prompt,
|
|
470
|
+
),
|
|
471
|
+
gemini_asr=config.GeminiASR(
|
|
472
|
+
asr_gemini_model=asr_gemini_model,
|
|
473
|
+
gemini_api_key=gemini_api_key,
|
|
474
|
+
),
|
|
475
|
+
ollama=config.Ollama(llm_ollama_model=llm_ollama_model, llm_ollama_host=llm_ollama_host),
|
|
476
|
+
openai_llm=config.OpenAILLM(
|
|
477
|
+
llm_openai_model=llm_openai_model,
|
|
478
|
+
openai_api_key=openai_api_key,
|
|
479
|
+
openai_base_url=openai_base_url,
|
|
480
|
+
),
|
|
481
|
+
gemini_llm=config.GeminiLLM(
|
|
482
|
+
llm_gemini_model=llm_gemini_model,
|
|
483
|
+
gemini_api_key=gemini_api_key,
|
|
484
|
+
),
|
|
485
|
+
llm_enabled=llm,
|
|
486
|
+
save_audio=save_audio,
|
|
487
|
+
audio_dir=audio_dir.expanduser() if audio_dir else _DEFAULT_AUDIO_DIR,
|
|
488
|
+
log_file=transcription_log.expanduser() if transcription_log else _DEFAULT_LOG_FILE,
|
|
489
|
+
quiet=quiet,
|
|
490
|
+
clipboard=clipboard,
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
# Run the daemon
|
|
494
|
+
with process.pid_file_context(process_name), suppress(KeyboardInterrupt):
|
|
495
|
+
asyncio.run(_daemon_loop(cfg))
|
|
496
|
+
|
|
497
|
+
if not quiet:
|
|
498
|
+
console.print()
|
|
499
|
+
print_with_style("👋 Transcribe daemon stopped.", style="yellow")
|