agent-cli 0.70.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_cli/__init__.py +5 -0
- agent_cli/__main__.py +6 -0
- agent_cli/_extras.json +14 -0
- agent_cli/_requirements/.gitkeep +0 -0
- agent_cli/_requirements/audio.txt +79 -0
- agent_cli/_requirements/faster-whisper.txt +215 -0
- agent_cli/_requirements/kokoro.txt +425 -0
- agent_cli/_requirements/llm.txt +183 -0
- agent_cli/_requirements/memory.txt +355 -0
- agent_cli/_requirements/mlx-whisper.txt +222 -0
- agent_cli/_requirements/piper.txt +176 -0
- agent_cli/_requirements/rag.txt +402 -0
- agent_cli/_requirements/server.txt +154 -0
- agent_cli/_requirements/speed.txt +77 -0
- agent_cli/_requirements/vad.txt +155 -0
- agent_cli/_requirements/wyoming.txt +71 -0
- agent_cli/_tools.py +368 -0
- agent_cli/agents/__init__.py +23 -0
- agent_cli/agents/_voice_agent_common.py +136 -0
- agent_cli/agents/assistant.py +383 -0
- agent_cli/agents/autocorrect.py +284 -0
- agent_cli/agents/chat.py +496 -0
- agent_cli/agents/memory/__init__.py +31 -0
- agent_cli/agents/memory/add.py +190 -0
- agent_cli/agents/memory/proxy.py +160 -0
- agent_cli/agents/rag_proxy.py +128 -0
- agent_cli/agents/speak.py +209 -0
- agent_cli/agents/transcribe.py +671 -0
- agent_cli/agents/transcribe_daemon.py +499 -0
- agent_cli/agents/voice_edit.py +291 -0
- agent_cli/api.py +22 -0
- agent_cli/cli.py +106 -0
- agent_cli/config.py +503 -0
- agent_cli/config_cmd.py +307 -0
- agent_cli/constants.py +27 -0
- agent_cli/core/__init__.py +1 -0
- agent_cli/core/audio.py +461 -0
- agent_cli/core/audio_format.py +299 -0
- agent_cli/core/chroma.py +88 -0
- agent_cli/core/deps.py +191 -0
- agent_cli/core/openai_proxy.py +139 -0
- agent_cli/core/process.py +195 -0
- agent_cli/core/reranker.py +120 -0
- agent_cli/core/sse.py +87 -0
- agent_cli/core/transcription_logger.py +70 -0
- agent_cli/core/utils.py +526 -0
- agent_cli/core/vad.py +175 -0
- agent_cli/core/watch.py +65 -0
- agent_cli/dev/__init__.py +14 -0
- agent_cli/dev/cli.py +1588 -0
- agent_cli/dev/coding_agents/__init__.py +19 -0
- agent_cli/dev/coding_agents/aider.py +24 -0
- agent_cli/dev/coding_agents/base.py +167 -0
- agent_cli/dev/coding_agents/claude.py +39 -0
- agent_cli/dev/coding_agents/codex.py +24 -0
- agent_cli/dev/coding_agents/continue_dev.py +15 -0
- agent_cli/dev/coding_agents/copilot.py +24 -0
- agent_cli/dev/coding_agents/cursor_agent.py +48 -0
- agent_cli/dev/coding_agents/gemini.py +28 -0
- agent_cli/dev/coding_agents/opencode.py +15 -0
- agent_cli/dev/coding_agents/registry.py +49 -0
- agent_cli/dev/editors/__init__.py +19 -0
- agent_cli/dev/editors/base.py +89 -0
- agent_cli/dev/editors/cursor.py +15 -0
- agent_cli/dev/editors/emacs.py +46 -0
- agent_cli/dev/editors/jetbrains.py +56 -0
- agent_cli/dev/editors/nano.py +31 -0
- agent_cli/dev/editors/neovim.py +33 -0
- agent_cli/dev/editors/registry.py +59 -0
- agent_cli/dev/editors/sublime.py +20 -0
- agent_cli/dev/editors/vim.py +42 -0
- agent_cli/dev/editors/vscode.py +15 -0
- agent_cli/dev/editors/zed.py +20 -0
- agent_cli/dev/project.py +568 -0
- agent_cli/dev/registry.py +52 -0
- agent_cli/dev/skill/SKILL.md +141 -0
- agent_cli/dev/skill/examples.md +571 -0
- agent_cli/dev/terminals/__init__.py +19 -0
- agent_cli/dev/terminals/apple_terminal.py +82 -0
- agent_cli/dev/terminals/base.py +56 -0
- agent_cli/dev/terminals/gnome.py +51 -0
- agent_cli/dev/terminals/iterm2.py +84 -0
- agent_cli/dev/terminals/kitty.py +77 -0
- agent_cli/dev/terminals/registry.py +48 -0
- agent_cli/dev/terminals/tmux.py +58 -0
- agent_cli/dev/terminals/warp.py +132 -0
- agent_cli/dev/terminals/zellij.py +78 -0
- agent_cli/dev/worktree.py +856 -0
- agent_cli/docs_gen.py +417 -0
- agent_cli/example-config.toml +185 -0
- agent_cli/install/__init__.py +5 -0
- agent_cli/install/common.py +89 -0
- agent_cli/install/extras.py +174 -0
- agent_cli/install/hotkeys.py +48 -0
- agent_cli/install/services.py +87 -0
- agent_cli/memory/__init__.py +7 -0
- agent_cli/memory/_files.py +250 -0
- agent_cli/memory/_filters.py +63 -0
- agent_cli/memory/_git.py +157 -0
- agent_cli/memory/_indexer.py +142 -0
- agent_cli/memory/_ingest.py +408 -0
- agent_cli/memory/_persistence.py +182 -0
- agent_cli/memory/_prompt.py +91 -0
- agent_cli/memory/_retrieval.py +294 -0
- agent_cli/memory/_store.py +169 -0
- agent_cli/memory/_streaming.py +44 -0
- agent_cli/memory/_tasks.py +48 -0
- agent_cli/memory/api.py +113 -0
- agent_cli/memory/client.py +272 -0
- agent_cli/memory/engine.py +361 -0
- agent_cli/memory/entities.py +43 -0
- agent_cli/memory/models.py +112 -0
- agent_cli/opts.py +433 -0
- agent_cli/py.typed +0 -0
- agent_cli/rag/__init__.py +3 -0
- agent_cli/rag/_indexer.py +67 -0
- agent_cli/rag/_indexing.py +226 -0
- agent_cli/rag/_prompt.py +30 -0
- agent_cli/rag/_retriever.py +156 -0
- agent_cli/rag/_store.py +48 -0
- agent_cli/rag/_utils.py +218 -0
- agent_cli/rag/api.py +175 -0
- agent_cli/rag/client.py +299 -0
- agent_cli/rag/engine.py +302 -0
- agent_cli/rag/models.py +55 -0
- agent_cli/scripts/.runtime/.gitkeep +0 -0
- agent_cli/scripts/__init__.py +1 -0
- agent_cli/scripts/check_plugin_skill_sync.py +50 -0
- agent_cli/scripts/linux-hotkeys/README.md +63 -0
- agent_cli/scripts/linux-hotkeys/toggle-autocorrect.sh +45 -0
- agent_cli/scripts/linux-hotkeys/toggle-transcription.sh +58 -0
- agent_cli/scripts/linux-hotkeys/toggle-voice-edit.sh +58 -0
- agent_cli/scripts/macos-hotkeys/README.md +45 -0
- agent_cli/scripts/macos-hotkeys/skhd-config-example +5 -0
- agent_cli/scripts/macos-hotkeys/toggle-autocorrect.sh +12 -0
- agent_cli/scripts/macos-hotkeys/toggle-transcription.sh +37 -0
- agent_cli/scripts/macos-hotkeys/toggle-voice-edit.sh +37 -0
- agent_cli/scripts/nvidia-asr-server/README.md +99 -0
- agent_cli/scripts/nvidia-asr-server/pyproject.toml +27 -0
- agent_cli/scripts/nvidia-asr-server/server.py +255 -0
- agent_cli/scripts/nvidia-asr-server/shell.nix +32 -0
- agent_cli/scripts/nvidia-asr-server/uv.lock +4654 -0
- agent_cli/scripts/run-openwakeword.sh +11 -0
- agent_cli/scripts/run-piper-windows.ps1 +30 -0
- agent_cli/scripts/run-piper.sh +24 -0
- agent_cli/scripts/run-whisper-linux.sh +40 -0
- agent_cli/scripts/run-whisper-macos.sh +6 -0
- agent_cli/scripts/run-whisper-windows.ps1 +51 -0
- agent_cli/scripts/run-whisper.sh +9 -0
- agent_cli/scripts/run_faster_whisper_server.py +136 -0
- agent_cli/scripts/setup-linux-hotkeys.sh +72 -0
- agent_cli/scripts/setup-linux.sh +108 -0
- agent_cli/scripts/setup-macos-hotkeys.sh +61 -0
- agent_cli/scripts/setup-macos.sh +76 -0
- agent_cli/scripts/setup-windows.ps1 +63 -0
- agent_cli/scripts/start-all-services-windows.ps1 +53 -0
- agent_cli/scripts/start-all-services.sh +178 -0
- agent_cli/scripts/sync_extras.py +138 -0
- agent_cli/server/__init__.py +3 -0
- agent_cli/server/cli.py +721 -0
- agent_cli/server/common.py +222 -0
- agent_cli/server/model_manager.py +288 -0
- agent_cli/server/model_registry.py +225 -0
- agent_cli/server/proxy/__init__.py +3 -0
- agent_cli/server/proxy/api.py +444 -0
- agent_cli/server/streaming.py +67 -0
- agent_cli/server/tts/__init__.py +3 -0
- agent_cli/server/tts/api.py +335 -0
- agent_cli/server/tts/backends/__init__.py +82 -0
- agent_cli/server/tts/backends/base.py +139 -0
- agent_cli/server/tts/backends/kokoro.py +403 -0
- agent_cli/server/tts/backends/piper.py +253 -0
- agent_cli/server/tts/model_manager.py +201 -0
- agent_cli/server/tts/model_registry.py +28 -0
- agent_cli/server/tts/wyoming_handler.py +249 -0
- agent_cli/server/whisper/__init__.py +3 -0
- agent_cli/server/whisper/api.py +413 -0
- agent_cli/server/whisper/backends/__init__.py +89 -0
- agent_cli/server/whisper/backends/base.py +97 -0
- agent_cli/server/whisper/backends/faster_whisper.py +225 -0
- agent_cli/server/whisper/backends/mlx.py +270 -0
- agent_cli/server/whisper/languages.py +116 -0
- agent_cli/server/whisper/model_manager.py +157 -0
- agent_cli/server/whisper/model_registry.py +28 -0
- agent_cli/server/whisper/wyoming_handler.py +203 -0
- agent_cli/services/__init__.py +343 -0
- agent_cli/services/_wyoming_utils.py +64 -0
- agent_cli/services/asr.py +506 -0
- agent_cli/services/llm.py +228 -0
- agent_cli/services/tts.py +450 -0
- agent_cli/services/wake_word.py +142 -0
- agent_cli-0.70.5.dist-info/METADATA +2118 -0
- agent_cli-0.70.5.dist-info/RECORD +196 -0
- agent_cli-0.70.5.dist-info/WHEEL +4 -0
- agent_cli-0.70.5.dist-info/entry_points.txt +4 -0
- agent_cli-0.70.5.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
r"""Common functionalities for voice-based agents."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import time
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
from agent_cli.core.utils import print_input_panel, print_with_style
|
|
10
|
+
from agent_cli.services import asr
|
|
11
|
+
from agent_cli.services.llm import process_and_update_clipboard
|
|
12
|
+
from agent_cli.services.tts import handle_tts_playback
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from rich.live import Live
|
|
16
|
+
|
|
17
|
+
from agent_cli import config
|
|
18
|
+
|
|
19
|
+
LOGGER = logging.getLogger()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
async def get_instruction_from_audio(
|
|
23
|
+
*,
|
|
24
|
+
audio_data: bytes,
|
|
25
|
+
provider_cfg: config.ProviderSelection,
|
|
26
|
+
audio_input_cfg: config.AudioInput,
|
|
27
|
+
wyoming_asr_cfg: config.WyomingASR,
|
|
28
|
+
openai_asr_cfg: config.OpenAIASR,
|
|
29
|
+
gemini_asr_cfg: config.GeminiASR,
|
|
30
|
+
ollama_cfg: config.Ollama,
|
|
31
|
+
logger: logging.Logger,
|
|
32
|
+
quiet: bool,
|
|
33
|
+
) -> str | None:
|
|
34
|
+
"""Transcribe audio data and return the instruction."""
|
|
35
|
+
try:
|
|
36
|
+
start_time = time.monotonic()
|
|
37
|
+
transcriber = asr.create_recorded_audio_transcriber(provider_cfg)
|
|
38
|
+
instruction = await transcriber(
|
|
39
|
+
audio_data=audio_data,
|
|
40
|
+
provider_cfg=provider_cfg,
|
|
41
|
+
audio_input_cfg=audio_input_cfg,
|
|
42
|
+
wyoming_asr_cfg=wyoming_asr_cfg,
|
|
43
|
+
openai_asr_cfg=openai_asr_cfg,
|
|
44
|
+
gemini_asr_cfg=gemini_asr_cfg,
|
|
45
|
+
ollama_cfg=ollama_cfg,
|
|
46
|
+
logger=logger,
|
|
47
|
+
quiet=quiet,
|
|
48
|
+
)
|
|
49
|
+
elapsed = time.monotonic() - start_time
|
|
50
|
+
|
|
51
|
+
if not instruction or not instruction.strip():
|
|
52
|
+
if not quiet:
|
|
53
|
+
print_with_style(
|
|
54
|
+
"No speech detected in recording",
|
|
55
|
+
style="yellow",
|
|
56
|
+
)
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
if not quiet:
|
|
60
|
+
print_input_panel(
|
|
61
|
+
instruction,
|
|
62
|
+
title="🎯 Instruction",
|
|
63
|
+
style="bold yellow",
|
|
64
|
+
subtitle=f"[dim]took {elapsed:.2f}s[/dim]",
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
return instruction
|
|
68
|
+
|
|
69
|
+
except Exception as e:
|
|
70
|
+
logger.exception("Failed to process audio with ASR")
|
|
71
|
+
if not quiet:
|
|
72
|
+
print_with_style(f"ASR processing failed: {e}", style="red")
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
async def process_instruction_and_respond(
|
|
77
|
+
*,
|
|
78
|
+
instruction: str,
|
|
79
|
+
original_text: str,
|
|
80
|
+
provider_cfg: config.ProviderSelection,
|
|
81
|
+
general_cfg: config.General,
|
|
82
|
+
ollama_cfg: config.Ollama,
|
|
83
|
+
openai_llm_cfg: config.OpenAILLM,
|
|
84
|
+
gemini_llm_cfg: config.GeminiLLM,
|
|
85
|
+
audio_output_cfg: config.AudioOutput,
|
|
86
|
+
wyoming_tts_cfg: config.WyomingTTS,
|
|
87
|
+
openai_tts_cfg: config.OpenAITTS,
|
|
88
|
+
kokoro_tts_cfg: config.KokoroTTS,
|
|
89
|
+
gemini_tts_cfg: config.GeminiTTS | None = None,
|
|
90
|
+
system_prompt: str,
|
|
91
|
+
agent_instructions: str,
|
|
92
|
+
live: Live | None,
|
|
93
|
+
logger: logging.Logger,
|
|
94
|
+
) -> str | None:
|
|
95
|
+
"""Process instruction with LLM and handle TTS response.
|
|
96
|
+
|
|
97
|
+
Returns the processed text, or None if processing failed.
|
|
98
|
+
"""
|
|
99
|
+
result: str | None = None
|
|
100
|
+
# Process with LLM if clipboard mode is enabled
|
|
101
|
+
if general_cfg.clipboard:
|
|
102
|
+
result = await process_and_update_clipboard(
|
|
103
|
+
system_prompt=system_prompt,
|
|
104
|
+
agent_instructions=agent_instructions,
|
|
105
|
+
provider_cfg=provider_cfg,
|
|
106
|
+
ollama_cfg=ollama_cfg,
|
|
107
|
+
openai_cfg=openai_llm_cfg,
|
|
108
|
+
gemini_cfg=gemini_llm_cfg,
|
|
109
|
+
logger=logger,
|
|
110
|
+
original_text=original_text,
|
|
111
|
+
instruction=instruction,
|
|
112
|
+
clipboard=general_cfg.clipboard,
|
|
113
|
+
quiet=general_cfg.quiet,
|
|
114
|
+
live=live,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Handle TTS response if enabled
|
|
118
|
+
if audio_output_cfg.enable_tts and result and result.strip():
|
|
119
|
+
await handle_tts_playback(
|
|
120
|
+
text=result,
|
|
121
|
+
provider_cfg=provider_cfg,
|
|
122
|
+
audio_output_cfg=audio_output_cfg,
|
|
123
|
+
wyoming_tts_cfg=wyoming_tts_cfg,
|
|
124
|
+
openai_tts_cfg=openai_tts_cfg,
|
|
125
|
+
kokoro_tts_cfg=kokoro_tts_cfg,
|
|
126
|
+
gemini_tts_cfg=gemini_tts_cfg,
|
|
127
|
+
save_file=general_cfg.save_file,
|
|
128
|
+
quiet=general_cfg.quiet,
|
|
129
|
+
logger=logger,
|
|
130
|
+
play_audio=not general_cfg.save_file,
|
|
131
|
+
status_message="🔊 Speaking response...",
|
|
132
|
+
description="TTS audio",
|
|
133
|
+
live=live,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
return result
|
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
r"""Wake word-based voice assistant that records when wake word is detected.
|
|
2
|
+
|
|
3
|
+
This agent uses Wyoming wake word detection to implement a hands-free voice assistant that:
|
|
4
|
+
1. Continuously listens for a wake word
|
|
5
|
+
2. When the wake word is detected, starts recording user speech
|
|
6
|
+
3. When the wake word is detected again, stops recording and processes the speech
|
|
7
|
+
4. Sends the recorded speech to ASR for transcription
|
|
8
|
+
5. Optionally processes the transcript with an LLM and speaks the response
|
|
9
|
+
|
|
10
|
+
WORKFLOW:
|
|
11
|
+
1. Agent starts listening for the specified wake word
|
|
12
|
+
2. First wake word detection -> start recording user speech
|
|
13
|
+
3. Second wake word detection -> stop recording and process the speech
|
|
14
|
+
4. Transcribe the recorded speech using Wyoming ASR
|
|
15
|
+
5. Optionally process with LLM and respond with TTS
|
|
16
|
+
|
|
17
|
+
USAGE:
|
|
18
|
+
- Start the agent: assistant --wake-word "ok_nabu" --input-device-index 1
|
|
19
|
+
- The agent runs continuously until stopped with Ctrl+C or --stop
|
|
20
|
+
- Uses background process management for daemon-like operation
|
|
21
|
+
|
|
22
|
+
REQUIREMENTS:
|
|
23
|
+
- Wyoming wake word server (e.g., wyoming-openwakeword)
|
|
24
|
+
- Wyoming ASR server (e.g., wyoming-whisper)
|
|
25
|
+
- Optional: Wyoming TTS server for responses
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import asyncio
|
|
31
|
+
import logging
|
|
32
|
+
from contextlib import suppress
|
|
33
|
+
from pathlib import Path # noqa: TC003
|
|
34
|
+
from typing import TYPE_CHECKING
|
|
35
|
+
|
|
36
|
+
from agent_cli import config, opts
|
|
37
|
+
from agent_cli.agents._voice_agent_common import (
|
|
38
|
+
get_instruction_from_audio,
|
|
39
|
+
process_instruction_and_respond,
|
|
40
|
+
)
|
|
41
|
+
from agent_cli.cli import app
|
|
42
|
+
from agent_cli.core import audio, process
|
|
43
|
+
from agent_cli.core.audio import setup_devices
|
|
44
|
+
from agent_cli.core.deps import requires_extras
|
|
45
|
+
from agent_cli.core.utils import (
|
|
46
|
+
InteractiveStopEvent,
|
|
47
|
+
maybe_live,
|
|
48
|
+
print_command_line_args,
|
|
49
|
+
print_with_style,
|
|
50
|
+
setup_logging,
|
|
51
|
+
signal_handling_context,
|
|
52
|
+
stop_or_status_or_toggle,
|
|
53
|
+
)
|
|
54
|
+
from agent_cli.services import asr
|
|
55
|
+
from agent_cli.services.wake_word import create_wake_word_detector
|
|
56
|
+
|
|
57
|
+
if TYPE_CHECKING:
|
|
58
|
+
import sounddevice as sd
|
|
59
|
+
from rich.live import Live
|
|
60
|
+
|
|
61
|
+
LOGGER = logging.getLogger()
|
|
62
|
+
|
|
63
|
+
WAKE_WORD_VARIATIONS = {
|
|
64
|
+
"ok_nabu": ["ok nabu", "okay nabu", "okay, nabu", "ok, nabu", "ok naboo", "okay naboo"],
|
|
65
|
+
"alexa": ["alexa"],
|
|
66
|
+
"hey_jarvis": ["hey jarvis"],
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
# LLM Prompts for wake word assistant
|
|
70
|
+
SYSTEM_PROMPT_TEMPLATE = """\
|
|
71
|
+
You are a helpful voice assistant. Respond to user questions and commands in a conversational, friendly manner.
|
|
72
|
+
|
|
73
|
+
The user is using a wake word to start and stop the recording, so the wake word will always appear at the END of the transcription.
|
|
74
|
+
The wake word is "{wake_word}". You should ignore the wake word and any variations of it (e.g., "{variations}") when processing the user's command.
|
|
75
|
+
|
|
76
|
+
Keep your responses concise but informative. If the user asks you to perform an action that requires external tools or systems, explain what you would do if you had access to those capabilities.
|
|
77
|
+
|
|
78
|
+
Always be helpful, accurate, and engaging in your responses.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
AGENT_INSTRUCTIONS_TEMPLATE = """\
|
|
82
|
+
The user has spoken a voice command or question. The user is using a wake word to start and stop the recording. The wake word is "{wake_word}". You should ignore the wake word and any variations of it (e.g., "{variations}") when processing the user's command.
|
|
83
|
+
|
|
84
|
+
Provide a helpful, conversational response.
|
|
85
|
+
|
|
86
|
+
If it's a question, answer it clearly and concisely.
|
|
87
|
+
If it's a command, explain what you would do or provide guidance on how to accomplish it.
|
|
88
|
+
If it's unclear, ask for clarification in a friendly way.
|
|
89
|
+
|
|
90
|
+
Respond as if you're having a natural conversation.
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
async def _record_audio_with_wake_word(
|
|
95
|
+
stream: sd.InputStream,
|
|
96
|
+
stop_event: InteractiveStopEvent,
|
|
97
|
+
logger: logging.Logger,
|
|
98
|
+
*,
|
|
99
|
+
wake_word_cfg: config.WakeWord,
|
|
100
|
+
quiet: bool = False,
|
|
101
|
+
live: Live | None = None,
|
|
102
|
+
) -> bytes | None:
|
|
103
|
+
"""Record audio to a buffer using wake word detection to start and stop."""
|
|
104
|
+
if not quiet:
|
|
105
|
+
print_with_style(
|
|
106
|
+
f"👂 Listening for wake word: [bold yellow]{wake_word_cfg.wake_word}[/bold yellow]",
|
|
107
|
+
)
|
|
108
|
+
print_with_style(
|
|
109
|
+
"Say the wake word to start recording, then say it again to stop and process.",
|
|
110
|
+
style="dim",
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
async with audio.tee_audio_stream(stream, stop_event, logger) as tee:
|
|
114
|
+
# Create a queue for wake word detection
|
|
115
|
+
wake_queue = await tee.add_queue()
|
|
116
|
+
|
|
117
|
+
detector = create_wake_word_detector(wake_word_cfg)
|
|
118
|
+
detected_word = await detector(
|
|
119
|
+
logger=logger,
|
|
120
|
+
queue=wake_queue,
|
|
121
|
+
quiet=quiet,
|
|
122
|
+
live=live,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
if not detected_word or stop_event.is_set():
|
|
126
|
+
# Clean up the queue if we exit early
|
|
127
|
+
await tee.remove_queue(wake_queue)
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
if not quiet:
|
|
131
|
+
print_with_style(
|
|
132
|
+
f"✅ Wake word '{detected_word}' detected! Starting recording...",
|
|
133
|
+
style="green",
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# Add a new queue for recording
|
|
137
|
+
record_queue = await tee.add_queue()
|
|
138
|
+
record_task = asyncio.create_task(asr.record_audio_to_buffer(record_queue, logger))
|
|
139
|
+
|
|
140
|
+
# Use the same wake_queue for stop-word detection
|
|
141
|
+
stop_detected_word = await detector(
|
|
142
|
+
logger=logger,
|
|
143
|
+
queue=wake_queue,
|
|
144
|
+
quiet=quiet,
|
|
145
|
+
live=live,
|
|
146
|
+
progress_message="Recording... (say wake word to stop)",
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# Stop the recording task by removing its queue
|
|
150
|
+
await tee.remove_queue(record_queue)
|
|
151
|
+
audio_data = await record_task
|
|
152
|
+
|
|
153
|
+
# Clean up the wake queue
|
|
154
|
+
await tee.remove_queue(wake_queue)
|
|
155
|
+
|
|
156
|
+
if not stop_detected_word or stop_event.is_set():
|
|
157
|
+
return None
|
|
158
|
+
|
|
159
|
+
if not quiet:
|
|
160
|
+
print_with_style(
|
|
161
|
+
f"🛑 Wake word '{stop_detected_word}' detected! Stopping recording...",
|
|
162
|
+
style="yellow",
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
return audio_data
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
async def _async_main(
|
|
169
|
+
*,
|
|
170
|
+
provider_cfg: config.ProviderSelection,
|
|
171
|
+
general_cfg: config.General,
|
|
172
|
+
audio_in_cfg: config.AudioInput,
|
|
173
|
+
wyoming_asr_cfg: config.WyomingASR,
|
|
174
|
+
openai_asr_cfg: config.OpenAIASR,
|
|
175
|
+
gemini_asr_cfg: config.GeminiASR,
|
|
176
|
+
ollama_cfg: config.Ollama,
|
|
177
|
+
openai_llm_cfg: config.OpenAILLM,
|
|
178
|
+
gemini_llm_cfg: config.GeminiLLM,
|
|
179
|
+
audio_out_cfg: config.AudioOutput,
|
|
180
|
+
wyoming_tts_cfg: config.WyomingTTS,
|
|
181
|
+
openai_tts_cfg: config.OpenAITTS,
|
|
182
|
+
kokoro_tts_cfg: config.KokoroTTS,
|
|
183
|
+
gemini_tts_cfg: config.GeminiTTS,
|
|
184
|
+
wake_word_cfg: config.WakeWord,
|
|
185
|
+
system_prompt: str,
|
|
186
|
+
agent_instructions: str,
|
|
187
|
+
live: Live | None,
|
|
188
|
+
) -> None:
|
|
189
|
+
"""Core asynchronous logic for the wake word assistant."""
|
|
190
|
+
device_info = setup_devices(general_cfg, audio_in_cfg, audio_out_cfg)
|
|
191
|
+
if device_info is None:
|
|
192
|
+
return
|
|
193
|
+
input_device_index, _, tts_output_device_index = device_info
|
|
194
|
+
audio_in_cfg.input_device_index = input_device_index
|
|
195
|
+
audio_out_cfg.output_device_index = tts_output_device_index
|
|
196
|
+
|
|
197
|
+
stream_config = audio.setup_input_stream(input_device_index)
|
|
198
|
+
with (
|
|
199
|
+
audio.open_audio_stream(stream_config) as stream,
|
|
200
|
+
signal_handling_context(LOGGER, general_cfg.quiet) as stop_event,
|
|
201
|
+
):
|
|
202
|
+
while not stop_event.is_set():
|
|
203
|
+
audio_data = await _record_audio_with_wake_word(
|
|
204
|
+
stream,
|
|
205
|
+
stop_event,
|
|
206
|
+
LOGGER,
|
|
207
|
+
wake_word_cfg=wake_word_cfg,
|
|
208
|
+
quiet=general_cfg.quiet,
|
|
209
|
+
live=live,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
if not audio_data:
|
|
213
|
+
if not general_cfg.quiet:
|
|
214
|
+
print_with_style("No audio recorded", style="yellow")
|
|
215
|
+
continue
|
|
216
|
+
|
|
217
|
+
if stop_event.is_set():
|
|
218
|
+
break
|
|
219
|
+
|
|
220
|
+
instruction = await get_instruction_from_audio(
|
|
221
|
+
audio_data=audio_data,
|
|
222
|
+
provider_cfg=provider_cfg,
|
|
223
|
+
audio_input_cfg=audio_in_cfg,
|
|
224
|
+
wyoming_asr_cfg=wyoming_asr_cfg,
|
|
225
|
+
openai_asr_cfg=openai_asr_cfg,
|
|
226
|
+
gemini_asr_cfg=gemini_asr_cfg,
|
|
227
|
+
ollama_cfg=ollama_cfg,
|
|
228
|
+
logger=LOGGER,
|
|
229
|
+
quiet=general_cfg.quiet,
|
|
230
|
+
)
|
|
231
|
+
if not instruction:
|
|
232
|
+
continue
|
|
233
|
+
|
|
234
|
+
await process_instruction_and_respond(
|
|
235
|
+
instruction=instruction,
|
|
236
|
+
original_text="",
|
|
237
|
+
provider_cfg=provider_cfg,
|
|
238
|
+
general_cfg=general_cfg,
|
|
239
|
+
ollama_cfg=ollama_cfg,
|
|
240
|
+
openai_llm_cfg=openai_llm_cfg,
|
|
241
|
+
gemini_llm_cfg=gemini_llm_cfg,
|
|
242
|
+
audio_output_cfg=audio_out_cfg,
|
|
243
|
+
wyoming_tts_cfg=wyoming_tts_cfg,
|
|
244
|
+
openai_tts_cfg=openai_tts_cfg,
|
|
245
|
+
kokoro_tts_cfg=kokoro_tts_cfg,
|
|
246
|
+
gemini_tts_cfg=gemini_tts_cfg,
|
|
247
|
+
system_prompt=system_prompt,
|
|
248
|
+
agent_instructions=agent_instructions,
|
|
249
|
+
live=live,
|
|
250
|
+
logger=LOGGER,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
if not general_cfg.quiet:
|
|
254
|
+
print_with_style("✨ Ready for next command...", style="green")
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
@app.command("assistant", rich_help_panel="Voice Commands")
|
|
258
|
+
@requires_extras("audio", "llm")
|
|
259
|
+
def assistant(
|
|
260
|
+
*,
|
|
261
|
+
# --- Provider Selection ---
|
|
262
|
+
asr_provider: str = opts.ASR_PROVIDER,
|
|
263
|
+
llm_provider: str = opts.LLM_PROVIDER,
|
|
264
|
+
tts_provider: str = opts.TTS_PROVIDER,
|
|
265
|
+
# --- Wake Word Configuration ---
|
|
266
|
+
wake_server_ip: str = opts.WAKE_SERVER_IP,
|
|
267
|
+
wake_server_port: int = opts.WAKE_SERVER_PORT,
|
|
268
|
+
wake_word: str = opts.WAKE_WORD,
|
|
269
|
+
# --- ASR (Audio) Configuration ---
|
|
270
|
+
input_device_index: int | None = opts.INPUT_DEVICE_INDEX,
|
|
271
|
+
input_device_name: str | None = opts.INPUT_DEVICE_NAME,
|
|
272
|
+
asr_wyoming_ip: str = opts.ASR_WYOMING_IP,
|
|
273
|
+
asr_wyoming_port: int = opts.ASR_WYOMING_PORT,
|
|
274
|
+
asr_openai_model: str = opts.ASR_OPENAI_MODEL,
|
|
275
|
+
asr_gemini_model: str = opts.ASR_GEMINI_MODEL,
|
|
276
|
+
# --- LLM Configuration ---
|
|
277
|
+
llm_ollama_model: str = opts.LLM_OLLAMA_MODEL,
|
|
278
|
+
llm_ollama_host: str = opts.LLM_OLLAMA_HOST,
|
|
279
|
+
llm_openai_model: str = opts.LLM_OPENAI_MODEL,
|
|
280
|
+
openai_api_key: str | None = opts.OPENAI_API_KEY,
|
|
281
|
+
openai_base_url: str | None = opts.OPENAI_BASE_URL,
|
|
282
|
+
llm_gemini_model: str = opts.LLM_GEMINI_MODEL,
|
|
283
|
+
gemini_api_key: str | None = opts.GEMINI_API_KEY,
|
|
284
|
+
# --- TTS Configuration ---
|
|
285
|
+
enable_tts: bool = opts.ENABLE_TTS,
|
|
286
|
+
output_device_index: int | None = opts.OUTPUT_DEVICE_INDEX,
|
|
287
|
+
output_device_name: str | None = opts.OUTPUT_DEVICE_NAME,
|
|
288
|
+
tts_speed: float = opts.TTS_SPEED,
|
|
289
|
+
tts_wyoming_ip: str = opts.TTS_WYOMING_IP,
|
|
290
|
+
tts_wyoming_port: int = opts.TTS_WYOMING_PORT,
|
|
291
|
+
tts_wyoming_voice: str | None = opts.TTS_WYOMING_VOICE,
|
|
292
|
+
tts_wyoming_language: str | None = opts.TTS_WYOMING_LANGUAGE,
|
|
293
|
+
tts_wyoming_speaker: str | None = opts.TTS_WYOMING_SPEAKER,
|
|
294
|
+
tts_openai_model: str = opts.TTS_OPENAI_MODEL,
|
|
295
|
+
tts_openai_voice: str = opts.TTS_OPENAI_VOICE,
|
|
296
|
+
tts_openai_base_url: str | None = opts.TTS_OPENAI_BASE_URL,
|
|
297
|
+
tts_kokoro_model: str = opts.TTS_KOKORO_MODEL,
|
|
298
|
+
tts_kokoro_voice: str = opts.TTS_KOKORO_VOICE,
|
|
299
|
+
tts_kokoro_host: str = opts.TTS_KOKORO_HOST,
|
|
300
|
+
tts_gemini_model: str = opts.TTS_GEMINI_MODEL,
|
|
301
|
+
tts_gemini_voice: str = opts.TTS_GEMINI_VOICE,
|
|
302
|
+
# --- Process Management ---
|
|
303
|
+
stop: bool = opts.STOP,
|
|
304
|
+
status: bool = opts.STATUS,
|
|
305
|
+
toggle: bool = opts.TOGGLE,
|
|
306
|
+
# --- General Options ---
|
|
307
|
+
save_file: Path | None = opts.SAVE_FILE,
|
|
308
|
+
clipboard: bool = opts.CLIPBOARD,
|
|
309
|
+
log_level: opts.LogLevel = opts.LOG_LEVEL,
|
|
310
|
+
log_file: str | None = opts.LOG_FILE,
|
|
311
|
+
list_devices: bool = opts.LIST_DEVICES,
|
|
312
|
+
quiet: bool = opts.QUIET,
|
|
313
|
+
config_file: str | None = opts.CONFIG_FILE,
|
|
314
|
+
print_args: bool = opts.PRINT_ARGS,
|
|
315
|
+
) -> None:
|
|
316
|
+
"""Wake word-based voice assistant using local or remote services."""
|
|
317
|
+
if print_args:
|
|
318
|
+
print_command_line_args(locals())
|
|
319
|
+
|
|
320
|
+
setup_logging(log_level, log_file, quiet=quiet)
|
|
321
|
+
general_cfg = config.General(
|
|
322
|
+
log_level=log_level,
|
|
323
|
+
log_file=log_file,
|
|
324
|
+
quiet=quiet,
|
|
325
|
+
list_devices=list_devices,
|
|
326
|
+
clipboard=clipboard,
|
|
327
|
+
save_file=save_file,
|
|
328
|
+
)
|
|
329
|
+
process_name = "assistant"
|
|
330
|
+
if stop_or_status_or_toggle(
|
|
331
|
+
process_name,
|
|
332
|
+
"wake word assistant",
|
|
333
|
+
stop,
|
|
334
|
+
status,
|
|
335
|
+
toggle,
|
|
336
|
+
quiet=general_cfg.quiet,
|
|
337
|
+
):
|
|
338
|
+
return
|
|
339
|
+
|
|
340
|
+
with (
|
|
341
|
+
process.pid_file_context(process_name),
|
|
342
|
+
suppress(KeyboardInterrupt),
|
|
343
|
+
maybe_live(not general_cfg.quiet) as live,
|
|
344
|
+
):
|
|
345
|
+
cfgs = config.create_provider_configs_from_locals(locals())
|
|
346
|
+
wake_word_cfg = config.WakeWord(
|
|
347
|
+
wake_server_ip=wake_server_ip,
|
|
348
|
+
wake_server_port=wake_server_port,
|
|
349
|
+
wake_word=wake_word,
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
variations = ", ".join(WAKE_WORD_VARIATIONS.get(wake_word_cfg.wake_word, []))
|
|
353
|
+
system_prompt = SYSTEM_PROMPT_TEMPLATE.format(
|
|
354
|
+
wake_word=wake_word_cfg.wake_word,
|
|
355
|
+
variations=variations,
|
|
356
|
+
)
|
|
357
|
+
agent_instructions = AGENT_INSTRUCTIONS_TEMPLATE.format(
|
|
358
|
+
wake_word=wake_word_cfg.wake_word,
|
|
359
|
+
variations=variations,
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
asyncio.run(
|
|
363
|
+
_async_main(
|
|
364
|
+
provider_cfg=cfgs.provider,
|
|
365
|
+
general_cfg=general_cfg,
|
|
366
|
+
audio_in_cfg=cfgs.audio_in,
|
|
367
|
+
wyoming_asr_cfg=cfgs.wyoming_asr,
|
|
368
|
+
openai_asr_cfg=cfgs.openai_asr,
|
|
369
|
+
gemini_asr_cfg=cfgs.gemini_asr,
|
|
370
|
+
ollama_cfg=cfgs.ollama,
|
|
371
|
+
openai_llm_cfg=cfgs.openai_llm,
|
|
372
|
+
gemini_llm_cfg=cfgs.gemini_llm,
|
|
373
|
+
audio_out_cfg=cfgs.audio_out,
|
|
374
|
+
wyoming_tts_cfg=cfgs.wyoming_tts,
|
|
375
|
+
openai_tts_cfg=cfgs.openai_tts,
|
|
376
|
+
kokoro_tts_cfg=cfgs.kokoro_tts,
|
|
377
|
+
gemini_tts_cfg=cfgs.gemini_tts,
|
|
378
|
+
wake_word_cfg=wake_word_cfg,
|
|
379
|
+
system_prompt=system_prompt,
|
|
380
|
+
agent_instructions=agent_instructions,
|
|
381
|
+
live=live,
|
|
382
|
+
),
|
|
383
|
+
)
|