agent-cli 0.70.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_cli/__init__.py +5 -0
- agent_cli/__main__.py +6 -0
- agent_cli/_extras.json +14 -0
- agent_cli/_requirements/.gitkeep +0 -0
- agent_cli/_requirements/audio.txt +79 -0
- agent_cli/_requirements/faster-whisper.txt +215 -0
- agent_cli/_requirements/kokoro.txt +425 -0
- agent_cli/_requirements/llm.txt +183 -0
- agent_cli/_requirements/memory.txt +355 -0
- agent_cli/_requirements/mlx-whisper.txt +222 -0
- agent_cli/_requirements/piper.txt +176 -0
- agent_cli/_requirements/rag.txt +402 -0
- agent_cli/_requirements/server.txt +154 -0
- agent_cli/_requirements/speed.txt +77 -0
- agent_cli/_requirements/vad.txt +155 -0
- agent_cli/_requirements/wyoming.txt +71 -0
- agent_cli/_tools.py +368 -0
- agent_cli/agents/__init__.py +23 -0
- agent_cli/agents/_voice_agent_common.py +136 -0
- agent_cli/agents/assistant.py +383 -0
- agent_cli/agents/autocorrect.py +284 -0
- agent_cli/agents/chat.py +496 -0
- agent_cli/agents/memory/__init__.py +31 -0
- agent_cli/agents/memory/add.py +190 -0
- agent_cli/agents/memory/proxy.py +160 -0
- agent_cli/agents/rag_proxy.py +128 -0
- agent_cli/agents/speak.py +209 -0
- agent_cli/agents/transcribe.py +671 -0
- agent_cli/agents/transcribe_daemon.py +499 -0
- agent_cli/agents/voice_edit.py +291 -0
- agent_cli/api.py +22 -0
- agent_cli/cli.py +106 -0
- agent_cli/config.py +503 -0
- agent_cli/config_cmd.py +307 -0
- agent_cli/constants.py +27 -0
- agent_cli/core/__init__.py +1 -0
- agent_cli/core/audio.py +461 -0
- agent_cli/core/audio_format.py +299 -0
- agent_cli/core/chroma.py +88 -0
- agent_cli/core/deps.py +191 -0
- agent_cli/core/openai_proxy.py +139 -0
- agent_cli/core/process.py +195 -0
- agent_cli/core/reranker.py +120 -0
- agent_cli/core/sse.py +87 -0
- agent_cli/core/transcription_logger.py +70 -0
- agent_cli/core/utils.py +526 -0
- agent_cli/core/vad.py +175 -0
- agent_cli/core/watch.py +65 -0
- agent_cli/dev/__init__.py +14 -0
- agent_cli/dev/cli.py +1588 -0
- agent_cli/dev/coding_agents/__init__.py +19 -0
- agent_cli/dev/coding_agents/aider.py +24 -0
- agent_cli/dev/coding_agents/base.py +167 -0
- agent_cli/dev/coding_agents/claude.py +39 -0
- agent_cli/dev/coding_agents/codex.py +24 -0
- agent_cli/dev/coding_agents/continue_dev.py +15 -0
- agent_cli/dev/coding_agents/copilot.py +24 -0
- agent_cli/dev/coding_agents/cursor_agent.py +48 -0
- agent_cli/dev/coding_agents/gemini.py +28 -0
- agent_cli/dev/coding_agents/opencode.py +15 -0
- agent_cli/dev/coding_agents/registry.py +49 -0
- agent_cli/dev/editors/__init__.py +19 -0
- agent_cli/dev/editors/base.py +89 -0
- agent_cli/dev/editors/cursor.py +15 -0
- agent_cli/dev/editors/emacs.py +46 -0
- agent_cli/dev/editors/jetbrains.py +56 -0
- agent_cli/dev/editors/nano.py +31 -0
- agent_cli/dev/editors/neovim.py +33 -0
- agent_cli/dev/editors/registry.py +59 -0
- agent_cli/dev/editors/sublime.py +20 -0
- agent_cli/dev/editors/vim.py +42 -0
- agent_cli/dev/editors/vscode.py +15 -0
- agent_cli/dev/editors/zed.py +20 -0
- agent_cli/dev/project.py +568 -0
- agent_cli/dev/registry.py +52 -0
- agent_cli/dev/skill/SKILL.md +141 -0
- agent_cli/dev/skill/examples.md +571 -0
- agent_cli/dev/terminals/__init__.py +19 -0
- agent_cli/dev/terminals/apple_terminal.py +82 -0
- agent_cli/dev/terminals/base.py +56 -0
- agent_cli/dev/terminals/gnome.py +51 -0
- agent_cli/dev/terminals/iterm2.py +84 -0
- agent_cli/dev/terminals/kitty.py +77 -0
- agent_cli/dev/terminals/registry.py +48 -0
- agent_cli/dev/terminals/tmux.py +58 -0
- agent_cli/dev/terminals/warp.py +132 -0
- agent_cli/dev/terminals/zellij.py +78 -0
- agent_cli/dev/worktree.py +856 -0
- agent_cli/docs_gen.py +417 -0
- agent_cli/example-config.toml +185 -0
- agent_cli/install/__init__.py +5 -0
- agent_cli/install/common.py +89 -0
- agent_cli/install/extras.py +174 -0
- agent_cli/install/hotkeys.py +48 -0
- agent_cli/install/services.py +87 -0
- agent_cli/memory/__init__.py +7 -0
- agent_cli/memory/_files.py +250 -0
- agent_cli/memory/_filters.py +63 -0
- agent_cli/memory/_git.py +157 -0
- agent_cli/memory/_indexer.py +142 -0
- agent_cli/memory/_ingest.py +408 -0
- agent_cli/memory/_persistence.py +182 -0
- agent_cli/memory/_prompt.py +91 -0
- agent_cli/memory/_retrieval.py +294 -0
- agent_cli/memory/_store.py +169 -0
- agent_cli/memory/_streaming.py +44 -0
- agent_cli/memory/_tasks.py +48 -0
- agent_cli/memory/api.py +113 -0
- agent_cli/memory/client.py +272 -0
- agent_cli/memory/engine.py +361 -0
- agent_cli/memory/entities.py +43 -0
- agent_cli/memory/models.py +112 -0
- agent_cli/opts.py +433 -0
- agent_cli/py.typed +0 -0
- agent_cli/rag/__init__.py +3 -0
- agent_cli/rag/_indexer.py +67 -0
- agent_cli/rag/_indexing.py +226 -0
- agent_cli/rag/_prompt.py +30 -0
- agent_cli/rag/_retriever.py +156 -0
- agent_cli/rag/_store.py +48 -0
- agent_cli/rag/_utils.py +218 -0
- agent_cli/rag/api.py +175 -0
- agent_cli/rag/client.py +299 -0
- agent_cli/rag/engine.py +302 -0
- agent_cli/rag/models.py +55 -0
- agent_cli/scripts/.runtime/.gitkeep +0 -0
- agent_cli/scripts/__init__.py +1 -0
- agent_cli/scripts/check_plugin_skill_sync.py +50 -0
- agent_cli/scripts/linux-hotkeys/README.md +63 -0
- agent_cli/scripts/linux-hotkeys/toggle-autocorrect.sh +45 -0
- agent_cli/scripts/linux-hotkeys/toggle-transcription.sh +58 -0
- agent_cli/scripts/linux-hotkeys/toggle-voice-edit.sh +58 -0
- agent_cli/scripts/macos-hotkeys/README.md +45 -0
- agent_cli/scripts/macos-hotkeys/skhd-config-example +5 -0
- agent_cli/scripts/macos-hotkeys/toggle-autocorrect.sh +12 -0
- agent_cli/scripts/macos-hotkeys/toggle-transcription.sh +37 -0
- agent_cli/scripts/macos-hotkeys/toggle-voice-edit.sh +37 -0
- agent_cli/scripts/nvidia-asr-server/README.md +99 -0
- agent_cli/scripts/nvidia-asr-server/pyproject.toml +27 -0
- agent_cli/scripts/nvidia-asr-server/server.py +255 -0
- agent_cli/scripts/nvidia-asr-server/shell.nix +32 -0
- agent_cli/scripts/nvidia-asr-server/uv.lock +4654 -0
- agent_cli/scripts/run-openwakeword.sh +11 -0
- agent_cli/scripts/run-piper-windows.ps1 +30 -0
- agent_cli/scripts/run-piper.sh +24 -0
- agent_cli/scripts/run-whisper-linux.sh +40 -0
- agent_cli/scripts/run-whisper-macos.sh +6 -0
- agent_cli/scripts/run-whisper-windows.ps1 +51 -0
- agent_cli/scripts/run-whisper.sh +9 -0
- agent_cli/scripts/run_faster_whisper_server.py +136 -0
- agent_cli/scripts/setup-linux-hotkeys.sh +72 -0
- agent_cli/scripts/setup-linux.sh +108 -0
- agent_cli/scripts/setup-macos-hotkeys.sh +61 -0
- agent_cli/scripts/setup-macos.sh +76 -0
- agent_cli/scripts/setup-windows.ps1 +63 -0
- agent_cli/scripts/start-all-services-windows.ps1 +53 -0
- agent_cli/scripts/start-all-services.sh +178 -0
- agent_cli/scripts/sync_extras.py +138 -0
- agent_cli/server/__init__.py +3 -0
- agent_cli/server/cli.py +721 -0
- agent_cli/server/common.py +222 -0
- agent_cli/server/model_manager.py +288 -0
- agent_cli/server/model_registry.py +225 -0
- agent_cli/server/proxy/__init__.py +3 -0
- agent_cli/server/proxy/api.py +444 -0
- agent_cli/server/streaming.py +67 -0
- agent_cli/server/tts/__init__.py +3 -0
- agent_cli/server/tts/api.py +335 -0
- agent_cli/server/tts/backends/__init__.py +82 -0
- agent_cli/server/tts/backends/base.py +139 -0
- agent_cli/server/tts/backends/kokoro.py +403 -0
- agent_cli/server/tts/backends/piper.py +253 -0
- agent_cli/server/tts/model_manager.py +201 -0
- agent_cli/server/tts/model_registry.py +28 -0
- agent_cli/server/tts/wyoming_handler.py +249 -0
- agent_cli/server/whisper/__init__.py +3 -0
- agent_cli/server/whisper/api.py +413 -0
- agent_cli/server/whisper/backends/__init__.py +89 -0
- agent_cli/server/whisper/backends/base.py +97 -0
- agent_cli/server/whisper/backends/faster_whisper.py +225 -0
- agent_cli/server/whisper/backends/mlx.py +270 -0
- agent_cli/server/whisper/languages.py +116 -0
- agent_cli/server/whisper/model_manager.py +157 -0
- agent_cli/server/whisper/model_registry.py +28 -0
- agent_cli/server/whisper/wyoming_handler.py +203 -0
- agent_cli/services/__init__.py +343 -0
- agent_cli/services/_wyoming_utils.py +64 -0
- agent_cli/services/asr.py +506 -0
- agent_cli/services/llm.py +228 -0
- agent_cli/services/tts.py +450 -0
- agent_cli/services/wake_word.py +142 -0
- agent_cli-0.70.5.dist-info/METADATA +2118 -0
- agent_cli-0.70.5.dist-info/RECORD +196 -0
- agent_cli-0.70.5.dist-info/WHEEL +4 -0
- agent_cli-0.70.5.dist-info/entry_points.txt +4 -0
- agent_cli-0.70.5.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
"""Module for interacting with online services like OpenAI and Gemini."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
import wave
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
from agent_cli import constants
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
import logging
|
|
13
|
+
|
|
14
|
+
from openai import AsyncOpenAI
|
|
15
|
+
|
|
16
|
+
from agent_cli import config
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
_RIFF_HEADER = b"RIFF"
|
|
20
|
+
_LOG_TRUNCATE_LENGTH = 100
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _is_wav_file(data: bytes) -> bool:
|
|
24
|
+
"""Check if data is a WAV file by looking for RIFF header."""
|
|
25
|
+
return len(data) >= len(_RIFF_HEADER) and data[: len(_RIFF_HEADER)] == _RIFF_HEADER
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def pcm_to_wav(
|
|
29
|
+
pcm_data: bytes,
|
|
30
|
+
*,
|
|
31
|
+
sample_rate: int = 16000,
|
|
32
|
+
sample_width: int = 2,
|
|
33
|
+
channels: int = 1,
|
|
34
|
+
) -> bytes:
|
|
35
|
+
"""Convert raw PCM audio data to WAV format.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
pcm_data: Raw PCM audio bytes
|
|
39
|
+
sample_rate: Sample rate in Hz (default: 16000)
|
|
40
|
+
sample_width: Bytes per sample (default: 2 for 16-bit)
|
|
41
|
+
channels: Number of audio channels (default: 1 for mono)
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
WAV-formatted audio bytes
|
|
45
|
+
|
|
46
|
+
"""
|
|
47
|
+
wav_buffer = io.BytesIO()
|
|
48
|
+
with wave.open(wav_buffer, "wb") as wav_file:
|
|
49
|
+
wav_file.setnchannels(channels)
|
|
50
|
+
wav_file.setsampwidth(sample_width)
|
|
51
|
+
wav_file.setframerate(sample_rate)
|
|
52
|
+
wav_file.writeframes(pcm_data)
|
|
53
|
+
return wav_buffer.getvalue()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# Map file extensions to MIME types for Gemini
|
|
57
|
+
_GEMINI_MIME_TYPES: dict[str, str] = {
|
|
58
|
+
".wav": "audio/wav",
|
|
59
|
+
".mp3": "audio/mp3",
|
|
60
|
+
".aiff": "audio/aiff",
|
|
61
|
+
".aac": "audio/aac",
|
|
62
|
+
".ogg": "audio/ogg",
|
|
63
|
+
".flac": "audio/flac",
|
|
64
|
+
".m4a": "audio/mp4", # m4a is MP4 audio container
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
# Audio formats supported by Gemini (derived from MIME type mapping)
|
|
68
|
+
GEMINI_SUPPORTED_FORMATS: frozenset[str] = frozenset(_GEMINI_MIME_TYPES.keys())
|
|
69
|
+
|
|
70
|
+
# Audio formats supported by OpenAI Whisper API
|
|
71
|
+
OPENAI_SUPPORTED_FORMATS: frozenset[str] = frozenset(
|
|
72
|
+
{".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm"},
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
_GEMINI_TRANSCRIPTION_PROMPT = (
|
|
77
|
+
"Transcribe this audio accurately. Return only the transcription text, "
|
|
78
|
+
"nothing else. Do not include any prefixes, labels, or explanations."
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
async def transcribe_audio_gemini(
|
|
83
|
+
audio_data: bytes,
|
|
84
|
+
gemini_asr_cfg: config.GeminiASR,
|
|
85
|
+
logger: logging.Logger,
|
|
86
|
+
*,
|
|
87
|
+
file_suffix: str = ".wav",
|
|
88
|
+
extra_instructions: str | None = None,
|
|
89
|
+
**_kwargs: object,
|
|
90
|
+
) -> str:
|
|
91
|
+
"""Transcribe audio using Gemini's native audio understanding.
|
|
92
|
+
|
|
93
|
+
Gemini can process audio natively and return transcriptions.
|
|
94
|
+
Supports WAV, MP3, AIFF, AAC, OGG, and FLAC formats.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
audio_data: Audio bytes (can be raw PCM or complete audio file)
|
|
98
|
+
gemini_asr_cfg: Gemini ASR configuration
|
|
99
|
+
logger: Logger instance
|
|
100
|
+
file_suffix: File extension for MIME type detection (default: .wav)
|
|
101
|
+
extra_instructions: Additional context/instructions to improve transcription
|
|
102
|
+
|
|
103
|
+
"""
|
|
104
|
+
from google import genai # noqa: PLC0415
|
|
105
|
+
from google.genai import types # noqa: PLC0415
|
|
106
|
+
|
|
107
|
+
if not gemini_asr_cfg.gemini_api_key:
|
|
108
|
+
msg = "Gemini API key is not set."
|
|
109
|
+
raise ValueError(msg)
|
|
110
|
+
|
|
111
|
+
logger.info("Transcribing audio with Gemini %s...", gemini_asr_cfg.asr_gemini_model)
|
|
112
|
+
|
|
113
|
+
# Determine MIME type from file suffix
|
|
114
|
+
mime_type = _GEMINI_MIME_TYPES.get(file_suffix.lower(), "audio/wav")
|
|
115
|
+
|
|
116
|
+
logger.debug(
|
|
117
|
+
"Received audio: size=%d bytes, file_suffix=%s, is_wav=%s",
|
|
118
|
+
len(audio_data),
|
|
119
|
+
file_suffix,
|
|
120
|
+
_is_wav_file(audio_data),
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# If raw PCM (no recognized format header), convert to WAV
|
|
124
|
+
# Only do this if file_suffix is .wav but data doesn't have WAV header (indicating raw PCM)
|
|
125
|
+
if not _is_wav_file(audio_data) and file_suffix.lower() == ".wav":
|
|
126
|
+
logger.debug("Wrapping raw PCM data with WAV header (16kHz, 16-bit, mono)")
|
|
127
|
+
audio_data = pcm_to_wav(
|
|
128
|
+
audio_data,
|
|
129
|
+
sample_rate=constants.AUDIO_RATE,
|
|
130
|
+
sample_width=constants.AUDIO_FORMAT_WIDTH,
|
|
131
|
+
channels=constants.AUDIO_CHANNELS,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
logger.debug("Using MIME type: %s", mime_type)
|
|
135
|
+
|
|
136
|
+
# Build the transcription prompt with optional context
|
|
137
|
+
effective_prompt = gemini_asr_cfg.get_effective_prompt(extra_instructions)
|
|
138
|
+
if effective_prompt:
|
|
139
|
+
prompt = f"{_GEMINI_TRANSCRIPTION_PROMPT}\n\nContext: {effective_prompt}"
|
|
140
|
+
logger.debug("Using Gemini ASR with context prompt")
|
|
141
|
+
else:
|
|
142
|
+
prompt = _GEMINI_TRANSCRIPTION_PROMPT
|
|
143
|
+
|
|
144
|
+
client = genai.Client(api_key=gemini_asr_cfg.gemini_api_key)
|
|
145
|
+
|
|
146
|
+
response = await client.aio.models.generate_content(
|
|
147
|
+
model=gemini_asr_cfg.asr_gemini_model,
|
|
148
|
+
contents=[
|
|
149
|
+
prompt,
|
|
150
|
+
types.Part.from_bytes(data=audio_data, mime_type=mime_type),
|
|
151
|
+
],
|
|
152
|
+
)
|
|
153
|
+
text = response.text.strip()
|
|
154
|
+
|
|
155
|
+
if text:
|
|
156
|
+
logger.info(
|
|
157
|
+
"Transcription result: %s",
|
|
158
|
+
text[:_LOG_TRUNCATE_LENGTH] + "..." if len(text) > _LOG_TRUNCATE_LENGTH else text,
|
|
159
|
+
)
|
|
160
|
+
else:
|
|
161
|
+
logger.warning(
|
|
162
|
+
"Empty transcription returned - audio may be silent, corrupted, or in wrong format",
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
return text
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _get_openai_client(api_key: str | None, base_url: str | None = None) -> AsyncOpenAI:
|
|
169
|
+
"""Get an OpenAI client instance.
|
|
170
|
+
|
|
171
|
+
For custom endpoints (base_url is set), API key is optional and a dummy value
|
|
172
|
+
is used if not provided, since custom endpoints may not require authentication.
|
|
173
|
+
"""
|
|
174
|
+
from openai import AsyncOpenAI # noqa: PLC0415
|
|
175
|
+
|
|
176
|
+
# Use dummy API key for custom endpoints if none provided
|
|
177
|
+
effective_api_key = api_key or "dummy-api-key"
|
|
178
|
+
return AsyncOpenAI(api_key=effective_api_key, base_url=base_url)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
async def transcribe_audio_openai(
|
|
182
|
+
audio_data: bytes,
|
|
183
|
+
openai_asr_cfg: config.OpenAIASR,
|
|
184
|
+
logger: logging.Logger,
|
|
185
|
+
*,
|
|
186
|
+
file_suffix: str = ".wav",
|
|
187
|
+
extra_instructions: str | None = None,
|
|
188
|
+
**_kwargs: object, # Accept extra kwargs for consistency with Wyoming
|
|
189
|
+
) -> str:
|
|
190
|
+
"""Transcribe audio using OpenAI's Whisper API or a compatible endpoint.
|
|
191
|
+
|
|
192
|
+
OpenAI Whisper supports: mp3, mp4, mpeg, mpga, m4a, wav, and webm formats.
|
|
193
|
+
|
|
194
|
+
When openai_base_url is set, uses the custom endpoint instead of the official OpenAI API.
|
|
195
|
+
This allows using self-hosted Whisper models or other compatible services.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
audio_data: Audio bytes (can be raw PCM or complete audio file)
|
|
199
|
+
openai_asr_cfg: OpenAI ASR configuration
|
|
200
|
+
logger: Logger instance
|
|
201
|
+
file_suffix: File extension for filename (default: .wav)
|
|
202
|
+
extra_instructions: Additional context/instructions to improve transcription
|
|
203
|
+
|
|
204
|
+
"""
|
|
205
|
+
if openai_asr_cfg.openai_base_url:
|
|
206
|
+
logger.info(
|
|
207
|
+
"Transcribing audio with custom OpenAI-compatible endpoint: %s",
|
|
208
|
+
openai_asr_cfg.openai_base_url,
|
|
209
|
+
)
|
|
210
|
+
else:
|
|
211
|
+
logger.info("Transcribing audio with OpenAI Whisper...")
|
|
212
|
+
if not openai_asr_cfg.openai_api_key:
|
|
213
|
+
msg = "OpenAI API key is not set."
|
|
214
|
+
raise ValueError(msg)
|
|
215
|
+
|
|
216
|
+
client = _get_openai_client(
|
|
217
|
+
api_key=openai_asr_cfg.openai_api_key,
|
|
218
|
+
base_url=openai_asr_cfg.openai_base_url,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
logger.debug(
|
|
222
|
+
"Received audio: size=%d bytes, file_suffix=%s, is_wav=%s",
|
|
223
|
+
len(audio_data),
|
|
224
|
+
file_suffix,
|
|
225
|
+
_is_wav_file(audio_data),
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
# Convert raw PCM to WAV if needed (custom endpoints like faster-whisper require proper format)
|
|
229
|
+
# Only do this if file_suffix is .wav but data doesn't have WAV header (indicating raw PCM)
|
|
230
|
+
if not _is_wav_file(audio_data) and file_suffix.lower() == ".wav":
|
|
231
|
+
logger.debug("Wrapping raw PCM data with WAV header (16kHz, 16-bit, mono)")
|
|
232
|
+
audio_data = pcm_to_wav(
|
|
233
|
+
audio_data,
|
|
234
|
+
sample_rate=constants.AUDIO_RATE,
|
|
235
|
+
sample_width=constants.AUDIO_FORMAT_WIDTH,
|
|
236
|
+
channels=constants.AUDIO_CHANNELS,
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
audio_file = io.BytesIO(audio_data)
|
|
240
|
+
# Use the correct file extension so OpenAI knows the format
|
|
241
|
+
audio_file.name = f"audio{file_suffix}"
|
|
242
|
+
|
|
243
|
+
logger.debug("Sending to OpenAI with filename: %s", audio_file.name)
|
|
244
|
+
|
|
245
|
+
transcription_params: dict[str, object] = {
|
|
246
|
+
"model": openai_asr_cfg.asr_openai_model,
|
|
247
|
+
"file": audio_file,
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
# Get effective prompt combining config and extra_instructions
|
|
251
|
+
effective_prompt = openai_asr_cfg.get_effective_prompt(extra_instructions)
|
|
252
|
+
if effective_prompt:
|
|
253
|
+
transcription_params["prompt"] = effective_prompt
|
|
254
|
+
logger.debug("Using OpenAI ASR with prompt")
|
|
255
|
+
|
|
256
|
+
response = await client.audio.transcriptions.create(**transcription_params)
|
|
257
|
+
text = response.text
|
|
258
|
+
|
|
259
|
+
if text:
|
|
260
|
+
logger.info(
|
|
261
|
+
"Transcription result: %s",
|
|
262
|
+
text[:_LOG_TRUNCATE_LENGTH] + "..." if len(text) > _LOG_TRUNCATE_LENGTH else text,
|
|
263
|
+
)
|
|
264
|
+
else:
|
|
265
|
+
logger.warning(
|
|
266
|
+
"Empty transcription returned - audio may be silent, corrupted, or in wrong format",
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
return text
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
async def synthesize_speech_openai(
|
|
273
|
+
text: str,
|
|
274
|
+
openai_tts_cfg: config.OpenAITTS,
|
|
275
|
+
logger: logging.Logger,
|
|
276
|
+
) -> bytes:
|
|
277
|
+
"""Synthesize speech using OpenAI's TTS API or a compatible endpoint."""
|
|
278
|
+
if openai_tts_cfg.tts_openai_base_url:
|
|
279
|
+
logger.info(
|
|
280
|
+
"Synthesizing speech with custom OpenAI-compatible endpoint: %s",
|
|
281
|
+
openai_tts_cfg.tts_openai_base_url,
|
|
282
|
+
)
|
|
283
|
+
else:
|
|
284
|
+
logger.info("Synthesizing speech with OpenAI TTS...")
|
|
285
|
+
if not openai_tts_cfg.openai_api_key:
|
|
286
|
+
msg = "OpenAI API key is not set."
|
|
287
|
+
raise ValueError(msg)
|
|
288
|
+
|
|
289
|
+
client = _get_openai_client(
|
|
290
|
+
api_key=openai_tts_cfg.openai_api_key,
|
|
291
|
+
base_url=openai_tts_cfg.tts_openai_base_url,
|
|
292
|
+
)
|
|
293
|
+
response = await client.audio.speech.create(
|
|
294
|
+
model=openai_tts_cfg.tts_openai_model,
|
|
295
|
+
voice=openai_tts_cfg.tts_openai_voice,
|
|
296
|
+
input=text,
|
|
297
|
+
response_format="wav",
|
|
298
|
+
)
|
|
299
|
+
return response.content
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
async def synthesize_speech_gemini(
|
|
303
|
+
text: str,
|
|
304
|
+
gemini_tts_cfg: config.GeminiTTS,
|
|
305
|
+
logger: logging.Logger,
|
|
306
|
+
) -> bytes:
|
|
307
|
+
"""Synthesize speech using Gemini's native TTS.
|
|
308
|
+
|
|
309
|
+
Returns WAV audio data (converted from Gemini's raw PCM output).
|
|
310
|
+
"""
|
|
311
|
+
from google import genai # noqa: PLC0415
|
|
312
|
+
from google.genai import types # noqa: PLC0415
|
|
313
|
+
|
|
314
|
+
if not gemini_tts_cfg.gemini_api_key:
|
|
315
|
+
msg = "Gemini API key is not set."
|
|
316
|
+
raise ValueError(msg)
|
|
317
|
+
|
|
318
|
+
logger.info(
|
|
319
|
+
"Synthesizing speech with Gemini %s (voice: %s)...",
|
|
320
|
+
gemini_tts_cfg.tts_gemini_model,
|
|
321
|
+
gemini_tts_cfg.tts_gemini_voice,
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
client = genai.Client(api_key=gemini_tts_cfg.gemini_api_key)
|
|
325
|
+
|
|
326
|
+
response = await client.aio.models.generate_content(
|
|
327
|
+
model=gemini_tts_cfg.tts_gemini_model,
|
|
328
|
+
contents=text,
|
|
329
|
+
config=types.GenerateContentConfig(
|
|
330
|
+
response_modalities=["AUDIO"],
|
|
331
|
+
speech_config=types.SpeechConfig(
|
|
332
|
+
voice_config=types.VoiceConfig(
|
|
333
|
+
prebuilt_voice_config=types.PrebuiltVoiceConfig(
|
|
334
|
+
voice_name=gemini_tts_cfg.tts_gemini_voice,
|
|
335
|
+
),
|
|
336
|
+
),
|
|
337
|
+
),
|
|
338
|
+
),
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
# Gemini returns raw PCM: 24kHz, 16-bit, mono
|
|
342
|
+
pcm_data = response.candidates[0].content.parts[0].inline_data.data
|
|
343
|
+
return pcm_to_wav(pcm_data, sample_rate=24000)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Utility functions for Wyoming protocol interactions to eliminate code duplication."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from contextlib import asynccontextmanager
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
from agent_cli.core.utils import print_error_message
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
import logging
|
|
12
|
+
from collections.abc import AsyncGenerator
|
|
13
|
+
|
|
14
|
+
from wyoming.client import AsyncClient
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@asynccontextmanager
|
|
18
|
+
async def wyoming_client_context(
|
|
19
|
+
server_ip: str,
|
|
20
|
+
server_port: int,
|
|
21
|
+
server_type: str,
|
|
22
|
+
logger: logging.Logger,
|
|
23
|
+
*,
|
|
24
|
+
quiet: bool = False,
|
|
25
|
+
) -> AsyncGenerator[AsyncClient, None]:
|
|
26
|
+
"""Context manager for Wyoming client connections with unified error handling.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
server_ip: Wyoming server IP
|
|
30
|
+
server_port: Wyoming server port
|
|
31
|
+
server_type: Type of server (e.g., "ASR", "TTS", "wake word")
|
|
32
|
+
logger: Logger instance
|
|
33
|
+
quiet: If True, suppress console error messages
|
|
34
|
+
|
|
35
|
+
Yields:
|
|
36
|
+
Connected Wyoming client
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
ConnectionRefusedError: If connection fails
|
|
40
|
+
Exception: For other connection errors
|
|
41
|
+
|
|
42
|
+
"""
|
|
43
|
+
from wyoming.client import AsyncClient # noqa: PLC0415
|
|
44
|
+
|
|
45
|
+
uri = f"tcp://{server_ip}:{server_port}"
|
|
46
|
+
logger.info("Connecting to Wyoming %s server at %s", server_type, uri)
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
async with AsyncClient.from_uri(uri) as client:
|
|
50
|
+
logger.info("%s connection established", server_type)
|
|
51
|
+
yield client
|
|
52
|
+
except ConnectionRefusedError:
|
|
53
|
+
logger.exception("%s connection refused.", server_type)
|
|
54
|
+
if not quiet:
|
|
55
|
+
print_error_message(
|
|
56
|
+
f"{server_type} connection refused.",
|
|
57
|
+
f"Is the Wyoming {server_type.lower()} server running at {uri}?",
|
|
58
|
+
)
|
|
59
|
+
raise
|
|
60
|
+
except Exception as e:
|
|
61
|
+
logger.exception("An error occurred during %s connection", server_type.lower())
|
|
62
|
+
if not quiet:
|
|
63
|
+
print_error_message(f"{server_type} error: {e}")
|
|
64
|
+
raise
|