agent-cli 0.70.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. agent_cli/__init__.py +5 -0
  2. agent_cli/__main__.py +6 -0
  3. agent_cli/_extras.json +14 -0
  4. agent_cli/_requirements/.gitkeep +0 -0
  5. agent_cli/_requirements/audio.txt +79 -0
  6. agent_cli/_requirements/faster-whisper.txt +215 -0
  7. agent_cli/_requirements/kokoro.txt +425 -0
  8. agent_cli/_requirements/llm.txt +183 -0
  9. agent_cli/_requirements/memory.txt +355 -0
  10. agent_cli/_requirements/mlx-whisper.txt +222 -0
  11. agent_cli/_requirements/piper.txt +176 -0
  12. agent_cli/_requirements/rag.txt +402 -0
  13. agent_cli/_requirements/server.txt +154 -0
  14. agent_cli/_requirements/speed.txt +77 -0
  15. agent_cli/_requirements/vad.txt +155 -0
  16. agent_cli/_requirements/wyoming.txt +71 -0
  17. agent_cli/_tools.py +368 -0
  18. agent_cli/agents/__init__.py +23 -0
  19. agent_cli/agents/_voice_agent_common.py +136 -0
  20. agent_cli/agents/assistant.py +383 -0
  21. agent_cli/agents/autocorrect.py +284 -0
  22. agent_cli/agents/chat.py +496 -0
  23. agent_cli/agents/memory/__init__.py +31 -0
  24. agent_cli/agents/memory/add.py +190 -0
  25. agent_cli/agents/memory/proxy.py +160 -0
  26. agent_cli/agents/rag_proxy.py +128 -0
  27. agent_cli/agents/speak.py +209 -0
  28. agent_cli/agents/transcribe.py +671 -0
  29. agent_cli/agents/transcribe_daemon.py +499 -0
  30. agent_cli/agents/voice_edit.py +291 -0
  31. agent_cli/api.py +22 -0
  32. agent_cli/cli.py +106 -0
  33. agent_cli/config.py +503 -0
  34. agent_cli/config_cmd.py +307 -0
  35. agent_cli/constants.py +27 -0
  36. agent_cli/core/__init__.py +1 -0
  37. agent_cli/core/audio.py +461 -0
  38. agent_cli/core/audio_format.py +299 -0
  39. agent_cli/core/chroma.py +88 -0
  40. agent_cli/core/deps.py +191 -0
  41. agent_cli/core/openai_proxy.py +139 -0
  42. agent_cli/core/process.py +195 -0
  43. agent_cli/core/reranker.py +120 -0
  44. agent_cli/core/sse.py +87 -0
  45. agent_cli/core/transcription_logger.py +70 -0
  46. agent_cli/core/utils.py +526 -0
  47. agent_cli/core/vad.py +175 -0
  48. agent_cli/core/watch.py +65 -0
  49. agent_cli/dev/__init__.py +14 -0
  50. agent_cli/dev/cli.py +1588 -0
  51. agent_cli/dev/coding_agents/__init__.py +19 -0
  52. agent_cli/dev/coding_agents/aider.py +24 -0
  53. agent_cli/dev/coding_agents/base.py +167 -0
  54. agent_cli/dev/coding_agents/claude.py +39 -0
  55. agent_cli/dev/coding_agents/codex.py +24 -0
  56. agent_cli/dev/coding_agents/continue_dev.py +15 -0
  57. agent_cli/dev/coding_agents/copilot.py +24 -0
  58. agent_cli/dev/coding_agents/cursor_agent.py +48 -0
  59. agent_cli/dev/coding_agents/gemini.py +28 -0
  60. agent_cli/dev/coding_agents/opencode.py +15 -0
  61. agent_cli/dev/coding_agents/registry.py +49 -0
  62. agent_cli/dev/editors/__init__.py +19 -0
  63. agent_cli/dev/editors/base.py +89 -0
  64. agent_cli/dev/editors/cursor.py +15 -0
  65. agent_cli/dev/editors/emacs.py +46 -0
  66. agent_cli/dev/editors/jetbrains.py +56 -0
  67. agent_cli/dev/editors/nano.py +31 -0
  68. agent_cli/dev/editors/neovim.py +33 -0
  69. agent_cli/dev/editors/registry.py +59 -0
  70. agent_cli/dev/editors/sublime.py +20 -0
  71. agent_cli/dev/editors/vim.py +42 -0
  72. agent_cli/dev/editors/vscode.py +15 -0
  73. agent_cli/dev/editors/zed.py +20 -0
  74. agent_cli/dev/project.py +568 -0
  75. agent_cli/dev/registry.py +52 -0
  76. agent_cli/dev/skill/SKILL.md +141 -0
  77. agent_cli/dev/skill/examples.md +571 -0
  78. agent_cli/dev/terminals/__init__.py +19 -0
  79. agent_cli/dev/terminals/apple_terminal.py +82 -0
  80. agent_cli/dev/terminals/base.py +56 -0
  81. agent_cli/dev/terminals/gnome.py +51 -0
  82. agent_cli/dev/terminals/iterm2.py +84 -0
  83. agent_cli/dev/terminals/kitty.py +77 -0
  84. agent_cli/dev/terminals/registry.py +48 -0
  85. agent_cli/dev/terminals/tmux.py +58 -0
  86. agent_cli/dev/terminals/warp.py +132 -0
  87. agent_cli/dev/terminals/zellij.py +78 -0
  88. agent_cli/dev/worktree.py +856 -0
  89. agent_cli/docs_gen.py +417 -0
  90. agent_cli/example-config.toml +185 -0
  91. agent_cli/install/__init__.py +5 -0
  92. agent_cli/install/common.py +89 -0
  93. agent_cli/install/extras.py +174 -0
  94. agent_cli/install/hotkeys.py +48 -0
  95. agent_cli/install/services.py +87 -0
  96. agent_cli/memory/__init__.py +7 -0
  97. agent_cli/memory/_files.py +250 -0
  98. agent_cli/memory/_filters.py +63 -0
  99. agent_cli/memory/_git.py +157 -0
  100. agent_cli/memory/_indexer.py +142 -0
  101. agent_cli/memory/_ingest.py +408 -0
  102. agent_cli/memory/_persistence.py +182 -0
  103. agent_cli/memory/_prompt.py +91 -0
  104. agent_cli/memory/_retrieval.py +294 -0
  105. agent_cli/memory/_store.py +169 -0
  106. agent_cli/memory/_streaming.py +44 -0
  107. agent_cli/memory/_tasks.py +48 -0
  108. agent_cli/memory/api.py +113 -0
  109. agent_cli/memory/client.py +272 -0
  110. agent_cli/memory/engine.py +361 -0
  111. agent_cli/memory/entities.py +43 -0
  112. agent_cli/memory/models.py +112 -0
  113. agent_cli/opts.py +433 -0
  114. agent_cli/py.typed +0 -0
  115. agent_cli/rag/__init__.py +3 -0
  116. agent_cli/rag/_indexer.py +67 -0
  117. agent_cli/rag/_indexing.py +226 -0
  118. agent_cli/rag/_prompt.py +30 -0
  119. agent_cli/rag/_retriever.py +156 -0
  120. agent_cli/rag/_store.py +48 -0
  121. agent_cli/rag/_utils.py +218 -0
  122. agent_cli/rag/api.py +175 -0
  123. agent_cli/rag/client.py +299 -0
  124. agent_cli/rag/engine.py +302 -0
  125. agent_cli/rag/models.py +55 -0
  126. agent_cli/scripts/.runtime/.gitkeep +0 -0
  127. agent_cli/scripts/__init__.py +1 -0
  128. agent_cli/scripts/check_plugin_skill_sync.py +50 -0
  129. agent_cli/scripts/linux-hotkeys/README.md +63 -0
  130. agent_cli/scripts/linux-hotkeys/toggle-autocorrect.sh +45 -0
  131. agent_cli/scripts/linux-hotkeys/toggle-transcription.sh +58 -0
  132. agent_cli/scripts/linux-hotkeys/toggle-voice-edit.sh +58 -0
  133. agent_cli/scripts/macos-hotkeys/README.md +45 -0
  134. agent_cli/scripts/macos-hotkeys/skhd-config-example +5 -0
  135. agent_cli/scripts/macos-hotkeys/toggle-autocorrect.sh +12 -0
  136. agent_cli/scripts/macos-hotkeys/toggle-transcription.sh +37 -0
  137. agent_cli/scripts/macos-hotkeys/toggle-voice-edit.sh +37 -0
  138. agent_cli/scripts/nvidia-asr-server/README.md +99 -0
  139. agent_cli/scripts/nvidia-asr-server/pyproject.toml +27 -0
  140. agent_cli/scripts/nvidia-asr-server/server.py +255 -0
  141. agent_cli/scripts/nvidia-asr-server/shell.nix +32 -0
  142. agent_cli/scripts/nvidia-asr-server/uv.lock +4654 -0
  143. agent_cli/scripts/run-openwakeword.sh +11 -0
  144. agent_cli/scripts/run-piper-windows.ps1 +30 -0
  145. agent_cli/scripts/run-piper.sh +24 -0
  146. agent_cli/scripts/run-whisper-linux.sh +40 -0
  147. agent_cli/scripts/run-whisper-macos.sh +6 -0
  148. agent_cli/scripts/run-whisper-windows.ps1 +51 -0
  149. agent_cli/scripts/run-whisper.sh +9 -0
  150. agent_cli/scripts/run_faster_whisper_server.py +136 -0
  151. agent_cli/scripts/setup-linux-hotkeys.sh +72 -0
  152. agent_cli/scripts/setup-linux.sh +108 -0
  153. agent_cli/scripts/setup-macos-hotkeys.sh +61 -0
  154. agent_cli/scripts/setup-macos.sh +76 -0
  155. agent_cli/scripts/setup-windows.ps1 +63 -0
  156. agent_cli/scripts/start-all-services-windows.ps1 +53 -0
  157. agent_cli/scripts/start-all-services.sh +178 -0
  158. agent_cli/scripts/sync_extras.py +138 -0
  159. agent_cli/server/__init__.py +3 -0
  160. agent_cli/server/cli.py +721 -0
  161. agent_cli/server/common.py +222 -0
  162. agent_cli/server/model_manager.py +288 -0
  163. agent_cli/server/model_registry.py +225 -0
  164. agent_cli/server/proxy/__init__.py +3 -0
  165. agent_cli/server/proxy/api.py +444 -0
  166. agent_cli/server/streaming.py +67 -0
  167. agent_cli/server/tts/__init__.py +3 -0
  168. agent_cli/server/tts/api.py +335 -0
  169. agent_cli/server/tts/backends/__init__.py +82 -0
  170. agent_cli/server/tts/backends/base.py +139 -0
  171. agent_cli/server/tts/backends/kokoro.py +403 -0
  172. agent_cli/server/tts/backends/piper.py +253 -0
  173. agent_cli/server/tts/model_manager.py +201 -0
  174. agent_cli/server/tts/model_registry.py +28 -0
  175. agent_cli/server/tts/wyoming_handler.py +249 -0
  176. agent_cli/server/whisper/__init__.py +3 -0
  177. agent_cli/server/whisper/api.py +413 -0
  178. agent_cli/server/whisper/backends/__init__.py +89 -0
  179. agent_cli/server/whisper/backends/base.py +97 -0
  180. agent_cli/server/whisper/backends/faster_whisper.py +225 -0
  181. agent_cli/server/whisper/backends/mlx.py +270 -0
  182. agent_cli/server/whisper/languages.py +116 -0
  183. agent_cli/server/whisper/model_manager.py +157 -0
  184. agent_cli/server/whisper/model_registry.py +28 -0
  185. agent_cli/server/whisper/wyoming_handler.py +203 -0
  186. agent_cli/services/__init__.py +343 -0
  187. agent_cli/services/_wyoming_utils.py +64 -0
  188. agent_cli/services/asr.py +506 -0
  189. agent_cli/services/llm.py +228 -0
  190. agent_cli/services/tts.py +450 -0
  191. agent_cli/services/wake_word.py +142 -0
  192. agent_cli-0.70.5.dist-info/METADATA +2118 -0
  193. agent_cli-0.70.5.dist-info/RECORD +196 -0
  194. agent_cli-0.70.5.dist-info/WHEEL +4 -0
  195. agent_cli-0.70.5.dist-info/entry_points.txt +4 -0
  196. agent_cli-0.70.5.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,343 @@
1
+ """Module for interacting with online services like OpenAI and Gemini."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import wave
7
+ from typing import TYPE_CHECKING
8
+
9
+ from agent_cli import constants
10
+
11
+ if TYPE_CHECKING:
12
+ import logging
13
+
14
+ from openai import AsyncOpenAI
15
+
16
+ from agent_cli import config
17
+
18
+
19
+ _RIFF_HEADER = b"RIFF"
20
+ _LOG_TRUNCATE_LENGTH = 100
21
+
22
+
23
+ def _is_wav_file(data: bytes) -> bool:
24
+ """Check if data is a WAV file by looking for RIFF header."""
25
+ return len(data) >= len(_RIFF_HEADER) and data[: len(_RIFF_HEADER)] == _RIFF_HEADER
26
+
27
+
28
+ def pcm_to_wav(
29
+ pcm_data: bytes,
30
+ *,
31
+ sample_rate: int = 16000,
32
+ sample_width: int = 2,
33
+ channels: int = 1,
34
+ ) -> bytes:
35
+ """Convert raw PCM audio data to WAV format.
36
+
37
+ Args:
38
+ pcm_data: Raw PCM audio bytes
39
+ sample_rate: Sample rate in Hz (default: 16000)
40
+ sample_width: Bytes per sample (default: 2 for 16-bit)
41
+ channels: Number of audio channels (default: 1 for mono)
42
+
43
+ Returns:
44
+ WAV-formatted audio bytes
45
+
46
+ """
47
+ wav_buffer = io.BytesIO()
48
+ with wave.open(wav_buffer, "wb") as wav_file:
49
+ wav_file.setnchannels(channels)
50
+ wav_file.setsampwidth(sample_width)
51
+ wav_file.setframerate(sample_rate)
52
+ wav_file.writeframes(pcm_data)
53
+ return wav_buffer.getvalue()
54
+
55
+
56
+ # Map file extensions to MIME types for Gemini
57
+ _GEMINI_MIME_TYPES: dict[str, str] = {
58
+ ".wav": "audio/wav",
59
+ ".mp3": "audio/mp3",
60
+ ".aiff": "audio/aiff",
61
+ ".aac": "audio/aac",
62
+ ".ogg": "audio/ogg",
63
+ ".flac": "audio/flac",
64
+ ".m4a": "audio/mp4", # m4a is MP4 audio container
65
+ }
66
+
67
+ # Audio formats supported by Gemini (derived from MIME type mapping)
68
+ GEMINI_SUPPORTED_FORMATS: frozenset[str] = frozenset(_GEMINI_MIME_TYPES.keys())
69
+
70
+ # Audio formats supported by OpenAI Whisper API
71
+ OPENAI_SUPPORTED_FORMATS: frozenset[str] = frozenset(
72
+ {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm"},
73
+ )
74
+
75
+
76
+ _GEMINI_TRANSCRIPTION_PROMPT = (
77
+ "Transcribe this audio accurately. Return only the transcription text, "
78
+ "nothing else. Do not include any prefixes, labels, or explanations."
79
+ )
80
+
81
+
82
+ async def transcribe_audio_gemini(
83
+ audio_data: bytes,
84
+ gemini_asr_cfg: config.GeminiASR,
85
+ logger: logging.Logger,
86
+ *,
87
+ file_suffix: str = ".wav",
88
+ extra_instructions: str | None = None,
89
+ **_kwargs: object,
90
+ ) -> str:
91
+ """Transcribe audio using Gemini's native audio understanding.
92
+
93
+ Gemini can process audio natively and return transcriptions.
94
+ Supports WAV, MP3, AIFF, AAC, OGG, and FLAC formats.
95
+
96
+ Args:
97
+ audio_data: Audio bytes (can be raw PCM or complete audio file)
98
+ gemini_asr_cfg: Gemini ASR configuration
99
+ logger: Logger instance
100
+ file_suffix: File extension for MIME type detection (default: .wav)
101
+ extra_instructions: Additional context/instructions to improve transcription
102
+
103
+ """
104
+ from google import genai # noqa: PLC0415
105
+ from google.genai import types # noqa: PLC0415
106
+
107
+ if not gemini_asr_cfg.gemini_api_key:
108
+ msg = "Gemini API key is not set."
109
+ raise ValueError(msg)
110
+
111
+ logger.info("Transcribing audio with Gemini %s...", gemini_asr_cfg.asr_gemini_model)
112
+
113
+ # Determine MIME type from file suffix
114
+ mime_type = _GEMINI_MIME_TYPES.get(file_suffix.lower(), "audio/wav")
115
+
116
+ logger.debug(
117
+ "Received audio: size=%d bytes, file_suffix=%s, is_wav=%s",
118
+ len(audio_data),
119
+ file_suffix,
120
+ _is_wav_file(audio_data),
121
+ )
122
+
123
+ # If raw PCM (no recognized format header), convert to WAV
124
+ # Only do this if file_suffix is .wav but data doesn't have WAV header (indicating raw PCM)
125
+ if not _is_wav_file(audio_data) and file_suffix.lower() == ".wav":
126
+ logger.debug("Wrapping raw PCM data with WAV header (16kHz, 16-bit, mono)")
127
+ audio_data = pcm_to_wav(
128
+ audio_data,
129
+ sample_rate=constants.AUDIO_RATE,
130
+ sample_width=constants.AUDIO_FORMAT_WIDTH,
131
+ channels=constants.AUDIO_CHANNELS,
132
+ )
133
+
134
+ logger.debug("Using MIME type: %s", mime_type)
135
+
136
+ # Build the transcription prompt with optional context
137
+ effective_prompt = gemini_asr_cfg.get_effective_prompt(extra_instructions)
138
+ if effective_prompt:
139
+ prompt = f"{_GEMINI_TRANSCRIPTION_PROMPT}\n\nContext: {effective_prompt}"
140
+ logger.debug("Using Gemini ASR with context prompt")
141
+ else:
142
+ prompt = _GEMINI_TRANSCRIPTION_PROMPT
143
+
144
+ client = genai.Client(api_key=gemini_asr_cfg.gemini_api_key)
145
+
146
+ response = await client.aio.models.generate_content(
147
+ model=gemini_asr_cfg.asr_gemini_model,
148
+ contents=[
149
+ prompt,
150
+ types.Part.from_bytes(data=audio_data, mime_type=mime_type),
151
+ ],
152
+ )
153
+ text = response.text.strip()
154
+
155
+ if text:
156
+ logger.info(
157
+ "Transcription result: %s",
158
+ text[:_LOG_TRUNCATE_LENGTH] + "..." if len(text) > _LOG_TRUNCATE_LENGTH else text,
159
+ )
160
+ else:
161
+ logger.warning(
162
+ "Empty transcription returned - audio may be silent, corrupted, or in wrong format",
163
+ )
164
+
165
+ return text
166
+
167
+
168
+ def _get_openai_client(api_key: str | None, base_url: str | None = None) -> AsyncOpenAI:
169
+ """Get an OpenAI client instance.
170
+
171
+ For custom endpoints (base_url is set), API key is optional and a dummy value
172
+ is used if not provided, since custom endpoints may not require authentication.
173
+ """
174
+ from openai import AsyncOpenAI # noqa: PLC0415
175
+
176
+ # Use dummy API key for custom endpoints if none provided
177
+ effective_api_key = api_key or "dummy-api-key"
178
+ return AsyncOpenAI(api_key=effective_api_key, base_url=base_url)
179
+
180
+
181
+ async def transcribe_audio_openai(
182
+ audio_data: bytes,
183
+ openai_asr_cfg: config.OpenAIASR,
184
+ logger: logging.Logger,
185
+ *,
186
+ file_suffix: str = ".wav",
187
+ extra_instructions: str | None = None,
188
+ **_kwargs: object, # Accept extra kwargs for consistency with Wyoming
189
+ ) -> str:
190
+ """Transcribe audio using OpenAI's Whisper API or a compatible endpoint.
191
+
192
+ OpenAI Whisper supports: mp3, mp4, mpeg, mpga, m4a, wav, and webm formats.
193
+
194
+ When openai_base_url is set, uses the custom endpoint instead of the official OpenAI API.
195
+ This allows using self-hosted Whisper models or other compatible services.
196
+
197
+ Args:
198
+ audio_data: Audio bytes (can be raw PCM or complete audio file)
199
+ openai_asr_cfg: OpenAI ASR configuration
200
+ logger: Logger instance
201
+ file_suffix: File extension for filename (default: .wav)
202
+ extra_instructions: Additional context/instructions to improve transcription
203
+
204
+ """
205
+ if openai_asr_cfg.openai_base_url:
206
+ logger.info(
207
+ "Transcribing audio with custom OpenAI-compatible endpoint: %s",
208
+ openai_asr_cfg.openai_base_url,
209
+ )
210
+ else:
211
+ logger.info("Transcribing audio with OpenAI Whisper...")
212
+ if not openai_asr_cfg.openai_api_key:
213
+ msg = "OpenAI API key is not set."
214
+ raise ValueError(msg)
215
+
216
+ client = _get_openai_client(
217
+ api_key=openai_asr_cfg.openai_api_key,
218
+ base_url=openai_asr_cfg.openai_base_url,
219
+ )
220
+
221
+ logger.debug(
222
+ "Received audio: size=%d bytes, file_suffix=%s, is_wav=%s",
223
+ len(audio_data),
224
+ file_suffix,
225
+ _is_wav_file(audio_data),
226
+ )
227
+
228
+ # Convert raw PCM to WAV if needed (custom endpoints like faster-whisper require proper format)
229
+ # Only do this if file_suffix is .wav but data doesn't have WAV header (indicating raw PCM)
230
+ if not _is_wav_file(audio_data) and file_suffix.lower() == ".wav":
231
+ logger.debug("Wrapping raw PCM data with WAV header (16kHz, 16-bit, mono)")
232
+ audio_data = pcm_to_wav(
233
+ audio_data,
234
+ sample_rate=constants.AUDIO_RATE,
235
+ sample_width=constants.AUDIO_FORMAT_WIDTH,
236
+ channels=constants.AUDIO_CHANNELS,
237
+ )
238
+
239
+ audio_file = io.BytesIO(audio_data)
240
+ # Use the correct file extension so OpenAI knows the format
241
+ audio_file.name = f"audio{file_suffix}"
242
+
243
+ logger.debug("Sending to OpenAI with filename: %s", audio_file.name)
244
+
245
+ transcription_params: dict[str, object] = {
246
+ "model": openai_asr_cfg.asr_openai_model,
247
+ "file": audio_file,
248
+ }
249
+
250
+ # Get effective prompt combining config and extra_instructions
251
+ effective_prompt = openai_asr_cfg.get_effective_prompt(extra_instructions)
252
+ if effective_prompt:
253
+ transcription_params["prompt"] = effective_prompt
254
+ logger.debug("Using OpenAI ASR with prompt")
255
+
256
+ response = await client.audio.transcriptions.create(**transcription_params)
257
+ text = response.text
258
+
259
+ if text:
260
+ logger.info(
261
+ "Transcription result: %s",
262
+ text[:_LOG_TRUNCATE_LENGTH] + "..." if len(text) > _LOG_TRUNCATE_LENGTH else text,
263
+ )
264
+ else:
265
+ logger.warning(
266
+ "Empty transcription returned - audio may be silent, corrupted, or in wrong format",
267
+ )
268
+
269
+ return text
270
+
271
+
272
+ async def synthesize_speech_openai(
273
+ text: str,
274
+ openai_tts_cfg: config.OpenAITTS,
275
+ logger: logging.Logger,
276
+ ) -> bytes:
277
+ """Synthesize speech using OpenAI's TTS API or a compatible endpoint."""
278
+ if openai_tts_cfg.tts_openai_base_url:
279
+ logger.info(
280
+ "Synthesizing speech with custom OpenAI-compatible endpoint: %s",
281
+ openai_tts_cfg.tts_openai_base_url,
282
+ )
283
+ else:
284
+ logger.info("Synthesizing speech with OpenAI TTS...")
285
+ if not openai_tts_cfg.openai_api_key:
286
+ msg = "OpenAI API key is not set."
287
+ raise ValueError(msg)
288
+
289
+ client = _get_openai_client(
290
+ api_key=openai_tts_cfg.openai_api_key,
291
+ base_url=openai_tts_cfg.tts_openai_base_url,
292
+ )
293
+ response = await client.audio.speech.create(
294
+ model=openai_tts_cfg.tts_openai_model,
295
+ voice=openai_tts_cfg.tts_openai_voice,
296
+ input=text,
297
+ response_format="wav",
298
+ )
299
+ return response.content
300
+
301
+
302
+ async def synthesize_speech_gemini(
303
+ text: str,
304
+ gemini_tts_cfg: config.GeminiTTS,
305
+ logger: logging.Logger,
306
+ ) -> bytes:
307
+ """Synthesize speech using Gemini's native TTS.
308
+
309
+ Returns WAV audio data (converted from Gemini's raw PCM output).
310
+ """
311
+ from google import genai # noqa: PLC0415
312
+ from google.genai import types # noqa: PLC0415
313
+
314
+ if not gemini_tts_cfg.gemini_api_key:
315
+ msg = "Gemini API key is not set."
316
+ raise ValueError(msg)
317
+
318
+ logger.info(
319
+ "Synthesizing speech with Gemini %s (voice: %s)...",
320
+ gemini_tts_cfg.tts_gemini_model,
321
+ gemini_tts_cfg.tts_gemini_voice,
322
+ )
323
+
324
+ client = genai.Client(api_key=gemini_tts_cfg.gemini_api_key)
325
+
326
+ response = await client.aio.models.generate_content(
327
+ model=gemini_tts_cfg.tts_gemini_model,
328
+ contents=text,
329
+ config=types.GenerateContentConfig(
330
+ response_modalities=["AUDIO"],
331
+ speech_config=types.SpeechConfig(
332
+ voice_config=types.VoiceConfig(
333
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(
334
+ voice_name=gemini_tts_cfg.tts_gemini_voice,
335
+ ),
336
+ ),
337
+ ),
338
+ ),
339
+ )
340
+
341
+ # Gemini returns raw PCM: 24kHz, 16-bit, mono
342
+ pcm_data = response.candidates[0].content.parts[0].inline_data.data
343
+ return pcm_to_wav(pcm_data, sample_rate=24000)
@@ -0,0 +1,64 @@
1
+ """Utility functions for Wyoming protocol interactions to eliminate code duplication."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from contextlib import asynccontextmanager
6
+ from typing import TYPE_CHECKING
7
+
8
+ from agent_cli.core.utils import print_error_message
9
+
10
+ if TYPE_CHECKING:
11
+ import logging
12
+ from collections.abc import AsyncGenerator
13
+
14
+ from wyoming.client import AsyncClient
15
+
16
+
17
+ @asynccontextmanager
18
+ async def wyoming_client_context(
19
+ server_ip: str,
20
+ server_port: int,
21
+ server_type: str,
22
+ logger: logging.Logger,
23
+ *,
24
+ quiet: bool = False,
25
+ ) -> AsyncGenerator[AsyncClient, None]:
26
+ """Context manager for Wyoming client connections with unified error handling.
27
+
28
+ Args:
29
+ server_ip: Wyoming server IP
30
+ server_port: Wyoming server port
31
+ server_type: Type of server (e.g., "ASR", "TTS", "wake word")
32
+ logger: Logger instance
33
+ quiet: If True, suppress console error messages
34
+
35
+ Yields:
36
+ Connected Wyoming client
37
+
38
+ Raises:
39
+ ConnectionRefusedError: If connection fails
40
+ Exception: For other connection errors
41
+
42
+ """
43
+ from wyoming.client import AsyncClient # noqa: PLC0415
44
+
45
+ uri = f"tcp://{server_ip}:{server_port}"
46
+ logger.info("Connecting to Wyoming %s server at %s", server_type, uri)
47
+
48
+ try:
49
+ async with AsyncClient.from_uri(uri) as client:
50
+ logger.info("%s connection established", server_type)
51
+ yield client
52
+ except ConnectionRefusedError:
53
+ logger.exception("%s connection refused.", server_type)
54
+ if not quiet:
55
+ print_error_message(
56
+ f"{server_type} connection refused.",
57
+ f"Is the Wyoming {server_type.lower()} server running at {uri}?",
58
+ )
59
+ raise
60
+ except Exception as e:
61
+ logger.exception("An error occurred during %s connection", server_type.lower())
62
+ if not quiet:
63
+ print_error_message(f"{server_type} error: {e}")
64
+ raise