devduck 0.7.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of devduck might be problematic. Click here for more details.
- devduck/__init__.py +151 -67
- devduck/_version.py +2 -2
- devduck/tools/speech_to_speech.py +750 -0
- devduck-1.1.0.dist-info/METADATA +716 -0
- {devduck-0.7.0.dist-info → devduck-1.1.0.dist-info}/RECORD +9 -8
- {devduck-0.7.0.dist-info → devduck-1.1.0.dist-info}/entry_points.txt +1 -0
- devduck-0.7.0.dist-info/METADATA +0 -589
- {devduck-0.7.0.dist-info → devduck-1.1.0.dist-info}/WHEEL +0 -0
- {devduck-0.7.0.dist-info → devduck-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {devduck-0.7.0.dist-info → devduck-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,750 @@
|
|
|
1
|
+
"""Real-time speech-to-speech bidirectional streaming tool for DevDuck.
|
|
2
|
+
|
|
3
|
+
Provides background speech-to-speech conversation capability using Strands
|
|
4
|
+
experimental bidirectional streaming with full model provider support, tool
|
|
5
|
+
inheritance, and comprehensive configuration options.
|
|
6
|
+
|
|
7
|
+
This tool creates isolated bidirectional agent sessions that run in background
|
|
8
|
+
threads, enabling real-time voice conversations with AI models while the parent
|
|
9
|
+
agent remains responsive.
|
|
10
|
+
|
|
11
|
+
Key Features:
|
|
12
|
+
- **Background Execution:** Runs in separate thread - parent agent stays responsive
|
|
13
|
+
- **Real-Time Audio:** Microphone input and speaker output with pyaudio
|
|
14
|
+
- **Tool Inheritance:** Automatically inherits ALL tools from parent agent
|
|
15
|
+
- **System Prompt Inheritance:** Combines parent agent's prompt with custom prompts
|
|
16
|
+
- **Multiple Providers:** Nova Sonic, OpenAI Realtime API, Gemini Live
|
|
17
|
+
- **Full Configuration:** Per-provider custom settings and parameters
|
|
18
|
+
- **Environment API Keys:** Auto-loads API keys from environment variables
|
|
19
|
+
- **Built-in Stop:** Uses SDK's stop_conversation tool for graceful termination
|
|
20
|
+
- **Auto-Interruption:** Built-in VAD for natural conversation flow
|
|
21
|
+
- **Conversation History:** Automatically saves transcripts to files
|
|
22
|
+
|
|
23
|
+
Supported Providers:
|
|
24
|
+
-------------------
|
|
25
|
+
1. **Nova Sonic (AWS Bedrock):**
|
|
26
|
+
- Region: us-east-1, eu-north-1, ap-northeast-1
|
|
27
|
+
- Model: amazon.nova-2-sonic-v1:0 (configurable)
|
|
28
|
+
- Voices: tiffany, matthew, amy, ambre, florian, beatrice, lorenzo, greta, lennart, lupe, carlos
|
|
29
|
+
- Requires AWS credentials (boto3 credential chain)
|
|
30
|
+
|
|
31
|
+
2. **OpenAI Realtime API:**
|
|
32
|
+
- Models: gpt-realtime, gpt-4o-realtime-preview (configurable)
|
|
33
|
+
- Requires OPENAI_API_KEY environment variable
|
|
34
|
+
- Custom session config support
|
|
35
|
+
|
|
36
|
+
3. **Gemini Live:**
|
|
37
|
+
- Model: gemini-2.5-flash-native-audio-preview-09-2025 (configurable)
|
|
38
|
+
- Requires GOOGLE_API_KEY or GEMINI_API_KEY environment variable
|
|
39
|
+
- Live config customization
|
|
40
|
+
|
|
41
|
+
Configuration Examples:
|
|
42
|
+
----------------------
|
|
43
|
+
# Nova Sonic with custom voice
|
|
44
|
+
model_settings = {
|
|
45
|
+
"provider_config": {
|
|
46
|
+
"audio": {"voice": "matthew"}
|
|
47
|
+
},
|
|
48
|
+
"client_config": {"region": "us-east-1"}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
# OpenAI Realtime with custom model
|
|
52
|
+
model_settings = {
|
|
53
|
+
"model_id": "gpt-4o-realtime-preview",
|
|
54
|
+
"provider_config": {
|
|
55
|
+
"audio": {"voice": "coral"}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
# Gemini Live with custom voice
|
|
60
|
+
model_settings = {
|
|
61
|
+
"model_id": "gemini-2.5-flash-native-audio-preview-09-2025",
|
|
62
|
+
"provider_config": {
|
|
63
|
+
"audio": {"voice": "Kore"}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
import os
|
|
69
|
+
import asyncio
|
|
70
|
+
import tempfile
|
|
71
|
+
import json
|
|
72
|
+
import logging
|
|
73
|
+
import threading
|
|
74
|
+
import traceback
|
|
75
|
+
from datetime import datetime
|
|
76
|
+
from pathlib import Path
|
|
77
|
+
from typing import Any, Dict, List, Optional
|
|
78
|
+
|
|
79
|
+
from strands import tool
|
|
80
|
+
from strands.experimental.bidi.agent.agent import BidiAgent
|
|
81
|
+
from strands.experimental.bidi.models.gemini_live import BidiGeminiLiveModel
|
|
82
|
+
from strands.experimental.bidi.models.nova_sonic import BidiNovaSonicModel
|
|
83
|
+
from strands.experimental.bidi.models.openai_realtime import BidiOpenAIRealtimeModel
|
|
84
|
+
from strands.experimental.bidi.io.audio import BidiAudioIO
|
|
85
|
+
|
|
86
|
+
logger = logging.getLogger(__name__)
|
|
87
|
+
|
|
88
|
+
# Global session tracking
|
|
89
|
+
_active_sessions = {}
|
|
90
|
+
_session_lock = threading.Lock()
|
|
91
|
+
|
|
92
|
+
# Session history storage location
|
|
93
|
+
BASE_DIR = Path(os.getenv("DEVDUCK_HOME", tempfile.gettempdir()))
|
|
94
|
+
HISTORY_DIR = BASE_DIR / ".devduck" / "speech_sessions"
|
|
95
|
+
HISTORY_DIR.mkdir(parents=True, exist_ok=True)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class SpeechSession:
|
|
99
|
+
"""Manages a speech-to-speech conversation session with full lifecycle management."""
|
|
100
|
+
|
|
101
|
+
def __init__(
|
|
102
|
+
self,
|
|
103
|
+
session_id: str,
|
|
104
|
+
agent: BidiAgent,
|
|
105
|
+
):
|
|
106
|
+
"""Initialize speech session.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
session_id: Unique session identifier
|
|
110
|
+
agent: BidiAgent instance
|
|
111
|
+
"""
|
|
112
|
+
self.session_id = session_id
|
|
113
|
+
self.agent = agent
|
|
114
|
+
self.active = False
|
|
115
|
+
self.thread = None
|
|
116
|
+
self.loop = None
|
|
117
|
+
self.history_file = HISTORY_DIR / f"{session_id}.json"
|
|
118
|
+
|
|
119
|
+
def start(self) -> None:
|
|
120
|
+
"""Start the speech session in background thread."""
|
|
121
|
+
if self.active:
|
|
122
|
+
raise ValueError("Session already active")
|
|
123
|
+
|
|
124
|
+
self.active = True
|
|
125
|
+
self.thread = threading.Thread(target=self._run_session, daemon=True)
|
|
126
|
+
self.thread.start()
|
|
127
|
+
|
|
128
|
+
def stop(self) -> None:
|
|
129
|
+
"""Stop the speech session and cleanup resources."""
|
|
130
|
+
if not self.active:
|
|
131
|
+
return
|
|
132
|
+
|
|
133
|
+
self.active = False
|
|
134
|
+
|
|
135
|
+
# Stop the bidi agent using its event loop
|
|
136
|
+
if self.loop and self.loop.is_running():
|
|
137
|
+
# Schedule stop in the session's event loop and wait for it
|
|
138
|
+
future = asyncio.run_coroutine_threadsafe(self.agent.stop(), self.loop)
|
|
139
|
+
try:
|
|
140
|
+
# Wait up to 3 seconds for stop to complete
|
|
141
|
+
future.result(timeout=3.0)
|
|
142
|
+
logger.info(
|
|
143
|
+
f"Successfully stopped bidi agent for session {self.session_id}"
|
|
144
|
+
)
|
|
145
|
+
except Exception as e:
|
|
146
|
+
logger.warning(f"Error stopping bidi agent: {e}")
|
|
147
|
+
|
|
148
|
+
if self.thread:
|
|
149
|
+
self.thread.join(timeout=5.0)
|
|
150
|
+
|
|
151
|
+
# Save conversation history after session ends
|
|
152
|
+
self._save_history()
|
|
153
|
+
|
|
154
|
+
def _save_history(self) -> None:
|
|
155
|
+
"""Save conversation history to file."""
|
|
156
|
+
try:
|
|
157
|
+
history_data = {
|
|
158
|
+
"session_id": self.session_id,
|
|
159
|
+
"timestamp": datetime.now().isoformat(),
|
|
160
|
+
"messages": self.agent.messages,
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
with open(self.history_file, "w") as f:
|
|
164
|
+
json.dump(history_data, f, indent=2)
|
|
165
|
+
|
|
166
|
+
logger.info(f"Saved conversation history to {self.history_file}")
|
|
167
|
+
except Exception as e:
|
|
168
|
+
logger.error(f"Failed to save history: {e}")
|
|
169
|
+
|
|
170
|
+
def _run_session(self) -> None:
|
|
171
|
+
"""Main session runner in background thread."""
|
|
172
|
+
try:
|
|
173
|
+
# Create event loop for this thread
|
|
174
|
+
self.loop = asyncio.new_event_loop()
|
|
175
|
+
asyncio.set_event_loop(self.loop)
|
|
176
|
+
|
|
177
|
+
# Run the async session
|
|
178
|
+
self.loop.run_until_complete(self._async_session())
|
|
179
|
+
except Exception as e:
|
|
180
|
+
error_msg = f"Session error: {e}\n{traceback.format_exc()}"
|
|
181
|
+
logger.debug(error_msg)
|
|
182
|
+
print(f"\n🦆 Session error: {e}")
|
|
183
|
+
finally:
|
|
184
|
+
if self.loop:
|
|
185
|
+
self.loop.close()
|
|
186
|
+
|
|
187
|
+
async def _async_session(self) -> None:
|
|
188
|
+
"""Async session management using BidiAudioIO."""
|
|
189
|
+
try:
|
|
190
|
+
# Create audio I/O
|
|
191
|
+
audio_io = BidiAudioIO()
|
|
192
|
+
|
|
193
|
+
# Run agent with audio I/O
|
|
194
|
+
await self.agent.run(inputs=[audio_io.input()], outputs=[audio_io.output()])
|
|
195
|
+
|
|
196
|
+
except Exception as e:
|
|
197
|
+
logger.debug(f"Async session error: {e}\n{traceback.format_exc()}")
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
@tool
|
|
201
|
+
def speech_to_speech(
|
|
202
|
+
action: str,
|
|
203
|
+
provider: str = "novasonic",
|
|
204
|
+
system_prompt: Optional[str] = None,
|
|
205
|
+
session_id: Optional[str] = None,
|
|
206
|
+
model_settings: Optional[Dict[str, Any]] = None,
|
|
207
|
+
tools: Optional[List[str]] = None,
|
|
208
|
+
agent: Optional[Any] = None,
|
|
209
|
+
load_history_from: Optional[str] = None,
|
|
210
|
+
) -> str:
|
|
211
|
+
"""Start, stop, or manage speech-to-speech conversations.
|
|
212
|
+
|
|
213
|
+
Creates a background bidirectional streaming session for real-time voice
|
|
214
|
+
conversations with AI. Supports full model configuration, tool inheritance,
|
|
215
|
+
and multiple model providers with custom settings.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
action: Action to perform:
|
|
219
|
+
- "start": Start new speech session
|
|
220
|
+
- "stop": Stop session(s)
|
|
221
|
+
- "status": Get session status
|
|
222
|
+
- "list_history": List saved conversation histories
|
|
223
|
+
- "read_history": Read a specific conversation history
|
|
224
|
+
provider: Model provider to use:
|
|
225
|
+
- "novasonic": AWS Bedrock Nova Sonic
|
|
226
|
+
- "openai": OpenAI Realtime API
|
|
227
|
+
- "gemini_live": Google Gemini Live
|
|
228
|
+
system_prompt: Custom system prompt for the agent. This will be appended
|
|
229
|
+
to the parent agent's system prompt (if available). If not provided,
|
|
230
|
+
uses default prompt that encourages tool usage.
|
|
231
|
+
session_id: Session identifier:
|
|
232
|
+
- For "start": Custom ID (auto-generated if not provided)
|
|
233
|
+
- For "stop": Specific session to stop (stops all if not provided)
|
|
234
|
+
- For "read_history": Session ID to read history from
|
|
235
|
+
- For "status": Not used
|
|
236
|
+
model_settings: Provider-specific configuration dictionary. Structure:
|
|
237
|
+
{
|
|
238
|
+
"model_id": "model-name",
|
|
239
|
+
"provider_config": {
|
|
240
|
+
"audio": {"voice": "voice-name"},
|
|
241
|
+
"inference": {...}
|
|
242
|
+
},
|
|
243
|
+
"client_config": {
|
|
244
|
+
"region": "us-east-1", # for Nova Sonic
|
|
245
|
+
"api_key": "key" # for OpenAI/Gemini (auto-loaded from env if not provided)
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
Examples:
|
|
250
|
+
- Nova Sonic with custom voice:
|
|
251
|
+
{"provider_config": {"audio": {"voice": "matthew"}}}
|
|
252
|
+
|
|
253
|
+
- OpenAI with custom model:
|
|
254
|
+
{"model_id": "gpt-4o-realtime-preview"}
|
|
255
|
+
|
|
256
|
+
- Gemini with custom voice:
|
|
257
|
+
{"provider_config": {"audio": {"voice": "Kore"}}}
|
|
258
|
+
tools: List of tool names to make available. If not provided,
|
|
259
|
+
inherits ALL tools from parent agent.
|
|
260
|
+
agent: Parent agent (automatically passed by Strands framework)
|
|
261
|
+
load_history_from: Optional session ID to load conversation history from
|
|
262
|
+
when starting a new session (provides context continuity)
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
str: Status message with session details or error information
|
|
266
|
+
|
|
267
|
+
Environment Variables:
|
|
268
|
+
- OPENAI_API_KEY: Required for OpenAI Realtime API (if not in model_settings)
|
|
269
|
+
- GOOGLE_API_KEY or GEMINI_API_KEY: Required for Gemini Live (if not in model_settings)
|
|
270
|
+
- AWS credentials: Required for Nova Sonic (boto3 default credential chain)
|
|
271
|
+
|
|
272
|
+
Nova Sonic Voice Options:
|
|
273
|
+
- English (US): tiffany (feminine), matthew (masculine)
|
|
274
|
+
- English (GB): amy (feminine)
|
|
275
|
+
- French: ambre (feminine), florian (masculine)
|
|
276
|
+
- Italian: beatrice (feminine), lorenzo (masculine)
|
|
277
|
+
- German: greta (feminine), lennart (masculine)
|
|
278
|
+
- Spanish: lupe (feminine), carlos (masculine)
|
|
279
|
+
"""
|
|
280
|
+
|
|
281
|
+
if action == "start":
|
|
282
|
+
return _start_speech_session(
|
|
283
|
+
provider,
|
|
284
|
+
system_prompt,
|
|
285
|
+
session_id,
|
|
286
|
+
model_settings,
|
|
287
|
+
tools,
|
|
288
|
+
agent,
|
|
289
|
+
load_history_from,
|
|
290
|
+
)
|
|
291
|
+
elif action == "stop":
|
|
292
|
+
return _stop_speech_session(session_id)
|
|
293
|
+
elif action == "status":
|
|
294
|
+
return _get_session_status()
|
|
295
|
+
elif action == "list_history":
|
|
296
|
+
return _list_conversation_histories()
|
|
297
|
+
elif action == "read_history":
|
|
298
|
+
return _read_conversation_history(session_id)
|
|
299
|
+
else:
|
|
300
|
+
return f"Unknown action: {action}"
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def _create_speech_session_tool(current_session_id: str, bidi_agent: BidiAgent):
|
|
304
|
+
"""Create a speech_session tool for the given session.
|
|
305
|
+
|
|
306
|
+
This tool is attached to each bidi agent instance to allow session management
|
|
307
|
+
from within the speech conversation.
|
|
308
|
+
"""
|
|
309
|
+
|
|
310
|
+
@tool
|
|
311
|
+
def speech_session(
|
|
312
|
+
action: str,
|
|
313
|
+
session_id: Optional[str] = None,
|
|
314
|
+
) -> str:
|
|
315
|
+
"""Manage the current speech conversation session.
|
|
316
|
+
|
|
317
|
+
Actions:
|
|
318
|
+
- "stop": Stop the current conversation
|
|
319
|
+
- "status": Get session status
|
|
320
|
+
- "list_history": List all saved conversation histories
|
|
321
|
+
- "read_history": Read a specific conversation history
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
action: Action to perform
|
|
325
|
+
session_id: Session ID (required for read_history)
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
Status message
|
|
329
|
+
"""
|
|
330
|
+
if action == "stop":
|
|
331
|
+
try:
|
|
332
|
+
# Stop the session (which will call bidi_agent.stop() properly)
|
|
333
|
+
with _session_lock:
|
|
334
|
+
if current_session_id in _active_sessions:
|
|
335
|
+
_active_sessions[current_session_id].stop()
|
|
336
|
+
del _active_sessions[current_session_id]
|
|
337
|
+
return "Conversation stopped successfully."
|
|
338
|
+
else:
|
|
339
|
+
return f"Session {current_session_id} not found in active sessions."
|
|
340
|
+
except Exception as e:
|
|
341
|
+
logger.error(f"Error stopping conversation: {e}")
|
|
342
|
+
return f"Error stopping conversation: {e}"
|
|
343
|
+
|
|
344
|
+
elif action == "status":
|
|
345
|
+
return _get_session_status()
|
|
346
|
+
|
|
347
|
+
elif action == "list_history":
|
|
348
|
+
return _list_conversation_histories()
|
|
349
|
+
|
|
350
|
+
elif action == "read_history":
|
|
351
|
+
return _read_conversation_history(session_id)
|
|
352
|
+
|
|
353
|
+
else:
|
|
354
|
+
return f"Unknown action: {action}. Available: stop, status, list_history, read_history"
|
|
355
|
+
|
|
356
|
+
return speech_session
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def _start_speech_session(
|
|
360
|
+
provider: str,
|
|
361
|
+
system_prompt: Optional[str],
|
|
362
|
+
session_id: Optional[str],
|
|
363
|
+
model_settings: Optional[Dict[str, Any]],
|
|
364
|
+
tool_names: Optional[List[str]],
|
|
365
|
+
parent_agent: Optional[Any],
|
|
366
|
+
load_history_from: Optional[str],
|
|
367
|
+
) -> str:
|
|
368
|
+
"""Start a speech-to-speech session with full configuration support."""
|
|
369
|
+
try:
|
|
370
|
+
# Generate session ID if not provided
|
|
371
|
+
if not session_id:
|
|
372
|
+
session_id = f"speech_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
|
373
|
+
|
|
374
|
+
# Check if session already exists
|
|
375
|
+
with _session_lock:
|
|
376
|
+
if session_id in _active_sessions:
|
|
377
|
+
return f"❌ Session already exists: {session_id}"
|
|
378
|
+
|
|
379
|
+
# Create model based on provider with custom settings
|
|
380
|
+
model_settings = model_settings or {}
|
|
381
|
+
model_info = f"{provider}"
|
|
382
|
+
|
|
383
|
+
try:
|
|
384
|
+
if provider == "novasonic":
|
|
385
|
+
# Nova Sonic only available in: us-east-1, eu-north-1, ap-northeast-1
|
|
386
|
+
default_settings = {
|
|
387
|
+
"model_id": "amazon.nova-2-sonic-v1:0",
|
|
388
|
+
"provider_config": {
|
|
389
|
+
"audio": {
|
|
390
|
+
"voice": "tiffany",
|
|
391
|
+
},
|
|
392
|
+
},
|
|
393
|
+
"client_config": {"region": "us-east-1"},
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
# Merge user settings with defaults (deep merge for nested dicts)
|
|
397
|
+
if model_settings:
|
|
398
|
+
# Merge top-level keys
|
|
399
|
+
for key, value in model_settings.items():
|
|
400
|
+
if (
|
|
401
|
+
key in default_settings
|
|
402
|
+
and isinstance(default_settings[key], dict)
|
|
403
|
+
and isinstance(value, dict)
|
|
404
|
+
):
|
|
405
|
+
# Deep merge for nested dicts
|
|
406
|
+
default_settings[key].update(value)
|
|
407
|
+
else:
|
|
408
|
+
default_settings[key] = value
|
|
409
|
+
|
|
410
|
+
model = BidiNovaSonicModel(**default_settings)
|
|
411
|
+
region = default_settings.get("client_config", {}).get(
|
|
412
|
+
"region", "us-east-1"
|
|
413
|
+
)
|
|
414
|
+
voice = (
|
|
415
|
+
default_settings.get("provider_config", {})
|
|
416
|
+
.get("audio", {})
|
|
417
|
+
.get("voice", "tiffany")
|
|
418
|
+
)
|
|
419
|
+
model_info = f"Nova Sonic ({region}, voice: {voice})"
|
|
420
|
+
|
|
421
|
+
elif provider == "openai":
|
|
422
|
+
# Read API key from environment if not provided in model_settings
|
|
423
|
+
default_settings = {
|
|
424
|
+
"model_id": "gpt-realtime",
|
|
425
|
+
"client_config": {
|
|
426
|
+
"api_key": os.getenv("OPENAI_API_KEY"),
|
|
427
|
+
},
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
# Merge user settings
|
|
431
|
+
if model_settings:
|
|
432
|
+
for key, value in model_settings.items():
|
|
433
|
+
if (
|
|
434
|
+
key in default_settings
|
|
435
|
+
and isinstance(default_settings[key], dict)
|
|
436
|
+
and isinstance(value, dict)
|
|
437
|
+
):
|
|
438
|
+
default_settings[key].update(value)
|
|
439
|
+
else:
|
|
440
|
+
default_settings[key] = value
|
|
441
|
+
|
|
442
|
+
# Check if API key is available
|
|
443
|
+
if not default_settings.get("client_config", {}).get("api_key"):
|
|
444
|
+
return "❌ OpenAI API key not found. Set OPENAI_API_KEY environment variable or provide in model_settings['client_config']['api_key']"
|
|
445
|
+
|
|
446
|
+
model = BidiOpenAIRealtimeModel(**default_settings)
|
|
447
|
+
model_id = default_settings.get("model_id", "gpt-realtime")
|
|
448
|
+
voice = (
|
|
449
|
+
default_settings.get("provider_config", {})
|
|
450
|
+
.get("audio", {})
|
|
451
|
+
.get("voice", "default")
|
|
452
|
+
)
|
|
453
|
+
model_info = f"OpenAI Realtime ({model_id}, voice: {voice})"
|
|
454
|
+
|
|
455
|
+
elif provider == "gemini_live":
|
|
456
|
+
# Read API key from environment if not provided in model_settings
|
|
457
|
+
api_key = os.getenv("GOOGLE_API_KEY") or os.getenv("GEMINI_API_KEY")
|
|
458
|
+
|
|
459
|
+
default_settings = {
|
|
460
|
+
"model_id": "gemini-2.5-flash-native-audio-preview-09-2025",
|
|
461
|
+
"client_config": {
|
|
462
|
+
"api_key": api_key,
|
|
463
|
+
},
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
# Merge user settings
|
|
467
|
+
if model_settings:
|
|
468
|
+
for key, value in model_settings.items():
|
|
469
|
+
if (
|
|
470
|
+
key in default_settings
|
|
471
|
+
and isinstance(default_settings[key], dict)
|
|
472
|
+
and isinstance(value, dict)
|
|
473
|
+
):
|
|
474
|
+
default_settings[key].update(value)
|
|
475
|
+
else:
|
|
476
|
+
default_settings[key] = value
|
|
477
|
+
|
|
478
|
+
# Check if API key is available
|
|
479
|
+
if not default_settings.get("client_config", {}).get("api_key"):
|
|
480
|
+
return "❌ Google/Gemini API key not found. Set GOOGLE_API_KEY or GEMINI_API_KEY environment variable or provide in model_settings['client_config']['api_key']"
|
|
481
|
+
|
|
482
|
+
model = BidiGeminiLiveModel(**default_settings)
|
|
483
|
+
model_id = default_settings.get("model_id", "gemini-2.5-flash-live")
|
|
484
|
+
voice = (
|
|
485
|
+
default_settings.get("provider_config", {})
|
|
486
|
+
.get("audio", {})
|
|
487
|
+
.get("voice", "default")
|
|
488
|
+
)
|
|
489
|
+
model_info = f"Gemini Live ({model_id}, voice: {voice})"
|
|
490
|
+
|
|
491
|
+
else:
|
|
492
|
+
return f"❌ Unknown provider: {provider}. Supported: novasonic, openai, gemini_live"
|
|
493
|
+
except Exception as e:
|
|
494
|
+
return f"❌ Error creating {provider} model: {e}\n\nCheck your configuration and credentials."
|
|
495
|
+
|
|
496
|
+
# Get parent agent's tools
|
|
497
|
+
tools = []
|
|
498
|
+
inherited_count = 0
|
|
499
|
+
|
|
500
|
+
if parent_agent and hasattr(parent_agent, "tool_registry"):
|
|
501
|
+
try:
|
|
502
|
+
# Get all tool functions from parent agent's registry
|
|
503
|
+
registry_dict = parent_agent.tool_registry.registry
|
|
504
|
+
|
|
505
|
+
# If specific tools requested, filter; otherwise inherit all
|
|
506
|
+
if tool_names:
|
|
507
|
+
# User specified tool names - only include those
|
|
508
|
+
for tool_name in tool_names:
|
|
509
|
+
if tool_name not in ["speech_to_speech"]:
|
|
510
|
+
tool_func = registry_dict.get(tool_name)
|
|
511
|
+
if tool_func:
|
|
512
|
+
tools.append(tool_func)
|
|
513
|
+
inherited_count += 1
|
|
514
|
+
else:
|
|
515
|
+
logger.warning(
|
|
516
|
+
f"Tool '{tool_name}' not found in parent agent's registry"
|
|
517
|
+
)
|
|
518
|
+
else:
|
|
519
|
+
# No specific tools - inherit all except excluded
|
|
520
|
+
for tool_name, tool_func in registry_dict.items():
|
|
521
|
+
if tool_name not in ["speech_to_speech"]:
|
|
522
|
+
tools.append(tool_func)
|
|
523
|
+
inherited_count += 1
|
|
524
|
+
|
|
525
|
+
except Exception as e:
|
|
526
|
+
logger.warning(f"Could not inherit tools from parent agent: {e}")
|
|
527
|
+
|
|
528
|
+
# Load conversation history if requested
|
|
529
|
+
messages = None
|
|
530
|
+
if load_history_from:
|
|
531
|
+
history_file = HISTORY_DIR / f"{load_history_from}.json"
|
|
532
|
+
if history_file.exists():
|
|
533
|
+
try:
|
|
534
|
+
with open(history_file, "r") as f:
|
|
535
|
+
history_data = json.load(f)
|
|
536
|
+
messages = history_data.get("messages", [])
|
|
537
|
+
logger.info(
|
|
538
|
+
f"Loaded {len(messages)} messages from {load_history_from}"
|
|
539
|
+
)
|
|
540
|
+
except Exception as e:
|
|
541
|
+
logger.warning(
|
|
542
|
+
f"Failed to load history from {load_history_from}: {e}"
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
# Build system prompt: parent prompt + custom prompt
|
|
546
|
+
final_system_prompt = ""
|
|
547
|
+
|
|
548
|
+
# Get parent agent's system prompt if available
|
|
549
|
+
if parent_agent and hasattr(parent_agent, "system_prompt"):
|
|
550
|
+
parent_prompt = parent_agent.system_prompt or ""
|
|
551
|
+
if parent_prompt:
|
|
552
|
+
final_system_prompt = parent_prompt
|
|
553
|
+
|
|
554
|
+
# Add custom system prompt
|
|
555
|
+
if system_prompt:
|
|
556
|
+
if final_system_prompt:
|
|
557
|
+
final_system_prompt = f"{final_system_prompt}\n\n{system_prompt}"
|
|
558
|
+
else:
|
|
559
|
+
final_system_prompt = system_prompt
|
|
560
|
+
|
|
561
|
+
# Use default system prompt if nothing provided
|
|
562
|
+
if not final_system_prompt:
|
|
563
|
+
final_system_prompt = """You are a helpful AI assistant with access to powerful tools.
|
|
564
|
+
- To stop the conversation → Use speech_session tool with action="stop"
|
|
565
|
+
Keep your voice responses brief and natural."""
|
|
566
|
+
|
|
567
|
+
# Create bidirectional agent with inherited tools (speech_session will be added after)
|
|
568
|
+
bidi_agent = BidiAgent(
|
|
569
|
+
model=model,
|
|
570
|
+
tools=tools,
|
|
571
|
+
system_prompt=final_system_prompt,
|
|
572
|
+
messages=messages,
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
# Create and add speech_session tool to agent's registry
|
|
576
|
+
# This allows user to manage the session from within the conversation
|
|
577
|
+
speech_session_tool = _create_speech_session_tool(session_id, bidi_agent)
|
|
578
|
+
bidi_agent.tool_registry.registry["speech_session"] = speech_session_tool
|
|
579
|
+
|
|
580
|
+
# Create and start session
|
|
581
|
+
session = SpeechSession(
|
|
582
|
+
session_id=session_id,
|
|
583
|
+
agent=bidi_agent,
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
session.start()
|
|
587
|
+
|
|
588
|
+
# Register session
|
|
589
|
+
with _session_lock:
|
|
590
|
+
_active_sessions[session_id] = session
|
|
591
|
+
|
|
592
|
+
# Build settings summary
|
|
593
|
+
settings_summary = ""
|
|
594
|
+
if model_settings:
|
|
595
|
+
settings_lines = []
|
|
596
|
+
for key, value in model_settings.items():
|
|
597
|
+
if key not in ["api_key", "secret"]: # Hide sensitive data
|
|
598
|
+
settings_lines.append(f" - {key}: {value}")
|
|
599
|
+
if settings_lines:
|
|
600
|
+
settings_summary = "\n**Model Settings:**\n" + "\n".join(settings_lines)
|
|
601
|
+
|
|
602
|
+
# Add history info if loaded
|
|
603
|
+
history_info = ""
|
|
604
|
+
if messages:
|
|
605
|
+
history_info = f"\n**Loaded History:** {len(messages)} messages from session '{load_history_from}'"
|
|
606
|
+
|
|
607
|
+
return f"""✅ Speech session started!
|
|
608
|
+
|
|
609
|
+
**Session ID:** {session_id}
|
|
610
|
+
**Provider:** {model_info}
|
|
611
|
+
**Tools:** {inherited_count + 1} tools available (includes speech_session){settings_summary}{history_info}
|
|
612
|
+
**History Location:** {session.history_file}
|
|
613
|
+
|
|
614
|
+
The session is running in the background. Speak into your microphone to interact!
|
|
615
|
+
|
|
616
|
+
**To manage the session during conversation:**
|
|
617
|
+
- Stop: Say "stop the session" or "end conversation"
|
|
618
|
+
- Check status: Say "check session status"
|
|
619
|
+
- List histories: Say "list conversation histories"
|
|
620
|
+
|
|
621
|
+
**External Commands:**
|
|
622
|
+
- Check status: speech_to_speech(action="status")
|
|
623
|
+
- Stop session: speech_to_speech(action="stop", session_id="{session_id}")
|
|
624
|
+
- List histories: speech_to_speech(action="list_history")
|
|
625
|
+
- Read history: speech_to_speech(action="read_history", session_id="{session_id}")
|
|
626
|
+
"""
|
|
627
|
+
|
|
628
|
+
except Exception as e:
|
|
629
|
+
logger.error(f"Error starting speech session: {e}\n{traceback.format_exc()}")
|
|
630
|
+
return f"❌ Error starting session: {e}\n\nCheck logs for details."
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
def _stop_speech_session(session_id: Optional[str]) -> str:
|
|
634
|
+
"""Stop a speech session."""
|
|
635
|
+
with _session_lock:
|
|
636
|
+
if not session_id:
|
|
637
|
+
if not _active_sessions:
|
|
638
|
+
return "❌ No active sessions"
|
|
639
|
+
# Stop all sessions
|
|
640
|
+
session_ids = list(_active_sessions.keys())
|
|
641
|
+
for sid in session_ids:
|
|
642
|
+
_active_sessions[sid].stop()
|
|
643
|
+
del _active_sessions[sid]
|
|
644
|
+
return f"✅ Stopped {len(session_ids)} session(s)"
|
|
645
|
+
|
|
646
|
+
if session_id not in _active_sessions:
|
|
647
|
+
return f"❌ Session not found: {session_id}"
|
|
648
|
+
|
|
649
|
+
session = _active_sessions[session_id]
|
|
650
|
+
session.stop()
|
|
651
|
+
del _active_sessions[session_id]
|
|
652
|
+
|
|
653
|
+
return f"✅ Session stopped: {session_id}"
|
|
654
|
+
|
|
655
|
+
|
|
656
|
+
def _get_session_status() -> str:
|
|
657
|
+
"""Get status of all active sessions."""
|
|
658
|
+
with _session_lock:
|
|
659
|
+
if not _active_sessions:
|
|
660
|
+
return "No active speech sessions"
|
|
661
|
+
|
|
662
|
+
status_lines = ["**Active Speech Sessions:**\n"]
|
|
663
|
+
for session_id, session in _active_sessions.items():
|
|
664
|
+
status_lines.append(
|
|
665
|
+
f"- **{session_id}**\n"
|
|
666
|
+
f" - Active: {'✅' if session.active else '❌'}\n"
|
|
667
|
+
f" - History File: {session.history_file}"
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
return "\n".join(status_lines)
|
|
671
|
+
|
|
672
|
+
|
|
673
|
+
def _list_conversation_histories() -> str:
|
|
674
|
+
"""List all saved conversation histories."""
|
|
675
|
+
history_files = sorted(
|
|
676
|
+
HISTORY_DIR.glob("*.json"), key=lambda p: p.stat().st_mtime, reverse=True
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
if not history_files:
|
|
680
|
+
return f"No saved conversation histories found in {HISTORY_DIR}"
|
|
681
|
+
|
|
682
|
+
lines = [f"**Saved Conversation Histories** ({len(history_files)} total):\n"]
|
|
683
|
+
lines.append(f"Location: {HISTORY_DIR}\n")
|
|
684
|
+
|
|
685
|
+
for history_file in history_files:
|
|
686
|
+
try:
|
|
687
|
+
with open(history_file, "r") as f:
|
|
688
|
+
data = json.load(f)
|
|
689
|
+
session_id = data.get("session_id", history_file.stem)
|
|
690
|
+
timestamp = data.get("timestamp", "unknown")
|
|
691
|
+
message_count = len(data.get("messages", []))
|
|
692
|
+
|
|
693
|
+
lines.append(
|
|
694
|
+
f"- **{session_id}**\n"
|
|
695
|
+
f" - Timestamp: {timestamp}\n"
|
|
696
|
+
f" - Messages: {message_count}\n"
|
|
697
|
+
f" - File: {history_file.name}"
|
|
698
|
+
)
|
|
699
|
+
except Exception as e:
|
|
700
|
+
lines.append(f"- **{history_file.stem}** (error reading: {e})")
|
|
701
|
+
|
|
702
|
+
return "\n".join(lines)
|
|
703
|
+
|
|
704
|
+
|
|
705
|
+
def _read_conversation_history(session_id: Optional[str]) -> str:
|
|
706
|
+
"""Read a specific conversation history."""
|
|
707
|
+
if not session_id:
|
|
708
|
+
return "❌ session_id required for read_history action"
|
|
709
|
+
|
|
710
|
+
history_file = HISTORY_DIR / f"{session_id}.json"
|
|
711
|
+
|
|
712
|
+
if not history_file.exists():
|
|
713
|
+
return f"❌ No history found for session: {session_id}\n\nAvailable histories:\n{_list_conversation_histories()}"
|
|
714
|
+
|
|
715
|
+
try:
|
|
716
|
+
with open(history_file, "r") as f:
|
|
717
|
+
data = json.load(f)
|
|
718
|
+
|
|
719
|
+
messages = data.get("messages", [])
|
|
720
|
+
timestamp = data.get("timestamp", "unknown")
|
|
721
|
+
|
|
722
|
+
lines = [
|
|
723
|
+
f"**Conversation History: {session_id}**\n",
|
|
724
|
+
f"Timestamp: {timestamp}",
|
|
725
|
+
f"Messages: {len(messages)}\n",
|
|
726
|
+
"---\n",
|
|
727
|
+
]
|
|
728
|
+
|
|
729
|
+
# Format messages
|
|
730
|
+
for i, msg in enumerate(messages, 1):
|
|
731
|
+
role = msg.get("role", "unknown")
|
|
732
|
+
content_blocks = msg.get("content", [])
|
|
733
|
+
|
|
734
|
+
lines.append(f"**{i}. {role.upper()}:**")
|
|
735
|
+
|
|
736
|
+
for block in content_blocks:
|
|
737
|
+
if "text" in block:
|
|
738
|
+
lines.append(f" {block['text']}")
|
|
739
|
+
elif "toolUse" in block:
|
|
740
|
+
tool_use = block["toolUse"]
|
|
741
|
+
lines.append(f" [Tool Call: {tool_use['name']}]")
|
|
742
|
+
elif "toolResult" in block:
|
|
743
|
+
lines.append(f" [Tool Result]")
|
|
744
|
+
|
|
745
|
+
lines.append("")
|
|
746
|
+
|
|
747
|
+
return "\n".join(lines)
|
|
748
|
+
|
|
749
|
+
except Exception as e:
|
|
750
|
+
return f"❌ Error reading history: {e}"
|