devduck 0.1.1766644714__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of devduck might be problematic. Click here for more details.

Files changed (36) hide show
  1. devduck/__init__.py +591 -1092
  2. devduck/_version.py +2 -2
  3. devduck/install.sh +42 -0
  4. devduck/test_redduck.py +1 -0
  5. devduck/tools/__init__.py +4 -44
  6. devduck/tools/install_tools.py +2 -103
  7. devduck/tools/mcp_server.py +6 -34
  8. devduck/tools/tcp.py +7 -6
  9. devduck/tools/websocket.py +2 -8
  10. devduck-0.2.0.dist-info/METADATA +143 -0
  11. devduck-0.2.0.dist-info/RECORD +16 -0
  12. {devduck-0.1.1766644714.dist-info → devduck-0.2.0.dist-info}/entry_points.txt +0 -1
  13. devduck-0.2.0.dist-info/licenses/LICENSE +21 -0
  14. devduck/agentcore_handler.py +0 -76
  15. devduck/tools/_ambient_input.py +0 -423
  16. devduck/tools/_tray_app.py +0 -530
  17. devduck/tools/agentcore_agents.py +0 -197
  18. devduck/tools/agentcore_config.py +0 -441
  19. devduck/tools/agentcore_invoke.py +0 -423
  20. devduck/tools/agentcore_logs.py +0 -320
  21. devduck/tools/ambient.py +0 -157
  22. devduck/tools/create_subagent.py +0 -659
  23. devduck/tools/fetch_github_tool.py +0 -201
  24. devduck/tools/ipc.py +0 -546
  25. devduck/tools/scraper.py +0 -935
  26. devduck/tools/speech_to_speech.py +0 -850
  27. devduck/tools/state_manager.py +0 -292
  28. devduck/tools/store_in_kb.py +0 -187
  29. devduck/tools/system_prompt.py +0 -608
  30. devduck/tools/tray.py +0 -247
  31. devduck/tools/use_github.py +0 -438
  32. devduck-0.1.1766644714.dist-info/METADATA +0 -717
  33. devduck-0.1.1766644714.dist-info/RECORD +0 -33
  34. devduck-0.1.1766644714.dist-info/licenses/LICENSE +0 -201
  35. {devduck-0.1.1766644714.dist-info → devduck-0.2.0.dist-info}/WHEEL +0 -0
  36. {devduck-0.1.1766644714.dist-info → devduck-0.2.0.dist-info}/top_level.txt +0 -0
@@ -1,850 +0,0 @@
1
- """Real-time speech-to-speech bidirectional streaming tool for DevDuck.
2
-
3
- Provides background speech-to-speech conversation capability using Strands
4
- experimental bidirectional streaming with full model provider support, tool
5
- inheritance, and comprehensive configuration options.
6
-
7
- This tool creates isolated bidirectional agent sessions that run in background
8
- threads, enabling real-time voice conversations with AI models while the parent
9
- agent remains responsive.
10
-
11
- Key Features:
12
- - **Background Execution:** Runs in separate thread - parent agent stays responsive
13
- - **Real-Time Audio:** Microphone input and speaker output with pyaudio
14
- - **Tool Inheritance:** Automatically inherits ALL tools from parent agent
15
- - **System Prompt Inheritance:** Combines parent agent's prompt with custom prompts
16
- - **Multiple Providers:** Nova Sonic, OpenAI Realtime API, Gemini Live
17
- - **Full Configuration:** Per-provider custom settings and parameters
18
- - **Environment API Keys:** Auto-loads API keys from environment variables
19
- - **Built-in Stop:** Uses SDK's stop_conversation tool for graceful termination
20
- - **Auto-Interruption:** Built-in VAD for natural conversation flow
21
- - **Conversation History:** Automatically saves transcripts to files
22
-
23
- Supported Providers:
24
- -------------------
25
- 1. **Nova Sonic (AWS Bedrock):**
26
- - Region: us-east-1, eu-north-1, ap-northeast-1
27
- - Model: amazon.nova-2-sonic-v1:0 (configurable)
28
- - Voices: tiffany, matthew, amy, ambre, florian, beatrice, lorenzo, greta, lennart, lupe, carlos
29
- - Requires AWS credentials (boto3 credential chain)
30
-
31
- 2. **OpenAI Realtime API:**
32
- - Models: gpt-realtime, gpt-4o-realtime-preview (configurable)
33
- - Requires OPENAI_API_KEY environment variable
34
- - Custom session config support
35
-
36
- 3. **Gemini Live:**
37
- - Model: gemini-2.5-flash-native-audio-preview-09-2025 (configurable)
38
- - Requires GOOGLE_API_KEY or GEMINI_API_KEY environment variable
39
- - Live config customization
40
-
41
- Configuration Examples:
42
- ----------------------
43
- # Nova Sonic with custom voice
44
- model_settings = {
45
- "provider_config": {
46
- "audio": {"voice": "matthew"}
47
- },
48
- "client_config": {"region": "us-east-1"}
49
- }
50
-
51
- # OpenAI Realtime with custom model
52
- model_settings = {
53
- "model_id": "gpt-4o-realtime-preview",
54
- "provider_config": {
55
- "audio": {"voice": "coral"}
56
- }
57
- }
58
-
59
- # Gemini Live with custom voice
60
- model_settings = {
61
- "model_id": "gemini-2.5-flash-native-audio-preview-09-2025",
62
- "provider_config": {
63
- "audio": {"voice": "Kore"}
64
- }
65
- }
66
- """
67
-
68
- import os
69
- import asyncio
70
- import tempfile
71
- import json
72
- import logging
73
- import threading
74
- import traceback
75
- from datetime import datetime
76
- from pathlib import Path
77
- from typing import Any, Dict, List, Optional
78
-
79
- from strands import tool
80
- from strands.experimental.bidi.agent.agent import BidiAgent
81
- from strands.experimental.bidi.models.gemini_live import BidiGeminiLiveModel
82
- from strands.experimental.bidi.models.nova_sonic import BidiNovaSonicModel
83
- from strands.experimental.bidi.models.openai_realtime import BidiOpenAIRealtimeModel
84
- from strands.experimental.bidi.io.audio import BidiAudioIO
85
-
86
- logger = logging.getLogger(__name__)
87
-
88
- # Global session tracking
89
- _active_sessions = {}
90
- _session_lock = threading.Lock()
91
-
92
- # Session history storage location
93
- BASE_DIR = Path(os.getenv("DEVDUCK_HOME", tempfile.gettempdir()))
94
- HISTORY_DIR = BASE_DIR / ".devduck" / "speech_sessions"
95
- HISTORY_DIR.mkdir(parents=True, exist_ok=True)
96
-
97
-
98
- class SpeechSession:
99
- """Manages a speech-to-speech conversation session with full lifecycle management."""
100
-
101
- def __init__(
102
- self,
103
- session_id: str,
104
- agent: BidiAgent,
105
- input_device_index: Optional[int] = None,
106
- output_device_index: Optional[int] = None,
107
- ):
108
- """Initialize speech session.
109
-
110
- Args:
111
- session_id: Unique session identifier
112
- agent: BidiAgent instance
113
- input_device_index: PyAudio input device index
114
- output_device_index: PyAudio output device index
115
- """
116
- self.session_id = session_id
117
- self.agent = agent
118
- self.input_device_index = input_device_index
119
- self.output_device_index = output_device_index
120
- self.active = False
121
- self.thread = None
122
- self.loop = None
123
- self.history_file = HISTORY_DIR / f"{session_id}.json"
124
-
125
- def start(self) -> None:
126
- """Start the speech session in background thread."""
127
- if self.active:
128
- raise ValueError("Session already active")
129
-
130
- self.active = True
131
- self.thread = threading.Thread(target=self._run_session, daemon=True)
132
- self.thread.start()
133
-
134
- def stop(self) -> None:
135
- """Stop the speech session and cleanup resources."""
136
- if not self.active:
137
- return
138
-
139
- self.active = False
140
-
141
- # Stop the bidi agent using its event loop
142
- if self.loop and self.loop.is_running():
143
- # Schedule stop in the session's event loop and wait for it
144
- future = asyncio.run_coroutine_threadsafe(self.agent.stop(), self.loop)
145
- try:
146
- # Wait up to 3 seconds for stop to complete
147
- future.result(timeout=3.0)
148
- logger.info(
149
- f"Successfully stopped bidi agent for session {self.session_id}"
150
- )
151
- except Exception as e:
152
- logger.warning(f"Error stopping bidi agent: {e}")
153
-
154
- if self.thread:
155
- self.thread.join(timeout=5.0)
156
-
157
- # Save conversation history after session ends
158
- self._save_history()
159
-
160
- def _save_history(self) -> None:
161
- """Save conversation history to file."""
162
- try:
163
- history_data = {
164
- "session_id": self.session_id,
165
- "timestamp": datetime.now().isoformat(),
166
- "messages": self.agent.messages,
167
- }
168
-
169
- with open(self.history_file, "w") as f:
170
- json.dump(history_data, f, indent=2)
171
-
172
- logger.info(f"Saved conversation history to {self.history_file}")
173
- except Exception as e:
174
- logger.error(f"Failed to save history: {e}")
175
-
176
- def _run_session(self) -> None:
177
- """Main session runner in background thread."""
178
- try:
179
- # Create event loop for this thread
180
- self.loop = asyncio.new_event_loop()
181
- asyncio.set_event_loop(self.loop)
182
-
183
- # Run the async session
184
- self.loop.run_until_complete(self._async_session())
185
- except Exception as e:
186
- error_msg = f"Session error: {e}\n{traceback.format_exc()}"
187
- logger.debug(error_msg)
188
- print(f"\n🦆 Session error: {e}")
189
- finally:
190
- if self.loop:
191
- self.loop.close()
192
-
193
- async def _async_session(self) -> None:
194
- """Async session management using BidiAudioIO."""
195
- try:
196
- # Create audio I/O with device indices
197
- audio_io = BidiAudioIO(
198
- input_device_index=self.input_device_index,
199
- output_device_index=self.output_device_index,
200
- )
201
-
202
- # Run agent with audio I/O
203
- await self.agent.run(inputs=[audio_io.input()], outputs=[audio_io.output()])
204
-
205
- except Exception as e:
206
- logger.debug(f"Async session error: {e}\n{traceback.format_exc()}")
207
-
208
-
209
- @tool
210
- def speech_to_speech(
211
- action: str,
212
- provider: str = "novasonic",
213
- system_prompt: Optional[str] = None,
214
- session_id: Optional[str] = None,
215
- model_settings: Optional[Dict[str, Any]] = None,
216
- tools: Optional[List[str]] = None,
217
- agent: Optional[Any] = None,
218
- load_history_from: Optional[str] = None,
219
- inherit_system_prompt: bool = False,
220
- input_device_index: Optional[int] = None,
221
- output_device_index: Optional[int] = None,
222
- ) -> str:
223
- """Start, stop, or manage speech-to-speech conversations.
224
-
225
- Creates a background bidirectional streaming session for real-time voice
226
- conversations with AI. Supports full model configuration, tool inheritance,
227
- and multiple model providers with custom settings.
228
-
229
- Args:
230
- action: Action to perform:
231
- - "start": Start new speech session
232
- - "stop": Stop session(s)
233
- - "status": Get session status
234
- - "list_history": List saved conversation histories
235
- - "read_history": Read a specific conversation history
236
- - "list_audio_devices": List all available audio input/output devices
237
- provider: Model provider to use:
238
- - "novasonic": AWS Bedrock Nova Sonic
239
- - "openai": OpenAI Realtime API
240
- - "gemini_live": Google Gemini Live
241
- system_prompt: Custom system prompt for the agent. This will be appended
242
- to the parent agent's system prompt (if inherit_system_prompt=True).
243
- If not provided, uses default prompt that encourages tool usage.
244
- session_id: Session identifier:
245
- - For "start": Custom ID (auto-generated if not provided)
246
- - For "stop": Specific session to stop (stops all if not provided)
247
- - For "read_history": Session ID to read history from
248
- - For "status": Not used
249
- inherit_system_prompt: Whether to inherit parent agent's system prompt.
250
- Set to False to use only the custom system_prompt (useful for OpenAI
251
- which has 16K token limit). Default: False
252
- model_settings: Provider-specific configuration dictionary. Structure:
253
- {
254
- "model_id": "model-name",
255
- "provider_config": {
256
- "audio": {"voice": "voice-name"},
257
- "inference": {...}
258
- },
259
- "client_config": {
260
- "region": "us-east-1", # for Nova Sonic
261
- "api_key": "key" # for OpenAI/Gemini (auto-loaded from env if not provided)
262
- }
263
- }
264
-
265
- Examples:
266
- - Nova Sonic with custom voice:
267
- {"provider_config": {"audio": {"voice": "matthew"}}}
268
-
269
- - OpenAI with custom model:
270
- {"model_id": "gpt-4o-realtime-preview"}
271
-
272
- - Gemini with custom voice:
273
- {"provider_config": {"audio": {"voice": "Kore"}}}
274
- tools: List of tool names to make available. If not provided,
275
- inherits ALL tools from parent agent.
276
- agent: Parent agent (automatically passed by Strands framework)
277
- load_history_from: Optional session ID to load conversation history from
278
- when starting a new session (provides context continuity)
279
- input_device_index: Optional PyAudio input device index. If not specified,
280
- uses system default. Use action="list_audio_devices" to see available devices.
281
- output_device_index: Optional PyAudio output device index. If not specified,
282
- uses system default. Use action="list_audio_devices" to see available devices.
283
-
284
- Returns:
285
- str: Status message with session details or error information
286
-
287
- Environment Variables:
288
- - OPENAI_API_KEY: Required for OpenAI Realtime API (if not in model_settings)
289
- - GOOGLE_API_KEY or GEMINI_API_KEY: Required for Gemini Live (if not in model_settings)
290
- - AWS credentials: Required for Nova Sonic (boto3 default credential chain)
291
-
292
- Nova Sonic Voice Options:
293
- - English (US): tiffany (feminine), matthew (masculine)
294
- - English (GB): amy (feminine)
295
- - French: ambre (feminine), florian (masculine)
296
- - Italian: beatrice (feminine), lorenzo (masculine)
297
- - German: greta (feminine), lennart (masculine)
298
- - Spanish: lupe (feminine), carlos (masculine)
299
- """
300
-
301
- if action == "start":
302
- return _start_speech_session(
303
- provider,
304
- system_prompt,
305
- session_id,
306
- model_settings,
307
- tools,
308
- agent,
309
- load_history_from,
310
- inherit_system_prompt,
311
- input_device_index,
312
- output_device_index,
313
- )
314
- elif action == "stop":
315
- return _stop_speech_session(session_id)
316
- elif action == "status":
317
- return _get_session_status()
318
- elif action == "list_history":
319
- return _list_conversation_histories()
320
- elif action == "read_history":
321
- return _read_conversation_history(session_id)
322
- elif action == "list_audio_devices":
323
- return _list_audio_devices()
324
- else:
325
- return f"Unknown action: {action}"
326
-
327
-
328
- def _create_speech_session_tool(current_session_id: str, bidi_agent: BidiAgent):
329
- """Create a speech_session tool for the given session.
330
-
331
- This tool is attached to each bidi agent instance to allow session management
332
- from within the speech conversation.
333
- """
334
-
335
- @tool
336
- def speech_session(
337
- action: str,
338
- session_id: Optional[str] = None,
339
- ) -> str:
340
- """Manage the current speech conversation session.
341
-
342
- Actions:
343
- - "stop": Stop the current conversation
344
- - "status": Get session status
345
- - "list_history": List all saved conversation histories
346
- - "read_history": Read a specific conversation history
347
-
348
- Args:
349
- action: Action to perform
350
- session_id: Session ID (required for read_history)
351
-
352
- Returns:
353
- Status message
354
- """
355
- if action == "stop":
356
- try:
357
- # Stop the session (which will call bidi_agent.stop() properly)
358
- with _session_lock:
359
- if current_session_id in _active_sessions:
360
- _active_sessions[current_session_id].stop()
361
- del _active_sessions[current_session_id]
362
- return "Conversation stopped successfully."
363
- else:
364
- return f"Session {current_session_id} not found in active sessions."
365
- except Exception as e:
366
- logger.error(f"Error stopping conversation: {e}")
367
- return f"Error stopping conversation: {e}"
368
-
369
- elif action == "status":
370
- return _get_session_status()
371
-
372
- elif action == "list_history":
373
- return _list_conversation_histories()
374
-
375
- elif action == "read_history":
376
- return _read_conversation_history(session_id)
377
-
378
- else:
379
- return f"Unknown action: {action}. Available: stop, status, list_history, read_history"
380
-
381
- return speech_session
382
-
383
-
384
- def _start_speech_session(
385
- provider: str,
386
- system_prompt: Optional[str],
387
- session_id: Optional[str],
388
- model_settings: Optional[Dict[str, Any]],
389
- tool_names: Optional[List[str]],
390
- parent_agent: Optional[Any],
391
- load_history_from: Optional[str],
392
- inherit_system_prompt: bool,
393
- input_device_index: Optional[int],
394
- output_device_index: Optional[int],
395
- ) -> str:
396
- """Start a speech-to-speech session with full configuration support."""
397
- try:
398
- # Generate session ID if not provided
399
- if not session_id:
400
- session_id = f"speech_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
401
-
402
- # Check if session already exists
403
- with _session_lock:
404
- if session_id in _active_sessions:
405
- return f"❌ Session already exists: {session_id}"
406
-
407
- # Create model based on provider with custom settings
408
- model_settings = model_settings or {}
409
- model_info = f"{provider}"
410
-
411
- try:
412
- if provider == "novasonic":
413
- # Nova Sonic only available in: us-east-1, eu-north-1, ap-northeast-1
414
- default_settings = {
415
- "model_id": os.getenv("BIDI_MODEL_ID", "amazon.nova-2-sonic-v1:0"),
416
- "provider_config": {
417
- "audio": {
418
- "voice": "tiffany",
419
- },
420
- },
421
- "client_config": {"region": "us-east-1"},
422
- }
423
-
424
- # Merge user settings with defaults (deep merge for nested dicts)
425
- if model_settings:
426
- # Merge top-level keys
427
- for key, value in model_settings.items():
428
- if (
429
- key in default_settings
430
- and isinstance(default_settings[key], dict)
431
- and isinstance(value, dict)
432
- ):
433
- # Deep merge for nested dicts
434
- default_settings[key].update(value)
435
- else:
436
- default_settings[key] = value
437
-
438
- model = BidiNovaSonicModel(**default_settings)
439
- region = default_settings.get("client_config", {}).get(
440
- "region", "us-east-1"
441
- )
442
- voice = (
443
- default_settings.get("provider_config", {})
444
- .get("audio", {})
445
- .get("voice", "tiffany")
446
- )
447
- model_info = f"Nova Sonic ({region}, voice: {voice})"
448
-
449
- elif provider == "openai":
450
- # Read API key from environment if not provided in model_settings
451
- default_settings = {
452
- "model_id": os.getenv("BIDI_MODEL_ID", "gpt-realtime"),
453
- "client_config": {
454
- "api_key": os.getenv("OPENAI_API_KEY"),
455
- },
456
- }
457
-
458
- # Merge user settings
459
- if model_settings:
460
- for key, value in model_settings.items():
461
- if (
462
- key in default_settings
463
- and isinstance(default_settings[key], dict)
464
- and isinstance(value, dict)
465
- ):
466
- default_settings[key].update(value)
467
- else:
468
- default_settings[key] = value
469
-
470
- # Check if API key is available
471
- if not default_settings.get("client_config", {}).get("api_key"):
472
- return "❌ OpenAI API key not found. Set OPENAI_API_KEY environment variable or provide in model_settings['client_config']['api_key']"
473
-
474
- model = BidiOpenAIRealtimeModel(**default_settings)
475
- model_id = default_settings.get("model_id", "gpt-realtime")
476
- voice = (
477
- default_settings.get("provider_config", {})
478
- .get("audio", {})
479
- .get("voice", "default")
480
- )
481
- model_info = f"OpenAI Realtime ({model_id}, voice: {voice})"
482
-
483
- elif provider == "gemini_live":
484
- # Read API key from environment if not provided in model_settings
485
- api_key = os.getenv("GOOGLE_API_KEY") or os.getenv("GEMINI_API_KEY")
486
-
487
- default_settings = {
488
- "model_id": os.getenv(
489
- "BIDI_MODEL_ID", "gemini-2.5-flash-native-audio-preview-09-2025"
490
- ),
491
- "client_config": {
492
- "api_key": api_key,
493
- },
494
- }
495
-
496
- # Merge user settings
497
- if model_settings:
498
- for key, value in model_settings.items():
499
- if (
500
- key in default_settings
501
- and isinstance(default_settings[key], dict)
502
- and isinstance(value, dict)
503
- ):
504
- default_settings[key].update(value)
505
- else:
506
- default_settings[key] = value
507
-
508
- # Check if API key is available
509
- if not default_settings.get("client_config", {}).get("api_key"):
510
- return "❌ Google/Gemini API key not found. Set GOOGLE_API_KEY or GEMINI_API_KEY environment variable or provide in model_settings['client_config']['api_key']"
511
-
512
- model = BidiGeminiLiveModel(**default_settings)
513
- model_id = default_settings.get("model_id", "gemini-2.5-flash-live")
514
- voice = (
515
- default_settings.get("provider_config", {})
516
- .get("audio", {})
517
- .get("voice", "default")
518
- )
519
- model_info = f"Gemini Live ({model_id}, voice: {voice})"
520
-
521
- else:
522
- return f"❌ Unknown provider: {provider}. Supported: novasonic, openai, gemini_live"
523
- except Exception as e:
524
- return f"❌ Error creating {provider} model: {e}\n\nCheck your configuration and credentials."
525
-
526
- # Get parent agent's tools
527
- tools = []
528
- inherited_count = 0
529
-
530
- if parent_agent and hasattr(parent_agent, "tool_registry"):
531
- try:
532
- # Get all tool functions from parent agent's registry
533
- registry_dict = parent_agent.tool_registry.registry
534
-
535
- # If specific tools requested, filter; otherwise inherit all
536
- if tool_names:
537
- # User specified tool names - only include those
538
- for tool_name in tool_names:
539
- if tool_name not in ["speech_to_speech"]:
540
- tool_func = registry_dict.get(tool_name)
541
- if tool_func:
542
- tools.append(tool_func)
543
- inherited_count += 1
544
- else:
545
- logger.warning(
546
- f"Tool '{tool_name}' not found in parent agent's registry"
547
- )
548
- else:
549
- # No specific tools - inherit all except excluded
550
- for tool_name, tool_func in registry_dict.items():
551
- if tool_name not in ["speech_to_speech"]:
552
- tools.append(tool_func)
553
- inherited_count += 1
554
-
555
- except Exception as e:
556
- logger.warning(f"Could not inherit tools from parent agent: {e}")
557
-
558
- # Load conversation history if requested
559
- messages = None
560
- if load_history_from:
561
- history_file = HISTORY_DIR / f"{load_history_from}.json"
562
- if history_file.exists():
563
- try:
564
- with open(history_file, "r") as f:
565
- history_data = json.load(f)
566
- messages = history_data.get("messages", [])
567
- logger.info(
568
- f"Loaded {len(messages)} messages from {load_history_from}"
569
- )
570
- except Exception as e:
571
- logger.warning(
572
- f"Failed to load history from {load_history_from}: {e}"
573
- )
574
-
575
- # Build system prompt: parent prompt + custom prompt
576
- final_system_prompt = ""
577
-
578
- # Get parent agent's system prompt if available and inheritance enabled
579
- if (
580
- inherit_system_prompt
581
- and parent_agent
582
- and hasattr(parent_agent, "system_prompt")
583
- ):
584
- parent_prompt = parent_agent.system_prompt or ""
585
- if parent_prompt:
586
- final_system_prompt = parent_prompt
587
-
588
- # Add custom system prompt
589
- if system_prompt:
590
- if final_system_prompt:
591
- final_system_prompt = f"{final_system_prompt}\n\n{system_prompt}"
592
- else:
593
- final_system_prompt = system_prompt
594
-
595
- # Use default system prompt if nothing provided
596
- if not final_system_prompt:
597
- final_system_prompt = """You are a helpful AI assistant with access to powerful tools.
598
- - To stop the conversation → Use speech_session tool with action="stop"
599
- Keep your voice responses brief and natural."""
600
-
601
- # Create bidirectional agent with inherited tools (speech_session will be added after)
602
- bidi_agent = BidiAgent(
603
- model=model,
604
- tools=tools,
605
- system_prompt=final_system_prompt,
606
- messages=messages,
607
- )
608
-
609
- # Create and add speech_session tool to agent's registry
610
- # This allows user to manage the session from within the conversation
611
- speech_session_tool = _create_speech_session_tool(session_id, bidi_agent)
612
- bidi_agent.tool_registry.registry["speech_session"] = speech_session_tool
613
-
614
- # Create and start session
615
- session = SpeechSession(
616
- session_id=session_id,
617
- agent=bidi_agent,
618
- input_device_index=input_device_index,
619
- output_device_index=output_device_index,
620
- )
621
-
622
- session.start()
623
-
624
- # Register session
625
- with _session_lock:
626
- _active_sessions[session_id] = session
627
-
628
- # Build settings summary
629
- settings_summary = ""
630
- if model_settings:
631
- settings_lines = []
632
- for key, value in model_settings.items():
633
- if key not in ["api_key", "secret"]: # Hide sensitive data
634
- settings_lines.append(f" - {key}: {value}")
635
- if settings_lines:
636
- settings_summary = "\n**Model Settings:**\n" + "\n".join(settings_lines)
637
-
638
- # Add history info if loaded
639
- history_info = ""
640
- if messages:
641
- history_info = f"\n**Loaded History:** {len(messages)} messages from session '{load_history_from}'"
642
-
643
- return f"""✅ Speech session started!
644
-
645
- **Session ID:** {session_id}
646
- **Provider:** {model_info}
647
- **Tools:** {inherited_count + 1} tools available (includes speech_session){settings_summary}{history_info}
648
- **History Location:** {session.history_file}
649
-
650
- The session is running in the background. Speak into your microphone to interact!
651
-
652
- **To manage the session during conversation:**
653
- - Stop: Say "stop the session" or "end conversation"
654
- - Check status: Say "check session status"
655
- - List histories: Say "list conversation histories"
656
-
657
- **External Commands:**
658
- - Check status: speech_to_speech(action="status")
659
- - Stop session: speech_to_speech(action="stop", session_id="{session_id}")
660
- - List histories: speech_to_speech(action="list_history")
661
- - Read history: speech_to_speech(action="read_history", session_id="{session_id}")
662
- """
663
-
664
- except Exception as e:
665
- logger.error(f"Error starting speech session: {e}\n{traceback.format_exc()}")
666
- return f"❌ Error starting session: {e}\n\nCheck logs for details."
667
-
668
-
669
- def _stop_speech_session(session_id: Optional[str]) -> str:
670
- """Stop a speech session."""
671
- with _session_lock:
672
- if not session_id:
673
- if not _active_sessions:
674
- return "❌ No active sessions"
675
- # Stop all sessions
676
- session_ids = list(_active_sessions.keys())
677
- for sid in session_ids:
678
- _active_sessions[sid].stop()
679
- del _active_sessions[sid]
680
- return f"✅ Stopped {len(session_ids)} session(s)"
681
-
682
- if session_id not in _active_sessions:
683
- return f"❌ Session not found: {session_id}"
684
-
685
- session = _active_sessions[session_id]
686
- session.stop()
687
- del _active_sessions[session_id]
688
-
689
- return f"✅ Session stopped: {session_id}"
690
-
691
-
692
- def _get_session_status() -> str:
693
- """Get status of all active sessions."""
694
- with _session_lock:
695
- if not _active_sessions:
696
- return "No active speech sessions"
697
-
698
- status_lines = ["**Active Speech Sessions:**\n"]
699
- for session_id, session in _active_sessions.items():
700
- status_lines.append(
701
- f"- **{session_id}**\n"
702
- f" - Active: {'✅' if session.active else '❌'}\n"
703
- f" - History File: {session.history_file}"
704
- )
705
-
706
- return "\n".join(status_lines)
707
-
708
-
709
- def _list_conversation_histories() -> str:
710
- """List all saved conversation histories."""
711
- history_files = sorted(
712
- HISTORY_DIR.glob("*.json"), key=lambda p: p.stat().st_mtime, reverse=True
713
- )
714
-
715
- if not history_files:
716
- return f"No saved conversation histories found in {HISTORY_DIR}"
717
-
718
- lines = [f"**Saved Conversation Histories** ({len(history_files)} total):\n"]
719
- lines.append(f"Location: {HISTORY_DIR}\n")
720
-
721
- for history_file in history_files:
722
- try:
723
- with open(history_file, "r") as f:
724
- data = json.load(f)
725
- session_id = data.get("session_id", history_file.stem)
726
- timestamp = data.get("timestamp", "unknown")
727
- message_count = len(data.get("messages", []))
728
-
729
- lines.append(
730
- f"- **{session_id}**\n"
731
- f" - Timestamp: {timestamp}\n"
732
- f" - Messages: {message_count}\n"
733
- f" - File: {history_file.name}"
734
- )
735
- except Exception as e:
736
- lines.append(f"- **{history_file.stem}** (error reading: {e})")
737
-
738
- return "\n".join(lines)
739
-
740
-
741
- def _read_conversation_history(session_id: Optional[str]) -> str:
742
- """Read a specific conversation history."""
743
- if not session_id:
744
- return "❌ session_id required for read_history action"
745
-
746
- history_file = HISTORY_DIR / f"{session_id}.json"
747
-
748
- if not history_file.exists():
749
- return f"❌ No history found for session: {session_id}\n\nAvailable histories:\n{_list_conversation_histories()}"
750
-
751
- try:
752
- with open(history_file, "r") as f:
753
- data = json.load(f)
754
-
755
- messages = data.get("messages", [])
756
- timestamp = data.get("timestamp", "unknown")
757
-
758
- lines = [
759
- f"**Conversation History: {session_id}**\n",
760
- f"Timestamp: {timestamp}",
761
- f"Messages: {len(messages)}\n",
762
- "---\n",
763
- ]
764
-
765
- # Format messages
766
- for i, msg in enumerate(messages, 1):
767
- role = msg.get("role", "unknown")
768
- content_blocks = msg.get("content", [])
769
-
770
- lines.append(f"**{i}. {role.upper()}:**")
771
-
772
- for block in content_blocks:
773
- if "text" in block:
774
- lines.append(f" {block['text']}")
775
- elif "toolUse" in block:
776
- tool_use = block["toolUse"]
777
- lines.append(f" [Tool Call: {tool_use['name']}]")
778
- elif "toolResult" in block:
779
- lines.append(f" [Tool Result]")
780
-
781
- lines.append("")
782
-
783
- return "\n".join(lines)
784
-
785
- except Exception as e:
786
- return f"❌ Error reading history: {e}"
787
-
788
-
789
- def _list_audio_devices() -> str:
790
- """List all available audio input and output devices."""
791
- try:
792
- import pyaudio
793
-
794
- p = pyaudio.PyAudio()
795
-
796
- lines = ["**Available Audio Devices:**\n"]
797
-
798
- # List all devices
799
- device_count = p.get_device_count()
800
- default_input = p.get_default_input_device_info()["index"]
801
- default_output = p.get_default_output_device_info()["index"]
802
-
803
- lines.append(f"Total devices: {device_count}\n")
804
-
805
- for i in range(device_count):
806
- try:
807
- info = p.get_device_info_by_index(i)
808
- name = info["name"]
809
- max_input_channels = info["maxInputChannels"]
810
- max_output_channels = info["maxOutputChannels"]
811
-
812
- device_type = []
813
- is_default = []
814
-
815
- if max_input_channels > 0:
816
- device_type.append("INPUT")
817
- if i == default_input:
818
- is_default.append("default input")
819
-
820
- if max_output_channels > 0:
821
- device_type.append("OUTPUT")
822
- if i == default_output:
823
- is_default.append("default output")
824
-
825
- type_str = "/".join(device_type) if device_type else "NONE"
826
- default_str = f" [{', '.join(is_default)}]" if is_default else ""
827
-
828
- lines.append(
829
- f"- **Index {i}:** {name}\n"
830
- f" Type: {type_str}{default_str}\n"
831
- f" Input Channels: {max_input_channels}, Output Channels: {max_output_channels}"
832
- )
833
-
834
- except Exception as e:
835
- lines.append(f"- **Index {i}:** Error reading device info - {e}")
836
-
837
- p.terminate()
838
-
839
- lines.append(
840
- "\n**Usage:**\n"
841
- "To use a specific device, pass the index:\n"
842
- ' speech_to_speech(action="start", input_device_index=2, output_device_index=5)'
843
- )
844
-
845
- return "\n".join(lines)
846
-
847
- except ImportError:
848
- return "❌ PyAudio not installed. Install with: pip install pyaudio"
849
- except Exception as e:
850
- return f"❌ Error listing audio devices: {e}"