solana-agent 20.1.2__py3-none-any.whl → 31.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. solana_agent/__init__.py +10 -5
  2. solana_agent/adapters/ffmpeg_transcoder.py +375 -0
  3. solana_agent/adapters/mongodb_adapter.py +15 -2
  4. solana_agent/adapters/openai_adapter.py +679 -0
  5. solana_agent/adapters/openai_realtime_ws.py +1813 -0
  6. solana_agent/adapters/pinecone_adapter.py +543 -0
  7. solana_agent/cli.py +128 -0
  8. solana_agent/client/solana_agent.py +180 -20
  9. solana_agent/domains/agent.py +13 -13
  10. solana_agent/domains/routing.py +18 -8
  11. solana_agent/factories/agent_factory.py +239 -38
  12. solana_agent/guardrails/pii.py +107 -0
  13. solana_agent/interfaces/client/client.py +95 -12
  14. solana_agent/interfaces/guardrails/guardrails.py +26 -0
  15. solana_agent/interfaces/plugins/plugins.py +2 -1
  16. solana_agent/interfaces/providers/__init__.py +0 -0
  17. solana_agent/interfaces/providers/audio.py +40 -0
  18. solana_agent/interfaces/providers/data_storage.py +9 -2
  19. solana_agent/interfaces/providers/llm.py +86 -9
  20. solana_agent/interfaces/providers/memory.py +13 -1
  21. solana_agent/interfaces/providers/realtime.py +212 -0
  22. solana_agent/interfaces/providers/vector_storage.py +53 -0
  23. solana_agent/interfaces/services/agent.py +27 -12
  24. solana_agent/interfaces/services/knowledge_base.py +59 -0
  25. solana_agent/interfaces/services/query.py +41 -8
  26. solana_agent/interfaces/services/routing.py +0 -1
  27. solana_agent/plugins/manager.py +37 -16
  28. solana_agent/plugins/registry.py +34 -19
  29. solana_agent/plugins/tools/__init__.py +0 -5
  30. solana_agent/plugins/tools/auto_tool.py +1 -0
  31. solana_agent/repositories/memory.py +332 -111
  32. solana_agent/services/__init__.py +1 -1
  33. solana_agent/services/agent.py +390 -241
  34. solana_agent/services/knowledge_base.py +768 -0
  35. solana_agent/services/query.py +1858 -153
  36. solana_agent/services/realtime.py +626 -0
  37. solana_agent/services/routing.py +104 -51
  38. solana_agent-31.4.0.dist-info/METADATA +1070 -0
  39. solana_agent-31.4.0.dist-info/RECORD +49 -0
  40. {solana_agent-20.1.2.dist-info → solana_agent-31.4.0.dist-info}/WHEEL +1 -1
  41. solana_agent-31.4.0.dist-info/entry_points.txt +3 -0
  42. solana_agent/adapters/llm_adapter.py +0 -160
  43. solana_agent-20.1.2.dist-info/METADATA +0 -464
  44. solana_agent-20.1.2.dist-info/RECORD +0 -35
  45. {solana_agent-20.1.2.dist-info → solana_agent-31.4.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,626 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ from typing import Any, AsyncGenerator, Dict, Optional
6
+
7
+ from solana_agent.interfaces.providers.realtime import (
8
+ BaseRealtimeSession,
9
+ RealtimeSessionOptions,
10
+ RealtimeChunk,
11
+ )
12
+ from solana_agent.interfaces.providers.audio import AudioTranscoder
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class RealtimeService:
18
+ """High-level service to manage a realtime audio session.
19
+
20
+ Responsibilities:
21
+ - Connect/close a realtime session (WebSocket-based)
22
+ - Update voice and VAD at runtime via session.update
23
+ - Append/commit/clear input audio buffers
24
+ - Expose separate async generators for audio and input/output transcripts
25
+ - Allow out-of-band response.create (e.g., text-to-speech without new audio)
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ session: BaseRealtimeSession,
31
+ options: Optional[RealtimeSessionOptions] = None,
32
+ transcoder: Optional[AudioTranscoder] = None,
33
+ accept_compressed_input: bool = False,
34
+ client_input_mime: str = "audio/mp4",
35
+ encode_output: bool = False,
36
+ client_output_mime: str = "audio/aac",
37
+ ) -> None:
38
+ self._session = session
39
+ self._options = options or RealtimeSessionOptions()
40
+ self._connected = False
41
+ self._lock = asyncio.Lock()
42
+ self._transcoder = transcoder
43
+ # Client-side transport controls (do not affect OpenAI session formats)
44
+ self._accept_compressed_input = accept_compressed_input
45
+ self._client_input_mime = client_input_mime
46
+ self._encode_output = encode_output
47
+ self._client_output_mime = client_output_mime
48
+
49
+ async def start(self) -> None: # pragma: no cover
50
+ async with self._lock:
51
+ if self._connected:
52
+ return
53
+ logger.info("RealtimeService: starting session")
54
+ await self._session.connect()
55
+ self._connected = True
56
+
57
+ async def stop(self) -> None: # pragma: no cover
58
+ async with self._lock:
59
+ if not self._connected:
60
+ return
61
+ logger.info("RealtimeService: stopping session")
62
+ await self._session.close()
63
+ self._connected = False
64
+
65
+ # --- Configuration ---
66
+ async def configure(
67
+ self,
68
+ *,
69
+ voice: Optional[str] = None,
70
+ vad_enabled: Optional[bool] = None,
71
+ instructions: Optional[str] = None,
72
+ input_rate_hz: Optional[int] = None,
73
+ output_rate_hz: Optional[int] = None,
74
+ input_mime: Optional[str] = None,
75
+ output_mime: Optional[str] = None,
76
+ tools: Optional[list[dict[str, Any]]] = None,
77
+ tool_choice: Optional[str] = None,
78
+ ) -> None: # pragma: no cover
79
+ """Update session settings (voice, VAD, formats, tools)."""
80
+ patch: Dict[str, Any] = {}
81
+
82
+ audio_patch: Dict[str, Any] = {}
83
+ if input_mime or input_rate_hz is not None or vad_enabled is not None:
84
+ turn_detection = None
85
+ if vad_enabled is not None:
86
+ if vad_enabled:
87
+ turn_detection = {
88
+ "type": "semantic_vad",
89
+ "create_response": True,
90
+ }
91
+ else:
92
+ turn_detection = None
93
+ audio_patch["input"] = {
94
+ "format": "pcm16", # session is fixed to PCM16 server-side
95
+ "turn_detection": turn_detection,
96
+ }
97
+
98
+ if output_mime or output_rate_hz is not None or voice is not None:
99
+ # Only configure audio output if audio is in the output modalities
100
+ modalities = (
101
+ self._options.output_modalities
102
+ if self._options.output_modalities is not None
103
+ else ["audio"]
104
+ )
105
+ if "audio" in modalities:
106
+ audio_patch["output"] = {
107
+ "format": "pcm16", # session is fixed to PCM16 server-side
108
+ "voice": voice or self._options.voice,
109
+ "speed": 1.0,
110
+ }
111
+
112
+ if audio_patch:
113
+ patch["audio"] = audio_patch
114
+
115
+ if instructions is not None:
116
+ patch["instructions"] = instructions
117
+ if tools is not None:
118
+ patch["tools"] = tools
119
+ if tool_choice is not None:
120
+ patch["tool_choice"] = tool_choice
121
+
122
+ if patch:
123
+ logger.debug("RealtimeService.configure patch: %s", patch)
124
+ await self._session.update_session(patch)
125
+
126
+ # Update local options snapshot
127
+ if voice is not None:
128
+ self._options.voice = voice
129
+ if vad_enabled is not None:
130
+ self._options.vad_enabled = vad_enabled
131
+ if instructions is not None:
132
+ self._options.instructions = instructions
133
+ if input_rate_hz is not None:
134
+ self._options.input_rate_hz = input_rate_hz
135
+ if output_rate_hz is not None:
136
+ self._options.output_rate_hz = output_rate_hz
137
+ if input_mime is not None:
138
+ self._options.input_mime = input_mime
139
+ if output_mime is not None:
140
+ self._options.output_mime = output_mime
141
+ if tools is not None:
142
+ self._options.tools = tools
143
+ if tool_choice is not None:
144
+ self._options.tool_choice = tool_choice
145
+
146
+ # --- Audio input ---
147
+ async def append_audio(self, chunk_bytes: bytes) -> None: # pragma: no cover
148
+ """Accepts PCM16 by default; if accept_compressed_input is True, transcodes client audio to PCM16.
149
+
150
+ This keeps the server session configured for PCM while allowing mobile clients to send MP4/AAC.
151
+ """
152
+ logger.debug(
153
+ "RealtimeService.append_audio: len=%d, accept_compressed_input=%s, client_input_mime=%s",
154
+ len(chunk_bytes),
155
+ self._accept_compressed_input,
156
+ self._client_input_mime,
157
+ )
158
+ if self._accept_compressed_input:
159
+ if not self._transcoder:
160
+ raise ValueError(
161
+ "Compressed input enabled but no transcoder configured"
162
+ )
163
+ pcm16 = await self._transcoder.to_pcm16(
164
+ chunk_bytes, self._client_input_mime, self._options.input_rate_hz
165
+ )
166
+ await self._session.append_audio(pcm16)
167
+ logger.debug("RealtimeService.append_audio: sent PCM16 len=%d", len(pcm16))
168
+ return
169
+ # Default: pass-through PCM16
170
+ await self._session.append_audio(chunk_bytes)
171
+ logger.debug(
172
+ "RealtimeService.append_audio: sent passthrough len=%d", len(chunk_bytes)
173
+ )
174
+
175
+ async def commit_input(self) -> None: # pragma: no cover
176
+ logger.debug("RealtimeService.commit_input")
177
+ await self._session.commit_input()
178
+
179
+ async def clear_input(self) -> None: # pragma: no cover
180
+ logger.debug("RealtimeService.clear_input")
181
+ await self._session.clear_input()
182
+
183
+ # --- Out-of-band response (e.g., TTS without new audio) ---
184
+ async def create_conversation_item(
185
+ self, item: Dict[str, Any]
186
+ ) -> None: # pragma: no cover
187
+ """Create a conversation item (e.g., for text input)."""
188
+ await self._session.create_conversation_item(item)
189
+
190
+ async def create_response( # pragma: no cover
191
+ self, response_patch: Optional[Dict[str, Any]] = None
192
+ ) -> None:
193
+ await self._session.create_response(response_patch)
194
+
195
+ # --- Streams ---
196
+ def iter_events(self) -> AsyncGenerator[Dict[str, Any], None]: # pragma: no cover
197
+ return self._session.iter_events()
198
+
199
+ def iter_output_audio(self) -> AsyncGenerator[bytes, None]: # pragma: no cover
200
+ return self._session.iter_output_audio()
201
+
202
+ def reset_output_stream(self) -> None: # pragma: no cover
203
+ try:
204
+ if hasattr(self._session, "reset_output_stream"):
205
+ self._session.reset_output_stream()
206
+ except Exception:
207
+ pass
208
+
209
+ async def iter_output_audio_encoded(
210
+ self,
211
+ ) -> AsyncGenerator[RealtimeChunk, None]: # pragma: no cover
212
+ """Stream PCM16 audio as RealtimeChunk objects, tolerating long tool executions by waiting while calls are pending.
213
+
214
+ - If no audio arrives immediately, we keep waiting as long as a function/tool call is pending.
215
+ - Bridge across multiple audio segments (e.g., pre-call and post-call responses).
216
+ - Only end the stream when no audio is available and no pending tool call remains.
217
+ """
218
+
219
+ def _has_pending_tool() -> bool:
220
+ try:
221
+ return bool(
222
+ getattr(self._session, "has_pending_tool_call", lambda: False)()
223
+ )
224
+ except Exception:
225
+ return False
226
+
227
+ async def _produce_pcm():
228
+ max_wait_pending_sec = 600.0 # allow up to 10 minutes while tools run
229
+ waited_while_pending = 0.0
230
+ base_idle_timeout = 12.0
231
+ idle_slice = 1.0
232
+
233
+ while True:
234
+ gen = self._session.iter_output_audio()
235
+ try:
236
+ # Inner loop for one segment until generator ends
237
+ while True:
238
+ try:
239
+ chunk = await asyncio.wait_for(
240
+ gen.__anext__(), timeout=idle_slice
241
+ )
242
+ except asyncio.TimeoutError:
243
+ if _has_pending_tool():
244
+ waited_while_pending += idle_slice
245
+ if waited_while_pending <= max_wait_pending_sec:
246
+ continue
247
+ else:
248
+ logger.warning(
249
+ "RealtimeService: exceeded max pending-tool wait; ending stream"
250
+ )
251
+ return
252
+ else:
253
+ # No pending tool: accumulate idle time; stop after base timeout
254
+ waited_while_pending += idle_slice
255
+ if waited_while_pending >= base_idle_timeout:
256
+ logger.warning(
257
+ "RealtimeService: idle with no pending tool; ending stream"
258
+ )
259
+ return
260
+ continue
261
+ # Got a chunk; reset idle counter and yield
262
+ waited_while_pending = 0.0
263
+ if not chunk:
264
+ continue
265
+ yield chunk
266
+ except StopAsyncIteration:
267
+ # Segment ended; if a tool is pending, continue to next segment
268
+ if _has_pending_tool():
269
+ await asyncio.sleep(0.25)
270
+ continue
271
+ # Otherwise, no more audio segments expected
272
+ return
273
+
274
+ if self._encode_output and self._transcoder:
275
+ async for out in self._transcoder.stream_from_pcm16(
276
+ _produce_pcm(), self._client_output_mime, self._options.output_rate_hz
277
+ ):
278
+ yield RealtimeChunk(modality="audio", data=out)
279
+ else:
280
+ async for chunk in _produce_pcm():
281
+ yield RealtimeChunk(modality="audio", data=chunk)
282
+
283
+ async def iter_output_combined(
284
+ self,
285
+ ) -> AsyncGenerator[RealtimeChunk, None]: # pragma: no cover
286
+ """Stream both audio and text chunks as RealtimeChunk objects.
287
+
288
+ This method combines audio and text streams when both modalities are enabled.
289
+ Audio chunks are yielded as they arrive, and text chunks are yielded as transcript deltas arrive.
290
+ """
291
+
292
+ # Determine which modalities to stream based on session options
293
+ modalities = (
294
+ self._options.output_modalities
295
+ if self._options.output_modalities is not None
296
+ else ["audio"]
297
+ )
298
+ should_stream_audio = "audio" in modalities
299
+ should_stream_text = "text" in modalities
300
+
301
+ if not should_stream_audio and not should_stream_text:
302
+ return # No modalities requested
303
+
304
+ # Create tasks for both streams if needed
305
+ tasks = []
306
+ queues = []
307
+
308
+ if should_stream_audio:
309
+ audio_queue = asyncio.Queue()
310
+ queues.append(audio_queue)
311
+
312
+ async def _collect_audio():
313
+ try:
314
+ async for chunk in self.iter_output_audio_encoded():
315
+ await audio_queue.put(chunk)
316
+ finally:
317
+ await audio_queue.put(None) # Sentinel
318
+
319
+ tasks.append(asyncio.create_task(_collect_audio()))
320
+
321
+ if should_stream_text:
322
+ text_queue = asyncio.Queue()
323
+ queues.append(text_queue)
324
+
325
+ async def _collect_text():
326
+ try:
327
+ async for text_chunk in self.iter_output_transcript():
328
+ if text_chunk: # Only yield non-empty text chunks
329
+ await text_queue.put(
330
+ RealtimeChunk(modality="text", data=text_chunk)
331
+ )
332
+ finally:
333
+ await text_queue.put(None) # Sentinel
334
+
335
+ tasks.append(asyncio.create_task(_collect_text()))
336
+
337
+ try:
338
+ # Collect chunks from all queues
339
+ active_queues = len(queues)
340
+
341
+ while active_queues > 0:
342
+ for queue in queues:
343
+ try:
344
+ chunk = queue.get_nowait()
345
+ if chunk is None:
346
+ active_queues -= 1
347
+ else:
348
+ yield chunk
349
+ except asyncio.QueueEmpty:
350
+ continue
351
+
352
+ # Small delay to prevent busy waiting
353
+ if active_queues > 0:
354
+ await asyncio.sleep(0.01)
355
+
356
+ finally:
357
+ # Cancel all tasks
358
+ for task in tasks:
359
+ if not task.done():
360
+ task.cancel()
361
+
362
+ def iter_input_transcript(self) -> AsyncGenerator[str, None]: # pragma: no cover
363
+ return self._session.iter_input_transcript()
364
+
365
+ def iter_output_transcript(self) -> AsyncGenerator[str, None]: # pragma: no cover
366
+ return self._session.iter_output_transcript()
367
+
368
+
369
+ class TwinRealtimeService:
370
+ """Orchestrates two realtime sessions in parallel:
371
+
372
+ - conversation: full duplex (audio out + assistant transcript, tools, etc.)
373
+ - transcription: transcription-only session per GA (input transcript deltas)
374
+
375
+ Audio input is fanned out to both sessions. Output audio is sourced from the
376
+ conversation session only. Input transcript is sourced from the transcription
377
+ session only. This aligns with the GA guidance to use a dedicated
378
+ transcription session for reliable realtime STT, while the conversation
379
+ session handles assistant speech.
380
+ """
381
+
382
+ def __init__(
383
+ self,
384
+ conversation: BaseRealtimeSession,
385
+ transcription: BaseRealtimeSession,
386
+ *,
387
+ conv_options: Optional[RealtimeSessionOptions] = None,
388
+ trans_options: Optional[RealtimeSessionOptions] = None,
389
+ transcoder: Optional[AudioTranscoder] = None,
390
+ accept_compressed_input: bool = False,
391
+ client_input_mime: str = "audio/mp4",
392
+ encode_output: bool = False,
393
+ client_output_mime: str = "audio/aac",
394
+ ) -> None:
395
+ self._conv = conversation
396
+ self._trans = transcription
397
+ self._conv_opts = conv_options or RealtimeSessionOptions()
398
+ self._trans_opts = trans_options or RealtimeSessionOptions()
399
+ self._transcoder = transcoder
400
+ self._accept_compressed_input = accept_compressed_input
401
+ self._client_input_mime = client_input_mime
402
+ self._encode_output = encode_output
403
+ self._client_output_mime = client_output_mime
404
+ self._connected = False
405
+ self._lock = asyncio.Lock()
406
+
407
+ async def start(self) -> None: # pragma: no cover
408
+ async with self._lock:
409
+ if self._connected:
410
+ return
411
+ logger.info("TwinRealtimeService: starting conversation + transcription")
412
+ await asyncio.gather(self._conv.connect(), self._trans.connect())
413
+ self._connected = True
414
+
415
+ async def stop(self) -> None: # pragma: no cover
416
+ async with self._lock:
417
+ if not self._connected:
418
+ return
419
+ logger.info("TwinRealtimeService: stopping both sessions")
420
+ try:
421
+ await asyncio.gather(self._conv.close(), self._trans.close())
422
+ finally:
423
+ self._connected = False
424
+
425
+ async def reconnect(self) -> None: # pragma: no cover
426
+ async with self._lock:
427
+ try:
428
+ await asyncio.gather(self._conv.close(), self._trans.close())
429
+ except Exception:
430
+ pass
431
+ self._connected = False
432
+ await self.start()
433
+
434
+ async def configure(
435
+ self,
436
+ *,
437
+ voice: Optional[str] = None,
438
+ vad_enabled: Optional[bool] = None,
439
+ instructions: Optional[str] = None,
440
+ input_rate_hz: Optional[int] = None,
441
+ output_rate_hz: Optional[int] = None,
442
+ input_mime: Optional[str] = None,
443
+ output_mime: Optional[str] = None,
444
+ tools: Optional[list[dict[str, Any]]] = None,
445
+ tool_choice: Optional[str] = None,
446
+ ) -> None: # pragma: no cover
447
+ # Only the conversation session needs voice/tools; transcription session
448
+ # already has its own VAD model configured at connect-time.
449
+ patch: Dict[str, Any] = {}
450
+ audio_patch: Dict[str, Any] = {}
451
+ if (
452
+ vad_enabled is not None
453
+ or input_rate_hz is not None
454
+ or input_mime is not None
455
+ ):
456
+ turn_detection = None
457
+ if vad_enabled is not None:
458
+ if vad_enabled:
459
+ turn_detection = {"type": "semantic_vad", "create_response": True}
460
+ else:
461
+ turn_detection = None
462
+ audio_patch["input"] = {"format": "pcm16", "turn_detection": turn_detection}
463
+ if output_rate_hz is not None or output_mime is not None or voice is not None:
464
+ # Only configure audio output if audio is in the output modalities
465
+ modalities = (
466
+ self._conv_opts.output_modalities
467
+ if self._conv_opts.output_modalities is not None
468
+ else ["audio"]
469
+ )
470
+ if "audio" in modalities:
471
+ audio_patch["output"] = {
472
+ "format": "pcm16",
473
+ "voice": voice or self._conv_opts.voice,
474
+ "speed": 1.0,
475
+ }
476
+ if audio_patch:
477
+ patch["audio"] = audio_patch
478
+ if instructions is not None:
479
+ patch["instructions"] = instructions
480
+ if tools is not None:
481
+ patch["tools"] = tools
482
+ if tool_choice is not None:
483
+ patch["tool_choice"] = tool_choice
484
+
485
+ if patch:
486
+ logger.debug("TwinRealtimeService.configure patch (conv): %s", patch)
487
+ await self._conv.update_session(patch)
488
+
489
+ # Update local snapshots
490
+ if voice is not None:
491
+ self._conv_opts.voice = voice
492
+ if vad_enabled is not None:
493
+ self._conv_opts.vad_enabled = vad_enabled
494
+ self._trans_opts.vad_enabled = vad_enabled
495
+ if instructions is not None:
496
+ self._conv_opts.instructions = instructions
497
+ if input_rate_hz is not None:
498
+ self._conv_opts.input_rate_hz = input_rate_hz
499
+ self._trans_opts.input_rate_hz = input_rate_hz
500
+ if output_rate_hz is not None:
501
+ self._conv_opts.output_rate_hz = output_rate_hz
502
+ if input_mime is not None:
503
+ self._conv_opts.input_mime = input_mime
504
+ self._trans_opts.input_mime = input_mime
505
+ if output_mime is not None:
506
+ self._conv_opts.output_mime = output_mime
507
+ if tools is not None:
508
+ self._conv_opts.tools = tools
509
+ if tool_choice is not None:
510
+ self._conv_opts.tool_choice = tool_choice
511
+
512
+ async def append_audio(self, chunk_bytes: bytes) -> None: # pragma: no cover
513
+ # Transcode once if needed, then fan out to both
514
+ if self._accept_compressed_input:
515
+ if not self._transcoder:
516
+ raise ValueError(
517
+ "Compressed input enabled but no transcoder configured"
518
+ )
519
+ pcm16 = await self._transcoder.to_pcm16(
520
+ chunk_bytes, self._client_input_mime, self._conv_opts.input_rate_hz
521
+ )
522
+ await asyncio.gather(
523
+ self._conv.append_audio(pcm16), self._trans.append_audio(pcm16)
524
+ )
525
+ return
526
+ await asyncio.gather(
527
+ self._conv.append_audio(chunk_bytes),
528
+ self._trans.append_audio(chunk_bytes),
529
+ )
530
+
531
+ async def commit_input(self) -> None: # pragma: no cover
532
+ await asyncio.gather(self._conv.commit_input(), self._trans.commit_input())
533
+
534
+ async def commit_conversation(self) -> None: # pragma: no cover
535
+ await self._conv.commit_input()
536
+
537
+ async def commit_transcription(self) -> None: # pragma: no cover
538
+ await self._trans.commit_input()
539
+
540
+ async def clear_input(self) -> None: # pragma: no cover
541
+ await asyncio.gather(self._conv.clear_input(), self._trans.clear_input())
542
+
543
+ async def create_conversation_item(
544
+ self, item: Dict[str, Any]
545
+ ) -> None: # pragma: no cover
546
+ """Create a conversation item (e.g., for text input)."""
547
+ await self._conv.create_conversation_item(item)
548
+
549
+ async def create_response(
550
+ self, response_patch: Optional[Dict[str, Any]] = None
551
+ ) -> None: # pragma: no cover
552
+ # Only conversation session creates assistant responses
553
+ await self._conv.create_response(response_patch)
554
+
555
+ # --- Streams ---
556
+ def iter_events(self) -> AsyncGenerator[Dict[str, Any], None]: # pragma: no cover
557
+ # Prefer conversation events; caller can listen to transcription via iter_input_transcript
558
+ return self._conv.iter_events()
559
+
560
+ def iter_output_audio(self) -> AsyncGenerator[bytes, None]: # pragma: no cover
561
+ return self._conv.iter_output_audio()
562
+
563
+ def reset_output_stream(self) -> None: # pragma: no cover
564
+ try:
565
+ if hasattr(self._conv, "reset_output_stream"):
566
+ self._conv.reset_output_stream()
567
+ except Exception:
568
+ pass
569
+
570
+ async def iter_output_audio_encoded(
571
+ self,
572
+ ) -> AsyncGenerator[RealtimeChunk, None]: # pragma: no cover
573
+ # Reuse the same encoding pipeline as RealtimeService but source from conversation
574
+ pcm_gen = self._conv.iter_output_audio()
575
+
576
+ try:
577
+ first_chunk = await asyncio.wait_for(pcm_gen.__anext__(), timeout=12.0)
578
+ except StopAsyncIteration:
579
+ logger.warning("TwinRealtimeService: no PCM produced (ended immediately)")
580
+ return
581
+ except asyncio.TimeoutError:
582
+ logger.warning("TwinRealtimeService: no PCM within timeout; closing conv")
583
+ try:
584
+ # Close both sessions to ensure clean restart on next turn
585
+ await asyncio.gather(self._conv.close(), self._trans.close())
586
+ self._connected = False
587
+ except Exception:
588
+ pass
589
+ return
590
+
591
+ async def _pcm_iter():
592
+ if first_chunk:
593
+ yield first_chunk
594
+ async for c in pcm_gen:
595
+ if not c:
596
+ continue
597
+ yield c
598
+
599
+ if self._encode_output and self._transcoder:
600
+ async for out in self._transcoder.stream_from_pcm16(
601
+ _pcm_iter(), self._client_output_mime, self._conv_opts.output_rate_hz
602
+ ):
603
+ yield RealtimeChunk(modality="audio", data=out)
604
+ else:
605
+ async for chunk in _pcm_iter():
606
+ yield RealtimeChunk(modality="audio", data=chunk)
607
+
608
+ def iter_input_transcript(self) -> AsyncGenerator[str, None]: # pragma: no cover
609
+ return self._trans.iter_input_transcript()
610
+
611
+ def iter_output_transcript(self) -> AsyncGenerator[str, None]: # pragma: no cover
612
+ return self._conv.iter_output_transcript()
613
+
614
+ def iter_transcription_events(
615
+ self,
616
+ ) -> AsyncGenerator[Dict[str, Any], None]: # pragma: no cover
617
+ # Expose transcription session events for completion detection
618
+ return self._trans.iter_events()
619
+
620
+ def is_connected(self) -> bool: # pragma: no cover
621
+ return self._connected
622
+
623
+ def set_tool_executor(self, executor) -> None: # pragma: no cover
624
+ # Forward to conversation session (tools only apply there)
625
+ if hasattr(self._conv, "set_tool_executor"):
626
+ self._conv.set_tool_executor(executor)