solana-agent 20.1.2__py3-none-any.whl → 31.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- solana_agent/__init__.py +10 -5
- solana_agent/adapters/ffmpeg_transcoder.py +375 -0
- solana_agent/adapters/mongodb_adapter.py +15 -2
- solana_agent/adapters/openai_adapter.py +679 -0
- solana_agent/adapters/openai_realtime_ws.py +1813 -0
- solana_agent/adapters/pinecone_adapter.py +543 -0
- solana_agent/cli.py +128 -0
- solana_agent/client/solana_agent.py +180 -20
- solana_agent/domains/agent.py +13 -13
- solana_agent/domains/routing.py +18 -8
- solana_agent/factories/agent_factory.py +239 -38
- solana_agent/guardrails/pii.py +107 -0
- solana_agent/interfaces/client/client.py +95 -12
- solana_agent/interfaces/guardrails/guardrails.py +26 -0
- solana_agent/interfaces/plugins/plugins.py +2 -1
- solana_agent/interfaces/providers/__init__.py +0 -0
- solana_agent/interfaces/providers/audio.py +40 -0
- solana_agent/interfaces/providers/data_storage.py +9 -2
- solana_agent/interfaces/providers/llm.py +86 -9
- solana_agent/interfaces/providers/memory.py +13 -1
- solana_agent/interfaces/providers/realtime.py +212 -0
- solana_agent/interfaces/providers/vector_storage.py +53 -0
- solana_agent/interfaces/services/agent.py +27 -12
- solana_agent/interfaces/services/knowledge_base.py +59 -0
- solana_agent/interfaces/services/query.py +41 -8
- solana_agent/interfaces/services/routing.py +0 -1
- solana_agent/plugins/manager.py +37 -16
- solana_agent/plugins/registry.py +34 -19
- solana_agent/plugins/tools/__init__.py +0 -5
- solana_agent/plugins/tools/auto_tool.py +1 -0
- solana_agent/repositories/memory.py +332 -111
- solana_agent/services/__init__.py +1 -1
- solana_agent/services/agent.py +390 -241
- solana_agent/services/knowledge_base.py +768 -0
- solana_agent/services/query.py +1858 -153
- solana_agent/services/realtime.py +626 -0
- solana_agent/services/routing.py +104 -51
- solana_agent-31.4.0.dist-info/METADATA +1070 -0
- solana_agent-31.4.0.dist-info/RECORD +49 -0
- {solana_agent-20.1.2.dist-info → solana_agent-31.4.0.dist-info}/WHEEL +1 -1
- solana_agent-31.4.0.dist-info/entry_points.txt +3 -0
- solana_agent/adapters/llm_adapter.py +0 -160
- solana_agent-20.1.2.dist-info/METADATA +0 -464
- solana_agent-20.1.2.dist-info/RECORD +0 -35
- {solana_agent-20.1.2.dist-info → solana_agent-31.4.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,626 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Any, AsyncGenerator, Dict, Optional
|
|
6
|
+
|
|
7
|
+
from solana_agent.interfaces.providers.realtime import (
|
|
8
|
+
BaseRealtimeSession,
|
|
9
|
+
RealtimeSessionOptions,
|
|
10
|
+
RealtimeChunk,
|
|
11
|
+
)
|
|
12
|
+
from solana_agent.interfaces.providers.audio import AudioTranscoder
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class RealtimeService:
|
|
18
|
+
"""High-level service to manage a realtime audio session.
|
|
19
|
+
|
|
20
|
+
Responsibilities:
|
|
21
|
+
- Connect/close a realtime session (WebSocket-based)
|
|
22
|
+
- Update voice and VAD at runtime via session.update
|
|
23
|
+
- Append/commit/clear input audio buffers
|
|
24
|
+
- Expose separate async generators for audio and input/output transcripts
|
|
25
|
+
- Allow out-of-band response.create (e.g., text-to-speech without new audio)
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
session: BaseRealtimeSession,
|
|
31
|
+
options: Optional[RealtimeSessionOptions] = None,
|
|
32
|
+
transcoder: Optional[AudioTranscoder] = None,
|
|
33
|
+
accept_compressed_input: bool = False,
|
|
34
|
+
client_input_mime: str = "audio/mp4",
|
|
35
|
+
encode_output: bool = False,
|
|
36
|
+
client_output_mime: str = "audio/aac",
|
|
37
|
+
) -> None:
|
|
38
|
+
self._session = session
|
|
39
|
+
self._options = options or RealtimeSessionOptions()
|
|
40
|
+
self._connected = False
|
|
41
|
+
self._lock = asyncio.Lock()
|
|
42
|
+
self._transcoder = transcoder
|
|
43
|
+
# Client-side transport controls (do not affect OpenAI session formats)
|
|
44
|
+
self._accept_compressed_input = accept_compressed_input
|
|
45
|
+
self._client_input_mime = client_input_mime
|
|
46
|
+
self._encode_output = encode_output
|
|
47
|
+
self._client_output_mime = client_output_mime
|
|
48
|
+
|
|
49
|
+
async def start(self) -> None: # pragma: no cover
|
|
50
|
+
async with self._lock:
|
|
51
|
+
if self._connected:
|
|
52
|
+
return
|
|
53
|
+
logger.info("RealtimeService: starting session")
|
|
54
|
+
await self._session.connect()
|
|
55
|
+
self._connected = True
|
|
56
|
+
|
|
57
|
+
async def stop(self) -> None: # pragma: no cover
|
|
58
|
+
async with self._lock:
|
|
59
|
+
if not self._connected:
|
|
60
|
+
return
|
|
61
|
+
logger.info("RealtimeService: stopping session")
|
|
62
|
+
await self._session.close()
|
|
63
|
+
self._connected = False
|
|
64
|
+
|
|
65
|
+
# --- Configuration ---
|
|
66
|
+
async def configure(
|
|
67
|
+
self,
|
|
68
|
+
*,
|
|
69
|
+
voice: Optional[str] = None,
|
|
70
|
+
vad_enabled: Optional[bool] = None,
|
|
71
|
+
instructions: Optional[str] = None,
|
|
72
|
+
input_rate_hz: Optional[int] = None,
|
|
73
|
+
output_rate_hz: Optional[int] = None,
|
|
74
|
+
input_mime: Optional[str] = None,
|
|
75
|
+
output_mime: Optional[str] = None,
|
|
76
|
+
tools: Optional[list[dict[str, Any]]] = None,
|
|
77
|
+
tool_choice: Optional[str] = None,
|
|
78
|
+
) -> None: # pragma: no cover
|
|
79
|
+
"""Update session settings (voice, VAD, formats, tools)."""
|
|
80
|
+
patch: Dict[str, Any] = {}
|
|
81
|
+
|
|
82
|
+
audio_patch: Dict[str, Any] = {}
|
|
83
|
+
if input_mime or input_rate_hz is not None or vad_enabled is not None:
|
|
84
|
+
turn_detection = None
|
|
85
|
+
if vad_enabled is not None:
|
|
86
|
+
if vad_enabled:
|
|
87
|
+
turn_detection = {
|
|
88
|
+
"type": "semantic_vad",
|
|
89
|
+
"create_response": True,
|
|
90
|
+
}
|
|
91
|
+
else:
|
|
92
|
+
turn_detection = None
|
|
93
|
+
audio_patch["input"] = {
|
|
94
|
+
"format": "pcm16", # session is fixed to PCM16 server-side
|
|
95
|
+
"turn_detection": turn_detection,
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if output_mime or output_rate_hz is not None or voice is not None:
|
|
99
|
+
# Only configure audio output if audio is in the output modalities
|
|
100
|
+
modalities = (
|
|
101
|
+
self._options.output_modalities
|
|
102
|
+
if self._options.output_modalities is not None
|
|
103
|
+
else ["audio"]
|
|
104
|
+
)
|
|
105
|
+
if "audio" in modalities:
|
|
106
|
+
audio_patch["output"] = {
|
|
107
|
+
"format": "pcm16", # session is fixed to PCM16 server-side
|
|
108
|
+
"voice": voice or self._options.voice,
|
|
109
|
+
"speed": 1.0,
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
if audio_patch:
|
|
113
|
+
patch["audio"] = audio_patch
|
|
114
|
+
|
|
115
|
+
if instructions is not None:
|
|
116
|
+
patch["instructions"] = instructions
|
|
117
|
+
if tools is not None:
|
|
118
|
+
patch["tools"] = tools
|
|
119
|
+
if tool_choice is not None:
|
|
120
|
+
patch["tool_choice"] = tool_choice
|
|
121
|
+
|
|
122
|
+
if patch:
|
|
123
|
+
logger.debug("RealtimeService.configure patch: %s", patch)
|
|
124
|
+
await self._session.update_session(patch)
|
|
125
|
+
|
|
126
|
+
# Update local options snapshot
|
|
127
|
+
if voice is not None:
|
|
128
|
+
self._options.voice = voice
|
|
129
|
+
if vad_enabled is not None:
|
|
130
|
+
self._options.vad_enabled = vad_enabled
|
|
131
|
+
if instructions is not None:
|
|
132
|
+
self._options.instructions = instructions
|
|
133
|
+
if input_rate_hz is not None:
|
|
134
|
+
self._options.input_rate_hz = input_rate_hz
|
|
135
|
+
if output_rate_hz is not None:
|
|
136
|
+
self._options.output_rate_hz = output_rate_hz
|
|
137
|
+
if input_mime is not None:
|
|
138
|
+
self._options.input_mime = input_mime
|
|
139
|
+
if output_mime is not None:
|
|
140
|
+
self._options.output_mime = output_mime
|
|
141
|
+
if tools is not None:
|
|
142
|
+
self._options.tools = tools
|
|
143
|
+
if tool_choice is not None:
|
|
144
|
+
self._options.tool_choice = tool_choice
|
|
145
|
+
|
|
146
|
+
# --- Audio input ---
|
|
147
|
+
async def append_audio(self, chunk_bytes: bytes) -> None: # pragma: no cover
|
|
148
|
+
"""Accepts PCM16 by default; if accept_compressed_input is True, transcodes client audio to PCM16.
|
|
149
|
+
|
|
150
|
+
This keeps the server session configured for PCM while allowing mobile clients to send MP4/AAC.
|
|
151
|
+
"""
|
|
152
|
+
logger.debug(
|
|
153
|
+
"RealtimeService.append_audio: len=%d, accept_compressed_input=%s, client_input_mime=%s",
|
|
154
|
+
len(chunk_bytes),
|
|
155
|
+
self._accept_compressed_input,
|
|
156
|
+
self._client_input_mime,
|
|
157
|
+
)
|
|
158
|
+
if self._accept_compressed_input:
|
|
159
|
+
if not self._transcoder:
|
|
160
|
+
raise ValueError(
|
|
161
|
+
"Compressed input enabled but no transcoder configured"
|
|
162
|
+
)
|
|
163
|
+
pcm16 = await self._transcoder.to_pcm16(
|
|
164
|
+
chunk_bytes, self._client_input_mime, self._options.input_rate_hz
|
|
165
|
+
)
|
|
166
|
+
await self._session.append_audio(pcm16)
|
|
167
|
+
logger.debug("RealtimeService.append_audio: sent PCM16 len=%d", len(pcm16))
|
|
168
|
+
return
|
|
169
|
+
# Default: pass-through PCM16
|
|
170
|
+
await self._session.append_audio(chunk_bytes)
|
|
171
|
+
logger.debug(
|
|
172
|
+
"RealtimeService.append_audio: sent passthrough len=%d", len(chunk_bytes)
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
async def commit_input(self) -> None: # pragma: no cover
|
|
176
|
+
logger.debug("RealtimeService.commit_input")
|
|
177
|
+
await self._session.commit_input()
|
|
178
|
+
|
|
179
|
+
async def clear_input(self) -> None: # pragma: no cover
|
|
180
|
+
logger.debug("RealtimeService.clear_input")
|
|
181
|
+
await self._session.clear_input()
|
|
182
|
+
|
|
183
|
+
# --- Out-of-band response (e.g., TTS without new audio) ---
|
|
184
|
+
async def create_conversation_item(
|
|
185
|
+
self, item: Dict[str, Any]
|
|
186
|
+
) -> None: # pragma: no cover
|
|
187
|
+
"""Create a conversation item (e.g., for text input)."""
|
|
188
|
+
await self._session.create_conversation_item(item)
|
|
189
|
+
|
|
190
|
+
async def create_response( # pragma: no cover
|
|
191
|
+
self, response_patch: Optional[Dict[str, Any]] = None
|
|
192
|
+
) -> None:
|
|
193
|
+
await self._session.create_response(response_patch)
|
|
194
|
+
|
|
195
|
+
# --- Streams ---
|
|
196
|
+
def iter_events(self) -> AsyncGenerator[Dict[str, Any], None]: # pragma: no cover
|
|
197
|
+
return self._session.iter_events()
|
|
198
|
+
|
|
199
|
+
def iter_output_audio(self) -> AsyncGenerator[bytes, None]: # pragma: no cover
|
|
200
|
+
return self._session.iter_output_audio()
|
|
201
|
+
|
|
202
|
+
def reset_output_stream(self) -> None: # pragma: no cover
|
|
203
|
+
try:
|
|
204
|
+
if hasattr(self._session, "reset_output_stream"):
|
|
205
|
+
self._session.reset_output_stream()
|
|
206
|
+
except Exception:
|
|
207
|
+
pass
|
|
208
|
+
|
|
209
|
+
async def iter_output_audio_encoded(
|
|
210
|
+
self,
|
|
211
|
+
) -> AsyncGenerator[RealtimeChunk, None]: # pragma: no cover
|
|
212
|
+
"""Stream PCM16 audio as RealtimeChunk objects, tolerating long tool executions by waiting while calls are pending.
|
|
213
|
+
|
|
214
|
+
- If no audio arrives immediately, we keep waiting as long as a function/tool call is pending.
|
|
215
|
+
- Bridge across multiple audio segments (e.g., pre-call and post-call responses).
|
|
216
|
+
- Only end the stream when no audio is available and no pending tool call remains.
|
|
217
|
+
"""
|
|
218
|
+
|
|
219
|
+
def _has_pending_tool() -> bool:
|
|
220
|
+
try:
|
|
221
|
+
return bool(
|
|
222
|
+
getattr(self._session, "has_pending_tool_call", lambda: False)()
|
|
223
|
+
)
|
|
224
|
+
except Exception:
|
|
225
|
+
return False
|
|
226
|
+
|
|
227
|
+
async def _produce_pcm():
|
|
228
|
+
max_wait_pending_sec = 600.0 # allow up to 10 minutes while tools run
|
|
229
|
+
waited_while_pending = 0.0
|
|
230
|
+
base_idle_timeout = 12.0
|
|
231
|
+
idle_slice = 1.0
|
|
232
|
+
|
|
233
|
+
while True:
|
|
234
|
+
gen = self._session.iter_output_audio()
|
|
235
|
+
try:
|
|
236
|
+
# Inner loop for one segment until generator ends
|
|
237
|
+
while True:
|
|
238
|
+
try:
|
|
239
|
+
chunk = await asyncio.wait_for(
|
|
240
|
+
gen.__anext__(), timeout=idle_slice
|
|
241
|
+
)
|
|
242
|
+
except asyncio.TimeoutError:
|
|
243
|
+
if _has_pending_tool():
|
|
244
|
+
waited_while_pending += idle_slice
|
|
245
|
+
if waited_while_pending <= max_wait_pending_sec:
|
|
246
|
+
continue
|
|
247
|
+
else:
|
|
248
|
+
logger.warning(
|
|
249
|
+
"RealtimeService: exceeded max pending-tool wait; ending stream"
|
|
250
|
+
)
|
|
251
|
+
return
|
|
252
|
+
else:
|
|
253
|
+
# No pending tool: accumulate idle time; stop after base timeout
|
|
254
|
+
waited_while_pending += idle_slice
|
|
255
|
+
if waited_while_pending >= base_idle_timeout:
|
|
256
|
+
logger.warning(
|
|
257
|
+
"RealtimeService: idle with no pending tool; ending stream"
|
|
258
|
+
)
|
|
259
|
+
return
|
|
260
|
+
continue
|
|
261
|
+
# Got a chunk; reset idle counter and yield
|
|
262
|
+
waited_while_pending = 0.0
|
|
263
|
+
if not chunk:
|
|
264
|
+
continue
|
|
265
|
+
yield chunk
|
|
266
|
+
except StopAsyncIteration:
|
|
267
|
+
# Segment ended; if a tool is pending, continue to next segment
|
|
268
|
+
if _has_pending_tool():
|
|
269
|
+
await asyncio.sleep(0.25)
|
|
270
|
+
continue
|
|
271
|
+
# Otherwise, no more audio segments expected
|
|
272
|
+
return
|
|
273
|
+
|
|
274
|
+
if self._encode_output and self._transcoder:
|
|
275
|
+
async for out in self._transcoder.stream_from_pcm16(
|
|
276
|
+
_produce_pcm(), self._client_output_mime, self._options.output_rate_hz
|
|
277
|
+
):
|
|
278
|
+
yield RealtimeChunk(modality="audio", data=out)
|
|
279
|
+
else:
|
|
280
|
+
async for chunk in _produce_pcm():
|
|
281
|
+
yield RealtimeChunk(modality="audio", data=chunk)
|
|
282
|
+
|
|
283
|
+
async def iter_output_combined(
|
|
284
|
+
self,
|
|
285
|
+
) -> AsyncGenerator[RealtimeChunk, None]: # pragma: no cover
|
|
286
|
+
"""Stream both audio and text chunks as RealtimeChunk objects.
|
|
287
|
+
|
|
288
|
+
This method combines audio and text streams when both modalities are enabled.
|
|
289
|
+
Audio chunks are yielded as they arrive, and text chunks are yielded as transcript deltas arrive.
|
|
290
|
+
"""
|
|
291
|
+
|
|
292
|
+
# Determine which modalities to stream based on session options
|
|
293
|
+
modalities = (
|
|
294
|
+
self._options.output_modalities
|
|
295
|
+
if self._options.output_modalities is not None
|
|
296
|
+
else ["audio"]
|
|
297
|
+
)
|
|
298
|
+
should_stream_audio = "audio" in modalities
|
|
299
|
+
should_stream_text = "text" in modalities
|
|
300
|
+
|
|
301
|
+
if not should_stream_audio and not should_stream_text:
|
|
302
|
+
return # No modalities requested
|
|
303
|
+
|
|
304
|
+
# Create tasks for both streams if needed
|
|
305
|
+
tasks = []
|
|
306
|
+
queues = []
|
|
307
|
+
|
|
308
|
+
if should_stream_audio:
|
|
309
|
+
audio_queue = asyncio.Queue()
|
|
310
|
+
queues.append(audio_queue)
|
|
311
|
+
|
|
312
|
+
async def _collect_audio():
|
|
313
|
+
try:
|
|
314
|
+
async for chunk in self.iter_output_audio_encoded():
|
|
315
|
+
await audio_queue.put(chunk)
|
|
316
|
+
finally:
|
|
317
|
+
await audio_queue.put(None) # Sentinel
|
|
318
|
+
|
|
319
|
+
tasks.append(asyncio.create_task(_collect_audio()))
|
|
320
|
+
|
|
321
|
+
if should_stream_text:
|
|
322
|
+
text_queue = asyncio.Queue()
|
|
323
|
+
queues.append(text_queue)
|
|
324
|
+
|
|
325
|
+
async def _collect_text():
|
|
326
|
+
try:
|
|
327
|
+
async for text_chunk in self.iter_output_transcript():
|
|
328
|
+
if text_chunk: # Only yield non-empty text chunks
|
|
329
|
+
await text_queue.put(
|
|
330
|
+
RealtimeChunk(modality="text", data=text_chunk)
|
|
331
|
+
)
|
|
332
|
+
finally:
|
|
333
|
+
await text_queue.put(None) # Sentinel
|
|
334
|
+
|
|
335
|
+
tasks.append(asyncio.create_task(_collect_text()))
|
|
336
|
+
|
|
337
|
+
try:
|
|
338
|
+
# Collect chunks from all queues
|
|
339
|
+
active_queues = len(queues)
|
|
340
|
+
|
|
341
|
+
while active_queues > 0:
|
|
342
|
+
for queue in queues:
|
|
343
|
+
try:
|
|
344
|
+
chunk = queue.get_nowait()
|
|
345
|
+
if chunk is None:
|
|
346
|
+
active_queues -= 1
|
|
347
|
+
else:
|
|
348
|
+
yield chunk
|
|
349
|
+
except asyncio.QueueEmpty:
|
|
350
|
+
continue
|
|
351
|
+
|
|
352
|
+
# Small delay to prevent busy waiting
|
|
353
|
+
if active_queues > 0:
|
|
354
|
+
await asyncio.sleep(0.01)
|
|
355
|
+
|
|
356
|
+
finally:
|
|
357
|
+
# Cancel all tasks
|
|
358
|
+
for task in tasks:
|
|
359
|
+
if not task.done():
|
|
360
|
+
task.cancel()
|
|
361
|
+
|
|
362
|
+
def iter_input_transcript(self) -> AsyncGenerator[str, None]: # pragma: no cover
|
|
363
|
+
return self._session.iter_input_transcript()
|
|
364
|
+
|
|
365
|
+
def iter_output_transcript(self) -> AsyncGenerator[str, None]: # pragma: no cover
|
|
366
|
+
return self._session.iter_output_transcript()
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
class TwinRealtimeService:
|
|
370
|
+
"""Orchestrates two realtime sessions in parallel:
|
|
371
|
+
|
|
372
|
+
- conversation: full duplex (audio out + assistant transcript, tools, etc.)
|
|
373
|
+
- transcription: transcription-only session per GA (input transcript deltas)
|
|
374
|
+
|
|
375
|
+
Audio input is fanned out to both sessions. Output audio is sourced from the
|
|
376
|
+
conversation session only. Input transcript is sourced from the transcription
|
|
377
|
+
session only. This aligns with the GA guidance to use a dedicated
|
|
378
|
+
transcription session for reliable realtime STT, while the conversation
|
|
379
|
+
session handles assistant speech.
|
|
380
|
+
"""
|
|
381
|
+
|
|
382
|
+
def __init__(
|
|
383
|
+
self,
|
|
384
|
+
conversation: BaseRealtimeSession,
|
|
385
|
+
transcription: BaseRealtimeSession,
|
|
386
|
+
*,
|
|
387
|
+
conv_options: Optional[RealtimeSessionOptions] = None,
|
|
388
|
+
trans_options: Optional[RealtimeSessionOptions] = None,
|
|
389
|
+
transcoder: Optional[AudioTranscoder] = None,
|
|
390
|
+
accept_compressed_input: bool = False,
|
|
391
|
+
client_input_mime: str = "audio/mp4",
|
|
392
|
+
encode_output: bool = False,
|
|
393
|
+
client_output_mime: str = "audio/aac",
|
|
394
|
+
) -> None:
|
|
395
|
+
self._conv = conversation
|
|
396
|
+
self._trans = transcription
|
|
397
|
+
self._conv_opts = conv_options or RealtimeSessionOptions()
|
|
398
|
+
self._trans_opts = trans_options or RealtimeSessionOptions()
|
|
399
|
+
self._transcoder = transcoder
|
|
400
|
+
self._accept_compressed_input = accept_compressed_input
|
|
401
|
+
self._client_input_mime = client_input_mime
|
|
402
|
+
self._encode_output = encode_output
|
|
403
|
+
self._client_output_mime = client_output_mime
|
|
404
|
+
self._connected = False
|
|
405
|
+
self._lock = asyncio.Lock()
|
|
406
|
+
|
|
407
|
+
async def start(self) -> None: # pragma: no cover
|
|
408
|
+
async with self._lock:
|
|
409
|
+
if self._connected:
|
|
410
|
+
return
|
|
411
|
+
logger.info("TwinRealtimeService: starting conversation + transcription")
|
|
412
|
+
await asyncio.gather(self._conv.connect(), self._trans.connect())
|
|
413
|
+
self._connected = True
|
|
414
|
+
|
|
415
|
+
async def stop(self) -> None: # pragma: no cover
|
|
416
|
+
async with self._lock:
|
|
417
|
+
if not self._connected:
|
|
418
|
+
return
|
|
419
|
+
logger.info("TwinRealtimeService: stopping both sessions")
|
|
420
|
+
try:
|
|
421
|
+
await asyncio.gather(self._conv.close(), self._trans.close())
|
|
422
|
+
finally:
|
|
423
|
+
self._connected = False
|
|
424
|
+
|
|
425
|
+
async def reconnect(self) -> None: # pragma: no cover
|
|
426
|
+
async with self._lock:
|
|
427
|
+
try:
|
|
428
|
+
await asyncio.gather(self._conv.close(), self._trans.close())
|
|
429
|
+
except Exception:
|
|
430
|
+
pass
|
|
431
|
+
self._connected = False
|
|
432
|
+
await self.start()
|
|
433
|
+
|
|
434
|
+
async def configure(
|
|
435
|
+
self,
|
|
436
|
+
*,
|
|
437
|
+
voice: Optional[str] = None,
|
|
438
|
+
vad_enabled: Optional[bool] = None,
|
|
439
|
+
instructions: Optional[str] = None,
|
|
440
|
+
input_rate_hz: Optional[int] = None,
|
|
441
|
+
output_rate_hz: Optional[int] = None,
|
|
442
|
+
input_mime: Optional[str] = None,
|
|
443
|
+
output_mime: Optional[str] = None,
|
|
444
|
+
tools: Optional[list[dict[str, Any]]] = None,
|
|
445
|
+
tool_choice: Optional[str] = None,
|
|
446
|
+
) -> None: # pragma: no cover
|
|
447
|
+
# Only the conversation session needs voice/tools; transcription session
|
|
448
|
+
# already has its own VAD model configured at connect-time.
|
|
449
|
+
patch: Dict[str, Any] = {}
|
|
450
|
+
audio_patch: Dict[str, Any] = {}
|
|
451
|
+
if (
|
|
452
|
+
vad_enabled is not None
|
|
453
|
+
or input_rate_hz is not None
|
|
454
|
+
or input_mime is not None
|
|
455
|
+
):
|
|
456
|
+
turn_detection = None
|
|
457
|
+
if vad_enabled is not None:
|
|
458
|
+
if vad_enabled:
|
|
459
|
+
turn_detection = {"type": "semantic_vad", "create_response": True}
|
|
460
|
+
else:
|
|
461
|
+
turn_detection = None
|
|
462
|
+
audio_patch["input"] = {"format": "pcm16", "turn_detection": turn_detection}
|
|
463
|
+
if output_rate_hz is not None or output_mime is not None or voice is not None:
|
|
464
|
+
# Only configure audio output if audio is in the output modalities
|
|
465
|
+
modalities = (
|
|
466
|
+
self._conv_opts.output_modalities
|
|
467
|
+
if self._conv_opts.output_modalities is not None
|
|
468
|
+
else ["audio"]
|
|
469
|
+
)
|
|
470
|
+
if "audio" in modalities:
|
|
471
|
+
audio_patch["output"] = {
|
|
472
|
+
"format": "pcm16",
|
|
473
|
+
"voice": voice or self._conv_opts.voice,
|
|
474
|
+
"speed": 1.0,
|
|
475
|
+
}
|
|
476
|
+
if audio_patch:
|
|
477
|
+
patch["audio"] = audio_patch
|
|
478
|
+
if instructions is not None:
|
|
479
|
+
patch["instructions"] = instructions
|
|
480
|
+
if tools is not None:
|
|
481
|
+
patch["tools"] = tools
|
|
482
|
+
if tool_choice is not None:
|
|
483
|
+
patch["tool_choice"] = tool_choice
|
|
484
|
+
|
|
485
|
+
if patch:
|
|
486
|
+
logger.debug("TwinRealtimeService.configure patch (conv): %s", patch)
|
|
487
|
+
await self._conv.update_session(patch)
|
|
488
|
+
|
|
489
|
+
# Update local snapshots
|
|
490
|
+
if voice is not None:
|
|
491
|
+
self._conv_opts.voice = voice
|
|
492
|
+
if vad_enabled is not None:
|
|
493
|
+
self._conv_opts.vad_enabled = vad_enabled
|
|
494
|
+
self._trans_opts.vad_enabled = vad_enabled
|
|
495
|
+
if instructions is not None:
|
|
496
|
+
self._conv_opts.instructions = instructions
|
|
497
|
+
if input_rate_hz is not None:
|
|
498
|
+
self._conv_opts.input_rate_hz = input_rate_hz
|
|
499
|
+
self._trans_opts.input_rate_hz = input_rate_hz
|
|
500
|
+
if output_rate_hz is not None:
|
|
501
|
+
self._conv_opts.output_rate_hz = output_rate_hz
|
|
502
|
+
if input_mime is not None:
|
|
503
|
+
self._conv_opts.input_mime = input_mime
|
|
504
|
+
self._trans_opts.input_mime = input_mime
|
|
505
|
+
if output_mime is not None:
|
|
506
|
+
self._conv_opts.output_mime = output_mime
|
|
507
|
+
if tools is not None:
|
|
508
|
+
self._conv_opts.tools = tools
|
|
509
|
+
if tool_choice is not None:
|
|
510
|
+
self._conv_opts.tool_choice = tool_choice
|
|
511
|
+
|
|
512
|
+
async def append_audio(self, chunk_bytes: bytes) -> None: # pragma: no cover
|
|
513
|
+
# Transcode once if needed, then fan out to both
|
|
514
|
+
if self._accept_compressed_input:
|
|
515
|
+
if not self._transcoder:
|
|
516
|
+
raise ValueError(
|
|
517
|
+
"Compressed input enabled but no transcoder configured"
|
|
518
|
+
)
|
|
519
|
+
pcm16 = await self._transcoder.to_pcm16(
|
|
520
|
+
chunk_bytes, self._client_input_mime, self._conv_opts.input_rate_hz
|
|
521
|
+
)
|
|
522
|
+
await asyncio.gather(
|
|
523
|
+
self._conv.append_audio(pcm16), self._trans.append_audio(pcm16)
|
|
524
|
+
)
|
|
525
|
+
return
|
|
526
|
+
await asyncio.gather(
|
|
527
|
+
self._conv.append_audio(chunk_bytes),
|
|
528
|
+
self._trans.append_audio(chunk_bytes),
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
async def commit_input(self) -> None: # pragma: no cover
|
|
532
|
+
await asyncio.gather(self._conv.commit_input(), self._trans.commit_input())
|
|
533
|
+
|
|
534
|
+
async def commit_conversation(self) -> None: # pragma: no cover
|
|
535
|
+
await self._conv.commit_input()
|
|
536
|
+
|
|
537
|
+
async def commit_transcription(self) -> None: # pragma: no cover
|
|
538
|
+
await self._trans.commit_input()
|
|
539
|
+
|
|
540
|
+
async def clear_input(self) -> None: # pragma: no cover
|
|
541
|
+
await asyncio.gather(self._conv.clear_input(), self._trans.clear_input())
|
|
542
|
+
|
|
543
|
+
async def create_conversation_item(
|
|
544
|
+
self, item: Dict[str, Any]
|
|
545
|
+
) -> None: # pragma: no cover
|
|
546
|
+
"""Create a conversation item (e.g., for text input)."""
|
|
547
|
+
await self._conv.create_conversation_item(item)
|
|
548
|
+
|
|
549
|
+
async def create_response(
|
|
550
|
+
self, response_patch: Optional[Dict[str, Any]] = None
|
|
551
|
+
) -> None: # pragma: no cover
|
|
552
|
+
# Only conversation session creates assistant responses
|
|
553
|
+
await self._conv.create_response(response_patch)
|
|
554
|
+
|
|
555
|
+
# --- Streams ---
|
|
556
|
+
def iter_events(self) -> AsyncGenerator[Dict[str, Any], None]: # pragma: no cover
|
|
557
|
+
# Prefer conversation events; caller can listen to transcription via iter_input_transcript
|
|
558
|
+
return self._conv.iter_events()
|
|
559
|
+
|
|
560
|
+
def iter_output_audio(self) -> AsyncGenerator[bytes, None]: # pragma: no cover
|
|
561
|
+
return self._conv.iter_output_audio()
|
|
562
|
+
|
|
563
|
+
def reset_output_stream(self) -> None: # pragma: no cover
|
|
564
|
+
try:
|
|
565
|
+
if hasattr(self._conv, "reset_output_stream"):
|
|
566
|
+
self._conv.reset_output_stream()
|
|
567
|
+
except Exception:
|
|
568
|
+
pass
|
|
569
|
+
|
|
570
|
+
async def iter_output_audio_encoded(
|
|
571
|
+
self,
|
|
572
|
+
) -> AsyncGenerator[RealtimeChunk, None]: # pragma: no cover
|
|
573
|
+
# Reuse the same encoding pipeline as RealtimeService but source from conversation
|
|
574
|
+
pcm_gen = self._conv.iter_output_audio()
|
|
575
|
+
|
|
576
|
+
try:
|
|
577
|
+
first_chunk = await asyncio.wait_for(pcm_gen.__anext__(), timeout=12.0)
|
|
578
|
+
except StopAsyncIteration:
|
|
579
|
+
logger.warning("TwinRealtimeService: no PCM produced (ended immediately)")
|
|
580
|
+
return
|
|
581
|
+
except asyncio.TimeoutError:
|
|
582
|
+
logger.warning("TwinRealtimeService: no PCM within timeout; closing conv")
|
|
583
|
+
try:
|
|
584
|
+
# Close both sessions to ensure clean restart on next turn
|
|
585
|
+
await asyncio.gather(self._conv.close(), self._trans.close())
|
|
586
|
+
self._connected = False
|
|
587
|
+
except Exception:
|
|
588
|
+
pass
|
|
589
|
+
return
|
|
590
|
+
|
|
591
|
+
async def _pcm_iter():
|
|
592
|
+
if first_chunk:
|
|
593
|
+
yield first_chunk
|
|
594
|
+
async for c in pcm_gen:
|
|
595
|
+
if not c:
|
|
596
|
+
continue
|
|
597
|
+
yield c
|
|
598
|
+
|
|
599
|
+
if self._encode_output and self._transcoder:
|
|
600
|
+
async for out in self._transcoder.stream_from_pcm16(
|
|
601
|
+
_pcm_iter(), self._client_output_mime, self._conv_opts.output_rate_hz
|
|
602
|
+
):
|
|
603
|
+
yield RealtimeChunk(modality="audio", data=out)
|
|
604
|
+
else:
|
|
605
|
+
async for chunk in _pcm_iter():
|
|
606
|
+
yield RealtimeChunk(modality="audio", data=chunk)
|
|
607
|
+
|
|
608
|
+
def iter_input_transcript(self) -> AsyncGenerator[str, None]: # pragma: no cover
|
|
609
|
+
return self._trans.iter_input_transcript()
|
|
610
|
+
|
|
611
|
+
def iter_output_transcript(self) -> AsyncGenerator[str, None]: # pragma: no cover
|
|
612
|
+
return self._conv.iter_output_transcript()
|
|
613
|
+
|
|
614
|
+
def iter_transcription_events(
|
|
615
|
+
self,
|
|
616
|
+
) -> AsyncGenerator[Dict[str, Any], None]: # pragma: no cover
|
|
617
|
+
# Expose transcription session events for completion detection
|
|
618
|
+
return self._trans.iter_events()
|
|
619
|
+
|
|
620
|
+
def is_connected(self) -> bool: # pragma: no cover
|
|
621
|
+
return self._connected
|
|
622
|
+
|
|
623
|
+
def set_tool_executor(self, executor) -> None: # pragma: no cover
|
|
624
|
+
# Forward to conversation session (tools only apply there)
|
|
625
|
+
if hasattr(self._conv, "set_tool_executor"):
|
|
626
|
+
self._conv.set_tool_executor(executor)
|