pygpt-net 2.7.7__py3-none-any.whl → 2.7.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pygpt_net/CHANGELOG.txt +12 -0
- pygpt_net/__init__.py +3 -3
- pygpt_net/app.py +5 -1
- pygpt_net/controller/assistant/batch.py +2 -2
- pygpt_net/controller/assistant/files.py +7 -6
- pygpt_net/controller/assistant/threads.py +0 -0
- pygpt_net/controller/chat/command.py +0 -0
- pygpt_net/controller/dialogs/confirm.py +35 -58
- pygpt_net/controller/lang/mapping.py +9 -9
- pygpt_net/controller/realtime/realtime.py +13 -1
- pygpt_net/controller/remote_store/{google/batch.py → batch.py} +209 -252
- pygpt_net/controller/remote_store/remote_store.py +982 -13
- pygpt_net/core/command/command.py +0 -0
- pygpt_net/core/db/viewer.py +1 -1
- pygpt_net/core/realtime/worker.py +3 -1
- pygpt_net/{controller/remote_store/google → core/remote_store/anthropic}/__init__.py +0 -1
- pygpt_net/core/remote_store/anthropic/files.py +211 -0
- pygpt_net/core/remote_store/anthropic/store.py +208 -0
- pygpt_net/core/remote_store/openai/store.py +5 -4
- pygpt_net/core/remote_store/remote_store.py +5 -1
- pygpt_net/{controller/remote_store/openai → core/remote_store/xai}/__init__.py +0 -1
- pygpt_net/core/remote_store/xai/files.py +225 -0
- pygpt_net/core/remote_store/xai/store.py +219 -0
- pygpt_net/data/config/config.json +10 -6
- pygpt_net/data/config/models.json +38 -22
- pygpt_net/data/config/settings.json +54 -1
- pygpt_net/data/icons/folder_eye.svg +1 -0
- pygpt_net/data/icons/folder_eye_filled.svg +1 -0
- pygpt_net/data/icons/folder_open.svg +1 -0
- pygpt_net/data/icons/folder_open_filled.svg +1 -0
- pygpt_net/data/locale/locale.de.ini +4 -3
- pygpt_net/data/locale/locale.en.ini +14 -4
- pygpt_net/data/locale/locale.es.ini +4 -3
- pygpt_net/data/locale/locale.fr.ini +4 -3
- pygpt_net/data/locale/locale.it.ini +4 -3
- pygpt_net/data/locale/locale.pl.ini +5 -4
- pygpt_net/data/locale/locale.uk.ini +4 -3
- pygpt_net/data/locale/locale.zh.ini +4 -3
- pygpt_net/icons.qrc +4 -0
- pygpt_net/icons_rc.py +282 -138
- pygpt_net/provider/api/anthropic/__init__.py +2 -0
- pygpt_net/provider/api/anthropic/chat.py +84 -1
- pygpt_net/provider/api/anthropic/store.py +307 -0
- pygpt_net/provider/api/anthropic/stream.py +75 -0
- pygpt_net/provider/api/anthropic/worker/__init__.py +0 -0
- pygpt_net/provider/api/anthropic/worker/importer.py +278 -0
- pygpt_net/provider/api/google/chat.py +59 -2
- pygpt_net/provider/api/google/realtime/client.py +70 -24
- pygpt_net/provider/api/google/realtime/realtime.py +48 -12
- pygpt_net/provider/api/google/store.py +124 -3
- pygpt_net/provider/api/google/stream.py +91 -24
- pygpt_net/provider/api/google/worker/importer.py +16 -28
- pygpt_net/provider/api/openai/assistants.py +2 -2
- pygpt_net/provider/api/openai/realtime/realtime.py +26 -6
- pygpt_net/provider/api/openai/store.py +4 -1
- pygpt_net/provider/api/openai/worker/importer.py +19 -61
- pygpt_net/provider/api/openai/worker/importer_assistants.py +230 -0
- pygpt_net/provider/api/x_ai/__init__.py +27 -6
- pygpt_net/provider/api/x_ai/audio.py +43 -11
- pygpt_net/provider/api/x_ai/chat.py +92 -4
- pygpt_net/provider/api/x_ai/realtime/__init__.py +12 -0
- pygpt_net/provider/api/x_ai/realtime/client.py +1864 -0
- pygpt_net/provider/api/x_ai/realtime/realtime.py +213 -0
- pygpt_net/provider/api/x_ai/remote_tools.py +102 -1
- pygpt_net/provider/api/x_ai/store.py +610 -0
- pygpt_net/provider/api/x_ai/stream.py +30 -9
- pygpt_net/provider/api/x_ai/tools.py +51 -0
- pygpt_net/provider/api/x_ai/worker/importer.py +308 -0
- pygpt_net/provider/audio_input/xai_grok_voice.py +390 -0
- pygpt_net/provider/audio_output/xai_tts.py +325 -0
- pygpt_net/provider/core/config/patch.py +29 -3
- pygpt_net/provider/core/config/patches/patch_before_2_6_42.py +2 -2
- pygpt_net/provider/core/model/patch.py +49 -1
- pygpt_net/tools/image_viewer/tool.py +334 -34
- pygpt_net/tools/image_viewer/ui/dialogs.py +317 -21
- pygpt_net/ui/dialog/assistant.py +1 -1
- pygpt_net/ui/dialog/plugins.py +13 -5
- pygpt_net/ui/dialog/remote_store.py +552 -0
- pygpt_net/ui/dialogs.py +3 -5
- pygpt_net/ui/layout/ctx/ctx_list.py +58 -7
- pygpt_net/ui/menu/tools.py +6 -13
- pygpt_net/ui/widget/dialog/{remote_store_google.py → remote_store.py} +10 -10
- pygpt_net/ui/widget/element/button.py +4 -4
- pygpt_net/ui/widget/image/display.py +2 -2
- pygpt_net/ui/widget/lists/context.py +2 -2
- {pygpt_net-2.7.7.dist-info → pygpt_net-2.7.9.dist-info}/METADATA +14 -2
- {pygpt_net-2.7.7.dist-info → pygpt_net-2.7.9.dist-info}/RECORD +87 -75
- pygpt_net/controller/remote_store/google/store.py +0 -615
- pygpt_net/controller/remote_store/openai/batch.py +0 -524
- pygpt_net/controller/remote_store/openai/store.py +0 -699
- pygpt_net/ui/dialog/remote_store_google.py +0 -539
- pygpt_net/ui/dialog/remote_store_openai.py +0 -539
- pygpt_net/ui/widget/dialog/remote_store_openai.py +0 -56
- pygpt_net/ui/widget/lists/remote_store_google.py +0 -248
- pygpt_net/ui/widget/lists/remote_store_openai.py +0 -317
- {pygpt_net-2.7.7.dist-info → pygpt_net-2.7.9.dist-info}/LICENSE +0 -0
- {pygpt_net-2.7.7.dist-info → pygpt_net-2.7.9.dist-info}/WHEEL +0 -0
- {pygpt_net-2.7.7.dist-info → pygpt_net-2.7.9.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,1864 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# ================================================== #
|
|
4
|
+
# This file is a part of PYGPT package #
|
|
5
|
+
# Website: https://pygpt.net #
|
|
6
|
+
# GitHub: https://github.com/szczyglis-dev/py-gpt #
|
|
7
|
+
# MIT License #
|
|
8
|
+
# Created By : Marcin Szczygliński #
|
|
9
|
+
# Updated Date: 2026.01.07 23:00:00 #
|
|
10
|
+
# ================================================== #
|
|
11
|
+
|
|
12
|
+
import asyncio
|
|
13
|
+
import base64
|
|
14
|
+
import io
|
|
15
|
+
import json
|
|
16
|
+
import websockets
|
|
17
|
+
|
|
18
|
+
from typing import Optional, Callable, Awaitable
|
|
19
|
+
from urllib.parse import urlencode
|
|
20
|
+
|
|
21
|
+
from pygpt_net.core.events import RealtimeEvent
|
|
22
|
+
from pygpt_net.item.ctx import CtxItem
|
|
23
|
+
from pygpt_net.core.text.utils import has_unclosed_code_tag
|
|
24
|
+
|
|
25
|
+
# shared
|
|
26
|
+
from pygpt_net.core.realtime.shared.loop import BackgroundLoop
|
|
27
|
+
from pygpt_net.core.realtime.shared.audio import (
|
|
28
|
+
coerce_to_pcm16_mono,
|
|
29
|
+
resample_pcm16_mono,
|
|
30
|
+
iter_pcm_chunks,
|
|
31
|
+
DEFAULT_24K,
|
|
32
|
+
)
|
|
33
|
+
from pygpt_net.core.realtime.shared.tools import (
|
|
34
|
+
sanitize_function_tools,
|
|
35
|
+
tools_signature,
|
|
36
|
+
build_tool_outputs_payload,
|
|
37
|
+
)
|
|
38
|
+
from pygpt_net.core.realtime.shared.turn import TurnMode, apply_turn_mode_openai
|
|
39
|
+
from pygpt_net.core.realtime.shared.session import set_ctx_rt_handle, set_rt_session_expires_at
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class xAIIRealtimeClient:
|
|
43
|
+
"""
|
|
44
|
+
xAI Realtime API client with persistent session and a dedicated background event loop.
|
|
45
|
+
|
|
46
|
+
Key points:
|
|
47
|
+
- A single background asyncio loop runs in its own thread for the lifetime of the client.
|
|
48
|
+
- One websocket connection (session) at a time; multiple "turns" (send_turn) are serialized.
|
|
49
|
+
- Supports server VAD (auto-turn) and manual turn control (input_audio_buffer.* + response.create).
|
|
50
|
+
- Safe to call run()/send_turn()/reset()/shutdown() from any thread or event loop.
|
|
51
|
+
|
|
52
|
+
Session resumption:
|
|
53
|
+
- The official Realtime API does not expose a documented server-side "resume" for closed WS sessions.
|
|
54
|
+
We still persist the server-provided handle (session or conversation id) and surface it via ctx.extra["rt_session_id"].
|
|
55
|
+
If opts.rt_session_id is provided and differs from the current in-memory handle, we reset the connection and attempt
|
|
56
|
+
to reconnect with a "session_id" query parameter. If that fails, we fall back to the standard URL.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
WS_URL = "wss://api.x.ai/v1/realtime"
|
|
60
|
+
|
|
61
|
+
def __init__(self, window=None, debug: bool = False):
|
|
62
|
+
"""
|
|
63
|
+
xAI Realtime API client
|
|
64
|
+
|
|
65
|
+
:param window: Window instance
|
|
66
|
+
:param debug: Enable debug logging
|
|
67
|
+
"""
|
|
68
|
+
self.window = window
|
|
69
|
+
self.debug = debug
|
|
70
|
+
|
|
71
|
+
# WebSocket and session state (lives on the owner loop)
|
|
72
|
+
self.ws: Optional[websockets.WebSocketClientProtocol] = None
|
|
73
|
+
self._rx_task: Optional[asyncio.Task] = None
|
|
74
|
+
self._running: bool = False
|
|
75
|
+
|
|
76
|
+
# Background loop
|
|
77
|
+
self._bg = BackgroundLoop(name="xAI-RT-Loop")
|
|
78
|
+
|
|
79
|
+
# Flow control primitives (created on the owner loop)
|
|
80
|
+
self._send_lock: Optional[asyncio.Lock] = None
|
|
81
|
+
self._response_done: Optional[asyncio.Event] = None
|
|
82
|
+
self._response_active: bool = False
|
|
83
|
+
|
|
84
|
+
# Callbacks and context
|
|
85
|
+
self._on_text: Optional[Callable[[str], Awaitable[None]]] = None
|
|
86
|
+
self._on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None
|
|
87
|
+
self._should_stop: Optional[Callable[[], bool]] = None
|
|
88
|
+
self._ctx: Optional[CtxItem] = None
|
|
89
|
+
self._last_opts = None # kept to allow reset() without resupplying
|
|
90
|
+
|
|
91
|
+
self._DEFAULT_RATE = DEFAULT_24K
|
|
92
|
+
|
|
93
|
+
# Per-response extraction state (tools/images/citations/usage/assembled text)
|
|
94
|
+
self._rt_state = None # dict populated on response.created
|
|
95
|
+
|
|
96
|
+
# Input transcription buffers keyed by item_id
|
|
97
|
+
self._input_tr_buffers: dict[str, io.StringIO] = {}
|
|
98
|
+
|
|
99
|
+
# Cached session.tools signature to avoid redundant session.update
|
|
100
|
+
self._cached_session_tools_sig: Optional[str] = None
|
|
101
|
+
|
|
102
|
+
# Last tool calls snapshot for mapping tool responses
|
|
103
|
+
self._last_tool_calls: list[dict] = []
|
|
104
|
+
|
|
105
|
+
# Live session handle (for best-effort resumption semantics)
|
|
106
|
+
self._rt_session_id: Optional[str] = None
|
|
107
|
+
self._rt_session_expires_at: Optional[int] = None # epoch seconds if provided by server
|
|
108
|
+
|
|
109
|
+
# -----------------------------
|
|
110
|
+
# Public high-level entrypoints
|
|
111
|
+
# -----------------------------
|
|
112
|
+
|
|
113
|
+
async def run(
|
|
114
|
+
self,
|
|
115
|
+
ctx: CtxItem,
|
|
116
|
+
opts,
|
|
117
|
+
on_text: Callable[[str], Awaitable[None]],
|
|
118
|
+
on_audio: Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]],
|
|
119
|
+
should_stop: Callable[[], bool] = lambda: False,
|
|
120
|
+
):
|
|
121
|
+
"""
|
|
122
|
+
Run one turn: open session if needed, send prompt/audio, await response completion.
|
|
123
|
+
|
|
124
|
+
:param ctx: CtxItem with model and conversation
|
|
125
|
+
:param opts: Options object with prompt/audio/voice/etc.
|
|
126
|
+
:param on_text: Async callback for text deltas
|
|
127
|
+
:param on_audio: Async callback for audio chunks
|
|
128
|
+
:param should_stop: Sync callback to signal barge-in (cancel active response)
|
|
129
|
+
"""
|
|
130
|
+
self._ensure_background_loop()
|
|
131
|
+
self._ctx = ctx
|
|
132
|
+
|
|
133
|
+
# If a different resumable handle is provided, reset to attempt best-effort resume.
|
|
134
|
+
try:
|
|
135
|
+
provided = getattr(opts, "rt_session_id", None)
|
|
136
|
+
if isinstance(provided, str):
|
|
137
|
+
provided = provided.strip()
|
|
138
|
+
if self.ws is not None and provided and provided != (self._rt_session_id or ""):
|
|
139
|
+
await self._run_on_owner(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop))
|
|
140
|
+
except Exception:
|
|
141
|
+
pass
|
|
142
|
+
|
|
143
|
+
# Open session on the owner loop (once)
|
|
144
|
+
if not self.ws:
|
|
145
|
+
await self._run_on_owner(self._open_session_internal(ctx, opts, on_text, on_audio, should_stop))
|
|
146
|
+
|
|
147
|
+
# Send one turn on the owner loop
|
|
148
|
+
await self._run_on_owner(self._send_turn_internal(
|
|
149
|
+
getattr(opts, "prompt", None),
|
|
150
|
+
getattr(opts, "audio_data", None),
|
|
151
|
+
getattr(opts, "audio_format", None),
|
|
152
|
+
getattr(opts, "audio_rate", None),
|
|
153
|
+
wait_for_done=not bool(getattr(opts, "streaming", False)),
|
|
154
|
+
))
|
|
155
|
+
|
|
156
|
+
async def open_session(
|
|
157
|
+
self,
|
|
158
|
+
ctx: CtxItem,
|
|
159
|
+
opts,
|
|
160
|
+
on_text: Callable[[str], Awaitable[None]],
|
|
161
|
+
on_audio: Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]],
|
|
162
|
+
should_stop: Callable[[], bool] = lambda: False,
|
|
163
|
+
):
|
|
164
|
+
"""
|
|
165
|
+
Explicitly open a session (websocket); normally run() does this on demand.
|
|
166
|
+
"""
|
|
167
|
+
self._ensure_background_loop()
|
|
168
|
+
|
|
169
|
+
# If the session is already open but a different handle is requested, reset to attempt reattach.
|
|
170
|
+
try:
|
|
171
|
+
provided = getattr(opts, "rt_session_id", None)
|
|
172
|
+
if isinstance(provided, str):
|
|
173
|
+
provided = provided.strip()
|
|
174
|
+
if self.ws is not None and provided and provided != (self._rt_session_id or ""):
|
|
175
|
+
await self._run_on_owner(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop))
|
|
176
|
+
return
|
|
177
|
+
except Exception:
|
|
178
|
+
pass
|
|
179
|
+
|
|
180
|
+
await self._run_on_owner(self._open_session_internal(ctx, opts, on_text, on_audio, should_stop))
|
|
181
|
+
|
|
182
|
+
async def close_session(self):
|
|
183
|
+
"""Close the websocket session but keep the background loop alive."""
|
|
184
|
+
if not self._bg.loop:
|
|
185
|
+
return
|
|
186
|
+
await self._run_on_owner(self._close_session_internal())
|
|
187
|
+
|
|
188
|
+
async def reset_session(
|
|
189
|
+
self,
|
|
190
|
+
ctx: Optional[CtxItem] = None,
|
|
191
|
+
opts=None,
|
|
192
|
+
on_text: Optional[Callable[[str], Awaitable[None]]] = None,
|
|
193
|
+
on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None,
|
|
194
|
+
should_stop: Optional[Callable[[], bool]] = None,
|
|
195
|
+
):
|
|
196
|
+
"""
|
|
197
|
+
Close the current session and open a fresh one (new conversation on the server).
|
|
198
|
+
If parameters are omitted, last-known ones are used.
|
|
199
|
+
"""
|
|
200
|
+
self._ensure_background_loop()
|
|
201
|
+
await self._run_on_owner(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop))
|
|
202
|
+
|
|
203
|
+
async def shutdown(self):
|
|
204
|
+
"""
|
|
205
|
+
Gracefully close the current session (if any).
|
|
206
|
+
Does NOT stop the background loop; use stop_loop_sync() or shutdown_and_stop() to also stop the loop.
|
|
207
|
+
"""
|
|
208
|
+
if not self._bg.loop:
|
|
209
|
+
return
|
|
210
|
+
await self._run_on_owner(self._close_session_internal())
|
|
211
|
+
|
|
212
|
+
async def shutdown_and_stop(self):
|
|
213
|
+
"""Close session and stop the background loop thread."""
|
|
214
|
+
await self.shutdown()
|
|
215
|
+
self.stop_loop_sync()
|
|
216
|
+
|
|
217
|
+
# -----------------------------
|
|
218
|
+
# Synchronous convenience calls
|
|
219
|
+
# -----------------------------
|
|
220
|
+
|
|
221
|
+
def close_session_sync(self, timeout: float = 5.0):
|
|
222
|
+
"""Synchronous wrapper around close_session()."""
|
|
223
|
+
if not self._bg.loop or not self._bg.loop.is_running():
|
|
224
|
+
return
|
|
225
|
+
self._bg.run_sync(self._close_session_internal(), timeout=timeout)
|
|
226
|
+
|
|
227
|
+
def reset_session_sync(
|
|
228
|
+
self,
|
|
229
|
+
ctx: Optional[CtxItem] = None,
|
|
230
|
+
opts=None,
|
|
231
|
+
on_text: Optional[Callable[[str], Awaitable[None]]] = None,
|
|
232
|
+
on_audio: Optional[Callable[[bytes, str], Awaitable[None]]] = None,
|
|
233
|
+
should_stop: Optional[Callable[[], bool]] = None,
|
|
234
|
+
timeout: float = 10.0,
|
|
235
|
+
):
|
|
236
|
+
"""Synchronous wrapper around reset_session()."""
|
|
237
|
+
self._ensure_background_loop()
|
|
238
|
+
self._bg.run_sync(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop), timeout=timeout)
|
|
239
|
+
|
|
240
|
+
def shutdown_sync(self, timeout: float = 5.0):
|
|
241
|
+
"""Synchronous wrapper around shutdown() — closes the WS but leaves the loop alive."""
|
|
242
|
+
if not self._bg.loop or not self._bg.loop.is_running():
|
|
243
|
+
return
|
|
244
|
+
self._bg.run_sync(self._close_session_internal(), timeout=timeout)
|
|
245
|
+
|
|
246
|
+
def stop_loop_sync(self, timeout: float = 2.0):
|
|
247
|
+
"""Stop the background event loop thread."""
|
|
248
|
+
self._bg.stop(timeout=timeout)
|
|
249
|
+
|
|
250
|
+
# -----------------------------
|
|
251
|
+
# Tools helpers
|
|
252
|
+
# -----------------------------
|
|
253
|
+
|
|
254
|
+
def _update_last_opts_tools(self, tools: Optional[list], remote_tools: Optional[list]) -> None:
|
|
255
|
+
"""
|
|
256
|
+
Update self._last_opts with tools/remote_tools if fields are present.
|
|
257
|
+
"""
|
|
258
|
+
lo = self._last_opts
|
|
259
|
+
if not lo:
|
|
260
|
+
return
|
|
261
|
+
try:
|
|
262
|
+
if tools is not None and hasattr(lo, "tools"):
|
|
263
|
+
setattr(lo, "tools", tools)
|
|
264
|
+
except Exception:
|
|
265
|
+
pass
|
|
266
|
+
try:
|
|
267
|
+
if remote_tools is not None and hasattr(lo, "remote_tools"):
|
|
268
|
+
setattr(lo, "remote_tools", remote_tools)
|
|
269
|
+
except Exception:
|
|
270
|
+
pass
|
|
271
|
+
|
|
272
|
+
def _xai_tool_shape(self, tool: dict) -> dict:
|
|
273
|
+
"""
|
|
274
|
+
Ensure xAI-compatible tool shape:
|
|
275
|
+
- function tools use top-level name/parameters (no nested "function" object)
|
|
276
|
+
- known provider tools: file_search (vector_store_ids), web_search, x_search
|
|
277
|
+
Unknown provider-only tools are dropped to avoid server-side validation issues.
|
|
278
|
+
"""
|
|
279
|
+
try:
|
|
280
|
+
if not isinstance(tool, dict):
|
|
281
|
+
return tool
|
|
282
|
+
|
|
283
|
+
t = dict(tool)
|
|
284
|
+
|
|
285
|
+
# Convert OpenAI Realtime "function": {...} into xAI top-level form
|
|
286
|
+
if t.get("type") == "function":
|
|
287
|
+
if "function" in t and isinstance(t["function"], dict):
|
|
288
|
+
f = t["function"]
|
|
289
|
+
name = f.get("name") or t.get("name")
|
|
290
|
+
desc = f.get("description") or t.get("description") or ""
|
|
291
|
+
params = f.get("parameters") or t.get("parameters") or {"type": "object"}
|
|
292
|
+
return {
|
|
293
|
+
"type": "function",
|
|
294
|
+
"name": name,
|
|
295
|
+
"description": desc,
|
|
296
|
+
"parameters": params if isinstance(params, dict) else {"type": "object"},
|
|
297
|
+
}
|
|
298
|
+
# Already top-level form, return as-is
|
|
299
|
+
return {
|
|
300
|
+
"type": "function",
|
|
301
|
+
"name": t.get("name"),
|
|
302
|
+
"description": t.get("description") or "",
|
|
303
|
+
"parameters": t.get("parameters") or {"type": "object"},
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
# Map collections_search -> file_search
|
|
307
|
+
if t.get("type") == "collections_search":
|
|
308
|
+
vec_ids = t.get("collection_ids") or t.get("vector_store_ids") or []
|
|
309
|
+
max_num = t.get("max_num_results") if isinstance(t.get("max_num_results"), int) else None
|
|
310
|
+
out = {
|
|
311
|
+
"type": "file_search",
|
|
312
|
+
"vector_store_ids": vec_ids if isinstance(vec_ids, list) else [],
|
|
313
|
+
}
|
|
314
|
+
if max_num is not None:
|
|
315
|
+
out["max_num_results"] = max_num
|
|
316
|
+
return out
|
|
317
|
+
|
|
318
|
+
# Pass-through for known provider tools
|
|
319
|
+
if t.get("type") in ("file_search", "web_search", "x_search"):
|
|
320
|
+
return t
|
|
321
|
+
|
|
322
|
+
# code_interpreter is not documented for xAI Voice Agent; drop it
|
|
323
|
+
if t.get("type") == "code_interpreter":
|
|
324
|
+
return {}
|
|
325
|
+
|
|
326
|
+
return t
|
|
327
|
+
except Exception:
|
|
328
|
+
return tool
|
|
329
|
+
|
|
330
|
+
def _compose_xai_tools(self, tools: Optional[list], remote_tools: Optional[list]) -> list:
|
|
331
|
+
"""
|
|
332
|
+
Compose a single list of tools in xAI shape; filters out unsupported ones.
|
|
333
|
+
"""
|
|
334
|
+
out: list = []
|
|
335
|
+
try:
|
|
336
|
+
fn = tools or []
|
|
337
|
+
rt = remote_tools or []
|
|
338
|
+
|
|
339
|
+
# Sanitize function tools from our shared helper first
|
|
340
|
+
fn = sanitize_function_tools(fn) or fn
|
|
341
|
+
|
|
342
|
+
# Merge order: provider tools first (as in xAI docs), then function tools
|
|
343
|
+
for t in (rt or []):
|
|
344
|
+
shaped = self._xai_tool_shape(t)
|
|
345
|
+
if isinstance(shaped, dict) and shaped:
|
|
346
|
+
out.append(shaped)
|
|
347
|
+
for t in (fn or []):
|
|
348
|
+
shaped = self._xai_tool_shape(t)
|
|
349
|
+
if isinstance(shaped, dict) and shaped:
|
|
350
|
+
out.append(shaped)
|
|
351
|
+
except Exception:
|
|
352
|
+
pass
|
|
353
|
+
return out
|
|
354
|
+
|
|
355
|
+
# -----------------------------
|
|
356
|
+
# Internal: background loop/dispatch
|
|
357
|
+
# -----------------------------
|
|
358
|
+
|
|
359
|
+
def _ensure_background_loop(self):
|
|
360
|
+
"""Start the background asyncio loop once and keep it running."""
|
|
361
|
+
self._bg.ensure()
|
|
362
|
+
|
|
363
|
+
async def _run_on_owner(self, coro):
|
|
364
|
+
"""Await a coroutine scheduled on the owner loop from any thread/loop."""
|
|
365
|
+
return await self._bg.run(coro)
|
|
366
|
+
|
|
367
|
+
# -----------------------------
|
|
368
|
+
# Internal: session lifecycle
|
|
369
|
+
# -----------------------------
|
|
370
|
+
|
|
371
|
+
async def _open_session_internal(
|
|
372
|
+
self,
|
|
373
|
+
ctx: CtxItem,
|
|
374
|
+
opts,
|
|
375
|
+
on_text: Callable[[str], Awaitable[None]],
|
|
376
|
+
on_audio: Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]],
|
|
377
|
+
should_stop: Callable[[], bool] = lambda: False,
|
|
378
|
+
):
|
|
379
|
+
"""
|
|
380
|
+
Open WS and configure the Realtime session on the owner loop.
|
|
381
|
+
"""
|
|
382
|
+
if self.ws is not None:
|
|
383
|
+
if self.debug:
|
|
384
|
+
print("[open_session] already open")
|
|
385
|
+
return
|
|
386
|
+
|
|
387
|
+
core = self.window.core
|
|
388
|
+
api_key = self.window.core.config.get("api_key_xai")
|
|
389
|
+
if not api_key:
|
|
390
|
+
raise RuntimeError("xAPI key not configured")
|
|
391
|
+
|
|
392
|
+
model_id = getattr(opts, "model", None) or (ctx.model if ctx and ctx.model else "grok-3")
|
|
393
|
+
voice = getattr(opts, "voice", None) or self._preferred_voice()
|
|
394
|
+
|
|
395
|
+
# Optional: requested resume handle from opts
|
|
396
|
+
resume_sid = None
|
|
397
|
+
try:
|
|
398
|
+
provided = getattr(opts, "rt_session_id", None)
|
|
399
|
+
if isinstance(provided, str):
|
|
400
|
+
provided = provided.strip()
|
|
401
|
+
if provided and provided != (self._rt_session_id or ""):
|
|
402
|
+
resume_sid = provided
|
|
403
|
+
self._rt_session_id = resume_sid
|
|
404
|
+
set_ctx_rt_handle(self._ctx, resume_sid, self.window)
|
|
405
|
+
except Exception:
|
|
406
|
+
pass
|
|
407
|
+
|
|
408
|
+
# Prefer plain WS URL; fallback to query-parameter variant
|
|
409
|
+
url_plain = self.WS_URL
|
|
410
|
+
q = {"model": model_id}
|
|
411
|
+
if resume_sid:
|
|
412
|
+
q["session_id"] = resume_sid
|
|
413
|
+
url_with_q = f"{self.WS_URL}?{urlencode(q)}"
|
|
414
|
+
|
|
415
|
+
headers = {
|
|
416
|
+
"Authorization": f"Bearer {api_key}",
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
# Save callbacks and context
|
|
420
|
+
self._on_text = on_text
|
|
421
|
+
self._on_audio = on_audio
|
|
422
|
+
self._should_stop = should_stop or (lambda: False)
|
|
423
|
+
self._ctx = ctx
|
|
424
|
+
self._last_opts = opts
|
|
425
|
+
|
|
426
|
+
# Control primitives
|
|
427
|
+
self._response_done = asyncio.Event()
|
|
428
|
+
self._send_lock = asyncio.Lock()
|
|
429
|
+
|
|
430
|
+
if self.debug:
|
|
431
|
+
print(f"[open_session] owner_loop={id(asyncio.get_running_loop())}")
|
|
432
|
+
|
|
433
|
+
# Connect WS with robust fallback
|
|
434
|
+
try:
|
|
435
|
+
self.ws = await websockets.connect(
|
|
436
|
+
url_plain,
|
|
437
|
+
additional_headers=headers,
|
|
438
|
+
max_size=16 * 1024 * 1024,
|
|
439
|
+
ping_interval=20,
|
|
440
|
+
ping_timeout=20,
|
|
441
|
+
close_timeout=5,
|
|
442
|
+
)
|
|
443
|
+
except Exception as e:
|
|
444
|
+
if self.debug:
|
|
445
|
+
print(f"[open_session] connect plain failed: {e!r}")
|
|
446
|
+
try:
|
|
447
|
+
self.ws = await websockets.connect(
|
|
448
|
+
url_with_q,
|
|
449
|
+
additional_headers=headers,
|
|
450
|
+
max_size=16 * 1024 * 1024,
|
|
451
|
+
ping_interval=20,
|
|
452
|
+
ping_timeout=20,
|
|
453
|
+
close_timeout=5,
|
|
454
|
+
)
|
|
455
|
+
except Exception as e2:
|
|
456
|
+
if self.debug:
|
|
457
|
+
print(f"[open_session] fallback connect failed: {e2!r}")
|
|
458
|
+
self.ws = None
|
|
459
|
+
|
|
460
|
+
if not self.ws:
|
|
461
|
+
raise RuntimeError("xAI Realtime: WebSocket connect failed")
|
|
462
|
+
|
|
463
|
+
if self.debug:
|
|
464
|
+
print("[open_session] WS connected")
|
|
465
|
+
|
|
466
|
+
# Session payload compatible with xAI Voice Agent
|
|
467
|
+
session_payload = {
|
|
468
|
+
"type": "session.update",
|
|
469
|
+
"session": {
|
|
470
|
+
"voice": voice,
|
|
471
|
+
"audio": {
|
|
472
|
+
"input": {"format": {"type": "audio/pcm", "rate": self._DEFAULT_RATE}},
|
|
473
|
+
"output": {"format": {"type": "audio/pcm", "rate": self._DEFAULT_RATE}},
|
|
474
|
+
},
|
|
475
|
+
},
|
|
476
|
+
}
|
|
477
|
+
if getattr(opts, "system_prompt", None):
|
|
478
|
+
session_payload["session"]["instructions"] = str(getattr(opts, "system_prompt"))
|
|
479
|
+
|
|
480
|
+
# Turn detection (server VAD) or manual turns
|
|
481
|
+
turn_mode = TurnMode.AUTO if bool(getattr(opts, "auto_turn", False)) else TurnMode.MANUAL
|
|
482
|
+
apply_turn_mode_openai(session_payload, turn_mode)
|
|
483
|
+
self._tune_openai_vad(session_payload, opts)
|
|
484
|
+
|
|
485
|
+
# Attach tools to session (xAI expects tools only in session.update)
|
|
486
|
+
try:
|
|
487
|
+
session_tools = self._compose_xai_tools(
|
|
488
|
+
getattr(opts, "tools", None),
|
|
489
|
+
getattr(opts, "remote_tools", None),
|
|
490
|
+
)
|
|
491
|
+
if session_tools:
|
|
492
|
+
session_payload["session"]["tools"] = session_tools
|
|
493
|
+
self._cached_session_tools_sig = tools_signature(session_tools)
|
|
494
|
+
if self.debug:
|
|
495
|
+
print(f"[open_session] session.tools attached: {len(session_tools)}")
|
|
496
|
+
else:
|
|
497
|
+
self._cached_session_tools_sig = tools_signature([])
|
|
498
|
+
except Exception as _e:
|
|
499
|
+
if self.debug:
|
|
500
|
+
print(f"[open_session] tools sanitize error: {_e}")
|
|
501
|
+
self._cached_session_tools_sig = tools_signature([])
|
|
502
|
+
|
|
503
|
+
if self.debug:
|
|
504
|
+
print(f"[open_session] session_payload: {json.dumps(session_payload)}")
|
|
505
|
+
|
|
506
|
+
await self.ws.send(json.dumps(session_payload))
|
|
507
|
+
if self.debug:
|
|
508
|
+
print("[open_session] session.update sent")
|
|
509
|
+
|
|
510
|
+
# Start a single receiver task
|
|
511
|
+
if self._rx_task is None or self._rx_task.done():
|
|
512
|
+
self._running = True
|
|
513
|
+
self._rx_task = asyncio.create_task(self._recv_loop(), name="realtime-recv")
|
|
514
|
+
if self.debug:
|
|
515
|
+
print("[open_session] _recv_loop started")
|
|
516
|
+
|
|
517
|
+
async def _close_session_internal(self):
|
|
518
|
+
"""Close WS and stop the receiver; keep the background loop alive for reuse."""
|
|
519
|
+
self._running = False
|
|
520
|
+
|
|
521
|
+
# Cancel active response if any
|
|
522
|
+
if self.ws and self._response_active:
|
|
523
|
+
try:
|
|
524
|
+
await self.ws.send(json.dumps({"type": "response.cancel"}))
|
|
525
|
+
except Exception:
|
|
526
|
+
pass
|
|
527
|
+
|
|
528
|
+
# Unblock any waiters before clearing handles
|
|
529
|
+
try:
|
|
530
|
+
if self._response_done and not self._response_done.is_set():
|
|
531
|
+
self._response_done.set()
|
|
532
|
+
except Exception:
|
|
533
|
+
pass
|
|
534
|
+
|
|
535
|
+
# Close the socket
|
|
536
|
+
if self.ws:
|
|
537
|
+
try:
|
|
538
|
+
await self.ws.close()
|
|
539
|
+
except Exception:
|
|
540
|
+
pass
|
|
541
|
+
self.ws = None
|
|
542
|
+
|
|
543
|
+
# Await receiver
|
|
544
|
+
if self._rx_task:
|
|
545
|
+
try:
|
|
546
|
+
await self._rx_task
|
|
547
|
+
except Exception:
|
|
548
|
+
pass
|
|
549
|
+
self._rx_task = None
|
|
550
|
+
|
|
551
|
+
# Reset control primitives
|
|
552
|
+
self._response_active = False
|
|
553
|
+
self._response_done = None
|
|
554
|
+
self._send_lock = None
|
|
555
|
+
self._cached_session_tools_sig = None
|
|
556
|
+
|
|
557
|
+
# Clear in-memory handle; do not wipe persisted ctx.extra["rt_session_id"]
|
|
558
|
+
self._rt_session_id = None
|
|
559
|
+
self._rt_session_expires_at = None
|
|
560
|
+
|
|
561
|
+
if self.debug:
|
|
562
|
+
print("[close_session] closed")
|
|
563
|
+
|
|
564
|
+
async def _reset_session_internal(
|
|
565
|
+
self,
|
|
566
|
+
ctx: Optional[CtxItem] = None,
|
|
567
|
+
opts=None,
|
|
568
|
+
on_text: Optional[Callable[[str], Awaitable[None]]] = None,
|
|
569
|
+
on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None,
|
|
570
|
+
should_stop: Optional[Callable[[], bool]] = None,
|
|
571
|
+
):
|
|
572
|
+
"""
|
|
573
|
+
Close current session and open a new one with provided or last-known parameters.
|
|
574
|
+
"""
|
|
575
|
+
# Determine params to reuse if not provided
|
|
576
|
+
ctx = ctx or self._ctx
|
|
577
|
+
opts = opts or self._last_opts
|
|
578
|
+
on_text = on_text or self._on_text
|
|
579
|
+
on_audio = on_audio or self._on_audio
|
|
580
|
+
should_stop = should_stop or self._should_stop or (lambda: False)
|
|
581
|
+
|
|
582
|
+
if not (ctx and opts and on_text and on_audio):
|
|
583
|
+
raise RuntimeError("reset_session requires previous or explicit ctx/opts/callbacks")
|
|
584
|
+
|
|
585
|
+
await self._close_session_internal()
|
|
586
|
+
await self._open_session_internal(ctx, opts, on_text, on_audio, should_stop)
|
|
587
|
+
|
|
588
|
+
# -----------------------------
|
|
589
|
+
# Internal: one "turn"
|
|
590
|
+
# -----------------------------
|
|
591
|
+
|
|
592
|
+
async def _send_turn_internal(
|
|
593
|
+
self,
|
|
594
|
+
prompt: Optional[str] = None,
|
|
595
|
+
audio_data: Optional[bytes] = None,
|
|
596
|
+
audio_format: Optional[str] = None,
|
|
597
|
+
audio_rate: Optional[int] = None,
|
|
598
|
+
wait_for_done: bool = True,
|
|
599
|
+
):
|
|
600
|
+
"""
|
|
601
|
+
Send one manual turn (optional text + optional audio) and trigger response.create.
|
|
602
|
+
"""
|
|
603
|
+
if not self.ws:
|
|
604
|
+
# If session dropped remotely, try to reopen from last state
|
|
605
|
+
if self._ctx and self._last_opts:
|
|
606
|
+
await self._open_session_internal(self._ctx, self._last_opts, self._on_text, self._on_audio, self._should_stop)
|
|
607
|
+
else:
|
|
608
|
+
raise RuntimeError("Session not open. Call open_session(...) first.")
|
|
609
|
+
|
|
610
|
+
# Serialize all sends to a single WS writer
|
|
611
|
+
if self._send_lock is None:
|
|
612
|
+
self._send_lock = asyncio.Lock()
|
|
613
|
+
|
|
614
|
+
# Determine whether we should trigger a response for this turn
|
|
615
|
+
def _bool(v) -> bool:
|
|
616
|
+
try:
|
|
617
|
+
return bool(v)
|
|
618
|
+
except Exception:
|
|
619
|
+
return False
|
|
620
|
+
|
|
621
|
+
is_auto_turn = _bool(getattr(self._last_opts or object(), "auto_turn", False))
|
|
622
|
+
has_text = False
|
|
623
|
+
if prompt is not None:
|
|
624
|
+
p = str(prompt).strip()
|
|
625
|
+
has_text = bool(p and p != "...")
|
|
626
|
+
has_audio = bool(audio_data)
|
|
627
|
+
reply_hint = False
|
|
628
|
+
try:
|
|
629
|
+
extra = getattr(self._last_opts, "extra", None)
|
|
630
|
+
if isinstance(extra, dict):
|
|
631
|
+
reply_hint = bool(extra.get("reply", False))
|
|
632
|
+
except Exception:
|
|
633
|
+
pass
|
|
634
|
+
|
|
635
|
+
if not has_text and not has_audio and not reply_hint:
|
|
636
|
+
if self.debug:
|
|
637
|
+
print("[send_turn] skipped: manual mode with empty input; waiting for explicit commit")
|
|
638
|
+
return
|
|
639
|
+
|
|
640
|
+
wait_prev: Optional[asyncio.Event] = None
|
|
641
|
+
wait_curr: Optional[asyncio.Event] = None
|
|
642
|
+
|
|
643
|
+
async with self._send_lock:
|
|
644
|
+
# Ensure previous response is finished (snapshot the handle to avoid race with close)
|
|
645
|
+
if self._response_active and self._response_done:
|
|
646
|
+
wait_prev = self._response_done
|
|
647
|
+
|
|
648
|
+
# Optional text
|
|
649
|
+
if has_text:
|
|
650
|
+
if self.debug:
|
|
651
|
+
print(f"[send_turn] prompt len={len(prompt)}")
|
|
652
|
+
await self.ws.send(json.dumps({
|
|
653
|
+
"type": "conversation.item.create",
|
|
654
|
+
"item": {
|
|
655
|
+
"type": "message",
|
|
656
|
+
"role": "user",
|
|
657
|
+
"content": [{"type": "input_text", "text": str(prompt)}],
|
|
658
|
+
},
|
|
659
|
+
}))
|
|
660
|
+
|
|
661
|
+
# Optional audio (manual turn control flow)
|
|
662
|
+
if has_audio:
|
|
663
|
+
sr, _ch, pcm = coerce_to_pcm16_mono(audio_data, audio_format, audio_rate, fallback_rate=self._DEFAULT_RATE)
|
|
664
|
+
|
|
665
|
+
if sr != self._DEFAULT_RATE:
|
|
666
|
+
try:
|
|
667
|
+
pcm = resample_pcm16_mono(pcm, sr, self._DEFAULT_RATE)
|
|
668
|
+
if self.debug:
|
|
669
|
+
print(f"[audio] resampled {sr} -> {self._DEFAULT_RATE}")
|
|
670
|
+
sr = self._DEFAULT_RATE
|
|
671
|
+
except Exception as e:
|
|
672
|
+
if self.debug:
|
|
673
|
+
print(f"[audio] resample failed {sr}->{self._DEFAULT_RATE}: {e}")
|
|
674
|
+
|
|
675
|
+
# Append PCM and commit input buffer
|
|
676
|
+
for chunk in iter_pcm_chunks(pcm, sr, ms=50):
|
|
677
|
+
if not chunk:
|
|
678
|
+
continue
|
|
679
|
+
await self.ws.send(json.dumps({
|
|
680
|
+
"type": "input_audio_buffer.append",
|
|
681
|
+
"audio": base64.b64encode(chunk).decode("utf-8"),
|
|
682
|
+
}))
|
|
683
|
+
await self.ws.send(json.dumps({"type": "input_audio_buffer.commit"}))
|
|
684
|
+
|
|
685
|
+
# If we were waiting for a previous response, do it inside lock handoff-safe
|
|
686
|
+
if wait_prev:
|
|
687
|
+
try:
|
|
688
|
+
if self.debug:
|
|
689
|
+
print("[send_turn] waiting for previous response")
|
|
690
|
+
await wait_prev.wait()
|
|
691
|
+
except Exception:
|
|
692
|
+
pass
|
|
693
|
+
|
|
694
|
+
# Prepare wait handle for the response about to start
|
|
695
|
+
if self._response_done is None:
|
|
696
|
+
self._response_done = asyncio.Event()
|
|
697
|
+
else:
|
|
698
|
+
try:
|
|
699
|
+
self._response_done.clear()
|
|
700
|
+
except Exception:
|
|
701
|
+
self._response_done = asyncio.Event()
|
|
702
|
+
wait_curr = self._response_done # snapshot for race-free waiting
|
|
703
|
+
|
|
704
|
+
# Build minimal response payload for xAI (tools are configured only via session.update)
|
|
705
|
+
payload = {
|
|
706
|
+
"type": "response.create",
|
|
707
|
+
"response": {"modalities": ["text", "audio"]},
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
await self.ws.send(json.dumps(payload))
|
|
711
|
+
if self.debug:
|
|
712
|
+
print("[send_turn] response.create sent")
|
|
713
|
+
|
|
714
|
+
# Optionally wait for response.done (otherwise return immediately)
|
|
715
|
+
if wait_for_done and wait_curr:
|
|
716
|
+
if self.debug:
|
|
717
|
+
print("[send_turn] waiting for response.done")
|
|
718
|
+
try:
|
|
719
|
+
await wait_curr.wait()
|
|
720
|
+
except Exception:
|
|
721
|
+
pass
|
|
722
|
+
if self.debug:
|
|
723
|
+
print("[send_turn] response.done received")
|
|
724
|
+
|
|
725
|
+
async def _cancel_active_response_internal(self):
|
|
726
|
+
"""Cancel current response (barge-in)."""
|
|
727
|
+
if self.ws and self._response_active:
|
|
728
|
+
try:
|
|
729
|
+
await self.ws.send(json.dumps({"type": "response.cancel"}))
|
|
730
|
+
except Exception:
|
|
731
|
+
pass
|
|
732
|
+
|
|
733
|
+
# -----------------------------
|
|
734
|
+
# Internal: audio input (auto-turn mode)
|
|
735
|
+
# -----------------------------
|
|
736
|
+
|
|
737
|
+
def rt_handle_audio_input_sync(self, event: RealtimeEvent, timeout: float = 0.5):
|
|
738
|
+
"""
|
|
739
|
+
Synchronous entrypoint for continuous microphone input when auto-turn is enabled.
|
|
740
|
+
This is safe to call from any thread; it schedules on the owner's background loop.
|
|
741
|
+
"""
|
|
742
|
+
# Fast return if nothing to send
|
|
743
|
+
try:
|
|
744
|
+
payload = getattr(event, "data", {}) or {}
|
|
745
|
+
if isinstance(payload, dict) and "payload" in payload and isinstance(payload["payload"], dict):
|
|
746
|
+
payload = payload["payload"]
|
|
747
|
+
if not payload or not payload.get("data"):
|
|
748
|
+
return
|
|
749
|
+
except Exception:
|
|
750
|
+
return
|
|
751
|
+
|
|
752
|
+
self._ensure_background_loop()
|
|
753
|
+
try:
|
|
754
|
+
self._bg.run_sync(self._rt_handle_audio_input_internal(event), timeout=timeout)
|
|
755
|
+
except Exception:
|
|
756
|
+
# Never raise to caller from audio callback
|
|
757
|
+
pass
|
|
758
|
+
|
|
759
|
+
async def _rt_handle_audio_input_internal(self, event: RealtimeEvent):
|
|
760
|
+
"""
|
|
761
|
+
Owner-loop implementation: push live audio to input buffer in auto-turn mode.
|
|
762
|
+
"""
|
|
763
|
+
if not self.ws or not self._running:
|
|
764
|
+
if self.debug:
|
|
765
|
+
print("[_rt_handle_audio_input] Socket not open!")
|
|
766
|
+
return
|
|
767
|
+
try:
|
|
768
|
+
if not bool(getattr(self._last_opts, "auto_turn", False)):
|
|
769
|
+
return
|
|
770
|
+
except Exception:
|
|
771
|
+
return
|
|
772
|
+
|
|
773
|
+
# Extract normalized payload
|
|
774
|
+
payload = getattr(event, "data", {}) or {}
|
|
775
|
+
if isinstance(payload, dict) and "payload" in payload and isinstance(payload["payload"], dict):
|
|
776
|
+
payload = payload["payload"]
|
|
777
|
+
|
|
778
|
+
data: bytes = payload.get("data") or b""
|
|
779
|
+
if not data:
|
|
780
|
+
return
|
|
781
|
+
mime = str(payload.get("mime") or "audio/pcm")
|
|
782
|
+
rate = int(payload.get("rate") or 0) or self._DEFAULT_RATE
|
|
783
|
+
channels = int(payload.get("channels") or 1)
|
|
784
|
+
is_final = bool(payload.get("final", False))
|
|
785
|
+
|
|
786
|
+
# Convert to PCM16 mono @ 24kHz as required by our session config
|
|
787
|
+
fmt_hint = "pcm16" if mime.startswith("audio/pcm") else None
|
|
788
|
+
try:
|
|
789
|
+
sr, _ch, pcm = coerce_to_pcm16_mono(data, fmt_hint, rate, fallback_rate=self._DEFAULT_RATE)
|
|
790
|
+
if sr != self._DEFAULT_RATE:
|
|
791
|
+
try:
|
|
792
|
+
pcm = resample_pcm16_mono(pcm, sr, self._DEFAULT_RATE)
|
|
793
|
+
sr = self._DEFAULT_RATE
|
|
794
|
+
except Exception:
|
|
795
|
+
sr = self._DEFAULT_RATE
|
|
796
|
+
except Exception:
|
|
797
|
+
return
|
|
798
|
+
|
|
799
|
+
# Serialize writes to the websocket
|
|
800
|
+
if self._send_lock is None:
|
|
801
|
+
self._send_lock = asyncio.Lock()
|
|
802
|
+
|
|
803
|
+
async with self._send_lock:
|
|
804
|
+
# Append in ~50 ms chunks to keep frames small
|
|
805
|
+
for chunk in iter_pcm_chunks(pcm, sr, ms=50):
|
|
806
|
+
if not chunk:
|
|
807
|
+
continue
|
|
808
|
+
try:
|
|
809
|
+
await self.ws.send(json.dumps({
|
|
810
|
+
"type": "input_audio_buffer.append",
|
|
811
|
+
"audio": base64.b64encode(chunk).decode("utf-8"),
|
|
812
|
+
}))
|
|
813
|
+
except Exception:
|
|
814
|
+
return
|
|
815
|
+
|
|
816
|
+
# With server VAD enabled, the server commits the buffer automatically.
|
|
817
|
+
if is_final:
|
|
818
|
+
if self.debug:
|
|
819
|
+
print("[_rt_handle_audio_input] final chunk sent (server VAD will commit)")
|
|
820
|
+
|
|
821
|
+
def commit_audio_input_sync(self, timeout: float = 0.5):
|
|
822
|
+
"""
|
|
823
|
+
Synchronous entrypoint to commit the input audio buffer in auto-turn mode.
|
|
824
|
+
This is safe to call from any thread; it schedules on the owner's background loop.
|
|
825
|
+
"""
|
|
826
|
+
self._ensure_background_loop()
|
|
827
|
+
try:
|
|
828
|
+
self._bg.run_sync(self._commit_audio_input_internal(), timeout=timeout)
|
|
829
|
+
except Exception:
|
|
830
|
+
# Never raise to caller from audio callback
|
|
831
|
+
pass
|
|
832
|
+
|
|
833
|
+
async def _commit_audio_input_internal(self):
|
|
834
|
+
"""
|
|
835
|
+
Owner-loop implementation: commit input audio buffer in auto-turn mode.
|
|
836
|
+
"""
|
|
837
|
+
if not self.ws or not self._running:
|
|
838
|
+
return
|
|
839
|
+
try:
|
|
840
|
+
if not bool(getattr(self._last_opts, "auto_turn", False)):
|
|
841
|
+
return
|
|
842
|
+
except Exception:
|
|
843
|
+
return
|
|
844
|
+
if self._send_lock is None:
|
|
845
|
+
self._send_lock = asyncio.Lock()
|
|
846
|
+
async with self._send_lock:
|
|
847
|
+
try:
|
|
848
|
+
await self.ws.send(json.dumps({"type": "input_audio_buffer.commit"}))
|
|
849
|
+
except Exception:
|
|
850
|
+
pass
|
|
851
|
+
|
|
852
|
+
def force_response_now_sync(self, timeout: float = 5.0):
|
|
853
|
+
"""Synchronously force the model to create a response from current input buffer."""
|
|
854
|
+
self._ensure_background_loop()
|
|
855
|
+
try:
|
|
856
|
+
self._bg.run_sync(self._force_response_now_internal(), timeout=timeout)
|
|
857
|
+
except Exception:
|
|
858
|
+
pass
|
|
859
|
+
|
|
860
|
+
async def _force_response_now_internal(self):
|
|
861
|
+
"""Owner-loop: commit current input buffer and trigger response.create."""
|
|
862
|
+
if not self.ws or not self._running:
|
|
863
|
+
return
|
|
864
|
+
try:
|
|
865
|
+
if not bool(getattr(self._last_opts, "auto_turn", False)):
|
|
866
|
+
# This helper is intended for auto-turn; manual flow already does commit+response.create.
|
|
867
|
+
return
|
|
868
|
+
except Exception:
|
|
869
|
+
return
|
|
870
|
+
|
|
871
|
+
if self._send_lock is None:
|
|
872
|
+
self._send_lock = asyncio.Lock()
|
|
873
|
+
|
|
874
|
+
async with self._send_lock:
|
|
875
|
+
# 1) Finalize current input buffer
|
|
876
|
+
try:
|
|
877
|
+
await self.ws.send(json.dumps({"type": "input_audio_buffer.commit"}))
|
|
878
|
+
except Exception:
|
|
879
|
+
return
|
|
880
|
+
|
|
881
|
+
# 2) Prepare wait handle for this response
|
|
882
|
+
if self._response_done is None:
|
|
883
|
+
self._response_done = asyncio.Event()
|
|
884
|
+
else:
|
|
885
|
+
try:
|
|
886
|
+
self._response_done.clear()
|
|
887
|
+
except Exception:
|
|
888
|
+
self._response_done = asyncio.Event()
|
|
889
|
+
|
|
890
|
+
# 3) Trigger the assistant response now
|
|
891
|
+
try:
|
|
892
|
+
await self.ws.send(json.dumps({
|
|
893
|
+
"type": "response.create",
|
|
894
|
+
"response": {"modalities": ["text", "audio"]},
|
|
895
|
+
}))
|
|
896
|
+
except Exception:
|
|
897
|
+
return
|
|
898
|
+
|
|
899
|
+
# -----------------------------
|
|
900
|
+
# Public: live tools update
|
|
901
|
+
# -----------------------------
|
|
902
|
+
|
|
903
|
+
async def update_session_tools(
|
|
904
|
+
self,
|
|
905
|
+
tools: Optional[list] = None,
|
|
906
|
+
remote_tools: Optional[list] = None,
|
|
907
|
+
force: bool = False
|
|
908
|
+
):
|
|
909
|
+
"""
|
|
910
|
+
Update session tools live via session.update.
|
|
911
|
+
If WS is not open, this updates self._last_opts and returns.
|
|
912
|
+
"""
|
|
913
|
+
self._ensure_background_loop()
|
|
914
|
+
return await self._run_on_owner(
|
|
915
|
+
self._update_session_tools_internal(tools, remote_tools, force)
|
|
916
|
+
)
|
|
917
|
+
|
|
918
|
+
def update_session_tools_sync(
|
|
919
|
+
self,
|
|
920
|
+
tools: Optional[list] = None,
|
|
921
|
+
remote_tools: Optional[list] = None,
|
|
922
|
+
force: bool = False,
|
|
923
|
+
timeout: float = 5.0
|
|
924
|
+
):
|
|
925
|
+
"""Synchronous wrapper over update_session_tools()."""
|
|
926
|
+
self._ensure_background_loop()
|
|
927
|
+
self._bg.run_sync(self._update_session_tools_internal(tools, remote_tools, force), timeout=timeout)
|
|
928
|
+
|
|
929
|
+
async def _update_session_tools_internal(
|
|
930
|
+
self,
|
|
931
|
+
tools: Optional[list],
|
|
932
|
+
remote_tools: Optional[list],
|
|
933
|
+
force: bool
|
|
934
|
+
):
|
|
935
|
+
"""
|
|
936
|
+
Owner-loop implementation for session tools update.
|
|
937
|
+
"""
|
|
938
|
+
# If socket is not open, just cache into last opts
|
|
939
|
+
if not self.ws:
|
|
940
|
+
self._update_last_opts_tools(tools, remote_tools)
|
|
941
|
+
self._cached_session_tools_sig = None
|
|
942
|
+
if self.debug:
|
|
943
|
+
print("[update_session_tools] WS not open; cached for next session")
|
|
944
|
+
return
|
|
945
|
+
|
|
946
|
+
# Compose xAI-shaped session tools (provider tools + function tools)
|
|
947
|
+
try:
|
|
948
|
+
session_tools = self._compose_xai_tools(
|
|
949
|
+
tools if tools is not None else getattr(self._last_opts, "tools", None),
|
|
950
|
+
remote_tools if remote_tools is not None else getattr(self._last_opts, "remote_tools", None),
|
|
951
|
+
)
|
|
952
|
+
except Exception as e:
|
|
953
|
+
if self.debug:
|
|
954
|
+
print(f"[update_session_tools] sanitize error: {e}")
|
|
955
|
+
session_tools = []
|
|
956
|
+
|
|
957
|
+
new_sig = tools_signature(session_tools)
|
|
958
|
+
|
|
959
|
+
# Compare with cached signature
|
|
960
|
+
if not force and self._cached_session_tools_sig == new_sig:
|
|
961
|
+
if self.debug:
|
|
962
|
+
print("[update_session_tools] no changes; skipping session.update")
|
|
963
|
+
self._update_last_opts_tools(tools, remote_tools)
|
|
964
|
+
return
|
|
965
|
+
|
|
966
|
+
# Send session.update under the single writer lock
|
|
967
|
+
if self._send_lock is None:
|
|
968
|
+
self._send_lock = asyncio.Lock()
|
|
969
|
+
async with self._send_lock:
|
|
970
|
+
try:
|
|
971
|
+
payload = {
|
|
972
|
+
"type": "session.update",
|
|
973
|
+
"session": {"tools": session_tools}
|
|
974
|
+
}
|
|
975
|
+
await self.ws.send(json.dumps(payload))
|
|
976
|
+
self._cached_session_tools_sig = new_sig
|
|
977
|
+
self._update_last_opts_tools(tools, remote_tools)
|
|
978
|
+
if self.debug:
|
|
979
|
+
print(f"[update_session_tools] session.update sent; tools={len(session_tools)}")
|
|
980
|
+
except Exception as e:
|
|
981
|
+
if self.debug:
|
|
982
|
+
print(f"[update_session_tools] send error: {e}")
|
|
983
|
+
|
|
984
|
+
# -----------------------------
|
|
985
|
+
# Public: send tool results back to the model
|
|
986
|
+
# -----------------------------
|
|
987
|
+
|
|
988
|
+
async def send_tool_results(
|
|
989
|
+
self,
|
|
990
|
+
results,
|
|
991
|
+
continue_turn: bool = True,
|
|
992
|
+
wait_for_done: bool = True,
|
|
993
|
+
):
|
|
994
|
+
"""
|
|
995
|
+
Send tool results back to the Realtime session.
|
|
996
|
+
"""
|
|
997
|
+
self._ensure_background_loop()
|
|
998
|
+
return await self._run_on_owner(
|
|
999
|
+
self._send_tool_results_internal(results, continue_turn, wait_for_done)
|
|
1000
|
+
)
|
|
1001
|
+
|
|
1002
|
+
def send_tool_results_sync(
|
|
1003
|
+
self,
|
|
1004
|
+
results,
|
|
1005
|
+
continue_turn: bool = True,
|
|
1006
|
+
wait_for_done: bool = True,
|
|
1007
|
+
timeout: float = 20.0,
|
|
1008
|
+
):
|
|
1009
|
+
"""Synchronous wrapper for send_tool_results()."""
|
|
1010
|
+
self._ensure_background_loop()
|
|
1011
|
+
return self._bg.run_sync(
|
|
1012
|
+
self._send_tool_results_internal(results, continue_turn, wait_for_done),
|
|
1013
|
+
timeout=timeout
|
|
1014
|
+
)
|
|
1015
|
+
|
|
1016
|
+
async def _send_tool_results_internal(
|
|
1017
|
+
self,
|
|
1018
|
+
results,
|
|
1019
|
+
continue_turn: bool,
|
|
1020
|
+
wait_for_done: bool,
|
|
1021
|
+
):
|
|
1022
|
+
"""
|
|
1023
|
+
Owner-loop implementation. Serializes sends under the WS writer lock.
|
|
1024
|
+
"""
|
|
1025
|
+
if not self.ws:
|
|
1026
|
+
raise RuntimeError("Live session is not open")
|
|
1027
|
+
|
|
1028
|
+
outputs = build_tool_outputs_payload(results, self._last_tool_calls)
|
|
1029
|
+
if not outputs:
|
|
1030
|
+
return
|
|
1031
|
+
|
|
1032
|
+
if self._send_lock is None:
|
|
1033
|
+
self._send_lock = asyncio.Lock()
|
|
1034
|
+
|
|
1035
|
+
wait_ev: Optional[asyncio.Event] = None
|
|
1036
|
+
async with self._send_lock:
|
|
1037
|
+
# Emit one conversation.item.create per tool output
|
|
1038
|
+
for it in outputs:
|
|
1039
|
+
payload = {
|
|
1040
|
+
"type": "conversation.item.create",
|
|
1041
|
+
"item": {
|
|
1042
|
+
"type": "function_call_output",
|
|
1043
|
+
"call_id": it["call_id"],
|
|
1044
|
+
"output": it["output"],
|
|
1045
|
+
},
|
|
1046
|
+
}
|
|
1047
|
+
if it.get("previous_item_id"):
|
|
1048
|
+
payload["previous_item_id"] = it["previous_item_id"]
|
|
1049
|
+
await self.ws.send(json.dumps(payload))
|
|
1050
|
+
|
|
1051
|
+
# Optionally ask the model to continue
|
|
1052
|
+
if continue_turn:
|
|
1053
|
+
if self._response_done is None:
|
|
1054
|
+
self._response_done = asyncio.Event()
|
|
1055
|
+
else:
|
|
1056
|
+
try:
|
|
1057
|
+
self._response_done.clear()
|
|
1058
|
+
except Exception:
|
|
1059
|
+
self._response_done = asyncio.Event()
|
|
1060
|
+
wait_ev = self._response_done # snapshot for race-free waiting
|
|
1061
|
+
await self.ws.send(json.dumps({
|
|
1062
|
+
"type": "response.create",
|
|
1063
|
+
"response": {"modalities": ["text", "audio"]},
|
|
1064
|
+
}))
|
|
1065
|
+
|
|
1066
|
+
# Wait for the follow-up response to complete
|
|
1067
|
+
if continue_turn and wait_for_done and wait_ev:
|
|
1068
|
+
try:
|
|
1069
|
+
await wait_ev.wait()
|
|
1070
|
+
except Exception:
|
|
1071
|
+
pass
|
|
1072
|
+
|
|
1073
|
+
# -----------------------------
|
|
1074
|
+
# Internal: receive loop
|
|
1075
|
+
# -----------------------------
|
|
1076
|
+
|
|
1077
|
+
async def _recv_loop(self):
|
|
1078
|
+
"""
|
|
1079
|
+
Single receiver loop for the entire session.
|
|
1080
|
+
Processes incoming events and dispatches to callbacks.
|
|
1081
|
+
"""
|
|
1082
|
+
if self.debug:
|
|
1083
|
+
print("[_recv_loop] started")
|
|
1084
|
+
|
|
1085
|
+
DEFAULT_RATE = self._DEFAULT_RATE
|
|
1086
|
+
audio_done = True
|
|
1087
|
+
|
|
1088
|
+
try:
|
|
1089
|
+
while self._running and self.ws:
|
|
1090
|
+
# Do not hard-stop the session on should_stop; only cancel active response if requested.
|
|
1091
|
+
if self._should_stop and self._should_stop():
|
|
1092
|
+
await self._cancel_active_response_internal()
|
|
1093
|
+
|
|
1094
|
+
try:
|
|
1095
|
+
raw = await asyncio.wait_for(self.ws.recv(), timeout=60)
|
|
1096
|
+
except asyncio.TimeoutError:
|
|
1097
|
+
continue
|
|
1098
|
+
except Exception as e:
|
|
1099
|
+
if self.debug:
|
|
1100
|
+
print(f"[_recv_loop] recv error: {e!r}")
|
|
1101
|
+
break
|
|
1102
|
+
|
|
1103
|
+
if isinstance(raw, bytes):
|
|
1104
|
+
continue
|
|
1105
|
+
|
|
1106
|
+
try:
|
|
1107
|
+
ev = json.loads(raw)
|
|
1108
|
+
except Exception:
|
|
1109
|
+
continue
|
|
1110
|
+
|
|
1111
|
+
etype = ev.get("type")
|
|
1112
|
+
|
|
1113
|
+
# ---- session / conversation lifecycle ----
|
|
1114
|
+
if etype in ("session.created", "session.updated"):
|
|
1115
|
+
sess = ev.get("session") or {}
|
|
1116
|
+
sid = sess.get("id")
|
|
1117
|
+
if isinstance(sid, str) and sid.strip():
|
|
1118
|
+
self._rt_session_id = sid.strip()
|
|
1119
|
+
set_ctx_rt_handle(self._ctx, self._rt_session_id, self.window)
|
|
1120
|
+
if self.debug:
|
|
1121
|
+
print(f"[_recv_loop] session id: {self._rt_session_id}")
|
|
1122
|
+
exp = sess.get("expires_at") or sess.get("expiresAt")
|
|
1123
|
+
try:
|
|
1124
|
+
if isinstance(exp, (int, float)) and exp > 0:
|
|
1125
|
+
self._rt_session_expires_at = int(exp)
|
|
1126
|
+
set_rt_session_expires_at(self._ctx, self._rt_session_expires_at, self.window)
|
|
1127
|
+
except Exception:
|
|
1128
|
+
pass
|
|
1129
|
+
continue
|
|
1130
|
+
|
|
1131
|
+
if etype == "conversation.created":
|
|
1132
|
+
conv = ev.get("conversation") or {}
|
|
1133
|
+
cid = conv.get("id")
|
|
1134
|
+
if isinstance(cid, str) and cid.strip():
|
|
1135
|
+
self._rt_session_id = cid.strip()
|
|
1136
|
+
set_ctx_rt_handle(self._ctx, self._rt_session_id, self.window)
|
|
1137
|
+
if self.debug:
|
|
1138
|
+
print(f"[_recv_loop] conversation id: {self._rt_session_id}")
|
|
1139
|
+
continue
|
|
1140
|
+
|
|
1141
|
+
if etype == "response.created":
|
|
1142
|
+
if self.debug:
|
|
1143
|
+
print("[_recv_loop] response created")
|
|
1144
|
+
self._response_active = True
|
|
1145
|
+
audio_done = False
|
|
1146
|
+
self._rt_reset_state()
|
|
1147
|
+
|
|
1148
|
+
elif etype == "input_audio_buffer.speech_started":
|
|
1149
|
+
if self.debug:
|
|
1150
|
+
print("[_recv_loop] speech_started")
|
|
1151
|
+
|
|
1152
|
+
elif etype == "input_audio_buffer.speech_stopped":
|
|
1153
|
+
if self.debug:
|
|
1154
|
+
print("[_recv_loop] speech_stopped")
|
|
1155
|
+
|
|
1156
|
+
elif etype in ("conversation.item.committed", "input_audio_buffer.committed"):
|
|
1157
|
+
if self.debug:
|
|
1158
|
+
print("[_recv_loop] audio_buffer committed")
|
|
1159
|
+
if self._last_opts:
|
|
1160
|
+
self._last_opts.rt_signals.response.emit(RealtimeEvent(RealtimeEvent.RT_OUTPUT_AUDIO_COMMIT, {
|
|
1161
|
+
"ctx": self._ctx,
|
|
1162
|
+
}))
|
|
1163
|
+
|
|
1164
|
+
elif etype == "input_audio_buffer.cleared":
|
|
1165
|
+
if self.debug:
|
|
1166
|
+
print("[_recv_loop] audio_buffer.cleared")
|
|
1167
|
+
|
|
1168
|
+
# ---- input transcription (user speech) ----
|
|
1169
|
+
elif etype == "conversation.item.input_audio_transcription.delta":
|
|
1170
|
+
if self._transcribe_enabled():
|
|
1171
|
+
buf = self._input_tr_buffers.get(ev.get("item_id"))
|
|
1172
|
+
if buf is None:
|
|
1173
|
+
buf = io.StringIO()
|
|
1174
|
+
self._input_tr_buffers[ev.get("item_id")] = buf
|
|
1175
|
+
delta = ev.get("delta") or ev.get("text") or ev.get("transcript") or ""
|
|
1176
|
+
if delta:
|
|
1177
|
+
buf.write(str(delta))
|
|
1178
|
+
|
|
1179
|
+
elif etype in ("conversation.item.input_audio_transcription.completed",
|
|
1180
|
+
"conversation.item.input_audio_transcription.done"):
|
|
1181
|
+
if self._transcribe_enabled():
|
|
1182
|
+
item_id = ev.get("item_id")
|
|
1183
|
+
tr = ev.get("transcript") or ""
|
|
1184
|
+
buf = self._input_tr_buffers.pop(item_id, None)
|
|
1185
|
+
if buf is not None:
|
|
1186
|
+
try:
|
|
1187
|
+
v = buf.getvalue()
|
|
1188
|
+
if v and not tr:
|
|
1189
|
+
tr = v
|
|
1190
|
+
finally:
|
|
1191
|
+
try:
|
|
1192
|
+
buf.close()
|
|
1193
|
+
except Exception:
|
|
1194
|
+
pass
|
|
1195
|
+
if tr:
|
|
1196
|
+
self._save_input_transcript(tr)
|
|
1197
|
+
|
|
1198
|
+
elif etype in ("conversation.item.created", "conversation.item.added"):
|
|
1199
|
+
if self.debug:
|
|
1200
|
+
print("[_recv_loop] conversation item event")
|
|
1201
|
+
if self._transcribe_enabled():
|
|
1202
|
+
item = ev.get("item") or {}
|
|
1203
|
+
if item.get("role") == "user":
|
|
1204
|
+
for c in (item.get("content") or []):
|
|
1205
|
+
if isinstance(c, dict) and c.get("type") in ("input_audio", "audio"):
|
|
1206
|
+
tr = c.get("transcript")
|
|
1207
|
+
if tr:
|
|
1208
|
+
self._save_input_transcript(str(tr))
|
|
1209
|
+
|
|
1210
|
+
# ---- assistant text vs assistant audio transcript deltas ----
|
|
1211
|
+
elif etype in ("response.text.delta", "response.output_text.delta"):
|
|
1212
|
+
delta = ev.get("delta") or ev.get("text")
|
|
1213
|
+
if isinstance(delta, dict) and "text" in delta:
|
|
1214
|
+
delta = delta["text"]
|
|
1215
|
+
if delta:
|
|
1216
|
+
self._rt_append_text(delta)
|
|
1217
|
+
if self._on_text:
|
|
1218
|
+
try:
|
|
1219
|
+
await self._on_text(str(delta))
|
|
1220
|
+
except Exception:
|
|
1221
|
+
pass
|
|
1222
|
+
|
|
1223
|
+
elif etype in ("response.audio_transcript.delta", "response.output_audio_transcript.delta"):
|
|
1224
|
+
if self._transcribe_enabled():
|
|
1225
|
+
delta = ev.get("delta") or ev.get("text")
|
|
1226
|
+
if isinstance(delta, dict) and "text" in delta:
|
|
1227
|
+
delta = delta["text"]
|
|
1228
|
+
if delta:
|
|
1229
|
+
self._rt_append_text(delta)
|
|
1230
|
+
if self._on_text:
|
|
1231
|
+
try:
|
|
1232
|
+
await self._on_text(str(delta))
|
|
1233
|
+
except Exception:
|
|
1234
|
+
pass
|
|
1235
|
+
|
|
1236
|
+
elif etype in ("response.text.done", "response.output_text.done",
|
|
1237
|
+
"response.audio_transcript.done", "response.output_audio_transcript.done"):
|
|
1238
|
+
if self.debug:
|
|
1239
|
+
print("[_recv_loop] text/transcript done")
|
|
1240
|
+
|
|
1241
|
+
elif etype == "response.content_part.added":
|
|
1242
|
+
part = ev.get("part") or {}
|
|
1243
|
+
ptype = part.get("type")
|
|
1244
|
+
if ptype == "text":
|
|
1245
|
+
txt = part.get("text") or ""
|
|
1246
|
+
if txt:
|
|
1247
|
+
self._rt_append_text(txt)
|
|
1248
|
+
if self._on_text:
|
|
1249
|
+
try:
|
|
1250
|
+
await self._on_text(str(txt))
|
|
1251
|
+
except Exception:
|
|
1252
|
+
pass
|
|
1253
|
+
elif ptype == "audio":
|
|
1254
|
+
b64 = part.get("audio")
|
|
1255
|
+
if b64 and self._on_audio:
|
|
1256
|
+
try:
|
|
1257
|
+
data = base64.b64decode(b64)
|
|
1258
|
+
await self._on_audio(data, "audio/pcm", DEFAULT_RATE, 1, False)
|
|
1259
|
+
except Exception:
|
|
1260
|
+
pass
|
|
1261
|
+
tr = part.get("transcript")
|
|
1262
|
+
if tr and self._transcribe_enabled():
|
|
1263
|
+
self._rt_append_text(tr)
|
|
1264
|
+
if self._on_text:
|
|
1265
|
+
try:
|
|
1266
|
+
await self._on_text(str(tr))
|
|
1267
|
+
except Exception:
|
|
1268
|
+
pass
|
|
1269
|
+
|
|
1270
|
+
elif etype in ("response.audio.delta", "response.output_audio.delta"):
|
|
1271
|
+
b64 = ev.get("delta")
|
|
1272
|
+
if b64 and self._on_audio:
|
|
1273
|
+
try:
|
|
1274
|
+
data = base64.b64decode(b64)
|
|
1275
|
+
await self._on_audio(data, "audio/pcm", DEFAULT_RATE, 1, False)
|
|
1276
|
+
except Exception:
|
|
1277
|
+
pass
|
|
1278
|
+
|
|
1279
|
+
elif etype in ("response.audio.done", "response.output_audio.done"):
|
|
1280
|
+
if self.debug:
|
|
1281
|
+
print("[_recv_loop] audio done")
|
|
1282
|
+
if not audio_done and self._on_audio:
|
|
1283
|
+
try:
|
|
1284
|
+
await self._on_audio(b"", "audio/pcm", DEFAULT_RATE, 1, True)
|
|
1285
|
+
except Exception:
|
|
1286
|
+
pass
|
|
1287
|
+
audio_done = True
|
|
1288
|
+
|
|
1289
|
+
# ---- function calling (tools) ----
|
|
1290
|
+
elif etype == "response.output_item.added":
|
|
1291
|
+
if self.debug:
|
|
1292
|
+
print("[_recv_loop] output_item added")
|
|
1293
|
+
item = ev.get("item") or {}
|
|
1294
|
+
if item.get("type") == "function_call":
|
|
1295
|
+
fid = item.get("id") or item.get("item_id") or ""
|
|
1296
|
+
call_id = item.get("call_id") or ""
|
|
1297
|
+
name = item.get("name") or ""
|
|
1298
|
+
self._rt_state["tool_calls"].append({
|
|
1299
|
+
"id": fid,
|
|
1300
|
+
"call_id": call_id,
|
|
1301
|
+
"type": "function",
|
|
1302
|
+
"function": {"name": name, "arguments": ""}
|
|
1303
|
+
})
|
|
1304
|
+
if fid and fid not in self._rt_state["fn_args_buffers"]:
|
|
1305
|
+
self._rt_state["fn_args_buffers"][fid] = io.StringIO()
|
|
1306
|
+
|
|
1307
|
+
elif etype == "response.function_call_arguments.delta":
|
|
1308
|
+
buf = self._rt_state["fn_args_buffers"].get(ev.get("item_id"))
|
|
1309
|
+
if buf is not None:
|
|
1310
|
+
delta = ev.get("delta") or ""
|
|
1311
|
+
if delta:
|
|
1312
|
+
buf.write(delta)
|
|
1313
|
+
|
|
1314
|
+
elif etype == "response.function_call_arguments.done":
|
|
1315
|
+
item_id = ev.get("item_id")
|
|
1316
|
+
args_val = ev.get("arguments") or ""
|
|
1317
|
+
buf = self._rt_state["fn_args_buffers"].pop(item_id, None)
|
|
1318
|
+
if buf is not None:
|
|
1319
|
+
try:
|
|
1320
|
+
concat = buf.getvalue()
|
|
1321
|
+
if concat:
|
|
1322
|
+
args_val = concat
|
|
1323
|
+
finally:
|
|
1324
|
+
try:
|
|
1325
|
+
buf.close()
|
|
1326
|
+
except Exception:
|
|
1327
|
+
pass
|
|
1328
|
+
for tc in self._rt_state["tool_calls"]:
|
|
1329
|
+
if tc.get("id") == item_id:
|
|
1330
|
+
tc["function"]["arguments"] = args_val
|
|
1331
|
+
break
|
|
1332
|
+
self._rt_state["force_func_call"] = True
|
|
1333
|
+
|
|
1334
|
+
elif etype == "response.output_item.done":
|
|
1335
|
+
if self.debug:
|
|
1336
|
+
print("[_recv_loop] output_item done")
|
|
1337
|
+
item = ev.get("item") or {}
|
|
1338
|
+
if item.get("type") == "function_call":
|
|
1339
|
+
fid = item.get("id") or item.get("item_id") or ""
|
|
1340
|
+
name = item.get("name") or ""
|
|
1341
|
+
args_val = item.get("arguments") or ""
|
|
1342
|
+
for tc in self._rt_state["tool_calls"]:
|
|
1343
|
+
if fid and tc.get("id") == fid:
|
|
1344
|
+
if name:
|
|
1345
|
+
tc["function"]["name"] = name
|
|
1346
|
+
if args_val:
|
|
1347
|
+
tc["function"]["arguments"] = args_val
|
|
1348
|
+
break
|
|
1349
|
+
self._rt_state["force_func_call"] = True
|
|
1350
|
+
|
|
1351
|
+
# ---- code interpreter (delta/done) ----
|
|
1352
|
+
elif etype in ("response.code_interpreter_call_code.delta", "response.code_interpreter_call.code.delta"):
|
|
1353
|
+
code_delta = ev.get("delta") or ""
|
|
1354
|
+
if code_delta:
|
|
1355
|
+
if not self._rt_state["is_code"]:
|
|
1356
|
+
hdr = "\n\n**Code interpreter**\n```python\n"
|
|
1357
|
+
self._rt_append_text(hdr + code_delta)
|
|
1358
|
+
if self._on_text:
|
|
1359
|
+
try:
|
|
1360
|
+
await self._on_text(hdr + code_delta)
|
|
1361
|
+
except Exception:
|
|
1362
|
+
pass
|
|
1363
|
+
self._rt_state["is_code"] = True
|
|
1364
|
+
else:
|
|
1365
|
+
self._rt_append_text(code_delta)
|
|
1366
|
+
if self._on_text:
|
|
1367
|
+
try:
|
|
1368
|
+
await self._on_text(code_delta)
|
|
1369
|
+
except Exception:
|
|
1370
|
+
pass
|
|
1371
|
+
|
|
1372
|
+
elif etype in ("response.code_interpreter_call_code.done", "response.code_interpreter_call.code.done"):
|
|
1373
|
+
if self.debug:
|
|
1374
|
+
print("[_recv_loop] code done")
|
|
1375
|
+
if self._rt_state["is_code"]:
|
|
1376
|
+
tail = "\n\n```\n-----------\n"
|
|
1377
|
+
self._rt_append_text(tail)
|
|
1378
|
+
if self._on_text:
|
|
1379
|
+
try:
|
|
1380
|
+
await self._on_text(tail)
|
|
1381
|
+
except Exception:
|
|
1382
|
+
pass
|
|
1383
|
+
self._rt_state["is_code"] = False
|
|
1384
|
+
|
|
1385
|
+
# ---- annotations (citations/files) ----
|
|
1386
|
+
elif etype == "response.output_text.annotation.added":
|
|
1387
|
+
if self.debug:
|
|
1388
|
+
print("[_recv_loop] annotation added")
|
|
1389
|
+
ann = ev.get("annotation") or {}
|
|
1390
|
+
atype = ann.get("type")
|
|
1391
|
+
if atype == "url_citation":
|
|
1392
|
+
url = ann.get("url")
|
|
1393
|
+
self._rt_add_citation(url)
|
|
1394
|
+
elif atype == "container_file_citation":
|
|
1395
|
+
self._rt_state["files"].append({
|
|
1396
|
+
"container_id": ann.get("container_id"),
|
|
1397
|
+
"file_id": ann.get("file_id"),
|
|
1398
|
+
})
|
|
1399
|
+
|
|
1400
|
+
# ---- partial images (defensive) ----
|
|
1401
|
+
elif etype == "response.image_generation_call.partial_image":
|
|
1402
|
+
image_b64 = ev.get("partial_image_b64")
|
|
1403
|
+
if image_b64:
|
|
1404
|
+
try:
|
|
1405
|
+
img_bytes = base64.b64decode(image_b64)
|
|
1406
|
+
save_path = self.window.core.image.gen_unique_path(self._ctx)
|
|
1407
|
+
with open(save_path, "wb") as f:
|
|
1408
|
+
f.write(img_bytes)
|
|
1409
|
+
self._rt_state["image_paths"].append(save_path)
|
|
1410
|
+
self._rt_state["is_image"] = True
|
|
1411
|
+
if not isinstance(self._ctx.images, list):
|
|
1412
|
+
self._ctx.images = []
|
|
1413
|
+
if save_path not in self._ctx.images:
|
|
1414
|
+
self._ctx.images.append(save_path)
|
|
1415
|
+
except Exception:
|
|
1416
|
+
pass
|
|
1417
|
+
|
|
1418
|
+
elif etype == "response.done":
|
|
1419
|
+
if self.debug:
|
|
1420
|
+
print("[_recv_loop] response done")
|
|
1421
|
+
if not audio_done and self._on_audio:
|
|
1422
|
+
try:
|
|
1423
|
+
await self._on_audio(b"", "audio/pcm", DEFAULT_RATE, 1, True)
|
|
1424
|
+
except Exception:
|
|
1425
|
+
pass
|
|
1426
|
+
audio_done = True
|
|
1427
|
+
|
|
1428
|
+
self._response_active = False
|
|
1429
|
+
|
|
1430
|
+
try:
|
|
1431
|
+
resp_obj = ev.get("response") or {}
|
|
1432
|
+
self._rt_capture_usage(resp_obj)
|
|
1433
|
+
except Exception:
|
|
1434
|
+
pass
|
|
1435
|
+
|
|
1436
|
+
output = "".join(self._rt_state["output_parts"]) if self._rt_state else ""
|
|
1437
|
+
if has_unclosed_code_tag(output):
|
|
1438
|
+
output += "\n```"
|
|
1439
|
+
if not output:
|
|
1440
|
+
try:
|
|
1441
|
+
transcript = self._extract_text_from_response_done(ev)
|
|
1442
|
+
if transcript:
|
|
1443
|
+
output = transcript
|
|
1444
|
+
except Exception:
|
|
1445
|
+
pass
|
|
1446
|
+
|
|
1447
|
+
try:
|
|
1448
|
+
if self._ctx:
|
|
1449
|
+
self._ctx.output = output or (self._ctx.output or "")
|
|
1450
|
+
up = self._rt_state.get("usage_payload") if self._rt_state else None
|
|
1451
|
+
if up:
|
|
1452
|
+
in_tok = up.get("in")
|
|
1453
|
+
out_tok = up.get("out")
|
|
1454
|
+
if in_tok is None:
|
|
1455
|
+
in_tok = self._ctx.input_tokens if self._ctx.input_tokens is not None else 0
|
|
1456
|
+
if out_tok is None:
|
|
1457
|
+
out_tok = 0
|
|
1458
|
+
self._ctx.set_tokens(in_tok, out_tok)
|
|
1459
|
+
try:
|
|
1460
|
+
if not isinstance(self._ctx.extra, dict):
|
|
1461
|
+
self._ctx.extra = {}
|
|
1462
|
+
self._ctx.extra["usage"] = {
|
|
1463
|
+
"vendor": "openai",
|
|
1464
|
+
"input_tokens": in_tok,
|
|
1465
|
+
"output_tokens": out_tok,
|
|
1466
|
+
"reasoning_tokens": up.get("reasoning", 0),
|
|
1467
|
+
"total_reported": up.get("total"),
|
|
1468
|
+
}
|
|
1469
|
+
except Exception:
|
|
1470
|
+
pass
|
|
1471
|
+
|
|
1472
|
+
if self._rt_state and self._rt_state["citations"]:
|
|
1473
|
+
if self._ctx.urls is None:
|
|
1474
|
+
self._ctx.urls = []
|
|
1475
|
+
for u in self._rt_state["citations"]:
|
|
1476
|
+
if u not in self._ctx.urls:
|
|
1477
|
+
self._ctx.urls.append(u)
|
|
1478
|
+
|
|
1479
|
+
if self._rt_state and self._rt_state["image_paths"]:
|
|
1480
|
+
if not isinstance(self._ctx.images, list):
|
|
1481
|
+
self._ctx.images = []
|
|
1482
|
+
for p in self._rt_state["image_paths"]:
|
|
1483
|
+
if p not in self._ctx.images:
|
|
1484
|
+
self._ctx.images.append(p)
|
|
1485
|
+
|
|
1486
|
+
self.window.core.ctx.update_item(self._ctx)
|
|
1487
|
+
except Exception:
|
|
1488
|
+
pass
|
|
1489
|
+
|
|
1490
|
+
try:
|
|
1491
|
+
files = (self._rt_state or {}).get("files") or []
|
|
1492
|
+
if files:
|
|
1493
|
+
self.window.core.api.openai.container.download_files(self._ctx, files)
|
|
1494
|
+
except Exception:
|
|
1495
|
+
pass
|
|
1496
|
+
|
|
1497
|
+
try:
|
|
1498
|
+
tcs = (self._rt_state or {}).get("tool_calls") or []
|
|
1499
|
+
if tcs:
|
|
1500
|
+
for tc in tcs:
|
|
1501
|
+
fn = tc.get("function") or {}
|
|
1502
|
+
if isinstance(fn.get("arguments"), dict):
|
|
1503
|
+
fn["arguments"] = json.dumps(fn["arguments"], ensure_ascii=False)
|
|
1504
|
+
self._ctx.force_call = bool((self._rt_state or {}).get("force_func_call"))
|
|
1505
|
+
self.window.core.debug.info("[realtime] Tool calls found, unpacking...")
|
|
1506
|
+
self.window.core.command.unpack_tool_calls_chunks(self._ctx, tcs)
|
|
1507
|
+
self.window.core.ctx.update_item(self._ctx)
|
|
1508
|
+
except Exception:
|
|
1509
|
+
pass
|
|
1510
|
+
|
|
1511
|
+
try:
|
|
1512
|
+
tcs = (self._rt_state or {}).get("tool_calls") or []
|
|
1513
|
+
if tcs:
|
|
1514
|
+
self._last_tool_calls = list(tcs)
|
|
1515
|
+
except Exception:
|
|
1516
|
+
pass
|
|
1517
|
+
|
|
1518
|
+
if self._response_done:
|
|
1519
|
+
self._response_done.set()
|
|
1520
|
+
|
|
1521
|
+
if self._last_opts:
|
|
1522
|
+
self._last_opts.rt_signals.response.emit(RealtimeEvent(RealtimeEvent.RT_OUTPUT_TURN_END, {
|
|
1523
|
+
"ctx": self._ctx,
|
|
1524
|
+
}))
|
|
1525
|
+
|
|
1526
|
+
self._rt_state = None
|
|
1527
|
+
|
|
1528
|
+
elif etype == "error":
|
|
1529
|
+
if self.debug:
|
|
1530
|
+
print(f"[_recv_loop] error event: {ev}")
|
|
1531
|
+
err = ev.get("error") or {}
|
|
1532
|
+
msg = (err.get("message") or "")
|
|
1533
|
+
code = (err.get("code") or "")
|
|
1534
|
+
if isinstance(code, str) and code.strip().lower() == "session_expired":
|
|
1535
|
+
self._rt_session_id = None
|
|
1536
|
+
if self.debug:
|
|
1537
|
+
print("[_recv_loop] session expired")
|
|
1538
|
+
if "already has an active response" in (msg or "").lower():
|
|
1539
|
+
if self._response_done:
|
|
1540
|
+
self._response_done.set()
|
|
1541
|
+
continue
|
|
1542
|
+
if self._response_done:
|
|
1543
|
+
self._response_done.set()
|
|
1544
|
+
if self.debug:
|
|
1545
|
+
print(f"[_recv_loop] error: {msg}")
|
|
1546
|
+
|
|
1547
|
+
# Other events are ignored
|
|
1548
|
+
|
|
1549
|
+
except Exception as e:
|
|
1550
|
+
if self.debug:
|
|
1551
|
+
print(f"[_recv_loop] exception: {e!r}")
|
|
1552
|
+
finally:
|
|
1553
|
+
if self.debug:
|
|
1554
|
+
print("[_recv_loop] stopped")
|
|
1555
|
+
try:
|
|
1556
|
+
if self._response_done and not self._response_done.is_set():
|
|
1557
|
+
self._response_done.set()
|
|
1558
|
+
except Exception:
|
|
1559
|
+
pass
|
|
1560
|
+
try:
|
|
1561
|
+
if self.ws:
|
|
1562
|
+
await self.ws.close()
|
|
1563
|
+
except Exception:
|
|
1564
|
+
pass
|
|
1565
|
+
self.ws = None
|
|
1566
|
+
self._running = False
|
|
1567
|
+
|
|
1568
|
+
# -----------------------------
|
|
1569
|
+
# Helpers
|
|
1570
|
+
# -----------------------------
|
|
1571
|
+
|
|
1572
|
+
def _preferred_voice(self) -> str:
|
|
1573
|
+
"""
|
|
1574
|
+
Resolve preferred OpenAI voice from settings.
|
|
1575
|
+
"""
|
|
1576
|
+
try:
|
|
1577
|
+
v = self.window.core.plugins.get_option("audio_output", "openai_voice")
|
|
1578
|
+
if v:
|
|
1579
|
+
return str(v)
|
|
1580
|
+
except Exception:
|
|
1581
|
+
pass
|
|
1582
|
+
return "Ara"
|
|
1583
|
+
|
|
1584
|
+
def _extract_text_from_response_done(self, ev: dict) -> str:
|
|
1585
|
+
"""
|
|
1586
|
+
Extract assistant text from response.done payload.
|
|
1587
|
+
"""
|
|
1588
|
+
res = ev.get("response") or {}
|
|
1589
|
+
out = res.get("output") or []
|
|
1590
|
+
parts: list[str] = []
|
|
1591
|
+
|
|
1592
|
+
for item in out:
|
|
1593
|
+
if not isinstance(item, dict):
|
|
1594
|
+
continue
|
|
1595
|
+
if item.get("type") not in ("message", "tool_result", "function_call_result", "response"):
|
|
1596
|
+
pass
|
|
1597
|
+
content_list = item.get("content") or []
|
|
1598
|
+
for c in content_list:
|
|
1599
|
+
if not isinstance(c, dict):
|
|
1600
|
+
continue
|
|
1601
|
+
ctype = c.get("type")
|
|
1602
|
+
if ctype == "audio" and self._transcribe_enabled():
|
|
1603
|
+
tr = c.get("transcript")
|
|
1604
|
+
if tr:
|
|
1605
|
+
parts.append(str(tr))
|
|
1606
|
+
elif ctype in ("text", "output_text", "input_text"):
|
|
1607
|
+
txt = c.get("text")
|
|
1608
|
+
if isinstance(txt, dict):
|
|
1609
|
+
txt = txt.get("text") or txt.get("value")
|
|
1610
|
+
if txt:
|
|
1611
|
+
parts.append(str(txt))
|
|
1612
|
+
|
|
1613
|
+
text = "\n".join(t.strip() for t in parts if t and str(t).strip())
|
|
1614
|
+
return text
|
|
1615
|
+
|
|
1616
|
+
# ---- per-response state helpers ----
|
|
1617
|
+
|
|
1618
|
+
def _rt_reset_state(self):
|
|
1619
|
+
"""Reset per-response extraction state."""
|
|
1620
|
+
self._rt_state = {
|
|
1621
|
+
"output_parts": [],
|
|
1622
|
+
"begin": True,
|
|
1623
|
+
"fn_args_buffers": {},
|
|
1624
|
+
"tool_calls": [],
|
|
1625
|
+
"citations": [],
|
|
1626
|
+
"files": [],
|
|
1627
|
+
"image_paths": [],
|
|
1628
|
+
"is_image": False,
|
|
1629
|
+
"is_code": False,
|
|
1630
|
+
"force_func_call": False,
|
|
1631
|
+
"usage_payload": {},
|
|
1632
|
+
}
|
|
1633
|
+
|
|
1634
|
+
def _rt_append_text(self, s: str):
|
|
1635
|
+
"""Append text to assembled output, skipping initial empty deltas."""
|
|
1636
|
+
if self._rt_state is None:
|
|
1637
|
+
self._rt_reset_state()
|
|
1638
|
+
if self._rt_state["begin"] and (s is None or s == ""):
|
|
1639
|
+
return
|
|
1640
|
+
self._rt_state["output_parts"].append(str(s))
|
|
1641
|
+
self._rt_state["begin"] = False
|
|
1642
|
+
|
|
1643
|
+
def _rt_add_citation(self, url: Optional[str]):
|
|
1644
|
+
"""Add a URL citation to state and ctx (de-duplicated)."""
|
|
1645
|
+
if not url or not isinstance(url, str):
|
|
1646
|
+
return
|
|
1647
|
+
url = url.strip()
|
|
1648
|
+
if not (url.startswith("http://") or url.startswith("https://")):
|
|
1649
|
+
return
|
|
1650
|
+
if url not in self._rt_state["citations"]:
|
|
1651
|
+
self._rt_state["citations"].append(url)
|
|
1652
|
+
try:
|
|
1653
|
+
if self._ctx:
|
|
1654
|
+
if self._ctx.urls is None:
|
|
1655
|
+
self._ctx.urls = []
|
|
1656
|
+
if url not in self._ctx.urls:
|
|
1657
|
+
self._ctx.urls.append(url)
|
|
1658
|
+
except Exception:
|
|
1659
|
+
pass
|
|
1660
|
+
|
|
1661
|
+
def _rt_capture_usage(self, response_obj: dict):
|
|
1662
|
+
"""
|
|
1663
|
+
Capture token usage from response.done if present.
|
|
1664
|
+
"""
|
|
1665
|
+
try:
|
|
1666
|
+
usage = (response_obj or {}).get("usage") or {}
|
|
1667
|
+
if not usage:
|
|
1668
|
+
return
|
|
1669
|
+
in_tok = usage.get("input_tokens") or usage.get("prompt_tokens")
|
|
1670
|
+
out_tok = usage.get("output_tokens") or usage.get("completion_tokens")
|
|
1671
|
+
total = usage.get("total_tokens")
|
|
1672
|
+
self._rt_state["usage_payload"] = {
|
|
1673
|
+
"in": int(in_tok) if in_tok is not None else None,
|
|
1674
|
+
"out": int(out_tok) if out_tok is not None else None,
|
|
1675
|
+
"total": int(total) if total is not None else None,
|
|
1676
|
+
"reasoning": 0,
|
|
1677
|
+
}
|
|
1678
|
+
except Exception:
|
|
1679
|
+
pass
|
|
1680
|
+
|
|
1681
|
+
# ---- transcription helpers ----
|
|
1682
|
+
|
|
1683
|
+
def _transcribe_enabled(self) -> bool:
|
|
1684
|
+
"""Returns True if transcription (input/output) is enabled via opts.transcribe."""
|
|
1685
|
+
try:
|
|
1686
|
+
return bool(getattr(self._last_opts, "transcribe", False))
|
|
1687
|
+
except Exception:
|
|
1688
|
+
return False
|
|
1689
|
+
|
|
1690
|
+
def _save_input_transcript(self, transcript: str):
|
|
1691
|
+
"""
|
|
1692
|
+
Persist input transcript into ctx. If the user didn't provide a text prompt in this turn,
|
|
1693
|
+
ctx.input is also populated so downstream code treats it as the user's textual message.
|
|
1694
|
+
"""
|
|
1695
|
+
if not transcript:
|
|
1696
|
+
return
|
|
1697
|
+
try:
|
|
1698
|
+
if self._ctx:
|
|
1699
|
+
if not isinstance(self._ctx.extra, dict):
|
|
1700
|
+
self._ctx.extra = {}
|
|
1701
|
+
self._ctx.extra["input_transcript"] = str(transcript)
|
|
1702
|
+
if not getattr(self._last_opts, "prompt", None):
|
|
1703
|
+
self._ctx.input = str(transcript)
|
|
1704
|
+
self.window.core.ctx.update_item(self._ctx)
|
|
1705
|
+
except Exception:
|
|
1706
|
+
pass
|
|
1707
|
+
|
|
1708
|
+
def _tune_openai_vad(self, session_payload: dict, opts) -> None:
|
|
1709
|
+
"""
|
|
1710
|
+
Increase end-of-speech hold for server VAD (auto-turn) to reduce premature turn endings.
|
|
1711
|
+
"""
|
|
1712
|
+
try:
|
|
1713
|
+
sess = session_payload.get("session") or {}
|
|
1714
|
+
td = sess.get("turn_detection")
|
|
1715
|
+
if not isinstance(td, dict):
|
|
1716
|
+
return
|
|
1717
|
+
|
|
1718
|
+
target_ms = getattr(opts, "vad_end_silence_ms", None)
|
|
1719
|
+
if not isinstance(target_ms, (int, float)) or target_ms <= 0:
|
|
1720
|
+
base = int(td.get("silence_duration_ms") or 500)
|
|
1721
|
+
target_ms = max(base, 2000)
|
|
1722
|
+
|
|
1723
|
+
td["silence_duration_ms"] = int(target_ms)
|
|
1724
|
+
|
|
1725
|
+
prefix_ms = getattr(opts, "vad_prefix_padding_ms", None)
|
|
1726
|
+
if isinstance(prefix_ms, (int, float)) and prefix_ms >= 0:
|
|
1727
|
+
td["prefix_padding_ms"] = int(prefix_ms)
|
|
1728
|
+
except Exception:
|
|
1729
|
+
pass
|
|
1730
|
+
|
|
1731
|
+
def update_session_autoturn_sync(
|
|
1732
|
+
self,
|
|
1733
|
+
enabled: bool,
|
|
1734
|
+
silence_ms: Optional[int] = None,
|
|
1735
|
+
prefix_ms: Optional[int] = None,
|
|
1736
|
+
timeout: float = 5.0,
|
|
1737
|
+
):
|
|
1738
|
+
"""
|
|
1739
|
+
Synchronous helper to enable/disable auto-turn (VAD) mode on the live session.
|
|
1740
|
+
You can override silence and prefix (ms) as 2nd and 3rd args.
|
|
1741
|
+
If WS is not open, this updates self._last_opts and returns.
|
|
1742
|
+
"""
|
|
1743
|
+
self._ensure_background_loop()
|
|
1744
|
+
try:
|
|
1745
|
+
self._bg.run_sync(
|
|
1746
|
+
self._update_session_autoturn_internal(enabled, silence_ms, prefix_ms),
|
|
1747
|
+
timeout=timeout
|
|
1748
|
+
)
|
|
1749
|
+
except Exception:
|
|
1750
|
+
pass
|
|
1751
|
+
|
|
1752
|
+
async def _update_session_autoturn_internal(
|
|
1753
|
+
self,
|
|
1754
|
+
enabled: bool,
|
|
1755
|
+
silence_ms: Optional[int] = None,
|
|
1756
|
+
prefix_ms: Optional[int] = None,
|
|
1757
|
+
):
|
|
1758
|
+
"""
|
|
1759
|
+
Owner-loop implementation for toggling auto-turn (server/semantic VAD) at runtime
|
|
1760
|
+
with optional silence and prefix overrides (milliseconds).
|
|
1761
|
+
"""
|
|
1762
|
+
# If socket is not open, just cache into last opts
|
|
1763
|
+
if not self.ws:
|
|
1764
|
+
try:
|
|
1765
|
+
if self._last_opts:
|
|
1766
|
+
setattr(self._last_opts, "auto_turn", bool(enabled))
|
|
1767
|
+
if silence_ms is not None:
|
|
1768
|
+
setattr(self._last_opts, "vad_end_silence_ms", int(silence_ms))
|
|
1769
|
+
if prefix_ms is not None:
|
|
1770
|
+
setattr(self._last_opts, "vad_prefix_padding_ms", int(prefix_ms))
|
|
1771
|
+
except Exception:
|
|
1772
|
+
pass
|
|
1773
|
+
if self.debug:
|
|
1774
|
+
print("[update_session_autoturn] WS not open; cached for next session")
|
|
1775
|
+
return
|
|
1776
|
+
|
|
1777
|
+
if self._send_lock is None:
|
|
1778
|
+
self._send_lock = asyncio.Lock()
|
|
1779
|
+
|
|
1780
|
+
async with self._send_lock:
|
|
1781
|
+
try:
|
|
1782
|
+
payload: dict = {"type": "session.update", "session": {}}
|
|
1783
|
+
turn_mode = TurnMode.AUTO if enabled else TurnMode.MANUAL
|
|
1784
|
+
apply_turn_mode_openai(payload, turn_mode)
|
|
1785
|
+
|
|
1786
|
+
if enabled:
|
|
1787
|
+
sess = payload.get("session", {})
|
|
1788
|
+
td = sess.get("turn_detection")
|
|
1789
|
+
|
|
1790
|
+
try:
|
|
1791
|
+
vad_type = getattr(self._last_opts, "vad_type", None)
|
|
1792
|
+
if isinstance(vad_type, str) and vad_type in ("server_vad", "semantic_vad"):
|
|
1793
|
+
if isinstance(td, dict):
|
|
1794
|
+
td["type"] = vad_type
|
|
1795
|
+
except Exception:
|
|
1796
|
+
pass
|
|
1797
|
+
|
|
1798
|
+
try:
|
|
1799
|
+
thr = getattr(self._last_opts, "vad_threshold", None)
|
|
1800
|
+
if isinstance(thr, (int, float)) and isinstance(td, dict) and td.get("type") == "server_vad":
|
|
1801
|
+
td["threshold"] = float(thr)
|
|
1802
|
+
except Exception:
|
|
1803
|
+
pass
|
|
1804
|
+
|
|
1805
|
+
self._tune_openai_vad(payload, self._last_opts)
|
|
1806
|
+
|
|
1807
|
+
if isinstance(td, dict):
|
|
1808
|
+
if silence_ms is not None:
|
|
1809
|
+
td["silence_duration_ms"] = int(silence_ms)
|
|
1810
|
+
if prefix_ms is not None:
|
|
1811
|
+
td["prefix_padding_ms"] = int(prefix_ms)
|
|
1812
|
+
|
|
1813
|
+
try:
|
|
1814
|
+
cr = getattr(self._last_opts, "vad_create_response", None)
|
|
1815
|
+
if isinstance(cr, bool):
|
|
1816
|
+
td["create_response"] = cr
|
|
1817
|
+
except Exception:
|
|
1818
|
+
pass
|
|
1819
|
+
try:
|
|
1820
|
+
ir = getattr(self._last_opts, "vad_interrupt_response", None)
|
|
1821
|
+
if isinstance(ir, bool):
|
|
1822
|
+
td["interrupt_response"] = ir
|
|
1823
|
+
except Exception:
|
|
1824
|
+
pass
|
|
1825
|
+
|
|
1826
|
+
await self.ws.send(json.dumps(payload))
|
|
1827
|
+
|
|
1828
|
+
try:
|
|
1829
|
+
if self._last_opts:
|
|
1830
|
+
setattr(self._last_opts, "auto_turn", bool(enabled))
|
|
1831
|
+
if silence_ms is not None:
|
|
1832
|
+
setattr(self._last_opts, "vad_end_silence_ms", int(silence_ms))
|
|
1833
|
+
if prefix_ms is not None:
|
|
1834
|
+
setattr(self._last_opts, "vad_prefix_padding_ms", int(prefix_ms))
|
|
1835
|
+
except Exception:
|
|
1836
|
+
pass
|
|
1837
|
+
|
|
1838
|
+
if self.debug:
|
|
1839
|
+
td_dbg = (payload.get("session", {}) or {}).get("turn_detection")
|
|
1840
|
+
print(f"[update_session_autoturn] session.update sent; auto_turn={enabled}, td={td_dbg}")
|
|
1841
|
+
|
|
1842
|
+
except Exception as e:
|
|
1843
|
+
if self.debug:
|
|
1844
|
+
print(f"[update_session_autoturn] send error: {e}")
|
|
1845
|
+
|
|
1846
|
+
def set_debug(self, enabled: bool):
|
|
1847
|
+
"""
|
|
1848
|
+
Enable or disable debug logging.
|
|
1849
|
+
|
|
1850
|
+
:param enabled: True to enable debug logging, False to disable.
|
|
1851
|
+
"""
|
|
1852
|
+
self.debug = bool(enabled)
|
|
1853
|
+
|
|
1854
|
+
def is_session_active(self) -> bool:
|
|
1855
|
+
"""Check if the WS session is currently open."""
|
|
1856
|
+
return self.ws is not None and self._running
|
|
1857
|
+
|
|
1858
|
+
def is_session(self) -> bool:
|
|
1859
|
+
"""Check if the WS session is currently open."""
|
|
1860
|
+
return self.ws is not None
|
|
1861
|
+
|
|
1862
|
+
def update_ctx(self, ctx: CtxItem):
|
|
1863
|
+
"""Update the current CtxItem (for session handle persistence)."""
|
|
1864
|
+
self._ctx = ctx
|