pygpt-net 2.7.6__py3-none-any.whl → 2.7.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pygpt_net/CHANGELOG.txt +13 -0
- pygpt_net/__init__.py +3 -3
- pygpt_net/app.py +5 -1
- pygpt_net/controller/assistant/batch.py +2 -2
- pygpt_net/controller/assistant/files.py +7 -6
- pygpt_net/controller/assistant/threads.py +0 -0
- pygpt_net/controller/chat/command.py +0 -0
- pygpt_net/controller/chat/remote_tools.py +3 -9
- pygpt_net/controller/chat/stream.py +2 -2
- pygpt_net/controller/chat/{handler/worker.py → stream_worker.py} +13 -35
- pygpt_net/controller/dialogs/confirm.py +35 -58
- pygpt_net/controller/lang/mapping.py +9 -9
- pygpt_net/controller/remote_store/{google/batch.py → batch.py} +209 -252
- pygpt_net/controller/remote_store/remote_store.py +982 -13
- pygpt_net/core/command/command.py +0 -0
- pygpt_net/core/db/viewer.py +1 -1
- pygpt_net/core/debug/models.py +2 -2
- pygpt_net/core/realtime/worker.py +3 -1
- pygpt_net/{controller/remote_store/google → core/remote_store/anthropic}/__init__.py +0 -1
- pygpt_net/core/remote_store/anthropic/files.py +211 -0
- pygpt_net/core/remote_store/anthropic/store.py +208 -0
- pygpt_net/core/remote_store/openai/store.py +5 -4
- pygpt_net/core/remote_store/remote_store.py +5 -1
- pygpt_net/{controller/remote_store/openai → core/remote_store/xai}/__init__.py +0 -1
- pygpt_net/core/remote_store/xai/files.py +225 -0
- pygpt_net/core/remote_store/xai/store.py +219 -0
- pygpt_net/data/config/config.json +18 -5
- pygpt_net/data/config/models.json +193 -4
- pygpt_net/data/config/settings.json +179 -36
- pygpt_net/data/icons/folder_eye.svg +1 -0
- pygpt_net/data/icons/folder_eye_filled.svg +1 -0
- pygpt_net/data/icons/folder_open.svg +1 -0
- pygpt_net/data/icons/folder_open_filled.svg +1 -0
- pygpt_net/data/locale/locale.de.ini +6 -3
- pygpt_net/data/locale/locale.en.ini +46 -12
- pygpt_net/data/locale/locale.es.ini +6 -3
- pygpt_net/data/locale/locale.fr.ini +6 -3
- pygpt_net/data/locale/locale.it.ini +6 -3
- pygpt_net/data/locale/locale.pl.ini +7 -4
- pygpt_net/data/locale/locale.uk.ini +6 -3
- pygpt_net/data/locale/locale.zh.ini +6 -3
- pygpt_net/icons.qrc +4 -0
- pygpt_net/icons_rc.py +282 -138
- pygpt_net/plugin/cmd_mouse_control/worker.py +2 -1
- pygpt_net/plugin/cmd_mouse_control/worker_sandbox.py +2 -1
- pygpt_net/provider/api/anthropic/__init__.py +10 -3
- pygpt_net/provider/api/anthropic/chat.py +342 -11
- pygpt_net/provider/api/anthropic/computer.py +844 -0
- pygpt_net/provider/api/anthropic/remote_tools.py +172 -0
- pygpt_net/provider/api/anthropic/store.py +307 -0
- pygpt_net/{controller/chat/handler/anthropic_stream.py → provider/api/anthropic/stream.py} +99 -10
- pygpt_net/provider/api/anthropic/tools.py +32 -77
- pygpt_net/provider/api/anthropic/utils.py +30 -0
- pygpt_net/{controller/chat/handler → provider/api/anthropic/worker}/__init__.py +0 -0
- pygpt_net/provider/api/anthropic/worker/importer.py +278 -0
- pygpt_net/provider/api/google/chat.py +62 -9
- pygpt_net/provider/api/google/store.py +124 -3
- pygpt_net/{controller/chat/handler/google_stream.py → provider/api/google/stream.py} +92 -25
- pygpt_net/provider/api/google/utils.py +185 -0
- pygpt_net/provider/api/google/worker/importer.py +16 -28
- pygpt_net/provider/api/langchain/__init__.py +0 -0
- pygpt_net/{controller/chat/handler/langchain_stream.py → provider/api/langchain/stream.py} +1 -1
- pygpt_net/provider/api/llama_index/__init__.py +0 -0
- pygpt_net/{controller/chat/handler/llamaindex_stream.py → provider/api/llama_index/stream.py} +1 -1
- pygpt_net/provider/api/openai/assistants.py +2 -2
- pygpt_net/provider/api/openai/image.py +2 -2
- pygpt_net/provider/api/openai/store.py +4 -1
- pygpt_net/{controller/chat/handler/openai_stream.py → provider/api/openai/stream.py} +1 -1
- pygpt_net/provider/api/openai/utils.py +69 -3
- pygpt_net/provider/api/openai/worker/importer.py +19 -61
- pygpt_net/provider/api/openai/worker/importer_assistants.py +230 -0
- pygpt_net/provider/api/x_ai/__init__.py +138 -15
- pygpt_net/provider/api/x_ai/audio.py +43 -11
- pygpt_net/provider/api/x_ai/chat.py +92 -4
- pygpt_net/provider/api/x_ai/image.py +149 -47
- pygpt_net/provider/api/x_ai/realtime/__init__.py +12 -0
- pygpt_net/provider/api/x_ai/realtime/client.py +1825 -0
- pygpt_net/provider/api/x_ai/realtime/realtime.py +198 -0
- pygpt_net/provider/api/x_ai/{remote.py → remote_tools.py} +183 -70
- pygpt_net/provider/api/x_ai/responses.py +507 -0
- pygpt_net/provider/api/x_ai/store.py +610 -0
- pygpt_net/{controller/chat/handler/xai_stream.py → provider/api/x_ai/stream.py} +42 -10
- pygpt_net/provider/api/x_ai/tools.py +59 -8
- pygpt_net/{controller/chat/handler → provider/api/x_ai}/utils.py +1 -2
- pygpt_net/provider/api/x_ai/vision.py +1 -4
- pygpt_net/provider/api/x_ai/worker/importer.py +308 -0
- pygpt_net/provider/audio_input/xai_grok_voice.py +390 -0
- pygpt_net/provider/audio_output/xai_tts.py +325 -0
- pygpt_net/provider/core/config/patch.py +39 -3
- pygpt_net/provider/core/config/patches/patch_before_2_6_42.py +2 -2
- pygpt_net/provider/core/model/patch.py +39 -1
- pygpt_net/tools/image_viewer/tool.py +334 -34
- pygpt_net/tools/image_viewer/ui/dialogs.py +319 -22
- pygpt_net/tools/text_editor/ui/dialogs.py +3 -2
- pygpt_net/tools/text_editor/ui/widgets.py +0 -0
- pygpt_net/ui/dialog/assistant.py +1 -1
- pygpt_net/ui/dialog/plugins.py +13 -5
- pygpt_net/ui/dialog/remote_store.py +552 -0
- pygpt_net/ui/dialogs.py +3 -5
- pygpt_net/ui/layout/ctx/ctx_list.py +58 -7
- pygpt_net/ui/menu/tools.py +6 -13
- pygpt_net/ui/widget/dialog/base.py +16 -5
- pygpt_net/ui/widget/dialog/{remote_store_google.py → remote_store.py} +10 -10
- pygpt_net/ui/widget/element/button.py +4 -4
- pygpt_net/ui/widget/image/display.py +2 -2
- pygpt_net/ui/widget/lists/context.py +2 -2
- pygpt_net/ui/widget/textarea/editor.py +0 -0
- {pygpt_net-2.7.6.dist-info → pygpt_net-2.7.8.dist-info}/METADATA +15 -2
- {pygpt_net-2.7.6.dist-info → pygpt_net-2.7.8.dist-info}/RECORD +107 -89
- pygpt_net/controller/remote_store/google/store.py +0 -615
- pygpt_net/controller/remote_store/openai/batch.py +0 -524
- pygpt_net/controller/remote_store/openai/store.py +0 -699
- pygpt_net/ui/dialog/remote_store_google.py +0 -539
- pygpt_net/ui/dialog/remote_store_openai.py +0 -539
- pygpt_net/ui/widget/dialog/remote_store_openai.py +0 -56
- pygpt_net/ui/widget/lists/remote_store_google.py +0 -248
- pygpt_net/ui/widget/lists/remote_store_openai.py +0 -317
- {pygpt_net-2.7.6.dist-info → pygpt_net-2.7.8.dist-info}/LICENSE +0 -0
- {pygpt_net-2.7.6.dist-info → pygpt_net-2.7.8.dist-info}/WHEEL +0 -0
- {pygpt_net-2.7.6.dist-info → pygpt_net-2.7.8.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,1825 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# ================================================== #
|
|
4
|
+
# This file is a part of PYGPT package #
|
|
5
|
+
# Website: https://pygpt.net #
|
|
6
|
+
# GitHub: https://github.com/szczyglis-dev/py-gpt #
|
|
7
|
+
# MIT License #
|
|
8
|
+
# Created By : Marcin Szczygliński #
|
|
9
|
+
# Updated Date: 2026.01.06 20:00:00 #
|
|
10
|
+
# ================================================== #
|
|
11
|
+
|
|
12
|
+
import asyncio
|
|
13
|
+
import base64
|
|
14
|
+
import io
|
|
15
|
+
import json
|
|
16
|
+
import websockets
|
|
17
|
+
|
|
18
|
+
from typing import Optional, Callable, Awaitable
|
|
19
|
+
from urllib.parse import urlencode
|
|
20
|
+
|
|
21
|
+
from pygpt_net.core.events import RealtimeEvent
|
|
22
|
+
from pygpt_net.core.types import MODE_AUDIO
|
|
23
|
+
from pygpt_net.item.ctx import CtxItem
|
|
24
|
+
from pygpt_net.core.text.utils import has_unclosed_code_tag
|
|
25
|
+
|
|
26
|
+
# shared
|
|
27
|
+
from pygpt_net.core.realtime.shared.loop import BackgroundLoop
|
|
28
|
+
from pygpt_net.core.realtime.shared.audio import (
|
|
29
|
+
coerce_to_pcm16_mono,
|
|
30
|
+
resample_pcm16_mono,
|
|
31
|
+
iter_pcm_chunks,
|
|
32
|
+
DEFAULT_24K,
|
|
33
|
+
)
|
|
34
|
+
from pygpt_net.core.realtime.shared.tools import (
|
|
35
|
+
sanitize_function_tools,
|
|
36
|
+
sanitize_remote_tools,
|
|
37
|
+
prepare_tools_for_session,
|
|
38
|
+
prepare_tools_for_response,
|
|
39
|
+
tools_signature,
|
|
40
|
+
build_tool_outputs_payload,
|
|
41
|
+
)
|
|
42
|
+
from pygpt_net.core.realtime.shared.turn import TurnMode, apply_turn_mode_openai
|
|
43
|
+
from pygpt_net.core.realtime.shared.session import set_ctx_rt_handle, set_rt_session_expires_at
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class xAIIRealtimeClient:
|
|
47
|
+
"""
|
|
48
|
+
xAI Realtime API client with persistent session and a dedicated background event loop.
|
|
49
|
+
|
|
50
|
+
Key points:
|
|
51
|
+
- A single background asyncio loop runs in its own thread for the lifetime of the client.
|
|
52
|
+
- One websocket connection (session) at a time; multiple "turns" (send_turn) are serialized.
|
|
53
|
+
- No server VAD: manual turn control via input_audio_buffer.* + response.create.
|
|
54
|
+
- Safe to call run()/send_turn()/reset()/shutdown() from any thread or event loop.
|
|
55
|
+
|
|
56
|
+
Session resumption:
|
|
57
|
+
- The official Realtime API does not expose a documented server-side "resume" for closed WS sessions.
|
|
58
|
+
We still persist the server-provided session.id and surface it via ctx.extra["rt_session_id"].
|
|
59
|
+
- If opts.rt_session_id is provided and differs from the current in-memory handle, we reset the
|
|
60
|
+
connection and attempt to reconnect with a "session_id" query parameter. If that fails, we fall
|
|
61
|
+
back to the standard URL to avoid breaking existing functionality.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
WS_URL = "wss://api.x.ai/v1/realtime"
|
|
65
|
+
|
|
66
|
+
def __init__(self, window=None, debug: bool = False):
|
|
67
|
+
"""
|
|
68
|
+
xAI Realtime API client
|
|
69
|
+
|
|
70
|
+
:param window: Window instance
|
|
71
|
+
:param debug: Enable debug logging
|
|
72
|
+
"""
|
|
73
|
+
self.window = window
|
|
74
|
+
self.debug = debug
|
|
75
|
+
|
|
76
|
+
# WebSocket and session state (lives on the owner loop)
|
|
77
|
+
self.ws: Optional[websockets.WebSocketClientProtocol] = None
|
|
78
|
+
self._rx_task: Optional[asyncio.Task] = None
|
|
79
|
+
self._running: bool = False
|
|
80
|
+
|
|
81
|
+
# Background loop
|
|
82
|
+
self._bg = BackgroundLoop(name="xAI-RT-Loop")
|
|
83
|
+
|
|
84
|
+
# Flow control primitives (created on the owner loop)
|
|
85
|
+
self._send_lock: Optional[asyncio.Lock] = None
|
|
86
|
+
self._response_done: Optional[asyncio.Event] = None
|
|
87
|
+
self._response_active: bool = False
|
|
88
|
+
|
|
89
|
+
# Callbacks and context
|
|
90
|
+
self._on_text: Optional[Callable[[str], Awaitable[None]]] = None
|
|
91
|
+
self._on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None
|
|
92
|
+
self._should_stop: Optional[Callable[[], bool]] = None
|
|
93
|
+
self._ctx: Optional[CtxItem] = None
|
|
94
|
+
self._last_opts = None # kept to allow reset() without resupplying
|
|
95
|
+
|
|
96
|
+
self._DEFAULT_RATE = DEFAULT_24K
|
|
97
|
+
|
|
98
|
+
# Per-response extraction state (tools/images/citations/usage/assembled text)
|
|
99
|
+
self._rt_state = None # dict populated on response.created
|
|
100
|
+
|
|
101
|
+
# Input transcription buffers keyed by item_id
|
|
102
|
+
self._input_tr_buffers: dict[str, io.StringIO] = {}
|
|
103
|
+
|
|
104
|
+
# Cached session.tools signature to avoid redundant session.update
|
|
105
|
+
self._cached_session_tools_sig: Optional[str] = None
|
|
106
|
+
|
|
107
|
+
# Last tool calls snapshot for mapping tool responses
|
|
108
|
+
self._last_tool_calls: list[dict] = []
|
|
109
|
+
|
|
110
|
+
# Live session handle (for best-effort resumption semantics)
|
|
111
|
+
self._rt_session_id: Optional[str] = None
|
|
112
|
+
self._rt_session_expires_at: Optional[int] = None # epoch seconds if provided by server
|
|
113
|
+
|
|
114
|
+
# -----------------------------
|
|
115
|
+
# Public high-level entrypoints
|
|
116
|
+
# -----------------------------
|
|
117
|
+
|
|
118
|
+
async def run(
|
|
119
|
+
self,
|
|
120
|
+
ctx: CtxItem,
|
|
121
|
+
opts,
|
|
122
|
+
on_text: Callable[[str], Awaitable[None]],
|
|
123
|
+
on_audio: Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]],
|
|
124
|
+
should_stop: Callable[[], bool] = lambda: False,
|
|
125
|
+
):
|
|
126
|
+
"""
|
|
127
|
+
Run one turn: open session if needed, send prompt/audio, await response completion.
|
|
128
|
+
|
|
129
|
+
:param ctx: CtxItem with model and conversation
|
|
130
|
+
:param opts: Options object with prompt/audio/voice/etc.
|
|
131
|
+
:param on_text: Async callback for text deltas
|
|
132
|
+
:param on_audio: Async callback for audio chunks
|
|
133
|
+
:param should_stop: Sync callback to signal barge-in (cancel active response)
|
|
134
|
+
"""
|
|
135
|
+
self._ensure_background_loop()
|
|
136
|
+
self._ctx = ctx
|
|
137
|
+
|
|
138
|
+
# If a different resumable handle is provided, reset to attempt best-effort resume.
|
|
139
|
+
try:
|
|
140
|
+
provided = getattr(opts, "rt_session_id", None)
|
|
141
|
+
if isinstance(provided, str):
|
|
142
|
+
provided = provided.strip()
|
|
143
|
+
if self.ws is not None and provided and provided != (self._rt_session_id or ""):
|
|
144
|
+
await self._run_on_owner(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop))
|
|
145
|
+
except Exception:
|
|
146
|
+
pass
|
|
147
|
+
|
|
148
|
+
# Open session on the owner loop (once)
|
|
149
|
+
if not self.ws:
|
|
150
|
+
await self._run_on_owner(self._open_session_internal(ctx, opts, on_text, on_audio, should_stop))
|
|
151
|
+
|
|
152
|
+
# Send one turn on the owner loop
|
|
153
|
+
await self._run_on_owner(self._send_turn_internal(
|
|
154
|
+
getattr(opts, "prompt", None),
|
|
155
|
+
getattr(opts, "audio_data", None),
|
|
156
|
+
getattr(opts, "audio_format", None),
|
|
157
|
+
getattr(opts, "audio_rate", None),
|
|
158
|
+
wait_for_done=not bool(getattr(opts, "streaming", False)),
|
|
159
|
+
))
|
|
160
|
+
|
|
161
|
+
async def open_session(
|
|
162
|
+
self,
|
|
163
|
+
ctx: CtxItem,
|
|
164
|
+
opts,
|
|
165
|
+
on_text: Callable[[str], Awaitable[None]],
|
|
166
|
+
on_audio: Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]],
|
|
167
|
+
should_stop: Callable[[], bool] = lambda: False,
|
|
168
|
+
):
|
|
169
|
+
"""
|
|
170
|
+
Explicitly open a session (websocket); normally run() does this on demand.
|
|
171
|
+
"""
|
|
172
|
+
self._ensure_background_loop()
|
|
173
|
+
|
|
174
|
+
# If the session is already open but a different handle is requested, reset to attempt reattach.
|
|
175
|
+
try:
|
|
176
|
+
provided = getattr(opts, "rt_session_id", None)
|
|
177
|
+
if isinstance(provided, str):
|
|
178
|
+
provided = provided.strip()
|
|
179
|
+
if self.ws is not None and provided and provided != (self._rt_session_id or ""):
|
|
180
|
+
await self._run_on_owner(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop))
|
|
181
|
+
return
|
|
182
|
+
except Exception:
|
|
183
|
+
pass
|
|
184
|
+
|
|
185
|
+
await self._run_on_owner(self._open_session_internal(ctx, opts, on_text, on_audio, should_stop))
|
|
186
|
+
|
|
187
|
+
async def close_session(self):
|
|
188
|
+
"""Close the websocket session but keep the background loop alive."""
|
|
189
|
+
if not self._bg.loop:
|
|
190
|
+
return
|
|
191
|
+
await self._run_on_owner(self._close_session_internal())
|
|
192
|
+
|
|
193
|
+
async def reset_session(
|
|
194
|
+
self,
|
|
195
|
+
ctx: Optional[CtxItem] = None,
|
|
196
|
+
opts=None,
|
|
197
|
+
on_text: Optional[Callable[[str], Awaitable[None]]] = None,
|
|
198
|
+
on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None,
|
|
199
|
+
should_stop: Optional[Callable[[], bool]] = None,
|
|
200
|
+
):
|
|
201
|
+
"""
|
|
202
|
+
Close the current session and open a fresh one (new conversation on the server).
|
|
203
|
+
If parameters are omitted, last-known ones are used.
|
|
204
|
+
"""
|
|
205
|
+
self._ensure_background_loop()
|
|
206
|
+
await self._run_on_owner(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop))
|
|
207
|
+
|
|
208
|
+
async def shutdown(self):
|
|
209
|
+
"""
|
|
210
|
+
Gracefully close the current session (if any).
|
|
211
|
+
Does NOT stop the background loop; use stop_loop_sync() or shutdown_and_stop() to also stop the loop.
|
|
212
|
+
"""
|
|
213
|
+
if not self._bg.loop:
|
|
214
|
+
return
|
|
215
|
+
await self._run_on_owner(self._close_session_internal())
|
|
216
|
+
|
|
217
|
+
async def shutdown_and_stop(self):
|
|
218
|
+
"""Close session and stop the background loop thread."""
|
|
219
|
+
await self.shutdown()
|
|
220
|
+
self.stop_loop_sync()
|
|
221
|
+
|
|
222
|
+
# -----------------------------
|
|
223
|
+
# Synchronous convenience calls
|
|
224
|
+
# -----------------------------
|
|
225
|
+
|
|
226
|
+
def close_session_sync(self, timeout: float = 5.0):
|
|
227
|
+
"""Synchronous wrapper around close_session()."""
|
|
228
|
+
if not self._bg.loop or not self._bg.loop.is_running():
|
|
229
|
+
return
|
|
230
|
+
self._bg.run_sync(self._close_session_internal(), timeout=timeout)
|
|
231
|
+
|
|
232
|
+
def reset_session_sync(
|
|
233
|
+
self,
|
|
234
|
+
ctx: Optional[CtxItem] = None,
|
|
235
|
+
opts=None,
|
|
236
|
+
on_text: Optional[Callable[[str], Awaitable[None]]] = None,
|
|
237
|
+
on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None,
|
|
238
|
+
should_stop: Optional[Callable[[], bool]] = None,
|
|
239
|
+
timeout: float = 10.0,
|
|
240
|
+
):
|
|
241
|
+
"""Synchronous wrapper around reset_session()."""
|
|
242
|
+
self._ensure_background_loop()
|
|
243
|
+
self._bg.run_sync(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop), timeout=timeout)
|
|
244
|
+
|
|
245
|
+
def shutdown_sync(self, timeout: float = 5.0):
|
|
246
|
+
"""Synchronous wrapper around shutdown() — closes the WS but leaves the loop alive."""
|
|
247
|
+
if not self._bg.loop or not self._bg.loop.is_running():
|
|
248
|
+
return
|
|
249
|
+
self._bg.run_sync(self._close_session_internal(), timeout=timeout)
|
|
250
|
+
|
|
251
|
+
def stop_loop_sync(self, timeout: float = 2.0):
|
|
252
|
+
"""Stop the background event loop thread."""
|
|
253
|
+
self._bg.stop(timeout=timeout)
|
|
254
|
+
|
|
255
|
+
# -----------------------------
|
|
256
|
+
# Tools helpers
|
|
257
|
+
# -----------------------------
|
|
258
|
+
|
|
259
|
+
def _update_last_opts_tools(self, tools: Optional[list], remote_tools: Optional[list]) -> None:
|
|
260
|
+
"""
|
|
261
|
+
Update self._last_opts with tools/remote_tools if fields are present.
|
|
262
|
+
"""
|
|
263
|
+
lo = self._last_opts
|
|
264
|
+
if not lo:
|
|
265
|
+
return
|
|
266
|
+
try:
|
|
267
|
+
if tools is not None and hasattr(lo, "tools"):
|
|
268
|
+
setattr(lo, "tools", tools)
|
|
269
|
+
except Exception:
|
|
270
|
+
pass
|
|
271
|
+
try:
|
|
272
|
+
if remote_tools is not None and hasattr(lo, "remote_tools"):
|
|
273
|
+
setattr(lo, "remote_tools", remote_tools)
|
|
274
|
+
except Exception:
|
|
275
|
+
pass
|
|
276
|
+
|
|
277
|
+
# -----------------------------
|
|
278
|
+
# Internal: background loop/dispatch
|
|
279
|
+
# -----------------------------
|
|
280
|
+
|
|
281
|
+
def _ensure_background_loop(self):
|
|
282
|
+
"""Start the background asyncio loop once and keep it running."""
|
|
283
|
+
self._bg.ensure()
|
|
284
|
+
|
|
285
|
+
async def _run_on_owner(self, coro):
|
|
286
|
+
"""Await a coroutine scheduled on the owner loop from any thread/loop."""
|
|
287
|
+
return await self._bg.run(coro)
|
|
288
|
+
|
|
289
|
+
# -----------------------------
|
|
290
|
+
# Internal: session lifecycle
|
|
291
|
+
# -----------------------------
|
|
292
|
+
|
|
293
|
+
async def _open_session_internal(
|
|
294
|
+
self,
|
|
295
|
+
ctx: CtxItem,
|
|
296
|
+
opts,
|
|
297
|
+
on_text: Callable[[str], Awaitable[None]],
|
|
298
|
+
on_audio: Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]],
|
|
299
|
+
should_stop: Callable[[], bool] = lambda: False,
|
|
300
|
+
):
|
|
301
|
+
"""
|
|
302
|
+
Open WS and configure the Realtime session on the owner loop.
|
|
303
|
+
"""
|
|
304
|
+
if self.ws is not None:
|
|
305
|
+
if self.debug:
|
|
306
|
+
print("[open_session] already open")
|
|
307
|
+
return
|
|
308
|
+
|
|
309
|
+
core = self.window.core
|
|
310
|
+
api_key = self.window.core.config.get("api_key_xai")
|
|
311
|
+
if not api_key:
|
|
312
|
+
raise RuntimeError("xAPI key not configured")
|
|
313
|
+
|
|
314
|
+
model_id = getattr(opts, "model", None) or (ctx.model if ctx and ctx.model else "grok-3")
|
|
315
|
+
voice = getattr(opts, "voice", None) or self._preferred_voice()
|
|
316
|
+
|
|
317
|
+
# Optional: requested resume handle from opts
|
|
318
|
+
resume_sid = None
|
|
319
|
+
try:
|
|
320
|
+
provided = getattr(opts, "rt_session_id", None)
|
|
321
|
+
if isinstance(provided, str):
|
|
322
|
+
provided = provided.strip()
|
|
323
|
+
if provided and provided != (self._rt_session_id or ""):
|
|
324
|
+
resume_sid = provided
|
|
325
|
+
self._rt_session_id = resume_sid
|
|
326
|
+
set_ctx_rt_handle(self._ctx, resume_sid, self.window)
|
|
327
|
+
except Exception:
|
|
328
|
+
pass
|
|
329
|
+
|
|
330
|
+
# Build WS URL with model and optional session_id for best-effort resume
|
|
331
|
+
base_q = {"model": model_id}
|
|
332
|
+
if resume_sid:
|
|
333
|
+
base_q["session_id"] = resume_sid # if unsupported by server, connect fallback will ignore
|
|
334
|
+
url_with_sid = f"{self.WS_URL}?{urlencode(base_q)}"
|
|
335
|
+
url_no_sid = f"{self.WS_URL}?{urlencode({'model': model_id})}"
|
|
336
|
+
|
|
337
|
+
headers = {
|
|
338
|
+
"Authorization": f"Bearer {api_key}",
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
# Transcription toggle
|
|
342
|
+
transcribe_enabled = bool(getattr(opts, "transcribe", False))
|
|
343
|
+
|
|
344
|
+
# Save callbacks and context
|
|
345
|
+
self._on_text = on_text
|
|
346
|
+
self._on_audio = on_audio
|
|
347
|
+
self._should_stop = should_stop or (lambda: False)
|
|
348
|
+
self._ctx = ctx
|
|
349
|
+
self._last_opts = opts
|
|
350
|
+
|
|
351
|
+
# Control primitives
|
|
352
|
+
self._response_done = asyncio.Event()
|
|
353
|
+
self._send_lock = asyncio.Lock()
|
|
354
|
+
|
|
355
|
+
if self.debug:
|
|
356
|
+
print(f"[open_session] owner_loop={id(asyncio.get_running_loop())}")
|
|
357
|
+
|
|
358
|
+
# Connect WS: first try with session_id if provided; on failure, fall back to plain URL.
|
|
359
|
+
try:
|
|
360
|
+
target_url = url_with_sid if resume_sid else url_no_sid
|
|
361
|
+
self.ws = await websockets.connect(
|
|
362
|
+
target_url,
|
|
363
|
+
additional_headers=headers,
|
|
364
|
+
max_size=16 * 1024 * 1024,
|
|
365
|
+
ping_interval=20,
|
|
366
|
+
ping_timeout=20,
|
|
367
|
+
close_timeout=5,
|
|
368
|
+
)
|
|
369
|
+
except Exception as e:
|
|
370
|
+
if resume_sid and self.debug:
|
|
371
|
+
print(f"[open_session] connect with session_id failed ({e!r}); falling back to plain URL")
|
|
372
|
+
if resume_sid:
|
|
373
|
+
self.ws = await websockets.connect(
|
|
374
|
+
url_no_sid,
|
|
375
|
+
additional_headers=headers,
|
|
376
|
+
max_size=16 * 1024 * 1024,
|
|
377
|
+
ping_interval=20,
|
|
378
|
+
ping_timeout=20,
|
|
379
|
+
close_timeout=5,
|
|
380
|
+
)
|
|
381
|
+
if self.debug:
|
|
382
|
+
print("[open_session] WS connected")
|
|
383
|
+
|
|
384
|
+
# Session payload (manual by default; prepared for auto)
|
|
385
|
+
session_payload = {
|
|
386
|
+
"type": "session.update",
|
|
387
|
+
"session": {
|
|
388
|
+
"modalities": ["text", "audio"],
|
|
389
|
+
"voice": voice,
|
|
390
|
+
"input_audio_format": "pcm16",
|
|
391
|
+
"output_audio_format": "pcm16",
|
|
392
|
+
# turn_detection set below via apply_turn_mode_openai
|
|
393
|
+
**({"instructions": str(getattr(opts, "system_prompt"))} if getattr(opts, "system_prompt", None) else {}),
|
|
394
|
+
},
|
|
395
|
+
}
|
|
396
|
+
turn_mode = TurnMode.AUTO if bool(getattr(opts, "auto_turn", False)) else TurnMode.MANUAL
|
|
397
|
+
apply_turn_mode_openai(session_payload, turn_mode)
|
|
398
|
+
self._tune_openai_vad(session_payload, opts)
|
|
399
|
+
|
|
400
|
+
# Attach tools to session (remote + functions)
|
|
401
|
+
try:
|
|
402
|
+
session_tools = prepare_tools_for_session(opts)
|
|
403
|
+
if session_tools:
|
|
404
|
+
session_payload["session"]["tools"] = session_tools
|
|
405
|
+
self._cached_session_tools_sig = tools_signature(session_tools)
|
|
406
|
+
if self.debug:
|
|
407
|
+
print(f"[open_session] session.tools attached: {len(session_tools)}")
|
|
408
|
+
else:
|
|
409
|
+
self._cached_session_tools_sig = tools_signature([])
|
|
410
|
+
except Exception as _e:
|
|
411
|
+
if self.debug:
|
|
412
|
+
print(f"[open_session] tools sanitize error: {_e}")
|
|
413
|
+
self._cached_session_tools_sig = tools_signature([])
|
|
414
|
+
|
|
415
|
+
# Attach native input transcription if requested
|
|
416
|
+
try:
|
|
417
|
+
if transcribe_enabled:
|
|
418
|
+
iat = {"model": "whisper-1"}
|
|
419
|
+
lang = getattr(opts, "transcribe_language", None) or getattr(opts, "language", None)
|
|
420
|
+
if lang:
|
|
421
|
+
iat["language"] = str(lang)
|
|
422
|
+
session_payload["session"]["input_audio_transcription"] = iat
|
|
423
|
+
except Exception:
|
|
424
|
+
pass
|
|
425
|
+
|
|
426
|
+
if self.debug:
|
|
427
|
+
print(f"[open_session] session_payload: {json.dumps(session_payload)}")
|
|
428
|
+
|
|
429
|
+
await self.ws.send(json.dumps(session_payload))
|
|
430
|
+
if self.debug:
|
|
431
|
+
print("[open_session] session.update sent")
|
|
432
|
+
|
|
433
|
+
# Start a single receiver task
|
|
434
|
+
if self._rx_task is None or self._rx_task.done():
|
|
435
|
+
self._running = True
|
|
436
|
+
self._rx_task = asyncio.create_task(self._recv_loop(), name="realtime-recv")
|
|
437
|
+
if self.debug:
|
|
438
|
+
print("[open_session] _recv_loop started")
|
|
439
|
+
|
|
440
|
+
async def _close_session_internal(self):
|
|
441
|
+
"""Close WS and stop the receiver; keep the background loop alive for reuse."""
|
|
442
|
+
self._running = False
|
|
443
|
+
|
|
444
|
+
# Cancel active response if any
|
|
445
|
+
if self.ws and self._response_active:
|
|
446
|
+
try:
|
|
447
|
+
await self.ws.send(json.dumps({"type": "response.cancel"}))
|
|
448
|
+
except Exception:
|
|
449
|
+
pass
|
|
450
|
+
|
|
451
|
+
# Unblock any waiters before clearing handles
|
|
452
|
+
try:
|
|
453
|
+
if self._response_done and not self._response_done.is_set():
|
|
454
|
+
self._response_done.set()
|
|
455
|
+
except Exception:
|
|
456
|
+
pass
|
|
457
|
+
|
|
458
|
+
# Close the socket
|
|
459
|
+
if self.ws:
|
|
460
|
+
try:
|
|
461
|
+
await self.ws.close()
|
|
462
|
+
except Exception:
|
|
463
|
+
pass
|
|
464
|
+
self.ws = None
|
|
465
|
+
|
|
466
|
+
# Await receiver
|
|
467
|
+
if self._rx_task:
|
|
468
|
+
try:
|
|
469
|
+
await self._rx_task
|
|
470
|
+
except Exception:
|
|
471
|
+
pass
|
|
472
|
+
self._rx_task = None
|
|
473
|
+
|
|
474
|
+
# Reset control primitives
|
|
475
|
+
self._response_active = False
|
|
476
|
+
self._response_done = None
|
|
477
|
+
self._send_lock = None
|
|
478
|
+
self._cached_session_tools_sig = None
|
|
479
|
+
|
|
480
|
+
# Clear in-memory handle; do not wipe persisted ctx.extra["rt_session_id"]
|
|
481
|
+
self._rt_session_id = None
|
|
482
|
+
self._rt_session_expires_at = None
|
|
483
|
+
|
|
484
|
+
if self.debug:
|
|
485
|
+
print("[close_session] closed")
|
|
486
|
+
|
|
487
|
+
async def _reset_session_internal(
|
|
488
|
+
self,
|
|
489
|
+
ctx: Optional[CtxItem] = None,
|
|
490
|
+
opts=None,
|
|
491
|
+
on_text: Optional[Callable[[str], Awaitable[None]]] = None,
|
|
492
|
+
on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None,
|
|
493
|
+
should_stop: Optional[Callable[[], bool]] = None,
|
|
494
|
+
):
|
|
495
|
+
"""
|
|
496
|
+
Close current session and open a new one with provided or last-known parameters.
|
|
497
|
+
"""
|
|
498
|
+
# Determine params to reuse if not provided
|
|
499
|
+
ctx = ctx or self._ctx
|
|
500
|
+
opts = opts or self._last_opts
|
|
501
|
+
on_text = on_text or self._on_text
|
|
502
|
+
on_audio = on_audio or self._on_audio
|
|
503
|
+
should_stop = should_stop or self._should_stop or (lambda: False)
|
|
504
|
+
|
|
505
|
+
if not (ctx and opts and on_text and on_audio):
|
|
506
|
+
raise RuntimeError("reset_session requires previous or explicit ctx/opts/callbacks")
|
|
507
|
+
|
|
508
|
+
await self._close_session_internal()
|
|
509
|
+
await self._open_session_internal(ctx, opts, on_text, on_audio, should_stop)
|
|
510
|
+
|
|
511
|
+
# -----------------------------
|
|
512
|
+
# Internal: one "turn"
|
|
513
|
+
# -----------------------------
|
|
514
|
+
|
|
515
|
+
async def _send_turn_internal(
|
|
516
|
+
self,
|
|
517
|
+
prompt: Optional[str] = None,
|
|
518
|
+
audio_data: Optional[bytes] = None,
|
|
519
|
+
audio_format: Optional[str] = None,
|
|
520
|
+
audio_rate: Optional[int] = None,
|
|
521
|
+
wait_for_done: bool = True,
|
|
522
|
+
):
|
|
523
|
+
"""
|
|
524
|
+
Send one manual turn (optional text + optional audio) and trigger response.create.
|
|
525
|
+
"""
|
|
526
|
+
if not self.ws:
|
|
527
|
+
# If session dropped remotely, try to reopen from last state
|
|
528
|
+
if self._ctx and self._last_opts:
|
|
529
|
+
await self._open_session_internal(self._ctx, self._last_opts, self._on_text, self._on_audio, self._should_stop)
|
|
530
|
+
else:
|
|
531
|
+
raise RuntimeError("Session not open. Call open_session(...) first.")
|
|
532
|
+
|
|
533
|
+
# Serialize all sends to a single WS writer
|
|
534
|
+
if self._send_lock is None:
|
|
535
|
+
self._send_lock = asyncio.Lock()
|
|
536
|
+
|
|
537
|
+
# Determine whether we should trigger a response for this turn
|
|
538
|
+
def _bool(v) -> bool:
|
|
539
|
+
try:
|
|
540
|
+
return bool(v)
|
|
541
|
+
except Exception:
|
|
542
|
+
return False
|
|
543
|
+
|
|
544
|
+
is_auto_turn = _bool(getattr(self._last_opts or object(), "auto_turn", False))
|
|
545
|
+
has_text = bool(prompt and str(prompt).strip() and str(prompt).strip() != "...")
|
|
546
|
+
has_audio = bool(audio_data)
|
|
547
|
+
# Honor explicit "reply" hint if provided by caller (e.g., opts.extra.reply == True)
|
|
548
|
+
reply_hint = False
|
|
549
|
+
try:
|
|
550
|
+
extra = getattr(self._last_opts, "extra", None)
|
|
551
|
+
if isinstance(extra, dict):
|
|
552
|
+
reply_hint = bool(extra.get("reply", False))
|
|
553
|
+
except Exception:
|
|
554
|
+
pass
|
|
555
|
+
|
|
556
|
+
# In manual mode, do not auto-trigger response.create when there is no user input and no explicit reply request.
|
|
557
|
+
if not has_text and not has_audio and not reply_hint:
|
|
558
|
+
if self.debug:
|
|
559
|
+
print("[send_turn] skipped: manual mode with empty input; waiting for explicit commit")
|
|
560
|
+
return
|
|
561
|
+
|
|
562
|
+
wait_prev: Optional[asyncio.Event] = None
|
|
563
|
+
wait_curr: Optional[asyncio.Event] = None
|
|
564
|
+
|
|
565
|
+
async with self._send_lock:
|
|
566
|
+
# Ensure previous response is finished (snapshot the handle to avoid race with close)
|
|
567
|
+
if self._response_active and self._response_done:
|
|
568
|
+
wait_prev = self._response_done
|
|
569
|
+
|
|
570
|
+
# Optional text
|
|
571
|
+
if has_text:
|
|
572
|
+
if self.debug:
|
|
573
|
+
print(f"[send_turn] prompt len={len(prompt)}")
|
|
574
|
+
await self.ws.send(json.dumps({
|
|
575
|
+
"type": "conversation.item.create",
|
|
576
|
+
"item": {
|
|
577
|
+
"type": "message",
|
|
578
|
+
"role": "user",
|
|
579
|
+
"content": [{"type": "input_text", "text": str(prompt)}],
|
|
580
|
+
},
|
|
581
|
+
}))
|
|
582
|
+
|
|
583
|
+
# Optional audio
|
|
584
|
+
if has_audio:
|
|
585
|
+
sr, _ch, pcm = coerce_to_pcm16_mono(audio_data, audio_format, audio_rate, fallback_rate=self._DEFAULT_RATE)
|
|
586
|
+
|
|
587
|
+
if sr != self._DEFAULT_RATE:
|
|
588
|
+
try:
|
|
589
|
+
pcm = resample_pcm16_mono(pcm, sr, self._DEFAULT_RATE)
|
|
590
|
+
if self.debug:
|
|
591
|
+
print(f"[audio] resampled {sr} -> {self._DEFAULT_RATE}")
|
|
592
|
+
sr = self._DEFAULT_RATE
|
|
593
|
+
except Exception as e:
|
|
594
|
+
if self.debug:
|
|
595
|
+
print(f"[audio] resample failed {sr}->{self._DEFAULT_RATE}: {e}")
|
|
596
|
+
|
|
597
|
+
await self.ws.send(json.dumps({"type": "input_audio_buffer.clear"}))
|
|
598
|
+
for chunk in iter_pcm_chunks(pcm, sr, ms=50):
|
|
599
|
+
if not chunk:
|
|
600
|
+
continue
|
|
601
|
+
await self.ws.send(json.dumps({
|
|
602
|
+
"type": "input_audio_buffer.append",
|
|
603
|
+
"audio": base64.b64encode(chunk).decode("utf-8"),
|
|
604
|
+
}))
|
|
605
|
+
await self.ws.send(json.dumps({"type": "input_audio_buffer.commit"}))
|
|
606
|
+
|
|
607
|
+
# If we were waiting for a previous response, do it inside lock handoff-safe
|
|
608
|
+
if wait_prev:
|
|
609
|
+
try:
|
|
610
|
+
if self.debug:
|
|
611
|
+
print("[send_turn] waiting for previous response")
|
|
612
|
+
await wait_prev.wait()
|
|
613
|
+
except Exception:
|
|
614
|
+
pass
|
|
615
|
+
|
|
616
|
+
# Prepare wait handle for the response about to start
|
|
617
|
+
if self._response_done is None:
|
|
618
|
+
self._response_done = asyncio.Event()
|
|
619
|
+
else:
|
|
620
|
+
try:
|
|
621
|
+
self._response_done.clear()
|
|
622
|
+
except Exception:
|
|
623
|
+
self._response_done = asyncio.Event()
|
|
624
|
+
wait_curr = self._response_done # snapshot for race-free waiting
|
|
625
|
+
|
|
626
|
+
# Build optional response payload (modalities + tools/tool_choice)
|
|
627
|
+
resp_obj = {"modalities": ["text", "audio"]}
|
|
628
|
+
try:
|
|
629
|
+
resp_tools, tool_choice = prepare_tools_for_response(self._last_opts)
|
|
630
|
+
if resp_tools:
|
|
631
|
+
resp_obj["tools"] = resp_tools
|
|
632
|
+
if tool_choice is None:
|
|
633
|
+
tool_choice = "auto"
|
|
634
|
+
if tool_choice:
|
|
635
|
+
resp_obj["tool_choice"] = tool_choice
|
|
636
|
+
except Exception as _e:
|
|
637
|
+
if self.debug:
|
|
638
|
+
print(f"[send_turn] response tools compose error: {_e}")
|
|
639
|
+
|
|
640
|
+
payload = {"type": "response.create"}
|
|
641
|
+
if len(resp_obj) > 0:
|
|
642
|
+
payload["response"] = resp_obj
|
|
643
|
+
|
|
644
|
+
await self.ws.send(json.dumps(payload))
|
|
645
|
+
if self.debug:
|
|
646
|
+
print("[send_turn] response.create sent")
|
|
647
|
+
|
|
648
|
+
# Optionally wait for response.done (otherwise return immediately)
|
|
649
|
+
if wait_for_done and wait_curr:
|
|
650
|
+
if self.debug:
|
|
651
|
+
print("[send_turn] waiting for response.done")
|
|
652
|
+
try:
|
|
653
|
+
await wait_curr.wait()
|
|
654
|
+
except Exception:
|
|
655
|
+
pass
|
|
656
|
+
if self.debug:
|
|
657
|
+
print("[send_turn] response.done received")
|
|
658
|
+
|
|
659
|
+
async def _cancel_active_response_internal(self):
|
|
660
|
+
"""Cancel current response (barge-in)."""
|
|
661
|
+
if self.ws and self._response_active:
|
|
662
|
+
try:
|
|
663
|
+
await self.ws.send(json.dumps({"type": "response.cancel"}))
|
|
664
|
+
except Exception:
|
|
665
|
+
pass
|
|
666
|
+
|
|
667
|
+
# -----------------------------
|
|
668
|
+
# Internal: audio input (auto-turn mode)
|
|
669
|
+
# -----------------------------
|
|
670
|
+
|
|
671
|
+
def rt_handle_audio_input_sync(self, event: RealtimeEvent, timeout: float = 0.5):
|
|
672
|
+
"""
|
|
673
|
+
Synchronous entrypoint for continuous microphone input when auto-turn is enabled.
|
|
674
|
+
This is safe to call from any thread; it schedules on the owner's background loop.
|
|
675
|
+
"""
|
|
676
|
+
# Fast return if nothing to send
|
|
677
|
+
try:
|
|
678
|
+
payload = getattr(event, "data", {}) or {}
|
|
679
|
+
if isinstance(payload, dict) and "payload" in payload and isinstance(payload["payload"], dict):
|
|
680
|
+
payload = payload["payload"]
|
|
681
|
+
if not payload or not payload.get("data"):
|
|
682
|
+
return
|
|
683
|
+
except Exception:
|
|
684
|
+
return
|
|
685
|
+
|
|
686
|
+
self._ensure_background_loop()
|
|
687
|
+
try:
|
|
688
|
+
self._bg.run_sync(self._rt_handle_audio_input_internal(event), timeout=timeout)
|
|
689
|
+
except Exception:
|
|
690
|
+
# Never raise to caller from audio callback
|
|
691
|
+
pass
|
|
692
|
+
|
|
693
|
+
async def _rt_handle_audio_input_internal(self, event: RealtimeEvent):
|
|
694
|
+
"""
|
|
695
|
+
Owner-loop implementation: push live audio to input buffer in auto-turn mode.
|
|
696
|
+
"""
|
|
697
|
+
# Session must be open and auto-turn must be enabled
|
|
698
|
+
if not self.ws or not self._running:
|
|
699
|
+
if self.debug:
|
|
700
|
+
print("[_rt_handle_audio_input] Socket not open!")
|
|
701
|
+
return
|
|
702
|
+
try:
|
|
703
|
+
if not bool(getattr(self._last_opts, "auto_turn", False)):
|
|
704
|
+
return
|
|
705
|
+
except Exception:
|
|
706
|
+
return
|
|
707
|
+
|
|
708
|
+
# Extract normalized payload
|
|
709
|
+
payload = getattr(event, "data", {}) or {}
|
|
710
|
+
if isinstance(payload, dict) and "payload" in payload and isinstance(payload["payload"], dict):
|
|
711
|
+
payload = payload["payload"]
|
|
712
|
+
|
|
713
|
+
data: bytes = payload.get("data") or b""
|
|
714
|
+
if not data:
|
|
715
|
+
return
|
|
716
|
+
mime = str(payload.get("mime") or "audio/pcm")
|
|
717
|
+
rate = int(payload.get("rate") or 0) or self._DEFAULT_RATE
|
|
718
|
+
channels = int(payload.get("channels") or 1)
|
|
719
|
+
is_final = bool(payload.get("final", False))
|
|
720
|
+
|
|
721
|
+
# Convert to PCM16 mono @ 24kHz as required by our session config
|
|
722
|
+
fmt_hint = "pcm16" if mime.startswith("audio/pcm") else None
|
|
723
|
+
try:
|
|
724
|
+
sr, _ch, pcm = coerce_to_pcm16_mono(data, fmt_hint, rate, fallback_rate=self._DEFAULT_RATE)
|
|
725
|
+
if sr != self._DEFAULT_RATE:
|
|
726
|
+
try:
|
|
727
|
+
pcm = resample_pcm16_mono(pcm, sr, self._DEFAULT_RATE)
|
|
728
|
+
sr = self._DEFAULT_RATE
|
|
729
|
+
except Exception:
|
|
730
|
+
# On resample failure, still try to send raw chunk as-is (defensive)
|
|
731
|
+
sr = self._DEFAULT_RATE
|
|
732
|
+
except Exception:
|
|
733
|
+
return
|
|
734
|
+
|
|
735
|
+
# Serialize writes to the websocket
|
|
736
|
+
if self._send_lock is None:
|
|
737
|
+
self._send_lock = asyncio.Lock()
|
|
738
|
+
|
|
739
|
+
async with self._send_lock:
|
|
740
|
+
# Append in ~50 ms chunks to keep frames small
|
|
741
|
+
for chunk in iter_pcm_chunks(pcm, sr, ms=50):
|
|
742
|
+
if not chunk:
|
|
743
|
+
continue
|
|
744
|
+
try:
|
|
745
|
+
await self.ws.send(json.dumps({
|
|
746
|
+
"type": "input_audio_buffer.append",
|
|
747
|
+
"audio": base64.b64encode(chunk).decode("utf-8"),
|
|
748
|
+
}))
|
|
749
|
+
except Exception:
|
|
750
|
+
return
|
|
751
|
+
|
|
752
|
+
# If plugin reported stream end, flush the buffer once.
|
|
753
|
+
if is_final:
|
|
754
|
+
try:
|
|
755
|
+
if self.debug:
|
|
756
|
+
print("[_rt_handle_audio_input] final chunk; committing")
|
|
757
|
+
await self.ws.send(json.dumps({"type": "input_audio_buffer.commit"}))
|
|
758
|
+
except Exception:
|
|
759
|
+
pass
|
|
760
|
+
|
|
761
|
+
def commit_audio_input_sync(self, timeout: float = 0.5):
|
|
762
|
+
"""
|
|
763
|
+
Synchronous entrypoint to commit the input audio buffer in auto-turn mode.
|
|
764
|
+
This is safe to call from any thread; it schedules on the owner's background loop.
|
|
765
|
+
"""
|
|
766
|
+
self._ensure_background_loop()
|
|
767
|
+
try:
|
|
768
|
+
self._bg.run_sync(self._commit_audio_input_internal(), timeout=timeout)
|
|
769
|
+
except Exception:
|
|
770
|
+
# Never raise to caller from audio callback
|
|
771
|
+
pass
|
|
772
|
+
|
|
773
|
+
async def _commit_audio_input_internal(self):
|
|
774
|
+
"""
|
|
775
|
+
Owner-loop implementation: commit input audio buffer in auto-turn mode.
|
|
776
|
+
"""
|
|
777
|
+
if not self.ws or not self._running:
|
|
778
|
+
return
|
|
779
|
+
try:
|
|
780
|
+
if not bool(getattr(self._last_opts, "auto_turn", False)):
|
|
781
|
+
return
|
|
782
|
+
except Exception:
|
|
783
|
+
return
|
|
784
|
+
if self._send_lock is None:
|
|
785
|
+
self._send_lock = asyncio.Lock()
|
|
786
|
+
async with self._send_lock:
|
|
787
|
+
try:
|
|
788
|
+
await self.ws.send(json.dumps({"type": "input_audio_buffer.commit"}))
|
|
789
|
+
except Exception:
|
|
790
|
+
pass
|
|
791
|
+
|
|
792
|
+
def force_response_now_sync(self, timeout: float = 5.0):
|
|
793
|
+
"""Synchronously force the model to create a response from current input buffer."""
|
|
794
|
+
self._ensure_background_loop()
|
|
795
|
+
try:
|
|
796
|
+
self._bg.run_sync(self._force_response_now_internal(), timeout=timeout)
|
|
797
|
+
except Exception:
|
|
798
|
+
pass
|
|
799
|
+
|
|
800
|
+
async def _force_response_now_internal(self):
|
|
801
|
+
"""Owner-loop: commit current input buffer and trigger response.create."""
|
|
802
|
+
if not self.ws or not self._running:
|
|
803
|
+
return
|
|
804
|
+
try:
|
|
805
|
+
if not bool(getattr(self._last_opts, "auto_turn", False)):
|
|
806
|
+
# This helper is intended for auto-turn; manual flow already does commit+response.create.
|
|
807
|
+
return
|
|
808
|
+
except Exception:
|
|
809
|
+
return
|
|
810
|
+
|
|
811
|
+
if self._send_lock is None:
|
|
812
|
+
self._send_lock = asyncio.Lock()
|
|
813
|
+
|
|
814
|
+
async with self._send_lock:
|
|
815
|
+
# 1) Finalize current input buffer
|
|
816
|
+
try:
|
|
817
|
+
await self.ws.send(json.dumps({"type": "input_audio_buffer.commit"}))
|
|
818
|
+
except Exception:
|
|
819
|
+
return
|
|
820
|
+
|
|
821
|
+
# 2) Prepare wait handle for this response
|
|
822
|
+
if self._response_done is None:
|
|
823
|
+
self._response_done = asyncio.Event()
|
|
824
|
+
else:
|
|
825
|
+
try:
|
|
826
|
+
self._response_done.clear()
|
|
827
|
+
except Exception:
|
|
828
|
+
self._response_done = asyncio.Event()
|
|
829
|
+
|
|
830
|
+
# 3) Build response payload (modalities + tools/tool_choice like in _send_turn_internal)
|
|
831
|
+
resp_obj = {"modalities": ["text", "audio"]}
|
|
832
|
+
try:
|
|
833
|
+
resp_tools, tool_choice = prepare_tools_for_response(self._last_opts)
|
|
834
|
+
if resp_tools:
|
|
835
|
+
resp_obj["tools"] = resp_tools
|
|
836
|
+
if tool_choice is None:
|
|
837
|
+
tool_choice = "auto"
|
|
838
|
+
if tool_choice:
|
|
839
|
+
resp_obj["tool_choice"] = tool_choice
|
|
840
|
+
except Exception:
|
|
841
|
+
pass
|
|
842
|
+
|
|
843
|
+
# 4) Trigger the assistant response now
|
|
844
|
+
try:
|
|
845
|
+
await self.ws.send(json.dumps({"type": "response.create", "response": resp_obj}))
|
|
846
|
+
except Exception:
|
|
847
|
+
return
|
|
848
|
+
|
|
849
|
+
# -----------------------------
|
|
850
|
+
# Public: live tools update
|
|
851
|
+
# -----------------------------
|
|
852
|
+
|
|
853
|
+
async def update_session_tools(
|
|
854
|
+
self,
|
|
855
|
+
tools: Optional[list] = None,
|
|
856
|
+
remote_tools: Optional[list] = None,
|
|
857
|
+
force: bool = False
|
|
858
|
+
):
|
|
859
|
+
"""
|
|
860
|
+
Update session tools live via session.update.
|
|
861
|
+
If WS is not open, this updates self._last_opts and returns.
|
|
862
|
+
"""
|
|
863
|
+
self._ensure_background_loop()
|
|
864
|
+
return await self._run_on_owner(
|
|
865
|
+
self._update_session_tools_internal(tools, remote_tools, force)
|
|
866
|
+
)
|
|
867
|
+
|
|
868
|
+
def update_session_tools_sync(
|
|
869
|
+
self,
|
|
870
|
+
tools: Optional[list] = None,
|
|
871
|
+
remote_tools: Optional[list] = None,
|
|
872
|
+
force: bool = False,
|
|
873
|
+
timeout: float = 5.0
|
|
874
|
+
):
|
|
875
|
+
"""Synchronous wrapper over update_session_tools()."""
|
|
876
|
+
self._ensure_background_loop()
|
|
877
|
+
self._bg.run_sync(self._update_session_tools_internal(tools, remote_tools, force), timeout=timeout)
|
|
878
|
+
|
|
879
|
+
async def _update_session_tools_internal(
|
|
880
|
+
self,
|
|
881
|
+
tools: Optional[list],
|
|
882
|
+
remote_tools: Optional[list],
|
|
883
|
+
force: bool
|
|
884
|
+
):
|
|
885
|
+
"""
|
|
886
|
+
Owner-loop implementation for session tools update.
|
|
887
|
+
"""
|
|
888
|
+
# If socket is not open, just cache into last opts
|
|
889
|
+
if not self.ws:
|
|
890
|
+
self._update_last_opts_tools(tools, remote_tools)
|
|
891
|
+
self._cached_session_tools_sig = None
|
|
892
|
+
if self.debug:
|
|
893
|
+
print("[update_session_tools] WS not open; cached for next session")
|
|
894
|
+
return
|
|
895
|
+
|
|
896
|
+
# Sanitize/compose session tools
|
|
897
|
+
try:
|
|
898
|
+
fn = sanitize_function_tools(tools if tools is not None else getattr(self._last_opts, "tools", None))
|
|
899
|
+
rt = sanitize_remote_tools(remote_tools if remote_tools is not None else getattr(self._last_opts, "remote_tools", None))
|
|
900
|
+
session_tools = (rt or []) + (fn or [])
|
|
901
|
+
except Exception as e:
|
|
902
|
+
if self.debug:
|
|
903
|
+
print(f"[update_session_tools] sanitize error: {e}")
|
|
904
|
+
session_tools = []
|
|
905
|
+
|
|
906
|
+
new_sig = tools_signature(session_tools)
|
|
907
|
+
|
|
908
|
+
# Compare with cached signature
|
|
909
|
+
if not force and self._cached_session_tools_sig == new_sig:
|
|
910
|
+
if self.debug:
|
|
911
|
+
print("[update_session_tools] no changes; skipping session.update")
|
|
912
|
+
self._update_last_opts_tools(tools, remote_tools)
|
|
913
|
+
return
|
|
914
|
+
|
|
915
|
+
# Send session.update under the single writer lock
|
|
916
|
+
if self._send_lock is None:
|
|
917
|
+
self._send_lock = asyncio.Lock()
|
|
918
|
+
async with self._send_lock:
|
|
919
|
+
try:
|
|
920
|
+
payload = {
|
|
921
|
+
"type": "session.update",
|
|
922
|
+
"session": {"tools": session_tools}
|
|
923
|
+
}
|
|
924
|
+
await self.ws.send(json.dumps(payload))
|
|
925
|
+
self._cached_session_tools_sig = new_sig
|
|
926
|
+
self._update_last_opts_tools(tools, remote_tools)
|
|
927
|
+
if self.debug:
|
|
928
|
+
print(f"[update_session_tools] session.update sent; tools={len(session_tools)}")
|
|
929
|
+
except Exception as e:
|
|
930
|
+
if self.debug:
|
|
931
|
+
print(f"[update_session_tools] send error: {e}")
|
|
932
|
+
|
|
933
|
+
# -----------------------------
|
|
934
|
+
# Public: send tool results back to the model
|
|
935
|
+
# -----------------------------
|
|
936
|
+
|
|
937
|
+
async def send_tool_results(
|
|
938
|
+
self,
|
|
939
|
+
results,
|
|
940
|
+
continue_turn: bool = True,
|
|
941
|
+
wait_for_done: bool = True,
|
|
942
|
+
):
|
|
943
|
+
"""
|
|
944
|
+
Send tool results back to the Realtime session.
|
|
945
|
+
"""
|
|
946
|
+
self._ensure_background_loop()
|
|
947
|
+
return await self._run_on_owner(
|
|
948
|
+
self._send_tool_results_internal(results, continue_turn, wait_for_done)
|
|
949
|
+
)
|
|
950
|
+
|
|
951
|
+
def send_tool_results_sync(
|
|
952
|
+
self,
|
|
953
|
+
results,
|
|
954
|
+
continue_turn: bool = True,
|
|
955
|
+
wait_for_done: bool = True,
|
|
956
|
+
timeout: float = 20.0,
|
|
957
|
+
):
|
|
958
|
+
"""Synchronous wrapper for send_tool_results()."""
|
|
959
|
+
self._ensure_background_loop()
|
|
960
|
+
return self._bg.run_sync(
|
|
961
|
+
self._send_tool_results_internal(results, continue_turn, wait_for_done),
|
|
962
|
+
timeout=timeout
|
|
963
|
+
)
|
|
964
|
+
|
|
965
|
+
async def _send_tool_results_internal(
|
|
966
|
+
self,
|
|
967
|
+
results,
|
|
968
|
+
continue_turn: bool,
|
|
969
|
+
wait_for_done: bool,
|
|
970
|
+
):
|
|
971
|
+
"""
|
|
972
|
+
Owner-loop implementation. Serializes sends under the WS writer lock.
|
|
973
|
+
"""
|
|
974
|
+
if not self.ws:
|
|
975
|
+
raise RuntimeError("Live session is not open")
|
|
976
|
+
|
|
977
|
+
outputs = build_tool_outputs_payload(results, self._last_tool_calls)
|
|
978
|
+
if not outputs:
|
|
979
|
+
return
|
|
980
|
+
|
|
981
|
+
if self._send_lock is None:
|
|
982
|
+
self._send_lock = asyncio.Lock()
|
|
983
|
+
|
|
984
|
+
wait_ev: Optional[asyncio.Event] = None
|
|
985
|
+
async with self._send_lock:
|
|
986
|
+
# Emit one conversation.item.create per tool output
|
|
987
|
+
for it in outputs:
|
|
988
|
+
payload = {
|
|
989
|
+
"type": "conversation.item.create",
|
|
990
|
+
"item": {
|
|
991
|
+
"type": "function_call_output",
|
|
992
|
+
"call_id": it["call_id"],
|
|
993
|
+
"output": it["output"], # must be a string (JSON-encoded when dict/list)
|
|
994
|
+
},
|
|
995
|
+
}
|
|
996
|
+
if it.get("previous_item_id"):
|
|
997
|
+
payload["previous_item_id"] = it["previous_item_id"]
|
|
998
|
+
await self.ws.send(json.dumps(payload))
|
|
999
|
+
|
|
1000
|
+
# Optionally ask the model to continue
|
|
1001
|
+
if continue_turn:
|
|
1002
|
+
if self._response_done is None:
|
|
1003
|
+
self._response_done = asyncio.Event()
|
|
1004
|
+
else:
|
|
1005
|
+
try:
|
|
1006
|
+
self._response_done.clear()
|
|
1007
|
+
except Exception:
|
|
1008
|
+
self._response_done = asyncio.Event()
|
|
1009
|
+
wait_ev = self._response_done # snapshot for race-free waiting
|
|
1010
|
+
await self.ws.send(json.dumps({"type": "response.create"}))
|
|
1011
|
+
|
|
1012
|
+
# Wait for the follow-up response to complete
|
|
1013
|
+
if continue_turn and wait_for_done and wait_ev:
|
|
1014
|
+
try:
|
|
1015
|
+
await wait_ev.wait()
|
|
1016
|
+
except Exception:
|
|
1017
|
+
pass
|
|
1018
|
+
|
|
1019
|
+
# -----------------------------
|
|
1020
|
+
# Internal: receive loop
|
|
1021
|
+
# -----------------------------
|
|
1022
|
+
|
|
1023
|
+
async def _recv_loop(self):
|
|
1024
|
+
"""
|
|
1025
|
+
Single receiver loop for the entire session.
|
|
1026
|
+
Processes incoming events and dispatches to callbacks.
|
|
1027
|
+
"""
|
|
1028
|
+
if self.debug:
|
|
1029
|
+
print("[_recv_loop] started")
|
|
1030
|
+
|
|
1031
|
+
DEFAULT_RATE = self._DEFAULT_RATE
|
|
1032
|
+
audio_done = True
|
|
1033
|
+
|
|
1034
|
+
try:
|
|
1035
|
+
while self._running and self.ws:
|
|
1036
|
+
# Do not hard-stop the session on should_stop; only cancel active response if requested.
|
|
1037
|
+
if self._should_stop and self._should_stop():
|
|
1038
|
+
await self._cancel_active_response_internal()
|
|
1039
|
+
|
|
1040
|
+
try:
|
|
1041
|
+
raw = await asyncio.wait_for(self.ws.recv(), timeout=60)
|
|
1042
|
+
except asyncio.TimeoutError:
|
|
1043
|
+
continue
|
|
1044
|
+
except Exception as e:
|
|
1045
|
+
if self.debug:
|
|
1046
|
+
print(f"[_recv_loop] recv error: {e!r}")
|
|
1047
|
+
break
|
|
1048
|
+
|
|
1049
|
+
if isinstance(raw, bytes):
|
|
1050
|
+
# Realtime sends JSON text frames; ignore unexpected binary
|
|
1051
|
+
continue
|
|
1052
|
+
|
|
1053
|
+
try:
|
|
1054
|
+
ev = json.loads(raw)
|
|
1055
|
+
except Exception:
|
|
1056
|
+
continue
|
|
1057
|
+
|
|
1058
|
+
etype = ev.get("type")
|
|
1059
|
+
|
|
1060
|
+
# ---- session lifecycle (capture server handle) ----
|
|
1061
|
+
if etype in ("session.created", "session.updated"):
|
|
1062
|
+
sess = ev.get("session") or {}
|
|
1063
|
+
sid = sess.get("id")
|
|
1064
|
+
if isinstance(sid, str) and sid.strip():
|
|
1065
|
+
self._rt_session_id = sid.strip()
|
|
1066
|
+
set_ctx_rt_handle(self._ctx, self._rt_session_id, self.window)
|
|
1067
|
+
if self.debug:
|
|
1068
|
+
print(f"[_recv_loop] session id: {self._rt_session_id}")
|
|
1069
|
+
# Optional: expires_at if present (not always provided)
|
|
1070
|
+
exp = sess.get("expires_at") or sess.get("expiresAt")
|
|
1071
|
+
try:
|
|
1072
|
+
if isinstance(exp, (int, float)) and exp > 0:
|
|
1073
|
+
self._rt_session_expires_at = int(exp)
|
|
1074
|
+
set_rt_session_expires_at(self._ctx, self._rt_session_expires_at, self.window)
|
|
1075
|
+
except Exception:
|
|
1076
|
+
pass
|
|
1077
|
+
continue
|
|
1078
|
+
|
|
1079
|
+
if etype == "response.created":
|
|
1080
|
+
if self.debug:
|
|
1081
|
+
print("[_recv_loop] response created")
|
|
1082
|
+
self._response_active = True
|
|
1083
|
+
audio_done = False
|
|
1084
|
+
self._rt_reset_state()
|
|
1085
|
+
|
|
1086
|
+
elif etype == "input_audio_buffer.speech_started":
|
|
1087
|
+
if self.debug:
|
|
1088
|
+
print("[_recv_loop] speech_started")
|
|
1089
|
+
|
|
1090
|
+
elif etype == "input_audio_buffer.speech_stopped":
|
|
1091
|
+
if self.debug:
|
|
1092
|
+
print("[_recv_loop] speech_stopped")
|
|
1093
|
+
|
|
1094
|
+
elif etype == "input_audio_buffer.committed":
|
|
1095
|
+
if self.debug:
|
|
1096
|
+
print("[_recv_loop] audio_buffer.committed")
|
|
1097
|
+
|
|
1098
|
+
# disable mic input if auto-commit
|
|
1099
|
+
if self._last_opts:
|
|
1100
|
+
self._last_opts.rt_signals.response.emit(RealtimeEvent(RealtimeEvent.RT_OUTPUT_AUDIO_COMMIT, {
|
|
1101
|
+
"ctx": self._ctx,
|
|
1102
|
+
}))
|
|
1103
|
+
|
|
1104
|
+
# ---- input transcription (user speech) ----
|
|
1105
|
+
elif etype == "conversation.item.input_audio_transcription.delta":
|
|
1106
|
+
if self._transcribe_enabled():
|
|
1107
|
+
buf = self._input_tr_buffers.get(ev.get("item_id"))
|
|
1108
|
+
if buf is None:
|
|
1109
|
+
buf = io.StringIO()
|
|
1110
|
+
self._input_tr_buffers[ev.get("item_id")] = buf
|
|
1111
|
+
delta = ev.get("delta") or ev.get("text") or ev.get("transcript") or ""
|
|
1112
|
+
if delta:
|
|
1113
|
+
buf.write(str(delta))
|
|
1114
|
+
|
|
1115
|
+
elif etype in ("conversation.item.input_audio_transcription.completed",
|
|
1116
|
+
"conversation.item.input_audio_transcription.done"):
|
|
1117
|
+
if self._transcribe_enabled():
|
|
1118
|
+
item_id = ev.get("item_id")
|
|
1119
|
+
tr = ev.get("transcript") or ""
|
|
1120
|
+
buf = self._input_tr_buffers.pop(item_id, None)
|
|
1121
|
+
if buf is not None:
|
|
1122
|
+
try:
|
|
1123
|
+
v = buf.getvalue()
|
|
1124
|
+
if v and not tr:
|
|
1125
|
+
tr = v
|
|
1126
|
+
finally:
|
|
1127
|
+
try:
|
|
1128
|
+
buf.close()
|
|
1129
|
+
except Exception:
|
|
1130
|
+
pass
|
|
1131
|
+
if tr:
|
|
1132
|
+
self._save_input_transcript(tr)
|
|
1133
|
+
|
|
1134
|
+
elif etype == "conversation.item.input_audio_transcription.failed":
|
|
1135
|
+
if self.debug:
|
|
1136
|
+
err = (ev.get("error") or {}).get("message") or "input transcription failed"
|
|
1137
|
+
print(f"[_recv_loop] {err}")
|
|
1138
|
+
|
|
1139
|
+
elif etype == "conversation.item.created":
|
|
1140
|
+
if self.debug:
|
|
1141
|
+
print("[_recv_loop] conversation.item.created")
|
|
1142
|
+
# Fallback: some servers may include transcript inside the created user item
|
|
1143
|
+
if self._transcribe_enabled():
|
|
1144
|
+
item = ev.get("item") or {}
|
|
1145
|
+
if item.get("role") == "user":
|
|
1146
|
+
for c in (item.get("content") or []):
|
|
1147
|
+
if isinstance(c, dict) and c.get("type") in ("input_audio", "audio"):
|
|
1148
|
+
tr = c.get("transcript")
|
|
1149
|
+
if tr:
|
|
1150
|
+
self._save_input_transcript(str(tr))
|
|
1151
|
+
|
|
1152
|
+
# ---- assistant text vs assistant audio transcript deltas ----
|
|
1153
|
+
elif etype in ("response.text.delta", "response.output_text.delta"):
|
|
1154
|
+
delta = ev.get("delta") or ev.get("text")
|
|
1155
|
+
if isinstance(delta, dict) and "text" in delta:
|
|
1156
|
+
delta = delta["text"]
|
|
1157
|
+
if delta:
|
|
1158
|
+
self._rt_append_text(delta)
|
|
1159
|
+
if self._on_text:
|
|
1160
|
+
try:
|
|
1161
|
+
await self._on_text(str(delta))
|
|
1162
|
+
except Exception:
|
|
1163
|
+
pass
|
|
1164
|
+
elif etype == "response.audio_transcript.delta":
|
|
1165
|
+
if self._transcribe_enabled():
|
|
1166
|
+
delta = ev.get("delta") or ev.get("text")
|
|
1167
|
+
if isinstance(delta, dict) and "text" in delta:
|
|
1168
|
+
delta = delta["text"]
|
|
1169
|
+
if delta:
|
|
1170
|
+
self._rt_append_text(delta)
|
|
1171
|
+
if self._on_text:
|
|
1172
|
+
try:
|
|
1173
|
+
await self._on_text(str(delta))
|
|
1174
|
+
except Exception:
|
|
1175
|
+
pass
|
|
1176
|
+
|
|
1177
|
+
elif etype in ("response.text.done", "response.output_text.done", "response.audio_transcript.done"):
|
|
1178
|
+
if self.debug:
|
|
1179
|
+
print("[_recv_loop] text done")
|
|
1180
|
+
|
|
1181
|
+
elif etype == "response.content_part.added":
|
|
1182
|
+
part = ev.get("part") or {}
|
|
1183
|
+
ptype = part.get("type")
|
|
1184
|
+
if ptype == "text":
|
|
1185
|
+
txt = part.get("text") or ""
|
|
1186
|
+
if txt:
|
|
1187
|
+
self._rt_append_text(txt)
|
|
1188
|
+
if self._on_text:
|
|
1189
|
+
try:
|
|
1190
|
+
await self._on_text(str(txt))
|
|
1191
|
+
except Exception:
|
|
1192
|
+
pass
|
|
1193
|
+
elif ptype == "audio":
|
|
1194
|
+
b64 = part.get("audio")
|
|
1195
|
+
if b64 and self._on_audio:
|
|
1196
|
+
try:
|
|
1197
|
+
data = base64.b64decode(b64)
|
|
1198
|
+
await self._on_audio(data, "audio/pcm", DEFAULT_RATE, 1, False)
|
|
1199
|
+
except Exception:
|
|
1200
|
+
pass
|
|
1201
|
+
tr = part.get("transcript")
|
|
1202
|
+
if tr and self._transcribe_enabled():
|
|
1203
|
+
self._rt_append_text(tr)
|
|
1204
|
+
if self._on_text:
|
|
1205
|
+
try:
|
|
1206
|
+
await self._on_text(str(tr))
|
|
1207
|
+
except Exception:
|
|
1208
|
+
pass
|
|
1209
|
+
|
|
1210
|
+
elif etype == "response.audio.delta":
|
|
1211
|
+
b64 = ev.get("delta")
|
|
1212
|
+
if b64 and self._on_audio:
|
|
1213
|
+
try:
|
|
1214
|
+
data = base64.b64decode(b64)
|
|
1215
|
+
await self._on_audio(data, "audio/pcm", DEFAULT_RATE, 1, False)
|
|
1216
|
+
except Exception:
|
|
1217
|
+
pass
|
|
1218
|
+
|
|
1219
|
+
elif etype == "response.audio.done":
|
|
1220
|
+
if self.debug:
|
|
1221
|
+
print("[_recv_loop] audio done")
|
|
1222
|
+
if not audio_done and self._on_audio:
|
|
1223
|
+
try:
|
|
1224
|
+
await self._on_audio(b"", "audio/pcm", DEFAULT_RATE, 1, True)
|
|
1225
|
+
except Exception:
|
|
1226
|
+
pass
|
|
1227
|
+
audio_done = True
|
|
1228
|
+
|
|
1229
|
+
# ---- function calling (tools) ----
|
|
1230
|
+
elif etype == "response.output_item.added":
|
|
1231
|
+
if self.debug:
|
|
1232
|
+
print("[_recv_loop] output_item added")
|
|
1233
|
+
item = ev.get("item") or {}
|
|
1234
|
+
if item.get("type") == "function_call":
|
|
1235
|
+
fid = item.get("id") or item.get("item_id") or ""
|
|
1236
|
+
call_id = item.get("call_id") or ""
|
|
1237
|
+
name = item.get("name") or ""
|
|
1238
|
+
self._rt_state["tool_calls"].append({
|
|
1239
|
+
"id": fid,
|
|
1240
|
+
"call_id": call_id,
|
|
1241
|
+
"type": "function",
|
|
1242
|
+
"function": {"name": name, "arguments": ""}
|
|
1243
|
+
})
|
|
1244
|
+
if fid and fid not in self._rt_state["fn_args_buffers"]:
|
|
1245
|
+
self._rt_state["fn_args_buffers"][fid] = io.StringIO()
|
|
1246
|
+
|
|
1247
|
+
elif etype == "response.function_call_arguments.delta":
|
|
1248
|
+
buf = self._rt_state["fn_args_buffers"].get(ev.get("item_id"))
|
|
1249
|
+
if buf is not None:
|
|
1250
|
+
delta = ev.get("delta") or ""
|
|
1251
|
+
if delta:
|
|
1252
|
+
buf.write(delta)
|
|
1253
|
+
|
|
1254
|
+
elif etype == "response.function_call_arguments.done":
|
|
1255
|
+
item_id = ev.get("item_id")
|
|
1256
|
+
args_val = ev.get("arguments") or ""
|
|
1257
|
+
buf = self._rt_state["fn_args_buffers"].pop(item_id, None)
|
|
1258
|
+
if buf is not None:
|
|
1259
|
+
try:
|
|
1260
|
+
concat = buf.getvalue()
|
|
1261
|
+
if concat:
|
|
1262
|
+
args_val = concat
|
|
1263
|
+
finally:
|
|
1264
|
+
try:
|
|
1265
|
+
buf.close()
|
|
1266
|
+
except Exception:
|
|
1267
|
+
pass
|
|
1268
|
+
for tc in self._rt_state["tool_calls"]:
|
|
1269
|
+
if tc.get("id") == item_id:
|
|
1270
|
+
tc["function"]["arguments"] = args_val
|
|
1271
|
+
break
|
|
1272
|
+
self._rt_state["force_func_call"] = True
|
|
1273
|
+
|
|
1274
|
+
elif etype == "response.output_item.done":
|
|
1275
|
+
if self.debug:
|
|
1276
|
+
print("[_recv_loop] output_item done")
|
|
1277
|
+
item = ev.get("item") or {}
|
|
1278
|
+
if item.get("type") == "function_call":
|
|
1279
|
+
fid = item.get("id") or item.get("item_id") or ""
|
|
1280
|
+
name = item.get("name") or ""
|
|
1281
|
+
args_val = item.get("arguments") or ""
|
|
1282
|
+
for tc in self._rt_state["tool_calls"]:
|
|
1283
|
+
if fid and tc.get("id") == fid:
|
|
1284
|
+
if name:
|
|
1285
|
+
tc["function"]["name"] = name
|
|
1286
|
+
if args_val:
|
|
1287
|
+
tc["function"]["arguments"] = args_val
|
|
1288
|
+
break
|
|
1289
|
+
self._rt_state["force_func_call"] = True
|
|
1290
|
+
|
|
1291
|
+
# ---- code interpreter (delta/done) ----
|
|
1292
|
+
elif etype in ("response.code_interpreter_call_code.delta", "response.code_interpreter_call.code.delta"):
|
|
1293
|
+
code_delta = ev.get("delta") or ""
|
|
1294
|
+
if code_delta:
|
|
1295
|
+
if not self._rt_state["is_code"]:
|
|
1296
|
+
hdr = "\n\n**Code interpreter**\n```python\n"
|
|
1297
|
+
self._rt_append_text(hdr + code_delta)
|
|
1298
|
+
if self._on_text:
|
|
1299
|
+
try:
|
|
1300
|
+
await self._on_text(hdr + code_delta)
|
|
1301
|
+
except Exception:
|
|
1302
|
+
pass
|
|
1303
|
+
self._rt_state["is_code"] = True
|
|
1304
|
+
else:
|
|
1305
|
+
self._rt_append_text(code_delta)
|
|
1306
|
+
if self._on_text:
|
|
1307
|
+
try:
|
|
1308
|
+
await self._on_text(code_delta)
|
|
1309
|
+
except Exception:
|
|
1310
|
+
pass
|
|
1311
|
+
|
|
1312
|
+
elif etype in ("response.code_interpreter_call_code.done", "response.code_interpreter_call.code.done"):
|
|
1313
|
+
if self.debug:
|
|
1314
|
+
print("[_recv_loop] code done")
|
|
1315
|
+
if self._rt_state["is_code"]:
|
|
1316
|
+
tail = "\n\n```\n-----------\n"
|
|
1317
|
+
self._rt_append_text(tail)
|
|
1318
|
+
if self._on_text:
|
|
1319
|
+
try:
|
|
1320
|
+
await self._on_text(tail)
|
|
1321
|
+
except Exception:
|
|
1322
|
+
pass
|
|
1323
|
+
self._rt_state["is_code"] = False
|
|
1324
|
+
|
|
1325
|
+
# ---- annotations (citations/files) ----
|
|
1326
|
+
elif etype == "response.output_text.annotation.added":
|
|
1327
|
+
if self.debug:
|
|
1328
|
+
print("[_recv_loop] annotation added")
|
|
1329
|
+
ann = ev.get("annotation") or {}
|
|
1330
|
+
atype = ann.get("type")
|
|
1331
|
+
if atype == "url_citation":
|
|
1332
|
+
url = ann.get("url")
|
|
1333
|
+
self._rt_add_citation(url)
|
|
1334
|
+
elif atype == "container_file_citation":
|
|
1335
|
+
self._rt_state["files"].append({
|
|
1336
|
+
"container_id": ann.get("container_id"),
|
|
1337
|
+
"file_id": ann.get("file_id"),
|
|
1338
|
+
})
|
|
1339
|
+
|
|
1340
|
+
# ---- partial images (defensive) ----
|
|
1341
|
+
elif etype == "response.image_generation_call.partial_image":
|
|
1342
|
+
image_b64 = ev.get("partial_image_b64")
|
|
1343
|
+
if image_b64:
|
|
1344
|
+
try:
|
|
1345
|
+
img_bytes = base64.b64decode(image_b64)
|
|
1346
|
+
save_path = self.window.core.image.gen_unique_path(self._ctx)
|
|
1347
|
+
with open(save_path, "wb") as f:
|
|
1348
|
+
f.write(img_bytes)
|
|
1349
|
+
self._rt_state["image_paths"].append(save_path)
|
|
1350
|
+
self._rt_state["is_image"] = True
|
|
1351
|
+
if not isinstance(self._ctx.images, list):
|
|
1352
|
+
self._ctx.images = []
|
|
1353
|
+
if save_path not in self._ctx.images:
|
|
1354
|
+
self._ctx.images.append(save_path)
|
|
1355
|
+
except Exception:
|
|
1356
|
+
pass
|
|
1357
|
+
|
|
1358
|
+
elif etype == "response.done":
|
|
1359
|
+
if self.debug:
|
|
1360
|
+
print("[_recv_loop] response done")
|
|
1361
|
+
# Ensure audio finalized
|
|
1362
|
+
if not audio_done and self._on_audio:
|
|
1363
|
+
try:
|
|
1364
|
+
await self._on_audio(b"", "audio/pcm", DEFAULT_RATE, 1, True)
|
|
1365
|
+
except Exception:
|
|
1366
|
+
pass
|
|
1367
|
+
audio_done = True
|
|
1368
|
+
|
|
1369
|
+
self._response_active = False
|
|
1370
|
+
|
|
1371
|
+
# Capture usage if present on response
|
|
1372
|
+
try:
|
|
1373
|
+
resp_obj = ev.get("response") or {}
|
|
1374
|
+
self._rt_capture_usage(resp_obj)
|
|
1375
|
+
except Exception:
|
|
1376
|
+
pass
|
|
1377
|
+
|
|
1378
|
+
# Build final output text
|
|
1379
|
+
output = "".join(self._rt_state["output_parts"]) if self._rt_state else ""
|
|
1380
|
+
if has_unclosed_code_tag(output):
|
|
1381
|
+
output += "\n```"
|
|
1382
|
+
if not output:
|
|
1383
|
+
try:
|
|
1384
|
+
transcript = self._extract_text_from_response_done(ev)
|
|
1385
|
+
if transcript:
|
|
1386
|
+
output = transcript
|
|
1387
|
+
except Exception:
|
|
1388
|
+
pass
|
|
1389
|
+
|
|
1390
|
+
# Persist into ctx
|
|
1391
|
+
try:
|
|
1392
|
+
if self._ctx:
|
|
1393
|
+
self._ctx.output = output or (self._ctx.output or "")
|
|
1394
|
+
up = self._rt_state.get("usage_payload") if self._rt_state else None
|
|
1395
|
+
if up:
|
|
1396
|
+
in_tok = up.get("in")
|
|
1397
|
+
out_tok = up.get("out")
|
|
1398
|
+
if in_tok is None:
|
|
1399
|
+
in_tok = self._ctx.input_tokens if self._ctx.input_tokens is not None else 0
|
|
1400
|
+
if out_tok is None:
|
|
1401
|
+
out_tok = 0
|
|
1402
|
+
self._ctx.set_tokens(in_tok, out_tok)
|
|
1403
|
+
try:
|
|
1404
|
+
if not isinstance(self._ctx.extra, dict):
|
|
1405
|
+
self._ctx.extra = {}
|
|
1406
|
+
self._ctx.extra["usage"] = {
|
|
1407
|
+
"vendor": "openai",
|
|
1408
|
+
"input_tokens": in_tok,
|
|
1409
|
+
"output_tokens": out_tok,
|
|
1410
|
+
"reasoning_tokens": up.get("reasoning", 0),
|
|
1411
|
+
"total_reported": up.get("total"),
|
|
1412
|
+
}
|
|
1413
|
+
except Exception:
|
|
1414
|
+
pass
|
|
1415
|
+
|
|
1416
|
+
# Citations
|
|
1417
|
+
if self._rt_state and self._rt_state["citations"]:
|
|
1418
|
+
if self._ctx.urls is None:
|
|
1419
|
+
self._ctx.urls = []
|
|
1420
|
+
for u in self._rt_state["citations"]:
|
|
1421
|
+
if u not in self._ctx.urls:
|
|
1422
|
+
self._ctx.urls.append(u)
|
|
1423
|
+
|
|
1424
|
+
# Images
|
|
1425
|
+
if self._rt_state and self._rt_state["image_paths"]:
|
|
1426
|
+
if not isinstance(self._ctx.images, list):
|
|
1427
|
+
self._ctx.images = []
|
|
1428
|
+
for p in self._rt_state["image_paths"]:
|
|
1429
|
+
if p not in self._ctx.images:
|
|
1430
|
+
self._ctx.images.append(p)
|
|
1431
|
+
|
|
1432
|
+
self.window.core.ctx.update_item(self._ctx)
|
|
1433
|
+
except Exception:
|
|
1434
|
+
pass
|
|
1435
|
+
|
|
1436
|
+
# Download container files if any
|
|
1437
|
+
try:
|
|
1438
|
+
files = (self._rt_state or {}).get("files") or []
|
|
1439
|
+
if files:
|
|
1440
|
+
self.window.core.api.openai.container.download_files(self._ctx, files)
|
|
1441
|
+
except Exception:
|
|
1442
|
+
pass
|
|
1443
|
+
|
|
1444
|
+
# Unpack tool calls if any
|
|
1445
|
+
try:
|
|
1446
|
+
tcs = (self._rt_state or {}).get("tool_calls") or []
|
|
1447
|
+
if tcs:
|
|
1448
|
+
for tc in tcs:
|
|
1449
|
+
fn = tc.get("function") or {}
|
|
1450
|
+
if isinstance(fn.get("arguments"), dict):
|
|
1451
|
+
fn["arguments"] = json.dumps(fn["arguments"], ensure_ascii=False)
|
|
1452
|
+
self._ctx.force_call = bool((self._rt_state or {}).get("force_func_call"))
|
|
1453
|
+
self.window.core.debug.info("[realtime] Tool calls found, unpacking...")
|
|
1454
|
+
self.window.core.command.unpack_tool_calls_chunks(self._ctx, tcs)
|
|
1455
|
+
self.window.core.ctx.update_item(self._ctx)
|
|
1456
|
+
except Exception:
|
|
1457
|
+
pass
|
|
1458
|
+
|
|
1459
|
+
# Persist last tool calls snapshot for mapping tool outputs
|
|
1460
|
+
try:
|
|
1461
|
+
tcs = (self._rt_state or {}).get("tool_calls") or []
|
|
1462
|
+
if tcs:
|
|
1463
|
+
self._last_tool_calls = list(tcs)
|
|
1464
|
+
except Exception:
|
|
1465
|
+
pass
|
|
1466
|
+
|
|
1467
|
+
# Unblock waiters
|
|
1468
|
+
if self._response_done:
|
|
1469
|
+
self._response_done.set()
|
|
1470
|
+
|
|
1471
|
+
# send RT_OUTPUT_TURN_END signal
|
|
1472
|
+
if self._last_opts:
|
|
1473
|
+
self._last_opts.rt_signals.response.emit(RealtimeEvent(RealtimeEvent.RT_OUTPUT_TURN_END, {
|
|
1474
|
+
"ctx": self._ctx,
|
|
1475
|
+
}))
|
|
1476
|
+
|
|
1477
|
+
# Reset per-response extraction state
|
|
1478
|
+
self._rt_state = None
|
|
1479
|
+
|
|
1480
|
+
elif etype == "error":
|
|
1481
|
+
if self.debug:
|
|
1482
|
+
print(f"[_recv_loop] error event: {ev}")
|
|
1483
|
+
# Session expiration and other errors
|
|
1484
|
+
err = ev.get("error") or {}
|
|
1485
|
+
msg = (err.get("message") or "")
|
|
1486
|
+
code = (err.get("code") or "")
|
|
1487
|
+
if isinstance(code, str) and code.strip().lower() == "session_expired":
|
|
1488
|
+
self._rt_session_id = None
|
|
1489
|
+
if self.debug:
|
|
1490
|
+
print("[_recv_loop] session expired")
|
|
1491
|
+
if "already has an active response" in (msg or "").lower():
|
|
1492
|
+
if self._response_done:
|
|
1493
|
+
self._response_done.set()
|
|
1494
|
+
continue
|
|
1495
|
+
if self._response_done:
|
|
1496
|
+
self._response_done.set()
|
|
1497
|
+
if self.debug:
|
|
1498
|
+
print(f"[_recv_loop] error: {msg}")
|
|
1499
|
+
|
|
1500
|
+
# Other events are ignored
|
|
1501
|
+
|
|
1502
|
+
except Exception as e:
|
|
1503
|
+
if self.debug:
|
|
1504
|
+
print(f"[_recv_loop] exception: {e!r}")
|
|
1505
|
+
finally:
|
|
1506
|
+
if self.debug:
|
|
1507
|
+
print("[_recv_loop] stopped")
|
|
1508
|
+
# Ensure any waiters are unblocked on socket teardown
|
|
1509
|
+
try:
|
|
1510
|
+
if self._response_done and not self._response_done.is_set():
|
|
1511
|
+
self._response_done.set()
|
|
1512
|
+
except Exception:
|
|
1513
|
+
pass
|
|
1514
|
+
try:
|
|
1515
|
+
if self.ws:
|
|
1516
|
+
await self.ws.close()
|
|
1517
|
+
except Exception:
|
|
1518
|
+
pass
|
|
1519
|
+
self.ws = None
|
|
1520
|
+
self._running = False
|
|
1521
|
+
|
|
1522
|
+
# -----------------------------
|
|
1523
|
+
# Helpers
|
|
1524
|
+
# -----------------------------
|
|
1525
|
+
|
|
1526
|
+
def _preferred_voice(self) -> str:
|
|
1527
|
+
"""
|
|
1528
|
+
Resolve preferred OpenAI voice from settings.
|
|
1529
|
+
"""
|
|
1530
|
+
try:
|
|
1531
|
+
v = self.window.core.plugins.get_option("audio_output", "openai_voice")
|
|
1532
|
+
if v:
|
|
1533
|
+
return str(v)
|
|
1534
|
+
except Exception:
|
|
1535
|
+
pass
|
|
1536
|
+
return "alloy"
|
|
1537
|
+
|
|
1538
|
+
def _extract_text_from_response_done(self, ev: dict) -> str:
|
|
1539
|
+
"""
|
|
1540
|
+
Extract assistant text from response.done payload.
|
|
1541
|
+
"""
|
|
1542
|
+
res = ev.get("response") or {}
|
|
1543
|
+
out = res.get("output") or []
|
|
1544
|
+
parts: list[str] = []
|
|
1545
|
+
|
|
1546
|
+
for item in out:
|
|
1547
|
+
if not isinstance(item, dict):
|
|
1548
|
+
continue
|
|
1549
|
+
if item.get("type") not in ("message", "tool_result", "function_call_result", "response"):
|
|
1550
|
+
pass
|
|
1551
|
+
content_list = item.get("content") or []
|
|
1552
|
+
for c in content_list:
|
|
1553
|
+
if not isinstance(c, dict):
|
|
1554
|
+
continue
|
|
1555
|
+
ctype = c.get("type")
|
|
1556
|
+
if ctype == "audio" and self._transcribe_enabled():
|
|
1557
|
+
tr = c.get("transcript")
|
|
1558
|
+
if tr:
|
|
1559
|
+
parts.append(str(tr))
|
|
1560
|
+
elif ctype in ("text", "output_text", "input_text"):
|
|
1561
|
+
txt = c.get("text")
|
|
1562
|
+
if isinstance(txt, dict):
|
|
1563
|
+
txt = txt.get("text") or txt.get("value")
|
|
1564
|
+
if txt:
|
|
1565
|
+
parts.append(str(txt))
|
|
1566
|
+
|
|
1567
|
+
text = "\n".join(t.strip() for t in parts if t and str(t).strip())
|
|
1568
|
+
return text
|
|
1569
|
+
|
|
1570
|
+
# ---- per-response state helpers ----
|
|
1571
|
+
|
|
1572
|
+
def _rt_reset_state(self):
|
|
1573
|
+
"""Reset per-response extraction state."""
|
|
1574
|
+
self._rt_state = {
|
|
1575
|
+
"output_parts": [],
|
|
1576
|
+
"begin": True,
|
|
1577
|
+
"fn_args_buffers": {},
|
|
1578
|
+
"tool_calls": [],
|
|
1579
|
+
"citations": [],
|
|
1580
|
+
"files": [],
|
|
1581
|
+
"image_paths": [],
|
|
1582
|
+
"is_image": False,
|
|
1583
|
+
"is_code": False,
|
|
1584
|
+
"force_func_call": False,
|
|
1585
|
+
"usage_payload": {},
|
|
1586
|
+
}
|
|
1587
|
+
|
|
1588
|
+
def _rt_append_text(self, s: str):
|
|
1589
|
+
"""Append text to assembled output, skipping initial empty deltas."""
|
|
1590
|
+
if self._rt_state is None:
|
|
1591
|
+
self._rt_reset_state()
|
|
1592
|
+
if self._rt_state["begin"] and (s is None or s == ""):
|
|
1593
|
+
return
|
|
1594
|
+
self._rt_state["output_parts"].append(str(s))
|
|
1595
|
+
self._rt_state["begin"] = False
|
|
1596
|
+
|
|
1597
|
+
def _rt_add_citation(self, url: Optional[str]):
|
|
1598
|
+
"""Add a URL citation to state and ctx (de-duplicated)."""
|
|
1599
|
+
if not url or not isinstance(url, str):
|
|
1600
|
+
return
|
|
1601
|
+
url = url.strip()
|
|
1602
|
+
if not (url.startswith("http://") or url.startswith("https://")):
|
|
1603
|
+
return
|
|
1604
|
+
if url not in self._rt_state["citations"]:
|
|
1605
|
+
self._rt_state["citations"].append(url)
|
|
1606
|
+
try:
|
|
1607
|
+
if self._ctx:
|
|
1608
|
+
if self._ctx.urls is None:
|
|
1609
|
+
self._ctx.urls = []
|
|
1610
|
+
if url not in self._ctx.urls:
|
|
1611
|
+
self._ctx.urls.append(url)
|
|
1612
|
+
except Exception:
|
|
1613
|
+
pass
|
|
1614
|
+
|
|
1615
|
+
def _rt_capture_usage(self, response_obj: dict):
|
|
1616
|
+
"""
|
|
1617
|
+
Capture token usage from response.done if present.
|
|
1618
|
+
"""
|
|
1619
|
+
try:
|
|
1620
|
+
usage = (response_obj or {}).get("usage") or {}
|
|
1621
|
+
if not usage:
|
|
1622
|
+
return
|
|
1623
|
+
in_tok = usage.get("input_tokens") or usage.get("prompt_tokens")
|
|
1624
|
+
out_tok = usage.get("output_tokens") or usage.get("completion_tokens")
|
|
1625
|
+
total = usage.get("total_tokens")
|
|
1626
|
+
self._rt_state["usage_payload"] = {
|
|
1627
|
+
"in": int(in_tok) if in_tok is not None else None,
|
|
1628
|
+
"out": int(out_tok) if out_tok is not None else None,
|
|
1629
|
+
"total": int(total) if total is not None else None,
|
|
1630
|
+
"reasoning": 0,
|
|
1631
|
+
}
|
|
1632
|
+
except Exception:
|
|
1633
|
+
pass
|
|
1634
|
+
|
|
1635
|
+
# ---- transcription helpers ----
|
|
1636
|
+
|
|
1637
|
+
def _transcribe_enabled(self) -> bool:
|
|
1638
|
+
"""Returns True if transcription (input/output) is enabled via opts.transcribe."""
|
|
1639
|
+
try:
|
|
1640
|
+
return bool(getattr(self._last_opts, "transcribe", False))
|
|
1641
|
+
except Exception:
|
|
1642
|
+
return False
|
|
1643
|
+
|
|
1644
|
+
def _save_input_transcript(self, transcript: str):
|
|
1645
|
+
"""
|
|
1646
|
+
Persist input transcript into ctx. If the user didn't provide a text prompt in this turn,
|
|
1647
|
+
ctx.input is also populated so downstream code treats it as the user's textual message.
|
|
1648
|
+
"""
|
|
1649
|
+
if not transcript:
|
|
1650
|
+
return
|
|
1651
|
+
try:
|
|
1652
|
+
if self._ctx:
|
|
1653
|
+
if not isinstance(self._ctx.extra, dict):
|
|
1654
|
+
self._ctx.extra = {}
|
|
1655
|
+
self._ctx.input.extra["input_transcript"] = str(transcript)
|
|
1656
|
+
if not getattr(self._last_opts, "prompt", None):
|
|
1657
|
+
self._ctx.input = str(transcript)
|
|
1658
|
+
self.window.core.ctx.update_item(self._ctx)
|
|
1659
|
+
except Exception:
|
|
1660
|
+
pass
|
|
1661
|
+
|
|
1662
|
+
def _tune_openai_vad(self, session_payload: dict, opts) -> None:
|
|
1663
|
+
"""
|
|
1664
|
+
Increase end-of-speech hold for server VAD (auto-turn) to reduce premature turn endings.
|
|
1665
|
+
"""
|
|
1666
|
+
try:
|
|
1667
|
+
sess = session_payload.get("session") or {}
|
|
1668
|
+
td = sess.get("turn_detection")
|
|
1669
|
+
if not isinstance(td, dict):
|
|
1670
|
+
return # manual mode or VAD disabled
|
|
1671
|
+
|
|
1672
|
+
# Resolve target silence (default +2000 ms)
|
|
1673
|
+
target_ms = getattr(opts, "vad_end_silence_ms", None)
|
|
1674
|
+
if not isinstance(target_ms, (int, float)) or target_ms <= 0:
|
|
1675
|
+
# If user didn't override, ensure at least 2000 ms
|
|
1676
|
+
base = int(td.get("silence_duration_ms") or 500)
|
|
1677
|
+
target_ms = max(base, 2000)
|
|
1678
|
+
|
|
1679
|
+
td["silence_duration_ms"] = int(target_ms)
|
|
1680
|
+
|
|
1681
|
+
# Optional: prefix padding before detected speech
|
|
1682
|
+
prefix_ms = getattr(opts, "vad_prefix_padding_ms", None)
|
|
1683
|
+
if isinstance(prefix_ms, (int, float)) and prefix_ms >= 0:
|
|
1684
|
+
td["prefix_padding_ms"] = int(prefix_ms)
|
|
1685
|
+
except Exception:
|
|
1686
|
+
pass
|
|
1687
|
+
|
|
1688
|
+
def update_session_autoturn_sync(
|
|
1689
|
+
self,
|
|
1690
|
+
enabled: bool,
|
|
1691
|
+
silence_ms: Optional[int] = None,
|
|
1692
|
+
prefix_ms: Optional[int] = None,
|
|
1693
|
+
timeout: float = 5.0,
|
|
1694
|
+
):
|
|
1695
|
+
"""
|
|
1696
|
+
Synchronous helper to enable/disable auto-turn (VAD) mode on the live session.
|
|
1697
|
+
You can override silence and prefix (ms) as 2nd and 3rd args.
|
|
1698
|
+
If WS is not open, this updates self._last_opts and returns.
|
|
1699
|
+
"""
|
|
1700
|
+
self._ensure_background_loop()
|
|
1701
|
+
try:
|
|
1702
|
+
self._bg.run_sync(
|
|
1703
|
+
self._update_session_autoturn_internal(enabled, silence_ms, prefix_ms),
|
|
1704
|
+
timeout=timeout
|
|
1705
|
+
)
|
|
1706
|
+
except Exception:
|
|
1707
|
+
pass
|
|
1708
|
+
|
|
1709
|
+
async def _update_session_autoturn_internal(
|
|
1710
|
+
self,
|
|
1711
|
+
enabled: bool,
|
|
1712
|
+
silence_ms: Optional[int] = None,
|
|
1713
|
+
prefix_ms: Optional[int] = None,
|
|
1714
|
+
):
|
|
1715
|
+
"""
|
|
1716
|
+
Owner-loop implementation for toggling auto-turn (server/semantic VAD) at runtime
|
|
1717
|
+
with optional silence and prefix overrides (milliseconds).
|
|
1718
|
+
"""
|
|
1719
|
+
# If socket is not open, just cache into last opts
|
|
1720
|
+
if not self.ws:
|
|
1721
|
+
try:
|
|
1722
|
+
if self._last_opts:
|
|
1723
|
+
setattr(self._last_opts, "auto_turn", bool(enabled))
|
|
1724
|
+
if silence_ms is not None:
|
|
1725
|
+
setattr(self._last_opts, "vad_end_silence_ms", int(silence_ms))
|
|
1726
|
+
if prefix_ms is not None:
|
|
1727
|
+
setattr(self._last_opts, "vad_prefix_padding_ms", int(prefix_ms))
|
|
1728
|
+
except Exception:
|
|
1729
|
+
pass
|
|
1730
|
+
if self.debug:
|
|
1731
|
+
print("[update_session_autoturn] WS not open; cached for next session")
|
|
1732
|
+
return
|
|
1733
|
+
|
|
1734
|
+
if self._send_lock is None:
|
|
1735
|
+
self._send_lock = asyncio.Lock()
|
|
1736
|
+
|
|
1737
|
+
async with self._send_lock:
|
|
1738
|
+
try:
|
|
1739
|
+
# Build base session.update; let helper set correct turn_detection shape
|
|
1740
|
+
payload: dict = {"type": "session.update", "session": {}}
|
|
1741
|
+
turn_mode = TurnMode.AUTO if enabled else TurnMode.MANUAL
|
|
1742
|
+
apply_turn_mode_openai(payload, turn_mode) # sets session.turn_detection (AUTO) or None (MANUAL)
|
|
1743
|
+
|
|
1744
|
+
if enabled:
|
|
1745
|
+
sess = payload.get("session", {})
|
|
1746
|
+
td = sess.get("turn_detection")
|
|
1747
|
+
|
|
1748
|
+
# Optional VAD type override via opts.vad_type ("server_vad" | "semantic_vad")
|
|
1749
|
+
try:
|
|
1750
|
+
vad_type = getattr(self._last_opts, "vad_type", None)
|
|
1751
|
+
if isinstance(vad_type, str) and vad_type in ("server_vad", "semantic_vad"):
|
|
1752
|
+
if isinstance(td, dict):
|
|
1753
|
+
td["type"] = vad_type
|
|
1754
|
+
except Exception:
|
|
1755
|
+
pass
|
|
1756
|
+
|
|
1757
|
+
# Optional threshold for server_vad
|
|
1758
|
+
try:
|
|
1759
|
+
thr = getattr(self._last_opts, "vad_threshold", None)
|
|
1760
|
+
if isinstance(thr, (int, float)) and isinstance(td, dict) and td.get("type") == "server_vad":
|
|
1761
|
+
td["threshold"] = float(thr)
|
|
1762
|
+
except Exception:
|
|
1763
|
+
pass
|
|
1764
|
+
|
|
1765
|
+
# Apply defaults based on opts first
|
|
1766
|
+
self._tune_openai_vad(payload, self._last_opts)
|
|
1767
|
+
|
|
1768
|
+
# Then hard-override with explicit args (user provided values win)
|
|
1769
|
+
if isinstance(td, dict):
|
|
1770
|
+
if silence_ms is not None:
|
|
1771
|
+
td["silence_duration_ms"] = int(silence_ms)
|
|
1772
|
+
if prefix_ms is not None:
|
|
1773
|
+
td["prefix_padding_ms"] = int(prefix_ms)
|
|
1774
|
+
|
|
1775
|
+
# Optional flags from opts
|
|
1776
|
+
try:
|
|
1777
|
+
cr = getattr(self._last_opts, "vad_create_response", None)
|
|
1778
|
+
if isinstance(cr, bool):
|
|
1779
|
+
td["create_response"] = cr
|
|
1780
|
+
except Exception:
|
|
1781
|
+
pass
|
|
1782
|
+
try:
|
|
1783
|
+
ir = getattr(self._last_opts, "vad_interrupt_response", None)
|
|
1784
|
+
if isinstance(ir, bool):
|
|
1785
|
+
td["interrupt_response"] = ir
|
|
1786
|
+
except Exception:
|
|
1787
|
+
pass
|
|
1788
|
+
|
|
1789
|
+
# Send the update
|
|
1790
|
+
await self.ws.send(json.dumps(payload))
|
|
1791
|
+
|
|
1792
|
+
# Update local opts snapshot so next calls keep the same settings
|
|
1793
|
+
try:
|
|
1794
|
+
if self._last_opts:
|
|
1795
|
+
setattr(self._last_opts, "auto_turn", bool(enabled))
|
|
1796
|
+
if silence_ms is not None:
|
|
1797
|
+
setattr(self._last_opts, "vad_end_silence_ms", int(silence_ms))
|
|
1798
|
+
if prefix_ms is not None:
|
|
1799
|
+
setattr(self._last_opts, "vad_prefix_padding_ms", int(prefix_ms))
|
|
1800
|
+
except Exception:
|
|
1801
|
+
pass
|
|
1802
|
+
|
|
1803
|
+
if self.debug:
|
|
1804
|
+
td_dbg = (payload.get("session", {}) or {}).get("turn_detection")
|
|
1805
|
+
print(f"[update_session_autoturn] session.update sent; auto_turn={enabled}, td={td_dbg}")
|
|
1806
|
+
|
|
1807
|
+
except Exception as e:
|
|
1808
|
+
if self.debug:
|
|
1809
|
+
print(f"[update_session_autoturn] send error: {e}")
|
|
1810
|
+
|
|
1811
|
+
def set_debug(self, enabled: bool):
|
|
1812
|
+
"""
|
|
1813
|
+
Enable or disable debug logging.
|
|
1814
|
+
|
|
1815
|
+
:param enabled: True to enable debug logging, False to disable.
|
|
1816
|
+
"""
|
|
1817
|
+
self.debug = bool(enabled)
|
|
1818
|
+
|
|
1819
|
+
def is_session_active(self) -> bool:
|
|
1820
|
+
"""Check if the WS session is currently open."""
|
|
1821
|
+
return self.ws is not None and self._running
|
|
1822
|
+
|
|
1823
|
+
def update_ctx(self, ctx: CtxItem):
|
|
1824
|
+
"""Update the current CtxItem (for session handle persistence)."""
|
|
1825
|
+
self._ctx = ctx
|