pygpt-net 2.7.6__py3-none-any.whl → 2.7.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. pygpt_net/CHANGELOG.txt +13 -0
  2. pygpt_net/__init__.py +3 -3
  3. pygpt_net/app.py +5 -1
  4. pygpt_net/controller/assistant/batch.py +2 -2
  5. pygpt_net/controller/assistant/files.py +7 -6
  6. pygpt_net/controller/assistant/threads.py +0 -0
  7. pygpt_net/controller/chat/command.py +0 -0
  8. pygpt_net/controller/chat/remote_tools.py +3 -9
  9. pygpt_net/controller/chat/stream.py +2 -2
  10. pygpt_net/controller/chat/{handler/worker.py → stream_worker.py} +13 -35
  11. pygpt_net/controller/dialogs/confirm.py +35 -58
  12. pygpt_net/controller/lang/mapping.py +9 -9
  13. pygpt_net/controller/remote_store/{google/batch.py → batch.py} +209 -252
  14. pygpt_net/controller/remote_store/remote_store.py +982 -13
  15. pygpt_net/core/command/command.py +0 -0
  16. pygpt_net/core/db/viewer.py +1 -1
  17. pygpt_net/core/debug/models.py +2 -2
  18. pygpt_net/core/realtime/worker.py +3 -1
  19. pygpt_net/{controller/remote_store/google → core/remote_store/anthropic}/__init__.py +0 -1
  20. pygpt_net/core/remote_store/anthropic/files.py +211 -0
  21. pygpt_net/core/remote_store/anthropic/store.py +208 -0
  22. pygpt_net/core/remote_store/openai/store.py +5 -4
  23. pygpt_net/core/remote_store/remote_store.py +5 -1
  24. pygpt_net/{controller/remote_store/openai → core/remote_store/xai}/__init__.py +0 -1
  25. pygpt_net/core/remote_store/xai/files.py +225 -0
  26. pygpt_net/core/remote_store/xai/store.py +219 -0
  27. pygpt_net/data/config/config.json +18 -5
  28. pygpt_net/data/config/models.json +193 -4
  29. pygpt_net/data/config/settings.json +179 -36
  30. pygpt_net/data/icons/folder_eye.svg +1 -0
  31. pygpt_net/data/icons/folder_eye_filled.svg +1 -0
  32. pygpt_net/data/icons/folder_open.svg +1 -0
  33. pygpt_net/data/icons/folder_open_filled.svg +1 -0
  34. pygpt_net/data/locale/locale.de.ini +6 -3
  35. pygpt_net/data/locale/locale.en.ini +46 -12
  36. pygpt_net/data/locale/locale.es.ini +6 -3
  37. pygpt_net/data/locale/locale.fr.ini +6 -3
  38. pygpt_net/data/locale/locale.it.ini +6 -3
  39. pygpt_net/data/locale/locale.pl.ini +7 -4
  40. pygpt_net/data/locale/locale.uk.ini +6 -3
  41. pygpt_net/data/locale/locale.zh.ini +6 -3
  42. pygpt_net/icons.qrc +4 -0
  43. pygpt_net/icons_rc.py +282 -138
  44. pygpt_net/plugin/cmd_mouse_control/worker.py +2 -1
  45. pygpt_net/plugin/cmd_mouse_control/worker_sandbox.py +2 -1
  46. pygpt_net/provider/api/anthropic/__init__.py +10 -3
  47. pygpt_net/provider/api/anthropic/chat.py +342 -11
  48. pygpt_net/provider/api/anthropic/computer.py +844 -0
  49. pygpt_net/provider/api/anthropic/remote_tools.py +172 -0
  50. pygpt_net/provider/api/anthropic/store.py +307 -0
  51. pygpt_net/{controller/chat/handler/anthropic_stream.py → provider/api/anthropic/stream.py} +99 -10
  52. pygpt_net/provider/api/anthropic/tools.py +32 -77
  53. pygpt_net/provider/api/anthropic/utils.py +30 -0
  54. pygpt_net/{controller/chat/handler → provider/api/anthropic/worker}/__init__.py +0 -0
  55. pygpt_net/provider/api/anthropic/worker/importer.py +278 -0
  56. pygpt_net/provider/api/google/chat.py +62 -9
  57. pygpt_net/provider/api/google/store.py +124 -3
  58. pygpt_net/{controller/chat/handler/google_stream.py → provider/api/google/stream.py} +92 -25
  59. pygpt_net/provider/api/google/utils.py +185 -0
  60. pygpt_net/provider/api/google/worker/importer.py +16 -28
  61. pygpt_net/provider/api/langchain/__init__.py +0 -0
  62. pygpt_net/{controller/chat/handler/langchain_stream.py → provider/api/langchain/stream.py} +1 -1
  63. pygpt_net/provider/api/llama_index/__init__.py +0 -0
  64. pygpt_net/{controller/chat/handler/llamaindex_stream.py → provider/api/llama_index/stream.py} +1 -1
  65. pygpt_net/provider/api/openai/assistants.py +2 -2
  66. pygpt_net/provider/api/openai/image.py +2 -2
  67. pygpt_net/provider/api/openai/store.py +4 -1
  68. pygpt_net/{controller/chat/handler/openai_stream.py → provider/api/openai/stream.py} +1 -1
  69. pygpt_net/provider/api/openai/utils.py +69 -3
  70. pygpt_net/provider/api/openai/worker/importer.py +19 -61
  71. pygpt_net/provider/api/openai/worker/importer_assistants.py +230 -0
  72. pygpt_net/provider/api/x_ai/__init__.py +138 -15
  73. pygpt_net/provider/api/x_ai/audio.py +43 -11
  74. pygpt_net/provider/api/x_ai/chat.py +92 -4
  75. pygpt_net/provider/api/x_ai/image.py +149 -47
  76. pygpt_net/provider/api/x_ai/realtime/__init__.py +12 -0
  77. pygpt_net/provider/api/x_ai/realtime/client.py +1825 -0
  78. pygpt_net/provider/api/x_ai/realtime/realtime.py +198 -0
  79. pygpt_net/provider/api/x_ai/{remote.py → remote_tools.py} +183 -70
  80. pygpt_net/provider/api/x_ai/responses.py +507 -0
  81. pygpt_net/provider/api/x_ai/store.py +610 -0
  82. pygpt_net/{controller/chat/handler/xai_stream.py → provider/api/x_ai/stream.py} +42 -10
  83. pygpt_net/provider/api/x_ai/tools.py +59 -8
  84. pygpt_net/{controller/chat/handler → provider/api/x_ai}/utils.py +1 -2
  85. pygpt_net/provider/api/x_ai/vision.py +1 -4
  86. pygpt_net/provider/api/x_ai/worker/importer.py +308 -0
  87. pygpt_net/provider/audio_input/xai_grok_voice.py +390 -0
  88. pygpt_net/provider/audio_output/xai_tts.py +325 -0
  89. pygpt_net/provider/core/config/patch.py +39 -3
  90. pygpt_net/provider/core/config/patches/patch_before_2_6_42.py +2 -2
  91. pygpt_net/provider/core/model/patch.py +39 -1
  92. pygpt_net/tools/image_viewer/tool.py +334 -34
  93. pygpt_net/tools/image_viewer/ui/dialogs.py +319 -22
  94. pygpt_net/tools/text_editor/ui/dialogs.py +3 -2
  95. pygpt_net/tools/text_editor/ui/widgets.py +0 -0
  96. pygpt_net/ui/dialog/assistant.py +1 -1
  97. pygpt_net/ui/dialog/plugins.py +13 -5
  98. pygpt_net/ui/dialog/remote_store.py +552 -0
  99. pygpt_net/ui/dialogs.py +3 -5
  100. pygpt_net/ui/layout/ctx/ctx_list.py +58 -7
  101. pygpt_net/ui/menu/tools.py +6 -13
  102. pygpt_net/ui/widget/dialog/base.py +16 -5
  103. pygpt_net/ui/widget/dialog/{remote_store_google.py → remote_store.py} +10 -10
  104. pygpt_net/ui/widget/element/button.py +4 -4
  105. pygpt_net/ui/widget/image/display.py +2 -2
  106. pygpt_net/ui/widget/lists/context.py +2 -2
  107. pygpt_net/ui/widget/textarea/editor.py +0 -0
  108. {pygpt_net-2.7.6.dist-info → pygpt_net-2.7.8.dist-info}/METADATA +15 -2
  109. {pygpt_net-2.7.6.dist-info → pygpt_net-2.7.8.dist-info}/RECORD +107 -89
  110. pygpt_net/controller/remote_store/google/store.py +0 -615
  111. pygpt_net/controller/remote_store/openai/batch.py +0 -524
  112. pygpt_net/controller/remote_store/openai/store.py +0 -699
  113. pygpt_net/ui/dialog/remote_store_google.py +0 -539
  114. pygpt_net/ui/dialog/remote_store_openai.py +0 -539
  115. pygpt_net/ui/widget/dialog/remote_store_openai.py +0 -56
  116. pygpt_net/ui/widget/lists/remote_store_google.py +0 -248
  117. pygpt_net/ui/widget/lists/remote_store_openai.py +0 -317
  118. {pygpt_net-2.7.6.dist-info → pygpt_net-2.7.8.dist-info}/LICENSE +0 -0
  119. {pygpt_net-2.7.6.dist-info → pygpt_net-2.7.8.dist-info}/WHEEL +0 -0
  120. {pygpt_net-2.7.6.dist-info → pygpt_net-2.7.8.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,1825 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # ================================================== #
4
+ # This file is a part of PYGPT package #
5
+ # Website: https://pygpt.net #
6
+ # GitHub: https://github.com/szczyglis-dev/py-gpt #
7
+ # MIT License #
8
+ # Created By : Marcin Szczygliński #
9
+ # Updated Date: 2026.01.06 20:00:00 #
10
+ # ================================================== #
11
+
12
+ import asyncio
13
+ import base64
14
+ import io
15
+ import json
16
+ import websockets
17
+
18
+ from typing import Optional, Callable, Awaitable
19
+ from urllib.parse import urlencode
20
+
21
+ from pygpt_net.core.events import RealtimeEvent
22
+ from pygpt_net.core.types import MODE_AUDIO
23
+ from pygpt_net.item.ctx import CtxItem
24
+ from pygpt_net.core.text.utils import has_unclosed_code_tag
25
+
26
+ # shared
27
+ from pygpt_net.core.realtime.shared.loop import BackgroundLoop
28
+ from pygpt_net.core.realtime.shared.audio import (
29
+ coerce_to_pcm16_mono,
30
+ resample_pcm16_mono,
31
+ iter_pcm_chunks,
32
+ DEFAULT_24K,
33
+ )
34
+ from pygpt_net.core.realtime.shared.tools import (
35
+ sanitize_function_tools,
36
+ sanitize_remote_tools,
37
+ prepare_tools_for_session,
38
+ prepare_tools_for_response,
39
+ tools_signature,
40
+ build_tool_outputs_payload,
41
+ )
42
+ from pygpt_net.core.realtime.shared.turn import TurnMode, apply_turn_mode_openai
43
+ from pygpt_net.core.realtime.shared.session import set_ctx_rt_handle, set_rt_session_expires_at
44
+
45
+
46
+ class xAIIRealtimeClient:
47
+ """
48
+ xAI Realtime API client with persistent session and a dedicated background event loop.
49
+
50
+ Key points:
51
+ - A single background asyncio loop runs in its own thread for the lifetime of the client.
52
+ - One websocket connection (session) at a time; multiple "turns" (send_turn) are serialized.
53
+ - No server VAD: manual turn control via input_audio_buffer.* + response.create.
54
+ - Safe to call run()/send_turn()/reset()/shutdown() from any thread or event loop.
55
+
56
+ Session resumption:
57
+ - The official Realtime API does not expose a documented server-side "resume" for closed WS sessions.
58
+ We still persist the server-provided session.id and surface it via ctx.extra["rt_session_id"].
59
+ - If opts.rt_session_id is provided and differs from the current in-memory handle, we reset the
60
+ connection and attempt to reconnect with a "session_id" query parameter. If that fails, we fall
61
+ back to the standard URL to avoid breaking existing functionality.
62
+ """
63
+
64
+ WS_URL = "wss://api.x.ai/v1/realtime"
65
+
66
+ def __init__(self, window=None, debug: bool = False):
67
+ """
68
+ xAI Realtime API client
69
+
70
+ :param window: Window instance
71
+ :param debug: Enable debug logging
72
+ """
73
+ self.window = window
74
+ self.debug = debug
75
+
76
+ # WebSocket and session state (lives on the owner loop)
77
+ self.ws: Optional[websockets.WebSocketClientProtocol] = None
78
+ self._rx_task: Optional[asyncio.Task] = None
79
+ self._running: bool = False
80
+
81
+ # Background loop
82
+ self._bg = BackgroundLoop(name="xAI-RT-Loop")
83
+
84
+ # Flow control primitives (created on the owner loop)
85
+ self._send_lock: Optional[asyncio.Lock] = None
86
+ self._response_done: Optional[asyncio.Event] = None
87
+ self._response_active: bool = False
88
+
89
+ # Callbacks and context
90
+ self._on_text: Optional[Callable[[str], Awaitable[None]]] = None
91
+ self._on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None
92
+ self._should_stop: Optional[Callable[[], bool]] = None
93
+ self._ctx: Optional[CtxItem] = None
94
+ self._last_opts = None # kept to allow reset() without resupplying
95
+
96
+ self._DEFAULT_RATE = DEFAULT_24K
97
+
98
+ # Per-response extraction state (tools/images/citations/usage/assembled text)
99
+ self._rt_state = None # dict populated on response.created
100
+
101
+ # Input transcription buffers keyed by item_id
102
+ self._input_tr_buffers: dict[str, io.StringIO] = {}
103
+
104
+ # Cached session.tools signature to avoid redundant session.update
105
+ self._cached_session_tools_sig: Optional[str] = None
106
+
107
+ # Last tool calls snapshot for mapping tool responses
108
+ self._last_tool_calls: list[dict] = []
109
+
110
+ # Live session handle (for best-effort resumption semantics)
111
+ self._rt_session_id: Optional[str] = None
112
+ self._rt_session_expires_at: Optional[int] = None # epoch seconds if provided by server
113
+
114
+ # -----------------------------
115
+ # Public high-level entrypoints
116
+ # -----------------------------
117
+
118
+ async def run(
119
+ self,
120
+ ctx: CtxItem,
121
+ opts,
122
+ on_text: Callable[[str], Awaitable[None]],
123
+ on_audio: Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]],
124
+ should_stop: Callable[[], bool] = lambda: False,
125
+ ):
126
+ """
127
+ Run one turn: open session if needed, send prompt/audio, await response completion.
128
+
129
+ :param ctx: CtxItem with model and conversation
130
+ :param opts: Options object with prompt/audio/voice/etc.
131
+ :param on_text: Async callback for text deltas
132
+ :param on_audio: Async callback for audio chunks
133
+ :param should_stop: Sync callback to signal barge-in (cancel active response)
134
+ """
135
+ self._ensure_background_loop()
136
+ self._ctx = ctx
137
+
138
+ # If a different resumable handle is provided, reset to attempt best-effort resume.
139
+ try:
140
+ provided = getattr(opts, "rt_session_id", None)
141
+ if isinstance(provided, str):
142
+ provided = provided.strip()
143
+ if self.ws is not None and provided and provided != (self._rt_session_id or ""):
144
+ await self._run_on_owner(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop))
145
+ except Exception:
146
+ pass
147
+
148
+ # Open session on the owner loop (once)
149
+ if not self.ws:
150
+ await self._run_on_owner(self._open_session_internal(ctx, opts, on_text, on_audio, should_stop))
151
+
152
+ # Send one turn on the owner loop
153
+ await self._run_on_owner(self._send_turn_internal(
154
+ getattr(opts, "prompt", None),
155
+ getattr(opts, "audio_data", None),
156
+ getattr(opts, "audio_format", None),
157
+ getattr(opts, "audio_rate", None),
158
+ wait_for_done=not bool(getattr(opts, "streaming", False)),
159
+ ))
160
+
161
+ async def open_session(
162
+ self,
163
+ ctx: CtxItem,
164
+ opts,
165
+ on_text: Callable[[str], Awaitable[None]],
166
+ on_audio: Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]],
167
+ should_stop: Callable[[], bool] = lambda: False,
168
+ ):
169
+ """
170
+ Explicitly open a session (websocket); normally run() does this on demand.
171
+ """
172
+ self._ensure_background_loop()
173
+
174
+ # If the session is already open but a different handle is requested, reset to attempt reattach.
175
+ try:
176
+ provided = getattr(opts, "rt_session_id", None)
177
+ if isinstance(provided, str):
178
+ provided = provided.strip()
179
+ if self.ws is not None and provided and provided != (self._rt_session_id or ""):
180
+ await self._run_on_owner(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop))
181
+ return
182
+ except Exception:
183
+ pass
184
+
185
+ await self._run_on_owner(self._open_session_internal(ctx, opts, on_text, on_audio, should_stop))
186
+
187
+ async def close_session(self):
188
+ """Close the websocket session but keep the background loop alive."""
189
+ if not self._bg.loop:
190
+ return
191
+ await self._run_on_owner(self._close_session_internal())
192
+
193
+ async def reset_session(
194
+ self,
195
+ ctx: Optional[CtxItem] = None,
196
+ opts=None,
197
+ on_text: Optional[Callable[[str], Awaitable[None]]] = None,
198
+ on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None,
199
+ should_stop: Optional[Callable[[], bool]] = None,
200
+ ):
201
+ """
202
+ Close the current session and open a fresh one (new conversation on the server).
203
+ If parameters are omitted, last-known ones are used.
204
+ """
205
+ self._ensure_background_loop()
206
+ await self._run_on_owner(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop))
207
+
208
+ async def shutdown(self):
209
+ """
210
+ Gracefully close the current session (if any).
211
+ Does NOT stop the background loop; use stop_loop_sync() or shutdown_and_stop() to also stop the loop.
212
+ """
213
+ if not self._bg.loop:
214
+ return
215
+ await self._run_on_owner(self._close_session_internal())
216
+
217
+ async def shutdown_and_stop(self):
218
+ """Close session and stop the background loop thread."""
219
+ await self.shutdown()
220
+ self.stop_loop_sync()
221
+
222
+ # -----------------------------
223
+ # Synchronous convenience calls
224
+ # -----------------------------
225
+
226
+ def close_session_sync(self, timeout: float = 5.0):
227
+ """Synchronous wrapper around close_session()."""
228
+ if not self._bg.loop or not self._bg.loop.is_running():
229
+ return
230
+ self._bg.run_sync(self._close_session_internal(), timeout=timeout)
231
+
232
+ def reset_session_sync(
233
+ self,
234
+ ctx: Optional[CtxItem] = None,
235
+ opts=None,
236
+ on_text: Optional[Callable[[str], Awaitable[None]]] = None,
237
+ on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None,
238
+ should_stop: Optional[Callable[[], bool]] = None,
239
+ timeout: float = 10.0,
240
+ ):
241
+ """Synchronous wrapper around reset_session()."""
242
+ self._ensure_background_loop()
243
+ self._bg.run_sync(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop), timeout=timeout)
244
+
245
+ def shutdown_sync(self, timeout: float = 5.0):
246
+ """Synchronous wrapper around shutdown() — closes the WS but leaves the loop alive."""
247
+ if not self._bg.loop or not self._bg.loop.is_running():
248
+ return
249
+ self._bg.run_sync(self._close_session_internal(), timeout=timeout)
250
+
251
+ def stop_loop_sync(self, timeout: float = 2.0):
252
+ """Stop the background event loop thread."""
253
+ self._bg.stop(timeout=timeout)
254
+
255
+ # -----------------------------
256
+ # Tools helpers
257
+ # -----------------------------
258
+
259
+ def _update_last_opts_tools(self, tools: Optional[list], remote_tools: Optional[list]) -> None:
260
+ """
261
+ Update self._last_opts with tools/remote_tools if fields are present.
262
+ """
263
+ lo = self._last_opts
264
+ if not lo:
265
+ return
266
+ try:
267
+ if tools is not None and hasattr(lo, "tools"):
268
+ setattr(lo, "tools", tools)
269
+ except Exception:
270
+ pass
271
+ try:
272
+ if remote_tools is not None and hasattr(lo, "remote_tools"):
273
+ setattr(lo, "remote_tools", remote_tools)
274
+ except Exception:
275
+ pass
276
+
277
+ # -----------------------------
278
+ # Internal: background loop/dispatch
279
+ # -----------------------------
280
+
281
+ def _ensure_background_loop(self):
282
+ """Start the background asyncio loop once and keep it running."""
283
+ self._bg.ensure()
284
+
285
+ async def _run_on_owner(self, coro):
286
+ """Await a coroutine scheduled on the owner loop from any thread/loop."""
287
+ return await self._bg.run(coro)
288
+
289
+ # -----------------------------
290
+ # Internal: session lifecycle
291
+ # -----------------------------
292
+
293
+ async def _open_session_internal(
294
+ self,
295
+ ctx: CtxItem,
296
+ opts,
297
+ on_text: Callable[[str], Awaitable[None]],
298
+ on_audio: Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]],
299
+ should_stop: Callable[[], bool] = lambda: False,
300
+ ):
301
+ """
302
+ Open WS and configure the Realtime session on the owner loop.
303
+ """
304
+ if self.ws is not None:
305
+ if self.debug:
306
+ print("[open_session] already open")
307
+ return
308
+
309
+ core = self.window.core
310
+ api_key = self.window.core.config.get("api_key_xai")
311
+ if not api_key:
312
+ raise RuntimeError("xAPI key not configured")
313
+
314
+ model_id = getattr(opts, "model", None) or (ctx.model if ctx and ctx.model else "grok-3")
315
+ voice = getattr(opts, "voice", None) or self._preferred_voice()
316
+
317
+ # Optional: requested resume handle from opts
318
+ resume_sid = None
319
+ try:
320
+ provided = getattr(opts, "rt_session_id", None)
321
+ if isinstance(provided, str):
322
+ provided = provided.strip()
323
+ if provided and provided != (self._rt_session_id or ""):
324
+ resume_sid = provided
325
+ self._rt_session_id = resume_sid
326
+ set_ctx_rt_handle(self._ctx, resume_sid, self.window)
327
+ except Exception:
328
+ pass
329
+
330
+ # Build WS URL with model and optional session_id for best-effort resume
331
+ base_q = {"model": model_id}
332
+ if resume_sid:
333
+ base_q["session_id"] = resume_sid # if unsupported by server, connect fallback will ignore
334
+ url_with_sid = f"{self.WS_URL}?{urlencode(base_q)}"
335
+ url_no_sid = f"{self.WS_URL}?{urlencode({'model': model_id})}"
336
+
337
+ headers = {
338
+ "Authorization": f"Bearer {api_key}",
339
+ }
340
+
341
+ # Transcription toggle
342
+ transcribe_enabled = bool(getattr(opts, "transcribe", False))
343
+
344
+ # Save callbacks and context
345
+ self._on_text = on_text
346
+ self._on_audio = on_audio
347
+ self._should_stop = should_stop or (lambda: False)
348
+ self._ctx = ctx
349
+ self._last_opts = opts
350
+
351
+ # Control primitives
352
+ self._response_done = asyncio.Event()
353
+ self._send_lock = asyncio.Lock()
354
+
355
+ if self.debug:
356
+ print(f"[open_session] owner_loop={id(asyncio.get_running_loop())}")
357
+
358
+ # Connect WS: first try with session_id if provided; on failure, fall back to plain URL.
359
+ try:
360
+ target_url = url_with_sid if resume_sid else url_no_sid
361
+ self.ws = await websockets.connect(
362
+ target_url,
363
+ additional_headers=headers,
364
+ max_size=16 * 1024 * 1024,
365
+ ping_interval=20,
366
+ ping_timeout=20,
367
+ close_timeout=5,
368
+ )
369
+ except Exception as e:
370
+ if resume_sid and self.debug:
371
+ print(f"[open_session] connect with session_id failed ({e!r}); falling back to plain URL")
372
+ if resume_sid:
373
+ self.ws = await websockets.connect(
374
+ url_no_sid,
375
+ additional_headers=headers,
376
+ max_size=16 * 1024 * 1024,
377
+ ping_interval=20,
378
+ ping_timeout=20,
379
+ close_timeout=5,
380
+ )
381
+ if self.debug:
382
+ print("[open_session] WS connected")
383
+
384
+ # Session payload (manual by default; prepared for auto)
385
+ session_payload = {
386
+ "type": "session.update",
387
+ "session": {
388
+ "modalities": ["text", "audio"],
389
+ "voice": voice,
390
+ "input_audio_format": "pcm16",
391
+ "output_audio_format": "pcm16",
392
+ # turn_detection set below via apply_turn_mode_openai
393
+ **({"instructions": str(getattr(opts, "system_prompt"))} if getattr(opts, "system_prompt", None) else {}),
394
+ },
395
+ }
396
+ turn_mode = TurnMode.AUTO if bool(getattr(opts, "auto_turn", False)) else TurnMode.MANUAL
397
+ apply_turn_mode_openai(session_payload, turn_mode)
398
+ self._tune_openai_vad(session_payload, opts)
399
+
400
+ # Attach tools to session (remote + functions)
401
+ try:
402
+ session_tools = prepare_tools_for_session(opts)
403
+ if session_tools:
404
+ session_payload["session"]["tools"] = session_tools
405
+ self._cached_session_tools_sig = tools_signature(session_tools)
406
+ if self.debug:
407
+ print(f"[open_session] session.tools attached: {len(session_tools)}")
408
+ else:
409
+ self._cached_session_tools_sig = tools_signature([])
410
+ except Exception as _e:
411
+ if self.debug:
412
+ print(f"[open_session] tools sanitize error: {_e}")
413
+ self._cached_session_tools_sig = tools_signature([])
414
+
415
+ # Attach native input transcription if requested
416
+ try:
417
+ if transcribe_enabled:
418
+ iat = {"model": "whisper-1"}
419
+ lang = getattr(opts, "transcribe_language", None) or getattr(opts, "language", None)
420
+ if lang:
421
+ iat["language"] = str(lang)
422
+ session_payload["session"]["input_audio_transcription"] = iat
423
+ except Exception:
424
+ pass
425
+
426
+ if self.debug:
427
+ print(f"[open_session] session_payload: {json.dumps(session_payload)}")
428
+
429
+ await self.ws.send(json.dumps(session_payload))
430
+ if self.debug:
431
+ print("[open_session] session.update sent")
432
+
433
+ # Start a single receiver task
434
+ if self._rx_task is None or self._rx_task.done():
435
+ self._running = True
436
+ self._rx_task = asyncio.create_task(self._recv_loop(), name="realtime-recv")
437
+ if self.debug:
438
+ print("[open_session] _recv_loop started")
439
+
440
+ async def _close_session_internal(self):
441
+ """Close WS and stop the receiver; keep the background loop alive for reuse."""
442
+ self._running = False
443
+
444
+ # Cancel active response if any
445
+ if self.ws and self._response_active:
446
+ try:
447
+ await self.ws.send(json.dumps({"type": "response.cancel"}))
448
+ except Exception:
449
+ pass
450
+
451
+ # Unblock any waiters before clearing handles
452
+ try:
453
+ if self._response_done and not self._response_done.is_set():
454
+ self._response_done.set()
455
+ except Exception:
456
+ pass
457
+
458
+ # Close the socket
459
+ if self.ws:
460
+ try:
461
+ await self.ws.close()
462
+ except Exception:
463
+ pass
464
+ self.ws = None
465
+
466
+ # Await receiver
467
+ if self._rx_task:
468
+ try:
469
+ await self._rx_task
470
+ except Exception:
471
+ pass
472
+ self._rx_task = None
473
+
474
+ # Reset control primitives
475
+ self._response_active = False
476
+ self._response_done = None
477
+ self._send_lock = None
478
+ self._cached_session_tools_sig = None
479
+
480
+ # Clear in-memory handle; do not wipe persisted ctx.extra["rt_session_id"]
481
+ self._rt_session_id = None
482
+ self._rt_session_expires_at = None
483
+
484
+ if self.debug:
485
+ print("[close_session] closed")
486
+
487
+ async def _reset_session_internal(
488
+ self,
489
+ ctx: Optional[CtxItem] = None,
490
+ opts=None,
491
+ on_text: Optional[Callable[[str], Awaitable[None]]] = None,
492
+ on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None,
493
+ should_stop: Optional[Callable[[], bool]] = None,
494
+ ):
495
+ """
496
+ Close current session and open a new one with provided or last-known parameters.
497
+ """
498
+ # Determine params to reuse if not provided
499
+ ctx = ctx or self._ctx
500
+ opts = opts or self._last_opts
501
+ on_text = on_text or self._on_text
502
+ on_audio = on_audio or self._on_audio
503
+ should_stop = should_stop or self._should_stop or (lambda: False)
504
+
505
+ if not (ctx and opts and on_text and on_audio):
506
+ raise RuntimeError("reset_session requires previous or explicit ctx/opts/callbacks")
507
+
508
+ await self._close_session_internal()
509
+ await self._open_session_internal(ctx, opts, on_text, on_audio, should_stop)
510
+
511
+ # -----------------------------
512
+ # Internal: one "turn"
513
+ # -----------------------------
514
+
515
+ async def _send_turn_internal(
516
+ self,
517
+ prompt: Optional[str] = None,
518
+ audio_data: Optional[bytes] = None,
519
+ audio_format: Optional[str] = None,
520
+ audio_rate: Optional[int] = None,
521
+ wait_for_done: bool = True,
522
+ ):
523
+ """
524
+ Send one manual turn (optional text + optional audio) and trigger response.create.
525
+ """
526
+ if not self.ws:
527
+ # If session dropped remotely, try to reopen from last state
528
+ if self._ctx and self._last_opts:
529
+ await self._open_session_internal(self._ctx, self._last_opts, self._on_text, self._on_audio, self._should_stop)
530
+ else:
531
+ raise RuntimeError("Session not open. Call open_session(...) first.")
532
+
533
+ # Serialize all sends to a single WS writer
534
+ if self._send_lock is None:
535
+ self._send_lock = asyncio.Lock()
536
+
537
+ # Determine whether we should trigger a response for this turn
538
+ def _bool(v) -> bool:
539
+ try:
540
+ return bool(v)
541
+ except Exception:
542
+ return False
543
+
544
+ is_auto_turn = _bool(getattr(self._last_opts or object(), "auto_turn", False))
545
+ has_text = bool(prompt and str(prompt).strip() and str(prompt).strip() != "...")
546
+ has_audio = bool(audio_data)
547
+ # Honor explicit "reply" hint if provided by caller (e.g., opts.extra.reply == True)
548
+ reply_hint = False
549
+ try:
550
+ extra = getattr(self._last_opts, "extra", None)
551
+ if isinstance(extra, dict):
552
+ reply_hint = bool(extra.get("reply", False))
553
+ except Exception:
554
+ pass
555
+
556
+ # In manual mode, do not auto-trigger response.create when there is no user input and no explicit reply request.
557
+ if not has_text and not has_audio and not reply_hint:
558
+ if self.debug:
559
+ print("[send_turn] skipped: manual mode with empty input; waiting for explicit commit")
560
+ return
561
+
562
+ wait_prev: Optional[asyncio.Event] = None
563
+ wait_curr: Optional[asyncio.Event] = None
564
+
565
+ async with self._send_lock:
566
+ # Ensure previous response is finished (snapshot the handle to avoid race with close)
567
+ if self._response_active and self._response_done:
568
+ wait_prev = self._response_done
569
+
570
+ # Optional text
571
+ if has_text:
572
+ if self.debug:
573
+ print(f"[send_turn] prompt len={len(prompt)}")
574
+ await self.ws.send(json.dumps({
575
+ "type": "conversation.item.create",
576
+ "item": {
577
+ "type": "message",
578
+ "role": "user",
579
+ "content": [{"type": "input_text", "text": str(prompt)}],
580
+ },
581
+ }))
582
+
583
+ # Optional audio
584
+ if has_audio:
585
+ sr, _ch, pcm = coerce_to_pcm16_mono(audio_data, audio_format, audio_rate, fallback_rate=self._DEFAULT_RATE)
586
+
587
+ if sr != self._DEFAULT_RATE:
588
+ try:
589
+ pcm = resample_pcm16_mono(pcm, sr, self._DEFAULT_RATE)
590
+ if self.debug:
591
+ print(f"[audio] resampled {sr} -> {self._DEFAULT_RATE}")
592
+ sr = self._DEFAULT_RATE
593
+ except Exception as e:
594
+ if self.debug:
595
+ print(f"[audio] resample failed {sr}->{self._DEFAULT_RATE}: {e}")
596
+
597
+ await self.ws.send(json.dumps({"type": "input_audio_buffer.clear"}))
598
+ for chunk in iter_pcm_chunks(pcm, sr, ms=50):
599
+ if not chunk:
600
+ continue
601
+ await self.ws.send(json.dumps({
602
+ "type": "input_audio_buffer.append",
603
+ "audio": base64.b64encode(chunk).decode("utf-8"),
604
+ }))
605
+ await self.ws.send(json.dumps({"type": "input_audio_buffer.commit"}))
606
+
607
+ # If we were waiting for a previous response, do it inside lock handoff-safe
608
+ if wait_prev:
609
+ try:
610
+ if self.debug:
611
+ print("[send_turn] waiting for previous response")
612
+ await wait_prev.wait()
613
+ except Exception:
614
+ pass
615
+
616
+ # Prepare wait handle for the response about to start
617
+ if self._response_done is None:
618
+ self._response_done = asyncio.Event()
619
+ else:
620
+ try:
621
+ self._response_done.clear()
622
+ except Exception:
623
+ self._response_done = asyncio.Event()
624
+ wait_curr = self._response_done # snapshot for race-free waiting
625
+
626
+ # Build optional response payload (modalities + tools/tool_choice)
627
+ resp_obj = {"modalities": ["text", "audio"]}
628
+ try:
629
+ resp_tools, tool_choice = prepare_tools_for_response(self._last_opts)
630
+ if resp_tools:
631
+ resp_obj["tools"] = resp_tools
632
+ if tool_choice is None:
633
+ tool_choice = "auto"
634
+ if tool_choice:
635
+ resp_obj["tool_choice"] = tool_choice
636
+ except Exception as _e:
637
+ if self.debug:
638
+ print(f"[send_turn] response tools compose error: {_e}")
639
+
640
+ payload = {"type": "response.create"}
641
+ if len(resp_obj) > 0:
642
+ payload["response"] = resp_obj
643
+
644
+ await self.ws.send(json.dumps(payload))
645
+ if self.debug:
646
+ print("[send_turn] response.create sent")
647
+
648
+ # Optionally wait for response.done (otherwise return immediately)
649
+ if wait_for_done and wait_curr:
650
+ if self.debug:
651
+ print("[send_turn] waiting for response.done")
652
+ try:
653
+ await wait_curr.wait()
654
+ except Exception:
655
+ pass
656
+ if self.debug:
657
+ print("[send_turn] response.done received")
658
+
659
+ async def _cancel_active_response_internal(self):
660
+ """Cancel current response (barge-in)."""
661
+ if self.ws and self._response_active:
662
+ try:
663
+ await self.ws.send(json.dumps({"type": "response.cancel"}))
664
+ except Exception:
665
+ pass
666
+
667
+ # -----------------------------
668
+ # Internal: audio input (auto-turn mode)
669
+ # -----------------------------
670
+
671
+ def rt_handle_audio_input_sync(self, event: RealtimeEvent, timeout: float = 0.5):
672
+ """
673
+ Synchronous entrypoint for continuous microphone input when auto-turn is enabled.
674
+ This is safe to call from any thread; it schedules on the owner's background loop.
675
+ """
676
+ # Fast return if nothing to send
677
+ try:
678
+ payload = getattr(event, "data", {}) or {}
679
+ if isinstance(payload, dict) and "payload" in payload and isinstance(payload["payload"], dict):
680
+ payload = payload["payload"]
681
+ if not payload or not payload.get("data"):
682
+ return
683
+ except Exception:
684
+ return
685
+
686
+ self._ensure_background_loop()
687
+ try:
688
+ self._bg.run_sync(self._rt_handle_audio_input_internal(event), timeout=timeout)
689
+ except Exception:
690
+ # Never raise to caller from audio callback
691
+ pass
692
+
693
+ async def _rt_handle_audio_input_internal(self, event: RealtimeEvent):
694
+ """
695
+ Owner-loop implementation: push live audio to input buffer in auto-turn mode.
696
+ """
697
+ # Session must be open and auto-turn must be enabled
698
+ if not self.ws or not self._running:
699
+ if self.debug:
700
+ print("[_rt_handle_audio_input] Socket not open!")
701
+ return
702
+ try:
703
+ if not bool(getattr(self._last_opts, "auto_turn", False)):
704
+ return
705
+ except Exception:
706
+ return
707
+
708
+ # Extract normalized payload
709
+ payload = getattr(event, "data", {}) or {}
710
+ if isinstance(payload, dict) and "payload" in payload and isinstance(payload["payload"], dict):
711
+ payload = payload["payload"]
712
+
713
+ data: bytes = payload.get("data") or b""
714
+ if not data:
715
+ return
716
+ mime = str(payload.get("mime") or "audio/pcm")
717
+ rate = int(payload.get("rate") or 0) or self._DEFAULT_RATE
718
+ channels = int(payload.get("channels") or 1)
719
+ is_final = bool(payload.get("final", False))
720
+
721
+ # Convert to PCM16 mono @ 24kHz as required by our session config
722
+ fmt_hint = "pcm16" if mime.startswith("audio/pcm") else None
723
+ try:
724
+ sr, _ch, pcm = coerce_to_pcm16_mono(data, fmt_hint, rate, fallback_rate=self._DEFAULT_RATE)
725
+ if sr != self._DEFAULT_RATE:
726
+ try:
727
+ pcm = resample_pcm16_mono(pcm, sr, self._DEFAULT_RATE)
728
+ sr = self._DEFAULT_RATE
729
+ except Exception:
730
+ # On resample failure, still try to send raw chunk as-is (defensive)
731
+ sr = self._DEFAULT_RATE
732
+ except Exception:
733
+ return
734
+
735
+ # Serialize writes to the websocket
736
+ if self._send_lock is None:
737
+ self._send_lock = asyncio.Lock()
738
+
739
+ async with self._send_lock:
740
+ # Append in ~50 ms chunks to keep frames small
741
+ for chunk in iter_pcm_chunks(pcm, sr, ms=50):
742
+ if not chunk:
743
+ continue
744
+ try:
745
+ await self.ws.send(json.dumps({
746
+ "type": "input_audio_buffer.append",
747
+ "audio": base64.b64encode(chunk).decode("utf-8"),
748
+ }))
749
+ except Exception:
750
+ return
751
+
752
+ # If plugin reported stream end, flush the buffer once.
753
+ if is_final:
754
+ try:
755
+ if self.debug:
756
+ print("[_rt_handle_audio_input] final chunk; committing")
757
+ await self.ws.send(json.dumps({"type": "input_audio_buffer.commit"}))
758
+ except Exception:
759
+ pass
760
+
761
+ def commit_audio_input_sync(self, timeout: float = 0.5):
762
+ """
763
+ Synchronous entrypoint to commit the input audio buffer in auto-turn mode.
764
+ This is safe to call from any thread; it schedules on the owner's background loop.
765
+ """
766
+ self._ensure_background_loop()
767
+ try:
768
+ self._bg.run_sync(self._commit_audio_input_internal(), timeout=timeout)
769
+ except Exception:
770
+ # Never raise to caller from audio callback
771
+ pass
772
+
773
+ async def _commit_audio_input_internal(self):
774
+ """
775
+ Owner-loop implementation: commit input audio buffer in auto-turn mode.
776
+ """
777
+ if not self.ws or not self._running:
778
+ return
779
+ try:
780
+ if not bool(getattr(self._last_opts, "auto_turn", False)):
781
+ return
782
+ except Exception:
783
+ return
784
+ if self._send_lock is None:
785
+ self._send_lock = asyncio.Lock()
786
+ async with self._send_lock:
787
+ try:
788
+ await self.ws.send(json.dumps({"type": "input_audio_buffer.commit"}))
789
+ except Exception:
790
+ pass
791
+
792
+ def force_response_now_sync(self, timeout: float = 5.0):
793
+ """Synchronously force the model to create a response from current input buffer."""
794
+ self._ensure_background_loop()
795
+ try:
796
+ self._bg.run_sync(self._force_response_now_internal(), timeout=timeout)
797
+ except Exception:
798
+ pass
799
+
800
+ async def _force_response_now_internal(self):
801
+ """Owner-loop: commit current input buffer and trigger response.create."""
802
+ if not self.ws or not self._running:
803
+ return
804
+ try:
805
+ if not bool(getattr(self._last_opts, "auto_turn", False)):
806
+ # This helper is intended for auto-turn; manual flow already does commit+response.create.
807
+ return
808
+ except Exception:
809
+ return
810
+
811
+ if self._send_lock is None:
812
+ self._send_lock = asyncio.Lock()
813
+
814
+ async with self._send_lock:
815
+ # 1) Finalize current input buffer
816
+ try:
817
+ await self.ws.send(json.dumps({"type": "input_audio_buffer.commit"}))
818
+ except Exception:
819
+ return
820
+
821
+ # 2) Prepare wait handle for this response
822
+ if self._response_done is None:
823
+ self._response_done = asyncio.Event()
824
+ else:
825
+ try:
826
+ self._response_done.clear()
827
+ except Exception:
828
+ self._response_done = asyncio.Event()
829
+
830
+ # 3) Build response payload (modalities + tools/tool_choice like in _send_turn_internal)
831
+ resp_obj = {"modalities": ["text", "audio"]}
832
+ try:
833
+ resp_tools, tool_choice = prepare_tools_for_response(self._last_opts)
834
+ if resp_tools:
835
+ resp_obj["tools"] = resp_tools
836
+ if tool_choice is None:
837
+ tool_choice = "auto"
838
+ if tool_choice:
839
+ resp_obj["tool_choice"] = tool_choice
840
+ except Exception:
841
+ pass
842
+
843
+ # 4) Trigger the assistant response now
844
+ try:
845
+ await self.ws.send(json.dumps({"type": "response.create", "response": resp_obj}))
846
+ except Exception:
847
+ return
848
+
849
+ # -----------------------------
850
+ # Public: live tools update
851
+ # -----------------------------
852
+
853
+ async def update_session_tools(
854
+ self,
855
+ tools: Optional[list] = None,
856
+ remote_tools: Optional[list] = None,
857
+ force: bool = False
858
+ ):
859
+ """
860
+ Update session tools live via session.update.
861
+ If WS is not open, this updates self._last_opts and returns.
862
+ """
863
+ self._ensure_background_loop()
864
+ return await self._run_on_owner(
865
+ self._update_session_tools_internal(tools, remote_tools, force)
866
+ )
867
+
868
+ def update_session_tools_sync(
869
+ self,
870
+ tools: Optional[list] = None,
871
+ remote_tools: Optional[list] = None,
872
+ force: bool = False,
873
+ timeout: float = 5.0
874
+ ):
875
+ """Synchronous wrapper over update_session_tools()."""
876
+ self._ensure_background_loop()
877
+ self._bg.run_sync(self._update_session_tools_internal(tools, remote_tools, force), timeout=timeout)
878
+
879
+ async def _update_session_tools_internal(
880
+ self,
881
+ tools: Optional[list],
882
+ remote_tools: Optional[list],
883
+ force: bool
884
+ ):
885
+ """
886
+ Owner-loop implementation for session tools update.
887
+ """
888
+ # If socket is not open, just cache into last opts
889
+ if not self.ws:
890
+ self._update_last_opts_tools(tools, remote_tools)
891
+ self._cached_session_tools_sig = None
892
+ if self.debug:
893
+ print("[update_session_tools] WS not open; cached for next session")
894
+ return
895
+
896
+ # Sanitize/compose session tools
897
+ try:
898
+ fn = sanitize_function_tools(tools if tools is not None else getattr(self._last_opts, "tools", None))
899
+ rt = sanitize_remote_tools(remote_tools if remote_tools is not None else getattr(self._last_opts, "remote_tools", None))
900
+ session_tools = (rt or []) + (fn or [])
901
+ except Exception as e:
902
+ if self.debug:
903
+ print(f"[update_session_tools] sanitize error: {e}")
904
+ session_tools = []
905
+
906
+ new_sig = tools_signature(session_tools)
907
+
908
+ # Compare with cached signature
909
+ if not force and self._cached_session_tools_sig == new_sig:
910
+ if self.debug:
911
+ print("[update_session_tools] no changes; skipping session.update")
912
+ self._update_last_opts_tools(tools, remote_tools)
913
+ return
914
+
915
+ # Send session.update under the single writer lock
916
+ if self._send_lock is None:
917
+ self._send_lock = asyncio.Lock()
918
+ async with self._send_lock:
919
+ try:
920
+ payload = {
921
+ "type": "session.update",
922
+ "session": {"tools": session_tools}
923
+ }
924
+ await self.ws.send(json.dumps(payload))
925
+ self._cached_session_tools_sig = new_sig
926
+ self._update_last_opts_tools(tools, remote_tools)
927
+ if self.debug:
928
+ print(f"[update_session_tools] session.update sent; tools={len(session_tools)}")
929
+ except Exception as e:
930
+ if self.debug:
931
+ print(f"[update_session_tools] send error: {e}")
932
+
933
+ # -----------------------------
934
+ # Public: send tool results back to the model
935
+ # -----------------------------
936
+
937
+ async def send_tool_results(
938
+ self,
939
+ results,
940
+ continue_turn: bool = True,
941
+ wait_for_done: bool = True,
942
+ ):
943
+ """
944
+ Send tool results back to the Realtime session.
945
+ """
946
+ self._ensure_background_loop()
947
+ return await self._run_on_owner(
948
+ self._send_tool_results_internal(results, continue_turn, wait_for_done)
949
+ )
950
+
951
+ def send_tool_results_sync(
952
+ self,
953
+ results,
954
+ continue_turn: bool = True,
955
+ wait_for_done: bool = True,
956
+ timeout: float = 20.0,
957
+ ):
958
+ """Synchronous wrapper for send_tool_results()."""
959
+ self._ensure_background_loop()
960
+ return self._bg.run_sync(
961
+ self._send_tool_results_internal(results, continue_turn, wait_for_done),
962
+ timeout=timeout
963
+ )
964
+
965
+ async def _send_tool_results_internal(
966
+ self,
967
+ results,
968
+ continue_turn: bool,
969
+ wait_for_done: bool,
970
+ ):
971
+ """
972
+ Owner-loop implementation. Serializes sends under the WS writer lock.
973
+ """
974
+ if not self.ws:
975
+ raise RuntimeError("Live session is not open")
976
+
977
+ outputs = build_tool_outputs_payload(results, self._last_tool_calls)
978
+ if not outputs:
979
+ return
980
+
981
+ if self._send_lock is None:
982
+ self._send_lock = asyncio.Lock()
983
+
984
+ wait_ev: Optional[asyncio.Event] = None
985
+ async with self._send_lock:
986
+ # Emit one conversation.item.create per tool output
987
+ for it in outputs:
988
+ payload = {
989
+ "type": "conversation.item.create",
990
+ "item": {
991
+ "type": "function_call_output",
992
+ "call_id": it["call_id"],
993
+ "output": it["output"], # must be a string (JSON-encoded when dict/list)
994
+ },
995
+ }
996
+ if it.get("previous_item_id"):
997
+ payload["previous_item_id"] = it["previous_item_id"]
998
+ await self.ws.send(json.dumps(payload))
999
+
1000
+ # Optionally ask the model to continue
1001
+ if continue_turn:
1002
+ if self._response_done is None:
1003
+ self._response_done = asyncio.Event()
1004
+ else:
1005
+ try:
1006
+ self._response_done.clear()
1007
+ except Exception:
1008
+ self._response_done = asyncio.Event()
1009
+ wait_ev = self._response_done # snapshot for race-free waiting
1010
+ await self.ws.send(json.dumps({"type": "response.create"}))
1011
+
1012
+ # Wait for the follow-up response to complete
1013
+ if continue_turn and wait_for_done and wait_ev:
1014
+ try:
1015
+ await wait_ev.wait()
1016
+ except Exception:
1017
+ pass
1018
+
1019
+ # -----------------------------
1020
+ # Internal: receive loop
1021
+ # -----------------------------
1022
+
1023
+ async def _recv_loop(self):
1024
+ """
1025
+ Single receiver loop for the entire session.
1026
+ Processes incoming events and dispatches to callbacks.
1027
+ """
1028
+ if self.debug:
1029
+ print("[_recv_loop] started")
1030
+
1031
+ DEFAULT_RATE = self._DEFAULT_RATE
1032
+ audio_done = True
1033
+
1034
+ try:
1035
+ while self._running and self.ws:
1036
+ # Do not hard-stop the session on should_stop; only cancel active response if requested.
1037
+ if self._should_stop and self._should_stop():
1038
+ await self._cancel_active_response_internal()
1039
+
1040
+ try:
1041
+ raw = await asyncio.wait_for(self.ws.recv(), timeout=60)
1042
+ except asyncio.TimeoutError:
1043
+ continue
1044
+ except Exception as e:
1045
+ if self.debug:
1046
+ print(f"[_recv_loop] recv error: {e!r}")
1047
+ break
1048
+
1049
+ if isinstance(raw, bytes):
1050
+ # Realtime sends JSON text frames; ignore unexpected binary
1051
+ continue
1052
+
1053
+ try:
1054
+ ev = json.loads(raw)
1055
+ except Exception:
1056
+ continue
1057
+
1058
+ etype = ev.get("type")
1059
+
1060
+ # ---- session lifecycle (capture server handle) ----
1061
+ if etype in ("session.created", "session.updated"):
1062
+ sess = ev.get("session") or {}
1063
+ sid = sess.get("id")
1064
+ if isinstance(sid, str) and sid.strip():
1065
+ self._rt_session_id = sid.strip()
1066
+ set_ctx_rt_handle(self._ctx, self._rt_session_id, self.window)
1067
+ if self.debug:
1068
+ print(f"[_recv_loop] session id: {self._rt_session_id}")
1069
+ # Optional: expires_at if present (not always provided)
1070
+ exp = sess.get("expires_at") or sess.get("expiresAt")
1071
+ try:
1072
+ if isinstance(exp, (int, float)) and exp > 0:
1073
+ self._rt_session_expires_at = int(exp)
1074
+ set_rt_session_expires_at(self._ctx, self._rt_session_expires_at, self.window)
1075
+ except Exception:
1076
+ pass
1077
+ continue
1078
+
1079
+ if etype == "response.created":
1080
+ if self.debug:
1081
+ print("[_recv_loop] response created")
1082
+ self._response_active = True
1083
+ audio_done = False
1084
+ self._rt_reset_state()
1085
+
1086
+ elif etype == "input_audio_buffer.speech_started":
1087
+ if self.debug:
1088
+ print("[_recv_loop] speech_started")
1089
+
1090
+ elif etype == "input_audio_buffer.speech_stopped":
1091
+ if self.debug:
1092
+ print("[_recv_loop] speech_stopped")
1093
+
1094
+ elif etype == "input_audio_buffer.committed":
1095
+ if self.debug:
1096
+ print("[_recv_loop] audio_buffer.committed")
1097
+
1098
+ # disable mic input if auto-commit
1099
+ if self._last_opts:
1100
+ self._last_opts.rt_signals.response.emit(RealtimeEvent(RealtimeEvent.RT_OUTPUT_AUDIO_COMMIT, {
1101
+ "ctx": self._ctx,
1102
+ }))
1103
+
1104
+ # ---- input transcription (user speech) ----
1105
+ elif etype == "conversation.item.input_audio_transcription.delta":
1106
+ if self._transcribe_enabled():
1107
+ buf = self._input_tr_buffers.get(ev.get("item_id"))
1108
+ if buf is None:
1109
+ buf = io.StringIO()
1110
+ self._input_tr_buffers[ev.get("item_id")] = buf
1111
+ delta = ev.get("delta") or ev.get("text") or ev.get("transcript") or ""
1112
+ if delta:
1113
+ buf.write(str(delta))
1114
+
1115
+ elif etype in ("conversation.item.input_audio_transcription.completed",
1116
+ "conversation.item.input_audio_transcription.done"):
1117
+ if self._transcribe_enabled():
1118
+ item_id = ev.get("item_id")
1119
+ tr = ev.get("transcript") or ""
1120
+ buf = self._input_tr_buffers.pop(item_id, None)
1121
+ if buf is not None:
1122
+ try:
1123
+ v = buf.getvalue()
1124
+ if v and not tr:
1125
+ tr = v
1126
+ finally:
1127
+ try:
1128
+ buf.close()
1129
+ except Exception:
1130
+ pass
1131
+ if tr:
1132
+ self._save_input_transcript(tr)
1133
+
1134
+ elif etype == "conversation.item.input_audio_transcription.failed":
1135
+ if self.debug:
1136
+ err = (ev.get("error") or {}).get("message") or "input transcription failed"
1137
+ print(f"[_recv_loop] {err}")
1138
+
1139
+ elif etype == "conversation.item.created":
1140
+ if self.debug:
1141
+ print("[_recv_loop] conversation.item.created")
1142
+ # Fallback: some servers may include transcript inside the created user item
1143
+ if self._transcribe_enabled():
1144
+ item = ev.get("item") or {}
1145
+ if item.get("role") == "user":
1146
+ for c in (item.get("content") or []):
1147
+ if isinstance(c, dict) and c.get("type") in ("input_audio", "audio"):
1148
+ tr = c.get("transcript")
1149
+ if tr:
1150
+ self._save_input_transcript(str(tr))
1151
+
1152
+ # ---- assistant text vs assistant audio transcript deltas ----
1153
+ elif etype in ("response.text.delta", "response.output_text.delta"):
1154
+ delta = ev.get("delta") or ev.get("text")
1155
+ if isinstance(delta, dict) and "text" in delta:
1156
+ delta = delta["text"]
1157
+ if delta:
1158
+ self._rt_append_text(delta)
1159
+ if self._on_text:
1160
+ try:
1161
+ await self._on_text(str(delta))
1162
+ except Exception:
1163
+ pass
1164
+ elif etype == "response.audio_transcript.delta":
1165
+ if self._transcribe_enabled():
1166
+ delta = ev.get("delta") or ev.get("text")
1167
+ if isinstance(delta, dict) and "text" in delta:
1168
+ delta = delta["text"]
1169
+ if delta:
1170
+ self._rt_append_text(delta)
1171
+ if self._on_text:
1172
+ try:
1173
+ await self._on_text(str(delta))
1174
+ except Exception:
1175
+ pass
1176
+
1177
+ elif etype in ("response.text.done", "response.output_text.done", "response.audio_transcript.done"):
1178
+ if self.debug:
1179
+ print("[_recv_loop] text done")
1180
+
1181
+ elif etype == "response.content_part.added":
1182
+ part = ev.get("part") or {}
1183
+ ptype = part.get("type")
1184
+ if ptype == "text":
1185
+ txt = part.get("text") or ""
1186
+ if txt:
1187
+ self._rt_append_text(txt)
1188
+ if self._on_text:
1189
+ try:
1190
+ await self._on_text(str(txt))
1191
+ except Exception:
1192
+ pass
1193
+ elif ptype == "audio":
1194
+ b64 = part.get("audio")
1195
+ if b64 and self._on_audio:
1196
+ try:
1197
+ data = base64.b64decode(b64)
1198
+ await self._on_audio(data, "audio/pcm", DEFAULT_RATE, 1, False)
1199
+ except Exception:
1200
+ pass
1201
+ tr = part.get("transcript")
1202
+ if tr and self._transcribe_enabled():
1203
+ self._rt_append_text(tr)
1204
+ if self._on_text:
1205
+ try:
1206
+ await self._on_text(str(tr))
1207
+ except Exception:
1208
+ pass
1209
+
1210
+ elif etype == "response.audio.delta":
1211
+ b64 = ev.get("delta")
1212
+ if b64 and self._on_audio:
1213
+ try:
1214
+ data = base64.b64decode(b64)
1215
+ await self._on_audio(data, "audio/pcm", DEFAULT_RATE, 1, False)
1216
+ except Exception:
1217
+ pass
1218
+
1219
+ elif etype == "response.audio.done":
1220
+ if self.debug:
1221
+ print("[_recv_loop] audio done")
1222
+ if not audio_done and self._on_audio:
1223
+ try:
1224
+ await self._on_audio(b"", "audio/pcm", DEFAULT_RATE, 1, True)
1225
+ except Exception:
1226
+ pass
1227
+ audio_done = True
1228
+
1229
+ # ---- function calling (tools) ----
1230
+ elif etype == "response.output_item.added":
1231
+ if self.debug:
1232
+ print("[_recv_loop] output_item added")
1233
+ item = ev.get("item") or {}
1234
+ if item.get("type") == "function_call":
1235
+ fid = item.get("id") or item.get("item_id") or ""
1236
+ call_id = item.get("call_id") or ""
1237
+ name = item.get("name") or ""
1238
+ self._rt_state["tool_calls"].append({
1239
+ "id": fid,
1240
+ "call_id": call_id,
1241
+ "type": "function",
1242
+ "function": {"name": name, "arguments": ""}
1243
+ })
1244
+ if fid and fid not in self._rt_state["fn_args_buffers"]:
1245
+ self._rt_state["fn_args_buffers"][fid] = io.StringIO()
1246
+
1247
+ elif etype == "response.function_call_arguments.delta":
1248
+ buf = self._rt_state["fn_args_buffers"].get(ev.get("item_id"))
1249
+ if buf is not None:
1250
+ delta = ev.get("delta") or ""
1251
+ if delta:
1252
+ buf.write(delta)
1253
+
1254
+ elif etype == "response.function_call_arguments.done":
1255
+ item_id = ev.get("item_id")
1256
+ args_val = ev.get("arguments") or ""
1257
+ buf = self._rt_state["fn_args_buffers"].pop(item_id, None)
1258
+ if buf is not None:
1259
+ try:
1260
+ concat = buf.getvalue()
1261
+ if concat:
1262
+ args_val = concat
1263
+ finally:
1264
+ try:
1265
+ buf.close()
1266
+ except Exception:
1267
+ pass
1268
+ for tc in self._rt_state["tool_calls"]:
1269
+ if tc.get("id") == item_id:
1270
+ tc["function"]["arguments"] = args_val
1271
+ break
1272
+ self._rt_state["force_func_call"] = True
1273
+
1274
+ elif etype == "response.output_item.done":
1275
+ if self.debug:
1276
+ print("[_recv_loop] output_item done")
1277
+ item = ev.get("item") or {}
1278
+ if item.get("type") == "function_call":
1279
+ fid = item.get("id") or item.get("item_id") or ""
1280
+ name = item.get("name") or ""
1281
+ args_val = item.get("arguments") or ""
1282
+ for tc in self._rt_state["tool_calls"]:
1283
+ if fid and tc.get("id") == fid:
1284
+ if name:
1285
+ tc["function"]["name"] = name
1286
+ if args_val:
1287
+ tc["function"]["arguments"] = args_val
1288
+ break
1289
+ self._rt_state["force_func_call"] = True
1290
+
1291
+ # ---- code interpreter (delta/done) ----
1292
+ elif etype in ("response.code_interpreter_call_code.delta", "response.code_interpreter_call.code.delta"):
1293
+ code_delta = ev.get("delta") or ""
1294
+ if code_delta:
1295
+ if not self._rt_state["is_code"]:
1296
+ hdr = "\n\n**Code interpreter**\n```python\n"
1297
+ self._rt_append_text(hdr + code_delta)
1298
+ if self._on_text:
1299
+ try:
1300
+ await self._on_text(hdr + code_delta)
1301
+ except Exception:
1302
+ pass
1303
+ self._rt_state["is_code"] = True
1304
+ else:
1305
+ self._rt_append_text(code_delta)
1306
+ if self._on_text:
1307
+ try:
1308
+ await self._on_text(code_delta)
1309
+ except Exception:
1310
+ pass
1311
+
1312
+ elif etype in ("response.code_interpreter_call_code.done", "response.code_interpreter_call.code.done"):
1313
+ if self.debug:
1314
+ print("[_recv_loop] code done")
1315
+ if self._rt_state["is_code"]:
1316
+ tail = "\n\n```\n-----------\n"
1317
+ self._rt_append_text(tail)
1318
+ if self._on_text:
1319
+ try:
1320
+ await self._on_text(tail)
1321
+ except Exception:
1322
+ pass
1323
+ self._rt_state["is_code"] = False
1324
+
1325
+ # ---- annotations (citations/files) ----
1326
+ elif etype == "response.output_text.annotation.added":
1327
+ if self.debug:
1328
+ print("[_recv_loop] annotation added")
1329
+ ann = ev.get("annotation") or {}
1330
+ atype = ann.get("type")
1331
+ if atype == "url_citation":
1332
+ url = ann.get("url")
1333
+ self._rt_add_citation(url)
1334
+ elif atype == "container_file_citation":
1335
+ self._rt_state["files"].append({
1336
+ "container_id": ann.get("container_id"),
1337
+ "file_id": ann.get("file_id"),
1338
+ })
1339
+
1340
+ # ---- partial images (defensive) ----
1341
+ elif etype == "response.image_generation_call.partial_image":
1342
+ image_b64 = ev.get("partial_image_b64")
1343
+ if image_b64:
1344
+ try:
1345
+ img_bytes = base64.b64decode(image_b64)
1346
+ save_path = self.window.core.image.gen_unique_path(self._ctx)
1347
+ with open(save_path, "wb") as f:
1348
+ f.write(img_bytes)
1349
+ self._rt_state["image_paths"].append(save_path)
1350
+ self._rt_state["is_image"] = True
1351
+ if not isinstance(self._ctx.images, list):
1352
+ self._ctx.images = []
1353
+ if save_path not in self._ctx.images:
1354
+ self._ctx.images.append(save_path)
1355
+ except Exception:
1356
+ pass
1357
+
1358
+ elif etype == "response.done":
1359
+ if self.debug:
1360
+ print("[_recv_loop] response done")
1361
+ # Ensure audio finalized
1362
+ if not audio_done and self._on_audio:
1363
+ try:
1364
+ await self._on_audio(b"", "audio/pcm", DEFAULT_RATE, 1, True)
1365
+ except Exception:
1366
+ pass
1367
+ audio_done = True
1368
+
1369
+ self._response_active = False
1370
+
1371
+ # Capture usage if present on response
1372
+ try:
1373
+ resp_obj = ev.get("response") or {}
1374
+ self._rt_capture_usage(resp_obj)
1375
+ except Exception:
1376
+ pass
1377
+
1378
+ # Build final output text
1379
+ output = "".join(self._rt_state["output_parts"]) if self._rt_state else ""
1380
+ if has_unclosed_code_tag(output):
1381
+ output += "\n```"
1382
+ if not output:
1383
+ try:
1384
+ transcript = self._extract_text_from_response_done(ev)
1385
+ if transcript:
1386
+ output = transcript
1387
+ except Exception:
1388
+ pass
1389
+
1390
+ # Persist into ctx
1391
+ try:
1392
+ if self._ctx:
1393
+ self._ctx.output = output or (self._ctx.output or "")
1394
+ up = self._rt_state.get("usage_payload") if self._rt_state else None
1395
+ if up:
1396
+ in_tok = up.get("in")
1397
+ out_tok = up.get("out")
1398
+ if in_tok is None:
1399
+ in_tok = self._ctx.input_tokens if self._ctx.input_tokens is not None else 0
1400
+ if out_tok is None:
1401
+ out_tok = 0
1402
+ self._ctx.set_tokens(in_tok, out_tok)
1403
+ try:
1404
+ if not isinstance(self._ctx.extra, dict):
1405
+ self._ctx.extra = {}
1406
+ self._ctx.extra["usage"] = {
1407
+ "vendor": "openai",
1408
+ "input_tokens": in_tok,
1409
+ "output_tokens": out_tok,
1410
+ "reasoning_tokens": up.get("reasoning", 0),
1411
+ "total_reported": up.get("total"),
1412
+ }
1413
+ except Exception:
1414
+ pass
1415
+
1416
+ # Citations
1417
+ if self._rt_state and self._rt_state["citations"]:
1418
+ if self._ctx.urls is None:
1419
+ self._ctx.urls = []
1420
+ for u in self._rt_state["citations"]:
1421
+ if u not in self._ctx.urls:
1422
+ self._ctx.urls.append(u)
1423
+
1424
+ # Images
1425
+ if self._rt_state and self._rt_state["image_paths"]:
1426
+ if not isinstance(self._ctx.images, list):
1427
+ self._ctx.images = []
1428
+ for p in self._rt_state["image_paths"]:
1429
+ if p not in self._ctx.images:
1430
+ self._ctx.images.append(p)
1431
+
1432
+ self.window.core.ctx.update_item(self._ctx)
1433
+ except Exception:
1434
+ pass
1435
+
1436
+ # Download container files if any
1437
+ try:
1438
+ files = (self._rt_state or {}).get("files") or []
1439
+ if files:
1440
+ self.window.core.api.openai.container.download_files(self._ctx, files)
1441
+ except Exception:
1442
+ pass
1443
+
1444
+ # Unpack tool calls if any
1445
+ try:
1446
+ tcs = (self._rt_state or {}).get("tool_calls") or []
1447
+ if tcs:
1448
+ for tc in tcs:
1449
+ fn = tc.get("function") or {}
1450
+ if isinstance(fn.get("arguments"), dict):
1451
+ fn["arguments"] = json.dumps(fn["arguments"], ensure_ascii=False)
1452
+ self._ctx.force_call = bool((self._rt_state or {}).get("force_func_call"))
1453
+ self.window.core.debug.info("[realtime] Tool calls found, unpacking...")
1454
+ self.window.core.command.unpack_tool_calls_chunks(self._ctx, tcs)
1455
+ self.window.core.ctx.update_item(self._ctx)
1456
+ except Exception:
1457
+ pass
1458
+
1459
+ # Persist last tool calls snapshot for mapping tool outputs
1460
+ try:
1461
+ tcs = (self._rt_state or {}).get("tool_calls") or []
1462
+ if tcs:
1463
+ self._last_tool_calls = list(tcs)
1464
+ except Exception:
1465
+ pass
1466
+
1467
+ # Unblock waiters
1468
+ if self._response_done:
1469
+ self._response_done.set()
1470
+
1471
+ # send RT_OUTPUT_TURN_END signal
1472
+ if self._last_opts:
1473
+ self._last_opts.rt_signals.response.emit(RealtimeEvent(RealtimeEvent.RT_OUTPUT_TURN_END, {
1474
+ "ctx": self._ctx,
1475
+ }))
1476
+
1477
+ # Reset per-response extraction state
1478
+ self._rt_state = None
1479
+
1480
+ elif etype == "error":
1481
+ if self.debug:
1482
+ print(f"[_recv_loop] error event: {ev}")
1483
+ # Session expiration and other errors
1484
+ err = ev.get("error") or {}
1485
+ msg = (err.get("message") or "")
1486
+ code = (err.get("code") or "")
1487
+ if isinstance(code, str) and code.strip().lower() == "session_expired":
1488
+ self._rt_session_id = None
1489
+ if self.debug:
1490
+ print("[_recv_loop] session expired")
1491
+ if "already has an active response" in (msg or "").lower():
1492
+ if self._response_done:
1493
+ self._response_done.set()
1494
+ continue
1495
+ if self._response_done:
1496
+ self._response_done.set()
1497
+ if self.debug:
1498
+ print(f"[_recv_loop] error: {msg}")
1499
+
1500
+ # Other events are ignored
1501
+
1502
+ except Exception as e:
1503
+ if self.debug:
1504
+ print(f"[_recv_loop] exception: {e!r}")
1505
+ finally:
1506
+ if self.debug:
1507
+ print("[_recv_loop] stopped")
1508
+ # Ensure any waiters are unblocked on socket teardown
1509
+ try:
1510
+ if self._response_done and not self._response_done.is_set():
1511
+ self._response_done.set()
1512
+ except Exception:
1513
+ pass
1514
+ try:
1515
+ if self.ws:
1516
+ await self.ws.close()
1517
+ except Exception:
1518
+ pass
1519
+ self.ws = None
1520
+ self._running = False
1521
+
1522
+ # -----------------------------
1523
+ # Helpers
1524
+ # -----------------------------
1525
+
1526
+ def _preferred_voice(self) -> str:
1527
+ """
1528
+ Resolve preferred OpenAI voice from settings.
1529
+ """
1530
+ try:
1531
+ v = self.window.core.plugins.get_option("audio_output", "openai_voice")
1532
+ if v:
1533
+ return str(v)
1534
+ except Exception:
1535
+ pass
1536
+ return "alloy"
1537
+
1538
+ def _extract_text_from_response_done(self, ev: dict) -> str:
1539
+ """
1540
+ Extract assistant text from response.done payload.
1541
+ """
1542
+ res = ev.get("response") or {}
1543
+ out = res.get("output") or []
1544
+ parts: list[str] = []
1545
+
1546
+ for item in out:
1547
+ if not isinstance(item, dict):
1548
+ continue
1549
+ if item.get("type") not in ("message", "tool_result", "function_call_result", "response"):
1550
+ pass
1551
+ content_list = item.get("content") or []
1552
+ for c in content_list:
1553
+ if not isinstance(c, dict):
1554
+ continue
1555
+ ctype = c.get("type")
1556
+ if ctype == "audio" and self._transcribe_enabled():
1557
+ tr = c.get("transcript")
1558
+ if tr:
1559
+ parts.append(str(tr))
1560
+ elif ctype in ("text", "output_text", "input_text"):
1561
+ txt = c.get("text")
1562
+ if isinstance(txt, dict):
1563
+ txt = txt.get("text") or txt.get("value")
1564
+ if txt:
1565
+ parts.append(str(txt))
1566
+
1567
+ text = "\n".join(t.strip() for t in parts if t and str(t).strip())
1568
+ return text
1569
+
1570
+ # ---- per-response state helpers ----
1571
+
1572
+ def _rt_reset_state(self):
1573
+ """Reset per-response extraction state."""
1574
+ self._rt_state = {
1575
+ "output_parts": [],
1576
+ "begin": True,
1577
+ "fn_args_buffers": {},
1578
+ "tool_calls": [],
1579
+ "citations": [],
1580
+ "files": [],
1581
+ "image_paths": [],
1582
+ "is_image": False,
1583
+ "is_code": False,
1584
+ "force_func_call": False,
1585
+ "usage_payload": {},
1586
+ }
1587
+
1588
+ def _rt_append_text(self, s: str):
1589
+ """Append text to assembled output, skipping initial empty deltas."""
1590
+ if self._rt_state is None:
1591
+ self._rt_reset_state()
1592
+ if self._rt_state["begin"] and (s is None or s == ""):
1593
+ return
1594
+ self._rt_state["output_parts"].append(str(s))
1595
+ self._rt_state["begin"] = False
1596
+
1597
+ def _rt_add_citation(self, url: Optional[str]):
1598
+ """Add a URL citation to state and ctx (de-duplicated)."""
1599
+ if not url or not isinstance(url, str):
1600
+ return
1601
+ url = url.strip()
1602
+ if not (url.startswith("http://") or url.startswith("https://")):
1603
+ return
1604
+ if url not in self._rt_state["citations"]:
1605
+ self._rt_state["citations"].append(url)
1606
+ try:
1607
+ if self._ctx:
1608
+ if self._ctx.urls is None:
1609
+ self._ctx.urls = []
1610
+ if url not in self._ctx.urls:
1611
+ self._ctx.urls.append(url)
1612
+ except Exception:
1613
+ pass
1614
+
1615
+ def _rt_capture_usage(self, response_obj: dict):
1616
+ """
1617
+ Capture token usage from response.done if present.
1618
+ """
1619
+ try:
1620
+ usage = (response_obj or {}).get("usage") or {}
1621
+ if not usage:
1622
+ return
1623
+ in_tok = usage.get("input_tokens") or usage.get("prompt_tokens")
1624
+ out_tok = usage.get("output_tokens") or usage.get("completion_tokens")
1625
+ total = usage.get("total_tokens")
1626
+ self._rt_state["usage_payload"] = {
1627
+ "in": int(in_tok) if in_tok is not None else None,
1628
+ "out": int(out_tok) if out_tok is not None else None,
1629
+ "total": int(total) if total is not None else None,
1630
+ "reasoning": 0,
1631
+ }
1632
+ except Exception:
1633
+ pass
1634
+
1635
+ # ---- transcription helpers ----
1636
+
1637
+ def _transcribe_enabled(self) -> bool:
1638
+ """Returns True if transcription (input/output) is enabled via opts.transcribe."""
1639
+ try:
1640
+ return bool(getattr(self._last_opts, "transcribe", False))
1641
+ except Exception:
1642
+ return False
1643
+
1644
+ def _save_input_transcript(self, transcript: str):
1645
+ """
1646
+ Persist input transcript into ctx. If the user didn't provide a text prompt in this turn,
1647
+ ctx.input is also populated so downstream code treats it as the user's textual message.
1648
+ """
1649
+ if not transcript:
1650
+ return
1651
+ try:
1652
+ if self._ctx:
1653
+ if not isinstance(self._ctx.extra, dict):
1654
+ self._ctx.extra = {}
1655
+ self._ctx.input.extra["input_transcript"] = str(transcript)
1656
+ if not getattr(self._last_opts, "prompt", None):
1657
+ self._ctx.input = str(transcript)
1658
+ self.window.core.ctx.update_item(self._ctx)
1659
+ except Exception:
1660
+ pass
1661
+
1662
+ def _tune_openai_vad(self, session_payload: dict, opts) -> None:
1663
+ """
1664
+ Increase end-of-speech hold for server VAD (auto-turn) to reduce premature turn endings.
1665
+ """
1666
+ try:
1667
+ sess = session_payload.get("session") or {}
1668
+ td = sess.get("turn_detection")
1669
+ if not isinstance(td, dict):
1670
+ return # manual mode or VAD disabled
1671
+
1672
+ # Resolve target silence (default +2000 ms)
1673
+ target_ms = getattr(opts, "vad_end_silence_ms", None)
1674
+ if not isinstance(target_ms, (int, float)) or target_ms <= 0:
1675
+ # If user didn't override, ensure at least 2000 ms
1676
+ base = int(td.get("silence_duration_ms") or 500)
1677
+ target_ms = max(base, 2000)
1678
+
1679
+ td["silence_duration_ms"] = int(target_ms)
1680
+
1681
+ # Optional: prefix padding before detected speech
1682
+ prefix_ms = getattr(opts, "vad_prefix_padding_ms", None)
1683
+ if isinstance(prefix_ms, (int, float)) and prefix_ms >= 0:
1684
+ td["prefix_padding_ms"] = int(prefix_ms)
1685
+ except Exception:
1686
+ pass
1687
+
1688
+ def update_session_autoturn_sync(
1689
+ self,
1690
+ enabled: bool,
1691
+ silence_ms: Optional[int] = None,
1692
+ prefix_ms: Optional[int] = None,
1693
+ timeout: float = 5.0,
1694
+ ):
1695
+ """
1696
+ Synchronous helper to enable/disable auto-turn (VAD) mode on the live session.
1697
+ You can override silence and prefix (ms) as 2nd and 3rd args.
1698
+ If WS is not open, this updates self._last_opts and returns.
1699
+ """
1700
+ self._ensure_background_loop()
1701
+ try:
1702
+ self._bg.run_sync(
1703
+ self._update_session_autoturn_internal(enabled, silence_ms, prefix_ms),
1704
+ timeout=timeout
1705
+ )
1706
+ except Exception:
1707
+ pass
1708
+
1709
+ async def _update_session_autoturn_internal(
1710
+ self,
1711
+ enabled: bool,
1712
+ silence_ms: Optional[int] = None,
1713
+ prefix_ms: Optional[int] = None,
1714
+ ):
1715
+ """
1716
+ Owner-loop implementation for toggling auto-turn (server/semantic VAD) at runtime
1717
+ with optional silence and prefix overrides (milliseconds).
1718
+ """
1719
+ # If socket is not open, just cache into last opts
1720
+ if not self.ws:
1721
+ try:
1722
+ if self._last_opts:
1723
+ setattr(self._last_opts, "auto_turn", bool(enabled))
1724
+ if silence_ms is not None:
1725
+ setattr(self._last_opts, "vad_end_silence_ms", int(silence_ms))
1726
+ if prefix_ms is not None:
1727
+ setattr(self._last_opts, "vad_prefix_padding_ms", int(prefix_ms))
1728
+ except Exception:
1729
+ pass
1730
+ if self.debug:
1731
+ print("[update_session_autoturn] WS not open; cached for next session")
1732
+ return
1733
+
1734
+ if self._send_lock is None:
1735
+ self._send_lock = asyncio.Lock()
1736
+
1737
+ async with self._send_lock:
1738
+ try:
1739
+ # Build base session.update; let helper set correct turn_detection shape
1740
+ payload: dict = {"type": "session.update", "session": {}}
1741
+ turn_mode = TurnMode.AUTO if enabled else TurnMode.MANUAL
1742
+ apply_turn_mode_openai(payload, turn_mode) # sets session.turn_detection (AUTO) or None (MANUAL)
1743
+
1744
+ if enabled:
1745
+ sess = payload.get("session", {})
1746
+ td = sess.get("turn_detection")
1747
+
1748
+ # Optional VAD type override via opts.vad_type ("server_vad" | "semantic_vad")
1749
+ try:
1750
+ vad_type = getattr(self._last_opts, "vad_type", None)
1751
+ if isinstance(vad_type, str) and vad_type in ("server_vad", "semantic_vad"):
1752
+ if isinstance(td, dict):
1753
+ td["type"] = vad_type
1754
+ except Exception:
1755
+ pass
1756
+
1757
+ # Optional threshold for server_vad
1758
+ try:
1759
+ thr = getattr(self._last_opts, "vad_threshold", None)
1760
+ if isinstance(thr, (int, float)) and isinstance(td, dict) and td.get("type") == "server_vad":
1761
+ td["threshold"] = float(thr)
1762
+ except Exception:
1763
+ pass
1764
+
1765
+ # Apply defaults based on opts first
1766
+ self._tune_openai_vad(payload, self._last_opts)
1767
+
1768
+ # Then hard-override with explicit args (user provided values win)
1769
+ if isinstance(td, dict):
1770
+ if silence_ms is not None:
1771
+ td["silence_duration_ms"] = int(silence_ms)
1772
+ if prefix_ms is not None:
1773
+ td["prefix_padding_ms"] = int(prefix_ms)
1774
+
1775
+ # Optional flags from opts
1776
+ try:
1777
+ cr = getattr(self._last_opts, "vad_create_response", None)
1778
+ if isinstance(cr, bool):
1779
+ td["create_response"] = cr
1780
+ except Exception:
1781
+ pass
1782
+ try:
1783
+ ir = getattr(self._last_opts, "vad_interrupt_response", None)
1784
+ if isinstance(ir, bool):
1785
+ td["interrupt_response"] = ir
1786
+ except Exception:
1787
+ pass
1788
+
1789
+ # Send the update
1790
+ await self.ws.send(json.dumps(payload))
1791
+
1792
+ # Update local opts snapshot so next calls keep the same settings
1793
+ try:
1794
+ if self._last_opts:
1795
+ setattr(self._last_opts, "auto_turn", bool(enabled))
1796
+ if silence_ms is not None:
1797
+ setattr(self._last_opts, "vad_end_silence_ms", int(silence_ms))
1798
+ if prefix_ms is not None:
1799
+ setattr(self._last_opts, "vad_prefix_padding_ms", int(prefix_ms))
1800
+ except Exception:
1801
+ pass
1802
+
1803
+ if self.debug:
1804
+ td_dbg = (payload.get("session", {}) or {}).get("turn_detection")
1805
+ print(f"[update_session_autoturn] session.update sent; auto_turn={enabled}, td={td_dbg}")
1806
+
1807
+ except Exception as e:
1808
+ if self.debug:
1809
+ print(f"[update_session_autoturn] send error: {e}")
1810
+
1811
+ def set_debug(self, enabled: bool):
1812
+ """
1813
+ Enable or disable debug logging.
1814
+
1815
+ :param enabled: True to enable debug logging, False to disable.
1816
+ """
1817
+ self.debug = bool(enabled)
1818
+
1819
+ def is_session_active(self) -> bool:
1820
+ """Check if the WS session is currently open."""
1821
+ return self.ws is not None and self._running
1822
+
1823
+ def update_ctx(self, ctx: CtxItem):
1824
+ """Update the current CtxItem (for session handle persistence)."""
1825
+ self._ctx = ctx