pygpt-net 2.6.30__py3-none-any.whl → 2.6.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. pygpt_net/CHANGELOG.txt +15 -0
  2. pygpt_net/__init__.py +3 -3
  3. pygpt_net/app.py +7 -1
  4. pygpt_net/app_core.py +3 -1
  5. pygpt_net/config.py +3 -1
  6. pygpt_net/controller/__init__.py +9 -2
  7. pygpt_net/controller/audio/audio.py +38 -1
  8. pygpt_net/controller/audio/ui.py +2 -2
  9. pygpt_net/controller/chat/audio.py +1 -8
  10. pygpt_net/controller/chat/common.py +23 -62
  11. pygpt_net/controller/chat/handler/__init__.py +0 -0
  12. pygpt_net/controller/chat/handler/stream_worker.py +1124 -0
  13. pygpt_net/controller/chat/output.py +8 -3
  14. pygpt_net/controller/chat/stream.py +3 -1071
  15. pygpt_net/controller/chat/text.py +3 -2
  16. pygpt_net/controller/kernel/kernel.py +11 -3
  17. pygpt_net/controller/kernel/reply.py +5 -1
  18. pygpt_net/controller/lang/custom.py +2 -2
  19. pygpt_net/controller/media/__init__.py +12 -0
  20. pygpt_net/controller/media/media.py +115 -0
  21. pygpt_net/controller/realtime/__init__.py +12 -0
  22. pygpt_net/controller/realtime/manager.py +53 -0
  23. pygpt_net/controller/realtime/realtime.py +293 -0
  24. pygpt_net/controller/ui/mode.py +23 -2
  25. pygpt_net/controller/ui/ui.py +19 -1
  26. pygpt_net/core/audio/audio.py +6 -1
  27. pygpt_net/core/audio/backend/native/__init__.py +12 -0
  28. pygpt_net/core/audio/backend/{native.py → native/native.py} +426 -127
  29. pygpt_net/core/audio/backend/native/player.py +139 -0
  30. pygpt_net/core/audio/backend/native/realtime.py +250 -0
  31. pygpt_net/core/audio/backend/pyaudio/__init__.py +12 -0
  32. pygpt_net/core/audio/backend/pyaudio/playback.py +194 -0
  33. pygpt_net/core/audio/backend/pyaudio/pyaudio.py +923 -0
  34. pygpt_net/core/audio/backend/pyaudio/realtime.py +312 -0
  35. pygpt_net/core/audio/backend/pygame/__init__.py +12 -0
  36. pygpt_net/core/audio/backend/{pygame.py → pygame/pygame.py} +130 -19
  37. pygpt_net/core/audio/backend/shared/__init__.py +38 -0
  38. pygpt_net/core/audio/backend/shared/conversions.py +211 -0
  39. pygpt_net/core/audio/backend/shared/envelope.py +38 -0
  40. pygpt_net/core/audio/backend/shared/player.py +137 -0
  41. pygpt_net/core/audio/backend/shared/rt.py +52 -0
  42. pygpt_net/core/audio/capture.py +5 -0
  43. pygpt_net/core/audio/output.py +14 -2
  44. pygpt_net/core/audio/whisper.py +6 -2
  45. pygpt_net/core/bridge/bridge.py +2 -1
  46. pygpt_net/core/bridge/worker.py +4 -1
  47. pygpt_net/core/dispatcher/dispatcher.py +37 -1
  48. pygpt_net/core/events/__init__.py +2 -1
  49. pygpt_net/core/events/realtime.py +55 -0
  50. pygpt_net/core/image/image.py +56 -5
  51. pygpt_net/core/realtime/__init__.py +0 -0
  52. pygpt_net/core/realtime/options.py +87 -0
  53. pygpt_net/core/realtime/shared/__init__.py +0 -0
  54. pygpt_net/core/realtime/shared/audio.py +213 -0
  55. pygpt_net/core/realtime/shared/loop.py +64 -0
  56. pygpt_net/core/realtime/shared/session.py +59 -0
  57. pygpt_net/core/realtime/shared/text.py +37 -0
  58. pygpt_net/core/realtime/shared/tools.py +276 -0
  59. pygpt_net/core/realtime/shared/turn.py +38 -0
  60. pygpt_net/core/realtime/shared/types.py +16 -0
  61. pygpt_net/core/realtime/worker.py +160 -0
  62. pygpt_net/core/render/web/body.py +24 -3
  63. pygpt_net/core/text/utils.py +54 -2
  64. pygpt_net/core/types/__init__.py +1 -0
  65. pygpt_net/core/types/image.py +54 -0
  66. pygpt_net/core/video/__init__.py +12 -0
  67. pygpt_net/core/video/video.py +290 -0
  68. pygpt_net/data/config/config.json +26 -5
  69. pygpt_net/data/config/models.json +221 -103
  70. pygpt_net/data/config/settings.json +244 -6
  71. pygpt_net/data/css/web-blocks.css +6 -0
  72. pygpt_net/data/css/web-chatgpt.css +6 -0
  73. pygpt_net/data/css/web-chatgpt_wide.css +6 -0
  74. pygpt_net/data/locale/locale.de.ini +35 -7
  75. pygpt_net/data/locale/locale.en.ini +56 -17
  76. pygpt_net/data/locale/locale.es.ini +35 -7
  77. pygpt_net/data/locale/locale.fr.ini +35 -7
  78. pygpt_net/data/locale/locale.it.ini +35 -7
  79. pygpt_net/data/locale/locale.pl.ini +38 -7
  80. pygpt_net/data/locale/locale.uk.ini +35 -7
  81. pygpt_net/data/locale/locale.zh.ini +31 -3
  82. pygpt_net/data/locale/plugin.audio_input.en.ini +4 -0
  83. pygpt_net/data/locale/plugin.audio_output.en.ini +4 -0
  84. pygpt_net/data/locale/plugin.cmd_web.en.ini +8 -0
  85. pygpt_net/item/model.py +22 -1
  86. pygpt_net/plugin/audio_input/plugin.py +37 -4
  87. pygpt_net/plugin/audio_input/simple.py +57 -8
  88. pygpt_net/plugin/cmd_files/worker.py +3 -0
  89. pygpt_net/provider/api/google/__init__.py +76 -7
  90. pygpt_net/provider/api/google/audio.py +8 -1
  91. pygpt_net/provider/api/google/chat.py +45 -6
  92. pygpt_net/provider/api/google/image.py +226 -86
  93. pygpt_net/provider/api/google/realtime/__init__.py +12 -0
  94. pygpt_net/provider/api/google/realtime/client.py +1945 -0
  95. pygpt_net/provider/api/google/realtime/realtime.py +186 -0
  96. pygpt_net/provider/api/google/video.py +364 -0
  97. pygpt_net/provider/api/openai/__init__.py +22 -2
  98. pygpt_net/provider/api/openai/realtime/__init__.py +12 -0
  99. pygpt_net/provider/api/openai/realtime/client.py +1828 -0
  100. pygpt_net/provider/api/openai/realtime/realtime.py +193 -0
  101. pygpt_net/provider/audio_input/google_genai.py +103 -0
  102. pygpt_net/provider/audio_output/google_genai_tts.py +229 -0
  103. pygpt_net/provider/audio_output/google_tts.py +0 -12
  104. pygpt_net/provider/audio_output/openai_tts.py +8 -5
  105. pygpt_net/provider/core/config/patch.py +241 -178
  106. pygpt_net/provider/core/model/patch.py +28 -2
  107. pygpt_net/provider/llms/google.py +8 -9
  108. pygpt_net/provider/web/duckduck_search.py +212 -0
  109. pygpt_net/ui/layout/toolbox/audio.py +55 -0
  110. pygpt_net/ui/layout/toolbox/footer.py +14 -42
  111. pygpt_net/ui/layout/toolbox/image.py +7 -13
  112. pygpt_net/ui/layout/toolbox/raw.py +52 -0
  113. pygpt_net/ui/layout/toolbox/split.py +48 -0
  114. pygpt_net/ui/layout/toolbox/toolbox.py +8 -8
  115. pygpt_net/ui/layout/toolbox/video.py +49 -0
  116. pygpt_net/ui/widget/option/combo.py +15 -1
  117. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/METADATA +46 -22
  118. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/RECORD +121 -73
  119. pygpt_net/core/audio/backend/pyaudio.py +0 -554
  120. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/LICENSE +0 -0
  121. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/WHEEL +0 -0
  122. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,1828 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # ================================================== #
4
+ # This file is a part of PYGPT package #
5
+ # Website: https://pygpt.net #
6
+ # GitHub: https://github.com/szczyglis-dev/py-gpt #
7
+ # MIT License #
8
+ # Created By : Marcin Szczygliński #
9
+ # Updated Date: 2025.09.01 00:00:00 #
10
+ # ================================================== #
11
+
12
+ import asyncio
13
+ import base64
14
+ import io
15
+ import json
16
+ import websockets
17
+
18
+ from typing import Optional, Callable, Awaitable
19
+ from urllib.parse import urlencode
20
+
21
+ from pygpt_net.core.events import RealtimeEvent
22
+ from pygpt_net.core.types import MODE_AUDIO
23
+ from pygpt_net.item.ctx import CtxItem
24
+ from pygpt_net.core.text.utils import has_unclosed_code_tag
25
+
26
+ # shared
27
+ from pygpt_net.core.realtime.shared.loop import BackgroundLoop
28
+ from pygpt_net.core.realtime.shared.audio import (
29
+ coerce_to_pcm16_mono,
30
+ resample_pcm16_mono,
31
+ iter_pcm_chunks,
32
+ DEFAULT_24K,
33
+ )
34
+ from pygpt_net.core.realtime.shared.tools import (
35
+ sanitize_function_tools,
36
+ sanitize_remote_tools,
37
+ prepare_tools_for_session,
38
+ prepare_tools_for_response,
39
+ tools_signature,
40
+ build_tool_outputs_payload,
41
+ )
42
+ from pygpt_net.core.realtime.shared.turn import TurnMode, apply_turn_mode_openai
43
+ from pygpt_net.core.realtime.shared.session import set_ctx_rt_handle, set_rt_session_expires_at
44
+
45
+
46
+ class OpenAIRealtimeClient:
47
+ """
48
+ OpenAI Realtime API client with persistent session and a dedicated background event loop.
49
+
50
+ Key points:
51
+ - A single background asyncio loop runs in its own thread for the lifetime of the client.
52
+ - One websocket connection (session) at a time; multiple "turns" (send_turn) are serialized.
53
+ - No server VAD: manual turn control via input_audio_buffer.* + response.create.
54
+ - Safe to call run()/send_turn()/reset()/shutdown() from any thread or event loop.
55
+
56
+ Session resumption:
57
+ - The official Realtime API does not expose a documented server-side "resume" for closed WS sessions.
58
+ We still persist the server-provided session.id and surface it via ctx.extra["rt_session_id"].
59
+ - If opts.rt_session_id is provided and differs from the current in-memory handle, we reset the
60
+ connection and attempt to reconnect with a "session_id" query parameter. If that fails, we fall
61
+ back to the standard URL to avoid breaking existing functionality.
62
+ """
63
+
64
+ WS_URL = "wss://api.openai.com/v1/realtime"
65
+
66
+ def __init__(self, window=None, debug: bool = False):
67
+ """
68
+ OpenAI Realtime API client
69
+
70
+ :param window: Window instance
71
+ :param debug: Enable debug logging
72
+ """
73
+ self.window = window
74
+ self.debug = debug
75
+
76
+ # WebSocket and session state (lives on the owner loop)
77
+ self.ws: Optional[websockets.WebSocketClientProtocol] = None
78
+ self._rx_task: Optional[asyncio.Task] = None
79
+ self._running: bool = False
80
+
81
+ # Background loop
82
+ self._bg = BackgroundLoop(name="OpenAI-RT-Loop")
83
+
84
+ # Flow control primitives (created on the owner loop)
85
+ self._send_lock: Optional[asyncio.Lock] = None
86
+ self._response_done: Optional[asyncio.Event] = None
87
+ self._response_active: bool = False
88
+
89
+ # Callbacks and context
90
+ self._on_text: Optional[Callable[[str], Awaitable[None]]] = None
91
+ self._on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None
92
+ self._should_stop: Optional[Callable[[], bool]] = None
93
+ self._ctx: Optional[CtxItem] = None
94
+ self._last_opts = None # kept to allow reset() without resupplying
95
+
96
+ self._DEFAULT_RATE = DEFAULT_24K
97
+
98
+ # Per-response extraction state (tools/images/citations/usage/assembled text)
99
+ self._rt_state = None # dict populated on response.created
100
+
101
+ # Input transcription buffers keyed by item_id
102
+ self._input_tr_buffers: dict[str, io.StringIO] = {}
103
+
104
+ # Cached session.tools signature to avoid redundant session.update
105
+ self._cached_session_tools_sig: Optional[str] = None
106
+
107
+ # Last tool calls snapshot for mapping tool responses
108
+ self._last_tool_calls: list[dict] = []
109
+
110
+ # Live session handle (for best-effort resumption semantics)
111
+ self._rt_session_id: Optional[str] = None
112
+ self._rt_session_expires_at: Optional[int] = None # epoch seconds if provided by server
113
+
114
+ # -----------------------------
115
+ # Public high-level entrypoints
116
+ # -----------------------------
117
+
118
+ async def run(
119
+ self,
120
+ ctx: CtxItem,
121
+ opts,
122
+ on_text: Callable[[str], Awaitable[None]],
123
+ on_audio: Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]],
124
+ should_stop: Callable[[], bool] = lambda: False,
125
+ ):
126
+ """
127
+ Run one turn: open session if needed, send prompt/audio, await response completion.
128
+
129
+ :param ctx: CtxItem with model and conversation
130
+ :param opts: Options object with prompt/audio/voice/etc.
131
+ :param on_text: Async callback for text deltas
132
+ :param on_audio: Async callback for audio chunks
133
+ :param should_stop: Sync callback to signal barge-in (cancel active response)
134
+ """
135
+ self._ensure_background_loop()
136
+ self._ctx = ctx
137
+
138
+ # If a different resumable handle is provided, reset to attempt best-effort resume.
139
+ try:
140
+ provided = getattr(opts, "rt_session_id", None)
141
+ if isinstance(provided, str):
142
+ provided = provided.strip()
143
+ if self.ws is not None and provided and provided != (self._rt_session_id or ""):
144
+ await self._run_on_owner(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop))
145
+ except Exception:
146
+ pass
147
+
148
+ # Open session on the owner loop (once)
149
+ if not self.ws:
150
+ await self._run_on_owner(self._open_session_internal(ctx, opts, on_text, on_audio, should_stop))
151
+
152
+ # Send one turn on the owner loop
153
+ await self._run_on_owner(self._send_turn_internal(
154
+ getattr(opts, "prompt", None),
155
+ getattr(opts, "audio_data", None),
156
+ getattr(opts, "audio_format", None),
157
+ getattr(opts, "audio_rate", None),
158
+ wait_for_done=not bool(getattr(opts, "streaming", False)),
159
+ ))
160
+
161
+ async def open_session(
162
+ self,
163
+ ctx: CtxItem,
164
+ opts,
165
+ on_text: Callable[[str], Awaitable[None]],
166
+ on_audio: Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]],
167
+ should_stop: Callable[[], bool] = lambda: False,
168
+ ):
169
+ """
170
+ Explicitly open a session (websocket); normally run() does this on demand.
171
+ """
172
+ self._ensure_background_loop()
173
+
174
+ # If the session is already open but a different handle is requested, reset to attempt reattach.
175
+ try:
176
+ provided = getattr(opts, "rt_session_id", None)
177
+ if isinstance(provided, str):
178
+ provided = provided.strip()
179
+ if self.ws is not None and provided and provided != (self._rt_session_id or ""):
180
+ await self._run_on_owner(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop))
181
+ return
182
+ except Exception:
183
+ pass
184
+
185
+ await self._run_on_owner(self._open_session_internal(ctx, opts, on_text, on_audio, should_stop))
186
+
187
+ async def close_session(self):
188
+ """Close the websocket session but keep the background loop alive."""
189
+ if not self._bg.loop:
190
+ return
191
+ await self._run_on_owner(self._close_session_internal())
192
+
193
+ async def reset_session(
194
+ self,
195
+ ctx: Optional[CtxItem] = None,
196
+ opts=None,
197
+ on_text: Optional[Callable[[str], Awaitable[None]]] = None,
198
+ on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None,
199
+ should_stop: Optional[Callable[[], bool]] = None,
200
+ ):
201
+ """
202
+ Close the current session and open a fresh one (new conversation on the server).
203
+ If parameters are omitted, last-known ones are used.
204
+ """
205
+ self._ensure_background_loop()
206
+ await self._run_on_owner(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop))
207
+
208
+ async def shutdown(self):
209
+ """
210
+ Gracefully close the current session (if any).
211
+ Does NOT stop the background loop; use stop_loop_sync() or shutdown_and_stop() to also stop the loop.
212
+ """
213
+ if not self._bg.loop:
214
+ return
215
+ await self._run_on_owner(self._close_session_internal())
216
+
217
+ async def shutdown_and_stop(self):
218
+ """Close session and stop the background loop thread."""
219
+ await self.shutdown()
220
+ self.stop_loop_sync()
221
+
222
+ # -----------------------------
223
+ # Synchronous convenience calls
224
+ # -----------------------------
225
+
226
+ def close_session_sync(self, timeout: float = 5.0):
227
+ """Synchronous wrapper around close_session()."""
228
+ if not self._bg.loop or not self._bg.loop.is_running():
229
+ return
230
+ self._bg.run_sync(self._close_session_internal(), timeout=timeout)
231
+
232
+ def reset_session_sync(
233
+ self,
234
+ ctx: Optional[CtxItem] = None,
235
+ opts=None,
236
+ on_text: Optional[Callable[[str], Awaitable[None]]] = None,
237
+ on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None,
238
+ should_stop: Optional[Callable[[], bool]] = None,
239
+ timeout: float = 10.0,
240
+ ):
241
+ """Synchronous wrapper around reset_session()."""
242
+ self._ensure_background_loop()
243
+ self._bg.run_sync(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop), timeout=timeout)
244
+
245
+ def shutdown_sync(self, timeout: float = 5.0):
246
+ """Synchronous wrapper around shutdown() — closes the WS but leaves the loop alive."""
247
+ if not self._bg.loop or not self._bg.loop.is_running():
248
+ return
249
+ self._bg.run_sync(self._close_session_internal(), timeout=timeout)
250
+
251
+ def stop_loop_sync(self, timeout: float = 2.0):
252
+ """Stop the background event loop thread."""
253
+ self._bg.stop(timeout=timeout)
254
+
255
+ # -----------------------------
256
+ # Tools helpers
257
+ # -----------------------------
258
+
259
+ def _update_last_opts_tools(self, tools: Optional[list], remote_tools: Optional[list]) -> None:
260
+ """
261
+ Update self._last_opts with tools/remote_tools if fields are present.
262
+ """
263
+ lo = self._last_opts
264
+ if not lo:
265
+ return
266
+ try:
267
+ if tools is not None and hasattr(lo, "tools"):
268
+ setattr(lo, "tools", tools)
269
+ except Exception:
270
+ pass
271
+ try:
272
+ if remote_tools is not None and hasattr(lo, "remote_tools"):
273
+ setattr(lo, "remote_tools", remote_tools)
274
+ except Exception:
275
+ pass
276
+
277
+ # -----------------------------
278
+ # Internal: background loop/dispatch
279
+ # -----------------------------
280
+
281
+ def _ensure_background_loop(self):
282
+ """Start the background asyncio loop once and keep it running."""
283
+ self._bg.ensure()
284
+
285
+ async def _run_on_owner(self, coro):
286
+ """Await a coroutine scheduled on the owner loop from any thread/loop."""
287
+ return await self._bg.run(coro)
288
+
289
+ # -----------------------------
290
+ # Internal: session lifecycle
291
+ # -----------------------------
292
+
293
+ async def _open_session_internal(
294
+ self,
295
+ ctx: CtxItem,
296
+ opts,
297
+ on_text: Callable[[str], Awaitable[None]],
298
+ on_audio: Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]],
299
+ should_stop: Callable[[], bool] = lambda: False,
300
+ ):
301
+ """
302
+ Open WS and configure the Realtime session on the owner loop.
303
+ """
304
+ if self.ws is not None:
305
+ if self.debug:
306
+ print("[open_session] already open")
307
+ return
308
+
309
+ core = self.window.core
310
+ model_data = core.models.get(ctx.model)
311
+ args = self.window.core.models.prepare_client_args(MODE_AUDIO, model_data if ctx else None)
312
+ api_key = args.get("api_key")
313
+ if not api_key:
314
+ raise RuntimeError("OpenAI API key not configured")
315
+
316
+ model_id = getattr(opts, "model", None) or (ctx.model if ctx and ctx.model else "gpt-4o-realtime-preview")
317
+ voice = getattr(opts, "voice", None) or self._preferred_voice()
318
+
319
+ # Optional: requested resume handle from opts
320
+ resume_sid = None
321
+ try:
322
+ provided = getattr(opts, "rt_session_id", None)
323
+ if isinstance(provided, str):
324
+ provided = provided.strip()
325
+ if provided and provided != (self._rt_session_id or ""):
326
+ resume_sid = provided
327
+ self._rt_session_id = resume_sid
328
+ set_ctx_rt_handle(self._ctx, resume_sid, self.window)
329
+ except Exception:
330
+ pass
331
+
332
+ # Build WS URL with model and optional session_id for best-effort resume
333
+ base_q = {"model": model_id}
334
+ if resume_sid:
335
+ base_q["session_id"] = resume_sid # if unsupported by server, connect fallback will ignore
336
+ url_with_sid = f"{self.WS_URL}?{urlencode(base_q)}"
337
+ url_no_sid = f"{self.WS_URL}?{urlencode({'model': model_id})}"
338
+
339
+ headers = {
340
+ "Authorization": f"Bearer {api_key}",
341
+ "OpenAI-Beta": "realtime=v1",
342
+ }
343
+
344
+ # Transcription toggle
345
+ transcribe_enabled = bool(getattr(opts, "transcribe", False))
346
+
347
+ # Save callbacks and context
348
+ self._on_text = on_text
349
+ self._on_audio = on_audio
350
+ self._should_stop = should_stop or (lambda: False)
351
+ self._ctx = ctx
352
+ self._last_opts = opts
353
+
354
+ # Control primitives
355
+ self._response_done = asyncio.Event()
356
+ self._send_lock = asyncio.Lock()
357
+
358
+ if self.debug:
359
+ print(f"[open_session] owner_loop={id(asyncio.get_running_loop())}")
360
+
361
+ # Connect WS: first try with session_id if provided; on failure, fall back to plain URL.
362
+ try:
363
+ target_url = url_with_sid if resume_sid else url_no_sid
364
+ self.ws = await websockets.connect(
365
+ target_url,
366
+ additional_headers=headers,
367
+ max_size=16 * 1024 * 1024,
368
+ ping_interval=20,
369
+ ping_timeout=20,
370
+ close_timeout=5,
371
+ )
372
+ except Exception as e:
373
+ if resume_sid and self.debug:
374
+ print(f"[open_session] connect with session_id failed ({e!r}); falling back to plain URL")
375
+ if resume_sid:
376
+ self.ws = await websockets.connect(
377
+ url_no_sid,
378
+ additional_headers=headers,
379
+ max_size=16 * 1024 * 1024,
380
+ ping_interval=20,
381
+ ping_timeout=20,
382
+ close_timeout=5,
383
+ )
384
+ if self.debug:
385
+ print("[open_session] WS connected")
386
+
387
+ # Session payload (manual by default; prepared for auto)
388
+ session_payload = {
389
+ "type": "session.update",
390
+ "session": {
391
+ "modalities": ["text", "audio"],
392
+ "voice": voice,
393
+ "input_audio_format": "pcm16",
394
+ "output_audio_format": "pcm16",
395
+ # turn_detection set below via apply_turn_mode_openai
396
+ **({"instructions": str(getattr(opts, "system_prompt"))} if getattr(opts, "system_prompt", None) else {}),
397
+ },
398
+ }
399
+ turn_mode = TurnMode.AUTO if bool(getattr(opts, "auto_turn", False)) else TurnMode.MANUAL
400
+ apply_turn_mode_openai(session_payload, turn_mode)
401
+ self._tune_openai_vad(session_payload, opts)
402
+
403
+ # Attach tools to session (remote + functions)
404
+ try:
405
+ session_tools = prepare_tools_for_session(opts)
406
+ if session_tools:
407
+ session_payload["session"]["tools"] = session_tools
408
+ self._cached_session_tools_sig = tools_signature(session_tools)
409
+ if self.debug:
410
+ print(f"[open_session] session.tools attached: {len(session_tools)}")
411
+ else:
412
+ self._cached_session_tools_sig = tools_signature([])
413
+ except Exception as _e:
414
+ if self.debug:
415
+ print(f"[open_session] tools sanitize error: {_e}")
416
+ self._cached_session_tools_sig = tools_signature([])
417
+
418
+ # Attach native input transcription if requested
419
+ try:
420
+ if transcribe_enabled:
421
+ iat = {"model": "whisper-1"}
422
+ lang = getattr(opts, "transcribe_language", None) or getattr(opts, "language", None)
423
+ if lang:
424
+ iat["language"] = str(lang)
425
+ session_payload["session"]["input_audio_transcription"] = iat
426
+ except Exception:
427
+ pass
428
+
429
+ if self.debug:
430
+ print(f"[open_session] session_payload: {json.dumps(session_payload)}")
431
+
432
+ await self.ws.send(json.dumps(session_payload))
433
+ if self.debug:
434
+ print("[open_session] session.update sent")
435
+
436
+ # Start a single receiver task
437
+ if self._rx_task is None or self._rx_task.done():
438
+ self._running = True
439
+ self._rx_task = asyncio.create_task(self._recv_loop(), name="realtime-recv")
440
+ if self.debug:
441
+ print("[open_session] _recv_loop started")
442
+
443
+ async def _close_session_internal(self):
444
+ """Close WS and stop the receiver; keep the background loop alive for reuse."""
445
+ self._running = False
446
+
447
+ # Cancel active response if any
448
+ if self.ws and self._response_active:
449
+ try:
450
+ await self.ws.send(json.dumps({"type": "response.cancel"}))
451
+ except Exception:
452
+ pass
453
+
454
+ # Unblock any waiters before clearing handles
455
+ try:
456
+ if self._response_done and not self._response_done.is_set():
457
+ self._response_done.set()
458
+ except Exception:
459
+ pass
460
+
461
+ # Close the socket
462
+ if self.ws:
463
+ try:
464
+ await self.ws.close()
465
+ except Exception:
466
+ pass
467
+ self.ws = None
468
+
469
+ # Await receiver
470
+ if self._rx_task:
471
+ try:
472
+ await self._rx_task
473
+ except Exception:
474
+ pass
475
+ self._rx_task = None
476
+
477
+ # Reset control primitives
478
+ self._response_active = False
479
+ self._response_done = None
480
+ self._send_lock = None
481
+ self._cached_session_tools_sig = None
482
+
483
+ # Clear in-memory handle; do not wipe persisted ctx.extra["rt_session_id"]
484
+ self._rt_session_id = None
485
+ self._rt_session_expires_at = None
486
+
487
+ if self.debug:
488
+ print("[close_session] closed")
489
+
490
+ async def _reset_session_internal(
491
+ self,
492
+ ctx: Optional[CtxItem] = None,
493
+ opts=None,
494
+ on_text: Optional[Callable[[str], Awaitable[None]]] = None,
495
+ on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None,
496
+ should_stop: Optional[Callable[[], bool]] = None,
497
+ ):
498
+ """
499
+ Close current session and open a new one with provided or last-known parameters.
500
+ """
501
+ # Determine params to reuse if not provided
502
+ ctx = ctx or self._ctx
503
+ opts = opts or self._last_opts
504
+ on_text = on_text or self._on_text
505
+ on_audio = on_audio or self._on_audio
506
+ should_stop = should_stop or self._should_stop or (lambda: False)
507
+
508
+ if not (ctx and opts and on_text and on_audio):
509
+ raise RuntimeError("reset_session requires previous or explicit ctx/opts/callbacks")
510
+
511
+ await self._close_session_internal()
512
+ await self._open_session_internal(ctx, opts, on_text, on_audio, should_stop)
513
+
514
+ # -----------------------------
515
+ # Internal: one "turn"
516
+ # -----------------------------
517
+
518
+ async def _send_turn_internal(
519
+ self,
520
+ prompt: Optional[str] = None,
521
+ audio_data: Optional[bytes] = None,
522
+ audio_format: Optional[str] = None,
523
+ audio_rate: Optional[int] = None,
524
+ wait_for_done: bool = True,
525
+ ):
526
+ """
527
+ Send one manual turn (optional text + optional audio) and trigger response.create.
528
+ """
529
+ if not self.ws:
530
+ # If session dropped remotely, try to reopen from last state
531
+ if self._ctx and self._last_opts:
532
+ await self._open_session_internal(self._ctx, self._last_opts, self._on_text, self._on_audio, self._should_stop)
533
+ else:
534
+ raise RuntimeError("Session not open. Call open_session(...) first.")
535
+
536
+ # Serialize all sends to a single WS writer
537
+ if self._send_lock is None:
538
+ self._send_lock = asyncio.Lock()
539
+
540
+ # Determine whether we should trigger a response for this turn
541
+ def _bool(v) -> bool:
542
+ try:
543
+ return bool(v)
544
+ except Exception:
545
+ return False
546
+
547
+ is_auto_turn = _bool(getattr(self._last_opts or object(), "auto_turn", False))
548
+ has_text = bool(prompt and str(prompt).strip() and str(prompt).strip() != "...")
549
+ has_audio = bool(audio_data)
550
+ # Honor explicit "reply" hint if provided by caller (e.g., opts.extra.reply == True)
551
+ reply_hint = False
552
+ try:
553
+ extra = getattr(self._last_opts, "extra", None)
554
+ if isinstance(extra, dict):
555
+ reply_hint = bool(extra.get("reply", False))
556
+ except Exception:
557
+ pass
558
+
559
+ # In manual mode, do not auto-trigger response.create when there is no user input and no explicit reply request.
560
+ if not has_text and not has_audio and not reply_hint:
561
+ if self.debug:
562
+ print("[send_turn] skipped: manual mode with empty input; waiting for explicit commit")
563
+ return
564
+
565
+ wait_prev: Optional[asyncio.Event] = None
566
+ wait_curr: Optional[asyncio.Event] = None
567
+
568
+ async with self._send_lock:
569
+ # Ensure previous response is finished (snapshot the handle to avoid race with close)
570
+ if self._response_active and self._response_done:
571
+ wait_prev = self._response_done
572
+
573
+ # Optional text
574
+ if has_text:
575
+ if self.debug:
576
+ print(f"[send_turn] prompt len={len(prompt)}")
577
+ await self.ws.send(json.dumps({
578
+ "type": "conversation.item.create",
579
+ "item": {
580
+ "type": "message",
581
+ "role": "user",
582
+ "content": [{"type": "input_text", "text": str(prompt)}],
583
+ },
584
+ }))
585
+
586
+ # Optional audio
587
+ if has_audio:
588
+ sr, _ch, pcm = coerce_to_pcm16_mono(audio_data, audio_format, audio_rate, fallback_rate=self._DEFAULT_RATE)
589
+
590
+ if sr != self._DEFAULT_RATE:
591
+ try:
592
+ pcm = resample_pcm16_mono(pcm, sr, self._DEFAULT_RATE)
593
+ if self.debug:
594
+ print(f"[audio] resampled {sr} -> {self._DEFAULT_RATE}")
595
+ sr = self._DEFAULT_RATE
596
+ except Exception as e:
597
+ if self.debug:
598
+ print(f"[audio] resample failed {sr}->{self._DEFAULT_RATE}: {e}")
599
+
600
+ await self.ws.send(json.dumps({"type": "input_audio_buffer.clear"}))
601
+ for chunk in iter_pcm_chunks(pcm, sr, ms=50):
602
+ if not chunk:
603
+ continue
604
+ await self.ws.send(json.dumps({
605
+ "type": "input_audio_buffer.append",
606
+ "audio": base64.b64encode(chunk).decode("utf-8"),
607
+ }))
608
+ await self.ws.send(json.dumps({"type": "input_audio_buffer.commit"}))
609
+
610
+ # If we were waiting for a previous response, do it inside lock handoff-safe
611
+ if wait_prev:
612
+ try:
613
+ if self.debug:
614
+ print("[send_turn] waiting for previous response")
615
+ await wait_prev.wait()
616
+ except Exception:
617
+ pass
618
+
619
+ # Prepare wait handle for the response about to start
620
+ if self._response_done is None:
621
+ self._response_done = asyncio.Event()
622
+ else:
623
+ try:
624
+ self._response_done.clear()
625
+ except Exception:
626
+ self._response_done = asyncio.Event()
627
+ wait_curr = self._response_done # snapshot for race-free waiting
628
+
629
+ # Build optional response payload (modalities + tools/tool_choice)
630
+ resp_obj = {"modalities": ["text", "audio"]}
631
+ try:
632
+ resp_tools, tool_choice = prepare_tools_for_response(self._last_opts)
633
+ if resp_tools:
634
+ resp_obj["tools"] = resp_tools
635
+ if tool_choice is None:
636
+ tool_choice = "auto"
637
+ if tool_choice:
638
+ resp_obj["tool_choice"] = tool_choice
639
+ except Exception as _e:
640
+ if self.debug:
641
+ print(f"[send_turn] response tools compose error: {_e}")
642
+
643
+ payload = {"type": "response.create"}
644
+ if len(resp_obj) > 0:
645
+ payload["response"] = resp_obj
646
+
647
+ await self.ws.send(json.dumps(payload))
648
+ if self.debug:
649
+ print("[send_turn] response.create sent")
650
+
651
+ # Optionally wait for response.done (otherwise return immediately)
652
+ if wait_for_done and wait_curr:
653
+ if self.debug:
654
+ print("[send_turn] waiting for response.done")
655
+ try:
656
+ await wait_curr.wait()
657
+ except Exception:
658
+ pass
659
+ if self.debug:
660
+ print("[send_turn] response.done received")
661
+
662
+ async def _cancel_active_response_internal(self):
663
+ """Cancel current response (barge-in)."""
664
+ if self.ws and self._response_active:
665
+ try:
666
+ await self.ws.send(json.dumps({"type": "response.cancel"}))
667
+ except Exception:
668
+ pass
669
+
670
+ # -----------------------------
671
+ # Internal: audio input (auto-turn mode)
672
+ # -----------------------------
673
+
674
+ def rt_handle_audio_input_sync(self, event: RealtimeEvent, timeout: float = 0.5):
675
+ """
676
+ Synchronous entrypoint for continuous microphone input when auto-turn is enabled.
677
+ This is safe to call from any thread; it schedules on the owner's background loop.
678
+ """
679
+ # Fast return if nothing to send
680
+ try:
681
+ payload = getattr(event, "data", {}) or {}
682
+ if isinstance(payload, dict) and "payload" in payload and isinstance(payload["payload"], dict):
683
+ payload = payload["payload"]
684
+ if not payload or not payload.get("data"):
685
+ return
686
+ except Exception:
687
+ return
688
+
689
+ self._ensure_background_loop()
690
+ try:
691
+ self._bg.run_sync(self._rt_handle_audio_input_internal(event), timeout=timeout)
692
+ except Exception:
693
+ # Never raise to caller from audio callback
694
+ pass
695
+
696
+ async def _rt_handle_audio_input_internal(self, event: RealtimeEvent):
697
+ """
698
+ Owner-loop implementation: push live audio to input buffer in auto-turn mode.
699
+ """
700
+ # Session must be open and auto-turn must be enabled
701
+ if not self.ws or not self._running:
702
+ if self.debug:
703
+ print("[_rt_handle_audio_input] Socket not open!")
704
+ return
705
+ try:
706
+ if not bool(getattr(self._last_opts, "auto_turn", False)):
707
+ return
708
+ except Exception:
709
+ return
710
+
711
+ # Extract normalized payload
712
+ payload = getattr(event, "data", {}) or {}
713
+ if isinstance(payload, dict) and "payload" in payload and isinstance(payload["payload"], dict):
714
+ payload = payload["payload"]
715
+
716
+ data: bytes = payload.get("data") or b""
717
+ if not data:
718
+ return
719
+ mime = str(payload.get("mime") or "audio/pcm")
720
+ rate = int(payload.get("rate") or 0) or self._DEFAULT_RATE
721
+ channels = int(payload.get("channels") or 1)
722
+ is_final = bool(payload.get("final", False))
723
+
724
+ # Convert to PCM16 mono @ 24kHz as required by our session config
725
+ fmt_hint = "pcm16" if mime.startswith("audio/pcm") else None
726
+ try:
727
+ sr, _ch, pcm = coerce_to_pcm16_mono(data, fmt_hint, rate, fallback_rate=self._DEFAULT_RATE)
728
+ if sr != self._DEFAULT_RATE:
729
+ try:
730
+ pcm = resample_pcm16_mono(pcm, sr, self._DEFAULT_RATE)
731
+ sr = self._DEFAULT_RATE
732
+ except Exception:
733
+ # On resample failure, still try to send raw chunk as-is (defensive)
734
+ sr = self._DEFAULT_RATE
735
+ except Exception:
736
+ return
737
+
738
+ # Serialize writes to the websocket
739
+ if self._send_lock is None:
740
+ self._send_lock = asyncio.Lock()
741
+
742
+ async with self._send_lock:
743
+ # Append in ~50 ms chunks to keep frames small
744
+ for chunk in iter_pcm_chunks(pcm, sr, ms=50):
745
+ if not chunk:
746
+ continue
747
+ try:
748
+ await self.ws.send(json.dumps({
749
+ "type": "input_audio_buffer.append",
750
+ "audio": base64.b64encode(chunk).decode("utf-8"),
751
+ }))
752
+ except Exception:
753
+ return
754
+
755
+ # If plugin reported stream end, flush the buffer once.
756
+ if is_final:
757
+ try:
758
+ if self.debug:
759
+ print("[_rt_handle_audio_input] final chunk; committing")
760
+ await self.ws.send(json.dumps({"type": "input_audio_buffer.commit"}))
761
+ except Exception:
762
+ pass
763
+
764
+ def commit_audio_input_sync(self, timeout: float = 0.5):
765
+ """
766
+ Synchronous entrypoint to commit the input audio buffer in auto-turn mode.
767
+ This is safe to call from any thread; it schedules on the owner's background loop.
768
+ """
769
+ self._ensure_background_loop()
770
+ try:
771
+ self._bg.run_sync(self._commit_audio_input_internal(), timeout=timeout)
772
+ except Exception:
773
+ # Never raise to caller from audio callback
774
+ pass
775
+
776
+ async def _commit_audio_input_internal(self):
777
+ """
778
+ Owner-loop implementation: commit input audio buffer in auto-turn mode.
779
+ """
780
+ if not self.ws or not self._running:
781
+ return
782
+ try:
783
+ if not bool(getattr(self._last_opts, "auto_turn", False)):
784
+ return
785
+ except Exception:
786
+ return
787
+ if self._send_lock is None:
788
+ self._send_lock = asyncio.Lock()
789
+ async with self._send_lock:
790
+ try:
791
+ await self.ws.send(json.dumps({"type": "input_audio_buffer.commit"}))
792
+ except Exception:
793
+ pass
794
+
795
+ def force_response_now_sync(self, timeout: float = 5.0):
796
+ """Synchronously force the model to create a response from current input buffer."""
797
+ self._ensure_background_loop()
798
+ try:
799
+ self._bg.run_sync(self._force_response_now_internal(), timeout=timeout)
800
+ except Exception:
801
+ pass
802
+
803
+ async def _force_response_now_internal(self):
804
+ """Owner-loop: commit current input buffer and trigger response.create."""
805
+ if not self.ws or not self._running:
806
+ return
807
+ try:
808
+ if not bool(getattr(self._last_opts, "auto_turn", False)):
809
+ # This helper is intended for auto-turn; manual flow already does commit+response.create.
810
+ return
811
+ except Exception:
812
+ return
813
+
814
+ if self._send_lock is None:
815
+ self._send_lock = asyncio.Lock()
816
+
817
+ async with self._send_lock:
818
+ # 1) Finalize current input buffer
819
+ try:
820
+ await self.ws.send(json.dumps({"type": "input_audio_buffer.commit"}))
821
+ except Exception:
822
+ return
823
+
824
+ # 2) Prepare wait handle for this response
825
+ if self._response_done is None:
826
+ self._response_done = asyncio.Event()
827
+ else:
828
+ try:
829
+ self._response_done.clear()
830
+ except Exception:
831
+ self._response_done = asyncio.Event()
832
+
833
+ # 3) Build response payload (modalities + tools/tool_choice like in _send_turn_internal)
834
+ resp_obj = {"modalities": ["text", "audio"]}
835
+ try:
836
+ resp_tools, tool_choice = prepare_tools_for_response(self._last_opts)
837
+ if resp_tools:
838
+ resp_obj["tools"] = resp_tools
839
+ if tool_choice is None:
840
+ tool_choice = "auto"
841
+ if tool_choice:
842
+ resp_obj["tool_choice"] = tool_choice
843
+ except Exception:
844
+ pass
845
+
846
+ # 4) Trigger the assistant response now
847
+ try:
848
+ await self.ws.send(json.dumps({"type": "response.create", "response": resp_obj}))
849
+ except Exception:
850
+ return
851
+
852
+ # -----------------------------
853
+ # Public: live tools update
854
+ # -----------------------------
855
+
856
+ async def update_session_tools(
857
+ self,
858
+ tools: Optional[list] = None,
859
+ remote_tools: Optional[list] = None,
860
+ force: bool = False
861
+ ):
862
+ """
863
+ Update session tools live via session.update.
864
+ If WS is not open, this updates self._last_opts and returns.
865
+ """
866
+ self._ensure_background_loop()
867
+ return await self._run_on_owner(
868
+ self._update_session_tools_internal(tools, remote_tools, force)
869
+ )
870
+
871
+ def update_session_tools_sync(
872
+ self,
873
+ tools: Optional[list] = None,
874
+ remote_tools: Optional[list] = None,
875
+ force: bool = False,
876
+ timeout: float = 5.0
877
+ ):
878
+ """Synchronous wrapper over update_session_tools()."""
879
+ self._ensure_background_loop()
880
+ self._bg.run_sync(self._update_session_tools_internal(tools, remote_tools, force), timeout=timeout)
881
+
882
+ async def _update_session_tools_internal(
883
+ self,
884
+ tools: Optional[list],
885
+ remote_tools: Optional[list],
886
+ force: bool
887
+ ):
888
+ """
889
+ Owner-loop implementation for session tools update.
890
+ """
891
+ # If socket is not open, just cache into last opts
892
+ if not self.ws:
893
+ self._update_last_opts_tools(tools, remote_tools)
894
+ self._cached_session_tools_sig = None
895
+ if self.debug:
896
+ print("[update_session_tools] WS not open; cached for next session")
897
+ return
898
+
899
+ # Sanitize/compose session tools
900
+ try:
901
+ fn = sanitize_function_tools(tools if tools is not None else getattr(self._last_opts, "tools", None))
902
+ rt = sanitize_remote_tools(remote_tools if remote_tools is not None else getattr(self._last_opts, "remote_tools", None))
903
+ session_tools = (rt or []) + (fn or [])
904
+ except Exception as e:
905
+ if self.debug:
906
+ print(f"[update_session_tools] sanitize error: {e}")
907
+ session_tools = []
908
+
909
+ new_sig = tools_signature(session_tools)
910
+
911
+ # Compare with cached signature
912
+ if not force and self._cached_session_tools_sig == new_sig:
913
+ if self.debug:
914
+ print("[update_session_tools] no changes; skipping session.update")
915
+ self._update_last_opts_tools(tools, remote_tools)
916
+ return
917
+
918
+ # Send session.update under the single writer lock
919
+ if self._send_lock is None:
920
+ self._send_lock = asyncio.Lock()
921
+ async with self._send_lock:
922
+ try:
923
+ payload = {
924
+ "type": "session.update",
925
+ "session": {"tools": session_tools}
926
+ }
927
+ await self.ws.send(json.dumps(payload))
928
+ self._cached_session_tools_sig = new_sig
929
+ self._update_last_opts_tools(tools, remote_tools)
930
+ if self.debug:
931
+ print(f"[update_session_tools] session.update sent; tools={len(session_tools)}")
932
+ except Exception as e:
933
+ if self.debug:
934
+ print(f"[update_session_tools] send error: {e}")
935
+
936
+ # -----------------------------
937
+ # Public: send tool results back to the model
938
+ # -----------------------------
939
+
940
+ async def send_tool_results(
941
+ self,
942
+ results,
943
+ continue_turn: bool = True,
944
+ wait_for_done: bool = True,
945
+ ):
946
+ """
947
+ Send tool results back to the Realtime session.
948
+ """
949
+ self._ensure_background_loop()
950
+ return await self._run_on_owner(
951
+ self._send_tool_results_internal(results, continue_turn, wait_for_done)
952
+ )
953
+
954
+ def send_tool_results_sync(
955
+ self,
956
+ results,
957
+ continue_turn: bool = True,
958
+ wait_for_done: bool = True,
959
+ timeout: float = 20.0,
960
+ ):
961
+ """Synchronous wrapper for send_tool_results()."""
962
+ self._ensure_background_loop()
963
+ return self._bg.run_sync(
964
+ self._send_tool_results_internal(results, continue_turn, wait_for_done),
965
+ timeout=timeout
966
+ )
967
+
968
+ async def _send_tool_results_internal(
969
+ self,
970
+ results,
971
+ continue_turn: bool,
972
+ wait_for_done: bool,
973
+ ):
974
+ """
975
+ Owner-loop implementation. Serializes sends under the WS writer lock.
976
+ """
977
+ if not self.ws:
978
+ raise RuntimeError("Live session is not open")
979
+
980
+ outputs = build_tool_outputs_payload(results, self._last_tool_calls)
981
+ if not outputs:
982
+ return
983
+
984
+ if self._send_lock is None:
985
+ self._send_lock = asyncio.Lock()
986
+
987
+ wait_ev: Optional[asyncio.Event] = None
988
+ async with self._send_lock:
989
+ # Emit one conversation.item.create per tool output
990
+ for it in outputs:
991
+ payload = {
992
+ "type": "conversation.item.create",
993
+ "item": {
994
+ "type": "function_call_output",
995
+ "call_id": it["call_id"],
996
+ "output": it["output"], # must be a string (JSON-encoded when dict/list)
997
+ },
998
+ }
999
+ if it.get("previous_item_id"):
1000
+ payload["previous_item_id"] = it["previous_item_id"]
1001
+ await self.ws.send(json.dumps(payload))
1002
+
1003
+ # Optionally ask the model to continue
1004
+ if continue_turn:
1005
+ if self._response_done is None:
1006
+ self._response_done = asyncio.Event()
1007
+ else:
1008
+ try:
1009
+ self._response_done.clear()
1010
+ except Exception:
1011
+ self._response_done = asyncio.Event()
1012
+ wait_ev = self._response_done # snapshot for race-free waiting
1013
+ await self.ws.send(json.dumps({"type": "response.create"}))
1014
+
1015
+ # Wait for the follow-up response to complete
1016
+ if continue_turn and wait_for_done and wait_ev:
1017
+ try:
1018
+ await wait_ev.wait()
1019
+ except Exception:
1020
+ pass
1021
+
1022
+ # -----------------------------
1023
+ # Internal: receive loop
1024
+ # -----------------------------
1025
+
1026
+ async def _recv_loop(self):
1027
+ """
1028
+ Single receiver loop for the entire session.
1029
+ Processes incoming events and dispatches to callbacks.
1030
+ """
1031
+ if self.debug:
1032
+ print("[_recv_loop] started")
1033
+
1034
+ DEFAULT_RATE = self._DEFAULT_RATE
1035
+ audio_done = True
1036
+
1037
+ try:
1038
+ while self._running and self.ws:
1039
+ # Do not hard-stop the session on should_stop; only cancel active response if requested.
1040
+ if self._should_stop and self._should_stop():
1041
+ await self._cancel_active_response_internal()
1042
+
1043
+ try:
1044
+ raw = await asyncio.wait_for(self.ws.recv(), timeout=60)
1045
+ except asyncio.TimeoutError:
1046
+ continue
1047
+ except Exception as e:
1048
+ if self.debug:
1049
+ print(f"[_recv_loop] recv error: {e!r}")
1050
+ break
1051
+
1052
+ if isinstance(raw, bytes):
1053
+ # Realtime sends JSON text frames; ignore unexpected binary
1054
+ continue
1055
+
1056
+ try:
1057
+ ev = json.loads(raw)
1058
+ except Exception:
1059
+ continue
1060
+
1061
+ etype = ev.get("type")
1062
+
1063
+ # ---- session lifecycle (capture server handle) ----
1064
+ if etype in ("session.created", "session.updated"):
1065
+ sess = ev.get("session") or {}
1066
+ sid = sess.get("id")
1067
+ if isinstance(sid, str) and sid.strip():
1068
+ self._rt_session_id = sid.strip()
1069
+ set_ctx_rt_handle(self._ctx, self._rt_session_id, self.window)
1070
+ if self.debug:
1071
+ print(f"[_recv_loop] session id: {self._rt_session_id}")
1072
+ # Optional: expires_at if present (not always provided)
1073
+ exp = sess.get("expires_at") or sess.get("expiresAt")
1074
+ try:
1075
+ if isinstance(exp, (int, float)) and exp > 0:
1076
+ self._rt_session_expires_at = int(exp)
1077
+ set_rt_session_expires_at(self._ctx, self._rt_session_expires_at, self.window)
1078
+ except Exception:
1079
+ pass
1080
+ continue
1081
+
1082
+ if etype == "response.created":
1083
+ if self.debug:
1084
+ print("[_recv_loop] response created")
1085
+ self._response_active = True
1086
+ audio_done = False
1087
+ self._rt_reset_state()
1088
+
1089
+ elif etype == "input_audio_buffer.speech_started":
1090
+ if self.debug:
1091
+ print("[_recv_loop] speech_started")
1092
+
1093
+ elif etype == "input_audio_buffer.speech_stopped":
1094
+ if self.debug:
1095
+ print("[_recv_loop] speech_stopped")
1096
+
1097
+ elif etype == "input_audio_buffer.committed":
1098
+ if self.debug:
1099
+ print("[_recv_loop] audio_buffer.committed")
1100
+
1101
+ # disable mic input if auto-commit
1102
+ if self._last_opts:
1103
+ self._last_opts.rt_signals.response.emit(RealtimeEvent(RealtimeEvent.RT_OUTPUT_AUDIO_COMMIT, {
1104
+ "ctx": self._ctx,
1105
+ }))
1106
+
1107
+ # ---- input transcription (user speech) ----
1108
+ elif etype == "conversation.item.input_audio_transcription.delta":
1109
+ if self._transcribe_enabled():
1110
+ buf = self._input_tr_buffers.get(ev.get("item_id"))
1111
+ if buf is None:
1112
+ buf = io.StringIO()
1113
+ self._input_tr_buffers[ev.get("item_id")] = buf
1114
+ delta = ev.get("delta") or ev.get("text") or ev.get("transcript") or ""
1115
+ if delta:
1116
+ buf.write(str(delta))
1117
+
1118
+ elif etype in ("conversation.item.input_audio_transcription.completed",
1119
+ "conversation.item.input_audio_transcription.done"):
1120
+ if self._transcribe_enabled():
1121
+ item_id = ev.get("item_id")
1122
+ tr = ev.get("transcript") or ""
1123
+ buf = self._input_tr_buffers.pop(item_id, None)
1124
+ if buf is not None:
1125
+ try:
1126
+ v = buf.getvalue()
1127
+ if v and not tr:
1128
+ tr = v
1129
+ finally:
1130
+ try:
1131
+ buf.close()
1132
+ except Exception:
1133
+ pass
1134
+ if tr:
1135
+ self._save_input_transcript(tr)
1136
+
1137
+ elif etype == "conversation.item.input_audio_transcription.failed":
1138
+ if self.debug:
1139
+ err = (ev.get("error") or {}).get("message") or "input transcription failed"
1140
+ print(f"[_recv_loop] {err}")
1141
+
1142
+ elif etype == "conversation.item.created":
1143
+ if self.debug:
1144
+ print("[_recv_loop] conversation.item.created")
1145
+ # Fallback: some servers may include transcript inside the created user item
1146
+ if self._transcribe_enabled():
1147
+ item = ev.get("item") or {}
1148
+ if item.get("role") == "user":
1149
+ for c in (item.get("content") or []):
1150
+ if isinstance(c, dict) and c.get("type") in ("input_audio", "audio"):
1151
+ tr = c.get("transcript")
1152
+ if tr:
1153
+ self._save_input_transcript(str(tr))
1154
+
1155
+ # ---- assistant text vs assistant audio transcript deltas ----
1156
+ elif etype in ("response.text.delta", "response.output_text.delta"):
1157
+ delta = ev.get("delta") or ev.get("text")
1158
+ if isinstance(delta, dict) and "text" in delta:
1159
+ delta = delta["text"]
1160
+ if delta:
1161
+ self._rt_append_text(delta)
1162
+ if self._on_text:
1163
+ try:
1164
+ await self._on_text(str(delta))
1165
+ except Exception:
1166
+ pass
1167
+ elif etype == "response.audio_transcript.delta":
1168
+ if self._transcribe_enabled():
1169
+ delta = ev.get("delta") or ev.get("text")
1170
+ if isinstance(delta, dict) and "text" in delta:
1171
+ delta = delta["text"]
1172
+ if delta:
1173
+ self._rt_append_text(delta)
1174
+ if self._on_text:
1175
+ try:
1176
+ await self._on_text(str(delta))
1177
+ except Exception:
1178
+ pass
1179
+
1180
+ elif etype in ("response.text.done", "response.output_text.done", "response.audio_transcript.done"):
1181
+ if self.debug:
1182
+ print("[_recv_loop] text done")
1183
+
1184
+ elif etype == "response.content_part.added":
1185
+ part = ev.get("part") or {}
1186
+ ptype = part.get("type")
1187
+ if ptype == "text":
1188
+ txt = part.get("text") or ""
1189
+ if txt:
1190
+ self._rt_append_text(txt)
1191
+ if self._on_text:
1192
+ try:
1193
+ await self._on_text(str(txt))
1194
+ except Exception:
1195
+ pass
1196
+ elif ptype == "audio":
1197
+ b64 = part.get("audio")
1198
+ if b64 and self._on_audio:
1199
+ try:
1200
+ data = base64.b64decode(b64)
1201
+ await self._on_audio(data, "audio/pcm", DEFAULT_RATE, 1, False)
1202
+ except Exception:
1203
+ pass
1204
+ tr = part.get("transcript")
1205
+ if tr and self._transcribe_enabled():
1206
+ self._rt_append_text(tr)
1207
+ if self._on_text:
1208
+ try:
1209
+ await self._on_text(str(tr))
1210
+ except Exception:
1211
+ pass
1212
+
1213
+ elif etype == "response.audio.delta":
1214
+ b64 = ev.get("delta")
1215
+ if b64 and self._on_audio:
1216
+ try:
1217
+ data = base64.b64decode(b64)
1218
+ await self._on_audio(data, "audio/pcm", DEFAULT_RATE, 1, False)
1219
+ except Exception:
1220
+ pass
1221
+
1222
+ elif etype == "response.audio.done":
1223
+ if self.debug:
1224
+ print("[_recv_loop] audio done")
1225
+ if not audio_done and self._on_audio:
1226
+ try:
1227
+ await self._on_audio(b"", "audio/pcm", DEFAULT_RATE, 1, True)
1228
+ except Exception:
1229
+ pass
1230
+ audio_done = True
1231
+
1232
+ # ---- function calling (tools) ----
1233
+ elif etype == "response.output_item.added":
1234
+ if self.debug:
1235
+ print("[_recv_loop] output_item added")
1236
+ item = ev.get("item") or {}
1237
+ if item.get("type") == "function_call":
1238
+ fid = item.get("id") or item.get("item_id") or ""
1239
+ call_id = item.get("call_id") or ""
1240
+ name = item.get("name") or ""
1241
+ self._rt_state["tool_calls"].append({
1242
+ "id": fid,
1243
+ "call_id": call_id,
1244
+ "type": "function",
1245
+ "function": {"name": name, "arguments": ""}
1246
+ })
1247
+ if fid and fid not in self._rt_state["fn_args_buffers"]:
1248
+ self._rt_state["fn_args_buffers"][fid] = io.StringIO()
1249
+
1250
+ elif etype == "response.function_call_arguments.delta":
1251
+ buf = self._rt_state["fn_args_buffers"].get(ev.get("item_id"))
1252
+ if buf is not None:
1253
+ delta = ev.get("delta") or ""
1254
+ if delta:
1255
+ buf.write(delta)
1256
+
1257
+ elif etype == "response.function_call_arguments.done":
1258
+ item_id = ev.get("item_id")
1259
+ args_val = ev.get("arguments") or ""
1260
+ buf = self._rt_state["fn_args_buffers"].pop(item_id, None)
1261
+ if buf is not None:
1262
+ try:
1263
+ concat = buf.getvalue()
1264
+ if concat:
1265
+ args_val = concat
1266
+ finally:
1267
+ try:
1268
+ buf.close()
1269
+ except Exception:
1270
+ pass
1271
+ for tc in self._rt_state["tool_calls"]:
1272
+ if tc.get("id") == item_id:
1273
+ tc["function"]["arguments"] = args_val
1274
+ break
1275
+ self._rt_state["force_func_call"] = True
1276
+
1277
+ elif etype == "response.output_item.done":
1278
+ if self.debug:
1279
+ print("[_recv_loop] output_item done")
1280
+ item = ev.get("item") or {}
1281
+ if item.get("type") == "function_call":
1282
+ fid = item.get("id") or item.get("item_id") or ""
1283
+ name = item.get("name") or ""
1284
+ args_val = item.get("arguments") or ""
1285
+ for tc in self._rt_state["tool_calls"]:
1286
+ if fid and tc.get("id") == fid:
1287
+ if name:
1288
+ tc["function"]["name"] = name
1289
+ if args_val:
1290
+ tc["function"]["arguments"] = args_val
1291
+ break
1292
+ self._rt_state["force_func_call"] = True
1293
+
1294
+ # ---- code interpreter (delta/done) ----
1295
+ elif etype in ("response.code_interpreter_call_code.delta", "response.code_interpreter_call.code.delta"):
1296
+ code_delta = ev.get("delta") or ""
1297
+ if code_delta:
1298
+ if not self._rt_state["is_code"]:
1299
+ hdr = "\n\n**Code interpreter**\n```python\n"
1300
+ self._rt_append_text(hdr + code_delta)
1301
+ if self._on_text:
1302
+ try:
1303
+ await self._on_text(hdr + code_delta)
1304
+ except Exception:
1305
+ pass
1306
+ self._rt_state["is_code"] = True
1307
+ else:
1308
+ self._rt_append_text(code_delta)
1309
+ if self._on_text:
1310
+ try:
1311
+ await self._on_text(code_delta)
1312
+ except Exception:
1313
+ pass
1314
+
1315
+ elif etype in ("response.code_interpreter_call_code.done", "response.code_interpreter_call.code.done"):
1316
+ if self.debug:
1317
+ print("[_recv_loop] code done")
1318
+ if self._rt_state["is_code"]:
1319
+ tail = "\n\n```\n-----------\n"
1320
+ self._rt_append_text(tail)
1321
+ if self._on_text:
1322
+ try:
1323
+ await self._on_text(tail)
1324
+ except Exception:
1325
+ pass
1326
+ self._rt_state["is_code"] = False
1327
+
1328
+ # ---- annotations (citations/files) ----
1329
+ elif etype == "response.output_text.annotation.added":
1330
+ if self.debug:
1331
+ print("[_recv_loop] annotation added")
1332
+ ann = ev.get("annotation") or {}
1333
+ atype = ann.get("type")
1334
+ if atype == "url_citation":
1335
+ url = ann.get("url")
1336
+ self._rt_add_citation(url)
1337
+ elif atype == "container_file_citation":
1338
+ self._rt_state["files"].append({
1339
+ "container_id": ann.get("container_id"),
1340
+ "file_id": ann.get("file_id"),
1341
+ })
1342
+
1343
+ # ---- partial images (defensive) ----
1344
+ elif etype == "response.image_generation_call.partial_image":
1345
+ image_b64 = ev.get("partial_image_b64")
1346
+ if image_b64:
1347
+ try:
1348
+ img_bytes = base64.b64decode(image_b64)
1349
+ save_path = self.window.core.image.gen_unique_path(self._ctx)
1350
+ with open(save_path, "wb") as f:
1351
+ f.write(img_bytes)
1352
+ self._rt_state["image_paths"].append(save_path)
1353
+ self._rt_state["is_image"] = True
1354
+ if not isinstance(self._ctx.images, list):
1355
+ self._ctx.images = []
1356
+ if save_path not in self._ctx.images:
1357
+ self._ctx.images.append(save_path)
1358
+ except Exception:
1359
+ pass
1360
+
1361
+ elif etype == "response.done":
1362
+ if self.debug:
1363
+ print("[_recv_loop] response done")
1364
+ # Ensure audio finalized
1365
+ if not audio_done and self._on_audio:
1366
+ try:
1367
+ await self._on_audio(b"", "audio/pcm", DEFAULT_RATE, 1, True)
1368
+ except Exception:
1369
+ pass
1370
+ audio_done = True
1371
+
1372
+ self._response_active = False
1373
+
1374
+ # Capture usage if present on response
1375
+ try:
1376
+ resp_obj = ev.get("response") or {}
1377
+ self._rt_capture_usage(resp_obj)
1378
+ except Exception:
1379
+ pass
1380
+
1381
+ # Build final output text
1382
+ output = "".join(self._rt_state["output_parts"]) if self._rt_state else ""
1383
+ if has_unclosed_code_tag(output):
1384
+ output += "\n```"
1385
+ if not output:
1386
+ try:
1387
+ transcript = self._extract_text_from_response_done(ev)
1388
+ if transcript:
1389
+ output = transcript
1390
+ except Exception:
1391
+ pass
1392
+
1393
+ # Persist into ctx
1394
+ try:
1395
+ if self._ctx:
1396
+ self._ctx.output = output or (self._ctx.output or "")
1397
+ up = self._rt_state.get("usage_payload") if self._rt_state else None
1398
+ if up:
1399
+ in_tok = up.get("in")
1400
+ out_tok = up.get("out")
1401
+ if in_tok is None:
1402
+ in_tok = self._ctx.input_tokens if self._ctx.input_tokens is not None else 0
1403
+ if out_tok is None:
1404
+ out_tok = 0
1405
+ self._ctx.set_tokens(in_tok, out_tok)
1406
+ try:
1407
+ if not isinstance(self._ctx.extra, dict):
1408
+ self._ctx.extra = {}
1409
+ self._ctx.extra["usage"] = {
1410
+ "vendor": "openai",
1411
+ "input_tokens": in_tok,
1412
+ "output_tokens": out_tok,
1413
+ "reasoning_tokens": up.get("reasoning", 0),
1414
+ "total_reported": up.get("total"),
1415
+ }
1416
+ except Exception:
1417
+ pass
1418
+
1419
+ # Citations
1420
+ if self._rt_state and self._rt_state["citations"]:
1421
+ if self._ctx.urls is None:
1422
+ self._ctx.urls = []
1423
+ for u in self._rt_state["citations"]:
1424
+ if u not in self._ctx.urls:
1425
+ self._ctx.urls.append(u)
1426
+
1427
+ # Images
1428
+ if self._rt_state and self._rt_state["image_paths"]:
1429
+ if not isinstance(self._ctx.images, list):
1430
+ self._ctx.images = []
1431
+ for p in self._rt_state["image_paths"]:
1432
+ if p not in self._ctx.images:
1433
+ self._ctx.images.append(p)
1434
+
1435
+ self.window.core.ctx.update_item(self._ctx)
1436
+ except Exception:
1437
+ pass
1438
+
1439
+ # Download container files if any
1440
+ try:
1441
+ files = (self._rt_state or {}).get("files") or []
1442
+ if files:
1443
+ self.window.core.api.openai.container.download_files(self._ctx, files)
1444
+ except Exception:
1445
+ pass
1446
+
1447
+ # Unpack tool calls if any
1448
+ try:
1449
+ tcs = (self._rt_state or {}).get("tool_calls") or []
1450
+ if tcs:
1451
+ for tc in tcs:
1452
+ fn = tc.get("function") or {}
1453
+ if isinstance(fn.get("arguments"), dict):
1454
+ fn["arguments"] = json.dumps(fn["arguments"], ensure_ascii=False)
1455
+ self._ctx.force_call = bool((self._rt_state or {}).get("force_func_call"))
1456
+ self.window.core.debug.info("[realtime] Tool calls found, unpacking...")
1457
+ self.window.core.command.unpack_tool_calls_chunks(self._ctx, tcs)
1458
+ self.window.core.ctx.update_item(self._ctx)
1459
+ except Exception:
1460
+ pass
1461
+
1462
+ # Persist last tool calls snapshot for mapping tool outputs
1463
+ try:
1464
+ tcs = (self._rt_state or {}).get("tool_calls") or []
1465
+ if tcs:
1466
+ self._last_tool_calls = list(tcs)
1467
+ except Exception:
1468
+ pass
1469
+
1470
+ # Unblock waiters
1471
+ if self._response_done:
1472
+ self._response_done.set()
1473
+
1474
+ # send RT_OUTPUT_TURN_END signal
1475
+ if self._last_opts:
1476
+ self._last_opts.rt_signals.response.emit(RealtimeEvent(RealtimeEvent.RT_OUTPUT_TURN_END, {
1477
+ "ctx": self._ctx,
1478
+ }))
1479
+
1480
+ # Reset per-response extraction state
1481
+ self._rt_state = None
1482
+
1483
+ elif etype == "error":
1484
+ if self.debug:
1485
+ print(f"[_recv_loop] error event: {ev}")
1486
+ # Session expiration and other errors
1487
+ err = ev.get("error") or {}
1488
+ msg = (err.get("message") or "")
1489
+ code = (err.get("code") or "")
1490
+ if isinstance(code, str) and code.strip().lower() == "session_expired":
1491
+ self._rt_session_id = None
1492
+ if self.debug:
1493
+ print("[_recv_loop] session expired")
1494
+ if "already has an active response" in (msg or "").lower():
1495
+ if self._response_done:
1496
+ self._response_done.set()
1497
+ continue
1498
+ if self._response_done:
1499
+ self._response_done.set()
1500
+ if self.debug:
1501
+ print(f"[_recv_loop] error: {msg}")
1502
+
1503
+ # Other events are ignored
1504
+
1505
+ except Exception as e:
1506
+ if self.debug:
1507
+ print(f"[_recv_loop] exception: {e!r}")
1508
+ finally:
1509
+ if self.debug:
1510
+ print("[_recv_loop] stopped")
1511
+ # Ensure any waiters are unblocked on socket teardown
1512
+ try:
1513
+ if self._response_done and not self._response_done.is_set():
1514
+ self._response_done.set()
1515
+ except Exception:
1516
+ pass
1517
+ try:
1518
+ if self.ws:
1519
+ await self.ws.close()
1520
+ except Exception:
1521
+ pass
1522
+ self.ws = None
1523
+ self._running = False
1524
+
1525
+ # -----------------------------
1526
+ # Helpers
1527
+ # -----------------------------
1528
+
1529
+ def _preferred_voice(self) -> str:
1530
+ """
1531
+ Resolve preferred OpenAI voice from settings.
1532
+ """
1533
+ try:
1534
+ v = self.window.core.plugins.get_option("audio_output", "openai_voice")
1535
+ if v:
1536
+ return str(v)
1537
+ except Exception:
1538
+ pass
1539
+ return "alloy"
1540
+
1541
+ def _extract_text_from_response_done(self, ev: dict) -> str:
1542
+ """
1543
+ Extract assistant text from response.done payload.
1544
+ """
1545
+ res = ev.get("response") or {}
1546
+ out = res.get("output") or []
1547
+ parts: list[str] = []
1548
+
1549
+ for item in out:
1550
+ if not isinstance(item, dict):
1551
+ continue
1552
+ if item.get("type") not in ("message", "tool_result", "function_call_result", "response"):
1553
+ pass
1554
+ content_list = item.get("content") or []
1555
+ for c in content_list:
1556
+ if not isinstance(c, dict):
1557
+ continue
1558
+ ctype = c.get("type")
1559
+ if ctype == "audio" and self._transcribe_enabled():
1560
+ tr = c.get("transcript")
1561
+ if tr:
1562
+ parts.append(str(tr))
1563
+ elif ctype in ("text", "output_text", "input_text"):
1564
+ txt = c.get("text")
1565
+ if isinstance(txt, dict):
1566
+ txt = txt.get("text") or txt.get("value")
1567
+ if txt:
1568
+ parts.append(str(txt))
1569
+
1570
+ text = "\n".join(t.strip() for t in parts if t and str(t).strip())
1571
+ return text
1572
+
1573
+ # ---- per-response state helpers ----
1574
+
1575
+ def _rt_reset_state(self):
1576
+ """Reset per-response extraction state."""
1577
+ self._rt_state = {
1578
+ "output_parts": [],
1579
+ "begin": True,
1580
+ "fn_args_buffers": {},
1581
+ "tool_calls": [],
1582
+ "citations": [],
1583
+ "files": [],
1584
+ "image_paths": [],
1585
+ "is_image": False,
1586
+ "is_code": False,
1587
+ "force_func_call": False,
1588
+ "usage_payload": {},
1589
+ }
1590
+
1591
+ def _rt_append_text(self, s: str):
1592
+ """Append text to assembled output, skipping initial empty deltas."""
1593
+ if self._rt_state is None:
1594
+ self._rt_reset_state()
1595
+ if self._rt_state["begin"] and (s is None or s == ""):
1596
+ return
1597
+ self._rt_state["output_parts"].append(str(s))
1598
+ self._rt_state["begin"] = False
1599
+
1600
+ def _rt_add_citation(self, url: Optional[str]):
1601
+ """Add a URL citation to state and ctx (de-duplicated)."""
1602
+ if not url or not isinstance(url, str):
1603
+ return
1604
+ url = url.strip()
1605
+ if not (url.startswith("http://") or url.startswith("https://")):
1606
+ return
1607
+ if url not in self._rt_state["citations"]:
1608
+ self._rt_state["citations"].append(url)
1609
+ try:
1610
+ if self._ctx:
1611
+ if self._ctx.urls is None:
1612
+ self._ctx.urls = []
1613
+ if url not in self._ctx.urls:
1614
+ self._ctx.urls.append(url)
1615
+ except Exception:
1616
+ pass
1617
+
1618
+ def _rt_capture_usage(self, response_obj: dict):
1619
+ """
1620
+ Capture token usage from response.done if present.
1621
+ """
1622
+ try:
1623
+ usage = (response_obj or {}).get("usage") or {}
1624
+ if not usage:
1625
+ return
1626
+ in_tok = usage.get("input_tokens") or usage.get("prompt_tokens")
1627
+ out_tok = usage.get("output_tokens") or usage.get("completion_tokens")
1628
+ total = usage.get("total_tokens")
1629
+ self._rt_state["usage_payload"] = {
1630
+ "in": int(in_tok) if in_tok is not None else None,
1631
+ "out": int(out_tok) if out_tok is not None else None,
1632
+ "total": int(total) if total is not None else None,
1633
+ "reasoning": 0,
1634
+ }
1635
+ except Exception:
1636
+ pass
1637
+
1638
+ # ---- transcription helpers ----
1639
+
1640
+ def _transcribe_enabled(self) -> bool:
1641
+ """Returns True if transcription (input/output) is enabled via opts.transcribe."""
1642
+ try:
1643
+ return bool(getattr(self._last_opts, "transcribe", False))
1644
+ except Exception:
1645
+ return False
1646
+
1647
+ def _save_input_transcript(self, transcript: str):
1648
+ """
1649
+ Persist input transcript into ctx. If the user didn't provide a text prompt in this turn,
1650
+ ctx.input is also populated so downstream code treats it as the user's textual message.
1651
+ """
1652
+ if not transcript:
1653
+ return
1654
+ try:
1655
+ if self._ctx:
1656
+ if not isinstance(self._ctx.extra, dict):
1657
+ self._ctx.extra = {}
1658
+ self._ctx.input.extra["input_transcript"] = str(transcript)
1659
+ if not getattr(self._last_opts, "prompt", None):
1660
+ self._ctx.input = str(transcript)
1661
+ self.window.core.ctx.update_item(self._ctx)
1662
+ except Exception:
1663
+ pass
1664
+
1665
+ def _tune_openai_vad(self, session_payload: dict, opts) -> None:
1666
+ """
1667
+ Increase end-of-speech hold for server VAD (auto-turn) to reduce premature turn endings.
1668
+ """
1669
+ try:
1670
+ sess = session_payload.get("session") or {}
1671
+ td = sess.get("turn_detection")
1672
+ if not isinstance(td, dict):
1673
+ return # manual mode or VAD disabled
1674
+
1675
+ # Resolve target silence (default +2000 ms)
1676
+ target_ms = getattr(opts, "vad_end_silence_ms", None)
1677
+ if not isinstance(target_ms, (int, float)) or target_ms <= 0:
1678
+ # If user didn't override, ensure at least 2000 ms
1679
+ base = int(td.get("silence_duration_ms") or 500)
1680
+ target_ms = max(base, 2000)
1681
+
1682
+ td["silence_duration_ms"] = int(target_ms)
1683
+
1684
+ # Optional: prefix padding before detected speech
1685
+ prefix_ms = getattr(opts, "vad_prefix_padding_ms", None)
1686
+ if isinstance(prefix_ms, (int, float)) and prefix_ms >= 0:
1687
+ td["prefix_padding_ms"] = int(prefix_ms)
1688
+ except Exception:
1689
+ pass
1690
+
1691
+ def update_session_autoturn_sync(
1692
+ self,
1693
+ enabled: bool,
1694
+ silence_ms: Optional[int] = None,
1695
+ prefix_ms: Optional[int] = None,
1696
+ timeout: float = 5.0,
1697
+ ):
1698
+ """
1699
+ Synchronous helper to enable/disable auto-turn (VAD) mode on the live session.
1700
+ You can override silence and prefix (ms) as 2nd and 3rd args.
1701
+ If WS is not open, this updates self._last_opts and returns.
1702
+ """
1703
+ self._ensure_background_loop()
1704
+ try:
1705
+ self._bg.run_sync(
1706
+ self._update_session_autoturn_internal(enabled, silence_ms, prefix_ms),
1707
+ timeout=timeout
1708
+ )
1709
+ except Exception:
1710
+ pass
1711
+
1712
+ async def _update_session_autoturn_internal(
1713
+ self,
1714
+ enabled: bool,
1715
+ silence_ms: Optional[int] = None,
1716
+ prefix_ms: Optional[int] = None,
1717
+ ):
1718
+ """
1719
+ Owner-loop implementation for toggling auto-turn (server/semantic VAD) at runtime
1720
+ with optional silence and prefix overrides (milliseconds).
1721
+ """
1722
+ # If socket is not open, just cache into last opts
1723
+ if not self.ws:
1724
+ try:
1725
+ if self._last_opts:
1726
+ setattr(self._last_opts, "auto_turn", bool(enabled))
1727
+ if silence_ms is not None:
1728
+ setattr(self._last_opts, "vad_end_silence_ms", int(silence_ms))
1729
+ if prefix_ms is not None:
1730
+ setattr(self._last_opts, "vad_prefix_padding_ms", int(prefix_ms))
1731
+ except Exception:
1732
+ pass
1733
+ if self.debug:
1734
+ print("[update_session_autoturn] WS not open; cached for next session")
1735
+ return
1736
+
1737
+ if self._send_lock is None:
1738
+ self._send_lock = asyncio.Lock()
1739
+
1740
+ async with self._send_lock:
1741
+ try:
1742
+ # Build base session.update; let helper set correct turn_detection shape
1743
+ payload: dict = {"type": "session.update", "session": {}}
1744
+ turn_mode = TurnMode.AUTO if enabled else TurnMode.MANUAL
1745
+ apply_turn_mode_openai(payload, turn_mode) # sets session.turn_detection (AUTO) or None (MANUAL)
1746
+
1747
+ if enabled:
1748
+ sess = payload.get("session", {})
1749
+ td = sess.get("turn_detection")
1750
+
1751
+ # Optional VAD type override via opts.vad_type ("server_vad" | "semantic_vad")
1752
+ try:
1753
+ vad_type = getattr(self._last_opts, "vad_type", None)
1754
+ if isinstance(vad_type, str) and vad_type in ("server_vad", "semantic_vad"):
1755
+ if isinstance(td, dict):
1756
+ td["type"] = vad_type
1757
+ except Exception:
1758
+ pass
1759
+
1760
+ # Optional threshold for server_vad
1761
+ try:
1762
+ thr = getattr(self._last_opts, "vad_threshold", None)
1763
+ if isinstance(thr, (int, float)) and isinstance(td, dict) and td.get("type") == "server_vad":
1764
+ td["threshold"] = float(thr)
1765
+ except Exception:
1766
+ pass
1767
+
1768
+ # Apply defaults based on opts first
1769
+ self._tune_openai_vad(payload, self._last_opts)
1770
+
1771
+ # Then hard-override with explicit args (user provided values win)
1772
+ if isinstance(td, dict):
1773
+ if silence_ms is not None:
1774
+ td["silence_duration_ms"] = int(silence_ms)
1775
+ if prefix_ms is not None:
1776
+ td["prefix_padding_ms"] = int(prefix_ms)
1777
+
1778
+ # Optional flags from opts
1779
+ try:
1780
+ cr = getattr(self._last_opts, "vad_create_response", None)
1781
+ if isinstance(cr, bool):
1782
+ td["create_response"] = cr
1783
+ except Exception:
1784
+ pass
1785
+ try:
1786
+ ir = getattr(self._last_opts, "vad_interrupt_response", None)
1787
+ if isinstance(ir, bool):
1788
+ td["interrupt_response"] = ir
1789
+ except Exception:
1790
+ pass
1791
+
1792
+ # Send the update
1793
+ await self.ws.send(json.dumps(payload))
1794
+
1795
+ # Update local opts snapshot so next calls keep the same settings
1796
+ try:
1797
+ if self._last_opts:
1798
+ setattr(self._last_opts, "auto_turn", bool(enabled))
1799
+ if silence_ms is not None:
1800
+ setattr(self._last_opts, "vad_end_silence_ms", int(silence_ms))
1801
+ if prefix_ms is not None:
1802
+ setattr(self._last_opts, "vad_prefix_padding_ms", int(prefix_ms))
1803
+ except Exception:
1804
+ pass
1805
+
1806
+ if self.debug:
1807
+ td_dbg = (payload.get("session", {}) or {}).get("turn_detection")
1808
+ print(f"[update_session_autoturn] session.update sent; auto_turn={enabled}, td={td_dbg}")
1809
+
1810
+ except Exception as e:
1811
+ if self.debug:
1812
+ print(f"[update_session_autoturn] send error: {e}")
1813
+
1814
+ def set_debug(self, enabled: bool):
1815
+ """
1816
+ Enable or disable debug logging.
1817
+
1818
+ :param enabled: True to enable debug logging, False to disable.
1819
+ """
1820
+ self.debug = bool(enabled)
1821
+
1822
+ def is_session_active(self) -> bool:
1823
+ """Check if the WS session is currently open."""
1824
+ return self.ws is not None and self._running
1825
+
1826
+ def update_ctx(self, ctx: CtxItem):
1827
+ """Update the current CtxItem (for session handle persistence)."""
1828
+ self._ctx = ctx