pygpt-net 2.6.30__py3-none-any.whl → 2.6.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. pygpt_net/CHANGELOG.txt +8 -0
  2. pygpt_net/__init__.py +3 -3
  3. pygpt_net/app.py +4 -0
  4. pygpt_net/controller/__init__.py +5 -2
  5. pygpt_net/controller/audio/audio.py +25 -1
  6. pygpt_net/controller/audio/ui.py +2 -2
  7. pygpt_net/controller/chat/audio.py +1 -8
  8. pygpt_net/controller/chat/common.py +29 -3
  9. pygpt_net/controller/chat/handler/__init__.py +0 -0
  10. pygpt_net/controller/chat/handler/stream_worker.py +1124 -0
  11. pygpt_net/controller/chat/output.py +8 -3
  12. pygpt_net/controller/chat/stream.py +3 -1071
  13. pygpt_net/controller/chat/text.py +3 -2
  14. pygpt_net/controller/kernel/kernel.py +11 -3
  15. pygpt_net/controller/kernel/reply.py +5 -1
  16. pygpt_net/controller/realtime/__init__.py +12 -0
  17. pygpt_net/controller/realtime/manager.py +53 -0
  18. pygpt_net/controller/realtime/realtime.py +268 -0
  19. pygpt_net/controller/ui/mode.py +7 -0
  20. pygpt_net/controller/ui/ui.py +19 -1
  21. pygpt_net/core/audio/audio.py +6 -1
  22. pygpt_net/core/audio/backend/native/__init__.py +12 -0
  23. pygpt_net/core/audio/backend/{native.py → native/native.py} +426 -127
  24. pygpt_net/core/audio/backend/native/player.py +139 -0
  25. pygpt_net/core/audio/backend/native/realtime.py +250 -0
  26. pygpt_net/core/audio/backend/pyaudio/__init__.py +12 -0
  27. pygpt_net/core/audio/backend/pyaudio/playback.py +194 -0
  28. pygpt_net/core/audio/backend/pyaudio/pyaudio.py +923 -0
  29. pygpt_net/core/audio/backend/pyaudio/realtime.py +275 -0
  30. pygpt_net/core/audio/backend/pygame/__init__.py +12 -0
  31. pygpt_net/core/audio/backend/{pygame.py → pygame/pygame.py} +130 -19
  32. pygpt_net/core/audio/backend/shared/__init__.py +38 -0
  33. pygpt_net/core/audio/backend/shared/conversions.py +211 -0
  34. pygpt_net/core/audio/backend/shared/envelope.py +38 -0
  35. pygpt_net/core/audio/backend/shared/player.py +137 -0
  36. pygpt_net/core/audio/backend/shared/rt.py +52 -0
  37. pygpt_net/core/audio/capture.py +5 -0
  38. pygpt_net/core/audio/output.py +13 -2
  39. pygpt_net/core/audio/whisper.py +6 -2
  40. pygpt_net/core/bridge/bridge.py +2 -1
  41. pygpt_net/core/bridge/worker.py +4 -1
  42. pygpt_net/core/dispatcher/dispatcher.py +37 -1
  43. pygpt_net/core/events/__init__.py +2 -1
  44. pygpt_net/core/events/realtime.py +55 -0
  45. pygpt_net/core/image/image.py +51 -1
  46. pygpt_net/core/realtime/__init__.py +0 -0
  47. pygpt_net/core/realtime/options.py +87 -0
  48. pygpt_net/core/realtime/shared/__init__.py +0 -0
  49. pygpt_net/core/realtime/shared/audio.py +213 -0
  50. pygpt_net/core/realtime/shared/loop.py +64 -0
  51. pygpt_net/core/realtime/shared/session.py +59 -0
  52. pygpt_net/core/realtime/shared/text.py +37 -0
  53. pygpt_net/core/realtime/shared/tools.py +276 -0
  54. pygpt_net/core/realtime/shared/turn.py +38 -0
  55. pygpt_net/core/realtime/shared/types.py +16 -0
  56. pygpt_net/core/realtime/worker.py +164 -0
  57. pygpt_net/core/types/__init__.py +1 -0
  58. pygpt_net/core/types/image.py +48 -0
  59. pygpt_net/data/config/config.json +10 -4
  60. pygpt_net/data/config/models.json +149 -103
  61. pygpt_net/data/config/settings.json +50 -0
  62. pygpt_net/data/locale/locale.de.ini +5 -5
  63. pygpt_net/data/locale/locale.en.ini +19 -13
  64. pygpt_net/data/locale/locale.es.ini +5 -5
  65. pygpt_net/data/locale/locale.fr.ini +5 -5
  66. pygpt_net/data/locale/locale.it.ini +5 -5
  67. pygpt_net/data/locale/locale.pl.ini +5 -5
  68. pygpt_net/data/locale/locale.uk.ini +5 -5
  69. pygpt_net/data/locale/locale.zh.ini +1 -1
  70. pygpt_net/data/locale/plugin.audio_input.en.ini +4 -0
  71. pygpt_net/data/locale/plugin.audio_output.en.ini +4 -0
  72. pygpt_net/plugin/audio_input/plugin.py +37 -4
  73. pygpt_net/plugin/audio_input/simple.py +57 -8
  74. pygpt_net/plugin/cmd_files/worker.py +3 -0
  75. pygpt_net/provider/api/google/__init__.py +39 -6
  76. pygpt_net/provider/api/google/audio.py +8 -1
  77. pygpt_net/provider/api/google/chat.py +45 -6
  78. pygpt_net/provider/api/google/image.py +226 -86
  79. pygpt_net/provider/api/google/realtime/__init__.py +12 -0
  80. pygpt_net/provider/api/google/realtime/client.py +1945 -0
  81. pygpt_net/provider/api/google/realtime/realtime.py +186 -0
  82. pygpt_net/provider/api/openai/__init__.py +22 -2
  83. pygpt_net/provider/api/openai/realtime/__init__.py +12 -0
  84. pygpt_net/provider/api/openai/realtime/client.py +1828 -0
  85. pygpt_net/provider/api/openai/realtime/realtime.py +194 -0
  86. pygpt_net/provider/audio_input/google_genai.py +103 -0
  87. pygpt_net/provider/audio_output/google_genai_tts.py +229 -0
  88. pygpt_net/provider/audio_output/google_tts.py +0 -12
  89. pygpt_net/provider/audio_output/openai_tts.py +8 -5
  90. pygpt_net/provider/core/config/patch.py +15 -0
  91. pygpt_net/provider/core/model/patch.py +11 -0
  92. pygpt_net/provider/llms/google.py +8 -9
  93. pygpt_net/ui/layout/toolbox/footer.py +16 -0
  94. pygpt_net/ui/layout/toolbox/image.py +5 -0
  95. pygpt_net/ui/widget/option/combo.py +15 -1
  96. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.31.dist-info}/METADATA +26 -14
  97. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.31.dist-info}/RECORD +100 -62
  98. pygpt_net/core/audio/backend/pyaudio.py +0 -554
  99. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.31.dist-info}/LICENSE +0 -0
  100. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.31.dist-info}/WHEEL +0 -0
  101. {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.31.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,1945 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # ================================================== #
4
+ # This file is a part of PYGPT package #
5
+ # Website: https://pygpt.net #
6
+ # GitHub: https://github.com/szczyglis-dev/py-gpt #
7
+ # MIT License #
8
+ # Created By : Marcin Szczygliński #
9
+ # Updated Date: 2025.08.31 23:00:00 #
10
+ # ================================================== #
11
+
12
+ import asyncio
13
+ import base64
14
+ import json
15
+ from typing import Optional, Callable, Awaitable, Tuple, List, Any
16
+
17
+ from google.genai import types as gtypes # for Schema/FunctionDeclaration/FunctionResponse compatibility
18
+
19
+ from pygpt_net.core.events import RealtimeEvent
20
+ from pygpt_net.core.types import MODE_AUDIO
21
+ from pygpt_net.item.ctx import CtxItem
22
+ from pygpt_net.core.text.utils import has_unclosed_code_tag
23
+
24
+ # shared
25
+ from pygpt_net.core.realtime.shared.loop import BackgroundLoop
26
+ from pygpt_net.core.realtime.shared.audio import to_pcm16_mono
27
+ from pygpt_net.core.realtime.shared.tools import build_function_responses_payload
28
+ from pygpt_net.core.realtime.shared.text import coalesce_text
29
+ from pygpt_net.core.realtime.shared.turn import TurnMode, apply_turn_mode_google
30
+ from pygpt_net.core.realtime.shared.session import set_ctx_rt_handle
31
+
32
+
33
+ class GoogleLiveClient:
34
+ """
35
+ Google Live client with server-side memory and smooth audio:
36
+
37
+ - One persistent Live session; server keeps conversation context across turns.
38
+ - User turns are sent via:
39
+ * text: send_client_content(Content(...), turn_complete=True/False)
40
+ * audio: ActivityStart -> send_realtime_input(audio=Blob...) -> ActivityEnd
41
+ (manual turns; no auto VAD; no inline dicts — SDK serializes wire format)
42
+ - Auto-turn mode (automatic VAD) is fully supported for continuous mic input:
43
+ * push audio chunks via send_realtime_input(audio=...)
44
+ * flush on demand via send_realtime_input(audio_stream_end=True)
45
+ * receiver for one model turn is started automatically on first audio chunk.
46
+ - Each turn has its own receive loop, ending on serverContent.turnComplete or toolCall.
47
+ - Audio is jitter-buffered (~60ms) and de-duplicated (prefer response.data over inline_data).
48
+ - Final transcript is coalesced; preserves hard line breaks only.
49
+ - Tool calls, citations, images and usage are extracted and persisted to ctx to mirror OpenAI provider behavior.
50
+ - Emits RealtimeEvent.RT_OUTPUT_AUDIO_COMMIT when the model starts responding after auto VAD or after an explicit flush,
51
+ and RealtimeEvent.RT_OUTPUT_TURN_END after each turn.
52
+ - Supports sending tool results back to the model (send_tool_results/send_tool_results_sync).
53
+ """
54
+ def __init__(
55
+ self,
56
+ window=None,
57
+ debug: bool = False
58
+ ):
59
+ self.window = window
60
+ self.debug = debug
61
+
62
+ # Live session resources (owned by background loop)
63
+ self._session = None
64
+ self._session_cm = None
65
+
66
+ # Background loop
67
+ self._bg: BackgroundLoop = BackgroundLoop(name="Google-RT-Loop")
68
+
69
+ # Flow control (per-session)
70
+ self._send_lock: Optional[asyncio.Lock] = None
71
+ self._response_done: Optional[asyncio.Event] = None
72
+ self._response_active: bool = False
73
+ self._turn_task: Optional[asyncio.Task] = None
74
+
75
+ # Callbacks and context
76
+ self._on_text: Optional[Callable[[str], Awaitable[None]]] = None
77
+ self._on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None
78
+ self._should_stop: Optional[Callable[[], bool]] = None
79
+ self._ctx: Optional[CtxItem] = None
80
+ self._last_opts = None
81
+
82
+ # Per-turn text aggregation
83
+ self._turn_text_parts: List[str] = []
84
+ self._last_out_tr: str = "" # last full output transcription (to compute deltas)
85
+
86
+ # Audio I/O (rates)
87
+ self._IN_RATE = 16000 # input (LINEAR16 mono)
88
+ self._OUT_RATE = 24000 # output (model audio PCM16@24kHz)
89
+
90
+ # Output audio jitter buffer
91
+ self._audio_buf = bytearray()
92
+ self._OUT_CHUNK_MS = 60
93
+ self._OUT_BYTES_PER_MS = int(self._OUT_RATE * 2 / 1000) # PCM16 mono (2 bytes/sample)
94
+ self._saw_data_stream = False # prefer response.data over inline_data to avoid duplicates
95
+
96
+ # Per-turn extraction state
97
+ self._rt_state: Optional[dict] = None
98
+
99
+ # Last tool calls snapshot
100
+ self._last_tool_calls: list[dict] = []
101
+
102
+ # Live session resumption (current session handle)
103
+ self._rt_session_id: Optional[str] = None # string handle that can be used to resume a session
104
+
105
+ # Cached tools signature to avoid redundant restarts
106
+ self._cached_session_tools_sig: Optional[str] = None
107
+
108
+ # Auto-turn state
109
+ self._auto_audio_in_flight: bool = False # True if auto-turn audio has been sent in current turn
110
+
111
+ # -----------------------------
112
+ # Public high-level entrypoints
113
+ # -----------------------------
114
+
115
+ async def run(
116
+ self,
117
+ ctx: CtxItem,
118
+ opts,
119
+ on_text: Callable[[str], Awaitable[None]],
120
+ on_audio: Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]],
121
+ should_stop: Callable[[], bool] = lambda: False,
122
+ ):
123
+ """
124
+ Run one turn: open session if needed, send prompt/audio, receive until turn complete.
125
+ """
126
+ self._ensure_background_loop()
127
+ self._ctx = ctx
128
+
129
+ # If a different resumable handle is provided, reset the session to resume there
130
+ try:
131
+ provided = getattr(opts, "rt_session_id", None)
132
+ if isinstance(provided, str):
133
+ provided = provided.strip()
134
+ if self._session is not None and provided and provided != (self._rt_session_id or ""):
135
+ await self._run_on_owner(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop))
136
+ except Exception:
137
+ pass
138
+
139
+ if not self._session:
140
+ await self._run_on_owner(self._open_session_internal(ctx, opts, on_text, on_audio, should_stop))
141
+
142
+ await self._run_on_owner(self._send_turn_internal(
143
+ getattr(opts, "prompt", None),
144
+ getattr(opts, "audio_data", None),
145
+ getattr(opts, "audio_format", None),
146
+ getattr(opts, "audio_rate", None),
147
+ wait_for_done=not bool(getattr(opts, "streaming", False)),
148
+ ))
149
+
150
+ async def open_session(
151
+ self,
152
+ ctx: CtxItem,
153
+ opts,
154
+ on_text: Callable[[str], Awaitable[None]],
155
+ on_audio: Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]],
156
+ should_stop: Callable[[], bool] = lambda: False,
157
+ ):
158
+ """
159
+ Open persistent Live session (if not already open).
160
+ """
161
+ self._ensure_background_loop()
162
+
163
+ # If the session is already open but a different handle is requested, reset to resume.
164
+ try:
165
+ provided = getattr(opts, "rt_session_id", None)
166
+ if isinstance(provided, str):
167
+ provided = provided.strip()
168
+ if self._session is not None and provided and provided != (self._rt_session_id or ""):
169
+ await self._run_on_owner(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop))
170
+ return
171
+ except Exception:
172
+ pass
173
+
174
+ await self._run_on_owner(self._open_session_internal(ctx, opts, on_text, on_audio, should_stop))
175
+
176
+ async def close_session(self):
177
+ """Close persistent Live session (if open)."""
178
+ if not self._bg.loop:
179
+ return
180
+ await self._run_on_owner(self._close_session_internal())
181
+
182
+ async def reset_session(
183
+ self,
184
+ ctx: Optional[CtxItem] = None,
185
+ opts=None,
186
+ on_text: Optional[Callable[[str], Awaitable[None]]] = None,
187
+ on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None,
188
+ should_stop: Optional[Callable[[], bool]] = None,
189
+ ):
190
+ """
191
+ Reset (close and reopen) persistent Live session with same or new params.
192
+ """
193
+ self._ensure_background_loop()
194
+ await self._run_on_owner(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop))
195
+
196
+ async def shutdown(self):
197
+ """Shutdown background loop and close session."""
198
+ if not self._bg.loop:
199
+ return
200
+ await self._run_on_owner(self._close_session_internal())
201
+
202
+ async def shutdown_and_stop(self):
203
+ """Shutdown background loop, close session and stop the loop thread."""
204
+ await self.shutdown()
205
+ self.stop_loop_sync()
206
+
207
+ # -----------------------------
208
+ # Synchronous convenience calls
209
+ # -----------------------------
210
+
211
+ def close_session_sync(self, timeout: float = 5.0):
212
+ """Close persistent Live session (if open)."""
213
+ if not self._bg.loop or not self._bg.loop.is_running():
214
+ return
215
+ self._bg.run_sync(self._close_session_internal(), timeout=timeout)
216
+
217
+ def reset_session_sync(
218
+ self,
219
+ ctx: Optional[CtxItem] = None,
220
+ opts=None,
221
+ on_text: Optional[Callable[[str], Awaitable[None]]] = None,
222
+ on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None,
223
+ should_stop: Optional[Callable[[], bool]] = None,
224
+ timeout: float = 10.0,
225
+ ):
226
+ """
227
+ Reset (close and reopen) persistent Live session with same or new params.
228
+ """
229
+ self._ensure_background_loop()
230
+ self._bg.run_sync(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop), timeout=timeout)
231
+
232
+ def shutdown_sync(self, timeout: float = 5.0):
233
+ """
234
+ Shutdown background loop and close session (sync).
235
+ """
236
+ if not self._bg.loop or not self._bg.loop.is_running():
237
+ return
238
+ self._bg.run_sync(self._close_session_internal(), timeout=timeout)
239
+
240
+ def stop_loop_sync(self, timeout: float = 2.0):
241
+ """
242
+ Stop background loop and join the thread.
243
+ """
244
+ self._bg.stop(timeout=timeout)
245
+
246
+ # -----------------------------
247
+ # Tools helpers
248
+ # -----------------------------
249
+
250
+ def _update_last_opts_tools(self, tools: Optional[list], remote_tools: Optional[list]) -> None:
251
+ """
252
+ Update self._last_opts with tools/remote_tools if those attributes exist.
253
+ """
254
+ lo = self._last_opts
255
+ if not lo:
256
+ return
257
+ try:
258
+ if tools is not None and hasattr(lo, "tools"):
259
+ setattr(lo, "tools", tools)
260
+ except Exception:
261
+ pass
262
+ try:
263
+ if remote_tools is not None and hasattr(lo, "remote_tools"):
264
+ setattr(lo, "remote_tools", remote_tools)
265
+ except Exception:
266
+ pass
267
+
268
+ def _tools_signature(self, tools_list: list) -> str:
269
+ """
270
+ Build a stable signature string for the given tools list.
271
+ """
272
+ try:
273
+ return json.dumps(tools_list or [], ensure_ascii=False, sort_keys=True, separators=(",", ":"))
274
+ except Exception:
275
+ return str(tools_list or [])
276
+
277
+ # -----------------------------
278
+ # Internal: background loop/dispatch
279
+ # -----------------------------
280
+
281
+ def _ensure_background_loop(self):
282
+ """Ensure background event loop and thread are running."""
283
+ self._bg.ensure()
284
+
285
+ async def _run_on_owner(self, coro):
286
+ """
287
+ Run coroutine on the owner loop and await result.
288
+ """
289
+ return await self._bg.run(coro)
290
+
291
+ # -----------------------------
292
+ # Internal: session lifecycle
293
+ # -----------------------------
294
+
295
+ async def _open_session_internal(
296
+ self,
297
+ ctx: CtxItem,
298
+ opts,
299
+ on_text: Callable[[str], Awaitable[None]],
300
+ on_audio: Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]],
301
+ should_stop: Callable[[], bool] = lambda: False,
302
+ ):
303
+ """
304
+ Open persistent Live session (if not already open).
305
+ """
306
+ if self._session is not None:
307
+ if self.debug:
308
+ print("[google.open_session] already open")
309
+ return
310
+
311
+ core = self.window.core
312
+ model_data = core.models.get(ctx.model) if ctx and getattr(ctx, "model", None) else None
313
+ client = self.window.core.api.google.get_client(MODE_AUDIO, model_data if ctx else None)
314
+ if not client:
315
+ raise RuntimeError("Google GenAI client not configured")
316
+
317
+ # Select Live-capable model
318
+ model_id = getattr(opts, "model", None) or (ctx.model if ctx and getattr(ctx, "model", None) else "gemini-live-2.5-flash-preview")
319
+ voice = getattr(opts, "voice", None) or self._preferred_voice()
320
+
321
+ # Compose tools for session
322
+ session_tools = self._sanitize_tools(getattr(opts, "tools", None), getattr(opts, "remote_tools", None))
323
+
324
+ # Live config — manual activity boundaries (no auto VAD by default)
325
+ live_cfg = {
326
+ "response_modalities": ["AUDIO"],
327
+ "speech_config": {"voice_config": {"prebuilt_voice_config": {"voice_name": voice}}},
328
+ "output_audio_transcription": {},
329
+ "realtime_input_config": {"automatic_activity_detection": {"disabled": True}},
330
+ }
331
+ if session_tools:
332
+ live_cfg["tools"] = session_tools
333
+
334
+ # Cache current tools signature
335
+ self._cached_session_tools_sig = self._tools_signature(session_tools or [])
336
+
337
+ sys_prompt = getattr(opts, "system_prompt", None)
338
+ if sys_prompt:
339
+ live_cfg["system_instruction"] = str(sys_prompt)
340
+
341
+ # Session resumption: enable updates; resume when a different non-empty handle is given
342
+ try:
343
+ provided_handle = getattr(opts, "rt_session_id", None)
344
+ resume_handle = None
345
+ if isinstance(provided_handle, str):
346
+ ph = provided_handle.strip()
347
+ if ph and ph != (self._rt_session_id or ""):
348
+ resume_handle = ph
349
+
350
+ live_cfg["session_resumption"] = gtypes.SessionResumptionConfig(handle=resume_handle)
351
+
352
+ if resume_handle:
353
+ self._rt_session_id = resume_handle
354
+ set_ctx_rt_handle(self._ctx, resume_handle, self.window)
355
+ except Exception:
356
+ pass
357
+
358
+ # Apply turn mode (auto/manual VAD)
359
+ turn_mode = TurnMode.AUTO if bool(getattr(opts, "auto_turn", False)) else TurnMode.MANUAL
360
+ apply_turn_mode_google(live_cfg, turn_mode)
361
+ self._tune_google_vad(live_cfg, opts)
362
+
363
+ # Save callbacks and ctx
364
+ self._on_text = on_text
365
+ self._on_audio = on_audio
366
+ self._should_stop = should_stop or (lambda: False)
367
+ self._ctx = ctx
368
+ self._last_opts = opts
369
+
370
+ # Control primitives
371
+ self._response_done = asyncio.Event()
372
+ self._send_lock = asyncio.Lock()
373
+ self._turn_text_parts = []
374
+ self._last_out_tr = ""
375
+ self._last_tool_calls = []
376
+
377
+ # Connect session
378
+ self._session_cm = client.aio.live.connect(model=model_id, config=live_cfg)
379
+ self._session = await self._session_cm.__aenter__()
380
+ if self.debug:
381
+ print("[google.open_session] live session connected")
382
+
383
+ async def _close_session_internal(self):
384
+ """Close persistent Live session (if open)."""
385
+ if self._turn_task and not self._turn_task.done():
386
+ try:
387
+ await asyncio.wait_for(self._turn_task, timeout=2.0)
388
+ except Exception:
389
+ pass
390
+ self._turn_task = None
391
+
392
+ if self._session_cm:
393
+ try:
394
+ await self._session_cm.__aexit__(None, None, None)
395
+ except Exception:
396
+ pass
397
+ self._session_cm = None
398
+ self._session = None
399
+
400
+ self._response_active = False
401
+ self._response_done = None
402
+ self._send_lock = None
403
+ self._turn_text_parts = []
404
+ self._last_out_tr = ""
405
+ self._audio_buf.clear()
406
+ self._saw_data_stream = False
407
+ self._rt_state = None
408
+ self._last_tool_calls = []
409
+
410
+ # Clear only in-memory handle; keep persisted ctx.extra["rt_session_id"]
411
+ self._rt_session_id = None
412
+
413
+ # Clear cached tools signature
414
+ self._cached_session_tools_sig = None
415
+
416
+ # Auto-turn flags
417
+ self._auto_audio_in_flight = False
418
+
419
+ if self.debug:
420
+ print("[google.close_session] closed")
421
+
422
+ async def _reset_session_internal(
423
+ self,
424
+ ctx: Optional[CtxItem] = None,
425
+ opts=None,
426
+ on_text: Optional[Callable[[str], Awaitable[None]]] = None,
427
+ on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None,
428
+ should_stop: Optional[Callable[[], bool]] = None,
429
+ ):
430
+ """
431
+ Reset (close and reopen) persistent Live session with same or new params.
432
+ """
433
+ ctx = ctx or self._ctx
434
+ opts = opts or self._last_opts
435
+ on_text = on_text or self._on_text
436
+ on_audio = on_audio or self._on_audio
437
+ should_stop = should_stop or self._should_stop or (lambda: False)
438
+
439
+ if not (ctx and opts and on_text and on_audio):
440
+ raise RuntimeError("reset_session requires previous or explicit ctx/opts/callbacks")
441
+
442
+ await self._close_session_internal()
443
+ await self._open_session_internal(ctx, opts, on_text, on_audio, should_stop)
444
+
445
+ # -----------------------------
446
+ # Internal: one "turn"
447
+ # -----------------------------
448
+
449
+ async def _send_turn_internal(
450
+ self,
451
+ prompt: Optional[str] = None,
452
+ audio_data: Optional[bytes] = None,
453
+ audio_format: Optional[str] = None,
454
+ audio_rate: Optional[int] = None,
455
+ wait_for_done: bool = True,
456
+ ):
457
+ """
458
+ Send one turn: prompt and/or audio, receive until turn complete or tool call.
459
+ """
460
+ if not self._session:
461
+ if self._ctx and self._last_opts:
462
+ await self._open_session_internal(self._ctx, self._last_opts, self._on_text, self._on_audio, self._should_stop)
463
+ else:
464
+ raise RuntimeError("Session not open. Call open_session(...) first.")
465
+
466
+ if self._send_lock is None:
467
+ self._send_lock = asyncio.Lock()
468
+
469
+ async with self._send_lock:
470
+ if self._response_active and self._response_done:
471
+ if self.debug:
472
+ print("[google.send_turn] waiting for previous response")
473
+ await self._response_done.wait()
474
+
475
+ # Reset per-turn collectors
476
+ self._turn_text_parts = []
477
+ self._last_out_tr = ""
478
+ self._audio_buf.clear()
479
+ self._saw_data_stream = False
480
+ self._rt_reset_state()
481
+ self._last_tool_calls = []
482
+ self._auto_audio_in_flight = False
483
+
484
+ # Normalize prompt/audio first to choose a stable path
485
+ txt = str(prompt).strip() if prompt is not None else ""
486
+ if txt == "...":
487
+ txt = ""
488
+ parts_t = [gtypes.Part(text=txt)] if txt else []
489
+
490
+ pcm = b""
491
+ rate = self._IN_RATE
492
+ if audio_data:
493
+ pcm, rate = to_pcm16_mono(audio_data, audio_format, audio_rate, target_rate=self._IN_RATE)
494
+
495
+ has_text = bool(parts_t)
496
+ has_audio = bool(pcm)
497
+
498
+ # Branches
499
+ if has_text and not has_audio:
500
+ # TEXT-ONLY -> single Content, turn_complete=True
501
+ await self._session.send_client_content(
502
+ turns=gtypes.Content(role="user", parts=parts_t),
503
+ turn_complete=True,
504
+ )
505
+ self._response_active = True
506
+ if self._response_done is None:
507
+ self._response_done = asyncio.Event()
508
+ else:
509
+ try:
510
+ self._response_done.clear()
511
+ except Exception:
512
+ self._response_done = asyncio.Event()
513
+ self._turn_task = asyncio.create_task(self._recv_one_turn(), name="google-live-turn")
514
+
515
+ elif has_audio and not has_text:
516
+ # AUDIO-ONLY
517
+ # If auto-turn is enabled, use auto-VAD path and flush with audio_stream_end.
518
+ # Otherwise, use manual ActivityStart/End boundaries.
519
+ use_auto = False
520
+ try:
521
+ use_auto = bool(getattr(self._last_opts, "auto_turn", False))
522
+ except Exception:
523
+ use_auto = False
524
+
525
+ self._response_active = True
526
+ if self._response_done is None:
527
+ self._response_done = asyncio.Event()
528
+ else:
529
+ try:
530
+ self._response_done.clear()
531
+ except Exception:
532
+ self._response_done = asyncio.Event()
533
+
534
+ # Start receiving before sending any audio
535
+ self._turn_task = asyncio.create_task(self._recv_one_turn(), name="google-live-turn")
536
+
537
+ if use_auto:
538
+ self._auto_audio_in_flight = True
539
+ # Auto-VAD: send a single audio blob and flush explicitly
540
+ try:
541
+ await self._session.send_realtime_input(
542
+ audio=gtypes.Blob(data=pcm, mime_type=f"audio/pcm;rate={int(rate)}")
543
+ )
544
+ await self._session.send_realtime_input(audio_stream_end=True)
545
+ self._emit_audio_commit_signal() # fire once for explicit flush
546
+ except Exception as e:
547
+ if self.debug:
548
+ print(f"[google.audio:auto] send failed: {e!r}")
549
+ raise
550
+ else:
551
+ # Manual activity: start -> audio -> end
552
+ await self._send_audio_realtime_manual(pcm, rate)
553
+
554
+ elif has_text and has_audio:
555
+ # TEXT + AUDIO in one user turn:
556
+ # Respect the configured mode: in manual mode keep ActivityStart/End,
557
+ # in auto-turn mode send text first and then treat audio as auto-VAD stream with explicit flush.
558
+ use_auto = False
559
+ try:
560
+ use_auto = bool(getattr(self._last_opts, "auto_turn", False))
561
+ except Exception:
562
+ use_auto = False
563
+
564
+ # 1) text opens the turn (turn_complete=False)
565
+ await self._session.send_client_content(
566
+ turns=gtypes.Content(role="user", parts=parts_t),
567
+ turn_complete=False,
568
+ )
569
+
570
+ self._response_active = True
571
+ if self._response_done is None:
572
+ self._response_done = asyncio.Event()
573
+ else:
574
+ try:
575
+ self._response_done.clear()
576
+ except Exception:
577
+ self._response_done = asyncio.Event()
578
+
579
+ # Start receiver, then send audio
580
+ self._turn_task = asyncio.create_task(self._recv_one_turn(), name="google-live-turn")
581
+
582
+ if use_auto:
583
+ self._auto_audio_in_flight = True
584
+ try:
585
+ await self._session.send_realtime_input(
586
+ audio=gtypes.Blob(data=pcm, mime_type=f"audio/pcm;rate={int(rate)}")
587
+ )
588
+ await self._session.send_realtime_input(audio_stream_end=True)
589
+ self._emit_audio_commit_signal() # fire once for explicit flush
590
+ except Exception as e:
591
+ if self.debug:
592
+ print(f"[google.audio:auto+text] send failed: {e!r}")
593
+ raise
594
+ else:
595
+ await self._send_audio_realtime_manual(pcm, rate)
596
+
597
+ else:
598
+ # nothing to send
599
+ return
600
+
601
+ if wait_for_done and self._turn_task:
602
+ try:
603
+ await self._turn_task
604
+ except Exception:
605
+ pass
606
+
607
+ async def _send_audio_realtime_manual(self, pcm: bytes, rate: int):
608
+ """
609
+ Manual turn boundaries: ActivityStart -> audio chunks -> ActivityEnd.
610
+ MIME must be audio/pcm;rate=RATE (no space).
611
+ """
612
+ if not pcm:
613
+ return
614
+ mime = f"audio/pcm;rate={int(rate)}"
615
+
616
+ # Activity start
617
+ try:
618
+ await self._session.send_realtime_input(activity_start=gtypes.ActivityStart())
619
+ if self.debug:
620
+ print("[google.audio] activityStart")
621
+ except Exception as e:
622
+ if self.debug:
623
+ print(f"[google.audio] activityStart failed: {e!r}")
624
+ raise
625
+
626
+ # ~100 ms chunks (for 16kHz -> 3200 bytes)
627
+ bytes_per_ms = int(rate * 2 / 1000) # 2 bytes per sample, mono
628
+ chunk = max(bytes_per_ms * 100, 3200)
629
+ for i in range(0, len(pcm), chunk):
630
+ part = pcm[i:i + chunk]
631
+ try:
632
+ await self._session.send_realtime_input(
633
+ audio=gtypes.Blob(data=part, mime_type=mime)
634
+ )
635
+ except Exception as e:
636
+ if self.debug:
637
+ print(f"[google.audio] payload send failed: {e!r}")
638
+ raise
639
+
640
+ # Activity end
641
+ try:
642
+ await self._session.send_realtime_input(activity_end=gtypes.ActivityEnd())
643
+ if self.debug:
644
+ print("[google.audio] activityEnd")
645
+ except Exception as e:
646
+ if self.debug:
647
+ print(f"[google.audio] activityEnd failed: {e!r}")
648
+ raise
649
+
650
+ # -----------------------------
651
+ # Internal: realtime audio input (auto-turn mode)
652
+ # -----------------------------
653
+
654
+ def rt_handle_audio_input_sync(self, event: RealtimeEvent, timeout: float = 0.5):
655
+ """
656
+ Synchronous entrypoint for continuous microphone input when auto-turn is enabled.
657
+ Safe to call from any thread; schedules work on the background loop.
658
+ """
659
+ # Quick no-op if empty
660
+ try:
661
+ payload = getattr(event, "data", {}) or {}
662
+ if isinstance(payload, dict) and "payload" in payload and isinstance(payload["payload"], dict):
663
+ payload = payload["payload"]
664
+ if not payload or not payload.get("data"):
665
+ return
666
+ except Exception:
667
+ return
668
+
669
+ self._ensure_background_loop()
670
+ try:
671
+ self._bg.run_sync(self._rt_handle_audio_input_internal(event), timeout=timeout)
672
+ except Exception:
673
+ # Never raise to caller from audio callback
674
+ pass
675
+
676
+ async def _rt_handle_audio_input_internal(self, event: RealtimeEvent):
677
+ """
678
+ Owner-loop implementation: push live audio to Gemini Live in auto-turn mode.
679
+ """
680
+ if not self._session:
681
+ return
682
+ try:
683
+ if not bool(getattr(self._last_opts, "auto_turn", False)):
684
+ # Only handle here when auto-turn is on; manual mode uses ActivityStart/End path.
685
+ return
686
+ except Exception:
687
+ return
688
+
689
+ # Extract normalized payload
690
+ payload = getattr(event, "data", {}) or {}
691
+ if isinstance(payload, dict) and "payload" in payload and isinstance(payload["payload"], dict):
692
+ payload = payload["payload"]
693
+
694
+ data: bytes = payload.get("data") or b""
695
+ if not data:
696
+ return
697
+ mime = str(payload.get("mime") or "audio/pcm")
698
+ rate = int(payload.get("rate") or 0) or self._IN_RATE
699
+ channels = int(payload.get("channels") or 1)
700
+ is_final = bool(payload.get("final", False))
701
+
702
+ # Normalize to LINEAR16 mono @16kHz (Live API input native rate)
703
+ fmt_hint = "pcm16" if mime.startswith("audio/pcm") else None
704
+ try:
705
+ pcm, norm_rate = to_pcm16_mono(data, fmt_hint, rate, target_rate=self._IN_RATE)
706
+ except Exception:
707
+ return
708
+
709
+ # Ensure a receiver for this auto-turn is running before sending audio
710
+ self._ensure_auto_receiver_started()
711
+
712
+ # Mark that auto-turn audio has been sent in this turn
713
+ self._auto_audio_in_flight = True
714
+
715
+ # Send audio blob; Gemini Live handles VAD automatically in auto mode
716
+ if self._send_lock is None:
717
+ self._send_lock = asyncio.Lock()
718
+
719
+ async with self._send_lock:
720
+ try:
721
+ await self._session.send_realtime_input(
722
+ audio=gtypes.Blob(data=pcm, mime_type=f"audio/pcm;rate={int(norm_rate)}")
723
+ )
724
+ except Exception:
725
+ return
726
+
727
+ # If stream end is flagged, flush server-side VAD buffer
728
+ if is_final:
729
+ try:
730
+ await self._session.send_realtime_input(audio_stream_end=True)
731
+ self._emit_audio_commit_signal() # fire once for explicit flush
732
+ except Exception:
733
+ pass
734
+
735
+ def commit_audio_input_sync(self, timeout: float = 0.5):
736
+ """
737
+ Synchronous entrypoint to flush the input audio stream in auto-turn mode.
738
+ This sends audio_stream_end to force the model to process current buffered audio.
739
+ Safe to call from any thread.
740
+ """
741
+ self._ensure_background_loop()
742
+ try:
743
+ self._bg.run_sync(self._commit_audio_input_internal(), timeout=timeout)
744
+ except Exception:
745
+ # Never raise to caller
746
+ pass
747
+
748
+ async def _commit_audio_input_internal(self):
749
+ """
750
+ Owner-loop implementation: in auto-turn mode flush server-side VAD buffer.
751
+ """
752
+ if not self._session:
753
+ return
754
+ try:
755
+ if not bool(getattr(self._last_opts, "auto_turn", False)):
756
+ return
757
+ except Exception:
758
+ return
759
+
760
+ # Ensure a receiver is running for this turn
761
+ self._ensure_auto_receiver_started()
762
+
763
+ if self._send_lock is None:
764
+ self._send_lock = asyncio.Lock()
765
+ async with self._send_lock:
766
+ try:
767
+ await self._session.send_realtime_input(audio_stream_end=True)
768
+ self._emit_audio_commit_signal() # fire once for explicit flush
769
+ except Exception:
770
+ pass
771
+
772
+ def force_response_now_sync(self, timeout: float = 5.0):
773
+ """
774
+ Synchronously force the model to create a response from current input buffer (auto-turn).
775
+ Internally sends audio_stream_end and ensures a receiver is running for the pending turn.
776
+ """
777
+ self._ensure_background_loop()
778
+ try:
779
+ self._bg.run_sync(self._force_response_now_internal(), timeout=timeout)
780
+ except Exception:
781
+ # Defensive: do not propagate errors to caller
782
+ pass
783
+
784
+ async def _force_response_now_internal(self):
785
+ """
786
+ Owner-loop: in auto-turn mode, flush current audio buffer and guarantee that a receive task
787
+ for the current model turn is running. No-op in manual mode.
788
+ """
789
+ if not self._session:
790
+ return
791
+ try:
792
+ if not bool(getattr(self._last_opts, "auto_turn", False)):
793
+ return
794
+ except Exception:
795
+ return
796
+
797
+ # Ensure a receiver is running for this turn
798
+ self._ensure_auto_receiver_started()
799
+
800
+ # Flush server-side buffer to force the model to respond
801
+ if self._send_lock is None:
802
+ self._send_lock = asyncio.Lock()
803
+ async with self._send_lock:
804
+ try:
805
+ await self._session.send_realtime_input(audio_stream_end=True)
806
+ self._emit_audio_commit_signal() # fire once for explicit flush
807
+ except Exception:
808
+ pass
809
+
810
+ async def _recv_one_turn(self):
811
+ """Receive one turn until serverContent.turnComplete or toolCall."""
812
+ if self.debug:
813
+ print("[google._recv_one_turn] start")
814
+
815
+ turn_finished = False
816
+
817
+ try:
818
+ async for response in self._session.receive():
819
+ # 0) Session resumption updates (store last resumable handle)
820
+ try:
821
+ sru = getattr(response, "session_resumption_update", None) or getattr(response, "sessionResumptionUpdate", None)
822
+ if sru:
823
+ resumable = bool(getattr(sru, "resumable", None))
824
+ new_handle = getattr(sru, "new_handle", None) or getattr(sru, "newHandle", None)
825
+ if resumable and isinstance(new_handle, str) and new_handle.strip():
826
+ self._rt_session_id = new_handle.strip()
827
+ set_ctx_rt_handle(self._ctx, self._rt_session_id, self.window)
828
+ if self.debug:
829
+ print(f"[google.live] session handle updated: {self._rt_session_id}")
830
+ except Exception:
831
+ pass
832
+
833
+ # 1) Usage (top-level)
834
+ try:
835
+ um = getattr(response, "usage_metadata", None) or getattr(response, "usageMetadata", None)
836
+ if um:
837
+ self._rt_capture_google_usage(um)
838
+ except Exception:
839
+ pass
840
+
841
+ # 2) Preferred audio source: response.data (PCM16@24kHz)
842
+ data = getattr(response, "data", None)
843
+ if isinstance(data, (bytes, bytearray)):
844
+ # First output from model -> emit commit once (auto-turn only)
845
+ self._maybe_emit_auto_commit()
846
+ self._saw_data_stream = True
847
+ await self._audio_push(bytes(data), final=False)
848
+
849
+ # 3) Server content
850
+ sc = getattr(response, "server_content", None) or getattr(response, "serverContent", None)
851
+ if sc:
852
+ # Any serverContent reaching here implies the model started processing;
853
+ # emit commit once if not yet emitted (auto-turn only).
854
+ self._maybe_emit_auto_commit()
855
+
856
+ # Output transcription (often cumulative)
857
+ out_tr = getattr(sc, "output_transcription", None) or getattr(sc, "outputTranscription", None)
858
+ if out_tr and getattr(out_tr, "text", None) and self._on_text:
859
+ full = str(out_tr.text)
860
+ delta = full[len(self._last_out_tr):] if full.startswith(self._last_out_tr) else full
861
+ self._last_out_tr = full
862
+ if delta.strip():
863
+ self._turn_text_parts.append(delta)
864
+ try:
865
+ await self._on_text(delta)
866
+ except Exception:
867
+ pass
868
+
869
+ # Optional: input transcription (handy in manual mode)
870
+ in_tr = getattr(sc, "input_transcription", None) or getattr(sc, "inputTranscription", None)
871
+ if in_tr and getattr(in_tr, "text", None) and self.debug:
872
+ print("[google.input_tr]", in_tr.text)
873
+
874
+ # Model turn parts
875
+ model_turn = getattr(sc, "model_turn", None) or getattr(sc, "modelTurn", None)
876
+ if model_turn:
877
+ parts = getattr(model_turn, "parts", None) or []
878
+ for p in parts:
879
+ # Function call parts
880
+ fc = getattr(p, "function_call", None) or (p.get("function_call") if isinstance(p, dict) else None)
881
+ if fc:
882
+ name = getattr(fc, "name", None) or (fc.get("name") if isinstance(fc, dict) else "")
883
+ args_obj = getattr(fc, "args", None) or (fc.get("args") if isinstance(fc, dict) else {})
884
+ args_dict = self._to_plain_dict(args_obj) or {}
885
+ self._rt_state["tool_calls"].append({
886
+ "id": getattr(fc, "id", None) or "",
887
+ "type": "function",
888
+ "function": {
889
+ "name": name or "",
890
+ "arguments": json.dumps(args_dict, ensure_ascii=False),
891
+ }
892
+ })
893
+ self._rt_state["force_func_call"] = True
894
+ self._last_tool_calls = list(self._rt_state["tool_calls"])
895
+ turn_finished = True # let the app run tools now
896
+
897
+ # Text part
898
+ txt = getattr(p, "text", None) or (p.get("text") if isinstance(p, dict) else None)
899
+ if txt and self._on_text:
900
+ s = str(txt)
901
+ self._turn_text_parts.append(s)
902
+ try:
903
+ await self._on_text(s)
904
+ except Exception:
905
+ pass
906
+
907
+ # Code execution parts
908
+ ex = getattr(p, "executable_code", None) or (p.get("executable_code") if isinstance(p, dict) else None)
909
+ if ex:
910
+ lang = (getattr(ex, "language", None) or "python").strip() or "python"
911
+ code_txt = (
912
+ getattr(ex, "code", None) or
913
+ getattr(ex, "program", None) or
914
+ getattr(ex, "source", None) or
915
+ ""
916
+ )
917
+ if not self._rt_state["is_code"]:
918
+ hdr = f"\n\n**Code interpreter**\n```{lang.lower()}\n"
919
+ self._turn_text_parts.append(hdr + str(code_txt))
920
+ try:
921
+ if self._on_text:
922
+ await self._on_text(hdr + str(code_txt))
923
+ except Exception:
924
+ pass
925
+ self._rt_state["is_code"] = True
926
+ else:
927
+ self._turn_text_parts.append(str(code_txt))
928
+ try:
929
+ if self._on_text:
930
+ await self._on_text(str(code_txt))
931
+ except Exception:
932
+ pass
933
+
934
+ cer = getattr(p, "code_execution_result", None) or (p.get("code_execution_result") if isinstance(p, dict) else None)
935
+ if cer and self._rt_state["is_code"]:
936
+ tail = "\n\n```\n-----------\n"
937
+ self._turn_text_parts.append(tail)
938
+ try:
939
+ if self._on_text:
940
+ await self._on_text(tail)
941
+ except Exception:
942
+ pass
943
+ self._rt_state["is_code"] = False
944
+
945
+ # Inline images
946
+ inline = getattr(p, "inline_data", None) or (p.get("inline_data") if isinstance(p, dict) else None)
947
+ if inline:
948
+ mime = (getattr(inline, "mime_type", "") or (inline.get("mime_type") if isinstance(inline, dict) else "") or "").lower()
949
+ if mime.startswith("image/"):
950
+ pdata = getattr(inline, "data", None) if not isinstance(inline, dict) else inline.get("data")
951
+ try:
952
+ img_bytes = None
953
+ if isinstance(pdata, (bytes, bytearray)):
954
+ img_bytes = bytes(pdata)
955
+ elif isinstance(pdata, str):
956
+ img_bytes = base64.b64decode(pdata)
957
+ if img_bytes:
958
+ save_path = self.window.core.image.gen_unique_path(self._ctx)
959
+ with open(save_path, "wb") as f:
960
+ f.write(img_bytes)
961
+ self._rt_state["image_paths"].append(save_path)
962
+ if not isinstance(self._ctx.images, list):
963
+ self._ctx.images = []
964
+ if save_path not in self._ctx.images:
965
+ self._ctx.images.append(save_path)
966
+ except Exception:
967
+ pass
968
+
969
+ # Citations (grounding)
970
+ try:
971
+ self._collect_google_citations_from_server_content(sc)
972
+ except Exception:
973
+ pass
974
+
975
+ # Turn complete signal
976
+ try:
977
+ if bool(getattr(sc, "turn_complete", None) or getattr(sc, "turnComplete", None)):
978
+ turn_finished = True
979
+ except Exception:
980
+ pass
981
+
982
+ # 4) Dedicated toolCall message
983
+ tc = getattr(response, "tool_call", None) or getattr(response, "toolCall", None)
984
+ if tc:
985
+ self._maybe_emit_auto_commit() # ensure commit signaled before handing off to tools
986
+ fcs = getattr(tc, "function_calls", None) or getattr(tc, "functionCalls", None) or []
987
+ new_calls = []
988
+ for fc in fcs:
989
+ name = getattr(fc, "name", "") or ""
990
+ args_obj = getattr(fc, "args", {}) or {}
991
+ args_dict = self._to_plain_dict(args_obj) or {}
992
+ new_calls.append({
993
+ "id": getattr(fc, "id", "") or "",
994
+ "type": "function",
995
+ "function": {
996
+ "name": name,
997
+ "arguments": json.dumps(args_dict, ensure_ascii=False),
998
+ }
999
+ })
1000
+ if new_calls:
1001
+ seen = {(tc["function"]["name"], tc["function"]["arguments"]) for tc in self._rt_state["tool_calls"]}
1002
+ for c in new_calls:
1003
+ key = (c["function"]["name"], c["function"]["arguments"])
1004
+ if key not in seen:
1005
+ self._rt_state["tool_calls"].append(c)
1006
+ seen.add(key)
1007
+ self._rt_state["force_func_call"] = True
1008
+ self._last_tool_calls = list(self._rt_state["tool_calls"])
1009
+ turn_finished = True
1010
+
1011
+ if turn_finished:
1012
+ break
1013
+
1014
+ # Flush jitter buffer
1015
+ await self._audio_push(b"", final=True)
1016
+
1017
+ except asyncio.CancelledError:
1018
+ try:
1019
+ await self._audio_push(b"", final=True)
1020
+ except Exception:
1021
+ pass
1022
+ except Exception as e:
1023
+ if self.debug:
1024
+ print(f"[google._recv_one_turn] exception: {e!r}")
1025
+ try:
1026
+ await self._audio_push(b"", final=True)
1027
+ except Exception:
1028
+ pass
1029
+ finally:
1030
+ # Persist textual output
1031
+ try:
1032
+ if self.window and self.window.core and self._ctx:
1033
+ txt = coalesce_text(self._turn_text_parts)
1034
+ if has_unclosed_code_tag(txt):
1035
+ txt += "\n```"
1036
+ if txt:
1037
+ self._ctx.output = txt
1038
+ # Tokens usage
1039
+ up = (self._rt_state or {}).get("usage_payload") or {}
1040
+ if up:
1041
+ in_tok = up.get("in")
1042
+ out_tok = up.get("out")
1043
+ self._ctx.set_tokens(in_tok if in_tok is not None else (self._ctx.input_tokens or 0),
1044
+ out_tok if out_tok is not None else 0)
1045
+ try:
1046
+ if not isinstance(self._ctx.extra, dict):
1047
+ self._ctx.extra = {}
1048
+ self._ctx.extra["usage"] = {
1049
+ "vendor": "google",
1050
+ "input_tokens": in_tok,
1051
+ "output_tokens": out_tok,
1052
+ "reasoning_tokens": up.get("reasoning", 0),
1053
+ "total_reported": up.get("total"),
1054
+ }
1055
+ except Exception:
1056
+ pass
1057
+
1058
+ # Citations to ctx.urls
1059
+ cites = (self._rt_state or {}).get("citations") or []
1060
+ if cites:
1061
+ if self._ctx.urls is None:
1062
+ self._ctx.urls = []
1063
+ for u in cites:
1064
+ if u not in self._ctx.urls:
1065
+ self._ctx.urls.append(u)
1066
+
1067
+ # Images to ctx.images
1068
+ imgs = (self._rt_state or {}).get("image_paths") or []
1069
+ if imgs:
1070
+ if not isinstance(self._ctx.images, list):
1071
+ self._ctx.images = []
1072
+ for p in imgs:
1073
+ if p not in self._ctx.images:
1074
+ self._ctx.images.append(p)
1075
+
1076
+ # Unpack tool calls
1077
+ tcs = (self._rt_state or {}).get("tool_calls") or []
1078
+ if tcs:
1079
+ for tc in tcs:
1080
+ fn = tc.get("function") or {}
1081
+ if isinstance(fn.get("arguments"), dict):
1082
+ fn["arguments"] = json.dumps(fn["arguments"], ensure_ascii=False)
1083
+ self._ctx.force_call = bool((self._rt_state or {}).get("force_func_call"))
1084
+ self.window.core.debug.info("[google.live] Tool calls found, unpacking...")
1085
+ self.window.core.command.unpack_tool_calls_chunks(self._ctx, tcs)
1086
+
1087
+ self.window.core.ctx.update_item(self._ctx)
1088
+ except Exception:
1089
+ pass
1090
+
1091
+ # Mark done for waiters
1092
+ self._response_active = False
1093
+ if self._response_done:
1094
+ try:
1095
+ self._response_done.set()
1096
+ except Exception:
1097
+ pass
1098
+
1099
+ # Emit end-of-turn event for audio pipeline symmetry with OpenAI
1100
+ try:
1101
+ if self._last_opts and hasattr(self._last_opts, "rt_signals"):
1102
+ self._last_opts.rt_signals.response.emit(RealtimeEvent(RealtimeEvent.RT_OUTPUT_TURN_END, {
1103
+ "ctx": self._ctx,
1104
+ }))
1105
+ except Exception:
1106
+ pass
1107
+
1108
+ # Reset per-turn state
1109
+ self._rt_state = None
1110
+ self._auto_audio_in_flight = False
1111
+
1112
+ if self.debug:
1113
+ print("[google._recv_one_turn] done")
1114
+
1115
+ # -----------------------------
1116
+ # Public: live tools update
1117
+ # -----------------------------
1118
+
1119
+ async def update_session_tools(
1120
+ self,
1121
+ tools: Optional[list] = None,
1122
+ remote_tools: Optional[list] = None,
1123
+ force: bool = False,
1124
+ ):
1125
+ """
1126
+ Update session tools for Google Live.
1127
+ Since the Live API does not support mid-session tool config updates via SDK,
1128
+ this performs a safe session restart with best-effort resumption if the tools changed.
1129
+ If the session is not open, it only updates cached opts for the next open.
1130
+ """
1131
+ self._ensure_background_loop()
1132
+ return await self._run_on_owner(
1133
+ self._update_session_tools_internal(tools, remote_tools, force)
1134
+ )
1135
+
1136
+ def update_session_tools_sync(
1137
+ self,
1138
+ tools: Optional[list] = None,
1139
+ remote_tools: Optional[list] = None,
1140
+ force: bool = False,
1141
+ timeout: float = 10.0,
1142
+ ):
1143
+ """Synchronous wrapper over update_session_tools()."""
1144
+ self._ensure_background_loop()
1145
+ return self._bg.run_sync(
1146
+ self._update_session_tools_internal(tools, remote_tools, force),
1147
+ timeout=timeout
1148
+ )
1149
+
1150
+ async def _update_session_tools_internal(
1151
+ self,
1152
+ tools: Optional[list],
1153
+ remote_tools: Optional[list],
1154
+ force: bool,
1155
+ ):
1156
+ """
1157
+ Owner-loop implementation for tools update on Google Live.
1158
+
1159
+ Strategy:
1160
+ - Sanitize and compute signature of the requested tools set.
1161
+ - If session is closed: update last opts and clear local cache.
1162
+ - If session is open and tools changed (or force=True):
1163
+ * Wait for any active response to finish.
1164
+ * Restart the Live session and request resumption using the last known handle.
1165
+ """
1166
+ # Prepare target tools (prefer explicit args, fallback to last opts)
1167
+ try:
1168
+ target_tools_raw = tools if tools is not None else getattr(self._last_opts, "tools", None)
1169
+ except Exception:
1170
+ target_tools_raw = None
1171
+ try:
1172
+ target_remote_raw = remote_tools if remote_tools is not None else getattr(self._last_opts, "remote_tools", None)
1173
+ except Exception:
1174
+ target_remote_raw = None
1175
+
1176
+ session_tools = self._sanitize_tools(target_tools_raw, target_remote_raw)
1177
+ new_sig = self._tools_signature(session_tools or [])
1178
+
1179
+ # If session is not open, just cache for next open
1180
+ if not self._session:
1181
+ self._update_last_opts_tools(tools, remote_tools)
1182
+ self._cached_session_tools_sig = None
1183
+ if self.debug:
1184
+ print("[google.update_session_tools] session not open; cached for next open")
1185
+ return
1186
+
1187
+ # Skip if unchanged
1188
+ if not force and self._cached_session_tools_sig == new_sig:
1189
+ self._update_last_opts_tools(tools, remote_tools)
1190
+ if self.debug:
1191
+ print("[google.update_session_tools] no changes; skipping restart")
1192
+ return
1193
+
1194
+ # Ensure previous response is finished
1195
+ if self._send_lock is None:
1196
+ self._send_lock = asyncio.Lock()
1197
+ async with self._send_lock:
1198
+ if self._response_active and self._response_done:
1199
+ if self.debug:
1200
+ print("[google.update_session_tools] waiting for active response to finish")
1201
+ try:
1202
+ await self._response_done.wait()
1203
+ except Exception:
1204
+ pass
1205
+
1206
+ # Persist new tools into last opts
1207
+ self._update_last_opts_tools(tools, remote_tools)
1208
+
1209
+ # Try to resume the session state after restart if possible
1210
+ prev_handle = self._rt_session_id
1211
+
1212
+ # Inject resumption handle into opts for the next open
1213
+ try:
1214
+ if self._last_opts is not None and prev_handle:
1215
+ setattr(self._last_opts, "rt_session_id", prev_handle)
1216
+ except Exception:
1217
+ pass
1218
+
1219
+ if self.debug:
1220
+ print("[google.update_session_tools] restarting session to apply new tools")
1221
+
1222
+ # Restart session with updated opts and best-effort resume
1223
+ await self._reset_session_internal(
1224
+ ctx=self._ctx,
1225
+ opts=self._last_opts,
1226
+ on_text=self._on_text,
1227
+ on_audio=self._on_audio,
1228
+ should_stop=self._should_stop,
1229
+ )
1230
+
1231
+ # Cache new signature to suppress redundant restarts
1232
+ self._cached_session_tools_sig = new_sig
1233
+
1234
+ if self.debug:
1235
+ print(f"[google.update_session_tools] session restarted; tools={len(session_tools)}")
1236
+
1237
+ # -----------------------------
1238
+ # Public: send tool results back to the model
1239
+ # -----------------------------
1240
+
1241
+ async def send_tool_results(
1242
+ self,
1243
+ results,
1244
+ continue_turn: bool = True,
1245
+ wait_for_done: bool = True,
1246
+ ):
1247
+ """
1248
+ Send tool results back to the Live session (FunctionResponse list).
1249
+ """
1250
+ self._ensure_background_loop()
1251
+ return await self._run_on_owner(
1252
+ self._send_tool_results_internal(results, continue_turn, wait_for_done)
1253
+ )
1254
+
1255
+ def send_tool_results_sync(
1256
+ self,
1257
+ results,
1258
+ continue_turn: bool = True,
1259
+ wait_for_done: bool = True,
1260
+ timeout: float = 20.0,
1261
+ ):
1262
+ """
1263
+ Synchronous wrapper for send_tool_results().
1264
+ """
1265
+ self._ensure_background_loop()
1266
+ return self._bg.run_sync(
1267
+ self._send_tool_results_internal(results, continue_turn, wait_for_done),
1268
+ timeout=timeout
1269
+ )
1270
+
1271
+ async def _send_tool_results_internal(
1272
+ self,
1273
+ results,
1274
+ continue_turn: bool,
1275
+ wait_for_done: bool,
1276
+ ):
1277
+ """
1278
+ Internal implementation of send_tool_results.
1279
+ """
1280
+ if not self._session:
1281
+ raise RuntimeError("Live session is not open")
1282
+
1283
+ # Build neutral list and convert to gtypes.FunctionResponse[]
1284
+ try:
1285
+ neutral = build_function_responses_payload(results, self._last_tool_calls)
1286
+ except Exception as e:
1287
+ raise RuntimeError(f"Invalid tool results payload: {e}") from e
1288
+
1289
+ if not neutral:
1290
+ return
1291
+
1292
+ fn_responses = [
1293
+ gtypes.FunctionResponse(id=e.get("id") or "", name=e.get("name") or "", response=e.get("response") or {})
1294
+ for e in neutral
1295
+ ]
1296
+
1297
+ if self._send_lock is None:
1298
+ self._send_lock = asyncio.Lock()
1299
+ async with self._send_lock:
1300
+ try:
1301
+ await self._session.send_tool_response(function_responses=fn_responses)
1302
+ except Exception as e:
1303
+ raise RuntimeError(f"send_tool_response failed: {e}") from e
1304
+
1305
+ if continue_turn:
1306
+ self._turn_text_parts = []
1307
+ self._last_out_tr = ""
1308
+ self._audio_buf.clear()
1309
+ self._saw_data_stream = False
1310
+ self._rt_reset_state()
1311
+
1312
+ self._response_active = True
1313
+ if self._response_done is None:
1314
+ self._response_done = asyncio.Event()
1315
+ else:
1316
+ try:
1317
+ self._response_done.clear()
1318
+ except Exception:
1319
+ self._response_done = asyncio.Event()
1320
+
1321
+ self._turn_task = asyncio.create_task(self._recv_one_turn(), name="google-live-turn-followup")
1322
+
1323
+ if wait_for_done:
1324
+ try:
1325
+ await self._turn_task
1326
+ except Exception:
1327
+ pass
1328
+
1329
+ # -----------------------------
1330
+ # Helpers
1331
+ # -----------------------------
1332
+
1333
+ def _preferred_voice(self) -> str:
1334
+ """
1335
+ Get preferred TTS voice from options or default.
1336
+ """
1337
+ try:
1338
+ v = self.window.core.plugins.get_option("audio_output", "google_genai_tts_voice")
1339
+ if v:
1340
+ mapping = {"kore": "Kore", "puck": "Puck", "charon": "Charon", "verse": "Verse", "legend": "Legend"}
1341
+ return mapping.get(str(v).strip().lower(), str(v))
1342
+ except Exception:
1343
+ pass
1344
+ return "Kore"
1345
+
1346
+ async def _audio_push(self, data: bytes, final: bool = False):
1347
+ """
1348
+ Push audio data to the output callback in ~100 ms chunks.
1349
+ """
1350
+ if not self._on_audio:
1351
+ return
1352
+ if data:
1353
+ self._audio_buf.extend(data)
1354
+ threshold = self._OUT_BYTES_PER_MS * self._OUT_CHUNK_MS
1355
+ while len(self._audio_buf) >= threshold:
1356
+ chunk = self._audio_buf[:threshold]
1357
+ del self._audio_buf[:threshold]
1358
+ try:
1359
+ await self._on_audio(bytes(chunk), "audio/pcm", self._OUT_RATE, 1, False)
1360
+ except Exception:
1361
+ pass
1362
+ if final:
1363
+ if self._audio_buf:
1364
+ try:
1365
+ await self._on_audio(bytes(self._audio_buf), "audio/pcm", self._OUT_RATE, 1, False)
1366
+ except Exception:
1367
+ pass
1368
+ self._audio_buf.clear()
1369
+ try:
1370
+ await self._on_audio(b"", "audio/pcm", self._OUT_RATE, 1, True)
1371
+ except Exception:
1372
+ pass
1373
+
1374
+ def _to_plain_dict(self, obj: Any) -> Any:
1375
+ """
1376
+ Convert various objects (pydantic, dataclass, etc) to plain dict recursively.
1377
+ """
1378
+ try:
1379
+ if hasattr(obj, "to_json_dict"):
1380
+ return obj.to_json_dict()
1381
+ if hasattr(obj, "model_dump"):
1382
+ return obj.model_dump()
1383
+ if hasattr(obj, "to_dict"):
1384
+ return obj.to_dict()
1385
+ except Exception:
1386
+ pass
1387
+ if isinstance(obj, dict):
1388
+ return {k: self._to_plain_dict(v) for k, v in obj.items()}
1389
+ if isinstance(obj, (list, tuple)):
1390
+ return [self._to_plain_dict(x) for x in obj]
1391
+ return obj
1392
+
1393
+ def _rt_reset_state(self):
1394
+ """Reset per-turn realtime state."""
1395
+ self._rt_state = {
1396
+ "tool_calls": [],
1397
+ "citations": [],
1398
+ "files": [],
1399
+ "image_paths": [],
1400
+ "is_image": False,
1401
+ "is_code": False,
1402
+ "force_func_call": False,
1403
+ "usage_payload": {},
1404
+ "auto_commit_signaled": False,
1405
+ }
1406
+
1407
+ def _rt_capture_google_usage(self, um_obj: Any):
1408
+ """
1409
+ Capture Google GenAI token usage from usage_metadata object.
1410
+ """
1411
+ if not um_obj or self._rt_state is None:
1412
+ return
1413
+
1414
+ def as_int(v):
1415
+ try:
1416
+ if v is None:
1417
+ return None
1418
+ return int(v)
1419
+ except Exception:
1420
+ try:
1421
+ return int(float(v))
1422
+ except Exception:
1423
+ return None
1424
+
1425
+ prompt = (getattr(um_obj, "prompt_token_count", None)
1426
+ or getattr(um_obj, "promptTokenCount", None)
1427
+ or getattr(um_obj, "prompt_tokens", None)
1428
+ or None)
1429
+ total = (getattr(um_obj, "total_token_count", None)
1430
+ or getattr(um_obj, "totalTokenCount", None)
1431
+ or getattr(um_obj, "total_tokens", None)
1432
+ or None)
1433
+ candidates = (getattr(um_obj, "candidates_token_count", None)
1434
+ or getattr(um_obj, "candidatesTokenCount", None)
1435
+ or getattr(um_obj, "output_tokens", None)
1436
+ or None)
1437
+ reasoning = (getattr(um_obj, "candidates_reasoning_token_count", None)
1438
+ or getattr(um_obj, "candidatesReasoningTokenCount", None)
1439
+ or getattr(um_obj, "reasoning_tokens", None)
1440
+ or 0)
1441
+ p = as_int(prompt)
1442
+ t = as_int(total)
1443
+ c = as_int(candidates)
1444
+ r = as_int(reasoning) or 0
1445
+ out_total = max(0, (t or 0) - (p or 0)) if (t is not None and p is not None) else c
1446
+ self._rt_state["usage_payload"] = {"in": p, "out": out_total, "reasoning": r, "total": t}
1447
+
1448
+ def _collect_google_citations_from_server_content(self, sc: Any):
1449
+ """
1450
+ Collect citations (URLs) from server_content grounding metadata and add to rt_state and ctx.urls.
1451
+ """
1452
+ if self._rt_state is None:
1453
+ return
1454
+
1455
+ def add_url(url: Optional[str]):
1456
+ if not url or not isinstance(url, str):
1457
+ return
1458
+ u = url.strip()
1459
+ if not (u.startswith("http://") or u.startswith("https://")):
1460
+ return
1461
+ if u not in self._rt_state["citations"]:
1462
+ self._rt_state["citations"].append(u)
1463
+ try:
1464
+ if self._ctx:
1465
+ if self._ctx.urls is None:
1466
+ self._ctx.urls = []
1467
+ if u not in self._ctx.urls:
1468
+ self._ctx.urls.append(u)
1469
+ except Exception:
1470
+ pass
1471
+
1472
+ gm = getattr(sc, "grounding_metadata", None) or getattr(sc, "groundingMetadata", None)
1473
+ if gm:
1474
+ atts = getattr(gm, "grounding_attributions", None) or getattr(gm, "groundingAttributions", None) or []
1475
+ try:
1476
+ for att in atts or []:
1477
+ for path in (
1478
+ "web.uri", "web.url", "source.web.uri", "source.web.url",
1479
+ "source.uri", "source.url", "uri", "url",
1480
+ ):
1481
+ add_url(self._safe_get(att, path))
1482
+ except Exception:
1483
+ pass
1484
+ for path in (
1485
+ "search_entry_point.uri", "search_entry_point.url",
1486
+ "searchEntryPoint.uri", "searchEntryPoint.url",
1487
+ "search_entry_point.rendered_content_uri", "searchEntryPoint.rendered_content_uri",
1488
+ ):
1489
+ add_url(self._safe_get(gm, path))
1490
+
1491
+ try:
1492
+ mt = getattr(sc, "model_turn", None) or getattr(sc, "modelTurn", None)
1493
+ parts = getattr(mt, "parts", None) or []
1494
+ for p in parts:
1495
+ pcm = self._safe_get(p, "citation_metadata") or self._safe_get(p, "citationMetadata")
1496
+ if pcm:
1497
+ arr = (self._safe_get(pcm, "citation_sources")
1498
+ or self._safe_get(pcm, "citationSources")
1499
+ or self._safe_get(pcm, "citations") or []
1500
+ )
1501
+ for cit in arr or []:
1502
+ for path in ("uri", "url", "source.uri", "source.url", "web.uri", "web.url"):
1503
+ add_url(self._safe_get(cit, path))
1504
+ gpa = self._safe_get(p, "grounding_attributions") or self._safe_get(p, "groundingAttributions") or []
1505
+ for att in gpa or []:
1506
+ for path in ("web.uri", "web.url", "source.web.uri", "source.web.url", "uri", "url"):
1507
+ add_url(self._safe_get(att, path))
1508
+ except Exception:
1509
+ pass
1510
+
1511
+ def _safe_get(self, obj, path: str) -> Any:
1512
+ """
1513
+ Safely get a nested attribute or dict key by dot-separated path.
1514
+ """
1515
+ cur = obj
1516
+ for seg in path.split("."):
1517
+ if cur is None:
1518
+ return None
1519
+ if isinstance(cur, dict):
1520
+ cur = cur.get(seg)
1521
+ else:
1522
+ if seg.isdigit() and isinstance(cur, (list, tuple)):
1523
+ idx = int(seg)
1524
+ if 0 <= idx < len(cur):
1525
+ cur = cur[idx]
1526
+ else:
1527
+ return None
1528
+ else:
1529
+ cur = getattr(cur, seg, None)
1530
+ return cur
1531
+
1532
+ # -------- tools sanitizer for Live config (dict-only, robust) --------
1533
+
1534
+ def _sanitize_tools(self, tools: Any, remote_tools: Optional[list] = None) -> list:
1535
+ """
1536
+ Normalize opts.tools into Live API config.tools (list of dicts).
1537
+ Supports gtypes.Tool, dict, or mixed list.
1538
+ """
1539
+ out: list = []
1540
+ sigset: set[str] = set()
1541
+
1542
+ def add(entry: dict):
1543
+ try:
1544
+ sig = json.dumps(entry, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
1545
+ except Exception:
1546
+ sig = str(entry)
1547
+ if sig not in sigset:
1548
+ out.append(entry)
1549
+ sigset.add(sig)
1550
+
1551
+ def handle_tool_obj(t):
1552
+ # Convert gtypes.Tool -> dict
1553
+ fd_list = []
1554
+ fds = getattr(t, "function_declarations", None) or getattr(t, "functionDeclarations", None) or []
1555
+ for fd in fds or []:
1556
+ fd_dict = self._fd_to_dict(fd)
1557
+ if fd_dict:
1558
+ fd_list.append(fd_dict)
1559
+ if fd_list:
1560
+ add({"function_declarations": fd_list})
1561
+
1562
+ # built-ins
1563
+ if getattr(t, "code_execution", None) or getattr(t, "codeExecution", None):
1564
+ add({"code_execution": {}})
1565
+ if getattr(t, "google_search", None) or getattr(t, "googleSearch", None):
1566
+ add({"google_search": {}})
1567
+ if getattr(t, "url_context", None) or getattr(t, "urlContext", None):
1568
+ add({"url_context": {}})
1569
+
1570
+ def handle_tool_dict(d: dict):
1571
+ fds = d.get("function_declarations") or d.get("functionDeclarations")
1572
+ if isinstance(fds, list) and fds:
1573
+ fd_list = []
1574
+ for fd in fds:
1575
+ fd_dict = self._fd_to_dict(fd)
1576
+ if fd_dict:
1577
+ fd_list.append(fd_dict)
1578
+ if fd_list:
1579
+ add({"function_declarations": fd_list})
1580
+
1581
+ if (d.get("type") or "").lower() == "function":
1582
+ fn = d.get("function") if isinstance(d.get("function"), dict) else d
1583
+ name = fn.get("name")
1584
+ if name:
1585
+ fd = {"name": str(name)}
1586
+ if fn.get("description"):
1587
+ fd["description"] = fn["description"]
1588
+ params = fn.get("parameters")
1589
+ fd["parameters"] = self._schema_to_plain(params if params is not None else {"type": "OBJECT"})
1590
+ add({"function_declarations": [fd]})
1591
+
1592
+ for k in ("google_search", "code_execution", "url_context"):
1593
+ if k in d and isinstance(d[k], dict):
1594
+ add({k: dict(d[k])})
1595
+ elif k in d and d[k] is True:
1596
+ add({k: {}})
1597
+
1598
+ if isinstance(tools, (list, tuple)):
1599
+ for t in tools:
1600
+ if t is None:
1601
+ continue
1602
+ if t.__class__.__name__ == "Tool" or isinstance(t, getattr(gtypes, "Tool", ())):
1603
+ handle_tool_obj(t)
1604
+ elif isinstance(t, dict):
1605
+ handle_tool_dict(t)
1606
+
1607
+ if isinstance(remote_tools, (list, tuple)):
1608
+ for t in remote_tools:
1609
+ if isinstance(t, dict):
1610
+ handle_tool_dict(t)
1611
+ elif t.__class__.__name__ == "Tool" or isinstance(t, getattr(gtypes, "Tool", ())):
1612
+ handle_tool_obj(t)
1613
+
1614
+ return out
1615
+
1616
+ def _fd_to_dict(self, fd: Any) -> Optional[dict]:
1617
+ """
1618
+ Convert FunctionDeclaration (gtypes or dict) to plain dict with normalized schema.
1619
+ """
1620
+ if fd.__class__.__name__ == "FunctionDeclaration" or isinstance(fd, getattr(gtypes, "FunctionDeclaration", ())):
1621
+ name = getattr(fd, "name", None)
1622
+ if not name:
1623
+ return None
1624
+ out = {"name": str(name)}
1625
+ desc = getattr(fd, "description", None)
1626
+ if desc:
1627
+ out["description"] = desc
1628
+ params = getattr(fd, "parameters", None)
1629
+ out["parameters"] = self._schema_to_plain(params if params is not None else {"type": "OBJECT"})
1630
+ return out
1631
+
1632
+ if isinstance(fd, dict):
1633
+ name = fd.get("name")
1634
+ if not name:
1635
+ return None
1636
+ out = {"name": str(name)}
1637
+ if fd.get("description"):
1638
+ out["description"] = fd["description"]
1639
+ params = fd.get("parameters")
1640
+ out["parameters"] = self._schema_to_plain(params if params is not None else {"type": "OBJECT"})
1641
+ return out
1642
+
1643
+ return None
1644
+
1645
+ def _schema_to_plain(self, sc: Any) -> dict:
1646
+ """
1647
+ Convert gtypes.Schema or dict to a plain dict acceptable by Live API.
1648
+ """
1649
+ allowed = {"OBJECT", "ARRAY", "STRING", "NUMBER", "INTEGER", "BOOLEAN"}
1650
+ alias = {"INT": "INTEGER", "BOOL": "BOOLEAN", "FLOAT": "NUMBER", "DOUBLE": "NUMBER"}
1651
+
1652
+ def norm_type(val) -> str:
1653
+ n = getattr(val, "name", None)
1654
+ if isinstance(n, str) and n:
1655
+ s = n
1656
+ else:
1657
+ s = str(val or "")
1658
+ if "." in s:
1659
+ s = s.split(".")[-1]
1660
+ s = alias.get(s.upper(), s.upper())
1661
+ return s if s in allowed else "OBJECT"
1662
+
1663
+ if sc is not None and (sc.__class__.__name__ == "Schema" or isinstance(sc, getattr(gtypes, "Schema", ()))):
1664
+ d: dict = {}
1665
+ t = getattr(sc, "type", None)
1666
+ d["type"] = norm_type(t) if t is not None else "OBJECT"
1667
+
1668
+ desc = getattr(sc, "description", None)
1669
+ if desc:
1670
+ d["description"] = desc
1671
+ fmt = getattr(sc, "format", None)
1672
+ if fmt:
1673
+ d["format"] = fmt
1674
+ enum = getattr(sc, "enum", None)
1675
+ if isinstance(enum, list) and enum and d["type"] == "STRING":
1676
+ d["enum"] = enum
1677
+ req = getattr(sc, "required", None)
1678
+ if isinstance(req, list) and req:
1679
+ d["required"] = [x for x in req if isinstance(x, str)]
1680
+
1681
+ props = getattr(sc, "properties", None)
1682
+ if isinstance(props, dict) and props:
1683
+ d["properties"] = {k: self._schema_to_plain(v) for k, v in props.items()}
1684
+
1685
+ items = getattr(sc, "items", None)
1686
+ if items:
1687
+ d["items"] = self._schema_to_plain(items)
1688
+
1689
+ return d
1690
+
1691
+ if isinstance(sc, dict):
1692
+ d = dict(sc)
1693
+ d["type"] = norm_type(d.get("type"))
1694
+ if isinstance(d.get("properties"), dict):
1695
+ d["properties"] = {k: self._schema_to_plain(v) for k, v in d["properties"].items()}
1696
+ if isinstance(d.get("items"), dict):
1697
+ d["items"] = self._schema_to_plain(d["items"])
1698
+ if "enum" in d and d.get("type") != "STRING":
1699
+ d.pop("enum", None)
1700
+ return d
1701
+
1702
+ return {"type": "OBJECT"}
1703
+
1704
+ def _tune_google_vad(self, live_cfg: dict, opts) -> None:
1705
+ """
1706
+ Increase end-of-speech hold for automatic VAD in Gemini Live.
1707
+ """
1708
+ try:
1709
+ ric = live_cfg.setdefault("realtime_input_config", {})
1710
+ aad = ric.setdefault("automatic_activity_detection", {})
1711
+ if aad.get("disabled") is True:
1712
+ return # manual mode, VAD disabled
1713
+
1714
+ # Resolve target silence (default 2000 ms)
1715
+ target_ms = getattr(opts, "vad_end_silence_ms", None)
1716
+ if not isinstance(target_ms, (int, float)) or target_ms <= 0:
1717
+ base = int(aad.get("silence_duration_ms") or 100)
1718
+ target_ms = max(base, 2000)
1719
+
1720
+ aad["silence_duration_ms"] = int(target_ms)
1721
+
1722
+ # Optional: make end-of-speech less aggressive
1723
+ try:
1724
+ aad["end_of_speech_sensitivity"] = gtypes.EndSensitivity.END_SENSITIVITY_LOW
1725
+ except Exception:
1726
+ aad["end_of_speech_sensitivity"] = "END_SENSITIVITY_LOW"
1727
+
1728
+ # Optional: leading padding before detected speech
1729
+ prefix_ms = getattr(opts, "vad_prefix_padding_ms", None)
1730
+ if isinstance(prefix_ms, (int, float)) and prefix_ms >= 0:
1731
+ aad["prefix_padding_ms"] = int(prefix_ms)
1732
+ except Exception:
1733
+ pass
1734
+
1735
+ def set_debug(self, enabled: bool):
1736
+ """
1737
+ Enable or disable debug logging.
1738
+
1739
+ :param enabled: True to enable debug logging, False to disable.
1740
+ """
1741
+ self.debug = bool(enabled)
1742
+
1743
+ def is_session_active(self) -> bool:
1744
+ """Check if the WS session is currently open."""
1745
+ return self._session is not None
1746
+
1747
+ def update_ctx(self, ctx: CtxItem):
1748
+ """Update the current CtxItem (for session handle persistence)."""
1749
+ self._ctx = ctx
1750
+
1751
+ # -----------------------------
1752
+ # Internal: auto-turn receiver bootstrap
1753
+ # -----------------------------
1754
+
1755
+ def _ensure_auto_receiver_started(self):
1756
+ """
1757
+ Start a receiver task for one model turn in auto-turn mode if not already active.
1758
+ This guarantees we do not miss server responses when sending live audio chunks.
1759
+ """
1760
+ # Only in auto-turn mode and with an open session
1761
+ if not self._session:
1762
+ return
1763
+ try:
1764
+ if not bool(getattr(self._last_opts, "auto_turn", False)):
1765
+ return
1766
+ except Exception:
1767
+ return
1768
+
1769
+ # If a previous task exists but is done, clear the ref
1770
+ if self._turn_task and self._turn_task.done():
1771
+ self._turn_task = None
1772
+
1773
+ if not self._response_active:
1774
+ # Reset per-turn collectors
1775
+ self._turn_text_parts = []
1776
+ self._last_out_tr = ""
1777
+ self._audio_buf.clear()
1778
+ self._saw_data_stream = False
1779
+ self._rt_reset_state()
1780
+
1781
+ self._response_active = True
1782
+ if self._response_done is None:
1783
+ self._response_done = asyncio.Event()
1784
+ else:
1785
+ try:
1786
+ self._response_done.clear()
1787
+ except Exception:
1788
+ self._response_done = asyncio.Event()
1789
+
1790
+ self._turn_task = asyncio.create_task(self._recv_one_turn(), name="google-live-auto-turn")
1791
+
1792
+ def update_session_autoturn_sync(
1793
+ self,
1794
+ enabled: bool,
1795
+ silence_ms: Optional[int] = None,
1796
+ prefix_ms: Optional[int] = None,
1797
+ timeout: float = 10.0,
1798
+ ):
1799
+ """
1800
+ Synchronous helper: enable/disable auto-turn (VAD) for Google Live
1801
+ and optionally override silence/prefix (milliseconds).
1802
+ Note: Live API doesn't support mid-session VAD reconfigure; we restart
1803
+ the session safely if it is open.
1804
+ """
1805
+ self._ensure_background_loop()
1806
+ return self._bg.run_sync(
1807
+ self._update_session_autoturn_internal(enabled, silence_ms, prefix_ms),
1808
+ timeout=timeout
1809
+ )
1810
+
1811
+ async def _update_session_autoturn_internal(
1812
+ self,
1813
+ enabled: bool,
1814
+ silence_ms: Optional[int] = None,
1815
+ prefix_ms: Optional[int] = None,
1816
+ ):
1817
+ """
1818
+ Owner-loop: toggle auto-turn (automatic_activity_detection) and optionally
1819
+ set silence_duration_ms / prefix_padding_ms. If the session is open,
1820
+ perform a safe restart to apply new config. If closed, cache in opts.
1821
+ """
1822
+
1823
+ # Helper to update cached opts
1824
+ def _apply_to_opts():
1825
+ if not self._last_opts:
1826
+ return
1827
+ try:
1828
+ setattr(self._last_opts, "auto_turn", bool(enabled))
1829
+ except Exception:
1830
+ pass
1831
+ try:
1832
+ if silence_ms is not None:
1833
+ setattr(self._last_opts, "vad_end_silence_ms", int(silence_ms))
1834
+ except Exception:
1835
+ pass
1836
+ try:
1837
+ if prefix_ms is not None:
1838
+ setattr(self._last_opts, "vad_prefix_padding_ms", int(prefix_ms))
1839
+ except Exception:
1840
+ pass
1841
+
1842
+ # If session not open -> just cache and exit
1843
+ if not self._session:
1844
+ _apply_to_opts()
1845
+ if self.debug:
1846
+ print("[google.update_session_autoturn] session not open; cached for next open")
1847
+ return
1848
+
1849
+ # Compute whether anything changes to avoid unnecessary restart
1850
+ cur_enabled = False
1851
+ try:
1852
+ cur_enabled = bool(getattr(self._last_opts, "auto_turn", False))
1853
+ except Exception:
1854
+ pass
1855
+ cur_sil = getattr(self._last_opts, "vad_end_silence_ms", None)
1856
+ cur_pre = getattr(self._last_opts, "vad_prefix_padding_ms", None)
1857
+
1858
+ change = (cur_enabled != bool(enabled))
1859
+ if silence_ms is not None and int(silence_ms) != (int(cur_sil) if isinstance(cur_sil, (int, float)) else None):
1860
+ change = True
1861
+ if prefix_ms is not None and int(prefix_ms) != (int(cur_pre) if isinstance(cur_pre, (int, float)) else None):
1862
+ change = True
1863
+
1864
+ if not change:
1865
+ # Nothing to do; still persist values to opts for consistency
1866
+ _apply_to_opts()
1867
+ if self.debug:
1868
+ print("[google.update_session_autoturn] no changes; skipping restart")
1869
+ return
1870
+
1871
+ # Wait for any active response to finish before restart
1872
+ if self._send_lock is None:
1873
+ self._send_lock = asyncio.Lock()
1874
+ async with self._send_lock:
1875
+ if self._response_active and self._response_done:
1876
+ if self.debug:
1877
+ print("[google.update_session_autoturn] waiting for active response to finish")
1878
+ try:
1879
+ await self._response_done.wait()
1880
+ except Exception:
1881
+ pass
1882
+
1883
+ # Update cached opts with requested values
1884
+ _apply_to_opts()
1885
+
1886
+ # Try to resume after restart using the last known handle (best-effort)
1887
+ prev_handle = self._rt_session_id
1888
+ try:
1889
+ if self._last_opts is not None and prev_handle:
1890
+ setattr(self._last_opts, "rt_session_id", prev_handle)
1891
+ except Exception:
1892
+ pass
1893
+
1894
+ if self.debug:
1895
+ eff_sil = silence_ms if silence_ms is not None else cur_sil
1896
+ eff_pre = prefix_ms if prefix_ms is not None else cur_pre
1897
+ print(f"[google.update_session_autoturn] restarting session; auto_turn={enabled}, "
1898
+ f"silence_ms={eff_sil}, prefix_ms={eff_pre}")
1899
+
1900
+ # Restart session with updated config
1901
+ await self._reset_session_internal(
1902
+ ctx=self._ctx,
1903
+ opts=self._last_opts,
1904
+ on_text=self._on_text,
1905
+ on_audio=self._on_audio,
1906
+ should_stop=self._should_stop,
1907
+ )
1908
+
1909
+ if self.debug:
1910
+ print("[google.update_session_autoturn] session restarted with new VAD settings")
1911
+
1912
+ # -----------------------------
1913
+ # Internal: commit event helpers
1914
+ # -----------------------------
1915
+
1916
+ def _emit_audio_commit_signal(self):
1917
+ """
1918
+ Emit RT_OUTPUT_AUDIO_COMMIT once per turn in auto-turn mode.
1919
+ """
1920
+ if self._rt_state is None:
1921
+ self._rt_reset_state()
1922
+ if self._rt_state.get("auto_commit_signaled"):
1923
+ return
1924
+ try:
1925
+ if not bool(getattr(self._last_opts, "auto_turn", False)):
1926
+ return
1927
+ except Exception:
1928
+ return
1929
+ # Limit to audio turns: only when we actually sent auto-turn audio this turn
1930
+ if not self._auto_audio_in_flight:
1931
+ return
1932
+ try:
1933
+ if self._last_opts and hasattr(self._last_opts, "rt_signals"):
1934
+ self._last_opts.rt_signals.response.emit(
1935
+ RealtimeEvent(RealtimeEvent.RT_OUTPUT_AUDIO_COMMIT, {"ctx": self._ctx})
1936
+ )
1937
+ self._rt_state["auto_commit_signaled"] = True
1938
+ except Exception:
1939
+ pass
1940
+
1941
+ def _maybe_emit_auto_commit(self):
1942
+ """
1943
+ Emit RT_OUTPUT_AUDIO_COMMIT on first sign of model output in auto-turn mode.
1944
+ """
1945
+ self._emit_audio_commit_signal()