pygpt-net 2.6.30__py3-none-any.whl → 2.6.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pygpt_net/CHANGELOG.txt +15 -0
- pygpt_net/__init__.py +3 -3
- pygpt_net/app.py +7 -1
- pygpt_net/app_core.py +3 -1
- pygpt_net/config.py +3 -1
- pygpt_net/controller/__init__.py +9 -2
- pygpt_net/controller/audio/audio.py +38 -1
- pygpt_net/controller/audio/ui.py +2 -2
- pygpt_net/controller/chat/audio.py +1 -8
- pygpt_net/controller/chat/common.py +23 -62
- pygpt_net/controller/chat/handler/__init__.py +0 -0
- pygpt_net/controller/chat/handler/stream_worker.py +1124 -0
- pygpt_net/controller/chat/output.py +8 -3
- pygpt_net/controller/chat/stream.py +3 -1071
- pygpt_net/controller/chat/text.py +3 -2
- pygpt_net/controller/kernel/kernel.py +11 -3
- pygpt_net/controller/kernel/reply.py +5 -1
- pygpt_net/controller/lang/custom.py +2 -2
- pygpt_net/controller/media/__init__.py +12 -0
- pygpt_net/controller/media/media.py +115 -0
- pygpt_net/controller/realtime/__init__.py +12 -0
- pygpt_net/controller/realtime/manager.py +53 -0
- pygpt_net/controller/realtime/realtime.py +293 -0
- pygpt_net/controller/ui/mode.py +23 -2
- pygpt_net/controller/ui/ui.py +19 -1
- pygpt_net/core/audio/audio.py +6 -1
- pygpt_net/core/audio/backend/native/__init__.py +12 -0
- pygpt_net/core/audio/backend/{native.py → native/native.py} +426 -127
- pygpt_net/core/audio/backend/native/player.py +139 -0
- pygpt_net/core/audio/backend/native/realtime.py +250 -0
- pygpt_net/core/audio/backend/pyaudio/__init__.py +12 -0
- pygpt_net/core/audio/backend/pyaudio/playback.py +194 -0
- pygpt_net/core/audio/backend/pyaudio/pyaudio.py +923 -0
- pygpt_net/core/audio/backend/pyaudio/realtime.py +312 -0
- pygpt_net/core/audio/backend/pygame/__init__.py +12 -0
- pygpt_net/core/audio/backend/{pygame.py → pygame/pygame.py} +130 -19
- pygpt_net/core/audio/backend/shared/__init__.py +38 -0
- pygpt_net/core/audio/backend/shared/conversions.py +211 -0
- pygpt_net/core/audio/backend/shared/envelope.py +38 -0
- pygpt_net/core/audio/backend/shared/player.py +137 -0
- pygpt_net/core/audio/backend/shared/rt.py +52 -0
- pygpt_net/core/audio/capture.py +5 -0
- pygpt_net/core/audio/output.py +14 -2
- pygpt_net/core/audio/whisper.py +6 -2
- pygpt_net/core/bridge/bridge.py +2 -1
- pygpt_net/core/bridge/worker.py +4 -1
- pygpt_net/core/dispatcher/dispatcher.py +37 -1
- pygpt_net/core/events/__init__.py +2 -1
- pygpt_net/core/events/realtime.py +55 -0
- pygpt_net/core/image/image.py +56 -5
- pygpt_net/core/realtime/__init__.py +0 -0
- pygpt_net/core/realtime/options.py +87 -0
- pygpt_net/core/realtime/shared/__init__.py +0 -0
- pygpt_net/core/realtime/shared/audio.py +213 -0
- pygpt_net/core/realtime/shared/loop.py +64 -0
- pygpt_net/core/realtime/shared/session.py +59 -0
- pygpt_net/core/realtime/shared/text.py +37 -0
- pygpt_net/core/realtime/shared/tools.py +276 -0
- pygpt_net/core/realtime/shared/turn.py +38 -0
- pygpt_net/core/realtime/shared/types.py +16 -0
- pygpt_net/core/realtime/worker.py +160 -0
- pygpt_net/core/render/web/body.py +24 -3
- pygpt_net/core/text/utils.py +54 -2
- pygpt_net/core/types/__init__.py +1 -0
- pygpt_net/core/types/image.py +54 -0
- pygpt_net/core/video/__init__.py +12 -0
- pygpt_net/core/video/video.py +290 -0
- pygpt_net/data/config/config.json +26 -5
- pygpt_net/data/config/models.json +221 -103
- pygpt_net/data/config/settings.json +244 -6
- pygpt_net/data/css/web-blocks.css +6 -0
- pygpt_net/data/css/web-chatgpt.css +6 -0
- pygpt_net/data/css/web-chatgpt_wide.css +6 -0
- pygpt_net/data/locale/locale.de.ini +35 -7
- pygpt_net/data/locale/locale.en.ini +56 -17
- pygpt_net/data/locale/locale.es.ini +35 -7
- pygpt_net/data/locale/locale.fr.ini +35 -7
- pygpt_net/data/locale/locale.it.ini +35 -7
- pygpt_net/data/locale/locale.pl.ini +38 -7
- pygpt_net/data/locale/locale.uk.ini +35 -7
- pygpt_net/data/locale/locale.zh.ini +31 -3
- pygpt_net/data/locale/plugin.audio_input.en.ini +4 -0
- pygpt_net/data/locale/plugin.audio_output.en.ini +4 -0
- pygpt_net/data/locale/plugin.cmd_web.en.ini +8 -0
- pygpt_net/item/model.py +22 -1
- pygpt_net/plugin/audio_input/plugin.py +37 -4
- pygpt_net/plugin/audio_input/simple.py +57 -8
- pygpt_net/plugin/cmd_files/worker.py +3 -0
- pygpt_net/provider/api/google/__init__.py +76 -7
- pygpt_net/provider/api/google/audio.py +8 -1
- pygpt_net/provider/api/google/chat.py +45 -6
- pygpt_net/provider/api/google/image.py +226 -86
- pygpt_net/provider/api/google/realtime/__init__.py +12 -0
- pygpt_net/provider/api/google/realtime/client.py +1945 -0
- pygpt_net/provider/api/google/realtime/realtime.py +186 -0
- pygpt_net/provider/api/google/video.py +364 -0
- pygpt_net/provider/api/openai/__init__.py +22 -2
- pygpt_net/provider/api/openai/realtime/__init__.py +12 -0
- pygpt_net/provider/api/openai/realtime/client.py +1828 -0
- pygpt_net/provider/api/openai/realtime/realtime.py +193 -0
- pygpt_net/provider/audio_input/google_genai.py +103 -0
- pygpt_net/provider/audio_output/google_genai_tts.py +229 -0
- pygpt_net/provider/audio_output/google_tts.py +0 -12
- pygpt_net/provider/audio_output/openai_tts.py +8 -5
- pygpt_net/provider/core/config/patch.py +241 -178
- pygpt_net/provider/core/model/patch.py +28 -2
- pygpt_net/provider/llms/google.py +8 -9
- pygpt_net/provider/web/duckduck_search.py +212 -0
- pygpt_net/ui/layout/toolbox/audio.py +55 -0
- pygpt_net/ui/layout/toolbox/footer.py +14 -42
- pygpt_net/ui/layout/toolbox/image.py +7 -13
- pygpt_net/ui/layout/toolbox/raw.py +52 -0
- pygpt_net/ui/layout/toolbox/split.py +48 -0
- pygpt_net/ui/layout/toolbox/toolbox.py +8 -8
- pygpt_net/ui/layout/toolbox/video.py +49 -0
- pygpt_net/ui/widget/option/combo.py +15 -1
- {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/METADATA +46 -22
- {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/RECORD +121 -73
- pygpt_net/core/audio/backend/pyaudio.py +0 -554
- {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/LICENSE +0 -0
- {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/WHEEL +0 -0
- {pygpt_net-2.6.30.dist-info → pygpt_net-2.6.32.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,1945 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
# ================================================== #
|
|
4
|
+
# This file is a part of PYGPT package #
|
|
5
|
+
# Website: https://pygpt.net #
|
|
6
|
+
# GitHub: https://github.com/szczyglis-dev/py-gpt #
|
|
7
|
+
# MIT License #
|
|
8
|
+
# Created By : Marcin Szczygliński #
|
|
9
|
+
# Updated Date: 2025.08.31 23:00:00 #
|
|
10
|
+
# ================================================== #
|
|
11
|
+
|
|
12
|
+
import asyncio
|
|
13
|
+
import base64
|
|
14
|
+
import json
|
|
15
|
+
from typing import Optional, Callable, Awaitable, Tuple, List, Any
|
|
16
|
+
|
|
17
|
+
from google.genai import types as gtypes # for Schema/FunctionDeclaration/FunctionResponse compatibility
|
|
18
|
+
|
|
19
|
+
from pygpt_net.core.events import RealtimeEvent
|
|
20
|
+
from pygpt_net.core.types import MODE_AUDIO
|
|
21
|
+
from pygpt_net.item.ctx import CtxItem
|
|
22
|
+
from pygpt_net.core.text.utils import has_unclosed_code_tag
|
|
23
|
+
|
|
24
|
+
# shared
|
|
25
|
+
from pygpt_net.core.realtime.shared.loop import BackgroundLoop
|
|
26
|
+
from pygpt_net.core.realtime.shared.audio import to_pcm16_mono
|
|
27
|
+
from pygpt_net.core.realtime.shared.tools import build_function_responses_payload
|
|
28
|
+
from pygpt_net.core.realtime.shared.text import coalesce_text
|
|
29
|
+
from pygpt_net.core.realtime.shared.turn import TurnMode, apply_turn_mode_google
|
|
30
|
+
from pygpt_net.core.realtime.shared.session import set_ctx_rt_handle
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class GoogleLiveClient:
|
|
34
|
+
"""
|
|
35
|
+
Google Live client with server-side memory and smooth audio:
|
|
36
|
+
|
|
37
|
+
- One persistent Live session; server keeps conversation context across turns.
|
|
38
|
+
- User turns are sent via:
|
|
39
|
+
* text: send_client_content(Content(...), turn_complete=True/False)
|
|
40
|
+
* audio: ActivityStart -> send_realtime_input(audio=Blob...) -> ActivityEnd
|
|
41
|
+
(manual turns; no auto VAD; no inline dicts — SDK serializes wire format)
|
|
42
|
+
- Auto-turn mode (automatic VAD) is fully supported for continuous mic input:
|
|
43
|
+
* push audio chunks via send_realtime_input(audio=...)
|
|
44
|
+
* flush on demand via send_realtime_input(audio_stream_end=True)
|
|
45
|
+
* receiver for one model turn is started automatically on first audio chunk.
|
|
46
|
+
- Each turn has its own receive loop, ending on serverContent.turnComplete or toolCall.
|
|
47
|
+
- Audio is jitter-buffered (~60ms) and de-duplicated (prefer response.data over inline_data).
|
|
48
|
+
- Final transcript is coalesced; preserves hard line breaks only.
|
|
49
|
+
- Tool calls, citations, images and usage are extracted and persisted to ctx to mirror OpenAI provider behavior.
|
|
50
|
+
- Emits RealtimeEvent.RT_OUTPUT_AUDIO_COMMIT when the model starts responding after auto VAD or after an explicit flush,
|
|
51
|
+
and RealtimeEvent.RT_OUTPUT_TURN_END after each turn.
|
|
52
|
+
- Supports sending tool results back to the model (send_tool_results/send_tool_results_sync).
|
|
53
|
+
"""
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
window=None,
|
|
57
|
+
debug: bool = False
|
|
58
|
+
):
|
|
59
|
+
self.window = window
|
|
60
|
+
self.debug = debug
|
|
61
|
+
|
|
62
|
+
# Live session resources (owned by background loop)
|
|
63
|
+
self._session = None
|
|
64
|
+
self._session_cm = None
|
|
65
|
+
|
|
66
|
+
# Background loop
|
|
67
|
+
self._bg: BackgroundLoop = BackgroundLoop(name="Google-RT-Loop")
|
|
68
|
+
|
|
69
|
+
# Flow control (per-session)
|
|
70
|
+
self._send_lock: Optional[asyncio.Lock] = None
|
|
71
|
+
self._response_done: Optional[asyncio.Event] = None
|
|
72
|
+
self._response_active: bool = False
|
|
73
|
+
self._turn_task: Optional[asyncio.Task] = None
|
|
74
|
+
|
|
75
|
+
# Callbacks and context
|
|
76
|
+
self._on_text: Optional[Callable[[str], Awaitable[None]]] = None
|
|
77
|
+
self._on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None
|
|
78
|
+
self._should_stop: Optional[Callable[[], bool]] = None
|
|
79
|
+
self._ctx: Optional[CtxItem] = None
|
|
80
|
+
self._last_opts = None
|
|
81
|
+
|
|
82
|
+
# Per-turn text aggregation
|
|
83
|
+
self._turn_text_parts: List[str] = []
|
|
84
|
+
self._last_out_tr: str = "" # last full output transcription (to compute deltas)
|
|
85
|
+
|
|
86
|
+
# Audio I/O (rates)
|
|
87
|
+
self._IN_RATE = 16000 # input (LINEAR16 mono)
|
|
88
|
+
self._OUT_RATE = 24000 # output (model audio PCM16@24kHz)
|
|
89
|
+
|
|
90
|
+
# Output audio jitter buffer
|
|
91
|
+
self._audio_buf = bytearray()
|
|
92
|
+
self._OUT_CHUNK_MS = 60
|
|
93
|
+
self._OUT_BYTES_PER_MS = int(self._OUT_RATE * 2 / 1000) # PCM16 mono (2 bytes/sample)
|
|
94
|
+
self._saw_data_stream = False # prefer response.data over inline_data to avoid duplicates
|
|
95
|
+
|
|
96
|
+
# Per-turn extraction state
|
|
97
|
+
self._rt_state: Optional[dict] = None
|
|
98
|
+
|
|
99
|
+
# Last tool calls snapshot
|
|
100
|
+
self._last_tool_calls: list[dict] = []
|
|
101
|
+
|
|
102
|
+
# Live session resumption (current session handle)
|
|
103
|
+
self._rt_session_id: Optional[str] = None # string handle that can be used to resume a session
|
|
104
|
+
|
|
105
|
+
# Cached tools signature to avoid redundant restarts
|
|
106
|
+
self._cached_session_tools_sig: Optional[str] = None
|
|
107
|
+
|
|
108
|
+
# Auto-turn state
|
|
109
|
+
self._auto_audio_in_flight: bool = False # True if auto-turn audio has been sent in current turn
|
|
110
|
+
|
|
111
|
+
# -----------------------------
|
|
112
|
+
# Public high-level entrypoints
|
|
113
|
+
# -----------------------------
|
|
114
|
+
|
|
115
|
+
async def run(
|
|
116
|
+
self,
|
|
117
|
+
ctx: CtxItem,
|
|
118
|
+
opts,
|
|
119
|
+
on_text: Callable[[str], Awaitable[None]],
|
|
120
|
+
on_audio: Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]],
|
|
121
|
+
should_stop: Callable[[], bool] = lambda: False,
|
|
122
|
+
):
|
|
123
|
+
"""
|
|
124
|
+
Run one turn: open session if needed, send prompt/audio, receive until turn complete.
|
|
125
|
+
"""
|
|
126
|
+
self._ensure_background_loop()
|
|
127
|
+
self._ctx = ctx
|
|
128
|
+
|
|
129
|
+
# If a different resumable handle is provided, reset the session to resume there
|
|
130
|
+
try:
|
|
131
|
+
provided = getattr(opts, "rt_session_id", None)
|
|
132
|
+
if isinstance(provided, str):
|
|
133
|
+
provided = provided.strip()
|
|
134
|
+
if self._session is not None and provided and provided != (self._rt_session_id or ""):
|
|
135
|
+
await self._run_on_owner(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop))
|
|
136
|
+
except Exception:
|
|
137
|
+
pass
|
|
138
|
+
|
|
139
|
+
if not self._session:
|
|
140
|
+
await self._run_on_owner(self._open_session_internal(ctx, opts, on_text, on_audio, should_stop))
|
|
141
|
+
|
|
142
|
+
await self._run_on_owner(self._send_turn_internal(
|
|
143
|
+
getattr(opts, "prompt", None),
|
|
144
|
+
getattr(opts, "audio_data", None),
|
|
145
|
+
getattr(opts, "audio_format", None),
|
|
146
|
+
getattr(opts, "audio_rate", None),
|
|
147
|
+
wait_for_done=not bool(getattr(opts, "streaming", False)),
|
|
148
|
+
))
|
|
149
|
+
|
|
150
|
+
async def open_session(
|
|
151
|
+
self,
|
|
152
|
+
ctx: CtxItem,
|
|
153
|
+
opts,
|
|
154
|
+
on_text: Callable[[str], Awaitable[None]],
|
|
155
|
+
on_audio: Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]],
|
|
156
|
+
should_stop: Callable[[], bool] = lambda: False,
|
|
157
|
+
):
|
|
158
|
+
"""
|
|
159
|
+
Open persistent Live session (if not already open).
|
|
160
|
+
"""
|
|
161
|
+
self._ensure_background_loop()
|
|
162
|
+
|
|
163
|
+
# If the session is already open but a different handle is requested, reset to resume.
|
|
164
|
+
try:
|
|
165
|
+
provided = getattr(opts, "rt_session_id", None)
|
|
166
|
+
if isinstance(provided, str):
|
|
167
|
+
provided = provided.strip()
|
|
168
|
+
if self._session is not None and provided and provided != (self._rt_session_id or ""):
|
|
169
|
+
await self._run_on_owner(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop))
|
|
170
|
+
return
|
|
171
|
+
except Exception:
|
|
172
|
+
pass
|
|
173
|
+
|
|
174
|
+
await self._run_on_owner(self._open_session_internal(ctx, opts, on_text, on_audio, should_stop))
|
|
175
|
+
|
|
176
|
+
async def close_session(self):
|
|
177
|
+
"""Close persistent Live session (if open)."""
|
|
178
|
+
if not self._bg.loop:
|
|
179
|
+
return
|
|
180
|
+
await self._run_on_owner(self._close_session_internal())
|
|
181
|
+
|
|
182
|
+
async def reset_session(
|
|
183
|
+
self,
|
|
184
|
+
ctx: Optional[CtxItem] = None,
|
|
185
|
+
opts=None,
|
|
186
|
+
on_text: Optional[Callable[[str], Awaitable[None]]] = None,
|
|
187
|
+
on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None,
|
|
188
|
+
should_stop: Optional[Callable[[], bool]] = None,
|
|
189
|
+
):
|
|
190
|
+
"""
|
|
191
|
+
Reset (close and reopen) persistent Live session with same or new params.
|
|
192
|
+
"""
|
|
193
|
+
self._ensure_background_loop()
|
|
194
|
+
await self._run_on_owner(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop))
|
|
195
|
+
|
|
196
|
+
async def shutdown(self):
|
|
197
|
+
"""Shutdown background loop and close session."""
|
|
198
|
+
if not self._bg.loop:
|
|
199
|
+
return
|
|
200
|
+
await self._run_on_owner(self._close_session_internal())
|
|
201
|
+
|
|
202
|
+
async def shutdown_and_stop(self):
|
|
203
|
+
"""Shutdown background loop, close session and stop the loop thread."""
|
|
204
|
+
await self.shutdown()
|
|
205
|
+
self.stop_loop_sync()
|
|
206
|
+
|
|
207
|
+
# -----------------------------
|
|
208
|
+
# Synchronous convenience calls
|
|
209
|
+
# -----------------------------
|
|
210
|
+
|
|
211
|
+
def close_session_sync(self, timeout: float = 5.0):
|
|
212
|
+
"""Close persistent Live session (if open)."""
|
|
213
|
+
if not self._bg.loop or not self._bg.loop.is_running():
|
|
214
|
+
return
|
|
215
|
+
self._bg.run_sync(self._close_session_internal(), timeout=timeout)
|
|
216
|
+
|
|
217
|
+
def reset_session_sync(
|
|
218
|
+
self,
|
|
219
|
+
ctx: Optional[CtxItem] = None,
|
|
220
|
+
opts=None,
|
|
221
|
+
on_text: Optional[Callable[[str], Awaitable[None]]] = None,
|
|
222
|
+
on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None,
|
|
223
|
+
should_stop: Optional[Callable[[], bool]] = None,
|
|
224
|
+
timeout: float = 10.0,
|
|
225
|
+
):
|
|
226
|
+
"""
|
|
227
|
+
Reset (close and reopen) persistent Live session with same or new params.
|
|
228
|
+
"""
|
|
229
|
+
self._ensure_background_loop()
|
|
230
|
+
self._bg.run_sync(self._reset_session_internal(ctx, opts, on_text, on_audio, should_stop), timeout=timeout)
|
|
231
|
+
|
|
232
|
+
def shutdown_sync(self, timeout: float = 5.0):
|
|
233
|
+
"""
|
|
234
|
+
Shutdown background loop and close session (sync).
|
|
235
|
+
"""
|
|
236
|
+
if not self._bg.loop or not self._bg.loop.is_running():
|
|
237
|
+
return
|
|
238
|
+
self._bg.run_sync(self._close_session_internal(), timeout=timeout)
|
|
239
|
+
|
|
240
|
+
def stop_loop_sync(self, timeout: float = 2.0):
|
|
241
|
+
"""
|
|
242
|
+
Stop background loop and join the thread.
|
|
243
|
+
"""
|
|
244
|
+
self._bg.stop(timeout=timeout)
|
|
245
|
+
|
|
246
|
+
# -----------------------------
|
|
247
|
+
# Tools helpers
|
|
248
|
+
# -----------------------------
|
|
249
|
+
|
|
250
|
+
def _update_last_opts_tools(self, tools: Optional[list], remote_tools: Optional[list]) -> None:
|
|
251
|
+
"""
|
|
252
|
+
Update self._last_opts with tools/remote_tools if those attributes exist.
|
|
253
|
+
"""
|
|
254
|
+
lo = self._last_opts
|
|
255
|
+
if not lo:
|
|
256
|
+
return
|
|
257
|
+
try:
|
|
258
|
+
if tools is not None and hasattr(lo, "tools"):
|
|
259
|
+
setattr(lo, "tools", tools)
|
|
260
|
+
except Exception:
|
|
261
|
+
pass
|
|
262
|
+
try:
|
|
263
|
+
if remote_tools is not None and hasattr(lo, "remote_tools"):
|
|
264
|
+
setattr(lo, "remote_tools", remote_tools)
|
|
265
|
+
except Exception:
|
|
266
|
+
pass
|
|
267
|
+
|
|
268
|
+
def _tools_signature(self, tools_list: list) -> str:
|
|
269
|
+
"""
|
|
270
|
+
Build a stable signature string for the given tools list.
|
|
271
|
+
"""
|
|
272
|
+
try:
|
|
273
|
+
return json.dumps(tools_list or [], ensure_ascii=False, sort_keys=True, separators=(",", ":"))
|
|
274
|
+
except Exception:
|
|
275
|
+
return str(tools_list or [])
|
|
276
|
+
|
|
277
|
+
# -----------------------------
|
|
278
|
+
# Internal: background loop/dispatch
|
|
279
|
+
# -----------------------------
|
|
280
|
+
|
|
281
|
+
def _ensure_background_loop(self):
|
|
282
|
+
"""Ensure background event loop and thread are running."""
|
|
283
|
+
self._bg.ensure()
|
|
284
|
+
|
|
285
|
+
async def _run_on_owner(self, coro):
|
|
286
|
+
"""
|
|
287
|
+
Run coroutine on the owner loop and await result.
|
|
288
|
+
"""
|
|
289
|
+
return await self._bg.run(coro)
|
|
290
|
+
|
|
291
|
+
# -----------------------------
|
|
292
|
+
# Internal: session lifecycle
|
|
293
|
+
# -----------------------------
|
|
294
|
+
|
|
295
|
+
async def _open_session_internal(
|
|
296
|
+
self,
|
|
297
|
+
ctx: CtxItem,
|
|
298
|
+
opts,
|
|
299
|
+
on_text: Callable[[str], Awaitable[None]],
|
|
300
|
+
on_audio: Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]],
|
|
301
|
+
should_stop: Callable[[], bool] = lambda: False,
|
|
302
|
+
):
|
|
303
|
+
"""
|
|
304
|
+
Open persistent Live session (if not already open).
|
|
305
|
+
"""
|
|
306
|
+
if self._session is not None:
|
|
307
|
+
if self.debug:
|
|
308
|
+
print("[google.open_session] already open")
|
|
309
|
+
return
|
|
310
|
+
|
|
311
|
+
core = self.window.core
|
|
312
|
+
model_data = core.models.get(ctx.model) if ctx and getattr(ctx, "model", None) else None
|
|
313
|
+
client = self.window.core.api.google.get_client(MODE_AUDIO, model_data if ctx else None)
|
|
314
|
+
if not client:
|
|
315
|
+
raise RuntimeError("Google GenAI client not configured")
|
|
316
|
+
|
|
317
|
+
# Select Live-capable model
|
|
318
|
+
model_id = getattr(opts, "model", None) or (ctx.model if ctx and getattr(ctx, "model", None) else "gemini-live-2.5-flash-preview")
|
|
319
|
+
voice = getattr(opts, "voice", None) or self._preferred_voice()
|
|
320
|
+
|
|
321
|
+
# Compose tools for session
|
|
322
|
+
session_tools = self._sanitize_tools(getattr(opts, "tools", None), getattr(opts, "remote_tools", None))
|
|
323
|
+
|
|
324
|
+
# Live config — manual activity boundaries (no auto VAD by default)
|
|
325
|
+
live_cfg = {
|
|
326
|
+
"response_modalities": ["AUDIO"],
|
|
327
|
+
"speech_config": {"voice_config": {"prebuilt_voice_config": {"voice_name": voice}}},
|
|
328
|
+
"output_audio_transcription": {},
|
|
329
|
+
"realtime_input_config": {"automatic_activity_detection": {"disabled": True}},
|
|
330
|
+
}
|
|
331
|
+
if session_tools:
|
|
332
|
+
live_cfg["tools"] = session_tools
|
|
333
|
+
|
|
334
|
+
# Cache current tools signature
|
|
335
|
+
self._cached_session_tools_sig = self._tools_signature(session_tools or [])
|
|
336
|
+
|
|
337
|
+
sys_prompt = getattr(opts, "system_prompt", None)
|
|
338
|
+
if sys_prompt:
|
|
339
|
+
live_cfg["system_instruction"] = str(sys_prompt)
|
|
340
|
+
|
|
341
|
+
# Session resumption: enable updates; resume when a different non-empty handle is given
|
|
342
|
+
try:
|
|
343
|
+
provided_handle = getattr(opts, "rt_session_id", None)
|
|
344
|
+
resume_handle = None
|
|
345
|
+
if isinstance(provided_handle, str):
|
|
346
|
+
ph = provided_handle.strip()
|
|
347
|
+
if ph and ph != (self._rt_session_id or ""):
|
|
348
|
+
resume_handle = ph
|
|
349
|
+
|
|
350
|
+
live_cfg["session_resumption"] = gtypes.SessionResumptionConfig(handle=resume_handle)
|
|
351
|
+
|
|
352
|
+
if resume_handle:
|
|
353
|
+
self._rt_session_id = resume_handle
|
|
354
|
+
set_ctx_rt_handle(self._ctx, resume_handle, self.window)
|
|
355
|
+
except Exception:
|
|
356
|
+
pass
|
|
357
|
+
|
|
358
|
+
# Apply turn mode (auto/manual VAD)
|
|
359
|
+
turn_mode = TurnMode.AUTO if bool(getattr(opts, "auto_turn", False)) else TurnMode.MANUAL
|
|
360
|
+
apply_turn_mode_google(live_cfg, turn_mode)
|
|
361
|
+
self._tune_google_vad(live_cfg, opts)
|
|
362
|
+
|
|
363
|
+
# Save callbacks and ctx
|
|
364
|
+
self._on_text = on_text
|
|
365
|
+
self._on_audio = on_audio
|
|
366
|
+
self._should_stop = should_stop or (lambda: False)
|
|
367
|
+
self._ctx = ctx
|
|
368
|
+
self._last_opts = opts
|
|
369
|
+
|
|
370
|
+
# Control primitives
|
|
371
|
+
self._response_done = asyncio.Event()
|
|
372
|
+
self._send_lock = asyncio.Lock()
|
|
373
|
+
self._turn_text_parts = []
|
|
374
|
+
self._last_out_tr = ""
|
|
375
|
+
self._last_tool_calls = []
|
|
376
|
+
|
|
377
|
+
# Connect session
|
|
378
|
+
self._session_cm = client.aio.live.connect(model=model_id, config=live_cfg)
|
|
379
|
+
self._session = await self._session_cm.__aenter__()
|
|
380
|
+
if self.debug:
|
|
381
|
+
print("[google.open_session] live session connected")
|
|
382
|
+
|
|
383
|
+
async def _close_session_internal(self):
|
|
384
|
+
"""Close persistent Live session (if open)."""
|
|
385
|
+
if self._turn_task and not self._turn_task.done():
|
|
386
|
+
try:
|
|
387
|
+
await asyncio.wait_for(self._turn_task, timeout=2.0)
|
|
388
|
+
except Exception:
|
|
389
|
+
pass
|
|
390
|
+
self._turn_task = None
|
|
391
|
+
|
|
392
|
+
if self._session_cm:
|
|
393
|
+
try:
|
|
394
|
+
await self._session_cm.__aexit__(None, None, None)
|
|
395
|
+
except Exception:
|
|
396
|
+
pass
|
|
397
|
+
self._session_cm = None
|
|
398
|
+
self._session = None
|
|
399
|
+
|
|
400
|
+
self._response_active = False
|
|
401
|
+
self._response_done = None
|
|
402
|
+
self._send_lock = None
|
|
403
|
+
self._turn_text_parts = []
|
|
404
|
+
self._last_out_tr = ""
|
|
405
|
+
self._audio_buf.clear()
|
|
406
|
+
self._saw_data_stream = False
|
|
407
|
+
self._rt_state = None
|
|
408
|
+
self._last_tool_calls = []
|
|
409
|
+
|
|
410
|
+
# Clear only in-memory handle; keep persisted ctx.extra["rt_session_id"]
|
|
411
|
+
self._rt_session_id = None
|
|
412
|
+
|
|
413
|
+
# Clear cached tools signature
|
|
414
|
+
self._cached_session_tools_sig = None
|
|
415
|
+
|
|
416
|
+
# Auto-turn flags
|
|
417
|
+
self._auto_audio_in_flight = False
|
|
418
|
+
|
|
419
|
+
if self.debug:
|
|
420
|
+
print("[google.close_session] closed")
|
|
421
|
+
|
|
422
|
+
async def _reset_session_internal(
|
|
423
|
+
self,
|
|
424
|
+
ctx: Optional[CtxItem] = None,
|
|
425
|
+
opts=None,
|
|
426
|
+
on_text: Optional[Callable[[str], Awaitable[None]]] = None,
|
|
427
|
+
on_audio: Optional[Callable[[bytes, str, Optional[int], Optional[int], bool], Awaitable[None]]] = None,
|
|
428
|
+
should_stop: Optional[Callable[[], bool]] = None,
|
|
429
|
+
):
|
|
430
|
+
"""
|
|
431
|
+
Reset (close and reopen) persistent Live session with same or new params.
|
|
432
|
+
"""
|
|
433
|
+
ctx = ctx or self._ctx
|
|
434
|
+
opts = opts or self._last_opts
|
|
435
|
+
on_text = on_text or self._on_text
|
|
436
|
+
on_audio = on_audio or self._on_audio
|
|
437
|
+
should_stop = should_stop or self._should_stop or (lambda: False)
|
|
438
|
+
|
|
439
|
+
if not (ctx and opts and on_text and on_audio):
|
|
440
|
+
raise RuntimeError("reset_session requires previous or explicit ctx/opts/callbacks")
|
|
441
|
+
|
|
442
|
+
await self._close_session_internal()
|
|
443
|
+
await self._open_session_internal(ctx, opts, on_text, on_audio, should_stop)
|
|
444
|
+
|
|
445
|
+
# -----------------------------
|
|
446
|
+
# Internal: one "turn"
|
|
447
|
+
# -----------------------------
|
|
448
|
+
|
|
449
|
+
async def _send_turn_internal(
|
|
450
|
+
self,
|
|
451
|
+
prompt: Optional[str] = None,
|
|
452
|
+
audio_data: Optional[bytes] = None,
|
|
453
|
+
audio_format: Optional[str] = None,
|
|
454
|
+
audio_rate: Optional[int] = None,
|
|
455
|
+
wait_for_done: bool = True,
|
|
456
|
+
):
|
|
457
|
+
"""
|
|
458
|
+
Send one turn: prompt and/or audio, receive until turn complete or tool call.
|
|
459
|
+
"""
|
|
460
|
+
if not self._session:
|
|
461
|
+
if self._ctx and self._last_opts:
|
|
462
|
+
await self._open_session_internal(self._ctx, self._last_opts, self._on_text, self._on_audio, self._should_stop)
|
|
463
|
+
else:
|
|
464
|
+
raise RuntimeError("Session not open. Call open_session(...) first.")
|
|
465
|
+
|
|
466
|
+
if self._send_lock is None:
|
|
467
|
+
self._send_lock = asyncio.Lock()
|
|
468
|
+
|
|
469
|
+
async with self._send_lock:
|
|
470
|
+
if self._response_active and self._response_done:
|
|
471
|
+
if self.debug:
|
|
472
|
+
print("[google.send_turn] waiting for previous response")
|
|
473
|
+
await self._response_done.wait()
|
|
474
|
+
|
|
475
|
+
# Reset per-turn collectors
|
|
476
|
+
self._turn_text_parts = []
|
|
477
|
+
self._last_out_tr = ""
|
|
478
|
+
self._audio_buf.clear()
|
|
479
|
+
self._saw_data_stream = False
|
|
480
|
+
self._rt_reset_state()
|
|
481
|
+
self._last_tool_calls = []
|
|
482
|
+
self._auto_audio_in_flight = False
|
|
483
|
+
|
|
484
|
+
# Normalize prompt/audio first to choose a stable path
|
|
485
|
+
txt = str(prompt).strip() if prompt is not None else ""
|
|
486
|
+
if txt == "...":
|
|
487
|
+
txt = ""
|
|
488
|
+
parts_t = [gtypes.Part(text=txt)] if txt else []
|
|
489
|
+
|
|
490
|
+
pcm = b""
|
|
491
|
+
rate = self._IN_RATE
|
|
492
|
+
if audio_data:
|
|
493
|
+
pcm, rate = to_pcm16_mono(audio_data, audio_format, audio_rate, target_rate=self._IN_RATE)
|
|
494
|
+
|
|
495
|
+
has_text = bool(parts_t)
|
|
496
|
+
has_audio = bool(pcm)
|
|
497
|
+
|
|
498
|
+
# Branches
|
|
499
|
+
if has_text and not has_audio:
|
|
500
|
+
# TEXT-ONLY -> single Content, turn_complete=True
|
|
501
|
+
await self._session.send_client_content(
|
|
502
|
+
turns=gtypes.Content(role="user", parts=parts_t),
|
|
503
|
+
turn_complete=True,
|
|
504
|
+
)
|
|
505
|
+
self._response_active = True
|
|
506
|
+
if self._response_done is None:
|
|
507
|
+
self._response_done = asyncio.Event()
|
|
508
|
+
else:
|
|
509
|
+
try:
|
|
510
|
+
self._response_done.clear()
|
|
511
|
+
except Exception:
|
|
512
|
+
self._response_done = asyncio.Event()
|
|
513
|
+
self._turn_task = asyncio.create_task(self._recv_one_turn(), name="google-live-turn")
|
|
514
|
+
|
|
515
|
+
elif has_audio and not has_text:
|
|
516
|
+
# AUDIO-ONLY
|
|
517
|
+
# If auto-turn is enabled, use auto-VAD path and flush with audio_stream_end.
|
|
518
|
+
# Otherwise, use manual ActivityStart/End boundaries.
|
|
519
|
+
use_auto = False
|
|
520
|
+
try:
|
|
521
|
+
use_auto = bool(getattr(self._last_opts, "auto_turn", False))
|
|
522
|
+
except Exception:
|
|
523
|
+
use_auto = False
|
|
524
|
+
|
|
525
|
+
self._response_active = True
|
|
526
|
+
if self._response_done is None:
|
|
527
|
+
self._response_done = asyncio.Event()
|
|
528
|
+
else:
|
|
529
|
+
try:
|
|
530
|
+
self._response_done.clear()
|
|
531
|
+
except Exception:
|
|
532
|
+
self._response_done = asyncio.Event()
|
|
533
|
+
|
|
534
|
+
# Start receiving before sending any audio
|
|
535
|
+
self._turn_task = asyncio.create_task(self._recv_one_turn(), name="google-live-turn")
|
|
536
|
+
|
|
537
|
+
if use_auto:
|
|
538
|
+
self._auto_audio_in_flight = True
|
|
539
|
+
# Auto-VAD: send a single audio blob and flush explicitly
|
|
540
|
+
try:
|
|
541
|
+
await self._session.send_realtime_input(
|
|
542
|
+
audio=gtypes.Blob(data=pcm, mime_type=f"audio/pcm;rate={int(rate)}")
|
|
543
|
+
)
|
|
544
|
+
await self._session.send_realtime_input(audio_stream_end=True)
|
|
545
|
+
self._emit_audio_commit_signal() # fire once for explicit flush
|
|
546
|
+
except Exception as e:
|
|
547
|
+
if self.debug:
|
|
548
|
+
print(f"[google.audio:auto] send failed: {e!r}")
|
|
549
|
+
raise
|
|
550
|
+
else:
|
|
551
|
+
# Manual activity: start -> audio -> end
|
|
552
|
+
await self._send_audio_realtime_manual(pcm, rate)
|
|
553
|
+
|
|
554
|
+
elif has_text and has_audio:
|
|
555
|
+
# TEXT + AUDIO in one user turn:
|
|
556
|
+
# Respect the configured mode: in manual mode keep ActivityStart/End,
|
|
557
|
+
# in auto-turn mode send text first and then treat audio as auto-VAD stream with explicit flush.
|
|
558
|
+
use_auto = False
|
|
559
|
+
try:
|
|
560
|
+
use_auto = bool(getattr(self._last_opts, "auto_turn", False))
|
|
561
|
+
except Exception:
|
|
562
|
+
use_auto = False
|
|
563
|
+
|
|
564
|
+
# 1) text opens the turn (turn_complete=False)
|
|
565
|
+
await self._session.send_client_content(
|
|
566
|
+
turns=gtypes.Content(role="user", parts=parts_t),
|
|
567
|
+
turn_complete=False,
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
self._response_active = True
|
|
571
|
+
if self._response_done is None:
|
|
572
|
+
self._response_done = asyncio.Event()
|
|
573
|
+
else:
|
|
574
|
+
try:
|
|
575
|
+
self._response_done.clear()
|
|
576
|
+
except Exception:
|
|
577
|
+
self._response_done = asyncio.Event()
|
|
578
|
+
|
|
579
|
+
# Start receiver, then send audio
|
|
580
|
+
self._turn_task = asyncio.create_task(self._recv_one_turn(), name="google-live-turn")
|
|
581
|
+
|
|
582
|
+
if use_auto:
|
|
583
|
+
self._auto_audio_in_flight = True
|
|
584
|
+
try:
|
|
585
|
+
await self._session.send_realtime_input(
|
|
586
|
+
audio=gtypes.Blob(data=pcm, mime_type=f"audio/pcm;rate={int(rate)}")
|
|
587
|
+
)
|
|
588
|
+
await self._session.send_realtime_input(audio_stream_end=True)
|
|
589
|
+
self._emit_audio_commit_signal() # fire once for explicit flush
|
|
590
|
+
except Exception as e:
|
|
591
|
+
if self.debug:
|
|
592
|
+
print(f"[google.audio:auto+text] send failed: {e!r}")
|
|
593
|
+
raise
|
|
594
|
+
else:
|
|
595
|
+
await self._send_audio_realtime_manual(pcm, rate)
|
|
596
|
+
|
|
597
|
+
else:
|
|
598
|
+
# nothing to send
|
|
599
|
+
return
|
|
600
|
+
|
|
601
|
+
if wait_for_done and self._turn_task:
|
|
602
|
+
try:
|
|
603
|
+
await self._turn_task
|
|
604
|
+
except Exception:
|
|
605
|
+
pass
|
|
606
|
+
|
|
607
|
+
async def _send_audio_realtime_manual(self, pcm: bytes, rate: int):
|
|
608
|
+
"""
|
|
609
|
+
Manual turn boundaries: ActivityStart -> audio chunks -> ActivityEnd.
|
|
610
|
+
MIME must be audio/pcm;rate=RATE (no space).
|
|
611
|
+
"""
|
|
612
|
+
if not pcm:
|
|
613
|
+
return
|
|
614
|
+
mime = f"audio/pcm;rate={int(rate)}"
|
|
615
|
+
|
|
616
|
+
# Activity start
|
|
617
|
+
try:
|
|
618
|
+
await self._session.send_realtime_input(activity_start=gtypes.ActivityStart())
|
|
619
|
+
if self.debug:
|
|
620
|
+
print("[google.audio] activityStart")
|
|
621
|
+
except Exception as e:
|
|
622
|
+
if self.debug:
|
|
623
|
+
print(f"[google.audio] activityStart failed: {e!r}")
|
|
624
|
+
raise
|
|
625
|
+
|
|
626
|
+
# ~100 ms chunks (for 16kHz -> 3200 bytes)
|
|
627
|
+
bytes_per_ms = int(rate * 2 / 1000) # 2 bytes per sample, mono
|
|
628
|
+
chunk = max(bytes_per_ms * 100, 3200)
|
|
629
|
+
for i in range(0, len(pcm), chunk):
|
|
630
|
+
part = pcm[i:i + chunk]
|
|
631
|
+
try:
|
|
632
|
+
await self._session.send_realtime_input(
|
|
633
|
+
audio=gtypes.Blob(data=part, mime_type=mime)
|
|
634
|
+
)
|
|
635
|
+
except Exception as e:
|
|
636
|
+
if self.debug:
|
|
637
|
+
print(f"[google.audio] payload send failed: {e!r}")
|
|
638
|
+
raise
|
|
639
|
+
|
|
640
|
+
# Activity end
|
|
641
|
+
try:
|
|
642
|
+
await self._session.send_realtime_input(activity_end=gtypes.ActivityEnd())
|
|
643
|
+
if self.debug:
|
|
644
|
+
print("[google.audio] activityEnd")
|
|
645
|
+
except Exception as e:
|
|
646
|
+
if self.debug:
|
|
647
|
+
print(f"[google.audio] activityEnd failed: {e!r}")
|
|
648
|
+
raise
|
|
649
|
+
|
|
650
|
+
# -----------------------------
|
|
651
|
+
# Internal: realtime audio input (auto-turn mode)
|
|
652
|
+
# -----------------------------
|
|
653
|
+
|
|
654
|
+
def rt_handle_audio_input_sync(self, event: RealtimeEvent, timeout: float = 0.5):
|
|
655
|
+
"""
|
|
656
|
+
Synchronous entrypoint for continuous microphone input when auto-turn is enabled.
|
|
657
|
+
Safe to call from any thread; schedules work on the background loop.
|
|
658
|
+
"""
|
|
659
|
+
# Quick no-op if empty
|
|
660
|
+
try:
|
|
661
|
+
payload = getattr(event, "data", {}) or {}
|
|
662
|
+
if isinstance(payload, dict) and "payload" in payload and isinstance(payload["payload"], dict):
|
|
663
|
+
payload = payload["payload"]
|
|
664
|
+
if not payload or not payload.get("data"):
|
|
665
|
+
return
|
|
666
|
+
except Exception:
|
|
667
|
+
return
|
|
668
|
+
|
|
669
|
+
self._ensure_background_loop()
|
|
670
|
+
try:
|
|
671
|
+
self._bg.run_sync(self._rt_handle_audio_input_internal(event), timeout=timeout)
|
|
672
|
+
except Exception:
|
|
673
|
+
# Never raise to caller from audio callback
|
|
674
|
+
pass
|
|
675
|
+
|
|
676
|
+
async def _rt_handle_audio_input_internal(self, event: RealtimeEvent):
|
|
677
|
+
"""
|
|
678
|
+
Owner-loop implementation: push live audio to Gemini Live in auto-turn mode.
|
|
679
|
+
"""
|
|
680
|
+
if not self._session:
|
|
681
|
+
return
|
|
682
|
+
try:
|
|
683
|
+
if not bool(getattr(self._last_opts, "auto_turn", False)):
|
|
684
|
+
# Only handle here when auto-turn is on; manual mode uses ActivityStart/End path.
|
|
685
|
+
return
|
|
686
|
+
except Exception:
|
|
687
|
+
return
|
|
688
|
+
|
|
689
|
+
# Extract normalized payload
|
|
690
|
+
payload = getattr(event, "data", {}) or {}
|
|
691
|
+
if isinstance(payload, dict) and "payload" in payload and isinstance(payload["payload"], dict):
|
|
692
|
+
payload = payload["payload"]
|
|
693
|
+
|
|
694
|
+
data: bytes = payload.get("data") or b""
|
|
695
|
+
if not data:
|
|
696
|
+
return
|
|
697
|
+
mime = str(payload.get("mime") or "audio/pcm")
|
|
698
|
+
rate = int(payload.get("rate") or 0) or self._IN_RATE
|
|
699
|
+
channels = int(payload.get("channels") or 1)
|
|
700
|
+
is_final = bool(payload.get("final", False))
|
|
701
|
+
|
|
702
|
+
# Normalize to LINEAR16 mono @16kHz (Live API input native rate)
|
|
703
|
+
fmt_hint = "pcm16" if mime.startswith("audio/pcm") else None
|
|
704
|
+
try:
|
|
705
|
+
pcm, norm_rate = to_pcm16_mono(data, fmt_hint, rate, target_rate=self._IN_RATE)
|
|
706
|
+
except Exception:
|
|
707
|
+
return
|
|
708
|
+
|
|
709
|
+
# Ensure a receiver for this auto-turn is running before sending audio
|
|
710
|
+
self._ensure_auto_receiver_started()
|
|
711
|
+
|
|
712
|
+
# Mark that auto-turn audio has been sent in this turn
|
|
713
|
+
self._auto_audio_in_flight = True
|
|
714
|
+
|
|
715
|
+
# Send audio blob; Gemini Live handles VAD automatically in auto mode
|
|
716
|
+
if self._send_lock is None:
|
|
717
|
+
self._send_lock = asyncio.Lock()
|
|
718
|
+
|
|
719
|
+
async with self._send_lock:
|
|
720
|
+
try:
|
|
721
|
+
await self._session.send_realtime_input(
|
|
722
|
+
audio=gtypes.Blob(data=pcm, mime_type=f"audio/pcm;rate={int(norm_rate)}")
|
|
723
|
+
)
|
|
724
|
+
except Exception:
|
|
725
|
+
return
|
|
726
|
+
|
|
727
|
+
# If stream end is flagged, flush server-side VAD buffer
|
|
728
|
+
if is_final:
|
|
729
|
+
try:
|
|
730
|
+
await self._session.send_realtime_input(audio_stream_end=True)
|
|
731
|
+
self._emit_audio_commit_signal() # fire once for explicit flush
|
|
732
|
+
except Exception:
|
|
733
|
+
pass
|
|
734
|
+
|
|
735
|
+
def commit_audio_input_sync(self, timeout: float = 0.5):
|
|
736
|
+
"""
|
|
737
|
+
Synchronous entrypoint to flush the input audio stream in auto-turn mode.
|
|
738
|
+
This sends audio_stream_end to force the model to process current buffered audio.
|
|
739
|
+
Safe to call from any thread.
|
|
740
|
+
"""
|
|
741
|
+
self._ensure_background_loop()
|
|
742
|
+
try:
|
|
743
|
+
self._bg.run_sync(self._commit_audio_input_internal(), timeout=timeout)
|
|
744
|
+
except Exception:
|
|
745
|
+
# Never raise to caller
|
|
746
|
+
pass
|
|
747
|
+
|
|
748
|
+
async def _commit_audio_input_internal(self):
|
|
749
|
+
"""
|
|
750
|
+
Owner-loop implementation: in auto-turn mode flush server-side VAD buffer.
|
|
751
|
+
"""
|
|
752
|
+
if not self._session:
|
|
753
|
+
return
|
|
754
|
+
try:
|
|
755
|
+
if not bool(getattr(self._last_opts, "auto_turn", False)):
|
|
756
|
+
return
|
|
757
|
+
except Exception:
|
|
758
|
+
return
|
|
759
|
+
|
|
760
|
+
# Ensure a receiver is running for this turn
|
|
761
|
+
self._ensure_auto_receiver_started()
|
|
762
|
+
|
|
763
|
+
if self._send_lock is None:
|
|
764
|
+
self._send_lock = asyncio.Lock()
|
|
765
|
+
async with self._send_lock:
|
|
766
|
+
try:
|
|
767
|
+
await self._session.send_realtime_input(audio_stream_end=True)
|
|
768
|
+
self._emit_audio_commit_signal() # fire once for explicit flush
|
|
769
|
+
except Exception:
|
|
770
|
+
pass
|
|
771
|
+
|
|
772
|
+
def force_response_now_sync(self, timeout: float = 5.0):
|
|
773
|
+
"""
|
|
774
|
+
Synchronously force the model to create a response from current input buffer (auto-turn).
|
|
775
|
+
Internally sends audio_stream_end and ensures a receiver is running for the pending turn.
|
|
776
|
+
"""
|
|
777
|
+
self._ensure_background_loop()
|
|
778
|
+
try:
|
|
779
|
+
self._bg.run_sync(self._force_response_now_internal(), timeout=timeout)
|
|
780
|
+
except Exception:
|
|
781
|
+
# Defensive: do not propagate errors to caller
|
|
782
|
+
pass
|
|
783
|
+
|
|
784
|
+
async def _force_response_now_internal(self):
|
|
785
|
+
"""
|
|
786
|
+
Owner-loop: in auto-turn mode, flush current audio buffer and guarantee that a receive task
|
|
787
|
+
for the current model turn is running. No-op in manual mode.
|
|
788
|
+
"""
|
|
789
|
+
if not self._session:
|
|
790
|
+
return
|
|
791
|
+
try:
|
|
792
|
+
if not bool(getattr(self._last_opts, "auto_turn", False)):
|
|
793
|
+
return
|
|
794
|
+
except Exception:
|
|
795
|
+
return
|
|
796
|
+
|
|
797
|
+
# Ensure a receiver is running for this turn
|
|
798
|
+
self._ensure_auto_receiver_started()
|
|
799
|
+
|
|
800
|
+
# Flush server-side buffer to force the model to respond
|
|
801
|
+
if self._send_lock is None:
|
|
802
|
+
self._send_lock = asyncio.Lock()
|
|
803
|
+
async with self._send_lock:
|
|
804
|
+
try:
|
|
805
|
+
await self._session.send_realtime_input(audio_stream_end=True)
|
|
806
|
+
self._emit_audio_commit_signal() # fire once for explicit flush
|
|
807
|
+
except Exception:
|
|
808
|
+
pass
|
|
809
|
+
|
|
810
|
+
async def _recv_one_turn(self):
|
|
811
|
+
"""Receive one turn until serverContent.turnComplete or toolCall."""
|
|
812
|
+
if self.debug:
|
|
813
|
+
print("[google._recv_one_turn] start")
|
|
814
|
+
|
|
815
|
+
turn_finished = False
|
|
816
|
+
|
|
817
|
+
try:
|
|
818
|
+
async for response in self._session.receive():
|
|
819
|
+
# 0) Session resumption updates (store last resumable handle)
|
|
820
|
+
try:
|
|
821
|
+
sru = getattr(response, "session_resumption_update", None) or getattr(response, "sessionResumptionUpdate", None)
|
|
822
|
+
if sru:
|
|
823
|
+
resumable = bool(getattr(sru, "resumable", None))
|
|
824
|
+
new_handle = getattr(sru, "new_handle", None) or getattr(sru, "newHandle", None)
|
|
825
|
+
if resumable and isinstance(new_handle, str) and new_handle.strip():
|
|
826
|
+
self._rt_session_id = new_handle.strip()
|
|
827
|
+
set_ctx_rt_handle(self._ctx, self._rt_session_id, self.window)
|
|
828
|
+
if self.debug:
|
|
829
|
+
print(f"[google.live] session handle updated: {self._rt_session_id}")
|
|
830
|
+
except Exception:
|
|
831
|
+
pass
|
|
832
|
+
|
|
833
|
+
# 1) Usage (top-level)
|
|
834
|
+
try:
|
|
835
|
+
um = getattr(response, "usage_metadata", None) or getattr(response, "usageMetadata", None)
|
|
836
|
+
if um:
|
|
837
|
+
self._rt_capture_google_usage(um)
|
|
838
|
+
except Exception:
|
|
839
|
+
pass
|
|
840
|
+
|
|
841
|
+
# 2) Preferred audio source: response.data (PCM16@24kHz)
|
|
842
|
+
data = getattr(response, "data", None)
|
|
843
|
+
if isinstance(data, (bytes, bytearray)):
|
|
844
|
+
# First output from model -> emit commit once (auto-turn only)
|
|
845
|
+
self._maybe_emit_auto_commit()
|
|
846
|
+
self._saw_data_stream = True
|
|
847
|
+
await self._audio_push(bytes(data), final=False)
|
|
848
|
+
|
|
849
|
+
# 3) Server content
|
|
850
|
+
sc = getattr(response, "server_content", None) or getattr(response, "serverContent", None)
|
|
851
|
+
if sc:
|
|
852
|
+
# Any serverContent reaching here implies the model started processing;
|
|
853
|
+
# emit commit once if not yet emitted (auto-turn only).
|
|
854
|
+
self._maybe_emit_auto_commit()
|
|
855
|
+
|
|
856
|
+
# Output transcription (often cumulative)
|
|
857
|
+
out_tr = getattr(sc, "output_transcription", None) or getattr(sc, "outputTranscription", None)
|
|
858
|
+
if out_tr and getattr(out_tr, "text", None) and self._on_text:
|
|
859
|
+
full = str(out_tr.text)
|
|
860
|
+
delta = full[len(self._last_out_tr):] if full.startswith(self._last_out_tr) else full
|
|
861
|
+
self._last_out_tr = full
|
|
862
|
+
if delta.strip():
|
|
863
|
+
self._turn_text_parts.append(delta)
|
|
864
|
+
try:
|
|
865
|
+
await self._on_text(delta)
|
|
866
|
+
except Exception:
|
|
867
|
+
pass
|
|
868
|
+
|
|
869
|
+
# Optional: input transcription (handy in manual mode)
|
|
870
|
+
in_tr = getattr(sc, "input_transcription", None) or getattr(sc, "inputTranscription", None)
|
|
871
|
+
if in_tr and getattr(in_tr, "text", None) and self.debug:
|
|
872
|
+
print("[google.input_tr]", in_tr.text)
|
|
873
|
+
|
|
874
|
+
# Model turn parts
|
|
875
|
+
model_turn = getattr(sc, "model_turn", None) or getattr(sc, "modelTurn", None)
|
|
876
|
+
if model_turn:
|
|
877
|
+
parts = getattr(model_turn, "parts", None) or []
|
|
878
|
+
for p in parts:
|
|
879
|
+
# Function call parts
|
|
880
|
+
fc = getattr(p, "function_call", None) or (p.get("function_call") if isinstance(p, dict) else None)
|
|
881
|
+
if fc:
|
|
882
|
+
name = getattr(fc, "name", None) or (fc.get("name") if isinstance(fc, dict) else "")
|
|
883
|
+
args_obj = getattr(fc, "args", None) or (fc.get("args") if isinstance(fc, dict) else {})
|
|
884
|
+
args_dict = self._to_plain_dict(args_obj) or {}
|
|
885
|
+
self._rt_state["tool_calls"].append({
|
|
886
|
+
"id": getattr(fc, "id", None) or "",
|
|
887
|
+
"type": "function",
|
|
888
|
+
"function": {
|
|
889
|
+
"name": name or "",
|
|
890
|
+
"arguments": json.dumps(args_dict, ensure_ascii=False),
|
|
891
|
+
}
|
|
892
|
+
})
|
|
893
|
+
self._rt_state["force_func_call"] = True
|
|
894
|
+
self._last_tool_calls = list(self._rt_state["tool_calls"])
|
|
895
|
+
turn_finished = True # let the app run tools now
|
|
896
|
+
|
|
897
|
+
# Text part
|
|
898
|
+
txt = getattr(p, "text", None) or (p.get("text") if isinstance(p, dict) else None)
|
|
899
|
+
if txt and self._on_text:
|
|
900
|
+
s = str(txt)
|
|
901
|
+
self._turn_text_parts.append(s)
|
|
902
|
+
try:
|
|
903
|
+
await self._on_text(s)
|
|
904
|
+
except Exception:
|
|
905
|
+
pass
|
|
906
|
+
|
|
907
|
+
# Code execution parts
|
|
908
|
+
ex = getattr(p, "executable_code", None) or (p.get("executable_code") if isinstance(p, dict) else None)
|
|
909
|
+
if ex:
|
|
910
|
+
lang = (getattr(ex, "language", None) or "python").strip() or "python"
|
|
911
|
+
code_txt = (
|
|
912
|
+
getattr(ex, "code", None) or
|
|
913
|
+
getattr(ex, "program", None) or
|
|
914
|
+
getattr(ex, "source", None) or
|
|
915
|
+
""
|
|
916
|
+
)
|
|
917
|
+
if not self._rt_state["is_code"]:
|
|
918
|
+
hdr = f"\n\n**Code interpreter**\n```{lang.lower()}\n"
|
|
919
|
+
self._turn_text_parts.append(hdr + str(code_txt))
|
|
920
|
+
try:
|
|
921
|
+
if self._on_text:
|
|
922
|
+
await self._on_text(hdr + str(code_txt))
|
|
923
|
+
except Exception:
|
|
924
|
+
pass
|
|
925
|
+
self._rt_state["is_code"] = True
|
|
926
|
+
else:
|
|
927
|
+
self._turn_text_parts.append(str(code_txt))
|
|
928
|
+
try:
|
|
929
|
+
if self._on_text:
|
|
930
|
+
await self._on_text(str(code_txt))
|
|
931
|
+
except Exception:
|
|
932
|
+
pass
|
|
933
|
+
|
|
934
|
+
cer = getattr(p, "code_execution_result", None) or (p.get("code_execution_result") if isinstance(p, dict) else None)
|
|
935
|
+
if cer and self._rt_state["is_code"]:
|
|
936
|
+
tail = "\n\n```\n-----------\n"
|
|
937
|
+
self._turn_text_parts.append(tail)
|
|
938
|
+
try:
|
|
939
|
+
if self._on_text:
|
|
940
|
+
await self._on_text(tail)
|
|
941
|
+
except Exception:
|
|
942
|
+
pass
|
|
943
|
+
self._rt_state["is_code"] = False
|
|
944
|
+
|
|
945
|
+
# Inline images
|
|
946
|
+
inline = getattr(p, "inline_data", None) or (p.get("inline_data") if isinstance(p, dict) else None)
|
|
947
|
+
if inline:
|
|
948
|
+
mime = (getattr(inline, "mime_type", "") or (inline.get("mime_type") if isinstance(inline, dict) else "") or "").lower()
|
|
949
|
+
if mime.startswith("image/"):
|
|
950
|
+
pdata = getattr(inline, "data", None) if not isinstance(inline, dict) else inline.get("data")
|
|
951
|
+
try:
|
|
952
|
+
img_bytes = None
|
|
953
|
+
if isinstance(pdata, (bytes, bytearray)):
|
|
954
|
+
img_bytes = bytes(pdata)
|
|
955
|
+
elif isinstance(pdata, str):
|
|
956
|
+
img_bytes = base64.b64decode(pdata)
|
|
957
|
+
if img_bytes:
|
|
958
|
+
save_path = self.window.core.image.gen_unique_path(self._ctx)
|
|
959
|
+
with open(save_path, "wb") as f:
|
|
960
|
+
f.write(img_bytes)
|
|
961
|
+
self._rt_state["image_paths"].append(save_path)
|
|
962
|
+
if not isinstance(self._ctx.images, list):
|
|
963
|
+
self._ctx.images = []
|
|
964
|
+
if save_path not in self._ctx.images:
|
|
965
|
+
self._ctx.images.append(save_path)
|
|
966
|
+
except Exception:
|
|
967
|
+
pass
|
|
968
|
+
|
|
969
|
+
# Citations (grounding)
|
|
970
|
+
try:
|
|
971
|
+
self._collect_google_citations_from_server_content(sc)
|
|
972
|
+
except Exception:
|
|
973
|
+
pass
|
|
974
|
+
|
|
975
|
+
# Turn complete signal
|
|
976
|
+
try:
|
|
977
|
+
if bool(getattr(sc, "turn_complete", None) or getattr(sc, "turnComplete", None)):
|
|
978
|
+
turn_finished = True
|
|
979
|
+
except Exception:
|
|
980
|
+
pass
|
|
981
|
+
|
|
982
|
+
# 4) Dedicated toolCall message
|
|
983
|
+
tc = getattr(response, "tool_call", None) or getattr(response, "toolCall", None)
|
|
984
|
+
if tc:
|
|
985
|
+
self._maybe_emit_auto_commit() # ensure commit signaled before handing off to tools
|
|
986
|
+
fcs = getattr(tc, "function_calls", None) or getattr(tc, "functionCalls", None) or []
|
|
987
|
+
new_calls = []
|
|
988
|
+
for fc in fcs:
|
|
989
|
+
name = getattr(fc, "name", "") or ""
|
|
990
|
+
args_obj = getattr(fc, "args", {}) or {}
|
|
991
|
+
args_dict = self._to_plain_dict(args_obj) or {}
|
|
992
|
+
new_calls.append({
|
|
993
|
+
"id": getattr(fc, "id", "") or "",
|
|
994
|
+
"type": "function",
|
|
995
|
+
"function": {
|
|
996
|
+
"name": name,
|
|
997
|
+
"arguments": json.dumps(args_dict, ensure_ascii=False),
|
|
998
|
+
}
|
|
999
|
+
})
|
|
1000
|
+
if new_calls:
|
|
1001
|
+
seen = {(tc["function"]["name"], tc["function"]["arguments"]) for tc in self._rt_state["tool_calls"]}
|
|
1002
|
+
for c in new_calls:
|
|
1003
|
+
key = (c["function"]["name"], c["function"]["arguments"])
|
|
1004
|
+
if key not in seen:
|
|
1005
|
+
self._rt_state["tool_calls"].append(c)
|
|
1006
|
+
seen.add(key)
|
|
1007
|
+
self._rt_state["force_func_call"] = True
|
|
1008
|
+
self._last_tool_calls = list(self._rt_state["tool_calls"])
|
|
1009
|
+
turn_finished = True
|
|
1010
|
+
|
|
1011
|
+
if turn_finished:
|
|
1012
|
+
break
|
|
1013
|
+
|
|
1014
|
+
# Flush jitter buffer
|
|
1015
|
+
await self._audio_push(b"", final=True)
|
|
1016
|
+
|
|
1017
|
+
except asyncio.CancelledError:
|
|
1018
|
+
try:
|
|
1019
|
+
await self._audio_push(b"", final=True)
|
|
1020
|
+
except Exception:
|
|
1021
|
+
pass
|
|
1022
|
+
except Exception as e:
|
|
1023
|
+
if self.debug:
|
|
1024
|
+
print(f"[google._recv_one_turn] exception: {e!r}")
|
|
1025
|
+
try:
|
|
1026
|
+
await self._audio_push(b"", final=True)
|
|
1027
|
+
except Exception:
|
|
1028
|
+
pass
|
|
1029
|
+
finally:
|
|
1030
|
+
# Persist textual output
|
|
1031
|
+
try:
|
|
1032
|
+
if self.window and self.window.core and self._ctx:
|
|
1033
|
+
txt = coalesce_text(self._turn_text_parts)
|
|
1034
|
+
if has_unclosed_code_tag(txt):
|
|
1035
|
+
txt += "\n```"
|
|
1036
|
+
if txt:
|
|
1037
|
+
self._ctx.output = txt
|
|
1038
|
+
# Tokens usage
|
|
1039
|
+
up = (self._rt_state or {}).get("usage_payload") or {}
|
|
1040
|
+
if up:
|
|
1041
|
+
in_tok = up.get("in")
|
|
1042
|
+
out_tok = up.get("out")
|
|
1043
|
+
self._ctx.set_tokens(in_tok if in_tok is not None else (self._ctx.input_tokens or 0),
|
|
1044
|
+
out_tok if out_tok is not None else 0)
|
|
1045
|
+
try:
|
|
1046
|
+
if not isinstance(self._ctx.extra, dict):
|
|
1047
|
+
self._ctx.extra = {}
|
|
1048
|
+
self._ctx.extra["usage"] = {
|
|
1049
|
+
"vendor": "google",
|
|
1050
|
+
"input_tokens": in_tok,
|
|
1051
|
+
"output_tokens": out_tok,
|
|
1052
|
+
"reasoning_tokens": up.get("reasoning", 0),
|
|
1053
|
+
"total_reported": up.get("total"),
|
|
1054
|
+
}
|
|
1055
|
+
except Exception:
|
|
1056
|
+
pass
|
|
1057
|
+
|
|
1058
|
+
# Citations to ctx.urls
|
|
1059
|
+
cites = (self._rt_state or {}).get("citations") or []
|
|
1060
|
+
if cites:
|
|
1061
|
+
if self._ctx.urls is None:
|
|
1062
|
+
self._ctx.urls = []
|
|
1063
|
+
for u in cites:
|
|
1064
|
+
if u not in self._ctx.urls:
|
|
1065
|
+
self._ctx.urls.append(u)
|
|
1066
|
+
|
|
1067
|
+
# Images to ctx.images
|
|
1068
|
+
imgs = (self._rt_state or {}).get("image_paths") or []
|
|
1069
|
+
if imgs:
|
|
1070
|
+
if not isinstance(self._ctx.images, list):
|
|
1071
|
+
self._ctx.images = []
|
|
1072
|
+
for p in imgs:
|
|
1073
|
+
if p not in self._ctx.images:
|
|
1074
|
+
self._ctx.images.append(p)
|
|
1075
|
+
|
|
1076
|
+
# Unpack tool calls
|
|
1077
|
+
tcs = (self._rt_state or {}).get("tool_calls") or []
|
|
1078
|
+
if tcs:
|
|
1079
|
+
for tc in tcs:
|
|
1080
|
+
fn = tc.get("function") or {}
|
|
1081
|
+
if isinstance(fn.get("arguments"), dict):
|
|
1082
|
+
fn["arguments"] = json.dumps(fn["arguments"], ensure_ascii=False)
|
|
1083
|
+
self._ctx.force_call = bool((self._rt_state or {}).get("force_func_call"))
|
|
1084
|
+
self.window.core.debug.info("[google.live] Tool calls found, unpacking...")
|
|
1085
|
+
self.window.core.command.unpack_tool_calls_chunks(self._ctx, tcs)
|
|
1086
|
+
|
|
1087
|
+
self.window.core.ctx.update_item(self._ctx)
|
|
1088
|
+
except Exception:
|
|
1089
|
+
pass
|
|
1090
|
+
|
|
1091
|
+
# Mark done for waiters
|
|
1092
|
+
self._response_active = False
|
|
1093
|
+
if self._response_done:
|
|
1094
|
+
try:
|
|
1095
|
+
self._response_done.set()
|
|
1096
|
+
except Exception:
|
|
1097
|
+
pass
|
|
1098
|
+
|
|
1099
|
+
# Emit end-of-turn event for audio pipeline symmetry with OpenAI
|
|
1100
|
+
try:
|
|
1101
|
+
if self._last_opts and hasattr(self._last_opts, "rt_signals"):
|
|
1102
|
+
self._last_opts.rt_signals.response.emit(RealtimeEvent(RealtimeEvent.RT_OUTPUT_TURN_END, {
|
|
1103
|
+
"ctx": self._ctx,
|
|
1104
|
+
}))
|
|
1105
|
+
except Exception:
|
|
1106
|
+
pass
|
|
1107
|
+
|
|
1108
|
+
# Reset per-turn state
|
|
1109
|
+
self._rt_state = None
|
|
1110
|
+
self._auto_audio_in_flight = False
|
|
1111
|
+
|
|
1112
|
+
if self.debug:
|
|
1113
|
+
print("[google._recv_one_turn] done")
|
|
1114
|
+
|
|
1115
|
+
# -----------------------------
|
|
1116
|
+
# Public: live tools update
|
|
1117
|
+
# -----------------------------
|
|
1118
|
+
|
|
1119
|
+
async def update_session_tools(
|
|
1120
|
+
self,
|
|
1121
|
+
tools: Optional[list] = None,
|
|
1122
|
+
remote_tools: Optional[list] = None,
|
|
1123
|
+
force: bool = False,
|
|
1124
|
+
):
|
|
1125
|
+
"""
|
|
1126
|
+
Update session tools for Google Live.
|
|
1127
|
+
Since the Live API does not support mid-session tool config updates via SDK,
|
|
1128
|
+
this performs a safe session restart with best-effort resumption if the tools changed.
|
|
1129
|
+
If the session is not open, it only updates cached opts for the next open.
|
|
1130
|
+
"""
|
|
1131
|
+
self._ensure_background_loop()
|
|
1132
|
+
return await self._run_on_owner(
|
|
1133
|
+
self._update_session_tools_internal(tools, remote_tools, force)
|
|
1134
|
+
)
|
|
1135
|
+
|
|
1136
|
+
def update_session_tools_sync(
|
|
1137
|
+
self,
|
|
1138
|
+
tools: Optional[list] = None,
|
|
1139
|
+
remote_tools: Optional[list] = None,
|
|
1140
|
+
force: bool = False,
|
|
1141
|
+
timeout: float = 10.0,
|
|
1142
|
+
):
|
|
1143
|
+
"""Synchronous wrapper over update_session_tools()."""
|
|
1144
|
+
self._ensure_background_loop()
|
|
1145
|
+
return self._bg.run_sync(
|
|
1146
|
+
self._update_session_tools_internal(tools, remote_tools, force),
|
|
1147
|
+
timeout=timeout
|
|
1148
|
+
)
|
|
1149
|
+
|
|
1150
|
+
async def _update_session_tools_internal(
|
|
1151
|
+
self,
|
|
1152
|
+
tools: Optional[list],
|
|
1153
|
+
remote_tools: Optional[list],
|
|
1154
|
+
force: bool,
|
|
1155
|
+
):
|
|
1156
|
+
"""
|
|
1157
|
+
Owner-loop implementation for tools update on Google Live.
|
|
1158
|
+
|
|
1159
|
+
Strategy:
|
|
1160
|
+
- Sanitize and compute signature of the requested tools set.
|
|
1161
|
+
- If session is closed: update last opts and clear local cache.
|
|
1162
|
+
- If session is open and tools changed (or force=True):
|
|
1163
|
+
* Wait for any active response to finish.
|
|
1164
|
+
* Restart the Live session and request resumption using the last known handle.
|
|
1165
|
+
"""
|
|
1166
|
+
# Prepare target tools (prefer explicit args, fallback to last opts)
|
|
1167
|
+
try:
|
|
1168
|
+
target_tools_raw = tools if tools is not None else getattr(self._last_opts, "tools", None)
|
|
1169
|
+
except Exception:
|
|
1170
|
+
target_tools_raw = None
|
|
1171
|
+
try:
|
|
1172
|
+
target_remote_raw = remote_tools if remote_tools is not None else getattr(self._last_opts, "remote_tools", None)
|
|
1173
|
+
except Exception:
|
|
1174
|
+
target_remote_raw = None
|
|
1175
|
+
|
|
1176
|
+
session_tools = self._sanitize_tools(target_tools_raw, target_remote_raw)
|
|
1177
|
+
new_sig = self._tools_signature(session_tools or [])
|
|
1178
|
+
|
|
1179
|
+
# If session is not open, just cache for next open
|
|
1180
|
+
if not self._session:
|
|
1181
|
+
self._update_last_opts_tools(tools, remote_tools)
|
|
1182
|
+
self._cached_session_tools_sig = None
|
|
1183
|
+
if self.debug:
|
|
1184
|
+
print("[google.update_session_tools] session not open; cached for next open")
|
|
1185
|
+
return
|
|
1186
|
+
|
|
1187
|
+
# Skip if unchanged
|
|
1188
|
+
if not force and self._cached_session_tools_sig == new_sig:
|
|
1189
|
+
self._update_last_opts_tools(tools, remote_tools)
|
|
1190
|
+
if self.debug:
|
|
1191
|
+
print("[google.update_session_tools] no changes; skipping restart")
|
|
1192
|
+
return
|
|
1193
|
+
|
|
1194
|
+
# Ensure previous response is finished
|
|
1195
|
+
if self._send_lock is None:
|
|
1196
|
+
self._send_lock = asyncio.Lock()
|
|
1197
|
+
async with self._send_lock:
|
|
1198
|
+
if self._response_active and self._response_done:
|
|
1199
|
+
if self.debug:
|
|
1200
|
+
print("[google.update_session_tools] waiting for active response to finish")
|
|
1201
|
+
try:
|
|
1202
|
+
await self._response_done.wait()
|
|
1203
|
+
except Exception:
|
|
1204
|
+
pass
|
|
1205
|
+
|
|
1206
|
+
# Persist new tools into last opts
|
|
1207
|
+
self._update_last_opts_tools(tools, remote_tools)
|
|
1208
|
+
|
|
1209
|
+
# Try to resume the session state after restart if possible
|
|
1210
|
+
prev_handle = self._rt_session_id
|
|
1211
|
+
|
|
1212
|
+
# Inject resumption handle into opts for the next open
|
|
1213
|
+
try:
|
|
1214
|
+
if self._last_opts is not None and prev_handle:
|
|
1215
|
+
setattr(self._last_opts, "rt_session_id", prev_handle)
|
|
1216
|
+
except Exception:
|
|
1217
|
+
pass
|
|
1218
|
+
|
|
1219
|
+
if self.debug:
|
|
1220
|
+
print("[google.update_session_tools] restarting session to apply new tools")
|
|
1221
|
+
|
|
1222
|
+
# Restart session with updated opts and best-effort resume
|
|
1223
|
+
await self._reset_session_internal(
|
|
1224
|
+
ctx=self._ctx,
|
|
1225
|
+
opts=self._last_opts,
|
|
1226
|
+
on_text=self._on_text,
|
|
1227
|
+
on_audio=self._on_audio,
|
|
1228
|
+
should_stop=self._should_stop,
|
|
1229
|
+
)
|
|
1230
|
+
|
|
1231
|
+
# Cache new signature to suppress redundant restarts
|
|
1232
|
+
self._cached_session_tools_sig = new_sig
|
|
1233
|
+
|
|
1234
|
+
if self.debug:
|
|
1235
|
+
print(f"[google.update_session_tools] session restarted; tools={len(session_tools)}")
|
|
1236
|
+
|
|
1237
|
+
# -----------------------------
|
|
1238
|
+
# Public: send tool results back to the model
|
|
1239
|
+
# -----------------------------
|
|
1240
|
+
|
|
1241
|
+
async def send_tool_results(
|
|
1242
|
+
self,
|
|
1243
|
+
results,
|
|
1244
|
+
continue_turn: bool = True,
|
|
1245
|
+
wait_for_done: bool = True,
|
|
1246
|
+
):
|
|
1247
|
+
"""
|
|
1248
|
+
Send tool results back to the Live session (FunctionResponse list).
|
|
1249
|
+
"""
|
|
1250
|
+
self._ensure_background_loop()
|
|
1251
|
+
return await self._run_on_owner(
|
|
1252
|
+
self._send_tool_results_internal(results, continue_turn, wait_for_done)
|
|
1253
|
+
)
|
|
1254
|
+
|
|
1255
|
+
def send_tool_results_sync(
|
|
1256
|
+
self,
|
|
1257
|
+
results,
|
|
1258
|
+
continue_turn: bool = True,
|
|
1259
|
+
wait_for_done: bool = True,
|
|
1260
|
+
timeout: float = 20.0,
|
|
1261
|
+
):
|
|
1262
|
+
"""
|
|
1263
|
+
Synchronous wrapper for send_tool_results().
|
|
1264
|
+
"""
|
|
1265
|
+
self._ensure_background_loop()
|
|
1266
|
+
return self._bg.run_sync(
|
|
1267
|
+
self._send_tool_results_internal(results, continue_turn, wait_for_done),
|
|
1268
|
+
timeout=timeout
|
|
1269
|
+
)
|
|
1270
|
+
|
|
1271
|
+
async def _send_tool_results_internal(
|
|
1272
|
+
self,
|
|
1273
|
+
results,
|
|
1274
|
+
continue_turn: bool,
|
|
1275
|
+
wait_for_done: bool,
|
|
1276
|
+
):
|
|
1277
|
+
"""
|
|
1278
|
+
Internal implementation of send_tool_results.
|
|
1279
|
+
"""
|
|
1280
|
+
if not self._session:
|
|
1281
|
+
raise RuntimeError("Live session is not open")
|
|
1282
|
+
|
|
1283
|
+
# Build neutral list and convert to gtypes.FunctionResponse[]
|
|
1284
|
+
try:
|
|
1285
|
+
neutral = build_function_responses_payload(results, self._last_tool_calls)
|
|
1286
|
+
except Exception as e:
|
|
1287
|
+
raise RuntimeError(f"Invalid tool results payload: {e}") from e
|
|
1288
|
+
|
|
1289
|
+
if not neutral:
|
|
1290
|
+
return
|
|
1291
|
+
|
|
1292
|
+
fn_responses = [
|
|
1293
|
+
gtypes.FunctionResponse(id=e.get("id") or "", name=e.get("name") or "", response=e.get("response") or {})
|
|
1294
|
+
for e in neutral
|
|
1295
|
+
]
|
|
1296
|
+
|
|
1297
|
+
if self._send_lock is None:
|
|
1298
|
+
self._send_lock = asyncio.Lock()
|
|
1299
|
+
async with self._send_lock:
|
|
1300
|
+
try:
|
|
1301
|
+
await self._session.send_tool_response(function_responses=fn_responses)
|
|
1302
|
+
except Exception as e:
|
|
1303
|
+
raise RuntimeError(f"send_tool_response failed: {e}") from e
|
|
1304
|
+
|
|
1305
|
+
if continue_turn:
|
|
1306
|
+
self._turn_text_parts = []
|
|
1307
|
+
self._last_out_tr = ""
|
|
1308
|
+
self._audio_buf.clear()
|
|
1309
|
+
self._saw_data_stream = False
|
|
1310
|
+
self._rt_reset_state()
|
|
1311
|
+
|
|
1312
|
+
self._response_active = True
|
|
1313
|
+
if self._response_done is None:
|
|
1314
|
+
self._response_done = asyncio.Event()
|
|
1315
|
+
else:
|
|
1316
|
+
try:
|
|
1317
|
+
self._response_done.clear()
|
|
1318
|
+
except Exception:
|
|
1319
|
+
self._response_done = asyncio.Event()
|
|
1320
|
+
|
|
1321
|
+
self._turn_task = asyncio.create_task(self._recv_one_turn(), name="google-live-turn-followup")
|
|
1322
|
+
|
|
1323
|
+
if wait_for_done:
|
|
1324
|
+
try:
|
|
1325
|
+
await self._turn_task
|
|
1326
|
+
except Exception:
|
|
1327
|
+
pass
|
|
1328
|
+
|
|
1329
|
+
# -----------------------------
|
|
1330
|
+
# Helpers
|
|
1331
|
+
# -----------------------------
|
|
1332
|
+
|
|
1333
|
+
def _preferred_voice(self) -> str:
|
|
1334
|
+
"""
|
|
1335
|
+
Get preferred TTS voice from options or default.
|
|
1336
|
+
"""
|
|
1337
|
+
try:
|
|
1338
|
+
v = self.window.core.plugins.get_option("audio_output", "google_genai_tts_voice")
|
|
1339
|
+
if v:
|
|
1340
|
+
mapping = {"kore": "Kore", "puck": "Puck", "charon": "Charon", "verse": "Verse", "legend": "Legend"}
|
|
1341
|
+
return mapping.get(str(v).strip().lower(), str(v))
|
|
1342
|
+
except Exception:
|
|
1343
|
+
pass
|
|
1344
|
+
return "Kore"
|
|
1345
|
+
|
|
1346
|
+
async def _audio_push(self, data: bytes, final: bool = False):
|
|
1347
|
+
"""
|
|
1348
|
+
Push audio data to the output callback in ~100 ms chunks.
|
|
1349
|
+
"""
|
|
1350
|
+
if not self._on_audio:
|
|
1351
|
+
return
|
|
1352
|
+
if data:
|
|
1353
|
+
self._audio_buf.extend(data)
|
|
1354
|
+
threshold = self._OUT_BYTES_PER_MS * self._OUT_CHUNK_MS
|
|
1355
|
+
while len(self._audio_buf) >= threshold:
|
|
1356
|
+
chunk = self._audio_buf[:threshold]
|
|
1357
|
+
del self._audio_buf[:threshold]
|
|
1358
|
+
try:
|
|
1359
|
+
await self._on_audio(bytes(chunk), "audio/pcm", self._OUT_RATE, 1, False)
|
|
1360
|
+
except Exception:
|
|
1361
|
+
pass
|
|
1362
|
+
if final:
|
|
1363
|
+
if self._audio_buf:
|
|
1364
|
+
try:
|
|
1365
|
+
await self._on_audio(bytes(self._audio_buf), "audio/pcm", self._OUT_RATE, 1, False)
|
|
1366
|
+
except Exception:
|
|
1367
|
+
pass
|
|
1368
|
+
self._audio_buf.clear()
|
|
1369
|
+
try:
|
|
1370
|
+
await self._on_audio(b"", "audio/pcm", self._OUT_RATE, 1, True)
|
|
1371
|
+
except Exception:
|
|
1372
|
+
pass
|
|
1373
|
+
|
|
1374
|
+
def _to_plain_dict(self, obj: Any) -> Any:
|
|
1375
|
+
"""
|
|
1376
|
+
Convert various objects (pydantic, dataclass, etc) to plain dict recursively.
|
|
1377
|
+
"""
|
|
1378
|
+
try:
|
|
1379
|
+
if hasattr(obj, "to_json_dict"):
|
|
1380
|
+
return obj.to_json_dict()
|
|
1381
|
+
if hasattr(obj, "model_dump"):
|
|
1382
|
+
return obj.model_dump()
|
|
1383
|
+
if hasattr(obj, "to_dict"):
|
|
1384
|
+
return obj.to_dict()
|
|
1385
|
+
except Exception:
|
|
1386
|
+
pass
|
|
1387
|
+
if isinstance(obj, dict):
|
|
1388
|
+
return {k: self._to_plain_dict(v) for k, v in obj.items()}
|
|
1389
|
+
if isinstance(obj, (list, tuple)):
|
|
1390
|
+
return [self._to_plain_dict(x) for x in obj]
|
|
1391
|
+
return obj
|
|
1392
|
+
|
|
1393
|
+
def _rt_reset_state(self):
|
|
1394
|
+
"""Reset per-turn realtime state."""
|
|
1395
|
+
self._rt_state = {
|
|
1396
|
+
"tool_calls": [],
|
|
1397
|
+
"citations": [],
|
|
1398
|
+
"files": [],
|
|
1399
|
+
"image_paths": [],
|
|
1400
|
+
"is_image": False,
|
|
1401
|
+
"is_code": False,
|
|
1402
|
+
"force_func_call": False,
|
|
1403
|
+
"usage_payload": {},
|
|
1404
|
+
"auto_commit_signaled": False,
|
|
1405
|
+
}
|
|
1406
|
+
|
|
1407
|
+
def _rt_capture_google_usage(self, um_obj: Any):
|
|
1408
|
+
"""
|
|
1409
|
+
Capture Google GenAI token usage from usage_metadata object.
|
|
1410
|
+
"""
|
|
1411
|
+
if not um_obj or self._rt_state is None:
|
|
1412
|
+
return
|
|
1413
|
+
|
|
1414
|
+
def as_int(v):
|
|
1415
|
+
try:
|
|
1416
|
+
if v is None:
|
|
1417
|
+
return None
|
|
1418
|
+
return int(v)
|
|
1419
|
+
except Exception:
|
|
1420
|
+
try:
|
|
1421
|
+
return int(float(v))
|
|
1422
|
+
except Exception:
|
|
1423
|
+
return None
|
|
1424
|
+
|
|
1425
|
+
prompt = (getattr(um_obj, "prompt_token_count", None)
|
|
1426
|
+
or getattr(um_obj, "promptTokenCount", None)
|
|
1427
|
+
or getattr(um_obj, "prompt_tokens", None)
|
|
1428
|
+
or None)
|
|
1429
|
+
total = (getattr(um_obj, "total_token_count", None)
|
|
1430
|
+
or getattr(um_obj, "totalTokenCount", None)
|
|
1431
|
+
or getattr(um_obj, "total_tokens", None)
|
|
1432
|
+
or None)
|
|
1433
|
+
candidates = (getattr(um_obj, "candidates_token_count", None)
|
|
1434
|
+
or getattr(um_obj, "candidatesTokenCount", None)
|
|
1435
|
+
or getattr(um_obj, "output_tokens", None)
|
|
1436
|
+
or None)
|
|
1437
|
+
reasoning = (getattr(um_obj, "candidates_reasoning_token_count", None)
|
|
1438
|
+
or getattr(um_obj, "candidatesReasoningTokenCount", None)
|
|
1439
|
+
or getattr(um_obj, "reasoning_tokens", None)
|
|
1440
|
+
or 0)
|
|
1441
|
+
p = as_int(prompt)
|
|
1442
|
+
t = as_int(total)
|
|
1443
|
+
c = as_int(candidates)
|
|
1444
|
+
r = as_int(reasoning) or 0
|
|
1445
|
+
out_total = max(0, (t or 0) - (p or 0)) if (t is not None and p is not None) else c
|
|
1446
|
+
self._rt_state["usage_payload"] = {"in": p, "out": out_total, "reasoning": r, "total": t}
|
|
1447
|
+
|
|
1448
|
+
def _collect_google_citations_from_server_content(self, sc: Any):
|
|
1449
|
+
"""
|
|
1450
|
+
Collect citations (URLs) from server_content grounding metadata and add to rt_state and ctx.urls.
|
|
1451
|
+
"""
|
|
1452
|
+
if self._rt_state is None:
|
|
1453
|
+
return
|
|
1454
|
+
|
|
1455
|
+
def add_url(url: Optional[str]):
|
|
1456
|
+
if not url or not isinstance(url, str):
|
|
1457
|
+
return
|
|
1458
|
+
u = url.strip()
|
|
1459
|
+
if not (u.startswith("http://") or u.startswith("https://")):
|
|
1460
|
+
return
|
|
1461
|
+
if u not in self._rt_state["citations"]:
|
|
1462
|
+
self._rt_state["citations"].append(u)
|
|
1463
|
+
try:
|
|
1464
|
+
if self._ctx:
|
|
1465
|
+
if self._ctx.urls is None:
|
|
1466
|
+
self._ctx.urls = []
|
|
1467
|
+
if u not in self._ctx.urls:
|
|
1468
|
+
self._ctx.urls.append(u)
|
|
1469
|
+
except Exception:
|
|
1470
|
+
pass
|
|
1471
|
+
|
|
1472
|
+
gm = getattr(sc, "grounding_metadata", None) or getattr(sc, "groundingMetadata", None)
|
|
1473
|
+
if gm:
|
|
1474
|
+
atts = getattr(gm, "grounding_attributions", None) or getattr(gm, "groundingAttributions", None) or []
|
|
1475
|
+
try:
|
|
1476
|
+
for att in atts or []:
|
|
1477
|
+
for path in (
|
|
1478
|
+
"web.uri", "web.url", "source.web.uri", "source.web.url",
|
|
1479
|
+
"source.uri", "source.url", "uri", "url",
|
|
1480
|
+
):
|
|
1481
|
+
add_url(self._safe_get(att, path))
|
|
1482
|
+
except Exception:
|
|
1483
|
+
pass
|
|
1484
|
+
for path in (
|
|
1485
|
+
"search_entry_point.uri", "search_entry_point.url",
|
|
1486
|
+
"searchEntryPoint.uri", "searchEntryPoint.url",
|
|
1487
|
+
"search_entry_point.rendered_content_uri", "searchEntryPoint.rendered_content_uri",
|
|
1488
|
+
):
|
|
1489
|
+
add_url(self._safe_get(gm, path))
|
|
1490
|
+
|
|
1491
|
+
try:
|
|
1492
|
+
mt = getattr(sc, "model_turn", None) or getattr(sc, "modelTurn", None)
|
|
1493
|
+
parts = getattr(mt, "parts", None) or []
|
|
1494
|
+
for p in parts:
|
|
1495
|
+
pcm = self._safe_get(p, "citation_metadata") or self._safe_get(p, "citationMetadata")
|
|
1496
|
+
if pcm:
|
|
1497
|
+
arr = (self._safe_get(pcm, "citation_sources")
|
|
1498
|
+
or self._safe_get(pcm, "citationSources")
|
|
1499
|
+
or self._safe_get(pcm, "citations") or []
|
|
1500
|
+
)
|
|
1501
|
+
for cit in arr or []:
|
|
1502
|
+
for path in ("uri", "url", "source.uri", "source.url", "web.uri", "web.url"):
|
|
1503
|
+
add_url(self._safe_get(cit, path))
|
|
1504
|
+
gpa = self._safe_get(p, "grounding_attributions") or self._safe_get(p, "groundingAttributions") or []
|
|
1505
|
+
for att in gpa or []:
|
|
1506
|
+
for path in ("web.uri", "web.url", "source.web.uri", "source.web.url", "uri", "url"):
|
|
1507
|
+
add_url(self._safe_get(att, path))
|
|
1508
|
+
except Exception:
|
|
1509
|
+
pass
|
|
1510
|
+
|
|
1511
|
+
def _safe_get(self, obj, path: str) -> Any:
|
|
1512
|
+
"""
|
|
1513
|
+
Safely get a nested attribute or dict key by dot-separated path.
|
|
1514
|
+
"""
|
|
1515
|
+
cur = obj
|
|
1516
|
+
for seg in path.split("."):
|
|
1517
|
+
if cur is None:
|
|
1518
|
+
return None
|
|
1519
|
+
if isinstance(cur, dict):
|
|
1520
|
+
cur = cur.get(seg)
|
|
1521
|
+
else:
|
|
1522
|
+
if seg.isdigit() and isinstance(cur, (list, tuple)):
|
|
1523
|
+
idx = int(seg)
|
|
1524
|
+
if 0 <= idx < len(cur):
|
|
1525
|
+
cur = cur[idx]
|
|
1526
|
+
else:
|
|
1527
|
+
return None
|
|
1528
|
+
else:
|
|
1529
|
+
cur = getattr(cur, seg, None)
|
|
1530
|
+
return cur
|
|
1531
|
+
|
|
1532
|
+
# -------- tools sanitizer for Live config (dict-only, robust) --------
|
|
1533
|
+
|
|
1534
|
+
def _sanitize_tools(self, tools: Any, remote_tools: Optional[list] = None) -> list:
|
|
1535
|
+
"""
|
|
1536
|
+
Normalize opts.tools into Live API config.tools (list of dicts).
|
|
1537
|
+
Supports gtypes.Tool, dict, or mixed list.
|
|
1538
|
+
"""
|
|
1539
|
+
out: list = []
|
|
1540
|
+
sigset: set[str] = set()
|
|
1541
|
+
|
|
1542
|
+
def add(entry: dict):
|
|
1543
|
+
try:
|
|
1544
|
+
sig = json.dumps(entry, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
|
|
1545
|
+
except Exception:
|
|
1546
|
+
sig = str(entry)
|
|
1547
|
+
if sig not in sigset:
|
|
1548
|
+
out.append(entry)
|
|
1549
|
+
sigset.add(sig)
|
|
1550
|
+
|
|
1551
|
+
def handle_tool_obj(t):
|
|
1552
|
+
# Convert gtypes.Tool -> dict
|
|
1553
|
+
fd_list = []
|
|
1554
|
+
fds = getattr(t, "function_declarations", None) or getattr(t, "functionDeclarations", None) or []
|
|
1555
|
+
for fd in fds or []:
|
|
1556
|
+
fd_dict = self._fd_to_dict(fd)
|
|
1557
|
+
if fd_dict:
|
|
1558
|
+
fd_list.append(fd_dict)
|
|
1559
|
+
if fd_list:
|
|
1560
|
+
add({"function_declarations": fd_list})
|
|
1561
|
+
|
|
1562
|
+
# built-ins
|
|
1563
|
+
if getattr(t, "code_execution", None) or getattr(t, "codeExecution", None):
|
|
1564
|
+
add({"code_execution": {}})
|
|
1565
|
+
if getattr(t, "google_search", None) or getattr(t, "googleSearch", None):
|
|
1566
|
+
add({"google_search": {}})
|
|
1567
|
+
if getattr(t, "url_context", None) or getattr(t, "urlContext", None):
|
|
1568
|
+
add({"url_context": {}})
|
|
1569
|
+
|
|
1570
|
+
def handle_tool_dict(d: dict):
|
|
1571
|
+
fds = d.get("function_declarations") or d.get("functionDeclarations")
|
|
1572
|
+
if isinstance(fds, list) and fds:
|
|
1573
|
+
fd_list = []
|
|
1574
|
+
for fd in fds:
|
|
1575
|
+
fd_dict = self._fd_to_dict(fd)
|
|
1576
|
+
if fd_dict:
|
|
1577
|
+
fd_list.append(fd_dict)
|
|
1578
|
+
if fd_list:
|
|
1579
|
+
add({"function_declarations": fd_list})
|
|
1580
|
+
|
|
1581
|
+
if (d.get("type") or "").lower() == "function":
|
|
1582
|
+
fn = d.get("function") if isinstance(d.get("function"), dict) else d
|
|
1583
|
+
name = fn.get("name")
|
|
1584
|
+
if name:
|
|
1585
|
+
fd = {"name": str(name)}
|
|
1586
|
+
if fn.get("description"):
|
|
1587
|
+
fd["description"] = fn["description"]
|
|
1588
|
+
params = fn.get("parameters")
|
|
1589
|
+
fd["parameters"] = self._schema_to_plain(params if params is not None else {"type": "OBJECT"})
|
|
1590
|
+
add({"function_declarations": [fd]})
|
|
1591
|
+
|
|
1592
|
+
for k in ("google_search", "code_execution", "url_context"):
|
|
1593
|
+
if k in d and isinstance(d[k], dict):
|
|
1594
|
+
add({k: dict(d[k])})
|
|
1595
|
+
elif k in d and d[k] is True:
|
|
1596
|
+
add({k: {}})
|
|
1597
|
+
|
|
1598
|
+
if isinstance(tools, (list, tuple)):
|
|
1599
|
+
for t in tools:
|
|
1600
|
+
if t is None:
|
|
1601
|
+
continue
|
|
1602
|
+
if t.__class__.__name__ == "Tool" or isinstance(t, getattr(gtypes, "Tool", ())):
|
|
1603
|
+
handle_tool_obj(t)
|
|
1604
|
+
elif isinstance(t, dict):
|
|
1605
|
+
handle_tool_dict(t)
|
|
1606
|
+
|
|
1607
|
+
if isinstance(remote_tools, (list, tuple)):
|
|
1608
|
+
for t in remote_tools:
|
|
1609
|
+
if isinstance(t, dict):
|
|
1610
|
+
handle_tool_dict(t)
|
|
1611
|
+
elif t.__class__.__name__ == "Tool" or isinstance(t, getattr(gtypes, "Tool", ())):
|
|
1612
|
+
handle_tool_obj(t)
|
|
1613
|
+
|
|
1614
|
+
return out
|
|
1615
|
+
|
|
1616
|
+
def _fd_to_dict(self, fd: Any) -> Optional[dict]:
|
|
1617
|
+
"""
|
|
1618
|
+
Convert FunctionDeclaration (gtypes or dict) to plain dict with normalized schema.
|
|
1619
|
+
"""
|
|
1620
|
+
if fd.__class__.__name__ == "FunctionDeclaration" or isinstance(fd, getattr(gtypes, "FunctionDeclaration", ())):
|
|
1621
|
+
name = getattr(fd, "name", None)
|
|
1622
|
+
if not name:
|
|
1623
|
+
return None
|
|
1624
|
+
out = {"name": str(name)}
|
|
1625
|
+
desc = getattr(fd, "description", None)
|
|
1626
|
+
if desc:
|
|
1627
|
+
out["description"] = desc
|
|
1628
|
+
params = getattr(fd, "parameters", None)
|
|
1629
|
+
out["parameters"] = self._schema_to_plain(params if params is not None else {"type": "OBJECT"})
|
|
1630
|
+
return out
|
|
1631
|
+
|
|
1632
|
+
if isinstance(fd, dict):
|
|
1633
|
+
name = fd.get("name")
|
|
1634
|
+
if not name:
|
|
1635
|
+
return None
|
|
1636
|
+
out = {"name": str(name)}
|
|
1637
|
+
if fd.get("description"):
|
|
1638
|
+
out["description"] = fd["description"]
|
|
1639
|
+
params = fd.get("parameters")
|
|
1640
|
+
out["parameters"] = self._schema_to_plain(params if params is not None else {"type": "OBJECT"})
|
|
1641
|
+
return out
|
|
1642
|
+
|
|
1643
|
+
return None
|
|
1644
|
+
|
|
1645
|
+
def _schema_to_plain(self, sc: Any) -> dict:
|
|
1646
|
+
"""
|
|
1647
|
+
Convert gtypes.Schema or dict to a plain dict acceptable by Live API.
|
|
1648
|
+
"""
|
|
1649
|
+
allowed = {"OBJECT", "ARRAY", "STRING", "NUMBER", "INTEGER", "BOOLEAN"}
|
|
1650
|
+
alias = {"INT": "INTEGER", "BOOL": "BOOLEAN", "FLOAT": "NUMBER", "DOUBLE": "NUMBER"}
|
|
1651
|
+
|
|
1652
|
+
def norm_type(val) -> str:
|
|
1653
|
+
n = getattr(val, "name", None)
|
|
1654
|
+
if isinstance(n, str) and n:
|
|
1655
|
+
s = n
|
|
1656
|
+
else:
|
|
1657
|
+
s = str(val or "")
|
|
1658
|
+
if "." in s:
|
|
1659
|
+
s = s.split(".")[-1]
|
|
1660
|
+
s = alias.get(s.upper(), s.upper())
|
|
1661
|
+
return s if s in allowed else "OBJECT"
|
|
1662
|
+
|
|
1663
|
+
if sc is not None and (sc.__class__.__name__ == "Schema" or isinstance(sc, getattr(gtypes, "Schema", ()))):
|
|
1664
|
+
d: dict = {}
|
|
1665
|
+
t = getattr(sc, "type", None)
|
|
1666
|
+
d["type"] = norm_type(t) if t is not None else "OBJECT"
|
|
1667
|
+
|
|
1668
|
+
desc = getattr(sc, "description", None)
|
|
1669
|
+
if desc:
|
|
1670
|
+
d["description"] = desc
|
|
1671
|
+
fmt = getattr(sc, "format", None)
|
|
1672
|
+
if fmt:
|
|
1673
|
+
d["format"] = fmt
|
|
1674
|
+
enum = getattr(sc, "enum", None)
|
|
1675
|
+
if isinstance(enum, list) and enum and d["type"] == "STRING":
|
|
1676
|
+
d["enum"] = enum
|
|
1677
|
+
req = getattr(sc, "required", None)
|
|
1678
|
+
if isinstance(req, list) and req:
|
|
1679
|
+
d["required"] = [x for x in req if isinstance(x, str)]
|
|
1680
|
+
|
|
1681
|
+
props = getattr(sc, "properties", None)
|
|
1682
|
+
if isinstance(props, dict) and props:
|
|
1683
|
+
d["properties"] = {k: self._schema_to_plain(v) for k, v in props.items()}
|
|
1684
|
+
|
|
1685
|
+
items = getattr(sc, "items", None)
|
|
1686
|
+
if items:
|
|
1687
|
+
d["items"] = self._schema_to_plain(items)
|
|
1688
|
+
|
|
1689
|
+
return d
|
|
1690
|
+
|
|
1691
|
+
if isinstance(sc, dict):
|
|
1692
|
+
d = dict(sc)
|
|
1693
|
+
d["type"] = norm_type(d.get("type"))
|
|
1694
|
+
if isinstance(d.get("properties"), dict):
|
|
1695
|
+
d["properties"] = {k: self._schema_to_plain(v) for k, v in d["properties"].items()}
|
|
1696
|
+
if isinstance(d.get("items"), dict):
|
|
1697
|
+
d["items"] = self._schema_to_plain(d["items"])
|
|
1698
|
+
if "enum" in d and d.get("type") != "STRING":
|
|
1699
|
+
d.pop("enum", None)
|
|
1700
|
+
return d
|
|
1701
|
+
|
|
1702
|
+
return {"type": "OBJECT"}
|
|
1703
|
+
|
|
1704
|
+
def _tune_google_vad(self, live_cfg: dict, opts) -> None:
|
|
1705
|
+
"""
|
|
1706
|
+
Increase end-of-speech hold for automatic VAD in Gemini Live.
|
|
1707
|
+
"""
|
|
1708
|
+
try:
|
|
1709
|
+
ric = live_cfg.setdefault("realtime_input_config", {})
|
|
1710
|
+
aad = ric.setdefault("automatic_activity_detection", {})
|
|
1711
|
+
if aad.get("disabled") is True:
|
|
1712
|
+
return # manual mode, VAD disabled
|
|
1713
|
+
|
|
1714
|
+
# Resolve target silence (default 2000 ms)
|
|
1715
|
+
target_ms = getattr(opts, "vad_end_silence_ms", None)
|
|
1716
|
+
if not isinstance(target_ms, (int, float)) or target_ms <= 0:
|
|
1717
|
+
base = int(aad.get("silence_duration_ms") or 100)
|
|
1718
|
+
target_ms = max(base, 2000)
|
|
1719
|
+
|
|
1720
|
+
aad["silence_duration_ms"] = int(target_ms)
|
|
1721
|
+
|
|
1722
|
+
# Optional: make end-of-speech less aggressive
|
|
1723
|
+
try:
|
|
1724
|
+
aad["end_of_speech_sensitivity"] = gtypes.EndSensitivity.END_SENSITIVITY_LOW
|
|
1725
|
+
except Exception:
|
|
1726
|
+
aad["end_of_speech_sensitivity"] = "END_SENSITIVITY_LOW"
|
|
1727
|
+
|
|
1728
|
+
# Optional: leading padding before detected speech
|
|
1729
|
+
prefix_ms = getattr(opts, "vad_prefix_padding_ms", None)
|
|
1730
|
+
if isinstance(prefix_ms, (int, float)) and prefix_ms >= 0:
|
|
1731
|
+
aad["prefix_padding_ms"] = int(prefix_ms)
|
|
1732
|
+
except Exception:
|
|
1733
|
+
pass
|
|
1734
|
+
|
|
1735
|
+
def set_debug(self, enabled: bool):
|
|
1736
|
+
"""
|
|
1737
|
+
Enable or disable debug logging.
|
|
1738
|
+
|
|
1739
|
+
:param enabled: True to enable debug logging, False to disable.
|
|
1740
|
+
"""
|
|
1741
|
+
self.debug = bool(enabled)
|
|
1742
|
+
|
|
1743
|
+
def is_session_active(self) -> bool:
|
|
1744
|
+
"""Check if the WS session is currently open."""
|
|
1745
|
+
return self._session is not None
|
|
1746
|
+
|
|
1747
|
+
def update_ctx(self, ctx: CtxItem):
|
|
1748
|
+
"""Update the current CtxItem (for session handle persistence)."""
|
|
1749
|
+
self._ctx = ctx
|
|
1750
|
+
|
|
1751
|
+
# -----------------------------
|
|
1752
|
+
# Internal: auto-turn receiver bootstrap
|
|
1753
|
+
# -----------------------------
|
|
1754
|
+
|
|
1755
|
+
def _ensure_auto_receiver_started(self):
|
|
1756
|
+
"""
|
|
1757
|
+
Start a receiver task for one model turn in auto-turn mode if not already active.
|
|
1758
|
+
This guarantees we do not miss server responses when sending live audio chunks.
|
|
1759
|
+
"""
|
|
1760
|
+
# Only in auto-turn mode and with an open session
|
|
1761
|
+
if not self._session:
|
|
1762
|
+
return
|
|
1763
|
+
try:
|
|
1764
|
+
if not bool(getattr(self._last_opts, "auto_turn", False)):
|
|
1765
|
+
return
|
|
1766
|
+
except Exception:
|
|
1767
|
+
return
|
|
1768
|
+
|
|
1769
|
+
# If a previous task exists but is done, clear the ref
|
|
1770
|
+
if self._turn_task and self._turn_task.done():
|
|
1771
|
+
self._turn_task = None
|
|
1772
|
+
|
|
1773
|
+
if not self._response_active:
|
|
1774
|
+
# Reset per-turn collectors
|
|
1775
|
+
self._turn_text_parts = []
|
|
1776
|
+
self._last_out_tr = ""
|
|
1777
|
+
self._audio_buf.clear()
|
|
1778
|
+
self._saw_data_stream = False
|
|
1779
|
+
self._rt_reset_state()
|
|
1780
|
+
|
|
1781
|
+
self._response_active = True
|
|
1782
|
+
if self._response_done is None:
|
|
1783
|
+
self._response_done = asyncio.Event()
|
|
1784
|
+
else:
|
|
1785
|
+
try:
|
|
1786
|
+
self._response_done.clear()
|
|
1787
|
+
except Exception:
|
|
1788
|
+
self._response_done = asyncio.Event()
|
|
1789
|
+
|
|
1790
|
+
self._turn_task = asyncio.create_task(self._recv_one_turn(), name="google-live-auto-turn")
|
|
1791
|
+
|
|
1792
|
+
def update_session_autoturn_sync(
|
|
1793
|
+
self,
|
|
1794
|
+
enabled: bool,
|
|
1795
|
+
silence_ms: Optional[int] = None,
|
|
1796
|
+
prefix_ms: Optional[int] = None,
|
|
1797
|
+
timeout: float = 10.0,
|
|
1798
|
+
):
|
|
1799
|
+
"""
|
|
1800
|
+
Synchronous helper: enable/disable auto-turn (VAD) for Google Live
|
|
1801
|
+
and optionally override silence/prefix (milliseconds).
|
|
1802
|
+
Note: Live API doesn't support mid-session VAD reconfigure; we restart
|
|
1803
|
+
the session safely if it is open.
|
|
1804
|
+
"""
|
|
1805
|
+
self._ensure_background_loop()
|
|
1806
|
+
return self._bg.run_sync(
|
|
1807
|
+
self._update_session_autoturn_internal(enabled, silence_ms, prefix_ms),
|
|
1808
|
+
timeout=timeout
|
|
1809
|
+
)
|
|
1810
|
+
|
|
1811
|
+
async def _update_session_autoturn_internal(
|
|
1812
|
+
self,
|
|
1813
|
+
enabled: bool,
|
|
1814
|
+
silence_ms: Optional[int] = None,
|
|
1815
|
+
prefix_ms: Optional[int] = None,
|
|
1816
|
+
):
|
|
1817
|
+
"""
|
|
1818
|
+
Owner-loop: toggle auto-turn (automatic_activity_detection) and optionally
|
|
1819
|
+
set silence_duration_ms / prefix_padding_ms. If the session is open,
|
|
1820
|
+
perform a safe restart to apply new config. If closed, cache in opts.
|
|
1821
|
+
"""
|
|
1822
|
+
|
|
1823
|
+
# Helper to update cached opts
|
|
1824
|
+
def _apply_to_opts():
|
|
1825
|
+
if not self._last_opts:
|
|
1826
|
+
return
|
|
1827
|
+
try:
|
|
1828
|
+
setattr(self._last_opts, "auto_turn", bool(enabled))
|
|
1829
|
+
except Exception:
|
|
1830
|
+
pass
|
|
1831
|
+
try:
|
|
1832
|
+
if silence_ms is not None:
|
|
1833
|
+
setattr(self._last_opts, "vad_end_silence_ms", int(silence_ms))
|
|
1834
|
+
except Exception:
|
|
1835
|
+
pass
|
|
1836
|
+
try:
|
|
1837
|
+
if prefix_ms is not None:
|
|
1838
|
+
setattr(self._last_opts, "vad_prefix_padding_ms", int(prefix_ms))
|
|
1839
|
+
except Exception:
|
|
1840
|
+
pass
|
|
1841
|
+
|
|
1842
|
+
# If session not open -> just cache and exit
|
|
1843
|
+
if not self._session:
|
|
1844
|
+
_apply_to_opts()
|
|
1845
|
+
if self.debug:
|
|
1846
|
+
print("[google.update_session_autoturn] session not open; cached for next open")
|
|
1847
|
+
return
|
|
1848
|
+
|
|
1849
|
+
# Compute whether anything changes to avoid unnecessary restart
|
|
1850
|
+
cur_enabled = False
|
|
1851
|
+
try:
|
|
1852
|
+
cur_enabled = bool(getattr(self._last_opts, "auto_turn", False))
|
|
1853
|
+
except Exception:
|
|
1854
|
+
pass
|
|
1855
|
+
cur_sil = getattr(self._last_opts, "vad_end_silence_ms", None)
|
|
1856
|
+
cur_pre = getattr(self._last_opts, "vad_prefix_padding_ms", None)
|
|
1857
|
+
|
|
1858
|
+
change = (cur_enabled != bool(enabled))
|
|
1859
|
+
if silence_ms is not None and int(silence_ms) != (int(cur_sil) if isinstance(cur_sil, (int, float)) else None):
|
|
1860
|
+
change = True
|
|
1861
|
+
if prefix_ms is not None and int(prefix_ms) != (int(cur_pre) if isinstance(cur_pre, (int, float)) else None):
|
|
1862
|
+
change = True
|
|
1863
|
+
|
|
1864
|
+
if not change:
|
|
1865
|
+
# Nothing to do; still persist values to opts for consistency
|
|
1866
|
+
_apply_to_opts()
|
|
1867
|
+
if self.debug:
|
|
1868
|
+
print("[google.update_session_autoturn] no changes; skipping restart")
|
|
1869
|
+
return
|
|
1870
|
+
|
|
1871
|
+
# Wait for any active response to finish before restart
|
|
1872
|
+
if self._send_lock is None:
|
|
1873
|
+
self._send_lock = asyncio.Lock()
|
|
1874
|
+
async with self._send_lock:
|
|
1875
|
+
if self._response_active and self._response_done:
|
|
1876
|
+
if self.debug:
|
|
1877
|
+
print("[google.update_session_autoturn] waiting for active response to finish")
|
|
1878
|
+
try:
|
|
1879
|
+
await self._response_done.wait()
|
|
1880
|
+
except Exception:
|
|
1881
|
+
pass
|
|
1882
|
+
|
|
1883
|
+
# Update cached opts with requested values
|
|
1884
|
+
_apply_to_opts()
|
|
1885
|
+
|
|
1886
|
+
# Try to resume after restart using the last known handle (best-effort)
|
|
1887
|
+
prev_handle = self._rt_session_id
|
|
1888
|
+
try:
|
|
1889
|
+
if self._last_opts is not None and prev_handle:
|
|
1890
|
+
setattr(self._last_opts, "rt_session_id", prev_handle)
|
|
1891
|
+
except Exception:
|
|
1892
|
+
pass
|
|
1893
|
+
|
|
1894
|
+
if self.debug:
|
|
1895
|
+
eff_sil = silence_ms if silence_ms is not None else cur_sil
|
|
1896
|
+
eff_pre = prefix_ms if prefix_ms is not None else cur_pre
|
|
1897
|
+
print(f"[google.update_session_autoturn] restarting session; auto_turn={enabled}, "
|
|
1898
|
+
f"silence_ms={eff_sil}, prefix_ms={eff_pre}")
|
|
1899
|
+
|
|
1900
|
+
# Restart session with updated config
|
|
1901
|
+
await self._reset_session_internal(
|
|
1902
|
+
ctx=self._ctx,
|
|
1903
|
+
opts=self._last_opts,
|
|
1904
|
+
on_text=self._on_text,
|
|
1905
|
+
on_audio=self._on_audio,
|
|
1906
|
+
should_stop=self._should_stop,
|
|
1907
|
+
)
|
|
1908
|
+
|
|
1909
|
+
if self.debug:
|
|
1910
|
+
print("[google.update_session_autoturn] session restarted with new VAD settings")
|
|
1911
|
+
|
|
1912
|
+
# -----------------------------
|
|
1913
|
+
# Internal: commit event helpers
|
|
1914
|
+
# -----------------------------
|
|
1915
|
+
|
|
1916
|
+
def _emit_audio_commit_signal(self):
|
|
1917
|
+
"""
|
|
1918
|
+
Emit RT_OUTPUT_AUDIO_COMMIT once per turn in auto-turn mode.
|
|
1919
|
+
"""
|
|
1920
|
+
if self._rt_state is None:
|
|
1921
|
+
self._rt_reset_state()
|
|
1922
|
+
if self._rt_state.get("auto_commit_signaled"):
|
|
1923
|
+
return
|
|
1924
|
+
try:
|
|
1925
|
+
if not bool(getattr(self._last_opts, "auto_turn", False)):
|
|
1926
|
+
return
|
|
1927
|
+
except Exception:
|
|
1928
|
+
return
|
|
1929
|
+
# Limit to audio turns: only when we actually sent auto-turn audio this turn
|
|
1930
|
+
if not self._auto_audio_in_flight:
|
|
1931
|
+
return
|
|
1932
|
+
try:
|
|
1933
|
+
if self._last_opts and hasattr(self._last_opts, "rt_signals"):
|
|
1934
|
+
self._last_opts.rt_signals.response.emit(
|
|
1935
|
+
RealtimeEvent(RealtimeEvent.RT_OUTPUT_AUDIO_COMMIT, {"ctx": self._ctx})
|
|
1936
|
+
)
|
|
1937
|
+
self._rt_state["auto_commit_signaled"] = True
|
|
1938
|
+
except Exception:
|
|
1939
|
+
pass
|
|
1940
|
+
|
|
1941
|
+
def _maybe_emit_auto_commit(self):
|
|
1942
|
+
"""
|
|
1943
|
+
Emit RT_OUTPUT_AUDIO_COMMIT on first sign of model output in auto-turn mode.
|
|
1944
|
+
"""
|
|
1945
|
+
self._emit_audio_commit_signal()
|