livekit-plugins-google 0.3.0__py3-none-any.whl → 1.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/google/__init__.py +25 -7
- livekit/plugins/google/beta/__init__.py +13 -0
- livekit/plugins/google/beta/gemini_tts.py +258 -0
- livekit/plugins/google/llm.py +501 -0
- livekit/plugins/google/log.py +3 -0
- livekit/plugins/google/models.py +145 -31
- livekit/plugins/google/realtime/__init__.py +9 -0
- livekit/plugins/google/realtime/api_proto.py +66 -0
- livekit/plugins/google/realtime/realtime_api.py +1252 -0
- livekit/plugins/google/stt.py +518 -272
- livekit/plugins/google/tools.py +11 -0
- livekit/plugins/google/tts.py +447 -0
- livekit/plugins/google/utils.py +286 -0
- livekit/plugins/google/version.py +1 -1
- livekit_plugins_google-1.3.8.dist-info/METADATA +63 -0
- livekit_plugins_google-1.3.8.dist-info/RECORD +18 -0
- {livekit_plugins_google-0.3.0.dist-info → livekit_plugins_google-1.3.8.dist-info}/WHEEL +1 -2
- livekit_plugins_google-0.3.0.dist-info/METADATA +0 -47
- livekit_plugins_google-0.3.0.dist-info/RECORD +0 -9
- livekit_plugins_google-0.3.0.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,1252 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import contextlib
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import time
|
|
8
|
+
import weakref
|
|
9
|
+
from collections.abc import Iterator
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from typing import Literal
|
|
12
|
+
|
|
13
|
+
from google.auth._default_async import default_async
|
|
14
|
+
from google.genai import Client as GenAIClient, types
|
|
15
|
+
from google.genai.live import AsyncSession
|
|
16
|
+
from livekit import rtc
|
|
17
|
+
from livekit.agents import APIConnectionError, llm, utils
|
|
18
|
+
from livekit.agents.metrics import RealtimeModelMetrics
|
|
19
|
+
from livekit.agents.metrics.base import Metadata
|
|
20
|
+
from livekit.agents.types import (
|
|
21
|
+
DEFAULT_API_CONNECT_OPTIONS,
|
|
22
|
+
NOT_GIVEN,
|
|
23
|
+
APIConnectOptions,
|
|
24
|
+
NotGivenOr,
|
|
25
|
+
)
|
|
26
|
+
from livekit.agents.utils import audio as audio_utils, images, is_given
|
|
27
|
+
from livekit.plugins.google.realtime.api_proto import ClientEvents, LiveAPIModels, Voice
|
|
28
|
+
|
|
29
|
+
from ..log import logger
|
|
30
|
+
from ..tools import _LLMTool
|
|
31
|
+
from ..utils import create_tools_config, get_tool_results_for_realtime, to_fnc_ctx
|
|
32
|
+
from ..version import __version__
|
|
33
|
+
|
|
34
|
+
INPUT_AUDIO_SAMPLE_RATE = 16000
|
|
35
|
+
INPUT_AUDIO_CHANNELS = 1
|
|
36
|
+
OUTPUT_AUDIO_SAMPLE_RATE = 24000
|
|
37
|
+
OUTPUT_AUDIO_CHANNELS = 1
|
|
38
|
+
|
|
39
|
+
DEFAULT_IMAGE_ENCODE_OPTIONS = images.EncodeOptions(
|
|
40
|
+
format="JPEG",
|
|
41
|
+
quality=75,
|
|
42
|
+
resize_options=images.ResizeOptions(width=1024, height=1024, strategy="scale_aspect_fit"),
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
lk_google_debug = int(os.getenv("LK_GOOGLE_DEBUG", 0))
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class InputTranscription:
|
|
50
|
+
item_id: str
|
|
51
|
+
transcript: str
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class _RealtimeOptions:
|
|
56
|
+
model: LiveAPIModels | str
|
|
57
|
+
api_key: str | None
|
|
58
|
+
voice: Voice | str
|
|
59
|
+
language: NotGivenOr[str]
|
|
60
|
+
response_modalities: list[types.Modality]
|
|
61
|
+
vertexai: bool
|
|
62
|
+
project: str | None
|
|
63
|
+
location: str | None
|
|
64
|
+
candidate_count: int
|
|
65
|
+
temperature: NotGivenOr[float]
|
|
66
|
+
max_output_tokens: NotGivenOr[int]
|
|
67
|
+
top_p: NotGivenOr[float]
|
|
68
|
+
top_k: NotGivenOr[int]
|
|
69
|
+
presence_penalty: NotGivenOr[float]
|
|
70
|
+
frequency_penalty: NotGivenOr[float]
|
|
71
|
+
instructions: NotGivenOr[str]
|
|
72
|
+
input_audio_transcription: types.AudioTranscriptionConfig | None
|
|
73
|
+
output_audio_transcription: types.AudioTranscriptionConfig | None
|
|
74
|
+
image_encode_options: NotGivenOr[images.EncodeOptions]
|
|
75
|
+
conn_options: APIConnectOptions
|
|
76
|
+
http_options: NotGivenOr[types.HttpOptions]
|
|
77
|
+
enable_affective_dialog: NotGivenOr[bool] = NOT_GIVEN
|
|
78
|
+
proactivity: NotGivenOr[bool] = NOT_GIVEN
|
|
79
|
+
realtime_input_config: NotGivenOr[types.RealtimeInputConfig] = NOT_GIVEN
|
|
80
|
+
context_window_compression: NotGivenOr[types.ContextWindowCompressionConfig] = NOT_GIVEN
|
|
81
|
+
api_version: NotGivenOr[str] = NOT_GIVEN
|
|
82
|
+
gemini_tools: NotGivenOr[list[_LLMTool]] = NOT_GIVEN
|
|
83
|
+
tool_behavior: NotGivenOr[types.Behavior] = NOT_GIVEN
|
|
84
|
+
tool_response_scheduling: NotGivenOr[types.FunctionResponseScheduling] = NOT_GIVEN
|
|
85
|
+
thinking_config: NotGivenOr[types.ThinkingConfig] = NOT_GIVEN
|
|
86
|
+
session_resumption: NotGivenOr[types.SessionResumptionConfig] = NOT_GIVEN
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@dataclass
|
|
90
|
+
class _ResponseGeneration:
|
|
91
|
+
message_ch: utils.aio.Chan[llm.MessageGeneration]
|
|
92
|
+
function_ch: utils.aio.Chan[llm.FunctionCall]
|
|
93
|
+
|
|
94
|
+
input_id: str
|
|
95
|
+
response_id: str
|
|
96
|
+
text_ch: utils.aio.Chan[str]
|
|
97
|
+
audio_ch: utils.aio.Chan[rtc.AudioFrame]
|
|
98
|
+
|
|
99
|
+
input_transcription: str = ""
|
|
100
|
+
output_text: str = ""
|
|
101
|
+
|
|
102
|
+
_created_timestamp: float = field(default_factory=time.time)
|
|
103
|
+
"""The timestamp when the generation is created"""
|
|
104
|
+
_first_token_timestamp: float | None = None
|
|
105
|
+
"""The timestamp when the first audio token is received"""
|
|
106
|
+
_completed_timestamp: float | None = None
|
|
107
|
+
"""The timestamp when the generation is completed"""
|
|
108
|
+
_done: bool = False
|
|
109
|
+
"""Whether the generation is done (set when the turn is complete)"""
|
|
110
|
+
|
|
111
|
+
def push_text(self, text: str) -> None:
|
|
112
|
+
if self.output_text:
|
|
113
|
+
self.output_text += text
|
|
114
|
+
else:
|
|
115
|
+
self.output_text = text
|
|
116
|
+
|
|
117
|
+
self.text_ch.send_nowait(text)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class RealtimeModel(llm.RealtimeModel):
|
|
121
|
+
def __init__(
|
|
122
|
+
self,
|
|
123
|
+
*,
|
|
124
|
+
instructions: NotGivenOr[str] = NOT_GIVEN,
|
|
125
|
+
model: NotGivenOr[LiveAPIModels | str] = NOT_GIVEN,
|
|
126
|
+
api_key: NotGivenOr[str] = NOT_GIVEN,
|
|
127
|
+
voice: Voice | str = "Puck",
|
|
128
|
+
language: NotGivenOr[str] = NOT_GIVEN,
|
|
129
|
+
modalities: NotGivenOr[list[types.Modality]] = NOT_GIVEN,
|
|
130
|
+
vertexai: NotGivenOr[bool] = NOT_GIVEN,
|
|
131
|
+
project: NotGivenOr[str] = NOT_GIVEN,
|
|
132
|
+
location: NotGivenOr[str] = NOT_GIVEN,
|
|
133
|
+
candidate_count: int = 1,
|
|
134
|
+
temperature: NotGivenOr[float] = NOT_GIVEN,
|
|
135
|
+
max_output_tokens: NotGivenOr[int] = NOT_GIVEN,
|
|
136
|
+
top_p: NotGivenOr[float] = NOT_GIVEN,
|
|
137
|
+
top_k: NotGivenOr[int] = NOT_GIVEN,
|
|
138
|
+
presence_penalty: NotGivenOr[float] = NOT_GIVEN,
|
|
139
|
+
frequency_penalty: NotGivenOr[float] = NOT_GIVEN,
|
|
140
|
+
input_audio_transcription: NotGivenOr[types.AudioTranscriptionConfig | None] = NOT_GIVEN,
|
|
141
|
+
output_audio_transcription: NotGivenOr[types.AudioTranscriptionConfig | None] = NOT_GIVEN,
|
|
142
|
+
image_encode_options: NotGivenOr[images.EncodeOptions] = NOT_GIVEN,
|
|
143
|
+
enable_affective_dialog: NotGivenOr[bool] = NOT_GIVEN,
|
|
144
|
+
proactivity: NotGivenOr[bool] = NOT_GIVEN,
|
|
145
|
+
realtime_input_config: NotGivenOr[types.RealtimeInputConfig] = NOT_GIVEN,
|
|
146
|
+
context_window_compression: NotGivenOr[types.ContextWindowCompressionConfig] = NOT_GIVEN,
|
|
147
|
+
tool_behavior: NotGivenOr[types.Behavior] = NOT_GIVEN,
|
|
148
|
+
tool_response_scheduling: NotGivenOr[types.FunctionResponseScheduling] = NOT_GIVEN,
|
|
149
|
+
session_resumption: NotGivenOr[types.SessionResumptionConfig] = NOT_GIVEN,
|
|
150
|
+
api_version: NotGivenOr[str] = NOT_GIVEN,
|
|
151
|
+
conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
|
|
152
|
+
http_options: NotGivenOr[types.HttpOptions] = NOT_GIVEN,
|
|
153
|
+
_gemini_tools: NotGivenOr[list[_LLMTool]] = NOT_GIVEN,
|
|
154
|
+
thinking_config: NotGivenOr[types.ThinkingConfig] = NOT_GIVEN,
|
|
155
|
+
) -> None:
|
|
156
|
+
"""
|
|
157
|
+
Initializes a RealtimeModel instance for interacting with Google's Realtime API.
|
|
158
|
+
|
|
159
|
+
Environment Requirements:
|
|
160
|
+
- For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file or use any of the other Google Cloud auth methods.
|
|
161
|
+
The Google Cloud project and location can be set via `project` and `location` arguments or the environment variables
|
|
162
|
+
`GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_LOCATION`. By default, the project is inferred from the service account key file,
|
|
163
|
+
and the location defaults to "us-central1".
|
|
164
|
+
- For Google Gemini API: Set the `api_key` argument or the `GOOGLE_API_KEY` environment variable.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
instructions (str, optional): Initial system instructions for the model. Defaults to "".
|
|
168
|
+
api_key (str, optional): Google Gemini API key. If None, will attempt to read from the environment variable GOOGLE_API_KEY.
|
|
169
|
+
modalities (list[Modality], optional): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
|
|
170
|
+
model (str, optional): The name of the model to use. Defaults to "gemini-2.0-flash-live-001" or "gemini-2.0-flash-exp" (vertexai).
|
|
171
|
+
voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
|
|
172
|
+
language (str, optional): The language(BCP-47 Code) to use for the API. supported languages - https://ai.google.dev/gemini-api/docs/live#supported-languages
|
|
173
|
+
temperature (float, optional): Sampling temperature for response generation. Defaults to 0.8.
|
|
174
|
+
vertexai (bool, optional): Whether to use VertexAI for the API. Defaults to False.
|
|
175
|
+
project (str, optional): The project id to use for the API. Defaults to None. (for vertexai)
|
|
176
|
+
location (str, optional): The location to use for the API. Defaults to None. (for vertexai)
|
|
177
|
+
candidate_count (int, optional): The number of candidate responses to generate. Defaults to 1.
|
|
178
|
+
top_p (float, optional): The top-p value for response generation
|
|
179
|
+
top_k (int, optional): The top-k value for response generation
|
|
180
|
+
presence_penalty (float, optional): The presence penalty for response generation
|
|
181
|
+
frequency_penalty (float, optional): The frequency penalty for response generation
|
|
182
|
+
input_audio_transcription (AudioTranscriptionConfig | None, optional): The configuration for input audio transcription. Defaults to None.)
|
|
183
|
+
output_audio_transcription (AudioTranscriptionConfig | None, optional): The configuration for output audio transcription. Defaults to AudioTranscriptionConfig().
|
|
184
|
+
image_encode_options (images.EncodeOptions, optional): The configuration for image encoding. Defaults to DEFAULT_ENCODE_OPTIONS.
|
|
185
|
+
enable_affective_dialog (bool, optional): Whether to enable affective dialog. Defaults to False.
|
|
186
|
+
proactivity (bool, optional): Whether to enable proactive audio. Defaults to False.
|
|
187
|
+
realtime_input_config (RealtimeInputConfig, optional): The configuration for realtime input. Defaults to None.
|
|
188
|
+
context_window_compression (ContextWindowCompressionConfig, optional): The configuration for context window compression. Defaults to None.
|
|
189
|
+
tool_behavior (Behavior, optional): The behavior for tool call. Default behavior is BLOCK in Gemini Realtime API.
|
|
190
|
+
tool_response_scheduling (FunctionResponseScheduling, optional): The scheduling for tool response. Default scheduling is WHEN_IDLE.
|
|
191
|
+
session_resumption (SessionResumptionConfig, optional): The configuration for session resumption. Defaults to None.
|
|
192
|
+
thinking_config (ThinkingConfig, optional): Native audio thinking configuration.
|
|
193
|
+
conn_options (APIConnectOptions, optional): The configuration for the API connection. Defaults to DEFAULT_API_CONNECT_OPTIONS.
|
|
194
|
+
_gemini_tools (list[LLMTool], optional): Gemini-specific tools to use for the session. This parameter is experimental and may change.
|
|
195
|
+
|
|
196
|
+
Raises:
|
|
197
|
+
ValueError: If the API key is required but not found.
|
|
198
|
+
""" # noqa: E501
|
|
199
|
+
if not is_given(input_audio_transcription):
|
|
200
|
+
input_audio_transcription = types.AudioTranscriptionConfig()
|
|
201
|
+
if not is_given(output_audio_transcription):
|
|
202
|
+
output_audio_transcription = types.AudioTranscriptionConfig()
|
|
203
|
+
|
|
204
|
+
server_turn_detection = True
|
|
205
|
+
if (
|
|
206
|
+
is_given(realtime_input_config)
|
|
207
|
+
and realtime_input_config.automatic_activity_detection
|
|
208
|
+
and realtime_input_config.automatic_activity_detection.disabled
|
|
209
|
+
):
|
|
210
|
+
server_turn_detection = False
|
|
211
|
+
modalities = modalities if is_given(modalities) else [types.Modality.AUDIO]
|
|
212
|
+
|
|
213
|
+
super().__init__(
|
|
214
|
+
capabilities=llm.RealtimeCapabilities(
|
|
215
|
+
message_truncation=False,
|
|
216
|
+
turn_detection=server_turn_detection,
|
|
217
|
+
user_transcription=input_audio_transcription is not None,
|
|
218
|
+
auto_tool_reply_generation=True,
|
|
219
|
+
audio_output=types.Modality.AUDIO in modalities,
|
|
220
|
+
manual_function_calls=False,
|
|
221
|
+
)
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
if not is_given(model):
|
|
225
|
+
if vertexai:
|
|
226
|
+
model = "gemini-live-2.5-flash-native-audio"
|
|
227
|
+
else:
|
|
228
|
+
model = "gemini-2.5-flash-native-audio-preview-12-2025"
|
|
229
|
+
|
|
230
|
+
gemini_api_key = api_key if is_given(api_key) else os.environ.get("GOOGLE_API_KEY")
|
|
231
|
+
gcp_project = project if is_given(project) else os.environ.get("GOOGLE_CLOUD_PROJECT")
|
|
232
|
+
gcp_location: str | None = (
|
|
233
|
+
location
|
|
234
|
+
if is_given(location)
|
|
235
|
+
else os.environ.get("GOOGLE_CLOUD_LOCATION") or "us-central1"
|
|
236
|
+
)
|
|
237
|
+
use_vertexai = (
|
|
238
|
+
vertexai
|
|
239
|
+
if is_given(vertexai)
|
|
240
|
+
else os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "0").lower() in ["true", "1"]
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
if use_vertexai:
|
|
244
|
+
if not gcp_project:
|
|
245
|
+
_, gcp_project = default_async( # type: ignore
|
|
246
|
+
scopes=["https://www.googleapis.com/auth/cloud-platform"]
|
|
247
|
+
)
|
|
248
|
+
if not gcp_project or not gcp_location:
|
|
249
|
+
raise ValueError(
|
|
250
|
+
"Project is required for VertexAI via project kwarg or GOOGLE_CLOUD_PROJECT environment variable" # noqa: E501
|
|
251
|
+
)
|
|
252
|
+
gemini_api_key = None # VertexAI does not require an API key
|
|
253
|
+
else:
|
|
254
|
+
gcp_project = None
|
|
255
|
+
gcp_location = None
|
|
256
|
+
if not gemini_api_key:
|
|
257
|
+
raise ValueError(
|
|
258
|
+
"API key is required for Google API either via api_key or GOOGLE_API_KEY environment variable" # noqa: E501
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
self._opts = _RealtimeOptions(
|
|
262
|
+
model=model,
|
|
263
|
+
api_key=gemini_api_key,
|
|
264
|
+
voice=voice,
|
|
265
|
+
response_modalities=modalities,
|
|
266
|
+
vertexai=use_vertexai,
|
|
267
|
+
project=gcp_project,
|
|
268
|
+
location=gcp_location,
|
|
269
|
+
candidate_count=candidate_count,
|
|
270
|
+
temperature=temperature,
|
|
271
|
+
max_output_tokens=max_output_tokens,
|
|
272
|
+
top_p=top_p,
|
|
273
|
+
top_k=top_k,
|
|
274
|
+
presence_penalty=presence_penalty,
|
|
275
|
+
frequency_penalty=frequency_penalty,
|
|
276
|
+
instructions=instructions,
|
|
277
|
+
input_audio_transcription=input_audio_transcription,
|
|
278
|
+
output_audio_transcription=output_audio_transcription,
|
|
279
|
+
language=language,
|
|
280
|
+
image_encode_options=image_encode_options,
|
|
281
|
+
enable_affective_dialog=enable_affective_dialog,
|
|
282
|
+
proactivity=proactivity,
|
|
283
|
+
realtime_input_config=realtime_input_config,
|
|
284
|
+
context_window_compression=context_window_compression,
|
|
285
|
+
api_version=api_version,
|
|
286
|
+
gemini_tools=_gemini_tools,
|
|
287
|
+
tool_behavior=tool_behavior,
|
|
288
|
+
tool_response_scheduling=tool_response_scheduling,
|
|
289
|
+
conn_options=conn_options,
|
|
290
|
+
http_options=http_options,
|
|
291
|
+
thinking_config=thinking_config,
|
|
292
|
+
session_resumption=session_resumption,
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
self._sessions = weakref.WeakSet[RealtimeSession]()
|
|
296
|
+
|
|
297
|
+
@property
|
|
298
|
+
def model(self) -> str:
|
|
299
|
+
return self._opts.model
|
|
300
|
+
|
|
301
|
+
@property
|
|
302
|
+
def provider(self) -> str:
|
|
303
|
+
if self._opts.vertexai:
|
|
304
|
+
return "Vertex AI"
|
|
305
|
+
else:
|
|
306
|
+
return "Gemini"
|
|
307
|
+
|
|
308
|
+
def session(self) -> RealtimeSession:
|
|
309
|
+
sess = RealtimeSession(self)
|
|
310
|
+
self._sessions.add(sess)
|
|
311
|
+
return sess
|
|
312
|
+
|
|
313
|
+
def update_options(
|
|
314
|
+
self,
|
|
315
|
+
*,
|
|
316
|
+
voice: NotGivenOr[str] = NOT_GIVEN,
|
|
317
|
+
temperature: NotGivenOr[float] = NOT_GIVEN,
|
|
318
|
+
tool_behavior: NotGivenOr[types.Behavior] = NOT_GIVEN,
|
|
319
|
+
tool_response_scheduling: NotGivenOr[types.FunctionResponseScheduling] = NOT_GIVEN,
|
|
320
|
+
) -> None:
|
|
321
|
+
"""
|
|
322
|
+
Update the options for the RealtimeModel.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
voice (str, optional): The voice to use for the session.
|
|
326
|
+
temperature (float, optional): The temperature to use for the session.
|
|
327
|
+
tools (list[LLMTool], optional): The tools to use for the session.
|
|
328
|
+
"""
|
|
329
|
+
if is_given(voice):
|
|
330
|
+
self._opts.voice = voice
|
|
331
|
+
|
|
332
|
+
if is_given(temperature):
|
|
333
|
+
self._opts.temperature = temperature
|
|
334
|
+
|
|
335
|
+
if is_given(tool_behavior):
|
|
336
|
+
self._opts.tool_behavior = tool_behavior
|
|
337
|
+
|
|
338
|
+
if is_given(tool_response_scheduling):
|
|
339
|
+
self._opts.tool_response_scheduling = tool_response_scheduling
|
|
340
|
+
|
|
341
|
+
for sess in self._sessions:
|
|
342
|
+
sess.update_options(
|
|
343
|
+
voice=self._opts.voice,
|
|
344
|
+
temperature=self._opts.temperature,
|
|
345
|
+
tool_behavior=self._opts.tool_behavior,
|
|
346
|
+
tool_response_scheduling=self._opts.tool_response_scheduling,
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
async def aclose(self) -> None:
|
|
350
|
+
pass
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
class RealtimeSession(llm.RealtimeSession):
|
|
354
|
+
def __init__(self, realtime_model: RealtimeModel) -> None:
|
|
355
|
+
super().__init__(realtime_model)
|
|
356
|
+
self._opts = realtime_model._opts
|
|
357
|
+
self._tools = llm.ToolContext.empty()
|
|
358
|
+
self._gemini_declarations: list[types.FunctionDeclaration] = []
|
|
359
|
+
self._chat_ctx = llm.ChatContext.empty()
|
|
360
|
+
self._msg_ch = utils.aio.Chan[ClientEvents]()
|
|
361
|
+
self._input_resampler: rtc.AudioResampler | None = None
|
|
362
|
+
|
|
363
|
+
# 50ms chunks
|
|
364
|
+
self._bstream = audio_utils.AudioByteStream(
|
|
365
|
+
INPUT_AUDIO_SAMPLE_RATE,
|
|
366
|
+
INPUT_AUDIO_CHANNELS,
|
|
367
|
+
samples_per_channel=INPUT_AUDIO_SAMPLE_RATE // 20,
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
api_version = self._opts.api_version
|
|
371
|
+
if not api_version and (self._opts.enable_affective_dialog or self._opts.proactivity):
|
|
372
|
+
api_version = "v1alpha"
|
|
373
|
+
|
|
374
|
+
http_options = self._opts.http_options or types.HttpOptions(
|
|
375
|
+
timeout=int(self._opts.conn_options.timeout * 1000)
|
|
376
|
+
)
|
|
377
|
+
if api_version:
|
|
378
|
+
http_options.api_version = api_version
|
|
379
|
+
if not http_options.headers:
|
|
380
|
+
http_options.headers = {}
|
|
381
|
+
http_options.headers["x-goog-api-client"] = f"livekit-agents/{__version__}"
|
|
382
|
+
|
|
383
|
+
self._client = GenAIClient(
|
|
384
|
+
api_key=self._opts.api_key,
|
|
385
|
+
vertexai=self._opts.vertexai,
|
|
386
|
+
project=self._opts.project,
|
|
387
|
+
location=self._opts.location,
|
|
388
|
+
http_options=http_options,
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
self._main_atask = asyncio.create_task(self._main_task(), name="gemini-realtime-session")
|
|
392
|
+
|
|
393
|
+
self._current_generation: _ResponseGeneration | None = None
|
|
394
|
+
self._active_session: AsyncSession | None = None
|
|
395
|
+
# indicates if the underlying session should end
|
|
396
|
+
self._session_should_close = asyncio.Event()
|
|
397
|
+
self._response_created_futures: dict[str, asyncio.Future[llm.GenerationCreatedEvent]] = {}
|
|
398
|
+
self._pending_generation_fut: asyncio.Future[llm.GenerationCreatedEvent] | None = None
|
|
399
|
+
|
|
400
|
+
self._session_resumption_handle: str | None = (
|
|
401
|
+
self._opts.session_resumption.handle
|
|
402
|
+
if is_given(self._opts.session_resumption)
|
|
403
|
+
else None
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
self._in_user_activity = False
|
|
407
|
+
self._session_lock = asyncio.Lock()
|
|
408
|
+
self._num_retries = 0
|
|
409
|
+
|
|
410
|
+
async def _close_active_session(self) -> None:
|
|
411
|
+
async with self._session_lock:
|
|
412
|
+
if self._active_session:
|
|
413
|
+
try:
|
|
414
|
+
await self._active_session.close()
|
|
415
|
+
except Exception as e:
|
|
416
|
+
logger.warning(f"error closing Gemini session: {e}")
|
|
417
|
+
finally:
|
|
418
|
+
self._active_session = None
|
|
419
|
+
|
|
420
|
+
def _mark_restart_needed(self, on_error: bool = False) -> None:
|
|
421
|
+
if not self._session_should_close.is_set():
|
|
422
|
+
self._session_should_close.set()
|
|
423
|
+
# reset the msg_ch, do not send messages from previous session
|
|
424
|
+
if not on_error:
|
|
425
|
+
while not self._msg_ch.empty():
|
|
426
|
+
msg = self._msg_ch.recv_nowait()
|
|
427
|
+
if isinstance(msg, types.LiveClientContent) and msg.turn_complete is True:
|
|
428
|
+
logger.warning(
|
|
429
|
+
"discarding client content for turn completion, may cause generate_reply timeout",
|
|
430
|
+
extra={"content": str(msg)},
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
self._msg_ch = utils.aio.Chan[ClientEvents]()
|
|
434
|
+
|
|
435
|
+
def update_options(
|
|
436
|
+
self,
|
|
437
|
+
*,
|
|
438
|
+
voice: NotGivenOr[str] = NOT_GIVEN,
|
|
439
|
+
temperature: NotGivenOr[float] = NOT_GIVEN,
|
|
440
|
+
tool_choice: NotGivenOr[llm.ToolChoice | None] = NOT_GIVEN,
|
|
441
|
+
tool_behavior: NotGivenOr[types.Behavior] = NOT_GIVEN,
|
|
442
|
+
tool_response_scheduling: NotGivenOr[types.FunctionResponseScheduling] = NOT_GIVEN,
|
|
443
|
+
) -> None:
|
|
444
|
+
should_restart = False
|
|
445
|
+
if is_given(voice) and self._opts.voice != voice:
|
|
446
|
+
self._opts.voice = voice
|
|
447
|
+
should_restart = True
|
|
448
|
+
|
|
449
|
+
if is_given(temperature) and self._opts.temperature != temperature:
|
|
450
|
+
self._opts.temperature = temperature if is_given(temperature) else NOT_GIVEN
|
|
451
|
+
should_restart = True
|
|
452
|
+
|
|
453
|
+
if is_given(tool_behavior) and self._opts.tool_behavior != tool_behavior:
|
|
454
|
+
self._opts.tool_behavior = tool_behavior
|
|
455
|
+
should_restart = True
|
|
456
|
+
|
|
457
|
+
if (
|
|
458
|
+
is_given(tool_response_scheduling)
|
|
459
|
+
and self._opts.tool_response_scheduling != tool_response_scheduling
|
|
460
|
+
):
|
|
461
|
+
self._opts.tool_response_scheduling = tool_response_scheduling
|
|
462
|
+
# no need to restart
|
|
463
|
+
|
|
464
|
+
if is_given(tool_choice):
|
|
465
|
+
logger.warning("tool_choice is not supported by the Google Realtime API.")
|
|
466
|
+
|
|
467
|
+
if should_restart:
|
|
468
|
+
self._mark_restart_needed()
|
|
469
|
+
|
|
470
|
+
async def update_instructions(self, instructions: str) -> None:
|
|
471
|
+
if not is_given(self._opts.instructions) or self._opts.instructions != instructions:
|
|
472
|
+
self._opts.instructions = instructions
|
|
473
|
+
self._mark_restart_needed()
|
|
474
|
+
|
|
475
|
+
async def update_chat_ctx(self, chat_ctx: llm.ChatContext) -> None:
|
|
476
|
+
async with self._session_lock:
|
|
477
|
+
if not self._active_session:
|
|
478
|
+
self._chat_ctx = chat_ctx.copy()
|
|
479
|
+
return
|
|
480
|
+
|
|
481
|
+
diff_ops = llm.utils.compute_chat_ctx_diff(self._chat_ctx, chat_ctx)
|
|
482
|
+
|
|
483
|
+
if diff_ops.to_remove:
|
|
484
|
+
logger.warning("Gemini Live does not support removing messages")
|
|
485
|
+
|
|
486
|
+
append_ctx = llm.ChatContext.empty()
|
|
487
|
+
for _, item_id in diff_ops.to_create:
|
|
488
|
+
item = chat_ctx.get_by_id(item_id)
|
|
489
|
+
if item:
|
|
490
|
+
append_ctx.items.append(item)
|
|
491
|
+
|
|
492
|
+
if append_ctx.items:
|
|
493
|
+
turns_dict, _ = append_ctx.copy(
|
|
494
|
+
exclude_function_call=True,
|
|
495
|
+
).to_provider_format(format="google", inject_dummy_user_message=False)
|
|
496
|
+
# we are not generating, and do not need to inject
|
|
497
|
+
turns = [types.Content.model_validate(turn) for turn in turns_dict]
|
|
498
|
+
tool_results = get_tool_results_for_realtime(
|
|
499
|
+
append_ctx,
|
|
500
|
+
vertexai=self._opts.vertexai,
|
|
501
|
+
tool_response_scheduling=self._opts.tool_response_scheduling,
|
|
502
|
+
)
|
|
503
|
+
if turns:
|
|
504
|
+
self._send_client_event(types.LiveClientContent(turns=turns, turn_complete=False))
|
|
505
|
+
if tool_results:
|
|
506
|
+
self._send_client_event(tool_results)
|
|
507
|
+
|
|
508
|
+
# since we don't have a view of the history on the server side, we'll assume
|
|
509
|
+
# the current state is accurate. this isn't perfect because removals aren't done.
|
|
510
|
+
self._chat_ctx = chat_ctx.copy()
|
|
511
|
+
|
|
512
|
+
async def update_tools(self, tools: list[llm.FunctionTool | llm.RawFunctionTool]) -> None:
|
|
513
|
+
new_declarations: list[types.FunctionDeclaration] = to_fnc_ctx(
|
|
514
|
+
tools, use_parameters_json_schema=False, tool_behavior=self._opts.tool_behavior
|
|
515
|
+
)
|
|
516
|
+
current_tool_names = {f.name for f in self._gemini_declarations}
|
|
517
|
+
new_tool_names = {f.name for f in new_declarations}
|
|
518
|
+
|
|
519
|
+
if current_tool_names != new_tool_names:
|
|
520
|
+
self._gemini_declarations = new_declarations
|
|
521
|
+
self._tools = llm.ToolContext(tools)
|
|
522
|
+
self._mark_restart_needed()
|
|
523
|
+
|
|
524
|
+
@property
|
|
525
|
+
def chat_ctx(self) -> llm.ChatContext:
|
|
526
|
+
return self._chat_ctx.copy()
|
|
527
|
+
|
|
528
|
+
@property
|
|
529
|
+
def tools(self) -> llm.ToolContext:
|
|
530
|
+
return self._tools.copy()
|
|
531
|
+
|
|
532
|
+
@property
|
|
533
|
+
def _manual_activity_detection(self) -> bool:
|
|
534
|
+
if (
|
|
535
|
+
is_given(self._opts.realtime_input_config)
|
|
536
|
+
and self._opts.realtime_input_config.automatic_activity_detection is not None
|
|
537
|
+
and self._opts.realtime_input_config.automatic_activity_detection.disabled
|
|
538
|
+
):
|
|
539
|
+
return True
|
|
540
|
+
return False
|
|
541
|
+
|
|
542
|
+
@property
|
|
543
|
+
def session_resumption_handle(self) -> str | None:
|
|
544
|
+
return self._session_resumption_handle
|
|
545
|
+
|
|
546
|
+
def push_audio(self, frame: rtc.AudioFrame) -> None:
|
|
547
|
+
for f in self._resample_audio(frame):
|
|
548
|
+
for nf in self._bstream.write(f.data.tobytes()):
|
|
549
|
+
realtime_input = types.LiveClientRealtimeInput(
|
|
550
|
+
media_chunks=[
|
|
551
|
+
types.Blob(
|
|
552
|
+
data=nf.data.tobytes(),
|
|
553
|
+
mime_type=f"audio/pcm;rate={INPUT_AUDIO_SAMPLE_RATE}",
|
|
554
|
+
)
|
|
555
|
+
]
|
|
556
|
+
)
|
|
557
|
+
self._send_client_event(realtime_input)
|
|
558
|
+
|
|
559
|
+
def push_video(self, frame: rtc.VideoFrame) -> None:
|
|
560
|
+
encoded_data = images.encode(
|
|
561
|
+
frame, self._opts.image_encode_options or DEFAULT_IMAGE_ENCODE_OPTIONS
|
|
562
|
+
)
|
|
563
|
+
realtime_input = types.LiveClientRealtimeInput(
|
|
564
|
+
media_chunks=[types.Blob(data=encoded_data, mime_type="image/jpeg")]
|
|
565
|
+
)
|
|
566
|
+
self._send_client_event(realtime_input)
|
|
567
|
+
|
|
568
|
+
def _send_client_event(self, event: ClientEvents) -> None:
|
|
569
|
+
with contextlib.suppress(utils.aio.channel.ChanClosed):
|
|
570
|
+
self._msg_ch.send_nowait(event)
|
|
571
|
+
|
|
572
|
+
def generate_reply(
|
|
573
|
+
self, *, instructions: NotGivenOr[str] = NOT_GIVEN
|
|
574
|
+
) -> asyncio.Future[llm.GenerationCreatedEvent]:
|
|
575
|
+
if self._pending_generation_fut and not self._pending_generation_fut.done():
|
|
576
|
+
logger.warning(
|
|
577
|
+
"generate_reply called while another generation is pending, cancelling previous."
|
|
578
|
+
)
|
|
579
|
+
self._pending_generation_fut.cancel("Superseded by new generate_reply call")
|
|
580
|
+
|
|
581
|
+
fut = asyncio.Future[llm.GenerationCreatedEvent]()
|
|
582
|
+
self._pending_generation_fut = fut
|
|
583
|
+
|
|
584
|
+
if self._in_user_activity:
|
|
585
|
+
self._send_client_event(
|
|
586
|
+
types.LiveClientRealtimeInput(
|
|
587
|
+
activity_end=types.ActivityEnd(),
|
|
588
|
+
)
|
|
589
|
+
)
|
|
590
|
+
self._in_user_activity = False
|
|
591
|
+
|
|
592
|
+
# Gemini requires the last message to end with user's turn
|
|
593
|
+
# so we need to add a placeholder user turn in order to trigger a new generation
|
|
594
|
+
turns = []
|
|
595
|
+
if is_given(instructions):
|
|
596
|
+
turns.append(types.Content(parts=[types.Part(text=instructions)], role="model"))
|
|
597
|
+
turns.append(types.Content(parts=[types.Part(text=".")], role="user"))
|
|
598
|
+
self._send_client_event(types.LiveClientContent(turns=turns, turn_complete=True))
|
|
599
|
+
|
|
600
|
+
def _on_timeout() -> None:
|
|
601
|
+
if not fut.done():
|
|
602
|
+
fut.set_exception(
|
|
603
|
+
llm.RealtimeError(
|
|
604
|
+
"generate_reply timed out waiting for generation_created event."
|
|
605
|
+
)
|
|
606
|
+
)
|
|
607
|
+
if self._pending_generation_fut is fut:
|
|
608
|
+
self._pending_generation_fut = None
|
|
609
|
+
|
|
610
|
+
timeout_handle = asyncio.get_event_loop().call_later(5.0, _on_timeout)
|
|
611
|
+
fut.add_done_callback(lambda _: timeout_handle.cancel())
|
|
612
|
+
|
|
613
|
+
return fut
|
|
614
|
+
|
|
615
|
+
def start_user_activity(self) -> None:
|
|
616
|
+
if not self._manual_activity_detection:
|
|
617
|
+
return
|
|
618
|
+
|
|
619
|
+
if not self._in_user_activity:
|
|
620
|
+
self._in_user_activity = True
|
|
621
|
+
self._send_client_event(
|
|
622
|
+
types.LiveClientRealtimeInput(
|
|
623
|
+
activity_start=types.ActivityStart(),
|
|
624
|
+
)
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
def interrupt(self) -> None:
|
|
628
|
+
# Gemini Live treats activity start as interruption, so we rely on start_user_activity
|
|
629
|
+
# notifications to handle it
|
|
630
|
+
if (
|
|
631
|
+
self._opts.realtime_input_config
|
|
632
|
+
and self._opts.realtime_input_config.activity_handling
|
|
633
|
+
== types.ActivityHandling.NO_INTERRUPTION
|
|
634
|
+
):
|
|
635
|
+
return
|
|
636
|
+
self.start_user_activity()
|
|
637
|
+
|
|
638
|
+
def truncate(
|
|
639
|
+
self,
|
|
640
|
+
*,
|
|
641
|
+
message_id: str,
|
|
642
|
+
modalities: list[Literal["text", "audio"]],
|
|
643
|
+
audio_end_ms: int,
|
|
644
|
+
audio_transcript: NotGivenOr[str] = NOT_GIVEN,
|
|
645
|
+
) -> None:
|
|
646
|
+
logger.warning("truncate is not supported by the Google Realtime API.")
|
|
647
|
+
pass
|
|
648
|
+
|
|
649
|
+
async def aclose(self) -> None:
|
|
650
|
+
self._msg_ch.close()
|
|
651
|
+
self._session_should_close.set()
|
|
652
|
+
|
|
653
|
+
if self._main_atask:
|
|
654
|
+
await utils.aio.cancel_and_wait(self._main_atask)
|
|
655
|
+
|
|
656
|
+
await self._close_active_session()
|
|
657
|
+
|
|
658
|
+
if self._pending_generation_fut and not self._pending_generation_fut.done():
|
|
659
|
+
self._pending_generation_fut.cancel("Session closed")
|
|
660
|
+
|
|
661
|
+
for fut in self._response_created_futures.values():
|
|
662
|
+
if not fut.done():
|
|
663
|
+
fut.set_exception(llm.RealtimeError("Session closed before response created"))
|
|
664
|
+
self._response_created_futures.clear()
|
|
665
|
+
|
|
666
|
+
if self._current_generation:
|
|
667
|
+
self._mark_current_generation_done()
|
|
668
|
+
|
|
669
|
+
@utils.log_exceptions(logger=logger)
|
|
670
|
+
async def _main_task(self) -> None:
|
|
671
|
+
max_retries = self._opts.conn_options.max_retry
|
|
672
|
+
|
|
673
|
+
while not self._msg_ch.closed:
|
|
674
|
+
# previous session might not be closed yet, we'll do it here.
|
|
675
|
+
await self._close_active_session()
|
|
676
|
+
|
|
677
|
+
self._session_should_close.clear()
|
|
678
|
+
config = self._build_connect_config()
|
|
679
|
+
session = None
|
|
680
|
+
try:
|
|
681
|
+
logger.debug("connecting to Gemini Realtime API...")
|
|
682
|
+
async with self._client.aio.live.connect(
|
|
683
|
+
model=self._opts.model, config=config
|
|
684
|
+
) as session:
|
|
685
|
+
async with self._session_lock:
|
|
686
|
+
self._active_session = session
|
|
687
|
+
turns_dict, _ = self._chat_ctx.copy(
|
|
688
|
+
exclude_function_call=True,
|
|
689
|
+
).to_provider_format(format="google", inject_dummy_user_message=False)
|
|
690
|
+
if turns_dict:
|
|
691
|
+
turns = [types.Content.model_validate(turn) for turn in turns_dict]
|
|
692
|
+
await session.send_client_content(
|
|
693
|
+
turns=turns, # type: ignore
|
|
694
|
+
turn_complete=False,
|
|
695
|
+
)
|
|
696
|
+
# queue up existing chat context
|
|
697
|
+
send_task = asyncio.create_task(
|
|
698
|
+
self._send_task(session), name="gemini-realtime-send"
|
|
699
|
+
)
|
|
700
|
+
recv_task = asyncio.create_task(
|
|
701
|
+
self._recv_task(session), name="gemini-realtime-recv"
|
|
702
|
+
)
|
|
703
|
+
restart_wait_task = asyncio.create_task(
|
|
704
|
+
self._session_should_close.wait(), name="gemini-restart-wait"
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
done, pending = await asyncio.wait(
|
|
708
|
+
[send_task, recv_task, restart_wait_task],
|
|
709
|
+
return_when=asyncio.FIRST_COMPLETED,
|
|
710
|
+
)
|
|
711
|
+
|
|
712
|
+
for task in done:
|
|
713
|
+
if task is not restart_wait_task and task.exception():
|
|
714
|
+
logger.error(f"error in task {task.get_name()}: {task.exception()}")
|
|
715
|
+
raise task.exception() or Exception(f"{task.get_name()} failed")
|
|
716
|
+
|
|
717
|
+
if restart_wait_task not in done and self._msg_ch.closed:
|
|
718
|
+
break
|
|
719
|
+
|
|
720
|
+
for task in pending:
|
|
721
|
+
await utils.aio.cancel_and_wait(task)
|
|
722
|
+
|
|
723
|
+
except asyncio.CancelledError:
|
|
724
|
+
break
|
|
725
|
+
except Exception as e:
|
|
726
|
+
logger.error(f"Gemini Realtime API error: {e}", exc_info=e)
|
|
727
|
+
if not self._msg_ch.closed:
|
|
728
|
+
# we shouldn't retry when it's not connected, usually this means incorrect
|
|
729
|
+
# parameters or setup
|
|
730
|
+
if not session or max_retries == 0:
|
|
731
|
+
self._emit_error(e, recoverable=False)
|
|
732
|
+
raise APIConnectionError(message="Failed to connect to Gemini Live") from e
|
|
733
|
+
|
|
734
|
+
if self._num_retries == max_retries:
|
|
735
|
+
self._emit_error(e, recoverable=False)
|
|
736
|
+
raise APIConnectionError(
|
|
737
|
+
message=f"Failed to connect to Gemini Live after {max_retries} attempts"
|
|
738
|
+
) from e
|
|
739
|
+
|
|
740
|
+
retry_interval = self._opts.conn_options._interval_for_retry(self._num_retries)
|
|
741
|
+
logger.warning(
|
|
742
|
+
f"Gemini Realtime API connection failed, retrying in {retry_interval}s",
|
|
743
|
+
exc_info=e,
|
|
744
|
+
extra={"attempt": self._num_retries, "max_retries": max_retries},
|
|
745
|
+
)
|
|
746
|
+
await asyncio.sleep(retry_interval)
|
|
747
|
+
self._num_retries += 1
|
|
748
|
+
finally:
|
|
749
|
+
await self._close_active_session()
|
|
750
|
+
|
|
751
|
+
async def _send_task(self, session: AsyncSession) -> None:
|
|
752
|
+
try:
|
|
753
|
+
async for msg in self._msg_ch:
|
|
754
|
+
async with self._session_lock:
|
|
755
|
+
if self._session_should_close.is_set() or (
|
|
756
|
+
not self._active_session or self._active_session != session
|
|
757
|
+
):
|
|
758
|
+
break
|
|
759
|
+
if isinstance(msg, types.LiveClientContent):
|
|
760
|
+
await session.send_client_content(
|
|
761
|
+
turns=msg.turns, # type: ignore
|
|
762
|
+
turn_complete=msg.turn_complete if msg.turn_complete is not None else True,
|
|
763
|
+
)
|
|
764
|
+
elif isinstance(msg, types.LiveClientToolResponse) and msg.function_responses:
|
|
765
|
+
await session.send_tool_response(function_responses=msg.function_responses)
|
|
766
|
+
elif isinstance(msg, types.LiveClientRealtimeInput):
|
|
767
|
+
if msg.media_chunks:
|
|
768
|
+
for media_chunk in msg.media_chunks:
|
|
769
|
+
await session.send_realtime_input(media=media_chunk)
|
|
770
|
+
elif msg.activity_start:
|
|
771
|
+
await session.send_realtime_input(activity_start=msg.activity_start)
|
|
772
|
+
elif msg.activity_end:
|
|
773
|
+
await session.send_realtime_input(activity_end=msg.activity_end)
|
|
774
|
+
else:
|
|
775
|
+
logger.warning(f"Warning: Received unhandled message type: {type(msg)}")
|
|
776
|
+
|
|
777
|
+
if lk_google_debug and isinstance(
|
|
778
|
+
msg,
|
|
779
|
+
(
|
|
780
|
+
types.LiveClientContent,
|
|
781
|
+
types.LiveClientToolResponse,
|
|
782
|
+
types.LiveClientRealtimeInput,
|
|
783
|
+
),
|
|
784
|
+
):
|
|
785
|
+
if not isinstance(msg, types.LiveClientRealtimeInput) or not msg.media_chunks:
|
|
786
|
+
logger.debug(
|
|
787
|
+
f">>> sent {type(msg).__name__}",
|
|
788
|
+
extra={"content": msg.model_dump(exclude_defaults=True)},
|
|
789
|
+
)
|
|
790
|
+
|
|
791
|
+
except Exception as e:
|
|
792
|
+
if not self._session_should_close.is_set():
|
|
793
|
+
logger.error(f"error in send task: {e}", exc_info=e)
|
|
794
|
+
self._mark_restart_needed(on_error=True)
|
|
795
|
+
finally:
|
|
796
|
+
logger.debug("send task finished.")
|
|
797
|
+
|
|
798
|
+
async def _recv_task(self, session: AsyncSession) -> None:
|
|
799
|
+
try:
|
|
800
|
+
while True:
|
|
801
|
+
async with self._session_lock:
|
|
802
|
+
if self._session_should_close.is_set() or (
|
|
803
|
+
not self._active_session or self._active_session != session
|
|
804
|
+
):
|
|
805
|
+
logger.debug("receive task: Session changed or closed, stopping receive.")
|
|
806
|
+
break
|
|
807
|
+
|
|
808
|
+
async for response in session.receive():
|
|
809
|
+
if lk_google_debug:
|
|
810
|
+
resp_copy = response.model_dump(exclude_defaults=True)
|
|
811
|
+
# remove audio from debugging logs
|
|
812
|
+
if (
|
|
813
|
+
(sc := resp_copy.get("server_content"))
|
|
814
|
+
and (mt := sc.get("model_turn"))
|
|
815
|
+
and (parts := mt.get("parts"))
|
|
816
|
+
):
|
|
817
|
+
for part in parts:
|
|
818
|
+
if part and part.get("inline_data"):
|
|
819
|
+
part["inline_data"] = "<audio>"
|
|
820
|
+
logger.debug("<<< received response", extra={"response": resp_copy})
|
|
821
|
+
|
|
822
|
+
if not self._current_generation or self._current_generation._done:
|
|
823
|
+
if (sc := response.server_content) and sc.interrupted:
|
|
824
|
+
# two cases an interrupted event is sent without an active generation
|
|
825
|
+
# 1) the generation is done but playout is not finished (turn_complete -> interrupted)
|
|
826
|
+
# 2) the generation is not started (interrupted -> turn_complete)
|
|
827
|
+
# for both cases, we interrupt the agent if there is no pending generation from `generate_reply`
|
|
828
|
+
# for the second case, the pending generation will be stopped by `turn_complete` event coming later
|
|
829
|
+
if not self._pending_generation_fut:
|
|
830
|
+
self._handle_input_speech_started()
|
|
831
|
+
|
|
832
|
+
sc.interrupted = None
|
|
833
|
+
sc_copy = sc.model_dump(exclude_none=True)
|
|
834
|
+
if not sc_copy:
|
|
835
|
+
# ignore empty server content
|
|
836
|
+
response.server_content = None
|
|
837
|
+
if lk_google_debug:
|
|
838
|
+
logger.debug("ignoring empty server content")
|
|
839
|
+
|
|
840
|
+
if self._is_new_generation(response):
|
|
841
|
+
self._start_new_generation()
|
|
842
|
+
if lk_google_debug:
|
|
843
|
+
logger.debug(f"new generation started: {self._current_generation}")
|
|
844
|
+
|
|
845
|
+
if response.session_resumption_update:
|
|
846
|
+
if (
|
|
847
|
+
response.session_resumption_update.resumable
|
|
848
|
+
and response.session_resumption_update.new_handle
|
|
849
|
+
):
|
|
850
|
+
self._session_resumption_handle = (
|
|
851
|
+
response.session_resumption_update.new_handle
|
|
852
|
+
)
|
|
853
|
+
|
|
854
|
+
if response.server_content:
|
|
855
|
+
self._handle_server_content(response.server_content)
|
|
856
|
+
if response.tool_call:
|
|
857
|
+
self._handle_tool_calls(response.tool_call)
|
|
858
|
+
if response.tool_call_cancellation:
|
|
859
|
+
self._handle_tool_call_cancellation(response.tool_call_cancellation)
|
|
860
|
+
if response.usage_metadata:
|
|
861
|
+
self._handle_usage_metadata(response.usage_metadata)
|
|
862
|
+
if response.go_away:
|
|
863
|
+
self._handle_go_away(response.go_away)
|
|
864
|
+
|
|
865
|
+
if self._num_retries > 0:
|
|
866
|
+
self._num_retries = 0 # reset the retry counter
|
|
867
|
+
|
|
868
|
+
# TODO(dz): a server-side turn is complete
|
|
869
|
+
except Exception as e:
|
|
870
|
+
if not self._session_should_close.is_set():
|
|
871
|
+
logger.error(f"error in receive task: {e}", exc_info=e)
|
|
872
|
+
self._mark_restart_needed(on_error=True)
|
|
873
|
+
finally:
|
|
874
|
+
self._mark_current_generation_done()
|
|
875
|
+
|
|
876
|
+
def _build_connect_config(self) -> types.LiveConnectConfig:
|
|
877
|
+
temp = self._opts.temperature if is_given(self._opts.temperature) else None
|
|
878
|
+
|
|
879
|
+
tools_config = create_tools_config(
|
|
880
|
+
function_tools=self._gemini_declarations,
|
|
881
|
+
gemini_tools=self._opts.gemini_tools if is_given(self._opts.gemini_tools) else None,
|
|
882
|
+
)
|
|
883
|
+
conf = types.LiveConnectConfig(
|
|
884
|
+
response_modalities=self._opts.response_modalities,
|
|
885
|
+
generation_config=types.GenerationConfig(
|
|
886
|
+
candidate_count=self._opts.candidate_count,
|
|
887
|
+
temperature=temp,
|
|
888
|
+
max_output_tokens=self._opts.max_output_tokens
|
|
889
|
+
if is_given(self._opts.max_output_tokens)
|
|
890
|
+
else None,
|
|
891
|
+
top_p=self._opts.top_p if is_given(self._opts.top_p) else None,
|
|
892
|
+
top_k=self._opts.top_k if is_given(self._opts.top_k) else None,
|
|
893
|
+
presence_penalty=self._opts.presence_penalty
|
|
894
|
+
if is_given(self._opts.presence_penalty)
|
|
895
|
+
else None,
|
|
896
|
+
frequency_penalty=self._opts.frequency_penalty
|
|
897
|
+
if is_given(self._opts.frequency_penalty)
|
|
898
|
+
else None,
|
|
899
|
+
thinking_config=self._opts.thinking_config
|
|
900
|
+
if is_given(self._opts.thinking_config)
|
|
901
|
+
else None,
|
|
902
|
+
),
|
|
903
|
+
system_instruction=types.Content(parts=[types.Part(text=self._opts.instructions)])
|
|
904
|
+
if is_given(self._opts.instructions)
|
|
905
|
+
else None,
|
|
906
|
+
speech_config=types.SpeechConfig(
|
|
907
|
+
voice_config=types.VoiceConfig(
|
|
908
|
+
prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=self._opts.voice)
|
|
909
|
+
),
|
|
910
|
+
language_code=self._opts.language if is_given(self._opts.language) else None,
|
|
911
|
+
),
|
|
912
|
+
tools=tools_config,
|
|
913
|
+
input_audio_transcription=self._opts.input_audio_transcription,
|
|
914
|
+
output_audio_transcription=self._opts.output_audio_transcription,
|
|
915
|
+
session_resumption=types.SessionResumptionConfig(
|
|
916
|
+
handle=self._session_resumption_handle
|
|
917
|
+
),
|
|
918
|
+
)
|
|
919
|
+
|
|
920
|
+
if is_given(self._opts.proactivity):
|
|
921
|
+
conf.proactivity = types.ProactivityConfig(proactive_audio=self._opts.proactivity)
|
|
922
|
+
if is_given(self._opts.enable_affective_dialog):
|
|
923
|
+
conf.enable_affective_dialog = self._opts.enable_affective_dialog
|
|
924
|
+
if is_given(self._opts.realtime_input_config):
|
|
925
|
+
conf.realtime_input_config = self._opts.realtime_input_config
|
|
926
|
+
if is_given(self._opts.context_window_compression):
|
|
927
|
+
conf.context_window_compression = self._opts.context_window_compression
|
|
928
|
+
|
|
929
|
+
return conf
|
|
930
|
+
|
|
931
|
+
def _start_new_generation(self) -> None:
|
|
932
|
+
if self._current_generation and not self._current_generation._done:
|
|
933
|
+
logger.warning("starting new generation while another is active. Finalizing previous.")
|
|
934
|
+
self._mark_current_generation_done()
|
|
935
|
+
|
|
936
|
+
response_id = utils.shortuuid("GR_")
|
|
937
|
+
self._current_generation = _ResponseGeneration(
|
|
938
|
+
message_ch=utils.aio.Chan[llm.MessageGeneration](),
|
|
939
|
+
function_ch=utils.aio.Chan[llm.FunctionCall](),
|
|
940
|
+
response_id=response_id,
|
|
941
|
+
input_id=utils.shortuuid("GI_"),
|
|
942
|
+
text_ch=utils.aio.Chan[str](),
|
|
943
|
+
audio_ch=utils.aio.Chan[rtc.AudioFrame](),
|
|
944
|
+
_created_timestamp=time.time(),
|
|
945
|
+
)
|
|
946
|
+
if not self._realtime_model.capabilities.audio_output:
|
|
947
|
+
self._current_generation.audio_ch.close()
|
|
948
|
+
|
|
949
|
+
msg_modalities = asyncio.Future[list[Literal["text", "audio"]]]()
|
|
950
|
+
msg_modalities.set_result(
|
|
951
|
+
["audio", "text"] if self._realtime_model.capabilities.audio_output else ["text"]
|
|
952
|
+
)
|
|
953
|
+
self._current_generation.message_ch.send_nowait(
|
|
954
|
+
llm.MessageGeneration(
|
|
955
|
+
message_id=response_id,
|
|
956
|
+
text_stream=self._current_generation.text_ch,
|
|
957
|
+
audio_stream=self._current_generation.audio_ch,
|
|
958
|
+
modalities=msg_modalities,
|
|
959
|
+
)
|
|
960
|
+
)
|
|
961
|
+
|
|
962
|
+
generation_event = llm.GenerationCreatedEvent(
|
|
963
|
+
message_stream=self._current_generation.message_ch,
|
|
964
|
+
function_stream=self._current_generation.function_ch,
|
|
965
|
+
user_initiated=False,
|
|
966
|
+
response_id=self._current_generation.response_id,
|
|
967
|
+
)
|
|
968
|
+
|
|
969
|
+
if self._pending_generation_fut and not self._pending_generation_fut.done():
|
|
970
|
+
generation_event.user_initiated = True
|
|
971
|
+
self._pending_generation_fut.set_result(generation_event)
|
|
972
|
+
self._pending_generation_fut = None
|
|
973
|
+
else:
|
|
974
|
+
# emit input_speech_started event before starting an agent initiated generation
|
|
975
|
+
# to interrupt the previous audio playout if any
|
|
976
|
+
self._handle_input_speech_started()
|
|
977
|
+
|
|
978
|
+
self.emit("generation_created", generation_event)
|
|
979
|
+
|
|
980
|
+
def _handle_server_content(self, server_content: types.LiveServerContent) -> None:
|
|
981
|
+
current_gen = self._current_generation
|
|
982
|
+
if not current_gen:
|
|
983
|
+
logger.warning("received server content but no active generation.")
|
|
984
|
+
return
|
|
985
|
+
|
|
986
|
+
if model_turn := server_content.model_turn:
|
|
987
|
+
for part in model_turn.parts or []:
|
|
988
|
+
if part.thought:
|
|
989
|
+
# bypass reasoning output
|
|
990
|
+
continue
|
|
991
|
+
if part.text:
|
|
992
|
+
current_gen.push_text(part.text)
|
|
993
|
+
if part.inline_data:
|
|
994
|
+
if not current_gen._first_token_timestamp:
|
|
995
|
+
current_gen._first_token_timestamp = time.time()
|
|
996
|
+
frame_data = part.inline_data.data
|
|
997
|
+
try:
|
|
998
|
+
if not isinstance(frame_data, bytes):
|
|
999
|
+
raise ValueError("frame_data is not bytes")
|
|
1000
|
+
frame = rtc.AudioFrame(
|
|
1001
|
+
data=frame_data,
|
|
1002
|
+
sample_rate=OUTPUT_AUDIO_SAMPLE_RATE,
|
|
1003
|
+
num_channels=OUTPUT_AUDIO_CHANNELS,
|
|
1004
|
+
samples_per_channel=len(frame_data) // (2 * OUTPUT_AUDIO_CHANNELS),
|
|
1005
|
+
)
|
|
1006
|
+
current_gen.audio_ch.send_nowait(frame)
|
|
1007
|
+
except ValueError as e:
|
|
1008
|
+
logger.error(f"Error creating audio frame from Gemini data: {e}")
|
|
1009
|
+
|
|
1010
|
+
if input_transcription := server_content.input_transcription:
|
|
1011
|
+
text = input_transcription.text
|
|
1012
|
+
if text:
|
|
1013
|
+
if current_gen.input_transcription == "":
|
|
1014
|
+
# gemini would start with a space, which doesn't make sense
|
|
1015
|
+
# at beginning of the transcript
|
|
1016
|
+
text = text.lstrip()
|
|
1017
|
+
current_gen.input_transcription += text
|
|
1018
|
+
self.emit(
|
|
1019
|
+
"input_audio_transcription_completed",
|
|
1020
|
+
llm.InputTranscriptionCompleted(
|
|
1021
|
+
item_id=current_gen.input_id,
|
|
1022
|
+
transcript=current_gen.input_transcription,
|
|
1023
|
+
is_final=False,
|
|
1024
|
+
),
|
|
1025
|
+
)
|
|
1026
|
+
|
|
1027
|
+
if output_transcription := server_content.output_transcription:
|
|
1028
|
+
text = output_transcription.text
|
|
1029
|
+
if text:
|
|
1030
|
+
current_gen.push_text(text)
|
|
1031
|
+
|
|
1032
|
+
if server_content.generation_complete or server_content.turn_complete:
|
|
1033
|
+
current_gen._completed_timestamp = time.time()
|
|
1034
|
+
|
|
1035
|
+
if server_content.interrupted and not self._pending_generation_fut:
|
|
1036
|
+
# interrupt agent if there is no pending user initiated generation
|
|
1037
|
+
self._handle_input_speech_started()
|
|
1038
|
+
|
|
1039
|
+
if server_content.turn_complete:
|
|
1040
|
+
self._mark_current_generation_done()
|
|
1041
|
+
|
|
1042
|
+
def _mark_current_generation_done(self) -> None:
|
|
1043
|
+
if not self._current_generation or self._current_generation._done:
|
|
1044
|
+
return
|
|
1045
|
+
|
|
1046
|
+
# emit input_speech_stopped event after the generation is done
|
|
1047
|
+
self._handle_input_speech_stopped()
|
|
1048
|
+
|
|
1049
|
+
gen = self._current_generation
|
|
1050
|
+
|
|
1051
|
+
# The only way we'd know that the transcription is complete is by when they are
|
|
1052
|
+
# done with generation
|
|
1053
|
+
if gen.input_transcription:
|
|
1054
|
+
self.emit(
|
|
1055
|
+
"input_audio_transcription_completed",
|
|
1056
|
+
llm.InputTranscriptionCompleted(
|
|
1057
|
+
item_id=gen.input_id,
|
|
1058
|
+
transcript=gen.input_transcription,
|
|
1059
|
+
is_final=True,
|
|
1060
|
+
),
|
|
1061
|
+
)
|
|
1062
|
+
|
|
1063
|
+
# since gemini doesn't give us a view of the chat history on the server side,
|
|
1064
|
+
# we would handle it manually here
|
|
1065
|
+
self._chat_ctx.add_message(
|
|
1066
|
+
role="user",
|
|
1067
|
+
content=gen.input_transcription,
|
|
1068
|
+
id=gen.input_id,
|
|
1069
|
+
)
|
|
1070
|
+
|
|
1071
|
+
if gen.output_text:
|
|
1072
|
+
self._chat_ctx.add_message(
|
|
1073
|
+
role="assistant",
|
|
1074
|
+
content=gen.output_text,
|
|
1075
|
+
id=gen.response_id,
|
|
1076
|
+
)
|
|
1077
|
+
|
|
1078
|
+
if not gen.text_ch.closed:
|
|
1079
|
+
if self._opts.output_audio_transcription is None:
|
|
1080
|
+
# close the text data of transcription synchronizer
|
|
1081
|
+
gen.text_ch.send_nowait("")
|
|
1082
|
+
gen.text_ch.close()
|
|
1083
|
+
if not gen.audio_ch.closed:
|
|
1084
|
+
gen.audio_ch.close()
|
|
1085
|
+
|
|
1086
|
+
gen.function_ch.close()
|
|
1087
|
+
gen.message_ch.close()
|
|
1088
|
+
gen._done = True
|
|
1089
|
+
if lk_google_debug:
|
|
1090
|
+
logger.debug(f"generation done {gen}")
|
|
1091
|
+
|
|
1092
|
+
def _handle_input_speech_started(self) -> None:
|
|
1093
|
+
self.emit("input_speech_started", llm.InputSpeechStartedEvent())
|
|
1094
|
+
|
|
1095
|
+
def _handle_input_speech_stopped(self) -> None:
|
|
1096
|
+
self.emit(
|
|
1097
|
+
"input_speech_stopped",
|
|
1098
|
+
llm.InputSpeechStoppedEvent(user_transcription_enabled=False),
|
|
1099
|
+
)
|
|
1100
|
+
|
|
1101
|
+
def _handle_tool_calls(self, tool_call: types.LiveServerToolCall) -> None:
|
|
1102
|
+
if not self._current_generation:
|
|
1103
|
+
logger.warning("received tool call but no active generation.")
|
|
1104
|
+
return
|
|
1105
|
+
|
|
1106
|
+
gen = self._current_generation
|
|
1107
|
+
for fnc_call in tool_call.function_calls or []:
|
|
1108
|
+
arguments = json.dumps(fnc_call.args)
|
|
1109
|
+
|
|
1110
|
+
gen.function_ch.send_nowait(
|
|
1111
|
+
llm.FunctionCall(
|
|
1112
|
+
call_id=fnc_call.id or utils.shortuuid("fnc-call-"),
|
|
1113
|
+
name=fnc_call.name,
|
|
1114
|
+
arguments=arguments,
|
|
1115
|
+
)
|
|
1116
|
+
)
|
|
1117
|
+
self._mark_current_generation_done()
|
|
1118
|
+
|
|
1119
|
+
def _handle_tool_call_cancellation(
|
|
1120
|
+
self, tool_call_cancellation: types.LiveServerToolCallCancellation
|
|
1121
|
+
) -> None:
|
|
1122
|
+
logger.warning(
|
|
1123
|
+
"server cancelled tool calls",
|
|
1124
|
+
extra={"function_call_ids": tool_call_cancellation.ids},
|
|
1125
|
+
)
|
|
1126
|
+
|
|
1127
|
+
def _handle_usage_metadata(self, usage_metadata: types.UsageMetadata) -> None:
|
|
1128
|
+
current_gen = self._current_generation
|
|
1129
|
+
if not current_gen:
|
|
1130
|
+
logger.warning("no active generation to report metrics for")
|
|
1131
|
+
return
|
|
1132
|
+
|
|
1133
|
+
ttft = (
|
|
1134
|
+
current_gen._first_token_timestamp - current_gen._created_timestamp
|
|
1135
|
+
if current_gen._first_token_timestamp
|
|
1136
|
+
else -1
|
|
1137
|
+
)
|
|
1138
|
+
duration = (
|
|
1139
|
+
current_gen._completed_timestamp or time.time()
|
|
1140
|
+
) - current_gen._created_timestamp
|
|
1141
|
+
|
|
1142
|
+
def _token_details_map(
|
|
1143
|
+
token_details: list[types.ModalityTokenCount] | None,
|
|
1144
|
+
) -> dict[str, int]:
|
|
1145
|
+
token_details_map = {"audio_tokens": 0, "text_tokens": 0, "image_tokens": 0}
|
|
1146
|
+
if not token_details:
|
|
1147
|
+
return token_details_map
|
|
1148
|
+
|
|
1149
|
+
for token_detail in token_details:
|
|
1150
|
+
if not token_detail.token_count:
|
|
1151
|
+
continue
|
|
1152
|
+
|
|
1153
|
+
if token_detail.modality == types.MediaModality.AUDIO:
|
|
1154
|
+
token_details_map["audio_tokens"] += token_detail.token_count
|
|
1155
|
+
elif token_detail.modality == types.MediaModality.TEXT:
|
|
1156
|
+
token_details_map["text_tokens"] += token_detail.token_count
|
|
1157
|
+
elif token_detail.modality == types.MediaModality.IMAGE:
|
|
1158
|
+
token_details_map["image_tokens"] += token_detail.token_count
|
|
1159
|
+
return token_details_map
|
|
1160
|
+
|
|
1161
|
+
metrics = RealtimeModelMetrics(
|
|
1162
|
+
label=self._realtime_model.label,
|
|
1163
|
+
request_id=current_gen.response_id,
|
|
1164
|
+
timestamp=current_gen._created_timestamp,
|
|
1165
|
+
duration=duration,
|
|
1166
|
+
ttft=ttft,
|
|
1167
|
+
cancelled=False,
|
|
1168
|
+
input_tokens=usage_metadata.prompt_token_count or 0,
|
|
1169
|
+
output_tokens=usage_metadata.response_token_count or 0,
|
|
1170
|
+
total_tokens=usage_metadata.total_token_count or 0,
|
|
1171
|
+
tokens_per_second=(usage_metadata.response_token_count or 0) / duration
|
|
1172
|
+
if duration > 0
|
|
1173
|
+
else 0,
|
|
1174
|
+
input_token_details=RealtimeModelMetrics.InputTokenDetails(
|
|
1175
|
+
**_token_details_map(usage_metadata.prompt_tokens_details),
|
|
1176
|
+
cached_tokens=sum(
|
|
1177
|
+
token_detail.token_count or 0
|
|
1178
|
+
for token_detail in usage_metadata.cache_tokens_details or []
|
|
1179
|
+
),
|
|
1180
|
+
cached_tokens_details=RealtimeModelMetrics.CachedTokenDetails(
|
|
1181
|
+
**_token_details_map(usage_metadata.cache_tokens_details),
|
|
1182
|
+
),
|
|
1183
|
+
),
|
|
1184
|
+
output_token_details=RealtimeModelMetrics.OutputTokenDetails(
|
|
1185
|
+
**_token_details_map(usage_metadata.response_tokens_details),
|
|
1186
|
+
),
|
|
1187
|
+
metadata=Metadata(
|
|
1188
|
+
model_name=self._realtime_model.model, model_provider=self._realtime_model.provider
|
|
1189
|
+
),
|
|
1190
|
+
)
|
|
1191
|
+
self.emit("metrics_collected", metrics)
|
|
1192
|
+
|
|
1193
|
+
def _handle_go_away(self, go_away: types.LiveServerGoAway) -> None:
|
|
1194
|
+
logger.warning(
|
|
1195
|
+
f"Gemini server indicates disconnection soon. Time left: {go_away.time_left}"
|
|
1196
|
+
)
|
|
1197
|
+
# TODO(dz): this isn't a seamless reconnection just yet
|
|
1198
|
+
self._session_should_close.set()
|
|
1199
|
+
|
|
1200
|
+
def commit_audio(self) -> None:
|
|
1201
|
+
pass
|
|
1202
|
+
|
|
1203
|
+
def clear_audio(self) -> None:
|
|
1204
|
+
pass
|
|
1205
|
+
|
|
1206
|
+
def _resample_audio(self, frame: rtc.AudioFrame) -> Iterator[rtc.AudioFrame]:
|
|
1207
|
+
if self._input_resampler:
|
|
1208
|
+
if frame.sample_rate != self._input_resampler._input_rate:
|
|
1209
|
+
# input audio changed to a different sample rate
|
|
1210
|
+
self._input_resampler = None
|
|
1211
|
+
|
|
1212
|
+
if self._input_resampler is None and (
|
|
1213
|
+
frame.sample_rate != INPUT_AUDIO_SAMPLE_RATE
|
|
1214
|
+
or frame.num_channels != INPUT_AUDIO_CHANNELS
|
|
1215
|
+
):
|
|
1216
|
+
self._input_resampler = rtc.AudioResampler(
|
|
1217
|
+
input_rate=frame.sample_rate,
|
|
1218
|
+
output_rate=INPUT_AUDIO_SAMPLE_RATE,
|
|
1219
|
+
num_channels=INPUT_AUDIO_CHANNELS,
|
|
1220
|
+
)
|
|
1221
|
+
|
|
1222
|
+
if self._input_resampler:
|
|
1223
|
+
# TODO(long): flush the resampler when the input source is changed
|
|
1224
|
+
yield from self._input_resampler.push(frame)
|
|
1225
|
+
else:
|
|
1226
|
+
yield frame
|
|
1227
|
+
|
|
1228
|
+
def _emit_error(self, error: Exception, recoverable: bool) -> None:
|
|
1229
|
+
self.emit(
|
|
1230
|
+
"error",
|
|
1231
|
+
llm.RealtimeModelError(
|
|
1232
|
+
timestamp=time.time(),
|
|
1233
|
+
label=self._realtime_model._label,
|
|
1234
|
+
error=error,
|
|
1235
|
+
recoverable=recoverable,
|
|
1236
|
+
),
|
|
1237
|
+
)
|
|
1238
|
+
|
|
1239
|
+
def _is_new_generation(self, resp: types.LiveServerMessage) -> bool:
|
|
1240
|
+
if resp.tool_call:
|
|
1241
|
+
return True
|
|
1242
|
+
|
|
1243
|
+
if (sc := resp.server_content) and (
|
|
1244
|
+
sc.model_turn
|
|
1245
|
+
or (sc.output_transcription and sc.output_transcription is not None)
|
|
1246
|
+
or (sc.input_transcription and sc.input_transcription is not None)
|
|
1247
|
+
or (sc.generation_complete is not None)
|
|
1248
|
+
or (sc.turn_complete is not None)
|
|
1249
|
+
):
|
|
1250
|
+
return True
|
|
1251
|
+
|
|
1252
|
+
return False
|