livekit-plugins-google 0.3.0__py3-none-any.whl → 1.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1252 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import contextlib
5
+ import json
6
+ import os
7
+ import time
8
+ import weakref
9
+ from collections.abc import Iterator
10
+ from dataclasses import dataclass, field
11
+ from typing import Literal
12
+
13
+ from google.auth._default_async import default_async
14
+ from google.genai import Client as GenAIClient, types
15
+ from google.genai.live import AsyncSession
16
+ from livekit import rtc
17
+ from livekit.agents import APIConnectionError, llm, utils
18
+ from livekit.agents.metrics import RealtimeModelMetrics
19
+ from livekit.agents.metrics.base import Metadata
20
+ from livekit.agents.types import (
21
+ DEFAULT_API_CONNECT_OPTIONS,
22
+ NOT_GIVEN,
23
+ APIConnectOptions,
24
+ NotGivenOr,
25
+ )
26
+ from livekit.agents.utils import audio as audio_utils, images, is_given
27
+ from livekit.plugins.google.realtime.api_proto import ClientEvents, LiveAPIModels, Voice
28
+
29
+ from ..log import logger
30
+ from ..tools import _LLMTool
31
+ from ..utils import create_tools_config, get_tool_results_for_realtime, to_fnc_ctx
32
+ from ..version import __version__
33
+
34
+ INPUT_AUDIO_SAMPLE_RATE = 16000
35
+ INPUT_AUDIO_CHANNELS = 1
36
+ OUTPUT_AUDIO_SAMPLE_RATE = 24000
37
+ OUTPUT_AUDIO_CHANNELS = 1
38
+
39
+ DEFAULT_IMAGE_ENCODE_OPTIONS = images.EncodeOptions(
40
+ format="JPEG",
41
+ quality=75,
42
+ resize_options=images.ResizeOptions(width=1024, height=1024, strategy="scale_aspect_fit"),
43
+ )
44
+
45
+ lk_google_debug = int(os.getenv("LK_GOOGLE_DEBUG", 0))
46
+
47
+
48
+ @dataclass
49
+ class InputTranscription:
50
+ item_id: str
51
+ transcript: str
52
+
53
+
54
+ @dataclass
55
+ class _RealtimeOptions:
56
+ model: LiveAPIModels | str
57
+ api_key: str | None
58
+ voice: Voice | str
59
+ language: NotGivenOr[str]
60
+ response_modalities: list[types.Modality]
61
+ vertexai: bool
62
+ project: str | None
63
+ location: str | None
64
+ candidate_count: int
65
+ temperature: NotGivenOr[float]
66
+ max_output_tokens: NotGivenOr[int]
67
+ top_p: NotGivenOr[float]
68
+ top_k: NotGivenOr[int]
69
+ presence_penalty: NotGivenOr[float]
70
+ frequency_penalty: NotGivenOr[float]
71
+ instructions: NotGivenOr[str]
72
+ input_audio_transcription: types.AudioTranscriptionConfig | None
73
+ output_audio_transcription: types.AudioTranscriptionConfig | None
74
+ image_encode_options: NotGivenOr[images.EncodeOptions]
75
+ conn_options: APIConnectOptions
76
+ http_options: NotGivenOr[types.HttpOptions]
77
+ enable_affective_dialog: NotGivenOr[bool] = NOT_GIVEN
78
+ proactivity: NotGivenOr[bool] = NOT_GIVEN
79
+ realtime_input_config: NotGivenOr[types.RealtimeInputConfig] = NOT_GIVEN
80
+ context_window_compression: NotGivenOr[types.ContextWindowCompressionConfig] = NOT_GIVEN
81
+ api_version: NotGivenOr[str] = NOT_GIVEN
82
+ gemini_tools: NotGivenOr[list[_LLMTool]] = NOT_GIVEN
83
+ tool_behavior: NotGivenOr[types.Behavior] = NOT_GIVEN
84
+ tool_response_scheduling: NotGivenOr[types.FunctionResponseScheduling] = NOT_GIVEN
85
+ thinking_config: NotGivenOr[types.ThinkingConfig] = NOT_GIVEN
86
+ session_resumption: NotGivenOr[types.SessionResumptionConfig] = NOT_GIVEN
87
+
88
+
89
+ @dataclass
90
+ class _ResponseGeneration:
91
+ message_ch: utils.aio.Chan[llm.MessageGeneration]
92
+ function_ch: utils.aio.Chan[llm.FunctionCall]
93
+
94
+ input_id: str
95
+ response_id: str
96
+ text_ch: utils.aio.Chan[str]
97
+ audio_ch: utils.aio.Chan[rtc.AudioFrame]
98
+
99
+ input_transcription: str = ""
100
+ output_text: str = ""
101
+
102
+ _created_timestamp: float = field(default_factory=time.time)
103
+ """The timestamp when the generation is created"""
104
+ _first_token_timestamp: float | None = None
105
+ """The timestamp when the first audio token is received"""
106
+ _completed_timestamp: float | None = None
107
+ """The timestamp when the generation is completed"""
108
+ _done: bool = False
109
+ """Whether the generation is done (set when the turn is complete)"""
110
+
111
+ def push_text(self, text: str) -> None:
112
+ if self.output_text:
113
+ self.output_text += text
114
+ else:
115
+ self.output_text = text
116
+
117
+ self.text_ch.send_nowait(text)
118
+
119
+
120
+ class RealtimeModel(llm.RealtimeModel):
121
+ def __init__(
122
+ self,
123
+ *,
124
+ instructions: NotGivenOr[str] = NOT_GIVEN,
125
+ model: NotGivenOr[LiveAPIModels | str] = NOT_GIVEN,
126
+ api_key: NotGivenOr[str] = NOT_GIVEN,
127
+ voice: Voice | str = "Puck",
128
+ language: NotGivenOr[str] = NOT_GIVEN,
129
+ modalities: NotGivenOr[list[types.Modality]] = NOT_GIVEN,
130
+ vertexai: NotGivenOr[bool] = NOT_GIVEN,
131
+ project: NotGivenOr[str] = NOT_GIVEN,
132
+ location: NotGivenOr[str] = NOT_GIVEN,
133
+ candidate_count: int = 1,
134
+ temperature: NotGivenOr[float] = NOT_GIVEN,
135
+ max_output_tokens: NotGivenOr[int] = NOT_GIVEN,
136
+ top_p: NotGivenOr[float] = NOT_GIVEN,
137
+ top_k: NotGivenOr[int] = NOT_GIVEN,
138
+ presence_penalty: NotGivenOr[float] = NOT_GIVEN,
139
+ frequency_penalty: NotGivenOr[float] = NOT_GIVEN,
140
+ input_audio_transcription: NotGivenOr[types.AudioTranscriptionConfig | None] = NOT_GIVEN,
141
+ output_audio_transcription: NotGivenOr[types.AudioTranscriptionConfig | None] = NOT_GIVEN,
142
+ image_encode_options: NotGivenOr[images.EncodeOptions] = NOT_GIVEN,
143
+ enable_affective_dialog: NotGivenOr[bool] = NOT_GIVEN,
144
+ proactivity: NotGivenOr[bool] = NOT_GIVEN,
145
+ realtime_input_config: NotGivenOr[types.RealtimeInputConfig] = NOT_GIVEN,
146
+ context_window_compression: NotGivenOr[types.ContextWindowCompressionConfig] = NOT_GIVEN,
147
+ tool_behavior: NotGivenOr[types.Behavior] = NOT_GIVEN,
148
+ tool_response_scheduling: NotGivenOr[types.FunctionResponseScheduling] = NOT_GIVEN,
149
+ session_resumption: NotGivenOr[types.SessionResumptionConfig] = NOT_GIVEN,
150
+ api_version: NotGivenOr[str] = NOT_GIVEN,
151
+ conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
152
+ http_options: NotGivenOr[types.HttpOptions] = NOT_GIVEN,
153
+ _gemini_tools: NotGivenOr[list[_LLMTool]] = NOT_GIVEN,
154
+ thinking_config: NotGivenOr[types.ThinkingConfig] = NOT_GIVEN,
155
+ ) -> None:
156
+ """
157
+ Initializes a RealtimeModel instance for interacting with Google's Realtime API.
158
+
159
+ Environment Requirements:
160
+ - For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file or use any of the other Google Cloud auth methods.
161
+ The Google Cloud project and location can be set via `project` and `location` arguments or the environment variables
162
+ `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_LOCATION`. By default, the project is inferred from the service account key file,
163
+ and the location defaults to "us-central1".
164
+ - For Google Gemini API: Set the `api_key` argument or the `GOOGLE_API_KEY` environment variable.
165
+
166
+ Args:
167
+ instructions (str, optional): Initial system instructions for the model. Defaults to "".
168
+ api_key (str, optional): Google Gemini API key. If None, will attempt to read from the environment variable GOOGLE_API_KEY.
169
+ modalities (list[Modality], optional): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
170
+ model (str, optional): The name of the model to use. Defaults to "gemini-2.0-flash-live-001" or "gemini-2.0-flash-exp" (vertexai).
171
+ voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
172
+ language (str, optional): The language(BCP-47 Code) to use for the API. supported languages - https://ai.google.dev/gemini-api/docs/live#supported-languages
173
+ temperature (float, optional): Sampling temperature for response generation. Defaults to 0.8.
174
+ vertexai (bool, optional): Whether to use VertexAI for the API. Defaults to False.
175
+ project (str, optional): The project id to use for the API. Defaults to None. (for vertexai)
176
+ location (str, optional): The location to use for the API. Defaults to None. (for vertexai)
177
+ candidate_count (int, optional): The number of candidate responses to generate. Defaults to 1.
178
+ top_p (float, optional): The top-p value for response generation
179
+ top_k (int, optional): The top-k value for response generation
180
+ presence_penalty (float, optional): The presence penalty for response generation
181
+ frequency_penalty (float, optional): The frequency penalty for response generation
182
+ input_audio_transcription (AudioTranscriptionConfig | None, optional): The configuration for input audio transcription. Defaults to None.)
183
+ output_audio_transcription (AudioTranscriptionConfig | None, optional): The configuration for output audio transcription. Defaults to AudioTranscriptionConfig().
184
+ image_encode_options (images.EncodeOptions, optional): The configuration for image encoding. Defaults to DEFAULT_ENCODE_OPTIONS.
185
+ enable_affective_dialog (bool, optional): Whether to enable affective dialog. Defaults to False.
186
+ proactivity (bool, optional): Whether to enable proactive audio. Defaults to False.
187
+ realtime_input_config (RealtimeInputConfig, optional): The configuration for realtime input. Defaults to None.
188
+ context_window_compression (ContextWindowCompressionConfig, optional): The configuration for context window compression. Defaults to None.
189
+ tool_behavior (Behavior, optional): The behavior for tool call. Default behavior is BLOCK in Gemini Realtime API.
190
+ tool_response_scheduling (FunctionResponseScheduling, optional): The scheduling for tool response. Default scheduling is WHEN_IDLE.
191
+ session_resumption (SessionResumptionConfig, optional): The configuration for session resumption. Defaults to None.
192
+ thinking_config (ThinkingConfig, optional): Native audio thinking configuration.
193
+ conn_options (APIConnectOptions, optional): The configuration for the API connection. Defaults to DEFAULT_API_CONNECT_OPTIONS.
194
+ _gemini_tools (list[LLMTool], optional): Gemini-specific tools to use for the session. This parameter is experimental and may change.
195
+
196
+ Raises:
197
+ ValueError: If the API key is required but not found.
198
+ """ # noqa: E501
199
+ if not is_given(input_audio_transcription):
200
+ input_audio_transcription = types.AudioTranscriptionConfig()
201
+ if not is_given(output_audio_transcription):
202
+ output_audio_transcription = types.AudioTranscriptionConfig()
203
+
204
+ server_turn_detection = True
205
+ if (
206
+ is_given(realtime_input_config)
207
+ and realtime_input_config.automatic_activity_detection
208
+ and realtime_input_config.automatic_activity_detection.disabled
209
+ ):
210
+ server_turn_detection = False
211
+ modalities = modalities if is_given(modalities) else [types.Modality.AUDIO]
212
+
213
+ super().__init__(
214
+ capabilities=llm.RealtimeCapabilities(
215
+ message_truncation=False,
216
+ turn_detection=server_turn_detection,
217
+ user_transcription=input_audio_transcription is not None,
218
+ auto_tool_reply_generation=True,
219
+ audio_output=types.Modality.AUDIO in modalities,
220
+ manual_function_calls=False,
221
+ )
222
+ )
223
+
224
+ if not is_given(model):
225
+ if vertexai:
226
+ model = "gemini-live-2.5-flash-native-audio"
227
+ else:
228
+ model = "gemini-2.5-flash-native-audio-preview-12-2025"
229
+
230
+ gemini_api_key = api_key if is_given(api_key) else os.environ.get("GOOGLE_API_KEY")
231
+ gcp_project = project if is_given(project) else os.environ.get("GOOGLE_CLOUD_PROJECT")
232
+ gcp_location: str | None = (
233
+ location
234
+ if is_given(location)
235
+ else os.environ.get("GOOGLE_CLOUD_LOCATION") or "us-central1"
236
+ )
237
+ use_vertexai = (
238
+ vertexai
239
+ if is_given(vertexai)
240
+ else os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "0").lower() in ["true", "1"]
241
+ )
242
+
243
+ if use_vertexai:
244
+ if not gcp_project:
245
+ _, gcp_project = default_async( # type: ignore
246
+ scopes=["https://www.googleapis.com/auth/cloud-platform"]
247
+ )
248
+ if not gcp_project or not gcp_location:
249
+ raise ValueError(
250
+ "Project is required for VertexAI via project kwarg or GOOGLE_CLOUD_PROJECT environment variable" # noqa: E501
251
+ )
252
+ gemini_api_key = None # VertexAI does not require an API key
253
+ else:
254
+ gcp_project = None
255
+ gcp_location = None
256
+ if not gemini_api_key:
257
+ raise ValueError(
258
+ "API key is required for Google API either via api_key or GOOGLE_API_KEY environment variable" # noqa: E501
259
+ )
260
+
261
+ self._opts = _RealtimeOptions(
262
+ model=model,
263
+ api_key=gemini_api_key,
264
+ voice=voice,
265
+ response_modalities=modalities,
266
+ vertexai=use_vertexai,
267
+ project=gcp_project,
268
+ location=gcp_location,
269
+ candidate_count=candidate_count,
270
+ temperature=temperature,
271
+ max_output_tokens=max_output_tokens,
272
+ top_p=top_p,
273
+ top_k=top_k,
274
+ presence_penalty=presence_penalty,
275
+ frequency_penalty=frequency_penalty,
276
+ instructions=instructions,
277
+ input_audio_transcription=input_audio_transcription,
278
+ output_audio_transcription=output_audio_transcription,
279
+ language=language,
280
+ image_encode_options=image_encode_options,
281
+ enable_affective_dialog=enable_affective_dialog,
282
+ proactivity=proactivity,
283
+ realtime_input_config=realtime_input_config,
284
+ context_window_compression=context_window_compression,
285
+ api_version=api_version,
286
+ gemini_tools=_gemini_tools,
287
+ tool_behavior=tool_behavior,
288
+ tool_response_scheduling=tool_response_scheduling,
289
+ conn_options=conn_options,
290
+ http_options=http_options,
291
+ thinking_config=thinking_config,
292
+ session_resumption=session_resumption,
293
+ )
294
+
295
+ self._sessions = weakref.WeakSet[RealtimeSession]()
296
+
297
+ @property
298
+ def model(self) -> str:
299
+ return self._opts.model
300
+
301
+ @property
302
+ def provider(self) -> str:
303
+ if self._opts.vertexai:
304
+ return "Vertex AI"
305
+ else:
306
+ return "Gemini"
307
+
308
+ def session(self) -> RealtimeSession:
309
+ sess = RealtimeSession(self)
310
+ self._sessions.add(sess)
311
+ return sess
312
+
313
+ def update_options(
314
+ self,
315
+ *,
316
+ voice: NotGivenOr[str] = NOT_GIVEN,
317
+ temperature: NotGivenOr[float] = NOT_GIVEN,
318
+ tool_behavior: NotGivenOr[types.Behavior] = NOT_GIVEN,
319
+ tool_response_scheduling: NotGivenOr[types.FunctionResponseScheduling] = NOT_GIVEN,
320
+ ) -> None:
321
+ """
322
+ Update the options for the RealtimeModel.
323
+
324
+ Args:
325
+ voice (str, optional): The voice to use for the session.
326
+ temperature (float, optional): The temperature to use for the session.
327
+ tools (list[LLMTool], optional): The tools to use for the session.
328
+ """
329
+ if is_given(voice):
330
+ self._opts.voice = voice
331
+
332
+ if is_given(temperature):
333
+ self._opts.temperature = temperature
334
+
335
+ if is_given(tool_behavior):
336
+ self._opts.tool_behavior = tool_behavior
337
+
338
+ if is_given(tool_response_scheduling):
339
+ self._opts.tool_response_scheduling = tool_response_scheduling
340
+
341
+ for sess in self._sessions:
342
+ sess.update_options(
343
+ voice=self._opts.voice,
344
+ temperature=self._opts.temperature,
345
+ tool_behavior=self._opts.tool_behavior,
346
+ tool_response_scheduling=self._opts.tool_response_scheduling,
347
+ )
348
+
349
+ async def aclose(self) -> None:
350
+ pass
351
+
352
+
353
+ class RealtimeSession(llm.RealtimeSession):
354
+ def __init__(self, realtime_model: RealtimeModel) -> None:
355
+ super().__init__(realtime_model)
356
+ self._opts = realtime_model._opts
357
+ self._tools = llm.ToolContext.empty()
358
+ self._gemini_declarations: list[types.FunctionDeclaration] = []
359
+ self._chat_ctx = llm.ChatContext.empty()
360
+ self._msg_ch = utils.aio.Chan[ClientEvents]()
361
+ self._input_resampler: rtc.AudioResampler | None = None
362
+
363
+ # 50ms chunks
364
+ self._bstream = audio_utils.AudioByteStream(
365
+ INPUT_AUDIO_SAMPLE_RATE,
366
+ INPUT_AUDIO_CHANNELS,
367
+ samples_per_channel=INPUT_AUDIO_SAMPLE_RATE // 20,
368
+ )
369
+
370
+ api_version = self._opts.api_version
371
+ if not api_version and (self._opts.enable_affective_dialog or self._opts.proactivity):
372
+ api_version = "v1alpha"
373
+
374
+ http_options = self._opts.http_options or types.HttpOptions(
375
+ timeout=int(self._opts.conn_options.timeout * 1000)
376
+ )
377
+ if api_version:
378
+ http_options.api_version = api_version
379
+ if not http_options.headers:
380
+ http_options.headers = {}
381
+ http_options.headers["x-goog-api-client"] = f"livekit-agents/{__version__}"
382
+
383
+ self._client = GenAIClient(
384
+ api_key=self._opts.api_key,
385
+ vertexai=self._opts.vertexai,
386
+ project=self._opts.project,
387
+ location=self._opts.location,
388
+ http_options=http_options,
389
+ )
390
+
391
+ self._main_atask = asyncio.create_task(self._main_task(), name="gemini-realtime-session")
392
+
393
+ self._current_generation: _ResponseGeneration | None = None
394
+ self._active_session: AsyncSession | None = None
395
+ # indicates if the underlying session should end
396
+ self._session_should_close = asyncio.Event()
397
+ self._response_created_futures: dict[str, asyncio.Future[llm.GenerationCreatedEvent]] = {}
398
+ self._pending_generation_fut: asyncio.Future[llm.GenerationCreatedEvent] | None = None
399
+
400
+ self._session_resumption_handle: str | None = (
401
+ self._opts.session_resumption.handle
402
+ if is_given(self._opts.session_resumption)
403
+ else None
404
+ )
405
+
406
+ self._in_user_activity = False
407
+ self._session_lock = asyncio.Lock()
408
+ self._num_retries = 0
409
+
410
+ async def _close_active_session(self) -> None:
411
+ async with self._session_lock:
412
+ if self._active_session:
413
+ try:
414
+ await self._active_session.close()
415
+ except Exception as e:
416
+ logger.warning(f"error closing Gemini session: {e}")
417
+ finally:
418
+ self._active_session = None
419
+
420
+ def _mark_restart_needed(self, on_error: bool = False) -> None:
421
+ if not self._session_should_close.is_set():
422
+ self._session_should_close.set()
423
+ # reset the msg_ch, do not send messages from previous session
424
+ if not on_error:
425
+ while not self._msg_ch.empty():
426
+ msg = self._msg_ch.recv_nowait()
427
+ if isinstance(msg, types.LiveClientContent) and msg.turn_complete is True:
428
+ logger.warning(
429
+ "discarding client content for turn completion, may cause generate_reply timeout",
430
+ extra={"content": str(msg)},
431
+ )
432
+
433
+ self._msg_ch = utils.aio.Chan[ClientEvents]()
434
+
435
+ def update_options(
436
+ self,
437
+ *,
438
+ voice: NotGivenOr[str] = NOT_GIVEN,
439
+ temperature: NotGivenOr[float] = NOT_GIVEN,
440
+ tool_choice: NotGivenOr[llm.ToolChoice | None] = NOT_GIVEN,
441
+ tool_behavior: NotGivenOr[types.Behavior] = NOT_GIVEN,
442
+ tool_response_scheduling: NotGivenOr[types.FunctionResponseScheduling] = NOT_GIVEN,
443
+ ) -> None:
444
+ should_restart = False
445
+ if is_given(voice) and self._opts.voice != voice:
446
+ self._opts.voice = voice
447
+ should_restart = True
448
+
449
+ if is_given(temperature) and self._opts.temperature != temperature:
450
+ self._opts.temperature = temperature if is_given(temperature) else NOT_GIVEN
451
+ should_restart = True
452
+
453
+ if is_given(tool_behavior) and self._opts.tool_behavior != tool_behavior:
454
+ self._opts.tool_behavior = tool_behavior
455
+ should_restart = True
456
+
457
+ if (
458
+ is_given(tool_response_scheduling)
459
+ and self._opts.tool_response_scheduling != tool_response_scheduling
460
+ ):
461
+ self._opts.tool_response_scheduling = tool_response_scheduling
462
+ # no need to restart
463
+
464
+ if is_given(tool_choice):
465
+ logger.warning("tool_choice is not supported by the Google Realtime API.")
466
+
467
+ if should_restart:
468
+ self._mark_restart_needed()
469
+
470
+ async def update_instructions(self, instructions: str) -> None:
471
+ if not is_given(self._opts.instructions) or self._opts.instructions != instructions:
472
+ self._opts.instructions = instructions
473
+ self._mark_restart_needed()
474
+
475
+ async def update_chat_ctx(self, chat_ctx: llm.ChatContext) -> None:
476
+ async with self._session_lock:
477
+ if not self._active_session:
478
+ self._chat_ctx = chat_ctx.copy()
479
+ return
480
+
481
+ diff_ops = llm.utils.compute_chat_ctx_diff(self._chat_ctx, chat_ctx)
482
+
483
+ if diff_ops.to_remove:
484
+ logger.warning("Gemini Live does not support removing messages")
485
+
486
+ append_ctx = llm.ChatContext.empty()
487
+ for _, item_id in diff_ops.to_create:
488
+ item = chat_ctx.get_by_id(item_id)
489
+ if item:
490
+ append_ctx.items.append(item)
491
+
492
+ if append_ctx.items:
493
+ turns_dict, _ = append_ctx.copy(
494
+ exclude_function_call=True,
495
+ ).to_provider_format(format="google", inject_dummy_user_message=False)
496
+ # we are not generating, and do not need to inject
497
+ turns = [types.Content.model_validate(turn) for turn in turns_dict]
498
+ tool_results = get_tool_results_for_realtime(
499
+ append_ctx,
500
+ vertexai=self._opts.vertexai,
501
+ tool_response_scheduling=self._opts.tool_response_scheduling,
502
+ )
503
+ if turns:
504
+ self._send_client_event(types.LiveClientContent(turns=turns, turn_complete=False))
505
+ if tool_results:
506
+ self._send_client_event(tool_results)
507
+
508
+ # since we don't have a view of the history on the server side, we'll assume
509
+ # the current state is accurate. this isn't perfect because removals aren't done.
510
+ self._chat_ctx = chat_ctx.copy()
511
+
512
+ async def update_tools(self, tools: list[llm.FunctionTool | llm.RawFunctionTool]) -> None:
513
+ new_declarations: list[types.FunctionDeclaration] = to_fnc_ctx(
514
+ tools, use_parameters_json_schema=False, tool_behavior=self._opts.tool_behavior
515
+ )
516
+ current_tool_names = {f.name for f in self._gemini_declarations}
517
+ new_tool_names = {f.name for f in new_declarations}
518
+
519
+ if current_tool_names != new_tool_names:
520
+ self._gemini_declarations = new_declarations
521
+ self._tools = llm.ToolContext(tools)
522
+ self._mark_restart_needed()
523
+
524
+ @property
525
+ def chat_ctx(self) -> llm.ChatContext:
526
+ return self._chat_ctx.copy()
527
+
528
+ @property
529
+ def tools(self) -> llm.ToolContext:
530
+ return self._tools.copy()
531
+
532
+ @property
533
+ def _manual_activity_detection(self) -> bool:
534
+ if (
535
+ is_given(self._opts.realtime_input_config)
536
+ and self._opts.realtime_input_config.automatic_activity_detection is not None
537
+ and self._opts.realtime_input_config.automatic_activity_detection.disabled
538
+ ):
539
+ return True
540
+ return False
541
+
542
+ @property
543
+ def session_resumption_handle(self) -> str | None:
544
+ return self._session_resumption_handle
545
+
546
+ def push_audio(self, frame: rtc.AudioFrame) -> None:
547
+ for f in self._resample_audio(frame):
548
+ for nf in self._bstream.write(f.data.tobytes()):
549
+ realtime_input = types.LiveClientRealtimeInput(
550
+ media_chunks=[
551
+ types.Blob(
552
+ data=nf.data.tobytes(),
553
+ mime_type=f"audio/pcm;rate={INPUT_AUDIO_SAMPLE_RATE}",
554
+ )
555
+ ]
556
+ )
557
+ self._send_client_event(realtime_input)
558
+
559
+ def push_video(self, frame: rtc.VideoFrame) -> None:
560
+ encoded_data = images.encode(
561
+ frame, self._opts.image_encode_options or DEFAULT_IMAGE_ENCODE_OPTIONS
562
+ )
563
+ realtime_input = types.LiveClientRealtimeInput(
564
+ media_chunks=[types.Blob(data=encoded_data, mime_type="image/jpeg")]
565
+ )
566
+ self._send_client_event(realtime_input)
567
+
568
+ def _send_client_event(self, event: ClientEvents) -> None:
569
+ with contextlib.suppress(utils.aio.channel.ChanClosed):
570
+ self._msg_ch.send_nowait(event)
571
+
572
+ def generate_reply(
573
+ self, *, instructions: NotGivenOr[str] = NOT_GIVEN
574
+ ) -> asyncio.Future[llm.GenerationCreatedEvent]:
575
+ if self._pending_generation_fut and not self._pending_generation_fut.done():
576
+ logger.warning(
577
+ "generate_reply called while another generation is pending, cancelling previous."
578
+ )
579
+ self._pending_generation_fut.cancel("Superseded by new generate_reply call")
580
+
581
+ fut = asyncio.Future[llm.GenerationCreatedEvent]()
582
+ self._pending_generation_fut = fut
583
+
584
+ if self._in_user_activity:
585
+ self._send_client_event(
586
+ types.LiveClientRealtimeInput(
587
+ activity_end=types.ActivityEnd(),
588
+ )
589
+ )
590
+ self._in_user_activity = False
591
+
592
+ # Gemini requires the last message to end with user's turn
593
+ # so we need to add a placeholder user turn in order to trigger a new generation
594
+ turns = []
595
+ if is_given(instructions):
596
+ turns.append(types.Content(parts=[types.Part(text=instructions)], role="model"))
597
+ turns.append(types.Content(parts=[types.Part(text=".")], role="user"))
598
+ self._send_client_event(types.LiveClientContent(turns=turns, turn_complete=True))
599
+
600
+ def _on_timeout() -> None:
601
+ if not fut.done():
602
+ fut.set_exception(
603
+ llm.RealtimeError(
604
+ "generate_reply timed out waiting for generation_created event."
605
+ )
606
+ )
607
+ if self._pending_generation_fut is fut:
608
+ self._pending_generation_fut = None
609
+
610
+ timeout_handle = asyncio.get_event_loop().call_later(5.0, _on_timeout)
611
+ fut.add_done_callback(lambda _: timeout_handle.cancel())
612
+
613
+ return fut
614
+
615
+ def start_user_activity(self) -> None:
616
+ if not self._manual_activity_detection:
617
+ return
618
+
619
+ if not self._in_user_activity:
620
+ self._in_user_activity = True
621
+ self._send_client_event(
622
+ types.LiveClientRealtimeInput(
623
+ activity_start=types.ActivityStart(),
624
+ )
625
+ )
626
+
627
+ def interrupt(self) -> None:
628
+ # Gemini Live treats activity start as interruption, so we rely on start_user_activity
629
+ # notifications to handle it
630
+ if (
631
+ self._opts.realtime_input_config
632
+ and self._opts.realtime_input_config.activity_handling
633
+ == types.ActivityHandling.NO_INTERRUPTION
634
+ ):
635
+ return
636
+ self.start_user_activity()
637
+
638
+ def truncate(
639
+ self,
640
+ *,
641
+ message_id: str,
642
+ modalities: list[Literal["text", "audio"]],
643
+ audio_end_ms: int,
644
+ audio_transcript: NotGivenOr[str] = NOT_GIVEN,
645
+ ) -> None:
646
+ logger.warning("truncate is not supported by the Google Realtime API.")
647
+ pass
648
+
649
+ async def aclose(self) -> None:
650
+ self._msg_ch.close()
651
+ self._session_should_close.set()
652
+
653
+ if self._main_atask:
654
+ await utils.aio.cancel_and_wait(self._main_atask)
655
+
656
+ await self._close_active_session()
657
+
658
+ if self._pending_generation_fut and not self._pending_generation_fut.done():
659
+ self._pending_generation_fut.cancel("Session closed")
660
+
661
+ for fut in self._response_created_futures.values():
662
+ if not fut.done():
663
+ fut.set_exception(llm.RealtimeError("Session closed before response created"))
664
+ self._response_created_futures.clear()
665
+
666
+ if self._current_generation:
667
+ self._mark_current_generation_done()
668
+
669
+ @utils.log_exceptions(logger=logger)
670
+ async def _main_task(self) -> None:
671
+ max_retries = self._opts.conn_options.max_retry
672
+
673
+ while not self._msg_ch.closed:
674
+ # previous session might not be closed yet, we'll do it here.
675
+ await self._close_active_session()
676
+
677
+ self._session_should_close.clear()
678
+ config = self._build_connect_config()
679
+ session = None
680
+ try:
681
+ logger.debug("connecting to Gemini Realtime API...")
682
+ async with self._client.aio.live.connect(
683
+ model=self._opts.model, config=config
684
+ ) as session:
685
+ async with self._session_lock:
686
+ self._active_session = session
687
+ turns_dict, _ = self._chat_ctx.copy(
688
+ exclude_function_call=True,
689
+ ).to_provider_format(format="google", inject_dummy_user_message=False)
690
+ if turns_dict:
691
+ turns = [types.Content.model_validate(turn) for turn in turns_dict]
692
+ await session.send_client_content(
693
+ turns=turns, # type: ignore
694
+ turn_complete=False,
695
+ )
696
+ # queue up existing chat context
697
+ send_task = asyncio.create_task(
698
+ self._send_task(session), name="gemini-realtime-send"
699
+ )
700
+ recv_task = asyncio.create_task(
701
+ self._recv_task(session), name="gemini-realtime-recv"
702
+ )
703
+ restart_wait_task = asyncio.create_task(
704
+ self._session_should_close.wait(), name="gemini-restart-wait"
705
+ )
706
+
707
+ done, pending = await asyncio.wait(
708
+ [send_task, recv_task, restart_wait_task],
709
+ return_when=asyncio.FIRST_COMPLETED,
710
+ )
711
+
712
+ for task in done:
713
+ if task is not restart_wait_task and task.exception():
714
+ logger.error(f"error in task {task.get_name()}: {task.exception()}")
715
+ raise task.exception() or Exception(f"{task.get_name()} failed")
716
+
717
+ if restart_wait_task not in done and self._msg_ch.closed:
718
+ break
719
+
720
+ for task in pending:
721
+ await utils.aio.cancel_and_wait(task)
722
+
723
+ except asyncio.CancelledError:
724
+ break
725
+ except Exception as e:
726
+ logger.error(f"Gemini Realtime API error: {e}", exc_info=e)
727
+ if not self._msg_ch.closed:
728
+ # we shouldn't retry when it's not connected, usually this means incorrect
729
+ # parameters or setup
730
+ if not session or max_retries == 0:
731
+ self._emit_error(e, recoverable=False)
732
+ raise APIConnectionError(message="Failed to connect to Gemini Live") from e
733
+
734
+ if self._num_retries == max_retries:
735
+ self._emit_error(e, recoverable=False)
736
+ raise APIConnectionError(
737
+ message=f"Failed to connect to Gemini Live after {max_retries} attempts"
738
+ ) from e
739
+
740
+ retry_interval = self._opts.conn_options._interval_for_retry(self._num_retries)
741
+ logger.warning(
742
+ f"Gemini Realtime API connection failed, retrying in {retry_interval}s",
743
+ exc_info=e,
744
+ extra={"attempt": self._num_retries, "max_retries": max_retries},
745
+ )
746
+ await asyncio.sleep(retry_interval)
747
+ self._num_retries += 1
748
+ finally:
749
+ await self._close_active_session()
750
+
751
+ async def _send_task(self, session: AsyncSession) -> None:
752
+ try:
753
+ async for msg in self._msg_ch:
754
+ async with self._session_lock:
755
+ if self._session_should_close.is_set() or (
756
+ not self._active_session or self._active_session != session
757
+ ):
758
+ break
759
+ if isinstance(msg, types.LiveClientContent):
760
+ await session.send_client_content(
761
+ turns=msg.turns, # type: ignore
762
+ turn_complete=msg.turn_complete if msg.turn_complete is not None else True,
763
+ )
764
+ elif isinstance(msg, types.LiveClientToolResponse) and msg.function_responses:
765
+ await session.send_tool_response(function_responses=msg.function_responses)
766
+ elif isinstance(msg, types.LiveClientRealtimeInput):
767
+ if msg.media_chunks:
768
+ for media_chunk in msg.media_chunks:
769
+ await session.send_realtime_input(media=media_chunk)
770
+ elif msg.activity_start:
771
+ await session.send_realtime_input(activity_start=msg.activity_start)
772
+ elif msg.activity_end:
773
+ await session.send_realtime_input(activity_end=msg.activity_end)
774
+ else:
775
+ logger.warning(f"Warning: Received unhandled message type: {type(msg)}")
776
+
777
+ if lk_google_debug and isinstance(
778
+ msg,
779
+ (
780
+ types.LiveClientContent,
781
+ types.LiveClientToolResponse,
782
+ types.LiveClientRealtimeInput,
783
+ ),
784
+ ):
785
+ if not isinstance(msg, types.LiveClientRealtimeInput) or not msg.media_chunks:
786
+ logger.debug(
787
+ f">>> sent {type(msg).__name__}",
788
+ extra={"content": msg.model_dump(exclude_defaults=True)},
789
+ )
790
+
791
+ except Exception as e:
792
+ if not self._session_should_close.is_set():
793
+ logger.error(f"error in send task: {e}", exc_info=e)
794
+ self._mark_restart_needed(on_error=True)
795
+ finally:
796
+ logger.debug("send task finished.")
797
+
798
+ async def _recv_task(self, session: AsyncSession) -> None:
799
+ try:
800
+ while True:
801
+ async with self._session_lock:
802
+ if self._session_should_close.is_set() or (
803
+ not self._active_session or self._active_session != session
804
+ ):
805
+ logger.debug("receive task: Session changed or closed, stopping receive.")
806
+ break
807
+
808
+ async for response in session.receive():
809
+ if lk_google_debug:
810
+ resp_copy = response.model_dump(exclude_defaults=True)
811
+ # remove audio from debugging logs
812
+ if (
813
+ (sc := resp_copy.get("server_content"))
814
+ and (mt := sc.get("model_turn"))
815
+ and (parts := mt.get("parts"))
816
+ ):
817
+ for part in parts:
818
+ if part and part.get("inline_data"):
819
+ part["inline_data"] = "<audio>"
820
+ logger.debug("<<< received response", extra={"response": resp_copy})
821
+
822
+ if not self._current_generation or self._current_generation._done:
823
+ if (sc := response.server_content) and sc.interrupted:
824
+ # two cases an interrupted event is sent without an active generation
825
+ # 1) the generation is done but playout is not finished (turn_complete -> interrupted)
826
+ # 2) the generation is not started (interrupted -> turn_complete)
827
+ # for both cases, we interrupt the agent if there is no pending generation from `generate_reply`
828
+ # for the second case, the pending generation will be stopped by `turn_complete` event coming later
829
+ if not self._pending_generation_fut:
830
+ self._handle_input_speech_started()
831
+
832
+ sc.interrupted = None
833
+ sc_copy = sc.model_dump(exclude_none=True)
834
+ if not sc_copy:
835
+ # ignore empty server content
836
+ response.server_content = None
837
+ if lk_google_debug:
838
+ logger.debug("ignoring empty server content")
839
+
840
+ if self._is_new_generation(response):
841
+ self._start_new_generation()
842
+ if lk_google_debug:
843
+ logger.debug(f"new generation started: {self._current_generation}")
844
+
845
+ if response.session_resumption_update:
846
+ if (
847
+ response.session_resumption_update.resumable
848
+ and response.session_resumption_update.new_handle
849
+ ):
850
+ self._session_resumption_handle = (
851
+ response.session_resumption_update.new_handle
852
+ )
853
+
854
+ if response.server_content:
855
+ self._handle_server_content(response.server_content)
856
+ if response.tool_call:
857
+ self._handle_tool_calls(response.tool_call)
858
+ if response.tool_call_cancellation:
859
+ self._handle_tool_call_cancellation(response.tool_call_cancellation)
860
+ if response.usage_metadata:
861
+ self._handle_usage_metadata(response.usage_metadata)
862
+ if response.go_away:
863
+ self._handle_go_away(response.go_away)
864
+
865
+ if self._num_retries > 0:
866
+ self._num_retries = 0 # reset the retry counter
867
+
868
+ # TODO(dz): a server-side turn is complete
869
+ except Exception as e:
870
+ if not self._session_should_close.is_set():
871
+ logger.error(f"error in receive task: {e}", exc_info=e)
872
+ self._mark_restart_needed(on_error=True)
873
+ finally:
874
+ self._mark_current_generation_done()
875
+
876
+ def _build_connect_config(self) -> types.LiveConnectConfig:
877
+ temp = self._opts.temperature if is_given(self._opts.temperature) else None
878
+
879
+ tools_config = create_tools_config(
880
+ function_tools=self._gemini_declarations,
881
+ gemini_tools=self._opts.gemini_tools if is_given(self._opts.gemini_tools) else None,
882
+ )
883
+ conf = types.LiveConnectConfig(
884
+ response_modalities=self._opts.response_modalities,
885
+ generation_config=types.GenerationConfig(
886
+ candidate_count=self._opts.candidate_count,
887
+ temperature=temp,
888
+ max_output_tokens=self._opts.max_output_tokens
889
+ if is_given(self._opts.max_output_tokens)
890
+ else None,
891
+ top_p=self._opts.top_p if is_given(self._opts.top_p) else None,
892
+ top_k=self._opts.top_k if is_given(self._opts.top_k) else None,
893
+ presence_penalty=self._opts.presence_penalty
894
+ if is_given(self._opts.presence_penalty)
895
+ else None,
896
+ frequency_penalty=self._opts.frequency_penalty
897
+ if is_given(self._opts.frequency_penalty)
898
+ else None,
899
+ thinking_config=self._opts.thinking_config
900
+ if is_given(self._opts.thinking_config)
901
+ else None,
902
+ ),
903
+ system_instruction=types.Content(parts=[types.Part(text=self._opts.instructions)])
904
+ if is_given(self._opts.instructions)
905
+ else None,
906
+ speech_config=types.SpeechConfig(
907
+ voice_config=types.VoiceConfig(
908
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=self._opts.voice)
909
+ ),
910
+ language_code=self._opts.language if is_given(self._opts.language) else None,
911
+ ),
912
+ tools=tools_config,
913
+ input_audio_transcription=self._opts.input_audio_transcription,
914
+ output_audio_transcription=self._opts.output_audio_transcription,
915
+ session_resumption=types.SessionResumptionConfig(
916
+ handle=self._session_resumption_handle
917
+ ),
918
+ )
919
+
920
+ if is_given(self._opts.proactivity):
921
+ conf.proactivity = types.ProactivityConfig(proactive_audio=self._opts.proactivity)
922
+ if is_given(self._opts.enable_affective_dialog):
923
+ conf.enable_affective_dialog = self._opts.enable_affective_dialog
924
+ if is_given(self._opts.realtime_input_config):
925
+ conf.realtime_input_config = self._opts.realtime_input_config
926
+ if is_given(self._opts.context_window_compression):
927
+ conf.context_window_compression = self._opts.context_window_compression
928
+
929
+ return conf
930
+
931
+ def _start_new_generation(self) -> None:
932
+ if self._current_generation and not self._current_generation._done:
933
+ logger.warning("starting new generation while another is active. Finalizing previous.")
934
+ self._mark_current_generation_done()
935
+
936
+ response_id = utils.shortuuid("GR_")
937
+ self._current_generation = _ResponseGeneration(
938
+ message_ch=utils.aio.Chan[llm.MessageGeneration](),
939
+ function_ch=utils.aio.Chan[llm.FunctionCall](),
940
+ response_id=response_id,
941
+ input_id=utils.shortuuid("GI_"),
942
+ text_ch=utils.aio.Chan[str](),
943
+ audio_ch=utils.aio.Chan[rtc.AudioFrame](),
944
+ _created_timestamp=time.time(),
945
+ )
946
+ if not self._realtime_model.capabilities.audio_output:
947
+ self._current_generation.audio_ch.close()
948
+
949
+ msg_modalities = asyncio.Future[list[Literal["text", "audio"]]]()
950
+ msg_modalities.set_result(
951
+ ["audio", "text"] if self._realtime_model.capabilities.audio_output else ["text"]
952
+ )
953
+ self._current_generation.message_ch.send_nowait(
954
+ llm.MessageGeneration(
955
+ message_id=response_id,
956
+ text_stream=self._current_generation.text_ch,
957
+ audio_stream=self._current_generation.audio_ch,
958
+ modalities=msg_modalities,
959
+ )
960
+ )
961
+
962
+ generation_event = llm.GenerationCreatedEvent(
963
+ message_stream=self._current_generation.message_ch,
964
+ function_stream=self._current_generation.function_ch,
965
+ user_initiated=False,
966
+ response_id=self._current_generation.response_id,
967
+ )
968
+
969
+ if self._pending_generation_fut and not self._pending_generation_fut.done():
970
+ generation_event.user_initiated = True
971
+ self._pending_generation_fut.set_result(generation_event)
972
+ self._pending_generation_fut = None
973
+ else:
974
+ # emit input_speech_started event before starting an agent initiated generation
975
+ # to interrupt the previous audio playout if any
976
+ self._handle_input_speech_started()
977
+
978
+ self.emit("generation_created", generation_event)
979
+
980
+ def _handle_server_content(self, server_content: types.LiveServerContent) -> None:
981
+ current_gen = self._current_generation
982
+ if not current_gen:
983
+ logger.warning("received server content but no active generation.")
984
+ return
985
+
986
+ if model_turn := server_content.model_turn:
987
+ for part in model_turn.parts or []:
988
+ if part.thought:
989
+ # bypass reasoning output
990
+ continue
991
+ if part.text:
992
+ current_gen.push_text(part.text)
993
+ if part.inline_data:
994
+ if not current_gen._first_token_timestamp:
995
+ current_gen._first_token_timestamp = time.time()
996
+ frame_data = part.inline_data.data
997
+ try:
998
+ if not isinstance(frame_data, bytes):
999
+ raise ValueError("frame_data is not bytes")
1000
+ frame = rtc.AudioFrame(
1001
+ data=frame_data,
1002
+ sample_rate=OUTPUT_AUDIO_SAMPLE_RATE,
1003
+ num_channels=OUTPUT_AUDIO_CHANNELS,
1004
+ samples_per_channel=len(frame_data) // (2 * OUTPUT_AUDIO_CHANNELS),
1005
+ )
1006
+ current_gen.audio_ch.send_nowait(frame)
1007
+ except ValueError as e:
1008
+ logger.error(f"Error creating audio frame from Gemini data: {e}")
1009
+
1010
+ if input_transcription := server_content.input_transcription:
1011
+ text = input_transcription.text
1012
+ if text:
1013
+ if current_gen.input_transcription == "":
1014
+ # gemini would start with a space, which doesn't make sense
1015
+ # at beginning of the transcript
1016
+ text = text.lstrip()
1017
+ current_gen.input_transcription += text
1018
+ self.emit(
1019
+ "input_audio_transcription_completed",
1020
+ llm.InputTranscriptionCompleted(
1021
+ item_id=current_gen.input_id,
1022
+ transcript=current_gen.input_transcription,
1023
+ is_final=False,
1024
+ ),
1025
+ )
1026
+
1027
+ if output_transcription := server_content.output_transcription:
1028
+ text = output_transcription.text
1029
+ if text:
1030
+ current_gen.push_text(text)
1031
+
1032
+ if server_content.generation_complete or server_content.turn_complete:
1033
+ current_gen._completed_timestamp = time.time()
1034
+
1035
+ if server_content.interrupted and not self._pending_generation_fut:
1036
+ # interrupt agent if there is no pending user initiated generation
1037
+ self._handle_input_speech_started()
1038
+
1039
+ if server_content.turn_complete:
1040
+ self._mark_current_generation_done()
1041
+
1042
+ def _mark_current_generation_done(self) -> None:
1043
+ if not self._current_generation or self._current_generation._done:
1044
+ return
1045
+
1046
+ # emit input_speech_stopped event after the generation is done
1047
+ self._handle_input_speech_stopped()
1048
+
1049
+ gen = self._current_generation
1050
+
1051
+ # The only way we'd know that the transcription is complete is by when they are
1052
+ # done with generation
1053
+ if gen.input_transcription:
1054
+ self.emit(
1055
+ "input_audio_transcription_completed",
1056
+ llm.InputTranscriptionCompleted(
1057
+ item_id=gen.input_id,
1058
+ transcript=gen.input_transcription,
1059
+ is_final=True,
1060
+ ),
1061
+ )
1062
+
1063
+ # since gemini doesn't give us a view of the chat history on the server side,
1064
+ # we would handle it manually here
1065
+ self._chat_ctx.add_message(
1066
+ role="user",
1067
+ content=gen.input_transcription,
1068
+ id=gen.input_id,
1069
+ )
1070
+
1071
+ if gen.output_text:
1072
+ self._chat_ctx.add_message(
1073
+ role="assistant",
1074
+ content=gen.output_text,
1075
+ id=gen.response_id,
1076
+ )
1077
+
1078
+ if not gen.text_ch.closed:
1079
+ if self._opts.output_audio_transcription is None:
1080
+ # close the text data of transcription synchronizer
1081
+ gen.text_ch.send_nowait("")
1082
+ gen.text_ch.close()
1083
+ if not gen.audio_ch.closed:
1084
+ gen.audio_ch.close()
1085
+
1086
+ gen.function_ch.close()
1087
+ gen.message_ch.close()
1088
+ gen._done = True
1089
+ if lk_google_debug:
1090
+ logger.debug(f"generation done {gen}")
1091
+
1092
+ def _handle_input_speech_started(self) -> None:
1093
+ self.emit("input_speech_started", llm.InputSpeechStartedEvent())
1094
+
1095
+ def _handle_input_speech_stopped(self) -> None:
1096
+ self.emit(
1097
+ "input_speech_stopped",
1098
+ llm.InputSpeechStoppedEvent(user_transcription_enabled=False),
1099
+ )
1100
+
1101
+ def _handle_tool_calls(self, tool_call: types.LiveServerToolCall) -> None:
1102
+ if not self._current_generation:
1103
+ logger.warning("received tool call but no active generation.")
1104
+ return
1105
+
1106
+ gen = self._current_generation
1107
+ for fnc_call in tool_call.function_calls or []:
1108
+ arguments = json.dumps(fnc_call.args)
1109
+
1110
+ gen.function_ch.send_nowait(
1111
+ llm.FunctionCall(
1112
+ call_id=fnc_call.id or utils.shortuuid("fnc-call-"),
1113
+ name=fnc_call.name,
1114
+ arguments=arguments,
1115
+ )
1116
+ )
1117
+ self._mark_current_generation_done()
1118
+
1119
+ def _handle_tool_call_cancellation(
1120
+ self, tool_call_cancellation: types.LiveServerToolCallCancellation
1121
+ ) -> None:
1122
+ logger.warning(
1123
+ "server cancelled tool calls",
1124
+ extra={"function_call_ids": tool_call_cancellation.ids},
1125
+ )
1126
+
1127
+ def _handle_usage_metadata(self, usage_metadata: types.UsageMetadata) -> None:
1128
+ current_gen = self._current_generation
1129
+ if not current_gen:
1130
+ logger.warning("no active generation to report metrics for")
1131
+ return
1132
+
1133
+ ttft = (
1134
+ current_gen._first_token_timestamp - current_gen._created_timestamp
1135
+ if current_gen._first_token_timestamp
1136
+ else -1
1137
+ )
1138
+ duration = (
1139
+ current_gen._completed_timestamp or time.time()
1140
+ ) - current_gen._created_timestamp
1141
+
1142
+ def _token_details_map(
1143
+ token_details: list[types.ModalityTokenCount] | None,
1144
+ ) -> dict[str, int]:
1145
+ token_details_map = {"audio_tokens": 0, "text_tokens": 0, "image_tokens": 0}
1146
+ if not token_details:
1147
+ return token_details_map
1148
+
1149
+ for token_detail in token_details:
1150
+ if not token_detail.token_count:
1151
+ continue
1152
+
1153
+ if token_detail.modality == types.MediaModality.AUDIO:
1154
+ token_details_map["audio_tokens"] += token_detail.token_count
1155
+ elif token_detail.modality == types.MediaModality.TEXT:
1156
+ token_details_map["text_tokens"] += token_detail.token_count
1157
+ elif token_detail.modality == types.MediaModality.IMAGE:
1158
+ token_details_map["image_tokens"] += token_detail.token_count
1159
+ return token_details_map
1160
+
1161
+ metrics = RealtimeModelMetrics(
1162
+ label=self._realtime_model.label,
1163
+ request_id=current_gen.response_id,
1164
+ timestamp=current_gen._created_timestamp,
1165
+ duration=duration,
1166
+ ttft=ttft,
1167
+ cancelled=False,
1168
+ input_tokens=usage_metadata.prompt_token_count or 0,
1169
+ output_tokens=usage_metadata.response_token_count or 0,
1170
+ total_tokens=usage_metadata.total_token_count or 0,
1171
+ tokens_per_second=(usage_metadata.response_token_count or 0) / duration
1172
+ if duration > 0
1173
+ else 0,
1174
+ input_token_details=RealtimeModelMetrics.InputTokenDetails(
1175
+ **_token_details_map(usage_metadata.prompt_tokens_details),
1176
+ cached_tokens=sum(
1177
+ token_detail.token_count or 0
1178
+ for token_detail in usage_metadata.cache_tokens_details or []
1179
+ ),
1180
+ cached_tokens_details=RealtimeModelMetrics.CachedTokenDetails(
1181
+ **_token_details_map(usage_metadata.cache_tokens_details),
1182
+ ),
1183
+ ),
1184
+ output_token_details=RealtimeModelMetrics.OutputTokenDetails(
1185
+ **_token_details_map(usage_metadata.response_tokens_details),
1186
+ ),
1187
+ metadata=Metadata(
1188
+ model_name=self._realtime_model.model, model_provider=self._realtime_model.provider
1189
+ ),
1190
+ )
1191
+ self.emit("metrics_collected", metrics)
1192
+
1193
+ def _handle_go_away(self, go_away: types.LiveServerGoAway) -> None:
1194
+ logger.warning(
1195
+ f"Gemini server indicates disconnection soon. Time left: {go_away.time_left}"
1196
+ )
1197
+ # TODO(dz): this isn't a seamless reconnection just yet
1198
+ self._session_should_close.set()
1199
+
1200
+ def commit_audio(self) -> None:
1201
+ pass
1202
+
1203
+ def clear_audio(self) -> None:
1204
+ pass
1205
+
1206
+ def _resample_audio(self, frame: rtc.AudioFrame) -> Iterator[rtc.AudioFrame]:
1207
+ if self._input_resampler:
1208
+ if frame.sample_rate != self._input_resampler._input_rate:
1209
+ # input audio changed to a different sample rate
1210
+ self._input_resampler = None
1211
+
1212
+ if self._input_resampler is None and (
1213
+ frame.sample_rate != INPUT_AUDIO_SAMPLE_RATE
1214
+ or frame.num_channels != INPUT_AUDIO_CHANNELS
1215
+ ):
1216
+ self._input_resampler = rtc.AudioResampler(
1217
+ input_rate=frame.sample_rate,
1218
+ output_rate=INPUT_AUDIO_SAMPLE_RATE,
1219
+ num_channels=INPUT_AUDIO_CHANNELS,
1220
+ )
1221
+
1222
+ if self._input_resampler:
1223
+ # TODO(long): flush the resampler when the input source is changed
1224
+ yield from self._input_resampler.push(frame)
1225
+ else:
1226
+ yield frame
1227
+
1228
+ def _emit_error(self, error: Exception, recoverable: bool) -> None:
1229
+ self.emit(
1230
+ "error",
1231
+ llm.RealtimeModelError(
1232
+ timestamp=time.time(),
1233
+ label=self._realtime_model._label,
1234
+ error=error,
1235
+ recoverable=recoverable,
1236
+ ),
1237
+ )
1238
+
1239
+ def _is_new_generation(self, resp: types.LiveServerMessage) -> bool:
1240
+ if resp.tool_call:
1241
+ return True
1242
+
1243
+ if (sc := resp.server_content) and (
1244
+ sc.model_turn
1245
+ or (sc.output_transcription and sc.output_transcription is not None)
1246
+ or (sc.input_transcription and sc.input_transcription is not None)
1247
+ or (sc.generation_complete is not None)
1248
+ or (sc.turn_complete is not None)
1249
+ ):
1250
+ return True
1251
+
1252
+ return False