livekit-plugins-google 1.0.16__py3-none-any.whl → 1.0.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,15 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
+ import contextlib
4
5
  import json
5
6
  import os
6
7
  import weakref
8
+ from collections.abc import Iterator
7
9
  from dataclasses import dataclass
8
10
 
9
11
  from google import genai
12
+ from google.genai.live import AsyncSession
10
13
  from google.genai.types import (
11
14
  AudioTranscriptionConfig,
12
15
  Blob,
@@ -23,6 +26,7 @@ from google.genai.types import (
23
26
  Modality,
24
27
  Part,
25
28
  PrebuiltVoiceConfig,
29
+ SessionResumptionConfig,
26
30
  SpeechConfig,
27
31
  Tool,
28
32
  UsageMetadata,
@@ -31,15 +35,16 @@ from google.genai.types import (
31
35
  from livekit import rtc
32
36
  from livekit.agents import llm, utils
33
37
  from livekit.agents.types import NOT_GIVEN, NotGivenOr
34
- from livekit.agents.utils import images, is_given
38
+ from livekit.agents.utils import audio as audio_utils, images, is_given
39
+ from livekit.plugins.google.beta.realtime.api_proto import ClientEvents, LiveAPIModels, Voice
35
40
 
36
41
  from ...log import logger
37
42
  from ...utils import _build_gemini_fnc, get_tool_results_for_realtime, to_chat_ctx
38
- from .api_proto import ClientEvents, LiveAPIModels, Voice
39
43
 
40
44
  INPUT_AUDIO_SAMPLE_RATE = 16000
45
+ INPUT_AUDIO_CHANNELS = 1
41
46
  OUTPUT_AUDIO_SAMPLE_RATE = 24000
42
- NUM_CHANNELS = 1
47
+ OUTPUT_AUDIO_CHANNELS = 1
43
48
 
44
49
  DEFAULT_ENCODE_OPTIONS = images.EncodeOptions(
45
50
  format="JPEG",
@@ -59,6 +64,7 @@ class _RealtimeOptions:
59
64
  model: LiveAPIModels | str
60
65
  api_key: str | None
61
66
  voice: Voice | str
67
+ language: NotGivenOr[str]
62
68
  response_modalities: NotGivenOr[list[Modality]]
63
69
  vertexai: bool
64
70
  project: str | None
@@ -98,6 +104,7 @@ class RealtimeModel(llm.RealtimeModel):
98
104
  model: LiveAPIModels | str = "gemini-2.0-flash-live-001",
99
105
  api_key: NotGivenOr[str] = NOT_GIVEN,
100
106
  voice: Voice | str = "Puck",
107
+ language: NotGivenOr[str] = NOT_GIVEN,
101
108
  modalities: NotGivenOr[list[Modality]] = NOT_GIVEN,
102
109
  vertexai: bool = False,
103
110
  project: NotGivenOr[str] = NOT_GIVEN,
@@ -126,8 +133,9 @@ class RealtimeModel(llm.RealtimeModel):
126
133
  instructions (str, optional): Initial system instructions for the model. Defaults to "".
127
134
  api_key (str, optional): Google Gemini API key. If None, will attempt to read from the environment variable GOOGLE_API_KEY.
128
135
  modalities (list[Modality], optional): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
129
- model (str, optional): The name of the model to use. Defaults to "gemini-2.0-flash-exp".
136
+ model (str, optional): The name of the model to use. Defaults to "gemini-2.0-flash-live-001".
130
137
  voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
138
+ language (str, optional): The language(BCP-47 Code) to use for the API. supported languages - https://ai.google.dev/gemini-api/docs/live#supported-languages
131
139
  temperature (float, optional): Sampling temperature for response generation. Defaults to 0.8.
132
140
  vertexai (bool, optional): Whether to use VertexAI for the API. Defaults to False.
133
141
  project (str, optional): The project id to use for the API. Defaults to None. (for vertexai)
@@ -147,20 +155,20 @@ class RealtimeModel(llm.RealtimeModel):
147
155
  capabilities=llm.RealtimeCapabilities(
148
156
  message_truncation=False,
149
157
  turn_detection=True,
150
- user_transcription=False,
158
+ user_transcription=is_given(input_audio_transcription),
151
159
  )
152
160
  )
153
161
 
154
162
  gemini_api_key = api_key if is_given(api_key) else os.environ.get("GOOGLE_API_KEY")
155
163
  gcp_project = project if is_given(project) else os.environ.get("GOOGLE_CLOUD_PROJECT")
156
164
  gcp_location = location if is_given(location) else os.environ.get("GOOGLE_CLOUD_LOCATION")
165
+
157
166
  if vertexai:
158
167
  if not gcp_project or not gcp_location:
159
168
  raise ValueError(
160
169
  "Project and location are required for VertexAI either via project and location or GOOGLE_CLOUD_PROJECT and GOOGLE_CLOUD_LOCATION environment variables" # noqa: E501
161
170
  )
162
171
  gemini_api_key = None # VertexAI does not require an API key
163
-
164
172
  else:
165
173
  gcp_project = None
166
174
  gcp_location = None
@@ -192,6 +200,7 @@ class RealtimeModel(llm.RealtimeModel):
192
200
  instructions=instructions,
193
201
  input_audio_transcription=input_audio_transcription,
194
202
  output_audio_transcription=output_audio_transcription,
203
+ language=language,
195
204
  )
196
205
 
197
206
  self._sessions = weakref.WeakSet[RealtimeSession]()
@@ -213,7 +222,8 @@ class RealtimeModel(llm.RealtimeModel):
213
222
  for sess in self._sessions:
214
223
  sess.update_options(voice=self._opts.voice, temperature=self._opts.temperature)
215
224
 
216
- async def aclose(self) -> None: ...
225
+ async def aclose(self) -> None:
226
+ pass
217
227
 
218
228
 
219
229
  class RealtimeSession(llm.RealtimeSession):
@@ -221,138 +231,164 @@ class RealtimeSession(llm.RealtimeSession):
221
231
  super().__init__(realtime_model)
222
232
  self._opts = realtime_model._opts
223
233
  self._tools = llm.ToolContext.empty()
234
+ self._gemini_declarations: list[FunctionDeclaration] = []
224
235
  self._chat_ctx = llm.ChatContext.empty()
225
236
  self._msg_ch = utils.aio.Chan[ClientEvents]()
226
- self._gemini_tools: list[Tool] = []
237
+ self._input_resampler: rtc.AudioResampler | None = None
238
+
239
+ # 50ms chunks
240
+ self._bstream = audio_utils.AudioByteStream(
241
+ INPUT_AUDIO_SAMPLE_RATE,
242
+ INPUT_AUDIO_CHANNELS,
243
+ samples_per_channel=INPUT_AUDIO_SAMPLE_RATE // 20,
244
+ )
245
+
227
246
  self._client = genai.Client(
228
247
  api_key=self._opts.api_key,
229
248
  vertexai=self._opts.vertexai,
230
249
  project=self._opts.project,
231
250
  location=self._opts.location,
232
251
  )
252
+
233
253
  self._main_atask = asyncio.create_task(self._main_task(), name="gemini-realtime-session")
234
254
 
235
255
  self._current_generation: _ResponseGeneration | None = None
236
-
237
- self._is_interrupted = False
238
- self._active_response_id = None
239
- self._session = None
240
- self._update_chat_ctx_lock = asyncio.Lock()
241
- self._update_fnc_ctx_lock = asyncio.Lock()
256
+ self._active_session: AsyncSession | None = None
257
+ # indicates if the underlying session should end
258
+ self._session_should_close = asyncio.Event()
242
259
  self._response_created_futures: dict[str, asyncio.Future[llm.GenerationCreatedEvent]] = {}
243
- self._pending_generation_event_id = None
260
+ self._pending_generation_fut: asyncio.Future[llm.GenerationCreatedEvent] | None = None
244
261
 
245
- self._reconnect_event = asyncio.Event()
246
- self._session_lock = asyncio.Lock()
247
- self._gemini_close_task: asyncio.Task | None = None
262
+ self._session_resumption_handle: str | None = None
248
263
 
249
- def _schedule_gemini_session_close(self) -> None:
250
- if self._session is not None:
251
- self._gemini_close_task = asyncio.create_task(self._close_gemini_session())
264
+ self._update_lock = asyncio.Lock()
265
+ self._session_lock = asyncio.Lock()
252
266
 
253
- async def _close_gemini_session(self) -> None:
267
+ async def _close_active_session(self) -> None:
254
268
  async with self._session_lock:
255
- if self._session:
269
+ if self._active_session:
256
270
  try:
257
- await self._session.close()
271
+ await self._active_session.close()
272
+ except Exception as e:
273
+ logger.warning(f"error closing Gemini session: {e}")
258
274
  finally:
259
- self._session = None
275
+ self._active_session = None
260
276
 
261
- def update_options(
277
+ def _mark_restart_needed(self):
278
+ if not self._session_should_close.is_set():
279
+ self._session_should_close.set()
280
+ # reset the msg_ch, do not send messages from previous session
281
+ self._msg_ch = utils.aio.Chan[ClientEvents]()
282
+
283
+ async def update_options(
262
284
  self,
263
285
  *,
264
286
  voice: NotGivenOr[str] = NOT_GIVEN,
265
- tool_choice: NotGivenOr[llm.ToolChoice | None] = NOT_GIVEN,
266
287
  temperature: NotGivenOr[float] = NOT_GIVEN,
288
+ tool_choice: NotGivenOr[llm.ToolChoice | None] = NOT_GIVEN,
267
289
  ) -> None:
268
- if is_given(voice):
269
- self._opts.voice = voice
290
+ async with self._update_lock:
291
+ should_restart = False
292
+ if is_given(voice) and self._opts.voice != voice:
293
+ self._opts.voice = voice
294
+ should_restart = True
270
295
 
271
- if is_given(temperature):
272
- self._opts.temperature = temperature
296
+ if is_given(temperature) and self._opts.temperature != temperature:
297
+ self._opts.temperature = temperature if is_given(temperature) else NOT_GIVEN
298
+ should_restart = True
273
299
 
274
- if self._session:
275
- logger.warning("Updating options; triggering Gemini session reconnect.")
276
- self._reconnect_event.set()
277
- self._schedule_gemini_session_close()
300
+ if should_restart:
301
+ self._mark_restart_needed()
278
302
 
279
303
  async def update_instructions(self, instructions: str) -> None:
280
- self._opts.instructions = instructions
281
- if self._session:
282
- logger.warning("Updating instructions; triggering Gemini session reconnect.")
283
- self._reconnect_event.set()
284
- self._schedule_gemini_session_close()
304
+ async with self._update_lock:
305
+ if not is_given(self._opts.instructions) or self._opts.instructions != instructions:
306
+ self._opts.instructions = instructions
307
+ self._mark_restart_needed()
285
308
 
286
309
  async def update_chat_ctx(self, chat_ctx: llm.ChatContext) -> None:
287
- async with self._update_chat_ctx_lock:
288
- self._chat_ctx = chat_ctx
310
+ async with self._update_lock:
311
+ self._chat_ctx = chat_ctx.copy()
289
312
  turns, _ = to_chat_ctx(self._chat_ctx, id(self), ignore_functions=True)
290
313
  tool_results = get_tool_results_for_realtime(self._chat_ctx)
314
+ # TODO(dz): need to compute delta and then either append or recreate session
291
315
  if turns:
292
- self._msg_ch.send_nowait(LiveClientContent(turns=turns, turn_complete=False))
316
+ self._send_client_event(LiveClientContent(turns=turns, turn_complete=False))
293
317
  if tool_results:
294
- self._msg_ch.send_nowait(tool_results)
318
+ self._send_client_event(tool_results)
295
319
 
296
320
  async def update_tools(self, tools: list[llm.FunctionTool]) -> None:
297
- async with self._update_fnc_ctx_lock:
298
- retained_tools: list[llm.FunctionTool] = []
299
- gemini_function_declarations: list[FunctionDeclaration] = []
300
-
301
- for tool in tools:
302
- gemini_function = _build_gemini_fnc(tool)
303
- gemini_function_declarations.append(gemini_function)
304
- retained_tools.append(tool)
305
-
306
- self._tools = llm.ToolContext(retained_tools)
307
- self._gemini_tools = [Tool(function_declarations=gemini_function_declarations)]
308
- if self._session and gemini_function_declarations:
309
- logger.warning("Updating tools; triggering Gemini session reconnect.")
310
- self._reconnect_event.set()
311
- self._schedule_gemini_session_close()
321
+ async with self._update_lock:
322
+ new_declarations: list[FunctionDeclaration] = [
323
+ _build_gemini_fnc(tool) for tool in tools
324
+ ]
325
+ current_tool_names = {f.name for f in self._gemini_declarations}
326
+ new_tool_names = {f.name for f in new_declarations}
327
+
328
+ if current_tool_names != new_tool_names:
329
+ self._gemini_declarations = new_declarations
330
+ self._tools = llm.ToolContext(tools)
331
+ self._mark_restart_needed()
312
332
 
313
333
  @property
314
334
  def chat_ctx(self) -> llm.ChatContext:
315
- return self._chat_ctx
335
+ return self._chat_ctx.copy()
316
336
 
317
337
  @property
318
338
  def tools(self) -> llm.ToolContext:
319
- return self._tools
339
+ return self._tools.copy()
320
340
 
321
341
  def push_audio(self, frame: rtc.AudioFrame) -> None:
322
- self.push_media(frame.data.tobytes(), "audio/pcm")
342
+ for f in self._resample_audio(frame):
343
+ for nf in self._bstream.write(f.data.tobytes()):
344
+ realtime_input = LiveClientRealtimeInput(
345
+ media_chunks=[Blob(data=nf.data.tobytes(), mime_type="audio/pcm")]
346
+ )
347
+ self._send_client_event(realtime_input)
323
348
 
324
349
  def push_video(self, frame: rtc.VideoFrame) -> None:
325
350
  encoded_data = images.encode(frame, DEFAULT_ENCODE_OPTIONS)
326
- self.push_media(encoded_data, "image/jpeg")
327
-
328
- def push_media(self, bytes: bytes, mime_type: str) -> None:
329
351
  realtime_input = LiveClientRealtimeInput(
330
- media_chunks=[Blob(data=bytes, mime_type=mime_type)]
352
+ media_chunks=[Blob(data=encoded_data, mime_type="image/jpeg")]
331
353
  )
332
- self._msg_ch.send_nowait(realtime_input)
354
+ self._send_client_event(realtime_input)
355
+
356
+ def _send_client_event(self, event: ClientEvents) -> None:
357
+ with contextlib.suppress(utils.aio.channel.ChanClosed):
358
+ self._msg_ch.send_nowait(event)
333
359
 
334
360
  def generate_reply(
335
361
  self, *, instructions: NotGivenOr[str] = NOT_GIVEN
336
362
  ) -> asyncio.Future[llm.GenerationCreatedEvent]:
337
- fut = asyncio.Future()
363
+ if self._pending_generation_fut and not self._pending_generation_fut.done():
364
+ logger.warning(
365
+ "generate_reply called while another generation is pending, cancelling previous."
366
+ )
367
+ self._pending_generation_fut.cancel("Superseded by new generate_reply call")
338
368
 
339
- event_id = utils.shortuuid("gemini-response-")
340
- self._response_created_futures[event_id] = fut
341
- self._pending_generation_event_id = event_id
369
+ fut = asyncio.Future()
370
+ self._pending_generation_fut = fut
342
371
 
343
- instructions_content = instructions if is_given(instructions) else "."
344
- ctx = [Content(parts=[Part(text=instructions_content)], role="user")]
345
- self._msg_ch.send_nowait(LiveClientContent(turns=ctx, turn_complete=True))
372
+ # Gemini requires the last message to end with user's turn
373
+ # so we need to add a placeholder user turn in order to trigger a new generation
374
+ event = LiveClientContent(turns=[], turn_complete=True)
375
+ if is_given(instructions):
376
+ event.turns.append(Content(parts=[Part(text=instructions)], role="model"))
377
+ event.turns.append(Content(parts=[Part(text=".")], role="user"))
378
+ self._send_client_event(event)
346
379
 
347
380
  def _on_timeout() -> None:
348
- if event_id in self._response_created_futures and not fut.done():
349
- fut.set_exception(llm.RealtimeError("generate_reply timed out."))
350
- self._response_created_futures.pop(event_id, None)
351
- if self._pending_generation_event_id == event_id:
352
- self._pending_generation_event_id = None
381
+ if not fut.done():
382
+ fut.set_exception(
383
+ llm.RealtimeError(
384
+ "generate_reply timed out waiting for generation_created event."
385
+ )
386
+ )
387
+ if self._pending_generation_fut is fut:
388
+ self._pending_generation_fut = None
353
389
 
354
- handle = asyncio.get_event_loop().call_later(5.0, _on_timeout)
355
- fut.add_done_callback(lambda _: handle.cancel())
390
+ timeout_handle = asyncio.get_event_loop().call_later(5.0, _on_timeout)
391
+ fut.add_done_callback(lambda _: timeout_handle.cancel())
356
392
 
357
393
  return fut
358
394
 
@@ -360,133 +396,206 @@ class RealtimeSession(llm.RealtimeSession):
360
396
  pass
361
397
 
362
398
  def truncate(self, *, message_id: str, audio_end_ms: int) -> None:
399
+ logger.warning("truncate is not supported by the Google Realtime API.")
363
400
  pass
364
401
 
365
402
  async def aclose(self) -> None:
366
403
  self._msg_ch.close()
367
-
368
- for fut in self._response_created_futures.values():
369
- if not fut.done():
370
- fut.set_exception(llm.RealtimeError("Session closed"))
404
+ self._session_should_close.set()
371
405
 
372
406
  if self._main_atask:
373
407
  await utils.aio.cancel_and_wait(self._main_atask)
374
408
 
375
- if self._gemini_close_task:
376
- await utils.aio.cancel_and_wait(self._gemini_close_task)
409
+ await self._close_active_session()
410
+
411
+ if self._pending_generation_fut and not self._pending_generation_fut.done():
412
+ self._pending_generation_fut.cancel("Session closed")
413
+
414
+ for fut in self._response_created_futures.values():
415
+ if not fut.done():
416
+ fut.set_exception(llm.RealtimeError("Session closed before response created"))
417
+ self._response_created_futures.clear()
418
+
419
+ if self._current_generation:
420
+ self._finalize_response(closed=True)
377
421
 
378
422
  @utils.log_exceptions(logger=logger)
379
423
  async def _main_task(self):
380
- while True:
381
- config = LiveConnectConfig(
382
- response_modalities=self._opts.response_modalities
383
- if is_given(self._opts.response_modalities)
384
- else [Modality.AUDIO],
385
- generation_config=GenerationConfig(
386
- candidate_count=self._opts.candidate_count,
387
- temperature=self._opts.temperature
388
- if is_given(self._opts.temperature)
389
- else None,
390
- max_output_tokens=self._opts.max_output_tokens
391
- if is_given(self._opts.max_output_tokens)
392
- else None,
393
- top_p=self._opts.top_p if is_given(self._opts.top_p) else None,
394
- top_k=self._opts.top_k if is_given(self._opts.top_k) else None,
395
- presence_penalty=self._opts.presence_penalty
396
- if is_given(self._opts.presence_penalty)
397
- else None,
398
- frequency_penalty=self._opts.frequency_penalty
399
- if is_given(self._opts.frequency_penalty)
400
- else None,
401
- ),
402
- system_instruction=Content(parts=[Part(text=self._opts.instructions)])
403
- if is_given(self._opts.instructions)
404
- else None,
405
- speech_config=SpeechConfig(
406
- voice_config=VoiceConfig(
407
- prebuilt_voice_config=PrebuiltVoiceConfig(voice_name=self._opts.voice)
424
+ while not self._msg_ch.closed:
425
+ # previous session might not be closed yet, we'll do it here.
426
+ await self._close_active_session()
427
+
428
+ self._session_should_close.clear()
429
+ config = self._build_connect_config()
430
+ session = None
431
+ try:
432
+ logger.debug("connecting to Gemini Realtime API...")
433
+ async with self._client.aio.live.connect(
434
+ model=self._opts.model, config=config
435
+ ) as session:
436
+ async with self._session_lock:
437
+ self._active_session = session
438
+
439
+ # queue up existing chat context
440
+ send_task = asyncio.create_task(
441
+ self._send_task(session), name="gemini-realtime-send"
442
+ )
443
+ recv_task = asyncio.create_task(
444
+ self._recv_task(session), name="gemini-realtime-recv"
445
+ )
446
+ restart_wait_task = asyncio.create_task(
447
+ self._session_should_close.wait(), name="gemini-restart-wait"
408
448
  )
409
- ),
410
- tools=self._gemini_tools,
411
- input_audio_transcription=self._opts.input_audio_transcription,
412
- output_audio_transcription=self._opts.output_audio_transcription,
413
- )
414
-
415
- async with self._client.aio.live.connect(
416
- model=self._opts.model, config=config
417
- ) as session:
418
- async with self._session_lock:
419
- self._session = session
420
-
421
- @utils.log_exceptions(logger=logger)
422
- async def _send_task():
423
- async for msg in self._msg_ch:
424
- if isinstance(msg, LiveClientContent):
425
- await session.send(input=msg, end_of_turn=True)
426
- else:
427
- await session.send(input=msg)
428
- await session.send(input=".", end_of_turn=True)
429
-
430
- @utils.log_exceptions(logger=logger)
431
- async def _recv_task():
432
- while True:
433
- async for response in session.receive():
434
- if self._active_response_id is None:
435
- self._start_new_generation()
436
- if response.setup_complete:
437
- logger.info("connection established with gemini live api server")
438
- if response.server_content:
439
- self._handle_server_content(response.server_content)
440
- if response.tool_call:
441
- self._handle_tool_calls(response.tool_call)
442
- if response.tool_call_cancellation:
443
- self._handle_tool_call_cancellation(response.tool_call_cancellation)
444
- if response.usage_metadata:
445
- self._handle_usage_metadata(response.usage_metadata)
446
- if response.go_away:
447
- self._handle_go_away(response.go_away)
448
-
449
- send_task = asyncio.create_task(_send_task(), name="gemini-realtime-send")
450
- recv_task = asyncio.create_task(_recv_task(), name="gemini-realtime-recv")
451
- reconnect_task = asyncio.create_task(
452
- self._reconnect_event.wait(), name="reconnect-wait"
453
- )
454
449
 
455
- try:
456
- done, _ = await asyncio.wait(
457
- [send_task, recv_task, reconnect_task],
450
+ done, pending = await asyncio.wait(
451
+ [send_task, recv_task, restart_wait_task],
458
452
  return_when=asyncio.FIRST_COMPLETED,
459
453
  )
454
+
460
455
  for task in done:
461
- if task != reconnect_task:
462
- task.result()
456
+ if task is not restart_wait_task and task.exception():
457
+ logger.error(f"error in task {task.get_name()}: {task.exception()}")
458
+ raise task.exception() or Exception(f"{task.get_name()} failed")
463
459
 
464
- if reconnect_task not in done:
460
+ if restart_wait_task not in done and self._msg_ch.closed:
465
461
  break
466
462
 
467
- self._reconnect_event.clear()
468
- finally:
469
- await utils.aio.cancel_and_wait(send_task, recv_task, reconnect_task)
463
+ for task in pending:
464
+ await utils.aio.cancel_and_wait(task)
465
+
466
+ except asyncio.CancelledError:
467
+ break
468
+ except Exception as e:
469
+ logger.error(f"Gemini Realtime API error: {e}", exc_info=e)
470
+ if not self._msg_ch.closed:
471
+ logger.info("attempting to reconnect after 1 seconds...")
472
+ await asyncio.sleep(1)
473
+ finally:
474
+ await self._close_active_session()
475
+
476
+ async def _send_task(self, session: AsyncSession):
477
+ try:
478
+ async for msg in self._msg_ch:
479
+ async with self._session_lock:
480
+ if self._session_should_close.is_set() or (
481
+ not self._active_session or self._active_session != session
482
+ ):
483
+ break
484
+
485
+ if isinstance(msg, LiveClientContent):
486
+ await session.send(input=msg)
487
+ else:
488
+ await session.send(input=msg)
489
+ except Exception as e:
490
+ if not self._session_should_close.is_set():
491
+ logger.error(f"error in send task: {e}", exc_info=e)
492
+ self._mark_restart_needed()
493
+ finally:
494
+ logger.debug("send task finished.")
495
+
496
+ async def _recv_task(self, session: AsyncSession):
497
+ try:
498
+ while True:
499
+ async with self._session_lock:
500
+ if self._session_should_close.is_set() or (
501
+ not self._active_session or self._active_session != session
502
+ ):
503
+ logger.debug("receive task: Session changed or closed, stopping receive.")
504
+ break
505
+
506
+ async for response in session.receive():
507
+ if not self._current_generation and (
508
+ response.server_content or response.tool_call
509
+ ):
510
+ self._start_new_generation()
511
+
512
+ if response.session_resumption_update:
513
+ if (
514
+ response.session_resumption_update.resumable
515
+ and response.session_resumption_update.new_handle
516
+ ):
517
+ self._session_resumption_handle = (
518
+ response.session_resumption_update.new_handle
519
+ )
520
+
521
+ if response.server_content:
522
+ self._handle_server_content(response.server_content)
523
+ if response.tool_call:
524
+ self._handle_tool_calls(response.tool_call)
525
+ if response.tool_call_cancellation:
526
+ self._handle_tool_call_cancellation(response.tool_call_cancellation)
527
+ if response.usage_metadata:
528
+ self._handle_usage_metadata(response.usage_metadata)
529
+ if response.go_away:
530
+ self._handle_go_away(response.go_away)
531
+
532
+ # TODO(dz): a server-side turn is complete
533
+ except Exception as e:
534
+ if not self._session_should_close.is_set():
535
+ logger.error(f"error in receive task: {e}", exc_info=e)
536
+ self._mark_restart_needed()
537
+ finally:
538
+ self._finalize_response(closed=True)
539
+
540
+ def _build_connect_config(self) -> LiveConnectConfig:
541
+ temp = self._opts.temperature if is_given(self._opts.temperature) else None
542
+
543
+ return LiveConnectConfig(
544
+ response_modalities=self._opts.response_modalities
545
+ if is_given(self._opts.response_modalities)
546
+ else [Modality.AUDIO],
547
+ generation_config=GenerationConfig(
548
+ candidate_count=self._opts.candidate_count,
549
+ temperature=temp,
550
+ max_output_tokens=self._opts.max_output_tokens
551
+ if is_given(self._opts.max_output_tokens)
552
+ else None,
553
+ top_p=self._opts.top_p if is_given(self._opts.top_p) else None,
554
+ top_k=self._opts.top_k if is_given(self._opts.top_k) else None,
555
+ presence_penalty=self._opts.presence_penalty
556
+ if is_given(self._opts.presence_penalty)
557
+ else None,
558
+ frequency_penalty=self._opts.frequency_penalty
559
+ if is_given(self._opts.frequency_penalty)
560
+ else None,
561
+ ),
562
+ system_instruction=Content(parts=[Part(text=self._opts.instructions)])
563
+ if is_given(self._opts.instructions)
564
+ else None,
565
+ speech_config=SpeechConfig(
566
+ voice_config=VoiceConfig(
567
+ prebuilt_voice_config=PrebuiltVoiceConfig(voice_name=self._opts.voice)
568
+ ),
569
+ language_code=self._opts.language if is_given(self._opts.language) else None,
570
+ ),
571
+ tools=[Tool(function_declarations=self._gemini_declarations)],
572
+ input_audio_transcription=self._opts.input_audio_transcription,
573
+ output_audio_transcription=self._opts.output_audio_transcription,
574
+ session_resumption=SessionResumptionConfig(handle=self._session_resumption_handle),
575
+ )
470
576
 
471
577
  def _start_new_generation(self):
472
- self._is_interrupted = False
473
- self._active_response_id = utils.shortuuid("gemini-turn-")
578
+ if self._current_generation:
579
+ logger.warning("starting new generation while another is active. Finalizing previous.")
580
+ self._finalize_response(closed=True)
581
+
582
+ response_id = utils.shortuuid("gemini-turn-")
474
583
  self._current_generation = _ResponseGeneration(
475
584
  message_ch=utils.aio.Chan[llm.MessageGeneration](),
476
585
  function_ch=utils.aio.Chan[llm.FunctionCall](),
477
586
  messages={},
478
587
  )
479
588
 
480
- # We'll assume each chunk belongs to a single message ID self._active_response_id
481
589
  item_generation = _MessageGeneration(
482
- message_id=self._active_response_id,
590
+ message_id=response_id,
483
591
  text_ch=utils.aio.Chan[str](),
484
592
  audio_ch=utils.aio.Chan[rtc.AudioFrame](),
485
593
  )
594
+ self._current_generation.messages[response_id] = item_generation
486
595
 
487
596
  self._current_generation.message_ch.send_nowait(
488
597
  llm.MessageGeneration(
489
- message_id=self._active_response_id,
598
+ message_id=response_id,
490
599
  text_stream=item_generation.text_ch,
491
600
  audio_stream=item_generation.audio_ch,
492
601
  )
@@ -498,84 +607,92 @@ class RealtimeSession(llm.RealtimeSession):
498
607
  user_initiated=False,
499
608
  )
500
609
 
501
- # Resolve any pending future from generate_reply()
502
- if self._pending_generation_event_id and (
503
- fut := self._response_created_futures.pop(self._pending_generation_event_id, None)
504
- ):
505
- fut.set_result(generation_event)
610
+ if self._pending_generation_fut and not self._pending_generation_fut.done():
611
+ generation_event.user_initiated = True
612
+ self._pending_generation_fut.set_result(generation_event)
613
+ self._pending_generation_fut = None
506
614
 
507
- self._pending_generation_event_id = None
508
615
  self.emit("generation_created", generation_event)
509
616
 
510
- self._current_generation.messages[self._active_response_id] = item_generation
511
-
512
617
  def _handle_server_content(self, server_content: LiveServerContent):
513
- if not self._current_generation or not self._active_response_id:
514
- logger.warning(
515
- "gemini-realtime-session: No active response ID, skipping server content"
516
- )
618
+ if not self._current_generation:
619
+ logger.warning("received server content but no active generation.")
517
620
  return
518
621
 
519
- item_generation = self._current_generation.messages[self._active_response_id]
622
+ response_id = list(self._current_generation.messages.keys())[0]
623
+ item_generation = self._current_generation.messages[response_id]
520
624
 
521
- model_turn = server_content.model_turn
522
- if model_turn:
625
+ if model_turn := server_content.model_turn:
523
626
  for part in model_turn.parts:
524
627
  if part.text:
525
628
  item_generation.text_ch.send_nowait(part.text)
526
629
  if part.inline_data:
527
630
  frame_data = part.inline_data.data
528
- frame = rtc.AudioFrame(
529
- data=frame_data,
530
- sample_rate=OUTPUT_AUDIO_SAMPLE_RATE,
531
- num_channels=NUM_CHANNELS,
532
- samples_per_channel=len(frame_data) // 2,
533
- )
534
- item_generation.audio_ch.send_nowait(frame)
535
- input_transcription = server_content.input_transcription
536
- if input_transcription and input_transcription.text:
537
- self.emit(
538
- "input_audio_transcription_completed",
539
- llm.InputTranscriptionCompleted(
540
- item_id=self._active_response_id, transcript=input_transcription.text
541
- ),
542
- )
543
- output_transcription = server_content.output_transcription
544
- if output_transcription and output_transcription.text:
545
- item_generation.text_ch.send_nowait(output_transcription.text)
631
+ try:
632
+ frame = rtc.AudioFrame(
633
+ data=frame_data,
634
+ sample_rate=OUTPUT_AUDIO_SAMPLE_RATE,
635
+ num_channels=OUTPUT_AUDIO_CHANNELS,
636
+ samples_per_channel=len(frame_data) // (2 * OUTPUT_AUDIO_CHANNELS),
637
+ )
638
+ item_generation.audio_ch.send_nowait(frame)
639
+ except ValueError as e:
640
+ logger.error(f"Error creating audio frame from Gemini data: {e}")
641
+
642
+ if input_transcription := server_content.input_transcription:
643
+ if input_transcription.text:
644
+ self.emit(
645
+ "input_audio_transcription_completed",
646
+ llm.InputTranscriptionCompleted(
647
+ item_id=response_id, transcript=input_transcription.text
648
+ ),
649
+ )
650
+ self._handle_input_speech_started()
651
+
652
+ if output_transcription := server_content.output_transcription:
653
+ if output_transcription.text:
654
+ item_generation.text_ch.send_nowait(output_transcription.text)
655
+
546
656
  if server_content.interrupted:
547
- self._finalize_response()
657
+ self._finalize_response(interrupted=True)
548
658
  self._handle_input_speech_started()
549
659
 
550
660
  if server_content.turn_complete:
551
661
  self._finalize_response()
552
662
 
553
- def _finalize_response(self) -> None:
663
+ def _finalize_response(self, interrupted: bool = False, closed: bool = False) -> None:
554
664
  if not self._current_generation:
555
665
  return
556
666
 
557
- for item_generation in self._current_generation.messages.values():
558
- item_generation.text_ch.close()
559
- item_generation.audio_ch.close()
560
-
561
- self._current_generation.function_ch.close()
562
- self._current_generation.message_ch.close()
667
+ gen = self._current_generation
563
668
  self._current_generation = None
564
- self._is_interrupted = True
565
- self._active_response_id = None
669
+
670
+ for item_generation in gen.messages.values():
671
+ if not item_generation.text_ch.closed:
672
+ item_generation.text_ch.close()
673
+ if not item_generation.audio_ch.closed:
674
+ item_generation.audio_ch.close()
675
+
676
+ gen.function_ch.close()
677
+ gen.message_ch.close()
566
678
 
567
679
  def _handle_input_speech_started(self):
568
680
  self.emit("input_speech_started", llm.InputSpeechStartedEvent())
569
681
 
570
682
  def _handle_tool_calls(self, tool_call: LiveServerToolCall):
571
683
  if not self._current_generation:
684
+ logger.warning("received tool call but no active generation.")
572
685
  return
686
+
687
+ gen = self._current_generation
573
688
  for fnc_call in tool_call.function_calls:
574
- self._current_generation.function_ch.send_nowait(
689
+ arguments = json.dumps(fnc_call.args)
690
+
691
+ gen.function_ch.send_nowait(
575
692
  llm.FunctionCall(
576
- call_id=fnc_call.id or "",
693
+ call_id=fnc_call.id or utils.shortuuid("fnc-call-"),
577
694
  name=fnc_call.name,
578
- arguments=json.dumps(fnc_call.args),
695
+ arguments=arguments,
579
696
  )
580
697
  )
581
698
  self._finalize_response()
@@ -584,28 +701,45 @@ class RealtimeSession(llm.RealtimeSession):
584
701
  self, tool_call_cancellation: LiveServerToolCallCancellation
585
702
  ):
586
703
  logger.warning(
587
- "function call cancelled",
588
- extra={
589
- "function_call_ids": tool_call_cancellation.ids,
590
- },
704
+ "server cancelled tool calls",
705
+ extra={"function_call_ids": tool_call_cancellation.ids},
591
706
  )
592
- self.emit("function_calls_cancelled", tool_call_cancellation.ids)
593
707
 
594
708
  def _handle_usage_metadata(self, usage_metadata: UsageMetadata):
595
- # todo: handle metrics
596
- logger.info("Usage metadata", extra={"usage_metadata": usage_metadata})
709
+ # TODO: handle metrics
710
+ logger.debug("usage metadata", extra={"usage_metadata": usage_metadata})
597
711
 
598
712
  def _handle_go_away(self, go_away: LiveServerGoAway):
599
- # should we reconnect?
600
713
  logger.warning(
601
- f"gemini live api server will soon disconnect. time left: {go_away.time_left}"
714
+ f"Gemini server indicates disconnection soon. Time left: {go_away.time_left}"
602
715
  )
716
+ # TODO(dz): this isn't a seamless reconnection just yet
717
+ self._session_should_close.set()
603
718
 
604
719
  def commit_audio(self) -> None:
605
- raise NotImplementedError("commit_audio_buffer is not supported yet")
720
+ pass
606
721
 
607
722
  def clear_audio(self) -> None:
608
- raise NotImplementedError("clear_audio is not supported yet")
723
+ self._bstream.clear()
609
724
 
610
- def server_vad_enabled(self) -> bool:
611
- return True
725
+ def _resample_audio(self, frame: rtc.AudioFrame) -> Iterator[rtc.AudioFrame]:
726
+ if self._input_resampler:
727
+ if frame.sample_rate != self._input_resampler._input_rate:
728
+ # input audio changed to a different sample rate
729
+ self._input_resampler = None
730
+
731
+ if self._input_resampler is None and (
732
+ frame.sample_rate != INPUT_AUDIO_SAMPLE_RATE
733
+ or frame.num_channels != INPUT_AUDIO_CHANNELS
734
+ ):
735
+ self._input_resampler = rtc.AudioResampler(
736
+ input_rate=frame.sample_rate,
737
+ output_rate=INPUT_AUDIO_SAMPLE_RATE,
738
+ num_channels=INPUT_AUDIO_CHANNELS,
739
+ )
740
+
741
+ if self._input_resampler:
742
+ # TODO(long): flush the resampler when the input source is changed
743
+ yield from self._input_resampler.push(frame)
744
+ else:
745
+ yield frame
@@ -105,7 +105,7 @@ class TTS(tts.TTS):
105
105
  self._opts = _TTSOptions(
106
106
  voice=voice_params,
107
107
  audio_config=texttospeech.AudioConfig(
108
- audio_encoding=texttospeech.AudioEncoding.OGG_OPUS,
108
+ audio_encoding=texttospeech.AudioEncoding.PCM,
109
109
  sample_rate_hertz=sample_rate,
110
110
  pitch=pitch,
111
111
  effects_profile_id=effects_profile_id,
@@ -132,11 +132,11 @@ class TTS(tts.TTS):
132
132
  """ # noqa: E501
133
133
  params = {}
134
134
  if is_given(language):
135
- params["language"] = language
135
+ params["language_code"] = str(language)
136
136
  if is_given(gender):
137
- params["gender"] = gender
137
+ params["ssml_gender"] = _gender_from_str(str(gender))
138
138
  if is_given(voice_name):
139
- params["voice_name"] = voice_name
139
+ params["name"] = voice_name
140
140
 
141
141
  if params:
142
142
  self._opts.voice = texttospeech.VoiceSelectionParams(**params)
@@ -28,7 +28,7 @@ def get_tool_results_for_realtime(chat_ctx: llm.ChatContext) -> types.LiveClient
28
28
  types.FunctionResponse(
29
29
  id=msg.call_id,
30
30
  name=msg.name,
31
- response={"text": msg.output},
31
+ response={"output": msg.output},
32
32
  )
33
33
  )
34
34
  return (
@@ -99,9 +99,11 @@ def to_chat_ctx(
99
99
  if current_role is not None and parts:
100
100
  turns.append(types.Content(role=current_role, parts=parts))
101
101
 
102
- if not turns:
103
- # if no turns, add a user message with a placeholder
104
- turns = [types.Content(role="user", parts=[types.Part(text=".")])]
102
+ # # Gemini requires the last message to end with user's turn before they can generate
103
+ # # currently not used because to_chat_ctx should not be used to force a new generation
104
+ # if current_role != "user":
105
+ # turns.append(types.Content(role="user", parts=[types.Part(text=".")]))
106
+
105
107
  return turns, system_instruction
106
108
 
107
109
 
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.0.16"
15
+ __version__ = "1.0.18"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: livekit-plugins-google
3
- Version: 1.0.16
3
+ Version: 1.0.18
4
4
  Summary: Agent Framework plugin for services from Google Cloud
5
5
  Project-URL: Documentation, https://docs.livekit.io
6
6
  Project-URL: Website, https://livekit.io/
@@ -21,8 +21,8 @@ Requires-Python: >=3.9.0
21
21
  Requires-Dist: google-auth<3,>=2
22
22
  Requires-Dist: google-cloud-speech<3,>=2
23
23
  Requires-Dist: google-cloud-texttospeech<3,>=2
24
- Requires-Dist: google-genai>=1.10.0
25
- Requires-Dist: livekit-agents>=1.0.16
24
+ Requires-Dist: google-genai>=1.12.1
25
+ Requires-Dist: livekit-agents>=1.0.18
26
26
  Description-Content-Type: text/markdown
27
27
 
28
28
  # LiveKit Plugins Google
@@ -4,13 +4,13 @@ livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA
4
4
  livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
5
5
  livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  livekit/plugins/google/stt.py,sha256=AG_lh2fuuduJi0jFbA_QKFXLJ6NUdF1W_FfkLUJML_Q,22413
7
- livekit/plugins/google/tts.py,sha256=xhINokqY8UutXn85N-cbzq68eptbM6TTtIXmLktE_RM,9004
8
- livekit/plugins/google/utils.py,sha256=pbLSOAdQxInWhgI2Yhsrr9KvgvpFXYDdU2yx2p03pFg,9437
9
- livekit/plugins/google/version.py,sha256=JrpH7xxAlurLX7a-QPfPkangHuYRj8VFBapC1AR9TZs,601
7
+ livekit/plugins/google/tts.py,sha256=fmQwW9a1kPsEsrTvIo8fqw479RxWEx0SIc3oTVaj41U,9031
8
+ livekit/plugins/google/utils.py,sha256=TjjTwMbdJdxr3bZjUXxs-J_fipTTM00goW2-d9KWX6w,9582
9
+ livekit/plugins/google/version.py,sha256=cnPu9FVKZV9tFmmz7lEvftrO3B_nWJVFghi3j6UcJLs,601
10
10
  livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
11
11
  livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
12
12
  livekit/plugins/google/beta/realtime/api_proto.py,sha256=Fyrejs3SG0EjOPCCFLEnWXKEUxCff47PMWk2VsKJm5E,594
13
- livekit/plugins/google/beta/realtime/realtime_api.py,sha256=HvPYyQXC9OodWaDNxbRt1UAJ8IVdXZGK-PsIEr7UwbY,25078
14
- livekit_plugins_google-1.0.16.dist-info/METADATA,sha256=x7Ugs0szCq26O8ASP_7tjBzTUeMKReQ7K3Ao_MRf2DU,3492
15
- livekit_plugins_google-1.0.16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
16
- livekit_plugins_google-1.0.16.dist-info/RECORD,,
13
+ livekit/plugins/google/beta/realtime/realtime_api.py,sha256=sXp2oHnTlHrAp5wFmcXj0bRtQKixBYedfbufcbjVHxk,30897
14
+ livekit_plugins_google-1.0.18.dist-info/METADATA,sha256=Vqt0FoqibcKzX_jFXlyFkn-mT7iPC16JlH61VS0fbuw,3492
15
+ livekit_plugins_google-1.0.18.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
16
+ livekit_plugins_google-1.0.18.dist-info/RECORD,,