livekit-plugins-google 1.0.20__py3-none-any.whl → 1.0.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,14 +4,16 @@ import asyncio
4
4
  import contextlib
5
5
  import json
6
6
  import os
7
+ import time
7
8
  import weakref
8
9
  from collections.abc import Iterator
9
- from dataclasses import dataclass
10
+ from dataclasses import dataclass, field
10
11
 
11
12
  from google import genai
12
13
  from google.genai.live import AsyncSession
13
14
  from google.genai.types import (
14
15
  AudioTranscriptionConfig,
16
+ AutomaticActivityDetection,
15
17
  Blob,
16
18
  Content,
17
19
  FunctionDeclaration,
@@ -25,8 +27,10 @@ from google.genai.types import (
25
27
  LiveServerToolCall,
26
28
  LiveServerToolCallCancellation,
27
29
  Modality,
30
+ ModalityTokenCount,
28
31
  Part,
29
32
  PrebuiltVoiceConfig,
33
+ RealtimeInputConfig,
30
34
  SessionResumptionConfig,
31
35
  SpeechConfig,
32
36
  Tool,
@@ -35,19 +39,20 @@ from google.genai.types import (
35
39
  )
36
40
  from livekit import rtc
37
41
  from livekit.agents import llm, utils
42
+ from livekit.agents.metrics import RealtimeModelMetrics
38
43
  from livekit.agents.types import NOT_GIVEN, NotGivenOr
39
44
  from livekit.agents.utils import audio as audio_utils, images, is_given
40
45
  from livekit.plugins.google.beta.realtime.api_proto import ClientEvents, LiveAPIModels, Voice
41
46
 
42
47
  from ...log import logger
43
- from ...utils import _build_gemini_fnc, get_tool_results_for_realtime, to_chat_ctx
48
+ from ...utils import get_tool_results_for_realtime, to_chat_ctx, to_fnc_ctx
44
49
 
45
50
  INPUT_AUDIO_SAMPLE_RATE = 16000
46
51
  INPUT_AUDIO_CHANNELS = 1
47
52
  OUTPUT_AUDIO_SAMPLE_RATE = 24000
48
53
  OUTPUT_AUDIO_CHANNELS = 1
49
54
 
50
- DEFAULT_ENCODE_OPTIONS = images.EncodeOptions(
55
+ DEFAULT_IMAGE_ENCODE_OPTIONS = images.EncodeOptions(
51
56
  format="JPEG",
52
57
  quality=75,
53
58
  resize_options=images.ResizeOptions(width=1024, height=1024, strategy="scale_aspect_fit"),
@@ -80,13 +85,7 @@ class _RealtimeOptions:
80
85
  instructions: NotGivenOr[str]
81
86
  input_audio_transcription: AudioTranscriptionConfig | None
82
87
  output_audio_transcription: AudioTranscriptionConfig | None
83
-
84
-
85
- @dataclass
86
- class _MessageGeneration:
87
- message_id: str
88
- text_ch: utils.aio.Chan[str]
89
- audio_ch: utils.aio.Chan[rtc.AudioFrame]
88
+ image_encode_options: NotGivenOr[images.EncodeOptions]
90
89
 
91
90
 
92
91
  @dataclass
@@ -94,7 +93,19 @@ class _ResponseGeneration:
94
93
  message_ch: utils.aio.Chan[llm.MessageGeneration]
95
94
  function_ch: utils.aio.Chan[llm.FunctionCall]
96
95
 
97
- messages: dict[str, _MessageGeneration]
96
+ response_id: str
97
+ text_ch: utils.aio.Chan[str]
98
+ audio_ch: utils.aio.Chan[rtc.AudioFrame]
99
+ input_transcription: str = ""
100
+
101
+ _created_timestamp: float = field(default_factory=time.time)
102
+ """The timestamp when the generation is created"""
103
+ _first_token_timestamp: float | None = None
104
+ """The timestamp when the first audio token is received"""
105
+ _completed_timestamp: float | None = None
106
+ """The timestamp when the generation is completed"""
107
+ _done: bool = False
108
+ """Whether the generation is done (set when the turn is complete)"""
98
109
 
99
110
 
100
111
  class RealtimeModel(llm.RealtimeModel):
@@ -107,7 +118,7 @@ class RealtimeModel(llm.RealtimeModel):
107
118
  voice: Voice | str = "Puck",
108
119
  language: NotGivenOr[str] = NOT_GIVEN,
109
120
  modalities: NotGivenOr[list[Modality]] = NOT_GIVEN,
110
- vertexai: bool = False,
121
+ vertexai: NotGivenOr[bool] = NOT_GIVEN,
111
122
  project: NotGivenOr[str] = NOT_GIVEN,
112
123
  location: NotGivenOr[str] = NOT_GIVEN,
113
124
  candidate_count: int = 1,
@@ -119,12 +130,13 @@ class RealtimeModel(llm.RealtimeModel):
119
130
  frequency_penalty: NotGivenOr[float] = NOT_GIVEN,
120
131
  input_audio_transcription: NotGivenOr[AudioTranscriptionConfig | None] = NOT_GIVEN,
121
132
  output_audio_transcription: NotGivenOr[AudioTranscriptionConfig | None] = NOT_GIVEN,
133
+ image_encode_options: NotGivenOr[images.EncodeOptions] = NOT_GIVEN,
122
134
  ) -> None:
123
135
  """
124
136
  Initializes a RealtimeModel instance for interacting with Google's Realtime API.
125
137
 
126
138
  Environment Requirements:
127
- - For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file.
139
+ - For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file or use any of the other Google Cloud auth methods.
128
140
  The Google Cloud project and location can be set via `project` and `location` arguments or the environment variables
129
141
  `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_LOCATION`. By default, the project is inferred from the service account key file,
130
142
  and the location defaults to "us-central1".
@@ -148,15 +160,22 @@ class RealtimeModel(llm.RealtimeModel):
148
160
  frequency_penalty (float, optional): The frequency penalty for response generation
149
161
  input_audio_transcription (AudioTranscriptionConfig | None, optional): The configuration for input audio transcription. Defaults to None.)
150
162
  output_audio_transcription (AudioTranscriptionConfig | None, optional): The configuration for output audio transcription. Defaults to AudioTranscriptionConfig().
163
+ image_encode_options (images.EncodeOptions, optional): The configuration for image encoding. Defaults to DEFAULT_ENCODE_OPTIONS.
151
164
 
152
165
  Raises:
153
166
  ValueError: If the API key is required but not found.
154
167
  """ # noqa: E501
168
+ if not is_given(input_audio_transcription):
169
+ input_audio_transcription = AudioTranscriptionConfig()
170
+ if not is_given(output_audio_transcription):
171
+ output_audio_transcription = AudioTranscriptionConfig()
172
+
155
173
  super().__init__(
156
174
  capabilities=llm.RealtimeCapabilities(
157
175
  message_truncation=False,
158
176
  turn_detection=True,
159
- user_transcription=is_given(input_audio_transcription),
177
+ user_transcription=input_audio_transcription is not None,
178
+ auto_tool_reply_generation=True,
160
179
  )
161
180
  )
162
181
 
@@ -173,8 +192,13 @@ class RealtimeModel(llm.RealtimeModel):
173
192
  if is_given(location)
174
193
  else os.environ.get("GOOGLE_CLOUD_LOCATION") or "us-central1"
175
194
  )
195
+ use_vertexai = (
196
+ vertexai
197
+ if is_given(vertexai)
198
+ else os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "0").lower() in ["true", "1"]
199
+ )
176
200
 
177
- if vertexai:
201
+ if use_vertexai:
178
202
  if not gcp_project or not gcp_location:
179
203
  raise ValueError(
180
204
  "Project is required for VertexAI via project kwarg or GOOGLE_CLOUD_PROJECT environment variable" # noqa: E501
@@ -188,17 +212,12 @@ class RealtimeModel(llm.RealtimeModel):
188
212
  "API key is required for Google API either via api_key or GOOGLE_API_KEY environment variable" # noqa: E501
189
213
  )
190
214
 
191
- if not is_given(input_audio_transcription):
192
- input_audio_transcription = None
193
- if not is_given(output_audio_transcription):
194
- output_audio_transcription = AudioTranscriptionConfig()
195
-
196
215
  self._opts = _RealtimeOptions(
197
216
  model=model,
198
217
  api_key=gemini_api_key,
199
218
  voice=voice,
200
219
  response_modalities=modalities,
201
- vertexai=vertexai,
220
+ vertexai=use_vertexai,
202
221
  project=gcp_project,
203
222
  location=gcp_location,
204
223
  candidate_count=candidate_count,
@@ -212,6 +231,7 @@ class RealtimeModel(llm.RealtimeModel):
212
231
  input_audio_transcription=input_audio_transcription,
213
232
  output_audio_transcription=output_audio_transcription,
214
233
  language=language,
234
+ image_encode_options=image_encode_options,
215
235
  )
216
236
 
217
237
  self._sessions = weakref.WeakSet[RealtimeSession]()
@@ -272,7 +292,6 @@ class RealtimeSession(llm.RealtimeSession):
272
292
 
273
293
  self._session_resumption_handle: str | None = None
274
294
 
275
- self._update_lock = asyncio.Lock()
276
295
  self._session_lock = asyncio.Lock()
277
296
 
278
297
  async def _close_active_session(self) -> None:
@@ -291,57 +310,59 @@ class RealtimeSession(llm.RealtimeSession):
291
310
  # reset the msg_ch, do not send messages from previous session
292
311
  self._msg_ch = utils.aio.Chan[ClientEvents]()
293
312
 
294
- async def update_options(
313
+ def update_options(
295
314
  self,
296
315
  *,
297
316
  voice: NotGivenOr[str] = NOT_GIVEN,
298
317
  temperature: NotGivenOr[float] = NOT_GIVEN,
299
318
  tool_choice: NotGivenOr[llm.ToolChoice | None] = NOT_GIVEN,
300
319
  ) -> None:
301
- async with self._update_lock:
302
- should_restart = False
303
- if is_given(voice) and self._opts.voice != voice:
304
- self._opts.voice = voice
305
- should_restart = True
320
+ should_restart = False
321
+ if is_given(voice) and self._opts.voice != voice:
322
+ self._opts.voice = voice
323
+ should_restart = True
306
324
 
307
- if is_given(temperature) and self._opts.temperature != temperature:
308
- self._opts.temperature = temperature if is_given(temperature) else NOT_GIVEN
309
- should_restart = True
325
+ if is_given(temperature) and self._opts.temperature != temperature:
326
+ self._opts.temperature = temperature if is_given(temperature) else NOT_GIVEN
327
+ should_restart = True
310
328
 
311
- if should_restart:
312
- self._mark_restart_needed()
329
+ if should_restart:
330
+ self._mark_restart_needed()
313
331
 
314
332
  async def update_instructions(self, instructions: str) -> None:
315
- async with self._update_lock:
316
- if not is_given(self._opts.instructions) or self._opts.instructions != instructions:
317
- self._opts.instructions = instructions
318
- self._mark_restart_needed()
333
+ if not is_given(self._opts.instructions) or self._opts.instructions != instructions:
334
+ self._opts.instructions = instructions
335
+ self._mark_restart_needed()
319
336
 
320
337
  async def update_chat_ctx(self, chat_ctx: llm.ChatContext) -> None:
321
- async with self._update_lock:
322
- self._chat_ctx = chat_ctx.copy()
323
- turns, _ = to_chat_ctx(self._chat_ctx, id(self), ignore_functions=True)
324
- tool_results = get_tool_results_for_realtime(
325
- self._chat_ctx, vertexai=self._opts.vertexai
326
- )
327
- # TODO(dz): need to compute delta and then either append or recreate session
338
+ diff_ops = llm.utils.compute_chat_ctx_diff(self._chat_ctx, chat_ctx)
339
+
340
+ if diff_ops.to_remove:
341
+ logger.warning("Gemini Live does not support removing messages")
342
+
343
+ append_ctx = llm.ChatContext.empty()
344
+ for _, item_id in diff_ops.to_create:
345
+ item = chat_ctx.get_by_id(item_id)
346
+ if item:
347
+ append_ctx.items.append(item)
348
+
349
+ if append_ctx.items:
350
+ turns, _ = to_chat_ctx(append_ctx, id(self), ignore_functions=True)
351
+ tool_results = get_tool_results_for_realtime(append_ctx, vertexai=self._opts.vertexai)
328
352
  if turns:
329
353
  self._send_client_event(LiveClientContent(turns=turns, turn_complete=False))
330
354
  if tool_results:
331
355
  self._send_client_event(tool_results)
332
356
 
333
357
  async def update_tools(self, tools: list[llm.FunctionTool]) -> None:
334
- async with self._update_lock:
335
- new_declarations: list[FunctionDeclaration] = [
336
- _build_gemini_fnc(tool) for tool in tools
337
- ]
338
- current_tool_names = {f.name for f in self._gemini_declarations}
339
- new_tool_names = {f.name for f in new_declarations}
340
-
341
- if current_tool_names != new_tool_names:
342
- self._gemini_declarations = new_declarations
343
- self._tools = llm.ToolContext(tools)
344
- self._mark_restart_needed()
358
+ new_declarations: list[FunctionDeclaration] = to_fnc_ctx(tools)
359
+ current_tool_names = {f.name for f in self._gemini_declarations}
360
+ new_tool_names = {f.name for f in new_declarations}
361
+
362
+ if current_tool_names != new_tool_names:
363
+ self._gemini_declarations = new_declarations
364
+ self._tools = llm.ToolContext(tools)
365
+ self._mark_restart_needed()
345
366
 
346
367
  @property
347
368
  def chat_ctx(self) -> llm.ChatContext:
@@ -360,7 +381,9 @@ class RealtimeSession(llm.RealtimeSession):
360
381
  self._send_client_event(realtime_input)
361
382
 
362
383
  def push_video(self, frame: rtc.VideoFrame) -> None:
363
- encoded_data = images.encode(frame, DEFAULT_ENCODE_OPTIONS)
384
+ encoded_data = images.encode(
385
+ frame, self._opts.image_encode_options or DEFAULT_IMAGE_ENCODE_OPTIONS
386
+ )
364
387
  realtime_input = LiveClientRealtimeInput(
365
388
  media_chunks=[Blob(data=encoded_data, mime_type="image/jpeg")]
366
389
  )
@@ -430,7 +453,7 @@ class RealtimeSession(llm.RealtimeSession):
430
453
  self._response_created_futures.clear()
431
454
 
432
455
  if self._current_generation:
433
- self._finalize_response(closed=True)
456
+ self._mark_current_generation_done()
434
457
 
435
458
  @utils.log_exceptions(logger=logger)
436
459
  async def _main_task(self):
@@ -524,7 +547,7 @@ class RealtimeSession(llm.RealtimeSession):
524
547
  break
525
548
 
526
549
  async for response in session.receive():
527
- if not self._current_generation and (
550
+ if (not self._current_generation or self._current_generation._done) and (
528
551
  response.server_content or response.tool_call
529
552
  ):
530
553
  self._start_new_generation()
@@ -555,7 +578,7 @@ class RealtimeSession(llm.RealtimeSession):
555
578
  logger.error(f"error in receive task: {e}", exc_info=e)
556
579
  self._mark_restart_needed()
557
580
  finally:
558
- self._finalize_response(closed=True)
581
+ self._mark_current_generation_done()
559
582
 
560
583
  def _build_connect_config(self) -> LiveConnectConfig:
561
584
  temp = self._opts.temperature if is_given(self._opts.temperature) else None
@@ -592,32 +615,31 @@ class RealtimeSession(llm.RealtimeSession):
592
615
  input_audio_transcription=self._opts.input_audio_transcription,
593
616
  output_audio_transcription=self._opts.output_audio_transcription,
594
617
  session_resumption=SessionResumptionConfig(handle=self._session_resumption_handle),
618
+ realtime_input_config=RealtimeInputConfig(
619
+ automatic_activity_detection=AutomaticActivityDetection(),
620
+ ),
595
621
  )
596
622
 
597
623
  def _start_new_generation(self):
598
- if self._current_generation:
624
+ if self._current_generation and not self._current_generation._done:
599
625
  logger.warning("starting new generation while another is active. Finalizing previous.")
600
- self._finalize_response(closed=True)
626
+ self._mark_current_generation_done()
601
627
 
602
628
  response_id = utils.shortuuid("gemini-turn-")
603
629
  self._current_generation = _ResponseGeneration(
604
630
  message_ch=utils.aio.Chan[llm.MessageGeneration](),
605
631
  function_ch=utils.aio.Chan[llm.FunctionCall](),
606
- messages={},
607
- )
608
-
609
- item_generation = _MessageGeneration(
610
- message_id=response_id,
632
+ response_id=response_id,
611
633
  text_ch=utils.aio.Chan[str](),
612
634
  audio_ch=utils.aio.Chan[rtc.AudioFrame](),
635
+ _created_timestamp=time.time(),
613
636
  )
614
- self._current_generation.messages[response_id] = item_generation
615
637
 
616
638
  self._current_generation.message_ch.send_nowait(
617
639
  llm.MessageGeneration(
618
640
  message_id=response_id,
619
- text_stream=item_generation.text_ch,
620
- audio_stream=item_generation.audio_ch,
641
+ text_stream=self._current_generation.text_ch,
642
+ audio_stream=self._current_generation.audio_ch,
621
643
  )
622
644
  )
623
645
 
@@ -635,18 +657,18 @@ class RealtimeSession(llm.RealtimeSession):
635
657
  self.emit("generation_created", generation_event)
636
658
 
637
659
  def _handle_server_content(self, server_content: LiveServerContent):
638
- if not self._current_generation:
660
+ current_gen = self._current_generation
661
+ if not current_gen:
639
662
  logger.warning("received server content but no active generation.")
640
663
  return
641
664
 
642
- response_id = list(self._current_generation.messages.keys())[0]
643
- item_generation = self._current_generation.messages[response_id]
644
-
645
665
  if model_turn := server_content.model_turn:
646
666
  for part in model_turn.parts:
647
667
  if part.text:
648
- item_generation.text_ch.send_nowait(part.text)
668
+ current_gen.text_ch.send_nowait(part.text)
649
669
  if part.inline_data:
670
+ if not current_gen._first_token_timestamp:
671
+ current_gen._first_token_timestamp = time.time()
650
672
  frame_data = part.inline_data.data
651
673
  try:
652
674
  frame = rtc.AudioFrame(
@@ -655,46 +677,65 @@ class RealtimeSession(llm.RealtimeSession):
655
677
  num_channels=OUTPUT_AUDIO_CHANNELS,
656
678
  samples_per_channel=len(frame_data) // (2 * OUTPUT_AUDIO_CHANNELS),
657
679
  )
658
- item_generation.audio_ch.send_nowait(frame)
680
+ current_gen.audio_ch.send_nowait(frame)
659
681
  except ValueError as e:
660
682
  logger.error(f"Error creating audio frame from Gemini data: {e}")
661
683
 
662
684
  if input_transcription := server_content.input_transcription:
663
- if input_transcription.text:
685
+ text = input_transcription.text
686
+ if text:
687
+ if current_gen.input_transcription == "":
688
+ # gemini would start with a space, which doesn't make sense
689
+ # at beginning of the transcript
690
+ text = text.lstrip()
691
+ current_gen.input_transcription += text
664
692
  self.emit(
665
693
  "input_audio_transcription_completed",
666
694
  llm.InputTranscriptionCompleted(
667
- item_id=response_id, transcript=input_transcription.text
695
+ item_id=current_gen.response_id,
696
+ transcript=current_gen.input_transcription,
697
+ is_final=False,
668
698
  ),
669
699
  )
670
- self._handle_input_speech_started()
671
700
 
672
701
  if output_transcription := server_content.output_transcription:
673
- if output_transcription.text:
674
- item_generation.text_ch.send_nowait(output_transcription.text)
702
+ text = output_transcription.text
703
+ if text:
704
+ current_gen.text_ch.send_nowait(text)
705
+
706
+ if server_content.generation_complete:
707
+ # The only way we'd know that the transcription is complete is by when they are
708
+ # done with generation
709
+ if current_gen.input_transcription:
710
+ self.emit(
711
+ "input_audio_transcription_completed",
712
+ llm.InputTranscriptionCompleted(
713
+ item_id=current_gen.response_id,
714
+ transcript=current_gen.input_transcription,
715
+ is_final=True,
716
+ ),
717
+ )
718
+ current_gen._completed_timestamp = time.time()
675
719
 
676
720
  if server_content.interrupted:
677
- self._finalize_response(interrupted=True)
678
721
  self._handle_input_speech_started()
679
722
 
680
723
  if server_content.turn_complete:
681
- self._finalize_response()
724
+ self._mark_current_generation_done()
682
725
 
683
- def _finalize_response(self, interrupted: bool = False, closed: bool = False) -> None:
726
+ def _mark_current_generation_done(self) -> None:
684
727
  if not self._current_generation:
685
728
  return
686
729
 
687
730
  gen = self._current_generation
688
- self._current_generation = None
689
-
690
- for item_generation in gen.messages.values():
691
- if not item_generation.text_ch.closed:
692
- item_generation.text_ch.close()
693
- if not item_generation.audio_ch.closed:
694
- item_generation.audio_ch.close()
731
+ if not gen.text_ch.closed:
732
+ gen.text_ch.close()
733
+ if not gen.audio_ch.closed:
734
+ gen.audio_ch.close()
695
735
 
696
736
  gen.function_ch.close()
697
737
  gen.message_ch.close()
738
+ gen._done = True
698
739
 
699
740
  def _handle_input_speech_started(self):
700
741
  self.emit("input_speech_started", llm.InputSpeechStartedEvent())
@@ -715,7 +756,7 @@ class RealtimeSession(llm.RealtimeSession):
715
756
  arguments=arguments,
716
757
  )
717
758
  )
718
- self._finalize_response()
759
+ self._mark_current_generation_done()
719
760
 
720
761
  def _handle_tool_call_cancellation(
721
762
  self, tool_call_cancellation: LiveServerToolCallCancellation
@@ -726,8 +767,62 @@ class RealtimeSession(llm.RealtimeSession):
726
767
  )
727
768
 
728
769
  def _handle_usage_metadata(self, usage_metadata: UsageMetadata):
729
- # TODO: handle metrics
730
- logger.debug("usage metadata", extra={"usage_metadata": usage_metadata})
770
+ current_gen = self._current_generation
771
+ if not current_gen:
772
+ logger.warning("no active generation to report metrics for")
773
+ return
774
+
775
+ ttft = (
776
+ current_gen._first_token_timestamp - current_gen._created_timestamp
777
+ if current_gen._first_token_timestamp
778
+ else -1
779
+ )
780
+ duration = (
781
+ current_gen._completed_timestamp or time.time()
782
+ ) - current_gen._created_timestamp
783
+
784
+ def _token_details_map(
785
+ token_details: list[ModalityTokenCount] | None,
786
+ ) -> dict[Modality, int]:
787
+ token_details_map = {"audio_tokens": 0, "text_tokens": 0, "image_tokens": 0}
788
+ if not token_details:
789
+ return token_details_map
790
+
791
+ for token_detail in token_details:
792
+ if token_detail.modality == Modality.AUDIO:
793
+ token_details_map["audio_tokens"] += token_detail.token_count
794
+ elif token_detail.modality == Modality.TEXT:
795
+ token_details_map["text_tokens"] += token_detail.token_count
796
+ elif token_detail.modality == Modality.IMAGE:
797
+ token_details_map["image_tokens"] += token_detail.token_count
798
+ return token_details_map
799
+
800
+ metrics = RealtimeModelMetrics(
801
+ label=self._realtime_model._label,
802
+ request_id=current_gen.response_id,
803
+ timestamp=current_gen._created_timestamp,
804
+ duration=duration,
805
+ ttft=ttft,
806
+ cancelled=False,
807
+ input_tokens=usage_metadata.prompt_token_count or 0,
808
+ output_tokens=usage_metadata.response_token_count or 0,
809
+ total_tokens=usage_metadata.total_token_count or 0,
810
+ tokens_per_second=(usage_metadata.response_token_count or 0) / duration,
811
+ input_token_details=RealtimeModelMetrics.InputTokenDetails(
812
+ **_token_details_map(usage_metadata.prompt_tokens_details),
813
+ cached_tokens=sum(
814
+ token_detail.token_count or 0
815
+ for token_detail in usage_metadata.cache_tokens_details or []
816
+ ),
817
+ cached_tokens_details=RealtimeModelMetrics.CachedTokenDetails(
818
+ **_token_details_map(usage_metadata.cache_tokens_details),
819
+ ),
820
+ ),
821
+ output_token_details=RealtimeModelMetrics.OutputTokenDetails(
822
+ **_token_details_map(usage_metadata.response_tokens_details),
823
+ ),
824
+ )
825
+ self.emit("metrics_collected", metrics)
731
826
 
732
827
  def _handle_go_away(self, go_away: LiveServerGoAway):
733
828
  logger.warning(
@@ -62,7 +62,7 @@ class LLM(llm.LLM):
62
62
  *,
63
63
  model: ChatModels | str = "gemini-2.0-flash-001",
64
64
  api_key: NotGivenOr[str] = NOT_GIVEN,
65
- vertexai: NotGivenOr[bool] = False,
65
+ vertexai: NotGivenOr[bool] = NOT_GIVEN,
66
66
  project: NotGivenOr[str] = NOT_GIVEN,
67
67
  location: NotGivenOr[str] = NOT_GIVEN,
68
68
  temperature: NotGivenOr[float] = NOT_GIVEN,
@@ -78,7 +78,7 @@ class LLM(llm.LLM):
78
78
  Create a new instance of Google GenAI LLM.
79
79
 
80
80
  Environment Requirements:
81
- - For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file.
81
+ - For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file or use any of the other Google Cloud auth methods.
82
82
  The Google Cloud project and location can be set via `project` and `location` arguments or the environment variables
83
83
  `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_LOCATION`. By default, the project is inferred from the service account key file,
84
84
  and the location defaults to "us-central1".
@@ -87,9 +87,9 @@ class LLM(llm.LLM):
87
87
  Args:
88
88
  model (ChatModels | str, optional): The model name to use. Defaults to "gemini-2.0-flash-001".
89
89
  api_key (str, optional): The API key for Google Gemini. If not provided, it attempts to read from the `GOOGLE_API_KEY` environment variable.
90
- vertexai (bool, optional): Whether to use VertexAI. Defaults to False.
91
- project (str, optional): The Google Cloud project to use (only for VertexAI). Defaults to None.
92
- location (str, optional): The location to use for VertexAI API requests. Defaults value is "us-central1".
90
+ vertexai (bool, optional): Whether to use VertexAI. If not provided, it attempts to read from the `GOOGLE_GENAI_USE_VERTEXAI` environment variable. Defaults to False.
91
+ project (str, optional): The Google Cloud project to use (only for VertexAI). Defaults to None.
92
+ location (str, optional): The location to use for VertexAI API requests. Defaults value is "us-central1".
93
93
  temperature (float, optional): Sampling temperature for response generation. Defaults to 0.8.
94
94
  max_output_tokens (int, optional): Maximum number of tokens to generate in the output. Defaults to None.
95
95
  top_p (float, optional): The nucleus sampling probability for response generation. Defaults to None.
@@ -101,15 +101,19 @@ class LLM(llm.LLM):
101
101
  """ # noqa: E501
102
102
  super().__init__()
103
103
  gcp_project = project if is_given(project) else os.environ.get("GOOGLE_CLOUD_PROJECT")
104
- gcp_location = location if is_given(location) else os.environ.get("GOOGLE_CLOUD_LOCATION")
104
+ gcp_location = (
105
+ location
106
+ if is_given(location)
107
+ else os.environ.get("GOOGLE_CLOUD_LOCATION") or "us-central1"
108
+ )
109
+ use_vertexai = (
110
+ vertexai
111
+ if is_given(vertexai)
112
+ else os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "0").lower() in ["true", "1"]
113
+ )
105
114
  gemini_api_key = api_key if is_given(api_key) else os.environ.get("GOOGLE_API_KEY")
106
- _gac = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
107
- if _gac is None:
108
- logger.warning(
109
- "`GOOGLE_APPLICATION_CREDENTIALS` environment variable is not set. please set it to the path of the service account key file. Otherwise, use any of the other Google Cloud auth methods." # noqa: E501
110
- )
111
115
 
112
- if is_given(vertexai) and vertexai:
116
+ if use_vertexai:
113
117
  if not gcp_project:
114
118
  _, gcp_project = default_async(
115
119
  scopes=["https://www.googleapis.com/auth/cloud-platform"]
@@ -144,7 +148,7 @@ class LLM(llm.LLM):
144
148
  model=model,
145
149
  temperature=temperature,
146
150
  tool_choice=tool_choice,
147
- vertexai=vertexai,
151
+ vertexai=use_vertexai,
148
152
  project=project,
149
153
  location=location,
150
154
  max_output_tokens=max_output_tokens,
@@ -156,7 +160,7 @@ class LLM(llm.LLM):
156
160
  )
157
161
  self._client = genai.Client(
158
162
  api_key=gemini_api_key,
159
- vertexai=is_given(vertexai) and vertexai,
163
+ vertexai=use_vertexai,
160
164
  project=gcp_project,
161
165
  location=gcp_location,
162
166
  )
@@ -325,6 +329,7 @@ class LLMStream(llm.LLMStream):
325
329
  usage=llm.CompletionUsage(
326
330
  completion_tokens=usage.candidates_token_count or 0,
327
331
  prompt_tokens=usage.prompt_token_count or 0,
332
+ prompt_cached_tokens=usage.cached_content_token_count or 0,
328
333
  total_tokens=usage.total_token_count or 0,
329
334
  ),
330
335
  )
@@ -95,6 +95,8 @@ SpeechLanguages = Literal[
95
95
  Gender = Literal["male", "female", "neutral"]
96
96
 
97
97
  ChatModels = Literal[
98
+ "gemini-2.5-pro-preview-05-06",
99
+ "gemini-2.5-flash-preview-04-17",
98
100
  "gemini-2.0-flash-001",
99
101
  "gemini-2.0-flash-lite-preview-02-05",
100
102
  "gemini-2.0-pro-exp-02-05",
@@ -14,6 +14,8 @@
14
14
 
15
15
  from __future__ import annotations
16
16
 
17
+ import asyncio
18
+ import weakref
17
19
  from dataclasses import dataclass
18
20
 
19
21
  from google.api_core.client_options import ClientOptions
@@ -25,6 +27,7 @@ from livekit.agents import (
25
27
  APIConnectOptions,
26
28
  APIStatusError,
27
29
  APITimeoutError,
30
+ tokenize,
28
31
  tts,
29
32
  utils,
30
33
  )
@@ -35,13 +38,21 @@ from livekit.agents.types import (
35
38
  )
36
39
  from livekit.agents.utils import is_given
37
40
 
41
+ from .log import logger
38
42
  from .models import Gender, SpeechLanguages
39
43
 
44
+ BUFFERED_WORDS_COUNT = 8
45
+ NUM_CHANNELS = 1
46
+ DEFAULT_VOICE_NAME = "en-US-Chirp3-HD-Charon"
47
+ DEFAULT_LANGUAGE = "en-US"
48
+ DEFAULT_GENDER = "neutral"
49
+
40
50
 
41
51
  @dataclass
42
52
  class _TTSOptions:
43
53
  voice: texttospeech.VoiceSelectionParams
44
54
  audio_config: texttospeech.AudioConfig
55
+ tokenizer: tokenize.SentenceTokenizer
45
56
 
46
57
 
47
58
  class TTS(tts.TTS):
@@ -59,6 +70,8 @@ class TTS(tts.TTS):
59
70
  audio_encoding: texttospeech.AudioEncoding = texttospeech.AudioEncoding.PCM,
60
71
  credentials_info: NotGivenOr[dict] = NOT_GIVEN,
61
72
  credentials_file: NotGivenOr[str] = NOT_GIVEN,
73
+ tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
74
+ use_streaming: NotGivenOr[bool] = NOT_GIVEN,
62
75
  ) -> None:
63
76
  """
64
77
  Create a new instance of Google TTS.
@@ -78,12 +91,14 @@ class TTS(tts.TTS):
78
91
  speaking_rate (float, optional): Speed of speech. Default is 1.0.
79
92
  credentials_info (dict, optional): Dictionary containing Google Cloud credentials. Default is None.
80
93
  credentials_file (str, optional): Path to the Google Cloud credentials JSON file. Default is None.
94
+ tokenizer (tokenize.SentenceTokenizer, optional): Tokenizer for the TTS. Default is a basic sentence tokenizer.
95
+ use_streaming (bool, optional): Whether to use streaming synthesis. Default is True.
81
96
  """ # noqa: E501
97
+ if not is_given(use_streaming):
98
+ use_streaming = True
82
99
 
83
100
  super().__init__(
84
- capabilities=tts.TTSCapabilities(
85
- streaming=False,
86
- ),
101
+ capabilities=tts.TTSCapabilities(streaming=use_streaming),
87
102
  sample_rate=sample_rate,
88
103
  num_channels=1,
89
104
  )
@@ -93,15 +108,17 @@ class TTS(tts.TTS):
93
108
  self._credentials_file = credentials_file
94
109
  self._location = location
95
110
 
96
- lang = language if is_given(language) else "en-US"
97
- ssml_gender = _gender_from_str("neutral" if not is_given(gender) else gender)
98
- name = "" if not is_given(voice_name) else voice_name
111
+ lang = language if is_given(language) else DEFAULT_LANGUAGE
112
+ ssml_gender = _gender_from_str(DEFAULT_GENDER if not is_given(gender) else gender)
113
+ name = DEFAULT_VOICE_NAME if not is_given(voice_name) else voice_name
99
114
 
100
115
  voice_params = texttospeech.VoiceSelectionParams(
101
116
  name=name,
102
117
  language_code=lang,
103
118
  ssml_gender=ssml_gender,
104
119
  )
120
+ if not is_given(tokenizer):
121
+ tokenizer = tokenize.basic.SentenceTokenizer(min_sentence_len=BUFFERED_WORDS_COUNT)
105
122
 
106
123
  self._opts = _TTSOptions(
107
124
  voice=voice_params,
@@ -112,7 +129,9 @@ class TTS(tts.TTS):
112
129
  effects_profile_id=effects_profile_id,
113
130
  speaking_rate=speaking_rate,
114
131
  ),
132
+ tokenizer=tokenizer,
115
133
  )
134
+ self._streams = weakref.WeakSet[SynthesizeStream]()
116
135
 
117
136
  def update_options(
118
137
  self,
@@ -168,6 +187,18 @@ class TTS(tts.TTS):
168
187
  assert self._client is not None
169
188
  return self._client
170
189
 
190
+ def stream(
191
+ self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
192
+ ) -> SynthesizeStream:
193
+ stream = SynthesizeStream(
194
+ tts=self,
195
+ opts=self._opts,
196
+ client=self._ensure_client(),
197
+ conn_options=conn_options,
198
+ )
199
+ self._streams.add(stream)
200
+ return stream
201
+
171
202
  def synthesize(
172
203
  self,
173
204
  text: str,
@@ -182,6 +213,12 @@ class TTS(tts.TTS):
182
213
  client=self._ensure_client(),
183
214
  )
184
215
 
216
+ async def aclose(self) -> None:
217
+ for stream in list(self._streams):
218
+ await stream.aclose()
219
+ self._streams.clear()
220
+ await super().aclose()
221
+
185
222
 
186
223
  class ChunkedStream(tts.ChunkedStream):
187
224
  def __init__(
@@ -230,8 +267,105 @@ class ChunkedStream(tts.ChunkedStream):
230
267
  raise APITimeoutError() from None
231
268
  except GoogleAPICallError as e:
232
269
  raise APIStatusError(
233
- e.message, status_code=e.code or -1, request_id=None, body=None
234
- ) from None
270
+ f"{e.message} {e.details}", status_code=e.code or -1, request_id=None, body=None
271
+ ) from e
272
+ except Exception as e:
273
+ raise APIConnectionError() from e
274
+
275
+
276
+ class SynthesizeStream(tts.SynthesizeStream):
277
+ def __init__(
278
+ self,
279
+ *,
280
+ tts: TTS,
281
+ opts: _TTSOptions,
282
+ client: texttospeech.TextToSpeechAsyncClient,
283
+ conn_options: APIConnectOptions,
284
+ ):
285
+ super().__init__(tts=tts, conn_options=conn_options)
286
+ self._opts, self._client = opts, client
287
+ self._segments_ch = utils.aio.Chan[tokenize.SentenceStream]()
288
+
289
+ async def _run(self) -> None:
290
+ request_id = utils.shortuuid()
291
+
292
+ @utils.log_exceptions(logger=logger)
293
+ async def _tokenize_input():
294
+ input_stream = None
295
+ async for input in self._input_ch:
296
+ if isinstance(input, str):
297
+ if input_stream is None:
298
+ input_stream = self._opts.tokenizer.stream()
299
+ self._segments_ch.send_nowait(input_stream)
300
+ input_stream.push_text(input)
301
+ elif isinstance(input, self._FlushSentinel):
302
+ if input_stream:
303
+ input_stream.end_input()
304
+ input_stream = None
305
+ self._segments_ch.close()
306
+
307
+ @utils.log_exceptions(logger=logger)
308
+ async def _run_segments():
309
+ async for input_stream in self._segments_ch:
310
+ await self._run_stream(input_stream, request_id)
311
+
312
+ tasks = [
313
+ asyncio.create_task(_tokenize_input()),
314
+ asyncio.create_task(_run_segments()),
315
+ ]
316
+ try:
317
+ await asyncio.gather(*tasks)
318
+ except Exception as e:
319
+ raise APIConnectionError() from e
320
+
321
+ async def _run_stream(self, input_stream, request_id):
322
+ streaming_config = texttospeech.StreamingSynthesizeConfig(
323
+ voice=self._opts.voice,
324
+ streaming_audio_config=texttospeech.StreamingAudioConfig(
325
+ audio_encoding=texttospeech.AudioEncoding.PCM
326
+ ),
327
+ )
328
+ emitter = tts.SynthesizedAudioEmitter(event_ch=self._event_ch, request_id=request_id)
329
+ audio_bstream = utils.audio.AudioByteStream(
330
+ sample_rate=self._opts.audio_config.sample_rate_hertz,
331
+ num_channels=NUM_CHANNELS,
332
+ )
333
+
334
+ @utils.log_exceptions(logger=logger)
335
+ async def input_generator():
336
+ try:
337
+ yield texttospeech.StreamingSynthesizeRequest(streaming_config=streaming_config)
338
+ async for input in input_stream:
339
+ self._mark_started()
340
+ yield texttospeech.StreamingSynthesizeRequest(
341
+ input=texttospeech.StreamingSynthesisInput(text=input.token)
342
+ )
343
+
344
+ except Exception:
345
+ logger.exception("an error occurred while streaming input to google TTS")
346
+
347
+ try:
348
+ stream = await self._client.streaming_synthesize(
349
+ input_generator(),
350
+ timeout=self._conn_options.timeout,
351
+ )
352
+ async for resp in stream:
353
+ for frame in audio_bstream.write(resp.audio_content):
354
+ emitter.push(frame)
355
+
356
+ for frame in audio_bstream.flush():
357
+ emitter.push(frame)
358
+ emitter.flush()
359
+ except DeadlineExceeded as e:
360
+ logger.debug(f"google tts deadline exceeded: {e}")
361
+ pass
362
+ except GoogleAPICallError as e:
363
+ raise APIStatusError(
364
+ f"{e.message} {e.details}",
365
+ status_code=e.code or -1,
366
+ request_id=request_id,
367
+ body=None,
368
+ ) from e
235
369
  except Exception as e:
236
370
  raise APIConnectionError() from e
237
371
 
@@ -9,15 +9,31 @@ from pydantic import TypeAdapter
9
9
 
10
10
  from google.genai import types
11
11
  from livekit.agents import llm
12
- from livekit.agents.llm import FunctionTool, utils as llm_utils
12
+ from livekit.agents.llm import utils as llm_utils
13
+ from livekit.agents.llm.tool_context import (
14
+ FunctionTool,
15
+ RawFunctionTool,
16
+ get_raw_function_info,
17
+ is_function_tool,
18
+ is_raw_function_tool,
19
+ )
13
20
 
14
21
  from .log import logger
15
22
 
16
23
  __all__ = ["to_chat_ctx", "to_fnc_ctx"]
17
24
 
18
25
 
19
- def to_fnc_ctx(fncs: list[FunctionTool]) -> list[types.FunctionDeclaration]:
20
- return [_build_gemini_fnc(fnc) for fnc in fncs]
26
+ def to_fnc_ctx(fncs: list[FunctionTool | RawFunctionTool]) -> list[types.FunctionDeclaration]:
27
+ tools: list[types.FunctionDeclaration] = []
28
+ for fnc in fncs:
29
+ if is_raw_function_tool(fnc):
30
+ info = get_raw_function_info(fnc)
31
+ tools.append(types.FunctionDeclaration(**info.raw_schema))
32
+
33
+ elif is_function_tool(fnc):
34
+ tools.append(_build_gemini_fnc(fnc))
35
+
36
+ return tools
21
37
 
22
38
 
23
39
  def get_tool_results_for_realtime(
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.0.20"
15
+ __version__ = "1.0.22"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: livekit-plugins-google
3
- Version: 1.0.20
3
+ Version: 1.0.22
4
4
  Summary: Agent Framework plugin for services from Google Cloud
5
5
  Project-URL: Documentation, https://docs.livekit.io
6
6
  Project-URL: Website, https://livekit.io/
@@ -20,9 +20,9 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
20
  Requires-Python: >=3.9.0
21
21
  Requires-Dist: google-auth<3,>=2
22
22
  Requires-Dist: google-cloud-speech<3,>=2
23
- Requires-Dist: google-cloud-texttospeech<3,>=2
24
- Requires-Dist: google-genai>=1.12.1
25
- Requires-Dist: livekit-agents>=1.0.20
23
+ Requires-Dist: google-cloud-texttospeech<3,>=2.24
24
+ Requires-Dist: google-genai>=1.14.0
25
+ Requires-Dist: livekit-agents>=1.0.22
26
26
  Description-Content-Type: text/markdown
27
27
 
28
28
  # Google AI plugin for LiveKit Agents
@@ -0,0 +1,16 @@
1
+ livekit/plugins/google/__init__.py,sha256=xain2qUzU-YWhYWsLBkW8Q-szV-htpnzHTqymMPo-j0,1364
2
+ livekit/plugins/google/llm.py,sha256=Kr9qeBZ5Dd0WCCBR_-gM3WWsVRZPCSteK8NpBsg2C5Y,16304
3
+ livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
4
+ livekit/plugins/google/models.py,sha256=maGlEM3hK4-5hMnH9UQMJewA7BZMrnStsFLBNoNVySg,1531
5
+ livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ livekit/plugins/google/stt.py,sha256=2jk-1fHiBT8UW_n3CZsIEdMp2iBnUAlTnmefdUd8rAM,23620
7
+ livekit/plugins/google/tts.py,sha256=FfhNfGtW8drmYDDfLLZDjaIp2GvNiIdoovgtZq4t_l8,14211
8
+ livekit/plugins/google/utils.py,sha256=UBAbddYk7G8Nojg6bSC7_xN2pdl9qhs86HGhKYFuf9M,10509
9
+ livekit/plugins/google/version.py,sha256=-8dkOE2vDSF9WN8VoBrSwU2sb5YBGFuwPnSQXQ-uaYM,601
10
+ livekit/plugins/google/beta/__init__.py,sha256=5PnoG3Ux24bjzMSzmTeSVljE9EINivGcbWUEV6egGnM,216
11
+ livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
12
+ livekit/plugins/google/beta/realtime/api_proto.py,sha256=Fyrejs3SG0EjOPCCFLEnWXKEUxCff47PMWk2VsKJm5E,594
13
+ livekit/plugins/google/beta/realtime/realtime_api.py,sha256=yYB5fKXl_aaMH_ZSpfUlfOTUg4eRqqRENLTZhZMfBMc,36253
14
+ livekit_plugins_google-1.0.22.dist-info/METADATA,sha256=S4bQZr4NhWrAI6vyJi299sh5lsD5eVMNfxvN9__xAMY,1908
15
+ livekit_plugins_google-1.0.22.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
16
+ livekit_plugins_google-1.0.22.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- livekit/plugins/google/__init__.py,sha256=xain2qUzU-YWhYWsLBkW8Q-szV-htpnzHTqymMPo-j0,1364
2
- livekit/plugins/google/llm.py,sha256=m_lRoUw4RIO1d-LtNYugl99LUNcA1y4NQ17wX7Vv5j0,16189
3
- livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
4
- livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
5
- livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- livekit/plugins/google/stt.py,sha256=2jk-1fHiBT8UW_n3CZsIEdMp2iBnUAlTnmefdUd8rAM,23620
7
- livekit/plugins/google/tts.py,sha256=29R0ieV5sRPBf5Yi0SPFQk7ZZMbELF30bIL9K_j_Wcg,9100
8
- livekit/plugins/google/utils.py,sha256=zPzmnR-Rs2I87mT_k5S-PVbbuJMH8S-Hp5QcM4wv8vA,10067
9
- livekit/plugins/google/version.py,sha256=t4KmPVTpEy1pOJ2GRCA-GNJfCQq_-zHNDBxGj4GKfVk,601
10
- livekit/plugins/google/beta/__init__.py,sha256=5PnoG3Ux24bjzMSzmTeSVljE9EINivGcbWUEV6egGnM,216
11
- livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
12
- livekit/plugins/google/beta/realtime/api_proto.py,sha256=Fyrejs3SG0EjOPCCFLEnWXKEUxCff47PMWk2VsKJm5E,594
13
- livekit/plugins/google/beta/realtime/realtime_api.py,sha256=K_YD2CND3PMGV7c3gJY2UdReeLfsOPtIWDys5EU2T_A,31699
14
- livekit_plugins_google-1.0.20.dist-info/METADATA,sha256=govmSaj6few3t11vreVNKlH9Ki2YzbRGnN3b3il2f20,1905
15
- livekit_plugins_google-1.0.20.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
16
- livekit_plugins_google-1.0.20.dist-info/RECORD,,