livekit-plugins-google 1.0.19__py3-none-any.whl → 1.0.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,6 +12,13 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ """Google AI plugin for LiveKit Agents
16
+
17
+ Supports Gemini, Cloud Speech-to-Text, and Cloud Text-to-Speech.
18
+
19
+ See https://docs.livekit.io/agents/integrations/stt/google/ for more information.
20
+ """
21
+
15
22
  from . import beta
16
23
  from .llm import LLM
17
24
  from .stt import STT, SpeechStream
@@ -1,3 +1,12 @@
1
1
  from . import realtime
2
2
 
3
3
  __all__ = ["realtime"]
4
+
5
+ # Cleanup docs of unexported modules
6
+ _module = dir()
7
+ NOT_IN_ALL = [m for m in _module if m not in __all__]
8
+
9
+ __pdoc__ = {}
10
+
11
+ for n in NOT_IN_ALL:
12
+ __pdoc__[n] = False
@@ -4,14 +4,16 @@ import asyncio
4
4
  import contextlib
5
5
  import json
6
6
  import os
7
+ import time
7
8
  import weakref
8
9
  from collections.abc import Iterator
9
- from dataclasses import dataclass
10
+ from dataclasses import dataclass, field
10
11
 
11
12
  from google import genai
12
13
  from google.genai.live import AsyncSession
13
14
  from google.genai.types import (
14
15
  AudioTranscriptionConfig,
16
+ AutomaticActivityDetection,
15
17
  Blob,
16
18
  Content,
17
19
  FunctionDeclaration,
@@ -25,8 +27,10 @@ from google.genai.types import (
25
27
  LiveServerToolCall,
26
28
  LiveServerToolCallCancellation,
27
29
  Modality,
30
+ ModalityTokenCount,
28
31
  Part,
29
32
  PrebuiltVoiceConfig,
33
+ RealtimeInputConfig,
30
34
  SessionResumptionConfig,
31
35
  SpeechConfig,
32
36
  Tool,
@@ -35,19 +39,20 @@ from google.genai.types import (
35
39
  )
36
40
  from livekit import rtc
37
41
  from livekit.agents import llm, utils
42
+ from livekit.agents.metrics import RealtimeModelMetrics
38
43
  from livekit.agents.types import NOT_GIVEN, NotGivenOr
39
44
  from livekit.agents.utils import audio as audio_utils, images, is_given
40
45
  from livekit.plugins.google.beta.realtime.api_proto import ClientEvents, LiveAPIModels, Voice
41
46
 
42
47
  from ...log import logger
43
- from ...utils import _build_gemini_fnc, get_tool_results_for_realtime, to_chat_ctx
48
+ from ...utils import get_tool_results_for_realtime, to_chat_ctx, to_fnc_ctx
44
49
 
45
50
  INPUT_AUDIO_SAMPLE_RATE = 16000
46
51
  INPUT_AUDIO_CHANNELS = 1
47
52
  OUTPUT_AUDIO_SAMPLE_RATE = 24000
48
53
  OUTPUT_AUDIO_CHANNELS = 1
49
54
 
50
- DEFAULT_ENCODE_OPTIONS = images.EncodeOptions(
55
+ DEFAULT_IMAGE_ENCODE_OPTIONS = images.EncodeOptions(
51
56
  format="JPEG",
52
57
  quality=75,
53
58
  resize_options=images.ResizeOptions(width=1024, height=1024, strategy="scale_aspect_fit"),
@@ -80,13 +85,7 @@ class _RealtimeOptions:
80
85
  instructions: NotGivenOr[str]
81
86
  input_audio_transcription: AudioTranscriptionConfig | None
82
87
  output_audio_transcription: AudioTranscriptionConfig | None
83
-
84
-
85
- @dataclass
86
- class _MessageGeneration:
87
- message_id: str
88
- text_ch: utils.aio.Chan[str]
89
- audio_ch: utils.aio.Chan[rtc.AudioFrame]
88
+ image_encode_options: NotGivenOr[images.EncodeOptions]
90
89
 
91
90
 
92
91
  @dataclass
@@ -94,7 +93,19 @@ class _ResponseGeneration:
94
93
  message_ch: utils.aio.Chan[llm.MessageGeneration]
95
94
  function_ch: utils.aio.Chan[llm.FunctionCall]
96
95
 
97
- messages: dict[str, _MessageGeneration]
96
+ response_id: str
97
+ text_ch: utils.aio.Chan[str]
98
+ audio_ch: utils.aio.Chan[rtc.AudioFrame]
99
+ input_transcription: str = ""
100
+
101
+ _created_timestamp: float = field(default_factory=time.time)
102
+ """The timestamp when the generation is created"""
103
+ _first_token_timestamp: float | None = None
104
+ """The timestamp when the first audio token is received"""
105
+ _completed_timestamp: float | None = None
106
+ """The timestamp when the generation is completed"""
107
+ _done: bool = False
108
+ """Whether the generation is done (set when the turn is complete)"""
98
109
 
99
110
 
100
111
  class RealtimeModel(llm.RealtimeModel):
@@ -102,12 +113,12 @@ class RealtimeModel(llm.RealtimeModel):
102
113
  self,
103
114
  *,
104
115
  instructions: NotGivenOr[str] = NOT_GIVEN,
105
- model: LiveAPIModels | str = "gemini-2.0-flash-live-001",
116
+ model: NotGivenOr[LiveAPIModels | str] = NOT_GIVEN,
106
117
  api_key: NotGivenOr[str] = NOT_GIVEN,
107
118
  voice: Voice | str = "Puck",
108
119
  language: NotGivenOr[str] = NOT_GIVEN,
109
120
  modalities: NotGivenOr[list[Modality]] = NOT_GIVEN,
110
- vertexai: bool = False,
121
+ vertexai: NotGivenOr[bool] = NOT_GIVEN,
111
122
  project: NotGivenOr[str] = NOT_GIVEN,
112
123
  location: NotGivenOr[str] = NOT_GIVEN,
113
124
  candidate_count: int = 1,
@@ -119,12 +130,13 @@ class RealtimeModel(llm.RealtimeModel):
119
130
  frequency_penalty: NotGivenOr[float] = NOT_GIVEN,
120
131
  input_audio_transcription: NotGivenOr[AudioTranscriptionConfig | None] = NOT_GIVEN,
121
132
  output_audio_transcription: NotGivenOr[AudioTranscriptionConfig | None] = NOT_GIVEN,
133
+ image_encode_options: NotGivenOr[images.EncodeOptions] = NOT_GIVEN,
122
134
  ) -> None:
123
135
  """
124
136
  Initializes a RealtimeModel instance for interacting with Google's Realtime API.
125
137
 
126
138
  Environment Requirements:
127
- - For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file.
139
+ - For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file or use any of the other Google Cloud auth methods.
128
140
  The Google Cloud project and location can be set via `project` and `location` arguments or the environment variables
129
141
  `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_LOCATION`. By default, the project is inferred from the service account key file,
130
142
  and the location defaults to "us-central1".
@@ -134,7 +146,7 @@ class RealtimeModel(llm.RealtimeModel):
134
146
  instructions (str, optional): Initial system instructions for the model. Defaults to "".
135
147
  api_key (str, optional): Google Gemini API key. If None, will attempt to read from the environment variable GOOGLE_API_KEY.
136
148
  modalities (list[Modality], optional): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
137
- model (str, optional): The name of the model to use. Defaults to "gemini-2.0-flash-live-001".
149
+ model (str, optional): The name of the model to use. Defaults to "gemini-2.0-flash-live-001" or "gemini-2.0-flash-exp" (vertexai).
138
150
  voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
139
151
  language (str, optional): The language(BCP-47 Code) to use for the API. supported languages - https://ai.google.dev/gemini-api/docs/live#supported-languages
140
152
  temperature (float, optional): Sampling temperature for response generation. Defaults to 0.8.
@@ -148,26 +160,48 @@ class RealtimeModel(llm.RealtimeModel):
148
160
  frequency_penalty (float, optional): The frequency penalty for response generation
149
161
  input_audio_transcription (AudioTranscriptionConfig | None, optional): The configuration for input audio transcription. Defaults to None.)
150
162
  output_audio_transcription (AudioTranscriptionConfig | None, optional): The configuration for output audio transcription. Defaults to AudioTranscriptionConfig().
163
+ image_encode_options (images.EncodeOptions, optional): The configuration for image encoding. Defaults to DEFAULT_ENCODE_OPTIONS.
151
164
 
152
165
  Raises:
153
166
  ValueError: If the API key is required but not found.
154
167
  """ # noqa: E501
168
+ if not is_given(input_audio_transcription):
169
+ input_audio_transcription = AudioTranscriptionConfig()
170
+ if not is_given(output_audio_transcription):
171
+ output_audio_transcription = AudioTranscriptionConfig()
172
+
155
173
  super().__init__(
156
174
  capabilities=llm.RealtimeCapabilities(
157
175
  message_truncation=False,
158
176
  turn_detection=True,
159
- user_transcription=is_given(input_audio_transcription),
177
+ user_transcription=input_audio_transcription is not None,
178
+ auto_tool_reply_generation=True,
160
179
  )
161
180
  )
162
181
 
182
+ if not is_given(model):
183
+ if vertexai:
184
+ model = "gemini-2.0-flash-exp"
185
+ else:
186
+ model = "gemini-2.0-flash-live-001"
187
+
163
188
  gemini_api_key = api_key if is_given(api_key) else os.environ.get("GOOGLE_API_KEY")
164
189
  gcp_project = project if is_given(project) else os.environ.get("GOOGLE_CLOUD_PROJECT")
165
- gcp_location = location if is_given(location) else os.environ.get("GOOGLE_CLOUD_LOCATION")
190
+ gcp_location = (
191
+ location
192
+ if is_given(location)
193
+ else os.environ.get("GOOGLE_CLOUD_LOCATION") or "us-central1"
194
+ )
195
+ use_vertexai = (
196
+ vertexai
197
+ if is_given(vertexai)
198
+ else os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "0").lower() in ["true", "1"]
199
+ )
166
200
 
167
- if vertexai:
201
+ if use_vertexai:
168
202
  if not gcp_project or not gcp_location:
169
203
  raise ValueError(
170
- "Project and location are required for VertexAI either via project and location or GOOGLE_CLOUD_PROJECT and GOOGLE_CLOUD_LOCATION environment variables" # noqa: E501
204
+ "Project is required for VertexAI via project kwarg or GOOGLE_CLOUD_PROJECT environment variable" # noqa: E501
171
205
  )
172
206
  gemini_api_key = None # VertexAI does not require an API key
173
207
  else:
@@ -178,17 +212,12 @@ class RealtimeModel(llm.RealtimeModel):
178
212
  "API key is required for Google API either via api_key or GOOGLE_API_KEY environment variable" # noqa: E501
179
213
  )
180
214
 
181
- if not is_given(input_audio_transcription):
182
- input_audio_transcription = None
183
- if not is_given(output_audio_transcription):
184
- output_audio_transcription = AudioTranscriptionConfig()
185
-
186
215
  self._opts = _RealtimeOptions(
187
216
  model=model,
188
217
  api_key=gemini_api_key,
189
218
  voice=voice,
190
219
  response_modalities=modalities,
191
- vertexai=vertexai,
220
+ vertexai=use_vertexai,
192
221
  project=gcp_project,
193
222
  location=gcp_location,
194
223
  candidate_count=candidate_count,
@@ -202,6 +231,7 @@ class RealtimeModel(llm.RealtimeModel):
202
231
  input_audio_transcription=input_audio_transcription,
203
232
  output_audio_transcription=output_audio_transcription,
204
233
  language=language,
234
+ image_encode_options=image_encode_options,
205
235
  )
206
236
 
207
237
  self._sessions = weakref.WeakSet[RealtimeSession]()
@@ -262,7 +292,6 @@ class RealtimeSession(llm.RealtimeSession):
262
292
 
263
293
  self._session_resumption_handle: str | None = None
264
294
 
265
- self._update_lock = asyncio.Lock()
266
295
  self._session_lock = asyncio.Lock()
267
296
 
268
297
  async def _close_active_session(self) -> None:
@@ -281,55 +310,59 @@ class RealtimeSession(llm.RealtimeSession):
281
310
  # reset the msg_ch, do not send messages from previous session
282
311
  self._msg_ch = utils.aio.Chan[ClientEvents]()
283
312
 
284
- async def update_options(
313
+ def update_options(
285
314
  self,
286
315
  *,
287
316
  voice: NotGivenOr[str] = NOT_GIVEN,
288
317
  temperature: NotGivenOr[float] = NOT_GIVEN,
289
318
  tool_choice: NotGivenOr[llm.ToolChoice | None] = NOT_GIVEN,
290
319
  ) -> None:
291
- async with self._update_lock:
292
- should_restart = False
293
- if is_given(voice) and self._opts.voice != voice:
294
- self._opts.voice = voice
295
- should_restart = True
320
+ should_restart = False
321
+ if is_given(voice) and self._opts.voice != voice:
322
+ self._opts.voice = voice
323
+ should_restart = True
296
324
 
297
- if is_given(temperature) and self._opts.temperature != temperature:
298
- self._opts.temperature = temperature if is_given(temperature) else NOT_GIVEN
299
- should_restart = True
325
+ if is_given(temperature) and self._opts.temperature != temperature:
326
+ self._opts.temperature = temperature if is_given(temperature) else NOT_GIVEN
327
+ should_restart = True
300
328
 
301
- if should_restart:
302
- self._mark_restart_needed()
329
+ if should_restart:
330
+ self._mark_restart_needed()
303
331
 
304
332
  async def update_instructions(self, instructions: str) -> None:
305
- async with self._update_lock:
306
- if not is_given(self._opts.instructions) or self._opts.instructions != instructions:
307
- self._opts.instructions = instructions
308
- self._mark_restart_needed()
333
+ if not is_given(self._opts.instructions) or self._opts.instructions != instructions:
334
+ self._opts.instructions = instructions
335
+ self._mark_restart_needed()
309
336
 
310
337
  async def update_chat_ctx(self, chat_ctx: llm.ChatContext) -> None:
311
- async with self._update_lock:
312
- self._chat_ctx = chat_ctx.copy()
313
- turns, _ = to_chat_ctx(self._chat_ctx, id(self), ignore_functions=True)
314
- tool_results = get_tool_results_for_realtime(self._chat_ctx)
315
- # TODO(dz): need to compute delta and then either append or recreate session
338
+ diff_ops = llm.utils.compute_chat_ctx_diff(self._chat_ctx, chat_ctx)
339
+
340
+ if diff_ops.to_remove:
341
+ logger.warning("Gemini Live does not support removing messages")
342
+
343
+ append_ctx = llm.ChatContext.empty()
344
+ for _, item_id in diff_ops.to_create:
345
+ item = chat_ctx.get_by_id(item_id)
346
+ if item:
347
+ append_ctx.items.append(item)
348
+
349
+ if append_ctx.items:
350
+ turns, _ = to_chat_ctx(append_ctx, id(self), ignore_functions=True)
351
+ tool_results = get_tool_results_for_realtime(append_ctx, vertexai=self._opts.vertexai)
316
352
  if turns:
317
353
  self._send_client_event(LiveClientContent(turns=turns, turn_complete=False))
318
354
  if tool_results:
319
355
  self._send_client_event(tool_results)
320
356
 
321
357
  async def update_tools(self, tools: list[llm.FunctionTool]) -> None:
322
- async with self._update_lock:
323
- new_declarations: list[FunctionDeclaration] = [
324
- _build_gemini_fnc(tool) for tool in tools
325
- ]
326
- current_tool_names = {f.name for f in self._gemini_declarations}
327
- new_tool_names = {f.name for f in new_declarations}
328
-
329
- if current_tool_names != new_tool_names:
330
- self._gemini_declarations = new_declarations
331
- self._tools = llm.ToolContext(tools)
332
- self._mark_restart_needed()
358
+ new_declarations: list[FunctionDeclaration] = to_fnc_ctx(tools)
359
+ current_tool_names = {f.name for f in self._gemini_declarations}
360
+ new_tool_names = {f.name for f in new_declarations}
361
+
362
+ if current_tool_names != new_tool_names:
363
+ self._gemini_declarations = new_declarations
364
+ self._tools = llm.ToolContext(tools)
365
+ self._mark_restart_needed()
333
366
 
334
367
  @property
335
368
  def chat_ctx(self) -> llm.ChatContext:
@@ -348,7 +381,9 @@ class RealtimeSession(llm.RealtimeSession):
348
381
  self._send_client_event(realtime_input)
349
382
 
350
383
  def push_video(self, frame: rtc.VideoFrame) -> None:
351
- encoded_data = images.encode(frame, DEFAULT_ENCODE_OPTIONS)
384
+ encoded_data = images.encode(
385
+ frame, self._opts.image_encode_options or DEFAULT_IMAGE_ENCODE_OPTIONS
386
+ )
352
387
  realtime_input = LiveClientRealtimeInput(
353
388
  media_chunks=[Blob(data=encoded_data, mime_type="image/jpeg")]
354
389
  )
@@ -418,7 +453,7 @@ class RealtimeSession(llm.RealtimeSession):
418
453
  self._response_created_futures.clear()
419
454
 
420
455
  if self._current_generation:
421
- self._finalize_response(closed=True)
456
+ self._mark_current_generation_done()
422
457
 
423
458
  @utils.log_exceptions(logger=logger)
424
459
  async def _main_task(self):
@@ -512,7 +547,7 @@ class RealtimeSession(llm.RealtimeSession):
512
547
  break
513
548
 
514
549
  async for response in session.receive():
515
- if not self._current_generation and (
550
+ if (not self._current_generation or self._current_generation._done) and (
516
551
  response.server_content or response.tool_call
517
552
  ):
518
553
  self._start_new_generation()
@@ -543,7 +578,7 @@ class RealtimeSession(llm.RealtimeSession):
543
578
  logger.error(f"error in receive task: {e}", exc_info=e)
544
579
  self._mark_restart_needed()
545
580
  finally:
546
- self._finalize_response(closed=True)
581
+ self._mark_current_generation_done()
547
582
 
548
583
  def _build_connect_config(self) -> LiveConnectConfig:
549
584
  temp = self._opts.temperature if is_given(self._opts.temperature) else None
@@ -580,32 +615,31 @@ class RealtimeSession(llm.RealtimeSession):
580
615
  input_audio_transcription=self._opts.input_audio_transcription,
581
616
  output_audio_transcription=self._opts.output_audio_transcription,
582
617
  session_resumption=SessionResumptionConfig(handle=self._session_resumption_handle),
618
+ realtime_input_config=RealtimeInputConfig(
619
+ automatic_activity_detection=AutomaticActivityDetection(),
620
+ ),
583
621
  )
584
622
 
585
623
  def _start_new_generation(self):
586
- if self._current_generation:
624
+ if self._current_generation and not self._current_generation._done:
587
625
  logger.warning("starting new generation while another is active. Finalizing previous.")
588
- self._finalize_response(closed=True)
626
+ self._mark_current_generation_done()
589
627
 
590
628
  response_id = utils.shortuuid("gemini-turn-")
591
629
  self._current_generation = _ResponseGeneration(
592
630
  message_ch=utils.aio.Chan[llm.MessageGeneration](),
593
631
  function_ch=utils.aio.Chan[llm.FunctionCall](),
594
- messages={},
595
- )
596
-
597
- item_generation = _MessageGeneration(
598
- message_id=response_id,
632
+ response_id=response_id,
599
633
  text_ch=utils.aio.Chan[str](),
600
634
  audio_ch=utils.aio.Chan[rtc.AudioFrame](),
635
+ _created_timestamp=time.time(),
601
636
  )
602
- self._current_generation.messages[response_id] = item_generation
603
637
 
604
638
  self._current_generation.message_ch.send_nowait(
605
639
  llm.MessageGeneration(
606
640
  message_id=response_id,
607
- text_stream=item_generation.text_ch,
608
- audio_stream=item_generation.audio_ch,
641
+ text_stream=self._current_generation.text_ch,
642
+ audio_stream=self._current_generation.audio_ch,
609
643
  )
610
644
  )
611
645
 
@@ -623,18 +657,18 @@ class RealtimeSession(llm.RealtimeSession):
623
657
  self.emit("generation_created", generation_event)
624
658
 
625
659
  def _handle_server_content(self, server_content: LiveServerContent):
626
- if not self._current_generation:
660
+ current_gen = self._current_generation
661
+ if not current_gen:
627
662
  logger.warning("received server content but no active generation.")
628
663
  return
629
664
 
630
- response_id = list(self._current_generation.messages.keys())[0]
631
- item_generation = self._current_generation.messages[response_id]
632
-
633
665
  if model_turn := server_content.model_turn:
634
666
  for part in model_turn.parts:
635
667
  if part.text:
636
- item_generation.text_ch.send_nowait(part.text)
668
+ current_gen.text_ch.send_nowait(part.text)
637
669
  if part.inline_data:
670
+ if not current_gen._first_token_timestamp:
671
+ current_gen._first_token_timestamp = time.time()
638
672
  frame_data = part.inline_data.data
639
673
  try:
640
674
  frame = rtc.AudioFrame(
@@ -643,46 +677,65 @@ class RealtimeSession(llm.RealtimeSession):
643
677
  num_channels=OUTPUT_AUDIO_CHANNELS,
644
678
  samples_per_channel=len(frame_data) // (2 * OUTPUT_AUDIO_CHANNELS),
645
679
  )
646
- item_generation.audio_ch.send_nowait(frame)
680
+ current_gen.audio_ch.send_nowait(frame)
647
681
  except ValueError as e:
648
682
  logger.error(f"Error creating audio frame from Gemini data: {e}")
649
683
 
650
684
  if input_transcription := server_content.input_transcription:
651
- if input_transcription.text:
685
+ text = input_transcription.text
686
+ if text:
687
+ if current_gen.input_transcription == "":
688
+ # gemini would start with a space, which doesn't make sense
689
+ # at beginning of the transcript
690
+ text = text.lstrip()
691
+ current_gen.input_transcription += text
652
692
  self.emit(
653
693
  "input_audio_transcription_completed",
654
694
  llm.InputTranscriptionCompleted(
655
- item_id=response_id, transcript=input_transcription.text
695
+ item_id=current_gen.response_id,
696
+ transcript=current_gen.input_transcription,
697
+ is_final=False,
656
698
  ),
657
699
  )
658
- self._handle_input_speech_started()
659
700
 
660
701
  if output_transcription := server_content.output_transcription:
661
- if output_transcription.text:
662
- item_generation.text_ch.send_nowait(output_transcription.text)
702
+ text = output_transcription.text
703
+ if text:
704
+ current_gen.text_ch.send_nowait(text)
705
+
706
+ if server_content.generation_complete:
707
+ # The only way we'd know that the transcription is complete is by when they are
708
+ # done with generation
709
+ if current_gen.input_transcription:
710
+ self.emit(
711
+ "input_audio_transcription_completed",
712
+ llm.InputTranscriptionCompleted(
713
+ item_id=current_gen.response_id,
714
+ transcript=current_gen.input_transcription,
715
+ is_final=True,
716
+ ),
717
+ )
718
+ current_gen._completed_timestamp = time.time()
663
719
 
664
720
  if server_content.interrupted:
665
- self._finalize_response(interrupted=True)
666
721
  self._handle_input_speech_started()
667
722
 
668
723
  if server_content.turn_complete:
669
- self._finalize_response()
724
+ self._mark_current_generation_done()
670
725
 
671
- def _finalize_response(self, interrupted: bool = False, closed: bool = False) -> None:
726
+ def _mark_current_generation_done(self) -> None:
672
727
  if not self._current_generation:
673
728
  return
674
729
 
675
730
  gen = self._current_generation
676
- self._current_generation = None
677
-
678
- for item_generation in gen.messages.values():
679
- if not item_generation.text_ch.closed:
680
- item_generation.text_ch.close()
681
- if not item_generation.audio_ch.closed:
682
- item_generation.audio_ch.close()
731
+ if not gen.text_ch.closed:
732
+ gen.text_ch.close()
733
+ if not gen.audio_ch.closed:
734
+ gen.audio_ch.close()
683
735
 
684
736
  gen.function_ch.close()
685
737
  gen.message_ch.close()
738
+ gen._done = True
686
739
 
687
740
  def _handle_input_speech_started(self):
688
741
  self.emit("input_speech_started", llm.InputSpeechStartedEvent())
@@ -703,7 +756,7 @@ class RealtimeSession(llm.RealtimeSession):
703
756
  arguments=arguments,
704
757
  )
705
758
  )
706
- self._finalize_response()
759
+ self._mark_current_generation_done()
707
760
 
708
761
  def _handle_tool_call_cancellation(
709
762
  self, tool_call_cancellation: LiveServerToolCallCancellation
@@ -714,8 +767,62 @@ class RealtimeSession(llm.RealtimeSession):
714
767
  )
715
768
 
716
769
  def _handle_usage_metadata(self, usage_metadata: UsageMetadata):
717
- # TODO: handle metrics
718
- logger.debug("usage metadata", extra={"usage_metadata": usage_metadata})
770
+ current_gen = self._current_generation
771
+ if not current_gen:
772
+ logger.warning("no active generation to report metrics for")
773
+ return
774
+
775
+ ttft = (
776
+ current_gen._first_token_timestamp - current_gen._created_timestamp
777
+ if current_gen._first_token_timestamp
778
+ else -1
779
+ )
780
+ duration = (
781
+ current_gen._completed_timestamp or time.time()
782
+ ) - current_gen._created_timestamp
783
+
784
+ def _token_details_map(
785
+ token_details: list[ModalityTokenCount] | None,
786
+ ) -> dict[Modality, int]:
787
+ token_details_map = {"audio_tokens": 0, "text_tokens": 0, "image_tokens": 0}
788
+ if not token_details:
789
+ return token_details_map
790
+
791
+ for token_detail in token_details:
792
+ if token_detail.modality == Modality.AUDIO:
793
+ token_details_map["audio_tokens"] += token_detail.token_count
794
+ elif token_detail.modality == Modality.TEXT:
795
+ token_details_map["text_tokens"] += token_detail.token_count
796
+ elif token_detail.modality == Modality.IMAGE:
797
+ token_details_map["image_tokens"] += token_detail.token_count
798
+ return token_details_map
799
+
800
+ metrics = RealtimeModelMetrics(
801
+ label=self._realtime_model._label,
802
+ request_id=current_gen.response_id,
803
+ timestamp=current_gen._created_timestamp,
804
+ duration=duration,
805
+ ttft=ttft,
806
+ cancelled=False,
807
+ input_tokens=usage_metadata.prompt_token_count or 0,
808
+ output_tokens=usage_metadata.response_token_count or 0,
809
+ total_tokens=usage_metadata.total_token_count or 0,
810
+ tokens_per_second=(usage_metadata.response_token_count or 0) / duration,
811
+ input_token_details=RealtimeModelMetrics.InputTokenDetails(
812
+ **_token_details_map(usage_metadata.prompt_tokens_details),
813
+ cached_tokens=sum(
814
+ token_detail.token_count or 0
815
+ for token_detail in usage_metadata.cache_tokens_details or []
816
+ ),
817
+ cached_tokens_details=RealtimeModelMetrics.CachedTokenDetails(
818
+ **_token_details_map(usage_metadata.cache_tokens_details),
819
+ ),
820
+ ),
821
+ output_token_details=RealtimeModelMetrics.OutputTokenDetails(
822
+ **_token_details_map(usage_metadata.response_tokens_details),
823
+ ),
824
+ )
825
+ self.emit("metrics_collected", metrics)
719
826
 
720
827
  def _handle_go_away(self, go_away: LiveServerGoAway):
721
828
  logger.warning(
@@ -62,7 +62,7 @@ class LLM(llm.LLM):
62
62
  *,
63
63
  model: ChatModels | str = "gemini-2.0-flash-001",
64
64
  api_key: NotGivenOr[str] = NOT_GIVEN,
65
- vertexai: NotGivenOr[bool] = False,
65
+ vertexai: NotGivenOr[bool] = NOT_GIVEN,
66
66
  project: NotGivenOr[str] = NOT_GIVEN,
67
67
  location: NotGivenOr[str] = NOT_GIVEN,
68
68
  temperature: NotGivenOr[float] = NOT_GIVEN,
@@ -78,7 +78,7 @@ class LLM(llm.LLM):
78
78
  Create a new instance of Google GenAI LLM.
79
79
 
80
80
  Environment Requirements:
81
- - For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file.
81
+ - For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file or use any of the other Google Cloud auth methods.
82
82
  The Google Cloud project and location can be set via `project` and `location` arguments or the environment variables
83
83
  `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_LOCATION`. By default, the project is inferred from the service account key file,
84
84
  and the location defaults to "us-central1".
@@ -87,9 +87,9 @@ class LLM(llm.LLM):
87
87
  Args:
88
88
  model (ChatModels | str, optional): The model name to use. Defaults to "gemini-2.0-flash-001".
89
89
  api_key (str, optional): The API key for Google Gemini. If not provided, it attempts to read from the `GOOGLE_API_KEY` environment variable.
90
- vertexai (bool, optional): Whether to use VertexAI. Defaults to False.
91
- project (str, optional): The Google Cloud project to use (only for VertexAI). Defaults to None.
92
- location (str, optional): The location to use for VertexAI API requests. Defaults value is "us-central1".
90
+ vertexai (bool, optional): Whether to use VertexAI. If not provided, it attempts to read from the `GOOGLE_GENAI_USE_VERTEXAI` environment variable. Defaults to False.
91
+ project (str, optional): The Google Cloud project to use (only for VertexAI). Defaults to None.
92
+ location (str, optional): The location to use for VertexAI API requests. Defaults value is "us-central1".
93
93
  temperature (float, optional): Sampling temperature for response generation. Defaults to 0.8.
94
94
  max_output_tokens (int, optional): Maximum number of tokens to generate in the output. Defaults to None.
95
95
  top_p (float, optional): The nucleus sampling probability for response generation. Defaults to None.
@@ -101,15 +101,19 @@ class LLM(llm.LLM):
101
101
  """ # noqa: E501
102
102
  super().__init__()
103
103
  gcp_project = project if is_given(project) else os.environ.get("GOOGLE_CLOUD_PROJECT")
104
- gcp_location = location if is_given(location) else os.environ.get("GOOGLE_CLOUD_LOCATION")
104
+ gcp_location = (
105
+ location
106
+ if is_given(location)
107
+ else os.environ.get("GOOGLE_CLOUD_LOCATION") or "us-central1"
108
+ )
109
+ use_vertexai = (
110
+ vertexai
111
+ if is_given(vertexai)
112
+ else os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "0").lower() in ["true", "1"]
113
+ )
105
114
  gemini_api_key = api_key if is_given(api_key) else os.environ.get("GOOGLE_API_KEY")
106
- _gac = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
107
- if _gac is None:
108
- logger.warning(
109
- "`GOOGLE_APPLICATION_CREDENTIALS` environment variable is not set. please set it to the path of the service account key file. Otherwise, use any of the other Google Cloud auth methods." # noqa: E501
110
- )
111
115
 
112
- if is_given(vertexai) and vertexai:
116
+ if use_vertexai:
113
117
  if not gcp_project:
114
118
  _, gcp_project = default_async(
115
119
  scopes=["https://www.googleapis.com/auth/cloud-platform"]
@@ -144,7 +148,7 @@ class LLM(llm.LLM):
144
148
  model=model,
145
149
  temperature=temperature,
146
150
  tool_choice=tool_choice,
147
- vertexai=vertexai,
151
+ vertexai=use_vertexai,
148
152
  project=project,
149
153
  location=location,
150
154
  max_output_tokens=max_output_tokens,
@@ -156,7 +160,7 @@ class LLM(llm.LLM):
156
160
  )
157
161
  self._client = genai.Client(
158
162
  api_key=gemini_api_key,
159
- vertexai=is_given(vertexai) and vertexai,
163
+ vertexai=use_vertexai,
160
164
  project=gcp_project,
161
165
  location=gcp_location,
162
166
  )
@@ -241,7 +245,7 @@ class LLM(llm.LLM):
241
245
  client=self._client,
242
246
  model=self._opts.model,
243
247
  chat_ctx=chat_ctx,
244
- tools=tools,
248
+ tools=tools or [],
245
249
  conn_options=conn_options,
246
250
  extra_kwargs=extra,
247
251
  )
@@ -256,7 +260,7 @@ class LLMStream(llm.LLMStream):
256
260
  model: str | ChatModels,
257
261
  chat_ctx: llm.ChatContext,
258
262
  conn_options: APIConnectOptions,
259
- tools: list[FunctionTool] | None,
263
+ tools: list[FunctionTool],
260
264
  extra_kwargs: dict[str, Any],
261
265
  ) -> None:
262
266
  super().__init__(llm, chat_ctx=chat_ctx, tools=tools, conn_options=conn_options)
@@ -325,6 +329,7 @@ class LLMStream(llm.LLMStream):
325
329
  usage=llm.CompletionUsage(
326
330
  completion_tokens=usage.candidates_token_count or 0,
327
331
  prompt_tokens=usage.prompt_token_count or 0,
332
+ prompt_cached_tokens=usage.cached_content_token_count or 0,
328
333
  total_tokens=usage.total_token_count or 0,
329
334
  ),
330
335
  )
@@ -95,6 +95,8 @@ SpeechLanguages = Literal[
95
95
  Gender = Literal["male", "female", "neutral"]
96
96
 
97
97
  ChatModels = Literal[
98
+ "gemini-2.5-pro-preview-05-06",
99
+ "gemini-2.5-flash-preview-04-17",
98
100
  "gemini-2.0-flash-001",
99
101
  "gemini-2.0-flash-lite-preview-02-05",
100
102
  "gemini-2.0-pro-exp-02-05",
@@ -103,6 +103,7 @@ class STT(stt.STT):
103
103
  credentials_info: NotGivenOr[dict] = NOT_GIVEN,
104
104
  credentials_file: NotGivenOr[str] = NOT_GIVEN,
105
105
  keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN,
106
+ use_streaming: NotGivenOr[bool] = NOT_GIVEN,
106
107
  ):
107
108
  """
108
109
  Create a new instance of Google STT.
@@ -125,8 +126,13 @@ class STT(stt.STT):
125
126
  credentials_info(dict): the credentials info to use for recognition (default: None)
126
127
  credentials_file(str): the credentials file to use for recognition (default: None)
127
128
  keywords(List[tuple[str, float]]): list of keywords to recognize (default: None)
129
+ use_streaming(bool): whether to use streaming for recognition (default: True)
128
130
  """
129
- super().__init__(capabilities=stt.STTCapabilities(streaming=True, interim_results=True))
131
+ if not is_given(use_streaming):
132
+ use_streaming = True
133
+ super().__init__(
134
+ capabilities=stt.STTCapabilities(streaming=use_streaming, interim_results=True)
135
+ )
130
136
 
131
137
  self._location = location
132
138
  self._credentials_info = credentials_info
@@ -251,7 +257,7 @@ class STT(stt.STT):
251
257
  except DeadlineExceeded:
252
258
  raise APITimeoutError() from None
253
259
  except GoogleAPICallError as e:
254
- raise APIStatusError(e.message, status_code=e.code or -1) from None
260
+ raise APIStatusError(f"{e.message} {e.details}", status_code=e.code or -1) from e
255
261
  except Exception as e:
256
262
  raise APIConnectionError() from e
257
263
 
@@ -472,6 +478,7 @@ class SpeechStream(stt.SpeechStream):
472
478
  features=cloud_speech.RecognitionFeatures(
473
479
  enable_automatic_punctuation=self._config.punctuate,
474
480
  enable_word_time_offsets=True,
481
+ enable_spoken_punctuation=self._config.spoken_punctuation,
475
482
  ),
476
483
  ),
477
484
  streaming_features=cloud_speech.StreamingRecognitionFeatures(
@@ -505,7 +512,12 @@ class SpeechStream(stt.SpeechStream):
505
512
  except DeadlineExceeded:
506
513
  raise APITimeoutError() from None
507
514
  except GoogleAPICallError as e:
508
- raise APIStatusError(e.message, status_code=e.code or -1) from None
515
+ if e.code == 409:
516
+ logger.debug("stream timed out, restarting.")
517
+ else:
518
+ raise APIStatusError(
519
+ f"{e.message} {e.details}", status_code=e.code or -1
520
+ ) from e
509
521
  except Exception as e:
510
522
  raise APIConnectionError() from e
511
523
 
@@ -9,28 +9,48 @@ from pydantic import TypeAdapter
9
9
 
10
10
  from google.genai import types
11
11
  from livekit.agents import llm
12
- from livekit.agents.llm import FunctionTool, utils as llm_utils
12
+ from livekit.agents.llm import utils as llm_utils
13
+ from livekit.agents.llm.tool_context import (
14
+ FunctionTool,
15
+ RawFunctionTool,
16
+ get_raw_function_info,
17
+ is_function_tool,
18
+ is_raw_function_tool,
19
+ )
13
20
 
14
21
  from .log import logger
15
22
 
16
23
  __all__ = ["to_chat_ctx", "to_fnc_ctx"]
17
24
 
18
25
 
19
- def to_fnc_ctx(fncs: list[FunctionTool]) -> list[types.FunctionDeclaration]:
20
- return [_build_gemini_fnc(fnc) for fnc in fncs]
26
+ def to_fnc_ctx(fncs: list[FunctionTool | RawFunctionTool]) -> list[types.FunctionDeclaration]:
27
+ tools: list[types.FunctionDeclaration] = []
28
+ for fnc in fncs:
29
+ if is_raw_function_tool(fnc):
30
+ info = get_raw_function_info(fnc)
31
+ tools.append(types.FunctionDeclaration(**info.raw_schema))
21
32
 
33
+ elif is_function_tool(fnc):
34
+ tools.append(_build_gemini_fnc(fnc))
22
35
 
23
- def get_tool_results_for_realtime(chat_ctx: llm.ChatContext) -> types.LiveClientToolResponse | None:
36
+ return tools
37
+
38
+
39
+ def get_tool_results_for_realtime(
40
+ chat_ctx: llm.ChatContext, *, vertexai: bool = False
41
+ ) -> types.LiveClientToolResponse | None:
24
42
  function_responses: list[types.FunctionResponse] = []
25
43
  for msg in chat_ctx.items:
26
44
  if msg.type == "function_call_output":
27
- function_responses.append(
28
- types.FunctionResponse(
29
- id=msg.call_id,
30
- name=msg.name,
31
- response={"output": msg.output},
32
- )
45
+ res = types.FunctionResponse(
46
+ name=msg.name,
47
+ response={"output": msg.output},
33
48
  )
49
+ if not vertexai:
50
+ # vertexai does not support id in FunctionResponse
51
+ # see: https://github.com/googleapis/python-genai/blob/85e00bc/google/genai/_live_converters.py#L1435
52
+ res.id = msg.call_id
53
+ function_responses.append(res)
34
54
  return (
35
55
  types.LiveClientToolResponse(function_responses=function_responses)
36
56
  if function_responses
@@ -175,6 +195,15 @@ class _GeminiJsonSchema:
175
195
  schema.pop("title", None)
176
196
  schema.pop("default", None)
177
197
  schema.pop("additionalProperties", None)
198
+ schema.pop("$schema", None)
199
+
200
+ if (const := schema.pop("const", None)) is not None:
201
+ # Gemini doesn't support const, but it does support enum with a single value
202
+ schema["enum"] = [const]
203
+
204
+ schema.pop("discriminator", None)
205
+ schema.pop("examples", None)
206
+
178
207
  if ref := schema.pop("$ref", None):
179
208
  key = re.sub(r"^#/\$defs/", "", ref)
180
209
  if key in refs_stack:
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.0.19"
15
+ __version__ = "1.0.21"
@@ -0,0 +1,47 @@
1
+ Metadata-Version: 2.4
2
+ Name: livekit-plugins-google
3
+ Version: 1.0.21
4
+ Summary: Agent Framework plugin for services from Google Cloud
5
+ Project-URL: Documentation, https://docs.livekit.io
6
+ Project-URL: Website, https://livekit.io/
7
+ Project-URL: Source, https://github.com/livekit/agents
8
+ Author: LiveKit
9
+ License-Expression: Apache-2.0
10
+ Keywords: audio,livekit,realtime,video,webrtc
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3 :: Only
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Topic :: Multimedia :: Sound/Audio
18
+ Classifier: Topic :: Multimedia :: Video
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Requires-Python: >=3.9.0
21
+ Requires-Dist: google-auth<3,>=2
22
+ Requires-Dist: google-cloud-speech<3,>=2
23
+ Requires-Dist: google-cloud-texttospeech<3,>=2.24
24
+ Requires-Dist: google-genai>=1.14.0
25
+ Requires-Dist: livekit-agents>=1.0.21
26
+ Description-Content-Type: text/markdown
27
+
28
+ # Google AI plugin for LiveKit Agents
29
+
30
+ Support for Gemini, Gemini Live, Cloud Speech-to-Text, and Cloud Text-to-Speech.
31
+
32
+ See [https://docs.livekit.io/agents/integrations/google/](https://docs.livekit.io/agents/integrations/google/) for more information.
33
+
34
+ ## Installation
35
+
36
+ ```bash
37
+ pip install livekit-plugins-google
38
+ ```
39
+
40
+ ## Pre-requisites
41
+
42
+ For credentials, you'll need a Google Cloud account and obtain the correct credentials. Credentials can be passed directly or via Application Default Credentials as specified in [How Application Default Credentials works](https://cloud.google.com/docs/authentication/application-default-credentials).
43
+
44
+ To use the STT and TTS API, you'll need to enable the respective services for your Google Cloud project.
45
+
46
+ - Cloud Speech-to-Text API
47
+ - Cloud Text-to-Speech API
@@ -0,0 +1,16 @@
1
+ livekit/plugins/google/__init__.py,sha256=xain2qUzU-YWhYWsLBkW8Q-szV-htpnzHTqymMPo-j0,1364
2
+ livekit/plugins/google/llm.py,sha256=Kr9qeBZ5Dd0WCCBR_-gM3WWsVRZPCSteK8NpBsg2C5Y,16304
3
+ livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
4
+ livekit/plugins/google/models.py,sha256=maGlEM3hK4-5hMnH9UQMJewA7BZMrnStsFLBNoNVySg,1531
5
+ livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ livekit/plugins/google/stt.py,sha256=2jk-1fHiBT8UW_n3CZsIEdMp2iBnUAlTnmefdUd8rAM,23620
7
+ livekit/plugins/google/tts.py,sha256=29R0ieV5sRPBf5Yi0SPFQk7ZZMbELF30bIL9K_j_Wcg,9100
8
+ livekit/plugins/google/utils.py,sha256=UBAbddYk7G8Nojg6bSC7_xN2pdl9qhs86HGhKYFuf9M,10509
9
+ livekit/plugins/google/version.py,sha256=5lzQkS1jEPqreexacwMd18b2EOx7R5m8AQMKtQRBgC4,601
10
+ livekit/plugins/google/beta/__init__.py,sha256=5PnoG3Ux24bjzMSzmTeSVljE9EINivGcbWUEV6egGnM,216
11
+ livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
12
+ livekit/plugins/google/beta/realtime/api_proto.py,sha256=Fyrejs3SG0EjOPCCFLEnWXKEUxCff47PMWk2VsKJm5E,594
13
+ livekit/plugins/google/beta/realtime/realtime_api.py,sha256=yYB5fKXl_aaMH_ZSpfUlfOTUg4eRqqRENLTZhZMfBMc,36253
14
+ livekit_plugins_google-1.0.21.dist-info/METADATA,sha256=mQA8BfvWhAjp3V9GJA5OsZLzP_Q03UuDbRX2HbcEgtY,1908
15
+ livekit_plugins_google-1.0.21.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
16
+ livekit_plugins_google-1.0.21.dist-info/RECORD,,
@@ -1,99 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: livekit-plugins-google
3
- Version: 1.0.19
4
- Summary: Agent Framework plugin for services from Google Cloud
5
- Project-URL: Documentation, https://docs.livekit.io
6
- Project-URL: Website, https://livekit.io/
7
- Project-URL: Source, https://github.com/livekit/agents
8
- Author: LiveKit
9
- License-Expression: Apache-2.0
10
- Keywords: audio,livekit,realtime,video,webrtc
11
- Classifier: Intended Audience :: Developers
12
- Classifier: License :: OSI Approved :: Apache Software License
13
- Classifier: Programming Language :: Python :: 3
14
- Classifier: Programming Language :: Python :: 3 :: Only
15
- Classifier: Programming Language :: Python :: 3.9
16
- Classifier: Programming Language :: Python :: 3.10
17
- Classifier: Topic :: Multimedia :: Sound/Audio
18
- Classifier: Topic :: Multimedia :: Video
19
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
- Requires-Python: >=3.9.0
21
- Requires-Dist: google-auth<3,>=2
22
- Requires-Dist: google-cloud-speech<3,>=2
23
- Requires-Dist: google-cloud-texttospeech<3,>=2
24
- Requires-Dist: google-genai>=1.12.1
25
- Requires-Dist: livekit-agents>=1.0.19
26
- Description-Content-Type: text/markdown
27
-
28
- # LiveKit Plugins Google
29
-
30
- Agent Framework plugin for services from Google Cloud. Currently supporting Google's [Speech-to-Text](https://cloud.google.com/speech-to-text) API.
31
-
32
- ## Installation
33
-
34
- ```bash
35
- pip install livekit-plugins-google
36
- ```
37
-
38
- ## Pre-requisites
39
-
40
- For credentials, you'll need a Google Cloud account and obtain the correct credentials. Credentials can be passed directly or via Application Default Credentials as specified in [How Application Default Credentials works](https://cloud.google.com/docs/authentication/application-default-credentials).
41
-
42
- To use the STT and TTS API, you'll need to enable the respective services for your Google Cloud project.
43
-
44
- - Cloud Speech-to-Text API
45
- - Cloud Text-to-Speech API
46
-
47
-
48
- ## Gemini Multimodal Live
49
-
50
- Gemini Multimodal Live can be used with the `MultimodalAgent` class. See examples/multimodal_agent/gemini_agent.py for an example.
51
-
52
- ### Live Video Input (experimental)
53
-
54
- You can push video frames to your Gemini Multimodal Live session alongside the audio automatically handled by the `MultimodalAgent`. The basic approach is to subscribe to the video track, create a video stream, sample frames at a suitable frame rate, and push them into the RealtimeSession:
55
-
56
- ```
57
- # Make sure you subscribe to audio and video tracks
58
- await ctx.connect(auto_subscribe=AutoSubscribe.SUBSCRIBE_ALL)
59
-
60
- # Create your RealtimeModel and store a reference
61
- model = google.beta.realtime.RealtimeModel(
62
- # ...
63
- )
64
-
65
- # Create your MultimodalAgent as usual
66
- agent = MultimodalAgent(
67
- model=model,
68
- # ...
69
- )
70
-
71
- # Async method to process the video track and push frames to Gemini
72
- async def _process_video_track(self, track: Track):
73
- video_stream = VideoStream(track)
74
- last_frame_time = 0
75
-
76
- async for event in video_stream:
77
- current_time = asyncio.get_event_loop().time()
78
-
79
- # Sample at 1 FPS
80
- if current_time - last_frame_time < 1.0:
81
- continue
82
-
83
- last_frame_time = current_time
84
- frame = event.frame
85
-
86
- # Push the frame into the RealtimeSession
87
- model.sessions[0].push_video(frame)
88
-
89
- await video_stream.aclose()
90
-
91
- # Subscribe to new tracks and process them
92
- @ctx.room.on("track_subscribed")
93
- def _on_track_subscribed(track: Track, pub, participant):
94
- if track.kind == TrackKind.KIND_VIDEO:
95
- asyncio.create_task(self._process_video_track(track))
96
- ```
97
-
98
-
99
-
@@ -1,16 +0,0 @@
1
- livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
2
- livekit/plugins/google/llm.py,sha256=NaaT4Zaw6o98VcUHNrQcZZRkD7DPREd76O8fG9IOpXQ,16190
3
- livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
4
- livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
5
- livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- livekit/plugins/google/stt.py,sha256=MADnkh0YKWY4bLRgBwFv4emu4YFO-7EVnhxO--dPTlI,23082
7
- livekit/plugins/google/tts.py,sha256=29R0ieV5sRPBf5Yi0SPFQk7ZZMbELF30bIL9K_j_Wcg,9100
8
- livekit/plugins/google/utils.py,sha256=sPZZg5VHf60kSILUIHGIZyN2CWYwnCGNYICn8Mhcv9g,9534
9
- livekit/plugins/google/version.py,sha256=UDC8ahmGgRkv-qMQUY3QibuuVevGMQ9Fd4yIhcQBZwA,601
10
- livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
11
- livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
12
- livekit/plugins/google/beta/realtime/api_proto.py,sha256=Fyrejs3SG0EjOPCCFLEnWXKEUxCff47PMWk2VsKJm5E,594
13
- livekit/plugins/google/beta/realtime/realtime_api.py,sha256=yk202S604Eogp_ssBX2BSbAXV67uUyQzVO-bzLnScrs,31423
14
- livekit_plugins_google-1.0.19.dist-info/METADATA,sha256=HuRBvpT9dX3Mz7YOVhZhgQLm3-qQa2vAf2SRDQ5u1vM,3492
15
- livekit_plugins_google-1.0.19.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
16
- livekit_plugins_google-1.0.19.dist-info/RECORD,,