livekit-plugins-google 1.0.20__py3-none-any.whl → 1.0.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/google/beta/realtime/realtime_api.py +188 -93
- livekit/plugins/google/llm.py +19 -14
- livekit/plugins/google/models.py +2 -0
- livekit/plugins/google/utils.py +19 -3
- livekit/plugins/google/version.py +1 -1
- {livekit_plugins_google-1.0.20.dist-info → livekit_plugins_google-1.0.21.dist-info}/METADATA +4 -4
- {livekit_plugins_google-1.0.20.dist-info → livekit_plugins_google-1.0.21.dist-info}/RECORD +8 -8
- {livekit_plugins_google-1.0.20.dist-info → livekit_plugins_google-1.0.21.dist-info}/WHEEL +0 -0
@@ -4,14 +4,16 @@ import asyncio
|
|
4
4
|
import contextlib
|
5
5
|
import json
|
6
6
|
import os
|
7
|
+
import time
|
7
8
|
import weakref
|
8
9
|
from collections.abc import Iterator
|
9
|
-
from dataclasses import dataclass
|
10
|
+
from dataclasses import dataclass, field
|
10
11
|
|
11
12
|
from google import genai
|
12
13
|
from google.genai.live import AsyncSession
|
13
14
|
from google.genai.types import (
|
14
15
|
AudioTranscriptionConfig,
|
16
|
+
AutomaticActivityDetection,
|
15
17
|
Blob,
|
16
18
|
Content,
|
17
19
|
FunctionDeclaration,
|
@@ -25,8 +27,10 @@ from google.genai.types import (
|
|
25
27
|
LiveServerToolCall,
|
26
28
|
LiveServerToolCallCancellation,
|
27
29
|
Modality,
|
30
|
+
ModalityTokenCount,
|
28
31
|
Part,
|
29
32
|
PrebuiltVoiceConfig,
|
33
|
+
RealtimeInputConfig,
|
30
34
|
SessionResumptionConfig,
|
31
35
|
SpeechConfig,
|
32
36
|
Tool,
|
@@ -35,19 +39,20 @@ from google.genai.types import (
|
|
35
39
|
)
|
36
40
|
from livekit import rtc
|
37
41
|
from livekit.agents import llm, utils
|
42
|
+
from livekit.agents.metrics import RealtimeModelMetrics
|
38
43
|
from livekit.agents.types import NOT_GIVEN, NotGivenOr
|
39
44
|
from livekit.agents.utils import audio as audio_utils, images, is_given
|
40
45
|
from livekit.plugins.google.beta.realtime.api_proto import ClientEvents, LiveAPIModels, Voice
|
41
46
|
|
42
47
|
from ...log import logger
|
43
|
-
from ...utils import
|
48
|
+
from ...utils import get_tool_results_for_realtime, to_chat_ctx, to_fnc_ctx
|
44
49
|
|
45
50
|
INPUT_AUDIO_SAMPLE_RATE = 16000
|
46
51
|
INPUT_AUDIO_CHANNELS = 1
|
47
52
|
OUTPUT_AUDIO_SAMPLE_RATE = 24000
|
48
53
|
OUTPUT_AUDIO_CHANNELS = 1
|
49
54
|
|
50
|
-
|
55
|
+
DEFAULT_IMAGE_ENCODE_OPTIONS = images.EncodeOptions(
|
51
56
|
format="JPEG",
|
52
57
|
quality=75,
|
53
58
|
resize_options=images.ResizeOptions(width=1024, height=1024, strategy="scale_aspect_fit"),
|
@@ -80,13 +85,7 @@ class _RealtimeOptions:
|
|
80
85
|
instructions: NotGivenOr[str]
|
81
86
|
input_audio_transcription: AudioTranscriptionConfig | None
|
82
87
|
output_audio_transcription: AudioTranscriptionConfig | None
|
83
|
-
|
84
|
-
|
85
|
-
@dataclass
|
86
|
-
class _MessageGeneration:
|
87
|
-
message_id: str
|
88
|
-
text_ch: utils.aio.Chan[str]
|
89
|
-
audio_ch: utils.aio.Chan[rtc.AudioFrame]
|
88
|
+
image_encode_options: NotGivenOr[images.EncodeOptions]
|
90
89
|
|
91
90
|
|
92
91
|
@dataclass
|
@@ -94,7 +93,19 @@ class _ResponseGeneration:
|
|
94
93
|
message_ch: utils.aio.Chan[llm.MessageGeneration]
|
95
94
|
function_ch: utils.aio.Chan[llm.FunctionCall]
|
96
95
|
|
97
|
-
|
96
|
+
response_id: str
|
97
|
+
text_ch: utils.aio.Chan[str]
|
98
|
+
audio_ch: utils.aio.Chan[rtc.AudioFrame]
|
99
|
+
input_transcription: str = ""
|
100
|
+
|
101
|
+
_created_timestamp: float = field(default_factory=time.time)
|
102
|
+
"""The timestamp when the generation is created"""
|
103
|
+
_first_token_timestamp: float | None = None
|
104
|
+
"""The timestamp when the first audio token is received"""
|
105
|
+
_completed_timestamp: float | None = None
|
106
|
+
"""The timestamp when the generation is completed"""
|
107
|
+
_done: bool = False
|
108
|
+
"""Whether the generation is done (set when the turn is complete)"""
|
98
109
|
|
99
110
|
|
100
111
|
class RealtimeModel(llm.RealtimeModel):
|
@@ -107,7 +118,7 @@ class RealtimeModel(llm.RealtimeModel):
|
|
107
118
|
voice: Voice | str = "Puck",
|
108
119
|
language: NotGivenOr[str] = NOT_GIVEN,
|
109
120
|
modalities: NotGivenOr[list[Modality]] = NOT_GIVEN,
|
110
|
-
vertexai: bool =
|
121
|
+
vertexai: NotGivenOr[bool] = NOT_GIVEN,
|
111
122
|
project: NotGivenOr[str] = NOT_GIVEN,
|
112
123
|
location: NotGivenOr[str] = NOT_GIVEN,
|
113
124
|
candidate_count: int = 1,
|
@@ -119,12 +130,13 @@ class RealtimeModel(llm.RealtimeModel):
|
|
119
130
|
frequency_penalty: NotGivenOr[float] = NOT_GIVEN,
|
120
131
|
input_audio_transcription: NotGivenOr[AudioTranscriptionConfig | None] = NOT_GIVEN,
|
121
132
|
output_audio_transcription: NotGivenOr[AudioTranscriptionConfig | None] = NOT_GIVEN,
|
133
|
+
image_encode_options: NotGivenOr[images.EncodeOptions] = NOT_GIVEN,
|
122
134
|
) -> None:
|
123
135
|
"""
|
124
136
|
Initializes a RealtimeModel instance for interacting with Google's Realtime API.
|
125
137
|
|
126
138
|
Environment Requirements:
|
127
|
-
- For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file.
|
139
|
+
- For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file or use any of the other Google Cloud auth methods.
|
128
140
|
The Google Cloud project and location can be set via `project` and `location` arguments or the environment variables
|
129
141
|
`GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_LOCATION`. By default, the project is inferred from the service account key file,
|
130
142
|
and the location defaults to "us-central1".
|
@@ -148,15 +160,22 @@ class RealtimeModel(llm.RealtimeModel):
|
|
148
160
|
frequency_penalty (float, optional): The frequency penalty for response generation
|
149
161
|
input_audio_transcription (AudioTranscriptionConfig | None, optional): The configuration for input audio transcription. Defaults to None.)
|
150
162
|
output_audio_transcription (AudioTranscriptionConfig | None, optional): The configuration for output audio transcription. Defaults to AudioTranscriptionConfig().
|
163
|
+
image_encode_options (images.EncodeOptions, optional): The configuration for image encoding. Defaults to DEFAULT_ENCODE_OPTIONS.
|
151
164
|
|
152
165
|
Raises:
|
153
166
|
ValueError: If the API key is required but not found.
|
154
167
|
""" # noqa: E501
|
168
|
+
if not is_given(input_audio_transcription):
|
169
|
+
input_audio_transcription = AudioTranscriptionConfig()
|
170
|
+
if not is_given(output_audio_transcription):
|
171
|
+
output_audio_transcription = AudioTranscriptionConfig()
|
172
|
+
|
155
173
|
super().__init__(
|
156
174
|
capabilities=llm.RealtimeCapabilities(
|
157
175
|
message_truncation=False,
|
158
176
|
turn_detection=True,
|
159
|
-
user_transcription=
|
177
|
+
user_transcription=input_audio_transcription is not None,
|
178
|
+
auto_tool_reply_generation=True,
|
160
179
|
)
|
161
180
|
)
|
162
181
|
|
@@ -173,8 +192,13 @@ class RealtimeModel(llm.RealtimeModel):
|
|
173
192
|
if is_given(location)
|
174
193
|
else os.environ.get("GOOGLE_CLOUD_LOCATION") or "us-central1"
|
175
194
|
)
|
195
|
+
use_vertexai = (
|
196
|
+
vertexai
|
197
|
+
if is_given(vertexai)
|
198
|
+
else os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "0").lower() in ["true", "1"]
|
199
|
+
)
|
176
200
|
|
177
|
-
if
|
201
|
+
if use_vertexai:
|
178
202
|
if not gcp_project or not gcp_location:
|
179
203
|
raise ValueError(
|
180
204
|
"Project is required for VertexAI via project kwarg or GOOGLE_CLOUD_PROJECT environment variable" # noqa: E501
|
@@ -188,17 +212,12 @@ class RealtimeModel(llm.RealtimeModel):
|
|
188
212
|
"API key is required for Google API either via api_key or GOOGLE_API_KEY environment variable" # noqa: E501
|
189
213
|
)
|
190
214
|
|
191
|
-
if not is_given(input_audio_transcription):
|
192
|
-
input_audio_transcription = None
|
193
|
-
if not is_given(output_audio_transcription):
|
194
|
-
output_audio_transcription = AudioTranscriptionConfig()
|
195
|
-
|
196
215
|
self._opts = _RealtimeOptions(
|
197
216
|
model=model,
|
198
217
|
api_key=gemini_api_key,
|
199
218
|
voice=voice,
|
200
219
|
response_modalities=modalities,
|
201
|
-
vertexai=
|
220
|
+
vertexai=use_vertexai,
|
202
221
|
project=gcp_project,
|
203
222
|
location=gcp_location,
|
204
223
|
candidate_count=candidate_count,
|
@@ -212,6 +231,7 @@ class RealtimeModel(llm.RealtimeModel):
|
|
212
231
|
input_audio_transcription=input_audio_transcription,
|
213
232
|
output_audio_transcription=output_audio_transcription,
|
214
233
|
language=language,
|
234
|
+
image_encode_options=image_encode_options,
|
215
235
|
)
|
216
236
|
|
217
237
|
self._sessions = weakref.WeakSet[RealtimeSession]()
|
@@ -272,7 +292,6 @@ class RealtimeSession(llm.RealtimeSession):
|
|
272
292
|
|
273
293
|
self._session_resumption_handle: str | None = None
|
274
294
|
|
275
|
-
self._update_lock = asyncio.Lock()
|
276
295
|
self._session_lock = asyncio.Lock()
|
277
296
|
|
278
297
|
async def _close_active_session(self) -> None:
|
@@ -291,57 +310,59 @@ class RealtimeSession(llm.RealtimeSession):
|
|
291
310
|
# reset the msg_ch, do not send messages from previous session
|
292
311
|
self._msg_ch = utils.aio.Chan[ClientEvents]()
|
293
312
|
|
294
|
-
|
313
|
+
def update_options(
|
295
314
|
self,
|
296
315
|
*,
|
297
316
|
voice: NotGivenOr[str] = NOT_GIVEN,
|
298
317
|
temperature: NotGivenOr[float] = NOT_GIVEN,
|
299
318
|
tool_choice: NotGivenOr[llm.ToolChoice | None] = NOT_GIVEN,
|
300
319
|
) -> None:
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
should_restart = True
|
320
|
+
should_restart = False
|
321
|
+
if is_given(voice) and self._opts.voice != voice:
|
322
|
+
self._opts.voice = voice
|
323
|
+
should_restart = True
|
306
324
|
|
307
|
-
|
308
|
-
|
309
|
-
|
325
|
+
if is_given(temperature) and self._opts.temperature != temperature:
|
326
|
+
self._opts.temperature = temperature if is_given(temperature) else NOT_GIVEN
|
327
|
+
should_restart = True
|
310
328
|
|
311
|
-
|
312
|
-
|
329
|
+
if should_restart:
|
330
|
+
self._mark_restart_needed()
|
313
331
|
|
314
332
|
async def update_instructions(self, instructions: str) -> None:
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
self._mark_restart_needed()
|
333
|
+
if not is_given(self._opts.instructions) or self._opts.instructions != instructions:
|
334
|
+
self._opts.instructions = instructions
|
335
|
+
self._mark_restart_needed()
|
319
336
|
|
320
337
|
async def update_chat_ctx(self, chat_ctx: llm.ChatContext) -> None:
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
338
|
+
diff_ops = llm.utils.compute_chat_ctx_diff(self._chat_ctx, chat_ctx)
|
339
|
+
|
340
|
+
if diff_ops.to_remove:
|
341
|
+
logger.warning("Gemini Live does not support removing messages")
|
342
|
+
|
343
|
+
append_ctx = llm.ChatContext.empty()
|
344
|
+
for _, item_id in diff_ops.to_create:
|
345
|
+
item = chat_ctx.get_by_id(item_id)
|
346
|
+
if item:
|
347
|
+
append_ctx.items.append(item)
|
348
|
+
|
349
|
+
if append_ctx.items:
|
350
|
+
turns, _ = to_chat_ctx(append_ctx, id(self), ignore_functions=True)
|
351
|
+
tool_results = get_tool_results_for_realtime(append_ctx, vertexai=self._opts.vertexai)
|
328
352
|
if turns:
|
329
353
|
self._send_client_event(LiveClientContent(turns=turns, turn_complete=False))
|
330
354
|
if tool_results:
|
331
355
|
self._send_client_event(tool_results)
|
332
356
|
|
333
357
|
async def update_tools(self, tools: list[llm.FunctionTool]) -> None:
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
self._gemini_declarations = new_declarations
|
343
|
-
self._tools = llm.ToolContext(tools)
|
344
|
-
self._mark_restart_needed()
|
358
|
+
new_declarations: list[FunctionDeclaration] = to_fnc_ctx(tools)
|
359
|
+
current_tool_names = {f.name for f in self._gemini_declarations}
|
360
|
+
new_tool_names = {f.name for f in new_declarations}
|
361
|
+
|
362
|
+
if current_tool_names != new_tool_names:
|
363
|
+
self._gemini_declarations = new_declarations
|
364
|
+
self._tools = llm.ToolContext(tools)
|
365
|
+
self._mark_restart_needed()
|
345
366
|
|
346
367
|
@property
|
347
368
|
def chat_ctx(self) -> llm.ChatContext:
|
@@ -360,7 +381,9 @@ class RealtimeSession(llm.RealtimeSession):
|
|
360
381
|
self._send_client_event(realtime_input)
|
361
382
|
|
362
383
|
def push_video(self, frame: rtc.VideoFrame) -> None:
|
363
|
-
encoded_data = images.encode(
|
384
|
+
encoded_data = images.encode(
|
385
|
+
frame, self._opts.image_encode_options or DEFAULT_IMAGE_ENCODE_OPTIONS
|
386
|
+
)
|
364
387
|
realtime_input = LiveClientRealtimeInput(
|
365
388
|
media_chunks=[Blob(data=encoded_data, mime_type="image/jpeg")]
|
366
389
|
)
|
@@ -430,7 +453,7 @@ class RealtimeSession(llm.RealtimeSession):
|
|
430
453
|
self._response_created_futures.clear()
|
431
454
|
|
432
455
|
if self._current_generation:
|
433
|
-
self.
|
456
|
+
self._mark_current_generation_done()
|
434
457
|
|
435
458
|
@utils.log_exceptions(logger=logger)
|
436
459
|
async def _main_task(self):
|
@@ -524,7 +547,7 @@ class RealtimeSession(llm.RealtimeSession):
|
|
524
547
|
break
|
525
548
|
|
526
549
|
async for response in session.receive():
|
527
|
-
if not self._current_generation and (
|
550
|
+
if (not self._current_generation or self._current_generation._done) and (
|
528
551
|
response.server_content or response.tool_call
|
529
552
|
):
|
530
553
|
self._start_new_generation()
|
@@ -555,7 +578,7 @@ class RealtimeSession(llm.RealtimeSession):
|
|
555
578
|
logger.error(f"error in receive task: {e}", exc_info=e)
|
556
579
|
self._mark_restart_needed()
|
557
580
|
finally:
|
558
|
-
self.
|
581
|
+
self._mark_current_generation_done()
|
559
582
|
|
560
583
|
def _build_connect_config(self) -> LiveConnectConfig:
|
561
584
|
temp = self._opts.temperature if is_given(self._opts.temperature) else None
|
@@ -592,32 +615,31 @@ class RealtimeSession(llm.RealtimeSession):
|
|
592
615
|
input_audio_transcription=self._opts.input_audio_transcription,
|
593
616
|
output_audio_transcription=self._opts.output_audio_transcription,
|
594
617
|
session_resumption=SessionResumptionConfig(handle=self._session_resumption_handle),
|
618
|
+
realtime_input_config=RealtimeInputConfig(
|
619
|
+
automatic_activity_detection=AutomaticActivityDetection(),
|
620
|
+
),
|
595
621
|
)
|
596
622
|
|
597
623
|
def _start_new_generation(self):
|
598
|
-
if self._current_generation:
|
624
|
+
if self._current_generation and not self._current_generation._done:
|
599
625
|
logger.warning("starting new generation while another is active. Finalizing previous.")
|
600
|
-
self.
|
626
|
+
self._mark_current_generation_done()
|
601
627
|
|
602
628
|
response_id = utils.shortuuid("gemini-turn-")
|
603
629
|
self._current_generation = _ResponseGeneration(
|
604
630
|
message_ch=utils.aio.Chan[llm.MessageGeneration](),
|
605
631
|
function_ch=utils.aio.Chan[llm.FunctionCall](),
|
606
|
-
|
607
|
-
)
|
608
|
-
|
609
|
-
item_generation = _MessageGeneration(
|
610
|
-
message_id=response_id,
|
632
|
+
response_id=response_id,
|
611
633
|
text_ch=utils.aio.Chan[str](),
|
612
634
|
audio_ch=utils.aio.Chan[rtc.AudioFrame](),
|
635
|
+
_created_timestamp=time.time(),
|
613
636
|
)
|
614
|
-
self._current_generation.messages[response_id] = item_generation
|
615
637
|
|
616
638
|
self._current_generation.message_ch.send_nowait(
|
617
639
|
llm.MessageGeneration(
|
618
640
|
message_id=response_id,
|
619
|
-
text_stream=
|
620
|
-
audio_stream=
|
641
|
+
text_stream=self._current_generation.text_ch,
|
642
|
+
audio_stream=self._current_generation.audio_ch,
|
621
643
|
)
|
622
644
|
)
|
623
645
|
|
@@ -635,18 +657,18 @@ class RealtimeSession(llm.RealtimeSession):
|
|
635
657
|
self.emit("generation_created", generation_event)
|
636
658
|
|
637
659
|
def _handle_server_content(self, server_content: LiveServerContent):
|
638
|
-
|
660
|
+
current_gen = self._current_generation
|
661
|
+
if not current_gen:
|
639
662
|
logger.warning("received server content but no active generation.")
|
640
663
|
return
|
641
664
|
|
642
|
-
response_id = list(self._current_generation.messages.keys())[0]
|
643
|
-
item_generation = self._current_generation.messages[response_id]
|
644
|
-
|
645
665
|
if model_turn := server_content.model_turn:
|
646
666
|
for part in model_turn.parts:
|
647
667
|
if part.text:
|
648
|
-
|
668
|
+
current_gen.text_ch.send_nowait(part.text)
|
649
669
|
if part.inline_data:
|
670
|
+
if not current_gen._first_token_timestamp:
|
671
|
+
current_gen._first_token_timestamp = time.time()
|
650
672
|
frame_data = part.inline_data.data
|
651
673
|
try:
|
652
674
|
frame = rtc.AudioFrame(
|
@@ -655,46 +677,65 @@ class RealtimeSession(llm.RealtimeSession):
|
|
655
677
|
num_channels=OUTPUT_AUDIO_CHANNELS,
|
656
678
|
samples_per_channel=len(frame_data) // (2 * OUTPUT_AUDIO_CHANNELS),
|
657
679
|
)
|
658
|
-
|
680
|
+
current_gen.audio_ch.send_nowait(frame)
|
659
681
|
except ValueError as e:
|
660
682
|
logger.error(f"Error creating audio frame from Gemini data: {e}")
|
661
683
|
|
662
684
|
if input_transcription := server_content.input_transcription:
|
663
|
-
|
685
|
+
text = input_transcription.text
|
686
|
+
if text:
|
687
|
+
if current_gen.input_transcription == "":
|
688
|
+
# gemini would start with a space, which doesn't make sense
|
689
|
+
# at beginning of the transcript
|
690
|
+
text = text.lstrip()
|
691
|
+
current_gen.input_transcription += text
|
664
692
|
self.emit(
|
665
693
|
"input_audio_transcription_completed",
|
666
694
|
llm.InputTranscriptionCompleted(
|
667
|
-
item_id=response_id,
|
695
|
+
item_id=current_gen.response_id,
|
696
|
+
transcript=current_gen.input_transcription,
|
697
|
+
is_final=False,
|
668
698
|
),
|
669
699
|
)
|
670
|
-
self._handle_input_speech_started()
|
671
700
|
|
672
701
|
if output_transcription := server_content.output_transcription:
|
673
|
-
|
674
|
-
|
702
|
+
text = output_transcription.text
|
703
|
+
if text:
|
704
|
+
current_gen.text_ch.send_nowait(text)
|
705
|
+
|
706
|
+
if server_content.generation_complete:
|
707
|
+
# The only way we'd know that the transcription is complete is by when they are
|
708
|
+
# done with generation
|
709
|
+
if current_gen.input_transcription:
|
710
|
+
self.emit(
|
711
|
+
"input_audio_transcription_completed",
|
712
|
+
llm.InputTranscriptionCompleted(
|
713
|
+
item_id=current_gen.response_id,
|
714
|
+
transcript=current_gen.input_transcription,
|
715
|
+
is_final=True,
|
716
|
+
),
|
717
|
+
)
|
718
|
+
current_gen._completed_timestamp = time.time()
|
675
719
|
|
676
720
|
if server_content.interrupted:
|
677
|
-
self._finalize_response(interrupted=True)
|
678
721
|
self._handle_input_speech_started()
|
679
722
|
|
680
723
|
if server_content.turn_complete:
|
681
|
-
self.
|
724
|
+
self._mark_current_generation_done()
|
682
725
|
|
683
|
-
def
|
726
|
+
def _mark_current_generation_done(self) -> None:
|
684
727
|
if not self._current_generation:
|
685
728
|
return
|
686
729
|
|
687
730
|
gen = self._current_generation
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
item_generation.text_ch.close()
|
693
|
-
if not item_generation.audio_ch.closed:
|
694
|
-
item_generation.audio_ch.close()
|
731
|
+
if not gen.text_ch.closed:
|
732
|
+
gen.text_ch.close()
|
733
|
+
if not gen.audio_ch.closed:
|
734
|
+
gen.audio_ch.close()
|
695
735
|
|
696
736
|
gen.function_ch.close()
|
697
737
|
gen.message_ch.close()
|
738
|
+
gen._done = True
|
698
739
|
|
699
740
|
def _handle_input_speech_started(self):
|
700
741
|
self.emit("input_speech_started", llm.InputSpeechStartedEvent())
|
@@ -715,7 +756,7 @@ class RealtimeSession(llm.RealtimeSession):
|
|
715
756
|
arguments=arguments,
|
716
757
|
)
|
717
758
|
)
|
718
|
-
self.
|
759
|
+
self._mark_current_generation_done()
|
719
760
|
|
720
761
|
def _handle_tool_call_cancellation(
|
721
762
|
self, tool_call_cancellation: LiveServerToolCallCancellation
|
@@ -726,8 +767,62 @@ class RealtimeSession(llm.RealtimeSession):
|
|
726
767
|
)
|
727
768
|
|
728
769
|
def _handle_usage_metadata(self, usage_metadata: UsageMetadata):
|
729
|
-
|
730
|
-
|
770
|
+
current_gen = self._current_generation
|
771
|
+
if not current_gen:
|
772
|
+
logger.warning("no active generation to report metrics for")
|
773
|
+
return
|
774
|
+
|
775
|
+
ttft = (
|
776
|
+
current_gen._first_token_timestamp - current_gen._created_timestamp
|
777
|
+
if current_gen._first_token_timestamp
|
778
|
+
else -1
|
779
|
+
)
|
780
|
+
duration = (
|
781
|
+
current_gen._completed_timestamp or time.time()
|
782
|
+
) - current_gen._created_timestamp
|
783
|
+
|
784
|
+
def _token_details_map(
|
785
|
+
token_details: list[ModalityTokenCount] | None,
|
786
|
+
) -> dict[Modality, int]:
|
787
|
+
token_details_map = {"audio_tokens": 0, "text_tokens": 0, "image_tokens": 0}
|
788
|
+
if not token_details:
|
789
|
+
return token_details_map
|
790
|
+
|
791
|
+
for token_detail in token_details:
|
792
|
+
if token_detail.modality == Modality.AUDIO:
|
793
|
+
token_details_map["audio_tokens"] += token_detail.token_count
|
794
|
+
elif token_detail.modality == Modality.TEXT:
|
795
|
+
token_details_map["text_tokens"] += token_detail.token_count
|
796
|
+
elif token_detail.modality == Modality.IMAGE:
|
797
|
+
token_details_map["image_tokens"] += token_detail.token_count
|
798
|
+
return token_details_map
|
799
|
+
|
800
|
+
metrics = RealtimeModelMetrics(
|
801
|
+
label=self._realtime_model._label,
|
802
|
+
request_id=current_gen.response_id,
|
803
|
+
timestamp=current_gen._created_timestamp,
|
804
|
+
duration=duration,
|
805
|
+
ttft=ttft,
|
806
|
+
cancelled=False,
|
807
|
+
input_tokens=usage_metadata.prompt_token_count or 0,
|
808
|
+
output_tokens=usage_metadata.response_token_count or 0,
|
809
|
+
total_tokens=usage_metadata.total_token_count or 0,
|
810
|
+
tokens_per_second=(usage_metadata.response_token_count or 0) / duration,
|
811
|
+
input_token_details=RealtimeModelMetrics.InputTokenDetails(
|
812
|
+
**_token_details_map(usage_metadata.prompt_tokens_details),
|
813
|
+
cached_tokens=sum(
|
814
|
+
token_detail.token_count or 0
|
815
|
+
for token_detail in usage_metadata.cache_tokens_details or []
|
816
|
+
),
|
817
|
+
cached_tokens_details=RealtimeModelMetrics.CachedTokenDetails(
|
818
|
+
**_token_details_map(usage_metadata.cache_tokens_details),
|
819
|
+
),
|
820
|
+
),
|
821
|
+
output_token_details=RealtimeModelMetrics.OutputTokenDetails(
|
822
|
+
**_token_details_map(usage_metadata.response_tokens_details),
|
823
|
+
),
|
824
|
+
)
|
825
|
+
self.emit("metrics_collected", metrics)
|
731
826
|
|
732
827
|
def _handle_go_away(self, go_away: LiveServerGoAway):
|
733
828
|
logger.warning(
|
livekit/plugins/google/llm.py
CHANGED
@@ -62,7 +62,7 @@ class LLM(llm.LLM):
|
|
62
62
|
*,
|
63
63
|
model: ChatModels | str = "gemini-2.0-flash-001",
|
64
64
|
api_key: NotGivenOr[str] = NOT_GIVEN,
|
65
|
-
vertexai: NotGivenOr[bool] =
|
65
|
+
vertexai: NotGivenOr[bool] = NOT_GIVEN,
|
66
66
|
project: NotGivenOr[str] = NOT_GIVEN,
|
67
67
|
location: NotGivenOr[str] = NOT_GIVEN,
|
68
68
|
temperature: NotGivenOr[float] = NOT_GIVEN,
|
@@ -78,7 +78,7 @@ class LLM(llm.LLM):
|
|
78
78
|
Create a new instance of Google GenAI LLM.
|
79
79
|
|
80
80
|
Environment Requirements:
|
81
|
-
- For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file.
|
81
|
+
- For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file or use any of the other Google Cloud auth methods.
|
82
82
|
The Google Cloud project and location can be set via `project` and `location` arguments or the environment variables
|
83
83
|
`GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_LOCATION`. By default, the project is inferred from the service account key file,
|
84
84
|
and the location defaults to "us-central1".
|
@@ -87,9 +87,9 @@ class LLM(llm.LLM):
|
|
87
87
|
Args:
|
88
88
|
model (ChatModels | str, optional): The model name to use. Defaults to "gemini-2.0-flash-001".
|
89
89
|
api_key (str, optional): The API key for Google Gemini. If not provided, it attempts to read from the `GOOGLE_API_KEY` environment variable.
|
90
|
-
vertexai (bool, optional): Whether to use VertexAI. Defaults to False.
|
91
|
-
|
92
|
-
|
90
|
+
vertexai (bool, optional): Whether to use VertexAI. If not provided, it attempts to read from the `GOOGLE_GENAI_USE_VERTEXAI` environment variable. Defaults to False.
|
91
|
+
project (str, optional): The Google Cloud project to use (only for VertexAI). Defaults to None.
|
92
|
+
location (str, optional): The location to use for VertexAI API requests. Defaults value is "us-central1".
|
93
93
|
temperature (float, optional): Sampling temperature for response generation. Defaults to 0.8.
|
94
94
|
max_output_tokens (int, optional): Maximum number of tokens to generate in the output. Defaults to None.
|
95
95
|
top_p (float, optional): The nucleus sampling probability for response generation. Defaults to None.
|
@@ -101,15 +101,19 @@ class LLM(llm.LLM):
|
|
101
101
|
""" # noqa: E501
|
102
102
|
super().__init__()
|
103
103
|
gcp_project = project if is_given(project) else os.environ.get("GOOGLE_CLOUD_PROJECT")
|
104
|
-
gcp_location =
|
104
|
+
gcp_location = (
|
105
|
+
location
|
106
|
+
if is_given(location)
|
107
|
+
else os.environ.get("GOOGLE_CLOUD_LOCATION") or "us-central1"
|
108
|
+
)
|
109
|
+
use_vertexai = (
|
110
|
+
vertexai
|
111
|
+
if is_given(vertexai)
|
112
|
+
else os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "0").lower() in ["true", "1"]
|
113
|
+
)
|
105
114
|
gemini_api_key = api_key if is_given(api_key) else os.environ.get("GOOGLE_API_KEY")
|
106
|
-
_gac = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
|
107
|
-
if _gac is None:
|
108
|
-
logger.warning(
|
109
|
-
"`GOOGLE_APPLICATION_CREDENTIALS` environment variable is not set. please set it to the path of the service account key file. Otherwise, use any of the other Google Cloud auth methods." # noqa: E501
|
110
|
-
)
|
111
115
|
|
112
|
-
if
|
116
|
+
if use_vertexai:
|
113
117
|
if not gcp_project:
|
114
118
|
_, gcp_project = default_async(
|
115
119
|
scopes=["https://www.googleapis.com/auth/cloud-platform"]
|
@@ -144,7 +148,7 @@ class LLM(llm.LLM):
|
|
144
148
|
model=model,
|
145
149
|
temperature=temperature,
|
146
150
|
tool_choice=tool_choice,
|
147
|
-
vertexai=
|
151
|
+
vertexai=use_vertexai,
|
148
152
|
project=project,
|
149
153
|
location=location,
|
150
154
|
max_output_tokens=max_output_tokens,
|
@@ -156,7 +160,7 @@ class LLM(llm.LLM):
|
|
156
160
|
)
|
157
161
|
self._client = genai.Client(
|
158
162
|
api_key=gemini_api_key,
|
159
|
-
vertexai=
|
163
|
+
vertexai=use_vertexai,
|
160
164
|
project=gcp_project,
|
161
165
|
location=gcp_location,
|
162
166
|
)
|
@@ -325,6 +329,7 @@ class LLMStream(llm.LLMStream):
|
|
325
329
|
usage=llm.CompletionUsage(
|
326
330
|
completion_tokens=usage.candidates_token_count or 0,
|
327
331
|
prompt_tokens=usage.prompt_token_count or 0,
|
332
|
+
prompt_cached_tokens=usage.cached_content_token_count or 0,
|
328
333
|
total_tokens=usage.total_token_count or 0,
|
329
334
|
),
|
330
335
|
)
|
livekit/plugins/google/models.py
CHANGED
@@ -95,6 +95,8 @@ SpeechLanguages = Literal[
|
|
95
95
|
Gender = Literal["male", "female", "neutral"]
|
96
96
|
|
97
97
|
ChatModels = Literal[
|
98
|
+
"gemini-2.5-pro-preview-05-06",
|
99
|
+
"gemini-2.5-flash-preview-04-17",
|
98
100
|
"gemini-2.0-flash-001",
|
99
101
|
"gemini-2.0-flash-lite-preview-02-05",
|
100
102
|
"gemini-2.0-pro-exp-02-05",
|
livekit/plugins/google/utils.py
CHANGED
@@ -9,15 +9,31 @@ from pydantic import TypeAdapter
|
|
9
9
|
|
10
10
|
from google.genai import types
|
11
11
|
from livekit.agents import llm
|
12
|
-
from livekit.agents.llm import
|
12
|
+
from livekit.agents.llm import utils as llm_utils
|
13
|
+
from livekit.agents.llm.tool_context import (
|
14
|
+
FunctionTool,
|
15
|
+
RawFunctionTool,
|
16
|
+
get_raw_function_info,
|
17
|
+
is_function_tool,
|
18
|
+
is_raw_function_tool,
|
19
|
+
)
|
13
20
|
|
14
21
|
from .log import logger
|
15
22
|
|
16
23
|
__all__ = ["to_chat_ctx", "to_fnc_ctx"]
|
17
24
|
|
18
25
|
|
19
|
-
def to_fnc_ctx(fncs: list[FunctionTool]) -> list[types.FunctionDeclaration]:
|
20
|
-
|
26
|
+
def to_fnc_ctx(fncs: list[FunctionTool | RawFunctionTool]) -> list[types.FunctionDeclaration]:
|
27
|
+
tools: list[types.FunctionDeclaration] = []
|
28
|
+
for fnc in fncs:
|
29
|
+
if is_raw_function_tool(fnc):
|
30
|
+
info = get_raw_function_info(fnc)
|
31
|
+
tools.append(types.FunctionDeclaration(**info.raw_schema))
|
32
|
+
|
33
|
+
elif is_function_tool(fnc):
|
34
|
+
tools.append(_build_gemini_fnc(fnc))
|
35
|
+
|
36
|
+
return tools
|
21
37
|
|
22
38
|
|
23
39
|
def get_tool_results_for_realtime(
|
{livekit_plugins_google-1.0.20.dist-info → livekit_plugins_google-1.0.21.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: livekit-plugins-google
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.21
|
4
4
|
Summary: Agent Framework plugin for services from Google Cloud
|
5
5
|
Project-URL: Documentation, https://docs.livekit.io
|
6
6
|
Project-URL: Website, https://livekit.io/
|
@@ -20,9 +20,9 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
20
|
Requires-Python: >=3.9.0
|
21
21
|
Requires-Dist: google-auth<3,>=2
|
22
22
|
Requires-Dist: google-cloud-speech<3,>=2
|
23
|
-
Requires-Dist: google-cloud-texttospeech<3,>=2
|
24
|
-
Requires-Dist: google-genai>=1.
|
25
|
-
Requires-Dist: livekit-agents>=1.0.
|
23
|
+
Requires-Dist: google-cloud-texttospeech<3,>=2.24
|
24
|
+
Requires-Dist: google-genai>=1.14.0
|
25
|
+
Requires-Dist: livekit-agents>=1.0.21
|
26
26
|
Description-Content-Type: text/markdown
|
27
27
|
|
28
28
|
# Google AI plugin for LiveKit Agents
|
@@ -1,16 +1,16 @@
|
|
1
1
|
livekit/plugins/google/__init__.py,sha256=xain2qUzU-YWhYWsLBkW8Q-szV-htpnzHTqymMPo-j0,1364
|
2
|
-
livekit/plugins/google/llm.py,sha256=
|
2
|
+
livekit/plugins/google/llm.py,sha256=Kr9qeBZ5Dd0WCCBR_-gM3WWsVRZPCSteK8NpBsg2C5Y,16304
|
3
3
|
livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
|
4
|
-
livekit/plugins/google/models.py,sha256=
|
4
|
+
livekit/plugins/google/models.py,sha256=maGlEM3hK4-5hMnH9UQMJewA7BZMrnStsFLBNoNVySg,1531
|
5
5
|
livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
6
|
livekit/plugins/google/stt.py,sha256=2jk-1fHiBT8UW_n3CZsIEdMp2iBnUAlTnmefdUd8rAM,23620
|
7
7
|
livekit/plugins/google/tts.py,sha256=29R0ieV5sRPBf5Yi0SPFQk7ZZMbELF30bIL9K_j_Wcg,9100
|
8
|
-
livekit/plugins/google/utils.py,sha256=
|
9
|
-
livekit/plugins/google/version.py,sha256=
|
8
|
+
livekit/plugins/google/utils.py,sha256=UBAbddYk7G8Nojg6bSC7_xN2pdl9qhs86HGhKYFuf9M,10509
|
9
|
+
livekit/plugins/google/version.py,sha256=5lzQkS1jEPqreexacwMd18b2EOx7R5m8AQMKtQRBgC4,601
|
10
10
|
livekit/plugins/google/beta/__init__.py,sha256=5PnoG3Ux24bjzMSzmTeSVljE9EINivGcbWUEV6egGnM,216
|
11
11
|
livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
|
12
12
|
livekit/plugins/google/beta/realtime/api_proto.py,sha256=Fyrejs3SG0EjOPCCFLEnWXKEUxCff47PMWk2VsKJm5E,594
|
13
|
-
livekit/plugins/google/beta/realtime/realtime_api.py,sha256=
|
14
|
-
livekit_plugins_google-1.0.
|
15
|
-
livekit_plugins_google-1.0.
|
16
|
-
livekit_plugins_google-1.0.
|
13
|
+
livekit/plugins/google/beta/realtime/realtime_api.py,sha256=yYB5fKXl_aaMH_ZSpfUlfOTUg4eRqqRENLTZhZMfBMc,36253
|
14
|
+
livekit_plugins_google-1.0.21.dist-info/METADATA,sha256=mQA8BfvWhAjp3V9GJA5OsZLzP_Q03UuDbRX2HbcEgtY,1908
|
15
|
+
livekit_plugins_google-1.0.21.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
16
|
+
livekit_plugins_google-1.0.21.dist-info/RECORD,,
|
File without changes
|