livekit-plugins-google 1.0.19__py3-none-any.whl → 1.0.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/google/__init__.py +7 -0
- livekit/plugins/google/beta/__init__.py +9 -0
- livekit/plugins/google/beta/realtime/realtime_api.py +202 -95
- livekit/plugins/google/llm.py +21 -16
- livekit/plugins/google/models.py +2 -0
- livekit/plugins/google/stt.py +15 -3
- livekit/plugins/google/utils.py +39 -10
- livekit/plugins/google/version.py +1 -1
- livekit_plugins_google-1.0.21.dist-info/METADATA +47 -0
- livekit_plugins_google-1.0.21.dist-info/RECORD +16 -0
- livekit_plugins_google-1.0.19.dist-info/METADATA +0 -99
- livekit_plugins_google-1.0.19.dist-info/RECORD +0 -16
- {livekit_plugins_google-1.0.19.dist-info → livekit_plugins_google-1.0.21.dist-info}/WHEEL +0 -0
@@ -12,6 +12,13 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
+
"""Google AI plugin for LiveKit Agents
|
16
|
+
|
17
|
+
Supports Gemini, Cloud Speech-to-Text, and Cloud Text-to-Speech.
|
18
|
+
|
19
|
+
See https://docs.livekit.io/agents/integrations/stt/google/ for more information.
|
20
|
+
"""
|
21
|
+
|
15
22
|
from . import beta
|
16
23
|
from .llm import LLM
|
17
24
|
from .stt import STT, SpeechStream
|
@@ -4,14 +4,16 @@ import asyncio
|
|
4
4
|
import contextlib
|
5
5
|
import json
|
6
6
|
import os
|
7
|
+
import time
|
7
8
|
import weakref
|
8
9
|
from collections.abc import Iterator
|
9
|
-
from dataclasses import dataclass
|
10
|
+
from dataclasses import dataclass, field
|
10
11
|
|
11
12
|
from google import genai
|
12
13
|
from google.genai.live import AsyncSession
|
13
14
|
from google.genai.types import (
|
14
15
|
AudioTranscriptionConfig,
|
16
|
+
AutomaticActivityDetection,
|
15
17
|
Blob,
|
16
18
|
Content,
|
17
19
|
FunctionDeclaration,
|
@@ -25,8 +27,10 @@ from google.genai.types import (
|
|
25
27
|
LiveServerToolCall,
|
26
28
|
LiveServerToolCallCancellation,
|
27
29
|
Modality,
|
30
|
+
ModalityTokenCount,
|
28
31
|
Part,
|
29
32
|
PrebuiltVoiceConfig,
|
33
|
+
RealtimeInputConfig,
|
30
34
|
SessionResumptionConfig,
|
31
35
|
SpeechConfig,
|
32
36
|
Tool,
|
@@ -35,19 +39,20 @@ from google.genai.types import (
|
|
35
39
|
)
|
36
40
|
from livekit import rtc
|
37
41
|
from livekit.agents import llm, utils
|
42
|
+
from livekit.agents.metrics import RealtimeModelMetrics
|
38
43
|
from livekit.agents.types import NOT_GIVEN, NotGivenOr
|
39
44
|
from livekit.agents.utils import audio as audio_utils, images, is_given
|
40
45
|
from livekit.plugins.google.beta.realtime.api_proto import ClientEvents, LiveAPIModels, Voice
|
41
46
|
|
42
47
|
from ...log import logger
|
43
|
-
from ...utils import
|
48
|
+
from ...utils import get_tool_results_for_realtime, to_chat_ctx, to_fnc_ctx
|
44
49
|
|
45
50
|
INPUT_AUDIO_SAMPLE_RATE = 16000
|
46
51
|
INPUT_AUDIO_CHANNELS = 1
|
47
52
|
OUTPUT_AUDIO_SAMPLE_RATE = 24000
|
48
53
|
OUTPUT_AUDIO_CHANNELS = 1
|
49
54
|
|
50
|
-
|
55
|
+
DEFAULT_IMAGE_ENCODE_OPTIONS = images.EncodeOptions(
|
51
56
|
format="JPEG",
|
52
57
|
quality=75,
|
53
58
|
resize_options=images.ResizeOptions(width=1024, height=1024, strategy="scale_aspect_fit"),
|
@@ -80,13 +85,7 @@ class _RealtimeOptions:
|
|
80
85
|
instructions: NotGivenOr[str]
|
81
86
|
input_audio_transcription: AudioTranscriptionConfig | None
|
82
87
|
output_audio_transcription: AudioTranscriptionConfig | None
|
83
|
-
|
84
|
-
|
85
|
-
@dataclass
|
86
|
-
class _MessageGeneration:
|
87
|
-
message_id: str
|
88
|
-
text_ch: utils.aio.Chan[str]
|
89
|
-
audio_ch: utils.aio.Chan[rtc.AudioFrame]
|
88
|
+
image_encode_options: NotGivenOr[images.EncodeOptions]
|
90
89
|
|
91
90
|
|
92
91
|
@dataclass
|
@@ -94,7 +93,19 @@ class _ResponseGeneration:
|
|
94
93
|
message_ch: utils.aio.Chan[llm.MessageGeneration]
|
95
94
|
function_ch: utils.aio.Chan[llm.FunctionCall]
|
96
95
|
|
97
|
-
|
96
|
+
response_id: str
|
97
|
+
text_ch: utils.aio.Chan[str]
|
98
|
+
audio_ch: utils.aio.Chan[rtc.AudioFrame]
|
99
|
+
input_transcription: str = ""
|
100
|
+
|
101
|
+
_created_timestamp: float = field(default_factory=time.time)
|
102
|
+
"""The timestamp when the generation is created"""
|
103
|
+
_first_token_timestamp: float | None = None
|
104
|
+
"""The timestamp when the first audio token is received"""
|
105
|
+
_completed_timestamp: float | None = None
|
106
|
+
"""The timestamp when the generation is completed"""
|
107
|
+
_done: bool = False
|
108
|
+
"""Whether the generation is done (set when the turn is complete)"""
|
98
109
|
|
99
110
|
|
100
111
|
class RealtimeModel(llm.RealtimeModel):
|
@@ -102,12 +113,12 @@ class RealtimeModel(llm.RealtimeModel):
|
|
102
113
|
self,
|
103
114
|
*,
|
104
115
|
instructions: NotGivenOr[str] = NOT_GIVEN,
|
105
|
-
model: LiveAPIModels | str =
|
116
|
+
model: NotGivenOr[LiveAPIModels | str] = NOT_GIVEN,
|
106
117
|
api_key: NotGivenOr[str] = NOT_GIVEN,
|
107
118
|
voice: Voice | str = "Puck",
|
108
119
|
language: NotGivenOr[str] = NOT_GIVEN,
|
109
120
|
modalities: NotGivenOr[list[Modality]] = NOT_GIVEN,
|
110
|
-
vertexai: bool =
|
121
|
+
vertexai: NotGivenOr[bool] = NOT_GIVEN,
|
111
122
|
project: NotGivenOr[str] = NOT_GIVEN,
|
112
123
|
location: NotGivenOr[str] = NOT_GIVEN,
|
113
124
|
candidate_count: int = 1,
|
@@ -119,12 +130,13 @@ class RealtimeModel(llm.RealtimeModel):
|
|
119
130
|
frequency_penalty: NotGivenOr[float] = NOT_GIVEN,
|
120
131
|
input_audio_transcription: NotGivenOr[AudioTranscriptionConfig | None] = NOT_GIVEN,
|
121
132
|
output_audio_transcription: NotGivenOr[AudioTranscriptionConfig | None] = NOT_GIVEN,
|
133
|
+
image_encode_options: NotGivenOr[images.EncodeOptions] = NOT_GIVEN,
|
122
134
|
) -> None:
|
123
135
|
"""
|
124
136
|
Initializes a RealtimeModel instance for interacting with Google's Realtime API.
|
125
137
|
|
126
138
|
Environment Requirements:
|
127
|
-
- For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file.
|
139
|
+
- For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file or use any of the other Google Cloud auth methods.
|
128
140
|
The Google Cloud project and location can be set via `project` and `location` arguments or the environment variables
|
129
141
|
`GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_LOCATION`. By default, the project is inferred from the service account key file,
|
130
142
|
and the location defaults to "us-central1".
|
@@ -134,7 +146,7 @@ class RealtimeModel(llm.RealtimeModel):
|
|
134
146
|
instructions (str, optional): Initial system instructions for the model. Defaults to "".
|
135
147
|
api_key (str, optional): Google Gemini API key. If None, will attempt to read from the environment variable GOOGLE_API_KEY.
|
136
148
|
modalities (list[Modality], optional): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
|
137
|
-
model (str, optional): The name of the model to use. Defaults to "gemini-2.0-flash-live-001".
|
149
|
+
model (str, optional): The name of the model to use. Defaults to "gemini-2.0-flash-live-001" or "gemini-2.0-flash-exp" (vertexai).
|
138
150
|
voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
|
139
151
|
language (str, optional): The language(BCP-47 Code) to use for the API. supported languages - https://ai.google.dev/gemini-api/docs/live#supported-languages
|
140
152
|
temperature (float, optional): Sampling temperature for response generation. Defaults to 0.8.
|
@@ -148,26 +160,48 @@ class RealtimeModel(llm.RealtimeModel):
|
|
148
160
|
frequency_penalty (float, optional): The frequency penalty for response generation
|
149
161
|
input_audio_transcription (AudioTranscriptionConfig | None, optional): The configuration for input audio transcription. Defaults to None.)
|
150
162
|
output_audio_transcription (AudioTranscriptionConfig | None, optional): The configuration for output audio transcription. Defaults to AudioTranscriptionConfig().
|
163
|
+
image_encode_options (images.EncodeOptions, optional): The configuration for image encoding. Defaults to DEFAULT_ENCODE_OPTIONS.
|
151
164
|
|
152
165
|
Raises:
|
153
166
|
ValueError: If the API key is required but not found.
|
154
167
|
""" # noqa: E501
|
168
|
+
if not is_given(input_audio_transcription):
|
169
|
+
input_audio_transcription = AudioTranscriptionConfig()
|
170
|
+
if not is_given(output_audio_transcription):
|
171
|
+
output_audio_transcription = AudioTranscriptionConfig()
|
172
|
+
|
155
173
|
super().__init__(
|
156
174
|
capabilities=llm.RealtimeCapabilities(
|
157
175
|
message_truncation=False,
|
158
176
|
turn_detection=True,
|
159
|
-
user_transcription=
|
177
|
+
user_transcription=input_audio_transcription is not None,
|
178
|
+
auto_tool_reply_generation=True,
|
160
179
|
)
|
161
180
|
)
|
162
181
|
|
182
|
+
if not is_given(model):
|
183
|
+
if vertexai:
|
184
|
+
model = "gemini-2.0-flash-exp"
|
185
|
+
else:
|
186
|
+
model = "gemini-2.0-flash-live-001"
|
187
|
+
|
163
188
|
gemini_api_key = api_key if is_given(api_key) else os.environ.get("GOOGLE_API_KEY")
|
164
189
|
gcp_project = project if is_given(project) else os.environ.get("GOOGLE_CLOUD_PROJECT")
|
165
|
-
gcp_location =
|
190
|
+
gcp_location = (
|
191
|
+
location
|
192
|
+
if is_given(location)
|
193
|
+
else os.environ.get("GOOGLE_CLOUD_LOCATION") or "us-central1"
|
194
|
+
)
|
195
|
+
use_vertexai = (
|
196
|
+
vertexai
|
197
|
+
if is_given(vertexai)
|
198
|
+
else os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "0").lower() in ["true", "1"]
|
199
|
+
)
|
166
200
|
|
167
|
-
if
|
201
|
+
if use_vertexai:
|
168
202
|
if not gcp_project or not gcp_location:
|
169
203
|
raise ValueError(
|
170
|
-
"Project
|
204
|
+
"Project is required for VertexAI via project kwarg or GOOGLE_CLOUD_PROJECT environment variable" # noqa: E501
|
171
205
|
)
|
172
206
|
gemini_api_key = None # VertexAI does not require an API key
|
173
207
|
else:
|
@@ -178,17 +212,12 @@ class RealtimeModel(llm.RealtimeModel):
|
|
178
212
|
"API key is required for Google API either via api_key or GOOGLE_API_KEY environment variable" # noqa: E501
|
179
213
|
)
|
180
214
|
|
181
|
-
if not is_given(input_audio_transcription):
|
182
|
-
input_audio_transcription = None
|
183
|
-
if not is_given(output_audio_transcription):
|
184
|
-
output_audio_transcription = AudioTranscriptionConfig()
|
185
|
-
|
186
215
|
self._opts = _RealtimeOptions(
|
187
216
|
model=model,
|
188
217
|
api_key=gemini_api_key,
|
189
218
|
voice=voice,
|
190
219
|
response_modalities=modalities,
|
191
|
-
vertexai=
|
220
|
+
vertexai=use_vertexai,
|
192
221
|
project=gcp_project,
|
193
222
|
location=gcp_location,
|
194
223
|
candidate_count=candidate_count,
|
@@ -202,6 +231,7 @@ class RealtimeModel(llm.RealtimeModel):
|
|
202
231
|
input_audio_transcription=input_audio_transcription,
|
203
232
|
output_audio_transcription=output_audio_transcription,
|
204
233
|
language=language,
|
234
|
+
image_encode_options=image_encode_options,
|
205
235
|
)
|
206
236
|
|
207
237
|
self._sessions = weakref.WeakSet[RealtimeSession]()
|
@@ -262,7 +292,6 @@ class RealtimeSession(llm.RealtimeSession):
|
|
262
292
|
|
263
293
|
self._session_resumption_handle: str | None = None
|
264
294
|
|
265
|
-
self._update_lock = asyncio.Lock()
|
266
295
|
self._session_lock = asyncio.Lock()
|
267
296
|
|
268
297
|
async def _close_active_session(self) -> None:
|
@@ -281,55 +310,59 @@ class RealtimeSession(llm.RealtimeSession):
|
|
281
310
|
# reset the msg_ch, do not send messages from previous session
|
282
311
|
self._msg_ch = utils.aio.Chan[ClientEvents]()
|
283
312
|
|
284
|
-
|
313
|
+
def update_options(
|
285
314
|
self,
|
286
315
|
*,
|
287
316
|
voice: NotGivenOr[str] = NOT_GIVEN,
|
288
317
|
temperature: NotGivenOr[float] = NOT_GIVEN,
|
289
318
|
tool_choice: NotGivenOr[llm.ToolChoice | None] = NOT_GIVEN,
|
290
319
|
) -> None:
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
should_restart = True
|
320
|
+
should_restart = False
|
321
|
+
if is_given(voice) and self._opts.voice != voice:
|
322
|
+
self._opts.voice = voice
|
323
|
+
should_restart = True
|
296
324
|
|
297
|
-
|
298
|
-
|
299
|
-
|
325
|
+
if is_given(temperature) and self._opts.temperature != temperature:
|
326
|
+
self._opts.temperature = temperature if is_given(temperature) else NOT_GIVEN
|
327
|
+
should_restart = True
|
300
328
|
|
301
|
-
|
302
|
-
|
329
|
+
if should_restart:
|
330
|
+
self._mark_restart_needed()
|
303
331
|
|
304
332
|
async def update_instructions(self, instructions: str) -> None:
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
self._mark_restart_needed()
|
333
|
+
if not is_given(self._opts.instructions) or self._opts.instructions != instructions:
|
334
|
+
self._opts.instructions = instructions
|
335
|
+
self._mark_restart_needed()
|
309
336
|
|
310
337
|
async def update_chat_ctx(self, chat_ctx: llm.ChatContext) -> None:
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
338
|
+
diff_ops = llm.utils.compute_chat_ctx_diff(self._chat_ctx, chat_ctx)
|
339
|
+
|
340
|
+
if diff_ops.to_remove:
|
341
|
+
logger.warning("Gemini Live does not support removing messages")
|
342
|
+
|
343
|
+
append_ctx = llm.ChatContext.empty()
|
344
|
+
for _, item_id in diff_ops.to_create:
|
345
|
+
item = chat_ctx.get_by_id(item_id)
|
346
|
+
if item:
|
347
|
+
append_ctx.items.append(item)
|
348
|
+
|
349
|
+
if append_ctx.items:
|
350
|
+
turns, _ = to_chat_ctx(append_ctx, id(self), ignore_functions=True)
|
351
|
+
tool_results = get_tool_results_for_realtime(append_ctx, vertexai=self._opts.vertexai)
|
316
352
|
if turns:
|
317
353
|
self._send_client_event(LiveClientContent(turns=turns, turn_complete=False))
|
318
354
|
if tool_results:
|
319
355
|
self._send_client_event(tool_results)
|
320
356
|
|
321
357
|
async def update_tools(self, tools: list[llm.FunctionTool]) -> None:
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
self._gemini_declarations = new_declarations
|
331
|
-
self._tools = llm.ToolContext(tools)
|
332
|
-
self._mark_restart_needed()
|
358
|
+
new_declarations: list[FunctionDeclaration] = to_fnc_ctx(tools)
|
359
|
+
current_tool_names = {f.name for f in self._gemini_declarations}
|
360
|
+
new_tool_names = {f.name for f in new_declarations}
|
361
|
+
|
362
|
+
if current_tool_names != new_tool_names:
|
363
|
+
self._gemini_declarations = new_declarations
|
364
|
+
self._tools = llm.ToolContext(tools)
|
365
|
+
self._mark_restart_needed()
|
333
366
|
|
334
367
|
@property
|
335
368
|
def chat_ctx(self) -> llm.ChatContext:
|
@@ -348,7 +381,9 @@ class RealtimeSession(llm.RealtimeSession):
|
|
348
381
|
self._send_client_event(realtime_input)
|
349
382
|
|
350
383
|
def push_video(self, frame: rtc.VideoFrame) -> None:
|
351
|
-
encoded_data = images.encode(
|
384
|
+
encoded_data = images.encode(
|
385
|
+
frame, self._opts.image_encode_options or DEFAULT_IMAGE_ENCODE_OPTIONS
|
386
|
+
)
|
352
387
|
realtime_input = LiveClientRealtimeInput(
|
353
388
|
media_chunks=[Blob(data=encoded_data, mime_type="image/jpeg")]
|
354
389
|
)
|
@@ -418,7 +453,7 @@ class RealtimeSession(llm.RealtimeSession):
|
|
418
453
|
self._response_created_futures.clear()
|
419
454
|
|
420
455
|
if self._current_generation:
|
421
|
-
self.
|
456
|
+
self._mark_current_generation_done()
|
422
457
|
|
423
458
|
@utils.log_exceptions(logger=logger)
|
424
459
|
async def _main_task(self):
|
@@ -512,7 +547,7 @@ class RealtimeSession(llm.RealtimeSession):
|
|
512
547
|
break
|
513
548
|
|
514
549
|
async for response in session.receive():
|
515
|
-
if not self._current_generation and (
|
550
|
+
if (not self._current_generation or self._current_generation._done) and (
|
516
551
|
response.server_content or response.tool_call
|
517
552
|
):
|
518
553
|
self._start_new_generation()
|
@@ -543,7 +578,7 @@ class RealtimeSession(llm.RealtimeSession):
|
|
543
578
|
logger.error(f"error in receive task: {e}", exc_info=e)
|
544
579
|
self._mark_restart_needed()
|
545
580
|
finally:
|
546
|
-
self.
|
581
|
+
self._mark_current_generation_done()
|
547
582
|
|
548
583
|
def _build_connect_config(self) -> LiveConnectConfig:
|
549
584
|
temp = self._opts.temperature if is_given(self._opts.temperature) else None
|
@@ -580,32 +615,31 @@ class RealtimeSession(llm.RealtimeSession):
|
|
580
615
|
input_audio_transcription=self._opts.input_audio_transcription,
|
581
616
|
output_audio_transcription=self._opts.output_audio_transcription,
|
582
617
|
session_resumption=SessionResumptionConfig(handle=self._session_resumption_handle),
|
618
|
+
realtime_input_config=RealtimeInputConfig(
|
619
|
+
automatic_activity_detection=AutomaticActivityDetection(),
|
620
|
+
),
|
583
621
|
)
|
584
622
|
|
585
623
|
def _start_new_generation(self):
|
586
|
-
if self._current_generation:
|
624
|
+
if self._current_generation and not self._current_generation._done:
|
587
625
|
logger.warning("starting new generation while another is active. Finalizing previous.")
|
588
|
-
self.
|
626
|
+
self._mark_current_generation_done()
|
589
627
|
|
590
628
|
response_id = utils.shortuuid("gemini-turn-")
|
591
629
|
self._current_generation = _ResponseGeneration(
|
592
630
|
message_ch=utils.aio.Chan[llm.MessageGeneration](),
|
593
631
|
function_ch=utils.aio.Chan[llm.FunctionCall](),
|
594
|
-
|
595
|
-
)
|
596
|
-
|
597
|
-
item_generation = _MessageGeneration(
|
598
|
-
message_id=response_id,
|
632
|
+
response_id=response_id,
|
599
633
|
text_ch=utils.aio.Chan[str](),
|
600
634
|
audio_ch=utils.aio.Chan[rtc.AudioFrame](),
|
635
|
+
_created_timestamp=time.time(),
|
601
636
|
)
|
602
|
-
self._current_generation.messages[response_id] = item_generation
|
603
637
|
|
604
638
|
self._current_generation.message_ch.send_nowait(
|
605
639
|
llm.MessageGeneration(
|
606
640
|
message_id=response_id,
|
607
|
-
text_stream=
|
608
|
-
audio_stream=
|
641
|
+
text_stream=self._current_generation.text_ch,
|
642
|
+
audio_stream=self._current_generation.audio_ch,
|
609
643
|
)
|
610
644
|
)
|
611
645
|
|
@@ -623,18 +657,18 @@ class RealtimeSession(llm.RealtimeSession):
|
|
623
657
|
self.emit("generation_created", generation_event)
|
624
658
|
|
625
659
|
def _handle_server_content(self, server_content: LiveServerContent):
|
626
|
-
|
660
|
+
current_gen = self._current_generation
|
661
|
+
if not current_gen:
|
627
662
|
logger.warning("received server content but no active generation.")
|
628
663
|
return
|
629
664
|
|
630
|
-
response_id = list(self._current_generation.messages.keys())[0]
|
631
|
-
item_generation = self._current_generation.messages[response_id]
|
632
|
-
|
633
665
|
if model_turn := server_content.model_turn:
|
634
666
|
for part in model_turn.parts:
|
635
667
|
if part.text:
|
636
|
-
|
668
|
+
current_gen.text_ch.send_nowait(part.text)
|
637
669
|
if part.inline_data:
|
670
|
+
if not current_gen._first_token_timestamp:
|
671
|
+
current_gen._first_token_timestamp = time.time()
|
638
672
|
frame_data = part.inline_data.data
|
639
673
|
try:
|
640
674
|
frame = rtc.AudioFrame(
|
@@ -643,46 +677,65 @@ class RealtimeSession(llm.RealtimeSession):
|
|
643
677
|
num_channels=OUTPUT_AUDIO_CHANNELS,
|
644
678
|
samples_per_channel=len(frame_data) // (2 * OUTPUT_AUDIO_CHANNELS),
|
645
679
|
)
|
646
|
-
|
680
|
+
current_gen.audio_ch.send_nowait(frame)
|
647
681
|
except ValueError as e:
|
648
682
|
logger.error(f"Error creating audio frame from Gemini data: {e}")
|
649
683
|
|
650
684
|
if input_transcription := server_content.input_transcription:
|
651
|
-
|
685
|
+
text = input_transcription.text
|
686
|
+
if text:
|
687
|
+
if current_gen.input_transcription == "":
|
688
|
+
# gemini would start with a space, which doesn't make sense
|
689
|
+
# at beginning of the transcript
|
690
|
+
text = text.lstrip()
|
691
|
+
current_gen.input_transcription += text
|
652
692
|
self.emit(
|
653
693
|
"input_audio_transcription_completed",
|
654
694
|
llm.InputTranscriptionCompleted(
|
655
|
-
item_id=response_id,
|
695
|
+
item_id=current_gen.response_id,
|
696
|
+
transcript=current_gen.input_transcription,
|
697
|
+
is_final=False,
|
656
698
|
),
|
657
699
|
)
|
658
|
-
self._handle_input_speech_started()
|
659
700
|
|
660
701
|
if output_transcription := server_content.output_transcription:
|
661
|
-
|
662
|
-
|
702
|
+
text = output_transcription.text
|
703
|
+
if text:
|
704
|
+
current_gen.text_ch.send_nowait(text)
|
705
|
+
|
706
|
+
if server_content.generation_complete:
|
707
|
+
# The only way we'd know that the transcription is complete is by when they are
|
708
|
+
# done with generation
|
709
|
+
if current_gen.input_transcription:
|
710
|
+
self.emit(
|
711
|
+
"input_audio_transcription_completed",
|
712
|
+
llm.InputTranscriptionCompleted(
|
713
|
+
item_id=current_gen.response_id,
|
714
|
+
transcript=current_gen.input_transcription,
|
715
|
+
is_final=True,
|
716
|
+
),
|
717
|
+
)
|
718
|
+
current_gen._completed_timestamp = time.time()
|
663
719
|
|
664
720
|
if server_content.interrupted:
|
665
|
-
self._finalize_response(interrupted=True)
|
666
721
|
self._handle_input_speech_started()
|
667
722
|
|
668
723
|
if server_content.turn_complete:
|
669
|
-
self.
|
724
|
+
self._mark_current_generation_done()
|
670
725
|
|
671
|
-
def
|
726
|
+
def _mark_current_generation_done(self) -> None:
|
672
727
|
if not self._current_generation:
|
673
728
|
return
|
674
729
|
|
675
730
|
gen = self._current_generation
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
item_generation.text_ch.close()
|
681
|
-
if not item_generation.audio_ch.closed:
|
682
|
-
item_generation.audio_ch.close()
|
731
|
+
if not gen.text_ch.closed:
|
732
|
+
gen.text_ch.close()
|
733
|
+
if not gen.audio_ch.closed:
|
734
|
+
gen.audio_ch.close()
|
683
735
|
|
684
736
|
gen.function_ch.close()
|
685
737
|
gen.message_ch.close()
|
738
|
+
gen._done = True
|
686
739
|
|
687
740
|
def _handle_input_speech_started(self):
|
688
741
|
self.emit("input_speech_started", llm.InputSpeechStartedEvent())
|
@@ -703,7 +756,7 @@ class RealtimeSession(llm.RealtimeSession):
|
|
703
756
|
arguments=arguments,
|
704
757
|
)
|
705
758
|
)
|
706
|
-
self.
|
759
|
+
self._mark_current_generation_done()
|
707
760
|
|
708
761
|
def _handle_tool_call_cancellation(
|
709
762
|
self, tool_call_cancellation: LiveServerToolCallCancellation
|
@@ -714,8 +767,62 @@ class RealtimeSession(llm.RealtimeSession):
|
|
714
767
|
)
|
715
768
|
|
716
769
|
def _handle_usage_metadata(self, usage_metadata: UsageMetadata):
|
717
|
-
|
718
|
-
|
770
|
+
current_gen = self._current_generation
|
771
|
+
if not current_gen:
|
772
|
+
logger.warning("no active generation to report metrics for")
|
773
|
+
return
|
774
|
+
|
775
|
+
ttft = (
|
776
|
+
current_gen._first_token_timestamp - current_gen._created_timestamp
|
777
|
+
if current_gen._first_token_timestamp
|
778
|
+
else -1
|
779
|
+
)
|
780
|
+
duration = (
|
781
|
+
current_gen._completed_timestamp or time.time()
|
782
|
+
) - current_gen._created_timestamp
|
783
|
+
|
784
|
+
def _token_details_map(
|
785
|
+
token_details: list[ModalityTokenCount] | None,
|
786
|
+
) -> dict[Modality, int]:
|
787
|
+
token_details_map = {"audio_tokens": 0, "text_tokens": 0, "image_tokens": 0}
|
788
|
+
if not token_details:
|
789
|
+
return token_details_map
|
790
|
+
|
791
|
+
for token_detail in token_details:
|
792
|
+
if token_detail.modality == Modality.AUDIO:
|
793
|
+
token_details_map["audio_tokens"] += token_detail.token_count
|
794
|
+
elif token_detail.modality == Modality.TEXT:
|
795
|
+
token_details_map["text_tokens"] += token_detail.token_count
|
796
|
+
elif token_detail.modality == Modality.IMAGE:
|
797
|
+
token_details_map["image_tokens"] += token_detail.token_count
|
798
|
+
return token_details_map
|
799
|
+
|
800
|
+
metrics = RealtimeModelMetrics(
|
801
|
+
label=self._realtime_model._label,
|
802
|
+
request_id=current_gen.response_id,
|
803
|
+
timestamp=current_gen._created_timestamp,
|
804
|
+
duration=duration,
|
805
|
+
ttft=ttft,
|
806
|
+
cancelled=False,
|
807
|
+
input_tokens=usage_metadata.prompt_token_count or 0,
|
808
|
+
output_tokens=usage_metadata.response_token_count or 0,
|
809
|
+
total_tokens=usage_metadata.total_token_count or 0,
|
810
|
+
tokens_per_second=(usage_metadata.response_token_count or 0) / duration,
|
811
|
+
input_token_details=RealtimeModelMetrics.InputTokenDetails(
|
812
|
+
**_token_details_map(usage_metadata.prompt_tokens_details),
|
813
|
+
cached_tokens=sum(
|
814
|
+
token_detail.token_count or 0
|
815
|
+
for token_detail in usage_metadata.cache_tokens_details or []
|
816
|
+
),
|
817
|
+
cached_tokens_details=RealtimeModelMetrics.CachedTokenDetails(
|
818
|
+
**_token_details_map(usage_metadata.cache_tokens_details),
|
819
|
+
),
|
820
|
+
),
|
821
|
+
output_token_details=RealtimeModelMetrics.OutputTokenDetails(
|
822
|
+
**_token_details_map(usage_metadata.response_tokens_details),
|
823
|
+
),
|
824
|
+
)
|
825
|
+
self.emit("metrics_collected", metrics)
|
719
826
|
|
720
827
|
def _handle_go_away(self, go_away: LiveServerGoAway):
|
721
828
|
logger.warning(
|
livekit/plugins/google/llm.py
CHANGED
@@ -62,7 +62,7 @@ class LLM(llm.LLM):
|
|
62
62
|
*,
|
63
63
|
model: ChatModels | str = "gemini-2.0-flash-001",
|
64
64
|
api_key: NotGivenOr[str] = NOT_GIVEN,
|
65
|
-
vertexai: NotGivenOr[bool] =
|
65
|
+
vertexai: NotGivenOr[bool] = NOT_GIVEN,
|
66
66
|
project: NotGivenOr[str] = NOT_GIVEN,
|
67
67
|
location: NotGivenOr[str] = NOT_GIVEN,
|
68
68
|
temperature: NotGivenOr[float] = NOT_GIVEN,
|
@@ -78,7 +78,7 @@ class LLM(llm.LLM):
|
|
78
78
|
Create a new instance of Google GenAI LLM.
|
79
79
|
|
80
80
|
Environment Requirements:
|
81
|
-
- For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file.
|
81
|
+
- For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file or use any of the other Google Cloud auth methods.
|
82
82
|
The Google Cloud project and location can be set via `project` and `location` arguments or the environment variables
|
83
83
|
`GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_LOCATION`. By default, the project is inferred from the service account key file,
|
84
84
|
and the location defaults to "us-central1".
|
@@ -87,9 +87,9 @@ class LLM(llm.LLM):
|
|
87
87
|
Args:
|
88
88
|
model (ChatModels | str, optional): The model name to use. Defaults to "gemini-2.0-flash-001".
|
89
89
|
api_key (str, optional): The API key for Google Gemini. If not provided, it attempts to read from the `GOOGLE_API_KEY` environment variable.
|
90
|
-
vertexai (bool, optional): Whether to use VertexAI. Defaults to False.
|
91
|
-
|
92
|
-
|
90
|
+
vertexai (bool, optional): Whether to use VertexAI. If not provided, it attempts to read from the `GOOGLE_GENAI_USE_VERTEXAI` environment variable. Defaults to False.
|
91
|
+
project (str, optional): The Google Cloud project to use (only for VertexAI). Defaults to None.
|
92
|
+
location (str, optional): The location to use for VertexAI API requests. Defaults value is "us-central1".
|
93
93
|
temperature (float, optional): Sampling temperature for response generation. Defaults to 0.8.
|
94
94
|
max_output_tokens (int, optional): Maximum number of tokens to generate in the output. Defaults to None.
|
95
95
|
top_p (float, optional): The nucleus sampling probability for response generation. Defaults to None.
|
@@ -101,15 +101,19 @@ class LLM(llm.LLM):
|
|
101
101
|
""" # noqa: E501
|
102
102
|
super().__init__()
|
103
103
|
gcp_project = project if is_given(project) else os.environ.get("GOOGLE_CLOUD_PROJECT")
|
104
|
-
gcp_location =
|
104
|
+
gcp_location = (
|
105
|
+
location
|
106
|
+
if is_given(location)
|
107
|
+
else os.environ.get("GOOGLE_CLOUD_LOCATION") or "us-central1"
|
108
|
+
)
|
109
|
+
use_vertexai = (
|
110
|
+
vertexai
|
111
|
+
if is_given(vertexai)
|
112
|
+
else os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "0").lower() in ["true", "1"]
|
113
|
+
)
|
105
114
|
gemini_api_key = api_key if is_given(api_key) else os.environ.get("GOOGLE_API_KEY")
|
106
|
-
_gac = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
|
107
|
-
if _gac is None:
|
108
|
-
logger.warning(
|
109
|
-
"`GOOGLE_APPLICATION_CREDENTIALS` environment variable is not set. please set it to the path of the service account key file. Otherwise, use any of the other Google Cloud auth methods." # noqa: E501
|
110
|
-
)
|
111
115
|
|
112
|
-
if
|
116
|
+
if use_vertexai:
|
113
117
|
if not gcp_project:
|
114
118
|
_, gcp_project = default_async(
|
115
119
|
scopes=["https://www.googleapis.com/auth/cloud-platform"]
|
@@ -144,7 +148,7 @@ class LLM(llm.LLM):
|
|
144
148
|
model=model,
|
145
149
|
temperature=temperature,
|
146
150
|
tool_choice=tool_choice,
|
147
|
-
vertexai=
|
151
|
+
vertexai=use_vertexai,
|
148
152
|
project=project,
|
149
153
|
location=location,
|
150
154
|
max_output_tokens=max_output_tokens,
|
@@ -156,7 +160,7 @@ class LLM(llm.LLM):
|
|
156
160
|
)
|
157
161
|
self._client = genai.Client(
|
158
162
|
api_key=gemini_api_key,
|
159
|
-
vertexai=
|
163
|
+
vertexai=use_vertexai,
|
160
164
|
project=gcp_project,
|
161
165
|
location=gcp_location,
|
162
166
|
)
|
@@ -241,7 +245,7 @@ class LLM(llm.LLM):
|
|
241
245
|
client=self._client,
|
242
246
|
model=self._opts.model,
|
243
247
|
chat_ctx=chat_ctx,
|
244
|
-
tools=tools,
|
248
|
+
tools=tools or [],
|
245
249
|
conn_options=conn_options,
|
246
250
|
extra_kwargs=extra,
|
247
251
|
)
|
@@ -256,7 +260,7 @@ class LLMStream(llm.LLMStream):
|
|
256
260
|
model: str | ChatModels,
|
257
261
|
chat_ctx: llm.ChatContext,
|
258
262
|
conn_options: APIConnectOptions,
|
259
|
-
tools: list[FunctionTool]
|
263
|
+
tools: list[FunctionTool],
|
260
264
|
extra_kwargs: dict[str, Any],
|
261
265
|
) -> None:
|
262
266
|
super().__init__(llm, chat_ctx=chat_ctx, tools=tools, conn_options=conn_options)
|
@@ -325,6 +329,7 @@ class LLMStream(llm.LLMStream):
|
|
325
329
|
usage=llm.CompletionUsage(
|
326
330
|
completion_tokens=usage.candidates_token_count or 0,
|
327
331
|
prompt_tokens=usage.prompt_token_count or 0,
|
332
|
+
prompt_cached_tokens=usage.cached_content_token_count or 0,
|
328
333
|
total_tokens=usage.total_token_count or 0,
|
329
334
|
),
|
330
335
|
)
|
livekit/plugins/google/models.py
CHANGED
@@ -95,6 +95,8 @@ SpeechLanguages = Literal[
|
|
95
95
|
Gender = Literal["male", "female", "neutral"]
|
96
96
|
|
97
97
|
ChatModels = Literal[
|
98
|
+
"gemini-2.5-pro-preview-05-06",
|
99
|
+
"gemini-2.5-flash-preview-04-17",
|
98
100
|
"gemini-2.0-flash-001",
|
99
101
|
"gemini-2.0-flash-lite-preview-02-05",
|
100
102
|
"gemini-2.0-pro-exp-02-05",
|
livekit/plugins/google/stt.py
CHANGED
@@ -103,6 +103,7 @@ class STT(stt.STT):
|
|
103
103
|
credentials_info: NotGivenOr[dict] = NOT_GIVEN,
|
104
104
|
credentials_file: NotGivenOr[str] = NOT_GIVEN,
|
105
105
|
keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN,
|
106
|
+
use_streaming: NotGivenOr[bool] = NOT_GIVEN,
|
106
107
|
):
|
107
108
|
"""
|
108
109
|
Create a new instance of Google STT.
|
@@ -125,8 +126,13 @@ class STT(stt.STT):
|
|
125
126
|
credentials_info(dict): the credentials info to use for recognition (default: None)
|
126
127
|
credentials_file(str): the credentials file to use for recognition (default: None)
|
127
128
|
keywords(List[tuple[str, float]]): list of keywords to recognize (default: None)
|
129
|
+
use_streaming(bool): whether to use streaming for recognition (default: True)
|
128
130
|
"""
|
129
|
-
|
131
|
+
if not is_given(use_streaming):
|
132
|
+
use_streaming = True
|
133
|
+
super().__init__(
|
134
|
+
capabilities=stt.STTCapabilities(streaming=use_streaming, interim_results=True)
|
135
|
+
)
|
130
136
|
|
131
137
|
self._location = location
|
132
138
|
self._credentials_info = credentials_info
|
@@ -251,7 +257,7 @@ class STT(stt.STT):
|
|
251
257
|
except DeadlineExceeded:
|
252
258
|
raise APITimeoutError() from None
|
253
259
|
except GoogleAPICallError as e:
|
254
|
-
raise APIStatusError(e.message, status_code=e.code or -1) from
|
260
|
+
raise APIStatusError(f"{e.message} {e.details}", status_code=e.code or -1) from e
|
255
261
|
except Exception as e:
|
256
262
|
raise APIConnectionError() from e
|
257
263
|
|
@@ -472,6 +478,7 @@ class SpeechStream(stt.SpeechStream):
|
|
472
478
|
features=cloud_speech.RecognitionFeatures(
|
473
479
|
enable_automatic_punctuation=self._config.punctuate,
|
474
480
|
enable_word_time_offsets=True,
|
481
|
+
enable_spoken_punctuation=self._config.spoken_punctuation,
|
475
482
|
),
|
476
483
|
),
|
477
484
|
streaming_features=cloud_speech.StreamingRecognitionFeatures(
|
@@ -505,7 +512,12 @@ class SpeechStream(stt.SpeechStream):
|
|
505
512
|
except DeadlineExceeded:
|
506
513
|
raise APITimeoutError() from None
|
507
514
|
except GoogleAPICallError as e:
|
508
|
-
|
515
|
+
if e.code == 409:
|
516
|
+
logger.debug("stream timed out, restarting.")
|
517
|
+
else:
|
518
|
+
raise APIStatusError(
|
519
|
+
f"{e.message} {e.details}", status_code=e.code or -1
|
520
|
+
) from e
|
509
521
|
except Exception as e:
|
510
522
|
raise APIConnectionError() from e
|
511
523
|
|
livekit/plugins/google/utils.py
CHANGED
@@ -9,28 +9,48 @@ from pydantic import TypeAdapter
|
|
9
9
|
|
10
10
|
from google.genai import types
|
11
11
|
from livekit.agents import llm
|
12
|
-
from livekit.agents.llm import
|
12
|
+
from livekit.agents.llm import utils as llm_utils
|
13
|
+
from livekit.agents.llm.tool_context import (
|
14
|
+
FunctionTool,
|
15
|
+
RawFunctionTool,
|
16
|
+
get_raw_function_info,
|
17
|
+
is_function_tool,
|
18
|
+
is_raw_function_tool,
|
19
|
+
)
|
13
20
|
|
14
21
|
from .log import logger
|
15
22
|
|
16
23
|
__all__ = ["to_chat_ctx", "to_fnc_ctx"]
|
17
24
|
|
18
25
|
|
19
|
-
def to_fnc_ctx(fncs: list[FunctionTool]) -> list[types.FunctionDeclaration]:
|
20
|
-
|
26
|
+
def to_fnc_ctx(fncs: list[FunctionTool | RawFunctionTool]) -> list[types.FunctionDeclaration]:
|
27
|
+
tools: list[types.FunctionDeclaration] = []
|
28
|
+
for fnc in fncs:
|
29
|
+
if is_raw_function_tool(fnc):
|
30
|
+
info = get_raw_function_info(fnc)
|
31
|
+
tools.append(types.FunctionDeclaration(**info.raw_schema))
|
21
32
|
|
33
|
+
elif is_function_tool(fnc):
|
34
|
+
tools.append(_build_gemini_fnc(fnc))
|
22
35
|
|
23
|
-
|
36
|
+
return tools
|
37
|
+
|
38
|
+
|
39
|
+
def get_tool_results_for_realtime(
|
40
|
+
chat_ctx: llm.ChatContext, *, vertexai: bool = False
|
41
|
+
) -> types.LiveClientToolResponse | None:
|
24
42
|
function_responses: list[types.FunctionResponse] = []
|
25
43
|
for msg in chat_ctx.items:
|
26
44
|
if msg.type == "function_call_output":
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
name=msg.name,
|
31
|
-
response={"output": msg.output},
|
32
|
-
)
|
45
|
+
res = types.FunctionResponse(
|
46
|
+
name=msg.name,
|
47
|
+
response={"output": msg.output},
|
33
48
|
)
|
49
|
+
if not vertexai:
|
50
|
+
# vertexai does not support id in FunctionResponse
|
51
|
+
# see: https://github.com/googleapis/python-genai/blob/85e00bc/google/genai/_live_converters.py#L1435
|
52
|
+
res.id = msg.call_id
|
53
|
+
function_responses.append(res)
|
34
54
|
return (
|
35
55
|
types.LiveClientToolResponse(function_responses=function_responses)
|
36
56
|
if function_responses
|
@@ -175,6 +195,15 @@ class _GeminiJsonSchema:
|
|
175
195
|
schema.pop("title", None)
|
176
196
|
schema.pop("default", None)
|
177
197
|
schema.pop("additionalProperties", None)
|
198
|
+
schema.pop("$schema", None)
|
199
|
+
|
200
|
+
if (const := schema.pop("const", None)) is not None:
|
201
|
+
# Gemini doesn't support const, but it does support enum with a single value
|
202
|
+
schema["enum"] = [const]
|
203
|
+
|
204
|
+
schema.pop("discriminator", None)
|
205
|
+
schema.pop("examples", None)
|
206
|
+
|
178
207
|
if ref := schema.pop("$ref", None):
|
179
208
|
key = re.sub(r"^#/\$defs/", "", ref)
|
180
209
|
if key in refs_stack:
|
@@ -0,0 +1,47 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: livekit-plugins-google
|
3
|
+
Version: 1.0.21
|
4
|
+
Summary: Agent Framework plugin for services from Google Cloud
|
5
|
+
Project-URL: Documentation, https://docs.livekit.io
|
6
|
+
Project-URL: Website, https://livekit.io/
|
7
|
+
Project-URL: Source, https://github.com/livekit/agents
|
8
|
+
Author: LiveKit
|
9
|
+
License-Expression: Apache-2.0
|
10
|
+
Keywords: audio,livekit,realtime,video,webrtc
|
11
|
+
Classifier: Intended Audience :: Developers
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
14
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
17
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
18
|
+
Classifier: Topic :: Multimedia :: Video
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
20
|
+
Requires-Python: >=3.9.0
|
21
|
+
Requires-Dist: google-auth<3,>=2
|
22
|
+
Requires-Dist: google-cloud-speech<3,>=2
|
23
|
+
Requires-Dist: google-cloud-texttospeech<3,>=2.24
|
24
|
+
Requires-Dist: google-genai>=1.14.0
|
25
|
+
Requires-Dist: livekit-agents>=1.0.21
|
26
|
+
Description-Content-Type: text/markdown
|
27
|
+
|
28
|
+
# Google AI plugin for LiveKit Agents
|
29
|
+
|
30
|
+
Support for Gemini, Gemini Live, Cloud Speech-to-Text, and Cloud Text-to-Speech.
|
31
|
+
|
32
|
+
See [https://docs.livekit.io/agents/integrations/google/](https://docs.livekit.io/agents/integrations/google/) for more information.
|
33
|
+
|
34
|
+
## Installation
|
35
|
+
|
36
|
+
```bash
|
37
|
+
pip install livekit-plugins-google
|
38
|
+
```
|
39
|
+
|
40
|
+
## Pre-requisites
|
41
|
+
|
42
|
+
For credentials, you'll need a Google Cloud account and obtain the correct credentials. Credentials can be passed directly or via Application Default Credentials as specified in [How Application Default Credentials works](https://cloud.google.com/docs/authentication/application-default-credentials).
|
43
|
+
|
44
|
+
To use the STT and TTS API, you'll need to enable the respective services for your Google Cloud project.
|
45
|
+
|
46
|
+
- Cloud Speech-to-Text API
|
47
|
+
- Cloud Text-to-Speech API
|
@@ -0,0 +1,16 @@
|
|
1
|
+
livekit/plugins/google/__init__.py,sha256=xain2qUzU-YWhYWsLBkW8Q-szV-htpnzHTqymMPo-j0,1364
|
2
|
+
livekit/plugins/google/llm.py,sha256=Kr9qeBZ5Dd0WCCBR_-gM3WWsVRZPCSteK8NpBsg2C5Y,16304
|
3
|
+
livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
|
4
|
+
livekit/plugins/google/models.py,sha256=maGlEM3hK4-5hMnH9UQMJewA7BZMrnStsFLBNoNVySg,1531
|
5
|
+
livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
livekit/plugins/google/stt.py,sha256=2jk-1fHiBT8UW_n3CZsIEdMp2iBnUAlTnmefdUd8rAM,23620
|
7
|
+
livekit/plugins/google/tts.py,sha256=29R0ieV5sRPBf5Yi0SPFQk7ZZMbELF30bIL9K_j_Wcg,9100
|
8
|
+
livekit/plugins/google/utils.py,sha256=UBAbddYk7G8Nojg6bSC7_xN2pdl9qhs86HGhKYFuf9M,10509
|
9
|
+
livekit/plugins/google/version.py,sha256=5lzQkS1jEPqreexacwMd18b2EOx7R5m8AQMKtQRBgC4,601
|
10
|
+
livekit/plugins/google/beta/__init__.py,sha256=5PnoG3Ux24bjzMSzmTeSVljE9EINivGcbWUEV6egGnM,216
|
11
|
+
livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
|
12
|
+
livekit/plugins/google/beta/realtime/api_proto.py,sha256=Fyrejs3SG0EjOPCCFLEnWXKEUxCff47PMWk2VsKJm5E,594
|
13
|
+
livekit/plugins/google/beta/realtime/realtime_api.py,sha256=yYB5fKXl_aaMH_ZSpfUlfOTUg4eRqqRENLTZhZMfBMc,36253
|
14
|
+
livekit_plugins_google-1.0.21.dist-info/METADATA,sha256=mQA8BfvWhAjp3V9GJA5OsZLzP_Q03UuDbRX2HbcEgtY,1908
|
15
|
+
livekit_plugins_google-1.0.21.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
16
|
+
livekit_plugins_google-1.0.21.dist-info/RECORD,,
|
@@ -1,99 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: livekit-plugins-google
|
3
|
-
Version: 1.0.19
|
4
|
-
Summary: Agent Framework plugin for services from Google Cloud
|
5
|
-
Project-URL: Documentation, https://docs.livekit.io
|
6
|
-
Project-URL: Website, https://livekit.io/
|
7
|
-
Project-URL: Source, https://github.com/livekit/agents
|
8
|
-
Author: LiveKit
|
9
|
-
License-Expression: Apache-2.0
|
10
|
-
Keywords: audio,livekit,realtime,video,webrtc
|
11
|
-
Classifier: Intended Audience :: Developers
|
12
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
13
|
-
Classifier: Programming Language :: Python :: 3
|
14
|
-
Classifier: Programming Language :: Python :: 3 :: Only
|
15
|
-
Classifier: Programming Language :: Python :: 3.9
|
16
|
-
Classifier: Programming Language :: Python :: 3.10
|
17
|
-
Classifier: Topic :: Multimedia :: Sound/Audio
|
18
|
-
Classifier: Topic :: Multimedia :: Video
|
19
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
20
|
-
Requires-Python: >=3.9.0
|
21
|
-
Requires-Dist: google-auth<3,>=2
|
22
|
-
Requires-Dist: google-cloud-speech<3,>=2
|
23
|
-
Requires-Dist: google-cloud-texttospeech<3,>=2
|
24
|
-
Requires-Dist: google-genai>=1.12.1
|
25
|
-
Requires-Dist: livekit-agents>=1.0.19
|
26
|
-
Description-Content-Type: text/markdown
|
27
|
-
|
28
|
-
# LiveKit Plugins Google
|
29
|
-
|
30
|
-
Agent Framework plugin for services from Google Cloud. Currently supporting Google's [Speech-to-Text](https://cloud.google.com/speech-to-text) API.
|
31
|
-
|
32
|
-
## Installation
|
33
|
-
|
34
|
-
```bash
|
35
|
-
pip install livekit-plugins-google
|
36
|
-
```
|
37
|
-
|
38
|
-
## Pre-requisites
|
39
|
-
|
40
|
-
For credentials, you'll need a Google Cloud account and obtain the correct credentials. Credentials can be passed directly or via Application Default Credentials as specified in [How Application Default Credentials works](https://cloud.google.com/docs/authentication/application-default-credentials).
|
41
|
-
|
42
|
-
To use the STT and TTS API, you'll need to enable the respective services for your Google Cloud project.
|
43
|
-
|
44
|
-
- Cloud Speech-to-Text API
|
45
|
-
- Cloud Text-to-Speech API
|
46
|
-
|
47
|
-
|
48
|
-
## Gemini Multimodal Live
|
49
|
-
|
50
|
-
Gemini Multimodal Live can be used with the `MultimodalAgent` class. See examples/multimodal_agent/gemini_agent.py for an example.
|
51
|
-
|
52
|
-
### Live Video Input (experimental)
|
53
|
-
|
54
|
-
You can push video frames to your Gemini Multimodal Live session alongside the audio automatically handled by the `MultimodalAgent`. The basic approach is to subscribe to the video track, create a video stream, sample frames at a suitable frame rate, and push them into the RealtimeSession:
|
55
|
-
|
56
|
-
```
|
57
|
-
# Make sure you subscribe to audio and video tracks
|
58
|
-
await ctx.connect(auto_subscribe=AutoSubscribe.SUBSCRIBE_ALL)
|
59
|
-
|
60
|
-
# Create your RealtimeModel and store a reference
|
61
|
-
model = google.beta.realtime.RealtimeModel(
|
62
|
-
# ...
|
63
|
-
)
|
64
|
-
|
65
|
-
# Create your MultimodalAgent as usual
|
66
|
-
agent = MultimodalAgent(
|
67
|
-
model=model,
|
68
|
-
# ...
|
69
|
-
)
|
70
|
-
|
71
|
-
# Async method to process the video track and push frames to Gemini
|
72
|
-
async def _process_video_track(self, track: Track):
|
73
|
-
video_stream = VideoStream(track)
|
74
|
-
last_frame_time = 0
|
75
|
-
|
76
|
-
async for event in video_stream:
|
77
|
-
current_time = asyncio.get_event_loop().time()
|
78
|
-
|
79
|
-
# Sample at 1 FPS
|
80
|
-
if current_time - last_frame_time < 1.0:
|
81
|
-
continue
|
82
|
-
|
83
|
-
last_frame_time = current_time
|
84
|
-
frame = event.frame
|
85
|
-
|
86
|
-
# Push the frame into the RealtimeSession
|
87
|
-
model.sessions[0].push_video(frame)
|
88
|
-
|
89
|
-
await video_stream.aclose()
|
90
|
-
|
91
|
-
# Subscribe to new tracks and process them
|
92
|
-
@ctx.room.on("track_subscribed")
|
93
|
-
def _on_track_subscribed(track: Track, pub, participant):
|
94
|
-
if track.kind == TrackKind.KIND_VIDEO:
|
95
|
-
asyncio.create_task(self._process_video_track(track))
|
96
|
-
```
|
97
|
-
|
98
|
-
|
99
|
-
|
@@ -1,16 +0,0 @@
|
|
1
|
-
livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
|
2
|
-
livekit/plugins/google/llm.py,sha256=NaaT4Zaw6o98VcUHNrQcZZRkD7DPREd76O8fG9IOpXQ,16190
|
3
|
-
livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
|
4
|
-
livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
|
5
|
-
livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
livekit/plugins/google/stt.py,sha256=MADnkh0YKWY4bLRgBwFv4emu4YFO-7EVnhxO--dPTlI,23082
|
7
|
-
livekit/plugins/google/tts.py,sha256=29R0ieV5sRPBf5Yi0SPFQk7ZZMbELF30bIL9K_j_Wcg,9100
|
8
|
-
livekit/plugins/google/utils.py,sha256=sPZZg5VHf60kSILUIHGIZyN2CWYwnCGNYICn8Mhcv9g,9534
|
9
|
-
livekit/plugins/google/version.py,sha256=UDC8ahmGgRkv-qMQUY3QibuuVevGMQ9Fd4yIhcQBZwA,601
|
10
|
-
livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
|
11
|
-
livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
|
12
|
-
livekit/plugins/google/beta/realtime/api_proto.py,sha256=Fyrejs3SG0EjOPCCFLEnWXKEUxCff47PMWk2VsKJm5E,594
|
13
|
-
livekit/plugins/google/beta/realtime/realtime_api.py,sha256=yk202S604Eogp_ssBX2BSbAXV67uUyQzVO-bzLnScrs,31423
|
14
|
-
livekit_plugins_google-1.0.19.dist-info/METADATA,sha256=HuRBvpT9dX3Mz7YOVhZhgQLm3-qQa2vAf2SRDQ5u1vM,3492
|
15
|
-
livekit_plugins_google-1.0.19.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
16
|
-
livekit_plugins_google-1.0.19.dist-info/RECORD,,
|
File without changes
|