livekit-plugins-google 1.0.16__py3-none-any.whl → 1.0.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/google/beta/realtime/realtime_api.py +367 -233
- livekit/plugins/google/tts.py +4 -4
- livekit/plugins/google/utils.py +6 -4
- livekit/plugins/google/version.py +1 -1
- {livekit_plugins_google-1.0.16.dist-info → livekit_plugins_google-1.0.18.dist-info}/METADATA +3 -3
- {livekit_plugins_google-1.0.16.dist-info → livekit_plugins_google-1.0.18.dist-info}/RECORD +7 -7
- {livekit_plugins_google-1.0.16.dist-info → livekit_plugins_google-1.0.18.dist-info}/WHEEL +0 -0
@@ -1,12 +1,15 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import asyncio
|
4
|
+
import contextlib
|
4
5
|
import json
|
5
6
|
import os
|
6
7
|
import weakref
|
8
|
+
from collections.abc import Iterator
|
7
9
|
from dataclasses import dataclass
|
8
10
|
|
9
11
|
from google import genai
|
12
|
+
from google.genai.live import AsyncSession
|
10
13
|
from google.genai.types import (
|
11
14
|
AudioTranscriptionConfig,
|
12
15
|
Blob,
|
@@ -23,6 +26,7 @@ from google.genai.types import (
|
|
23
26
|
Modality,
|
24
27
|
Part,
|
25
28
|
PrebuiltVoiceConfig,
|
29
|
+
SessionResumptionConfig,
|
26
30
|
SpeechConfig,
|
27
31
|
Tool,
|
28
32
|
UsageMetadata,
|
@@ -31,15 +35,16 @@ from google.genai.types import (
|
|
31
35
|
from livekit import rtc
|
32
36
|
from livekit.agents import llm, utils
|
33
37
|
from livekit.agents.types import NOT_GIVEN, NotGivenOr
|
34
|
-
from livekit.agents.utils import images, is_given
|
38
|
+
from livekit.agents.utils import audio as audio_utils, images, is_given
|
39
|
+
from livekit.plugins.google.beta.realtime.api_proto import ClientEvents, LiveAPIModels, Voice
|
35
40
|
|
36
41
|
from ...log import logger
|
37
42
|
from ...utils import _build_gemini_fnc, get_tool_results_for_realtime, to_chat_ctx
|
38
|
-
from .api_proto import ClientEvents, LiveAPIModels, Voice
|
39
43
|
|
40
44
|
INPUT_AUDIO_SAMPLE_RATE = 16000
|
45
|
+
INPUT_AUDIO_CHANNELS = 1
|
41
46
|
OUTPUT_AUDIO_SAMPLE_RATE = 24000
|
42
|
-
|
47
|
+
OUTPUT_AUDIO_CHANNELS = 1
|
43
48
|
|
44
49
|
DEFAULT_ENCODE_OPTIONS = images.EncodeOptions(
|
45
50
|
format="JPEG",
|
@@ -59,6 +64,7 @@ class _RealtimeOptions:
|
|
59
64
|
model: LiveAPIModels | str
|
60
65
|
api_key: str | None
|
61
66
|
voice: Voice | str
|
67
|
+
language: NotGivenOr[str]
|
62
68
|
response_modalities: NotGivenOr[list[Modality]]
|
63
69
|
vertexai: bool
|
64
70
|
project: str | None
|
@@ -98,6 +104,7 @@ class RealtimeModel(llm.RealtimeModel):
|
|
98
104
|
model: LiveAPIModels | str = "gemini-2.0-flash-live-001",
|
99
105
|
api_key: NotGivenOr[str] = NOT_GIVEN,
|
100
106
|
voice: Voice | str = "Puck",
|
107
|
+
language: NotGivenOr[str] = NOT_GIVEN,
|
101
108
|
modalities: NotGivenOr[list[Modality]] = NOT_GIVEN,
|
102
109
|
vertexai: bool = False,
|
103
110
|
project: NotGivenOr[str] = NOT_GIVEN,
|
@@ -126,8 +133,9 @@ class RealtimeModel(llm.RealtimeModel):
|
|
126
133
|
instructions (str, optional): Initial system instructions for the model. Defaults to "".
|
127
134
|
api_key (str, optional): Google Gemini API key. If None, will attempt to read from the environment variable GOOGLE_API_KEY.
|
128
135
|
modalities (list[Modality], optional): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
|
129
|
-
model (str, optional): The name of the model to use. Defaults to "gemini-2.0-flash-
|
136
|
+
model (str, optional): The name of the model to use. Defaults to "gemini-2.0-flash-live-001".
|
130
137
|
voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
|
138
|
+
language (str, optional): The language(BCP-47 Code) to use for the API. supported languages - https://ai.google.dev/gemini-api/docs/live#supported-languages
|
131
139
|
temperature (float, optional): Sampling temperature for response generation. Defaults to 0.8.
|
132
140
|
vertexai (bool, optional): Whether to use VertexAI for the API. Defaults to False.
|
133
141
|
project (str, optional): The project id to use for the API. Defaults to None. (for vertexai)
|
@@ -147,20 +155,20 @@ class RealtimeModel(llm.RealtimeModel):
|
|
147
155
|
capabilities=llm.RealtimeCapabilities(
|
148
156
|
message_truncation=False,
|
149
157
|
turn_detection=True,
|
150
|
-
user_transcription=
|
158
|
+
user_transcription=is_given(input_audio_transcription),
|
151
159
|
)
|
152
160
|
)
|
153
161
|
|
154
162
|
gemini_api_key = api_key if is_given(api_key) else os.environ.get("GOOGLE_API_KEY")
|
155
163
|
gcp_project = project if is_given(project) else os.environ.get("GOOGLE_CLOUD_PROJECT")
|
156
164
|
gcp_location = location if is_given(location) else os.environ.get("GOOGLE_CLOUD_LOCATION")
|
165
|
+
|
157
166
|
if vertexai:
|
158
167
|
if not gcp_project or not gcp_location:
|
159
168
|
raise ValueError(
|
160
169
|
"Project and location are required for VertexAI either via project and location or GOOGLE_CLOUD_PROJECT and GOOGLE_CLOUD_LOCATION environment variables" # noqa: E501
|
161
170
|
)
|
162
171
|
gemini_api_key = None # VertexAI does not require an API key
|
163
|
-
|
164
172
|
else:
|
165
173
|
gcp_project = None
|
166
174
|
gcp_location = None
|
@@ -192,6 +200,7 @@ class RealtimeModel(llm.RealtimeModel):
|
|
192
200
|
instructions=instructions,
|
193
201
|
input_audio_transcription=input_audio_transcription,
|
194
202
|
output_audio_transcription=output_audio_transcription,
|
203
|
+
language=language,
|
195
204
|
)
|
196
205
|
|
197
206
|
self._sessions = weakref.WeakSet[RealtimeSession]()
|
@@ -213,7 +222,8 @@ class RealtimeModel(llm.RealtimeModel):
|
|
213
222
|
for sess in self._sessions:
|
214
223
|
sess.update_options(voice=self._opts.voice, temperature=self._opts.temperature)
|
215
224
|
|
216
|
-
async def aclose(self) -> None:
|
225
|
+
async def aclose(self) -> None:
|
226
|
+
pass
|
217
227
|
|
218
228
|
|
219
229
|
class RealtimeSession(llm.RealtimeSession):
|
@@ -221,138 +231,164 @@ class RealtimeSession(llm.RealtimeSession):
|
|
221
231
|
super().__init__(realtime_model)
|
222
232
|
self._opts = realtime_model._opts
|
223
233
|
self._tools = llm.ToolContext.empty()
|
234
|
+
self._gemini_declarations: list[FunctionDeclaration] = []
|
224
235
|
self._chat_ctx = llm.ChatContext.empty()
|
225
236
|
self._msg_ch = utils.aio.Chan[ClientEvents]()
|
226
|
-
self.
|
237
|
+
self._input_resampler: rtc.AudioResampler | None = None
|
238
|
+
|
239
|
+
# 50ms chunks
|
240
|
+
self._bstream = audio_utils.AudioByteStream(
|
241
|
+
INPUT_AUDIO_SAMPLE_RATE,
|
242
|
+
INPUT_AUDIO_CHANNELS,
|
243
|
+
samples_per_channel=INPUT_AUDIO_SAMPLE_RATE // 20,
|
244
|
+
)
|
245
|
+
|
227
246
|
self._client = genai.Client(
|
228
247
|
api_key=self._opts.api_key,
|
229
248
|
vertexai=self._opts.vertexai,
|
230
249
|
project=self._opts.project,
|
231
250
|
location=self._opts.location,
|
232
251
|
)
|
252
|
+
|
233
253
|
self._main_atask = asyncio.create_task(self._main_task(), name="gemini-realtime-session")
|
234
254
|
|
235
255
|
self._current_generation: _ResponseGeneration | None = None
|
236
|
-
|
237
|
-
|
238
|
-
self.
|
239
|
-
self._session = None
|
240
|
-
self._update_chat_ctx_lock = asyncio.Lock()
|
241
|
-
self._update_fnc_ctx_lock = asyncio.Lock()
|
256
|
+
self._active_session: AsyncSession | None = None
|
257
|
+
# indicates if the underlying session should end
|
258
|
+
self._session_should_close = asyncio.Event()
|
242
259
|
self._response_created_futures: dict[str, asyncio.Future[llm.GenerationCreatedEvent]] = {}
|
243
|
-
self.
|
260
|
+
self._pending_generation_fut: asyncio.Future[llm.GenerationCreatedEvent] | None = None
|
244
261
|
|
245
|
-
self.
|
246
|
-
self._session_lock = asyncio.Lock()
|
247
|
-
self._gemini_close_task: asyncio.Task | None = None
|
262
|
+
self._session_resumption_handle: str | None = None
|
248
263
|
|
249
|
-
|
250
|
-
|
251
|
-
self._gemini_close_task = asyncio.create_task(self._close_gemini_session())
|
264
|
+
self._update_lock = asyncio.Lock()
|
265
|
+
self._session_lock = asyncio.Lock()
|
252
266
|
|
253
|
-
async def
|
267
|
+
async def _close_active_session(self) -> None:
|
254
268
|
async with self._session_lock:
|
255
|
-
if self.
|
269
|
+
if self._active_session:
|
256
270
|
try:
|
257
|
-
await self.
|
271
|
+
await self._active_session.close()
|
272
|
+
except Exception as e:
|
273
|
+
logger.warning(f"error closing Gemini session: {e}")
|
258
274
|
finally:
|
259
|
-
self.
|
275
|
+
self._active_session = None
|
260
276
|
|
261
|
-
def
|
277
|
+
def _mark_restart_needed(self):
|
278
|
+
if not self._session_should_close.is_set():
|
279
|
+
self._session_should_close.set()
|
280
|
+
# reset the msg_ch, do not send messages from previous session
|
281
|
+
self._msg_ch = utils.aio.Chan[ClientEvents]()
|
282
|
+
|
283
|
+
async def update_options(
|
262
284
|
self,
|
263
285
|
*,
|
264
286
|
voice: NotGivenOr[str] = NOT_GIVEN,
|
265
|
-
tool_choice: NotGivenOr[llm.ToolChoice | None] = NOT_GIVEN,
|
266
287
|
temperature: NotGivenOr[float] = NOT_GIVEN,
|
288
|
+
tool_choice: NotGivenOr[llm.ToolChoice | None] = NOT_GIVEN,
|
267
289
|
) -> None:
|
268
|
-
|
269
|
-
|
290
|
+
async with self._update_lock:
|
291
|
+
should_restart = False
|
292
|
+
if is_given(voice) and self._opts.voice != voice:
|
293
|
+
self._opts.voice = voice
|
294
|
+
should_restart = True
|
270
295
|
|
271
|
-
|
272
|
-
|
296
|
+
if is_given(temperature) and self._opts.temperature != temperature:
|
297
|
+
self._opts.temperature = temperature if is_given(temperature) else NOT_GIVEN
|
298
|
+
should_restart = True
|
273
299
|
|
274
|
-
|
275
|
-
|
276
|
-
self._reconnect_event.set()
|
277
|
-
self._schedule_gemini_session_close()
|
300
|
+
if should_restart:
|
301
|
+
self._mark_restart_needed()
|
278
302
|
|
279
303
|
async def update_instructions(self, instructions: str) -> None:
|
280
|
-
self.
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
self._schedule_gemini_session_close()
|
304
|
+
async with self._update_lock:
|
305
|
+
if not is_given(self._opts.instructions) or self._opts.instructions != instructions:
|
306
|
+
self._opts.instructions = instructions
|
307
|
+
self._mark_restart_needed()
|
285
308
|
|
286
309
|
async def update_chat_ctx(self, chat_ctx: llm.ChatContext) -> None:
|
287
|
-
async with self.
|
288
|
-
self._chat_ctx = chat_ctx
|
310
|
+
async with self._update_lock:
|
311
|
+
self._chat_ctx = chat_ctx.copy()
|
289
312
|
turns, _ = to_chat_ctx(self._chat_ctx, id(self), ignore_functions=True)
|
290
313
|
tool_results = get_tool_results_for_realtime(self._chat_ctx)
|
314
|
+
# TODO(dz): need to compute delta and then either append or recreate session
|
291
315
|
if turns:
|
292
|
-
self.
|
316
|
+
self._send_client_event(LiveClientContent(turns=turns, turn_complete=False))
|
293
317
|
if tool_results:
|
294
|
-
self.
|
318
|
+
self._send_client_event(tool_results)
|
295
319
|
|
296
320
|
async def update_tools(self, tools: list[llm.FunctionTool]) -> None:
|
297
|
-
async with self.
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
for
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
if self._session and gemini_function_declarations:
|
309
|
-
logger.warning("Updating tools; triggering Gemini session reconnect.")
|
310
|
-
self._reconnect_event.set()
|
311
|
-
self._schedule_gemini_session_close()
|
321
|
+
async with self._update_lock:
|
322
|
+
new_declarations: list[FunctionDeclaration] = [
|
323
|
+
_build_gemini_fnc(tool) for tool in tools
|
324
|
+
]
|
325
|
+
current_tool_names = {f.name for f in self._gemini_declarations}
|
326
|
+
new_tool_names = {f.name for f in new_declarations}
|
327
|
+
|
328
|
+
if current_tool_names != new_tool_names:
|
329
|
+
self._gemini_declarations = new_declarations
|
330
|
+
self._tools = llm.ToolContext(tools)
|
331
|
+
self._mark_restart_needed()
|
312
332
|
|
313
333
|
@property
|
314
334
|
def chat_ctx(self) -> llm.ChatContext:
|
315
|
-
return self._chat_ctx
|
335
|
+
return self._chat_ctx.copy()
|
316
336
|
|
317
337
|
@property
|
318
338
|
def tools(self) -> llm.ToolContext:
|
319
|
-
return self._tools
|
339
|
+
return self._tools.copy()
|
320
340
|
|
321
341
|
def push_audio(self, frame: rtc.AudioFrame) -> None:
|
322
|
-
self.
|
342
|
+
for f in self._resample_audio(frame):
|
343
|
+
for nf in self._bstream.write(f.data.tobytes()):
|
344
|
+
realtime_input = LiveClientRealtimeInput(
|
345
|
+
media_chunks=[Blob(data=nf.data.tobytes(), mime_type="audio/pcm")]
|
346
|
+
)
|
347
|
+
self._send_client_event(realtime_input)
|
323
348
|
|
324
349
|
def push_video(self, frame: rtc.VideoFrame) -> None:
|
325
350
|
encoded_data = images.encode(frame, DEFAULT_ENCODE_OPTIONS)
|
326
|
-
self.push_media(encoded_data, "image/jpeg")
|
327
|
-
|
328
|
-
def push_media(self, bytes: bytes, mime_type: str) -> None:
|
329
351
|
realtime_input = LiveClientRealtimeInput(
|
330
|
-
media_chunks=[Blob(data=
|
352
|
+
media_chunks=[Blob(data=encoded_data, mime_type="image/jpeg")]
|
331
353
|
)
|
332
|
-
self.
|
354
|
+
self._send_client_event(realtime_input)
|
355
|
+
|
356
|
+
def _send_client_event(self, event: ClientEvents) -> None:
|
357
|
+
with contextlib.suppress(utils.aio.channel.ChanClosed):
|
358
|
+
self._msg_ch.send_nowait(event)
|
333
359
|
|
334
360
|
def generate_reply(
|
335
361
|
self, *, instructions: NotGivenOr[str] = NOT_GIVEN
|
336
362
|
) -> asyncio.Future[llm.GenerationCreatedEvent]:
|
337
|
-
|
363
|
+
if self._pending_generation_fut and not self._pending_generation_fut.done():
|
364
|
+
logger.warning(
|
365
|
+
"generate_reply called while another generation is pending, cancelling previous."
|
366
|
+
)
|
367
|
+
self._pending_generation_fut.cancel("Superseded by new generate_reply call")
|
338
368
|
|
339
|
-
|
340
|
-
self.
|
341
|
-
self._pending_generation_event_id = event_id
|
369
|
+
fut = asyncio.Future()
|
370
|
+
self._pending_generation_fut = fut
|
342
371
|
|
343
|
-
|
344
|
-
|
345
|
-
|
372
|
+
# Gemini requires the last message to end with user's turn
|
373
|
+
# so we need to add a placeholder user turn in order to trigger a new generation
|
374
|
+
event = LiveClientContent(turns=[], turn_complete=True)
|
375
|
+
if is_given(instructions):
|
376
|
+
event.turns.append(Content(parts=[Part(text=instructions)], role="model"))
|
377
|
+
event.turns.append(Content(parts=[Part(text=".")], role="user"))
|
378
|
+
self._send_client_event(event)
|
346
379
|
|
347
380
|
def _on_timeout() -> None:
|
348
|
-
if
|
349
|
-
fut.set_exception(
|
350
|
-
|
351
|
-
|
352
|
-
|
381
|
+
if not fut.done():
|
382
|
+
fut.set_exception(
|
383
|
+
llm.RealtimeError(
|
384
|
+
"generate_reply timed out waiting for generation_created event."
|
385
|
+
)
|
386
|
+
)
|
387
|
+
if self._pending_generation_fut is fut:
|
388
|
+
self._pending_generation_fut = None
|
353
389
|
|
354
|
-
|
355
|
-
fut.add_done_callback(lambda _:
|
390
|
+
timeout_handle = asyncio.get_event_loop().call_later(5.0, _on_timeout)
|
391
|
+
fut.add_done_callback(lambda _: timeout_handle.cancel())
|
356
392
|
|
357
393
|
return fut
|
358
394
|
|
@@ -360,133 +396,206 @@ class RealtimeSession(llm.RealtimeSession):
|
|
360
396
|
pass
|
361
397
|
|
362
398
|
def truncate(self, *, message_id: str, audio_end_ms: int) -> None:
|
399
|
+
logger.warning("truncate is not supported by the Google Realtime API.")
|
363
400
|
pass
|
364
401
|
|
365
402
|
async def aclose(self) -> None:
|
366
403
|
self._msg_ch.close()
|
367
|
-
|
368
|
-
for fut in self._response_created_futures.values():
|
369
|
-
if not fut.done():
|
370
|
-
fut.set_exception(llm.RealtimeError("Session closed"))
|
404
|
+
self._session_should_close.set()
|
371
405
|
|
372
406
|
if self._main_atask:
|
373
407
|
await utils.aio.cancel_and_wait(self._main_atask)
|
374
408
|
|
375
|
-
|
376
|
-
|
409
|
+
await self._close_active_session()
|
410
|
+
|
411
|
+
if self._pending_generation_fut and not self._pending_generation_fut.done():
|
412
|
+
self._pending_generation_fut.cancel("Session closed")
|
413
|
+
|
414
|
+
for fut in self._response_created_futures.values():
|
415
|
+
if not fut.done():
|
416
|
+
fut.set_exception(llm.RealtimeError("Session closed before response created"))
|
417
|
+
self._response_created_futures.clear()
|
418
|
+
|
419
|
+
if self._current_generation:
|
420
|
+
self._finalize_response(closed=True)
|
377
421
|
|
378
422
|
@utils.log_exceptions(logger=logger)
|
379
423
|
async def _main_task(self):
|
380
|
-
while
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
else None,
|
405
|
-
speech_config=SpeechConfig(
|
406
|
-
voice_config=VoiceConfig(
|
407
|
-
prebuilt_voice_config=PrebuiltVoiceConfig(voice_name=self._opts.voice)
|
424
|
+
while not self._msg_ch.closed:
|
425
|
+
# previous session might not be closed yet, we'll do it here.
|
426
|
+
await self._close_active_session()
|
427
|
+
|
428
|
+
self._session_should_close.clear()
|
429
|
+
config = self._build_connect_config()
|
430
|
+
session = None
|
431
|
+
try:
|
432
|
+
logger.debug("connecting to Gemini Realtime API...")
|
433
|
+
async with self._client.aio.live.connect(
|
434
|
+
model=self._opts.model, config=config
|
435
|
+
) as session:
|
436
|
+
async with self._session_lock:
|
437
|
+
self._active_session = session
|
438
|
+
|
439
|
+
# queue up existing chat context
|
440
|
+
send_task = asyncio.create_task(
|
441
|
+
self._send_task(session), name="gemini-realtime-send"
|
442
|
+
)
|
443
|
+
recv_task = asyncio.create_task(
|
444
|
+
self._recv_task(session), name="gemini-realtime-recv"
|
445
|
+
)
|
446
|
+
restart_wait_task = asyncio.create_task(
|
447
|
+
self._session_should_close.wait(), name="gemini-restart-wait"
|
408
448
|
)
|
409
|
-
),
|
410
|
-
tools=self._gemini_tools,
|
411
|
-
input_audio_transcription=self._opts.input_audio_transcription,
|
412
|
-
output_audio_transcription=self._opts.output_audio_transcription,
|
413
|
-
)
|
414
|
-
|
415
|
-
async with self._client.aio.live.connect(
|
416
|
-
model=self._opts.model, config=config
|
417
|
-
) as session:
|
418
|
-
async with self._session_lock:
|
419
|
-
self._session = session
|
420
|
-
|
421
|
-
@utils.log_exceptions(logger=logger)
|
422
|
-
async def _send_task():
|
423
|
-
async for msg in self._msg_ch:
|
424
|
-
if isinstance(msg, LiveClientContent):
|
425
|
-
await session.send(input=msg, end_of_turn=True)
|
426
|
-
else:
|
427
|
-
await session.send(input=msg)
|
428
|
-
await session.send(input=".", end_of_turn=True)
|
429
|
-
|
430
|
-
@utils.log_exceptions(logger=logger)
|
431
|
-
async def _recv_task():
|
432
|
-
while True:
|
433
|
-
async for response in session.receive():
|
434
|
-
if self._active_response_id is None:
|
435
|
-
self._start_new_generation()
|
436
|
-
if response.setup_complete:
|
437
|
-
logger.info("connection established with gemini live api server")
|
438
|
-
if response.server_content:
|
439
|
-
self._handle_server_content(response.server_content)
|
440
|
-
if response.tool_call:
|
441
|
-
self._handle_tool_calls(response.tool_call)
|
442
|
-
if response.tool_call_cancellation:
|
443
|
-
self._handle_tool_call_cancellation(response.tool_call_cancellation)
|
444
|
-
if response.usage_metadata:
|
445
|
-
self._handle_usage_metadata(response.usage_metadata)
|
446
|
-
if response.go_away:
|
447
|
-
self._handle_go_away(response.go_away)
|
448
|
-
|
449
|
-
send_task = asyncio.create_task(_send_task(), name="gemini-realtime-send")
|
450
|
-
recv_task = asyncio.create_task(_recv_task(), name="gemini-realtime-recv")
|
451
|
-
reconnect_task = asyncio.create_task(
|
452
|
-
self._reconnect_event.wait(), name="reconnect-wait"
|
453
|
-
)
|
454
449
|
|
455
|
-
|
456
|
-
|
457
|
-
[send_task, recv_task, reconnect_task],
|
450
|
+
done, pending = await asyncio.wait(
|
451
|
+
[send_task, recv_task, restart_wait_task],
|
458
452
|
return_when=asyncio.FIRST_COMPLETED,
|
459
453
|
)
|
454
|
+
|
460
455
|
for task in done:
|
461
|
-
if task
|
462
|
-
task.
|
456
|
+
if task is not restart_wait_task and task.exception():
|
457
|
+
logger.error(f"error in task {task.get_name()}: {task.exception()}")
|
458
|
+
raise task.exception() or Exception(f"{task.get_name()} failed")
|
463
459
|
|
464
|
-
if
|
460
|
+
if restart_wait_task not in done and self._msg_ch.closed:
|
465
461
|
break
|
466
462
|
|
467
|
-
|
468
|
-
|
469
|
-
|
463
|
+
for task in pending:
|
464
|
+
await utils.aio.cancel_and_wait(task)
|
465
|
+
|
466
|
+
except asyncio.CancelledError:
|
467
|
+
break
|
468
|
+
except Exception as e:
|
469
|
+
logger.error(f"Gemini Realtime API error: {e}", exc_info=e)
|
470
|
+
if not self._msg_ch.closed:
|
471
|
+
logger.info("attempting to reconnect after 1 seconds...")
|
472
|
+
await asyncio.sleep(1)
|
473
|
+
finally:
|
474
|
+
await self._close_active_session()
|
475
|
+
|
476
|
+
async def _send_task(self, session: AsyncSession):
|
477
|
+
try:
|
478
|
+
async for msg in self._msg_ch:
|
479
|
+
async with self._session_lock:
|
480
|
+
if self._session_should_close.is_set() or (
|
481
|
+
not self._active_session or self._active_session != session
|
482
|
+
):
|
483
|
+
break
|
484
|
+
|
485
|
+
if isinstance(msg, LiveClientContent):
|
486
|
+
await session.send(input=msg)
|
487
|
+
else:
|
488
|
+
await session.send(input=msg)
|
489
|
+
except Exception as e:
|
490
|
+
if not self._session_should_close.is_set():
|
491
|
+
logger.error(f"error in send task: {e}", exc_info=e)
|
492
|
+
self._mark_restart_needed()
|
493
|
+
finally:
|
494
|
+
logger.debug("send task finished.")
|
495
|
+
|
496
|
+
async def _recv_task(self, session: AsyncSession):
|
497
|
+
try:
|
498
|
+
while True:
|
499
|
+
async with self._session_lock:
|
500
|
+
if self._session_should_close.is_set() or (
|
501
|
+
not self._active_session or self._active_session != session
|
502
|
+
):
|
503
|
+
logger.debug("receive task: Session changed or closed, stopping receive.")
|
504
|
+
break
|
505
|
+
|
506
|
+
async for response in session.receive():
|
507
|
+
if not self._current_generation and (
|
508
|
+
response.server_content or response.tool_call
|
509
|
+
):
|
510
|
+
self._start_new_generation()
|
511
|
+
|
512
|
+
if response.session_resumption_update:
|
513
|
+
if (
|
514
|
+
response.session_resumption_update.resumable
|
515
|
+
and response.session_resumption_update.new_handle
|
516
|
+
):
|
517
|
+
self._session_resumption_handle = (
|
518
|
+
response.session_resumption_update.new_handle
|
519
|
+
)
|
520
|
+
|
521
|
+
if response.server_content:
|
522
|
+
self._handle_server_content(response.server_content)
|
523
|
+
if response.tool_call:
|
524
|
+
self._handle_tool_calls(response.tool_call)
|
525
|
+
if response.tool_call_cancellation:
|
526
|
+
self._handle_tool_call_cancellation(response.tool_call_cancellation)
|
527
|
+
if response.usage_metadata:
|
528
|
+
self._handle_usage_metadata(response.usage_metadata)
|
529
|
+
if response.go_away:
|
530
|
+
self._handle_go_away(response.go_away)
|
531
|
+
|
532
|
+
# TODO(dz): a server-side turn is complete
|
533
|
+
except Exception as e:
|
534
|
+
if not self._session_should_close.is_set():
|
535
|
+
logger.error(f"error in receive task: {e}", exc_info=e)
|
536
|
+
self._mark_restart_needed()
|
537
|
+
finally:
|
538
|
+
self._finalize_response(closed=True)
|
539
|
+
|
540
|
+
def _build_connect_config(self) -> LiveConnectConfig:
|
541
|
+
temp = self._opts.temperature if is_given(self._opts.temperature) else None
|
542
|
+
|
543
|
+
return LiveConnectConfig(
|
544
|
+
response_modalities=self._opts.response_modalities
|
545
|
+
if is_given(self._opts.response_modalities)
|
546
|
+
else [Modality.AUDIO],
|
547
|
+
generation_config=GenerationConfig(
|
548
|
+
candidate_count=self._opts.candidate_count,
|
549
|
+
temperature=temp,
|
550
|
+
max_output_tokens=self._opts.max_output_tokens
|
551
|
+
if is_given(self._opts.max_output_tokens)
|
552
|
+
else None,
|
553
|
+
top_p=self._opts.top_p if is_given(self._opts.top_p) else None,
|
554
|
+
top_k=self._opts.top_k if is_given(self._opts.top_k) else None,
|
555
|
+
presence_penalty=self._opts.presence_penalty
|
556
|
+
if is_given(self._opts.presence_penalty)
|
557
|
+
else None,
|
558
|
+
frequency_penalty=self._opts.frequency_penalty
|
559
|
+
if is_given(self._opts.frequency_penalty)
|
560
|
+
else None,
|
561
|
+
),
|
562
|
+
system_instruction=Content(parts=[Part(text=self._opts.instructions)])
|
563
|
+
if is_given(self._opts.instructions)
|
564
|
+
else None,
|
565
|
+
speech_config=SpeechConfig(
|
566
|
+
voice_config=VoiceConfig(
|
567
|
+
prebuilt_voice_config=PrebuiltVoiceConfig(voice_name=self._opts.voice)
|
568
|
+
),
|
569
|
+
language_code=self._opts.language if is_given(self._opts.language) else None,
|
570
|
+
),
|
571
|
+
tools=[Tool(function_declarations=self._gemini_declarations)],
|
572
|
+
input_audio_transcription=self._opts.input_audio_transcription,
|
573
|
+
output_audio_transcription=self._opts.output_audio_transcription,
|
574
|
+
session_resumption=SessionResumptionConfig(handle=self._session_resumption_handle),
|
575
|
+
)
|
470
576
|
|
471
577
|
def _start_new_generation(self):
|
472
|
-
self.
|
473
|
-
|
578
|
+
if self._current_generation:
|
579
|
+
logger.warning("starting new generation while another is active. Finalizing previous.")
|
580
|
+
self._finalize_response(closed=True)
|
581
|
+
|
582
|
+
response_id = utils.shortuuid("gemini-turn-")
|
474
583
|
self._current_generation = _ResponseGeneration(
|
475
584
|
message_ch=utils.aio.Chan[llm.MessageGeneration](),
|
476
585
|
function_ch=utils.aio.Chan[llm.FunctionCall](),
|
477
586
|
messages={},
|
478
587
|
)
|
479
588
|
|
480
|
-
# We'll assume each chunk belongs to a single message ID self._active_response_id
|
481
589
|
item_generation = _MessageGeneration(
|
482
|
-
message_id=
|
590
|
+
message_id=response_id,
|
483
591
|
text_ch=utils.aio.Chan[str](),
|
484
592
|
audio_ch=utils.aio.Chan[rtc.AudioFrame](),
|
485
593
|
)
|
594
|
+
self._current_generation.messages[response_id] = item_generation
|
486
595
|
|
487
596
|
self._current_generation.message_ch.send_nowait(
|
488
597
|
llm.MessageGeneration(
|
489
|
-
message_id=
|
598
|
+
message_id=response_id,
|
490
599
|
text_stream=item_generation.text_ch,
|
491
600
|
audio_stream=item_generation.audio_ch,
|
492
601
|
)
|
@@ -498,84 +607,92 @@ class RealtimeSession(llm.RealtimeSession):
|
|
498
607
|
user_initiated=False,
|
499
608
|
)
|
500
609
|
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
fut.set_result(generation_event)
|
610
|
+
if self._pending_generation_fut and not self._pending_generation_fut.done():
|
611
|
+
generation_event.user_initiated = True
|
612
|
+
self._pending_generation_fut.set_result(generation_event)
|
613
|
+
self._pending_generation_fut = None
|
506
614
|
|
507
|
-
self._pending_generation_event_id = None
|
508
615
|
self.emit("generation_created", generation_event)
|
509
616
|
|
510
|
-
self._current_generation.messages[self._active_response_id] = item_generation
|
511
|
-
|
512
617
|
def _handle_server_content(self, server_content: LiveServerContent):
|
513
|
-
if not self._current_generation
|
514
|
-
logger.warning(
|
515
|
-
"gemini-realtime-session: No active response ID, skipping server content"
|
516
|
-
)
|
618
|
+
if not self._current_generation:
|
619
|
+
logger.warning("received server content but no active generation.")
|
517
620
|
return
|
518
621
|
|
519
|
-
|
622
|
+
response_id = list(self._current_generation.messages.keys())[0]
|
623
|
+
item_generation = self._current_generation.messages[response_id]
|
520
624
|
|
521
|
-
model_turn
|
522
|
-
if model_turn:
|
625
|
+
if model_turn := server_content.model_turn:
|
523
626
|
for part in model_turn.parts:
|
524
627
|
if part.text:
|
525
628
|
item_generation.text_ch.send_nowait(part.text)
|
526
629
|
if part.inline_data:
|
527
630
|
frame_data = part.inline_data.data
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
631
|
+
try:
|
632
|
+
frame = rtc.AudioFrame(
|
633
|
+
data=frame_data,
|
634
|
+
sample_rate=OUTPUT_AUDIO_SAMPLE_RATE,
|
635
|
+
num_channels=OUTPUT_AUDIO_CHANNELS,
|
636
|
+
samples_per_channel=len(frame_data) // (2 * OUTPUT_AUDIO_CHANNELS),
|
637
|
+
)
|
638
|
+
item_generation.audio_ch.send_nowait(frame)
|
639
|
+
except ValueError as e:
|
640
|
+
logger.error(f"Error creating audio frame from Gemini data: {e}")
|
641
|
+
|
642
|
+
if input_transcription := server_content.input_transcription:
|
643
|
+
if input_transcription.text:
|
644
|
+
self.emit(
|
645
|
+
"input_audio_transcription_completed",
|
646
|
+
llm.InputTranscriptionCompleted(
|
647
|
+
item_id=response_id, transcript=input_transcription.text
|
648
|
+
),
|
649
|
+
)
|
650
|
+
self._handle_input_speech_started()
|
651
|
+
|
652
|
+
if output_transcription := server_content.output_transcription:
|
653
|
+
if output_transcription.text:
|
654
|
+
item_generation.text_ch.send_nowait(output_transcription.text)
|
655
|
+
|
546
656
|
if server_content.interrupted:
|
547
|
-
self._finalize_response()
|
657
|
+
self._finalize_response(interrupted=True)
|
548
658
|
self._handle_input_speech_started()
|
549
659
|
|
550
660
|
if server_content.turn_complete:
|
551
661
|
self._finalize_response()
|
552
662
|
|
553
|
-
def _finalize_response(self) -> None:
|
663
|
+
def _finalize_response(self, interrupted: bool = False, closed: bool = False) -> None:
|
554
664
|
if not self._current_generation:
|
555
665
|
return
|
556
666
|
|
557
|
-
|
558
|
-
item_generation.text_ch.close()
|
559
|
-
item_generation.audio_ch.close()
|
560
|
-
|
561
|
-
self._current_generation.function_ch.close()
|
562
|
-
self._current_generation.message_ch.close()
|
667
|
+
gen = self._current_generation
|
563
668
|
self._current_generation = None
|
564
|
-
|
565
|
-
|
669
|
+
|
670
|
+
for item_generation in gen.messages.values():
|
671
|
+
if not item_generation.text_ch.closed:
|
672
|
+
item_generation.text_ch.close()
|
673
|
+
if not item_generation.audio_ch.closed:
|
674
|
+
item_generation.audio_ch.close()
|
675
|
+
|
676
|
+
gen.function_ch.close()
|
677
|
+
gen.message_ch.close()
|
566
678
|
|
567
679
|
def _handle_input_speech_started(self):
|
568
680
|
self.emit("input_speech_started", llm.InputSpeechStartedEvent())
|
569
681
|
|
570
682
|
def _handle_tool_calls(self, tool_call: LiveServerToolCall):
|
571
683
|
if not self._current_generation:
|
684
|
+
logger.warning("received tool call but no active generation.")
|
572
685
|
return
|
686
|
+
|
687
|
+
gen = self._current_generation
|
573
688
|
for fnc_call in tool_call.function_calls:
|
574
|
-
|
689
|
+
arguments = json.dumps(fnc_call.args)
|
690
|
+
|
691
|
+
gen.function_ch.send_nowait(
|
575
692
|
llm.FunctionCall(
|
576
|
-
call_id=fnc_call.id or "",
|
693
|
+
call_id=fnc_call.id or utils.shortuuid("fnc-call-"),
|
577
694
|
name=fnc_call.name,
|
578
|
-
arguments=
|
695
|
+
arguments=arguments,
|
579
696
|
)
|
580
697
|
)
|
581
698
|
self._finalize_response()
|
@@ -584,28 +701,45 @@ class RealtimeSession(llm.RealtimeSession):
|
|
584
701
|
self, tool_call_cancellation: LiveServerToolCallCancellation
|
585
702
|
):
|
586
703
|
logger.warning(
|
587
|
-
"
|
588
|
-
extra={
|
589
|
-
"function_call_ids": tool_call_cancellation.ids,
|
590
|
-
},
|
704
|
+
"server cancelled tool calls",
|
705
|
+
extra={"function_call_ids": tool_call_cancellation.ids},
|
591
706
|
)
|
592
|
-
self.emit("function_calls_cancelled", tool_call_cancellation.ids)
|
593
707
|
|
594
708
|
def _handle_usage_metadata(self, usage_metadata: UsageMetadata):
|
595
|
-
#
|
596
|
-
logger.
|
709
|
+
# TODO: handle metrics
|
710
|
+
logger.debug("usage metadata", extra={"usage_metadata": usage_metadata})
|
597
711
|
|
598
712
|
def _handle_go_away(self, go_away: LiveServerGoAway):
|
599
|
-
# should we reconnect?
|
600
713
|
logger.warning(
|
601
|
-
f"
|
714
|
+
f"Gemini server indicates disconnection soon. Time left: {go_away.time_left}"
|
602
715
|
)
|
716
|
+
# TODO(dz): this isn't a seamless reconnection just yet
|
717
|
+
self._session_should_close.set()
|
603
718
|
|
604
719
|
def commit_audio(self) -> None:
|
605
|
-
|
720
|
+
pass
|
606
721
|
|
607
722
|
def clear_audio(self) -> None:
|
608
|
-
|
723
|
+
self._bstream.clear()
|
609
724
|
|
610
|
-
def
|
611
|
-
|
725
|
+
def _resample_audio(self, frame: rtc.AudioFrame) -> Iterator[rtc.AudioFrame]:
|
726
|
+
if self._input_resampler:
|
727
|
+
if frame.sample_rate != self._input_resampler._input_rate:
|
728
|
+
# input audio changed to a different sample rate
|
729
|
+
self._input_resampler = None
|
730
|
+
|
731
|
+
if self._input_resampler is None and (
|
732
|
+
frame.sample_rate != INPUT_AUDIO_SAMPLE_RATE
|
733
|
+
or frame.num_channels != INPUT_AUDIO_CHANNELS
|
734
|
+
):
|
735
|
+
self._input_resampler = rtc.AudioResampler(
|
736
|
+
input_rate=frame.sample_rate,
|
737
|
+
output_rate=INPUT_AUDIO_SAMPLE_RATE,
|
738
|
+
num_channels=INPUT_AUDIO_CHANNELS,
|
739
|
+
)
|
740
|
+
|
741
|
+
if self._input_resampler:
|
742
|
+
# TODO(long): flush the resampler when the input source is changed
|
743
|
+
yield from self._input_resampler.push(frame)
|
744
|
+
else:
|
745
|
+
yield frame
|
livekit/plugins/google/tts.py
CHANGED
@@ -105,7 +105,7 @@ class TTS(tts.TTS):
|
|
105
105
|
self._opts = _TTSOptions(
|
106
106
|
voice=voice_params,
|
107
107
|
audio_config=texttospeech.AudioConfig(
|
108
|
-
audio_encoding=texttospeech.AudioEncoding.
|
108
|
+
audio_encoding=texttospeech.AudioEncoding.PCM,
|
109
109
|
sample_rate_hertz=sample_rate,
|
110
110
|
pitch=pitch,
|
111
111
|
effects_profile_id=effects_profile_id,
|
@@ -132,11 +132,11 @@ class TTS(tts.TTS):
|
|
132
132
|
""" # noqa: E501
|
133
133
|
params = {}
|
134
134
|
if is_given(language):
|
135
|
-
params["
|
135
|
+
params["language_code"] = str(language)
|
136
136
|
if is_given(gender):
|
137
|
-
params["
|
137
|
+
params["ssml_gender"] = _gender_from_str(str(gender))
|
138
138
|
if is_given(voice_name):
|
139
|
-
params["
|
139
|
+
params["name"] = voice_name
|
140
140
|
|
141
141
|
if params:
|
142
142
|
self._opts.voice = texttospeech.VoiceSelectionParams(**params)
|
livekit/plugins/google/utils.py
CHANGED
@@ -28,7 +28,7 @@ def get_tool_results_for_realtime(chat_ctx: llm.ChatContext) -> types.LiveClient
|
|
28
28
|
types.FunctionResponse(
|
29
29
|
id=msg.call_id,
|
30
30
|
name=msg.name,
|
31
|
-
response={"
|
31
|
+
response={"output": msg.output},
|
32
32
|
)
|
33
33
|
)
|
34
34
|
return (
|
@@ -99,9 +99,11 @@ def to_chat_ctx(
|
|
99
99
|
if current_role is not None and parts:
|
100
100
|
turns.append(types.Content(role=current_role, parts=parts))
|
101
101
|
|
102
|
-
|
103
|
-
|
104
|
-
|
102
|
+
# # Gemini requires the last message to end with user's turn before they can generate
|
103
|
+
# # currently not used because to_chat_ctx should not be used to force a new generation
|
104
|
+
# if current_role != "user":
|
105
|
+
# turns.append(types.Content(role="user", parts=[types.Part(text=".")]))
|
106
|
+
|
105
107
|
return turns, system_instruction
|
106
108
|
|
107
109
|
|
{livekit_plugins_google-1.0.16.dist-info → livekit_plugins_google-1.0.18.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: livekit-plugins-google
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.18
|
4
4
|
Summary: Agent Framework plugin for services from Google Cloud
|
5
5
|
Project-URL: Documentation, https://docs.livekit.io
|
6
6
|
Project-URL: Website, https://livekit.io/
|
@@ -21,8 +21,8 @@ Requires-Python: >=3.9.0
|
|
21
21
|
Requires-Dist: google-auth<3,>=2
|
22
22
|
Requires-Dist: google-cloud-speech<3,>=2
|
23
23
|
Requires-Dist: google-cloud-texttospeech<3,>=2
|
24
|
-
Requires-Dist: google-genai>=1.
|
25
|
-
Requires-Dist: livekit-agents>=1.0.
|
24
|
+
Requires-Dist: google-genai>=1.12.1
|
25
|
+
Requires-Dist: livekit-agents>=1.0.18
|
26
26
|
Description-Content-Type: text/markdown
|
27
27
|
|
28
28
|
# LiveKit Plugins Google
|
@@ -4,13 +4,13 @@ livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA
|
|
4
4
|
livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
|
5
5
|
livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
6
|
livekit/plugins/google/stt.py,sha256=AG_lh2fuuduJi0jFbA_QKFXLJ6NUdF1W_FfkLUJML_Q,22413
|
7
|
-
livekit/plugins/google/tts.py,sha256=
|
8
|
-
livekit/plugins/google/utils.py,sha256=
|
9
|
-
livekit/plugins/google/version.py,sha256=
|
7
|
+
livekit/plugins/google/tts.py,sha256=fmQwW9a1kPsEsrTvIo8fqw479RxWEx0SIc3oTVaj41U,9031
|
8
|
+
livekit/plugins/google/utils.py,sha256=TjjTwMbdJdxr3bZjUXxs-J_fipTTM00goW2-d9KWX6w,9582
|
9
|
+
livekit/plugins/google/version.py,sha256=cnPu9FVKZV9tFmmz7lEvftrO3B_nWJVFghi3j6UcJLs,601
|
10
10
|
livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
|
11
11
|
livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
|
12
12
|
livekit/plugins/google/beta/realtime/api_proto.py,sha256=Fyrejs3SG0EjOPCCFLEnWXKEUxCff47PMWk2VsKJm5E,594
|
13
|
-
livekit/plugins/google/beta/realtime/realtime_api.py,sha256=
|
14
|
-
livekit_plugins_google-1.0.
|
15
|
-
livekit_plugins_google-1.0.
|
16
|
-
livekit_plugins_google-1.0.
|
13
|
+
livekit/plugins/google/beta/realtime/realtime_api.py,sha256=sXp2oHnTlHrAp5wFmcXj0bRtQKixBYedfbufcbjVHxk,30897
|
14
|
+
livekit_plugins_google-1.0.18.dist-info/METADATA,sha256=Vqt0FoqibcKzX_jFXlyFkn-mT7iPC16JlH61VS0fbuw,3492
|
15
|
+
livekit_plugins_google-1.0.18.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
16
|
+
livekit_plugins_google-1.0.18.dist-info/RECORD,,
|
File without changes
|