livekit-plugins-google 0.11.0__py3-none-any.whl → 1.0.0.dev4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/google/beta/realtime/__init__.py +1 -5
- livekit/plugins/google/beta/realtime/api_proto.py +2 -1
- livekit/plugins/google/beta/realtime/realtime_api.py +21 -46
- livekit/plugins/google/beta/realtime/transcriber.py +11 -27
- livekit/plugins/google/llm.py +127 -197
- livekit/plugins/google/stt.py +28 -58
- livekit/plugins/google/tts.py +10 -16
- livekit/plugins/google/utils.py +213 -0
- livekit/plugins/google/version.py +1 -1
- {livekit_plugins_google-0.11.0.dist-info → livekit_plugins_google-1.0.0.dev4.dist-info}/METADATA +12 -22
- livekit_plugins_google-1.0.0.dev4.dist-info/RECORD +17 -0
- {livekit_plugins_google-0.11.0.dist-info → livekit_plugins_google-1.0.0.dev4.dist-info}/WHEEL +1 -2
- livekit/plugins/google/_utils.py +0 -199
- livekit_plugins_google-0.11.0.dist-info/RECORD +0 -18
- livekit_plugins_google-0.11.0.dist-info/top_level.txt +0 -1
@@ -3,21 +3,17 @@ from __future__ import annotations
|
|
3
3
|
import asyncio
|
4
4
|
import json
|
5
5
|
import os
|
6
|
+
from collections.abc import AsyncIterable
|
6
7
|
from dataclasses import dataclass
|
7
|
-
from typing import
|
8
|
-
|
9
|
-
from livekit import rtc
|
10
|
-
from livekit.agents import llm, utils
|
11
|
-
from livekit.agents.llm.function_context import _create_ai_function_info
|
12
|
-
from livekit.agents.utils import images
|
8
|
+
from typing import Literal
|
13
9
|
|
14
10
|
from google import genai
|
11
|
+
from google.genai._api_client import HttpOptions
|
15
12
|
from google.genai.types import (
|
16
13
|
Blob,
|
17
14
|
Content,
|
18
15
|
FunctionResponse,
|
19
16
|
GenerationConfig,
|
20
|
-
HttpOptions,
|
21
17
|
LiveClientContent,
|
22
18
|
LiveClientRealtimeInput,
|
23
19
|
LiveClientToolResponse,
|
@@ -29,15 +25,13 @@ from google.genai.types import (
|
|
29
25
|
Tool,
|
30
26
|
VoiceConfig,
|
31
27
|
)
|
28
|
+
from livekit import rtc
|
29
|
+
from livekit.agents import llm, utils
|
30
|
+
from livekit.agents.llm.function_context import _create_ai_function_info
|
31
|
+
from livekit.agents.utils import images
|
32
32
|
|
33
33
|
from ...log import logger
|
34
|
-
from .api_proto import
|
35
|
-
ClientEvents,
|
36
|
-
LiveAPIModels,
|
37
|
-
Voice,
|
38
|
-
_build_gemini_ctx,
|
39
|
-
_build_tools,
|
40
|
-
)
|
34
|
+
from .api_proto import ClientEvents, LiveAPIModels, Voice, _build_gemini_ctx, _build_tools
|
41
35
|
from .transcriber import ModelTranscriber, TranscriberSession, TranscriptionContent
|
42
36
|
|
43
37
|
EventTypes = Literal[
|
@@ -108,7 +102,7 @@ class RealtimeModel:
|
|
108
102
|
model: LiveAPIModels | str = "gemini-2.0-flash-exp",
|
109
103
|
api_key: str | None = None,
|
110
104
|
voice: Voice | str = "Puck",
|
111
|
-
modalities: list[Modality] =
|
105
|
+
modalities: list[Modality] = None,
|
112
106
|
enable_user_audio_transcription: bool = True,
|
113
107
|
enable_agent_audio_transcription: bool = True,
|
114
108
|
vertexai: bool = False,
|
@@ -155,6 +149,8 @@ class RealtimeModel:
|
|
155
149
|
Raises:
|
156
150
|
ValueError: If the API key is not provided and cannot be found in environment variables.
|
157
151
|
"""
|
152
|
+
if modalities is None:
|
153
|
+
modalities = ["AUDIO"]
|
158
154
|
super().__init__()
|
159
155
|
self._capabilities = Capabilities(
|
160
156
|
supports_truncate=False,
|
@@ -180,9 +176,7 @@ class RealtimeModel:
|
|
180
176
|
"API key is required for Google API either via api_key or GOOGLE_API_KEY environment variable"
|
181
177
|
)
|
182
178
|
|
183
|
-
instructions_content = (
|
184
|
-
Content(parts=[Part(text=instructions)]) if instructions else None
|
185
|
-
)
|
179
|
+
instructions_content = Content(parts=[Part(text=instructions)]) if instructions else None
|
186
180
|
|
187
181
|
self._rt_sessions: list[GeminiRealtimeSession] = []
|
188
182
|
self._opts = ModelOptions(
|
@@ -259,8 +253,6 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
259
253
|
self._fnc_ctx = fnc_ctx
|
260
254
|
self._fnc_tasks = utils.aio.TaskSet()
|
261
255
|
self._is_interrupted = False
|
262
|
-
self._playout_complete = asyncio.Event()
|
263
|
-
self._playout_complete.set()
|
264
256
|
|
265
257
|
tools = []
|
266
258
|
if self._fnc_ctx is not None:
|
@@ -281,9 +273,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
281
273
|
system_instruction=self._opts.instructions,
|
282
274
|
speech_config=SpeechConfig(
|
283
275
|
voice_config=VoiceConfig(
|
284
|
-
prebuilt_voice_config=PrebuiltVoiceConfig(
|
285
|
-
voice_name=self._opts.voice
|
286
|
-
)
|
276
|
+
prebuilt_voice_config=PrebuiltVoiceConfig(voice_name=self._opts.voice)
|
287
277
|
)
|
288
278
|
),
|
289
279
|
tools=tools,
|
@@ -295,18 +285,12 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
295
285
|
project=self._opts.project,
|
296
286
|
location=self._opts.location,
|
297
287
|
)
|
298
|
-
self._main_atask = asyncio.create_task(
|
299
|
-
self._main_task(), name="gemini-realtime-session"
|
300
|
-
)
|
288
|
+
self._main_atask = asyncio.create_task(self._main_task(), name="gemini-realtime-session")
|
301
289
|
if self._opts.enable_user_audio_transcription:
|
302
|
-
self._transcriber = TranscriberSession(
|
303
|
-
client=self._client, model=self._opts.model
|
304
|
-
)
|
290
|
+
self._transcriber = TranscriberSession(client=self._client, model=self._opts.model)
|
305
291
|
self._transcriber.on("input_speech_done", self._on_input_speech_done)
|
306
292
|
if self._opts.enable_agent_audio_transcription:
|
307
|
-
self._agent_transcriber = ModelTranscriber(
|
308
|
-
client=self._client, model=self._opts.model
|
309
|
-
)
|
293
|
+
self._agent_transcriber = ModelTranscriber(client=self._client, model=self._opts.model)
|
310
294
|
self._agent_transcriber.on("input_speech_done", self._on_agent_speech_done)
|
311
295
|
# init dummy task
|
312
296
|
self._init_sync_task = asyncio.create_task(asyncio.sleep(0))
|
@@ -320,10 +304,6 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
320
304
|
self._send_ch.close()
|
321
305
|
await self._main_atask
|
322
306
|
|
323
|
-
@property
|
324
|
-
def playout_complete(self) -> asyncio.Event | None:
|
325
|
-
return self._playout_complete
|
326
|
-
|
327
307
|
@property
|
328
308
|
def fnc_ctx(self) -> llm.FunctionContext | None:
|
329
309
|
return self._fnc_ctx
|
@@ -341,9 +321,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
341
321
|
DEFAULT_ENCODE_OPTIONS = images.EncodeOptions(
|
342
322
|
format="JPEG",
|
343
323
|
quality=75,
|
344
|
-
resize_options=images.ResizeOptions(
|
345
|
-
width=1024, height=1024, strategy="scale_aspect_fit"
|
346
|
-
),
|
324
|
+
resize_options=images.ResizeOptions(width=1024, height=1024, strategy="scale_aspect_fit"),
|
347
325
|
)
|
348
326
|
|
349
327
|
def push_video(
|
@@ -393,9 +371,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
393
371
|
|
394
372
|
def create_response(
|
395
373
|
self,
|
396
|
-
on_duplicate: Literal[
|
397
|
-
"cancel_existing", "cancel_new", "keep_both"
|
398
|
-
] = "keep_both",
|
374
|
+
on_duplicate: Literal["cancel_existing", "cancel_new", "keep_both"] = "keep_both",
|
399
375
|
) -> None:
|
400
376
|
turns, _ = _build_gemini_ctx(self._chat_ctx, id(self))
|
401
377
|
ctx = [self._opts.instructions] + turns if self._opts.instructions else turns
|
@@ -481,8 +457,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
481
457
|
data=part.inline_data.data,
|
482
458
|
sample_rate=24000,
|
483
459
|
num_channels=1,
|
484
|
-
samples_per_channel=len(part.inline_data.data)
|
485
|
-
// 2,
|
460
|
+
samples_per_channel=len(part.inline_data.data) // 2,
|
486
461
|
)
|
487
462
|
if self._opts.enable_agent_audio_transcription:
|
488
463
|
content.audio.append(frame)
|
@@ -525,12 +500,12 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
525
500
|
logger.warning(
|
526
501
|
"function call cancelled",
|
527
502
|
extra={
|
528
|
-
"function_call_ids": response.tool_call_cancellation.
|
503
|
+
"function_call_ids": response.tool_call_cancellation.function_call_ids,
|
529
504
|
},
|
530
505
|
)
|
531
506
|
self.emit(
|
532
507
|
"function_calls_cancelled",
|
533
|
-
response.tool_call_cancellation.
|
508
|
+
response.tool_call_cancellation.function_call_ids,
|
534
509
|
)
|
535
510
|
|
536
511
|
async with self._client.aio.live.connect(
|
@@ -6,12 +6,12 @@ from dataclasses import dataclass
|
|
6
6
|
from typing import Literal
|
7
7
|
|
8
8
|
import websockets
|
9
|
-
from livekit import rtc
|
10
|
-
from livekit.agents import APIConnectionError, APIStatusError, utils
|
11
9
|
|
12
10
|
from google import genai
|
13
11
|
from google.genai import types
|
14
12
|
from google.genai.errors import APIError, ClientError, ServerError
|
13
|
+
from livekit import rtc
|
14
|
+
from livekit.agents import APIConnectionError, APIStatusError, utils
|
15
15
|
|
16
16
|
from ...log import logger
|
17
17
|
from .api_proto import ClientEvents, LiveAPIModels
|
@@ -51,11 +51,9 @@ class TranscriberSession(utils.EventEmitter[EventTypes]):
|
|
51
51
|
self._needed_sr = 16000
|
52
52
|
self._closed = False
|
53
53
|
|
54
|
-
system_instructions = types.Content(
|
55
|
-
parts=[types.Part(text=SYSTEM_INSTRUCTIONS)]
|
56
|
-
)
|
54
|
+
system_instructions = types.Content(parts=[types.Part(text=SYSTEM_INSTRUCTIONS)])
|
57
55
|
self._config = types.LiveConnectConfig(
|
58
|
-
response_modalities=[
|
56
|
+
response_modalities=["TEXT"],
|
59
57
|
system_instruction=system_instructions,
|
60
58
|
generation_config=types.GenerationConfig(temperature=0.0),
|
61
59
|
)
|
@@ -81,17 +79,13 @@ class TranscriberSession(utils.EventEmitter[EventTypes]):
|
|
81
79
|
for f in self._resampler.push(frame):
|
82
80
|
self._queue_msg(
|
83
81
|
types.LiveClientRealtimeInput(
|
84
|
-
media_chunks=[
|
85
|
-
types.Blob(data=f.data.tobytes(), mime_type="audio/pcm")
|
86
|
-
]
|
82
|
+
media_chunks=[types.Blob(data=f.data.tobytes(), mime_type="audio/pcm")]
|
87
83
|
)
|
88
84
|
)
|
89
85
|
else:
|
90
86
|
self._queue_msg(
|
91
87
|
types.LiveClientRealtimeInput(
|
92
|
-
media_chunks=[
|
93
|
-
types.Blob(data=frame.data.tobytes(), mime_type="audio/pcm")
|
94
|
-
]
|
88
|
+
media_chunks=[types.Blob(data=frame.data.tobytes(), mime_type="audio/pcm")]
|
95
89
|
)
|
96
90
|
)
|
97
91
|
|
@@ -157,17 +151,11 @@ class TranscriberSession(utils.EventEmitter[EventTypes]):
|
|
157
151
|
logger.exception(f"Uncaught error in transcriber _recv_task: {e}")
|
158
152
|
self._closed = True
|
159
153
|
|
160
|
-
async with self._client.aio.live.connect(
|
161
|
-
model=self._model, config=self._config
|
162
|
-
) as session:
|
154
|
+
async with self._client.aio.live.connect(model=self._model, config=self._config) as session:
|
163
155
|
self._session = session
|
164
156
|
tasks = [
|
165
|
-
asyncio.create_task(
|
166
|
-
|
167
|
-
),
|
168
|
-
asyncio.create_task(
|
169
|
-
_recv_task(), name="gemini-realtime-transcriber-recv"
|
170
|
-
),
|
157
|
+
asyncio.create_task(_send_task(), name="gemini-realtime-transcriber-send"),
|
158
|
+
asyncio.create_task(_recv_task(), name="gemini-realtime-transcriber-recv"),
|
171
159
|
]
|
172
160
|
|
173
161
|
try:
|
@@ -187,9 +175,7 @@ class ModelTranscriber(utils.EventEmitter[EventTypes]):
|
|
187
175
|
self._client = client
|
188
176
|
self._model = model
|
189
177
|
self._needed_sr = 16000
|
190
|
-
self._system_instructions = types.Content(
|
191
|
-
parts=[types.Part(text=SYSTEM_INSTRUCTIONS)]
|
192
|
-
)
|
178
|
+
self._system_instructions = types.Content(parts=[types.Part(text=SYSTEM_INSTRUCTIONS)])
|
193
179
|
self._config = types.GenerateContentConfig(
|
194
180
|
temperature=0.0,
|
195
181
|
system_instruction=self._system_instructions,
|
@@ -198,9 +184,7 @@ class ModelTranscriber(utils.EventEmitter[EventTypes]):
|
|
198
184
|
self._resampler: rtc.AudioResampler | None = None
|
199
185
|
self._buffer: rtc.AudioFrame | None = None
|
200
186
|
self._audio_ch = utils.aio.Chan[rtc.AudioFrame]()
|
201
|
-
self._main_atask = asyncio.create_task(
|
202
|
-
self._main_task(), name="gemini-model-transcriber"
|
203
|
-
)
|
187
|
+
self._main_atask = asyncio.create_task(self._main_task(), name="gemini-model-transcriber")
|
204
188
|
|
205
189
|
async def aclose(self) -> None:
|
206
190
|
if self._audio_ch.closed:
|