livekit-plugins-google 0.11.1__py3-none-any.whl → 1.0.0.dev5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/google/beta/realtime/__init__.py +1 -5
- livekit/plugins/google/beta/realtime/api_proto.py +3 -2
- livekit/plugins/google/beta/realtime/realtime_api.py +22 -51
- livekit/plugins/google/beta/realtime/transcriber.py +11 -27
- livekit/plugins/google/llm.py +127 -197
- livekit/plugins/google/stt.py +28 -58
- livekit/plugins/google/tts.py +10 -16
- livekit/plugins/google/utils.py +213 -0
- livekit/plugins/google/version.py +1 -1
- {livekit_plugins_google-0.11.1.dist-info → livekit_plugins_google-1.0.0.dev5.dist-info}/METADATA +12 -22
- livekit_plugins_google-1.0.0.dev5.dist-info/RECORD +17 -0
- {livekit_plugins_google-0.11.1.dist-info → livekit_plugins_google-1.0.0.dev5.dist-info}/WHEEL +1 -2
- livekit/plugins/google/_utils.py +0 -199
- livekit_plugins_google-0.11.1.dist-info/RECORD +0 -18
- livekit_plugins_google-0.11.1.dist-info/top_level.txt +0 -1
@@ -1,12 +1,13 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from
|
3
|
+
from collections.abc import Sequence
|
4
|
+
from typing import Literal, Union
|
4
5
|
|
5
6
|
from google.genai import types
|
6
7
|
|
7
8
|
from ..._utils import _build_gemini_ctx, _build_tools
|
8
9
|
|
9
|
-
LiveAPIModels = Literal["gemini-2.0-flash-
|
10
|
+
LiveAPIModels = Literal["gemini-2.0-flash-001",]
|
10
11
|
|
11
12
|
Voice = Literal["Puck", "Charon", "Kore", "Fenrir", "Aoede"]
|
12
13
|
|
@@ -3,21 +3,17 @@ from __future__ import annotations
|
|
3
3
|
import asyncio
|
4
4
|
import json
|
5
5
|
import os
|
6
|
+
from collections.abc import AsyncIterable
|
6
7
|
from dataclasses import dataclass
|
7
|
-
from typing import
|
8
|
-
|
9
|
-
from livekit import rtc
|
10
|
-
from livekit.agents import llm, utils
|
11
|
-
from livekit.agents.llm.function_context import _create_ai_function_info
|
12
|
-
from livekit.agents.utils import images
|
8
|
+
from typing import Literal
|
13
9
|
|
14
10
|
from google import genai
|
11
|
+
from google.genai._api_client import HttpOptions
|
15
12
|
from google.genai.types import (
|
16
13
|
Blob,
|
17
14
|
Content,
|
18
15
|
FunctionResponse,
|
19
16
|
GenerationConfig,
|
20
|
-
HttpOptions,
|
21
17
|
LiveClientContent,
|
22
18
|
LiveClientRealtimeInput,
|
23
19
|
LiveClientToolResponse,
|
@@ -29,15 +25,13 @@ from google.genai.types import (
|
|
29
25
|
Tool,
|
30
26
|
VoiceConfig,
|
31
27
|
)
|
28
|
+
from livekit import rtc
|
29
|
+
from livekit.agents import llm, utils
|
30
|
+
from livekit.agents.llm.function_context import _create_ai_function_info
|
31
|
+
from livekit.agents.utils import images
|
32
32
|
|
33
33
|
from ...log import logger
|
34
|
-
from .api_proto import
|
35
|
-
ClientEvents,
|
36
|
-
LiveAPIModels,
|
37
|
-
Voice,
|
38
|
-
_build_gemini_ctx,
|
39
|
-
_build_tools,
|
40
|
-
)
|
34
|
+
from .api_proto import ClientEvents, LiveAPIModels, Voice, _build_gemini_ctx, _build_tools
|
41
35
|
from .transcriber import ModelTranscriber, TranscriberSession, TranscriptionContent
|
42
36
|
|
43
37
|
EventTypes = Literal[
|
@@ -83,7 +77,6 @@ class Capabilities:
|
|
83
77
|
class ModelOptions:
|
84
78
|
model: LiveAPIModels | str
|
85
79
|
api_key: str | None
|
86
|
-
api_version: str
|
87
80
|
voice: Voice | str
|
88
81
|
response_modalities: list[Modality] | None
|
89
82
|
vertexai: bool
|
@@ -108,9 +101,8 @@ class RealtimeModel:
|
|
108
101
|
instructions: str | None = None,
|
109
102
|
model: LiveAPIModels | str = "gemini-2.0-flash-exp",
|
110
103
|
api_key: str | None = None,
|
111
|
-
api_version: str = "v1alpha",
|
112
104
|
voice: Voice | str = "Puck",
|
113
|
-
modalities: list[Modality] =
|
105
|
+
modalities: list[Modality] = None,
|
114
106
|
enable_user_audio_transcription: bool = True,
|
115
107
|
enable_agent_audio_transcription: bool = True,
|
116
108
|
vertexai: bool = False,
|
@@ -138,7 +130,6 @@ class RealtimeModel:
|
|
138
130
|
Args:
|
139
131
|
instructions (str, optional): Initial system instructions for the model. Defaults to "".
|
140
132
|
api_key (str or None, optional): Google Gemini API key. If None, will attempt to read from the environment variable GOOGLE_API_KEY.
|
141
|
-
api_version (str, optional): The version of the API to use. Defaults to "v1alpha".
|
142
133
|
modalities (list[Modality], optional): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
|
143
134
|
model (str or None, optional): The name of the model to use. Defaults to "gemini-2.0-flash-exp".
|
144
135
|
voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
|
@@ -158,6 +149,8 @@ class RealtimeModel:
|
|
158
149
|
Raises:
|
159
150
|
ValueError: If the API key is not provided and cannot be found in environment variables.
|
160
151
|
"""
|
152
|
+
if modalities is None:
|
153
|
+
modalities = ["AUDIO"]
|
161
154
|
super().__init__()
|
162
155
|
self._capabilities = Capabilities(
|
163
156
|
supports_truncate=False,
|
@@ -183,14 +176,11 @@ class RealtimeModel:
|
|
183
176
|
"API key is required for Google API either via api_key or GOOGLE_API_KEY environment variable"
|
184
177
|
)
|
185
178
|
|
186
|
-
instructions_content = (
|
187
|
-
Content(parts=[Part(text=instructions)]) if instructions else None
|
188
|
-
)
|
179
|
+
instructions_content = Content(parts=[Part(text=instructions)]) if instructions else None
|
189
180
|
|
190
181
|
self._rt_sessions: list[GeminiRealtimeSession] = []
|
191
182
|
self._opts = ModelOptions(
|
192
183
|
model=model,
|
193
|
-
api_version=api_version,
|
194
184
|
api_key=self._api_key,
|
195
185
|
voice=voice,
|
196
186
|
enable_user_audio_transcription=enable_user_audio_transcription,
|
@@ -263,8 +253,6 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
263
253
|
self._fnc_ctx = fnc_ctx
|
264
254
|
self._fnc_tasks = utils.aio.TaskSet()
|
265
255
|
self._is_interrupted = False
|
266
|
-
self._playout_complete = asyncio.Event()
|
267
|
-
self._playout_complete.set()
|
268
256
|
|
269
257
|
tools = []
|
270
258
|
if self._fnc_ctx is not None:
|
@@ -285,32 +273,24 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
285
273
|
system_instruction=self._opts.instructions,
|
286
274
|
speech_config=SpeechConfig(
|
287
275
|
voice_config=VoiceConfig(
|
288
|
-
prebuilt_voice_config=PrebuiltVoiceConfig(
|
289
|
-
voice_name=self._opts.voice
|
290
|
-
)
|
276
|
+
prebuilt_voice_config=PrebuiltVoiceConfig(voice_name=self._opts.voice)
|
291
277
|
)
|
292
278
|
),
|
293
279
|
tools=tools,
|
294
280
|
)
|
295
281
|
self._client = genai.Client(
|
296
|
-
http_options=HttpOptions(api_version=
|
282
|
+
http_options=HttpOptions(api_version="v1alpha"),
|
297
283
|
api_key=self._opts.api_key,
|
298
284
|
vertexai=self._opts.vertexai,
|
299
285
|
project=self._opts.project,
|
300
286
|
location=self._opts.location,
|
301
287
|
)
|
302
|
-
self._main_atask = asyncio.create_task(
|
303
|
-
self._main_task(), name="gemini-realtime-session"
|
304
|
-
)
|
288
|
+
self._main_atask = asyncio.create_task(self._main_task(), name="gemini-realtime-session")
|
305
289
|
if self._opts.enable_user_audio_transcription:
|
306
|
-
self._transcriber = TranscriberSession(
|
307
|
-
client=self._client, model=self._opts.model
|
308
|
-
)
|
290
|
+
self._transcriber = TranscriberSession(client=self._client, model=self._opts.model)
|
309
291
|
self._transcriber.on("input_speech_done", self._on_input_speech_done)
|
310
292
|
if self._opts.enable_agent_audio_transcription:
|
311
|
-
self._agent_transcriber = ModelTranscriber(
|
312
|
-
client=self._client, model=self._opts.model
|
313
|
-
)
|
293
|
+
self._agent_transcriber = ModelTranscriber(client=self._client, model=self._opts.model)
|
314
294
|
self._agent_transcriber.on("input_speech_done", self._on_agent_speech_done)
|
315
295
|
# init dummy task
|
316
296
|
self._init_sync_task = asyncio.create_task(asyncio.sleep(0))
|
@@ -324,10 +304,6 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
324
304
|
self._send_ch.close()
|
325
305
|
await self._main_atask
|
326
306
|
|
327
|
-
@property
|
328
|
-
def playout_complete(self) -> asyncio.Event | None:
|
329
|
-
return self._playout_complete
|
330
|
-
|
331
307
|
@property
|
332
308
|
def fnc_ctx(self) -> llm.FunctionContext | None:
|
333
309
|
return self._fnc_ctx
|
@@ -345,9 +321,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
345
321
|
DEFAULT_ENCODE_OPTIONS = images.EncodeOptions(
|
346
322
|
format="JPEG",
|
347
323
|
quality=75,
|
348
|
-
resize_options=images.ResizeOptions(
|
349
|
-
width=1024, height=1024, strategy="scale_aspect_fit"
|
350
|
-
),
|
324
|
+
resize_options=images.ResizeOptions(width=1024, height=1024, strategy="scale_aspect_fit"),
|
351
325
|
)
|
352
326
|
|
353
327
|
def push_video(
|
@@ -397,9 +371,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
397
371
|
|
398
372
|
def create_response(
|
399
373
|
self,
|
400
|
-
on_duplicate: Literal[
|
401
|
-
"cancel_existing", "cancel_new", "keep_both"
|
402
|
-
] = "keep_both",
|
374
|
+
on_duplicate: Literal["cancel_existing", "cancel_new", "keep_both"] = "keep_both",
|
403
375
|
) -> None:
|
404
376
|
turns, _ = _build_gemini_ctx(self._chat_ctx, id(self))
|
405
377
|
ctx = [self._opts.instructions] + turns if self._opts.instructions else turns
|
@@ -485,8 +457,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
485
457
|
data=part.inline_data.data,
|
486
458
|
sample_rate=24000,
|
487
459
|
num_channels=1,
|
488
|
-
samples_per_channel=len(part.inline_data.data)
|
489
|
-
// 2,
|
460
|
+
samples_per_channel=len(part.inline_data.data) // 2,
|
490
461
|
)
|
491
462
|
if self._opts.enable_agent_audio_transcription:
|
492
463
|
content.audio.append(frame)
|
@@ -529,12 +500,12 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
529
500
|
logger.warning(
|
530
501
|
"function call cancelled",
|
531
502
|
extra={
|
532
|
-
"function_call_ids": response.tool_call_cancellation.
|
503
|
+
"function_call_ids": response.tool_call_cancellation.function_call_ids,
|
533
504
|
},
|
534
505
|
)
|
535
506
|
self.emit(
|
536
507
|
"function_calls_cancelled",
|
537
|
-
response.tool_call_cancellation.
|
508
|
+
response.tool_call_cancellation.function_call_ids,
|
538
509
|
)
|
539
510
|
|
540
511
|
async with self._client.aio.live.connect(
|
@@ -6,12 +6,12 @@ from dataclasses import dataclass
|
|
6
6
|
from typing import Literal
|
7
7
|
|
8
8
|
import websockets
|
9
|
-
from livekit import rtc
|
10
|
-
from livekit.agents import APIConnectionError, APIStatusError, utils
|
11
9
|
|
12
10
|
from google import genai
|
13
11
|
from google.genai import types
|
14
12
|
from google.genai.errors import APIError, ClientError, ServerError
|
13
|
+
from livekit import rtc
|
14
|
+
from livekit.agents import APIConnectionError, APIStatusError, utils
|
15
15
|
|
16
16
|
from ...log import logger
|
17
17
|
from .api_proto import ClientEvents, LiveAPIModels
|
@@ -51,11 +51,9 @@ class TranscriberSession(utils.EventEmitter[EventTypes]):
|
|
51
51
|
self._needed_sr = 16000
|
52
52
|
self._closed = False
|
53
53
|
|
54
|
-
system_instructions = types.Content(
|
55
|
-
parts=[types.Part(text=SYSTEM_INSTRUCTIONS)]
|
56
|
-
)
|
54
|
+
system_instructions = types.Content(parts=[types.Part(text=SYSTEM_INSTRUCTIONS)])
|
57
55
|
self._config = types.LiveConnectConfig(
|
58
|
-
response_modalities=[
|
56
|
+
response_modalities=["TEXT"],
|
59
57
|
system_instruction=system_instructions,
|
60
58
|
generation_config=types.GenerationConfig(temperature=0.0),
|
61
59
|
)
|
@@ -81,17 +79,13 @@ class TranscriberSession(utils.EventEmitter[EventTypes]):
|
|
81
79
|
for f in self._resampler.push(frame):
|
82
80
|
self._queue_msg(
|
83
81
|
types.LiveClientRealtimeInput(
|
84
|
-
media_chunks=[
|
85
|
-
types.Blob(data=f.data.tobytes(), mime_type="audio/pcm")
|
86
|
-
]
|
82
|
+
media_chunks=[types.Blob(data=f.data.tobytes(), mime_type="audio/pcm")]
|
87
83
|
)
|
88
84
|
)
|
89
85
|
else:
|
90
86
|
self._queue_msg(
|
91
87
|
types.LiveClientRealtimeInput(
|
92
|
-
media_chunks=[
|
93
|
-
types.Blob(data=frame.data.tobytes(), mime_type="audio/pcm")
|
94
|
-
]
|
88
|
+
media_chunks=[types.Blob(data=frame.data.tobytes(), mime_type="audio/pcm")]
|
95
89
|
)
|
96
90
|
)
|
97
91
|
|
@@ -157,17 +151,11 @@ class TranscriberSession(utils.EventEmitter[EventTypes]):
|
|
157
151
|
logger.exception(f"Uncaught error in transcriber _recv_task: {e}")
|
158
152
|
self._closed = True
|
159
153
|
|
160
|
-
async with self._client.aio.live.connect(
|
161
|
-
model=self._model, config=self._config
|
162
|
-
) as session:
|
154
|
+
async with self._client.aio.live.connect(model=self._model, config=self._config) as session:
|
163
155
|
self._session = session
|
164
156
|
tasks = [
|
165
|
-
asyncio.create_task(
|
166
|
-
|
167
|
-
),
|
168
|
-
asyncio.create_task(
|
169
|
-
_recv_task(), name="gemini-realtime-transcriber-recv"
|
170
|
-
),
|
157
|
+
asyncio.create_task(_send_task(), name="gemini-realtime-transcriber-send"),
|
158
|
+
asyncio.create_task(_recv_task(), name="gemini-realtime-transcriber-recv"),
|
171
159
|
]
|
172
160
|
|
173
161
|
try:
|
@@ -187,9 +175,7 @@ class ModelTranscriber(utils.EventEmitter[EventTypes]):
|
|
187
175
|
self._client = client
|
188
176
|
self._model = model
|
189
177
|
self._needed_sr = 16000
|
190
|
-
self._system_instructions = types.Content(
|
191
|
-
parts=[types.Part(text=SYSTEM_INSTRUCTIONS)]
|
192
|
-
)
|
178
|
+
self._system_instructions = types.Content(parts=[types.Part(text=SYSTEM_INSTRUCTIONS)])
|
193
179
|
self._config = types.GenerateContentConfig(
|
194
180
|
temperature=0.0,
|
195
181
|
system_instruction=self._system_instructions,
|
@@ -198,9 +184,7 @@ class ModelTranscriber(utils.EventEmitter[EventTypes]):
|
|
198
184
|
self._resampler: rtc.AudioResampler | None = None
|
199
185
|
self._buffer: rtc.AudioFrame | None = None
|
200
186
|
self._audio_ch = utils.aio.Chan[rtc.AudioFrame]()
|
201
|
-
self._main_atask = asyncio.create_task(
|
202
|
-
self._main_task(), name="gemini-model-transcriber"
|
203
|
-
)
|
187
|
+
self._main_atask = asyncio.create_task(self._main_task(), name="gemini-model-transcriber")
|
204
188
|
|
205
189
|
async def aclose(self) -> None:
|
206
190
|
if self._audio_ch.closed:
|