livekit-plugins-google 1.0.11__py3-none-any.whl → 1.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/google/beta/realtime/realtime_api.py +61 -5
- livekit/plugins/google/llm.py +4 -1
- livekit/plugins/google/version.py +1 -1
- {livekit_plugins_google-1.0.11.dist-info → livekit_plugins_google-1.0.13.dist-info}/METADATA +3 -3
- {livekit_plugins_google-1.0.11.dist-info → livekit_plugins_google-1.0.13.dist-info}/RECORD +6 -6
- {livekit_plugins_google-1.0.11.dist-info → livekit_plugins_google-1.0.13.dist-info}/WHEEL +0 -0
@@ -7,8 +7,8 @@ import weakref
|
|
7
7
|
from dataclasses import dataclass
|
8
8
|
|
9
9
|
from google import genai
|
10
|
-
from google.genai._api_client import HttpOptions
|
11
10
|
from google.genai.types import (
|
11
|
+
AudioTranscriptionConfig,
|
12
12
|
Blob,
|
13
13
|
Content,
|
14
14
|
FunctionDeclaration,
|
@@ -17,6 +17,7 @@ from google.genai.types import (
|
|
17
17
|
LiveClientRealtimeInput,
|
18
18
|
LiveConnectConfig,
|
19
19
|
LiveServerContent,
|
20
|
+
LiveServerGoAway,
|
20
21
|
LiveServerToolCall,
|
21
22
|
LiveServerToolCallCancellation,
|
22
23
|
Modality,
|
@@ -24,12 +25,13 @@ from google.genai.types import (
|
|
24
25
|
PrebuiltVoiceConfig,
|
25
26
|
SpeechConfig,
|
26
27
|
Tool,
|
28
|
+
UsageMetadata,
|
27
29
|
VoiceConfig,
|
28
30
|
)
|
29
31
|
from livekit import rtc
|
30
32
|
from livekit.agents import llm, utils
|
31
33
|
from livekit.agents.types import NOT_GIVEN, NotGivenOr
|
32
|
-
from livekit.agents.utils import is_given
|
34
|
+
from livekit.agents.utils import images, is_given
|
33
35
|
|
34
36
|
from ...log import logger
|
35
37
|
from ...utils import _build_gemini_fnc, get_tool_results_for_realtime, to_chat_ctx
|
@@ -39,6 +41,12 @@ INPUT_AUDIO_SAMPLE_RATE = 16000
|
|
39
41
|
OUTPUT_AUDIO_SAMPLE_RATE = 24000
|
40
42
|
NUM_CHANNELS = 1
|
41
43
|
|
44
|
+
DEFAULT_ENCODE_OPTIONS = images.EncodeOptions(
|
45
|
+
format="JPEG",
|
46
|
+
quality=75,
|
47
|
+
resize_options=images.ResizeOptions(width=1024, height=1024, strategy="scale_aspect_fit"),
|
48
|
+
)
|
49
|
+
|
42
50
|
|
43
51
|
@dataclass
|
44
52
|
class InputTranscription:
|
@@ -63,6 +71,8 @@ class _RealtimeOptions:
|
|
63
71
|
presence_penalty: NotGivenOr[float]
|
64
72
|
frequency_penalty: NotGivenOr[float]
|
65
73
|
instructions: NotGivenOr[str]
|
74
|
+
input_audio_transcription: AudioTranscriptionConfig | None
|
75
|
+
output_audio_transcription: AudioTranscriptionConfig | None
|
66
76
|
|
67
77
|
|
68
78
|
@dataclass
|
@@ -99,6 +109,8 @@ class RealtimeModel(llm.RealtimeModel):
|
|
99
109
|
top_k: NotGivenOr[int] = NOT_GIVEN,
|
100
110
|
presence_penalty: NotGivenOr[float] = NOT_GIVEN,
|
101
111
|
frequency_penalty: NotGivenOr[float] = NOT_GIVEN,
|
112
|
+
input_audio_transcription: NotGivenOr[AudioTranscriptionConfig | None] = NOT_GIVEN,
|
113
|
+
output_audio_transcription: NotGivenOr[AudioTranscriptionConfig | None] = NOT_GIVEN,
|
102
114
|
) -> None:
|
103
115
|
"""
|
104
116
|
Initializes a RealtimeModel instance for interacting with Google's Realtime API.
|
@@ -125,6 +137,8 @@ class RealtimeModel(llm.RealtimeModel):
|
|
125
137
|
top_k (int, optional): The top-k value for response generation
|
126
138
|
presence_penalty (float, optional): The presence penalty for response generation
|
127
139
|
frequency_penalty (float, optional): The frequency penalty for response generation
|
140
|
+
input_audio_transcription (AudioTranscriptionConfig | None, optional): The configuration for input audio transcription. Defaults to None.)
|
141
|
+
output_audio_transcription (AudioTranscriptionConfig | None, optional): The configuration for output audio transcription. Defaults to AudioTranscriptionConfig().
|
128
142
|
|
129
143
|
Raises:
|
130
144
|
ValueError: If the API key is required but not found.
|
@@ -155,6 +169,11 @@ class RealtimeModel(llm.RealtimeModel):
|
|
155
169
|
"API key is required for Google API either via api_key or GOOGLE_API_KEY environment variable" # noqa: E501
|
156
170
|
)
|
157
171
|
|
172
|
+
if not is_given(input_audio_transcription):
|
173
|
+
input_audio_transcription = None
|
174
|
+
if not is_given(output_audio_transcription):
|
175
|
+
output_audio_transcription = AudioTranscriptionConfig()
|
176
|
+
|
158
177
|
self._opts = _RealtimeOptions(
|
159
178
|
model=model,
|
160
179
|
api_key=gemini_api_key,
|
@@ -171,6 +190,8 @@ class RealtimeModel(llm.RealtimeModel):
|
|
171
190
|
presence_penalty=presence_penalty,
|
172
191
|
frequency_penalty=frequency_penalty,
|
173
192
|
instructions=instructions,
|
193
|
+
input_audio_transcription=input_audio_transcription,
|
194
|
+
output_audio_transcription=output_audio_transcription,
|
174
195
|
)
|
175
196
|
|
176
197
|
self._sessions = weakref.WeakSet[RealtimeSession]()
|
@@ -204,7 +225,6 @@ class RealtimeSession(llm.RealtimeSession):
|
|
204
225
|
self._msg_ch = utils.aio.Chan[ClientEvents]()
|
205
226
|
self._gemini_tools: list[Tool] = []
|
206
227
|
self._client = genai.Client(
|
207
|
-
http_options=HttpOptions(api_version="v1alpha"),
|
208
228
|
api_key=self._opts.api_key,
|
209
229
|
vertexai=self._opts.vertexai,
|
210
230
|
project=self._opts.project,
|
@@ -299,8 +319,15 @@ class RealtimeSession(llm.RealtimeSession):
|
|
299
319
|
return self._tools
|
300
320
|
|
301
321
|
def push_audio(self, frame: rtc.AudioFrame) -> None:
|
322
|
+
self.push_media(frame.data.tobytes(), "audio/pcm")
|
323
|
+
|
324
|
+
def push_video(self, frame: rtc.VideoFrame) -> None:
|
325
|
+
encoded_data = images.encode(frame, DEFAULT_ENCODE_OPTIONS)
|
326
|
+
self.push_media(encoded_data, "image/jpeg")
|
327
|
+
|
328
|
+
def push_media(self, bytes: bytes, mime_type: str) -> None:
|
302
329
|
realtime_input = LiveClientRealtimeInput(
|
303
|
-
media_chunks=[Blob(data=
|
330
|
+
media_chunks=[Blob(data=bytes, mime_type=mime_type)]
|
304
331
|
)
|
305
332
|
self._msg_ch.send_nowait(realtime_input)
|
306
333
|
|
@@ -381,6 +408,8 @@ class RealtimeSession(llm.RealtimeSession):
|
|
381
408
|
)
|
382
409
|
),
|
383
410
|
tools=self._gemini_tools,
|
411
|
+
input_audio_transcription=self._opts.input_audio_transcription,
|
412
|
+
output_audio_transcription=self._opts.output_audio_transcription,
|
384
413
|
)
|
385
414
|
|
386
415
|
async with self._client.aio.live.connect(
|
@@ -404,12 +433,18 @@ class RealtimeSession(llm.RealtimeSession):
|
|
404
433
|
async for response in session.receive():
|
405
434
|
if self._active_response_id is None:
|
406
435
|
self._start_new_generation()
|
436
|
+
if response.setup_complete:
|
437
|
+
logger.info("connection established with gemini live api server")
|
407
438
|
if response.server_content:
|
408
439
|
self._handle_server_content(response.server_content)
|
409
440
|
if response.tool_call:
|
410
441
|
self._handle_tool_calls(response.tool_call)
|
411
442
|
if response.tool_call_cancellation:
|
412
443
|
self._handle_tool_call_cancellation(response.tool_call_cancellation)
|
444
|
+
if response.usage_metadata:
|
445
|
+
self._handle_usage_metadata(response.usage_metadata)
|
446
|
+
if response.go_away:
|
447
|
+
self._handle_go_away(response.go_away)
|
413
448
|
|
414
449
|
send_task = asyncio.create_task(_send_task(), name="gemini-realtime-send")
|
415
450
|
recv_task = asyncio.create_task(_recv_task(), name="gemini-realtime-recv")
|
@@ -497,6 +532,17 @@ class RealtimeSession(llm.RealtimeSession):
|
|
497
532
|
samples_per_channel=len(frame_data) // 2,
|
498
533
|
)
|
499
534
|
item_generation.audio_ch.send_nowait(frame)
|
535
|
+
input_transcription = server_content.input_transcription
|
536
|
+
if input_transcription and input_transcription.text:
|
537
|
+
self.emit(
|
538
|
+
"input_audio_transcription_completed",
|
539
|
+
llm.InputTranscriptionCompleted(
|
540
|
+
item_id=self._active_response_id, transcript=input_transcription.text
|
541
|
+
),
|
542
|
+
)
|
543
|
+
output_transcription = server_content.output_transcription
|
544
|
+
if output_transcription and output_transcription.text:
|
545
|
+
item_generation.text_ch.send_nowait(output_transcription.text)
|
500
546
|
|
501
547
|
if server_content.interrupted or server_content.turn_complete:
|
502
548
|
self._finalize_response()
|
@@ -522,7 +568,7 @@ class RealtimeSession(llm.RealtimeSession):
|
|
522
568
|
for fnc_call in tool_call.function_calls:
|
523
569
|
self._current_generation.function_ch.send_nowait(
|
524
570
|
llm.FunctionCall(
|
525
|
-
call_id=fnc_call.id,
|
571
|
+
call_id=fnc_call.id or "",
|
526
572
|
name=fnc_call.name,
|
527
573
|
arguments=json.dumps(fnc_call.args),
|
528
574
|
)
|
@@ -540,6 +586,16 @@ class RealtimeSession(llm.RealtimeSession):
|
|
540
586
|
)
|
541
587
|
self.emit("function_calls_cancelled", tool_call_cancellation.ids)
|
542
588
|
|
589
|
+
def _handle_usage_metadata(self, usage_metadata: UsageMetadata):
|
590
|
+
# todo: handle metrics
|
591
|
+
logger.info("Usage metadata", extra={"usage_metadata": usage_metadata})
|
592
|
+
|
593
|
+
def _handle_go_away(self, go_away: LiveServerGoAway):
|
594
|
+
# should we reconnect?
|
595
|
+
logger.warning(
|
596
|
+
f"gemini live api server will soon disconnect. time left: {go_away.time_left}"
|
597
|
+
)
|
598
|
+
|
543
599
|
def commit_audio(self) -> None:
|
544
600
|
raise NotImplementedError("commit_audio_buffer is not supported yet")
|
545
601
|
|
livekit/plugins/google/llm.py
CHANGED
@@ -26,6 +26,7 @@ from google.genai import types
|
|
26
26
|
from google.genai.errors import APIError, ClientError, ServerError
|
27
27
|
from livekit.agents import APIConnectionError, APIStatusError, llm, utils
|
28
28
|
from livekit.agents.llm import FunctionTool, ToolChoice, utils as llm_utils
|
29
|
+
from livekit.agents.llm.tool_context import get_function_info
|
29
30
|
from livekit.agents.types import (
|
30
31
|
DEFAULT_API_CONNECT_OPTIONS,
|
31
32
|
NOT_GIVEN,
|
@@ -173,7 +174,9 @@ class LLM(llm.LLM):
|
|
173
174
|
gemini_tool_choice = types.ToolConfig(
|
174
175
|
function_calling_config=types.FunctionCallingConfig(
|
175
176
|
mode="ANY",
|
176
|
-
allowed_function_names=[fnc.name for fnc in tools]
|
177
|
+
allowed_function_names=[get_function_info(fnc).name for fnc in tools]
|
178
|
+
if tools
|
179
|
+
else None,
|
177
180
|
)
|
178
181
|
)
|
179
182
|
extra["tool_config"] = gemini_tool_choice
|
{livekit_plugins_google-1.0.11.dist-info → livekit_plugins_google-1.0.13.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: livekit-plugins-google
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.13
|
4
4
|
Summary: Agent Framework plugin for services from Google Cloud
|
5
5
|
Project-URL: Documentation, https://docs.livekit.io
|
6
6
|
Project-URL: Website, https://livekit.io/
|
@@ -21,8 +21,8 @@ Requires-Python: >=3.9.0
|
|
21
21
|
Requires-Dist: google-auth<3,>=2
|
22
22
|
Requires-Dist: google-cloud-speech<3,>=2
|
23
23
|
Requires-Dist: google-cloud-texttospeech<3,>=2
|
24
|
-
Requires-Dist: google-genai
|
25
|
-
Requires-Dist: livekit-agents>=1.0.
|
24
|
+
Requires-Dist: google-genai>=1.10.0
|
25
|
+
Requires-Dist: livekit-agents>=1.0.13
|
26
26
|
Description-Content-Type: text/markdown
|
27
27
|
|
28
28
|
# LiveKit Plugins Google
|
@@ -1,16 +1,16 @@
|
|
1
1
|
livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
|
2
|
-
livekit/plugins/google/llm.py,sha256=
|
2
|
+
livekit/plugins/google/llm.py,sha256=yAm-to2ItTJ7dAHc-2mlPeI0Npz9ZxRdyuRLV8PINqg,14888
|
3
3
|
livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
|
4
4
|
livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
|
5
5
|
livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
6
|
livekit/plugins/google/stt.py,sha256=AG_lh2fuuduJi0jFbA_QKFXLJ6NUdF1W_FfkLUJML_Q,22413
|
7
7
|
livekit/plugins/google/tts.py,sha256=P8Zu2s0TfmyzlrNxzDIqyn3sGiNSW0n3nB_JlO_ojiM,7985
|
8
8
|
livekit/plugins/google/utils.py,sha256=pbLSOAdQxInWhgI2Yhsrr9KvgvpFXYDdU2yx2p03pFg,9437
|
9
|
-
livekit/plugins/google/version.py,sha256=
|
9
|
+
livekit/plugins/google/version.py,sha256=i9Tq4ZlIN5uba7xHRxp31dxAE9NuzqobM8zWhdM4QgA,601
|
10
10
|
livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
|
11
11
|
livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
|
12
12
|
livekit/plugins/google/beta/realtime/api_proto.py,sha256=cwpFOYjN_3v5PMY0TnzoHhJoASfZ7Qt9IO281ZhJ7Ww,565
|
13
|
-
livekit/plugins/google/beta/realtime/realtime_api.py,sha256=
|
14
|
-
livekit_plugins_google-1.0.
|
15
|
-
livekit_plugins_google-1.0.
|
16
|
-
livekit_plugins_google-1.0.
|
13
|
+
livekit/plugins/google/beta/realtime/realtime_api.py,sha256=JBEEOeTl6gv6Fe6GtYJjj9C-dqvfhWpOzNAa0tnTKgM,25002
|
14
|
+
livekit_plugins_google-1.0.13.dist-info/METADATA,sha256=u8ocRjsu24AzO_FRgqYZzDqc3gKnQGp1hprKBc3RFm4,3492
|
15
|
+
livekit_plugins_google-1.0.13.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
16
|
+
livekit_plugins_google-1.0.13.dist-info/RECORD,,
|
File without changes
|