livekit-plugins-google 1.0.0rc9__tar.gz → 1.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {livekit_plugins_google-1.0.0rc9 → livekit_plugins_google-1.0.2}/PKG-INFO +2 -2
- {livekit_plugins_google-1.0.0rc9 → livekit_plugins_google-1.0.2}/livekit/plugins/google/beta/realtime/api_proto.py +0 -3
- livekit_plugins_google-1.0.2/livekit/plugins/google/beta/realtime/realtime_api.py +550 -0
- {livekit_plugins_google-1.0.0rc9 → livekit_plugins_google-1.0.2}/livekit/plugins/google/stt.py +6 -12
- {livekit_plugins_google-1.0.0rc9 → livekit_plugins_google-1.0.2}/livekit/plugins/google/tts.py +4 -7
- {livekit_plugins_google-1.0.0rc9 → livekit_plugins_google-1.0.2}/livekit/plugins/google/utils.py +21 -3
- {livekit_plugins_google-1.0.0rc9 → livekit_plugins_google-1.0.2}/livekit/plugins/google/version.py +1 -1
- {livekit_plugins_google-1.0.0rc9 → livekit_plugins_google-1.0.2}/pyproject.toml +1 -1
- livekit_plugins_google-1.0.0rc9/livekit/plugins/google/beta/realtime/realtime_api.py +0 -569
- livekit_plugins_google-1.0.0rc9/livekit/plugins/google/beta/realtime/temp.py +0 -10
- livekit_plugins_google-1.0.0rc9/livekit/plugins/google/beta/realtime/transcriber.py +0 -254
- {livekit_plugins_google-1.0.0rc9 → livekit_plugins_google-1.0.2}/.gitignore +0 -0
- {livekit_plugins_google-1.0.0rc9 → livekit_plugins_google-1.0.2}/README.md +0 -0
- {livekit_plugins_google-1.0.0rc9 → livekit_plugins_google-1.0.2}/livekit/plugins/google/__init__.py +0 -0
- {livekit_plugins_google-1.0.0rc9 → livekit_plugins_google-1.0.2}/livekit/plugins/google/beta/__init__.py +0 -0
- {livekit_plugins_google-1.0.0rc9 → livekit_plugins_google-1.0.2}/livekit/plugins/google/beta/realtime/__init__.py +0 -0
- {livekit_plugins_google-1.0.0rc9 → livekit_plugins_google-1.0.2}/livekit/plugins/google/llm.py +0 -0
- {livekit_plugins_google-1.0.0rc9 → livekit_plugins_google-1.0.2}/livekit/plugins/google/log.py +0 -0
- {livekit_plugins_google-1.0.0rc9 → livekit_plugins_google-1.0.2}/livekit/plugins/google/models.py +0 -0
- {livekit_plugins_google-1.0.0rc9 → livekit_plugins_google-1.0.2}/livekit/plugins/google/py.typed +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: livekit-plugins-google
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.2
|
4
4
|
Summary: Agent Framework plugin for services from Google Cloud
|
5
5
|
Project-URL: Documentation, https://docs.livekit.io
|
6
6
|
Project-URL: Website, https://livekit.io/
|
@@ -22,7 +22,7 @@ Requires-Dist: google-auth<3,>=2
|
|
22
22
|
Requires-Dist: google-cloud-speech<3,>=2
|
23
23
|
Requires-Dist: google-cloud-texttospeech<3,>=2
|
24
24
|
Requires-Dist: google-genai==1.5.0
|
25
|
-
Requires-Dist: livekit-agents>=1.0.
|
25
|
+
Requires-Dist: livekit-agents>=1.0.2
|
26
26
|
Description-Content-Type: text/markdown
|
27
27
|
|
28
28
|
# LiveKit Plugins Google
|
@@ -5,13 +5,10 @@ from typing import Literal, Union
|
|
5
5
|
|
6
6
|
from google.genai import types
|
7
7
|
|
8
|
-
# from ..._utils import _build_gemini_ctx, _build_tools
|
9
|
-
|
10
8
|
LiveAPIModels = Literal["gemini-2.0-flash-exp"]
|
11
9
|
|
12
10
|
Voice = Literal["Puck", "Charon", "Kore", "Fenrir", "Aoede"]
|
13
11
|
|
14
|
-
# __all__ = ["_build_tools", "ClientEvents", "_build_gemini_ctx"]
|
15
12
|
|
16
13
|
ClientEvents = Union[
|
17
14
|
types.ContentListUnion,
|
@@ -0,0 +1,550 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
import json
|
5
|
+
import os
|
6
|
+
import weakref
|
7
|
+
from dataclasses import dataclass
|
8
|
+
|
9
|
+
from google import genai
|
10
|
+
from google.genai._api_client import HttpOptions
|
11
|
+
from google.genai.types import (
|
12
|
+
Blob,
|
13
|
+
Content,
|
14
|
+
FunctionDeclaration,
|
15
|
+
GenerationConfig,
|
16
|
+
LiveClientContent,
|
17
|
+
LiveClientRealtimeInput,
|
18
|
+
LiveConnectConfig,
|
19
|
+
LiveServerContent,
|
20
|
+
LiveServerToolCall,
|
21
|
+
LiveServerToolCallCancellation,
|
22
|
+
Modality,
|
23
|
+
Part,
|
24
|
+
PrebuiltVoiceConfig,
|
25
|
+
SpeechConfig,
|
26
|
+
Tool,
|
27
|
+
VoiceConfig,
|
28
|
+
)
|
29
|
+
from livekit import rtc
|
30
|
+
from livekit.agents import llm, utils
|
31
|
+
from livekit.agents.types import NOT_GIVEN, NotGivenOr
|
32
|
+
from livekit.agents.utils import is_given
|
33
|
+
|
34
|
+
from ...log import logger
|
35
|
+
from ...utils import _build_gemini_fnc, get_tool_results_for_realtime, to_chat_ctx
|
36
|
+
from .api_proto import ClientEvents, LiveAPIModels, Voice
|
37
|
+
|
38
|
+
INPUT_AUDIO_SAMPLE_RATE = 16000
|
39
|
+
OUTPUT_AUDIO_SAMPLE_RATE = 24000
|
40
|
+
NUM_CHANNELS = 1
|
41
|
+
|
42
|
+
|
43
|
+
@dataclass
|
44
|
+
class InputTranscription:
|
45
|
+
item_id: str
|
46
|
+
transcript: str
|
47
|
+
|
48
|
+
|
49
|
+
@dataclass
|
50
|
+
class _RealtimeOptions:
|
51
|
+
model: LiveAPIModels | str
|
52
|
+
api_key: str | None
|
53
|
+
voice: Voice | str
|
54
|
+
response_modalities: NotGivenOr[list[Modality]]
|
55
|
+
vertexai: bool
|
56
|
+
project: str | None
|
57
|
+
location: str | None
|
58
|
+
candidate_count: int
|
59
|
+
temperature: NotGivenOr[float]
|
60
|
+
max_output_tokens: NotGivenOr[int]
|
61
|
+
top_p: NotGivenOr[float]
|
62
|
+
top_k: NotGivenOr[int]
|
63
|
+
presence_penalty: NotGivenOr[float]
|
64
|
+
frequency_penalty: NotGivenOr[float]
|
65
|
+
instructions: NotGivenOr[str]
|
66
|
+
|
67
|
+
|
68
|
+
@dataclass
|
69
|
+
class _MessageGeneration:
|
70
|
+
message_id: str
|
71
|
+
text_ch: utils.aio.Chan[str]
|
72
|
+
audio_ch: utils.aio.Chan[rtc.AudioFrame]
|
73
|
+
|
74
|
+
|
75
|
+
@dataclass
|
76
|
+
class _ResponseGeneration:
|
77
|
+
message_ch: utils.aio.Chan[llm.MessageGeneration]
|
78
|
+
function_ch: utils.aio.Chan[llm.FunctionCall]
|
79
|
+
|
80
|
+
messages: dict[str, _MessageGeneration]
|
81
|
+
|
82
|
+
|
83
|
+
class RealtimeModel(llm.RealtimeModel):
|
84
|
+
def __init__(
|
85
|
+
self,
|
86
|
+
*,
|
87
|
+
instructions: NotGivenOr[str] = NOT_GIVEN,
|
88
|
+
model: LiveAPIModels | str = "gemini-2.0-flash-exp",
|
89
|
+
api_key: NotGivenOr[str] = NOT_GIVEN,
|
90
|
+
voice: Voice | str = "Puck",
|
91
|
+
modalities: NotGivenOr[list[Modality]] = NOT_GIVEN,
|
92
|
+
vertexai: bool = False,
|
93
|
+
project: NotGivenOr[str] = NOT_GIVEN,
|
94
|
+
location: NotGivenOr[str] = NOT_GIVEN,
|
95
|
+
candidate_count: int = 1,
|
96
|
+
temperature: NotGivenOr[float] = NOT_GIVEN,
|
97
|
+
max_output_tokens: NotGivenOr[int] = NOT_GIVEN,
|
98
|
+
top_p: NotGivenOr[float] = NOT_GIVEN,
|
99
|
+
top_k: NotGivenOr[int] = NOT_GIVEN,
|
100
|
+
presence_penalty: NotGivenOr[float] = NOT_GIVEN,
|
101
|
+
frequency_penalty: NotGivenOr[float] = NOT_GIVEN,
|
102
|
+
) -> None:
|
103
|
+
"""
|
104
|
+
Initializes a RealtimeModel instance for interacting with Google's Realtime API.
|
105
|
+
|
106
|
+
Environment Requirements:
|
107
|
+
- For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file.
|
108
|
+
The Google Cloud project and location can be set via `project` and `location` arguments or the environment variables
|
109
|
+
`GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_LOCATION`. By default, the project is inferred from the service account key file,
|
110
|
+
and the location defaults to "us-central1".
|
111
|
+
- For Google Gemini API: Set the `api_key` argument or the `GOOGLE_API_KEY` environment variable.
|
112
|
+
|
113
|
+
Args:
|
114
|
+
instructions (str, optional): Initial system instructions for the model. Defaults to "".
|
115
|
+
api_key (str, optional): Google Gemini API key. If None, will attempt to read from the environment variable GOOGLE_API_KEY.
|
116
|
+
modalities (list[Modality], optional): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
|
117
|
+
model (str, optional): The name of the model to use. Defaults to "gemini-2.0-flash-exp".
|
118
|
+
voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
|
119
|
+
temperature (float, optional): Sampling temperature for response generation. Defaults to 0.8.
|
120
|
+
vertexai (bool, optional): Whether to use VertexAI for the API. Defaults to False.
|
121
|
+
project (str, optional): The project id to use for the API. Defaults to None. (for vertexai)
|
122
|
+
location (str, optional): The location to use for the API. Defaults to None. (for vertexai)
|
123
|
+
candidate_count (int, optional): The number of candidate responses to generate. Defaults to 1.
|
124
|
+
top_p (float, optional): The top-p value for response generation
|
125
|
+
top_k (int, optional): The top-k value for response generation
|
126
|
+
presence_penalty (float, optional): The presence penalty for response generation
|
127
|
+
frequency_penalty (float, optional): The frequency penalty for response generation
|
128
|
+
|
129
|
+
Raises:
|
130
|
+
ValueError: If the API key is required but not found.
|
131
|
+
""" # noqa: E501
|
132
|
+
super().__init__(
|
133
|
+
capabilities=llm.RealtimeCapabilities(
|
134
|
+
message_truncation=False,
|
135
|
+
turn_detection=True,
|
136
|
+
user_transcription=False,
|
137
|
+
)
|
138
|
+
)
|
139
|
+
|
140
|
+
gemini_api_key = api_key if is_given(api_key) else os.environ.get("GOOGLE_API_KEY")
|
141
|
+
gcp_project = project if is_given(project) else os.environ.get("GOOGLE_CLOUD_PROJECT")
|
142
|
+
gcp_location = location if is_given(location) else os.environ.get("GOOGLE_CLOUD_LOCATION")
|
143
|
+
if vertexai:
|
144
|
+
if not gcp_project or not gcp_location:
|
145
|
+
raise ValueError(
|
146
|
+
"Project and location are required for VertexAI either via project and location or GOOGLE_CLOUD_PROJECT and GOOGLE_CLOUD_LOCATION environment variables" # noqa: E501
|
147
|
+
)
|
148
|
+
gemini_api_key = None # VertexAI does not require an API key
|
149
|
+
|
150
|
+
else:
|
151
|
+
gcp_project = None
|
152
|
+
gcp_location = None
|
153
|
+
if not gemini_api_key:
|
154
|
+
raise ValueError(
|
155
|
+
"API key is required for Google API either via api_key or GOOGLE_API_KEY environment variable" # noqa: E501
|
156
|
+
)
|
157
|
+
|
158
|
+
self._opts = _RealtimeOptions(
|
159
|
+
model=model,
|
160
|
+
api_key=gemini_api_key,
|
161
|
+
voice=voice,
|
162
|
+
response_modalities=modalities,
|
163
|
+
vertexai=vertexai,
|
164
|
+
project=gcp_project,
|
165
|
+
location=gcp_location,
|
166
|
+
candidate_count=candidate_count,
|
167
|
+
temperature=temperature,
|
168
|
+
max_output_tokens=max_output_tokens,
|
169
|
+
top_p=top_p,
|
170
|
+
top_k=top_k,
|
171
|
+
presence_penalty=presence_penalty,
|
172
|
+
frequency_penalty=frequency_penalty,
|
173
|
+
instructions=instructions,
|
174
|
+
)
|
175
|
+
|
176
|
+
self._sessions = weakref.WeakSet[RealtimeSession]()
|
177
|
+
|
178
|
+
def session(self) -> RealtimeSession:
|
179
|
+
sess = RealtimeSession(self)
|
180
|
+
self._sessions.add(sess)
|
181
|
+
return sess
|
182
|
+
|
183
|
+
def update_options(
|
184
|
+
self, *, voice: NotGivenOr[str] = NOT_GIVEN, temperature: NotGivenOr[float] = NOT_GIVEN
|
185
|
+
) -> None:
|
186
|
+
if is_given(voice):
|
187
|
+
self._opts.voice = voice
|
188
|
+
|
189
|
+
if is_given(temperature):
|
190
|
+
self._opts.temperature = temperature
|
191
|
+
|
192
|
+
for sess in self._sessions:
|
193
|
+
sess.update_options(voice=self._opts.voice, temperature=self._opts.temperature)
|
194
|
+
|
195
|
+
async def aclose(self) -> None: ...
|
196
|
+
|
197
|
+
|
198
|
+
class RealtimeSession(llm.RealtimeSession):
|
199
|
+
def __init__(self, realtime_model: RealtimeModel) -> None:
|
200
|
+
super().__init__(realtime_model)
|
201
|
+
self._opts = realtime_model._opts
|
202
|
+
self._tools = llm.ToolContext.empty()
|
203
|
+
self._chat_ctx = llm.ChatContext.empty()
|
204
|
+
self._msg_ch = utils.aio.Chan[ClientEvents]()
|
205
|
+
self._gemini_tools: list[Tool] = []
|
206
|
+
self._client = genai.Client(
|
207
|
+
http_options=HttpOptions(api_version="v1alpha"),
|
208
|
+
api_key=self._opts.api_key,
|
209
|
+
vertexai=self._opts.vertexai,
|
210
|
+
project=self._opts.project,
|
211
|
+
location=self._opts.location,
|
212
|
+
)
|
213
|
+
self._main_atask = asyncio.create_task(self._main_task(), name="gemini-realtime-session")
|
214
|
+
|
215
|
+
self._current_generation: _ResponseGeneration | None = None
|
216
|
+
|
217
|
+
self._is_interrupted = False
|
218
|
+
self._active_response_id = None
|
219
|
+
self._session = None
|
220
|
+
self._update_chat_ctx_lock = asyncio.Lock()
|
221
|
+
self._update_fnc_ctx_lock = asyncio.Lock()
|
222
|
+
self._response_created_futures: dict[str, asyncio.Future[llm.GenerationCreatedEvent]] = {}
|
223
|
+
self._pending_generation_event_id = None
|
224
|
+
|
225
|
+
self._reconnect_event = asyncio.Event()
|
226
|
+
self._session_lock = asyncio.Lock()
|
227
|
+
self._gemini_close_task: asyncio.Task | None = None
|
228
|
+
|
229
|
+
def _schedule_gemini_session_close(self) -> None:
|
230
|
+
if self._session is not None:
|
231
|
+
self._gemini_close_task = asyncio.create_task(self._close_gemini_session())
|
232
|
+
|
233
|
+
async def _close_gemini_session(self) -> None:
|
234
|
+
async with self._session_lock:
|
235
|
+
if self._session:
|
236
|
+
try:
|
237
|
+
await self._session.close()
|
238
|
+
finally:
|
239
|
+
self._session = None
|
240
|
+
|
241
|
+
def update_options(
|
242
|
+
self,
|
243
|
+
*,
|
244
|
+
voice: NotGivenOr[str] = NOT_GIVEN,
|
245
|
+
tool_choice: NotGivenOr[llm.ToolChoice | None] = NOT_GIVEN,
|
246
|
+
temperature: NotGivenOr[float] = NOT_GIVEN,
|
247
|
+
) -> None:
|
248
|
+
if is_given(voice):
|
249
|
+
self._opts.voice = voice
|
250
|
+
|
251
|
+
if is_given(temperature):
|
252
|
+
self._opts.temperature = temperature
|
253
|
+
|
254
|
+
if self._session:
|
255
|
+
logger.warning("Updating options; triggering Gemini session reconnect.")
|
256
|
+
self._reconnect_event.set()
|
257
|
+
self._schedule_gemini_session_close()
|
258
|
+
|
259
|
+
async def update_instructions(self, instructions: str) -> None:
|
260
|
+
self._opts.instructions = instructions
|
261
|
+
if self._session:
|
262
|
+
logger.warning("Updating instructions; triggering Gemini session reconnect.")
|
263
|
+
self._reconnect_event.set()
|
264
|
+
self._schedule_gemini_session_close()
|
265
|
+
|
266
|
+
async def update_chat_ctx(self, chat_ctx: llm.ChatContext) -> None:
|
267
|
+
async with self._update_chat_ctx_lock:
|
268
|
+
self._chat_ctx = chat_ctx
|
269
|
+
turns, _ = to_chat_ctx(self._chat_ctx, id(self), ignore_functions=True)
|
270
|
+
tool_results = get_tool_results_for_realtime(self._chat_ctx)
|
271
|
+
if turns:
|
272
|
+
self._msg_ch.send_nowait(LiveClientContent(turns=turns, turn_complete=False))
|
273
|
+
if tool_results:
|
274
|
+
self._msg_ch.send_nowait(tool_results)
|
275
|
+
|
276
|
+
async def update_tools(self, tools: list[llm.FunctionTool]) -> None:
|
277
|
+
async with self._update_fnc_ctx_lock:
|
278
|
+
retained_tools: list[llm.FunctionTool] = []
|
279
|
+
gemini_function_declarations: list[FunctionDeclaration] = []
|
280
|
+
|
281
|
+
for tool in tools:
|
282
|
+
gemini_function = _build_gemini_fnc(tool)
|
283
|
+
gemini_function_declarations.append(gemini_function)
|
284
|
+
retained_tools.append(tool)
|
285
|
+
|
286
|
+
self._tools = llm.ToolContext(retained_tools)
|
287
|
+
self._gemini_tools = [Tool(function_declarations=gemini_function_declarations)]
|
288
|
+
if self._session and gemini_function_declarations:
|
289
|
+
logger.warning("Updating tools; triggering Gemini session reconnect.")
|
290
|
+
self._reconnect_event.set()
|
291
|
+
self._schedule_gemini_session_close()
|
292
|
+
|
293
|
+
@property
|
294
|
+
def chat_ctx(self) -> llm.ChatContext:
|
295
|
+
return self._chat_ctx
|
296
|
+
|
297
|
+
@property
|
298
|
+
def tools(self) -> llm.ToolContext:
|
299
|
+
return self._tools
|
300
|
+
|
301
|
+
def push_audio(self, frame: rtc.AudioFrame) -> None:
|
302
|
+
realtime_input = LiveClientRealtimeInput(
|
303
|
+
media_chunks=[Blob(data=frame.data.tobytes(), mime_type="audio/pcm")],
|
304
|
+
)
|
305
|
+
self._msg_ch.send_nowait(realtime_input)
|
306
|
+
|
307
|
+
def generate_reply(
|
308
|
+
self, *, instructions: NotGivenOr[str] = NOT_GIVEN
|
309
|
+
) -> asyncio.Future[llm.GenerationCreatedEvent]:
|
310
|
+
fut = asyncio.Future()
|
311
|
+
|
312
|
+
event_id = utils.shortuuid("gemini-response-")
|
313
|
+
self._response_created_futures[event_id] = fut
|
314
|
+
self._pending_generation_event_id = event_id
|
315
|
+
|
316
|
+
instructions_content = instructions if is_given(instructions) else "."
|
317
|
+
ctx = [Content(parts=[Part(text=instructions_content)], role="user")]
|
318
|
+
self._msg_ch.send_nowait(LiveClientContent(turns=ctx, turn_complete=True))
|
319
|
+
|
320
|
+
def _on_timeout() -> None:
|
321
|
+
if event_id in self._response_created_futures and not fut.done():
|
322
|
+
fut.set_exception(llm.RealtimeError("generate_reply timed out."))
|
323
|
+
self._response_created_futures.pop(event_id, None)
|
324
|
+
if self._pending_generation_event_id == event_id:
|
325
|
+
self._pending_generation_event_id = None
|
326
|
+
|
327
|
+
handle = asyncio.get_event_loop().call_later(5.0, _on_timeout)
|
328
|
+
fut.add_done_callback(lambda _: handle.cancel())
|
329
|
+
|
330
|
+
return fut
|
331
|
+
|
332
|
+
def interrupt(self) -> None:
|
333
|
+
logger.warning("interrupt() - no direct cancellation in Gemini")
|
334
|
+
|
335
|
+
def truncate(self, *, message_id: str, audio_end_ms: int) -> None:
|
336
|
+
logger.warning(f"truncate(...) called for {message_id}, ignoring for Gemini")
|
337
|
+
|
338
|
+
async def aclose(self) -> None:
|
339
|
+
self._msg_ch.close()
|
340
|
+
|
341
|
+
for fut in self._response_created_futures.values():
|
342
|
+
if not fut.done():
|
343
|
+
fut.set_exception(llm.RealtimeError("Session closed"))
|
344
|
+
|
345
|
+
if self._main_atask:
|
346
|
+
await utils.aio.cancel_and_wait(self._main_atask)
|
347
|
+
|
348
|
+
if self._gemini_close_task:
|
349
|
+
await utils.aio.cancel_and_wait(self._gemini_close_task)
|
350
|
+
|
351
|
+
@utils.log_exceptions(logger=logger)
|
352
|
+
async def _main_task(self):
|
353
|
+
while True:
|
354
|
+
config = LiveConnectConfig(
|
355
|
+
response_modalities=self._opts.response_modalities
|
356
|
+
if is_given(self._opts.response_modalities)
|
357
|
+
else [Modality.AUDIO],
|
358
|
+
generation_config=GenerationConfig(
|
359
|
+
candidate_count=self._opts.candidate_count,
|
360
|
+
temperature=self._opts.temperature
|
361
|
+
if is_given(self._opts.temperature)
|
362
|
+
else None,
|
363
|
+
max_output_tokens=self._opts.max_output_tokens
|
364
|
+
if is_given(self._opts.max_output_tokens)
|
365
|
+
else None,
|
366
|
+
top_p=self._opts.top_p if is_given(self._opts.top_p) else None,
|
367
|
+
top_k=self._opts.top_k if is_given(self._opts.top_k) else None,
|
368
|
+
presence_penalty=self._opts.presence_penalty
|
369
|
+
if is_given(self._opts.presence_penalty)
|
370
|
+
else None,
|
371
|
+
frequency_penalty=self._opts.frequency_penalty
|
372
|
+
if is_given(self._opts.frequency_penalty)
|
373
|
+
else None,
|
374
|
+
),
|
375
|
+
system_instruction=Content(parts=[Part(text=self._opts.instructions)])
|
376
|
+
if is_given(self._opts.instructions)
|
377
|
+
else None,
|
378
|
+
speech_config=SpeechConfig(
|
379
|
+
voice_config=VoiceConfig(
|
380
|
+
prebuilt_voice_config=PrebuiltVoiceConfig(voice_name=self._opts.voice)
|
381
|
+
)
|
382
|
+
),
|
383
|
+
tools=self._gemini_tools,
|
384
|
+
)
|
385
|
+
|
386
|
+
async with self._client.aio.live.connect(
|
387
|
+
model=self._opts.model, config=config
|
388
|
+
) as session:
|
389
|
+
async with self._session_lock:
|
390
|
+
self._session = session
|
391
|
+
|
392
|
+
@utils.log_exceptions(logger=logger)
|
393
|
+
async def _send_task():
|
394
|
+
async for msg in self._msg_ch:
|
395
|
+
if isinstance(msg, LiveClientContent):
|
396
|
+
await session.send(input=msg, end_of_turn=True)
|
397
|
+
|
398
|
+
await session.send(input=msg)
|
399
|
+
await session.send(input=".", end_of_turn=True)
|
400
|
+
|
401
|
+
@utils.log_exceptions(logger=logger)
|
402
|
+
async def _recv_task():
|
403
|
+
while True:
|
404
|
+
async for response in session.receive():
|
405
|
+
if self._active_response_id is None:
|
406
|
+
self._start_new_generation()
|
407
|
+
if response.server_content:
|
408
|
+
self._handle_server_content(response.server_content)
|
409
|
+
if response.tool_call:
|
410
|
+
self._handle_tool_calls(response.tool_call)
|
411
|
+
if response.tool_call_cancellation:
|
412
|
+
self._handle_tool_call_cancellation(response.tool_call_cancellation)
|
413
|
+
|
414
|
+
send_task = asyncio.create_task(_send_task(), name="gemini-realtime-send")
|
415
|
+
recv_task = asyncio.create_task(_recv_task(), name="gemini-realtime-recv")
|
416
|
+
reconnect_task = asyncio.create_task(
|
417
|
+
self._reconnect_event.wait(), name="reconnect-wait"
|
418
|
+
)
|
419
|
+
|
420
|
+
try:
|
421
|
+
done, _ = await asyncio.wait(
|
422
|
+
[send_task, recv_task, reconnect_task],
|
423
|
+
return_when=asyncio.FIRST_COMPLETED,
|
424
|
+
)
|
425
|
+
for task in done:
|
426
|
+
if task != reconnect_task:
|
427
|
+
task.result()
|
428
|
+
|
429
|
+
if reconnect_task not in done:
|
430
|
+
break
|
431
|
+
|
432
|
+
self._reconnect_event.clear()
|
433
|
+
finally:
|
434
|
+
await utils.aio.cancel_and_wait(send_task, recv_task, reconnect_task)
|
435
|
+
|
436
|
+
def _start_new_generation(self):
|
437
|
+
self._is_interrupted = False
|
438
|
+
self._active_response_id = utils.shortuuid("gemini-turn-")
|
439
|
+
self._current_generation = _ResponseGeneration(
|
440
|
+
message_ch=utils.aio.Chan[llm.MessageGeneration](),
|
441
|
+
function_ch=utils.aio.Chan[llm.FunctionCall](),
|
442
|
+
messages={},
|
443
|
+
)
|
444
|
+
|
445
|
+
# We'll assume each chunk belongs to a single message ID self._active_response_id
|
446
|
+
item_generation = _MessageGeneration(
|
447
|
+
message_id=self._active_response_id,
|
448
|
+
text_ch=utils.aio.Chan[str](),
|
449
|
+
audio_ch=utils.aio.Chan[rtc.AudioFrame](),
|
450
|
+
)
|
451
|
+
|
452
|
+
self._current_generation.message_ch.send_nowait(
|
453
|
+
llm.MessageGeneration(
|
454
|
+
message_id=self._active_response_id,
|
455
|
+
text_stream=item_generation.text_ch,
|
456
|
+
audio_stream=item_generation.audio_ch,
|
457
|
+
)
|
458
|
+
)
|
459
|
+
|
460
|
+
generation_event = llm.GenerationCreatedEvent(
|
461
|
+
message_stream=self._current_generation.message_ch,
|
462
|
+
function_stream=self._current_generation.function_ch,
|
463
|
+
user_initiated=False,
|
464
|
+
)
|
465
|
+
|
466
|
+
# Resolve any pending future from generate_reply()
|
467
|
+
if self._pending_generation_event_id and (
|
468
|
+
fut := self._response_created_futures.pop(self._pending_generation_event_id, None)
|
469
|
+
):
|
470
|
+
fut.set_result(generation_event)
|
471
|
+
|
472
|
+
self._pending_generation_event_id = None
|
473
|
+
self.emit("generation_created", generation_event)
|
474
|
+
|
475
|
+
self._current_generation.messages[self._active_response_id] = item_generation
|
476
|
+
|
477
|
+
def _handle_server_content(self, server_content: LiveServerContent):
|
478
|
+
if not self._current_generation or not self._active_response_id:
|
479
|
+
logger.warning(
|
480
|
+
"gemini-realtime-session: No active response ID, skipping server content"
|
481
|
+
)
|
482
|
+
return
|
483
|
+
|
484
|
+
item_generation = self._current_generation.messages[self._active_response_id]
|
485
|
+
|
486
|
+
model_turn = server_content.model_turn
|
487
|
+
if model_turn:
|
488
|
+
for part in model_turn.parts:
|
489
|
+
if part.text:
|
490
|
+
item_generation.text_ch.send_nowait(part.text)
|
491
|
+
if part.inline_data:
|
492
|
+
frame_data = part.inline_data.data
|
493
|
+
frame = rtc.AudioFrame(
|
494
|
+
data=frame_data,
|
495
|
+
sample_rate=OUTPUT_AUDIO_SAMPLE_RATE,
|
496
|
+
num_channels=NUM_CHANNELS,
|
497
|
+
samples_per_channel=len(frame_data) // 2,
|
498
|
+
)
|
499
|
+
item_generation.audio_ch.send_nowait(frame)
|
500
|
+
|
501
|
+
if server_content.interrupted or server_content.turn_complete:
|
502
|
+
self._finalize_response()
|
503
|
+
|
504
|
+
def _finalize_response(self) -> None:
|
505
|
+
if not self._current_generation:
|
506
|
+
return
|
507
|
+
|
508
|
+
for item_generation in self._current_generation.messages.values():
|
509
|
+
item_generation.text_ch.close()
|
510
|
+
item_generation.audio_ch.close()
|
511
|
+
|
512
|
+
self._current_generation.function_ch.close()
|
513
|
+
self._current_generation.message_ch.close()
|
514
|
+
self._current_generation = None
|
515
|
+
self._is_interrupted = True
|
516
|
+
self._active_response_id = None
|
517
|
+
self.emit("agent_speech_stopped")
|
518
|
+
|
519
|
+
def _handle_tool_calls(self, tool_call: LiveServerToolCall):
|
520
|
+
if not self._current_generation:
|
521
|
+
return
|
522
|
+
for fnc_call in tool_call.function_calls:
|
523
|
+
self._current_generation.function_ch.send_nowait(
|
524
|
+
llm.FunctionCall(
|
525
|
+
call_id=fnc_call.id,
|
526
|
+
name=fnc_call.name,
|
527
|
+
arguments=json.dumps(fnc_call.args),
|
528
|
+
)
|
529
|
+
)
|
530
|
+
self._finalize_response()
|
531
|
+
|
532
|
+
def _handle_tool_call_cancellation(
|
533
|
+
self, tool_call_cancellation: LiveServerToolCallCancellation
|
534
|
+
):
|
535
|
+
logger.warning(
|
536
|
+
"function call cancelled",
|
537
|
+
extra={
|
538
|
+
"function_call_ids": tool_call_cancellation.ids,
|
539
|
+
},
|
540
|
+
)
|
541
|
+
self.emit("function_calls_cancelled", tool_call_cancellation.ids)
|
542
|
+
|
543
|
+
def commit_audio(self) -> None:
|
544
|
+
raise NotImplementedError("commit_audio_buffer is not supported yet")
|
545
|
+
|
546
|
+
def clear_audio(self) -> None:
|
547
|
+
raise NotImplementedError("clear_audio is not supported yet")
|
548
|
+
|
549
|
+
def server_vad_enabled(self) -> bool:
|
550
|
+
return True
|
{livekit_plugins_google-1.0.0rc9 → livekit_plugins_google-1.0.2}/livekit/plugins/google/stt.py
RENAMED
@@ -132,11 +132,11 @@ class STT(stt.STT):
|
|
132
132
|
try:
|
133
133
|
gauth_default()
|
134
134
|
except DefaultCredentialsError:
|
135
|
-
raise ValueError(
|
135
|
+
raise ValueError(
|
136
136
|
"Application default credentials must be available "
|
137
137
|
"when using Google STT without explicitly passing "
|
138
138
|
"credentials through credentials_info or credentials_file."
|
139
|
-
)
|
139
|
+
) from None
|
140
140
|
|
141
141
|
if isinstance(languages, str):
|
142
142
|
languages = [languages]
|
@@ -244,12 +244,9 @@ class STT(stt.STT):
|
|
244
244
|
|
245
245
|
return _recognize_response_to_speech_event(raw)
|
246
246
|
except DeadlineExceeded:
|
247
|
-
raise APITimeoutError()
|
247
|
+
raise APITimeoutError() from None
|
248
248
|
except GoogleAPICallError as e:
|
249
|
-
raise APIStatusError(
|
250
|
-
e.message,
|
251
|
-
status_code=e.code or -1,
|
252
|
-
)
|
249
|
+
raise APIStatusError(e.message, status_code=e.code or -1) from None
|
253
250
|
except Exception as e:
|
254
251
|
raise APIConnectionError() from e
|
255
252
|
|
@@ -495,12 +492,9 @@ class SpeechStream(stt.SpeechStream):
|
|
495
492
|
await utils.aio.gracefully_cancel(process_stream_task, wait_reconnect_task)
|
496
493
|
should_stop.set()
|
497
494
|
except DeadlineExceeded:
|
498
|
-
raise APITimeoutError()
|
495
|
+
raise APITimeoutError() from None
|
499
496
|
except GoogleAPICallError as e:
|
500
|
-
raise APIStatusError(
|
501
|
-
e.message,
|
502
|
-
status_code=e.code or -1,
|
503
|
-
)
|
497
|
+
raise APIStatusError(e.message, status_code=e.code or -1) from None
|
504
498
|
except Exception as e:
|
505
499
|
raise APIConnectionError() from e
|
506
500
|
|
{livekit_plugins_google-1.0.0rc9 → livekit_plugins_google-1.0.2}/livekit/plugins/google/tts.py
RENAMED
@@ -203,14 +203,11 @@ class ChunkedStream(tts.ChunkedStream):
|
|
203
203
|
await decoder.aclose()
|
204
204
|
|
205
205
|
except DeadlineExceeded:
|
206
|
-
raise APITimeoutError()
|
206
|
+
raise APITimeoutError() from None
|
207
207
|
except GoogleAPICallError as e:
|
208
|
-
raise APIStatusError(
|
209
|
-
e.message,
|
210
|
-
|
211
|
-
request_id=None,
|
212
|
-
body=None,
|
213
|
-
)
|
208
|
+
raise APIStatusError(
|
209
|
+
e.message, status_code=e.code or -1, request_id=None, body=None
|
210
|
+
) from None
|
214
211
|
except Exception as e:
|
215
212
|
raise APIConnectionError() from e
|
216
213
|
|
{livekit_plugins_google-1.0.0rc9 → livekit_plugins_google-1.0.2}/livekit/plugins/google/utils.py
RENAMED
@@ -20,8 +20,26 @@ def to_fnc_ctx(fncs: list[FunctionTool]) -> list[types.FunctionDeclaration]:
|
|
20
20
|
return [_build_gemini_fnc(fnc) for fnc in fncs]
|
21
21
|
|
22
22
|
|
23
|
+
def get_tool_results_for_realtime(chat_ctx: llm.ChatContext) -> types.LiveClientToolResponse | None:
|
24
|
+
function_responses: list[types.FunctionResponse] = []
|
25
|
+
for msg in chat_ctx.items:
|
26
|
+
if msg.type == "function_call_output":
|
27
|
+
function_responses.append(
|
28
|
+
types.FunctionResponse(
|
29
|
+
id=msg.call_id,
|
30
|
+
name=msg.name,
|
31
|
+
response={"text": msg.output},
|
32
|
+
)
|
33
|
+
)
|
34
|
+
return (
|
35
|
+
types.LiveClientToolResponse(function_responses=function_responses)
|
36
|
+
if function_responses
|
37
|
+
else None
|
38
|
+
)
|
39
|
+
|
40
|
+
|
23
41
|
def to_chat_ctx(
|
24
|
-
chat_ctx: llm.ChatContext, cache_key: Any
|
42
|
+
chat_ctx: llm.ChatContext, cache_key: Any, ignore_functions: bool = False
|
25
43
|
) -> tuple[list[types.Content], types.Content | None]:
|
26
44
|
turns: list[types.Content] = []
|
27
45
|
system_instruction: types.Content | None = None
|
@@ -59,7 +77,7 @@ def to_chat_ctx(
|
|
59
77
|
parts.append(types.Part(text=json.dumps(content)))
|
60
78
|
elif isinstance(content, llm.ImageContent):
|
61
79
|
parts.append(_to_image_part(content, cache_key))
|
62
|
-
elif msg.type == "function_call":
|
80
|
+
elif msg.type == "function_call" and not ignore_functions:
|
63
81
|
parts.append(
|
64
82
|
types.Part(
|
65
83
|
function_call=types.FunctionCall(
|
@@ -68,7 +86,7 @@ def to_chat_ctx(
|
|
68
86
|
)
|
69
87
|
)
|
70
88
|
)
|
71
|
-
elif msg.type == "function_call_output":
|
89
|
+
elif msg.type == "function_call_output" and not ignore_functions:
|
72
90
|
parts.append(
|
73
91
|
types.Part(
|
74
92
|
function_response=types.FunctionResponse(
|