livekit-plugins-google 0.8.1__py3-none-any.whl → 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/google/__init__.py +2 -2
- livekit/plugins/google/beta/__init__.py +3 -0
- livekit/plugins/google/beta/realtime/__init__.py +15 -0
- livekit/plugins/google/beta/realtime/api_proto.py +79 -0
- livekit/plugins/google/beta/realtime/realtime_api.py +424 -0
- livekit/plugins/google/stt.py +68 -11
- livekit/plugins/google/version.py +1 -1
- {livekit_plugins_google-0.8.1.dist-info → livekit_plugins_google-0.9.1.dist-info}/METADATA +19 -3
- livekit_plugins_google-0.9.1.dist-info/RECORD +15 -0
- {livekit_plugins_google-0.8.1.dist-info → livekit_plugins_google-0.9.1.dist-info}/WHEEL +1 -1
- livekit_plugins_google-0.8.1.dist-info/RECORD +0 -11
- {livekit_plugins_google-0.8.1.dist-info → livekit_plugins_google-0.9.1.dist-info}/top_level.txt +0 -0
@@ -12,12 +12,12 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
+
from . import beta
|
15
16
|
from .stt import STT, SpeechStream
|
16
17
|
from .tts import TTS
|
17
18
|
from .version import __version__
|
18
19
|
|
19
|
-
__all__ = ["STT", "TTS", "SpeechStream", "__version__"]
|
20
|
-
|
20
|
+
__all__ = ["STT", "TTS", "SpeechStream", "__version__", "beta"]
|
21
21
|
from livekit.agents import Plugin
|
22
22
|
|
23
23
|
from .log import logger
|
@@ -0,0 +1,79 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import inspect
|
4
|
+
from typing import Any, Dict, List, Literal, Sequence, Union
|
5
|
+
|
6
|
+
from google.genai import types # type: ignore
|
7
|
+
|
8
|
+
LiveAPIModels = Literal["gemini-2.0-flash-exp"]
|
9
|
+
|
10
|
+
Voice = Literal["Puck", "Charon", "Kore", "Fenrir", "Aoede"]
|
11
|
+
ResponseModality = Literal["AUDIO", "TEXT"]
|
12
|
+
|
13
|
+
|
14
|
+
ClientEvents = Union[
|
15
|
+
types.ContentListUnion,
|
16
|
+
types.ContentListUnionDict,
|
17
|
+
types.LiveClientContentOrDict,
|
18
|
+
types.LiveClientRealtimeInput,
|
19
|
+
types.LiveClientRealtimeInputOrDict,
|
20
|
+
types.LiveClientToolResponseOrDict,
|
21
|
+
types.FunctionResponseOrDict,
|
22
|
+
Sequence[types.FunctionResponseOrDict],
|
23
|
+
]
|
24
|
+
|
25
|
+
|
26
|
+
JSON_SCHEMA_TYPE_MAP = {
|
27
|
+
str: "string",
|
28
|
+
int: "integer",
|
29
|
+
float: "number",
|
30
|
+
bool: "boolean",
|
31
|
+
dict: "object",
|
32
|
+
list: "array",
|
33
|
+
}
|
34
|
+
|
35
|
+
|
36
|
+
def _build_parameters(arguments: Dict[str, Any]) -> types.SchemaDict:
|
37
|
+
properties: Dict[str, types.SchemaDict] = {}
|
38
|
+
required: List[str] = []
|
39
|
+
|
40
|
+
for arg_name, arg_info in arguments.items():
|
41
|
+
py_type = arg_info.type
|
42
|
+
if py_type not in JSON_SCHEMA_TYPE_MAP:
|
43
|
+
raise ValueError(f"Unsupported type: {py_type}")
|
44
|
+
|
45
|
+
prop: types.SchemaDict = {
|
46
|
+
"type": JSON_SCHEMA_TYPE_MAP[py_type],
|
47
|
+
"description": arg_info.description,
|
48
|
+
}
|
49
|
+
|
50
|
+
if arg_info.choices:
|
51
|
+
prop["enum"] = arg_info.choices
|
52
|
+
|
53
|
+
properties[arg_name] = prop
|
54
|
+
|
55
|
+
if arg_info.default is inspect.Parameter.empty:
|
56
|
+
required.append(arg_name)
|
57
|
+
|
58
|
+
parameters: types.SchemaDict = {"type": "object", "properties": properties}
|
59
|
+
|
60
|
+
if required:
|
61
|
+
parameters["required"] = required
|
62
|
+
|
63
|
+
return parameters
|
64
|
+
|
65
|
+
|
66
|
+
def _build_tools(fnc_ctx: Any) -> List[types.FunctionDeclarationDict]:
|
67
|
+
function_declarations: List[types.FunctionDeclarationDict] = []
|
68
|
+
for fnc_info in fnc_ctx.ai_functions.values():
|
69
|
+
parameters = _build_parameters(fnc_info.arguments)
|
70
|
+
|
71
|
+
func_decl: types.FunctionDeclarationDict = {
|
72
|
+
"name": fnc_info.name,
|
73
|
+
"description": fnc_info.description,
|
74
|
+
"parameters": parameters,
|
75
|
+
}
|
76
|
+
|
77
|
+
function_declarations.append(func_decl)
|
78
|
+
|
79
|
+
return function_declarations
|
@@ -0,0 +1,424 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
import base64
|
5
|
+
import json
|
6
|
+
import os
|
7
|
+
from dataclasses import dataclass
|
8
|
+
from typing import AsyncIterable, Literal
|
9
|
+
|
10
|
+
from livekit import rtc
|
11
|
+
from livekit.agents import llm, utils
|
12
|
+
from livekit.agents.llm.function_context import _create_ai_function_info
|
13
|
+
|
14
|
+
from google import genai # type: ignore
|
15
|
+
from google.genai.types import ( # type: ignore
|
16
|
+
FunctionResponse,
|
17
|
+
GenerationConfigDict,
|
18
|
+
LiveClientToolResponse,
|
19
|
+
LiveConnectConfigDict,
|
20
|
+
PrebuiltVoiceConfig,
|
21
|
+
SpeechConfig,
|
22
|
+
VoiceConfig,
|
23
|
+
)
|
24
|
+
|
25
|
+
from ...log import logger
|
26
|
+
from .api_proto import (
|
27
|
+
ClientEvents,
|
28
|
+
LiveAPIModels,
|
29
|
+
ResponseModality,
|
30
|
+
Voice,
|
31
|
+
_build_tools,
|
32
|
+
)
|
33
|
+
|
34
|
+
EventTypes = Literal[
|
35
|
+
"start_session",
|
36
|
+
"input_speech_started",
|
37
|
+
"response_content_added",
|
38
|
+
"response_content_done",
|
39
|
+
"function_calls_collected",
|
40
|
+
"function_calls_finished",
|
41
|
+
"function_calls_cancelled",
|
42
|
+
]
|
43
|
+
|
44
|
+
|
45
|
+
@dataclass
|
46
|
+
class GeminiContent:
|
47
|
+
response_id: str
|
48
|
+
item_id: str
|
49
|
+
output_index: int
|
50
|
+
content_index: int
|
51
|
+
text: str
|
52
|
+
audio: list[rtc.AudioFrame]
|
53
|
+
text_stream: AsyncIterable[str]
|
54
|
+
audio_stream: AsyncIterable[rtc.AudioFrame]
|
55
|
+
content_type: Literal["text", "audio"]
|
56
|
+
|
57
|
+
|
58
|
+
@dataclass
|
59
|
+
class Capabilities:
|
60
|
+
supports_truncate: bool
|
61
|
+
|
62
|
+
|
63
|
+
@dataclass
|
64
|
+
class ModelOptions:
|
65
|
+
model: LiveAPIModels | str
|
66
|
+
api_key: str | None
|
67
|
+
voice: Voice | str
|
68
|
+
response_modalities: ResponseModality
|
69
|
+
vertexai: bool
|
70
|
+
project: str | None
|
71
|
+
location: str | None
|
72
|
+
candidate_count: int
|
73
|
+
temperature: float | None
|
74
|
+
max_output_tokens: int | None
|
75
|
+
top_p: float | None
|
76
|
+
top_k: int | None
|
77
|
+
presence_penalty: float | None
|
78
|
+
frequency_penalty: float | None
|
79
|
+
instructions: str
|
80
|
+
|
81
|
+
|
82
|
+
class RealtimeModel:
|
83
|
+
def __init__(
|
84
|
+
self,
|
85
|
+
*,
|
86
|
+
instructions: str = "",
|
87
|
+
model: LiveAPIModels | str = "gemini-2.0-flash-exp",
|
88
|
+
api_key: str | None = None,
|
89
|
+
voice: Voice | str = "Puck",
|
90
|
+
modalities: ResponseModality = "AUDIO",
|
91
|
+
vertexai: bool = False,
|
92
|
+
project: str | None = None,
|
93
|
+
location: str | None = None,
|
94
|
+
candidate_count: int = 1,
|
95
|
+
temperature: float | None = None,
|
96
|
+
max_output_tokens: int | None = None,
|
97
|
+
top_p: float | None = None,
|
98
|
+
top_k: int | None = None,
|
99
|
+
presence_penalty: float | None = None,
|
100
|
+
frequency_penalty: float | None = None,
|
101
|
+
loop: asyncio.AbstractEventLoop | None = None,
|
102
|
+
):
|
103
|
+
"""
|
104
|
+
Initializes a RealtimeModel instance for interacting with Google's Realtime API.
|
105
|
+
|
106
|
+
Args:
|
107
|
+
instructions (str, optional): Initial system instructions for the model. Defaults to "".
|
108
|
+
api_key (str or None, optional): Google Gemini API key. If None, will attempt to read from the environment variable GOOGLE_API_KEY.
|
109
|
+
modalities (ResponseModality): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
|
110
|
+
model (str or None, optional): The name of the model to use. Defaults to "gemini-2.0-flash-exp".
|
111
|
+
voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
|
112
|
+
temperature (float, optional): Sampling temperature for response generation. Defaults to 0.8.
|
113
|
+
vertexai (bool, optional): Whether to use VertexAI for the API. Defaults to False.
|
114
|
+
project (str or None, optional): The project to use for the API. Defaults to None. (for vertexai)
|
115
|
+
location (str or None, optional): The location to use for the API. Defaults to None. (for vertexai)
|
116
|
+
candidate_count (int, optional): The number of candidate responses to generate. Defaults to 1.
|
117
|
+
top_p (float, optional): The top-p value for response generation
|
118
|
+
top_k (int, optional): The top-k value for response generation
|
119
|
+
presence_penalty (float, optional): The presence penalty for response generation
|
120
|
+
frequency_penalty (float, optional): The frequency penalty for response generation
|
121
|
+
loop (asyncio.AbstractEventLoop or None, optional): Event loop to use for async operations. If None, the current event loop is used.
|
122
|
+
|
123
|
+
Raises:
|
124
|
+
ValueError: If the API key is not provided and cannot be found in environment variables.
|
125
|
+
"""
|
126
|
+
super().__init__()
|
127
|
+
self._capabilities = Capabilities(
|
128
|
+
supports_truncate=False,
|
129
|
+
)
|
130
|
+
self._model = model
|
131
|
+
self._loop = loop or asyncio.get_event_loop()
|
132
|
+
self._api_key = api_key or os.environ.get("GOOGLE_API_KEY")
|
133
|
+
self._vertexai = vertexai
|
134
|
+
self._project_id = project or os.environ.get("GOOGLE_PROJECT")
|
135
|
+
self._location = location or os.environ.get("GOOGLE_LOCATION")
|
136
|
+
if self._api_key is None and not self._vertexai:
|
137
|
+
raise ValueError("GOOGLE_API_KEY is not set")
|
138
|
+
|
139
|
+
self._rt_sessions: list[GeminiRealtimeSession] = []
|
140
|
+
self._opts = ModelOptions(
|
141
|
+
model=model,
|
142
|
+
api_key=api_key,
|
143
|
+
voice=voice,
|
144
|
+
response_modalities=modalities,
|
145
|
+
vertexai=vertexai,
|
146
|
+
project=project,
|
147
|
+
location=location,
|
148
|
+
candidate_count=candidate_count,
|
149
|
+
temperature=temperature,
|
150
|
+
max_output_tokens=max_output_tokens,
|
151
|
+
top_p=top_p,
|
152
|
+
top_k=top_k,
|
153
|
+
presence_penalty=presence_penalty,
|
154
|
+
frequency_penalty=frequency_penalty,
|
155
|
+
instructions=instructions,
|
156
|
+
)
|
157
|
+
|
158
|
+
@property
|
159
|
+
def sessions(self) -> list[GeminiRealtimeSession]:
|
160
|
+
return self._rt_sessions
|
161
|
+
|
162
|
+
@property
|
163
|
+
def capabilities(self) -> Capabilities:
|
164
|
+
return self._capabilities
|
165
|
+
|
166
|
+
def session(
|
167
|
+
self,
|
168
|
+
*,
|
169
|
+
chat_ctx: llm.ChatContext | None = None,
|
170
|
+
fnc_ctx: llm.FunctionContext | None = None,
|
171
|
+
) -> GeminiRealtimeSession:
|
172
|
+
session = GeminiRealtimeSession(
|
173
|
+
opts=self._opts,
|
174
|
+
chat_ctx=chat_ctx or llm.ChatContext(),
|
175
|
+
fnc_ctx=fnc_ctx,
|
176
|
+
loop=self._loop,
|
177
|
+
)
|
178
|
+
self._rt_sessions.append(session)
|
179
|
+
|
180
|
+
return session
|
181
|
+
|
182
|
+
async def aclose(self) -> None:
|
183
|
+
for session in self._rt_sessions:
|
184
|
+
await session.aclose()
|
185
|
+
|
186
|
+
|
187
|
+
class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
188
|
+
def __init__(
|
189
|
+
self,
|
190
|
+
*,
|
191
|
+
opts: ModelOptions,
|
192
|
+
chat_ctx: llm.ChatContext,
|
193
|
+
fnc_ctx: llm.FunctionContext | None,
|
194
|
+
loop: asyncio.AbstractEventLoop,
|
195
|
+
):
|
196
|
+
"""
|
197
|
+
Initializes a GeminiRealtimeSession instance for interacting with Google's Realtime API.
|
198
|
+
|
199
|
+
Args:
|
200
|
+
opts (ModelOptions): The model options for the session.
|
201
|
+
chat_ctx (llm.ChatContext): The chat context for the session.
|
202
|
+
fnc_ctx (llm.FunctionContext or None): The function context for the session.
|
203
|
+
loop (asyncio.AbstractEventLoop): The event loop for the session.
|
204
|
+
"""
|
205
|
+
super().__init__()
|
206
|
+
self._loop = loop
|
207
|
+
self._opts = opts
|
208
|
+
self._chat_ctx = chat_ctx
|
209
|
+
self._fnc_ctx = fnc_ctx
|
210
|
+
self._fnc_tasks = utils.aio.TaskSet()
|
211
|
+
|
212
|
+
tools = []
|
213
|
+
if self._fnc_ctx is not None:
|
214
|
+
functions = _build_tools(self._fnc_ctx)
|
215
|
+
tools.append({"function_declarations": functions})
|
216
|
+
|
217
|
+
self._config = LiveConnectConfigDict(
|
218
|
+
model=self._opts.model,
|
219
|
+
response_modalities=self._opts.response_modalities,
|
220
|
+
generation_config=GenerationConfigDict(
|
221
|
+
candidate_count=self._opts.candidate_count,
|
222
|
+
temperature=self._opts.temperature,
|
223
|
+
max_output_tokens=self._opts.max_output_tokens,
|
224
|
+
top_p=self._opts.top_p,
|
225
|
+
top_k=self._opts.top_k,
|
226
|
+
presence_penalty=self._opts.presence_penalty,
|
227
|
+
frequency_penalty=self._opts.frequency_penalty,
|
228
|
+
),
|
229
|
+
system_instruction=self._opts.instructions,
|
230
|
+
speech_config=SpeechConfig(
|
231
|
+
voice_config=VoiceConfig(
|
232
|
+
prebuilt_voice_config=PrebuiltVoiceConfig(
|
233
|
+
voice_name=self._opts.voice
|
234
|
+
)
|
235
|
+
)
|
236
|
+
),
|
237
|
+
tools=tools,
|
238
|
+
)
|
239
|
+
self._client = genai.Client(
|
240
|
+
http_options={"api_version": "v1alpha"},
|
241
|
+
api_key=self._opts.api_key,
|
242
|
+
vertexai=self._opts.vertexai,
|
243
|
+
project=self._opts.project,
|
244
|
+
location=self._opts.location,
|
245
|
+
)
|
246
|
+
self._main_atask = asyncio.create_task(
|
247
|
+
self._main_task(), name="gemini-realtime-session"
|
248
|
+
)
|
249
|
+
# dummy task to wait for the session to be initialized # TODO: sync chat ctx
|
250
|
+
self._init_sync_task = asyncio.create_task(
|
251
|
+
asyncio.sleep(0), name="gemini-realtime-session-init"
|
252
|
+
)
|
253
|
+
self._send_ch = utils.aio.Chan[ClientEvents]()
|
254
|
+
self._active_response_id = None
|
255
|
+
|
256
|
+
async def aclose(self) -> None:
|
257
|
+
if self._send_ch.closed:
|
258
|
+
return
|
259
|
+
|
260
|
+
self._send_ch.close()
|
261
|
+
await self._main_atask
|
262
|
+
|
263
|
+
@property
|
264
|
+
def fnc_ctx(self) -> llm.FunctionContext | None:
|
265
|
+
return self._fnc_ctx
|
266
|
+
|
267
|
+
@fnc_ctx.setter
|
268
|
+
def fnc_ctx(self, value: llm.FunctionContext | None) -> None:
|
269
|
+
self._fnc_ctx = value
|
270
|
+
|
271
|
+
def _push_audio(self, frame: rtc.AudioFrame) -> None:
|
272
|
+
data = base64.b64encode(frame.data).decode("utf-8")
|
273
|
+
self._queue_msg({"mime_type": "audio/pcm", "data": data})
|
274
|
+
|
275
|
+
def _queue_msg(self, msg: dict) -> None:
|
276
|
+
self._send_ch.send_nowait(msg)
|
277
|
+
|
278
|
+
def chat_ctx_copy(self) -> llm.ChatContext:
|
279
|
+
return self._chat_ctx.copy()
|
280
|
+
|
281
|
+
async def set_chat_ctx(self, ctx: llm.ChatContext) -> None:
|
282
|
+
self._chat_ctx = ctx.copy()
|
283
|
+
|
284
|
+
@utils.log_exceptions(logger=logger)
|
285
|
+
async def _main_task(self):
|
286
|
+
@utils.log_exceptions(logger=logger)
|
287
|
+
async def _send_task():
|
288
|
+
async for msg in self._send_ch:
|
289
|
+
await self._session.send(msg)
|
290
|
+
|
291
|
+
await self._session.send(".", end_of_turn=True)
|
292
|
+
|
293
|
+
@utils.log_exceptions(logger=logger)
|
294
|
+
async def _recv_task():
|
295
|
+
while True:
|
296
|
+
async for response in self._session.receive():
|
297
|
+
if self._active_response_id is None:
|
298
|
+
self._active_response_id = utils.shortuuid()
|
299
|
+
text_stream = utils.aio.Chan[str]()
|
300
|
+
audio_stream = utils.aio.Chan[rtc.AudioFrame]()
|
301
|
+
content = GeminiContent(
|
302
|
+
response_id=self._active_response_id,
|
303
|
+
item_id=self._active_response_id,
|
304
|
+
output_index=0,
|
305
|
+
content_index=0,
|
306
|
+
text="",
|
307
|
+
audio=[],
|
308
|
+
text_stream=text_stream,
|
309
|
+
audio_stream=audio_stream,
|
310
|
+
content_type=self._opts.response_modalities,
|
311
|
+
)
|
312
|
+
self.emit("response_content_added", content)
|
313
|
+
|
314
|
+
server_content = response.server_content
|
315
|
+
if server_content:
|
316
|
+
model_turn = server_content.model_turn
|
317
|
+
if model_turn:
|
318
|
+
for part in model_turn.parts:
|
319
|
+
if part.text:
|
320
|
+
content.text_stream.send_nowait(part.text)
|
321
|
+
if part.inline_data:
|
322
|
+
frame = rtc.AudioFrame(
|
323
|
+
data=part.inline_data.data,
|
324
|
+
sample_rate=24000,
|
325
|
+
num_channels=1,
|
326
|
+
samples_per_channel=len(part.inline_data.data)
|
327
|
+
// 2,
|
328
|
+
)
|
329
|
+
content.audio_stream.send_nowait(frame)
|
330
|
+
|
331
|
+
if server_content.interrupted or server_content.turn_complete:
|
332
|
+
for stream in (content.text_stream, content.audio_stream):
|
333
|
+
if isinstance(stream, utils.aio.Chan):
|
334
|
+
stream.close()
|
335
|
+
|
336
|
+
if server_content.interrupted:
|
337
|
+
self.emit("input_speech_started")
|
338
|
+
elif server_content.turn_complete:
|
339
|
+
self.emit("response_content_done", content)
|
340
|
+
|
341
|
+
self._active_response_id = None
|
342
|
+
|
343
|
+
if response.tool_call:
|
344
|
+
if self._fnc_ctx is None:
|
345
|
+
raise ValueError("Function context is not set")
|
346
|
+
fnc_calls = []
|
347
|
+
for fnc_call in response.tool_call.function_calls:
|
348
|
+
fnc_call_info = _create_ai_function_info(
|
349
|
+
self._fnc_ctx,
|
350
|
+
fnc_call.id,
|
351
|
+
fnc_call.name,
|
352
|
+
json.dumps(fnc_call.args),
|
353
|
+
)
|
354
|
+
fnc_calls.append(fnc_call_info)
|
355
|
+
|
356
|
+
self.emit("function_calls_collected", fnc_calls)
|
357
|
+
|
358
|
+
for fnc_call_info in fnc_calls:
|
359
|
+
self._fnc_tasks.create_task(
|
360
|
+
self._run_fnc_task(fnc_call_info, content.item_id)
|
361
|
+
)
|
362
|
+
|
363
|
+
# Handle function call cancellations
|
364
|
+
if response.tool_call_cancellation:
|
365
|
+
logger.warning(
|
366
|
+
"function call cancelled",
|
367
|
+
extra={
|
368
|
+
"function_call_ids": response.tool_call_cancellation.function_call_ids,
|
369
|
+
},
|
370
|
+
)
|
371
|
+
self.emit(
|
372
|
+
"function_calls_cancelled",
|
373
|
+
response.tool_call_cancellation.function_call_ids,
|
374
|
+
)
|
375
|
+
|
376
|
+
async with self._client.aio.live.connect(
|
377
|
+
model=self._opts.model, config=self._config
|
378
|
+
) as session:
|
379
|
+
self._session = session
|
380
|
+
tasks = [
|
381
|
+
asyncio.create_task(_send_task(), name="gemini-realtime-send"),
|
382
|
+
asyncio.create_task(_recv_task(), name="gemini-realtime-recv"),
|
383
|
+
]
|
384
|
+
|
385
|
+
try:
|
386
|
+
await asyncio.gather(*tasks)
|
387
|
+
finally:
|
388
|
+
await utils.aio.gracefully_cancel(*tasks)
|
389
|
+
await self._session.close()
|
390
|
+
|
391
|
+
@utils.log_exceptions(logger=logger)
|
392
|
+
async def _run_fnc_task(self, fnc_call_info: llm.FunctionCallInfo, item_id: str):
|
393
|
+
logger.debug(
|
394
|
+
"executing ai function",
|
395
|
+
extra={
|
396
|
+
"function": fnc_call_info.function_info.name,
|
397
|
+
},
|
398
|
+
)
|
399
|
+
|
400
|
+
called_fnc = fnc_call_info.execute()
|
401
|
+
try:
|
402
|
+
await called_fnc.task
|
403
|
+
except Exception as e:
|
404
|
+
logger.exception(
|
405
|
+
"error executing ai function",
|
406
|
+
extra={
|
407
|
+
"function": fnc_call_info.function_info.name,
|
408
|
+
},
|
409
|
+
exc_info=e,
|
410
|
+
)
|
411
|
+
tool_call = llm.ChatMessage.create_tool_from_called_function(called_fnc)
|
412
|
+
if tool_call.content is not None:
|
413
|
+
tool_response = LiveClientToolResponse(
|
414
|
+
function_responses=[
|
415
|
+
FunctionResponse(
|
416
|
+
name=tool_call.name,
|
417
|
+
id=tool_call.tool_call_id,
|
418
|
+
response={"result": tool_call.content},
|
419
|
+
)
|
420
|
+
]
|
421
|
+
)
|
422
|
+
await self._session.send(tool_response)
|
423
|
+
|
424
|
+
self.emit("function_calls_finished", [called_fnc])
|
livekit/plugins/google/stt.py
CHANGED
@@ -16,6 +16,7 @@ from __future__ import annotations
|
|
16
16
|
|
17
17
|
import asyncio
|
18
18
|
import dataclasses
|
19
|
+
import time
|
19
20
|
import weakref
|
20
21
|
from dataclasses import dataclass
|
21
22
|
from typing import List, Union
|
@@ -44,6 +45,10 @@ from .models import SpeechLanguages, SpeechModels
|
|
44
45
|
LgType = Union[SpeechLanguages, str]
|
45
46
|
LanguageCode = Union[LgType, List[LgType]]
|
46
47
|
|
48
|
+
# Google STT has a timeout of 5 mins, we'll attempt to restart the session
|
49
|
+
# before that timeout is reached
|
50
|
+
_max_session_duration = 240
|
51
|
+
|
47
52
|
|
48
53
|
# This class is only be used internally to encapsulate the options
|
49
54
|
@dataclass
|
@@ -229,8 +234,6 @@ class STT(stt.STT):
|
|
229
234
|
raise APIStatusError(
|
230
235
|
e.message,
|
231
236
|
status_code=e.code or -1,
|
232
|
-
request_id=None,
|
233
|
-
body=None,
|
234
237
|
)
|
235
238
|
except Exception as e:
|
236
239
|
raise APIConnectionError() from e
|
@@ -278,6 +281,13 @@ class STT(stt.STT):
|
|
278
281
|
self._config.spoken_punctuation = spoken_punctuation
|
279
282
|
if model is not None:
|
280
283
|
self._config.model = model
|
284
|
+
client = None
|
285
|
+
recognizer = None
|
286
|
+
if location is not None:
|
287
|
+
self._location = location
|
288
|
+
# if location is changed, fetch a new client and recognizer as per the new location
|
289
|
+
client = self._ensure_client()
|
290
|
+
recognizer = self._recognizer
|
281
291
|
if keywords is not None:
|
282
292
|
self._config.keywords = keywords
|
283
293
|
|
@@ -289,8 +299,9 @@ class STT(stt.STT):
|
|
289
299
|
punctuate=punctuate,
|
290
300
|
spoken_punctuation=spoken_punctuation,
|
291
301
|
model=model,
|
292
|
-
location=location,
|
293
302
|
keywords=keywords,
|
303
|
+
client=client,
|
304
|
+
recognizer=recognizer,
|
294
305
|
)
|
295
306
|
|
296
307
|
|
@@ -312,6 +323,7 @@ class SpeechStream(stt.SpeechStream):
|
|
312
323
|
self._recognizer = recognizer
|
313
324
|
self._config = config
|
314
325
|
self._reconnect_event = asyncio.Event()
|
326
|
+
self._session_connected_at: float = 0
|
315
327
|
|
316
328
|
def update_options(
|
317
329
|
self,
|
@@ -322,8 +334,9 @@ class SpeechStream(stt.SpeechStream):
|
|
322
334
|
punctuate: bool | None = None,
|
323
335
|
spoken_punctuation: bool | None = None,
|
324
336
|
model: SpeechModels | None = None,
|
325
|
-
location: str | None = None,
|
326
337
|
keywords: List[tuple[str, float]] | None = None,
|
338
|
+
client: SpeechAsyncClient | None = None,
|
339
|
+
recognizer: str | None = None,
|
327
340
|
):
|
328
341
|
if languages is not None:
|
329
342
|
if isinstance(languages, str):
|
@@ -341,13 +354,17 @@ class SpeechStream(stt.SpeechStream):
|
|
341
354
|
self._config.model = model
|
342
355
|
if keywords is not None:
|
343
356
|
self._config.keywords = keywords
|
357
|
+
if client is not None:
|
358
|
+
self._client = client
|
359
|
+
if recognizer is not None:
|
360
|
+
self._recognizer = recognizer
|
344
361
|
|
345
362
|
self._reconnect_event.set()
|
346
363
|
|
347
364
|
async def _run(self) -> None:
|
348
365
|
# google requires a async generator when calling streaming_recognize
|
349
366
|
# this function basically convert the queue into a async generator
|
350
|
-
async def input_generator():
|
367
|
+
async def input_generator(should_stop: asyncio.Event):
|
351
368
|
try:
|
352
369
|
# first request should contain the config
|
353
370
|
yield cloud_speech.StreamingRecognizeRequest(
|
@@ -356,6 +373,12 @@ class SpeechStream(stt.SpeechStream):
|
|
356
373
|
)
|
357
374
|
|
358
375
|
async for frame in self._input_ch:
|
376
|
+
# when the stream is aborted due to reconnect, this input_generator
|
377
|
+
# needs to stop consuming frames
|
378
|
+
# when the generator stops, the previous gRPC stream will close
|
379
|
+
if should_stop.is_set():
|
380
|
+
return
|
381
|
+
|
359
382
|
if isinstance(frame, rtc.AudioFrame):
|
360
383
|
yield cloud_speech.StreamingRecognizeRequest(
|
361
384
|
audio=frame.data.tobytes()
|
@@ -367,6 +390,7 @@ class SpeechStream(stt.SpeechStream):
|
|
367
390
|
)
|
368
391
|
|
369
392
|
async def process_stream(stream):
|
393
|
+
has_started = False
|
370
394
|
async for resp in stream:
|
371
395
|
if (
|
372
396
|
resp.speech_event_type
|
@@ -375,6 +399,7 @@ class SpeechStream(stt.SpeechStream):
|
|
375
399
|
self._event_ch.send_nowait(
|
376
400
|
stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH)
|
377
401
|
)
|
402
|
+
has_started = True
|
378
403
|
|
379
404
|
if (
|
380
405
|
resp.speech_event_type
|
@@ -399,6 +424,22 @@ class SpeechStream(stt.SpeechStream):
|
|
399
424
|
alternatives=[speech_data],
|
400
425
|
)
|
401
426
|
)
|
427
|
+
if (
|
428
|
+
time.time() - self._session_connected_at
|
429
|
+
> _max_session_duration
|
430
|
+
):
|
431
|
+
logger.debug(
|
432
|
+
"Google STT maximum connection time reached. Reconnecting..."
|
433
|
+
)
|
434
|
+
if has_started:
|
435
|
+
self._event_ch.send_nowait(
|
436
|
+
stt.SpeechEvent(
|
437
|
+
type=stt.SpeechEventType.END_OF_SPEECH
|
438
|
+
)
|
439
|
+
)
|
440
|
+
has_started = False
|
441
|
+
self._reconnect_event.set()
|
442
|
+
return
|
402
443
|
|
403
444
|
if (
|
404
445
|
resp.speech_event_type
|
@@ -407,6 +448,7 @@ class SpeechStream(stt.SpeechStream):
|
|
407
448
|
self._event_ch.send_nowait(
|
408
449
|
stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH)
|
409
450
|
)
|
451
|
+
has_started = False
|
410
452
|
|
411
453
|
while True:
|
412
454
|
try:
|
@@ -431,25 +473,40 @@ class SpeechStream(stt.SpeechStream):
|
|
431
473
|
),
|
432
474
|
)
|
433
475
|
|
476
|
+
should_stop = asyncio.Event()
|
434
477
|
stream = await self._client.streaming_recognize(
|
435
|
-
requests=input_generator(),
|
478
|
+
requests=input_generator(should_stop),
|
436
479
|
)
|
480
|
+
self._session_connected_at = time.time()
|
437
481
|
|
438
482
|
process_stream_task = asyncio.create_task(process_stream(stream))
|
439
483
|
wait_reconnect_task = asyncio.create_task(self._reconnect_event.wait())
|
484
|
+
|
440
485
|
try:
|
441
|
-
await asyncio.wait(
|
486
|
+
done, _ = await asyncio.wait(
|
442
487
|
[process_stream_task, wait_reconnect_task],
|
443
488
|
return_when=asyncio.FIRST_COMPLETED,
|
444
489
|
)
|
490
|
+
for task in done:
|
491
|
+
if task != wait_reconnect_task:
|
492
|
+
task.result()
|
493
|
+
if wait_reconnect_task not in done:
|
494
|
+
break
|
495
|
+
self._reconnect_event.clear()
|
445
496
|
finally:
|
446
497
|
await utils.aio.gracefully_cancel(
|
447
498
|
process_stream_task, wait_reconnect_task
|
448
499
|
)
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
500
|
+
should_stop.set()
|
501
|
+
except DeadlineExceeded:
|
502
|
+
raise APITimeoutError()
|
503
|
+
except GoogleAPICallError as e:
|
504
|
+
raise APIStatusError(
|
505
|
+
e.message,
|
506
|
+
status_code=e.code or -1,
|
507
|
+
)
|
508
|
+
except Exception as e:
|
509
|
+
raise APIConnectionError() from e
|
453
510
|
|
454
511
|
|
455
512
|
def _recognize_response_to_speech_event(
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.2
|
2
2
|
Name: livekit-plugins-google
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.9.1
|
4
4
|
Summary: Agent Framework plugin for services from Google Cloud
|
5
5
|
Home-page: https://github.com/livekit/agents
|
6
6
|
License: Apache-2.0
|
@@ -22,7 +22,18 @@ Description-Content-Type: text/markdown
|
|
22
22
|
Requires-Dist: google-auth<3,>=2
|
23
23
|
Requires-Dist: google-cloud-speech<3,>=2
|
24
24
|
Requires-Dist: google-cloud-texttospeech<3,>=2
|
25
|
-
Requires-Dist:
|
25
|
+
Requires-Dist: google-genai>=0.3.0
|
26
|
+
Requires-Dist: livekit-agents>=0.12.3
|
27
|
+
Dynamic: classifier
|
28
|
+
Dynamic: description
|
29
|
+
Dynamic: description-content-type
|
30
|
+
Dynamic: home-page
|
31
|
+
Dynamic: keywords
|
32
|
+
Dynamic: license
|
33
|
+
Dynamic: project-url
|
34
|
+
Dynamic: requires-dist
|
35
|
+
Dynamic: requires-python
|
36
|
+
Dynamic: summary
|
26
37
|
|
27
38
|
# LiveKit Plugins Google
|
28
39
|
|
@@ -37,3 +48,8 @@ pip install livekit-plugins-google
|
|
37
48
|
## Pre-requisites
|
38
49
|
|
39
50
|
For credentials, you'll need a Google Cloud account and obtain the correct credentials. Credentials can be passed directly or via Application Default Credentials as specified in [How Application Default Credentials works](https://cloud.google.com/docs/authentication/application-default-credentials).
|
51
|
+
|
52
|
+
To use the STT and TTS API, you'll need to enable the respective services for your Google Cloud project.
|
53
|
+
|
54
|
+
- Cloud Speech-to-Text API
|
55
|
+
- Cloud Text-to-Speech API
|
@@ -0,0 +1,15 @@
|
|
1
|
+
livekit/plugins/google/__init__.py,sha256=TY-5FwEX4Vs7GLO1wSegIxC5W4UPkHBthlr-__yuE4w,1143
|
2
|
+
livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
|
3
|
+
livekit/plugins/google/models.py,sha256=cBXhZGY9bFaSCyL9VeSng9wsxhf3peJi3AUYBKV-8GQ,1343
|
4
|
+
livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
livekit/plugins/google/stt.py,sha256=E5kXPbicH4FEXBjyBzfqQWA-nPhKkojzcc-cbtWdmNs,21088
|
6
|
+
livekit/plugins/google/tts.py,sha256=95qXCigVQYWNbcN3pIKBpIah4b31U_MWtXv5Ji0AMc4,9229
|
7
|
+
livekit/plugins/google/version.py,sha256=4GcbYy7J7gvPMEA4wlPB0BJqg8CjF7HRVjQ-i1EH7M8,600
|
8
|
+
livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
|
9
|
+
livekit/plugins/google/beta/realtime/__init__.py,sha256=XnJpNIN6NRm7Y4hH2RNA8Xt-tTmkZEKCs_zzU3_koBI,251
|
10
|
+
livekit/plugins/google/beta/realtime/api_proto.py,sha256=IHYBryuzpfGQD86Twlfq6qxrBhFHptf_IvOk36Wxo1M,2156
|
11
|
+
livekit/plugins/google/beta/realtime/realtime_api.py,sha256=YUEf3iR9dIctnXRqev_qKSBM_plqcYKudodFO8nADJY,15966
|
12
|
+
livekit_plugins_google-0.9.1.dist-info/METADATA,sha256=y5d0OEdbkoGk0IPGURiDZbt6e6sWhsxOU2cioNrPu7w,2056
|
13
|
+
livekit_plugins_google-0.9.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
14
|
+
livekit_plugins_google-0.9.1.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
15
|
+
livekit_plugins_google-0.9.1.dist-info/RECORD,,
|
@@ -1,11 +0,0 @@
|
|
1
|
-
livekit/plugins/google/__init__.py,sha256=rqV6C5mFNDFlrA2IcGJrsebr2VxQwMzoDUjY1JhMBZM,1117
|
2
|
-
livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
|
3
|
-
livekit/plugins/google/models.py,sha256=cBXhZGY9bFaSCyL9VeSng9wsxhf3peJi3AUYBKV-8GQ,1343
|
4
|
-
livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
livekit/plugins/google/stt.py,sha256=tmjktdO6C2AuJWHSKl20ae3cfy_DqfN_oNYYcE552pQ,18566
|
6
|
-
livekit/plugins/google/tts.py,sha256=95qXCigVQYWNbcN3pIKBpIah4b31U_MWtXv5Ji0AMc4,9229
|
7
|
-
livekit/plugins/google/version.py,sha256=PoHw-_DNE2B5SpeoQ-r6HSfVmbDgYuGamg0dN2jhayQ,600
|
8
|
-
livekit_plugins_google-0.8.1.dist-info/METADATA,sha256=RHRMpfHxvaWjwWStByUPghWBLY5tIuC5Lm8r9C3hEhc,1643
|
9
|
-
livekit_plugins_google-0.8.1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
10
|
-
livekit_plugins_google-0.8.1.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
11
|
-
livekit_plugins_google-0.8.1.dist-info/RECORD,,
|
{livekit_plugins_google-0.8.1.dist-info → livekit_plugins_google-0.9.1.dist-info}/top_level.txt
RENAMED
File without changes
|