livekit-plugins-google 0.10.5__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/google/_utils.py +12 -15
- livekit/plugins/google/beta/realtime/realtime_api.py +54 -8
- livekit/plugins/google/beta/realtime/transcriber.py +1 -1
- livekit/plugins/google/llm.py +7 -6
- livekit/plugins/google/models.py +2 -2
- livekit/plugins/google/stt.py +101 -83
- livekit/plugins/google/tts.py +22 -43
- livekit/plugins/google/version.py +1 -1
- {livekit_plugins_google-0.10.5.dist-info → livekit_plugins_google-0.11.0.dist-info}/METADATA +57 -3
- livekit_plugins_google-0.11.0.dist-info/RECORD +18 -0
- {livekit_plugins_google-0.10.5.dist-info → livekit_plugins_google-0.11.0.dist-info}/WHEEL +1 -1
- livekit_plugins_google-0.10.5.dist-info/RECORD +0 -18
- {livekit_plugins_google-0.10.5.dist-info → livekit_plugins_google-0.11.0.dist-info}/top_level.txt +0 -0
livekit/plugins/google/_utils.py
CHANGED
@@ -10,14 +10,15 @@ from livekit.agents import llm, utils
|
|
10
10
|
from livekit.agents.llm.function_context import _is_optional_type
|
11
11
|
|
12
12
|
from google.genai import types
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
13
|
+
from google.genai.types import Type as GenaiType
|
14
|
+
|
15
|
+
JSON_SCHEMA_TYPE_MAP: dict[type, GenaiType] = {
|
16
|
+
str: GenaiType.STRING,
|
17
|
+
int: GenaiType.INTEGER,
|
18
|
+
float: GenaiType.NUMBER,
|
19
|
+
bool: GenaiType.BOOLEAN,
|
20
|
+
dict: GenaiType.OBJECT,
|
21
|
+
list: GenaiType.ARRAY,
|
21
22
|
}
|
22
23
|
|
23
24
|
__all__ = ["_build_gemini_ctx", "_build_tools"]
|
@@ -38,7 +39,7 @@ def _build_parameters(arguments: Dict[str, Any]) -> types.Schema | None:
|
|
38
39
|
item_type = get_args(py_type)[0]
|
39
40
|
if item_type not in JSON_SCHEMA_TYPE_MAP:
|
40
41
|
raise ValueError(f"Unsupported type: {item_type}")
|
41
|
-
prop.type =
|
42
|
+
prop.type = GenaiType.ARRAY
|
42
43
|
prop.items = types.Schema(type=JSON_SCHEMA_TYPE_MAP[item_type])
|
43
44
|
|
44
45
|
if arg_info.choices:
|
@@ -62,7 +63,7 @@ def _build_parameters(arguments: Dict[str, Any]) -> types.Schema | None:
|
|
62
63
|
required.append(arg_name)
|
63
64
|
|
64
65
|
if properties:
|
65
|
-
parameters = types.Schema(type=
|
66
|
+
parameters = types.Schema(type=GenaiType.OBJECT, properties=properties)
|
66
67
|
if required:
|
67
68
|
parameters.required = required
|
68
69
|
|
@@ -119,7 +120,6 @@ def _build_gemini_ctx(
|
|
119
120
|
parts.append(
|
120
121
|
types.Part(
|
121
122
|
function_call=types.FunctionCall(
|
122
|
-
id=fnc.tool_call_id,
|
123
123
|
name=fnc.function_info.name,
|
124
124
|
args=fnc.arguments,
|
125
125
|
)
|
@@ -132,7 +132,6 @@ def _build_gemini_ctx(
|
|
132
132
|
parts.append(
|
133
133
|
types.Part(
|
134
134
|
function_response=types.FunctionResponse(
|
135
|
-
id=msg.tool_call_id,
|
136
135
|
name=msg.name,
|
137
136
|
response=msg.content,
|
138
137
|
)
|
@@ -142,7 +141,6 @@ def _build_gemini_ctx(
|
|
142
141
|
parts.append(
|
143
142
|
types.Part(
|
144
143
|
function_response=types.FunctionResponse(
|
145
|
-
id=msg.tool_call_id,
|
146
144
|
name=msg.name,
|
147
145
|
response={"result": msg.content},
|
148
146
|
)
|
@@ -193,8 +191,7 @@ def _build_gemini_image_part(image: llm.ChatImage, cache_key: Any) -> types.Part
|
|
193
191
|
height=image.inference_height,
|
194
192
|
strategy="scale_aspect_fit",
|
195
193
|
)
|
196
|
-
|
197
|
-
image._cache[cache_key] = base64.b64encode(encoded_data).decode("utf-8")
|
194
|
+
image._cache[cache_key] = utils.images.encode(image.image, opts)
|
198
195
|
|
199
196
|
return types.Part.from_bytes(
|
200
197
|
data=image._cache[cache_key], mime_type="image/jpeg"
|
@@ -9,14 +9,15 @@ from typing import AsyncIterable, Literal
|
|
9
9
|
from livekit import rtc
|
10
10
|
from livekit.agents import llm, utils
|
11
11
|
from livekit.agents.llm.function_context import _create_ai_function_info
|
12
|
+
from livekit.agents.utils import images
|
12
13
|
|
13
14
|
from google import genai
|
14
|
-
from google.genai._api_client import HttpOptions
|
15
15
|
from google.genai.types import (
|
16
16
|
Blob,
|
17
17
|
Content,
|
18
18
|
FunctionResponse,
|
19
19
|
GenerationConfig,
|
20
|
+
HttpOptions,
|
20
21
|
LiveClientContent,
|
21
22
|
LiveClientRealtimeInput,
|
22
23
|
LiveClientToolResponse,
|
@@ -107,7 +108,7 @@ class RealtimeModel:
|
|
107
108
|
model: LiveAPIModels | str = "gemini-2.0-flash-exp",
|
108
109
|
api_key: str | None = None,
|
109
110
|
voice: Voice | str = "Puck",
|
110
|
-
modalities: list[Modality] = [
|
111
|
+
modalities: list[Modality] = [Modality.AUDIO],
|
111
112
|
enable_user_audio_transcription: bool = True,
|
112
113
|
enable_agent_audio_transcription: bool = True,
|
113
114
|
vertexai: bool = False,
|
@@ -258,6 +259,8 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
258
259
|
self._fnc_ctx = fnc_ctx
|
259
260
|
self._fnc_tasks = utils.aio.TaskSet()
|
260
261
|
self._is_interrupted = False
|
262
|
+
self._playout_complete = asyncio.Event()
|
263
|
+
self._playout_complete.set()
|
261
264
|
|
262
265
|
tools = []
|
263
266
|
if self._fnc_ctx is not None:
|
@@ -317,6 +320,10 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
317
320
|
self._send_ch.close()
|
318
321
|
await self._main_atask
|
319
322
|
|
323
|
+
@property
|
324
|
+
def playout_complete(self) -> asyncio.Event | None:
|
325
|
+
return self._playout_complete
|
326
|
+
|
320
327
|
@property
|
321
328
|
def fnc_ctx(self) -> llm.FunctionContext | None:
|
322
329
|
return self._fnc_ctx
|
@@ -325,14 +332,53 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
325
332
|
def fnc_ctx(self, value: llm.FunctionContext | None) -> None:
|
326
333
|
self._fnc_ctx = value
|
327
334
|
|
328
|
-
def
|
329
|
-
if self._opts.enable_user_audio_transcription:
|
330
|
-
self._transcriber._push_audio(frame)
|
335
|
+
def _push_media_chunk(self, data: bytes, mime_type: str) -> None:
|
331
336
|
realtime_input = LiveClientRealtimeInput(
|
332
|
-
media_chunks=[Blob(data=
|
337
|
+
media_chunks=[Blob(data=data, mime_type=mime_type)],
|
333
338
|
)
|
334
339
|
self._queue_msg(realtime_input)
|
335
340
|
|
341
|
+
DEFAULT_ENCODE_OPTIONS = images.EncodeOptions(
|
342
|
+
format="JPEG",
|
343
|
+
quality=75,
|
344
|
+
resize_options=images.ResizeOptions(
|
345
|
+
width=1024, height=1024, strategy="scale_aspect_fit"
|
346
|
+
),
|
347
|
+
)
|
348
|
+
|
349
|
+
def push_video(
|
350
|
+
self,
|
351
|
+
frame: rtc.VideoFrame,
|
352
|
+
encode_options: images.EncodeOptions = DEFAULT_ENCODE_OPTIONS,
|
353
|
+
) -> None:
|
354
|
+
"""Push a video frame to the Gemini Multimodal Live session.
|
355
|
+
|
356
|
+
Args:
|
357
|
+
frame (rtc.VideoFrame): The video frame to push.
|
358
|
+
encode_options (images.EncodeOptions, optional): The encode options for the video frame. Defaults to 1024x1024 JPEG.
|
359
|
+
|
360
|
+
Notes:
|
361
|
+
- This will be sent immediately so you should use a sampling frame rate that makes sense for your application and Gemini's constraints. 1 FPS is a good starting point.
|
362
|
+
"""
|
363
|
+
encoded_data = images.encode(
|
364
|
+
frame,
|
365
|
+
encode_options,
|
366
|
+
)
|
367
|
+
mime_type = (
|
368
|
+
"image/jpeg"
|
369
|
+
if encode_options.format == "JPEG"
|
370
|
+
else "image/png"
|
371
|
+
if encode_options.format == "PNG"
|
372
|
+
else "image/jpeg"
|
373
|
+
)
|
374
|
+
self._push_media_chunk(encoded_data, mime_type)
|
375
|
+
|
376
|
+
def _push_audio(self, frame: rtc.AudioFrame) -> None:
|
377
|
+
if self._opts.enable_user_audio_transcription:
|
378
|
+
self._transcriber._push_audio(frame)
|
379
|
+
|
380
|
+
self._push_media_chunk(frame.data.tobytes(), "audio/pcm")
|
381
|
+
|
336
382
|
def _queue_msg(self, msg: ClientEvents) -> None:
|
337
383
|
self._send_ch.send_nowait(msg)
|
338
384
|
|
@@ -479,12 +525,12 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
479
525
|
logger.warning(
|
480
526
|
"function call cancelled",
|
481
527
|
extra={
|
482
|
-
"function_call_ids": response.tool_call_cancellation.
|
528
|
+
"function_call_ids": response.tool_call_cancellation.ids,
|
483
529
|
},
|
484
530
|
)
|
485
531
|
self.emit(
|
486
532
|
"function_calls_cancelled",
|
487
|
-
response.tool_call_cancellation.
|
533
|
+
response.tool_call_cancellation.ids,
|
488
534
|
)
|
489
535
|
|
490
536
|
async with self._client.aio.live.connect(
|
@@ -55,7 +55,7 @@ class TranscriberSession(utils.EventEmitter[EventTypes]):
|
|
55
55
|
parts=[types.Part(text=SYSTEM_INSTRUCTIONS)]
|
56
56
|
)
|
57
57
|
self._config = types.LiveConnectConfig(
|
58
|
-
response_modalities=[
|
58
|
+
response_modalities=[types.Modality.TEXT],
|
59
59
|
system_instruction=system_instructions,
|
60
60
|
generation_config=types.GenerationConfig(temperature=0.0),
|
61
61
|
)
|
livekit/plugins/google/llm.py
CHANGED
@@ -240,7 +240,7 @@ class LLMStream(llm.LLMStream):
|
|
240
240
|
# specific function
|
241
241
|
tool_config = types.ToolConfig(
|
242
242
|
function_calling_config=types.FunctionCallingConfig(
|
243
|
-
mode=
|
243
|
+
mode=types.FunctionCallingConfigMode.ANY,
|
244
244
|
allowed_function_names=[self._tool_choice.name],
|
245
245
|
)
|
246
246
|
)
|
@@ -248,7 +248,7 @@ class LLMStream(llm.LLMStream):
|
|
248
248
|
# model must call any function
|
249
249
|
tool_config = types.ToolConfig(
|
250
250
|
function_calling_config=types.FunctionCallingConfig(
|
251
|
-
mode=
|
251
|
+
mode=types.FunctionCallingConfigMode.ANY,
|
252
252
|
allowed_function_names=[
|
253
253
|
fnc.name
|
254
254
|
for fnc in self._fnc_ctx.ai_functions.values()
|
@@ -259,14 +259,14 @@ class LLMStream(llm.LLMStream):
|
|
259
259
|
# model can call any function
|
260
260
|
tool_config = types.ToolConfig(
|
261
261
|
function_calling_config=types.FunctionCallingConfig(
|
262
|
-
mode=
|
262
|
+
mode=types.FunctionCallingConfigMode.AUTO
|
263
263
|
)
|
264
264
|
)
|
265
265
|
elif self._tool_choice == "none":
|
266
266
|
# model cannot call any function
|
267
267
|
tool_config = types.ToolConfig(
|
268
268
|
function_calling_config=types.FunctionCallingConfig(
|
269
|
-
mode=
|
269
|
+
mode=types.FunctionCallingConfigMode.NONE,
|
270
270
|
)
|
271
271
|
)
|
272
272
|
opts["tool_config"] = tool_config
|
@@ -282,11 +282,12 @@ class LLMStream(llm.LLMStream):
|
|
282
282
|
system_instruction=system_instruction,
|
283
283
|
**opts,
|
284
284
|
)
|
285
|
-
|
285
|
+
stream = await self._client.aio.models.generate_content_stream(
|
286
286
|
model=self._model,
|
287
287
|
contents=cast(types.ContentListUnion, turns),
|
288
288
|
config=config,
|
289
|
-
)
|
289
|
+
)
|
290
|
+
async for response in stream: # type: ignore
|
290
291
|
if response.prompt_feedback:
|
291
292
|
raise APIStatusError(
|
292
293
|
response.prompt_feedback.json(),
|
livekit/plugins/google/models.py
CHANGED
@@ -10,6 +10,8 @@ SpeechModels = Literal[
|
|
10
10
|
"medical_conversation",
|
11
11
|
"chirp",
|
12
12
|
"chirp_2",
|
13
|
+
"latest_long",
|
14
|
+
"latest_short",
|
13
15
|
]
|
14
16
|
|
15
17
|
SpeechLanguages = Literal[
|
@@ -92,8 +94,6 @@ SpeechLanguages = Literal[
|
|
92
94
|
|
93
95
|
Gender = Literal["male", "female", "neutral"]
|
94
96
|
|
95
|
-
AudioEncoding = Literal["wav", "mp3", "ogg", "mulaw", "alaw", "linear16"]
|
96
|
-
|
97
97
|
ChatModels = Literal[
|
98
98
|
"gemini-2.0-flash-001",
|
99
99
|
"gemini-2.0-flash-lite-preview-02-05",
|
livekit/plugins/google/stt.py
CHANGED
@@ -19,7 +19,7 @@ import dataclasses
|
|
19
19
|
import time
|
20
20
|
import weakref
|
21
21
|
from dataclasses import dataclass
|
22
|
-
from typing import List, Union
|
22
|
+
from typing import Callable, List, Union
|
23
23
|
|
24
24
|
from livekit import rtc
|
25
25
|
from livekit.agents import (
|
@@ -61,7 +61,7 @@ class STTOptions:
|
|
61
61
|
interim_results: bool
|
62
62
|
punctuate: bool
|
63
63
|
spoken_punctuation: bool
|
64
|
-
model: SpeechModels
|
64
|
+
model: SpeechModels | str
|
65
65
|
sample_rate: int
|
66
66
|
keywords: List[tuple[str, float]] | None
|
67
67
|
|
@@ -93,7 +93,7 @@ class STT(stt.STT):
|
|
93
93
|
interim_results: bool = True,
|
94
94
|
punctuate: bool = True,
|
95
95
|
spoken_punctuation: bool = False,
|
96
|
-
model: SpeechModels = "
|
96
|
+
model: SpeechModels | str = "latest_long",
|
97
97
|
location: str = "us-central1",
|
98
98
|
sample_rate: int = 16000,
|
99
99
|
credentials_info: dict | None = None,
|
@@ -106,12 +106,24 @@ class STT(stt.STT):
|
|
106
106
|
Credentials must be provided, either by using the ``credentials_info`` dict, or reading
|
107
107
|
from the file specified in ``credentials_file`` or via Application Default Credentials as
|
108
108
|
described in https://cloud.google.com/docs/authentication/application-default-credentials
|
109
|
+
|
110
|
+
args:
|
111
|
+
languages(LanguageCode): list of language codes to recognize (default: "en-US")
|
112
|
+
detect_language(bool): whether to detect the language of the audio (default: True)
|
113
|
+
interim_results(bool): whether to return interim results (default: True)
|
114
|
+
punctuate(bool): whether to punctuate the audio (default: True)
|
115
|
+
spoken_punctuation(bool): whether to use spoken punctuation (default: False)
|
116
|
+
model(SpeechModels): the model to use for recognition default: "latest_long"
|
117
|
+
location(str): the location to use for recognition default: "us-central1"
|
118
|
+
sample_rate(int): the sample rate of the audio default: 16000
|
119
|
+
credentials_info(dict): the credentials info to use for recognition (default: None)
|
120
|
+
credentials_file(str): the credentials file to use for recognition (default: None)
|
121
|
+
keywords(List[tuple[str, float]]): list of keywords to recognize (default: None)
|
109
122
|
"""
|
110
123
|
super().__init__(
|
111
124
|
capabilities=stt.STTCapabilities(streaming=True, interim_results=True)
|
112
125
|
)
|
113
126
|
|
114
|
-
self._client: SpeechAsyncClient | None = None
|
115
127
|
self._location = location
|
116
128
|
self._credentials_info = credentials_info
|
117
129
|
self._credentials_file = credentials_file
|
@@ -140,40 +152,44 @@ class STT(stt.STT):
|
|
140
152
|
keywords=keywords,
|
141
153
|
)
|
142
154
|
self._streams = weakref.WeakSet[SpeechStream]()
|
155
|
+
self._pool = utils.ConnectionPool[SpeechAsyncClient](
|
156
|
+
max_session_duration=_max_session_duration,
|
157
|
+
connect_cb=self._create_client,
|
158
|
+
)
|
143
159
|
|
144
|
-
def
|
160
|
+
async def _create_client(self) -> SpeechAsyncClient:
|
145
161
|
# Add support for passing a specific location that matches recognizer
|
146
162
|
# see: https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages
|
147
163
|
client_options = None
|
164
|
+
client: SpeechAsyncClient | None = None
|
148
165
|
if self._location != "global":
|
149
166
|
client_options = ClientOptions(
|
150
167
|
api_endpoint=f"{self._location}-speech.googleapis.com"
|
151
168
|
)
|
152
169
|
if self._credentials_info:
|
153
|
-
|
170
|
+
client = SpeechAsyncClient.from_service_account_info(
|
154
171
|
self._credentials_info,
|
155
172
|
client_options=client_options,
|
156
173
|
)
|
157
174
|
elif self._credentials_file:
|
158
|
-
|
175
|
+
client = SpeechAsyncClient.from_service_account_file(
|
159
176
|
self._credentials_file,
|
160
177
|
client_options=client_options,
|
161
178
|
)
|
162
179
|
else:
|
163
|
-
|
180
|
+
client = SpeechAsyncClient(
|
164
181
|
client_options=client_options,
|
165
182
|
)
|
166
|
-
assert
|
167
|
-
return
|
183
|
+
assert client is not None
|
184
|
+
return client
|
168
185
|
|
169
|
-
|
170
|
-
def _recognizer(self) -> str:
|
186
|
+
def _get_recognizer(self, client: SpeechAsyncClient) -> str:
|
171
187
|
# TODO(theomonnom): should we use recognizers?
|
172
188
|
# recognizers may improve latency https://cloud.google.com/speech-to-text/v2/docs/recognizers#understand_recognizers
|
173
189
|
|
174
190
|
# TODO(theomonnom): find a better way to access the project_id
|
175
191
|
try:
|
176
|
-
project_id =
|
192
|
+
project_id = client.transport._credentials.project_id # type: ignore
|
177
193
|
except AttributeError:
|
178
194
|
from google.auth import default as ga_default
|
179
195
|
|
@@ -224,16 +240,17 @@ class STT(stt.STT):
|
|
224
240
|
)
|
225
241
|
|
226
242
|
try:
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
243
|
+
async with self._pool.connection() as client:
|
244
|
+
raw = await client.recognize(
|
245
|
+
cloud_speech.RecognizeRequest(
|
246
|
+
recognizer=self._get_recognizer(client),
|
247
|
+
config=config,
|
248
|
+
content=frame.data.tobytes(),
|
249
|
+
),
|
250
|
+
timeout=conn_options.timeout,
|
251
|
+
)
|
235
252
|
|
236
|
-
|
253
|
+
return _recognize_response_to_speech_event(raw)
|
237
254
|
except DeadlineExceeded:
|
238
255
|
raise APITimeoutError()
|
239
256
|
except GoogleAPICallError as e:
|
@@ -253,8 +270,8 @@ class STT(stt.STT):
|
|
253
270
|
config = self._sanitize_options(language=language)
|
254
271
|
stream = SpeechStream(
|
255
272
|
stt=self,
|
256
|
-
|
257
|
-
|
273
|
+
pool=self._pool,
|
274
|
+
recognizer_cb=self._get_recognizer,
|
258
275
|
config=config,
|
259
276
|
conn_options=conn_options,
|
260
277
|
)
|
@@ -287,13 +304,10 @@ class STT(stt.STT):
|
|
287
304
|
self._config.spoken_punctuation = spoken_punctuation
|
288
305
|
if model is not None:
|
289
306
|
self._config.model = model
|
290
|
-
client = None
|
291
|
-
recognizer = None
|
292
307
|
if location is not None:
|
293
308
|
self._location = location
|
294
309
|
# if location is changed, fetch a new client and recognizer as per the new location
|
295
|
-
|
296
|
-
recognizer = self._recognizer
|
310
|
+
self._pool.invalidate()
|
297
311
|
if keywords is not None:
|
298
312
|
self._config.keywords = keywords
|
299
313
|
|
@@ -306,10 +320,12 @@ class STT(stt.STT):
|
|
306
320
|
spoken_punctuation=spoken_punctuation,
|
307
321
|
model=model,
|
308
322
|
keywords=keywords,
|
309
|
-
client=client,
|
310
|
-
recognizer=recognizer,
|
311
323
|
)
|
312
324
|
|
325
|
+
async def aclose(self) -> None:
|
326
|
+
await self._pool.aclose()
|
327
|
+
await super().aclose()
|
328
|
+
|
313
329
|
|
314
330
|
class SpeechStream(stt.SpeechStream):
|
315
331
|
def __init__(
|
@@ -317,16 +333,16 @@ class SpeechStream(stt.SpeechStream):
|
|
317
333
|
*,
|
318
334
|
stt: STT,
|
319
335
|
conn_options: APIConnectOptions,
|
320
|
-
|
321
|
-
|
336
|
+
pool: utils.ConnectionPool[SpeechAsyncClient],
|
337
|
+
recognizer_cb: Callable[[SpeechAsyncClient], str],
|
322
338
|
config: STTOptions,
|
323
339
|
) -> None:
|
324
340
|
super().__init__(
|
325
341
|
stt=stt, conn_options=conn_options, sample_rate=config.sample_rate
|
326
342
|
)
|
327
343
|
|
328
|
-
self.
|
329
|
-
self.
|
344
|
+
self._pool = pool
|
345
|
+
self._recognizer_cb = recognizer_cb
|
330
346
|
self._config = config
|
331
347
|
self._reconnect_event = asyncio.Event()
|
332
348
|
self._session_connected_at: float = 0
|
@@ -341,8 +357,6 @@ class SpeechStream(stt.SpeechStream):
|
|
341
357
|
spoken_punctuation: bool | None = None,
|
342
358
|
model: SpeechModels | None = None,
|
343
359
|
keywords: List[tuple[str, float]] | None = None,
|
344
|
-
client: SpeechAsyncClient | None = None,
|
345
|
-
recognizer: str | None = None,
|
346
360
|
):
|
347
361
|
if languages is not None:
|
348
362
|
if isinstance(languages, str):
|
@@ -360,21 +374,19 @@ class SpeechStream(stt.SpeechStream):
|
|
360
374
|
self._config.model = model
|
361
375
|
if keywords is not None:
|
362
376
|
self._config.keywords = keywords
|
363
|
-
if client is not None:
|
364
|
-
self._client = client
|
365
|
-
if recognizer is not None:
|
366
|
-
self._recognizer = recognizer
|
367
377
|
|
368
378
|
self._reconnect_event.set()
|
369
379
|
|
370
380
|
async def _run(self) -> None:
|
371
381
|
# google requires a async generator when calling streaming_recognize
|
372
382
|
# this function basically convert the queue into a async generator
|
373
|
-
async def input_generator(
|
383
|
+
async def input_generator(
|
384
|
+
client: SpeechAsyncClient, should_stop: asyncio.Event
|
385
|
+
):
|
374
386
|
try:
|
375
387
|
# first request should contain the config
|
376
388
|
yield cloud_speech.StreamingRecognizeRequest(
|
377
|
-
recognizer=self.
|
389
|
+
recognizer=self._recognizer_cb(client),
|
378
390
|
streaming_config=self._streaming_config,
|
379
391
|
)
|
380
392
|
|
@@ -395,7 +407,7 @@ class SpeechStream(stt.SpeechStream):
|
|
395
407
|
"an error occurred while streaming input to google STT"
|
396
408
|
)
|
397
409
|
|
398
|
-
async def process_stream(stream):
|
410
|
+
async def process_stream(client: SpeechAsyncClient, stream):
|
399
411
|
has_started = False
|
400
412
|
async for resp in stream:
|
401
413
|
if (
|
@@ -437,6 +449,7 @@ class SpeechStream(stt.SpeechStream):
|
|
437
449
|
logger.debug(
|
438
450
|
"Google STT maximum connection time reached. Reconnecting..."
|
439
451
|
)
|
452
|
+
self._pool.remove(client)
|
440
453
|
if has_started:
|
441
454
|
self._event_ch.send_nowait(
|
442
455
|
stt.SpeechEvent(
|
@@ -458,52 +471,57 @@ class SpeechStream(stt.SpeechStream):
|
|
458
471
|
|
459
472
|
while True:
|
460
473
|
try:
|
461
|
-
self.
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
474
|
+
async with self._pool.connection() as client:
|
475
|
+
self._streaming_config = cloud_speech.StreamingRecognitionConfig(
|
476
|
+
config=cloud_speech.RecognitionConfig(
|
477
|
+
explicit_decoding_config=cloud_speech.ExplicitDecodingConfig(
|
478
|
+
encoding=cloud_speech.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
|
479
|
+
sample_rate_hertz=self._config.sample_rate,
|
480
|
+
audio_channel_count=1,
|
481
|
+
),
|
482
|
+
adaptation=self._config.build_adaptation(),
|
483
|
+
language_codes=self._config.languages,
|
484
|
+
model=self._config.model,
|
485
|
+
features=cloud_speech.RecognitionFeatures(
|
486
|
+
enable_automatic_punctuation=self._config.punctuate,
|
487
|
+
enable_word_time_offsets=True,
|
488
|
+
),
|
467
489
|
),
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
features=cloud_speech.RecognitionFeatures(
|
472
|
-
enable_automatic_punctuation=self._config.punctuate,
|
473
|
-
enable_word_time_offsets=True,
|
490
|
+
streaming_features=cloud_speech.StreamingRecognitionFeatures(
|
491
|
+
enable_voice_activity_events=True,
|
492
|
+
interim_results=self._config.interim_results,
|
474
493
|
),
|
475
|
-
)
|
476
|
-
streaming_features=cloud_speech.StreamingRecognitionFeatures(
|
477
|
-
enable_voice_activity_events=True,
|
478
|
-
interim_results=self._config.interim_results,
|
479
|
-
),
|
480
|
-
)
|
481
|
-
|
482
|
-
should_stop = asyncio.Event()
|
483
|
-
stream = await self._client.streaming_recognize(
|
484
|
-
requests=input_generator(should_stop),
|
485
|
-
)
|
486
|
-
self._session_connected_at = time.time()
|
494
|
+
)
|
487
495
|
|
488
|
-
|
489
|
-
|
496
|
+
should_stop = asyncio.Event()
|
497
|
+
stream = await client.streaming_recognize(
|
498
|
+
requests=input_generator(client, should_stop),
|
499
|
+
)
|
500
|
+
self._session_connected_at = time.time()
|
490
501
|
|
491
|
-
|
492
|
-
|
493
|
-
[process_stream_task, wait_reconnect_task],
|
494
|
-
return_when=asyncio.FIRST_COMPLETED,
|
502
|
+
process_stream_task = asyncio.create_task(
|
503
|
+
process_stream(client, stream)
|
495
504
|
)
|
496
|
-
|
497
|
-
|
498
|
-
task.result()
|
499
|
-
if wait_reconnect_task not in done:
|
500
|
-
break
|
501
|
-
self._reconnect_event.clear()
|
502
|
-
finally:
|
503
|
-
await utils.aio.gracefully_cancel(
|
504
|
-
process_stream_task, wait_reconnect_task
|
505
|
+
wait_reconnect_task = asyncio.create_task(
|
506
|
+
self._reconnect_event.wait()
|
505
507
|
)
|
506
|
-
|
508
|
+
|
509
|
+
try:
|
510
|
+
done, _ = await asyncio.wait(
|
511
|
+
[process_stream_task, wait_reconnect_task],
|
512
|
+
return_when=asyncio.FIRST_COMPLETED,
|
513
|
+
)
|
514
|
+
for task in done:
|
515
|
+
if task != wait_reconnect_task:
|
516
|
+
task.result()
|
517
|
+
if wait_reconnect_task not in done:
|
518
|
+
break
|
519
|
+
self._reconnect_event.clear()
|
520
|
+
finally:
|
521
|
+
await utils.aio.gracefully_cancel(
|
522
|
+
process_stream_task, wait_reconnect_task
|
523
|
+
)
|
524
|
+
should_stop.set()
|
507
525
|
except DeadlineExceeded:
|
508
526
|
raise APITimeoutError()
|
509
527
|
except GoogleAPICallError as e:
|
livekit/plugins/google/tts.py
CHANGED
@@ -15,10 +15,9 @@
|
|
15
15
|
from __future__ import annotations
|
16
16
|
|
17
17
|
from dataclasses import dataclass
|
18
|
+
from typing import Optional
|
18
19
|
|
19
|
-
from livekit import rtc
|
20
20
|
from livekit.agents import (
|
21
|
-
DEFAULT_API_CONNECT_OPTIONS,
|
22
21
|
APIConnectionError,
|
23
22
|
APIConnectOptions,
|
24
23
|
APIStatusError,
|
@@ -31,7 +30,7 @@ from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
|
|
31
30
|
from google.cloud import texttospeech
|
32
31
|
from google.cloud.texttospeech_v1.types import SsmlVoiceGender, SynthesizeSpeechResponse
|
33
32
|
|
34
|
-
from .models import
|
33
|
+
from .models import Gender, SpeechLanguages
|
35
34
|
|
36
35
|
|
37
36
|
@dataclass
|
@@ -47,7 +46,6 @@ class TTS(tts.TTS):
|
|
47
46
|
language: SpeechLanguages | str = "en-US",
|
48
47
|
gender: Gender | str = "neutral",
|
49
48
|
voice_name: str = "", # Not required
|
50
|
-
encoding: AudioEncoding | str = "linear16",
|
51
49
|
sample_rate: int = 24000,
|
52
50
|
pitch: int = 0,
|
53
51
|
effects_profile_id: str = "",
|
@@ -66,7 +64,6 @@ class TTS(tts.TTS):
|
|
66
64
|
language (SpeechLanguages | str, optional): Language code (e.g., "en-US"). Default is "en-US".
|
67
65
|
gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
|
68
66
|
voice_name (str, optional): Specific voice name. Default is an empty string.
|
69
|
-
encoding (AudioEncoding | str, optional): Audio encoding format (e.g., "linear16"). Default is "linear16".
|
70
67
|
sample_rate (int, optional): Audio sample rate in Hz. Default is 24000.
|
71
68
|
pitch (float, optional): Speaking pitch, ranging from -20.0 to 20.0 semitones relative to the original pitch. Default is 0.
|
72
69
|
effects_profile_id (str): Optional identifier for selecting audio effects profiles to apply to the synthesized speech.
|
@@ -93,17 +90,10 @@ class TTS(tts.TTS):
|
|
93
90
|
ssml_gender=_gender_from_str(gender),
|
94
91
|
)
|
95
92
|
|
96
|
-
if encoding == "linear16" or encoding == "wav":
|
97
|
-
_audio_encoding = texttospeech.AudioEncoding.LINEAR16
|
98
|
-
elif encoding == "mp3":
|
99
|
-
_audio_encoding = texttospeech.AudioEncoding.MP3
|
100
|
-
else:
|
101
|
-
raise NotImplementedError(f"audio encoding {encoding} is not supported")
|
102
|
-
|
103
93
|
self._opts = _TTSOptions(
|
104
94
|
voice=voice,
|
105
95
|
audio_config=texttospeech.AudioConfig(
|
106
|
-
audio_encoding=
|
96
|
+
audio_encoding=texttospeech.AudioEncoding.OGG_OPUS,
|
107
97
|
sample_rate_hertz=sample_rate,
|
108
98
|
pitch=pitch,
|
109
99
|
effects_profile_id=effects_profile_id,
|
@@ -160,7 +150,7 @@ class TTS(tts.TTS):
|
|
160
150
|
self,
|
161
151
|
text: str,
|
162
152
|
*,
|
163
|
-
conn_options: APIConnectOptions =
|
153
|
+
conn_options: Optional[APIConnectOptions] = None,
|
164
154
|
) -> "ChunkedStream":
|
165
155
|
return ChunkedStream(
|
166
156
|
tts=self,
|
@@ -177,9 +167,9 @@ class ChunkedStream(tts.ChunkedStream):
|
|
177
167
|
*,
|
178
168
|
tts: TTS,
|
179
169
|
input_text: str,
|
180
|
-
conn_options: APIConnectOptions,
|
181
170
|
opts: _TTSOptions,
|
182
171
|
client: texttospeech.TextToSpeechAsyncClient,
|
172
|
+
conn_options: Optional[APIConnectOptions] = None,
|
183
173
|
) -> None:
|
184
174
|
super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
|
185
175
|
self._opts, self._client = opts, client
|
@@ -195,35 +185,24 @@ class ChunkedStream(tts.ChunkedStream):
|
|
195
185
|
timeout=self._conn_options.timeout,
|
196
186
|
)
|
197
187
|
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
for frame in bstream.flush():
|
211
|
-
self._event_ch.send_nowait(
|
212
|
-
tts.SynthesizedAudio(request_id=request_id, frame=frame)
|
213
|
-
)
|
214
|
-
else:
|
215
|
-
data = response.audio_content[44:] # skip WAV header
|
216
|
-
self._event_ch.send_nowait(
|
217
|
-
tts.SynthesizedAudio(
|
218
|
-
request_id=request_id,
|
219
|
-
frame=rtc.AudioFrame(
|
220
|
-
data=data,
|
221
|
-
sample_rate=self._opts.audio_config.sample_rate_hertz,
|
222
|
-
num_channels=1,
|
223
|
-
samples_per_channel=len(data) // 2, # 16-bit
|
224
|
-
),
|
225
|
-
)
|
188
|
+
# Create AudioStreamDecoder for OGG format
|
189
|
+
decoder = utils.codecs.AudioStreamDecoder(
|
190
|
+
sample_rate=self._opts.audio_config.sample_rate_hertz,
|
191
|
+
num_channels=1,
|
192
|
+
)
|
193
|
+
|
194
|
+
try:
|
195
|
+
decoder.push(response.audio_content)
|
196
|
+
decoder.end_input()
|
197
|
+
emitter = tts.SynthesizedAudioEmitter(
|
198
|
+
event_ch=self._event_ch,
|
199
|
+
request_id=request_id,
|
226
200
|
)
|
201
|
+
async for frame in decoder:
|
202
|
+
emitter.push(frame)
|
203
|
+
emitter.flush()
|
204
|
+
finally:
|
205
|
+
await decoder.aclose()
|
227
206
|
|
228
207
|
except DeadlineExceeded:
|
229
208
|
raise APITimeoutError()
|
{livekit_plugins_google-0.10.5.dist-info → livekit_plugins_google-0.11.0.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: livekit-plugins-google
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.11.0
|
4
4
|
Summary: Agent Framework plugin for services from Google Cloud
|
5
5
|
Home-page: https://github.com/livekit/agents
|
6
6
|
License: Apache-2.0
|
@@ -22,8 +22,8 @@ Description-Content-Type: text/markdown
|
|
22
22
|
Requires-Dist: google-auth<3,>=2
|
23
23
|
Requires-Dist: google-cloud-speech<3,>=2
|
24
24
|
Requires-Dist: google-cloud-texttospeech<3,>=2
|
25
|
-
Requires-Dist: google-genai==
|
26
|
-
Requires-Dist: livekit-agents
|
25
|
+
Requires-Dist: google-genai==1.3.0
|
26
|
+
Requires-Dist: livekit-agents<1.0.0,>=0.12.16
|
27
27
|
Dynamic: classifier
|
28
28
|
Dynamic: description
|
29
29
|
Dynamic: description-content-type
|
@@ -53,3 +53,57 @@ To use the STT and TTS API, you'll need to enable the respective services for yo
|
|
53
53
|
|
54
54
|
- Cloud Speech-to-Text API
|
55
55
|
- Cloud Text-to-Speech API
|
56
|
+
|
57
|
+
|
58
|
+
## Gemini Multimodal Live
|
59
|
+
|
60
|
+
Gemini Multimodal Live can be used with the `MultimodalAgent` class. See examples/multimodal_agent/gemini_agent.py for an example.
|
61
|
+
|
62
|
+
### Live Video Input (experimental)
|
63
|
+
|
64
|
+
You can push video frames to your Gemini Multimodal Live session alongside the audio automatically handled by the `MultimodalAgent`. The basic approach is to subscribe to the video track, create a video stream, sample frames at a suitable frame rate, and push them into the RealtimeSession:
|
65
|
+
|
66
|
+
```
|
67
|
+
# Make sure you subscribe to audio and video tracks
|
68
|
+
await ctx.connect(auto_subscribe=AutoSubscribe.SUBSCRIBE_ALL)
|
69
|
+
|
70
|
+
# Create your RealtimeModel and store a reference
|
71
|
+
model = google.beta.realtime.RealtimeModel(
|
72
|
+
# ...
|
73
|
+
)
|
74
|
+
|
75
|
+
# Create your MultimodalAgent as usual
|
76
|
+
agent = MultimodalAgent(
|
77
|
+
model=model,
|
78
|
+
# ...
|
79
|
+
)
|
80
|
+
|
81
|
+
# Async method to process the video track and push frames to Gemini
|
82
|
+
async def _process_video_track(self, track: Track):
|
83
|
+
video_stream = VideoStream(track)
|
84
|
+
last_frame_time = 0
|
85
|
+
|
86
|
+
async for event in video_stream:
|
87
|
+
current_time = asyncio.get_event_loop().time()
|
88
|
+
|
89
|
+
# Sample at 1 FPS
|
90
|
+
if current_time - last_frame_time < 1.0:
|
91
|
+
continue
|
92
|
+
|
93
|
+
last_frame_time = current_time
|
94
|
+
frame = event.frame
|
95
|
+
|
96
|
+
# Push the frame into the RealtimeSession
|
97
|
+
model.sessions[0].push_video(frame)
|
98
|
+
|
99
|
+
await video_stream.aclose()
|
100
|
+
|
101
|
+
# Subscribe to new tracks and process them
|
102
|
+
@ctx.room.on("track_subscribed")
|
103
|
+
def _on_track_subscribed(track: Track, pub, participant):
|
104
|
+
if track.kind == TrackKind.KIND_VIDEO:
|
105
|
+
asyncio.create_task(self._process_video_track(track))
|
106
|
+
```
|
107
|
+
|
108
|
+
|
109
|
+
|
@@ -0,0 +1,18 @@
|
|
1
|
+
livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
|
2
|
+
livekit/plugins/google/_utils.py,sha256=FG1_26nlWGcI6onPleQQcmGBMfb4QNYgis1B5BMJxWA,7131
|
3
|
+
livekit/plugins/google/llm.py,sha256=LZaHsrkjfboRZLWm7L2G0mw62q2sXBNj4YeeV2Sk2uU,16717
|
4
|
+
livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
|
5
|
+
livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
|
6
|
+
livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
livekit/plugins/google/stt.py,sha256=96GJmGDAIBdCpDECArwIXpj2s1xlcA_zuvTnwsvq4xA,22854
|
8
|
+
livekit/plugins/google/tts.py,sha256=pG9_pibO3NDGEMa4huU5S9lbeyI3daQyrS17SuTKfZI,8008
|
9
|
+
livekit/plugins/google/version.py,sha256=BvmVdoHkxksDSQP-uWrqIiyaAUImEyxSohntkIBNZRo,601
|
10
|
+
livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
|
11
|
+
livekit/plugins/google/beta/realtime/__init__.py,sha256=sGTn6JFNyA30QUXBZ_BV3l2eHpGAzR35ByXxg77vWNU,205
|
12
|
+
livekit/plugins/google/beta/realtime/api_proto.py,sha256=ralrRZqIbE71oyuLKRYaXHvm6tcHMwBJueKvSO8Xfus,658
|
13
|
+
livekit/plugins/google/beta/realtime/realtime_api.py,sha256=vZHiWNk8PorxtrHSmA7Ya6ZvCjT37YSJN-MxK8ebdrs,22795
|
14
|
+
livekit/plugins/google/beta/realtime/transcriber.py,sha256=rjXO0cSPr3HATxrSfv1MX7IbrjmiTvnLPF280BfRBL8,9809
|
15
|
+
livekit_plugins_google-0.11.0.dist-info/METADATA,sha256=b8Aj_eQnGhAT3DQa77KLHZBDGAWZYdrnTBWjVODAm2k,3732
|
16
|
+
livekit_plugins_google-0.11.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
17
|
+
livekit_plugins_google-0.11.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
18
|
+
livekit_plugins_google-0.11.0.dist-info/RECORD,,
|
@@ -1,18 +0,0 @@
|
|
1
|
-
livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
|
2
|
-
livekit/plugins/google/_utils.py,sha256=mjsqblhGMgAZ2MNPisAVkNsqq4gfO6vvprEKzAGoVwE,7248
|
3
|
-
livekit/plugins/google/llm.py,sha256=TVTerAabIf10AKVZr-Kn13eajhQ9RV7K4xaVD771yHU,16547
|
4
|
-
livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
|
5
|
-
livekit/plugins/google/models.py,sha256=Q47z_tIwLCufxhJyJHH7_1bo4xdBYZBSkkvMeycuItg,1493
|
6
|
-
livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
livekit/plugins/google/stt.py,sha256=QcpKAcg8ltFlQnLGSdtRS2H12pFEPs1ZzLojKHB8bpY,21376
|
8
|
-
livekit/plugins/google/tts.py,sha256=95qXCigVQYWNbcN3pIKBpIah4b31U_MWtXv5Ji0AMc4,9229
|
9
|
-
livekit/plugins/google/version.py,sha256=na7fXYRLcWIgCRi4QSAbV4DZGA7YDgOWcE0O21jDlAo,601
|
10
|
-
livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
|
11
|
-
livekit/plugins/google/beta/realtime/__init__.py,sha256=sGTn6JFNyA30QUXBZ_BV3l2eHpGAzR35ByXxg77vWNU,205
|
12
|
-
livekit/plugins/google/beta/realtime/api_proto.py,sha256=ralrRZqIbE71oyuLKRYaXHvm6tcHMwBJueKvSO8Xfus,658
|
13
|
-
livekit/plugins/google/beta/realtime/realtime_api.py,sha256=RPGYAJXelYPo16YyR2qccjUjxUJCkJBU2N5rNTpKxyo,21281
|
14
|
-
livekit/plugins/google/beta/realtime/transcriber.py,sha256=ZpKA3F8dqOtJPDlPiAgjw0AUDBIuhQiBVnvSYL4cdBg,9796
|
15
|
-
livekit_plugins_google-0.10.5.dist-info/METADATA,sha256=AHhTVMBNVlOnqMnLPjncTO_iIqkDS-ExCm_5ubD9Mdg,2058
|
16
|
-
livekit_plugins_google-0.10.5.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
17
|
-
livekit_plugins_google-0.10.5.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
18
|
-
livekit_plugins_google-0.10.5.dist-info/RECORD,,
|
{livekit_plugins_google-0.10.5.dist-info → livekit_plugins_google-0.11.0.dist-info}/top_level.txt
RENAMED
File without changes
|