livekit-plugins-google 0.10.6__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/google/beta/realtime/realtime_api.py +50 -4
- livekit/plugins/google/models.py +0 -2
- livekit/plugins/google/stt.py +4 -0
- livekit/plugins/google/tts.py +19 -40
- livekit/plugins/google/version.py +1 -1
- {livekit_plugins_google-0.10.6.dist-info → livekit_plugins_google-0.11.0.dist-info}/METADATA +56 -2
- {livekit_plugins_google-0.10.6.dist-info → livekit_plugins_google-0.11.0.dist-info}/RECORD +9 -9
- {livekit_plugins_google-0.10.6.dist-info → livekit_plugins_google-0.11.0.dist-info}/WHEEL +0 -0
- {livekit_plugins_google-0.10.6.dist-info → livekit_plugins_google-0.11.0.dist-info}/top_level.txt +0 -0
@@ -9,6 +9,7 @@ from typing import AsyncIterable, Literal
|
|
9
9
|
from livekit import rtc
|
10
10
|
from livekit.agents import llm, utils
|
11
11
|
from livekit.agents.llm.function_context import _create_ai_function_info
|
12
|
+
from livekit.agents.utils import images
|
12
13
|
|
13
14
|
from google import genai
|
14
15
|
from google.genai.types import (
|
@@ -258,6 +259,8 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
258
259
|
self._fnc_ctx = fnc_ctx
|
259
260
|
self._fnc_tasks = utils.aio.TaskSet()
|
260
261
|
self._is_interrupted = False
|
262
|
+
self._playout_complete = asyncio.Event()
|
263
|
+
self._playout_complete.set()
|
261
264
|
|
262
265
|
tools = []
|
263
266
|
if self._fnc_ctx is not None:
|
@@ -317,6 +320,10 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
317
320
|
self._send_ch.close()
|
318
321
|
await self._main_atask
|
319
322
|
|
323
|
+
@property
|
324
|
+
def playout_complete(self) -> asyncio.Event | None:
|
325
|
+
return self._playout_complete
|
326
|
+
|
320
327
|
@property
|
321
328
|
def fnc_ctx(self) -> llm.FunctionContext | None:
|
322
329
|
return self._fnc_ctx
|
@@ -325,14 +332,53 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
325
332
|
def fnc_ctx(self, value: llm.FunctionContext | None) -> None:
|
326
333
|
self._fnc_ctx = value
|
327
334
|
|
328
|
-
def
|
329
|
-
if self._opts.enable_user_audio_transcription:
|
330
|
-
self._transcriber._push_audio(frame)
|
335
|
+
def _push_media_chunk(self, data: bytes, mime_type: str) -> None:
|
331
336
|
realtime_input = LiveClientRealtimeInput(
|
332
|
-
media_chunks=[Blob(data=
|
337
|
+
media_chunks=[Blob(data=data, mime_type=mime_type)],
|
333
338
|
)
|
334
339
|
self._queue_msg(realtime_input)
|
335
340
|
|
341
|
+
DEFAULT_ENCODE_OPTIONS = images.EncodeOptions(
|
342
|
+
format="JPEG",
|
343
|
+
quality=75,
|
344
|
+
resize_options=images.ResizeOptions(
|
345
|
+
width=1024, height=1024, strategy="scale_aspect_fit"
|
346
|
+
),
|
347
|
+
)
|
348
|
+
|
349
|
+
def push_video(
|
350
|
+
self,
|
351
|
+
frame: rtc.VideoFrame,
|
352
|
+
encode_options: images.EncodeOptions = DEFAULT_ENCODE_OPTIONS,
|
353
|
+
) -> None:
|
354
|
+
"""Push a video frame to the Gemini Multimodal Live session.
|
355
|
+
|
356
|
+
Args:
|
357
|
+
frame (rtc.VideoFrame): The video frame to push.
|
358
|
+
encode_options (images.EncodeOptions, optional): The encode options for the video frame. Defaults to 1024x1024 JPEG.
|
359
|
+
|
360
|
+
Notes:
|
361
|
+
- This will be sent immediately so you should use a sampling frame rate that makes sense for your application and Gemini's constraints. 1 FPS is a good starting point.
|
362
|
+
"""
|
363
|
+
encoded_data = images.encode(
|
364
|
+
frame,
|
365
|
+
encode_options,
|
366
|
+
)
|
367
|
+
mime_type = (
|
368
|
+
"image/jpeg"
|
369
|
+
if encode_options.format == "JPEG"
|
370
|
+
else "image/png"
|
371
|
+
if encode_options.format == "PNG"
|
372
|
+
else "image/jpeg"
|
373
|
+
)
|
374
|
+
self._push_media_chunk(encoded_data, mime_type)
|
375
|
+
|
376
|
+
def _push_audio(self, frame: rtc.AudioFrame) -> None:
|
377
|
+
if self._opts.enable_user_audio_transcription:
|
378
|
+
self._transcriber._push_audio(frame)
|
379
|
+
|
380
|
+
self._push_media_chunk(frame.data.tobytes(), "audio/pcm")
|
381
|
+
|
336
382
|
def _queue_msg(self, msg: ClientEvents) -> None:
|
337
383
|
self._send_ch.send_nowait(msg)
|
338
384
|
|
livekit/plugins/google/models.py
CHANGED
livekit/plugins/google/stt.py
CHANGED
livekit/plugins/google/tts.py
CHANGED
@@ -17,7 +17,6 @@ from __future__ import annotations
|
|
17
17
|
from dataclasses import dataclass
|
18
18
|
from typing import Optional
|
19
19
|
|
20
|
-
from livekit import rtc
|
21
20
|
from livekit.agents import (
|
22
21
|
APIConnectionError,
|
23
22
|
APIConnectOptions,
|
@@ -31,7 +30,7 @@ from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
|
|
31
30
|
from google.cloud import texttospeech
|
32
31
|
from google.cloud.texttospeech_v1.types import SsmlVoiceGender, SynthesizeSpeechResponse
|
33
32
|
|
34
|
-
from .models import
|
33
|
+
from .models import Gender, SpeechLanguages
|
35
34
|
|
36
35
|
|
37
36
|
@dataclass
|
@@ -47,7 +46,6 @@ class TTS(tts.TTS):
|
|
47
46
|
language: SpeechLanguages | str = "en-US",
|
48
47
|
gender: Gender | str = "neutral",
|
49
48
|
voice_name: str = "", # Not required
|
50
|
-
encoding: AudioEncoding | str = "linear16",
|
51
49
|
sample_rate: int = 24000,
|
52
50
|
pitch: int = 0,
|
53
51
|
effects_profile_id: str = "",
|
@@ -66,7 +64,6 @@ class TTS(tts.TTS):
|
|
66
64
|
language (SpeechLanguages | str, optional): Language code (e.g., "en-US"). Default is "en-US".
|
67
65
|
gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
|
68
66
|
voice_name (str, optional): Specific voice name. Default is an empty string.
|
69
|
-
encoding (AudioEncoding | str, optional): Audio encoding format (e.g., "linear16"). Default is "linear16".
|
70
67
|
sample_rate (int, optional): Audio sample rate in Hz. Default is 24000.
|
71
68
|
pitch (float, optional): Speaking pitch, ranging from -20.0 to 20.0 semitones relative to the original pitch. Default is 0.
|
72
69
|
effects_profile_id (str): Optional identifier for selecting audio effects profiles to apply to the synthesized speech.
|
@@ -93,17 +90,10 @@ class TTS(tts.TTS):
|
|
93
90
|
ssml_gender=_gender_from_str(gender),
|
94
91
|
)
|
95
92
|
|
96
|
-
if encoding == "linear16" or encoding == "wav":
|
97
|
-
_audio_encoding = texttospeech.AudioEncoding.LINEAR16
|
98
|
-
elif encoding == "mp3":
|
99
|
-
_audio_encoding = texttospeech.AudioEncoding.MP3
|
100
|
-
else:
|
101
|
-
raise NotImplementedError(f"audio encoding {encoding} is not supported")
|
102
|
-
|
103
93
|
self._opts = _TTSOptions(
|
104
94
|
voice=voice,
|
105
95
|
audio_config=texttospeech.AudioConfig(
|
106
|
-
audio_encoding=
|
96
|
+
audio_encoding=texttospeech.AudioEncoding.OGG_OPUS,
|
107
97
|
sample_rate_hertz=sample_rate,
|
108
98
|
pitch=pitch,
|
109
99
|
effects_profile_id=effects_profile_id,
|
@@ -195,35 +185,24 @@ class ChunkedStream(tts.ChunkedStream):
|
|
195
185
|
timeout=self._conn_options.timeout,
|
196
186
|
)
|
197
187
|
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
for frame in bstream.flush():
|
211
|
-
self._event_ch.send_nowait(
|
212
|
-
tts.SynthesizedAudio(request_id=request_id, frame=frame)
|
213
|
-
)
|
214
|
-
else:
|
215
|
-
data = response.audio_content[44:] # skip WAV header
|
216
|
-
self._event_ch.send_nowait(
|
217
|
-
tts.SynthesizedAudio(
|
218
|
-
request_id=request_id,
|
219
|
-
frame=rtc.AudioFrame(
|
220
|
-
data=data,
|
221
|
-
sample_rate=self._opts.audio_config.sample_rate_hertz,
|
222
|
-
num_channels=1,
|
223
|
-
samples_per_channel=len(data) // 2, # 16-bit
|
224
|
-
),
|
225
|
-
)
|
188
|
+
# Create AudioStreamDecoder for OGG format
|
189
|
+
decoder = utils.codecs.AudioStreamDecoder(
|
190
|
+
sample_rate=self._opts.audio_config.sample_rate_hertz,
|
191
|
+
num_channels=1,
|
192
|
+
)
|
193
|
+
|
194
|
+
try:
|
195
|
+
decoder.push(response.audio_content)
|
196
|
+
decoder.end_input()
|
197
|
+
emitter = tts.SynthesizedAudioEmitter(
|
198
|
+
event_ch=self._event_ch,
|
199
|
+
request_id=request_id,
|
226
200
|
)
|
201
|
+
async for frame in decoder:
|
202
|
+
emitter.push(frame)
|
203
|
+
emitter.flush()
|
204
|
+
finally:
|
205
|
+
await decoder.aclose()
|
227
206
|
|
228
207
|
except DeadlineExceeded:
|
229
208
|
raise APITimeoutError()
|
{livekit_plugins_google-0.10.6.dist-info → livekit_plugins_google-0.11.0.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: livekit-plugins-google
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.11.0
|
4
4
|
Summary: Agent Framework plugin for services from Google Cloud
|
5
5
|
Home-page: https://github.com/livekit/agents
|
6
6
|
License: Apache-2.0
|
@@ -23,7 +23,7 @@ Requires-Dist: google-auth<3,>=2
|
|
23
23
|
Requires-Dist: google-cloud-speech<3,>=2
|
24
24
|
Requires-Dist: google-cloud-texttospeech<3,>=2
|
25
25
|
Requires-Dist: google-genai==1.3.0
|
26
|
-
Requires-Dist: livekit-agents
|
26
|
+
Requires-Dist: livekit-agents<1.0.0,>=0.12.16
|
27
27
|
Dynamic: classifier
|
28
28
|
Dynamic: description
|
29
29
|
Dynamic: description-content-type
|
@@ -53,3 +53,57 @@ To use the STT and TTS API, you'll need to enable the respective services for yo
|
|
53
53
|
|
54
54
|
- Cloud Speech-to-Text API
|
55
55
|
- Cloud Text-to-Speech API
|
56
|
+
|
57
|
+
|
58
|
+
## Gemini Multimodal Live
|
59
|
+
|
60
|
+
Gemini Multimodal Live can be used with the `MultimodalAgent` class. See examples/multimodal_agent/gemini_agent.py for an example.
|
61
|
+
|
62
|
+
### Live Video Input (experimental)
|
63
|
+
|
64
|
+
You can push video frames to your Gemini Multimodal Live session alongside the audio automatically handled by the `MultimodalAgent`. The basic approach is to subscribe to the video track, create a video stream, sample frames at a suitable frame rate, and push them into the RealtimeSession:
|
65
|
+
|
66
|
+
```
|
67
|
+
# Make sure you subscribe to audio and video tracks
|
68
|
+
await ctx.connect(auto_subscribe=AutoSubscribe.SUBSCRIBE_ALL)
|
69
|
+
|
70
|
+
# Create your RealtimeModel and store a reference
|
71
|
+
model = google.beta.realtime.RealtimeModel(
|
72
|
+
# ...
|
73
|
+
)
|
74
|
+
|
75
|
+
# Create your MultimodalAgent as usual
|
76
|
+
agent = MultimodalAgent(
|
77
|
+
model=model,
|
78
|
+
# ...
|
79
|
+
)
|
80
|
+
|
81
|
+
# Async method to process the video track and push frames to Gemini
|
82
|
+
async def _process_video_track(self, track: Track):
|
83
|
+
video_stream = VideoStream(track)
|
84
|
+
last_frame_time = 0
|
85
|
+
|
86
|
+
async for event in video_stream:
|
87
|
+
current_time = asyncio.get_event_loop().time()
|
88
|
+
|
89
|
+
# Sample at 1 FPS
|
90
|
+
if current_time - last_frame_time < 1.0:
|
91
|
+
continue
|
92
|
+
|
93
|
+
last_frame_time = current_time
|
94
|
+
frame = event.frame
|
95
|
+
|
96
|
+
# Push the frame into the RealtimeSession
|
97
|
+
model.sessions[0].push_video(frame)
|
98
|
+
|
99
|
+
await video_stream.aclose()
|
100
|
+
|
101
|
+
# Subscribe to new tracks and process them
|
102
|
+
@ctx.room.on("track_subscribed")
|
103
|
+
def _on_track_subscribed(track: Track, pub, participant):
|
104
|
+
if track.kind == TrackKind.KIND_VIDEO:
|
105
|
+
asyncio.create_task(self._process_video_track(track))
|
106
|
+
```
|
107
|
+
|
108
|
+
|
109
|
+
|
@@ -2,17 +2,17 @@ livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWu
|
|
2
2
|
livekit/plugins/google/_utils.py,sha256=FG1_26nlWGcI6onPleQQcmGBMfb4QNYgis1B5BMJxWA,7131
|
3
3
|
livekit/plugins/google/llm.py,sha256=LZaHsrkjfboRZLWm7L2G0mw62q2sXBNj4YeeV2Sk2uU,16717
|
4
4
|
livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
|
5
|
-
livekit/plugins/google/models.py,sha256=
|
5
|
+
livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
|
6
6
|
livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
livekit/plugins/google/stt.py,sha256=
|
8
|
-
livekit/plugins/google/tts.py,sha256=
|
9
|
-
livekit/plugins/google/version.py,sha256=
|
7
|
+
livekit/plugins/google/stt.py,sha256=96GJmGDAIBdCpDECArwIXpj2s1xlcA_zuvTnwsvq4xA,22854
|
8
|
+
livekit/plugins/google/tts.py,sha256=pG9_pibO3NDGEMa4huU5S9lbeyI3daQyrS17SuTKfZI,8008
|
9
|
+
livekit/plugins/google/version.py,sha256=BvmVdoHkxksDSQP-uWrqIiyaAUImEyxSohntkIBNZRo,601
|
10
10
|
livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
|
11
11
|
livekit/plugins/google/beta/realtime/__init__.py,sha256=sGTn6JFNyA30QUXBZ_BV3l2eHpGAzR35ByXxg77vWNU,205
|
12
12
|
livekit/plugins/google/beta/realtime/api_proto.py,sha256=ralrRZqIbE71oyuLKRYaXHvm6tcHMwBJueKvSO8Xfus,658
|
13
|
-
livekit/plugins/google/beta/realtime/realtime_api.py,sha256=
|
13
|
+
livekit/plugins/google/beta/realtime/realtime_api.py,sha256=vZHiWNk8PorxtrHSmA7Ya6ZvCjT37YSJN-MxK8ebdrs,22795
|
14
14
|
livekit/plugins/google/beta/realtime/transcriber.py,sha256=rjXO0cSPr3HATxrSfv1MX7IbrjmiTvnLPF280BfRBL8,9809
|
15
|
-
livekit_plugins_google-0.
|
16
|
-
livekit_plugins_google-0.
|
17
|
-
livekit_plugins_google-0.
|
18
|
-
livekit_plugins_google-0.
|
15
|
+
livekit_plugins_google-0.11.0.dist-info/METADATA,sha256=b8Aj_eQnGhAT3DQa77KLHZBDGAWZYdrnTBWjVODAm2k,3732
|
16
|
+
livekit_plugins_google-0.11.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
17
|
+
livekit_plugins_google-0.11.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
18
|
+
livekit_plugins_google-0.11.0.dist-info/RECORD,,
|
File without changes
|
{livekit_plugins_google-0.10.6.dist-info → livekit_plugins_google-0.11.0.dist-info}/top_level.txt
RENAMED
File without changes
|