livekit-plugins-google 0.10.6__py3-none-any.whl → 0.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/google/beta/realtime/api_proto.py +1 -1
- livekit/plugins/google/beta/realtime/realtime_api.py +55 -5
- livekit/plugins/google/models.py +0 -2
- livekit/plugins/google/stt.py +4 -0
- livekit/plugins/google/tts.py +19 -40
- livekit/plugins/google/version.py +1 -1
- {livekit_plugins_google-0.10.6.dist-info → livekit_plugins_google-0.11.1.dist-info}/METADATA +56 -2
- livekit_plugins_google-0.11.1.dist-info/RECORD +18 -0
- {livekit_plugins_google-0.10.6.dist-info → livekit_plugins_google-0.11.1.dist-info}/WHEEL +1 -1
- livekit_plugins_google-0.10.6.dist-info/RECORD +0 -18
- {livekit_plugins_google-0.10.6.dist-info → livekit_plugins_google-0.11.1.dist-info}/top_level.txt +0 -0
@@ -9,6 +9,7 @@ from typing import AsyncIterable, Literal
|
|
9
9
|
from livekit import rtc
|
10
10
|
from livekit.agents import llm, utils
|
11
11
|
from livekit.agents.llm.function_context import _create_ai_function_info
|
12
|
+
from livekit.agents.utils import images
|
12
13
|
|
13
14
|
from google import genai
|
14
15
|
from google.genai.types import (
|
@@ -82,6 +83,7 @@ class Capabilities:
|
|
82
83
|
class ModelOptions:
|
83
84
|
model: LiveAPIModels | str
|
84
85
|
api_key: str | None
|
86
|
+
api_version: str
|
85
87
|
voice: Voice | str
|
86
88
|
response_modalities: list[Modality] | None
|
87
89
|
vertexai: bool
|
@@ -106,6 +108,7 @@ class RealtimeModel:
|
|
106
108
|
instructions: str | None = None,
|
107
109
|
model: LiveAPIModels | str = "gemini-2.0-flash-exp",
|
108
110
|
api_key: str | None = None,
|
111
|
+
api_version: str = "v1alpha",
|
109
112
|
voice: Voice | str = "Puck",
|
110
113
|
modalities: list[Modality] = [Modality.AUDIO],
|
111
114
|
enable_user_audio_transcription: bool = True,
|
@@ -135,6 +138,7 @@ class RealtimeModel:
|
|
135
138
|
Args:
|
136
139
|
instructions (str, optional): Initial system instructions for the model. Defaults to "".
|
137
140
|
api_key (str or None, optional): Google Gemini API key. If None, will attempt to read from the environment variable GOOGLE_API_KEY.
|
141
|
+
api_version (str, optional): The version of the API to use. Defaults to "v1alpha".
|
138
142
|
modalities (list[Modality], optional): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
|
139
143
|
model (str or None, optional): The name of the model to use. Defaults to "gemini-2.0-flash-exp".
|
140
144
|
voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
|
@@ -186,6 +190,7 @@ class RealtimeModel:
|
|
186
190
|
self._rt_sessions: list[GeminiRealtimeSession] = []
|
187
191
|
self._opts = ModelOptions(
|
188
192
|
model=model,
|
193
|
+
api_version=api_version,
|
189
194
|
api_key=self._api_key,
|
190
195
|
voice=voice,
|
191
196
|
enable_user_audio_transcription=enable_user_audio_transcription,
|
@@ -258,6 +263,8 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
258
263
|
self._fnc_ctx = fnc_ctx
|
259
264
|
self._fnc_tasks = utils.aio.TaskSet()
|
260
265
|
self._is_interrupted = False
|
266
|
+
self._playout_complete = asyncio.Event()
|
267
|
+
self._playout_complete.set()
|
261
268
|
|
262
269
|
tools = []
|
263
270
|
if self._fnc_ctx is not None:
|
@@ -286,7 +293,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
286
293
|
tools=tools,
|
287
294
|
)
|
288
295
|
self._client = genai.Client(
|
289
|
-
http_options=HttpOptions(api_version=
|
296
|
+
http_options=HttpOptions(api_version=self._opts.api_version),
|
290
297
|
api_key=self._opts.api_key,
|
291
298
|
vertexai=self._opts.vertexai,
|
292
299
|
project=self._opts.project,
|
@@ -317,6 +324,10 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
317
324
|
self._send_ch.close()
|
318
325
|
await self._main_atask
|
319
326
|
|
327
|
+
@property
|
328
|
+
def playout_complete(self) -> asyncio.Event | None:
|
329
|
+
return self._playout_complete
|
330
|
+
|
320
331
|
@property
|
321
332
|
def fnc_ctx(self) -> llm.FunctionContext | None:
|
322
333
|
return self._fnc_ctx
|
@@ -325,14 +336,53 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
|
|
325
336
|
def fnc_ctx(self, value: llm.FunctionContext | None) -> None:
|
326
337
|
self._fnc_ctx = value
|
327
338
|
|
328
|
-
def
|
329
|
-
if self._opts.enable_user_audio_transcription:
|
330
|
-
self._transcriber._push_audio(frame)
|
339
|
+
def _push_media_chunk(self, data: bytes, mime_type: str) -> None:
|
331
340
|
realtime_input = LiveClientRealtimeInput(
|
332
|
-
media_chunks=[Blob(data=
|
341
|
+
media_chunks=[Blob(data=data, mime_type=mime_type)],
|
333
342
|
)
|
334
343
|
self._queue_msg(realtime_input)
|
335
344
|
|
345
|
+
DEFAULT_ENCODE_OPTIONS = images.EncodeOptions(
|
346
|
+
format="JPEG",
|
347
|
+
quality=75,
|
348
|
+
resize_options=images.ResizeOptions(
|
349
|
+
width=1024, height=1024, strategy="scale_aspect_fit"
|
350
|
+
),
|
351
|
+
)
|
352
|
+
|
353
|
+
def push_video(
|
354
|
+
self,
|
355
|
+
frame: rtc.VideoFrame,
|
356
|
+
encode_options: images.EncodeOptions = DEFAULT_ENCODE_OPTIONS,
|
357
|
+
) -> None:
|
358
|
+
"""Push a video frame to the Gemini Multimodal Live session.
|
359
|
+
|
360
|
+
Args:
|
361
|
+
frame (rtc.VideoFrame): The video frame to push.
|
362
|
+
encode_options (images.EncodeOptions, optional): The encode options for the video frame. Defaults to 1024x1024 JPEG.
|
363
|
+
|
364
|
+
Notes:
|
365
|
+
- This will be sent immediately so you should use a sampling frame rate that makes sense for your application and Gemini's constraints. 1 FPS is a good starting point.
|
366
|
+
"""
|
367
|
+
encoded_data = images.encode(
|
368
|
+
frame,
|
369
|
+
encode_options,
|
370
|
+
)
|
371
|
+
mime_type = (
|
372
|
+
"image/jpeg"
|
373
|
+
if encode_options.format == "JPEG"
|
374
|
+
else "image/png"
|
375
|
+
if encode_options.format == "PNG"
|
376
|
+
else "image/jpeg"
|
377
|
+
)
|
378
|
+
self._push_media_chunk(encoded_data, mime_type)
|
379
|
+
|
380
|
+
def _push_audio(self, frame: rtc.AudioFrame) -> None:
|
381
|
+
if self._opts.enable_user_audio_transcription:
|
382
|
+
self._transcriber._push_audio(frame)
|
383
|
+
|
384
|
+
self._push_media_chunk(frame.data.tobytes(), "audio/pcm")
|
385
|
+
|
336
386
|
def _queue_msg(self, msg: ClientEvents) -> None:
|
337
387
|
self._send_ch.send_nowait(msg)
|
338
388
|
|
livekit/plugins/google/models.py
CHANGED
livekit/plugins/google/stt.py
CHANGED
livekit/plugins/google/tts.py
CHANGED
@@ -17,7 +17,6 @@ from __future__ import annotations
|
|
17
17
|
from dataclasses import dataclass
|
18
18
|
from typing import Optional
|
19
19
|
|
20
|
-
from livekit import rtc
|
21
20
|
from livekit.agents import (
|
22
21
|
APIConnectionError,
|
23
22
|
APIConnectOptions,
|
@@ -31,7 +30,7 @@ from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
|
|
31
30
|
from google.cloud import texttospeech
|
32
31
|
from google.cloud.texttospeech_v1.types import SsmlVoiceGender, SynthesizeSpeechResponse
|
33
32
|
|
34
|
-
from .models import
|
33
|
+
from .models import Gender, SpeechLanguages
|
35
34
|
|
36
35
|
|
37
36
|
@dataclass
|
@@ -47,7 +46,6 @@ class TTS(tts.TTS):
|
|
47
46
|
language: SpeechLanguages | str = "en-US",
|
48
47
|
gender: Gender | str = "neutral",
|
49
48
|
voice_name: str = "", # Not required
|
50
|
-
encoding: AudioEncoding | str = "linear16",
|
51
49
|
sample_rate: int = 24000,
|
52
50
|
pitch: int = 0,
|
53
51
|
effects_profile_id: str = "",
|
@@ -66,7 +64,6 @@ class TTS(tts.TTS):
|
|
66
64
|
language (SpeechLanguages | str, optional): Language code (e.g., "en-US"). Default is "en-US".
|
67
65
|
gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
|
68
66
|
voice_name (str, optional): Specific voice name. Default is an empty string.
|
69
|
-
encoding (AudioEncoding | str, optional): Audio encoding format (e.g., "linear16"). Default is "linear16".
|
70
67
|
sample_rate (int, optional): Audio sample rate in Hz. Default is 24000.
|
71
68
|
pitch (float, optional): Speaking pitch, ranging from -20.0 to 20.0 semitones relative to the original pitch. Default is 0.
|
72
69
|
effects_profile_id (str): Optional identifier for selecting audio effects profiles to apply to the synthesized speech.
|
@@ -93,17 +90,10 @@ class TTS(tts.TTS):
|
|
93
90
|
ssml_gender=_gender_from_str(gender),
|
94
91
|
)
|
95
92
|
|
96
|
-
if encoding == "linear16" or encoding == "wav":
|
97
|
-
_audio_encoding = texttospeech.AudioEncoding.LINEAR16
|
98
|
-
elif encoding == "mp3":
|
99
|
-
_audio_encoding = texttospeech.AudioEncoding.MP3
|
100
|
-
else:
|
101
|
-
raise NotImplementedError(f"audio encoding {encoding} is not supported")
|
102
|
-
|
103
93
|
self._opts = _TTSOptions(
|
104
94
|
voice=voice,
|
105
95
|
audio_config=texttospeech.AudioConfig(
|
106
|
-
audio_encoding=
|
96
|
+
audio_encoding=texttospeech.AudioEncoding.OGG_OPUS,
|
107
97
|
sample_rate_hertz=sample_rate,
|
108
98
|
pitch=pitch,
|
109
99
|
effects_profile_id=effects_profile_id,
|
@@ -195,35 +185,24 @@ class ChunkedStream(tts.ChunkedStream):
|
|
195
185
|
timeout=self._conn_options.timeout,
|
196
186
|
)
|
197
187
|
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
for frame in bstream.flush():
|
211
|
-
self._event_ch.send_nowait(
|
212
|
-
tts.SynthesizedAudio(request_id=request_id, frame=frame)
|
213
|
-
)
|
214
|
-
else:
|
215
|
-
data = response.audio_content[44:] # skip WAV header
|
216
|
-
self._event_ch.send_nowait(
|
217
|
-
tts.SynthesizedAudio(
|
218
|
-
request_id=request_id,
|
219
|
-
frame=rtc.AudioFrame(
|
220
|
-
data=data,
|
221
|
-
sample_rate=self._opts.audio_config.sample_rate_hertz,
|
222
|
-
num_channels=1,
|
223
|
-
samples_per_channel=len(data) // 2, # 16-bit
|
224
|
-
),
|
225
|
-
)
|
188
|
+
# Create AudioStreamDecoder for OGG format
|
189
|
+
decoder = utils.codecs.AudioStreamDecoder(
|
190
|
+
sample_rate=self._opts.audio_config.sample_rate_hertz,
|
191
|
+
num_channels=1,
|
192
|
+
)
|
193
|
+
|
194
|
+
try:
|
195
|
+
decoder.push(response.audio_content)
|
196
|
+
decoder.end_input()
|
197
|
+
emitter = tts.SynthesizedAudioEmitter(
|
198
|
+
event_ch=self._event_ch,
|
199
|
+
request_id=request_id,
|
226
200
|
)
|
201
|
+
async for frame in decoder:
|
202
|
+
emitter.push(frame)
|
203
|
+
emitter.flush()
|
204
|
+
finally:
|
205
|
+
await decoder.aclose()
|
227
206
|
|
228
207
|
except DeadlineExceeded:
|
229
208
|
raise APITimeoutError()
|
{livekit_plugins_google-0.10.6.dist-info → livekit_plugins_google-0.11.1.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: livekit-plugins-google
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.11.1
|
4
4
|
Summary: Agent Framework plugin for services from Google Cloud
|
5
5
|
Home-page: https://github.com/livekit/agents
|
6
6
|
License: Apache-2.0
|
@@ -23,7 +23,7 @@ Requires-Dist: google-auth<3,>=2
|
|
23
23
|
Requires-Dist: google-cloud-speech<3,>=2
|
24
24
|
Requires-Dist: google-cloud-texttospeech<3,>=2
|
25
25
|
Requires-Dist: google-genai==1.3.0
|
26
|
-
Requires-Dist: livekit-agents
|
26
|
+
Requires-Dist: livekit-agents<1.0.0,>=0.12.16
|
27
27
|
Dynamic: classifier
|
28
28
|
Dynamic: description
|
29
29
|
Dynamic: description-content-type
|
@@ -53,3 +53,57 @@ To use the STT and TTS API, you'll need to enable the respective services for yo
|
|
53
53
|
|
54
54
|
- Cloud Speech-to-Text API
|
55
55
|
- Cloud Text-to-Speech API
|
56
|
+
|
57
|
+
|
58
|
+
## Gemini Multimodal Live
|
59
|
+
|
60
|
+
Gemini Multimodal Live can be used with the `MultimodalAgent` class. See examples/multimodal_agent/gemini_agent.py for an example.
|
61
|
+
|
62
|
+
### Live Video Input (experimental)
|
63
|
+
|
64
|
+
You can push video frames to your Gemini Multimodal Live session alongside the audio automatically handled by the `MultimodalAgent`. The basic approach is to subscribe to the video track, create a video stream, sample frames at a suitable frame rate, and push them into the RealtimeSession:
|
65
|
+
|
66
|
+
```
|
67
|
+
# Make sure you subscribe to audio and video tracks
|
68
|
+
await ctx.connect(auto_subscribe=AutoSubscribe.SUBSCRIBE_ALL)
|
69
|
+
|
70
|
+
# Create your RealtimeModel and store a reference
|
71
|
+
model = google.beta.realtime.RealtimeModel(
|
72
|
+
# ...
|
73
|
+
)
|
74
|
+
|
75
|
+
# Create your MultimodalAgent as usual
|
76
|
+
agent = MultimodalAgent(
|
77
|
+
model=model,
|
78
|
+
# ...
|
79
|
+
)
|
80
|
+
|
81
|
+
# Async method to process the video track and push frames to Gemini
|
82
|
+
async def _process_video_track(self, track: Track):
|
83
|
+
video_stream = VideoStream(track)
|
84
|
+
last_frame_time = 0
|
85
|
+
|
86
|
+
async for event in video_stream:
|
87
|
+
current_time = asyncio.get_event_loop().time()
|
88
|
+
|
89
|
+
# Sample at 1 FPS
|
90
|
+
if current_time - last_frame_time < 1.0:
|
91
|
+
continue
|
92
|
+
|
93
|
+
last_frame_time = current_time
|
94
|
+
frame = event.frame
|
95
|
+
|
96
|
+
# Push the frame into the RealtimeSession
|
97
|
+
model.sessions[0].push_video(frame)
|
98
|
+
|
99
|
+
await video_stream.aclose()
|
100
|
+
|
101
|
+
# Subscribe to new tracks and process them
|
102
|
+
@ctx.room.on("track_subscribed")
|
103
|
+
def _on_track_subscribed(track: Track, pub, participant):
|
104
|
+
if track.kind == TrackKind.KIND_VIDEO:
|
105
|
+
asyncio.create_task(self._process_video_track(track))
|
106
|
+
```
|
107
|
+
|
108
|
+
|
109
|
+
|
@@ -0,0 +1,18 @@
|
|
1
|
+
livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
|
2
|
+
livekit/plugins/google/_utils.py,sha256=FG1_26nlWGcI6onPleQQcmGBMfb4QNYgis1B5BMJxWA,7131
|
3
|
+
livekit/plugins/google/llm.py,sha256=LZaHsrkjfboRZLWm7L2G0mw62q2sXBNj4YeeV2Sk2uU,16717
|
4
|
+
livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
|
5
|
+
livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
|
6
|
+
livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
livekit/plugins/google/stt.py,sha256=96GJmGDAIBdCpDECArwIXpj2s1xlcA_zuvTnwsvq4xA,22854
|
8
|
+
livekit/plugins/google/tts.py,sha256=pG9_pibO3NDGEMa4huU5S9lbeyI3daQyrS17SuTKfZI,8008
|
9
|
+
livekit/plugins/google/version.py,sha256=LeUJJQ9jwADplJbF46ClzVjYAClwJEhZMCToNJN9lWc,601
|
10
|
+
livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
|
11
|
+
livekit/plugins/google/beta/realtime/__init__.py,sha256=sGTn6JFNyA30QUXBZ_BV3l2eHpGAzR35ByXxg77vWNU,205
|
12
|
+
livekit/plugins/google/beta/realtime/api_proto.py,sha256=9EhmwgeIgKDqdSijv5Q9pgx7UhAakK02ZDwbnUsra_o,657
|
13
|
+
livekit/plugins/google/beta/realtime/realtime_api.py,sha256=8JdWUMUheGhy1ia6JbN3_U2_cL7CNs8-1fTOAgW4I38,22999
|
14
|
+
livekit/plugins/google/beta/realtime/transcriber.py,sha256=rjXO0cSPr3HATxrSfv1MX7IbrjmiTvnLPF280BfRBL8,9809
|
15
|
+
livekit_plugins_google-0.11.1.dist-info/METADATA,sha256=m7B07abY9wTbEJVa3dmdsgfatxYwJFwDNQYhyJgIPJU,3732
|
16
|
+
livekit_plugins_google-0.11.1.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
|
17
|
+
livekit_plugins_google-0.11.1.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
18
|
+
livekit_plugins_google-0.11.1.dist-info/RECORD,,
|
@@ -1,18 +0,0 @@
|
|
1
|
-
livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
|
2
|
-
livekit/plugins/google/_utils.py,sha256=FG1_26nlWGcI6onPleQQcmGBMfb4QNYgis1B5BMJxWA,7131
|
3
|
-
livekit/plugins/google/llm.py,sha256=LZaHsrkjfboRZLWm7L2G0mw62q2sXBNj4YeeV2Sk2uU,16717
|
4
|
-
livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
|
5
|
-
livekit/plugins/google/models.py,sha256=8Ysqkb0pOSSr_S9XHYxLz5nofDTt8RtfbsTIWoptOQU,1532
|
6
|
-
livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
livekit/plugins/google/stt.py,sha256=0-4mVD5IydvsWp9OzYyVmXe6pz6FDvPutRLF169y674,22752
|
8
|
-
livekit/plugins/google/tts.py,sha256=w4EMk9rPfyAzPyWFwE_5sPo9UY7DNFa2g83K56AUk9I,9228
|
9
|
-
livekit/plugins/google/version.py,sha256=B7ZiVTsE24YmkTGl3227ZHjutNpXQp27028_w5-LuRA,601
|
10
|
-
livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
|
11
|
-
livekit/plugins/google/beta/realtime/__init__.py,sha256=sGTn6JFNyA30QUXBZ_BV3l2eHpGAzR35ByXxg77vWNU,205
|
12
|
-
livekit/plugins/google/beta/realtime/api_proto.py,sha256=ralrRZqIbE71oyuLKRYaXHvm6tcHMwBJueKvSO8Xfus,658
|
13
|
-
livekit/plugins/google/beta/realtime/realtime_api.py,sha256=SU_uQvZMBwbVgexZqkAjGmJVUW80ObJ4LP53rV7xqko,21228
|
14
|
-
livekit/plugins/google/beta/realtime/transcriber.py,sha256=rjXO0cSPr3HATxrSfv1MX7IbrjmiTvnLPF280BfRBL8,9809
|
15
|
-
livekit_plugins_google-0.10.6.dist-info/METADATA,sha256=cvkHdPcsrRpbSjW8oowAgN392NWQmoUD429U6zYSeKk,2058
|
16
|
-
livekit_plugins_google-0.10.6.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
17
|
-
livekit_plugins_google-0.10.6.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
18
|
-
livekit_plugins_google-0.10.6.dist-info/RECORD,,
|
{livekit_plugins_google-0.10.6.dist-info → livekit_plugins_google-0.11.1.dist-info}/top_level.txt
RENAMED
File without changes
|