livekit-plugins-google 0.10.6__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,7 @@ from typing import AsyncIterable, Literal
9
9
  from livekit import rtc
10
10
  from livekit.agents import llm, utils
11
11
  from livekit.agents.llm.function_context import _create_ai_function_info
12
+ from livekit.agents.utils import images
12
13
 
13
14
  from google import genai
14
15
  from google.genai.types import (
@@ -258,6 +259,8 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
258
259
  self._fnc_ctx = fnc_ctx
259
260
  self._fnc_tasks = utils.aio.TaskSet()
260
261
  self._is_interrupted = False
262
+ self._playout_complete = asyncio.Event()
263
+ self._playout_complete.set()
261
264
 
262
265
  tools = []
263
266
  if self._fnc_ctx is not None:
@@ -317,6 +320,10 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
317
320
  self._send_ch.close()
318
321
  await self._main_atask
319
322
 
323
+ @property
324
+ def playout_complete(self) -> asyncio.Event | None:
325
+ return self._playout_complete
326
+
320
327
  @property
321
328
  def fnc_ctx(self) -> llm.FunctionContext | None:
322
329
  return self._fnc_ctx
@@ -325,14 +332,53 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
325
332
  def fnc_ctx(self, value: llm.FunctionContext | None) -> None:
326
333
  self._fnc_ctx = value
327
334
 
328
- def _push_audio(self, frame: rtc.AudioFrame) -> None:
329
- if self._opts.enable_user_audio_transcription:
330
- self._transcriber._push_audio(frame)
335
+ def _push_media_chunk(self, data: bytes, mime_type: str) -> None:
331
336
  realtime_input = LiveClientRealtimeInput(
332
- media_chunks=[Blob(data=frame.data.tobytes(), mime_type="audio/pcm")],
337
+ media_chunks=[Blob(data=data, mime_type=mime_type)],
333
338
  )
334
339
  self._queue_msg(realtime_input)
335
340
 
341
+ DEFAULT_ENCODE_OPTIONS = images.EncodeOptions(
342
+ format="JPEG",
343
+ quality=75,
344
+ resize_options=images.ResizeOptions(
345
+ width=1024, height=1024, strategy="scale_aspect_fit"
346
+ ),
347
+ )
348
+
349
+ def push_video(
350
+ self,
351
+ frame: rtc.VideoFrame,
352
+ encode_options: images.EncodeOptions = DEFAULT_ENCODE_OPTIONS,
353
+ ) -> None:
354
+ """Push a video frame to the Gemini Multimodal Live session.
355
+
356
+ Args:
357
+ frame (rtc.VideoFrame): The video frame to push.
358
+ encode_options (images.EncodeOptions, optional): The encode options for the video frame. Defaults to 1024x1024 JPEG.
359
+
360
+ Notes:
361
+ - This will be sent immediately so you should use a sampling frame rate that makes sense for your application and Gemini's constraints. 1 FPS is a good starting point.
362
+ """
363
+ encoded_data = images.encode(
364
+ frame,
365
+ encode_options,
366
+ )
367
+ mime_type = (
368
+ "image/jpeg"
369
+ if encode_options.format == "JPEG"
370
+ else "image/png"
371
+ if encode_options.format == "PNG"
372
+ else "image/jpeg"
373
+ )
374
+ self._push_media_chunk(encoded_data, mime_type)
375
+
376
+ def _push_audio(self, frame: rtc.AudioFrame) -> None:
377
+ if self._opts.enable_user_audio_transcription:
378
+ self._transcriber._push_audio(frame)
379
+
380
+ self._push_media_chunk(frame.data.tobytes(), "audio/pcm")
381
+
336
382
  def _queue_msg(self, msg: ClientEvents) -> None:
337
383
  self._send_ch.send_nowait(msg)
338
384
 
@@ -94,8 +94,6 @@ SpeechLanguages = Literal[
94
94
 
95
95
  Gender = Literal["male", "female", "neutral"]
96
96
 
97
- AudioEncoding = Literal["wav", "mp3", "ogg", "mulaw", "alaw", "linear16"]
98
-
99
97
  ChatModels = Literal[
100
98
  "gemini-2.0-flash-001",
101
99
  "gemini-2.0-flash-lite-preview-02-05",
@@ -322,6 +322,10 @@ class STT(stt.STT):
322
322
  keywords=keywords,
323
323
  )
324
324
 
325
+ async def aclose(self) -> None:
326
+ await self._pool.aclose()
327
+ await super().aclose()
328
+
325
329
 
326
330
  class SpeechStream(stt.SpeechStream):
327
331
  def __init__(
@@ -17,7 +17,6 @@ from __future__ import annotations
17
17
  from dataclasses import dataclass
18
18
  from typing import Optional
19
19
 
20
- from livekit import rtc
21
20
  from livekit.agents import (
22
21
  APIConnectionError,
23
22
  APIConnectOptions,
@@ -31,7 +30,7 @@ from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
31
30
  from google.cloud import texttospeech
32
31
  from google.cloud.texttospeech_v1.types import SsmlVoiceGender, SynthesizeSpeechResponse
33
32
 
34
- from .models import AudioEncoding, Gender, SpeechLanguages
33
+ from .models import Gender, SpeechLanguages
35
34
 
36
35
 
37
36
  @dataclass
@@ -47,7 +46,6 @@ class TTS(tts.TTS):
47
46
  language: SpeechLanguages | str = "en-US",
48
47
  gender: Gender | str = "neutral",
49
48
  voice_name: str = "", # Not required
50
- encoding: AudioEncoding | str = "linear16",
51
49
  sample_rate: int = 24000,
52
50
  pitch: int = 0,
53
51
  effects_profile_id: str = "",
@@ -66,7 +64,6 @@ class TTS(tts.TTS):
66
64
  language (SpeechLanguages | str, optional): Language code (e.g., "en-US"). Default is "en-US".
67
65
  gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
68
66
  voice_name (str, optional): Specific voice name. Default is an empty string.
69
- encoding (AudioEncoding | str, optional): Audio encoding format (e.g., "linear16"). Default is "linear16".
70
67
  sample_rate (int, optional): Audio sample rate in Hz. Default is 24000.
71
68
  pitch (float, optional): Speaking pitch, ranging from -20.0 to 20.0 semitones relative to the original pitch. Default is 0.
72
69
  effects_profile_id (str): Optional identifier for selecting audio effects profiles to apply to the synthesized speech.
@@ -93,17 +90,10 @@ class TTS(tts.TTS):
93
90
  ssml_gender=_gender_from_str(gender),
94
91
  )
95
92
 
96
- if encoding == "linear16" or encoding == "wav":
97
- _audio_encoding = texttospeech.AudioEncoding.LINEAR16
98
- elif encoding == "mp3":
99
- _audio_encoding = texttospeech.AudioEncoding.MP3
100
- else:
101
- raise NotImplementedError(f"audio encoding {encoding} is not supported")
102
-
103
93
  self._opts = _TTSOptions(
104
94
  voice=voice,
105
95
  audio_config=texttospeech.AudioConfig(
106
- audio_encoding=_audio_encoding,
96
+ audio_encoding=texttospeech.AudioEncoding.OGG_OPUS,
107
97
  sample_rate_hertz=sample_rate,
108
98
  pitch=pitch,
109
99
  effects_profile_id=effects_profile_id,
@@ -195,35 +185,24 @@ class ChunkedStream(tts.ChunkedStream):
195
185
  timeout=self._conn_options.timeout,
196
186
  )
197
187
 
198
- if self._opts.audio_config.audio_encoding == "mp3":
199
- decoder = utils.codecs.Mp3StreamDecoder()
200
- bstream = utils.audio.AudioByteStream(
201
- sample_rate=self._opts.audio_config.sample_rate_hertz,
202
- num_channels=1,
203
- )
204
- for frame in decoder.decode_chunk(response.audio_content):
205
- for frame in bstream.write(frame.data.tobytes()):
206
- self._event_ch.send_nowait(
207
- tts.SynthesizedAudio(request_id=request_id, frame=frame)
208
- )
209
-
210
- for frame in bstream.flush():
211
- self._event_ch.send_nowait(
212
- tts.SynthesizedAudio(request_id=request_id, frame=frame)
213
- )
214
- else:
215
- data = response.audio_content[44:] # skip WAV header
216
- self._event_ch.send_nowait(
217
- tts.SynthesizedAudio(
218
- request_id=request_id,
219
- frame=rtc.AudioFrame(
220
- data=data,
221
- sample_rate=self._opts.audio_config.sample_rate_hertz,
222
- num_channels=1,
223
- samples_per_channel=len(data) // 2, # 16-bit
224
- ),
225
- )
188
+ # Create AudioStreamDecoder for OGG format
189
+ decoder = utils.codecs.AudioStreamDecoder(
190
+ sample_rate=self._opts.audio_config.sample_rate_hertz,
191
+ num_channels=1,
192
+ )
193
+
194
+ try:
195
+ decoder.push(response.audio_content)
196
+ decoder.end_input()
197
+ emitter = tts.SynthesizedAudioEmitter(
198
+ event_ch=self._event_ch,
199
+ request_id=request_id,
226
200
  )
201
+ async for frame in decoder:
202
+ emitter.push(frame)
203
+ emitter.flush()
204
+ finally:
205
+ await decoder.aclose()
227
206
 
228
207
  except DeadlineExceeded:
229
208
  raise APITimeoutError()
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.10.6"
15
+ __version__ = "0.11.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: livekit-plugins-google
3
- Version: 0.10.6
3
+ Version: 0.11.0
4
4
  Summary: Agent Framework plugin for services from Google Cloud
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -23,7 +23,7 @@ Requires-Dist: google-auth<3,>=2
23
23
  Requires-Dist: google-cloud-speech<3,>=2
24
24
  Requires-Dist: google-cloud-texttospeech<3,>=2
25
25
  Requires-Dist: google-genai==1.3.0
26
- Requires-Dist: livekit-agents>=0.12.11
26
+ Requires-Dist: livekit-agents<1.0.0,>=0.12.16
27
27
  Dynamic: classifier
28
28
  Dynamic: description
29
29
  Dynamic: description-content-type
@@ -53,3 +53,57 @@ To use the STT and TTS API, you'll need to enable the respective services for yo
53
53
 
54
54
  - Cloud Speech-to-Text API
55
55
  - Cloud Text-to-Speech API
56
+
57
+
58
+ ## Gemini Multimodal Live
59
+
60
+ Gemini Multimodal Live can be used with the `MultimodalAgent` class. See examples/multimodal_agent/gemini_agent.py for an example.
61
+
62
+ ### Live Video Input (experimental)
63
+
64
+ You can push video frames to your Gemini Multimodal Live session alongside the audio automatically handled by the `MultimodalAgent`. The basic approach is to subscribe to the video track, create a video stream, sample frames at a suitable frame rate, and push them into the RealtimeSession:
65
+
66
+ ```
67
+ # Make sure you subscribe to audio and video tracks
68
+ await ctx.connect(auto_subscribe=AutoSubscribe.SUBSCRIBE_ALL)
69
+
70
+ # Create your RealtimeModel and store a reference
71
+ model = google.beta.realtime.RealtimeModel(
72
+ # ...
73
+ )
74
+
75
+ # Create your MultimodalAgent as usual
76
+ agent = MultimodalAgent(
77
+ model=model,
78
+ # ...
79
+ )
80
+
81
+ # Async method to process the video track and push frames to Gemini
82
+ async def _process_video_track(self, track: Track):
83
+ video_stream = VideoStream(track)
84
+ last_frame_time = 0
85
+
86
+ async for event in video_stream:
87
+ current_time = asyncio.get_event_loop().time()
88
+
89
+ # Sample at 1 FPS
90
+ if current_time - last_frame_time < 1.0:
91
+ continue
92
+
93
+ last_frame_time = current_time
94
+ frame = event.frame
95
+
96
+ # Push the frame into the RealtimeSession
97
+ model.sessions[0].push_video(frame)
98
+
99
+ await video_stream.aclose()
100
+
101
+ # Subscribe to new tracks and process them
102
+ @ctx.room.on("track_subscribed")
103
+ def _on_track_subscribed(track: Track, pub, participant):
104
+ if track.kind == TrackKind.KIND_VIDEO:
105
+ asyncio.create_task(self._process_video_track(track))
106
+ ```
107
+
108
+
109
+
@@ -2,17 +2,17 @@ livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWu
2
2
  livekit/plugins/google/_utils.py,sha256=FG1_26nlWGcI6onPleQQcmGBMfb4QNYgis1B5BMJxWA,7131
3
3
  livekit/plugins/google/llm.py,sha256=LZaHsrkjfboRZLWm7L2G0mw62q2sXBNj4YeeV2Sk2uU,16717
4
4
  livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
5
- livekit/plugins/google/models.py,sha256=8Ysqkb0pOSSr_S9XHYxLz5nofDTt8RtfbsTIWoptOQU,1532
5
+ livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
6
6
  livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- livekit/plugins/google/stt.py,sha256=0-4mVD5IydvsWp9OzYyVmXe6pz6FDvPutRLF169y674,22752
8
- livekit/plugins/google/tts.py,sha256=w4EMk9rPfyAzPyWFwE_5sPo9UY7DNFa2g83K56AUk9I,9228
9
- livekit/plugins/google/version.py,sha256=B7ZiVTsE24YmkTGl3227ZHjutNpXQp27028_w5-LuRA,601
7
+ livekit/plugins/google/stt.py,sha256=96GJmGDAIBdCpDECArwIXpj2s1xlcA_zuvTnwsvq4xA,22854
8
+ livekit/plugins/google/tts.py,sha256=pG9_pibO3NDGEMa4huU5S9lbeyI3daQyrS17SuTKfZI,8008
9
+ livekit/plugins/google/version.py,sha256=BvmVdoHkxksDSQP-uWrqIiyaAUImEyxSohntkIBNZRo,601
10
10
  livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
11
11
  livekit/plugins/google/beta/realtime/__init__.py,sha256=sGTn6JFNyA30QUXBZ_BV3l2eHpGAzR35ByXxg77vWNU,205
12
12
  livekit/plugins/google/beta/realtime/api_proto.py,sha256=ralrRZqIbE71oyuLKRYaXHvm6tcHMwBJueKvSO8Xfus,658
13
- livekit/plugins/google/beta/realtime/realtime_api.py,sha256=SU_uQvZMBwbVgexZqkAjGmJVUW80ObJ4LP53rV7xqko,21228
13
+ livekit/plugins/google/beta/realtime/realtime_api.py,sha256=vZHiWNk8PorxtrHSmA7Ya6ZvCjT37YSJN-MxK8ebdrs,22795
14
14
  livekit/plugins/google/beta/realtime/transcriber.py,sha256=rjXO0cSPr3HATxrSfv1MX7IbrjmiTvnLPF280BfRBL8,9809
15
- livekit_plugins_google-0.10.6.dist-info/METADATA,sha256=cvkHdPcsrRpbSjW8oowAgN392NWQmoUD429U6zYSeKk,2058
16
- livekit_plugins_google-0.10.6.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
17
- livekit_plugins_google-0.10.6.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
18
- livekit_plugins_google-0.10.6.dist-info/RECORD,,
15
+ livekit_plugins_google-0.11.0.dist-info/METADATA,sha256=b8Aj_eQnGhAT3DQa77KLHZBDGAWZYdrnTBWjVODAm2k,3732
16
+ livekit_plugins_google-0.11.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
17
+ livekit_plugins_google-0.11.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
18
+ livekit_plugins_google-0.11.0.dist-info/RECORD,,