livekit-plugins-google 0.10.6__py3-none-any.whl → 0.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ from google.genai import types
6
6
 
7
7
  from ..._utils import _build_gemini_ctx, _build_tools
8
8
 
9
- LiveAPIModels = Literal["gemini-2.0-flash-001",]
9
+ LiveAPIModels = Literal["gemini-2.0-flash-exp"]
10
10
 
11
11
  Voice = Literal["Puck", "Charon", "Kore", "Fenrir", "Aoede"]
12
12
 
@@ -9,6 +9,7 @@ from typing import AsyncIterable, Literal
9
9
  from livekit import rtc
10
10
  from livekit.agents import llm, utils
11
11
  from livekit.agents.llm.function_context import _create_ai_function_info
12
+ from livekit.agents.utils import images
12
13
 
13
14
  from google import genai
14
15
  from google.genai.types import (
@@ -82,6 +83,7 @@ class Capabilities:
82
83
  class ModelOptions:
83
84
  model: LiveAPIModels | str
84
85
  api_key: str | None
86
+ api_version: str
85
87
  voice: Voice | str
86
88
  response_modalities: list[Modality] | None
87
89
  vertexai: bool
@@ -106,6 +108,7 @@ class RealtimeModel:
106
108
  instructions: str | None = None,
107
109
  model: LiveAPIModels | str = "gemini-2.0-flash-exp",
108
110
  api_key: str | None = None,
111
+ api_version: str = "v1alpha",
109
112
  voice: Voice | str = "Puck",
110
113
  modalities: list[Modality] = [Modality.AUDIO],
111
114
  enable_user_audio_transcription: bool = True,
@@ -135,6 +138,7 @@ class RealtimeModel:
135
138
  Args:
136
139
  instructions (str, optional): Initial system instructions for the model. Defaults to "".
137
140
  api_key (str or None, optional): Google Gemini API key. If None, will attempt to read from the environment variable GOOGLE_API_KEY.
141
+ api_version (str, optional): The version of the API to use. Defaults to "v1alpha".
138
142
  modalities (list[Modality], optional): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
139
143
  model (str or None, optional): The name of the model to use. Defaults to "gemini-2.0-flash-exp".
140
144
  voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
@@ -186,6 +190,7 @@ class RealtimeModel:
186
190
  self._rt_sessions: list[GeminiRealtimeSession] = []
187
191
  self._opts = ModelOptions(
188
192
  model=model,
193
+ api_version=api_version,
189
194
  api_key=self._api_key,
190
195
  voice=voice,
191
196
  enable_user_audio_transcription=enable_user_audio_transcription,
@@ -258,6 +263,8 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
258
263
  self._fnc_ctx = fnc_ctx
259
264
  self._fnc_tasks = utils.aio.TaskSet()
260
265
  self._is_interrupted = False
266
+ self._playout_complete = asyncio.Event()
267
+ self._playout_complete.set()
261
268
 
262
269
  tools = []
263
270
  if self._fnc_ctx is not None:
@@ -286,7 +293,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
286
293
  tools=tools,
287
294
  )
288
295
  self._client = genai.Client(
289
- http_options=HttpOptions(api_version="v1alpha"),
296
+ http_options=HttpOptions(api_version=self._opts.api_version),
290
297
  api_key=self._opts.api_key,
291
298
  vertexai=self._opts.vertexai,
292
299
  project=self._opts.project,
@@ -317,6 +324,10 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
317
324
  self._send_ch.close()
318
325
  await self._main_atask
319
326
 
327
+ @property
328
+ def playout_complete(self) -> asyncio.Event | None:
329
+ return self._playout_complete
330
+
320
331
  @property
321
332
  def fnc_ctx(self) -> llm.FunctionContext | None:
322
333
  return self._fnc_ctx
@@ -325,14 +336,53 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
325
336
  def fnc_ctx(self, value: llm.FunctionContext | None) -> None:
326
337
  self._fnc_ctx = value
327
338
 
328
- def _push_audio(self, frame: rtc.AudioFrame) -> None:
329
- if self._opts.enable_user_audio_transcription:
330
- self._transcriber._push_audio(frame)
339
+ def _push_media_chunk(self, data: bytes, mime_type: str) -> None:
331
340
  realtime_input = LiveClientRealtimeInput(
332
- media_chunks=[Blob(data=frame.data.tobytes(), mime_type="audio/pcm")],
341
+ media_chunks=[Blob(data=data, mime_type=mime_type)],
333
342
  )
334
343
  self._queue_msg(realtime_input)
335
344
 
345
+ DEFAULT_ENCODE_OPTIONS = images.EncodeOptions(
346
+ format="JPEG",
347
+ quality=75,
348
+ resize_options=images.ResizeOptions(
349
+ width=1024, height=1024, strategy="scale_aspect_fit"
350
+ ),
351
+ )
352
+
353
+ def push_video(
354
+ self,
355
+ frame: rtc.VideoFrame,
356
+ encode_options: images.EncodeOptions = DEFAULT_ENCODE_OPTIONS,
357
+ ) -> None:
358
+ """Push a video frame to the Gemini Multimodal Live session.
359
+
360
+ Args:
361
+ frame (rtc.VideoFrame): The video frame to push.
362
+ encode_options (images.EncodeOptions, optional): The encode options for the video frame. Defaults to 1024x1024 JPEG.
363
+
364
+ Notes:
365
+ - This will be sent immediately so you should use a sampling frame rate that makes sense for your application and Gemini's constraints. 1 FPS is a good starting point.
366
+ """
367
+ encoded_data = images.encode(
368
+ frame,
369
+ encode_options,
370
+ )
371
+ mime_type = (
372
+ "image/jpeg"
373
+ if encode_options.format == "JPEG"
374
+ else "image/png"
375
+ if encode_options.format == "PNG"
376
+ else "image/jpeg"
377
+ )
378
+ self._push_media_chunk(encoded_data, mime_type)
379
+
380
+ def _push_audio(self, frame: rtc.AudioFrame) -> None:
381
+ if self._opts.enable_user_audio_transcription:
382
+ self._transcriber._push_audio(frame)
383
+
384
+ self._push_media_chunk(frame.data.tobytes(), "audio/pcm")
385
+
336
386
  def _queue_msg(self, msg: ClientEvents) -> None:
337
387
  self._send_ch.send_nowait(msg)
338
388
 
@@ -94,8 +94,6 @@ SpeechLanguages = Literal[
94
94
 
95
95
  Gender = Literal["male", "female", "neutral"]
96
96
 
97
- AudioEncoding = Literal["wav", "mp3", "ogg", "mulaw", "alaw", "linear16"]
98
-
99
97
  ChatModels = Literal[
100
98
  "gemini-2.0-flash-001",
101
99
  "gemini-2.0-flash-lite-preview-02-05",
@@ -322,6 +322,10 @@ class STT(stt.STT):
322
322
  keywords=keywords,
323
323
  )
324
324
 
325
+ async def aclose(self) -> None:
326
+ await self._pool.aclose()
327
+ await super().aclose()
328
+
325
329
 
326
330
  class SpeechStream(stt.SpeechStream):
327
331
  def __init__(
@@ -17,7 +17,6 @@ from __future__ import annotations
17
17
  from dataclasses import dataclass
18
18
  from typing import Optional
19
19
 
20
- from livekit import rtc
21
20
  from livekit.agents import (
22
21
  APIConnectionError,
23
22
  APIConnectOptions,
@@ -31,7 +30,7 @@ from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
31
30
  from google.cloud import texttospeech
32
31
  from google.cloud.texttospeech_v1.types import SsmlVoiceGender, SynthesizeSpeechResponse
33
32
 
34
- from .models import AudioEncoding, Gender, SpeechLanguages
33
+ from .models import Gender, SpeechLanguages
35
34
 
36
35
 
37
36
  @dataclass
@@ -47,7 +46,6 @@ class TTS(tts.TTS):
47
46
  language: SpeechLanguages | str = "en-US",
48
47
  gender: Gender | str = "neutral",
49
48
  voice_name: str = "", # Not required
50
- encoding: AudioEncoding | str = "linear16",
51
49
  sample_rate: int = 24000,
52
50
  pitch: int = 0,
53
51
  effects_profile_id: str = "",
@@ -66,7 +64,6 @@ class TTS(tts.TTS):
66
64
  language (SpeechLanguages | str, optional): Language code (e.g., "en-US"). Default is "en-US".
67
65
  gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
68
66
  voice_name (str, optional): Specific voice name. Default is an empty string.
69
- encoding (AudioEncoding | str, optional): Audio encoding format (e.g., "linear16"). Default is "linear16".
70
67
  sample_rate (int, optional): Audio sample rate in Hz. Default is 24000.
71
68
  pitch (float, optional): Speaking pitch, ranging from -20.0 to 20.0 semitones relative to the original pitch. Default is 0.
72
69
  effects_profile_id (str): Optional identifier for selecting audio effects profiles to apply to the synthesized speech.
@@ -93,17 +90,10 @@ class TTS(tts.TTS):
93
90
  ssml_gender=_gender_from_str(gender),
94
91
  )
95
92
 
96
- if encoding == "linear16" or encoding == "wav":
97
- _audio_encoding = texttospeech.AudioEncoding.LINEAR16
98
- elif encoding == "mp3":
99
- _audio_encoding = texttospeech.AudioEncoding.MP3
100
- else:
101
- raise NotImplementedError(f"audio encoding {encoding} is not supported")
102
-
103
93
  self._opts = _TTSOptions(
104
94
  voice=voice,
105
95
  audio_config=texttospeech.AudioConfig(
106
- audio_encoding=_audio_encoding,
96
+ audio_encoding=texttospeech.AudioEncoding.OGG_OPUS,
107
97
  sample_rate_hertz=sample_rate,
108
98
  pitch=pitch,
109
99
  effects_profile_id=effects_profile_id,
@@ -195,35 +185,24 @@ class ChunkedStream(tts.ChunkedStream):
195
185
  timeout=self._conn_options.timeout,
196
186
  )
197
187
 
198
- if self._opts.audio_config.audio_encoding == "mp3":
199
- decoder = utils.codecs.Mp3StreamDecoder()
200
- bstream = utils.audio.AudioByteStream(
201
- sample_rate=self._opts.audio_config.sample_rate_hertz,
202
- num_channels=1,
203
- )
204
- for frame in decoder.decode_chunk(response.audio_content):
205
- for frame in bstream.write(frame.data.tobytes()):
206
- self._event_ch.send_nowait(
207
- tts.SynthesizedAudio(request_id=request_id, frame=frame)
208
- )
209
-
210
- for frame in bstream.flush():
211
- self._event_ch.send_nowait(
212
- tts.SynthesizedAudio(request_id=request_id, frame=frame)
213
- )
214
- else:
215
- data = response.audio_content[44:] # skip WAV header
216
- self._event_ch.send_nowait(
217
- tts.SynthesizedAudio(
218
- request_id=request_id,
219
- frame=rtc.AudioFrame(
220
- data=data,
221
- sample_rate=self._opts.audio_config.sample_rate_hertz,
222
- num_channels=1,
223
- samples_per_channel=len(data) // 2, # 16-bit
224
- ),
225
- )
188
+ # Create AudioStreamDecoder for OGG format
189
+ decoder = utils.codecs.AudioStreamDecoder(
190
+ sample_rate=self._opts.audio_config.sample_rate_hertz,
191
+ num_channels=1,
192
+ )
193
+
194
+ try:
195
+ decoder.push(response.audio_content)
196
+ decoder.end_input()
197
+ emitter = tts.SynthesizedAudioEmitter(
198
+ event_ch=self._event_ch,
199
+ request_id=request_id,
226
200
  )
201
+ async for frame in decoder:
202
+ emitter.push(frame)
203
+ emitter.flush()
204
+ finally:
205
+ await decoder.aclose()
227
206
 
228
207
  except DeadlineExceeded:
229
208
  raise APITimeoutError()
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.10.6"
15
+ __version__ = "0.11.1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: livekit-plugins-google
3
- Version: 0.10.6
3
+ Version: 0.11.1
4
4
  Summary: Agent Framework plugin for services from Google Cloud
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -23,7 +23,7 @@ Requires-Dist: google-auth<3,>=2
23
23
  Requires-Dist: google-cloud-speech<3,>=2
24
24
  Requires-Dist: google-cloud-texttospeech<3,>=2
25
25
  Requires-Dist: google-genai==1.3.0
26
- Requires-Dist: livekit-agents>=0.12.11
26
+ Requires-Dist: livekit-agents<1.0.0,>=0.12.16
27
27
  Dynamic: classifier
28
28
  Dynamic: description
29
29
  Dynamic: description-content-type
@@ -53,3 +53,57 @@ To use the STT and TTS API, you'll need to enable the respective services for yo
53
53
 
54
54
  - Cloud Speech-to-Text API
55
55
  - Cloud Text-to-Speech API
56
+
57
+
58
+ ## Gemini Multimodal Live
59
+
60
+ Gemini Multimodal Live can be used with the `MultimodalAgent` class. See examples/multimodal_agent/gemini_agent.py for an example.
61
+
62
+ ### Live Video Input (experimental)
63
+
64
+ You can push video frames to your Gemini Multimodal Live session alongside the audio automatically handled by the `MultimodalAgent`. The basic approach is to subscribe to the video track, create a video stream, sample frames at a suitable frame rate, and push them into the RealtimeSession:
65
+
66
+ ```
67
+ # Make sure you subscribe to audio and video tracks
68
+ await ctx.connect(auto_subscribe=AutoSubscribe.SUBSCRIBE_ALL)
69
+
70
+ # Create your RealtimeModel and store a reference
71
+ model = google.beta.realtime.RealtimeModel(
72
+ # ...
73
+ )
74
+
75
+ # Create your MultimodalAgent as usual
76
+ agent = MultimodalAgent(
77
+ model=model,
78
+ # ...
79
+ )
80
+
81
+ # Async method to process the video track and push frames to Gemini
82
+ async def _process_video_track(self, track: Track):
83
+ video_stream = VideoStream(track)
84
+ last_frame_time = 0
85
+
86
+ async for event in video_stream:
87
+ current_time = asyncio.get_event_loop().time()
88
+
89
+ # Sample at 1 FPS
90
+ if current_time - last_frame_time < 1.0:
91
+ continue
92
+
93
+ last_frame_time = current_time
94
+ frame = event.frame
95
+
96
+ # Push the frame into the RealtimeSession
97
+ model.sessions[0].push_video(frame)
98
+
99
+ await video_stream.aclose()
100
+
101
+ # Subscribe to new tracks and process them
102
+ @ctx.room.on("track_subscribed")
103
+ def _on_track_subscribed(track: Track, pub, participant):
104
+ if track.kind == TrackKind.KIND_VIDEO:
105
+ asyncio.create_task(self._process_video_track(track))
106
+ ```
107
+
108
+
109
+
@@ -0,0 +1,18 @@
1
+ livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
2
+ livekit/plugins/google/_utils.py,sha256=FG1_26nlWGcI6onPleQQcmGBMfb4QNYgis1B5BMJxWA,7131
3
+ livekit/plugins/google/llm.py,sha256=LZaHsrkjfboRZLWm7L2G0mw62q2sXBNj4YeeV2Sk2uU,16717
4
+ livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
5
+ livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
6
+ livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ livekit/plugins/google/stt.py,sha256=96GJmGDAIBdCpDECArwIXpj2s1xlcA_zuvTnwsvq4xA,22854
8
+ livekit/plugins/google/tts.py,sha256=pG9_pibO3NDGEMa4huU5S9lbeyI3daQyrS17SuTKfZI,8008
9
+ livekit/plugins/google/version.py,sha256=LeUJJQ9jwADplJbF46ClzVjYAClwJEhZMCToNJN9lWc,601
10
+ livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
11
+ livekit/plugins/google/beta/realtime/__init__.py,sha256=sGTn6JFNyA30QUXBZ_BV3l2eHpGAzR35ByXxg77vWNU,205
12
+ livekit/plugins/google/beta/realtime/api_proto.py,sha256=9EhmwgeIgKDqdSijv5Q9pgx7UhAakK02ZDwbnUsra_o,657
13
+ livekit/plugins/google/beta/realtime/realtime_api.py,sha256=8JdWUMUheGhy1ia6JbN3_U2_cL7CNs8-1fTOAgW4I38,22999
14
+ livekit/plugins/google/beta/realtime/transcriber.py,sha256=rjXO0cSPr3HATxrSfv1MX7IbrjmiTvnLPF280BfRBL8,9809
15
+ livekit_plugins_google-0.11.1.dist-info/METADATA,sha256=m7B07abY9wTbEJVa3dmdsgfatxYwJFwDNQYhyJgIPJU,3732
16
+ livekit_plugins_google-0.11.1.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
17
+ livekit_plugins_google-0.11.1.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
18
+ livekit_plugins_google-0.11.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.2)
2
+ Generator: setuptools (76.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,18 +0,0 @@
1
- livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
2
- livekit/plugins/google/_utils.py,sha256=FG1_26nlWGcI6onPleQQcmGBMfb4QNYgis1B5BMJxWA,7131
3
- livekit/plugins/google/llm.py,sha256=LZaHsrkjfboRZLWm7L2G0mw62q2sXBNj4YeeV2Sk2uU,16717
4
- livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
5
- livekit/plugins/google/models.py,sha256=8Ysqkb0pOSSr_S9XHYxLz5nofDTt8RtfbsTIWoptOQU,1532
6
- livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- livekit/plugins/google/stt.py,sha256=0-4mVD5IydvsWp9OzYyVmXe6pz6FDvPutRLF169y674,22752
8
- livekit/plugins/google/tts.py,sha256=w4EMk9rPfyAzPyWFwE_5sPo9UY7DNFa2g83K56AUk9I,9228
9
- livekit/plugins/google/version.py,sha256=B7ZiVTsE24YmkTGl3227ZHjutNpXQp27028_w5-LuRA,601
10
- livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
11
- livekit/plugins/google/beta/realtime/__init__.py,sha256=sGTn6JFNyA30QUXBZ_BV3l2eHpGAzR35ByXxg77vWNU,205
12
- livekit/plugins/google/beta/realtime/api_proto.py,sha256=ralrRZqIbE71oyuLKRYaXHvm6tcHMwBJueKvSO8Xfus,658
13
- livekit/plugins/google/beta/realtime/realtime_api.py,sha256=SU_uQvZMBwbVgexZqkAjGmJVUW80ObJ4LP53rV7xqko,21228
14
- livekit/plugins/google/beta/realtime/transcriber.py,sha256=rjXO0cSPr3HATxrSfv1MX7IbrjmiTvnLPF280BfRBL8,9809
15
- livekit_plugins_google-0.10.6.dist-info/METADATA,sha256=cvkHdPcsrRpbSjW8oowAgN392NWQmoUD429U6zYSeKk,2058
16
- livekit_plugins_google-0.10.6.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
17
- livekit_plugins_google-0.10.6.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
18
- livekit_plugins_google-0.10.6.dist-info/RECORD,,