PyPI - livekit-plugins-google - Versions diffs - 0.10.6__py3-none-any.whl → 0.11.0__py3-none-any.whl - Mend

livekit-plugins-google 0.10.6py3-none-any.whl → 0.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

livekit/plugins/google/beta/realtime/realtime_api.py CHANGED Viewed

@@ -9,6 +9,7 @@ from typing import AsyncIterable, Literal
 from livekit import rtc
 from livekit.agents import llm, utils
 from livekit.agents.llm.function_context import _create_ai_function_info
+from livekit.agents.utils import images
 from google import genai
 from google.genai.types import (
@@ -258,6 +259,8 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
         self._fnc_ctx = fnc_ctx
         self._fnc_tasks = utils.aio.TaskSet()
         self._is_interrupted = False
+        self._playout_complete = asyncio.Event()
+        self._playout_complete.set()
         tools = []
         if self._fnc_ctx is not None:
@@ -317,6 +320,10 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
         self._send_ch.close()
         await self._main_atask
+    @property
+    def playout_complete(self) -> asyncio.Event | None:
+        return self._playout_complete
     @property
     def fnc_ctx(self) -> llm.FunctionContext | None:
         return self._fnc_ctx
@@ -325,14 +332,53 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
     def fnc_ctx(self, value: llm.FunctionContext | None) -> None:
         self._fnc_ctx = value
-    def _push_audio(self, frame: rtc.AudioFrame) -> None:
-        if self._opts.enable_user_audio_transcription:
-            self._transcriber._push_audio(frame)
+    def _push_media_chunk(self, data: bytes, mime_type: str) -> None:
         realtime_input = LiveClientRealtimeInput(
-            media_chunks=[Blob(data=frame.data.tobytes(), mime_type="audio/pcm")],
+            media_chunks=[Blob(data=data, mime_type=mime_type)],
         )
         self._queue_msg(realtime_input)
+    DEFAULT_ENCODE_OPTIONS = images.EncodeOptions(
+        format="JPEG",
+        quality=75,
+        resize_options=images.ResizeOptions(
+            width=1024, height=1024, strategy="scale_aspect_fit"
+        ),
+    )
+    def push_video(
+        self,
+        frame: rtc.VideoFrame,
+        encode_options: images.EncodeOptions = DEFAULT_ENCODE_OPTIONS,
+    ) -> None:
+        """Push a video frame to the Gemini Multimodal Live session.
+        Args:
+            frame (rtc.VideoFrame): The video frame to push.
+            encode_options (images.EncodeOptions, optional): The encode options for the video frame. Defaults to 1024x1024 JPEG.
+        Notes:
+        - This will be sent immediately so you should use a sampling frame rate that makes sense for your application and Gemini's constraints. 1 FPS is a good starting point.
+        """
+        encoded_data = images.encode(
+            frame,
+            encode_options,
+        )
+        mime_type = (
+            "image/jpeg"
+            if encode_options.format == "JPEG"
+            else "image/png"
+            if encode_options.format == "PNG"
+            else "image/jpeg"
+        )
+        self._push_media_chunk(encoded_data, mime_type)
+    def _push_audio(self, frame: rtc.AudioFrame) -> None:
+        if self._opts.enable_user_audio_transcription:
+            self._transcriber._push_audio(frame)
+        self._push_media_chunk(frame.data.tobytes(), "audio/pcm")
     def _queue_msg(self, msg: ClientEvents) -> None:
         self._send_ch.send_nowait(msg)

livekit/plugins/google/models.py CHANGED Viewed

@@ -94,8 +94,6 @@ SpeechLanguages = Literal[
 Gender = Literal["male", "female", "neutral"]
-AudioEncoding = Literal["wav", "mp3", "ogg", "mulaw", "alaw", "linear16"]
 ChatModels = Literal[
     "gemini-2.0-flash-001",
     "gemini-2.0-flash-lite-preview-02-05",

livekit/plugins/google/stt.py CHANGED Viewed

@@ -322,6 +322,10 @@ class STT(stt.STT):
                 keywords=keywords,
             )
+    async def aclose(self) -> None:
+        await self._pool.aclose()
+        await super().aclose()
 class SpeechStream(stt.SpeechStream):
     def __init__(

livekit/plugins/google/tts.py CHANGED Viewed

@@ -17,7 +17,6 @@ from __future__ import annotations
 from dataclasses import dataclass
 from typing import Optional
-from livekit import rtc
 from livekit.agents import (
     APIConnectionError,
     APIConnectOptions,
@@ -31,7 +30,7 @@ from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
 from google.cloud import texttospeech
 from google.cloud.texttospeech_v1.types import SsmlVoiceGender, SynthesizeSpeechResponse
-from .models import AudioEncoding, Gender, SpeechLanguages
+from .models import Gender, SpeechLanguages
 @dataclass
@@ -47,7 +46,6 @@ class TTS(tts.TTS):
         language: SpeechLanguages | str = "en-US",
         gender: Gender | str = "neutral",
         voice_name: str = "",  # Not required
-        encoding: AudioEncoding | str = "linear16",
         sample_rate: int = 24000,
         pitch: int = 0,
         effects_profile_id: str = "",
@@ -66,7 +64,6 @@ class TTS(tts.TTS):
             language (SpeechLanguages | str, optional): Language code (e.g., "en-US"). Default is "en-US".
             gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
             voice_name (str, optional): Specific voice name. Default is an empty string.
-            encoding (AudioEncoding | str, optional): Audio encoding format (e.g., "linear16"). Default is "linear16".
             sample_rate (int, optional): Audio sample rate in Hz. Default is 24000.
             pitch (float, optional): Speaking pitch, ranging from -20.0 to 20.0 semitones relative to the original pitch. Default is 0.
             effects_profile_id (str): Optional identifier for selecting audio effects profiles to apply to the synthesized speech.
@@ -93,17 +90,10 @@ class TTS(tts.TTS):
             ssml_gender=_gender_from_str(gender),
         )
-        if encoding == "linear16" or encoding == "wav":
-            _audio_encoding = texttospeech.AudioEncoding.LINEAR16
-        elif encoding == "mp3":
-            _audio_encoding = texttospeech.AudioEncoding.MP3
-        else:
-            raise NotImplementedError(f"audio encoding {encoding} is not supported")
         self._opts = _TTSOptions(
             voice=voice,
             audio_config=texttospeech.AudioConfig(
-                audio_encoding=_audio_encoding,
+                audio_encoding=texttospeech.AudioEncoding.OGG_OPUS,
                 sample_rate_hertz=sample_rate,
                 pitch=pitch,
                 effects_profile_id=effects_profile_id,
@@ -195,35 +185,24 @@ class ChunkedStream(tts.ChunkedStream):
                 timeout=self._conn_options.timeout,
             )
-            if self._opts.audio_config.audio_encoding == "mp3":
-                decoder = utils.codecs.Mp3StreamDecoder()
-                bstream = utils.audio.AudioByteStream(
-                    sample_rate=self._opts.audio_config.sample_rate_hertz,
-                    num_channels=1,
-                )
-                for frame in decoder.decode_chunk(response.audio_content):
-                    for frame in bstream.write(frame.data.tobytes()):
-                        self._event_ch.send_nowait(
-                            tts.SynthesizedAudio(request_id=request_id, frame=frame)
-                        )
-                for frame in bstream.flush():
-                    self._event_ch.send_nowait(
-                        tts.SynthesizedAudio(request_id=request_id, frame=frame)
-                    )
-            else:
-                data = response.audio_content[44:]  # skip WAV header
-                self._event_ch.send_nowait(
-                    tts.SynthesizedAudio(
-                        request_id=request_id,
-                        frame=rtc.AudioFrame(
-                            data=data,
-                            sample_rate=self._opts.audio_config.sample_rate_hertz,
-                            num_channels=1,
-                            samples_per_channel=len(data) // 2,  # 16-bit
-                        ),
-                    )
+            # Create AudioStreamDecoder for OGG format
+            decoder = utils.codecs.AudioStreamDecoder(
+                sample_rate=self._opts.audio_config.sample_rate_hertz,
+                num_channels=1,
+            )
+            try:
+                decoder.push(response.audio_content)
+                decoder.end_input()
+                emitter = tts.SynthesizedAudioEmitter(
+                    event_ch=self._event_ch,
+                    request_id=request_id,
                 )
+                async for frame in decoder:
+                    emitter.push(frame)
+                emitter.flush()
+            finally:
+                await decoder.aclose()
         except DeadlineExceeded:
             raise APITimeoutError()

livekit/plugins/google/version.py CHANGED Viewed

@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.10.6"
+__version__ = "0.11.0"

{livekit_plugins_google-0.10.6.dist-info → livekit_plugins_google-0.11.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: livekit-plugins-google
-Version: 0.10.6
+Version: 0.11.0
 Summary: Agent Framework plugin for services from Google Cloud
 Home-page: https://github.com/livekit/agents
 License: Apache-2.0
@@ -23,7 +23,7 @@ Requires-Dist: google-auth<3,>=2
 Requires-Dist: google-cloud-speech<3,>=2
 Requires-Dist: google-cloud-texttospeech<3,>=2
 Requires-Dist: google-genai==1.3.0
-Requires-Dist: livekit-agents>=0.12.11
+Requires-Dist: livekit-agents<1.0.0,>=0.12.16
 Dynamic: classifier
 Dynamic: description
 Dynamic: description-content-type
@@ -53,3 +53,57 @@ To use the STT and TTS API, you'll need to enable the respective services for yo
 - Cloud Speech-to-Text API
 - Cloud Text-to-Speech API
+## Gemini Multimodal Live
+Gemini Multimodal Live can be used with the `MultimodalAgent` class. See examples/multimodal_agent/gemini_agent.py for an example.
+### Live Video Input (experimental)
+You can push video frames to your Gemini Multimodal Live session alongside the audio automatically handled by the `MultimodalAgent`.  The basic approach is to subscribe to the video track, create a video stream, sample frames at a suitable frame rate, and push them into the RealtimeSession:
+```
+# Make sure you subscribe to audio and video tracks
+await ctx.connect(auto_subscribe=AutoSubscribe.SUBSCRIBE_ALL)
+# Create your RealtimeModel and store a reference
+model = google.beta.realtime.RealtimeModel(
+    # ...
+)
+# Create your MultimodalAgent as usual
+agent = MultimodalAgent(
+    model=model,
+    # ...
+)
+# Async method to process the video track and push frames to Gemini
+async def _process_video_track(self, track: Track):
+    video_stream = VideoStream(track)
+    last_frame_time = 0
+    async for event in video_stream:
+        current_time = asyncio.get_event_loop().time()
+        # Sample at 1 FPS
+        if current_time - last_frame_time < 1.0:
+            continue
+        last_frame_time = current_time
+        frame = event.frame
+        # Push the frame into the RealtimeSession
+        model.sessions[0].push_video(frame)
+    await video_stream.aclose()
+# Subscribe to new tracks and process them
+@ctx.room.on("track_subscribed")
+def _on_track_subscribed(track: Track, pub, participant):
+    if track.kind == TrackKind.KIND_VIDEO:
+        asyncio.create_task(self._process_video_track(track))
+```

{livekit_plugins_google-0.10.6.dist-info → livekit_plugins_google-0.11.0.dist-info}/RECORD RENAMED Viewed

@@ -2,17 +2,17 @@ livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWu
 livekit/plugins/google/_utils.py,sha256=FG1_26nlWGcI6onPleQQcmGBMfb4QNYgis1B5BMJxWA,7131
 livekit/plugins/google/llm.py,sha256=LZaHsrkjfboRZLWm7L2G0mw62q2sXBNj4YeeV2Sk2uU,16717
 livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
-livekit/plugins/google/models.py,sha256=8Ysqkb0pOSSr_S9XHYxLz5nofDTt8RtfbsTIWoptOQU,1532
+livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
 livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-livekit/plugins/google/stt.py,sha256=0-4mVD5IydvsWp9OzYyVmXe6pz6FDvPutRLF169y674,22752
-livekit/plugins/google/tts.py,sha256=w4EMk9rPfyAzPyWFwE_5sPo9UY7DNFa2g83K56AUk9I,9228
-livekit/plugins/google/version.py,sha256=B7ZiVTsE24YmkTGl3227ZHjutNpXQp27028_w5-LuRA,601
+livekit/plugins/google/stt.py,sha256=96GJmGDAIBdCpDECArwIXpj2s1xlcA_zuvTnwsvq4xA,22854
+livekit/plugins/google/tts.py,sha256=pG9_pibO3NDGEMa4huU5S9lbeyI3daQyrS17SuTKfZI,8008
+livekit/plugins/google/version.py,sha256=BvmVdoHkxksDSQP-uWrqIiyaAUImEyxSohntkIBNZRo,601
 livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
 livekit/plugins/google/beta/realtime/__init__.py,sha256=sGTn6JFNyA30QUXBZ_BV3l2eHpGAzR35ByXxg77vWNU,205
 livekit/plugins/google/beta/realtime/api_proto.py,sha256=ralrRZqIbE71oyuLKRYaXHvm6tcHMwBJueKvSO8Xfus,658
-livekit/plugins/google/beta/realtime/realtime_api.py,sha256=SU_uQvZMBwbVgexZqkAjGmJVUW80ObJ4LP53rV7xqko,21228
+livekit/plugins/google/beta/realtime/realtime_api.py,sha256=vZHiWNk8PorxtrHSmA7Ya6ZvCjT37YSJN-MxK8ebdrs,22795
 livekit/plugins/google/beta/realtime/transcriber.py,sha256=rjXO0cSPr3HATxrSfv1MX7IbrjmiTvnLPF280BfRBL8,9809
-livekit_plugins_google-0.10.6.dist-info/METADATA,sha256=cvkHdPcsrRpbSjW8oowAgN392NWQmoUD429U6zYSeKk,2058
-livekit_plugins_google-0.10.6.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
-livekit_plugins_google-0.10.6.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
-livekit_plugins_google-0.10.6.dist-info/RECORD,,
+livekit_plugins_google-0.11.0.dist-info/METADATA,sha256=b8Aj_eQnGhAT3DQa77KLHZBDGAWZYdrnTBWjVODAm2k,3732
+livekit_plugins_google-0.11.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
+livekit_plugins_google-0.11.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
+livekit_plugins_google-0.11.0.dist-info/RECORD,,

{livekit_plugins_google-0.10.6.dist-info → livekit_plugins_google-0.11.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{livekit_plugins_google-0.10.6.dist-info → livekit_plugins_google-0.11.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

livekit-plugins-google 0.10.6__py3-none-any.whl → 0.11.0__py3-none-any.whl

livekit-plugins-google 0.10.6py3-none-any.whl → 0.11.0py3-none-any.whl