PyPI - livekit-plugins-google - Versions diffs - 0.10.6__py3-none-any.whl → 0.11.1__py3-none-any.whl - Mend

livekit-plugins-google 0.10.6py3-none-any.whl → 0.11.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

livekit/plugins/google/beta/realtime/api_proto.py CHANGED Viewed

@@ -6,7 +6,7 @@ from google.genai import types
 from ..._utils import _build_gemini_ctx, _build_tools
-LiveAPIModels = Literal["gemini-2.0-flash-001",]
+LiveAPIModels = Literal["gemini-2.0-flash-exp"]
 Voice = Literal["Puck", "Charon", "Kore", "Fenrir", "Aoede"]

livekit/plugins/google/beta/realtime/realtime_api.py CHANGED Viewed

@@ -9,6 +9,7 @@ from typing import AsyncIterable, Literal
 from livekit import rtc
 from livekit.agents import llm, utils
 from livekit.agents.llm.function_context import _create_ai_function_info
+from livekit.agents.utils import images
 from google import genai
 from google.genai.types import (
@@ -82,6 +83,7 @@ class Capabilities:
 class ModelOptions:
     model: LiveAPIModels | str
     api_key: str | None
+    api_version: str
     voice: Voice | str
     response_modalities: list[Modality] | None
     vertexai: bool
@@ -106,6 +108,7 @@ class RealtimeModel:
         instructions: str | None = None,
         model: LiveAPIModels | str = "gemini-2.0-flash-exp",
         api_key: str | None = None,
+        api_version: str = "v1alpha",
         voice: Voice | str = "Puck",
         modalities: list[Modality] = [Modality.AUDIO],
         enable_user_audio_transcription: bool = True,
@@ -135,6 +138,7 @@ class RealtimeModel:
         Args:
             instructions (str, optional): Initial system instructions for the model. Defaults to "".
             api_key (str or None, optional): Google Gemini API key. If None, will attempt to read from the environment variable GOOGLE_API_KEY.
+            api_version (str, optional): The version of the API to use. Defaults to "v1alpha".
             modalities (list[Modality], optional): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
             model (str or None, optional): The name of the model to use. Defaults to "gemini-2.0-flash-exp".
             voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
@@ -186,6 +190,7 @@ class RealtimeModel:
         self._rt_sessions: list[GeminiRealtimeSession] = []
         self._opts = ModelOptions(
             model=model,
+            api_version=api_version,
             api_key=self._api_key,
             voice=voice,
             enable_user_audio_transcription=enable_user_audio_transcription,
@@ -258,6 +263,8 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
         self._fnc_ctx = fnc_ctx
         self._fnc_tasks = utils.aio.TaskSet()
         self._is_interrupted = False
+        self._playout_complete = asyncio.Event()
+        self._playout_complete.set()
         tools = []
         if self._fnc_ctx is not None:
@@ -286,7 +293,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
             tools=tools,
         )
         self._client = genai.Client(
-            http_options=HttpOptions(api_version="v1alpha"),
+            http_options=HttpOptions(api_version=self._opts.api_version),
             api_key=self._opts.api_key,
             vertexai=self._opts.vertexai,
             project=self._opts.project,
@@ -317,6 +324,10 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
         self._send_ch.close()
         await self._main_atask
+    @property
+    def playout_complete(self) -> asyncio.Event | None:
+        return self._playout_complete
     @property
     def fnc_ctx(self) -> llm.FunctionContext | None:
         return self._fnc_ctx
@@ -325,14 +336,53 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
     def fnc_ctx(self, value: llm.FunctionContext | None) -> None:
         self._fnc_ctx = value
-    def _push_audio(self, frame: rtc.AudioFrame) -> None:
-        if self._opts.enable_user_audio_transcription:
-            self._transcriber._push_audio(frame)
+    def _push_media_chunk(self, data: bytes, mime_type: str) -> None:
         realtime_input = LiveClientRealtimeInput(
-            media_chunks=[Blob(data=frame.data.tobytes(), mime_type="audio/pcm")],
+            media_chunks=[Blob(data=data, mime_type=mime_type)],
         )
         self._queue_msg(realtime_input)
+    DEFAULT_ENCODE_OPTIONS = images.EncodeOptions(
+        format="JPEG",
+        quality=75,
+        resize_options=images.ResizeOptions(
+            width=1024, height=1024, strategy="scale_aspect_fit"
+        ),
+    )
+    def push_video(
+        self,
+        frame: rtc.VideoFrame,
+        encode_options: images.EncodeOptions = DEFAULT_ENCODE_OPTIONS,
+    ) -> None:
+        """Push a video frame to the Gemini Multimodal Live session.
+        Args:
+            frame (rtc.VideoFrame): The video frame to push.
+            encode_options (images.EncodeOptions, optional): The encode options for the video frame. Defaults to 1024x1024 JPEG.
+        Notes:
+        - This will be sent immediately so you should use a sampling frame rate that makes sense for your application and Gemini's constraints. 1 FPS is a good starting point.
+        """
+        encoded_data = images.encode(
+            frame,
+            encode_options,
+        )
+        mime_type = (
+            "image/jpeg"
+            if encode_options.format == "JPEG"
+            else "image/png"
+            if encode_options.format == "PNG"
+            else "image/jpeg"
+        )
+        self._push_media_chunk(encoded_data, mime_type)
+    def _push_audio(self, frame: rtc.AudioFrame) -> None:
+        if self._opts.enable_user_audio_transcription:
+            self._transcriber._push_audio(frame)
+        self._push_media_chunk(frame.data.tobytes(), "audio/pcm")
     def _queue_msg(self, msg: ClientEvents) -> None:
         self._send_ch.send_nowait(msg)

livekit/plugins/google/models.py CHANGED Viewed

@@ -94,8 +94,6 @@ SpeechLanguages = Literal[
 Gender = Literal["male", "female", "neutral"]
-AudioEncoding = Literal["wav", "mp3", "ogg", "mulaw", "alaw", "linear16"]
 ChatModels = Literal[
     "gemini-2.0-flash-001",
     "gemini-2.0-flash-lite-preview-02-05",

livekit/plugins/google/stt.py CHANGED Viewed

@@ -322,6 +322,10 @@ class STT(stt.STT):
                 keywords=keywords,
             )
+    async def aclose(self) -> None:
+        await self._pool.aclose()
+        await super().aclose()
 class SpeechStream(stt.SpeechStream):
     def __init__(

livekit/plugins/google/tts.py CHANGED Viewed

@@ -17,7 +17,6 @@ from __future__ import annotations
 from dataclasses import dataclass
 from typing import Optional
-from livekit import rtc
 from livekit.agents import (
     APIConnectionError,
     APIConnectOptions,
@@ -31,7 +30,7 @@ from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
 from google.cloud import texttospeech
 from google.cloud.texttospeech_v1.types import SsmlVoiceGender, SynthesizeSpeechResponse
-from .models import AudioEncoding, Gender, SpeechLanguages
+from .models import Gender, SpeechLanguages
 @dataclass
@@ -47,7 +46,6 @@ class TTS(tts.TTS):
         language: SpeechLanguages | str = "en-US",
         gender: Gender | str = "neutral",
         voice_name: str = "",  # Not required
-        encoding: AudioEncoding | str = "linear16",
         sample_rate: int = 24000,
         pitch: int = 0,
         effects_profile_id: str = "",
@@ -66,7 +64,6 @@ class TTS(tts.TTS):
             language (SpeechLanguages | str, optional): Language code (e.g., "en-US"). Default is "en-US".
             gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
             voice_name (str, optional): Specific voice name. Default is an empty string.
-            encoding (AudioEncoding | str, optional): Audio encoding format (e.g., "linear16"). Default is "linear16".
             sample_rate (int, optional): Audio sample rate in Hz. Default is 24000.
             pitch (float, optional): Speaking pitch, ranging from -20.0 to 20.0 semitones relative to the original pitch. Default is 0.
             effects_profile_id (str): Optional identifier for selecting audio effects profiles to apply to the synthesized speech.
@@ -93,17 +90,10 @@ class TTS(tts.TTS):
             ssml_gender=_gender_from_str(gender),
         )
-        if encoding == "linear16" or encoding == "wav":
-            _audio_encoding = texttospeech.AudioEncoding.LINEAR16
-        elif encoding == "mp3":
-            _audio_encoding = texttospeech.AudioEncoding.MP3
-        else:
-            raise NotImplementedError(f"audio encoding {encoding} is not supported")
         self._opts = _TTSOptions(
             voice=voice,
             audio_config=texttospeech.AudioConfig(
-                audio_encoding=_audio_encoding,
+                audio_encoding=texttospeech.AudioEncoding.OGG_OPUS,
                 sample_rate_hertz=sample_rate,
                 pitch=pitch,
                 effects_profile_id=effects_profile_id,
@@ -195,35 +185,24 @@ class ChunkedStream(tts.ChunkedStream):
                 timeout=self._conn_options.timeout,
             )
-            if self._opts.audio_config.audio_encoding == "mp3":
-                decoder = utils.codecs.Mp3StreamDecoder()
-                bstream = utils.audio.AudioByteStream(
-                    sample_rate=self._opts.audio_config.sample_rate_hertz,
-                    num_channels=1,
-                )
-                for frame in decoder.decode_chunk(response.audio_content):
-                    for frame in bstream.write(frame.data.tobytes()):
-                        self._event_ch.send_nowait(
-                            tts.SynthesizedAudio(request_id=request_id, frame=frame)
-                        )
-                for frame in bstream.flush():
-                    self._event_ch.send_nowait(
-                        tts.SynthesizedAudio(request_id=request_id, frame=frame)
-                    )
-            else:
-                data = response.audio_content[44:]  # skip WAV header
-                self._event_ch.send_nowait(
-                    tts.SynthesizedAudio(
-                        request_id=request_id,
-                        frame=rtc.AudioFrame(
-                            data=data,
-                            sample_rate=self._opts.audio_config.sample_rate_hertz,
-                            num_channels=1,
-                            samples_per_channel=len(data) // 2,  # 16-bit
-                        ),
-                    )
+            # Create AudioStreamDecoder for OGG format
+            decoder = utils.codecs.AudioStreamDecoder(
+                sample_rate=self._opts.audio_config.sample_rate_hertz,
+                num_channels=1,
+            )
+            try:
+                decoder.push(response.audio_content)
+                decoder.end_input()
+                emitter = tts.SynthesizedAudioEmitter(
+                    event_ch=self._event_ch,
+                    request_id=request_id,
                 )
+                async for frame in decoder:
+                    emitter.push(frame)
+                emitter.flush()
+            finally:
+                await decoder.aclose()
         except DeadlineExceeded:
             raise APITimeoutError()

livekit/plugins/google/version.py CHANGED Viewed

@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.10.6"
+__version__ = "0.11.1"

{livekit_plugins_google-0.10.6.dist-info → livekit_plugins_google-0.11.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: livekit-plugins-google
-Version: 0.10.6
+Version: 0.11.1
 Summary: Agent Framework plugin for services from Google Cloud
 Home-page: https://github.com/livekit/agents
 License: Apache-2.0
@@ -23,7 +23,7 @@ Requires-Dist: google-auth<3,>=2
 Requires-Dist: google-cloud-speech<3,>=2
 Requires-Dist: google-cloud-texttospeech<3,>=2
 Requires-Dist: google-genai==1.3.0
-Requires-Dist: livekit-agents>=0.12.11
+Requires-Dist: livekit-agents<1.0.0,>=0.12.16
 Dynamic: classifier
 Dynamic: description
 Dynamic: description-content-type
@@ -53,3 +53,57 @@ To use the STT and TTS API, you'll need to enable the respective services for yo
 - Cloud Speech-to-Text API
 - Cloud Text-to-Speech API
+## Gemini Multimodal Live
+Gemini Multimodal Live can be used with the `MultimodalAgent` class. See examples/multimodal_agent/gemini_agent.py for an example.
+### Live Video Input (experimental)
+You can push video frames to your Gemini Multimodal Live session alongside the audio automatically handled by the `MultimodalAgent`.  The basic approach is to subscribe to the video track, create a video stream, sample frames at a suitable frame rate, and push them into the RealtimeSession:
+```
+# Make sure you subscribe to audio and video tracks
+await ctx.connect(auto_subscribe=AutoSubscribe.SUBSCRIBE_ALL)
+# Create your RealtimeModel and store a reference
+model = google.beta.realtime.RealtimeModel(
+    # ...
+)
+# Create your MultimodalAgent as usual
+agent = MultimodalAgent(
+    model=model,
+    # ...
+)
+# Async method to process the video track and push frames to Gemini
+async def _process_video_track(self, track: Track):
+    video_stream = VideoStream(track)
+    last_frame_time = 0
+    async for event in video_stream:
+        current_time = asyncio.get_event_loop().time()
+        # Sample at 1 FPS
+        if current_time - last_frame_time < 1.0:
+            continue
+        last_frame_time = current_time
+        frame = event.frame
+        # Push the frame into the RealtimeSession
+        model.sessions[0].push_video(frame)
+    await video_stream.aclose()
+# Subscribe to new tracks and process them
+@ctx.room.on("track_subscribed")
+def _on_track_subscribed(track: Track, pub, participant):
+    if track.kind == TrackKind.KIND_VIDEO:
+        asyncio.create_task(self._process_video_track(track))
+```

livekit_plugins_google-0.11.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,18 @@
+livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
+livekit/plugins/google/_utils.py,sha256=FG1_26nlWGcI6onPleQQcmGBMfb4QNYgis1B5BMJxWA,7131
+livekit/plugins/google/llm.py,sha256=LZaHsrkjfboRZLWm7L2G0mw62q2sXBNj4YeeV2Sk2uU,16717
+livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
+livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
+livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+livekit/plugins/google/stt.py,sha256=96GJmGDAIBdCpDECArwIXpj2s1xlcA_zuvTnwsvq4xA,22854
+livekit/plugins/google/tts.py,sha256=pG9_pibO3NDGEMa4huU5S9lbeyI3daQyrS17SuTKfZI,8008
+livekit/plugins/google/version.py,sha256=LeUJJQ9jwADplJbF46ClzVjYAClwJEhZMCToNJN9lWc,601
+livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
+livekit/plugins/google/beta/realtime/__init__.py,sha256=sGTn6JFNyA30QUXBZ_BV3l2eHpGAzR35ByXxg77vWNU,205
+livekit/plugins/google/beta/realtime/api_proto.py,sha256=9EhmwgeIgKDqdSijv5Q9pgx7UhAakK02ZDwbnUsra_o,657
+livekit/plugins/google/beta/realtime/realtime_api.py,sha256=8JdWUMUheGhy1ia6JbN3_U2_cL7CNs8-1fTOAgW4I38,22999
+livekit/plugins/google/beta/realtime/transcriber.py,sha256=rjXO0cSPr3HATxrSfv1MX7IbrjmiTvnLPF280BfRBL8,9809
+livekit_plugins_google-0.11.1.dist-info/METADATA,sha256=m7B07abY9wTbEJVa3dmdsgfatxYwJFwDNQYhyJgIPJU,3732
+livekit_plugins_google-0.11.1.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
+livekit_plugins_google-0.11.1.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
+livekit_plugins_google-0.11.1.dist-info/RECORD,,

{livekit_plugins_google-0.10.6.dist-info → livekit_plugins_google-0.11.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.8.2)
+Generator: setuptools (76.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

livekit_plugins_google-0.10.6.dist-info/RECORD DELETED Viewed

@@ -1,18 +0,0 @@
-livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
-livekit/plugins/google/_utils.py,sha256=FG1_26nlWGcI6onPleQQcmGBMfb4QNYgis1B5BMJxWA,7131
-livekit/plugins/google/llm.py,sha256=LZaHsrkjfboRZLWm7L2G0mw62q2sXBNj4YeeV2Sk2uU,16717
-livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
-livekit/plugins/google/models.py,sha256=8Ysqkb0pOSSr_S9XHYxLz5nofDTt8RtfbsTIWoptOQU,1532
-livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-livekit/plugins/google/stt.py,sha256=0-4mVD5IydvsWp9OzYyVmXe6pz6FDvPutRLF169y674,22752
-livekit/plugins/google/tts.py,sha256=w4EMk9rPfyAzPyWFwE_5sPo9UY7DNFa2g83K56AUk9I,9228
-livekit/plugins/google/version.py,sha256=B7ZiVTsE24YmkTGl3227ZHjutNpXQp27028_w5-LuRA,601
-livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
-livekit/plugins/google/beta/realtime/__init__.py,sha256=sGTn6JFNyA30QUXBZ_BV3l2eHpGAzR35ByXxg77vWNU,205
-livekit/plugins/google/beta/realtime/api_proto.py,sha256=ralrRZqIbE71oyuLKRYaXHvm6tcHMwBJueKvSO8Xfus,658
-livekit/plugins/google/beta/realtime/realtime_api.py,sha256=SU_uQvZMBwbVgexZqkAjGmJVUW80ObJ4LP53rV7xqko,21228
-livekit/plugins/google/beta/realtime/transcriber.py,sha256=rjXO0cSPr3HATxrSfv1MX7IbrjmiTvnLPF280BfRBL8,9809
-livekit_plugins_google-0.10.6.dist-info/METADATA,sha256=cvkHdPcsrRpbSjW8oowAgN392NWQmoUD429U6zYSeKk,2058
-livekit_plugins_google-0.10.6.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
-livekit_plugins_google-0.10.6.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
-livekit_plugins_google-0.10.6.dist-info/RECORD,,

{livekit_plugins_google-0.10.6.dist-info → livekit_plugins_google-0.11.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

livekit-plugins-google 0.10.6__py3-none-any.whl → 0.11.1__py3-none-any.whl

livekit-plugins-google 0.10.6py3-none-any.whl → 0.11.1py3-none-any.whl