PyPI - livekit-plugins-google - Versions diffs - 0.10.5__py3-none-any.whl → 0.11.0__py3-none-any.whl - Mend

livekit-plugins-google 0.10.5py3-none-any.whl → 0.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

livekit/plugins/google/_utils.py CHANGED Viewed

@@ -10,14 +10,15 @@ from livekit.agents import llm, utils
 from livekit.agents.llm.function_context import _is_optional_type
 from google.genai import types
-JSON_SCHEMA_TYPE_MAP: dict[type, types.Type] = {
-    str: "STRING",
-    int: "INTEGER",
-    float: "NUMBER",
-    bool: "BOOLEAN",
-    dict: "OBJECT",
-    list: "ARRAY",
+from google.genai.types import Type as GenaiType
+JSON_SCHEMA_TYPE_MAP: dict[type, GenaiType] = {
+    str: GenaiType.STRING,
+    int: GenaiType.INTEGER,
+    float: GenaiType.NUMBER,
+    bool: GenaiType.BOOLEAN,
+    dict: GenaiType.OBJECT,
+    list: GenaiType.ARRAY,
 }
 __all__ = ["_build_gemini_ctx", "_build_tools"]
@@ -38,7 +39,7 @@ def _build_parameters(arguments: Dict[str, Any]) -> types.Schema | None:
             item_type = get_args(py_type)[0]
             if item_type not in JSON_SCHEMA_TYPE_MAP:
                 raise ValueError(f"Unsupported type: {item_type}")
-            prop.type = "ARRAY"
+            prop.type = GenaiType.ARRAY
             prop.items = types.Schema(type=JSON_SCHEMA_TYPE_MAP[item_type])
             if arg_info.choices:
@@ -62,7 +63,7 @@ def _build_parameters(arguments: Dict[str, Any]) -> types.Schema | None:
             required.append(arg_name)
     if properties:
-        parameters = types.Schema(type="OBJECT", properties=properties)
+        parameters = types.Schema(type=GenaiType.OBJECT, properties=properties)
         if required:
             parameters.required = required
@@ -119,7 +120,6 @@ def _build_gemini_ctx(
                 parts.append(
                     types.Part(
                         function_call=types.FunctionCall(
-                            id=fnc.tool_call_id,
                             name=fnc.function_info.name,
                             args=fnc.arguments,
                         )
@@ -132,7 +132,6 @@ def _build_gemini_ctx(
                     parts.append(
                         types.Part(
                             function_response=types.FunctionResponse(
-                                id=msg.tool_call_id,
                                 name=msg.name,
                                 response=msg.content,
                             )
@@ -142,7 +141,6 @@ def _build_gemini_ctx(
                     parts.append(
                         types.Part(
                             function_response=types.FunctionResponse(
-                                id=msg.tool_call_id,
                                 name=msg.name,
                                 response={"result": msg.content},
                             )
@@ -193,8 +191,7 @@ def _build_gemini_image_part(image: llm.ChatImage, cache_key: Any) -> types.Part
                     height=image.inference_height,
                     strategy="scale_aspect_fit",
                 )
-            encoded_data = utils.images.encode(image.image, opts)
-            image._cache[cache_key] = base64.b64encode(encoded_data).decode("utf-8")
+            image._cache[cache_key] = utils.images.encode(image.image, opts)
         return types.Part.from_bytes(
             data=image._cache[cache_key], mime_type="image/jpeg"

livekit/plugins/google/beta/realtime/realtime_api.py CHANGED Viewed

@@ -9,14 +9,15 @@ from typing import AsyncIterable, Literal
 from livekit import rtc
 from livekit.agents import llm, utils
 from livekit.agents.llm.function_context import _create_ai_function_info
+from livekit.agents.utils import images
 from google import genai
-from google.genai._api_client import HttpOptions
 from google.genai.types import (
     Blob,
     Content,
     FunctionResponse,
     GenerationConfig,
+    HttpOptions,
     LiveClientContent,
     LiveClientRealtimeInput,
     LiveClientToolResponse,
@@ -107,7 +108,7 @@ class RealtimeModel:
         model: LiveAPIModels | str = "gemini-2.0-flash-exp",
         api_key: str | None = None,
         voice: Voice | str = "Puck",
-        modalities: list[Modality] = ["AUDIO"],
+        modalities: list[Modality] = [Modality.AUDIO],
         enable_user_audio_transcription: bool = True,
         enable_agent_audio_transcription: bool = True,
         vertexai: bool = False,
@@ -258,6 +259,8 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
         self._fnc_ctx = fnc_ctx
         self._fnc_tasks = utils.aio.TaskSet()
         self._is_interrupted = False
+        self._playout_complete = asyncio.Event()
+        self._playout_complete.set()
         tools = []
         if self._fnc_ctx is not None:
@@ -317,6 +320,10 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
         self._send_ch.close()
         await self._main_atask
+    @property
+    def playout_complete(self) -> asyncio.Event | None:
+        return self._playout_complete
     @property
     def fnc_ctx(self) -> llm.FunctionContext | None:
         return self._fnc_ctx
@@ -325,14 +332,53 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
     def fnc_ctx(self, value: llm.FunctionContext | None) -> None:
         self._fnc_ctx = value
-    def _push_audio(self, frame: rtc.AudioFrame) -> None:
-        if self._opts.enable_user_audio_transcription:
-            self._transcriber._push_audio(frame)
+    def _push_media_chunk(self, data: bytes, mime_type: str) -> None:
         realtime_input = LiveClientRealtimeInput(
-            media_chunks=[Blob(data=frame.data.tobytes(), mime_type="audio/pcm")],
+            media_chunks=[Blob(data=data, mime_type=mime_type)],
         )
         self._queue_msg(realtime_input)
+    DEFAULT_ENCODE_OPTIONS = images.EncodeOptions(
+        format="JPEG",
+        quality=75,
+        resize_options=images.ResizeOptions(
+            width=1024, height=1024, strategy="scale_aspect_fit"
+        ),
+    )
+    def push_video(
+        self,
+        frame: rtc.VideoFrame,
+        encode_options: images.EncodeOptions = DEFAULT_ENCODE_OPTIONS,
+    ) -> None:
+        """Push a video frame to the Gemini Multimodal Live session.
+        Args:
+            frame (rtc.VideoFrame): The video frame to push.
+            encode_options (images.EncodeOptions, optional): The encode options for the video frame. Defaults to 1024x1024 JPEG.
+        Notes:
+        - This will be sent immediately so you should use a sampling frame rate that makes sense for your application and Gemini's constraints. 1 FPS is a good starting point.
+        """
+        encoded_data = images.encode(
+            frame,
+            encode_options,
+        )
+        mime_type = (
+            "image/jpeg"
+            if encode_options.format == "JPEG"
+            else "image/png"
+            if encode_options.format == "PNG"
+            else "image/jpeg"
+        )
+        self._push_media_chunk(encoded_data, mime_type)
+    def _push_audio(self, frame: rtc.AudioFrame) -> None:
+        if self._opts.enable_user_audio_transcription:
+            self._transcriber._push_audio(frame)
+        self._push_media_chunk(frame.data.tobytes(), "audio/pcm")
     def _queue_msg(self, msg: ClientEvents) -> None:
         self._send_ch.send_nowait(msg)
@@ -479,12 +525,12 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
                         logger.warning(
                             "function call cancelled",
                             extra={
-                                "function_call_ids": response.tool_call_cancellation.function_call_ids,
+                                "function_call_ids": response.tool_call_cancellation.ids,
                             },
                         )
                         self.emit(
                             "function_calls_cancelled",
-                            response.tool_call_cancellation.function_call_ids,
+                            response.tool_call_cancellation.ids,
                         )
         async with self._client.aio.live.connect(

livekit/plugins/google/beta/realtime/transcriber.py CHANGED Viewed

@@ -55,7 +55,7 @@ class TranscriberSession(utils.EventEmitter[EventTypes]):
             parts=[types.Part(text=SYSTEM_INSTRUCTIONS)]
         )
         self._config = types.LiveConnectConfig(
-            response_modalities=["TEXT"],
+            response_modalities=[types.Modality.TEXT],
             system_instruction=system_instructions,
             generation_config=types.GenerationConfig(temperature=0.0),
         )

livekit/plugins/google/llm.py CHANGED Viewed

@@ -240,7 +240,7 @@ class LLMStream(llm.LLMStream):
                         # specific function
                         tool_config = types.ToolConfig(
                             function_calling_config=types.FunctionCallingConfig(
-                                mode="ANY",
+                                mode=types.FunctionCallingConfigMode.ANY,
                                 allowed_function_names=[self._tool_choice.name],
                             )
                         )
@@ -248,7 +248,7 @@ class LLMStream(llm.LLMStream):
                         # model must call any function
                         tool_config = types.ToolConfig(
                             function_calling_config=types.FunctionCallingConfig(
-                                mode="ANY",
+                                mode=types.FunctionCallingConfigMode.ANY,
                                 allowed_function_names=[
                                     fnc.name
                                     for fnc in self._fnc_ctx.ai_functions.values()
@@ -259,14 +259,14 @@ class LLMStream(llm.LLMStream):
                         # model can call any function
                         tool_config = types.ToolConfig(
                             function_calling_config=types.FunctionCallingConfig(
-                                mode="AUTO"
+                                mode=types.FunctionCallingConfigMode.AUTO
                             )
                         )
                     elif self._tool_choice == "none":
                         # model cannot call any function
                         tool_config = types.ToolConfig(
                             function_calling_config=types.FunctionCallingConfig(
-                                mode="NONE",
+                                mode=types.FunctionCallingConfigMode.NONE,
                             )
                         )
                     opts["tool_config"] = tool_config
@@ -282,11 +282,12 @@ class LLMStream(llm.LLMStream):
                 system_instruction=system_instruction,
                 **opts,
             )
-            async for response in self._client.aio.models.generate_content_stream(
+            stream = await self._client.aio.models.generate_content_stream(
                 model=self._model,
                 contents=cast(types.ContentListUnion, turns),
                 config=config,
-            ):
+            )
+            async for response in stream:  # type: ignore
                 if response.prompt_feedback:
                     raise APIStatusError(
                         response.prompt_feedback.json(),

livekit/plugins/google/models.py CHANGED Viewed

@@ -10,6 +10,8 @@ SpeechModels = Literal[
     "medical_conversation",
     "chirp",
     "chirp_2",
+    "latest_long",
+    "latest_short",
 ]
 SpeechLanguages = Literal[
@@ -92,8 +94,6 @@ SpeechLanguages = Literal[
 Gender = Literal["male", "female", "neutral"]
-AudioEncoding = Literal["wav", "mp3", "ogg", "mulaw", "alaw", "linear16"]
 ChatModels = Literal[
     "gemini-2.0-flash-001",
     "gemini-2.0-flash-lite-preview-02-05",

livekit/plugins/google/stt.py CHANGED Viewed

@@ -19,7 +19,7 @@ import dataclasses
 import time
 import weakref
 from dataclasses import dataclass
-from typing import List, Union
+from typing import Callable, List, Union
 from livekit import rtc
 from livekit.agents import (
@@ -61,7 +61,7 @@ class STTOptions:
     interim_results: bool
     punctuate: bool
     spoken_punctuation: bool
-    model: SpeechModels
+    model: SpeechModels | str
     sample_rate: int
     keywords: List[tuple[str, float]] | None
@@ -93,7 +93,7 @@ class STT(stt.STT):
         interim_results: bool = True,
         punctuate: bool = True,
         spoken_punctuation: bool = False,
-        model: SpeechModels = "chirp_2",
+        model: SpeechModels | str = "latest_long",
         location: str = "us-central1",
         sample_rate: int = 16000,
         credentials_info: dict | None = None,
@@ -106,12 +106,24 @@ class STT(stt.STT):
         Credentials must be provided, either by using the ``credentials_info`` dict, or reading
         from the file specified in ``credentials_file`` or via Application Default Credentials as
         described in https://cloud.google.com/docs/authentication/application-default-credentials
+        args:
+            languages(LanguageCode): list of language codes to recognize (default: "en-US")
+            detect_language(bool): whether to detect the language of the audio (default: True)
+            interim_results(bool): whether to return interim results (default: True)
+            punctuate(bool): whether to punctuate the audio (default: True)
+            spoken_punctuation(bool): whether to use spoken punctuation (default: False)
+            model(SpeechModels): the model to use for recognition default: "latest_long"
+            location(str): the location to use for recognition default: "us-central1"
+            sample_rate(int): the sample rate of the audio default: 16000
+            credentials_info(dict): the credentials info to use for recognition (default: None)
+            credentials_file(str): the credentials file to use for recognition (default: None)
+            keywords(List[tuple[str, float]]): list of keywords to recognize (default: None)
         """
         super().__init__(
             capabilities=stt.STTCapabilities(streaming=True, interim_results=True)
         )
-        self._client: SpeechAsyncClient | None = None
         self._location = location
         self._credentials_info = credentials_info
         self._credentials_file = credentials_file
@@ -140,40 +152,44 @@ class STT(stt.STT):
             keywords=keywords,
         )
         self._streams = weakref.WeakSet[SpeechStream]()
+        self._pool = utils.ConnectionPool[SpeechAsyncClient](
+            max_session_duration=_max_session_duration,
+            connect_cb=self._create_client,
+        )
-    def _ensure_client(self) -> SpeechAsyncClient:
+    async def _create_client(self) -> SpeechAsyncClient:
         # Add support for passing a specific location that matches recognizer
         # see: https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages
         client_options = None
+        client: SpeechAsyncClient | None = None
         if self._location != "global":
             client_options = ClientOptions(
                 api_endpoint=f"{self._location}-speech.googleapis.com"
             )
         if self._credentials_info:
-            self._client = SpeechAsyncClient.from_service_account_info(
+            client = SpeechAsyncClient.from_service_account_info(
                 self._credentials_info,
                 client_options=client_options,
             )
         elif self._credentials_file:
-            self._client = SpeechAsyncClient.from_service_account_file(
+            client = SpeechAsyncClient.from_service_account_file(
                 self._credentials_file,
                 client_options=client_options,
             )
         else:
-            self._client = SpeechAsyncClient(
+            client = SpeechAsyncClient(
                 client_options=client_options,
             )
-        assert self._client is not None
-        return self._client
+        assert client is not None
+        return client
-    @property
-    def _recognizer(self) -> str:
+    def _get_recognizer(self, client: SpeechAsyncClient) -> str:
         # TODO(theomonnom): should we use recognizers?
         # recognizers may improve latency https://cloud.google.com/speech-to-text/v2/docs/recognizers#understand_recognizers
         # TODO(theomonnom): find a better way to access the project_id
         try:
-            project_id = self._ensure_client().transport._credentials.project_id  # type: ignore
+            project_id = client.transport._credentials.project_id  # type: ignore
         except AttributeError:
             from google.auth import default as ga_default
@@ -224,16 +240,17 @@ class STT(stt.STT):
         )
         try:
-            raw = await self._ensure_client().recognize(
-                cloud_speech.RecognizeRequest(
-                    recognizer=self._recognizer,
-                    config=config,
-                    content=frame.data.tobytes(),
-                ),
-                timeout=conn_options.timeout,
-            )
+            async with self._pool.connection() as client:
+                raw = await client.recognize(
+                    cloud_speech.RecognizeRequest(
+                        recognizer=self._get_recognizer(client),
+                        config=config,
+                        content=frame.data.tobytes(),
+                    ),
+                    timeout=conn_options.timeout,
+                )
-            return _recognize_response_to_speech_event(raw)
+                return _recognize_response_to_speech_event(raw)
         except DeadlineExceeded:
             raise APITimeoutError()
         except GoogleAPICallError as e:
@@ -253,8 +270,8 @@ class STT(stt.STT):
         config = self._sanitize_options(language=language)
         stream = SpeechStream(
             stt=self,
-            client=self._ensure_client(),
-            recognizer=self._recognizer,
+            pool=self._pool,
+            recognizer_cb=self._get_recognizer,
             config=config,
             conn_options=conn_options,
         )
@@ -287,13 +304,10 @@ class STT(stt.STT):
             self._config.spoken_punctuation = spoken_punctuation
         if model is not None:
             self._config.model = model
-        client = None
-        recognizer = None
         if location is not None:
             self._location = location
             # if location is changed, fetch a new client and recognizer as per the new location
-            client = self._ensure_client()
-            recognizer = self._recognizer
+            self._pool.invalidate()
         if keywords is not None:
             self._config.keywords = keywords
@@ -306,10 +320,12 @@ class STT(stt.STT):
                 spoken_punctuation=spoken_punctuation,
                 model=model,
                 keywords=keywords,
-                client=client,
-                recognizer=recognizer,
             )
+    async def aclose(self) -> None:
+        await self._pool.aclose()
+        await super().aclose()
 class SpeechStream(stt.SpeechStream):
     def __init__(
@@ -317,16 +333,16 @@ class SpeechStream(stt.SpeechStream):
         *,
         stt: STT,
         conn_options: APIConnectOptions,
-        client: SpeechAsyncClient,
-        recognizer: str,
+        pool: utils.ConnectionPool[SpeechAsyncClient],
+        recognizer_cb: Callable[[SpeechAsyncClient], str],
         config: STTOptions,
     ) -> None:
         super().__init__(
             stt=stt, conn_options=conn_options, sample_rate=config.sample_rate
         )
-        self._client = client
-        self._recognizer = recognizer
+        self._pool = pool
+        self._recognizer_cb = recognizer_cb
         self._config = config
         self._reconnect_event = asyncio.Event()
         self._session_connected_at: float = 0
@@ -341,8 +357,6 @@ class SpeechStream(stt.SpeechStream):
         spoken_punctuation: bool | None = None,
         model: SpeechModels | None = None,
         keywords: List[tuple[str, float]] | None = None,
-        client: SpeechAsyncClient | None = None,
-        recognizer: str | None = None,
     ):
         if languages is not None:
             if isinstance(languages, str):
@@ -360,21 +374,19 @@ class SpeechStream(stt.SpeechStream):
             self._config.model = model
         if keywords is not None:
             self._config.keywords = keywords
-        if client is not None:
-            self._client = client
-        if recognizer is not None:
-            self._recognizer = recognizer
         self._reconnect_event.set()
     async def _run(self) -> None:
         # google requires a async generator when calling streaming_recognize
         # this function basically convert the queue into a async generator
-        async def input_generator(should_stop: asyncio.Event):
+        async def input_generator(
+            client: SpeechAsyncClient, should_stop: asyncio.Event
+        ):
             try:
                 # first request should contain the config
                 yield cloud_speech.StreamingRecognizeRequest(
-                    recognizer=self._recognizer,
+                    recognizer=self._recognizer_cb(client),
                     streaming_config=self._streaming_config,
                 )
@@ -395,7 +407,7 @@ class SpeechStream(stt.SpeechStream):
                     "an error occurred while streaming input to google STT"
                 )
-        async def process_stream(stream):
+        async def process_stream(client: SpeechAsyncClient, stream):
             has_started = False
             async for resp in stream:
                 if (
@@ -437,6 +449,7 @@ class SpeechStream(stt.SpeechStream):
                             logger.debug(
                                 "Google STT maximum connection time reached. Reconnecting..."
                             )
+                            self._pool.remove(client)
                             if has_started:
                                 self._event_ch.send_nowait(
                                     stt.SpeechEvent(
@@ -458,52 +471,57 @@ class SpeechStream(stt.SpeechStream):
         while True:
             try:
-                self._streaming_config = cloud_speech.StreamingRecognitionConfig(
-                    config=cloud_speech.RecognitionConfig(
-                        explicit_decoding_config=cloud_speech.ExplicitDecodingConfig(
-                            encoding=cloud_speech.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
-                            sample_rate_hertz=self._config.sample_rate,
-                            audio_channel_count=1,
+                async with self._pool.connection() as client:
+                    self._streaming_config = cloud_speech.StreamingRecognitionConfig(
+                        config=cloud_speech.RecognitionConfig(
+                            explicit_decoding_config=cloud_speech.ExplicitDecodingConfig(
+                                encoding=cloud_speech.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
+                                sample_rate_hertz=self._config.sample_rate,
+                                audio_channel_count=1,
+                            ),
+                            adaptation=self._config.build_adaptation(),
+                            language_codes=self._config.languages,
+                            model=self._config.model,
+                            features=cloud_speech.RecognitionFeatures(
+                                enable_automatic_punctuation=self._config.punctuate,
+                                enable_word_time_offsets=True,
+                            ),
                         ),
-                        adaptation=self._config.build_adaptation(),
-                        language_codes=self._config.languages,
-                        model=self._config.model,
-                        features=cloud_speech.RecognitionFeatures(
-                            enable_automatic_punctuation=self._config.punctuate,
-                            enable_word_time_offsets=True,
+                        streaming_features=cloud_speech.StreamingRecognitionFeatures(
+                            enable_voice_activity_events=True,
+                            interim_results=self._config.interim_results,
                         ),
-                    ),
-                    streaming_features=cloud_speech.StreamingRecognitionFeatures(
-                        enable_voice_activity_events=True,
-                        interim_results=self._config.interim_results,
-                    ),
-                )
-                should_stop = asyncio.Event()
-                stream = await self._client.streaming_recognize(
-                    requests=input_generator(should_stop),
-                )
-                self._session_connected_at = time.time()
+                    )
-                process_stream_task = asyncio.create_task(process_stream(stream))
-                wait_reconnect_task = asyncio.create_task(self._reconnect_event.wait())
+                    should_stop = asyncio.Event()
+                    stream = await client.streaming_recognize(
+                        requests=input_generator(client, should_stop),
+                    )
+                    self._session_connected_at = time.time()
-                try:
-                    done, _ = await asyncio.wait(
-                        [process_stream_task, wait_reconnect_task],
-                        return_when=asyncio.FIRST_COMPLETED,
+                    process_stream_task = asyncio.create_task(
+                        process_stream(client, stream)
                     )
-                    for task in done:
-                        if task != wait_reconnect_task:
-                            task.result()
-                    if wait_reconnect_task not in done:
-                        break
-                    self._reconnect_event.clear()
-                finally:
-                    await utils.aio.gracefully_cancel(
-                        process_stream_task, wait_reconnect_task
+                    wait_reconnect_task = asyncio.create_task(
+                        self._reconnect_event.wait()
                     )
-                    should_stop.set()
+                    try:
+                        done, _ = await asyncio.wait(
+                            [process_stream_task, wait_reconnect_task],
+                            return_when=asyncio.FIRST_COMPLETED,
+                        )
+                        for task in done:
+                            if task != wait_reconnect_task:
+                                task.result()
+                        if wait_reconnect_task not in done:
+                            break
+                        self._reconnect_event.clear()
+                    finally:
+                        await utils.aio.gracefully_cancel(
+                            process_stream_task, wait_reconnect_task
+                        )
+                        should_stop.set()
             except DeadlineExceeded:
                 raise APITimeoutError()
             except GoogleAPICallError as e:

livekit/plugins/google/tts.py CHANGED Viewed

@@ -15,10 +15,9 @@
 from __future__ import annotations
 from dataclasses import dataclass
+from typing import Optional
-from livekit import rtc
 from livekit.agents import (
-    DEFAULT_API_CONNECT_OPTIONS,
     APIConnectionError,
     APIConnectOptions,
     APIStatusError,
@@ -31,7 +30,7 @@ from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
 from google.cloud import texttospeech
 from google.cloud.texttospeech_v1.types import SsmlVoiceGender, SynthesizeSpeechResponse
-from .models import AudioEncoding, Gender, SpeechLanguages
+from .models import Gender, SpeechLanguages
 @dataclass
@@ -47,7 +46,6 @@ class TTS(tts.TTS):
         language: SpeechLanguages | str = "en-US",
         gender: Gender | str = "neutral",
         voice_name: str = "",  # Not required
-        encoding: AudioEncoding | str = "linear16",
         sample_rate: int = 24000,
         pitch: int = 0,
         effects_profile_id: str = "",
@@ -66,7 +64,6 @@ class TTS(tts.TTS):
             language (SpeechLanguages | str, optional): Language code (e.g., "en-US"). Default is "en-US".
             gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
             voice_name (str, optional): Specific voice name. Default is an empty string.
-            encoding (AudioEncoding | str, optional): Audio encoding format (e.g., "linear16"). Default is "linear16".
             sample_rate (int, optional): Audio sample rate in Hz. Default is 24000.
             pitch (float, optional): Speaking pitch, ranging from -20.0 to 20.0 semitones relative to the original pitch. Default is 0.
             effects_profile_id (str): Optional identifier for selecting audio effects profiles to apply to the synthesized speech.
@@ -93,17 +90,10 @@ class TTS(tts.TTS):
             ssml_gender=_gender_from_str(gender),
         )
-        if encoding == "linear16" or encoding == "wav":
-            _audio_encoding = texttospeech.AudioEncoding.LINEAR16
-        elif encoding == "mp3":
-            _audio_encoding = texttospeech.AudioEncoding.MP3
-        else:
-            raise NotImplementedError(f"audio encoding {encoding} is not supported")
         self._opts = _TTSOptions(
             voice=voice,
             audio_config=texttospeech.AudioConfig(
-                audio_encoding=_audio_encoding,
+                audio_encoding=texttospeech.AudioEncoding.OGG_OPUS,
                 sample_rate_hertz=sample_rate,
                 pitch=pitch,
                 effects_profile_id=effects_profile_id,
@@ -160,7 +150,7 @@ class TTS(tts.TTS):
         self,
         text: str,
         *,
-        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
+        conn_options: Optional[APIConnectOptions] = None,
     ) -> "ChunkedStream":
         return ChunkedStream(
             tts=self,
@@ -177,9 +167,9 @@ class ChunkedStream(tts.ChunkedStream):
         *,
         tts: TTS,
         input_text: str,
-        conn_options: APIConnectOptions,
         opts: _TTSOptions,
         client: texttospeech.TextToSpeechAsyncClient,
+        conn_options: Optional[APIConnectOptions] = None,
     ) -> None:
         super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
         self._opts, self._client = opts, client
@@ -195,35 +185,24 @@ class ChunkedStream(tts.ChunkedStream):
                 timeout=self._conn_options.timeout,
             )
-            if self._opts.audio_config.audio_encoding == "mp3":
-                decoder = utils.codecs.Mp3StreamDecoder()
-                bstream = utils.audio.AudioByteStream(
-                    sample_rate=self._opts.audio_config.sample_rate_hertz,
-                    num_channels=1,
-                )
-                for frame in decoder.decode_chunk(response.audio_content):
-                    for frame in bstream.write(frame.data.tobytes()):
-                        self._event_ch.send_nowait(
-                            tts.SynthesizedAudio(request_id=request_id, frame=frame)
-                        )
-                for frame in bstream.flush():
-                    self._event_ch.send_nowait(
-                        tts.SynthesizedAudio(request_id=request_id, frame=frame)
-                    )
-            else:
-                data = response.audio_content[44:]  # skip WAV header
-                self._event_ch.send_nowait(
-                    tts.SynthesizedAudio(
-                        request_id=request_id,
-                        frame=rtc.AudioFrame(
-                            data=data,
-                            sample_rate=self._opts.audio_config.sample_rate_hertz,
-                            num_channels=1,
-                            samples_per_channel=len(data) // 2,  # 16-bit
-                        ),
-                    )
+            # Create AudioStreamDecoder for OGG format
+            decoder = utils.codecs.AudioStreamDecoder(
+                sample_rate=self._opts.audio_config.sample_rate_hertz,
+                num_channels=1,
+            )
+            try:
+                decoder.push(response.audio_content)
+                decoder.end_input()
+                emitter = tts.SynthesizedAudioEmitter(
+                    event_ch=self._event_ch,
+                    request_id=request_id,
                 )
+                async for frame in decoder:
+                    emitter.push(frame)
+                emitter.flush()
+            finally:
+                await decoder.aclose()
         except DeadlineExceeded:
             raise APITimeoutError()

livekit/plugins/google/version.py CHANGED Viewed

@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.10.5"
+__version__ = "0.11.0"

{livekit_plugins_google-0.10.5.dist-info → livekit_plugins_google-0.11.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: livekit-plugins-google
-Version: 0.10.5
+Version: 0.11.0
 Summary: Agent Framework plugin for services from Google Cloud
 Home-page: https://github.com/livekit/agents
 License: Apache-2.0
@@ -22,8 +22,8 @@ Description-Content-Type: text/markdown
 Requires-Dist: google-auth<3,>=2
 Requires-Dist: google-cloud-speech<3,>=2
 Requires-Dist: google-cloud-texttospeech<3,>=2
-Requires-Dist: google-genai==0.5.0
-Requires-Dist: livekit-agents>=0.12.11
+Requires-Dist: google-genai==1.3.0
+Requires-Dist: livekit-agents<1.0.0,>=0.12.16
 Dynamic: classifier
 Dynamic: description
 Dynamic: description-content-type
@@ -53,3 +53,57 @@ To use the STT and TTS API, you'll need to enable the respective services for yo
 - Cloud Speech-to-Text API
 - Cloud Text-to-Speech API
+## Gemini Multimodal Live
+Gemini Multimodal Live can be used with the `MultimodalAgent` class. See examples/multimodal_agent/gemini_agent.py for an example.
+### Live Video Input (experimental)
+You can push video frames to your Gemini Multimodal Live session alongside the audio automatically handled by the `MultimodalAgent`.  The basic approach is to subscribe to the video track, create a video stream, sample frames at a suitable frame rate, and push them into the RealtimeSession:
+```
+# Make sure you subscribe to audio and video tracks
+await ctx.connect(auto_subscribe=AutoSubscribe.SUBSCRIBE_ALL)
+# Create your RealtimeModel and store a reference
+model = google.beta.realtime.RealtimeModel(
+    # ...
+)
+# Create your MultimodalAgent as usual
+agent = MultimodalAgent(
+    model=model,
+    # ...
+)
+# Async method to process the video track and push frames to Gemini
+async def _process_video_track(self, track: Track):
+    video_stream = VideoStream(track)
+    last_frame_time = 0
+    async for event in video_stream:
+        current_time = asyncio.get_event_loop().time()
+        # Sample at 1 FPS
+        if current_time - last_frame_time < 1.0:
+            continue
+        last_frame_time = current_time
+        frame = event.frame
+        # Push the frame into the RealtimeSession
+        model.sessions[0].push_video(frame)
+    await video_stream.aclose()
+# Subscribe to new tracks and process them
+@ctx.room.on("track_subscribed")
+def _on_track_subscribed(track: Track, pub, participant):
+    if track.kind == TrackKind.KIND_VIDEO:
+        asyncio.create_task(self._process_video_track(track))
+```

livekit_plugins_google-0.11.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,18 @@
+livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
+livekit/plugins/google/_utils.py,sha256=FG1_26nlWGcI6onPleQQcmGBMfb4QNYgis1B5BMJxWA,7131
+livekit/plugins/google/llm.py,sha256=LZaHsrkjfboRZLWm7L2G0mw62q2sXBNj4YeeV2Sk2uU,16717
+livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
+livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
+livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+livekit/plugins/google/stt.py,sha256=96GJmGDAIBdCpDECArwIXpj2s1xlcA_zuvTnwsvq4xA,22854
+livekit/plugins/google/tts.py,sha256=pG9_pibO3NDGEMa4huU5S9lbeyI3daQyrS17SuTKfZI,8008
+livekit/plugins/google/version.py,sha256=BvmVdoHkxksDSQP-uWrqIiyaAUImEyxSohntkIBNZRo,601
+livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
+livekit/plugins/google/beta/realtime/__init__.py,sha256=sGTn6JFNyA30QUXBZ_BV3l2eHpGAzR35ByXxg77vWNU,205
+livekit/plugins/google/beta/realtime/api_proto.py,sha256=ralrRZqIbE71oyuLKRYaXHvm6tcHMwBJueKvSO8Xfus,658
+livekit/plugins/google/beta/realtime/realtime_api.py,sha256=vZHiWNk8PorxtrHSmA7Ya6ZvCjT37YSJN-MxK8ebdrs,22795
+livekit/plugins/google/beta/realtime/transcriber.py,sha256=rjXO0cSPr3HATxrSfv1MX7IbrjmiTvnLPF280BfRBL8,9809
+livekit_plugins_google-0.11.0.dist-info/METADATA,sha256=b8Aj_eQnGhAT3DQa77KLHZBDGAWZYdrnTBWjVODAm2k,3732
+livekit_plugins_google-0.11.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
+livekit_plugins_google-0.11.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
+livekit_plugins_google-0.11.0.dist-info/RECORD,,

{livekit_plugins_google-0.10.5.dist-info → livekit_plugins_google-0.11.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.8.0)
+Generator: setuptools (75.8.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

livekit_plugins_google-0.10.5.dist-info/RECORD DELETED Viewed

@@ -1,18 +0,0 @@
-livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
-livekit/plugins/google/_utils.py,sha256=mjsqblhGMgAZ2MNPisAVkNsqq4gfO6vvprEKzAGoVwE,7248
-livekit/plugins/google/llm.py,sha256=TVTerAabIf10AKVZr-Kn13eajhQ9RV7K4xaVD771yHU,16547
-livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
-livekit/plugins/google/models.py,sha256=Q47z_tIwLCufxhJyJHH7_1bo4xdBYZBSkkvMeycuItg,1493
-livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-livekit/plugins/google/stt.py,sha256=QcpKAcg8ltFlQnLGSdtRS2H12pFEPs1ZzLojKHB8bpY,21376
-livekit/plugins/google/tts.py,sha256=95qXCigVQYWNbcN3pIKBpIah4b31U_MWtXv5Ji0AMc4,9229
-livekit/plugins/google/version.py,sha256=na7fXYRLcWIgCRi4QSAbV4DZGA7YDgOWcE0O21jDlAo,601
-livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
-livekit/plugins/google/beta/realtime/__init__.py,sha256=sGTn6JFNyA30QUXBZ_BV3l2eHpGAzR35ByXxg77vWNU,205
-livekit/plugins/google/beta/realtime/api_proto.py,sha256=ralrRZqIbE71oyuLKRYaXHvm6tcHMwBJueKvSO8Xfus,658
-livekit/plugins/google/beta/realtime/realtime_api.py,sha256=RPGYAJXelYPo16YyR2qccjUjxUJCkJBU2N5rNTpKxyo,21281
-livekit/plugins/google/beta/realtime/transcriber.py,sha256=ZpKA3F8dqOtJPDlPiAgjw0AUDBIuhQiBVnvSYL4cdBg,9796
-livekit_plugins_google-0.10.5.dist-info/METADATA,sha256=AHhTVMBNVlOnqMnLPjncTO_iIqkDS-ExCm_5ubD9Mdg,2058
-livekit_plugins_google-0.10.5.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-livekit_plugins_google-0.10.5.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
-livekit_plugins_google-0.10.5.dist-info/RECORD,,

{livekit_plugins_google-0.10.5.dist-info → livekit_plugins_google-0.11.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

livekit-plugins-google 0.10.5__py3-none-any.whl → 0.11.0__py3-none-any.whl

livekit-plugins-google 0.10.5py3-none-any.whl → 0.11.0py3-none-any.whl