PyPI - livekit-plugins-google - Versions diffs - 0.9.0__tar.gz → 0.10.0__tar.gz - Mend

livekit-plugins-google 0.9.0tar.gz → 0.10.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{livekit_plugins_google-0.9.0 → livekit_plugins_google-0.10.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: livekit-plugins-google
-Version: 0.9.0
+Version: 0.10.0
 Summary: Agent Framework plugin for services from Google Cloud
 Home-page: https://github.com/livekit/agents
 License: Apache-2.0
@@ -22,8 +22,18 @@ Description-Content-Type: text/markdown
 Requires-Dist: google-auth<3,>=2
 Requires-Dist: google-cloud-speech<3,>=2
 Requires-Dist: google-cloud-texttospeech<3,>=2
-Requires-Dist: google-genai>=0.3.0
+Requires-Dist: google-genai==0.5.0
 Requires-Dist: livekit-agents>=0.12.3
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: license
+Dynamic: project-url
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
 # LiveKit Plugins Google

{livekit_plugins_google-0.9.0 → livekit_plugins_google-0.10.0}/livekit/plugins/google/__init__.py RENAMED Viewed

@@ -13,11 +13,12 @@
 # limitations under the License.
 from . import beta
+from .llm import LLM
 from .stt import STT, SpeechStream
 from .tts import TTS
 from .version import __version__
-__all__ = ["STT", "TTS", "SpeechStream", "__version__", "beta"]
+__all__ = ["STT", "TTS", "SpeechStream", "__version__", "beta", "LLM"]
 from livekit.agents import Plugin
 from .log import logger

livekit_plugins_google-0.10.0/livekit/plugins/google/_utils.py ADDED Viewed

@@ -0,0 +1,202 @@
+from __future__ import annotations
+import base64
+import inspect
+import json
+from typing import Any, Dict, List, Optional, get_args, get_origin
+from livekit import rtc
+from livekit.agents import llm, utils
+from livekit.agents.llm.function_context import _is_optional_type
+from google.genai import types
+JSON_SCHEMA_TYPE_MAP: dict[type, types.Type] = {
+    str: "STRING",
+    int: "INTEGER",
+    float: "NUMBER",
+    bool: "BOOLEAN",
+    dict: "OBJECT",
+    list: "ARRAY",
+}
+__all__ = ["_build_gemini_ctx", "_build_tools"]
+def _build_parameters(arguments: Dict[str, Any]) -> types.Schema | None:
+    properties: Dict[str, types.Schema] = {}
+    required: List[str] = []
+    for arg_name, arg_info in arguments.items():
+        prop = types.Schema()
+        if arg_info.description:
+            prop.description = arg_info.description
+        _, py_type = _is_optional_type(arg_info.type)
+        origin = get_origin(py_type)
+        if origin is list:
+            item_type = get_args(py_type)[0]
+            if item_type not in JSON_SCHEMA_TYPE_MAP:
+                raise ValueError(f"Unsupported type: {item_type}")
+            prop.type = "ARRAY"
+            prop.items = types.Schema(type=JSON_SCHEMA_TYPE_MAP[item_type])
+            if arg_info.choices:
+                prop.items.enum = arg_info.choices
+        else:
+            if py_type not in JSON_SCHEMA_TYPE_MAP:
+                raise ValueError(f"Unsupported type: {py_type}")
+            prop.type = JSON_SCHEMA_TYPE_MAP[py_type]
+            if arg_info.choices:
+                prop.enum = arg_info.choices
+                if py_type is int:
+                    raise ValueError(
+                        f"Parameter '{arg_info.name}' uses integer choices, not supported by this model."
+                    )
+        properties[arg_name] = prop
+        if arg_info.default is inspect.Parameter.empty:
+            required.append(arg_name)
+    if properties:
+        parameters = types.Schema(type="OBJECT", properties=properties)
+        if required:
+            parameters.required = required
+        return parameters
+    return None
+def _build_tools(fnc_ctx: Any) -> List[types.FunctionDeclaration]:
+    function_declarations: List[types.FunctionDeclaration] = []
+    for fnc_info in fnc_ctx.ai_functions.values():
+        parameters = _build_parameters(fnc_info.arguments)
+        func_decl = types.FunctionDeclaration(
+            name=fnc_info.name,
+            description=fnc_info.description,
+            parameters=parameters,
+        )
+        function_declarations.append(func_decl)
+    return function_declarations
+def _build_gemini_ctx(
+    chat_ctx: llm.ChatContext, cache_key: Any
+) -> tuple[list[types.Content], Optional[types.Content]]:
+    turns: list[types.Content] = []
+    system_instruction: Optional[types.Content] = None
+    current_role: Optional[str] = None
+    parts: list[types.Part] = []
+    for msg in chat_ctx.messages:
+        if msg.role == "system":
+            if isinstance(msg.content, str):
+                system_instruction = types.Content(parts=[types.Part(text=msg.content)])
+            continue
+        if msg.role == "assistant":
+            role = "model"
+        elif msg.role == "tool":
+            role = "user"
+        else:
+            role = "user"
+        # If role changed, finalize previous parts into a turn
+        if role != current_role:
+            if current_role is not None and parts:
+                turns.append(types.Content(role=current_role, parts=parts))
+            current_role = role
+            parts = []
+        if msg.tool_calls:
+            for fnc in msg.tool_calls:
+                parts.append(
+                    types.Part(
+                        function_call=types.FunctionCall(
+                            id=fnc.tool_call_id,
+                            name=fnc.function_info.name,
+                            args=fnc.arguments,
+                        )
+                    )
+                )
+        if msg.role == "tool":
+            if msg.content:
+                if isinstance(msg.content, dict):
+                    parts.append(
+                        types.Part(
+                            function_response=types.FunctionResponse(
+                                id=msg.tool_call_id,
+                                name=msg.name,
+                                response=msg.content,
+                            )
+                        )
+                    )
+                elif isinstance(msg.content, str):
+                    parts.append(
+                        types.Part(
+                            function_response=types.FunctionResponse(
+                                id=msg.tool_call_id,
+                                name=msg.name,
+                                response={"result": msg.content},
+                            )
+                        )
+                    )
+        else:
+            if msg.content:
+                if isinstance(msg.content, str):
+                    parts.append(types.Part(text=msg.content))
+                elif isinstance(msg.content, dict):
+                    parts.append(types.Part(text=json.dumps(msg.content)))
+                elif isinstance(msg.content, list):
+                    for item in msg.content:
+                        if isinstance(item, str):
+                            parts.append(types.Part(text=item))
+                        elif isinstance(item, llm.ChatImage):
+                            parts.append(_build_gemini_image_part(item, cache_key))
+    # Finalize last role's parts if any remain
+    if current_role is not None and parts:
+        turns.append(types.Content(role=current_role, parts=parts))
+    return turns, system_instruction
+def _build_gemini_image_part(image: llm.ChatImage, cache_key: Any) -> types.Part:
+    if isinstance(image.image, str):
+        # Check if the string is a Data URL
+        if image.image.startswith("data:image/jpeg;base64,"):
+            # Extract the base64 part after the comma
+            base64_data = image.image.split(",", 1)[1]
+            try:
+                image_bytes = base64.b64decode(base64_data)
+            except Exception as e:
+                raise ValueError("Invalid base64 data in image URL") from e
+            return types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg")
+        else:
+            # Assume it's a regular URL
+            return types.Part.from_uri(file_uri=image.image, mime_type="image/jpeg")
+    elif isinstance(image.image, rtc.VideoFrame):
+        if cache_key not in image._cache:
+            opts = utils.images.EncodeOptions()
+            if image.inference_width and image.inference_height:
+                opts.resize_options = utils.images.ResizeOptions(
+                    width=image.inference_width,
+                    height=image.inference_height,
+                    strategy="scale_aspect_fit",
+                )
+            encoded_data = utils.images.encode(image.image, opts)
+            image._cache[cache_key] = base64.b64encode(encoded_data).decode("utf-8")
+        return types.Part.from_bytes(
+            data=image._cache[cache_key], mime_type="image/jpeg"
+        )
+    raise ValueError(f"Unsupported image type: {type(image.image)}")

{livekit_plugins_google-0.9.0 → livekit_plugins_google-0.10.0}/livekit/plugins/google/beta/realtime/__init__.py RENAMED Viewed

@@ -1,7 +1,6 @@
 from .api_proto import (
     ClientEvents,
     LiveAPIModels,
-    ResponseModality,
     Voice,
 )
 from .realtime_api import RealtimeModel
@@ -10,6 +9,5 @@ __all__ = [
     "RealtimeModel",
     "ClientEvents",
     "LiveAPIModels",
-    "ResponseModality",
     "Voice",
 ]

livekit_plugins_google-0.10.0/livekit/plugins/google/beta/realtime/api_proto.py ADDED Viewed

@@ -0,0 +1,24 @@
+from __future__ import annotations
+from typing import Literal, Sequence, Union
+from google.genai import types
+from ..._utils import _build_gemini_ctx, _build_tools
+LiveAPIModels = Literal["gemini-2.0-flash-exp"]
+Voice = Literal["Puck", "Charon", "Kore", "Fenrir", "Aoede"]
+__all__ = ["_build_tools", "ClientEvents", "_build_gemini_ctx"]
+ClientEvents = Union[
+    types.ContentListUnion,
+    types.ContentListUnionDict,
+    types.LiveClientContentOrDict,
+    types.LiveClientRealtimeInput,
+    types.LiveClientRealtimeInputOrDict,
+    types.LiveClientToolResponseOrDict,
+    types.FunctionResponseOrDict,
+    Sequence[types.FunctionResponseOrDict],
+]

{livekit_plugins_google-0.9.0 → livekit_plugins_google-0.10.0}/livekit/plugins/google/beta/realtime/realtime_api.py RENAMED Viewed

@@ -1,7 +1,6 @@
 from __future__ import annotations
 import asyncio
-import base64
 import json
 import os
 from dataclasses import dataclass
@@ -11,14 +10,22 @@ from livekit import rtc
 from livekit.agents import llm, utils
 from livekit.agents.llm.function_context import _create_ai_function_info
-from google import genai  # type: ignore
-from google.genai.types import (  # type: ignore
+from google import genai
+from google.genai._api_client import HttpOptions
+from google.genai.types import (
+    Blob,
+    Content,
     FunctionResponse,
-    GenerationConfigDict,
+    GenerationConfig,
+    LiveClientContent,
+    LiveClientRealtimeInput,
     LiveClientToolResponse,
-    LiveConnectConfigDict,
+    LiveConnectConfig,
+    Modality,
+    Part,
     PrebuiltVoiceConfig,
     SpeechConfig,
+    Tool,
     VoiceConfig,
 )
@@ -26,10 +33,11 @@ from ...log import logger
 from .api_proto import (
     ClientEvents,
     LiveAPIModels,
-    ResponseModality,
     Voice,
+    _build_gemini_ctx,
     _build_tools,
 )
+from .transcriber import TranscriberSession, TranscriptionContent
 EventTypes = Literal[
     "start_session",
@@ -39,6 +47,9 @@ EventTypes = Literal[
     "function_calls_collected",
     "function_calls_finished",
     "function_calls_cancelled",
+    "input_speech_transcription_completed",
+    "agent_speech_transcription_completed",
+    "agent_speech_stopped",
 ]
@@ -55,6 +66,12 @@ class GeminiContent:
     content_type: Literal["text", "audio"]
+@dataclass
+class InputTranscription:
+    item_id: str
+    transcript: str
 @dataclass
 class Capabilities:
     supports_truncate: bool
@@ -65,7 +82,7 @@ class ModelOptions:
     model: LiveAPIModels | str
     api_key: str | None
     voice: Voice | str
-    response_modalities: ResponseModality
+    response_modalities: list[Modality] | None
     vertexai: bool
     project: str | None
     location: str | None
@@ -76,18 +93,22 @@ class ModelOptions:
     top_k: int | None
     presence_penalty: float | None
     frequency_penalty: float | None
-    instructions: str
+    instructions: Content | None
+    enable_user_audio_transcription: bool
+    enable_agent_audio_transcription: bool
 class RealtimeModel:
     def __init__(
         self,
         *,
-        instructions: str = "",
+        instructions: str | None = None,
         model: LiveAPIModels | str = "gemini-2.0-flash-exp",
         api_key: str | None = None,
         voice: Voice | str = "Puck",
-        modalities: ResponseModality = "AUDIO",
+        modalities: list[Modality] = ["AUDIO"],
+        enable_user_audio_transcription: bool = True,
+        enable_agent_audio_transcription: bool = True,
         vertexai: bool = False,
         project: str | None = None,
         location: str | None = None,
@@ -103,15 +124,24 @@ class RealtimeModel:
         """
         Initializes a RealtimeModel instance for interacting with Google's Realtime API.
+        Environment Requirements:
+        - For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file.
+        The Google Cloud project and location can be set via `project` and `location` arguments or the environment variables
+        `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_LOCATION`. By default, the project is inferred from the service account key file,
+        and the location defaults to "us-central1".
+        - For Google Gemini API: Set the `api_key` argument or the `GOOGLE_API_KEY` environment variable.
         Args:
             instructions (str, optional): Initial system instructions for the model. Defaults to "".
-            api_key (str or None, optional): OpenAI API key. If None, will attempt to read from the environment variable OPENAI_API_KEY
-            modalities (ResponseModality): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
+            api_key (str or None, optional): Google Gemini API key. If None, will attempt to read from the environment variable GOOGLE_API_KEY.
+            modalities (list[Modality], optional): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
             model (str or None, optional): The name of the model to use. Defaults to "gemini-2.0-flash-exp".
             voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
+            enable_user_audio_transcription (bool, optional): Whether to enable user audio transcription. Defaults to True
+            enable_agent_audio_transcription (bool, optional): Whether to enable agent audio transcription. Defaults to True
             temperature (float, optional): Sampling temperature for response generation. Defaults to 0.8.
             vertexai (bool, optional): Whether to use VertexAI for the API. Defaults to False.
-                project (str or None, optional): The project to use for the API. Defaults to None. (for vertexai)
+                project (str or None, optional): The project id to use for the API. Defaults to None. (for vertexai)
                 location (str or None, optional): The location to use for the API. Defaults to None. (for vertexai)
             candidate_count (int, optional): The number of candidate responses to generate. Defaults to 1.
             top_p (float, optional): The top-p value for response generation
@@ -130,21 +160,38 @@ class RealtimeModel:
         self._model = model
         self._loop = loop or asyncio.get_event_loop()
         self._api_key = api_key or os.environ.get("GOOGLE_API_KEY")
-        self._vertexai = vertexai
-        self._project_id = project or os.environ.get("GOOGLE_PROJECT")
-        self._location = location or os.environ.get("GOOGLE_LOCATION")
-        if self._api_key is None and not self._vertexai:
-            raise ValueError("GOOGLE_API_KEY is not set")
+        self._project = project or os.environ.get("GOOGLE_CLOUD_PROJECT")
+        self._location = location or os.environ.get("GOOGLE_CLOUD_LOCATION")
+        if vertexai:
+            if not self._project or not self._location:
+                raise ValueError(
+                    "Project and location are required for VertexAI either via project and location or GOOGLE_CLOUD_PROJECT and GOOGLE_CLOUD_LOCATION environment variables"
+                )
+            self._api_key = None  # VertexAI does not require an API key
+        else:
+            self._project = None
+            self._location = None
+            if not self._api_key:
+                raise ValueError(
+                    "API key is required for Google API either via api_key or GOOGLE_API_KEY environment variable"
+                )
+        instructions_content = (
+            Content(parts=[Part(text=instructions)]) if instructions else None
+        )
         self._rt_sessions: list[GeminiRealtimeSession] = []
         self._opts = ModelOptions(
             model=model,
-            api_key=api_key,
+            api_key=self._api_key,
             voice=voice,
+            enable_user_audio_transcription=enable_user_audio_transcription,
+            enable_agent_audio_transcription=enable_agent_audio_transcription,
             response_modalities=modalities,
             vertexai=vertexai,
-            project=project,
-            location=location,
+            project=self._project,
+            location=self._location,
             candidate_count=candidate_count,
             temperature=temperature,
             max_output_tokens=max_output_tokens,
@@ -152,7 +199,7 @@ class RealtimeModel:
             top_k=top_k,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
-            instructions=instructions,
+            instructions=instructions_content,
         )
     @property
@@ -208,16 +255,16 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
         self._chat_ctx = chat_ctx
         self._fnc_ctx = fnc_ctx
         self._fnc_tasks = utils.aio.TaskSet()
+        self._is_interrupted = False
         tools = []
         if self._fnc_ctx is not None:
             functions = _build_tools(self._fnc_ctx)
-            tools.append({"function_declarations": functions})
+            tools.append(Tool(function_declarations=functions))
-        self._config = LiveConnectConfigDict(
-            model=self._opts.model,
+        self._config = LiveConnectConfig(
             response_modalities=self._opts.response_modalities,
-            generation_config=GenerationConfigDict(
+            generation_config=GenerationConfig(
                 candidate_count=self._opts.candidate_count,
                 temperature=self._opts.temperature,
                 max_output_tokens=self._opts.max_output_tokens,
@@ -237,7 +284,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
             tools=tools,
         )
         self._client = genai.Client(
-            http_options={"api_version": "v1alpha"},
+            http_options=HttpOptions(api_version="v1alpha"),
             api_key=self._opts.api_key,
             vertexai=self._opts.vertexai,
             project=self._opts.project,
@@ -246,12 +293,22 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
         self._main_atask = asyncio.create_task(
             self._main_task(), name="gemini-realtime-session"
         )
-        # dummy task to wait for the session to be initialized # TODO: sync chat ctx
-        self._init_sync_task = asyncio.create_task(
-            asyncio.sleep(0), name="gemini-realtime-session-init"
-        )
+        if self._opts.enable_user_audio_transcription:
+            self._transcriber = TranscriberSession(
+                client=self._client, model=self._opts.model
+            )
+            self._transcriber.on("input_speech_done", self._on_input_speech_done)
+        if self._opts.enable_agent_audio_transcription:
+            self._agent_transcriber = TranscriberSession(
+                client=self._client, model=self._opts.model
+            )
+            self._agent_transcriber.on("input_speech_done", self._on_agent_speech_done)
+        # init dummy task
+        self._init_sync_task = asyncio.create_task(asyncio.sleep(0))
         self._send_ch = utils.aio.Chan[ClientEvents]()
         self._active_response_id = None
+        if chat_ctx:
+            self.generate_reply(chat_ctx)
     async def aclose(self) -> None:
         if self._send_ch.closed:
@@ -269,32 +326,97 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
         self._fnc_ctx = value
     def _push_audio(self, frame: rtc.AudioFrame) -> None:
-        data = base64.b64encode(frame.data).decode("utf-8")
-        self._queue_msg({"mime_type": "audio/pcm", "data": data})
+        if self._opts.enable_user_audio_transcription:
+            self._transcriber._push_audio(frame)
+        realtime_input = LiveClientRealtimeInput(
+            media_chunks=[Blob(data=frame.data.tobytes(), mime_type="audio/pcm")],
+        )
+        self._queue_msg(realtime_input)
-    def _queue_msg(self, msg: dict) -> None:
+    def _queue_msg(self, msg: ClientEvents) -> None:
         self._send_ch.send_nowait(msg)
+    def generate_reply(
+        self,
+        ctx: llm.ChatContext | llm.ChatMessage,
+        turn_complete: bool = True,
+    ) -> None:
+        if isinstance(ctx, llm.ChatMessage) and isinstance(ctx.content, str):
+            new_chat_ctx = llm.ChatContext()
+            new_chat_ctx.append(text=ctx.content, role=ctx.role)
+        elif isinstance(ctx, llm.ChatContext):
+            new_chat_ctx = ctx
+        else:
+            raise ValueError("Invalid chat context")
+        turns, _ = _build_gemini_ctx(new_chat_ctx, id(self))
+        client_content = LiveClientContent(
+            turn_complete=turn_complete,
+            turns=turns,
+        )
+        self._queue_msg(client_content)
     def chat_ctx_copy(self) -> llm.ChatContext:
         return self._chat_ctx.copy()
     async def set_chat_ctx(self, ctx: llm.ChatContext) -> None:
         self._chat_ctx = ctx.copy()
+    def cancel_response(self) -> None:
+        raise NotImplementedError("cancel_response is not supported yet")
+    def create_response(
+        self,
+        on_duplicate: Literal[
+            "cancel_existing", "cancel_new", "keep_both"
+        ] = "keep_both",
+    ) -> None:
+        raise NotImplementedError("create_response is not supported yet")
+    def commit_audio_buffer(self) -> None:
+        raise NotImplementedError("commit_audio_buffer is not supported yet")
+    def server_vad_enabled(self) -> bool:
+        return True
+    def _on_input_speech_done(self, content: TranscriptionContent) -> None:
+        if content.response_id and content.text:
+            self.emit(
+                "input_speech_transcription_completed",
+                InputTranscription(
+                    item_id=content.response_id,
+                    transcript=content.text,
+                ),
+            )
+        # self._chat_ctx.append(text=content.text, role="user")
+        # TODO: implement sync mechanism to make sure the transcribed user speech is inside the chat_ctx and always before the generated agent speech
+    def _on_agent_speech_done(self, content: TranscriptionContent) -> None:
+        if not self._is_interrupted and content.response_id and content.text:
+            self.emit(
+                "agent_speech_transcription_completed",
+                InputTranscription(
+                    item_id=content.response_id,
+                    transcript=content.text,
+                ),
+            )
+            # self._chat_ctx.append(text=content.text, role="assistant")
     @utils.log_exceptions(logger=logger)
     async def _main_task(self):
         @utils.log_exceptions(logger=logger)
         async def _send_task():
             async for msg in self._send_ch:
-                await self._session.send(msg)
+                await self._session.send(input=msg)
-            await self._session.send(".", end_of_turn=True)
+            await self._session.send(input=".", end_of_turn=True)
         @utils.log_exceptions(logger=logger)
         async def _recv_task():
             while True:
                 async for response in self._session.receive():
                     if self._active_response_id is None:
+                        self._is_interrupted = False
                         self._active_response_id = utils.shortuuid()
                         text_stream = utils.aio.Chan[str]()
                         audio_stream = utils.aio.Chan[rtc.AudioFrame]()
@@ -307,7 +429,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
                             audio=[],
                             text_stream=text_stream,
                             audio_stream=audio_stream,
-                            content_type=self._opts.response_modalities,
+                            content_type="audio",
                         )
                         self.emit("response_content_added", content)
@@ -326,6 +448,8 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
                                         samples_per_channel=len(part.inline_data.data)
                                         // 2,
                                     )
+                                    if self._opts.enable_agent_audio_transcription:
+                                        self._agent_transcriber._push_audio(frame)
                                     content.audio_stream.send_nowait(frame)
                         if server_content.interrupted or server_content.turn_complete:
@@ -333,10 +457,8 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
                                 if isinstance(stream, utils.aio.Chan):
                                     stream.close()
-                            if server_content.interrupted:
-                                self.emit("input_speech_started")
-                            elif server_content.turn_complete:
-                                self.emit("response_content_done", content)
+                            self.emit("agent_speech_stopped")
+                            self._is_interrupted = True
                             self._active_response_id = None
@@ -387,6 +509,10 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
             finally:
                 await utils.aio.gracefully_cancel(*tasks)
                 await self._session.close()
+                if self._opts.enable_user_audio_transcription:
+                    await self._transcriber.aclose()
+                if self._opts.enable_agent_audio_transcription:
+                    await self._agent_transcriber.aclose()
     @utils.log_exceptions(logger=logger)
     async def _run_fnc_task(self, fnc_call_info: llm.FunctionCallInfo, item_id: str):
@@ -419,6 +545,6 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
                     )
                 ]
             )
-            await self._session.send(tool_response)
+            await self._session.send(input=tool_response)
             self.emit("function_calls_finished", [called_fnc])

livekit-plugins-google 0.9.0__tar.gz → 0.10.0__tar.gz

livekit-plugins-google 0.9.0tar.gz → 0.10.0tar.gz