PyPI - livekit-plugins-google - Versions diffs - 1.0.23__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

livekit-plugins-google 1.0.23py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

livekit/plugins/google/__init__.py +3 -2
livekit/plugins/google/beta/realtime/realtime_api.py +296 -118
livekit/plugins/google/llm.py +60 -27
livekit/plugins/google/stt.py +19 -12
livekit/plugins/google/tools.py +11 -0
livekit/plugins/google/tts.py +109 -136
livekit/plugins/google/utils.py +39 -88
livekit/plugins/google/version.py +1 -1
{livekit_plugins_google-1.0.23.dist-info → livekit_plugins_google-1.1.0.dist-info}/METADATA +2 -2
livekit_plugins_google-1.1.0.dist-info/RECORD +17 -0
livekit_plugins_google-1.0.23.dist-info/RECORD +0 -16
{livekit_plugins_google-1.0.23.dist-info → livekit_plugins_google-1.1.0.dist-info}/WHEEL +0 -0

livekit/plugins/google/llm.py CHANGED Viewed

@@ -20,13 +20,17 @@ import os
 from dataclasses import dataclass
 from typing import Any, cast
-from google import genai
 from google.auth._default_async import default_async
-from google.genai import types
+from google.genai import Client, types
 from google.genai.errors import APIError, ClientError, ServerError
 from livekit.agents import APIConnectionError, APIStatusError, llm, utils
-from livekit.agents.llm import FunctionTool, ToolChoice, utils as llm_utils
-from livekit.agents.llm.tool_context import get_function_info
+from livekit.agents.llm import FunctionTool, RawFunctionTool, ToolChoice, utils as llm_utils
+from livekit.agents.llm.tool_context import (
+    get_function_info,
+    get_raw_function_info,
+    is_function_tool,
+    is_raw_function_tool,
+)
 from livekit.agents.types import (
     DEFAULT_API_CONNECT_OPTIONS,
     NOT_GIVEN,
@@ -37,7 +41,8 @@ from livekit.agents.utils import is_given
 from .log import logger
 from .models import ChatModels
-from .utils import to_chat_ctx, to_fnc_ctx, to_response_format
+from .tools import _LLMTool
+from .utils import create_tools_config, to_fnc_ctx, to_response_format
 @dataclass
@@ -54,6 +59,7 @@ class _LLMOptions:
     presence_penalty: NotGivenOr[float]
     frequency_penalty: NotGivenOr[float]
     thinking_config: NotGivenOr[types.ThinkingConfigOrDict]
+    gemini_tools: NotGivenOr[list[_LLMTool]]
 class LLM(llm.LLM):
@@ -73,6 +79,7 @@ class LLM(llm.LLM):
         frequency_penalty: NotGivenOr[float] = NOT_GIVEN,
         tool_choice: NotGivenOr[ToolChoice] = NOT_GIVEN,
         thinking_config: NotGivenOr[types.ThinkingConfigOrDict] = NOT_GIVEN,
+        gemini_tools: NotGivenOr[list[_LLMTool]] = NOT_GIVEN,
     ) -> None:
         """
         Create a new instance of Google GenAI LLM.
@@ -98,10 +105,11 @@ class LLM(llm.LLM):
             frequency_penalty (float, optional): Penalizes the model for repeating words. Defaults to None.
             tool_choice (ToolChoice, optional): Specifies whether to use tools during response generation. Defaults to "auto".
             thinking_config (ThinkingConfigOrDict, optional): The thinking configuration for response generation. Defaults to None.
+            gemini_tools (list[LLMTool], optional): The Gemini-specific tools to use for the session.
         """  # noqa: E501
         super().__init__()
         gcp_project = project if is_given(project) else os.environ.get("GOOGLE_CLOUD_PROJECT")
-        gcp_location = (
+        gcp_location: str | None = (
             location
             if is_given(location)
             else os.environ.get("GOOGLE_CLOUD_LOCATION") or "us-central1"
@@ -115,7 +123,7 @@ class LLM(llm.LLM):
         if use_vertexai:
             if not gcp_project:
-                _, gcp_project = default_async(
+                _, gcp_project = default_async(  # type: ignore
                     scopes=["https://www.googleapis.com/auth/cloud-platform"]
                 )
             gemini_api_key = None  # VertexAI does not require an API key
@@ -157,8 +165,9 @@ class LLM(llm.LLM):
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
             thinking_config=thinking_config,
+            gemini_tools=gemini_tools,
         )
-        self._client = genai.Client(
+        self._client = Client(
             api_key=gemini_api_key,
             vertexai=use_vertexai,
             project=gcp_project,
@@ -169,7 +178,7 @@ class LLM(llm.LLM):
         self,
         *,
         chat_ctx: llm.ChatContext,
-        tools: list[FunctionTool] | None = None,
+        tools: list[FunctionTool | RawFunctionTool] | None = None,
         conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
         parallel_tool_calls: NotGivenOr[bool] = NOT_GIVEN,
         tool_choice: NotGivenOr[ToolChoice] = NOT_GIVEN,
@@ -177,50 +186,58 @@ class LLM(llm.LLM):
             types.SchemaUnion | type[llm_utils.ResponseFormatT]
         ] = NOT_GIVEN,
         extra_kwargs: NotGivenOr[dict[str, Any]] = NOT_GIVEN,
+        gemini_tools: NotGivenOr[list[_LLMTool]] = NOT_GIVEN,
     ) -> LLMStream:
         extra = {}
         if is_given(extra_kwargs):
             extra.update(extra_kwargs)
-        tool_choice = tool_choice if is_given(tool_choice) else self._opts.tool_choice
+        tool_choice = (
+            cast(ToolChoice, tool_choice) if is_given(tool_choice) else self._opts.tool_choice
+        )
         if is_given(tool_choice):
             gemini_tool_choice: types.ToolConfig
             if isinstance(tool_choice, dict) and tool_choice.get("type") == "function":
                 gemini_tool_choice = types.ToolConfig(
                     function_calling_config=types.FunctionCallingConfig(
-                        mode="ANY",
+                        mode=types.FunctionCallingConfigMode.ANY,
                         allowed_function_names=[tool_choice["function"]["name"]],
                     )
                 )
                 extra["tool_config"] = gemini_tool_choice
             elif tool_choice == "required":
+                tool_names = []
+                for tool in tools or []:
+                    if is_function_tool(tool):
+                        tool_names.append(get_function_info(tool).name)
+                    elif is_raw_function_tool(tool):
+                        tool_names.append(get_raw_function_info(tool).name)
                 gemini_tool_choice = types.ToolConfig(
                     function_calling_config=types.FunctionCallingConfig(
-                        mode="ANY",
-                        allowed_function_names=[get_function_info(fnc).name for fnc in tools]
-                        if tools
-                        else None,
+                        mode=types.FunctionCallingConfigMode.ANY,
+                        allowed_function_names=tool_names or None,
                     )
                 )
                 extra["tool_config"] = gemini_tool_choice
             elif tool_choice == "auto":
                 gemini_tool_choice = types.ToolConfig(
                     function_calling_config=types.FunctionCallingConfig(
-                        mode="AUTO",
+                        mode=types.FunctionCallingConfigMode.AUTO,
                     )
                 )
                 extra["tool_config"] = gemini_tool_choice
             elif tool_choice == "none":
                 gemini_tool_choice = types.ToolConfig(
                     function_calling_config=types.FunctionCallingConfig(
-                        mode="NONE",
+                        mode=types.FunctionCallingConfigMode.NONE,
                     )
                 )
                 extra["tool_config"] = gemini_tool_choice
         if is_given(response_format):
-            extra["response_schema"] = to_response_format(response_format)
+            extra["response_schema"] = to_response_format(response_format)  # type: ignore
             extra["response_mime_type"] = "application/json"
         if is_given(self._opts.temperature):
@@ -240,6 +257,8 @@ class LLM(llm.LLM):
         if is_given(self._opts.thinking_config):
             extra["thinking_config"] = self._opts.thinking_config
+        gemini_tools = gemini_tools if is_given(gemini_tools) else self._opts.gemini_tools
         return LLMStream(
             self,
             client=self._client,
@@ -247,6 +266,7 @@ class LLM(llm.LLM):
             chat_ctx=chat_ctx,
             tools=tools or [],
             conn_options=conn_options,
+            gemini_tools=gemini_tools,
             extra_kwargs=extra,
         )
@@ -256,32 +276,45 @@ class LLMStream(llm.LLMStream):
         self,
         llm: LLM,
         *,
-        client: genai.Client,
+        client: Client,
         model: str | ChatModels,
         chat_ctx: llm.ChatContext,
         conn_options: APIConnectOptions,
-        tools: list[FunctionTool],
+        tools: list[FunctionTool | RawFunctionTool],
         extra_kwargs: dict[str, Any],
+        gemini_tools: NotGivenOr[list[_LLMTool]] = NOT_GIVEN,
     ) -> None:
         super().__init__(llm, chat_ctx=chat_ctx, tools=tools, conn_options=conn_options)
         self._client = client
         self._model = model
         self._llm: LLM = llm
         self._extra_kwargs = extra_kwargs
+        self._gemini_tools = gemini_tools
     async def _run(self) -> None:
         retryable = True
         request_id = utils.shortuuid()
         try:
-            turns, system_instruction = to_chat_ctx(self._chat_ctx, id(self._llm), generate=True)
+            turns_dict, extra_data = self._chat_ctx.to_provider_format(format="google")
+            turns = [types.Content.model_validate(turn) for turn in turns_dict]
             function_declarations = to_fnc_ctx(self._tools)
-            if function_declarations:
-                self._extra_kwargs["tools"] = [
-                    types.Tool(function_declarations=function_declarations)
-                ]
+            tools_config = create_tools_config(
+                function_tools=function_declarations,
+                gemini_tools=self._gemini_tools if is_given(self._gemini_tools) else None,
+            )
+            if tools_config:
+                self._extra_kwargs["tools"] = tools_config
             config = types.GenerateContentConfig(
-                system_instruction=system_instruction,
+                system_instruction=(
+                    [types.Part(text=content) for content in extra_data.system_messages]
+                    if extra_data.system_messages
+                    else None
+                ),
+                http_options=types.HttpOptions(
+                    timeout=int(self._conn_options.timeout * 1000),
+                ),
                 **self._extra_kwargs,
             )
@@ -371,7 +404,7 @@ class LLMStream(llm.LLMStream):
                     tool_calls=[
                         llm.FunctionToolCall(
                             arguments=json.dumps(part.function_call.args),
-                            name=part.function_call.name,
+                            name=part.function_call.name,  # type: ignore
                             call_id=part.function_call.id or utils.shortuuid("function_call_"),
                         )
                     ],

livekit/plugins/google/stt.py CHANGED Viewed

@@ -18,8 +18,9 @@ import asyncio
 import dataclasses
 import time
 import weakref
+from collections.abc import AsyncGenerator, AsyncIterable
 from dataclasses import dataclass
-from typing import Callable, Union
+from typing import Callable, Union, cast
 from google.api_core.client_options import ClientOptions
 from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
@@ -140,7 +141,7 @@ class STT(stt.STT):
         if not is_given(credentials_file) and not is_given(credentials_info):
             try:
-                gauth_default()
+                gauth_default()  # type: ignore
             except DefaultCredentialsError:
                 raise ValueError(
                     "Application default credentials must be available "
@@ -168,9 +169,10 @@ class STT(stt.STT):
             connect_cb=self._create_client,
         )
-    async def _create_client(self) -> SpeechAsyncClient:
+    async def _create_client(self, timeout: float) -> SpeechAsyncClient:
         # Add support for passing a specific location that matches recognizer
         # see: https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages
+        # TODO(long): how to set timeout?
         client_options = None
         client: SpeechAsyncClient | None = None
         if self._location != "global":
@@ -198,7 +200,7 @@ class STT(stt.STT):
         except AttributeError:
             from google.auth import default as ga_default
-            _, project_id = ga_default()
+            _, project_id = ga_default()  # type: ignore
         return f"projects/{project_id}/locations/{self._location}/recognizers/_"
     def _sanitize_options(self, *, language: NotGivenOr[str] = NOT_GIVEN) -> STTOptions:
@@ -243,7 +245,7 @@ class STT(stt.STT):
         )
         try:
-            async with self._pool.connection() as client:
+            async with self._pool.connection(timeout=conn_options.timeout) as client:
                 raw = await client.recognize(
                     cloud_speech.RecognizeRequest(
                         recognizer=self._get_recognizer(client),
@@ -289,11 +291,11 @@ class STT(stt.STT):
         model: NotGivenOr[SpeechModels] = NOT_GIVEN,
         location: NotGivenOr[str] = NOT_GIVEN,
         keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN,
-    ):
+    ) -> None:
         if is_given(languages):
             if isinstance(languages, str):
                 languages = [languages]
-            self._config.languages = languages
+            self._config.languages = cast(list[LgType], languages)
         if is_given(detect_language):
             self._config.detect_language = detect_language
         if is_given(interim_results):
@@ -356,11 +358,11 @@ class SpeechStream(stt.SpeechStream):
         model: NotGivenOr[SpeechModels] = NOT_GIVEN,
         min_confidence_threshold: NotGivenOr[float] = NOT_GIVEN,
         keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN,
-    ):
+    ) -> None:
         if is_given(languages):
             if isinstance(languages, str):
                 languages = [languages]
-            self._config.languages = languages
+            self._config.languages = cast(list[LgType], languages)
         if is_given(detect_language):
             self._config.detect_language = detect_language
         if is_given(interim_results):
@@ -381,7 +383,9 @@ class SpeechStream(stt.SpeechStream):
     async def _run(self) -> None:
         # google requires a async generator when calling streaming_recognize
         # this function basically convert the queue into a async generator
-        async def input_generator(client: SpeechAsyncClient, should_stop: asyncio.Event):
+        async def input_generator(
+            client: SpeechAsyncClient, should_stop: asyncio.Event
+        ) -> AsyncGenerator[cloud_speech.StreamingRecognizeRequest, None]:
             try:
                 # first request should contain the config
                 yield cloud_speech.StreamingRecognizeRequest(
@@ -402,7 +406,10 @@ class SpeechStream(stt.SpeechStream):
             except Exception:
                 logger.exception("an error occurred while streaming input to google STT")
-        async def process_stream(client: SpeechAsyncClient, stream):
+        async def process_stream(
+            client: SpeechAsyncClient,
+            stream: AsyncIterable[cloud_speech.StreamingRecognizeResponse],
+        ) -> None:
             has_started = False
             async for resp in stream:
                 if (
@@ -464,7 +471,7 @@ class SpeechStream(stt.SpeechStream):
         while True:
             try:
-                async with self._pool.connection() as client:
+                async with self._pool.connection(timeout=self._conn_options.timeout) as client:
                     self._streaming_config = cloud_speech.StreamingRecognitionConfig(
                         config=cloud_speech.RecognitionConfig(
                             explicit_decoding_config=cloud_speech.ExplicitDecodingConfig(

livekit/plugins/google/tools.py ADDED Viewed

@@ -0,0 +1,11 @@
+from typing import Union
+from google.genai.types import (
+    GoogleMaps,
+    GoogleSearch,
+    GoogleSearchRetrieval,
+    ToolCodeExecution,
+    UrlContext,
+)
+_LLMTool = Union[GoogleSearchRetrieval, ToolCodeExecution, GoogleSearch, UrlContext, GoogleMaps]

livekit-plugins-google 1.0.23__py3-none-any.whl → 1.1.0__py3-none-any.whl

livekit-plugins-google 1.0.23py3-none-any.whl → 1.1.0py3-none-any.whl