PyPI - livekit-plugins-google - Versions diffs - 0.11.1__py3-none-any.whl → 1.0.0.dev4__py3-none-any.whl - Mend

livekit-plugins-google 0.11.1py3-none-any.whl → 1.0.0.dev4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

livekit/plugins/google/beta/realtime/__init__.py +1 -5
livekit/plugins/google/beta/realtime/api_proto.py +3 -2
livekit/plugins/google/beta/realtime/realtime_api.py +22 -51
livekit/plugins/google/beta/realtime/transcriber.py +11 -27
livekit/plugins/google/llm.py +127 -197
livekit/plugins/google/stt.py +28 -58
livekit/plugins/google/tts.py +10 -16
livekit/plugins/google/utils.py +213 -0
livekit/plugins/google/version.py +1 -1
{livekit_plugins_google-0.11.1.dist-info → livekit_plugins_google-1.0.0.dev4.dist-info}/METADATA +12 -22
livekit_plugins_google-1.0.0.dev4.dist-info/RECORD +17 -0
{livekit_plugins_google-0.11.1.dist-info → livekit_plugins_google-1.0.0.dev4.dist-info}/WHEEL +1 -2
livekit/plugins/google/_utils.py +0 -199
livekit_plugins_google-0.11.1.dist-info/RECORD +0 -18
livekit_plugins_google-0.11.1.dist-info/top_level.txt +0 -1

livekit/plugins/google/beta/realtime/__init__.py CHANGED Viewed

@@ -1,8 +1,4 @@
-from .api_proto import (
-    ClientEvents,
-    LiveAPIModels,
-    Voice,
-)
+from .api_proto import ClientEvents, LiveAPIModels, Voice
 from .realtime_api import RealtimeModel
 __all__ = [

livekit/plugins/google/beta/realtime/api_proto.py CHANGED Viewed

@@ -1,12 +1,13 @@
 from __future__ import annotations
-from typing import Literal, Sequence, Union
+from collections.abc import Sequence
+from typing import Literal, Union
 from google.genai import types
 from ..._utils import _build_gemini_ctx, _build_tools
-LiveAPIModels = Literal["gemini-2.0-flash-exp"]
+LiveAPIModels = Literal["gemini-2.0-flash-001",]
 Voice = Literal["Puck", "Charon", "Kore", "Fenrir", "Aoede"]

livekit/plugins/google/beta/realtime/realtime_api.py CHANGED Viewed

@@ -3,21 +3,17 @@ from __future__ import annotations
 import asyncio
 import json
 import os
+from collections.abc import AsyncIterable
 from dataclasses import dataclass
-from typing import AsyncIterable, Literal
-from livekit import rtc
-from livekit.agents import llm, utils
-from livekit.agents.llm.function_context import _create_ai_function_info
-from livekit.agents.utils import images
+from typing import Literal
 from google import genai
+from google.genai._api_client import HttpOptions
 from google.genai.types import (
     Blob,
     Content,
     FunctionResponse,
     GenerationConfig,
-    HttpOptions,
     LiveClientContent,
     LiveClientRealtimeInput,
     LiveClientToolResponse,
@@ -29,15 +25,13 @@ from google.genai.types import (
     Tool,
     VoiceConfig,
 )
+from livekit import rtc
+from livekit.agents import llm, utils
+from livekit.agents.llm.function_context import _create_ai_function_info
+from livekit.agents.utils import images
 from ...log import logger
-from .api_proto import (
-    ClientEvents,
-    LiveAPIModels,
-    Voice,
-    _build_gemini_ctx,
-    _build_tools,
-)
+from .api_proto import ClientEvents, LiveAPIModels, Voice, _build_gemini_ctx, _build_tools
 from .transcriber import ModelTranscriber, TranscriberSession, TranscriptionContent
 EventTypes = Literal[
@@ -83,7 +77,6 @@ class Capabilities:
 class ModelOptions:
     model: LiveAPIModels | str
     api_key: str | None
-    api_version: str
     voice: Voice | str
     response_modalities: list[Modality] | None
     vertexai: bool
@@ -108,9 +101,8 @@ class RealtimeModel:
         instructions: str | None = None,
         model: LiveAPIModels | str = "gemini-2.0-flash-exp",
         api_key: str | None = None,
-        api_version: str = "v1alpha",
         voice: Voice | str = "Puck",
-        modalities: list[Modality] = [Modality.AUDIO],
+        modalities: list[Modality] = None,
         enable_user_audio_transcription: bool = True,
         enable_agent_audio_transcription: bool = True,
         vertexai: bool = False,
@@ -138,7 +130,6 @@ class RealtimeModel:
         Args:
             instructions (str, optional): Initial system instructions for the model. Defaults to "".
             api_key (str or None, optional): Google Gemini API key. If None, will attempt to read from the environment variable GOOGLE_API_KEY.
-            api_version (str, optional): The version of the API to use. Defaults to "v1alpha".
             modalities (list[Modality], optional): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
             model (str or None, optional): The name of the model to use. Defaults to "gemini-2.0-flash-exp".
             voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
@@ -158,6 +149,8 @@ class RealtimeModel:
         Raises:
             ValueError: If the API key is not provided and cannot be found in environment variables.
         """
+        if modalities is None:
+            modalities = ["AUDIO"]
         super().__init__()
         self._capabilities = Capabilities(
             supports_truncate=False,
@@ -183,14 +176,11 @@ class RealtimeModel:
                     "API key is required for Google API either via api_key or GOOGLE_API_KEY environment variable"
                 )
-        instructions_content = (
-            Content(parts=[Part(text=instructions)]) if instructions else None
-        )
+        instructions_content = Content(parts=[Part(text=instructions)]) if instructions else None
         self._rt_sessions: list[GeminiRealtimeSession] = []
         self._opts = ModelOptions(
             model=model,
-            api_version=api_version,
             api_key=self._api_key,
             voice=voice,
             enable_user_audio_transcription=enable_user_audio_transcription,
@@ -263,8 +253,6 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
         self._fnc_ctx = fnc_ctx
         self._fnc_tasks = utils.aio.TaskSet()
         self._is_interrupted = False
-        self._playout_complete = asyncio.Event()
-        self._playout_complete.set()
         tools = []
         if self._fnc_ctx is not None:
@@ -285,32 +273,24 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
             system_instruction=self._opts.instructions,
             speech_config=SpeechConfig(
                 voice_config=VoiceConfig(
-                    prebuilt_voice_config=PrebuiltVoiceConfig(
-                        voice_name=self._opts.voice
-                    )
+                    prebuilt_voice_config=PrebuiltVoiceConfig(voice_name=self._opts.voice)
                 )
             ),
             tools=tools,
         )
         self._client = genai.Client(
-            http_options=HttpOptions(api_version=self._opts.api_version),
+            http_options=HttpOptions(api_version="v1alpha"),
             api_key=self._opts.api_key,
             vertexai=self._opts.vertexai,
             project=self._opts.project,
             location=self._opts.location,
         )
-        self._main_atask = asyncio.create_task(
-            self._main_task(), name="gemini-realtime-session"
-        )
+        self._main_atask = asyncio.create_task(self._main_task(), name="gemini-realtime-session")
         if self._opts.enable_user_audio_transcription:
-            self._transcriber = TranscriberSession(
-                client=self._client, model=self._opts.model
-            )
+            self._transcriber = TranscriberSession(client=self._client, model=self._opts.model)
             self._transcriber.on("input_speech_done", self._on_input_speech_done)
         if self._opts.enable_agent_audio_transcription:
-            self._agent_transcriber = ModelTranscriber(
-                client=self._client, model=self._opts.model
-            )
+            self._agent_transcriber = ModelTranscriber(client=self._client, model=self._opts.model)
             self._agent_transcriber.on("input_speech_done", self._on_agent_speech_done)
         # init dummy task
         self._init_sync_task = asyncio.create_task(asyncio.sleep(0))
@@ -324,10 +304,6 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
         self._send_ch.close()
         await self._main_atask
-    @property
-    def playout_complete(self) -> asyncio.Event | None:
-        return self._playout_complete
     @property
     def fnc_ctx(self) -> llm.FunctionContext | None:
         return self._fnc_ctx
@@ -345,9 +321,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
     DEFAULT_ENCODE_OPTIONS = images.EncodeOptions(
         format="JPEG",
         quality=75,
-        resize_options=images.ResizeOptions(
-            width=1024, height=1024, strategy="scale_aspect_fit"
-        ),
+        resize_options=images.ResizeOptions(width=1024, height=1024, strategy="scale_aspect_fit"),
     )
     def push_video(
@@ -397,9 +371,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
     def create_response(
         self,
-        on_duplicate: Literal[
-            "cancel_existing", "cancel_new", "keep_both"
-        ] = "keep_both",
+        on_duplicate: Literal["cancel_existing", "cancel_new", "keep_both"] = "keep_both",
     ) -> None:
         turns, _ = _build_gemini_ctx(self._chat_ctx, id(self))
         ctx = [self._opts.instructions] + turns if self._opts.instructions else turns
@@ -485,8 +457,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
                                         data=part.inline_data.data,
                                         sample_rate=24000,
                                         num_channels=1,
-                                        samples_per_channel=len(part.inline_data.data)
-                                        // 2,
+                                        samples_per_channel=len(part.inline_data.data) // 2,
                                     )
                                     if self._opts.enable_agent_audio_transcription:
                                         content.audio.append(frame)
@@ -529,12 +500,12 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
                         logger.warning(
                             "function call cancelled",
                             extra={
-                                "function_call_ids": response.tool_call_cancellation.ids,
+                                "function_call_ids": response.tool_call_cancellation.function_call_ids,
                             },
                         )
                         self.emit(
                             "function_calls_cancelled",
-                            response.tool_call_cancellation.ids,
+                            response.tool_call_cancellation.function_call_ids,
                         )
         async with self._client.aio.live.connect(

livekit/plugins/google/beta/realtime/transcriber.py CHANGED Viewed

@@ -6,12 +6,12 @@ from dataclasses import dataclass
 from typing import Literal
 import websockets
-from livekit import rtc
-from livekit.agents import APIConnectionError, APIStatusError, utils
 from google import genai
 from google.genai import types
 from google.genai.errors import APIError, ClientError, ServerError
+from livekit import rtc
+from livekit.agents import APIConnectionError, APIStatusError, utils
 from ...log import logger
 from .api_proto import ClientEvents, LiveAPIModels
@@ -51,11 +51,9 @@ class TranscriberSession(utils.EventEmitter[EventTypes]):
         self._needed_sr = 16000
         self._closed = False
-        system_instructions = types.Content(
-            parts=[types.Part(text=SYSTEM_INSTRUCTIONS)]
-        )
+        system_instructions = types.Content(parts=[types.Part(text=SYSTEM_INSTRUCTIONS)])
         self._config = types.LiveConnectConfig(
-            response_modalities=[types.Modality.TEXT],
+            response_modalities=["TEXT"],
             system_instruction=system_instructions,
             generation_config=types.GenerationConfig(temperature=0.0),
         )
@@ -81,17 +79,13 @@ class TranscriberSession(utils.EventEmitter[EventTypes]):
             for f in self._resampler.push(frame):
                 self._queue_msg(
                     types.LiveClientRealtimeInput(
-                        media_chunks=[
-                            types.Blob(data=f.data.tobytes(), mime_type="audio/pcm")
-                        ]
+                        media_chunks=[types.Blob(data=f.data.tobytes(), mime_type="audio/pcm")]
                     )
                 )
         else:
             self._queue_msg(
                 types.LiveClientRealtimeInput(
-                    media_chunks=[
-                        types.Blob(data=frame.data.tobytes(), mime_type="audio/pcm")
-                    ]
+                    media_chunks=[types.Blob(data=frame.data.tobytes(), mime_type="audio/pcm")]
                 )
             )
@@ -157,17 +151,11 @@ class TranscriberSession(utils.EventEmitter[EventTypes]):
                 logger.exception(f"Uncaught error in transcriber _recv_task: {e}")
                 self._closed = True
-        async with self._client.aio.live.connect(
-            model=self._model, config=self._config
-        ) as session:
+        async with self._client.aio.live.connect(model=self._model, config=self._config) as session:
             self._session = session
             tasks = [
-                asyncio.create_task(
-                    _send_task(), name="gemini-realtime-transcriber-send"
-                ),
-                asyncio.create_task(
-                    _recv_task(), name="gemini-realtime-transcriber-recv"
-                ),
+                asyncio.create_task(_send_task(), name="gemini-realtime-transcriber-send"),
+                asyncio.create_task(_recv_task(), name="gemini-realtime-transcriber-recv"),
             ]
             try:
@@ -187,9 +175,7 @@ class ModelTranscriber(utils.EventEmitter[EventTypes]):
         self._client = client
         self._model = model
         self._needed_sr = 16000
-        self._system_instructions = types.Content(
-            parts=[types.Part(text=SYSTEM_INSTRUCTIONS)]
-        )
+        self._system_instructions = types.Content(parts=[types.Part(text=SYSTEM_INSTRUCTIONS)])
         self._config = types.GenerateContentConfig(
             temperature=0.0,
             system_instruction=self._system_instructions,
@@ -198,9 +184,7 @@ class ModelTranscriber(utils.EventEmitter[EventTypes]):
         self._resampler: rtc.AudioResampler | None = None
         self._buffer: rtc.AudioFrame | None = None
         self._audio_ch = utils.aio.Chan[rtc.AudioFrame]()
-        self._main_atask = asyncio.create_task(
-            self._main_task(), name="gemini-model-transcriber"
-        )
+        self._main_atask = asyncio.create_task(self._main_task(), name="gemini-model-transcriber")
     async def aclose(self) -> None:
         if self._audio_ch.closed:

livekit-plugins-google 0.11.1__py3-none-any.whl → 1.0.0.dev4__py3-none-any.whl

livekit-plugins-google 0.11.1py3-none-any.whl → 1.0.0.dev4py3-none-any.whl