PyPI - livekit-plugins-google - Versions diffs - 1.0.19__py3-none-any.whl → 1.0.21__py3-none-any.whl - Mend

livekit-plugins-google 1.0.19py3-none-any.whl → 1.0.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

livekit/plugins/google/__init__.py +7 -0
livekit/plugins/google/beta/__init__.py +9 -0
livekit/plugins/google/beta/realtime/realtime_api.py +202 -95
livekit/plugins/google/llm.py +21 -16
livekit/plugins/google/models.py +2 -0
livekit/plugins/google/stt.py +15 -3
livekit/plugins/google/utils.py +39 -10
livekit/plugins/google/version.py +1 -1
livekit_plugins_google-1.0.21.dist-info/METADATA +47 -0
livekit_plugins_google-1.0.21.dist-info/RECORD +16 -0
livekit_plugins_google-1.0.19.dist-info/METADATA +0 -99
livekit_plugins_google-1.0.19.dist-info/RECORD +0 -16
{livekit_plugins_google-1.0.19.dist-info → livekit_plugins_google-1.0.21.dist-info}/WHEEL +0 -0

livekit/plugins/google/__init__.py CHANGED Viewed

@@ -12,6 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Google AI plugin for LiveKit Agents
+Supports Gemini, Cloud Speech-to-Text, and Cloud Text-to-Speech.
+See https://docs.livekit.io/agents/integrations/stt/google/ for more information.
+"""
 from . import beta
 from .llm import LLM
 from .stt import STT, SpeechStream

livekit/plugins/google/beta/__init__.py CHANGED Viewed

@@ -1,3 +1,12 @@
 from . import realtime
 __all__ = ["realtime"]
+# Cleanup docs of unexported modules
+_module = dir()
+NOT_IN_ALL = [m for m in _module if m not in __all__]
+__pdoc__ = {}
+for n in NOT_IN_ALL:
+    __pdoc__[n] = False

livekit/plugins/google/beta/realtime/realtime_api.py CHANGED Viewed

@@ -4,14 +4,16 @@ import asyncio
 import contextlib
 import json
 import os
+import time
 import weakref
 from collections.abc import Iterator
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from google import genai
 from google.genai.live import AsyncSession
 from google.genai.types import (
     AudioTranscriptionConfig,
+    AutomaticActivityDetection,
     Blob,
     Content,
     FunctionDeclaration,
@@ -25,8 +27,10 @@ from google.genai.types import (
     LiveServerToolCall,
     LiveServerToolCallCancellation,
     Modality,
+    ModalityTokenCount,
     Part,
     PrebuiltVoiceConfig,
+    RealtimeInputConfig,
     SessionResumptionConfig,
     SpeechConfig,
     Tool,
@@ -35,19 +39,20 @@ from google.genai.types import (
 )
 from livekit import rtc
 from livekit.agents import llm, utils
+from livekit.agents.metrics import RealtimeModelMetrics
 from livekit.agents.types import NOT_GIVEN, NotGivenOr
 from livekit.agents.utils import audio as audio_utils, images, is_given
 from livekit.plugins.google.beta.realtime.api_proto import ClientEvents, LiveAPIModels, Voice
 from ...log import logger
-from ...utils import _build_gemini_fnc, get_tool_results_for_realtime, to_chat_ctx
+from ...utils import get_tool_results_for_realtime, to_chat_ctx, to_fnc_ctx
 INPUT_AUDIO_SAMPLE_RATE = 16000
 INPUT_AUDIO_CHANNELS = 1
 OUTPUT_AUDIO_SAMPLE_RATE = 24000
 OUTPUT_AUDIO_CHANNELS = 1
-DEFAULT_ENCODE_OPTIONS = images.EncodeOptions(
+DEFAULT_IMAGE_ENCODE_OPTIONS = images.EncodeOptions(
     format="JPEG",
     quality=75,
     resize_options=images.ResizeOptions(width=1024, height=1024, strategy="scale_aspect_fit"),
@@ -80,13 +85,7 @@ class _RealtimeOptions:
     instructions: NotGivenOr[str]
     input_audio_transcription: AudioTranscriptionConfig | None
     output_audio_transcription: AudioTranscriptionConfig | None
-@dataclass
-class _MessageGeneration:
-    message_id: str
-    text_ch: utils.aio.Chan[str]
-    audio_ch: utils.aio.Chan[rtc.AudioFrame]
+    image_encode_options: NotGivenOr[images.EncodeOptions]
 @dataclass
@@ -94,7 +93,19 @@ class _ResponseGeneration:
     message_ch: utils.aio.Chan[llm.MessageGeneration]
     function_ch: utils.aio.Chan[llm.FunctionCall]
-    messages: dict[str, _MessageGeneration]
+    response_id: str
+    text_ch: utils.aio.Chan[str]
+    audio_ch: utils.aio.Chan[rtc.AudioFrame]
+    input_transcription: str = ""
+    _created_timestamp: float = field(default_factory=time.time)
+    """The timestamp when the generation is created"""
+    _first_token_timestamp: float | None = None
+    """The timestamp when the first audio token is received"""
+    _completed_timestamp: float | None = None
+    """The timestamp when the generation is completed"""
+    _done: bool = False
+    """Whether the generation is done (set when the turn is complete)"""
 class RealtimeModel(llm.RealtimeModel):
@@ -102,12 +113,12 @@ class RealtimeModel(llm.RealtimeModel):
         self,
         *,
         instructions: NotGivenOr[str] = NOT_GIVEN,
-        model: LiveAPIModels | str = "gemini-2.0-flash-live-001",
+        model: NotGivenOr[LiveAPIModels | str] = NOT_GIVEN,
         api_key: NotGivenOr[str] = NOT_GIVEN,
         voice: Voice | str = "Puck",
         language: NotGivenOr[str] = NOT_GIVEN,
         modalities: NotGivenOr[list[Modality]] = NOT_GIVEN,
-        vertexai: bool = False,
+        vertexai: NotGivenOr[bool] = NOT_GIVEN,
         project: NotGivenOr[str] = NOT_GIVEN,
         location: NotGivenOr[str] = NOT_GIVEN,
         candidate_count: int = 1,
@@ -119,12 +130,13 @@ class RealtimeModel(llm.RealtimeModel):
         frequency_penalty: NotGivenOr[float] = NOT_GIVEN,
         input_audio_transcription: NotGivenOr[AudioTranscriptionConfig | None] = NOT_GIVEN,
         output_audio_transcription: NotGivenOr[AudioTranscriptionConfig | None] = NOT_GIVEN,
+        image_encode_options: NotGivenOr[images.EncodeOptions] = NOT_GIVEN,
     ) -> None:
         """
         Initializes a RealtimeModel instance for interacting with Google's Realtime API.
         Environment Requirements:
-        - For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file.
+        - For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file or use any of the other Google Cloud auth methods.
         The Google Cloud project and location can be set via `project` and `location` arguments or the environment variables
         `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_LOCATION`. By default, the project is inferred from the service account key file,
         and the location defaults to "us-central1".
@@ -134,7 +146,7 @@ class RealtimeModel(llm.RealtimeModel):
             instructions (str, optional): Initial system instructions for the model. Defaults to "".
             api_key (str, optional): Google Gemini API key. If None, will attempt to read from the environment variable GOOGLE_API_KEY.
             modalities (list[Modality], optional): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
-            model (str, optional): The name of the model to use. Defaults to "gemini-2.0-flash-live-001".
+            model (str, optional): The name of the model to use. Defaults to "gemini-2.0-flash-live-001" or "gemini-2.0-flash-exp" (vertexai).
             voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
             language (str, optional): The language(BCP-47 Code) to use for the API. supported languages - https://ai.google.dev/gemini-api/docs/live#supported-languages
             temperature (float, optional): Sampling temperature for response generation. Defaults to 0.8.
@@ -148,26 +160,48 @@ class RealtimeModel(llm.RealtimeModel):
             frequency_penalty (float, optional): The frequency penalty for response generation
             input_audio_transcription (AudioTranscriptionConfig | None, optional): The configuration for input audio transcription. Defaults to None.)
             output_audio_transcription (AudioTranscriptionConfig | None, optional): The configuration for output audio transcription. Defaults to AudioTranscriptionConfig().
+            image_encode_options (images.EncodeOptions, optional): The configuration for image encoding. Defaults to DEFAULT_ENCODE_OPTIONS.
         Raises:
             ValueError: If the API key is required but not found.
         """  # noqa: E501
+        if not is_given(input_audio_transcription):
+            input_audio_transcription = AudioTranscriptionConfig()
+        if not is_given(output_audio_transcription):
+            output_audio_transcription = AudioTranscriptionConfig()
         super().__init__(
             capabilities=llm.RealtimeCapabilities(
                 message_truncation=False,
                 turn_detection=True,
-                user_transcription=is_given(input_audio_transcription),
+                user_transcription=input_audio_transcription is not None,
+                auto_tool_reply_generation=True,
             )
         )
+        if not is_given(model):
+            if vertexai:
+                model = "gemini-2.0-flash-exp"
+            else:
+                model = "gemini-2.0-flash-live-001"
         gemini_api_key = api_key if is_given(api_key) else os.environ.get("GOOGLE_API_KEY")
         gcp_project = project if is_given(project) else os.environ.get("GOOGLE_CLOUD_PROJECT")
-        gcp_location = location if is_given(location) else os.environ.get("GOOGLE_CLOUD_LOCATION")
+        gcp_location = (
+            location
+            if is_given(location)
+            else os.environ.get("GOOGLE_CLOUD_LOCATION") or "us-central1"
+        )
+        use_vertexai = (
+            vertexai
+            if is_given(vertexai)
+            else os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "0").lower() in ["true", "1"]
+        )
-        if vertexai:
+        if use_vertexai:
             if not gcp_project or not gcp_location:
                 raise ValueError(
-                    "Project and location are required for VertexAI either via project and location or GOOGLE_CLOUD_PROJECT and GOOGLE_CLOUD_LOCATION environment variables"  # noqa: E501
+                    "Project is required for VertexAI via project kwarg or GOOGLE_CLOUD_PROJECT environment variable"  # noqa: E501
                 )
             gemini_api_key = None  # VertexAI does not require an API key
         else:
@@ -178,17 +212,12 @@ class RealtimeModel(llm.RealtimeModel):
                     "API key is required for Google API either via api_key or GOOGLE_API_KEY environment variable"  # noqa: E501
                 )
-        if not is_given(input_audio_transcription):
-            input_audio_transcription = None
-        if not is_given(output_audio_transcription):
-            output_audio_transcription = AudioTranscriptionConfig()
         self._opts = _RealtimeOptions(
             model=model,
             api_key=gemini_api_key,
             voice=voice,
             response_modalities=modalities,
-            vertexai=vertexai,
+            vertexai=use_vertexai,
             project=gcp_project,
             location=gcp_location,
             candidate_count=candidate_count,
@@ -202,6 +231,7 @@ class RealtimeModel(llm.RealtimeModel):
             input_audio_transcription=input_audio_transcription,
             output_audio_transcription=output_audio_transcription,
             language=language,
+            image_encode_options=image_encode_options,
         )
         self._sessions = weakref.WeakSet[RealtimeSession]()
@@ -262,7 +292,6 @@ class RealtimeSession(llm.RealtimeSession):
         self._session_resumption_handle: str | None = None
-        self._update_lock = asyncio.Lock()
         self._session_lock = asyncio.Lock()
     async def _close_active_session(self) -> None:
@@ -281,55 +310,59 @@ class RealtimeSession(llm.RealtimeSession):
             # reset the msg_ch, do not send messages from previous session
             self._msg_ch = utils.aio.Chan[ClientEvents]()
-    async def update_options(
+    def update_options(
         self,
         *,
         voice: NotGivenOr[str] = NOT_GIVEN,
         temperature: NotGivenOr[float] = NOT_GIVEN,
         tool_choice: NotGivenOr[llm.ToolChoice | None] = NOT_GIVEN,
     ) -> None:
-        async with self._update_lock:
-            should_restart = False
-            if is_given(voice) and self._opts.voice != voice:
-                self._opts.voice = voice
-                should_restart = True
+        should_restart = False
+        if is_given(voice) and self._opts.voice != voice:
+            self._opts.voice = voice
+            should_restart = True
-            if is_given(temperature) and self._opts.temperature != temperature:
-                self._opts.temperature = temperature if is_given(temperature) else NOT_GIVEN
-                should_restart = True
+        if is_given(temperature) and self._opts.temperature != temperature:
+            self._opts.temperature = temperature if is_given(temperature) else NOT_GIVEN
+            should_restart = True
-            if should_restart:
-                self._mark_restart_needed()
+        if should_restart:
+            self._mark_restart_needed()
     async def update_instructions(self, instructions: str) -> None:
-        async with self._update_lock:
-            if not is_given(self._opts.instructions) or self._opts.instructions != instructions:
-                self._opts.instructions = instructions
-                self._mark_restart_needed()
+        if not is_given(self._opts.instructions) or self._opts.instructions != instructions:
+            self._opts.instructions = instructions
+            self._mark_restart_needed()
     async def update_chat_ctx(self, chat_ctx: llm.ChatContext) -> None:
-        async with self._update_lock:
-            self._chat_ctx = chat_ctx.copy()
-            turns, _ = to_chat_ctx(self._chat_ctx, id(self), ignore_functions=True)
-            tool_results = get_tool_results_for_realtime(self._chat_ctx)
-            # TODO(dz): need to compute delta and then either append or recreate session
+        diff_ops = llm.utils.compute_chat_ctx_diff(self._chat_ctx, chat_ctx)
+        if diff_ops.to_remove:
+            logger.warning("Gemini Live does not support removing messages")
+        append_ctx = llm.ChatContext.empty()
+        for _, item_id in diff_ops.to_create:
+            item = chat_ctx.get_by_id(item_id)
+            if item:
+                append_ctx.items.append(item)
+        if append_ctx.items:
+            turns, _ = to_chat_ctx(append_ctx, id(self), ignore_functions=True)
+            tool_results = get_tool_results_for_realtime(append_ctx, vertexai=self._opts.vertexai)
             if turns:
                 self._send_client_event(LiveClientContent(turns=turns, turn_complete=False))
             if tool_results:
                 self._send_client_event(tool_results)
     async def update_tools(self, tools: list[llm.FunctionTool]) -> None:
-        async with self._update_lock:
-            new_declarations: list[FunctionDeclaration] = [
-                _build_gemini_fnc(tool) for tool in tools
-            ]
-            current_tool_names = {f.name for f in self._gemini_declarations}
-            new_tool_names = {f.name for f in new_declarations}
-            if current_tool_names != new_tool_names:
-                self._gemini_declarations = new_declarations
-                self._tools = llm.ToolContext(tools)
-                self._mark_restart_needed()
+        new_declarations: list[FunctionDeclaration] = to_fnc_ctx(tools)
+        current_tool_names = {f.name for f in self._gemini_declarations}
+        new_tool_names = {f.name for f in new_declarations}
+        if current_tool_names != new_tool_names:
+            self._gemini_declarations = new_declarations
+            self._tools = llm.ToolContext(tools)
+            self._mark_restart_needed()
     @property
     def chat_ctx(self) -> llm.ChatContext:
@@ -348,7 +381,9 @@ class RealtimeSession(llm.RealtimeSession):
                 self._send_client_event(realtime_input)
     def push_video(self, frame: rtc.VideoFrame) -> None:
-        encoded_data = images.encode(frame, DEFAULT_ENCODE_OPTIONS)
+        encoded_data = images.encode(
+            frame, self._opts.image_encode_options or DEFAULT_IMAGE_ENCODE_OPTIONS
+        )
         realtime_input = LiveClientRealtimeInput(
             media_chunks=[Blob(data=encoded_data, mime_type="image/jpeg")]
         )
@@ -418,7 +453,7 @@ class RealtimeSession(llm.RealtimeSession):
         self._response_created_futures.clear()
         if self._current_generation:
-            self._finalize_response(closed=True)
+            self._mark_current_generation_done()
     @utils.log_exceptions(logger=logger)
     async def _main_task(self):
@@ -512,7 +547,7 @@ class RealtimeSession(llm.RealtimeSession):
                         break
                 async for response in session.receive():
-                    if not self._current_generation and (
+                    if (not self._current_generation or self._current_generation._done) and (
                         response.server_content or response.tool_call
                     ):
                         self._start_new_generation()
@@ -543,7 +578,7 @@ class RealtimeSession(llm.RealtimeSession):
                 logger.error(f"error in receive task: {e}", exc_info=e)
                 self._mark_restart_needed()
         finally:
-            self._finalize_response(closed=True)
+            self._mark_current_generation_done()
     def _build_connect_config(self) -> LiveConnectConfig:
         temp = self._opts.temperature if is_given(self._opts.temperature) else None
@@ -580,32 +615,31 @@ class RealtimeSession(llm.RealtimeSession):
             input_audio_transcription=self._opts.input_audio_transcription,
             output_audio_transcription=self._opts.output_audio_transcription,
             session_resumption=SessionResumptionConfig(handle=self._session_resumption_handle),
+            realtime_input_config=RealtimeInputConfig(
+                automatic_activity_detection=AutomaticActivityDetection(),
+            ),
         )
     def _start_new_generation(self):
-        if self._current_generation:
+        if self._current_generation and not self._current_generation._done:
             logger.warning("starting new generation while another is active. Finalizing previous.")
-            self._finalize_response(closed=True)
+            self._mark_current_generation_done()
         response_id = utils.shortuuid("gemini-turn-")
         self._current_generation = _ResponseGeneration(
             message_ch=utils.aio.Chan[llm.MessageGeneration](),
             function_ch=utils.aio.Chan[llm.FunctionCall](),
-            messages={},
-        )
-        item_generation = _MessageGeneration(
-            message_id=response_id,
+            response_id=response_id,
             text_ch=utils.aio.Chan[str](),
             audio_ch=utils.aio.Chan[rtc.AudioFrame](),
+            _created_timestamp=time.time(),
         )
-        self._current_generation.messages[response_id] = item_generation
         self._current_generation.message_ch.send_nowait(
             llm.MessageGeneration(
                 message_id=response_id,
-                text_stream=item_generation.text_ch,
-                audio_stream=item_generation.audio_ch,
+                text_stream=self._current_generation.text_ch,
+                audio_stream=self._current_generation.audio_ch,
             )
         )
@@ -623,18 +657,18 @@ class RealtimeSession(llm.RealtimeSession):
         self.emit("generation_created", generation_event)
     def _handle_server_content(self, server_content: LiveServerContent):
-        if not self._current_generation:
+        current_gen = self._current_generation
+        if not current_gen:
             logger.warning("received server content but no active generation.")
             return
-        response_id = list(self._current_generation.messages.keys())[0]
-        item_generation = self._current_generation.messages[response_id]
         if model_turn := server_content.model_turn:
             for part in model_turn.parts:
                 if part.text:
-                    item_generation.text_ch.send_nowait(part.text)
+                    current_gen.text_ch.send_nowait(part.text)
                 if part.inline_data:
+                    if not current_gen._first_token_timestamp:
+                        current_gen._first_token_timestamp = time.time()
                     frame_data = part.inline_data.data
                     try:
                         frame = rtc.AudioFrame(
@@ -643,46 +677,65 @@ class RealtimeSession(llm.RealtimeSession):
                             num_channels=OUTPUT_AUDIO_CHANNELS,
                             samples_per_channel=len(frame_data) // (2 * OUTPUT_AUDIO_CHANNELS),
                         )
-                        item_generation.audio_ch.send_nowait(frame)
+                        current_gen.audio_ch.send_nowait(frame)
                     except ValueError as e:
                         logger.error(f"Error creating audio frame from Gemini data: {e}")
         if input_transcription := server_content.input_transcription:
-            if input_transcription.text:
+            text = input_transcription.text
+            if text:
+                if current_gen.input_transcription == "":
+                    # gemini would start with a space, which doesn't make sense
+                    # at beginning of the transcript
+                    text = text.lstrip()
+                current_gen.input_transcription += text
                 self.emit(
                     "input_audio_transcription_completed",
                     llm.InputTranscriptionCompleted(
-                        item_id=response_id, transcript=input_transcription.text
+                        item_id=current_gen.response_id,
+                        transcript=current_gen.input_transcription,
+                        is_final=False,
                     ),
                 )
-                self._handle_input_speech_started()
         if output_transcription := server_content.output_transcription:
-            if output_transcription.text:
-                item_generation.text_ch.send_nowait(output_transcription.text)
+            text = output_transcription.text
+            if text:
+                current_gen.text_ch.send_nowait(text)
+        if server_content.generation_complete:
+            # The only way we'd know that the transcription is complete is by when they are
+            # done with generation
+            if current_gen.input_transcription:
+                self.emit(
+                    "input_audio_transcription_completed",
+                    llm.InputTranscriptionCompleted(
+                        item_id=current_gen.response_id,
+                        transcript=current_gen.input_transcription,
+                        is_final=True,
+                    ),
+                )
+            current_gen._completed_timestamp = time.time()
         if server_content.interrupted:
-            self._finalize_response(interrupted=True)
             self._handle_input_speech_started()
         if server_content.turn_complete:
-            self._finalize_response()
+            self._mark_current_generation_done()
-    def _finalize_response(self, interrupted: bool = False, closed: bool = False) -> None:
+    def _mark_current_generation_done(self) -> None:
         if not self._current_generation:
             return
         gen = self._current_generation
-        self._current_generation = None
-        for item_generation in gen.messages.values():
-            if not item_generation.text_ch.closed:
-                item_generation.text_ch.close()
-            if not item_generation.audio_ch.closed:
-                item_generation.audio_ch.close()
+        if not gen.text_ch.closed:
+            gen.text_ch.close()
+        if not gen.audio_ch.closed:
+            gen.audio_ch.close()
         gen.function_ch.close()
         gen.message_ch.close()
+        gen._done = True
     def _handle_input_speech_started(self):
         self.emit("input_speech_started", llm.InputSpeechStartedEvent())
@@ -703,7 +756,7 @@ class RealtimeSession(llm.RealtimeSession):
                     arguments=arguments,
                 )
             )
-        self._finalize_response()
+        self._mark_current_generation_done()
     def _handle_tool_call_cancellation(
         self, tool_call_cancellation: LiveServerToolCallCancellation
@@ -714,8 +767,62 @@ class RealtimeSession(llm.RealtimeSession):
         )
     def _handle_usage_metadata(self, usage_metadata: UsageMetadata):
-        # TODO: handle metrics
-        logger.debug("usage metadata", extra={"usage_metadata": usage_metadata})
+        current_gen = self._current_generation
+        if not current_gen:
+            logger.warning("no active generation to report metrics for")
+            return
+        ttft = (
+            current_gen._first_token_timestamp - current_gen._created_timestamp
+            if current_gen._first_token_timestamp
+            else -1
+        )
+        duration = (
+            current_gen._completed_timestamp or time.time()
+        ) - current_gen._created_timestamp
+        def _token_details_map(
+            token_details: list[ModalityTokenCount] | None,
+        ) -> dict[Modality, int]:
+            token_details_map = {"audio_tokens": 0, "text_tokens": 0, "image_tokens": 0}
+            if not token_details:
+                return token_details_map
+            for token_detail in token_details:
+                if token_detail.modality == Modality.AUDIO:
+                    token_details_map["audio_tokens"] += token_detail.token_count
+                elif token_detail.modality == Modality.TEXT:
+                    token_details_map["text_tokens"] += token_detail.token_count
+                elif token_detail.modality == Modality.IMAGE:
+                    token_details_map["image_tokens"] += token_detail.token_count
+            return token_details_map
+        metrics = RealtimeModelMetrics(
+            label=self._realtime_model._label,
+            request_id=current_gen.response_id,
+            timestamp=current_gen._created_timestamp,
+            duration=duration,
+            ttft=ttft,
+            cancelled=False,
+            input_tokens=usage_metadata.prompt_token_count or 0,
+            output_tokens=usage_metadata.response_token_count or 0,
+            total_tokens=usage_metadata.total_token_count or 0,
+            tokens_per_second=(usage_metadata.response_token_count or 0) / duration,
+            input_token_details=RealtimeModelMetrics.InputTokenDetails(
+                **_token_details_map(usage_metadata.prompt_tokens_details),
+                cached_tokens=sum(
+                    token_detail.token_count or 0
+                    for token_detail in usage_metadata.cache_tokens_details or []
+                ),
+                cached_tokens_details=RealtimeModelMetrics.CachedTokenDetails(
+                    **_token_details_map(usage_metadata.cache_tokens_details),
+                ),
+            ),
+            output_token_details=RealtimeModelMetrics.OutputTokenDetails(
+                **_token_details_map(usage_metadata.response_tokens_details),
+            ),
+        )
+        self.emit("metrics_collected", metrics)
     def _handle_go_away(self, go_away: LiveServerGoAway):
         logger.warning(

livekit/plugins/google/llm.py CHANGED Viewed

@@ -62,7 +62,7 @@ class LLM(llm.LLM):
         *,
         model: ChatModels | str = "gemini-2.0-flash-001",
         api_key: NotGivenOr[str] = NOT_GIVEN,
-        vertexai: NotGivenOr[bool] = False,
+        vertexai: NotGivenOr[bool] = NOT_GIVEN,
         project: NotGivenOr[str] = NOT_GIVEN,
         location: NotGivenOr[str] = NOT_GIVEN,
         temperature: NotGivenOr[float] = NOT_GIVEN,
@@ -78,7 +78,7 @@ class LLM(llm.LLM):
         Create a new instance of Google GenAI LLM.
         Environment Requirements:
-        - For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file.
+        - For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file or use any of the other Google Cloud auth methods.
         The Google Cloud project and location can be set via `project` and `location` arguments or the environment variables
         `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_LOCATION`. By default, the project is inferred from the service account key file,
         and the location defaults to "us-central1".
@@ -87,9 +87,9 @@ class LLM(llm.LLM):
         Args:
             model (ChatModels | str, optional): The model name to use. Defaults to "gemini-2.0-flash-001".
             api_key (str, optional): The API key for Google Gemini. If not provided, it attempts to read from the `GOOGLE_API_KEY` environment variable.
-            vertexai (bool, optional): Whether to use VertexAI. Defaults to False.
-            project (str, optional): The Google Cloud project to use (only for VertexAI). Defaults to None.
-            location (str, optional): The location to use for VertexAI API requests. Defaults value is "us-central1".
+            vertexai (bool, optional): Whether to use VertexAI. If not provided, it attempts to read from the `GOOGLE_GENAI_USE_VERTEXAI` environment variable. Defaults to False.
+                project (str, optional): The Google Cloud project to use (only for VertexAI). Defaults to None.
+                location (str, optional): The location to use for VertexAI API requests. Defaults value is "us-central1".
             temperature (float, optional): Sampling temperature for response generation. Defaults to 0.8.
             max_output_tokens (int, optional): Maximum number of tokens to generate in the output. Defaults to None.
             top_p (float, optional): The nucleus sampling probability for response generation. Defaults to None.
@@ -101,15 +101,19 @@ class LLM(llm.LLM):
         """  # noqa: E501
         super().__init__()
         gcp_project = project if is_given(project) else os.environ.get("GOOGLE_CLOUD_PROJECT")
-        gcp_location = location if is_given(location) else os.environ.get("GOOGLE_CLOUD_LOCATION")
+        gcp_location = (
+            location
+            if is_given(location)
+            else os.environ.get("GOOGLE_CLOUD_LOCATION") or "us-central1"
+        )
+        use_vertexai = (
+            vertexai
+            if is_given(vertexai)
+            else os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "0").lower() in ["true", "1"]
+        )
         gemini_api_key = api_key if is_given(api_key) else os.environ.get("GOOGLE_API_KEY")
-        _gac = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
-        if _gac is None:
-            logger.warning(
-                "`GOOGLE_APPLICATION_CREDENTIALS` environment variable is not set. please set it to the path of the service account key file. Otherwise, use any of the other Google Cloud auth methods."  # noqa: E501
-            )
-        if is_given(vertexai) and vertexai:
+        if use_vertexai:
             if not gcp_project:
                 _, gcp_project = default_async(
                     scopes=["https://www.googleapis.com/auth/cloud-platform"]
@@ -144,7 +148,7 @@ class LLM(llm.LLM):
             model=model,
             temperature=temperature,
             tool_choice=tool_choice,
-            vertexai=vertexai,
+            vertexai=use_vertexai,
             project=project,
             location=location,
             max_output_tokens=max_output_tokens,
@@ -156,7 +160,7 @@ class LLM(llm.LLM):
         )
         self._client = genai.Client(
             api_key=gemini_api_key,
-            vertexai=is_given(vertexai) and vertexai,
+            vertexai=use_vertexai,
             project=gcp_project,
             location=gcp_location,
         )
@@ -241,7 +245,7 @@ class LLM(llm.LLM):
             client=self._client,
             model=self._opts.model,
             chat_ctx=chat_ctx,
-            tools=tools,
+            tools=tools or [],
             conn_options=conn_options,
             extra_kwargs=extra,
         )
@@ -256,7 +260,7 @@ class LLMStream(llm.LLMStream):
         model: str | ChatModels,
         chat_ctx: llm.ChatContext,
         conn_options: APIConnectOptions,
-        tools: list[FunctionTool] | None,
+        tools: list[FunctionTool],
         extra_kwargs: dict[str, Any],
     ) -> None:
         super().__init__(llm, chat_ctx=chat_ctx, tools=tools, conn_options=conn_options)
@@ -325,6 +329,7 @@ class LLMStream(llm.LLMStream):
                             usage=llm.CompletionUsage(
                                 completion_tokens=usage.candidates_token_count or 0,
                                 prompt_tokens=usage.prompt_token_count or 0,
+                                prompt_cached_tokens=usage.cached_content_token_count or 0,
                                 total_tokens=usage.total_token_count or 0,
                             ),
                         )

livekit/plugins/google/models.py CHANGED Viewed

@@ -95,6 +95,8 @@ SpeechLanguages = Literal[
 Gender = Literal["male", "female", "neutral"]
 ChatModels = Literal[
+    "gemini-2.5-pro-preview-05-06",
+    "gemini-2.5-flash-preview-04-17",
     "gemini-2.0-flash-001",
     "gemini-2.0-flash-lite-preview-02-05",
     "gemini-2.0-pro-exp-02-05",

livekit/plugins/google/stt.py CHANGED Viewed

@@ -103,6 +103,7 @@ class STT(stt.STT):
         credentials_info: NotGivenOr[dict] = NOT_GIVEN,
         credentials_file: NotGivenOr[str] = NOT_GIVEN,
         keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN,
+        use_streaming: NotGivenOr[bool] = NOT_GIVEN,
     ):
         """
         Create a new instance of Google STT.
@@ -125,8 +126,13 @@ class STT(stt.STT):
             credentials_info(dict): the credentials info to use for recognition (default: None)
             credentials_file(str): the credentials file to use for recognition (default: None)
             keywords(List[tuple[str, float]]): list of keywords to recognize (default: None)
+            use_streaming(bool): whether to use streaming for recognition (default: True)
         """
-        super().__init__(capabilities=stt.STTCapabilities(streaming=True, interim_results=True))
+        if not is_given(use_streaming):
+            use_streaming = True
+        super().__init__(
+            capabilities=stt.STTCapabilities(streaming=use_streaming, interim_results=True)
+        )
         self._location = location
         self._credentials_info = credentials_info
@@ -251,7 +257,7 @@ class STT(stt.STT):
         except DeadlineExceeded:
             raise APITimeoutError() from None
         except GoogleAPICallError as e:
-            raise APIStatusError(e.message, status_code=e.code or -1) from None
+            raise APIStatusError(f"{e.message} {e.details}", status_code=e.code or -1) from e
         except Exception as e:
             raise APIConnectionError() from e
@@ -472,6 +478,7 @@ class SpeechStream(stt.SpeechStream):
                             features=cloud_speech.RecognitionFeatures(
                                 enable_automatic_punctuation=self._config.punctuate,
                                 enable_word_time_offsets=True,
+                                enable_spoken_punctuation=self._config.spoken_punctuation,
                             ),
                         ),
                         streaming_features=cloud_speech.StreamingRecognitionFeatures(
@@ -505,7 +512,12 @@ class SpeechStream(stt.SpeechStream):
             except DeadlineExceeded:
                 raise APITimeoutError() from None
             except GoogleAPICallError as e:
-                raise APIStatusError(e.message, status_code=e.code or -1) from None
+                if e.code == 409:
+                    logger.debug("stream timed out, restarting.")
+                else:
+                    raise APIStatusError(
+                        f"{e.message} {e.details}", status_code=e.code or -1
+                    ) from e
             except Exception as e:
                 raise APIConnectionError() from e

livekit/plugins/google/utils.py CHANGED Viewed

@@ -9,28 +9,48 @@ from pydantic import TypeAdapter
 from google.genai import types
 from livekit.agents import llm
-from livekit.agents.llm import FunctionTool, utils as llm_utils
+from livekit.agents.llm import utils as llm_utils
+from livekit.agents.llm.tool_context import (
+    FunctionTool,
+    RawFunctionTool,
+    get_raw_function_info,
+    is_function_tool,
+    is_raw_function_tool,
+)
 from .log import logger
 __all__ = ["to_chat_ctx", "to_fnc_ctx"]
-def to_fnc_ctx(fncs: list[FunctionTool]) -> list[types.FunctionDeclaration]:
-    return [_build_gemini_fnc(fnc) for fnc in fncs]
+def to_fnc_ctx(fncs: list[FunctionTool | RawFunctionTool]) -> list[types.FunctionDeclaration]:
+    tools: list[types.FunctionDeclaration] = []
+    for fnc in fncs:
+        if is_raw_function_tool(fnc):
+            info = get_raw_function_info(fnc)
+            tools.append(types.FunctionDeclaration(**info.raw_schema))
+        elif is_function_tool(fnc):
+            tools.append(_build_gemini_fnc(fnc))
-def get_tool_results_for_realtime(chat_ctx: llm.ChatContext) -> types.LiveClientToolResponse | None:
+    return tools
+def get_tool_results_for_realtime(
+    chat_ctx: llm.ChatContext, *, vertexai: bool = False
+) -> types.LiveClientToolResponse | None:
     function_responses: list[types.FunctionResponse] = []
     for msg in chat_ctx.items:
         if msg.type == "function_call_output":
-            function_responses.append(
-                types.FunctionResponse(
-                    id=msg.call_id,
-                    name=msg.name,
-                    response={"output": msg.output},
-                )
+            res = types.FunctionResponse(
+                name=msg.name,
+                response={"output": msg.output},
             )
+            if not vertexai:
+                # vertexai does not support id in FunctionResponse
+                # see: https://github.com/googleapis/python-genai/blob/85e00bc/google/genai/_live_converters.py#L1435
+                res.id = msg.call_id
+            function_responses.append(res)
     return (
         types.LiveClientToolResponse(function_responses=function_responses)
         if function_responses
@@ -175,6 +195,15 @@ class _GeminiJsonSchema:
         schema.pop("title", None)
         schema.pop("default", None)
         schema.pop("additionalProperties", None)
+        schema.pop("$schema", None)
+        if (const := schema.pop("const", None)) is not None:
+            # Gemini doesn't support const, but it does support enum with a single value
+            schema["enum"] = [const]
+        schema.pop("discriminator", None)
+        schema.pop("examples", None)
         if ref := schema.pop("$ref", None):
             key = re.sub(r"^#/\$defs/", "", ref)
             if key in refs_stack:

livekit/plugins/google/version.py CHANGED Viewed

@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "1.0.19"
+__version__ = "1.0.21"

livekit_plugins_google-1.0.21.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,47 @@
+Metadata-Version: 2.4
+Name: livekit-plugins-google
+Version: 1.0.21
+Summary: Agent Framework plugin for services from Google Cloud
+Project-URL: Documentation, https://docs.livekit.io
+Project-URL: Website, https://livekit.io/
+Project-URL: Source, https://github.com/livekit/agents
+Author: LiveKit
+License-Expression: Apache-2.0
+Keywords: audio,livekit,realtime,video,webrtc
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Topic :: Multimedia :: Sound/Audio
+Classifier: Topic :: Multimedia :: Video
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.9.0
+Requires-Dist: google-auth<3,>=2
+Requires-Dist: google-cloud-speech<3,>=2
+Requires-Dist: google-cloud-texttospeech<3,>=2.24
+Requires-Dist: google-genai>=1.14.0
+Requires-Dist: livekit-agents>=1.0.21
+Description-Content-Type: text/markdown
+# Google AI plugin for LiveKit Agents
+Support for Gemini, Gemini Live, Cloud Speech-to-Text, and Cloud Text-to-Speech.
+See [https://docs.livekit.io/agents/integrations/google/](https://docs.livekit.io/agents/integrations/google/) for more information.
+## Installation
+```bash
+pip install livekit-plugins-google
+```
+## Pre-requisites
+For credentials, you'll need a Google Cloud account and obtain the correct credentials. Credentials can be passed directly or via Application Default Credentials as specified in [How Application Default Credentials works](https://cloud.google.com/docs/authentication/application-default-credentials).
+To use the STT and TTS API, you'll need to enable the respective services for your Google Cloud project.
+- Cloud Speech-to-Text API
+- Cloud Text-to-Speech API

livekit_plugins_google-1.0.21.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+livekit/plugins/google/__init__.py,sha256=xain2qUzU-YWhYWsLBkW8Q-szV-htpnzHTqymMPo-j0,1364
+livekit/plugins/google/llm.py,sha256=Kr9qeBZ5Dd0WCCBR_-gM3WWsVRZPCSteK8NpBsg2C5Y,16304
+livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
+livekit/plugins/google/models.py,sha256=maGlEM3hK4-5hMnH9UQMJewA7BZMrnStsFLBNoNVySg,1531
+livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+livekit/plugins/google/stt.py,sha256=2jk-1fHiBT8UW_n3CZsIEdMp2iBnUAlTnmefdUd8rAM,23620
+livekit/plugins/google/tts.py,sha256=29R0ieV5sRPBf5Yi0SPFQk7ZZMbELF30bIL9K_j_Wcg,9100
+livekit/plugins/google/utils.py,sha256=UBAbddYk7G8Nojg6bSC7_xN2pdl9qhs86HGhKYFuf9M,10509
+livekit/plugins/google/version.py,sha256=5lzQkS1jEPqreexacwMd18b2EOx7R5m8AQMKtQRBgC4,601
+livekit/plugins/google/beta/__init__.py,sha256=5PnoG3Ux24bjzMSzmTeSVljE9EINivGcbWUEV6egGnM,216
+livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
+livekit/plugins/google/beta/realtime/api_proto.py,sha256=Fyrejs3SG0EjOPCCFLEnWXKEUxCff47PMWk2VsKJm5E,594
+livekit/plugins/google/beta/realtime/realtime_api.py,sha256=yYB5fKXl_aaMH_ZSpfUlfOTUg4eRqqRENLTZhZMfBMc,36253
+livekit_plugins_google-1.0.21.dist-info/METADATA,sha256=mQA8BfvWhAjp3V9GJA5OsZLzP_Q03UuDbRX2HbcEgtY,1908
+livekit_plugins_google-1.0.21.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+livekit_plugins_google-1.0.21.dist-info/RECORD,,

livekit_plugins_google-1.0.19.dist-info/METADATA DELETED Viewed

@@ -1,99 +0,0 @@
-Metadata-Version: 2.4
-Name: livekit-plugins-google
-Version: 1.0.19
-Summary: Agent Framework plugin for services from Google Cloud
-Project-URL: Documentation, https://docs.livekit.io
-Project-URL: Website, https://livekit.io/
-Project-URL: Source, https://github.com/livekit/agents
-Author: LiveKit
-License-Expression: Apache-2.0
-Keywords: audio,livekit,realtime,video,webrtc
-Classifier: Intended Audience :: Developers
-Classifier: License :: OSI Approved :: Apache Software License
-Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3 :: Only
-Classifier: Programming Language :: Python :: 3.9
-Classifier: Programming Language :: Python :: 3.10
-Classifier: Topic :: Multimedia :: Sound/Audio
-Classifier: Topic :: Multimedia :: Video
-Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
-Requires-Python: >=3.9.0
-Requires-Dist: google-auth<3,>=2
-Requires-Dist: google-cloud-speech<3,>=2
-Requires-Dist: google-cloud-texttospeech<3,>=2
-Requires-Dist: google-genai>=1.12.1
-Requires-Dist: livekit-agents>=1.0.19
-Description-Content-Type: text/markdown
-# LiveKit Plugins Google
-Agent Framework plugin for services from Google Cloud. Currently supporting Google's [Speech-to-Text](https://cloud.google.com/speech-to-text) API.
-## Installation
-```bash
-pip install livekit-plugins-google
-```
-## Pre-requisites
-For credentials, you'll need a Google Cloud account and obtain the correct credentials. Credentials can be passed directly or via Application Default Credentials as specified in [How Application Default Credentials works](https://cloud.google.com/docs/authentication/application-default-credentials).
-To use the STT and TTS API, you'll need to enable the respective services for your Google Cloud project.
-- Cloud Speech-to-Text API
-- Cloud Text-to-Speech API
-## Gemini Multimodal Live
-Gemini Multimodal Live can be used with the `MultimodalAgent` class. See examples/multimodal_agent/gemini_agent.py for an example.
-### Live Video Input (experimental)
-You can push video frames to your Gemini Multimodal Live session alongside the audio automatically handled by the `MultimodalAgent`.  The basic approach is to subscribe to the video track, create a video stream, sample frames at a suitable frame rate, and push them into the RealtimeSession:
-```
-# Make sure you subscribe to audio and video tracks
-await ctx.connect(auto_subscribe=AutoSubscribe.SUBSCRIBE_ALL)
-# Create your RealtimeModel and store a reference
-model = google.beta.realtime.RealtimeModel(
-    # ...
-)
-# Create your MultimodalAgent as usual
-agent = MultimodalAgent(
-    model=model,
-    # ...
-)
-# Async method to process the video track and push frames to Gemini
-async def _process_video_track(self, track: Track):
-    video_stream = VideoStream(track)
-    last_frame_time = 0
-    async for event in video_stream:
-        current_time = asyncio.get_event_loop().time()
-        # Sample at 1 FPS
-        if current_time - last_frame_time < 1.0:
-            continue
-        last_frame_time = current_time
-        frame = event.frame
-        # Push the frame into the RealtimeSession
-        model.sessions[0].push_video(frame)
-    await video_stream.aclose()
-# Subscribe to new tracks and process them
-@ctx.room.on("track_subscribed")
-def _on_track_subscribed(track: Track, pub, participant):
-    if track.kind == TrackKind.KIND_VIDEO:
-        asyncio.create_task(self._process_video_track(track))
-```

livekit_plugins_google-1.0.19.dist-info/RECORD DELETED Viewed

@@ -1,16 +0,0 @@
-livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
-livekit/plugins/google/llm.py,sha256=NaaT4Zaw6o98VcUHNrQcZZRkD7DPREd76O8fG9IOpXQ,16190
-livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
-livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
-livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-livekit/plugins/google/stt.py,sha256=MADnkh0YKWY4bLRgBwFv4emu4YFO-7EVnhxO--dPTlI,23082
-livekit/plugins/google/tts.py,sha256=29R0ieV5sRPBf5Yi0SPFQk7ZZMbELF30bIL9K_j_Wcg,9100
-livekit/plugins/google/utils.py,sha256=sPZZg5VHf60kSILUIHGIZyN2CWYwnCGNYICn8Mhcv9g,9534
-livekit/plugins/google/version.py,sha256=UDC8ahmGgRkv-qMQUY3QibuuVevGMQ9Fd4yIhcQBZwA,601
-livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
-livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
-livekit/plugins/google/beta/realtime/api_proto.py,sha256=Fyrejs3SG0EjOPCCFLEnWXKEUxCff47PMWk2VsKJm5E,594
-livekit/plugins/google/beta/realtime/realtime_api.py,sha256=yk202S604Eogp_ssBX2BSbAXV67uUyQzVO-bzLnScrs,31423
-livekit_plugins_google-1.0.19.dist-info/METADATA,sha256=HuRBvpT9dX3Mz7YOVhZhgQLm3-qQa2vAf2SRDQ5u1vM,3492
-livekit_plugins_google-1.0.19.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-livekit_plugins_google-1.0.19.dist-info/RECORD,,

{livekit_plugins_google-1.0.19.dist-info → livekit_plugins_google-1.0.21.dist-info}/WHEEL RENAMED Viewed

File without changes

livekit-plugins-google 1.0.19__py3-none-any.whl → 1.0.21__py3-none-any.whl

livekit-plugins-google 1.0.19py3-none-any.whl → 1.0.21py3-none-any.whl