PyPI - livekit-plugins-google - Versions diffs - 1.0.22__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

livekit-plugins-google 1.0.22py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

livekit/plugins/google/__init__.py +3 -2
livekit/plugins/google/beta/realtime/api_proto.py +8 -2
livekit/plugins/google/beta/realtime/realtime_api.py +316 -117
livekit/plugins/google/llm.py +62 -32
livekit/plugins/google/models.py +1 -0
livekit/plugins/google/stt.py +19 -12
livekit/plugins/google/tools.py +11 -0
livekit/plugins/google/tts.py +109 -136
livekit/plugins/google/utils.py +39 -88
livekit/plugins/google/version.py +1 -1
{livekit_plugins_google-1.0.22.dist-info → livekit_plugins_google-1.1.0.dist-info}/METADATA +3 -3
livekit_plugins_google-1.1.0.dist-info/RECORD +17 -0
livekit_plugins_google-1.0.22.dist-info/RECORD +0 -16
{livekit_plugins_google-1.0.22.dist-info → livekit_plugins_google-1.1.0.dist-info}/WHEEL +0 -0

livekit/plugins/google/beta/realtime/realtime_api.py CHANGED Viewed

@@ -10,42 +10,23 @@ from collections.abc import Iterator
 from dataclasses import dataclass, field
 from google import genai
+from google.genai import types
 from google.genai.live import AsyncSession
-from google.genai.types import (
-    AudioTranscriptionConfig,
-    AutomaticActivityDetection,
-    Blob,
-    Content,
-    FunctionDeclaration,
-    GenerationConfig,
-    LiveClientContent,
-    LiveClientRealtimeInput,
-    LiveClientToolResponse,
-    LiveConnectConfig,
-    LiveServerContent,
-    LiveServerGoAway,
-    LiveServerToolCall,
-    LiveServerToolCallCancellation,
-    Modality,
-    ModalityTokenCount,
-    Part,
-    PrebuiltVoiceConfig,
-    RealtimeInputConfig,
-    SessionResumptionConfig,
-    SpeechConfig,
-    Tool,
-    UsageMetadata,
-    VoiceConfig,
-)
 from livekit import rtc
-from livekit.agents import llm, utils
+from livekit.agents import APIConnectionError, llm, utils
 from livekit.agents.metrics import RealtimeModelMetrics
-from livekit.agents.types import NOT_GIVEN, NotGivenOr
+from livekit.agents.types import (
+    DEFAULT_API_CONNECT_OPTIONS,
+    NOT_GIVEN,
+    APIConnectOptions,
+    NotGivenOr,
+)
 from livekit.agents.utils import audio as audio_utils, images, is_given
 from livekit.plugins.google.beta.realtime.api_proto import ClientEvents, LiveAPIModels, Voice
 from ...log import logger
-from ...utils import get_tool_results_for_realtime, to_chat_ctx, to_fnc_ctx
+from ...tools import _LLMTool
+from ...utils import create_tools_config, get_tool_results_for_realtime, to_fnc_ctx
 INPUT_AUDIO_SAMPLE_RATE = 16000
 INPUT_AUDIO_CHANNELS = 1
@@ -71,7 +52,7 @@ class _RealtimeOptions:
     api_key: str | None
     voice: Voice | str
     language: NotGivenOr[str]
-    response_modalities: NotGivenOr[list[Modality]]
+    response_modalities: NotGivenOr[list[types.Modality]]
     vertexai: bool
     project: str | None
     location: str | None
@@ -83,9 +64,16 @@ class _RealtimeOptions:
     presence_penalty: NotGivenOr[float]
     frequency_penalty: NotGivenOr[float]
     instructions: NotGivenOr[str]
-    input_audio_transcription: AudioTranscriptionConfig | None
-    output_audio_transcription: AudioTranscriptionConfig | None
+    input_audio_transcription: types.AudioTranscriptionConfig | None
+    output_audio_transcription: types.AudioTranscriptionConfig | None
     image_encode_options: NotGivenOr[images.EncodeOptions]
+    conn_options: APIConnectOptions
+    enable_affective_dialog: NotGivenOr[bool] = NOT_GIVEN
+    proactivity: NotGivenOr[bool] = NOT_GIVEN
+    realtime_input_config: NotGivenOr[types.RealtimeInputConfig] = NOT_GIVEN
+    context_window_compression: NotGivenOr[types.ContextWindowCompressionConfig] = NOT_GIVEN
+    api_version: NotGivenOr[str] = NOT_GIVEN
+    gemini_tools: NotGivenOr[list[_LLMTool]] = NOT_GIVEN
 @dataclass
@@ -93,10 +81,13 @@ class _ResponseGeneration:
     message_ch: utils.aio.Chan[llm.MessageGeneration]
     function_ch: utils.aio.Chan[llm.FunctionCall]
+    input_id: str
     response_id: str
     text_ch: utils.aio.Chan[str]
     audio_ch: utils.aio.Chan[rtc.AudioFrame]
     input_transcription: str = ""
+    output_text: str = ""
     _created_timestamp: float = field(default_factory=time.time)
     """The timestamp when the generation is created"""
@@ -107,6 +98,14 @@ class _ResponseGeneration:
     _done: bool = False
     """Whether the generation is done (set when the turn is complete)"""
+    def push_text(self, text: str) -> None:
+        if self.output_text:
+            self.output_text += text
+        else:
+            self.output_text = text
+        self.text_ch.send_nowait(text)
 class RealtimeModel(llm.RealtimeModel):
     def __init__(
@@ -117,7 +116,7 @@ class RealtimeModel(llm.RealtimeModel):
         api_key: NotGivenOr[str] = NOT_GIVEN,
         voice: Voice | str = "Puck",
         language: NotGivenOr[str] = NOT_GIVEN,
-        modalities: NotGivenOr[list[Modality]] = NOT_GIVEN,
+        modalities: NotGivenOr[list[types.Modality]] = NOT_GIVEN,
         vertexai: NotGivenOr[bool] = NOT_GIVEN,
         project: NotGivenOr[str] = NOT_GIVEN,
         location: NotGivenOr[str] = NOT_GIVEN,
@@ -128,9 +127,16 @@ class RealtimeModel(llm.RealtimeModel):
         top_k: NotGivenOr[int] = NOT_GIVEN,
         presence_penalty: NotGivenOr[float] = NOT_GIVEN,
         frequency_penalty: NotGivenOr[float] = NOT_GIVEN,
-        input_audio_transcription: NotGivenOr[AudioTranscriptionConfig | None] = NOT_GIVEN,
-        output_audio_transcription: NotGivenOr[AudioTranscriptionConfig | None] = NOT_GIVEN,
+        input_audio_transcription: NotGivenOr[types.AudioTranscriptionConfig | None] = NOT_GIVEN,
+        output_audio_transcription: NotGivenOr[types.AudioTranscriptionConfig | None] = NOT_GIVEN,
         image_encode_options: NotGivenOr[images.EncodeOptions] = NOT_GIVEN,
+        enable_affective_dialog: NotGivenOr[bool] = NOT_GIVEN,
+        proactivity: NotGivenOr[bool] = NOT_GIVEN,
+        realtime_input_config: NotGivenOr[types.RealtimeInputConfig] = NOT_GIVEN,
+        context_window_compression: NotGivenOr[types.ContextWindowCompressionConfig] = NOT_GIVEN,
+        api_version: NotGivenOr[str] = NOT_GIVEN,
+        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
+        _gemini_tools: NotGivenOr[list[_LLMTool]] = NOT_GIVEN,
     ) -> None:
         """
         Initializes a RealtimeModel instance for interacting with Google's Realtime API.
@@ -161,19 +167,33 @@ class RealtimeModel(llm.RealtimeModel):
             input_audio_transcription (AudioTranscriptionConfig | None, optional): The configuration for input audio transcription. Defaults to None.)
             output_audio_transcription (AudioTranscriptionConfig | None, optional): The configuration for output audio transcription. Defaults to AudioTranscriptionConfig().
             image_encode_options (images.EncodeOptions, optional): The configuration for image encoding. Defaults to DEFAULT_ENCODE_OPTIONS.
+            enable_affective_dialog (bool, optional): Whether to enable affective dialog. Defaults to False.
+            proactivity (bool, optional): Whether to enable proactive audio. Defaults to False.
+            realtime_input_config (RealtimeInputConfig, optional): The configuration for realtime input. Defaults to None.
+            context_window_compression (ContextWindowCompressionConfig, optional): The configuration for context window compression. Defaults to None.
+            conn_options (APIConnectOptions, optional): The configuration for the API connection. Defaults to DEFAULT_API_CONNECT_OPTIONS.
+            _gemini_tools (list[LLMTool], optional): Gemini-specific tools to use for the session. This parameter is experimental and may change.
         Raises:
             ValueError: If the API key is required but not found.
         """  # noqa: E501
         if not is_given(input_audio_transcription):
-            input_audio_transcription = AudioTranscriptionConfig()
+            input_audio_transcription = types.AudioTranscriptionConfig()
         if not is_given(output_audio_transcription):
-            output_audio_transcription = AudioTranscriptionConfig()
+            output_audio_transcription = types.AudioTranscriptionConfig()
+        server_turn_detection = True
+        if (
+            is_given(realtime_input_config)
+            and realtime_input_config.automatic_activity_detection
+            and realtime_input_config.automatic_activity_detection.disabled
+        ):
+            server_turn_detection = False
         super().__init__(
             capabilities=llm.RealtimeCapabilities(
                 message_truncation=False,
-                turn_detection=True,
+                turn_detection=server_turn_detection,
                 user_transcription=input_audio_transcription is not None,
                 auto_tool_reply_generation=True,
             )
@@ -187,7 +207,7 @@ class RealtimeModel(llm.RealtimeModel):
         gemini_api_key = api_key if is_given(api_key) else os.environ.get("GOOGLE_API_KEY")
         gcp_project = project if is_given(project) else os.environ.get("GOOGLE_CLOUD_PROJECT")
-        gcp_location = (
+        gcp_location: str | None = (
             location
             if is_given(location)
             else os.environ.get("GOOGLE_CLOUD_LOCATION") or "us-central1"
@@ -232,6 +252,13 @@ class RealtimeModel(llm.RealtimeModel):
             output_audio_transcription=output_audio_transcription,
             language=language,
             image_encode_options=image_encode_options,
+            enable_affective_dialog=enable_affective_dialog,
+            proactivity=proactivity,
+            realtime_input_config=realtime_input_config,
+            context_window_compression=context_window_compression,
+            api_version=api_version,
+            gemini_tools=_gemini_tools,
+            conn_options=conn_options,
         )
         self._sessions = weakref.WeakSet[RealtimeSession]()
@@ -242,8 +269,19 @@ class RealtimeModel(llm.RealtimeModel):
         return sess
     def update_options(
-        self, *, voice: NotGivenOr[str] = NOT_GIVEN, temperature: NotGivenOr[float] = NOT_GIVEN
+        self,
+        *,
+        voice: NotGivenOr[str] = NOT_GIVEN,
+        temperature: NotGivenOr[float] = NOT_GIVEN,
     ) -> None:
+        """
+        Update the options for the RealtimeModel.
+        Args:
+            voice (str, optional): The voice to use for the session.
+            temperature (float, optional): The temperature to use for the session.
+            tools (list[LLMTool], optional): The tools to use for the session.
+        """
         if is_given(voice):
             self._opts.voice = voice
@@ -251,7 +289,10 @@ class RealtimeModel(llm.RealtimeModel):
             self._opts.temperature = temperature
         for sess in self._sessions:
-            sess.update_options(voice=self._opts.voice, temperature=self._opts.temperature)
+            sess.update_options(
+                voice=self._opts.voice,
+                temperature=self._opts.temperature,
+            )
     async def aclose(self) -> None:
         pass
@@ -262,7 +303,7 @@ class RealtimeSession(llm.RealtimeSession):
         super().__init__(realtime_model)
         self._opts = realtime_model._opts
         self._tools = llm.ToolContext.empty()
-        self._gemini_declarations: list[FunctionDeclaration] = []
+        self._gemini_declarations: list[types.FunctionDeclaration] = []
         self._chat_ctx = llm.ChatContext.empty()
         self._msg_ch = utils.aio.Chan[ClientEvents]()
         self._input_resampler: rtc.AudioResampler | None = None
@@ -274,11 +315,20 @@ class RealtimeSession(llm.RealtimeSession):
             samples_per_channel=INPUT_AUDIO_SAMPLE_RATE // 20,
         )
+        api_version = self._opts.api_version
+        if not api_version and (self._opts.enable_affective_dialog or self._opts.proactivity):
+            api_version = "v1alpha"
+        http_options = types.HttpOptions(timeout=int(self._opts.conn_options.timeout * 1000))
+        if api_version:
+            http_options.api_version = api_version
         self._client = genai.Client(
             api_key=self._opts.api_key,
             vertexai=self._opts.vertexai,
             project=self._opts.project,
             location=self._opts.location,
+            http_options=http_options,
         )
         self._main_atask = asyncio.create_task(self._main_task(), name="gemini-realtime-session")
@@ -291,8 +341,9 @@ class RealtimeSession(llm.RealtimeSession):
         self._pending_generation_fut: asyncio.Future[llm.GenerationCreatedEvent] | None = None
         self._session_resumption_handle: str | None = None
+        self._in_user_activity = False
         self._session_lock = asyncio.Lock()
+        self._num_retries = 0
     async def _close_active_session(self) -> None:
         async with self._session_lock:
@@ -304,7 +355,7 @@ class RealtimeSession(llm.RealtimeSession):
                 finally:
                     self._active_session = None
-    def _mark_restart_needed(self):
+    def _mark_restart_needed(self) -> None:
         if not self._session_should_close.is_set():
             self._session_should_close.set()
             # reset the msg_ch, do not send messages from previous session
@@ -335,6 +386,11 @@ class RealtimeSession(llm.RealtimeSession):
             self._mark_restart_needed()
     async def update_chat_ctx(self, chat_ctx: llm.ChatContext) -> None:
+        async with self._session_lock:
+            if not self._active_session:
+                self._chat_ctx = chat_ctx.copy()
+                return
         diff_ops = llm.utils.compute_chat_ctx_diff(self._chat_ctx, chat_ctx)
         if diff_ops.to_remove:
@@ -347,15 +403,23 @@ class RealtimeSession(llm.RealtimeSession):
                 append_ctx.items.append(item)
         if append_ctx.items:
-            turns, _ = to_chat_ctx(append_ctx, id(self), ignore_functions=True)
+            turns_dict, _ = append_ctx.copy(
+                exclude_function_call=True,
+            ).to_provider_format(format="google", inject_dummy_user_message=False)
+            # we are not generating, and do not need to inject
+            turns = [types.Content.model_validate(turn) for turn in turns_dict]
             tool_results = get_tool_results_for_realtime(append_ctx, vertexai=self._opts.vertexai)
             if turns:
-                self._send_client_event(LiveClientContent(turns=turns, turn_complete=False))
+                self._send_client_event(types.LiveClientContent(turns=turns, turn_complete=False))
             if tool_results:
                 self._send_client_event(tool_results)
-    async def update_tools(self, tools: list[llm.FunctionTool]) -> None:
-        new_declarations: list[FunctionDeclaration] = to_fnc_ctx(tools)
+        # since we don't have a view of the history on the server side, we'll assume
+        # the current state is accurate. this isn't perfect because removals aren't done.
+        self._chat_ctx = chat_ctx.copy()
+    async def update_tools(self, tools: list[llm.FunctionTool | llm.RawFunctionTool]) -> None:
+        new_declarations: list[types.FunctionDeclaration] = to_fnc_ctx(tools)
         current_tool_names = {f.name for f in self._gemini_declarations}
         new_tool_names = {f.name for f in new_declarations}
@@ -372,11 +436,21 @@ class RealtimeSession(llm.RealtimeSession):
     def tools(self) -> llm.ToolContext:
         return self._tools.copy()
+    @property
+    def _manual_activity_detection(self) -> bool:
+        if (
+            is_given(self._opts.realtime_input_config)
+            and self._opts.realtime_input_config.automatic_activity_detection is not None
+            and self._opts.realtime_input_config.automatic_activity_detection.disabled
+        ):
+            return True
+        return False
     def push_audio(self, frame: rtc.AudioFrame) -> None:
         for f in self._resample_audio(frame):
             for nf in self._bstream.write(f.data.tobytes()):
-                realtime_input = LiveClientRealtimeInput(
-                    media_chunks=[Blob(data=nf.data.tobytes(), mime_type="audio/pcm")]
+                realtime_input = types.LiveClientRealtimeInput(
+                    media_chunks=[types.Blob(data=nf.data.tobytes(), mime_type="audio/pcm")]
                 )
                 self._send_client_event(realtime_input)
@@ -384,8 +458,8 @@ class RealtimeSession(llm.RealtimeSession):
         encoded_data = images.encode(
             frame, self._opts.image_encode_options or DEFAULT_IMAGE_ENCODE_OPTIONS
         )
-        realtime_input = LiveClientRealtimeInput(
-            media_chunks=[Blob(data=encoded_data, mime_type="image/jpeg")]
+        realtime_input = types.LiveClientRealtimeInput(
+            media_chunks=[types.Blob(data=encoded_data, mime_type="image/jpeg")]
         )
         self._send_client_event(realtime_input)
@@ -402,16 +476,24 @@ class RealtimeSession(llm.RealtimeSession):
             )
             self._pending_generation_fut.cancel("Superseded by new generate_reply call")
-        fut = asyncio.Future()
+        fut = asyncio.Future[llm.GenerationCreatedEvent]()
         self._pending_generation_fut = fut
+        if self._in_user_activity:
+            self._send_client_event(
+                types.LiveClientRealtimeInput(
+                    activity_end=types.ActivityEnd(),
+                )
+            )
+            self._in_user_activity = False
         # Gemini requires the last message to end with user's turn
         # so we need to add a placeholder user turn in order to trigger a new generation
-        event = LiveClientContent(turns=[], turn_complete=True)
+        turns = []
         if is_given(instructions):
-            event.turns.append(Content(parts=[Part(text=instructions)], role="model"))
-        event.turns.append(Content(parts=[Part(text=".")], role="user"))
-        self._send_client_event(event)
+            turns.append(types.Content(parts=[types.Part(text=instructions)], role="model"))
+        turns.append(types.Content(parts=[types.Part(text=".")], role="user"))
+        self._send_client_event(types.LiveClientContent(turns=turns, turn_complete=True))
         def _on_timeout() -> None:
             if not fut.done():
@@ -428,8 +510,28 @@ class RealtimeSession(llm.RealtimeSession):
         return fut
+    def start_user_activity(self) -> None:
+        if not self._manual_activity_detection:
+            return
+        if not self._in_user_activity:
+            self._in_user_activity = True
+            self._send_client_event(
+                types.LiveClientRealtimeInput(
+                    activity_start=types.ActivityStart(),
+                )
+            )
     def interrupt(self) -> None:
-        pass
+        # Gemini Live treats activity start as interruption, so we rely on start_user_activity
+        # notifications to handle it
+        if (
+            self._opts.realtime_input_config
+            and self._opts.realtime_input_config.activity_handling
+            == types.ActivityHandling.NO_INTERRUPTION
+        ):
+            return
+        self.start_user_activity()
     def truncate(self, *, message_id: str, audio_end_ms: int) -> None:
         logger.warning("truncate is not supported by the Google Realtime API.")
@@ -456,7 +558,9 @@ class RealtimeSession(llm.RealtimeSession):
             self._mark_current_generation_done()
     @utils.log_exceptions(logger=logger)
-    async def _main_task(self):
+    async def _main_task(self) -> None:
+        max_retries = self._opts.conn_options.max_retry
         while not self._msg_ch.closed:
             # previous session might not be closed yet, we'll do it here.
             await self._close_active_session()
@@ -471,7 +575,15 @@ class RealtimeSession(llm.RealtimeSession):
                 ) as session:
                     async with self._session_lock:
                         self._active_session = session
+                        turns_dict, _ = self._chat_ctx.copy(
+                            exclude_function_call=True,
+                        ).to_provider_format(format="google", inject_dummy_user_message=False)
+                        if turns_dict:
+                            turns = [types.Content.model_validate(turn) for turn in turns_dict]
+                            await session.send_client_content(
+                                turns=turns,  # type: ignore
+                                turn_complete=False,
+                            )
                     # queue up existing chat context
                     send_task = asyncio.create_task(
                         self._send_task(session), name="gemini-realtime-send"
@@ -504,12 +616,30 @@ class RealtimeSession(llm.RealtimeSession):
             except Exception as e:
                 logger.error(f"Gemini Realtime API error: {e}", exc_info=e)
                 if not self._msg_ch.closed:
-                    logger.info("attempting to reconnect after 1 seconds...")
-                    await asyncio.sleep(1)
+                    # we shouldn't retry when it's not connected, usually this means incorrect
+                    # parameters or setup
+                    if not session or max_retries == 0:
+                        self._emit_error(e, recoverable=False)
+                        raise APIConnectionError(message="Failed to connect to Gemini Live") from e
+                    if self._num_retries == max_retries:
+                        self._emit_error(e, recoverable=False)
+                        raise APIConnectionError(
+                            message=f"Failed to connect to Gemini Live after {max_retries} attempts"
+                        ) from e
+                    retry_interval = self._opts.conn_options._interval_for_retry(self._num_retries)
+                    logger.warning(
+                        f"Gemini Realtime API connection failed, retrying in {retry_interval}s",
+                        exc_info=e,
+                        extra={"attempt": self._num_retries, "max_retries": max_retries},
+                    )
+                    await asyncio.sleep(retry_interval)
+                    self._num_retries += 1
             finally:
                 await self._close_active_session()
-    async def _send_task(self, session: AsyncSession):
+    async def _send_task(self, session: AsyncSession) -> None:
         try:
             async for msg in self._msg_ch:
                 async with self._session_lock:
@@ -517,15 +647,21 @@ class RealtimeSession(llm.RealtimeSession):
                         not self._active_session or self._active_session != session
                     ):
                         break
-                if isinstance(msg, LiveClientContent):
+                if isinstance(msg, types.LiveClientContent):
                     await session.send_client_content(
-                        turns=msg.turns, turn_complete=msg.turn_complete
+                        turns=msg.turns,  # type: ignore
+                        turn_complete=msg.turn_complete or True,
                     )
-                elif isinstance(msg, LiveClientToolResponse):
+                elif isinstance(msg, types.LiveClientToolResponse) and msg.function_responses:
                     await session.send_tool_response(function_responses=msg.function_responses)
-                elif isinstance(msg, LiveClientRealtimeInput):
-                    for media_chunk in msg.media_chunks:
-                        await session.send_realtime_input(media=media_chunk)
+                elif isinstance(msg, types.LiveClientRealtimeInput):
+                    if msg.media_chunks:
+                        for media_chunk in msg.media_chunks:
+                            await session.send_realtime_input(media=media_chunk)
+                    elif msg.activity_start:
+                        await session.send_realtime_input(activity_start=msg.activity_start)
+                    elif msg.activity_end:
+                        await session.send_realtime_input(activity_end=msg.activity_end)
                 else:
                     logger.warning(f"Warning: Received unhandled message type: {type(msg)}")
@@ -536,7 +672,7 @@ class RealtimeSession(llm.RealtimeSession):
         finally:
             logger.debug("send task finished.")
-    async def _recv_task(self, session: AsyncSession):
+    async def _recv_task(self, session: AsyncSession) -> None:
         try:
             while True:
                 async with self._session_lock:
@@ -572,6 +708,9 @@ class RealtimeSession(llm.RealtimeSession):
                     if response.go_away:
                         self._handle_go_away(response.go_away)
+                    if self._num_retries > 0:
+                        self._num_retries = 0  # reset the retry counter
                 # TODO(dz): a server-side turn is complete
         except Exception as e:
             if not self._session_should_close.is_set():
@@ -580,14 +719,18 @@ class RealtimeSession(llm.RealtimeSession):
         finally:
             self._mark_current_generation_done()
-    def _build_connect_config(self) -> LiveConnectConfig:
+    def _build_connect_config(self) -> types.LiveConnectConfig:
         temp = self._opts.temperature if is_given(self._opts.temperature) else None
-        return LiveConnectConfig(
+        tools_config = create_tools_config(
+            function_tools=self._gemini_declarations,
+            gemini_tools=self._opts.gemini_tools if is_given(self._opts.gemini_tools) else None,
+        )
+        conf = types.LiveConnectConfig(
             response_modalities=self._opts.response_modalities
             if is_given(self._opts.response_modalities)
-            else [Modality.AUDIO],
-            generation_config=GenerationConfig(
+            else [types.Modality.AUDIO],
+            generation_config=types.GenerationConfig(
                 candidate_count=self._opts.candidate_count,
                 temperature=temp,
                 max_output_tokens=self._opts.max_output_tokens
@@ -602,34 +745,45 @@ class RealtimeSession(llm.RealtimeSession):
                 if is_given(self._opts.frequency_penalty)
                 else None,
             ),
-            system_instruction=Content(parts=[Part(text=self._opts.instructions)])
+            system_instruction=types.Content(parts=[types.Part(text=self._opts.instructions)])
             if is_given(self._opts.instructions)
             else None,
-            speech_config=SpeechConfig(
-                voice_config=VoiceConfig(
-                    prebuilt_voice_config=PrebuiltVoiceConfig(voice_name=self._opts.voice)
+            speech_config=types.SpeechConfig(
+                voice_config=types.VoiceConfig(
+                    prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=self._opts.voice)
                 ),
                 language_code=self._opts.language if is_given(self._opts.language) else None,
             ),
-            tools=[Tool(function_declarations=self._gemini_declarations)],
+            tools=tools_config,  # type: ignore
             input_audio_transcription=self._opts.input_audio_transcription,
             output_audio_transcription=self._opts.output_audio_transcription,
-            session_resumption=SessionResumptionConfig(handle=self._session_resumption_handle),
-            realtime_input_config=RealtimeInputConfig(
-                automatic_activity_detection=AutomaticActivityDetection(),
+            session_resumption=types.SessionResumptionConfig(
+                handle=self._session_resumption_handle
             ),
         )
-    def _start_new_generation(self):
+        if is_given(self._opts.proactivity):
+            conf.proactivity = types.ProactivityConfig(proactive_audio=self._opts.proactivity)
+        if is_given(self._opts.enable_affective_dialog):
+            conf.enable_affective_dialog = self._opts.enable_affective_dialog
+        if is_given(self._opts.realtime_input_config):
+            conf.realtime_input_config = self._opts.realtime_input_config
+        if is_given(self._opts.context_window_compression):
+            conf.context_window_compression = self._opts.context_window_compression
+        return conf
+    def _start_new_generation(self) -> None:
         if self._current_generation and not self._current_generation._done:
             logger.warning("starting new generation while another is active. Finalizing previous.")
             self._mark_current_generation_done()
-        response_id = utils.shortuuid("gemini-turn-")
+        response_id = utils.shortuuid("GR_")
         self._current_generation = _ResponseGeneration(
             message_ch=utils.aio.Chan[llm.MessageGeneration](),
             function_ch=utils.aio.Chan[llm.FunctionCall](),
             response_id=response_id,
+            input_id=utils.shortuuid("GI_"),
             text_ch=utils.aio.Chan[str](),
             audio_ch=utils.aio.Chan[rtc.AudioFrame](),
             _created_timestamp=time.time(),
@@ -656,21 +810,23 @@ class RealtimeSession(llm.RealtimeSession):
         self.emit("generation_created", generation_event)
-    def _handle_server_content(self, server_content: LiveServerContent):
+    def _handle_server_content(self, server_content: types.LiveServerContent) -> None:
         current_gen = self._current_generation
         if not current_gen:
             logger.warning("received server content but no active generation.")
             return
         if model_turn := server_content.model_turn:
-            for part in model_turn.parts:
+            for part in model_turn.parts or []:
                 if part.text:
-                    current_gen.text_ch.send_nowait(part.text)
+                    current_gen.push_text(part.text)
                 if part.inline_data:
                     if not current_gen._first_token_timestamp:
                         current_gen._first_token_timestamp = time.time()
                     frame_data = part.inline_data.data
                     try:
+                        if not isinstance(frame_data, bytes):
+                            raise ValueError("frame_data is not bytes")
                         frame = rtc.AudioFrame(
                             data=frame_data,
                             sample_rate=OUTPUT_AUDIO_SAMPLE_RATE,
@@ -692,7 +848,7 @@ class RealtimeSession(llm.RealtimeSession):
                 self.emit(
                     "input_audio_transcription_completed",
                     llm.InputTranscriptionCompleted(
-                        item_id=current_gen.response_id,
+                        item_id=current_gen.input_id,
                         transcript=current_gen.input_transcription,
                         is_final=False,
                     ),
@@ -701,20 +857,9 @@ class RealtimeSession(llm.RealtimeSession):
         if output_transcription := server_content.output_transcription:
             text = output_transcription.text
             if text:
-                current_gen.text_ch.send_nowait(text)
+                current_gen.push_text(text)
-        if server_content.generation_complete:
-            # The only way we'd know that the transcription is complete is by when they are
-            # done with generation
-            if current_gen.input_transcription:
-                self.emit(
-                    "input_audio_transcription_completed",
-                    llm.InputTranscriptionCompleted(
-                        item_id=current_gen.response_id,
-                        transcript=current_gen.input_transcription,
-                        is_final=True,
-                    ),
-                )
+        if server_content.generation_complete or server_content.turn_complete:
             current_gen._completed_timestamp = time.time()
         if server_content.interrupted:
@@ -724,10 +869,38 @@ class RealtimeSession(llm.RealtimeSession):
             self._mark_current_generation_done()
     def _mark_current_generation_done(self) -> None:
-        if not self._current_generation:
+        if not self._current_generation or self._current_generation._done:
             return
         gen = self._current_generation
+        # The only way we'd know that the transcription is complete is by when they are
+        # done with generation
+        if gen.input_transcription:
+            self.emit(
+                "input_audio_transcription_completed",
+                llm.InputTranscriptionCompleted(
+                    item_id=gen.input_id,
+                    transcript=gen.input_transcription,
+                    is_final=True,
+                ),
+            )
+            # since gemini doesn't give us a view of the chat history on the server side,
+            # we would handle it manually here
+            self._chat_ctx.add_message(
+                role="user",
+                content=gen.input_transcription,
+                id=gen.input_id,
+            )
+        if gen.output_text:
+            self._chat_ctx.add_message(
+                role="assistant",
+                content=gen.output_text,
+                id=gen.response_id,
+            )
         if not gen.text_ch.closed:
             gen.text_ch.close()
         if not gen.audio_ch.closed:
@@ -737,36 +910,37 @@ class RealtimeSession(llm.RealtimeSession):
         gen.message_ch.close()
         gen._done = True
-    def _handle_input_speech_started(self):
+    def _handle_input_speech_started(self) -> None:
         self.emit("input_speech_started", llm.InputSpeechStartedEvent())
-    def _handle_tool_calls(self, tool_call: LiveServerToolCall):
+    def _handle_tool_calls(self, tool_call: types.LiveServerToolCall) -> None:
         if not self._current_generation:
             logger.warning("received tool call but no active generation.")
             return
         gen = self._current_generation
-        for fnc_call in tool_call.function_calls:
+        for fnc_call in tool_call.function_calls or []:
             arguments = json.dumps(fnc_call.args)
             gen.function_ch.send_nowait(
                 llm.FunctionCall(
                     call_id=fnc_call.id or utils.shortuuid("fnc-call-"),
-                    name=fnc_call.name,
+                    name=fnc_call.name,  # type: ignore
                     arguments=arguments,
                 )
             )
+        self._on_final_input_audio_transcription()
         self._mark_current_generation_done()
     def _handle_tool_call_cancellation(
-        self, tool_call_cancellation: LiveServerToolCallCancellation
-    ):
+        self, tool_call_cancellation: types.LiveServerToolCallCancellation
+    ) -> None:
         logger.warning(
             "server cancelled tool calls",
             extra={"function_call_ids": tool_call_cancellation.ids},
         )
-    def _handle_usage_metadata(self, usage_metadata: UsageMetadata):
+    def _handle_usage_metadata(self, usage_metadata: types.UsageMetadata) -> None:
         current_gen = self._current_generation
         if not current_gen:
             logger.warning("no active generation to report metrics for")
@@ -782,18 +956,21 @@ class RealtimeSession(llm.RealtimeSession):
         ) - current_gen._created_timestamp
         def _token_details_map(
-            token_details: list[ModalityTokenCount] | None,
-        ) -> dict[Modality, int]:
+            token_details: list[types.ModalityTokenCount] | None,
+        ) -> dict[str, int]:
             token_details_map = {"audio_tokens": 0, "text_tokens": 0, "image_tokens": 0}
             if not token_details:
                 return token_details_map
             for token_detail in token_details:
-                if token_detail.modality == Modality.AUDIO:
+                if not token_detail.token_count:
+                    continue
+                if token_detail.modality == types.MediaModality.AUDIO:
                     token_details_map["audio_tokens"] += token_detail.token_count
-                elif token_detail.modality == Modality.TEXT:
+                elif token_detail.modality == types.MediaModality.TEXT:
                     token_details_map["text_tokens"] += token_detail.token_count
-                elif token_detail.modality == Modality.IMAGE:
+                elif token_detail.modality == types.MediaModality.IMAGE:
                     token_details_map["image_tokens"] += token_detail.token_count
             return token_details_map
@@ -807,7 +984,9 @@ class RealtimeSession(llm.RealtimeSession):
             input_tokens=usage_metadata.prompt_token_count or 0,
             output_tokens=usage_metadata.response_token_count or 0,
             total_tokens=usage_metadata.total_token_count or 0,
-            tokens_per_second=(usage_metadata.response_token_count or 0) / duration,
+            tokens_per_second=(usage_metadata.response_token_count or 0) / duration
+            if duration > 0
+            else 0,
             input_token_details=RealtimeModelMetrics.InputTokenDetails(
                 **_token_details_map(usage_metadata.prompt_tokens_details),
                 cached_tokens=sum(
@@ -824,18 +1003,27 @@ class RealtimeSession(llm.RealtimeSession):
         )
         self.emit("metrics_collected", metrics)
-    def _handle_go_away(self, go_away: LiveServerGoAway):
+    def _handle_go_away(self, go_away: types.LiveServerGoAway) -> None:
         logger.warning(
             f"Gemini server indicates disconnection soon. Time left: {go_away.time_left}"
         )
         # TODO(dz): this isn't a seamless reconnection just yet
         self._session_should_close.set()
+    def _on_final_input_audio_transcription(self) -> None:
+        if (gen := self._current_generation) and gen.input_transcription:
+            self.emit(
+                "input_audio_transcription_completed",
+                llm.InputTranscriptionCompleted(
+                    item_id=gen.response_id, transcript=gen.input_transcription, is_final=True
+                ),
+            )
     def commit_audio(self) -> None:
         pass
     def clear_audio(self) -> None:
-        self._bstream.clear()
+        pass
     def _resample_audio(self, frame: rtc.AudioFrame) -> Iterator[rtc.AudioFrame]:
         if self._input_resampler:
@@ -858,3 +1046,14 @@ class RealtimeSession(llm.RealtimeSession):
             yield from self._input_resampler.push(frame)
         else:
             yield frame
+    def _emit_error(self, error: Exception, recoverable: bool) -> None:
+        self.emit(
+            "error",
+            llm.RealtimeModelError(
+                timestamp=time.time(),
+                label=self._realtime_model._label,
+                error=error,
+                recoverable=recoverable,
+            ),
+        )

livekit-plugins-google 1.0.22__py3-none-any.whl → 1.1.0__py3-none-any.whl

livekit-plugins-google 1.0.22py3-none-any.whl → 1.1.0py3-none-any.whl