PyPI - livekit-plugins-google - Versions diffs - 0.10.2__py3-none-any.whl → 0.10.3__py3-none-any.whl - Mend

livekit-plugins-google 0.10.2py3-none-any.whl → 0.10.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

livekit/plugins/google/beta/realtime/api_proto.py CHANGED Viewed

@@ -6,7 +6,7 @@ from google.genai import types
 from ..._utils import _build_gemini_ctx, _build_tools
-LiveAPIModels = Literal["gemini-2.0-flash-exp"]
+LiveAPIModels = Literal["gemini-2.0-flash-001",]
 Voice = Literal["Puck", "Charon", "Kore", "Fenrir", "Aoede"]

livekit/plugins/google/beta/realtime/realtime_api.py CHANGED Viewed

@@ -37,7 +37,7 @@ from .api_proto import (
     _build_gemini_ctx,
     _build_tools,
 )
-from .transcriber import TranscriberSession, TranscriptionContent
+from .transcriber import ModelTranscriber, TranscriberSession, TranscriptionContent
 EventTypes = Literal[
     "start_session",
@@ -104,7 +104,7 @@ class RealtimeModel:
         self,
         *,
         instructions: str | None = None,
-        model: LiveAPIModels | str = "gemini-2.0-flash-exp",
+        model: LiveAPIModels | str = "gemini-2.0-flash-001",
         api_key: str | None = None,
         voice: Voice | str = "Puck",
         modalities: list[Modality] = ["AUDIO"],
@@ -136,7 +136,7 @@ class RealtimeModel:
             instructions (str, optional): Initial system instructions for the model. Defaults to "".
             api_key (str or None, optional): Google Gemini API key. If None, will attempt to read from the environment variable GOOGLE_API_KEY.
             modalities (list[Modality], optional): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
-            model (str or None, optional): The name of the model to use. Defaults to "gemini-2.0-flash-exp".
+            model (str or None, optional): The name of the model to use. Defaults to "gemini-2.0-flash-001".
             voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
             enable_user_audio_transcription (bool, optional): Whether to enable user audio transcription. Defaults to True
             enable_agent_audio_transcription (bool, optional): Whether to enable agent audio transcription. Defaults to True
@@ -301,7 +301,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
             )
             self._transcriber.on("input_speech_done", self._on_input_speech_done)
         if self._opts.enable_agent_audio_transcription:
-            self._agent_transcriber = TranscriberSession(
+            self._agent_transcriber = ModelTranscriber(
                 client=self._client, model=self._opts.model
             )
             self._agent_transcriber.on("input_speech_done", self._on_agent_speech_done)
@@ -382,7 +382,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
         # TODO: implement sync mechanism to make sure the transcribed user speech is inside the chat_ctx and always before the generated agent speech
     def _on_agent_speech_done(self, content: TranscriptionContent) -> None:
-        if not self._is_interrupted and content.response_id and content.text:
+        if content.response_id and content.text:
             self.emit(
                 "agent_speech_transcription_completed",
                 InputTranscription(
@@ -439,10 +439,12 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
                                         // 2,
                                     )
                                     if self._opts.enable_agent_audio_transcription:
-                                        self._agent_transcriber._push_audio(frame)
+                                        content.audio.append(frame)
                                     content.audio_stream.send_nowait(frame)
                         if server_content.interrupted or server_content.turn_complete:
+                            if self._opts.enable_agent_audio_transcription:
+                                self._agent_transcriber._push_audio(content.audio)
                             for stream in (content.text_stream, content.audio_stream):
                                 if isinstance(stream, utils.aio.Chan):
                                     stream.close()

livekit/plugins/google/beta/realtime/transcriber.py CHANGED Viewed

@@ -7,24 +7,21 @@ from typing import Literal
 import websockets
 from livekit import rtc
-from livekit.agents import utils
+from livekit.agents import APIConnectionError, APIStatusError, utils
 from google import genai
 from google.genai import types
+from google.genai.errors import APIError, ClientError, ServerError
 from ...log import logger
 from .api_proto import ClientEvents, LiveAPIModels
-EventTypes = Literal[
-    "input_speech_started",
-    "input_speech_done",
-]
+EventTypes = Literal["input_speech_started", "input_speech_done"]
 DEFAULT_LANGUAGE = "English"
 SYSTEM_INSTRUCTIONS = f"""
 You are an **Audio Transcriber**. Your task is to convert audio content into accurate and precise text.
 - Transcribe verbatim; exclude non-speech sounds.
 - Provide only transcription; no extra text or explanations.
 - If audio is unclear, respond with: `...`
@@ -32,7 +29,6 @@ You are an **Audio Transcriber**. Your task is to convert audio content into acc
 - Use proper punctuation and formatting.
 - Do not add explanations, comments, or extra information.
 - Do not include timestamps, speaker labels, or annotations unless specified.
 - Audio Language: {DEFAULT_LANGUAGE}
 """
@@ -44,30 +40,24 @@ class TranscriptionContent:
 class TranscriberSession(utils.EventEmitter[EventTypes]):
-    def __init__(
-        self,
-        *,
-        client: genai.Client,
-        model: LiveAPIModels | str,
-    ):
-        """
-        Initializes a TranscriberSession instance for interacting with Google's Realtime API.
-        """
+    """
+    Handles live audio transcription using the realtime API.
+    """
+    def __init__(self, *, client: genai.Client, model: LiveAPIModels | str):
         super().__init__()
         self._client = client
         self._model = model
         self._needed_sr = 16000
         self._closed = False
         system_instructions = types.Content(
             parts=[types.Part(text=SYSTEM_INSTRUCTIONS)]
         )
         self._config = types.LiveConnectConfig(
             response_modalities=["TEXT"],
             system_instruction=system_instructions,
-            generation_config=types.GenerationConfig(
-                temperature=0.0,
-            ),
+            generation_config=types.GenerationConfig(temperature=0.0),
         )
         self._main_atask = asyncio.create_task(
             self._main_task(), name="gemini-realtime-transcriber"
@@ -187,6 +177,93 @@ class TranscriberSession(utils.EventEmitter[EventTypes]):
                 await self._session.close()
+class ModelTranscriber(utils.EventEmitter[EventTypes]):
+    """
+    Transcribes agent audio using model generation.
+    """
+    def __init__(self, *, client: genai.Client, model: LiveAPIModels | str):
+        super().__init__()
+        self._client = client
+        self._model = model
+        self._needed_sr = 16000
+        self._system_instructions = types.Content(
+            parts=[types.Part(text=SYSTEM_INSTRUCTIONS)]
+        )
+        self._config = types.GenerateContentConfig(
+            temperature=0.0,
+            system_instruction=self._system_instructions,
+            # TODO: add response_schem
+        )
+        self._resampler: rtc.AudioResampler | None = None
+        self._buffer: rtc.AudioFrame | None = None
+        self._audio_ch = utils.aio.Chan[rtc.AudioFrame]()
+        self._main_atask = asyncio.create_task(
+            self._main_task(), name="gemini-model-transcriber"
+        )
+    async def aclose(self) -> None:
+        if self._audio_ch.closed:
+            return
+        self._audio_ch.close()
+        await self._main_atask
+    def _push_audio(self, frames: list[rtc.AudioFrame]) -> None:
+        if not frames:
+            return
+        buffer = utils.merge_frames(frames)
+        if buffer.sample_rate != self._needed_sr:
+            if self._resampler is None:
+                self._resampler = rtc.AudioResampler(
+                    input_rate=buffer.sample_rate,
+                    output_rate=self._needed_sr,
+                    quality=rtc.AudioResamplerQuality.HIGH,
+                )
+            buffer = utils.merge_frames(self._resampler.push(buffer))
+        self._audio_ch.send_nowait(buffer)
+    @utils.log_exceptions(logger=logger)
+    async def _main_task(self):
+        request_id = utils.shortuuid()
+        try:
+            async for buffer in self._audio_ch:
+                # TODO: stream content for better latency
+                response = await self._client.aio.models.generate_content(
+                    model=self._model,
+                    contents=[
+                        types.Content(
+                            parts=[
+                                types.Part(text=SYSTEM_INSTRUCTIONS),
+                                types.Part.from_bytes(
+                                    data=buffer.to_wav_bytes(),
+                                    mime_type="audio/wav",
+                                ),
+                            ],
+                            role="user",
+                        )
+                    ],
+                    config=self._config,
+                )
+                content = TranscriptionContent(
+                    response_id=request_id, text=clean_transcription(response.text)
+                )
+                self.emit("input_speech_done", content)
+        except (ClientError, ServerError, APIError) as e:
+            raise APIStatusError(
+                f"model transcriber error: {e}",
+                status_code=e.code,
+                body=e.message,
+                request_id=request_id,
+            ) from e
+        except Exception as e:
+            raise APIConnectionError("Error generating transcription") from e
 def clean_transcription(text: str) -> str:
     text = text.replace("\n", " ")
     text = re.sub(r"\s+", " ", text)

livekit/plugins/google/llm.py CHANGED Viewed

@@ -27,7 +27,7 @@ from livekit.agents import (
     llm,
     utils,
 )
-from livekit.agents.llm import ToolChoice, _create_ai_function_info
+from livekit.agents.llm import LLMCapabilities, ToolChoice, _create_ai_function_info
 from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, APIConnectOptions
 from google import genai
@@ -60,7 +60,7 @@ class LLM(llm.LLM):
     def __init__(
         self,
         *,
-        model: ChatModels | str = "gemini-2.0-flash-exp",
+        model: ChatModels | str = "gemini-2.0-flash-001",
         api_key: str | None = None,
         vertexai: bool = False,
         project: str | None = None,
@@ -85,7 +85,7 @@ class LLM(llm.LLM):
         - For Google Gemini API: Set the `api_key` argument or the `GOOGLE_API_KEY` environment variable.
         Args:
-            model (ChatModels | str, optional): The model name to use. Defaults to "gemini-2.0-flash-exp".
+            model (ChatModels | str, optional): The model name to use. Defaults to "gemini-2.0-flash-001".
             api_key (str, optional): The API key for Google Gemini. If not provided, it attempts to read from the `GOOGLE_API_KEY` environment variable.
             vertexai (bool, optional): Whether to use VertexAI. Defaults to False.
             project (str, optional): The Google Cloud project to use (only for VertexAI). Defaults to None.
@@ -99,8 +99,12 @@ class LLM(llm.LLM):
             frequency_penalty (float, optional): Penalizes the model for repeating words. Defaults to None.
             tool_choice (ToolChoice or Literal["auto", "required", "none"], optional): Specifies whether to use tools during response generation. Defaults to "auto".
         """
-        super().__init__()
-        self._capabilities = llm.LLMCapabilities(supports_choices_on_int=False)
+        super().__init__(
+            capabilities=LLMCapabilities(
+                supports_choices_on_int=False,
+                requires_persistent_functions=False,
+            )
+        )
         self._project_id = project or os.environ.get("GOOGLE_CLOUD_PROJECT", None)
         self._location = location or os.environ.get(
             "GOOGLE_CLOUD_LOCATION", "us-central1"

livekit/plugins/google/models.py CHANGED Viewed

@@ -94,4 +94,9 @@ Gender = Literal["male", "female", "neutral"]
 AudioEncoding = Literal["wav", "mp3", "ogg", "mulaw", "alaw", "linear16"]
-ChatModels = Literal["gemini-2.0-flash-exp", "gemini-1.5-pro"]
+ChatModels = Literal[
+    "gemini-2.0-flash-001",
+    "gemini-2.0-flash-lite-preview-02-05",
+    "gemini-2.0-pro-exp-02-05",
+    "gemini-1.5-pro",
+]

livekit/plugins/google/stt.py CHANGED Viewed

@@ -139,23 +139,26 @@ class STT(stt.STT):
         self._streams = weakref.WeakSet[SpeechStream]()
     def _ensure_client(self) -> SpeechAsyncClient:
+        # Add support for passing a specific location that matches recognizer
+        # see: https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages
+        client_options = None
+        if self._location != "global":
+            client_options = ClientOptions(
+                api_endpoint=f"{self._location}-speech.googleapis.com"
+            )
         if self._credentials_info:
             self._client = SpeechAsyncClient.from_service_account_info(
-                self._credentials_info
+                self._credentials_info,
+                client_options=client_options,
             )
         elif self._credentials_file:
             self._client = SpeechAsyncClient.from_service_account_file(
-                self._credentials_file
+                self._credentials_file,
+                client_options=client_options,
             )
-        elif self._location == "global":
-            self._client = SpeechAsyncClient()
         else:
-            # Add support for passing a specific location that matches recognizer
-            # see: https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages
             self._client = SpeechAsyncClient(
-                client_options=ClientOptions(
-                    api_endpoint=f"{self._location}-speech.googleapis.com"
-                )
+                client_options=client_options,
             )
         assert self._client is not None
         return self._client

livekit/plugins/google/version.py CHANGED Viewed

@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.10.2"
+__version__ = "0.10.3"

{livekit_plugins_google-0.10.2.dist-info → livekit_plugins_google-0.10.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: livekit-plugins-google
-Version: 0.10.2
+Version: 0.10.3
 Summary: Agent Framework plugin for services from Google Cloud
 Home-page: https://github.com/livekit/agents
 License: Apache-2.0

livekit_plugins_google-0.10.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,18 @@
+livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
+livekit/plugins/google/_utils.py,sha256=mjsqblhGMgAZ2MNPisAVkNsqq4gfO6vvprEKzAGoVwE,7248
+livekit/plugins/google/llm.py,sha256=TVTerAabIf10AKVZr-Kn13eajhQ9RV7K4xaVD771yHU,16547
+livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
+livekit/plugins/google/models.py,sha256=Q47z_tIwLCufxhJyJHH7_1bo4xdBYZBSkkvMeycuItg,1493
+livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+livekit/plugins/google/stt.py,sha256=zl5B8MroarvoBbOmSK5YzC1d3GJeltkpv4Y0n2XLoVE,21203
+livekit/plugins/google/tts.py,sha256=95qXCigVQYWNbcN3pIKBpIah4b31U_MWtXv5Ji0AMc4,9229
+livekit/plugins/google/version.py,sha256=k8ij2VzlolcsqiNUU1AriNVHljCjUQz0tYetVwc1gH0,601
+livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
+livekit/plugins/google/beta/realtime/__init__.py,sha256=sGTn6JFNyA30QUXBZ_BV3l2eHpGAzR35ByXxg77vWNU,205
+livekit/plugins/google/beta/realtime/api_proto.py,sha256=ralrRZqIbE71oyuLKRYaXHvm6tcHMwBJueKvSO8Xfus,658
+livekit/plugins/google/beta/realtime/realtime_api.py,sha256=3k2yJ-avbkyDBH3MKlCuBi0xiho003LHxCiYCsCXpg4,21281
+livekit/plugins/google/beta/realtime/transcriber.py,sha256=ZpKA3F8dqOtJPDlPiAgjw0AUDBIuhQiBVnvSYL4cdBg,9796
+livekit_plugins_google-0.10.3.dist-info/METADATA,sha256=kWXttBYbuIpMxR3KwJMchDcNn7OASsguQ_Sctm0t0Lw,2058
+livekit_plugins_google-0.10.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+livekit_plugins_google-0.10.3.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
+livekit_plugins_google-0.10.3.dist-info/RECORD,,

livekit_plugins_google-0.10.2.dist-info/RECORD DELETED Viewed

@@ -1,18 +0,0 @@
-livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
-livekit/plugins/google/_utils.py,sha256=mjsqblhGMgAZ2MNPisAVkNsqq4gfO6vvprEKzAGoVwE,7248
-livekit/plugins/google/llm.py,sha256=o9EJBv3rS5vKRq7m5YjSSqOxtH6pPekxRS_lra35hzk,16445
-livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
-livekit/plugins/google/models.py,sha256=w_qmOk5y86vjtszDiGpP9p0ctjQeaB8-UzqprxgpvCY,1407
-livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-livekit/plugins/google/stt.py,sha256=FA6Lpeb8QvRXLzkQ7cjsoMxHdtEGwHWkpN_TKqAdKAQ,21097
-livekit/plugins/google/tts.py,sha256=95qXCigVQYWNbcN3pIKBpIah4b31U_MWtXv5Ji0AMc4,9229
-livekit/plugins/google/version.py,sha256=jklx55q_NtxoIUiYD5AFOO11S_Jij8P491Y8nkw-VZk,601
-livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
-livekit/plugins/google/beta/realtime/__init__.py,sha256=sGTn6JFNyA30QUXBZ_BV3l2eHpGAzR35ByXxg77vWNU,205
-livekit/plugins/google/beta/realtime/api_proto.py,sha256=9EhmwgeIgKDqdSijv5Q9pgx7UhAakK02ZDwbnUsra_o,657
-livekit/plugins/google/beta/realtime/realtime_api.py,sha256=OwNoPmmomMtRkmYw-g2u7hIYpeIrSSNky7FlcHBVyFQ,21150
-livekit/plugins/google/beta/realtime/transcriber.py,sha256=JnZ75NyiOLkpvQ5N2nDniumDKcrjiq_tlryiLbuBoDM,6658
-livekit_plugins_google-0.10.2.dist-info/METADATA,sha256=dTBdAuYpGyCFVJNw0c8upUEdaFgdodWwrm1bB3a4Xp4,2058
-livekit_plugins_google-0.10.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-livekit_plugins_google-0.10.2.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
-livekit_plugins_google-0.10.2.dist-info/RECORD,,

{livekit_plugins_google-0.10.2.dist-info → livekit_plugins_google-0.10.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{livekit_plugins_google-0.10.2.dist-info → livekit_plugins_google-0.10.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

livekit-plugins-google 0.10.2__py3-none-any.whl → 0.10.3__py3-none-any.whl

livekit-plugins-google 0.10.2py3-none-any.whl → 0.10.3py3-none-any.whl