PyPI - livekit-plugins-google - Versions diffs - 0.7.3__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

livekit-plugins-google 0.7.3py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

livekit/plugins/google/__init__.py CHANGED Viewed

@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from . import beta
 from .stt import STT, SpeechStream
 from .tts import TTS
 from .version import __version__
-__all__ = ["STT", "TTS", "SpeechStream", "__version__"]
+__all__ = ["STT", "TTS", "SpeechStream", "__version__", "beta"]
 from livekit.agents import Plugin
 from .log import logger

livekit/plugins/google/beta/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from . import realtime
+__all__ = ["realtime"]

livekit/plugins/google/beta/realtime/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+from .api_proto import (
+    ClientEvents,
+    LiveAPIModels,
+    ResponseModality,
+    Voice,
+)
+from .realtime_api import RealtimeModel
+__all__ = [
+    "RealtimeModel",
+    "ClientEvents",
+    "LiveAPIModels",
+    "ResponseModality",
+    "Voice",
+]

livekit/plugins/google/beta/realtime/api_proto.py ADDED Viewed

@@ -0,0 +1,79 @@
+from __future__ import annotations
+import inspect
+from typing import Any, Dict, List, Literal, Sequence, Union
+from google.genai import types  # type: ignore
+LiveAPIModels = Literal["gemini-2.0-flash-exp"]
+Voice = Literal["Puck", "Charon", "Kore", "Fenrir", "Aoede"]
+ResponseModality = Literal["AUDIO", "TEXT"]
+ClientEvents = Union[
+    types.ContentListUnion,
+    types.ContentListUnionDict,
+    types.LiveClientContentOrDict,
+    types.LiveClientRealtimeInput,
+    types.LiveClientRealtimeInputOrDict,
+    types.LiveClientToolResponseOrDict,
+    types.FunctionResponseOrDict,
+    Sequence[types.FunctionResponseOrDict],
+]
+JSON_SCHEMA_TYPE_MAP = {
+    str: "string",
+    int: "integer",
+    float: "number",
+    bool: "boolean",
+    dict: "object",
+    list: "array",
+}
+def _build_parameters(arguments: Dict[str, Any]) -> types.SchemaDict:
+    properties: Dict[str, types.SchemaDict] = {}
+    required: List[str] = []
+    for arg_name, arg_info in arguments.items():
+        py_type = arg_info.type
+        if py_type not in JSON_SCHEMA_TYPE_MAP:
+            raise ValueError(f"Unsupported type: {py_type}")
+        prop: types.SchemaDict = {
+            "type": JSON_SCHEMA_TYPE_MAP[py_type],
+            "description": arg_info.description,
+        }
+        if arg_info.choices:
+            prop["enum"] = arg_info.choices
+        properties[arg_name] = prop
+        if arg_info.default is inspect.Parameter.empty:
+            required.append(arg_name)
+    parameters: types.SchemaDict = {"type": "object", "properties": properties}
+    if required:
+        parameters["required"] = required
+    return parameters
+def _build_tools(fnc_ctx: Any) -> List[types.FunctionDeclarationDict]:
+    function_declarations: List[types.FunctionDeclarationDict] = []
+    for fnc_info in fnc_ctx.ai_functions.values():
+        parameters = _build_parameters(fnc_info.arguments)
+        func_decl: types.FunctionDeclarationDict = {
+            "name": fnc_info.name,
+            "description": fnc_info.description,
+            "parameters": parameters,
+        }
+        function_declarations.append(func_decl)
+    return function_declarations

livekit/plugins/google/beta/realtime/realtime_api.py ADDED Viewed

@@ -0,0 +1,424 @@
+from __future__ import annotations
+import asyncio
+import base64
+import json
+import os
+from dataclasses import dataclass
+from typing import AsyncIterable, Literal
+from livekit import rtc
+from livekit.agents import llm, utils
+from livekit.agents.llm.function_context import _create_ai_function_info
+from google import genai  # type: ignore
+from google.genai.types import (  # type: ignore
+    FunctionResponse,
+    GenerationConfigDict,
+    LiveClientToolResponse,
+    LiveConnectConfigDict,
+    PrebuiltVoiceConfig,
+    SpeechConfig,
+    VoiceConfig,
+)
+from ...log import logger
+from .api_proto import (
+    ClientEvents,
+    LiveAPIModels,
+    ResponseModality,
+    Voice,
+    _build_tools,
+)
+EventTypes = Literal[
+    "start_session",
+    "input_speech_started",
+    "response_content_added",
+    "response_content_done",
+    "function_calls_collected",
+    "function_calls_finished",
+    "function_calls_cancelled",
+]
+@dataclass
+class GeminiContent:
+    response_id: str
+    item_id: str
+    output_index: int
+    content_index: int
+    text: str
+    audio: list[rtc.AudioFrame]
+    text_stream: AsyncIterable[str]
+    audio_stream: AsyncIterable[rtc.AudioFrame]
+    content_type: Literal["text", "audio"]
+@dataclass
+class Capabilities:
+    supports_truncate: bool
+@dataclass
+class ModelOptions:
+    model: LiveAPIModels | str
+    api_key: str | None
+    voice: Voice | str
+    response_modalities: ResponseModality
+    vertexai: bool
+    project: str | None
+    location: str | None
+    candidate_count: int
+    temperature: float | None
+    max_output_tokens: int | None
+    top_p: float | None
+    top_k: int | None
+    presence_penalty: float | None
+    frequency_penalty: float | None
+    instructions: str
+class RealtimeModel:
+    def __init__(
+        self,
+        *,
+        instructions: str = "",
+        model: LiveAPIModels | str = "gemini-2.0-flash-exp",
+        api_key: str | None = None,
+        voice: Voice | str = "Puck",
+        modalities: ResponseModality = "AUDIO",
+        vertexai: bool = False,
+        project: str | None = None,
+        location: str | None = None,
+        candidate_count: int = 1,
+        temperature: float | None = None,
+        max_output_tokens: int | None = None,
+        top_p: float | None = None,
+        top_k: int | None = None,
+        presence_penalty: float | None = None,
+        frequency_penalty: float | None = None,
+        loop: asyncio.AbstractEventLoop | None = None,
+    ):
+        """
+        Initializes a RealtimeModel instance for interacting with Google's Realtime API.
+        Args:
+            instructions (str, optional): Initial system instructions for the model. Defaults to "".
+            api_key (str or None, optional): OpenAI API key. If None, will attempt to read from the environment variable OPENAI_API_KEY
+            modalities (ResponseModality): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
+            model (str or None, optional): The name of the model to use. Defaults to "gemini-2.0-flash-exp".
+            voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
+            temperature (float, optional): Sampling temperature for response generation. Defaults to 0.8.
+            vertexai (bool, optional): Whether to use VertexAI for the API. Defaults to False.
+                project (str or None, optional): The project to use for the API. Defaults to None. (for vertexai)
+                location (str or None, optional): The location to use for the API. Defaults to None. (for vertexai)
+            candidate_count (int, optional): The number of candidate responses to generate. Defaults to 1.
+            top_p (float, optional): The top-p value for response generation
+            top_k (int, optional): The top-k value for response generation
+            presence_penalty (float, optional): The presence penalty for response generation
+            frequency_penalty (float, optional): The frequency penalty for response generation
+            loop (asyncio.AbstractEventLoop or None, optional): Event loop to use for async operations. If None, the current event loop is used.
+        Raises:
+            ValueError: If the API key is not provided and cannot be found in environment variables.
+        """
+        super().__init__()
+        self._capabilities = Capabilities(
+            supports_truncate=False,
+        )
+        self._model = model
+        self._loop = loop or asyncio.get_event_loop()
+        self._api_key = api_key or os.environ.get("GOOGLE_API_KEY")
+        self._vertexai = vertexai
+        self._project_id = project or os.environ.get("GOOGLE_PROJECT")
+        self._location = location or os.environ.get("GOOGLE_LOCATION")
+        if self._api_key is None and not self._vertexai:
+            raise ValueError("GOOGLE_API_KEY is not set")
+        self._rt_sessions: list[GeminiRealtimeSession] = []
+        self._opts = ModelOptions(
+            model=model,
+            api_key=api_key,
+            voice=voice,
+            response_modalities=modalities,
+            vertexai=vertexai,
+            project=project,
+            location=location,
+            candidate_count=candidate_count,
+            temperature=temperature,
+            max_output_tokens=max_output_tokens,
+            top_p=top_p,
+            top_k=top_k,
+            presence_penalty=presence_penalty,
+            frequency_penalty=frequency_penalty,
+            instructions=instructions,
+        )
+    @property
+    def sessions(self) -> list[GeminiRealtimeSession]:
+        return self._rt_sessions
+    @property
+    def capabilities(self) -> Capabilities:
+        return self._capabilities
+    def session(
+        self,
+        *,
+        chat_ctx: llm.ChatContext | None = None,
+        fnc_ctx: llm.FunctionContext | None = None,
+    ) -> GeminiRealtimeSession:
+        session = GeminiRealtimeSession(
+            opts=self._opts,
+            chat_ctx=chat_ctx or llm.ChatContext(),
+            fnc_ctx=fnc_ctx,
+            loop=self._loop,
+        )
+        self._rt_sessions.append(session)
+        return session
+    async def aclose(self) -> None:
+        for session in self._rt_sessions:
+            await session.aclose()
+class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
+    def __init__(
+        self,
+        *,
+        opts: ModelOptions,
+        chat_ctx: llm.ChatContext,
+        fnc_ctx: llm.FunctionContext | None,
+        loop: asyncio.AbstractEventLoop,
+    ):
+        """
+        Initializes a GeminiRealtimeSession instance for interacting with Google's Realtime API.
+        Args:
+            opts (ModelOptions): The model options for the session.
+            chat_ctx (llm.ChatContext): The chat context for the session.
+            fnc_ctx (llm.FunctionContext or None): The function context for the session.
+            loop (asyncio.AbstractEventLoop): The event loop for the session.
+        """
+        super().__init__()
+        self._loop = loop
+        self._opts = opts
+        self._chat_ctx = chat_ctx
+        self._fnc_ctx = fnc_ctx
+        self._fnc_tasks = utils.aio.TaskSet()
+        tools = []
+        if self._fnc_ctx is not None:
+            functions = _build_tools(self._fnc_ctx)
+            tools.append({"function_declarations": functions})
+        self._config = LiveConnectConfigDict(
+            model=self._opts.model,
+            response_modalities=self._opts.response_modalities,
+            generation_config=GenerationConfigDict(
+                candidate_count=self._opts.candidate_count,
+                temperature=self._opts.temperature,
+                max_output_tokens=self._opts.max_output_tokens,
+                top_p=self._opts.top_p,
+                top_k=self._opts.top_k,
+                presence_penalty=self._opts.presence_penalty,
+                frequency_penalty=self._opts.frequency_penalty,
+            ),
+            system_instruction=self._opts.instructions,
+            speech_config=SpeechConfig(
+                voice_config=VoiceConfig(
+                    prebuilt_voice_config=PrebuiltVoiceConfig(
+                        voice_name=self._opts.voice
+                    )
+                )
+            ),
+            tools=tools,
+        )
+        self._client = genai.Client(
+            http_options={"api_version": "v1alpha"},
+            api_key=self._opts.api_key,
+            vertexai=self._opts.vertexai,
+            project=self._opts.project,
+            location=self._opts.location,
+        )
+        self._main_atask = asyncio.create_task(
+            self._main_task(), name="gemini-realtime-session"
+        )
+        # dummy task to wait for the session to be initialized # TODO: sync chat ctx
+        self._init_sync_task = asyncio.create_task(
+            asyncio.sleep(0), name="gemini-realtime-session-init"
+        )
+        self._send_ch = utils.aio.Chan[ClientEvents]()
+        self._active_response_id = None
+    async def aclose(self) -> None:
+        if self._send_ch.closed:
+            return
+        self._send_ch.close()
+        await self._main_atask
+    @property
+    def fnc_ctx(self) -> llm.FunctionContext | None:
+        return self._fnc_ctx
+    @fnc_ctx.setter
+    def fnc_ctx(self, value: llm.FunctionContext | None) -> None:
+        self._fnc_ctx = value
+    def _push_audio(self, frame: rtc.AudioFrame) -> None:
+        data = base64.b64encode(frame.data).decode("utf-8")
+        self._queue_msg({"mime_type": "audio/pcm", "data": data})
+    def _queue_msg(self, msg: dict) -> None:
+        self._send_ch.send_nowait(msg)
+    def chat_ctx_copy(self) -> llm.ChatContext:
+        return self._chat_ctx.copy()
+    async def set_chat_ctx(self, ctx: llm.ChatContext) -> None:
+        self._chat_ctx = ctx.copy()
+    @utils.log_exceptions(logger=logger)
+    async def _main_task(self):
+        @utils.log_exceptions(logger=logger)
+        async def _send_task():
+            async for msg in self._send_ch:
+                await self._session.send(msg)
+            await self._session.send(".", end_of_turn=True)
+        @utils.log_exceptions(logger=logger)
+        async def _recv_task():
+            while True:
+                async for response in self._session.receive():
+                    if self._active_response_id is None:
+                        self._active_response_id = utils.shortuuid()
+                        text_stream = utils.aio.Chan[str]()
+                        audio_stream = utils.aio.Chan[rtc.AudioFrame]()
+                        content = GeminiContent(
+                            response_id=self._active_response_id,
+                            item_id=self._active_response_id,
+                            output_index=0,
+                            content_index=0,
+                            text="",
+                            audio=[],
+                            text_stream=text_stream,
+                            audio_stream=audio_stream,
+                            content_type=self._opts.response_modalities,
+                        )
+                        self.emit("response_content_added", content)
+                    server_content = response.server_content
+                    if server_content:
+                        model_turn = server_content.model_turn
+                        if model_turn:
+                            for part in model_turn.parts:
+                                if part.text:
+                                    content.text_stream.send_nowait(part.text)
+                                if part.inline_data:
+                                    frame = rtc.AudioFrame(
+                                        data=part.inline_data.data,
+                                        sample_rate=24000,
+                                        num_channels=1,
+                                        samples_per_channel=len(part.inline_data.data)
+                                        // 2,
+                                    )
+                                    content.audio_stream.send_nowait(frame)
+                        if server_content.interrupted or server_content.turn_complete:
+                            for stream in (content.text_stream, content.audio_stream):
+                                if isinstance(stream, utils.aio.Chan):
+                                    stream.close()
+                            if server_content.interrupted:
+                                self.emit("input_speech_started")
+                            elif server_content.turn_complete:
+                                self.emit("response_content_done", content)
+                            self._active_response_id = None
+                    if response.tool_call:
+                        if self._fnc_ctx is None:
+                            raise ValueError("Function context is not set")
+                        fnc_calls = []
+                        for fnc_call in response.tool_call.function_calls:
+                            fnc_call_info = _create_ai_function_info(
+                                self._fnc_ctx,
+                                fnc_call.id,
+                                fnc_call.name,
+                                json.dumps(fnc_call.args),
+                            )
+                            fnc_calls.append(fnc_call_info)
+                        self.emit("function_calls_collected", fnc_calls)
+                        for fnc_call_info in fnc_calls:
+                            self._fnc_tasks.create_task(
+                                self._run_fnc_task(fnc_call_info, content.item_id)
+                            )
+                    # Handle function call cancellations
+                    if response.tool_call_cancellation:
+                        logger.warning(
+                            "function call cancelled",
+                            extra={
+                                "function_call_ids": response.tool_call_cancellation.function_call_ids,
+                            },
+                        )
+                        self.emit(
+                            "function_calls_cancelled",
+                            response.tool_call_cancellation.function_call_ids,
+                        )
+        async with self._client.aio.live.connect(
+            model=self._opts.model, config=self._config
+        ) as session:
+            self._session = session
+            tasks = [
+                asyncio.create_task(_send_task(), name="gemini-realtime-send"),
+                asyncio.create_task(_recv_task(), name="gemini-realtime-recv"),
+            ]
+            try:
+                await asyncio.gather(*tasks)
+            finally:
+                await utils.aio.gracefully_cancel(*tasks)
+                await self._session.close()
+    @utils.log_exceptions(logger=logger)
+    async def _run_fnc_task(self, fnc_call_info: llm.FunctionCallInfo, item_id: str):
+        logger.debug(
+            "executing ai function",
+            extra={
+                "function": fnc_call_info.function_info.name,
+            },
+        )
+        called_fnc = fnc_call_info.execute()
+        try:
+            await called_fnc.task
+        except Exception as e:
+            logger.exception(
+                "error executing ai function",
+                extra={
+                    "function": fnc_call_info.function_info.name,
+                },
+                exc_info=e,
+            )
+        tool_call = llm.ChatMessage.create_tool_from_called_function(called_fnc)
+        if tool_call.content is not None:
+            tool_response = LiveClientToolResponse(
+                function_responses=[
+                    FunctionResponse(
+                        name=tool_call.name,
+                        id=tool_call.tool_call_id,
+                        response={"result": tool_call.content},
+                    )
+                ]
+            )
+            await self._session.send(tool_response)
+            self.emit("function_calls_finished", [called_fnc])

livekit/plugins/google/models.py CHANGED Viewed

@@ -3,7 +3,13 @@ from typing import Literal
 # Speech to Text v2
 SpeechModels = Literal[
-    "long", "short", "telephony", "medical_dictation", "medical_conversation", "chirp"
+    "long",
+    "short",
+    "telephony",
+    "medical_dictation",
+    "medical_conversation",
+    "chirp",
+    "chirp_2",
 ]
 SpeechLanguages = Literal[

livekit/plugins/google/stt.py CHANGED Viewed

@@ -16,19 +16,23 @@ from __future__ import annotations
 import asyncio
 import dataclasses
+import weakref
 from dataclasses import dataclass
-from typing import AsyncIterable, List, Union
+from typing import List, Union
-from livekit import agents, rtc
+from livekit import rtc
 from livekit.agents import (
+    DEFAULT_API_CONNECT_OPTIONS,
     APIConnectionError,
+    APIConnectOptions,
     APIStatusError,
     APITimeoutError,
     stt,
     utils,
 )
-from google.api_core.exceptions import Aborted, DeadlineExceeded, GoogleAPICallError
+from google.api_core.client_options import ClientOptions
+from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
 from google.auth import default as gauth_default
 from google.auth.exceptions import DefaultCredentialsError
 from google.cloud.speech_v2 import SpeechAsyncClient
@@ -50,6 +54,7 @@ class STTOptions:
     punctuate: bool
     spoken_punctuation: bool
     model: SpeechModels
+    sample_rate: int
     keywords: List[tuple[str, float]] | None
     def build_adaptation(self) -> cloud_speech.SpeechAdaptation | None:
@@ -81,6 +86,8 @@ class STT(stt.STT):
         punctuate: bool = True,
         spoken_punctuation: bool = True,
         model: SpeechModels = "long",
+        location: str = "global",
+        sample_rate: int = 16000,
         credentials_info: dict | None = None,
         credentials_file: str | None = None,
         keywords: List[tuple[str, float]] | None = None,
@@ -97,6 +104,7 @@ class STT(stt.STT):
         )
         self._client: SpeechAsyncClient | None = None
+        self._location = location
         self._credentials_info = credentials_info
         self._credentials_file = credentials_file
@@ -120,8 +128,10 @@ class STT(stt.STT):
             punctuate=punctuate,
             spoken_punctuation=spoken_punctuation,
             model=model,
+            sample_rate=sample_rate,
             keywords=keywords,
         )
+        self._streams = weakref.WeakSet[SpeechStream]()
     def _ensure_client(self) -> SpeechAsyncClient:
         if self._credentials_info:
@@ -132,9 +142,16 @@ class STT(stt.STT):
             self._client = SpeechAsyncClient.from_service_account_file(
                 self._credentials_file
             )
-        else:
+        elif self._location == "global":
             self._client = SpeechAsyncClient()
+        else:
+            # Add support for passing a specific location that matches recognizer
+            # see: https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages
+            self._client = SpeechAsyncClient(
+                client_options=ClientOptions(
+                    api_endpoint=f"{self._location}-speech.googleapis.com"
+                )
+            )
         assert self._client is not None
         return self._client
@@ -150,7 +167,7 @@ class STT(stt.STT):
             from google.auth import default as ga_default
             _, project_id = ga_default()
-        return f"projects/{project_id}/locations/global/recognizers/_"
+        return f"projects/{project_id}/locations/{self._location}/recognizers/_"
     def _sanitize_options(self, *, language: str | None = None) -> STTOptions:
         config = dataclasses.replace(self._config)
@@ -173,10 +190,11 @@ class STT(stt.STT):
         self,
         buffer: utils.AudioBuffer,
         *,
-        language: SpeechLanguages | str | None = None,
+        language: SpeechLanguages | str | None,
+        conn_options: APIConnectOptions,
     ) -> stt.SpeechEvent:
         config = self._sanitize_options(language=language)
-        frame = agents.utils.merge_frames(buffer)
+        frame = rtc.combine_audio_frames(buffer)
         config = cloud_speech.RecognitionConfig(
             explicit_decoding_config=cloud_speech.ExplicitDecodingConfig(
@@ -200,7 +218,8 @@ class STT(stt.STT):
                     recognizer=self._recognizer,
                     config=config,
                     content=frame.data.tobytes(),
-                )
+                ),
+                timeout=conn_options.timeout,
             )
             return _recognize_response_to_speech_event(raw)
@@ -217,154 +236,223 @@ class STT(stt.STT):
             raise APIConnectionError() from e
     def stream(
-        self, *, language: SpeechLanguages | str | None = None
+        self,
+        *,
+        language: SpeechLanguages | str | None = None,
+        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
     ) -> "SpeechStream":
         config = self._sanitize_options(language=language)
-        return SpeechStream(self, self._ensure_client(), self._recognizer, config)
+        stream = SpeechStream(
+            stt=self,
+            client=self._ensure_client(),
+            recognizer=self._recognizer,
+            config=config,
+            conn_options=conn_options,
+        )
+        self._streams.add(stream)
+        return stream
+    def update_options(
+        self,
+        *,
+        languages: LanguageCode | None = None,
+        detect_language: bool | None = None,
+        interim_results: bool | None = None,
+        punctuate: bool | None = None,
+        spoken_punctuation: bool | None = None,
+        model: SpeechModels | None = None,
+        location: str | None = None,
+        keywords: List[tuple[str, float]] | None = None,
+    ):
+        if languages is not None:
+            if isinstance(languages, str):
+                languages = [languages]
+            self._config.languages = languages
+        if detect_language is not None:
+            self._config.detect_language = detect_language
+        if interim_results is not None:
+            self._config.interim_results = interim_results
+        if punctuate is not None:
+            self._config.punctuate = punctuate
+        if spoken_punctuation is not None:
+            self._config.spoken_punctuation = spoken_punctuation
+        if model is not None:
+            self._config.model = model
+        if keywords is not None:
+            self._config.keywords = keywords
+        for stream in self._streams:
+            stream.update_options(
+                languages=languages,
+                detect_language=detect_language,
+                interim_results=interim_results,
+                punctuate=punctuate,
+                spoken_punctuation=spoken_punctuation,
+                model=model,
+                location=location,
+                keywords=keywords,
+            )
 class SpeechStream(stt.SpeechStream):
     def __init__(
         self,
+        *,
         stt: STT,
+        conn_options: APIConnectOptions,
         client: SpeechAsyncClient,
         recognizer: str,
         config: STTOptions,
-        sample_rate: int = 48000,
-        num_channels: int = 1,
-        max_retry: int = 32,
     ) -> None:
-        super().__init__(stt)
+        super().__init__(
+            stt=stt, conn_options=conn_options, sample_rate=config.sample_rate
+        )
         self._client = client
         self._recognizer = recognizer
         self._config = config
-        self._sample_rate = sample_rate
-        self._num_channels = num_channels
-        self._max_retry = max_retry
-        self._streaming_config = cloud_speech.StreamingRecognitionConfig(
-            config=cloud_speech.RecognitionConfig(
-                explicit_decoding_config=cloud_speech.ExplicitDecodingConfig(
-                    encoding=cloud_speech.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
-                    sample_rate_hertz=self._sample_rate,
-                    audio_channel_count=self._num_channels,
-                ),
-                adaptation=config.build_adaptation(),
-                language_codes=self._config.languages,
-                model=self._config.model,
-                features=cloud_speech.RecognitionFeatures(
-                    enable_automatic_punctuation=self._config.punctuate,
-                    enable_word_time_offsets=True,
-                ),
-            ),
-            streaming_features=cloud_speech.StreamingRecognitionFeatures(
-                enable_voice_activity_events=True,
-                interim_results=self._config.interim_results,
-            ),
-        )
+        self._reconnect_event = asyncio.Event()
-    @utils.log_exceptions(logger=logger)
-    async def _main_task(self) -> None:
-        await self._run(self._max_retry)
-    async def _run(self, max_retry: int) -> None:
-        retry_count = 0
-        while self._input_ch.qsize() or not self._input_ch.closed:
+    def update_options(
+        self,
+        *,
+        languages: LanguageCode | None = None,
+        detect_language: bool | None = None,
+        interim_results: bool | None = None,
+        punctuate: bool | None = None,
+        spoken_punctuation: bool | None = None,
+        model: SpeechModels | None = None,
+        location: str | None = None,
+        keywords: List[tuple[str, float]] | None = None,
+    ):
+        if languages is not None:
+            if isinstance(languages, str):
+                languages = [languages]
+            self._config.languages = languages
+        if detect_language is not None:
+            self._config.detect_language = detect_language
+        if interim_results is not None:
+            self._config.interim_results = interim_results
+        if punctuate is not None:
+            self._config.punctuate = punctuate
+        if spoken_punctuation is not None:
+            self._config.spoken_punctuation = spoken_punctuation
+        if model is not None:
+            self._config.model = model
+        if keywords is not None:
+            self._config.keywords = keywords
+        self._reconnect_event.set()
+    async def _run(self) -> None:
+        # google requires a async generator when calling streaming_recognize
+        # this function basically convert the queue into a async generator
+        async def input_generator():
             try:
-                # google requires a async generator when calling streaming_recognize
-                # this function basically convert the queue into a async generator
-                async def input_generator():
-                    try:
-                        # first request should contain the config
+                # first request should contain the config
+                yield cloud_speech.StreamingRecognizeRequest(
+                    recognizer=self._recognizer,
+                    streaming_config=self._streaming_config,
+                )
+                async for frame in self._input_ch:
+                    if isinstance(frame, rtc.AudioFrame):
                         yield cloud_speech.StreamingRecognizeRequest(
-                            recognizer=self._recognizer,
-                            streaming_config=self._streaming_config,
+                            audio=frame.data.tobytes()
                         )
-                        async for frame in self._input_ch:
-                            if isinstance(frame, rtc.AudioFrame):
-                                frame = frame.remix_and_resample(
-                                    self._sample_rate, self._num_channels
-                                )
-                                yield cloud_speech.StreamingRecognizeRequest(
-                                    audio=frame.data.tobytes()
-                                )
+            except Exception:
+                logger.exception(
+                    "an error occurred while streaming input to google STT"
+                )
+        async def process_stream(stream):
+            async for resp in stream:
+                if (
+                    resp.speech_event_type
+                    == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
+                ):
+                    self._event_ch.send_nowait(
+                        stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH)
+                    )
-                    except Exception:
-                        logger.exception(
-                            "an error occurred while streaming input to google STT"
+                if (
+                    resp.speech_event_type
+                    == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_EVENT_TYPE_UNSPECIFIED
+                ):
+                    result = resp.results[0]
+                    speech_data = _streaming_recognize_response_to_speech_data(resp)
+                    if speech_data is None:
+                        continue
+                    if not result.is_final:
+                        self._event_ch.send_nowait(
+                            stt.SpeechEvent(
+                                type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
+                                alternatives=[speech_data],
+                            )
+                        )
+                    else:
+                        self._event_ch.send_nowait(
+                            stt.SpeechEvent(
+                                type=stt.SpeechEventType.FINAL_TRANSCRIPT,
+                                alternatives=[speech_data],
+                            )
                         )
-                # try to connect
-                stream = await self._client.streaming_recognize(
-                    requests=input_generator()
-                )
-                retry_count = 0  # connection successful, reset retry count
-                await self._run_stream(stream)
-            except Aborted:
-                logger.error("google stt connection aborted")
-                break
-            except Exception as e:
-                if retry_count >= max_retry:
-                    logger.error(
-                        f"failed to connect to google stt after {max_retry} tries",
-                        exc_info=e,
+                if (
+                    resp.speech_event_type
+                    == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END
+                ):
+                    self._event_ch.send_nowait(
+                        stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH)
                     )
-                    break
-                retry_delay = min(retry_count * 2, 5)  # max 5s
-                retry_count += 1
-                logger.warning(
-                    f"google stt connection failed, retrying in {retry_delay}s",
-                    exc_info=e,
+        while True:
+            try:
+                self._streaming_config = cloud_speech.StreamingRecognitionConfig(
+                    config=cloud_speech.RecognitionConfig(
+                        explicit_decoding_config=cloud_speech.ExplicitDecodingConfig(
+                            encoding=cloud_speech.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
+                            sample_rate_hertz=self._config.sample_rate,
+                            audio_channel_count=1,
+                        ),
+                        adaptation=self._config.build_adaptation(),
+                        language_codes=self._config.languages,
+                        model=self._config.model,
+                        features=cloud_speech.RecognitionFeatures(
+                            enable_automatic_punctuation=self._config.punctuate,
+                            enable_word_time_offsets=True,
+                        ),
+                    ),
+                    streaming_features=cloud_speech.StreamingRecognitionFeatures(
+                        enable_voice_activity_events=True,
+                        interim_results=self._config.interim_results,
+                    ),
                 )
-                await asyncio.sleep(retry_delay)
-    async def _run_stream(
-        self, stream: AsyncIterable[cloud_speech.StreamingRecognizeResponse]
-    ):
-        async for resp in stream:
-            if (
-                resp.speech_event_type
-                == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
-            ):
-                self._event_ch.send_nowait(
-                    stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH)
+                stream = await self._client.streaming_recognize(
+                    requests=input_generator(),
                 )
-            if (
-                resp.speech_event_type
-                == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_EVENT_TYPE_UNSPECIFIED
-            ):
-                result = resp.results[0]
-                speech_data = _streaming_recognize_response_to_speech_data(resp)
-                if speech_data is None:
-                    continue
-                if not result.is_final:
-                    self._event_ch.send_nowait(
-                        stt.SpeechEvent(
-                            type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
-                            alternatives=[speech_data],
-                        )
+                process_stream_task = asyncio.create_task(process_stream(stream))
+                wait_reconnect_task = asyncio.create_task(self._reconnect_event.wait())
+                try:
+                    done, _ = await asyncio.wait(
+                        [process_stream_task, wait_reconnect_task],
+                        return_when=asyncio.FIRST_COMPLETED,
                     )
-                else:
-                    self._event_ch.send_nowait(
-                        stt.SpeechEvent(
-                            type=stt.SpeechEventType.FINAL_TRANSCRIPT,
-                            alternatives=[speech_data],
-                        )
+                    for task in done:
+                        if task != wait_reconnect_task:
+                            task.result()
+                finally:
+                    await utils.aio.gracefully_cancel(
+                        process_stream_task, wait_reconnect_task
                     )
-            if (
-                resp.speech_event_type
-                == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END
-            ):
-                self._event_ch.send_nowait(
-                    stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH)
-                )
+            finally:
+                if not self._reconnect_event.is_set():
+                    break
+                self._reconnect_event.clear()
 def _recognize_response_to_speech_event(

livekit/plugins/google/tts.py CHANGED Viewed

@@ -18,7 +18,9 @@ from dataclasses import dataclass
 from livekit import rtc
 from livekit.agents import (
+    DEFAULT_API_CONNECT_OPTIONS,
     APIConnectionError,
+    APIConnectOptions,
     APIStatusError,
     APITimeoutError,
     tts,
@@ -134,7 +136,7 @@ class TTS(tts.TTS):
         self._opts.audio_config.speaking_rate = speaking_rate
     def _ensure_client(self) -> texttospeech.TextToSpeechAsyncClient:
-        if not self._client:
+        if self._client is None:
             if self._credentials_info:
                 self._client = (
                     texttospeech.TextToSpeechAsyncClient.from_service_account_info(
@@ -154,22 +156,35 @@ class TTS(tts.TTS):
         assert self._client is not None
         return self._client
-    def synthesize(self, text: str) -> "ChunkedStream":
-        return ChunkedStream(self, text, self._opts, self._ensure_client())
+    def synthesize(
+        self,
+        text: str,
+        *,
+        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
+    ) -> "ChunkedStream":
+        return ChunkedStream(
+            tts=self,
+            input_text=text,
+            conn_options=conn_options,
+            opts=self._opts,
+            client=self._ensure_client(),
+        )
 class ChunkedStream(tts.ChunkedStream):
     def __init__(
         self,
+        *,
         tts: TTS,
-        text: str,
+        input_text: str,
+        conn_options: APIConnectOptions,
         opts: _TTSOptions,
         client: texttospeech.TextToSpeechAsyncClient,
     ) -> None:
-        super().__init__(tts, text)
+        super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
         self._opts, self._client = opts, client
-    async def _main_task(self) -> None:
+    async def _run(self) -> None:
         request_id = utils.shortuuid()
         try:
@@ -177,16 +192,16 @@ class ChunkedStream(tts.ChunkedStream):
                 input=texttospeech.SynthesisInput(text=self._input_text),
                 voice=self._opts.voice,
                 audio_config=self._opts.audio_config,
+                timeout=self._conn_options.timeout,
             )
-            data = response.audio_content
             if self._opts.audio_config.audio_encoding == "mp3":
                 decoder = utils.codecs.Mp3StreamDecoder()
                 bstream = utils.audio.AudioByteStream(
                     sample_rate=self._opts.audio_config.sample_rate_hertz,
                     num_channels=1,
                 )
-                for frame in decoder.decode_chunk(data):
+                for frame in decoder.decode_chunk(response.audio_content):
                     for frame in bstream.write(frame.data.tobytes()):
                         self._event_ch.send_nowait(
                             tts.SynthesizedAudio(request_id=request_id, frame=frame)
@@ -197,7 +212,7 @@ class ChunkedStream(tts.ChunkedStream):
                         tts.SynthesizedAudio(request_id=request_id, frame=frame)
                     )
             else:
-                data = data[44:]  # skip WAV header
+                data = response.audio_content[44:]  # skip WAV header
                 self._event_ch.send_nowait(
                     tts.SynthesizedAudio(
                         request_id=request_id,

livekit/plugins/google/version.py CHANGED Viewed

@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.7.3"
+__version__ = "0.9.0"

{livekit_plugins_google-0.7.3.dist-info → livekit_plugins_google-0.9.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: livekit-plugins-google
-Version: 0.7.3
+Version: 0.9.0
 Summary: Agent Framework plugin for services from Google Cloud
 Home-page: https://github.com/livekit/agents
 License: Apache-2.0
@@ -19,10 +19,11 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3 :: Only
 Requires-Python: >=3.9.0
 Description-Content-Type: text/markdown
-Requires-Dist: google-auth <3,>=2
-Requires-Dist: google-cloud-speech <3,>=2
-Requires-Dist: google-cloud-texttospeech <3,>=2
-Requires-Dist: livekit-agents >=0.11
+Requires-Dist: google-auth<3,>=2
+Requires-Dist: google-cloud-speech<3,>=2
+Requires-Dist: google-cloud-texttospeech<3,>=2
+Requires-Dist: google-genai>=0.3.0
+Requires-Dist: livekit-agents>=0.12.3
 # LiveKit Plugins Google
@@ -37,3 +38,8 @@ pip install livekit-plugins-google
 ## Pre-requisites
 For credentials, you'll need a Google Cloud account and obtain the correct credentials. Credentials can be passed directly or via Application Default Credentials as specified in [How Application Default Credentials works](https://cloud.google.com/docs/authentication/application-default-credentials).
+To use the STT and TTS API, you'll need to enable the respective services for your Google Cloud project.
+- Cloud Speech-to-Text API
+- Cloud Text-to-Speech API

livekit_plugins_google-0.9.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+livekit/plugins/google/__init__.py,sha256=TY-5FwEX4Vs7GLO1wSegIxC5W4UPkHBthlr-__yuE4w,1143
+livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
+livekit/plugins/google/models.py,sha256=cBXhZGY9bFaSCyL9VeSng9wsxhf3peJi3AUYBKV-8GQ,1343
+livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+livekit/plugins/google/stt.py,sha256=SfmKgQotIVzk9-Hipo1X5cnLQG4uXLniTUoyM3IynwA,18712
+livekit/plugins/google/tts.py,sha256=95qXCigVQYWNbcN3pIKBpIah4b31U_MWtXv5Ji0AMc4,9229
+livekit/plugins/google/version.py,sha256=onRKrcQ35NZG4oEg_95WGeTytHh_6VVAlQKAZhwiEe4,600
+livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
+livekit/plugins/google/beta/realtime/__init__.py,sha256=XnJpNIN6NRm7Y4hH2RNA8Xt-tTmkZEKCs_zzU3_koBI,251
+livekit/plugins/google/beta/realtime/api_proto.py,sha256=IHYBryuzpfGQD86Twlfq6qxrBhFHptf_IvOk36Wxo1M,2156
+livekit/plugins/google/beta/realtime/realtime_api.py,sha256=OxrbWnUOT_oFdrMruvLPHgEoXlOr6M5oGym9b2Iqz48,15958
+livekit_plugins_google-0.9.0.dist-info/METADATA,sha256=tB70OQMa7JtWLqRi1TMDUpv4y0TZEk0L609BN6y0x48,1841
+livekit_plugins_google-0.9.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+livekit_plugins_google-0.9.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
+livekit_plugins_google-0.9.0.dist-info/RECORD,,

{livekit_plugins_google-0.7.3.dist-info → livekit_plugins_google-0.9.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.5.0)
+Generator: setuptools (75.6.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

livekit_plugins_google-0.7.3.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-livekit/plugins/google/__init__.py,sha256=rqV6C5mFNDFlrA2IcGJrsebr2VxQwMzoDUjY1JhMBZM,1117
-livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
-livekit/plugins/google/models.py,sha256=n8pgTJ7xyJpPCZJ_y0GzaQq6LqYknL6K6trpi07-AxM,1307
-livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-livekit/plugins/google/stt.py,sha256=WjeqYsunW8jY-WHlnNeks7gR-TiojMRR7LYdAVdCxqY,15268
-livekit/plugins/google/tts.py,sha256=hRN8ul1lDXU8LPVEfbTszgBiRYsifZXCPMwk-Pv2KeA,8793
-livekit/plugins/google/version.py,sha256=yJeG0VwiekDJAk7GHcIAe43ebagJgloe-ZsqEGZnqzE,600
-livekit_plugins_google-0.7.3.dist-info/METADATA,sha256=8UvORpoVunOTq0xKxHEk8M3sexKFnBnu66DkEJCnrRY,1647
-livekit_plugins_google-0.7.3.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
-livekit_plugins_google-0.7.3.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
-livekit_plugins_google-0.7.3.dist-info/RECORD,,

{livekit_plugins_google-0.7.3.dist-info → livekit_plugins_google-0.9.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

livekit-plugins-google 0.7.3__py3-none-any.whl → 0.9.0__py3-none-any.whl

livekit-plugins-google 0.7.3py3-none-any.whl → 0.9.0py3-none-any.whl