PyPI - livekit-plugins-google - Versions diffs - 0.11.3__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

livekit-plugins-google 0.11.3py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

livekit/plugins/google/beta/realtime/__init__.py +1 -5
livekit/plugins/google/beta/realtime/api_proto.py +2 -4
livekit/plugins/google/beta/realtime/realtime_api.py +407 -449
livekit/plugins/google/llm.py +158 -220
livekit/plugins/google/stt.py +80 -115
livekit/plugins/google/tts.py +40 -56
livekit/plugins/google/utils.py +251 -0
livekit/plugins/google/version.py +1 -1
{livekit_plugins_google-0.11.3.dist-info → livekit_plugins_google-1.0.0.dist-info}/METADATA +11 -21
livekit_plugins_google-1.0.0.dist-info/RECORD +16 -0
{livekit_plugins_google-0.11.3.dist-info → livekit_plugins_google-1.0.0.dist-info}/WHEEL +1 -2
livekit/plugins/google/_utils.py +0 -199
livekit/plugins/google/beta/realtime/transcriber.py +0 -270
livekit_plugins_google-0.11.3.dist-info/RECORD +0 -18
livekit_plugins_google-0.11.3.dist-info/top_level.txt +0 -1

livekit/plugins/google/stt.py CHANGED Viewed

@@ -19,8 +19,14 @@ import dataclasses
 import time
 import weakref
 from dataclasses import dataclass
-from typing import Callable, List, Union
+from typing import Callable, Union
+from google.api_core.client_options import ClientOptions
+from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
+from google.auth import default as gauth_default
+from google.auth.exceptions import DefaultCredentialsError
+from google.cloud.speech_v2 import SpeechAsyncClient
+from google.cloud.speech_v2.types import cloud_speech
 from livekit import rtc
 from livekit.agents import (
     DEFAULT_API_CONNECT_OPTIONS,
@@ -31,19 +37,17 @@ from livekit.agents import (
     stt,
     utils,
 )
-from google.api_core.client_options import ClientOptions
-from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
-from google.auth import default as gauth_default
-from google.auth.exceptions import DefaultCredentialsError
-from google.cloud.speech_v2 import SpeechAsyncClient
-from google.cloud.speech_v2.types import cloud_speech
+from livekit.agents.types import (
+    NOT_GIVEN,
+    NotGivenOr,
+)
+from livekit.agents.utils import is_given
 from .log import logger
 from .models import SpeechLanguages, SpeechModels
 LgType = Union[SpeechLanguages, str]
-LanguageCode = Union[LgType, List[LgType]]
+LanguageCode = Union[LgType, list[LgType]]
 # Google STT has a timeout of 5 mins, we'll attempt to restart the session
 # before that timeout is reached
@@ -56,25 +60,23 @@ _min_confidence = 0.65
 # This class is only be used internally to encapsulate the options
 @dataclass
 class STTOptions:
-    languages: List[LgType]
+    languages: list[LgType]
     detect_language: bool
     interim_results: bool
     punctuate: bool
     spoken_punctuation: bool
     model: SpeechModels | str
     sample_rate: int
-    keywords: List[tuple[str, float]] | None
+    keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN
     def build_adaptation(self) -> cloud_speech.SpeechAdaptation | None:
-        if self.keywords:
+        if is_given(self.keywords):
             return cloud_speech.SpeechAdaptation(
                 phrase_sets=[
                     cloud_speech.SpeechAdaptation.AdaptationPhraseSet(
                         inline_phrase_set=cloud_speech.PhraseSet(
                             phrases=[
-                                cloud_speech.PhraseSet.Phrase(
-                                    value=keyword, boost=boost
-                                )
+                                cloud_speech.PhraseSet.Phrase(value=keyword, boost=boost)
                                 for keyword, boost in self.keywords
                             ]
                         )
@@ -96,9 +98,9 @@ class STT(stt.STT):
         model: SpeechModels | str = "latest_long",
         location: str = "global",
         sample_rate: int = 16000,
-        credentials_info: dict | None = None,
-        credentials_file: str | None = None,
-        keywords: List[tuple[str, float]] | None = None,
+        credentials_info: NotGivenOr[dict] = NOT_GIVEN,
+        credentials_file: NotGivenOr[str] = NOT_GIVEN,
+        keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN,
     ):
         """
         Create a new instance of Google STT.
@@ -120,15 +122,13 @@ class STT(stt.STT):
             credentials_file(str): the credentials file to use for recognition (default: None)
             keywords(List[tuple[str, float]]): list of keywords to recognize (default: None)
         """
-        super().__init__(
-            capabilities=stt.STTCapabilities(streaming=True, interim_results=True)
-        )
+        super().__init__(capabilities=stt.STTCapabilities(streaming=True, interim_results=True))
         self._location = location
         self._credentials_info = credentials_info
         self._credentials_file = credentials_file
-        if credentials_file is None and credentials_info is None:
+        if not is_given(credentials_file) and not is_given(credentials_info):
             try:
                 gauth_default()
             except DefaultCredentialsError:
@@ -136,7 +136,7 @@ class STT(stt.STT):
                     "Application default credentials must be available "
                     "when using Google STT without explicitly passing "
                     "credentials through credentials_info or credentials_file."
-                )
+                ) from None
         if isinstance(languages, str):
             languages = [languages]
@@ -163,23 +163,17 @@ class STT(stt.STT):
         client_options = None
         client: SpeechAsyncClient | None = None
         if self._location != "global":
-            client_options = ClientOptions(
-                api_endpoint=f"{self._location}-speech.googleapis.com"
-            )
-        if self._credentials_info:
+            client_options = ClientOptions(api_endpoint=f"{self._location}-speech.googleapis.com")
+        if is_given(self._credentials_info):
             client = SpeechAsyncClient.from_service_account_info(
-                self._credentials_info,
-                client_options=client_options,
+                self._credentials_info, client_options=client_options
             )
-        elif self._credentials_file:
+        elif is_given(self._credentials_file):
             client = SpeechAsyncClient.from_service_account_file(
-                self._credentials_file,
-                client_options=client_options,
+                self._credentials_file, client_options=client_options
             )
         else:
-            client = SpeechAsyncClient(
-                client_options=client_options,
-            )
+            client = SpeechAsyncClient(client_options=client_options)
         assert client is not None
         return client
@@ -196,19 +190,17 @@ class STT(stt.STT):
             _, project_id = ga_default()
         return f"projects/{project_id}/locations/{self._location}/recognizers/_"
-    def _sanitize_options(self, *, language: str | None = None) -> STTOptions:
+    def _sanitize_options(self, *, language: NotGivenOr[str] = NOT_GIVEN) -> STTOptions:
         config = dataclasses.replace(self._config)
-        if language:
+        if is_given(language):
             config.languages = [language]
         if not isinstance(config.languages, list):
             config.languages = [config.languages]
         elif not config.detect_language:
             if len(config.languages) > 1:
-                logger.warning(
-                    "multiple languages provided, but language detection is disabled"
-                )
+                logger.warning("multiple languages provided, but language detection is disabled")
             config.languages = [config.languages[0]]
         return config
@@ -217,7 +209,7 @@ class STT(stt.STT):
         self,
         buffer: utils.AudioBuffer,
         *,
-        language: SpeechLanguages | str | None,
+        language: NotGivenOr[SpeechLanguages | str] = NOT_GIVEN,
         conn_options: APIConnectOptions,
     ) -> stt.SpeechEvent:
         config = self._sanitize_options(language=language)
@@ -252,21 +244,18 @@ class STT(stt.STT):
                 return _recognize_response_to_speech_event(raw)
         except DeadlineExceeded:
-            raise APITimeoutError()
+            raise APITimeoutError() from None
         except GoogleAPICallError as e:
-            raise APIStatusError(
-                e.message,
-                status_code=e.code or -1,
-            )
+            raise APIStatusError(e.message, status_code=e.code or -1) from None
         except Exception as e:
             raise APIConnectionError() from e
     def stream(
         self,
         *,
-        language: SpeechLanguages | str | None = None,
+        language: NotGivenOr[SpeechLanguages | str] = NOT_GIVEN,
         conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
-    ) -> "SpeechStream":
+    ) -> SpeechStream:
         config = self._sanitize_options(language=language)
         stream = SpeechStream(
             stt=self,
@@ -281,34 +270,34 @@ class STT(stt.STT):
     def update_options(
         self,
         *,
-        languages: LanguageCode | None = None,
-        detect_language: bool | None = None,
-        interim_results: bool | None = None,
-        punctuate: bool | None = None,
-        spoken_punctuation: bool | None = None,
-        model: SpeechModels | None = None,
-        location: str | None = None,
-        keywords: List[tuple[str, float]] | None = None,
+        languages: NotGivenOr[LanguageCode] = NOT_GIVEN,
+        detect_language: NotGivenOr[bool] = NOT_GIVEN,
+        interim_results: NotGivenOr[bool] = NOT_GIVEN,
+        punctuate: NotGivenOr[bool] = NOT_GIVEN,
+        spoken_punctuation: NotGivenOr[bool] = NOT_GIVEN,
+        model: NotGivenOr[SpeechModels] = NOT_GIVEN,
+        location: NotGivenOr[str] = NOT_GIVEN,
+        keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN,
     ):
-        if languages is not None:
+        if is_given(languages):
             if isinstance(languages, str):
                 languages = [languages]
             self._config.languages = languages
-        if detect_language is not None:
+        if is_given(detect_language):
             self._config.detect_language = detect_language
-        if interim_results is not None:
+        if is_given(interim_results):
             self._config.interim_results = interim_results
-        if punctuate is not None:
+        if is_given(punctuate):
             self._config.punctuate = punctuate
-        if spoken_punctuation is not None:
+        if is_given(spoken_punctuation):
             self._config.spoken_punctuation = spoken_punctuation
-        if model is not None:
+        if is_given(model):
             self._config.model = model
-        if location is not None:
+        if is_given(location):
             self._location = location
             # if location is changed, fetch a new client and recognizer as per the new location
             self._pool.invalidate()
-        if keywords is not None:
+        if is_given(keywords):
             self._config.keywords = keywords
         for stream in self._streams:
@@ -337,9 +326,7 @@ class SpeechStream(stt.SpeechStream):
         recognizer_cb: Callable[[SpeechAsyncClient], str],
         config: STTOptions,
     ) -> None:
-        super().__init__(
-            stt=stt, conn_options=conn_options, sample_rate=config.sample_rate
-        )
+        super().__init__(stt=stt, conn_options=conn_options, sample_rate=config.sample_rate)
         self._pool = pool
         self._recognizer_cb = recognizer_cb
@@ -350,29 +337,29 @@ class SpeechStream(stt.SpeechStream):
     def update_options(
         self,
         *,
-        languages: LanguageCode | None = None,
-        detect_language: bool | None = None,
-        interim_results: bool | None = None,
-        punctuate: bool | None = None,
-        spoken_punctuation: bool | None = None,
-        model: SpeechModels | None = None,
-        keywords: List[tuple[str, float]] | None = None,
+        languages: NotGivenOr[LanguageCode] = NOT_GIVEN,
+        detect_language: NotGivenOr[bool] = NOT_GIVEN,
+        interim_results: NotGivenOr[bool] = NOT_GIVEN,
+        punctuate: NotGivenOr[bool] = NOT_GIVEN,
+        spoken_punctuation: NotGivenOr[bool] = NOT_GIVEN,
+        model: NotGivenOr[SpeechModels] = NOT_GIVEN,
+        keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN,
     ):
-        if languages is not None:
+        if is_given(languages):
             if isinstance(languages, str):
                 languages = [languages]
             self._config.languages = languages
-        if detect_language is not None:
+        if is_given(detect_language):
             self._config.detect_language = detect_language
-        if interim_results is not None:
+        if is_given(interim_results):
             self._config.interim_results = interim_results
-        if punctuate is not None:
+        if is_given(punctuate):
             self._config.punctuate = punctuate
-        if spoken_punctuation is not None:
+        if is_given(spoken_punctuation):
             self._config.spoken_punctuation = spoken_punctuation
-        if model is not None:
+        if is_given(model):
             self._config.model = model
-        if keywords is not None:
+        if is_given(keywords):
             self._config.keywords = keywords
         self._reconnect_event.set()
@@ -380,9 +367,7 @@ class SpeechStream(stt.SpeechStream):
     async def _run(self) -> None:
         # google requires a async generator when calling streaming_recognize
         # this function basically convert the queue into a async generator
-        async def input_generator(
-            client: SpeechAsyncClient, should_stop: asyncio.Event
-        ):
+        async def input_generator(client: SpeechAsyncClient, should_stop: asyncio.Event):
             try:
                 # first request should contain the config
                 yield cloud_speech.StreamingRecognizeRequest(
@@ -398,14 +383,10 @@ class SpeechStream(stt.SpeechStream):
                         return
                     if isinstance(frame, rtc.AudioFrame):
-                        yield cloud_speech.StreamingRecognizeRequest(
-                            audio=frame.data.tobytes()
-                        )
+                        yield cloud_speech.StreamingRecognizeRequest(audio=frame.data.tobytes())
             except Exception:
-                logger.exception(
-                    "an error occurred while streaming input to google STT"
-                )
+                logger.exception("an error occurred while streaming input to google STT")
         async def process_stream(client: SpeechAsyncClient, stream):
             has_started = False
@@ -421,7 +402,7 @@ class SpeechStream(stt.SpeechStream):
                 if (
                     resp.speech_event_type
-                    == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_EVENT_TYPE_UNSPECIFIED
+                    == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_EVENT_TYPE_UNSPECIFIED  # noqa: E501
                 ):
                     result = resp.results[0]
                     speech_data = _streaming_recognize_response_to_speech_data(resp)
@@ -442,19 +423,14 @@ class SpeechStream(stt.SpeechStream):
                                 alternatives=[speech_data],
                             )
                         )
-                        if (
-                            time.time() - self._session_connected_at
-                            > _max_session_duration
-                        ):
+                        if time.time() - self._session_connected_at > _max_session_duration:
                             logger.debug(
                                 "Google STT maximum connection time reached. Reconnecting..."
                             )
                             self._pool.remove(client)
                             if has_started:
                                 self._event_ch.send_nowait(
-                                    stt.SpeechEvent(
-                                        type=stt.SpeechEventType.END_OF_SPEECH
-                                    )
+                                    stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH)
                                 )
                                 has_started = False
                             self._reconnect_event.set()
@@ -498,12 +474,8 @@ class SpeechStream(stt.SpeechStream):
                     )
                     self._session_connected_at = time.time()
-                    process_stream_task = asyncio.create_task(
-                        process_stream(client, stream)
-                    )
-                    wait_reconnect_task = asyncio.create_task(
-                        self._reconnect_event.wait()
-                    )
+                    process_stream_task = asyncio.create_task(process_stream(client, stream))
+                    wait_reconnect_task = asyncio.create_task(self._reconnect_event.wait())
                     try:
                         done, _ = await asyncio.wait(
@@ -517,17 +489,12 @@ class SpeechStream(stt.SpeechStream):
                             break
                         self._reconnect_event.clear()
                     finally:
-                        await utils.aio.gracefully_cancel(
-                            process_stream_task, wait_reconnect_task
-                        )
+                        await utils.aio.gracefully_cancel(process_stream_task, wait_reconnect_task)
                         should_stop.set()
             except DeadlineExceeded:
-                raise APITimeoutError()
+                raise APITimeoutError() from None
             except GoogleAPICallError as e:
-                raise APIStatusError(
-                    e.message,
-                    status_code=e.code or -1,
-                )
+                raise APIStatusError(e.message, status_code=e.code or -1) from None
             except Exception as e:
                 raise APIConnectionError() from e
@@ -580,8 +547,6 @@ def _streaming_recognize_response_to_speech_data(
     if text == "":
         return None
-    data = stt.SpeechData(
-        language=lg, start_time=0, end_time=0, confidence=confidence, text=text
-    )
+    data = stt.SpeechData(language=lg, start_time=0, end_time=0, confidence=confidence, text=text)
     return data

livekit/plugins/google/tts.py CHANGED Viewed

@@ -15,8 +15,11 @@
 from __future__ import annotations
 from dataclasses import dataclass
-from typing import Optional
+from google.api_core.client_options import ClientOptions
+from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
+from google.cloud import texttospeech
+from google.cloud.texttospeech_v1.types import SsmlVoiceGender, SynthesizeSpeechResponse
 from livekit.agents import (
     APIConnectionError,
     APIConnectOptions,
@@ -25,13 +28,12 @@ from livekit.agents import (
     tts,
     utils,
 )
-from google.api_core.client_options import ClientOptions
-from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
-from google.cloud import texttospeech
-from google.cloud.texttospeech_v1.types import SsmlVoiceGender, SynthesizeSpeechResponse
-from .models import Gender, SpeechLanguages
+from livekit.agents.types import (
+    DEFAULT_API_CONNECT_OPTIONS,
+    NOT_GIVEN,
+    NotGivenOr,
+)
+from livekit.agents.utils import is_given
 @dataclass
@@ -44,16 +46,14 @@ class TTS(tts.TTS):
     def __init__(
         self,
         *,
-        language: SpeechLanguages | str = "en-US",
-        gender: Gender | str = "neutral",
-        voice_name: str = "",  # Not required
+        voice: NotGivenOr[texttospeech.VoiceSelectionParams] = NOT_GIVEN,
         sample_rate: int = 24000,
         pitch: int = 0,
         effects_profile_id: str = "",
         speaking_rate: float = 1.0,
         location: str = "global",
-        credentials_info: dict | None = None,
-        credentials_file: str | None = None,
+        credentials_info: NotGivenOr[dict] = NOT_GIVEN,
+        credentials_file: NotGivenOr[str] = NOT_GIVEN,
     ) -> None:
         """
         Create a new instance of Google TTS.
@@ -63,9 +63,7 @@ class TTS(tts.TTS):
         environmental variable.
         Args:
-            language (SpeechLanguages | str, optional): Language code (e.g., "en-US"). Default is "en-US".
-            gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
-            voice_name (str, optional): Specific voice name. Default is an empty string.
+            voice (texttospeech.VoiceSelectionParams, optional): Voice selection parameters.
             sample_rate (int, optional): Audio sample rate in Hz. Default is 24000.
             location (str, optional): Location for the TTS client. Default is "global".
             pitch (float, optional): Speaking pitch, ranging from -20.0 to 20.0 semitones relative to the original pitch. Default is 0.
@@ -73,7 +71,7 @@ class TTS(tts.TTS):
             speaking_rate (float, optional): Speed of speech. Default is 1.0.
             credentials_info (dict, optional): Dictionary containing Google Cloud credentials. Default is None.
             credentials_file (str, optional): Path to the Google Cloud credentials JSON file. Default is None.
-        """
+        """  # noqa: E501
         super().__init__(
             capabilities=tts.TTSCapabilities(
@@ -87,11 +85,12 @@ class TTS(tts.TTS):
         self._credentials_info = credentials_info
         self._credentials_file = credentials_file
         self._location = location
-        voice = texttospeech.VoiceSelectionParams(
-            name=voice_name,
-            language_code=language,
-            ssml_gender=_gender_from_str(gender),
-        )
+        if not is_given(voice):
+            voice = texttospeech.VoiceSelectionParams(
+                name="",
+                language_code="en-US",
+                ssml_gender=SsmlVoiceGender.NEUTRAL,
+            )
         self._opts = _TTSOptions(
             voice=voice,
@@ -107,26 +106,20 @@ class TTS(tts.TTS):
     def update_options(
         self,
         *,
-        language: SpeechLanguages | str = "en-US",
-        gender: Gender | str = "neutral",
-        voice_name: str = "",  # Not required
-        speaking_rate: float = 1.0,
+        voice: NotGivenOr[texttospeech.VoiceSelectionParams] = NOT_GIVEN,
+        speaking_rate: NotGivenOr[float] = NOT_GIVEN,
     ) -> None:
         """
         Update the TTS options.
         Args:
-            language (SpeechLanguages | str, optional): Language code (e.g., "en-US"). Default is "en-US".
-            gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
-            voice_name (str, optional): Specific voice name. Default is an empty string.
-            speaking_rate (float, optional): Speed of speech. Default is 1.0.
-        """
-        self._opts.voice = texttospeech.VoiceSelectionParams(
-            name=voice_name,
-            language_code=language,
-            ssml_gender=_gender_from_str(gender),
-        )
-        self._opts.audio_config.speaking_rate = speaking_rate
+            voice (texttospeech.VoiceSelectionParams, optional): Voice selection parameters.
+            speaking_rate (float, optional): Speed of speech.
+        """  # noqa: E501
+        if is_given(voice):
+            self._opts.voice = voice
+        if is_given(speaking_rate):
+            self._opts.audio_config.speaking_rate = speaking_rate
     def _ensure_client(self) -> texttospeech.TextToSpeechAsyncClient:
         api_endpoint = "texttospeech.googleapis.com"
@@ -135,19 +128,13 @@ class TTS(tts.TTS):
         if self._client is None:
             if self._credentials_info:
-                self._client = (
-                    texttospeech.TextToSpeechAsyncClient.from_service_account_info(
-                        self._credentials_info,
-                        client_options=ClientOptions(api_endpoint=api_endpoint),
-                    )
+                self._client = texttospeech.TextToSpeechAsyncClient.from_service_account_info(
+                    self._credentials_info, client_options=ClientOptions(api_endpoint=api_endpoint)
                 )
             elif self._credentials_file:
-                self._client = (
-                    texttospeech.TextToSpeechAsyncClient.from_service_account_file(
-                        self._credentials_file,
-                        client_options=ClientOptions(api_endpoint=api_endpoint),
-                    )
+                self._client = texttospeech.TextToSpeechAsyncClient.from_service_account_file(
+                    self._credentials_file, client_options=ClientOptions(api_endpoint=api_endpoint)
                 )
             else:
                 self._client = texttospeech.TextToSpeechAsyncClient(
@@ -161,8 +148,8 @@ class TTS(tts.TTS):
         self,
         text: str,
         *,
-        conn_options: Optional[APIConnectOptions] = None,
-    ) -> "ChunkedStream":
+        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
+    ) -> ChunkedStream:
         return ChunkedStream(
             tts=self,
             input_text=text,
@@ -180,7 +167,7 @@ class ChunkedStream(tts.ChunkedStream):
         input_text: str,
         opts: _TTSOptions,
         client: texttospeech.TextToSpeechAsyncClient,
-        conn_options: Optional[APIConnectOptions] = None,
+        conn_options: APIConnectOptions,
     ) -> None:
         super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
         self._opts, self._client = opts, client
@@ -216,14 +203,11 @@ class ChunkedStream(tts.ChunkedStream):
                 await decoder.aclose()
         except DeadlineExceeded:
-            raise APITimeoutError()
+            raise APITimeoutError() from None
         except GoogleAPICallError as e:
             raise APIStatusError(
-                e.message,
-                status_code=e.code or -1,
-                request_id=None,
-                body=None,
-            )
+                e.message, status_code=e.code or -1, request_id=None, body=None
+            ) from None
         except Exception as e:
             raise APIConnectionError() from e

livekit-plugins-google 0.11.3__py3-none-any.whl → 1.0.0__py3-none-any.whl

livekit-plugins-google 0.11.3py3-none-any.whl → 1.0.0py3-none-any.whl