PyPI - livekit-plugins-google - Versions diffs - 0.3.0__py3-none-any.whl → 1.3.8__py3-none-any.whl - Mend

livekit-plugins-google 0.3.0py3-none-any.whl → 1.3.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

livekit/plugins/google/__init__.py +25 -7
livekit/plugins/google/beta/__init__.py +13 -0
livekit/plugins/google/beta/gemini_tts.py +258 -0
livekit/plugins/google/llm.py +501 -0
livekit/plugins/google/log.py +3 -0
livekit/plugins/google/models.py +145 -31
livekit/plugins/google/realtime/__init__.py +9 -0
livekit/plugins/google/realtime/api_proto.py +66 -0
livekit/plugins/google/realtime/realtime_api.py +1252 -0
livekit/plugins/google/stt.py +518 -272
livekit/plugins/google/tools.py +11 -0
livekit/plugins/google/tts.py +447 -0
livekit/plugins/google/utils.py +286 -0
livekit/plugins/google/version.py +1 -1
livekit_plugins_google-1.3.8.dist-info/METADATA +63 -0
livekit_plugins_google-1.3.8.dist-info/RECORD +18 -0
{livekit_plugins_google-0.3.0.dist-info → livekit_plugins_google-1.3.8.dist-info}/WHEEL +1 -2
livekit_plugins_google-0.3.0.dist-info/METADATA +0 -47
livekit_plugins_google-0.3.0.dist-info/RECORD +0 -9
livekit_plugins_google-0.3.0.dist-info/top_level.txt +0 -1

livekit/plugins/google/stt.py CHANGED Viewed

@@ -15,35 +15,82 @@
 from __future__ import annotations
 import asyncio
-import contextlib
 import dataclasses
-import logging
+import time
+import weakref
+from collections.abc import AsyncGenerator, AsyncIterable
 from dataclasses import dataclass
-from typing import Any, AsyncIterable, Dict, List
+from datetime import timedelta
+from typing import Callable, Union, cast
-from livekit import agents, rtc
-from livekit.agents import stt
-from livekit.agents.utils import AudioBuffer
-from google.auth import credentials  # type: ignore
+from google.api_core.client_options import ClientOptions
+from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
+from google.auth import default as gauth_default
+from google.auth.exceptions import DefaultCredentialsError
 from google.cloud.speech_v2 import SpeechAsyncClient
 from google.cloud.speech_v2.types import cloud_speech
+from google.protobuf.duration_pb2 import Duration
+from livekit import rtc
+from livekit.agents import (
+    DEFAULT_API_CONNECT_OPTIONS,
+    APIConnectionError,
+    APIConnectOptions,
+    APIStatusError,
+    APITimeoutError,
+    stt,
+    utils,
+)
+from livekit.agents.types import (
+    NOT_GIVEN,
+    NotGivenOr,
+)
+from livekit.agents.utils import is_given
+from .log import logger
 from .models import SpeechLanguages, SpeechModels
-LgType = SpeechLanguages | str
-LanguageCode = LgType | List[LgType]
+LgType = Union[SpeechLanguages, str]
+LanguageCode = Union[LgType, list[LgType]]
+# Google STT has a timeout of 5 mins, we'll attempt to restart the session
+# before that timeout is reached
+_max_session_duration = 240
+# Google is very sensitive to background noise, so we'll ignore results with low confidence
+_default_min_confidence = 0.65
 # This class is only be used internally to encapsulate the options
 @dataclass
 class STTOptions:
-    languages: List[LgType]
+    languages: list[LgType]
     detect_language: bool
     interim_results: bool
     punctuate: bool
     spoken_punctuation: bool
-    model: SpeechModels
+    enable_word_time_offsets: bool
+    enable_word_confidence: bool
+    enable_voice_activity_events: bool
+    model: SpeechModels | str
+    sample_rate: int
+    min_confidence_threshold: float
+    keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN
+    def build_adaptation(self) -> cloud_speech.SpeechAdaptation | None:
+        if is_given(self.keywords):
+            return cloud_speech.SpeechAdaptation(
+                phrase_sets=[
+                    cloud_speech.SpeechAdaptation.AdaptationPhraseSet(
+                        inline_phrase_set=cloud_speech.PhraseSet(
+                            phrases=[
+                                cloud_speech.PhraseSet.Phrase(value=keyword, boost=boost)
+                                for keyword, boost in self.keywords
+                            ]
+                        )
+                    )
+                ]
+            )
+        return None
 class STT(stt.STT):
@@ -54,23 +101,64 @@ class STT(stt.STT):
         detect_language: bool = True,
         interim_results: bool = True,
         punctuate: bool = True,
-        spoken_punctuation: bool = True,
-        model: SpeechModels = "long",
-        credentials_info: Dict[str, Any] | None = None,
-        credentials_file: str | None = None,
+        spoken_punctuation: bool = False,
+        enable_word_time_offsets: bool = True,
+        enable_word_confidence: bool = False,
+        enable_voice_activity_events: bool = False,
+        model: SpeechModels | str = "latest_long",
+        location: str = "global",
+        sample_rate: int = 16000,
+        min_confidence_threshold: float = _default_min_confidence,
+        credentials_info: NotGivenOr[dict] = NOT_GIVEN,
+        credentials_file: NotGivenOr[str] = NOT_GIVEN,
+        keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN,
+        use_streaming: NotGivenOr[bool] = NOT_GIVEN,
     ):
         """
-        if no credentials is provided, it will use the credentials on the environment
-        GOOGLE_APPLICATION_CREDENTIALS (Default behavior of Google SpeechAsyncClient)
+        Create a new instance of Google STT.
+        Credentials must be provided, either by using the ``credentials_info`` dict, or reading
+        from the file specified in ``credentials_file`` or via Application Default Credentials as
+        described in https://cloud.google.com/docs/authentication/application-default-credentials
+        args:
+            languages(LanguageCode): list of language codes to recognize (default: "en-US")
+            detect_language(bool): whether to detect the language of the audio (default: True)
+            interim_results(bool): whether to return interim results (default: True)
+            punctuate(bool): whether to punctuate the audio (default: True)
+            spoken_punctuation(bool): whether to use spoken punctuation (default: False)
+            enable_word_time_offsets(bool): whether to enable word time offsets (default: True)
+            enable_word_confidence(bool): whether to enable word confidence (default: False)
+            enable_voice_activity_events(bool): whether to enable voice activity events (default: False)
+            model(SpeechModels): the model to use for recognition default: "latest_long"
+            location(str): the location to use for recognition default: "global"
+            sample_rate(int): the sample rate of the audio default: 16000
+            min_confidence_threshold(float): minimum confidence threshold for recognition
+            (default: 0.65)
+            credentials_info(dict): the credentials info to use for recognition (default: None)
+            credentials_file(str): the credentials file to use for recognition (default: None)
+            keywords(List[tuple[str, float]]): list of keywords to recognize (default: None)
+            use_streaming(bool): whether to use streaming for recognition (default: True)
         """
-        super().__init__(streaming_supported=True)
+        if not is_given(use_streaming):
+            use_streaming = True
+        super().__init__(
+            capabilities=stt.STTCapabilities(streaming=use_streaming, interim_results=True)
+        )
-        if credentials_info:
-            self._client = SpeechAsyncClient.from_service_account_info(credentials_info)
-        elif credentials_file:
-            self._client = SpeechAsyncClient.from_service_account_file(credentials_file)
-        else:
-            self._client = SpeechAsyncClient()
+        self._location = location
+        self._credentials_info = credentials_info
+        self._credentials_file = credentials_file
+        if not is_given(credentials_file) and not is_given(credentials_info):
+            try:
+                gauth_default()  # type: ignore
+            except DefaultCredentialsError:
+                raise ValueError(
+                    "Application default credentials must be available "
+                    "when using Google STT without explicitly passing "
+                    "credentials through credentials_info or credentials_file."
+                ) from None
         if isinstance(languages, str):
             languages = [languages]
@@ -81,322 +169,480 @@ class STT(stt.STT):
             interim_results=interim_results,
             punctuate=punctuate,
             spoken_punctuation=spoken_punctuation,
+            enable_word_time_offsets=enable_word_time_offsets,
+            enable_word_confidence=enable_word_confidence,
+            enable_voice_activity_events=enable_voice_activity_events,
             model=model,
+            sample_rate=sample_rate,
+            min_confidence_threshold=min_confidence_threshold,
+            keywords=keywords,
         )
-        self._creds = self._client.transport._credentials
+        self._streams = weakref.WeakSet[SpeechStream]()
+        self._pool = utils.ConnectionPool[SpeechAsyncClient](
+            max_session_duration=_max_session_duration,
+            connect_cb=self._create_client,
+        )
+    @property
+    def model(self) -> str:
+        return self._config.model
     @property
-    def _recognizer(self) -> str:
+    def provider(self) -> str:
+        return "Google Cloud Platform"
+    async def _create_client(self, timeout: float) -> SpeechAsyncClient:
+        # Add support for passing a specific location that matches recognizer
+        # see: https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages
+        # TODO(long): how to set timeout?
+        client_options = None
+        client: SpeechAsyncClient | None = None
+        if self._location != "global":
+            client_options = ClientOptions(api_endpoint=f"{self._location}-speech.googleapis.com")
+        if is_given(self._credentials_info):
+            client = SpeechAsyncClient.from_service_account_info(
+                self._credentials_info, client_options=client_options
+            )
+        elif is_given(self._credentials_file):
+            client = SpeechAsyncClient.from_service_account_file(
+                self._credentials_file, client_options=client_options
+            )
+        else:
+            client = SpeechAsyncClient(client_options=client_options)
+        assert client is not None
+        return client
+    def _get_recognizer(self, client: SpeechAsyncClient) -> str:
         # TODO(theomonnom): should we use recognizers?
-        # Recognizers may improve latency https://cloud.google.com/speech-to-text/v2/docs/recognizers#understand_recognizers
-        return f"projects/{self._creds.project_id}/locations/global/recognizers/_"  # type: ignore
+        # recognizers may improve latency https://cloud.google.com/speech-to-text/v2/docs/recognizers#understand_recognizers
-    def _sanitize_options(
-        self,
-        *,
-        language: str | None = None,
-    ) -> STTOptions:
+        # TODO(theomonnom): find a better way to access the project_id
+        try:
+            project_id = client.transport._credentials.project_id  # type: ignore
+        except AttributeError:
+            from google.auth import default as ga_default
+            _, project_id = ga_default()  # type: ignore
+        return f"projects/{project_id}/locations/{self._location}/recognizers/_"
+    def _sanitize_options(self, *, language: NotGivenOr[str] = NOT_GIVEN) -> STTOptions:
         config = dataclasses.replace(self._config)
-        if language:
+        if is_given(language):
             config.languages = [language]
         if not isinstance(config.languages, list):
             config.languages = [config.languages]
         elif not config.detect_language:
             if len(config.languages) > 1:
-                logging.warning(
-                    "multiple languages provided, but language detection is disabled"
-                )
+                logger.warning("multiple languages provided, but language detection is disabled")
             config.languages = [config.languages[0]]
         return config
-    async def recognize(
+    async def _recognize_impl(
         self,
+        buffer: utils.AudioBuffer,
         *,
-        buffer: AudioBuffer,
-        language: SpeechLanguages | str | None = None,
+        language: NotGivenOr[SpeechLanguages | str] = NOT_GIVEN,
+        conn_options: APIConnectOptions,
     ) -> stt.SpeechEvent:
         config = self._sanitize_options(language=language)
-        buffer = agents.utils.merge_frames(buffer)
+        frame = rtc.combine_audio_frames(buffer)
         config = cloud_speech.RecognitionConfig(
             explicit_decoding_config=cloud_speech.ExplicitDecodingConfig(
                 encoding=cloud_speech.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
-                sample_rate_hertz=buffer.sample_rate,
-                audio_channel_count=buffer.num_channels,
+                sample_rate_hertz=frame.sample_rate,
+                audio_channel_count=frame.num_channels,
             ),
+            adaptation=config.build_adaptation(),
             features=cloud_speech.RecognitionFeatures(
                 enable_automatic_punctuation=config.punctuate,
                 enable_spoken_punctuation=config.spoken_punctuation,
+                enable_word_time_offsets=config.enable_word_time_offsets,
+                enable_word_confidence=config.enable_word_confidence,
             ),
             model=config.model,
             language_codes=config.languages,
         )
-        return recognize_response_to_speech_event(
-            await self._client.recognize(
-                cloud_speech.RecognizeRequest(
-                    recognizer=self._recognizer,
-                    config=config,
-                    content=buffer.data.tobytes(),
+        try:
+            async with self._pool.connection(timeout=conn_options.timeout) as client:
+                raw = await client.recognize(
+                    cloud_speech.RecognizeRequest(
+                        recognizer=self._get_recognizer(client),
+                        config=config,
+                        content=frame.data.tobytes(),
+                    ),
+                    timeout=conn_options.timeout,
                 )
-            )
-        )
+                return _recognize_response_to_speech_event(raw)
+        except DeadlineExceeded:
+            raise APITimeoutError() from None
+        except GoogleAPICallError as e:
+            raise APIStatusError(f"{e.message} {e.details}", status_code=e.code or -1) from e
+        except Exception as e:
+            raise APIConnectionError() from e
     def stream(
         self,
         *,
-        language: SpeechLanguages | str | None = None,
-    ) -> "SpeechStream":
+        language: NotGivenOr[SpeechLanguages | str] = NOT_GIVEN,
+        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
+    ) -> SpeechStream:
         config = self._sanitize_options(language=language)
-        return SpeechStream(
-            self._client,
-            self._creds,
-            self._recognizer,
-            config,
+        stream = SpeechStream(
+            stt=self,
+            pool=self._pool,
+            recognizer_cb=self._get_recognizer,
+            config=config,
+            conn_options=conn_options,
         )
+        self._streams.add(stream)
+        return stream
+    def update_options(
+        self,
+        *,
+        languages: NotGivenOr[LanguageCode] = NOT_GIVEN,
+        detect_language: NotGivenOr[bool] = NOT_GIVEN,
+        interim_results: NotGivenOr[bool] = NOT_GIVEN,
+        punctuate: NotGivenOr[bool] = NOT_GIVEN,
+        spoken_punctuation: NotGivenOr[bool] = NOT_GIVEN,
+        model: NotGivenOr[SpeechModels] = NOT_GIVEN,
+        location: NotGivenOr[str] = NOT_GIVEN,
+        keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN,
+    ) -> None:
+        if is_given(languages):
+            if isinstance(languages, str):
+                languages = [languages]
+            self._config.languages = cast(list[LgType], languages)
+        if is_given(detect_language):
+            self._config.detect_language = detect_language
+        if is_given(interim_results):
+            self._config.interim_results = interim_results
+        if is_given(punctuate):
+            self._config.punctuate = punctuate
+        if is_given(spoken_punctuation):
+            self._config.spoken_punctuation = spoken_punctuation
+        if is_given(model):
+            self._config.model = model
+        if is_given(location):
+            self._location = location
+            # if location is changed, fetch a new client and recognizer as per the new location
+            self._pool.invalidate()
+        if is_given(keywords):
+            self._config.keywords = keywords
+        for stream in self._streams:
+            stream.update_options(
+                languages=languages,
+                detect_language=detect_language,
+                interim_results=interim_results,
+                punctuate=punctuate,
+                spoken_punctuation=spoken_punctuation,
+                model=model,
+                keywords=keywords,
+            )
+    async def aclose(self) -> None:
+        await self._pool.aclose()
+        await super().aclose()
 class SpeechStream(stt.SpeechStream):
     def __init__(
         self,
-        client: SpeechAsyncClient,
-        creds: credentials.Credentials,
-        recognizer: str,
+        *,
+        stt: STT,
+        conn_options: APIConnectOptions,
+        pool: utils.ConnectionPool[SpeechAsyncClient],
+        recognizer_cb: Callable[[SpeechAsyncClient], str],
         config: STTOptions,
-        sample_rate: int = 24000,
-        num_channels: int = 1,
-        max_retry: int = 32,
     ) -> None:
-        super().__init__()
+        super().__init__(stt=stt, conn_options=conn_options, sample_rate=config.sample_rate)
-        self._client = client
-        self._creds = creds
-        self._recognizer = recognizer
+        self._pool = pool
+        self._recognizer_cb = recognizer_cb
         self._config = config
-        self._sample_rate = sample_rate
-        self._num_channels = num_channels
-        self._queue = asyncio.Queue[rtc.AudioFrame | None]()
-        self._event_queue = asyncio.Queue[stt.SpeechEvent | None]()
-        self._closed = False
-        self._main_task = asyncio.create_task(self._run(max_retry=max_retry))
-        self._final_events: List[stt.SpeechEvent] = []
-        self._speaking = False
-        self._streaming_config = cloud_speech.StreamingRecognitionConfig(
-            config=cloud_speech.RecognitionConfig(
-                explicit_decoding_config=cloud_speech.ExplicitDecodingConfig(
-                    encoding=cloud_speech.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
-                    sample_rate_hertz=self._sample_rate,
-                    audio_channel_count=self._num_channels,
-                ),
-                language_codes=self._config.languages,
-                model=self._config.model,
-                features=cloud_speech.RecognitionFeatures(
-                    enable_automatic_punctuation=self._config.punctuate,
-                ),
-            ),
-            streaming_features=cloud_speech.StreamingRecognitionFeatures(
-                enable_voice_activity_events=True,
-                interim_results=self._config.interim_results,
-            ),
-        )
+        self._reconnect_event = asyncio.Event()
+        self._session_connected_at: float = 0
-        def log_exception(task: asyncio.Task) -> None:
-            if not task.cancelled() and task.exception():
-                logging.error(f"google stt task failed: {task.exception()}")
-        self._main_task.add_done_callback(log_exception)
-    def push_frame(self, frame: rtc.AudioFrame) -> None:
-        if self._closed:
-            raise ValueError("cannot push frame to closed stream")
-        self._queue.put_nowait(frame)
-    async def aclose(self, wait: bool = True) -> None:
-        self._closed = True
-        if not wait:
-            self._main_task.cancel()
-        self._queue.put_nowait(None)
-        with contextlib.suppress(asyncio.CancelledError):
-            await self._main_task
+    def update_options(
+        self,
+        *,
+        languages: NotGivenOr[LanguageCode] = NOT_GIVEN,
+        detect_language: NotGivenOr[bool] = NOT_GIVEN,
+        interim_results: NotGivenOr[bool] = NOT_GIVEN,
+        punctuate: NotGivenOr[bool] = NOT_GIVEN,
+        spoken_punctuation: NotGivenOr[bool] = NOT_GIVEN,
+        model: NotGivenOr[SpeechModels] = NOT_GIVEN,
+        min_confidence_threshold: NotGivenOr[float] = NOT_GIVEN,
+        keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN,
+    ) -> None:
+        if is_given(languages):
+            if isinstance(languages, str):
+                languages = [languages]
+            self._config.languages = cast(list[LgType], languages)
+        if is_given(detect_language):
+            self._config.detect_language = detect_language
+        if is_given(interim_results):
+            self._config.interim_results = interim_results
+        if is_given(punctuate):
+            self._config.punctuate = punctuate
+        if is_given(spoken_punctuation):
+            self._config.spoken_punctuation = spoken_punctuation
+        if is_given(model):
+            self._config.model = model
+        if is_given(min_confidence_threshold):
+            self._config.min_confidence_threshold = min_confidence_threshold
+        if is_given(keywords):
+            self._config.keywords = keywords
+        self._reconnect_event.set()
+    async def _run(self) -> None:
+        audio_pushed = False
+        # google requires a async generator when calling streaming_recognize
+        # this function basically convert the queue into a async generator
+        async def input_generator(
+            client: SpeechAsyncClient, should_stop: asyncio.Event
+        ) -> AsyncGenerator[cloud_speech.StreamingRecognizeRequest, None]:
+            nonlocal audio_pushed
+            try:
+                # first request should contain the config
+                yield cloud_speech.StreamingRecognizeRequest(
+                    recognizer=self._recognizer_cb(client),
+                    streaming_config=self._streaming_config,
+                )
-    async def _run(self, max_retry: int) -> None:
-        retry_count = 0
-        try:
-            while not self._closed:
-                try:
-                    # google requires a async generator when calling streaming_recognize
-                    # this function basically convert the queue into a async generator
-                    async def input_generator():
-                        try:
-                            # first request should contain the config
-                            yield cloud_speech.StreamingRecognizeRequest(
-                                recognizer=self._recognizer,
-                                streaming_config=self._streaming_config,
+                async for frame in self._input_ch:
+                    # when the stream is aborted due to reconnect, this input_generator
+                    # needs to stop consuming frames
+                    # when the generator stops, the previous gRPC stream will close
+                    if should_stop.is_set():
+                        return
+                    if isinstance(frame, rtc.AudioFrame):
+                        yield cloud_speech.StreamingRecognizeRequest(audio=frame.data.tobytes())
+                        if not audio_pushed:
+                            audio_pushed = True
+            except Exception:
+                logger.exception("an error occurred while streaming input to google STT")
+        async def process_stream(
+            client: SpeechAsyncClient,
+            stream: AsyncIterable[cloud_speech.StreamingRecognizeResponse],
+        ) -> None:
+            has_started = False
+            async for resp in stream:
+                if (
+                    resp.speech_event_type
+                    == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
+                ):
+                    self._event_ch.send_nowait(
+                        stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH)
+                    )
+                    has_started = True
+                if (
+                    resp.speech_event_type
+                    == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_EVENT_TYPE_UNSPECIFIED  # noqa: E501
+                ):
+                    result = resp.results[0]
+                    speech_data = _streaming_recognize_response_to_speech_data(
+                        resp,
+                        min_confidence_threshold=self._config.min_confidence_threshold,
+                    )
+                    if speech_data is None:
+                        continue
+                    if not result.is_final:
+                        self._event_ch.send_nowait(
+                            stt.SpeechEvent(
+                                type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
+                                alternatives=[speech_data],
                             )
-                            while True:
-                                frame = (
-                                    await self._queue.get()
-                                )  # wait for a new rtc.AudioFrame
-                                if frame is None:
-                                    break  # None is sent inside aclose
-                                self._queue.task_done()
-                                frame = frame.remix_and_resample(
-                                    self._sample_rate, self._num_channels
-                                )
-                                yield cloud_speech.StreamingRecognizeRequest(
-                                    audio=frame.data.tobytes(),
-                                )
-                        except Exception as e:
-                            logging.error(
-                                f"an error occurred while streaming inputs: {e}"
+                        )
+                    else:
+                        self._event_ch.send_nowait(
+                            stt.SpeechEvent(
+                                type=stt.SpeechEventType.FINAL_TRANSCRIPT,
+                                alternatives=[speech_data],
                             )
-                    # try to connect
-                    stream = await self._client.streaming_recognize(
-                        requests=input_generator()
-                    )
-                    retry_count = 0  # connection successful, reset retry count
-                    await self._run_stream(stream)
-                except Exception as e:
-                    if retry_count >= max_retry:
-                        logging.error(
-                            f"failed to connect to google stt after {max_retry} tries",
-                            exc_info=e,
                         )
-                        break
-                    retry_delay = min(retry_count * 2, 10)  # max 10s
-                    retry_count += 1
-                    logging.warning(
-                        f"google stt connection failed, retrying in {retry_delay}s",
-                        exc_info=e,
+                        if time.time() - self._session_connected_at > _max_session_duration:
+                            logger.debug(
+                                "Google STT maximum connection time reached. Reconnecting..."
+                            )
+                            self._pool.remove(client)
+                            if has_started:
+                                self._event_ch.send_nowait(
+                                    stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH)
+                                )
+                                has_started = False
+                            self._reconnect_event.set()
+                            return
+                if (
+                    resp.speech_event_type
+                    == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END
+                ):
+                    self._event_ch.send_nowait(
+                        stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH)
                     )
-                    await asyncio.sleep(retry_delay)
-        finally:
-            self._event_queue.put_nowait(None)
-    async def _run_stream(
-        self, stream: AsyncIterable[cloud_speech.StreamingRecognizeResponse]
-    ):
-        async for resp in stream:
-            if (
-                resp.speech_event_type
-                == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
-            ):
-                self._speaking = True
-                start_event = stt.SpeechEvent(
-                    type=stt.SpeechEventType.START_OF_SPEECH,
-                )
-                self._event_queue.put_nowait(start_event)
-            if (
-                resp.speech_event_type
-                == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_EVENT_TYPE_UNSPECIFIED
-            ):
-                result = resp.results[0]
-                if not result.is_final:
-                    # interim results
-                    iterim_event = stt.SpeechEvent(
-                        type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
-                        alternatives=streaming_recognize_response_to_speech_data(resp),
+                    has_started = False
+        while True:
+            audio_pushed = False
+            try:
+                async with self._pool.connection(timeout=self._conn_options.timeout) as client:
+                    self._streaming_config = cloud_speech.StreamingRecognitionConfig(
+                        config=cloud_speech.RecognitionConfig(
+                            explicit_decoding_config=cloud_speech.ExplicitDecodingConfig(
+                                encoding=cloud_speech.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
+                                sample_rate_hertz=self._config.sample_rate,
+                                audio_channel_count=1,
+                            ),
+                            adaptation=self._config.build_adaptation(),
+                            language_codes=self._config.languages,
+                            model=self._config.model,
+                            features=cloud_speech.RecognitionFeatures(
+                                enable_automatic_punctuation=self._config.punctuate,
+                                enable_word_time_offsets=self._config.enable_word_time_offsets,
+                                enable_spoken_punctuation=self._config.spoken_punctuation,
+                            ),
+                        ),
+                        streaming_features=cloud_speech.StreamingRecognitionFeatures(
+                            interim_results=self._config.interim_results,
+                            enable_voice_activity_events=self._config.enable_voice_activity_events,
+                        ),
                     )
-                    self._event_queue.put_nowait(iterim_event)
-                else:
-                    final_event = stt.SpeechEvent(
-                        type=stt.SpeechEventType.FINAL_TRANSCRIPT,
-                        alternatives=streaming_recognize_response_to_speech_data(resp),
+                    should_stop = asyncio.Event()
+                    stream = await client.streaming_recognize(
+                        requests=input_generator(client, should_stop),
                     )
-                    self._final_events.append(final_event)
-                    self._event_queue.put_nowait(final_event)
-                    if not self._speaking:
-                        # With Google STT, we receive the final event after the END_OF_SPEECH event
-                        sentence = ""
-                        confidence = 0.0
-                        for alt in self._final_events:
-                            sentence += f"{alt.alternatives[0].text.strip()} "
-                            confidence += alt.alternatives[0].confidence
-                        sentence = sentence.rstrip()
-                        confidence /= len(self._final_events)  # avg. of confidence
-                        end_event = stt.SpeechEvent(
-                            type=stt.SpeechEventType.END_OF_SPEECH,
-                            alternatives=[
-                                stt.SpeechData(
-                                    language=result.language_code,
-                                    start_time=self._final_events[0]
-                                    .alternatives[0]
-                                    .start_time,
-                                    end_time=self._final_events[-1]
-                                    .alternatives[0]
-                                    .end_time,
-                                    confidence=confidence,
-                                    text=sentence,
-                                )
-                            ],
-                        )
+                    self._session_connected_at = time.time()
-                        self._final_events = []
-                        self._event_queue.put_nowait(end_event)
+                    process_stream_task = asyncio.create_task(process_stream(client, stream))
+                    wait_reconnect_task = asyncio.create_task(self._reconnect_event.wait())
-            if (
-                resp.speech_event_type
-                == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END
-            ):
-                self._speaking = False
+                    try:
+                        done, _ = await asyncio.wait(
+                            [process_stream_task, wait_reconnect_task],
+                            return_when=asyncio.FIRST_COMPLETED,
+                        )
+                        for task in done:
+                            if task != wait_reconnect_task:
+                                task.result()
+                        if wait_reconnect_task not in done:
+                            break
+                        self._reconnect_event.clear()
+                    finally:
+                        should_stop.set()
+                        if not process_stream_task.done() and not wait_reconnect_task.done():
+                            # try to gracefully stop the process_stream_task
+                            try:
+                                await asyncio.wait_for(process_stream_task, timeout=1.0)
+                            except asyncio.TimeoutError:
+                                pass
+                        await utils.aio.gracefully_cancel(process_stream_task, wait_reconnect_task)
+            except DeadlineExceeded:
+                raise APITimeoutError() from None
+            except GoogleAPICallError as e:
+                if e.code == 409:
+                    if audio_pushed:
+                        logger.debug("stream timed out, restarting.")
+                else:
+                    raise APIStatusError(
+                        f"{e.message} {e.details}", status_code=e.code or -1
+                    ) from e
+            except Exception as e:
+                raise APIConnectionError() from e
-    async def __anext__(self) -> stt.SpeechEvent:
-        evt = await self._event_queue.get()
-        if evt is None:
-            raise StopAsyncIteration
-        return evt
+def _duration_to_seconds(duration: Duration | timedelta) -> float:
+    # Proto Plus may auto-convert Duration to timedelta; handle both.
+    # https://proto-plus-python.readthedocs.io/en/latest/marshal.html
+    if isinstance(duration, timedelta):
+        return duration.total_seconds()
+    return duration.seconds + duration.nanos / 1e9
-def recognize_response_to_speech_event(
+def _recognize_response_to_speech_event(
     resp: cloud_speech.RecognizeResponse,
 ) -> stt.SpeechEvent:
-    result = resp.results[0]
-    gg_alts = result.alternatives
-    return stt.SpeechEvent(
-        type=stt.SpeechEventType.FINAL_TRANSCRIPT,
-        alternatives=[
+    text = ""
+    confidence = 0.0
+    for result in resp.results:
+        text += result.alternatives[0].transcript
+        confidence += result.alternatives[0].confidence
+    alternatives = []
+    # Google STT may return empty results when spoken_lang != stt_lang
+    if resp.results:
+        try:
+            start_time = _duration_to_seconds(resp.results[0].alternatives[0].words[0].start_offset)
+            end_time = _duration_to_seconds(resp.results[-1].alternatives[0].words[-1].end_offset)
+        except IndexError:
+            # When enable_word_time_offsets=False, there are no "words" to access
+            start_time = end_time = 0
+        confidence /= len(resp.results)
+        lg = resp.results[0].language_code
+        alternatives = [
             stt.SpeechData(
-                language=result.language_code,
-                start_time=alt.words[0].start_offset.seconds if alt.words else 0,
-                end_time=alt.words[-1].end_offset.seconds if alt.words else 0,
-                confidence=alt.confidence,
-                text=alt.transcript,
+                language=lg,
+                start_time=start_time,
+                end_time=end_time,
+                confidence=confidence,
+                text=text,
             )
-            for alt in gg_alts
-        ],
-    )
+        ]
+    return stt.SpeechEvent(type=stt.SpeechEventType.FINAL_TRANSCRIPT, alternatives=alternatives)
-def streaming_recognize_response_to_speech_data(
+def _streaming_recognize_response_to_speech_data(
     resp: cloud_speech.StreamingRecognizeResponse,
-) -> List[stt.SpeechData]:
-    result = resp.results[0]
-    gg_alts = result.alternatives
-    return [
-        stt.SpeechData(
-            language=result.language_code,
-            start_time=alt.words[0].start_offset.seconds if alt.words else 0,
-            end_time=alt.words[-1].end_offset.seconds if alt.words else 0,
-            confidence=alt.confidence,
-            text=alt.transcript,
-        )
-        for alt in gg_alts
-    ]
+    *,
+    min_confidence_threshold: float,
+) -> stt.SpeechData | None:
+    text = ""
+    confidence = 0.0
+    final_result = None
+    for result in resp.results:
+        if len(result.alternatives) == 0:
+            continue
+        else:
+            if result.is_final:
+                final_result = result
+                break
+            else:
+                text += result.alternatives[0].transcript
+                confidence += result.alternatives[0].confidence
+    if final_result is not None:
+        text = final_result.alternatives[0].transcript
+        confidence = final_result.alternatives[0].confidence
+        lg = final_result.language_code
+    else:
+        confidence /= len(resp.results)
+        if confidence < min_confidence_threshold:
+            return None
+        lg = resp.results[0].language_code
+    if text == "":
+        return None
+    data = stt.SpeechData(language=lg, start_time=0, end_time=0, confidence=confidence, text=text)
+    return data

livekit-plugins-google 0.3.0__py3-none-any.whl → 1.3.8__py3-none-any.whl

livekit-plugins-google 0.3.0py3-none-any.whl → 1.3.8py3-none-any.whl