PyPI - livekit-plugins-google - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

livekit-plugins-google 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

livekit/plugins/google/__init__.py CHANGED Viewed

@@ -22,7 +22,7 @@ from livekit.agents import Plugin
 class GooglePlugin(Plugin):
     def __init__(self):
-        super().__init__(__name__, __version__)
+        super().__init__(__name__, __version__, __package__)
     def download_files(self):
         pass

livekit/plugins/google/py.typed ADDED Viewed

File without changes

livekit/plugins/google/stt.py CHANGED Viewed

@@ -12,23 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
+import asyncio
 import contextlib
-from typing import Optional, Union, List
-from google.auth import credentials
-from google.cloud.speech_v2 import SpeechAsyncClient
-from google.cloud.speech_v2.types import cloud_speech
-from livekit import rtc, agents
-from livekit.agents.utils import AudioBuffer
-from livekit.agents import stt
-from .models import SpeechModels, SpeechLanguages
-from dataclasses import dataclass
 import dataclasses
-import asyncio
 import logging
+from dataclasses import dataclass
+from typing import Any, AsyncIterable, Dict, List
+from livekit import agents, rtc
+from livekit.agents import stt
+from livekit.agents.utils import AudioBuffer
+from google.auth import credentials  # type: ignore
+from google.cloud.speech_v2 import SpeechAsyncClient
+from google.cloud.speech_v2.types import cloud_speech
-LgType = Union[SpeechLanguages, str]
-LanguageCode = Union[LgType, List[LgType]]
+from .models import SpeechLanguages, SpeechModels
+LgType = SpeechLanguages | str
+LanguageCode = LgType | List[LgType]
 # This class is only be used internally to encapsulate the options
@@ -52,8 +56,8 @@ class STT(stt.STT):
         punctuate: bool = True,
         spoken_punctuation: bool = True,
         model: SpeechModels = "long",
-        credentials_info: Optional[dict] = None,
-        credentials_file: Optional[str] = None,
+        credentials_info: Dict[str, Any] | None = None,
+        credentials_file: str | None = None,
     ):
         """
         if no credentials is provided, it will use the credentials on the environment
@@ -90,7 +94,7 @@ class STT(stt.STT):
     def _sanitize_options(
         self,
         *,
-        language: Optional[str] = None,
+        language: str | None = None,
     ) -> STTOptions:
         config = dataclasses.replace(self._config)
@@ -112,7 +116,7 @@ class STT(stt.STT):
         self,
         *,
         buffer: AudioBuffer,
-        language: Optional[Union[SpeechLanguages, str]] = None,
+        language: SpeechLanguages | str | None = None,
     ) -> stt.SpeechEvent:
         config = self._sanitize_options(language=language)
         buffer = agents.utils.merge_frames(buffer)
@@ -144,7 +148,7 @@ class STT(stt.STT):
     def stream(
         self,
         *,
-        language: Optional[Union[SpeechLanguages, str]] = None,
+        language: SpeechLanguages | str | None = None,
     ) -> "SpeechStream":
         config = self._sanitize_options(language=language)
         return SpeechStream(
@@ -164,6 +168,7 @@ class SpeechStream(stt.SpeechStream):
         config: STTOptions,
         sample_rate: int = 24000,
         num_channels: int = 1,
+        max_retry: int = 32,
     ) -> None:
         super().__init__()
@@ -174,33 +179,15 @@ class SpeechStream(stt.SpeechStream):
         self._sample_rate = sample_rate
         self._num_channels = num_channels
-        self._queue = asyncio.Queue[rtc.AudioFrame]()
-        self._event_queue = asyncio.Queue[stt.SpeechEvent]()
+        self._queue = asyncio.Queue[rtc.AudioFrame | None]()
+        self._event_queue = asyncio.Queue[stt.SpeechEvent | None]()
         self._closed = False
-        self._main_task = asyncio.create_task(self._run(max_retry=32))
-        def log_exception(task: asyncio.Task) -> None:
-            if not task.cancelled() and task.exception():
-                logging.error(f"google speech task failed: {task.exception()}")
+        self._main_task = asyncio.create_task(self._run(max_retry=max_retry))
-        self._main_task.add_done_callback(log_exception)
-    def push_frame(self, frame: rtc.AudioFrame) -> None:
-        if self._closed:
-            raise ValueError("cannot push frame to closed stream")
-        self._queue.put_nowait(frame)
-    async def flush(self) -> None:
-        await self._queue.join()
-    async def aclose(self) -> None:
-        self._main_task.cancel()
-        with contextlib.suppress(asyncio.CancelledError):
-            await self._main_task
+        self._final_events: List[stt.SpeechEvent] = []
+        self._speaking = False
-    def _streaming_config(self) -> cloud_speech.StreamingRecognitionConfig:
-        return cloud_speech.StreamingRecognitionConfig(
+        self._streaming_config = cloud_speech.StreamingRecognitionConfig(
             config=cloud_speech.RecognitionConfig(
                 explicit_decoding_config=cloud_speech.ExplicitDecodingConfig(
                     encoding=cloud_speech.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
@@ -214,64 +201,168 @@ class SpeechStream(stt.SpeechStream):
                 ),
             ),
             streaming_features=cloud_speech.StreamingRecognitionFeatures(
+                enable_voice_activity_events=True,
                 interim_results=self._config.interim_results,
             ),
         )
-    async def _run(self, max_retry: int) -> None:
-        """Try to connect to Google Speech API and forward frames"""
-        retry_count = 0
-        while True:
-            try:
-                input_gen = self._input_gen(self._streaming_config())
-                stream = await self._client.streaming_recognize(requests=input_gen)
-                retry_count = 0
-                async for resp in stream:
-                    self._event_queue.put_nowait(
-                        streaming_recognize_response_to_speech_event(resp)
-                    )
+        def log_exception(task: asyncio.Task) -> None:
+            if not task.cancelled() and task.exception():
+                logging.error(f"google stt task failed: {task.exception()}")
-            except asyncio.CancelledError:
-                break
-            except Exception as e:
-                if retry_count > max_retry and max_retry > 0:
-                    logging.error(f"failed to connect to Google Speech: {e}")
-                    break
+        self._main_task.add_done_callback(log_exception)
-                retry_delay = min(retry_count * 5, 5)  # max 5s
-                retry_count += 1
-                logging.warning(
-                    f"failed to connect to Google Speech: {e} - retrying in {retry_delay}s"
-                )
-                await asyncio.sleep(retry_delay)
+    def push_frame(self, frame: rtc.AudioFrame) -> None:
+        if self._closed:
+            raise ValueError("cannot push frame to closed stream")
+        self._queue.put_nowait(frame)
+    async def aclose(self, wait: bool = True) -> None:
         self._closed = True
+        if not wait:
+            self._main_task.cancel()
-    async def _input_gen(self, config):
-        """
-        Convert our input queue to a generator (needed by the Google Speech client in Python)
-        """
+        self._queue.put_nowait(None)
+        with contextlib.suppress(asyncio.CancelledError):
+            await self._main_task
+    async def _run(self, max_retry: int) -> None:
+        retry_count = 0
         try:
-            yield cloud_speech.StreamingRecognizeRequest(
-                recognizer=self._recognizer,
-                streaming_config=config,
-            )
-            while True:
-                frame = await self._queue.get()  # wait for a new rtc.AudioFrame
-                frame = frame.remix_and_resample(self._sample_rate, self._num_channels)
-                yield cloud_speech.StreamingRecognizeRequest(
-                    audio=frame.data.tobytes(),
+            while not self._closed:
+                try:
+                    # google requires a async generator when calling streaming_recognize
+                    # this function basically convert the queue into a async generator
+                    async def input_generator():
+                        try:
+                            # first request should contain the config
+                            yield cloud_speech.StreamingRecognizeRequest(
+                                recognizer=self._recognizer,
+                                streaming_config=self._streaming_config,
+                            )
+                            while True:
+                                frame = (
+                                    await self._queue.get()
+                                )  # wait for a new rtc.AudioFrame
+                                if frame is None:
+                                    break  # None is sent inside aclose
+                                self._queue.task_done()
+                                frame = frame.remix_and_resample(
+                                    self._sample_rate, self._num_channels
+                                )
+                                yield cloud_speech.StreamingRecognizeRequest(
+                                    audio=frame.data.tobytes(),
+                                )
+                        except Exception as e:
+                            logging.error(
+                                f"an error occurred while streaming inputs: {e}"
+                            )
+                    # try to connect
+                    stream = await self._client.streaming_recognize(
+                        requests=input_generator()
+                    )
+                    retry_count = 0  # connection successful, reset retry count
+                    await self._run_stream(stream)
+                except Exception as e:
+                    if retry_count >= max_retry:
+                        logging.error(
+                            f"failed to connect to google stt after {max_retry} tries",
+                            exc_info=e,
+                        )
+                        break
+                    retry_delay = min(retry_count * 2, 10)  # max 10s
+                    retry_count += 1
+                    logging.warning(
+                        f"google stt connection failed, retrying in {retry_delay}s",
+                        exc_info=e,
+                    )
+                    await asyncio.sleep(retry_delay)
+        finally:
+            self._event_queue.put_nowait(None)
+    async def _run_stream(
+        self, stream: AsyncIterable[cloud_speech.StreamingRecognizeResponse]
+    ):
+        async for resp in stream:
+            if (
+                resp.speech_event_type
+                == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
+            ):
+                self._speaking = True
+                start_event = stt.SpeechEvent(
+                    type=stt.SpeechEventType.START_OF_SPEECH,
                 )
-                self._queue.task_done()
-        except Exception as e:
-            logging.error(f"an error occurred while streaming inputs: {e}")
+                self._event_queue.put_nowait(start_event)
+            if (
+                resp.speech_event_type
+                == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_EVENT_TYPE_UNSPECIFIED
+            ):
+                result = resp.results[0]
+                if not result.is_final:
+                    # interim results
+                    iterim_event = stt.SpeechEvent(
+                        type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
+                        alternatives=streaming_recognize_response_to_speech_data(resp),
+                    )
+                    self._event_queue.put_nowait(iterim_event)
+                else:
+                    final_event = stt.SpeechEvent(
+                        type=stt.SpeechEventType.FINAL_TRANSCRIPT,
+                        alternatives=streaming_recognize_response_to_speech_data(resp),
+                    )
+                    self._final_events.append(final_event)
+                    self._event_queue.put_nowait(final_event)
+                    if not self._speaking:
+                        # With Google STT, we receive the final event after the END_OF_SPEECH event
+                        sentence = ""
+                        confidence = 0.0
+                        for alt in self._final_events:
+                            sentence += f"{alt.alternatives[0].text.strip()} "
+                            confidence += alt.alternatives[0].confidence
+                        sentence = sentence.rstrip()
+                        confidence /= len(self._final_events)  # avg. of confidence
+                        end_event = stt.SpeechEvent(
+                            type=stt.SpeechEventType.END_OF_SPEECH,
+                            alternatives=[
+                                stt.SpeechData(
+                                    language=result.language_code,
+                                    start_time=self._final_events[0]
+                                    .alternatives[0]
+                                    .start_time,
+                                    end_time=self._final_events[-1]
+                                    .alternatives[0]
+                                    .end_time,
+                                    confidence=confidence,
+                                    text=sentence,
+                                )
+                            ],
+                        )
+                        self._final_events = []
+                        self._event_queue.put_nowait(end_event)
+            if (
+                resp.speech_event_type
+                == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END
+            ):
+                self._speaking = False
     async def __anext__(self) -> stt.SpeechEvent:
-        if self._closed and self._event_queue.empty():
+        evt = await self._event_queue.get()
+        if evt is None:
             raise StopAsyncIteration
-        return await self._event_queue.get()
+        return evt
 def recognize_response_to_speech_event(
@@ -280,8 +371,7 @@ def recognize_response_to_speech_event(
     result = resp.results[0]
     gg_alts = result.alternatives
     return stt.SpeechEvent(
-        is_final=True,
-        end_of_speech=True,
+        type=stt.SpeechEventType.FINAL_TRANSCRIPT,
         alternatives=[
             stt.SpeechData(
                 language=result.language_code,
@@ -295,24 +385,18 @@ def recognize_response_to_speech_event(
     )
-def streaming_recognize_response_to_speech_event(
+def streaming_recognize_response_to_speech_data(
     resp: cloud_speech.StreamingRecognizeResponse,
-) -> stt.SpeechEvent:
+) -> List[stt.SpeechData]:
     result = resp.results[0]
     gg_alts = result.alternatives
-    return stt.SpeechEvent(
-        is_final=result.is_final,
-        # Google STT does not have a separate end_of_speech indicator
-        # so we'll use is_final
-        end_of_speech=result.is_final,
-        alternatives=[
-            stt.SpeechData(
-                language=result.language_code,
-                start_time=alt.words[0].start_offset.seconds if alt.words else 0,
-                end_time=alt.words[-1].end_offset.seconds if alt.words else 0,
-                confidence=alt.confidence,
-                text=alt.transcript,
-            )
-            for alt in gg_alts
-        ],
-    )
+    return [
+        stt.SpeechData(
+            language=result.language_code,
+            start_time=alt.words[0].start_offset.seconds if alt.words else 0,
+            end_time=alt.words[-1].end_offset.seconds if alt.words else 0,
+            confidence=alt.confidence,
+            text=alt.transcript,
+        )
+        for alt in gg_alts
+    ]

livekit/plugins/google/version.py CHANGED Viewed

@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.2.0"
+__version__ = "0.3.0"

{livekit_plugins_google-0.2.0.dist-info → livekit_plugins_google-0.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: livekit-plugins-google
-Version: 0.2.0
+Version: 0.3.0
 Summary: Agent Framework plugin for services from Google Cloud
 Home-page: https://github.com/livekit/agents
 License: Apache-2.0
@@ -29,8 +29,8 @@ Requires-Dist: google-cloud-speech <3,>=2
 Requires-Dist: google-cloud-texttospeech <3,>=2
 Requires-Dist: google-cloud-translate <4,>=3
 Requires-Dist: googleapis-common-protos <2,>=1
-Requires-Dist: livekit >=0.9.0
-Requires-Dist: livekit-agents >=0.3.0
+Requires-Dist: livekit >=0.9.2
+Requires-Dist: livekit-agents ~=0.5.dev0
 # LiveKit Plugins Google

livekit_plugins_google-0.3.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+livekit/plugins/google/__init__.py,sha256=snPMHNLrurYbLWQOkV_o6qG1CEWsOCZ8ZfPMvmh5ejY,931
+livekit/plugins/google/models.py,sha256=DgiXOvGDO8D9rfCKHJL28lbyQR8mXXB2kpku-szXLRs,1185
+livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+livekit/plugins/google/stt.py,sha256=lYA8hlkxG3YSw1Q34j8hgs4us5Ij-TLBQTRwtGPN9MY,15025
+livekit/plugins/google/version.py,sha256=G5iYozum4q7UpHwW43F7QfhzUfwcncPxBZ0gmUGsd5I,600
+livekit_plugins_google-0.3.0.dist-info/METADATA,sha256=sPd3OZxViD0Aq1uF1qJpbsYeqLAlq8tB720JXk-_RKw,1945
+livekit_plugins_google-0.3.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+livekit_plugins_google-0.3.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
+livekit_plugins_google-0.3.0.dist-info/RECORD,,

{livekit_plugins_google-0.2.0.dist-info → livekit_plugins_google-0.3.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.42.0)
+Generator: bdist_wheel (0.43.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

livekit_plugins_google-0.2.0.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-livekit/plugins/google/__init__.py,sha256=uDkfCsfqWmuPDrDolu-nJrZxpTD53pTCaRVWmyA8a6w,918
-livekit/plugins/google/models.py,sha256=DgiXOvGDO8D9rfCKHJL28lbyQR8mXXB2kpku-szXLRs,1185
-livekit/plugins/google/stt.py,sha256=efyE7vjxWuO99dR9-nSLF9LkmoX0khOwXpayh7-5saY,11149
-livekit/plugins/google/version.py,sha256=cLFCdnm5S21CiJ5UJBcqfRvvFkCQ8p6M5fFUJVJkEiM,600
-livekit_plugins_google-0.2.0.dist-info/METADATA,sha256=8tnZ8TW_UHy87ADQvAJSGFqm42Yi-E30bvV2x1LzzBg,1942
-livekit_plugins_google-0.2.0.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-livekit_plugins_google-0.2.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
-livekit_plugins_google-0.2.0.dist-info/RECORD,,

{livekit_plugins_google-0.2.0.dist-info → livekit_plugins_google-0.3.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

livekit-plugins-google 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

livekit-plugins-google 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl