PyPI - livekit-plugins-google - Versions diffs - 0.6.0__py3-none-any.whl → 0.6.dev0__py3-none-any.whl - Mend

livekit-plugins-google 0.6.0py3-none-any.whl → 0.6.dev0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

livekit/plugins/google/stt.py CHANGED Viewed

@@ -15,13 +15,15 @@
 from __future__ import annotations
 import asyncio
+import contextlib
 import dataclasses
 import os
 from dataclasses import dataclass
-from typing import AsyncIterable, List, Union
+from typing import AsyncIterable, List, Optional, Union
-from livekit import agents
-from livekit.agents import stt, utils
+from livekit import agents, rtc
+from livekit.agents import stt
+from livekit.agents.utils import AudioBuffer
 from google.cloud.speech_v2 import SpeechAsyncClient
 from google.cloud.speech_v2.types import cloud_speech
@@ -61,9 +63,7 @@ class STT(stt.STT):
         if no credentials is provided, it will use the credentials on the environment
         GOOGLE_APPLICATION_CREDENTIALS (default behavior of Google SpeechAsyncClient)
         """
-        super().__init__(
-            capabilities=stt.STTCapabilities(streaming=True, interim_results=True)
-        )
+        super().__init__(streaming_supported=True)
         self._client: SpeechAsyncClient | None = None
         self._credentials_info = credentials_info
@@ -112,7 +112,11 @@ class STT(stt.STT):
         project_id = self._ensure_client().transport._credentials.project_id  # type: ignore
         return f"projects/{project_id}/locations/global/recognizers/_"
-    def _sanitize_options(self, *, language: str | None = None) -> STTOptions:
+    def _sanitize_options(
+        self,
+        *,
+        language: str | None = None,
+    ) -> STTOptions:
         config = dataclasses.replace(self._config)
         if language:
@@ -131,8 +135,8 @@ class STT(stt.STT):
     async def recognize(
         self,
-        buffer: utils.AudioBuffer,
         *,
+        buffer: AudioBuffer,
         language: SpeechLanguages | str | None = None,
     ) -> stt.SpeechEvent:
         config = self._sanitize_options(language=language)
@@ -155,16 +159,24 @@ class STT(stt.STT):
         raw = await self._ensure_client().recognize(
             cloud_speech.RecognizeRequest(
-                recognizer=self._recognizer, config=config, content=frame.data.tobytes()
+                recognizer=self._recognizer,
+                config=config,
+                content=frame.data.tobytes(),
             )
         )
         return _recognize_response_to_speech_event(raw)
     def stream(
-        self, *, language: SpeechLanguages | str | None = None
+        self,
+        *,
+        language: SpeechLanguages | str | None = None,
     ) -> "SpeechStream":
         config = self._sanitize_options(language=language)
-        return SpeechStream(self._ensure_client(), self._recognizer, config)
+        return SpeechStream(
+            self._ensure_client(),
+            self._recognizer,
+            config,
+        )
 class SpeechStream(stt.SpeechStream):
@@ -184,7 +196,15 @@ class SpeechStream(stt.SpeechStream):
         self._config = config
         self._sample_rate = sample_rate
         self._num_channels = num_channels
-        self._max_retry = max_retry
+        self._queue = asyncio.Queue[Optional[rtc.AudioFrame]]()
+        self._event_queue = asyncio.Queue[Optional[stt.SpeechEvent]]()
+        self._closed = False
+        self._main_task = asyncio.create_task(self._run(max_retry=max_retry))
+        self._final_events: List[stt.SpeechEvent] = []
+        self._need_bos = True
+        self._need_eos = False
         self._streaming_config = cloud_speech.StreamingRecognitionConfig(
             config=cloud_speech.RecognitionConfig(
@@ -206,13 +226,30 @@ class SpeechStream(stt.SpeechStream):
             ),
         )
-    @utils.log_exceptions(logger=logger)
-    async def _main_task(self) -> None:
-        await self._run(self._max_retry)
+        def log_exception(task: asyncio.Task) -> None:
+            if not task.cancelled() and task.exception():
+                logger.error(f"google stt task failed: {task.exception()}")
+        self._main_task.add_done_callback(log_exception)
+    def push_frame(self, frame: rtc.AudioFrame) -> None:
+        if self._closed:
+            raise ValueError("cannot push frame to closed stream")
+        self._queue.put_nowait(frame)
+    async def aclose(self, *, wait: bool = True) -> None:
+        self._closed = True
+        if not wait:
+            self._main_task.cancel()
+        self._queue.put_nowait(None)
+        with contextlib.suppress(asyncio.CancelledError):
+            await self._main_task
     async def _run(self, max_retry: int) -> None:
         retry_count = 0
-        while not self._input_ch.closed:
+        while not self._closed:
             try:
                 # google requires a async generator when calling streaming_recognize
                 # this function basically convert the queue into a async generator
@@ -223,19 +260,19 @@ class SpeechStream(stt.SpeechStream):
                             recognizer=self._recognizer,
                             streaming_config=self._streaming_config,
                         )
+                        while True:
+                            frame = await self._queue.get()
+                            if frame is None:
+                                break
-                        async for frame in self._input_ch:
                             frame = frame.remix_and_resample(
                                 self._sample_rate, self._num_channels
                             )
                             yield cloud_speech.StreamingRecognizeRequest(
-                                audio=frame.data.tobytes()
+                                audio=frame.data.tobytes(),
                             )
-                    except Exception:
-                        logger.exception(
-                            "an error occurred while streaming input to google STT"
-                        )
+                    except Exception as e:
+                        logger.error(f"an error occurred while streaming inputs: {e}")
                 # try to connect
                 stream = await self._client.streaming_recognize(
@@ -260,6 +297,8 @@ class SpeechStream(stt.SpeechStream):
                 )
                 await asyncio.sleep(retry_delay)
+        self._event_queue.put_nowait(None)
     async def _run_stream(
         self, stream: AsyncIterable[cloud_speech.StreamingRecognizeResponse]
     ):
@@ -268,9 +307,11 @@ class SpeechStream(stt.SpeechStream):
                 resp.speech_event_type
                 == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
             ):
-                self._event_ch.send_nowait(
-                    stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH)
-                )
+                if self._need_eos:
+                    self._send_eos()
+            if self._need_bos:
+                self._send_bos()
             if (
                 resp.speech_event_type
@@ -278,31 +319,96 @@ class SpeechStream(stt.SpeechStream):
             ):
                 result = resp.results[0]
                 if not result.is_final:
-                    self._event_ch.send_nowait(
-                        stt.SpeechEvent(
-                            type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
-                            alternatives=[
-                                _streaming_recognize_response_to_speech_data(resp)
-                            ],
-                        )
+                    iterim_event = stt.SpeechEvent(
+                        type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
+                        alternatives=[
+                            _streaming_recognize_response_to_speech_data(resp)
+                        ],
                     )
+                    self._event_queue.put_nowait(iterim_event)
                 else:
-                    self._event_ch.send_nowait(
-                        stt.SpeechEvent(
-                            type=stt.SpeechEventType.FINAL_TRANSCRIPT,
-                            alternatives=[
-                                _streaming_recognize_response_to_speech_data(resp)
-                            ],
-                        )
+                    final_event = stt.SpeechEvent(
+                        type=stt.SpeechEventType.FINAL_TRANSCRIPT,
+                        alternatives=[
+                            _streaming_recognize_response_to_speech_data(resp)
+                        ],
                     )
+                    self._final_events.append(final_event)
+                    self._event_queue.put_nowait(final_event)
+            if self._need_eos:
+                self._send_eos()
             if (
                 resp.speech_event_type
                 == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END
             ):
-                self._event_ch.send_nowait(
-                    stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH)
-                )
+                self._need_eos = True
+        if not self._need_bos:
+            self._send_eos()
+    def _send_bos(self) -> None:
+        self._need_bos = False
+        start_event = stt.SpeechEvent(
+            type=stt.SpeechEventType.START_OF_SPEECH,
+        )
+        self._event_queue.put_nowait(start_event)
+    def _send_eos(self) -> None:
+        self._need_eos = False
+        self._need_bos = True
+        if self._final_events:
+            lg = self._final_events[0].alternatives[0].language
+            sentence = ""
+            confidence = 0.0
+            for alt in self._final_events:
+                sentence += f"{alt.alternatives[0].text.strip()} "
+                confidence += alt.alternatives[0].confidence
+            sentence = sentence.rstrip()
+            confidence /= len(self._final_events)  # avg. of confidence
+            end_event = stt.SpeechEvent(
+                type=stt.SpeechEventType.END_OF_SPEECH,
+                alternatives=[
+                    stt.SpeechData(
+                        language=lg,
+                        start_time=self._final_events[0].alternatives[0].start_time,
+                        end_time=self._final_events[-1].alternatives[0].end_time,
+                        confidence=confidence,
+                        text=sentence,
+                    )
+                ],
+            )
+            self._final_events = []
+            self._event_queue.put_nowait(end_event)
+        else:
+            end_event = stt.SpeechEvent(
+                type=stt.SpeechEventType.END_OF_SPEECH,
+                alternatives=[
+                    stt.SpeechData(
+                        language="",
+                        start_time=0,
+                        end_time=0,
+                        confidence=0,
+                        text="",
+                    )
+                ],
+            )
+            self._event_queue.put_nowait(end_event)
+    async def __anext__(self) -> stt.SpeechEvent:
+        evt = await self._event_queue.get()
+        if evt is None:
+            raise StopAsyncIteration
+        return evt
 def _recognize_response_to_speech_event(
@@ -347,7 +453,11 @@ def _streaming_recognize_response_to_speech_data(
     lg = resp.results[0].language_code
     data = stt.SpeechData(
-        language=lg, start_time=0, end_time=0, confidence=confidence, text=text
+        language=lg,
+        start_time=0,
+        end_time=0,
+        confidence=confidence,
+        text=text,
     )
     return data

livekit/plugins/google/tts.py CHANGED Viewed

@@ -14,14 +14,19 @@
 from __future__ import annotations
+import asyncio
+import contextlib
 from dataclasses import dataclass
-from typing import Union
+from typing import Optional, Union
 from livekit import rtc
-from livekit.agents import tts, utils
+from livekit.agents import codecs, tts
 from google.cloud import texttospeech
-from google.cloud.texttospeech_v1.types import SsmlVoiceGender, SynthesizeSpeechResponse
+from google.cloud.texttospeech_v1.types import (
+    SsmlVoiceGender,
+    SynthesizeSpeechResponse,
+)
 from .log import logger
 from .models import AudioEncoding, Gender, SpeechLanguages
@@ -55,11 +60,7 @@ class TTS(tts.TTS):
         GOOGLE_APPLICATION_CREDENTIALS (default behavior of Google TextToSpeechAsyncClient)
         """
         super().__init__(
-            capabilities=tts.TTSCapabilities(
-                streaming=True,
-            ),
-            sample_rate=sample_rate,
-            num_channels=1,
+            streaming_supported=False, sample_rate=sample_rate, num_channels=1
         )
         self._client: texttospeech.TextToSpeechAsyncClient | None = None
@@ -73,7 +74,9 @@ class TTS(tts.TTS):
             ssml_gender = SsmlVoiceGender.FEMALE
         voice = texttospeech.VoiceSelectionParams(
-            name=voice_name, language_code=language, ssml_gender=ssml_gender
+            name=voice_name,
+            language_code=language,
+            ssml_gender=ssml_gender,
         )
         if encoding == "linear16" or encoding == "wav":
@@ -113,7 +116,10 @@ class TTS(tts.TTS):
         assert self._client is not None
         return self._client
-    def synthesize(self, text: str) -> "ChunkedStream":
+    def synthesize(
+        self,
+        text: str,
+    ) -> "ChunkedStream":
         return ChunkedStream(text, self._opts, self._ensure_client())
@@ -121,38 +127,60 @@ class ChunkedStream(tts.ChunkedStream):
     def __init__(
         self, text: str, opts: _TTSOptions, client: texttospeech.TextToSpeechAsyncClient
     ) -> None:
-        super().__init__()
-        self._text, self._opts, self._client = text, opts, client
-    @utils.log_exceptions(logger=logger)
-    async def _main_task(self) -> None:
-        request_id = utils.shortuuid()
-        segment_id = utils.shortuuid()
-        response: SynthesizeSpeechResponse = await self._client.synthesize_speech(
-            input=texttospeech.SynthesisInput(text=self._text),
-            voice=self._opts.voice,
-            audio_config=self._opts.audio_config,
-        )
+        self._text = text
+        self._opts = opts
+        self._client = client
+        self._main_task: asyncio.Task | None = None
+        self._queue = asyncio.Queue[Optional[tts.SynthesizedAudio]]()
+    async def _run(self) -> None:
+        try:
+            response: SynthesizeSpeechResponse = await self._client.synthesize_speech(
+                input=texttospeech.SynthesisInput(text=self._text),
+                voice=self._opts.voice,
+                audio_config=self._opts.audio_config,
+            )
-        data = response.audio_content
-        if self._opts.audio_config.audio_encoding == "mp3":
-            decoder = utils.codecs.Mp3StreamDecoder()
-            for frame in decoder.decode_chunk(data):
-                self._event_ch.send_nowait(
+            data = response.audio_content
+            if self._opts.audio_config.audio_encoding == "mp3":
+                decoder = codecs.Mp3StreamDecoder()
+                frames = decoder.decode_chunk(data)
+                for frame in frames:
+                    self._queue.put_nowait(
+                        tts.SynthesizedAudio(text=self._text, data=frame)
+                    )
+            else:
+                self._queue.put_nowait(
                     tts.SynthesizedAudio(
-                        request_id=request_id, segment_id=segment_id, frame=frame
+                        text="",
+                        data=rtc.AudioFrame(
+                            data=data,
+                            sample_rate=self._opts.audio_config.sample_rate_hertz,
+                            num_channels=1,
+                            samples_per_channel=len(data) // 2,  # 16-bit
+                        ),
                     )
                 )
-        else:
-            self._event_ch.send_nowait(
-                tts.SynthesizedAudio(
-                    request_id=request_id,
-                    segment_id=segment_id,
-                    frame=rtc.AudioFrame(
-                        data=data,
-                        sample_rate=self._opts.audio_config.sample_rate_hertz,
-                        num_channels=1,
-                        samples_per_channel=len(data) // 2,  # 16-bit
-                    ),
-                )
-            )
+        except Exception:
+            logger.exception("failed to synthesize")
+        finally:
+            self._queue.put_nowait(None)
+    async def __anext__(self) -> tts.SynthesizedAudio:
+        if not self._main_task:
+            self._main_task = asyncio.create_task(self._run())
+        frame = await self._queue.get()
+        if frame is None:
+            raise StopAsyncIteration
+        return frame
+    async def aclose(self) -> None:
+        if not self._main_task:
+            return
+        self._main_task.cancel()
+        with contextlib.suppress(asyncio.CancelledError):
+            await self._main_task

livekit/plugins/google/version.py CHANGED Viewed

@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.6.0"
+__version__ = "0.6.dev0"

{livekit_plugins_google-0.6.0.dist-info → livekit_plugins_google-0.6.dev0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: livekit-plugins-google
-Version: 0.6.0
+Version: 0.6.dev0
 Summary: Agent Framework plugin for services from Google Cloud
 Home-page: https://github.com/livekit/agents
 License: Apache-2.0
@@ -14,14 +14,23 @@ Classifier: Topic :: Multimedia :: Sound/Audio
 Classifier: Topic :: Multimedia :: Video
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
 Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3 :: Only
-Requires-Python: >=3.9.0
+Requires-Python: >=3.7.0
 Description-Content-Type: text/markdown
+Requires-Dist: numpy <2,>=1
+Requires-Dist: google-api-core <3,>=2
+Requires-Dist: google-auth <3,>=2
+Requires-Dist: google-cloud-core <3,>=2
 Requires-Dist: google-cloud-speech <3,>=2
 Requires-Dist: google-cloud-texttospeech <3,>=2
-Requires-Dist: livekit-agents >=0.8.0.dev0
+Requires-Dist: google-cloud-translate <4,>=3
+Requires-Dist: googleapis-common-protos <2,>=1
+Requires-Dist: livekit ~=0.11
+Requires-Dist: livekit-agents ~=0.8.dev0
 # LiveKit Plugins Google

livekit_plugins_google-0.6.dev0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+livekit/plugins/google/__init__.py,sha256=DlQC5cosMFyQlM8_vFvJGoZiziFkd0Sa4mutnsxXyZM,959
+livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
+livekit/plugins/google/models.py,sha256=n8pgTJ7xyJpPCZJ_y0GzaQq6LqYknL6K6trpi07-AxM,1307
+livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+livekit/plugins/google/stt.py,sha256=GfWita3mgLZG2KpS9WYMCL8jwCNN5qukicpI58zPCcY,16058
+livekit/plugins/google/tts.py,sha256=J3V5aDUz0V2_Dfs16pobDVx7XwQqU1AEM8TWXdaDn9w,6182
+livekit/plugins/google/version.py,sha256=yB6WnbnD5MFhQDT5ItJ02XWVsNanlDYiOezzwv0IdcM,603
+livekit_plugins_google-0.6.dev0.dist-info/METADATA,sha256=azeNkX6imQv83LarBM4dZedsNBmaeDG0ESFS8-Q-S0E,1947
+livekit_plugins_google-0.6.dev0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+livekit_plugins_google-0.6.dev0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
+livekit_plugins_google-0.6.dev0.dist-info/RECORD,,

{livekit_plugins_google-0.6.0.dist-info → livekit_plugins_google-0.6.dev0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (71.1.0)
+Generator: bdist_wheel (0.43.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

livekit_plugins_google-0.6.0.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-livekit/plugins/google/__init__.py,sha256=DlQC5cosMFyQlM8_vFvJGoZiziFkd0Sa4mutnsxXyZM,959
-livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
-livekit/plugins/google/models.py,sha256=n8pgTJ7xyJpPCZJ_y0GzaQq6LqYknL6K6trpi07-AxM,1307
-livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-livekit/plugins/google/stt.py,sha256=bqXaoi5trER7PE45axfEpHwReElmf7yl38RpK1iJsdc,12849
-livekit/plugins/google/tts.py,sha256=KUw826CK3yt5meGVj0TKkueQ8o_gaXbc1Rtvdv2yF5M,5548
-livekit/plugins/google/version.py,sha256=Z62pORgDetwUvtfZOgPeIzXJugcrpDAOzC876rjCR0o,600
-livekit_plugins_google-0.6.0.dist-info/METADATA,sha256=Gb5O82GO4CpSvNHeYs4kD2K-neRklRGXaEQwOSQ8SpM,1584
-livekit_plugins_google-0.6.0.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
-livekit_plugins_google-0.6.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
-livekit_plugins_google-0.6.0.dist-info/RECORD,,

{livekit_plugins_google-0.6.0.dist-info → livekit_plugins_google-0.6.dev0.dist-info}/top_level.txt RENAMED Viewed

File without changes

livekit-plugins-google 0.6.0__py3-none-any.whl → 0.6.dev0__py3-none-any.whl

livekit-plugins-google 0.6.0py3-none-any.whl → 0.6.dev0py3-none-any.whl