PyPI - livekit-plugins-google - Versions diffs - 0.7.1__tar.gz → 0.7.3__tar.gz - Mend

livekit-plugins-google 0.7.1tar.gz → 0.7.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: livekit-plugins-google
-Version: 0.7.1
+Version: 0.7.3
 Summary: Agent Framework plugin for services from Google Cloud
 Home-page: https://github.com/livekit/agents
 License: Apache-2.0
@@ -22,7 +22,7 @@ Description-Content-Type: text/markdown
 Requires-Dist: google-auth<3,>=2
 Requires-Dist: google-cloud-speech<3,>=2
 Requires-Dist: google-cloud-texttospeech<3,>=2
-Requires-Dist: livekit-agents>=0.8.0.dev0
+Requires-Dist: livekit-agents>=0.11
 # LiveKit Plugins Google

{livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit/plugins/google/__init__.py RENAMED Viewed

@@ -29,3 +29,12 @@ class GooglePlugin(Plugin):
 Plugin.register_plugin(GooglePlugin())
+# Cleanup docs of unexported modules
+_module = dir()
+NOT_IN_ALL = [m for m in _module if m not in __all__]
+__pdoc__ = {}
+for n in NOT_IN_ALL:
+    __pdoc__[n] = False

{livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit/plugins/google/stt.py RENAMED Viewed

@@ -20,8 +20,15 @@ from dataclasses import dataclass
 from typing import AsyncIterable, List, Union
 from livekit import agents, rtc
-from livekit.agents import stt, utils
+from livekit.agents import (
+    APIConnectionError,
+    APIStatusError,
+    APITimeoutError,
+    stt,
+    utils,
+)
+from google.api_core.exceptions import Aborted, DeadlineExceeded, GoogleAPICallError
 from google.auth import default as gauth_default
 from google.auth.exceptions import DefaultCredentialsError
 from google.cloud.speech_v2 import SpeechAsyncClient
@@ -43,6 +50,25 @@ class STTOptions:
     punctuate: bool
     spoken_punctuation: bool
     model: SpeechModels
+    keywords: List[tuple[str, float]] | None
+    def build_adaptation(self) -> cloud_speech.SpeechAdaptation | None:
+        if self.keywords:
+            return cloud_speech.SpeechAdaptation(
+                phrase_sets=[
+                    cloud_speech.SpeechAdaptation.AdaptationPhraseSet(
+                        inline_phrase_set=cloud_speech.PhraseSet(
+                            phrases=[
+                                cloud_speech.PhraseSet.Phrase(
+                                    value=keyword, boost=boost
+                                )
+                                for keyword, boost in self.keywords
+                            ]
+                        )
+                    )
+                ]
+            )
+        return None
 class STT(stt.STT):
@@ -57,6 +83,7 @@ class STT(stt.STT):
         model: SpeechModels = "long",
         credentials_info: dict | None = None,
         credentials_file: str | None = None,
+        keywords: List[tuple[str, float]] | None = None,
     ):
         """
         Create a new instance of Google STT.
@@ -93,6 +120,7 @@ class STT(stt.STT):
             punctuate=punctuate,
             spoken_punctuation=spoken_punctuation,
             model=model,
+            keywords=keywords,
         )
     def _ensure_client(self) -> SpeechAsyncClient:
@@ -141,7 +169,7 @@ class STT(stt.STT):
         return config
-    async def recognize(
+    async def _recognize_impl(
         self,
         buffer: utils.AudioBuffer,
         *,
@@ -156,6 +184,7 @@ class STT(stt.STT):
                 sample_rate_hertz=frame.sample_rate,
                 audio_channel_count=frame.num_channels,
             ),
+            adaptation=config.build_adaptation(),
             features=cloud_speech.RecognitionFeatures(
                 enable_automatic_punctuation=config.punctuate,
                 enable_spoken_punctuation=config.spoken_punctuation,
@@ -165,23 +194,39 @@ class STT(stt.STT):
             language_codes=config.languages,
         )
-        raw = await self._ensure_client().recognize(
-            cloud_speech.RecognizeRequest(
-                recognizer=self._recognizer, config=config, content=frame.data.tobytes()
+        try:
+            raw = await self._ensure_client().recognize(
+                cloud_speech.RecognizeRequest(
+                    recognizer=self._recognizer,
+                    config=config,
+                    content=frame.data.tobytes(),
+                )
             )
-        )
-        return _recognize_response_to_speech_event(raw)
+            return _recognize_response_to_speech_event(raw)
+        except DeadlineExceeded:
+            raise APITimeoutError()
+        except GoogleAPICallError as e:
+            raise APIStatusError(
+                e.message,
+                status_code=e.code or -1,
+                request_id=None,
+                body=None,
+            )
+        except Exception as e:
+            raise APIConnectionError() from e
     def stream(
         self, *, language: SpeechLanguages | str | None = None
     ) -> "SpeechStream":
         config = self._sanitize_options(language=language)
-        return SpeechStream(self._ensure_client(), self._recognizer, config)
+        return SpeechStream(self, self._ensure_client(), self._recognizer, config)
 class SpeechStream(stt.SpeechStream):
     def __init__(
         self,
+        stt: STT,
         client: SpeechAsyncClient,
         recognizer: str,
         config: STTOptions,
@@ -189,7 +234,7 @@ class SpeechStream(stt.SpeechStream):
         num_channels: int = 1,
         max_retry: int = 32,
     ) -> None:
-        super().__init__()
+        super().__init__(stt)
         self._client = client
         self._recognizer = recognizer
@@ -205,6 +250,7 @@ class SpeechStream(stt.SpeechStream):
                     sample_rate_hertz=self._sample_rate,
                     audio_channel_count=self._num_channels,
                 ),
+                adaptation=config.build_adaptation(),
                 language_codes=self._config.languages,
                 model=self._config.model,
                 features=cloud_speech.RecognitionFeatures(
@@ -257,6 +303,9 @@ class SpeechStream(stt.SpeechStream):
                 retry_count = 0  # connection successful, reset retry count
                 await self._run_stream(stream)
+            except Aborted:
+                logger.error("google stt connection aborted")
+                break
             except Exception as e:
                 if retry_count >= max_retry:
                     logger.error(

livekit_plugins_google-0.7.3/livekit/plugins/google/tts.py ADDED Viewed

@@ -0,0 +1,233 @@
+# Copyright 2023 LiveKit, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+from dataclasses import dataclass
+from livekit import rtc
+from livekit.agents import (
+    APIConnectionError,
+    APIStatusError,
+    APITimeoutError,
+    tts,
+    utils,
+)
+from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
+from google.cloud import texttospeech
+from google.cloud.texttospeech_v1.types import SsmlVoiceGender, SynthesizeSpeechResponse
+from .models import AudioEncoding, Gender, SpeechLanguages
+@dataclass
+class _TTSOptions:
+    voice: texttospeech.VoiceSelectionParams
+    audio_config: texttospeech.AudioConfig
+class TTS(tts.TTS):
+    def __init__(
+        self,
+        *,
+        language: SpeechLanguages | str = "en-US",
+        gender: Gender | str = "neutral",
+        voice_name: str = "",  # Not required
+        encoding: AudioEncoding | str = "linear16",
+        sample_rate: int = 24000,
+        pitch: int = 0,
+        effects_profile_id: str = "",
+        speaking_rate: float = 1.0,
+        credentials_info: dict | None = None,
+        credentials_file: str | None = None,
+    ) -> None:
+        """
+        Create a new instance of Google TTS.
+        Credentials must be provided, either by using the ``credentials_info`` dict, or reading
+        from the file specified in ``credentials_file`` or the ``GOOGLE_APPLICATION_CREDENTIALS``
+        environmental variable.
+        Args:
+            language (SpeechLanguages | str, optional): Language code (e.g., "en-US"). Default is "en-US".
+            gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
+            voice_name (str, optional): Specific voice name. Default is an empty string.
+            encoding (AudioEncoding | str, optional): Audio encoding format (e.g., "linear16"). Default is "linear16".
+            sample_rate (int, optional): Audio sample rate in Hz. Default is 24000.
+            pitch (float, optional): Speaking pitch, ranging from -20.0 to 20.0 semitones relative to the original pitch. Default is 0.
+            effects_profile_id (str): Optional identifier for selecting audio effects profiles to apply to the synthesized speech.
+            speaking_rate (float, optional): Speed of speech. Default is 1.0.
+            credentials_info (dict, optional): Dictionary containing Google Cloud credentials. Default is None.
+            credentials_file (str, optional): Path to the Google Cloud credentials JSON file. Default is None.
+        """
+        super().__init__(
+            capabilities=tts.TTSCapabilities(
+                streaming=False,
+            ),
+            sample_rate=sample_rate,
+            num_channels=1,
+        )
+        self._client: texttospeech.TextToSpeechAsyncClient | None = None
+        self._credentials_info = credentials_info
+        self._credentials_file = credentials_file
+        voice = texttospeech.VoiceSelectionParams(
+            name=voice_name,
+            language_code=language,
+            ssml_gender=_gender_from_str(gender),
+        )
+        if encoding == "linear16" or encoding == "wav":
+            _audio_encoding = texttospeech.AudioEncoding.LINEAR16
+        elif encoding == "mp3":
+            _audio_encoding = texttospeech.AudioEncoding.MP3
+        else:
+            raise NotImplementedError(f"audio encoding {encoding} is not supported")
+        self._opts = _TTSOptions(
+            voice=voice,
+            audio_config=texttospeech.AudioConfig(
+                audio_encoding=_audio_encoding,
+                sample_rate_hertz=sample_rate,
+                pitch=pitch,
+                effects_profile_id=effects_profile_id,
+                speaking_rate=speaking_rate,
+            ),
+        )
+    def update_options(
+        self,
+        *,
+        language: SpeechLanguages | str = "en-US",
+        gender: Gender | str = "neutral",
+        voice_name: str = "",  # Not required
+        speaking_rate: float = 1.0,
+    ) -> None:
+        """
+        Update the TTS options.
+        Args:
+            language (SpeechLanguages | str, optional): Language code (e.g., "en-US"). Default is "en-US".
+            gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
+            voice_name (str, optional): Specific voice name. Default is an empty string.
+            speaking_rate (float, optional): Speed of speech. Default is 1.0.
+        """
+        self._opts.voice = texttospeech.VoiceSelectionParams(
+            name=voice_name,
+            language_code=language,
+            ssml_gender=_gender_from_str(gender),
+        )
+        self._opts.audio_config.speaking_rate = speaking_rate
+    def _ensure_client(self) -> texttospeech.TextToSpeechAsyncClient:
+        if not self._client:
+            if self._credentials_info:
+                self._client = (
+                    texttospeech.TextToSpeechAsyncClient.from_service_account_info(
+                        self._credentials_info
+                    )
+                )
+            elif self._credentials_file:
+                self._client = (
+                    texttospeech.TextToSpeechAsyncClient.from_service_account_file(
+                        self._credentials_file
+                    )
+                )
+            else:
+                self._client = texttospeech.TextToSpeechAsyncClient()
+        assert self._client is not None
+        return self._client
+    def synthesize(self, text: str) -> "ChunkedStream":
+        return ChunkedStream(self, text, self._opts, self._ensure_client())
+class ChunkedStream(tts.ChunkedStream):
+    def __init__(
+        self,
+        tts: TTS,
+        text: str,
+        opts: _TTSOptions,
+        client: texttospeech.TextToSpeechAsyncClient,
+    ) -> None:
+        super().__init__(tts, text)
+        self._opts, self._client = opts, client
+    async def _main_task(self) -> None:
+        request_id = utils.shortuuid()
+        try:
+            response: SynthesizeSpeechResponse = await self._client.synthesize_speech(
+                input=texttospeech.SynthesisInput(text=self._input_text),
+                voice=self._opts.voice,
+                audio_config=self._opts.audio_config,
+            )
+            data = response.audio_content
+            if self._opts.audio_config.audio_encoding == "mp3":
+                decoder = utils.codecs.Mp3StreamDecoder()
+                bstream = utils.audio.AudioByteStream(
+                    sample_rate=self._opts.audio_config.sample_rate_hertz,
+                    num_channels=1,
+                )
+                for frame in decoder.decode_chunk(data):
+                    for frame in bstream.write(frame.data.tobytes()):
+                        self._event_ch.send_nowait(
+                            tts.SynthesizedAudio(request_id=request_id, frame=frame)
+                        )
+                for frame in bstream.flush():
+                    self._event_ch.send_nowait(
+                        tts.SynthesizedAudio(request_id=request_id, frame=frame)
+                    )
+            else:
+                data = data[44:]  # skip WAV header
+                self._event_ch.send_nowait(
+                    tts.SynthesizedAudio(
+                        request_id=request_id,
+                        frame=rtc.AudioFrame(
+                            data=data,
+                            sample_rate=self._opts.audio_config.sample_rate_hertz,
+                            num_channels=1,
+                            samples_per_channel=len(data) // 2,  # 16-bit
+                        ),
+                    )
+                )
+        except DeadlineExceeded:
+            raise APITimeoutError()
+        except GoogleAPICallError as e:
+            raise APIStatusError(
+                e.message,
+                status_code=e.code or -1,
+                request_id=None,
+                body=None,
+            )
+        except Exception as e:
+            raise APIConnectionError() from e
+def _gender_from_str(gender: str) -> SsmlVoiceGender:
+    ssml_gender = SsmlVoiceGender.NEUTRAL
+    if gender == "male":
+        ssml_gender = SsmlVoiceGender.MALE
+    elif gender == "female":
+        ssml_gender = SsmlVoiceGender.FEMALE
+    return ssml_gender  # type: ignore

{livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit/plugins/google/version.py RENAMED Viewed

@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.7.1"
+__version__ = "0.7.3"

{livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit_plugins_google.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: livekit-plugins-google
-Version: 0.7.1
+Version: 0.7.3
 Summary: Agent Framework plugin for services from Google Cloud
 Home-page: https://github.com/livekit/agents
 License: Apache-2.0
@@ -22,7 +22,7 @@ Description-Content-Type: text/markdown
 Requires-Dist: google-auth<3,>=2
 Requires-Dist: google-cloud-speech<3,>=2
 Requires-Dist: google-cloud-texttospeech<3,>=2
-Requires-Dist: livekit-agents>=0.8.0.dev0
+Requires-Dist: livekit-agents>=0.11
 # LiveKit Plugins Google

{livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/livekit_plugins_google.egg-info/requires.txt RENAMED Viewed

@@ -1,4 +1,4 @@
 google-auth<3,>=2
 google-cloud-speech<3,>=2
 google-cloud-texttospeech<3,>=2
-livekit-agents>=0.8.0.dev0
+livekit-agents>=0.11

{livekit_plugins_google-0.7.1 → livekit_plugins_google-0.7.3}/setup.py RENAMED Viewed

@@ -51,7 +51,7 @@ setuptools.setup(
         "google-auth >= 2, < 3",
         "google-cloud-speech >= 2, < 3",
         "google-cloud-texttospeech >= 2, < 3",
-        "livekit-agents>=0.8.0.dev0",
+        "livekit-agents>=0.11",
     ],
     package_data={"livekit.plugins.google": ["py.typed"]},
     project_urls={

livekit_plugins_google-0.7.1/livekit/plugins/google/tts.py DELETED Viewed

@@ -1,174 +0,0 @@
-# Copyright 2023 LiveKit, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-from dataclasses import dataclass
-from typing import Union
-from livekit import rtc
-from livekit.agents import tts, utils
-from google.cloud import texttospeech
-from google.cloud.texttospeech_v1.types import SsmlVoiceGender, SynthesizeSpeechResponse
-from .log import logger
-from .models import AudioEncoding, Gender, SpeechLanguages
-LgType = Union[SpeechLanguages, str]
-GenderType = Union[Gender, str]
-AudioEncodingType = Union[AudioEncoding, str]
-@dataclass
-class _TTSOptions:
-    voice: texttospeech.VoiceSelectionParams
-    audio_config: texttospeech.AudioConfig
-class TTS(tts.TTS):
-    def __init__(
-        self,
-        *,
-        language: LgType = "en-US",
-        gender: GenderType = "neutral",
-        voice_name: str = "",  # Not required
-        encoding: AudioEncodingType = "linear16",
-        sample_rate: int = 24000,
-        speaking_rate: float = 1.0,
-        credentials_info: dict | None = None,
-        credentials_file: str | None = None,
-    ) -> None:
-        """
-        Create a new instance of Google TTS.
-        Credentials must be provided, either by using the ``credentials_info`` dict, or reading
-        from the file specified in ``credentials_file`` or the ``GOOGLE_APPLICATION_CREDENTIALS``
-        environmental variable.
-        """
-        super().__init__(
-            capabilities=tts.TTSCapabilities(
-                streaming=False,
-            ),
-            sample_rate=sample_rate,
-            num_channels=1,
-        )
-        self._client: texttospeech.TextToSpeechAsyncClient | None = None
-        self._credentials_info = credentials_info
-        self._credentials_file = credentials_file
-        ssml_gender = SsmlVoiceGender.NEUTRAL
-        if gender == "male":
-            ssml_gender = SsmlVoiceGender.MALE
-        elif gender == "female":
-            ssml_gender = SsmlVoiceGender.FEMALE
-        voice = texttospeech.VoiceSelectionParams(
-            name=voice_name, language_code=language, ssml_gender=ssml_gender
-        )
-        if encoding == "linear16" or encoding == "wav":
-            _audio_encoding = texttospeech.AudioEncoding.LINEAR16
-        elif encoding == "mp3":
-            _audio_encoding = texttospeech.AudioEncoding.MP3
-        else:
-            raise NotImplementedError(f"audio encoding {encoding} is not supported")
-        self._opts = _TTSOptions(
-            voice=voice,
-            audio_config=texttospeech.AudioConfig(
-                audio_encoding=_audio_encoding,
-                sample_rate_hertz=sample_rate,
-                speaking_rate=speaking_rate,
-            ),
-        )
-    def _ensure_client(self) -> texttospeech.TextToSpeechAsyncClient:
-        if not self._client:
-            if self._credentials_info:
-                self._client = (
-                    texttospeech.TextToSpeechAsyncClient.from_service_account_info(
-                        self._credentials_info
-                    )
-                )
-            elif self._credentials_file:
-                self._client = (
-                    texttospeech.TextToSpeechAsyncClient.from_service_account_file(
-                        self._credentials_file
-                    )
-                )
-            else:
-                self._client = texttospeech.TextToSpeechAsyncClient()
-        assert self._client is not None
-        return self._client
-    def synthesize(self, text: str) -> "ChunkedStream":
-        return ChunkedStream(text, self._opts, self._ensure_client())
-class ChunkedStream(tts.ChunkedStream):
-    def __init__(
-        self, text: str, opts: _TTSOptions, client: texttospeech.TextToSpeechAsyncClient
-    ) -> None:
-        super().__init__()
-        self._text, self._opts, self._client = text, opts, client
-    @utils.log_exceptions(logger=logger)
-    async def _main_task(self) -> None:
-        request_id = utils.shortuuid()
-        segment_id = utils.shortuuid()
-        response: SynthesizeSpeechResponse = await self._client.synthesize_speech(
-            input=texttospeech.SynthesisInput(text=self._text),
-            voice=self._opts.voice,
-            audio_config=self._opts.audio_config,
-        )
-        data = response.audio_content
-        if self._opts.audio_config.audio_encoding == "mp3":
-            decoder = utils.codecs.Mp3StreamDecoder()
-            bstream = utils.audio.AudioByteStream(
-                sample_rate=self._opts.audio_config.sample_rate_hertz, num_channels=1
-            )
-            for frame in decoder.decode_chunk(data):
-                for frame in bstream.write(frame.data):
-                    self._event_ch.send_nowait(
-                        tts.SynthesizedAudio(
-                            request_id=request_id, segment_id=segment_id, frame=frame
-                        )
-                    )
-            for frame in bstream.flush():
-                self._event_ch.send_nowait(
-                    tts.SynthesizedAudio(
-                        request_id=request_id, segment_id=segment_id, frame=frame
-                    )
-                )
-        else:
-            data = data[44:]  # skip WAV header
-            self._event_ch.send_nowait(
-                tts.SynthesizedAudio(
-                    request_id=request_id,
-                    segment_id=segment_id,
-                    frame=rtc.AudioFrame(
-                        data=data,
-                        sample_rate=self._opts.audio_config.sample_rate_hertz,
-                        num_channels=1,
-                        samples_per_channel=len(data) // 2,  # 16-bit
-                    ),
-                )
-            )