PyPI - livekit-plugins-google - Versions diffs - 1.1.5__py3-none-any.whl → 1.1.7__py3-none-any.whl - Mend

livekit-plugins-google 1.1.5py3-none-any.whl → 1.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of livekit-plugins-google might be problematic. Click here for more details.

Files changed (9) hide show

livekit/plugins/google/beta/__init__.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from . import realtime
+from .gemini_tts import TTS as GeminiTTS
-__all__ = ["realtime"]
+__all__ = ["realtime", "GeminiTTS"]
 # Cleanup docs of unexported modules
 _module = dir()

livekit/plugins/google/beta/gemini_tts.py ADDED Viewed

@@ -0,0 +1,247 @@
+from __future__ import annotations
+import os
+from dataclasses import dataclass
+from typing import Literal
+from google.genai import Client, types
+from google.genai.errors import APIError, ClientError, ServerError
+from livekit.agents import APIConnectionError, APIStatusError, tts, utils
+from livekit.agents.types import (
+    DEFAULT_API_CONNECT_OPTIONS,
+    NOT_GIVEN,
+    APIConnectOptions,
+    NotGivenOr,
+)
+from livekit.agents.utils import is_given
+GEMINI_TTS_MODELS = Literal["gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-tts"]
+GEMINI_VOICES = Literal[
+    "Zephyr",
+    "Puck",
+    "Charon",
+    "Kore",
+    "Fenrir",
+    "Leda",
+    "Orus",
+    "Aoede",
+    "Callirrhoe",
+    "Autonoe",
+    "Enceladus",
+    "Iapetus",
+    "Umbriel",
+    "Algieba",
+    "Despina",
+    "Erinome",
+    "Algenib",
+    "Rasalgethi",
+    "Laomedeia",
+    "Achernar",
+    "Alnilam",
+    "Schedar",
+    "Gacrux",
+    "Pulcherrima",
+    "Achird",
+    "Zubenelgenubi",
+    "Vindemiatrix",
+    "Sadachbia",
+    "Sadaltager",
+    "Sulafat",
+]
+DEFAULT_MODEL = "gemini-2.5-flash-preview-tts"
+DEFAULT_VOICE = "Kore"
+DEFAULT_SAMPLE_RATE = 24000  # not configurable
+NUM_CHANNELS = 1
+DEFAULT_INSTRUCTIONS = "Say the text with a proper tone, don't omit or add any words"
+@dataclass
+class _TTSOptions:
+    model: GEMINI_TTS_MODELS | str
+    voice_name: GEMINI_VOICES | str
+    vertexai: bool
+    project: str | None
+    location: str | None
+    instructions: str | None
+class TTS(tts.TTS):
+    def __init__(
+        self,
+        *,
+        model: GEMINI_TTS_MODELS | str = DEFAULT_MODEL,
+        voice_name: GEMINI_VOICES | str = DEFAULT_VOICE,
+        api_key: NotGivenOr[str] = NOT_GIVEN,
+        vertexai: NotGivenOr[bool] = NOT_GIVEN,
+        project: NotGivenOr[str] = NOT_GIVEN,
+        location: NotGivenOr[str] = NOT_GIVEN,
+        instructions: NotGivenOr[str | None] = NOT_GIVEN,
+    ) -> None:
+        """
+        Create a new instance of Gemini TTS.
+        Environment Requirements:
+        - For VertexAI: Set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to the path of the service account key file.
+        - For Google Gemini API: Set the `api_key` argument or the `GOOGLE_API_KEY` environment variable.
+        Args:
+            model (str, optional): The Gemini TTS model to use. Defaults to "gemini-2.5-flash-preview-tts".
+            voice_name (str, optional): The voice to use for synthesis. Defaults to "Kore".
+            api_key (str, optional): The API key for Google Gemini. If not provided, it attempts to read from the `GOOGLE_API_KEY` environment variable.
+            vertexai (bool, optional): Whether to use VertexAI. Defaults to False.
+            project (str, optional): The Google Cloud project to use (only for VertexAI).
+            location (str, optional): The location to use for VertexAI API requests. Defaults to "us-central1".
+            instructions (str, optional): Control the style, tone, accent, and pace using prompts. See https://ai.google.dev/gemini-api/docs/speech-generation#controllable
+        """  # noqa: E501
+        super().__init__(
+            capabilities=tts.TTSCapabilities(streaming=False),
+            sample_rate=DEFAULT_SAMPLE_RATE,
+            num_channels=NUM_CHANNELS,
+        )
+        gcp_project: str | None = (
+            project if is_given(project) else os.environ.get("GOOGLE_CLOUD_PROJECT")
+        )
+        gcp_location: str | None = (
+            location
+            if is_given(location)
+            else os.environ.get("GOOGLE_CLOUD_LOCATION") or "us-central1"
+        )
+        use_vertexai = (
+            vertexai
+            if is_given(vertexai)
+            else os.environ.get("GOOGLE_GENAI_USE_VERTEXAI", "0").lower() in ["true", "1"]
+        )
+        gemini_api_key = api_key if is_given(api_key) else os.environ.get("GOOGLE_API_KEY")
+        if use_vertexai:
+            if not gcp_project:
+                from google.auth._default_async import default_async
+                _, gcp_project = default_async(  # type: ignore
+                    scopes=["https://www.googleapis.com/auth/cloud-platform"]
+                )
+            gemini_api_key = None  # VertexAI does not require an API key
+        else:
+            gcp_project = None
+            gcp_location = None
+            if not gemini_api_key:
+                raise ValueError(
+                    "API key is required for Google API either via api_key or GOOGLE_API_KEY environment variable"  # noqa: E501
+                )
+        self._opts = _TTSOptions(
+            model=model,
+            voice_name=voice_name,
+            vertexai=use_vertexai,
+            project=gcp_project,
+            location=gcp_location,
+            instructions=instructions if is_given(instructions) else DEFAULT_INSTRUCTIONS,
+        )
+        self._client = Client(
+            api_key=gemini_api_key,
+            vertexai=use_vertexai,
+            project=gcp_project,
+            location=gcp_location,
+        )
+    def synthesize(
+        self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
+    ) -> ChunkedStream:
+        return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
+    def update_options(
+        self,
+        *,
+        voice_name: NotGivenOr[str] = NOT_GIVEN,
+    ) -> None:
+        """
+        Update the TTS options.
+        Args:
+            voice_name (str, optional): The voice to use for synthesis.
+        """
+        if is_given(voice_name):
+            self._opts.voice_name = voice_name
+class ChunkedStream(tts.ChunkedStream):
+    def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions) -> None:
+        super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
+        self._tts: TTS = tts
+    async def _run(self, output_emitter: tts.AudioEmitter) -> None:
+        try:
+            config = types.GenerateContentConfig(
+                response_modalities=["AUDIO"],
+                speech_config=types.SpeechConfig(
+                    voice_config=types.VoiceConfig(
+                        prebuilt_voice_config=types.PrebuiltVoiceConfig(
+                            voice_name=self._tts._opts.voice_name,
+                        )
+                    )
+                ),
+            )
+            input_text = self._input_text
+            if self._tts._opts.instructions is not None:
+                input_text = f'{self._tts._opts.instructions}:\n"{input_text}"'
+            response = await self._tts._client.aio.models.generate_content(
+                model=self._tts._opts.model,
+                contents=input_text,
+                config=config,
+            )
+            output_emitter.initialize(
+                request_id=utils.shortuuid(),
+                sample_rate=self._tts.sample_rate,
+                num_channels=self._tts.num_channels,
+                mime_type="audio/pcm",
+            )
+            if (
+                not response.candidates
+                or not (content := response.candidates[0].content)
+                or not content.parts
+            ):
+                raise APIStatusError("No audio content generated")
+            for part in content.parts:
+                if (
+                    (inline_data := part.inline_data)
+                    and inline_data.data
+                    and inline_data.mime_type
+                    and inline_data.mime_type.startswith("audio/")
+                ):
+                    # mime_type: audio/L16;codec=pcm;rate=24000
+                    output_emitter.push(inline_data.data)
+        except ClientError as e:
+            raise APIStatusError(
+                "gemini tts: client error",
+                status_code=e.code,
+                body=f"{e.message} {e.status}",
+                retryable=False if e.code != 429 else True,
+            ) from e
+        except ServerError as e:
+            raise APIStatusError(
+                "gemini tts: server error",
+                status_code=e.code,
+                body=f"{e.message} {e.status}",
+                retryable=True,
+            ) from e
+        except APIError as e:
+            raise APIStatusError(
+                "gemini tts: api error",
+                status_code=e.code,
+                body=f"{e.message} {e.status}",
+                retryable=True,
+            ) from e
+        except Exception as e:
+            raise APIConnectionError(
+                f"gemini tts: error generating speech {str(e)}",
+                retryable=True,
+            ) from e

livekit/plugins/google/beta/realtime/api_proto.py CHANGED Viewed

@@ -13,7 +13,7 @@ LiveAPIModels = Literal[
     "gemini-2.5-flash-exp-native-audio-thinking-dialog",
 ]
-Voice = Literal["Puck", "Charon", "Kore", "Fenrir", "Aoede", "Leda", "Oru", "Zephyr"]
+Voice = Literal["Puck", "Charon", "Kore", "Fenrir", "Aoede", "Leda", "Orus", "Zephyr"]
 ClientEvents = Union[

livekit/plugins/google/beta/realtime/realtime_api.py CHANGED Viewed

@@ -937,7 +937,6 @@ class RealtimeSession(llm.RealtimeSession):
                     arguments=arguments,
                 )
             )
-        self._on_final_input_audio_transcription()
         self._mark_current_generation_done()
     def _handle_tool_call_cancellation(
@@ -1018,15 +1017,6 @@ class RealtimeSession(llm.RealtimeSession):
         # TODO(dz): this isn't a seamless reconnection just yet
         self._session_should_close.set()
-    def _on_final_input_audio_transcription(self) -> None:
-        if (gen := self._current_generation) and gen.input_transcription:
-            self.emit(
-                "input_audio_transcription_completed",
-                llm.InputTranscriptionCompleted(
-                    item_id=gen.response_id, transcript=gen.input_transcription, is_final=True
-                ),
-            )
     def commit_audio(self) -> None:
         pass

livekit/plugins/google/tts.py CHANGED Viewed

@@ -22,7 +22,11 @@ from dataclasses import dataclass, replace
 from google.api_core.client_options import ClientOptions
 from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
 from google.cloud import texttospeech
-from google.cloud.texttospeech_v1.types import SsmlVoiceGender, SynthesizeSpeechResponse
+from google.cloud.texttospeech_v1.types import (
+    CustomPronunciations,
+    SsmlVoiceGender,
+    SynthesizeSpeechResponse,
+)
 from livekit.agents import APIConnectOptions, APIStatusError, APITimeoutError, tokenize, tts, utils
 from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, NOT_GIVEN, NotGivenOr
 from livekit.agents.utils import is_given
@@ -47,6 +51,7 @@ class _TTSOptions:
     speaking_rate: float
     tokenizer: tokenize.SentenceTokenizer
     volume_gain_db: float
+    custom_pronunciations: CustomPronunciations | None
     enable_ssml: bool
@@ -67,6 +72,7 @@ class TTS(tts.TTS):
         credentials_info: NotGivenOr[dict] = NOT_GIVEN,
         credentials_file: NotGivenOr[str] = NOT_GIVEN,
         tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
+        custom_pronunciations: NotGivenOr[CustomPronunciations] = NOT_GIVEN,
         use_streaming: bool = True,
         enable_ssml: bool = False,
     ) -> None:
@@ -90,6 +96,7 @@ class TTS(tts.TTS):
             credentials_info (dict, optional): Dictionary containing Google Cloud credentials. Default is None.
             credentials_file (str, optional): Path to the Google Cloud credentials JSON file. Default is None.
             tokenizer (tokenize.SentenceTokenizer, optional): Tokenizer for the TTS. Default is a basic sentence tokenizer.
+            custom_pronunciations (CustomPronunciations, optional): Custom pronunciations for the TTS. Default is None.
             use_streaming (bool, optional): Whether to use streaming synthesis. Default is True.
             enable_ssml (bool, optional): Whether to enable SSML support. Default is False.
         """  # noqa: E501
@@ -119,6 +126,8 @@ class TTS(tts.TTS):
         if not is_given(tokenizer):
             tokenizer = tokenize.basic.SentenceTokenizer(min_sentence_len=BUFFERED_WORDS_COUNT)
+        pronunciations = None if not is_given(custom_pronunciations) else custom_pronunciations
         self._opts = _TTSOptions(
             voice=voice_params,
             encoding=audio_encoding,
@@ -128,6 +137,7 @@ class TTS(tts.TTS):
             speaking_rate=speaking_rate,
             tokenizer=tokenizer,
             volume_gain_db=volume_gain_db,
+            custom_pronunciations=pronunciations,
             enable_ssml=enable_ssml,
         )
         self._streams = weakref.WeakSet[SynthesizeStream]()
@@ -223,9 +233,15 @@ class ChunkedStream(tts.ChunkedStream):
     async def _run(self, output_emitter: tts.AudioEmitter) -> None:
         try:
             input = (
-                texttospeech.SynthesisInput(ssml=self._build_ssml())
+                texttospeech.SynthesisInput(
+                    ssml=self._build_ssml(),
+                    custom_pronunciations=self._opts.custom_pronunciations,
+                )
                 if self._opts.enable_ssml
-                else texttospeech.SynthesisInput(text=self._input_text)
+                else texttospeech.SynthesisInput(
+                    text=self._input_text,
+                    custom_pronunciations=self._opts.custom_pronunciations,
+                )
             )
             response: SynthesizeSpeechResponse = await self._tts._ensure_client().synthesize_speech(
                 input=input,
@@ -287,6 +303,7 @@ class SynthesizeStream(tts.SynthesizeStream):
                 sample_rate_hertz=self._opts.sample_rate,
                 speaking_rate=self._opts.speaking_rate,
             ),
+            custom_pronunciations=self._opts.custom_pronunciations,
         )
         async def _tokenize_input() -> None:

livekit/plugins/google/version.py CHANGED Viewed

@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "1.1.5"
+__version__ = "1.1.7"

{livekit_plugins_google-1.1.5.dist-info → livekit_plugins_google-1.1.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: livekit-plugins-google
-Version: 1.1.5
+Version: 1.1.7
 Summary: Agent Framework plugin for services from Google Cloud
 Project-URL: Documentation, https://docs.livekit.io
 Project-URL: Website, https://livekit.io/
@@ -22,7 +22,7 @@ Requires-Dist: google-auth<3,>=2
 Requires-Dist: google-cloud-speech<3,>=2
 Requires-Dist: google-cloud-texttospeech<3,>=2.27
 Requires-Dist: google-genai>=v1.23.0
-Requires-Dist: livekit-agents>=1.1.5
+Requires-Dist: livekit-agents>=1.1.7
 Description-Content-Type: text/markdown
 # Google AI plugin for LiveKit Agents

{livekit_plugins_google-1.1.5.dist-info → livekit_plugins_google-1.1.7.dist-info}/RECORD RENAMED Viewed

@@ -5,13 +5,14 @@ livekit/plugins/google/models.py,sha256=hOpfbN_qdQ1ZTpCN9m9dvG2eb6WgQ3KE3WRpIeeM
 livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 livekit/plugins/google/stt.py,sha256=ssDMH5U1vQOLA44XMlovYWIR4UqVtZSge3YFN-zZ7Iw,24696
 livekit/plugins/google/tools.py,sha256=tD5HVDHO5JfUF029Cx3axHMJec0Gxalkl7s1FDgxLzI,259
-livekit/plugins/google/tts.py,sha256=YTfce55MWNJyDH4k8U1O2giOcrtccTs8vrkiW9GuBR0,15541
+livekit/plugins/google/tts.py,sha256=QVM4xcF7WHpbQOZDAhRJrz481iMhO9ACjjqPEdTT4Lw,16277
 livekit/plugins/google/utils.py,sha256=6iihkKx76DDtLiHOoTU2ZXqzupBRY_gN3njpnwdmeqY,8829
-livekit/plugins/google/version.py,sha256=OKtayGMVDYKyoKBO2yNM4kfRbH-PODJqECIiYhUzNWg,600
-livekit/plugins/google/beta/__init__.py,sha256=5PnoG3Ux24bjzMSzmTeSVljE9EINivGcbWUEV6egGnM,216
+livekit/plugins/google/version.py,sha256=EcBB23XE8aEiF7xHMivcb9wptFeYkGB1WNGSn1bIV3A,600
+livekit/plugins/google/beta/__init__.py,sha256=RvAUdvEiRN-fe4JrgPcN0Jkw1kZR9wPerGMFVjS1Cc0,270
+livekit/plugins/google/beta/gemini_tts.py,sha256=esWjr0Xf95tl0_AB7MXiFZ_VCORWgcWjzvLvRa3t0FQ,8515
 livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
-livekit/plugins/google/beta/realtime/api_proto.py,sha256=NfE7xr2N3JOu7gVfWbAmDcEhs8vuZgMRu5vpScPJzsg,776
-livekit/plugins/google/beta/realtime/realtime_api.py,sha256=tlAsTFsumqOavC9JT2SuQi_3eGYygZ3bbS-nEM7ea8Q,46293
-livekit_plugins_google-1.1.5.dist-info/METADATA,sha256=g6aRR1VIspmPtZ2C6VQ-cqZWx1gIpLtg4OFV1pbD01E,1907
-livekit_plugins_google-1.1.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-livekit_plugins_google-1.1.5.dist-info/RECORD,,
+livekit/plugins/google/beta/realtime/api_proto.py,sha256=cbKmpX32G4gPjF6cxFNzGEDfYX19SK-vWi4Myxb8Yks,777
+livekit/plugins/google/beta/realtime/realtime_api.py,sha256=nqiDiAtyHYFRd_Or1Y_95syjHyAVFjaEYTka0qPfXdE,45853
+livekit_plugins_google-1.1.7.dist-info/METADATA,sha256=yG5QbYo-vfSQQ4oyHXrima24mYz9K1sFfT8Bkx6Yh2A,1907
+livekit_plugins_google-1.1.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+livekit_plugins_google-1.1.7.dist-info/RECORD,,

{livekit_plugins_google-1.1.5.dist-info → livekit_plugins_google-1.1.7.dist-info}/WHEEL RENAMED Viewed

File without changes

livekit-plugins-google 1.1.5__py3-none-any.whl → 1.1.7__py3-none-any.whl

Potentially problematic release.

livekit-plugins-google 1.1.5py3-none-any.whl → 1.1.7py3-none-any.whl