PyPI - livekit-plugins-google - Versions diffs - 1.2.5__tar.gz → 1.2.7__tar.gz - Mend

livekit-plugins-google 1.2.5tar.gz → 1.2.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of livekit-plugins-google might be problematic. Click here for more details.

Files changed (19) hide show

{livekit_plugins_google-1.2.5 → livekit_plugins_google-1.2.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: livekit-plugins-google
-Version: 1.2.5
+Version: 1.2.7
 Summary: Agent Framework plugin for services from Google Cloud
 Project-URL: Documentation, https://docs.livekit.io
 Project-URL: Website, https://livekit.io/
@@ -22,7 +22,7 @@ Requires-Dist: google-auth<3,>=2
 Requires-Dist: google-cloud-speech<3,>=2
 Requires-Dist: google-cloud-texttospeech<3,>=2.27
 Requires-Dist: google-genai>=v1.23.0
-Requires-Dist: livekit-agents>=1.2.5
+Requires-Dist: livekit-agents>=1.2.7
 Description-Content-Type: text/markdown
 # Google AI plugin for LiveKit Agents

{livekit_plugins_google-1.2.5 → livekit_plugins_google-1.2.7}/livekit/plugins/google/beta/realtime/realtime_api.py RENAMED Viewed

@@ -428,7 +428,9 @@ class RealtimeSession(llm.RealtimeSession):
         self._chat_ctx = chat_ctx.copy()
     async def update_tools(self, tools: list[llm.FunctionTool | llm.RawFunctionTool]) -> None:
-        new_declarations: list[types.FunctionDeclaration] = to_fnc_ctx(tools)
+        new_declarations: list[types.FunctionDeclaration] = to_fnc_ctx(
+            tools, use_parameters_json_schema=False
+        )
         current_tool_names = {f.name for f in self._gemini_declarations}
         new_tool_names = {f.name for f in new_declarations}
@@ -699,10 +701,15 @@ class RealtimeSession(llm.RealtimeSession):
                         break
                 async for response in session.receive():
-                    if (not self._current_generation or self._current_generation._done) and (
-                        response.server_content or response.tool_call
-                    ):
-                        self._start_new_generation()
+                    if not self._current_generation or self._current_generation._done:
+                        if response.server_content and response.server_content.interrupted:
+                            # interrupt a generation already done
+                            self._handle_input_speech_started()
+                            # reset the flag and still start a new generation in case it has any other content
+                            response.server_content.interrupted = False
+                        if self._is_new_generation(response):
+                            self._start_new_generation()
                     if response.session_resumption_update:
                         if (
@@ -1084,3 +1091,16 @@ class RealtimeSession(llm.RealtimeSession):
                 recoverable=recoverable,
             ),
         )
+    def _is_new_generation(self, resp: types.LiveServerMessage) -> bool:
+        if resp.tool_call:
+            return True
+        if (sc := resp.server_content) and (
+            sc.model_turn
+            or (sc.output_transcription and sc.output_transcription.text is not None)
+            or (sc.input_transcription and sc.input_transcription.text is not None)
+        ):
+            return True
+        return False

{livekit_plugins_google-1.2.5 → livekit_plugins_google-1.2.7}/livekit/plugins/google/llm.py RENAMED Viewed

@@ -157,10 +157,6 @@ class LLM(llm.LLM):
             if _thinking_budget is not None:
                 if not isinstance(_thinking_budget, int):
                     raise ValueError("thinking_budget inside thinking_config must be an integer")
-                if not (0 <= _thinking_budget <= 24576):
-                    raise ValueError(
-                        "thinking_budget inside thinking_config must be between 0 and 24576"
-                    )
         self._opts = _LLMOptions(
             model=model,

{livekit_plugins_google-1.2.5 → livekit_plugins_google-1.2.7}/livekit/plugins/google/stt.py RENAMED Viewed

@@ -20,6 +20,7 @@ import time
 import weakref
 from collections.abc import AsyncGenerator, AsyncIterable
 from dataclasses import dataclass
+from datetime import timedelta
 from typing import Callable, Union, cast
 from google.api_core.client_options import ClientOptions
@@ -28,6 +29,7 @@ from google.auth import default as gauth_default
 from google.auth.exceptions import DefaultCredentialsError
 from google.cloud.speech_v2 import SpeechAsyncClient
 from google.cloud.speech_v2.types import cloud_speech
+from google.protobuf.duration_pb2 import Duration
 from livekit import rtc
 from livekit.agents import (
     DEFAULT_API_CONNECT_OPTIONS,
@@ -67,6 +69,7 @@ class STTOptions:
     punctuate: bool
     spoken_punctuation: bool
     enable_word_time_offsets: bool
+    enable_word_confidence: bool
     model: SpeechModels | str
     sample_rate: int
     min_confidence_threshold: float
@@ -99,6 +102,7 @@ class STT(stt.STT):
         punctuate: bool = True,
         spoken_punctuation: bool = False,
         enable_word_time_offsets: bool = True,
+        enable_word_confidence: bool = False,
         model: SpeechModels | str = "latest_long",
         location: str = "global",
         sample_rate: int = 16000,
@@ -122,6 +126,7 @@ class STT(stt.STT):
             punctuate(bool): whether to punctuate the audio (default: True)
             spoken_punctuation(bool): whether to use spoken punctuation (default: False)
             enable_word_time_offsets(bool): whether to enable word time offsets (default: True)
+            enable_word_confidence(bool): whether to enable word confidence (default: False)
             model(SpeechModels): the model to use for recognition default: "latest_long"
             location(str): the location to use for recognition default: "global"
             sample_rate(int): the sample rate of the audio default: 16000
@@ -162,6 +167,7 @@ class STT(stt.STT):
             punctuate=punctuate,
             spoken_punctuation=spoken_punctuation,
             enable_word_time_offsets=enable_word_time_offsets,
+            enable_word_confidence=enable_word_confidence,
             model=model,
             sample_rate=sample_rate,
             min_confidence_threshold=min_confidence_threshold,
@@ -243,6 +249,7 @@ class STT(stt.STT):
                 enable_automatic_punctuation=config.punctuate,
                 enable_spoken_punctuation=config.spoken_punctuation,
                 enable_word_time_offsets=config.enable_word_time_offsets,
+                enable_word_confidence=config.enable_word_confidence,
             ),
             model=config.model,
             language_codes=config.languages,
@@ -547,6 +554,14 @@ class SpeechStream(stt.SpeechStream):
                 raise APIConnectionError() from e
+def _duration_to_seconds(duration: Duration | timedelta) -> float:
+    # Proto Plus may auto-convert Duration to timedelta; handle both.
+    # https://proto-plus-python.readthedocs.io/en/latest/marshal.html
+    if isinstance(duration, timedelta):
+        return duration.total_seconds()
+    return duration.seconds + duration.nanos / 1e9
 def _recognize_response_to_speech_event(
     resp: cloud_speech.RecognizeResponse,
 ) -> stt.SpeechEvent:
@@ -556,24 +571,31 @@ def _recognize_response_to_speech_event(
         text += result.alternatives[0].transcript
         confidence += result.alternatives[0].confidence
-    # not sure why start_offset and end_offset returns a timedelta
-    start_offset = resp.results[0].alternatives[0].words[0].start_offset
-    end_offset = resp.results[-1].alternatives[0].words[-1].end_offset
+    alternatives = []
-    confidence /= len(resp.results)
-    lg = resp.results[0].language_code
-    return stt.SpeechEvent(
-        type=stt.SpeechEventType.FINAL_TRANSCRIPT,
-        alternatives=[
+    # Google STT may return empty results when spoken_lang != stt_lang
+    if resp.results:
+        try:
+            start_time = _duration_to_seconds(resp.results[0].alternatives[0].words[0].start_offset)
+            end_time = _duration_to_seconds(resp.results[-1].alternatives[0].words[-1].end_offset)
+        except IndexError:
+            # When enable_word_time_offsets=False, there are no "words" to access
+            start_time = end_time = 0
+        confidence /= len(resp.results)
+        lg = resp.results[0].language_code
+        alternatives = [
             stt.SpeechData(
                 language=lg,
-                start_time=start_offset.total_seconds(),  # type: ignore
-                end_time=end_offset.total_seconds(),  # type: ignore
+                start_time=start_time,
+                end_time=end_time,
                 confidence=confidence,
                 text=text,
             )
-        ],
-    )
+        ]
+    return stt.SpeechEvent(type=stt.SpeechEventType.FINAL_TRANSCRIPT, alternatives=alternatives)
 def _streaming_recognize_response_to_speech_data(

{livekit_plugins_google-1.2.5 → livekit_plugins_google-1.2.7}/livekit/plugins/google/tts.py RENAMED Viewed

@@ -61,6 +61,7 @@ class TTS(tts.TTS):
         language: NotGivenOr[SpeechLanguages | str] = NOT_GIVEN,
         gender: NotGivenOr[Gender | str] = NOT_GIVEN,
         voice_name: NotGivenOr[str] = NOT_GIVEN,
+        voice_cloning_key: NotGivenOr[str] = NOT_GIVEN,
         sample_rate: int = 24000,
         pitch: int = 0,
         effects_profile_id: str = "",
@@ -86,6 +87,7 @@ class TTS(tts.TTS):
             language (SpeechLanguages | str, optional): Language code (e.g., "en-US"). Default is "en-US".
             gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
             voice_name (str, optional): Specific voice name. Default is an empty string.
+            voice_cloning_key (str, optional): Voice clone key. Created via https://cloud.google.com/text-to-speech/docs/chirp3-instant-custom-voice
             sample_rate (int, optional): Audio sample rate in Hz. Default is 24000.
             location (str, optional): Location for the TTS client. Default is "global".
             pitch (float, optional): Speaking pitch, ranging from -20.0 to 20.0 semitones relative to the original pitch. Default is 0.
@@ -115,13 +117,18 @@ class TTS(tts.TTS):
         lang = language if is_given(language) else DEFAULT_LANGUAGE
         ssml_gender = _gender_from_str(DEFAULT_GENDER if not is_given(gender) else gender)
-        name = DEFAULT_VOICE_NAME if not is_given(voice_name) else voice_name
         voice_params = texttospeech.VoiceSelectionParams(
-            name=name,
             language_code=lang,
             ssml_gender=ssml_gender,
         )
+        if is_given(voice_cloning_key):
+            voice_params.voice_clone = texttospeech.VoiceCloneParams(
+                voice_cloning_key=voice_cloning_key,
+            )
+        else:
+            voice_params.name = voice_name if is_given(voice_name) else DEFAULT_VOICE_NAME
         if not is_given(tokenizer):
             tokenizer = tokenize.blingfire.SentenceTokenizer()

{livekit_plugins_google-1.2.5 → livekit_plugins_google-1.2.7}/livekit/plugins/google/utils.py RENAMED Viewed

@@ -23,18 +23,27 @@ from .tools import _LLMTool
 __all__ = ["to_fnc_ctx"]
-def to_fnc_ctx(fncs: list[FunctionTool | RawFunctionTool]) -> list[types.FunctionDeclaration]:
+def to_fnc_ctx(
+    fncs: list[FunctionTool | RawFunctionTool], *, use_parameters_json_schema: bool = True
+) -> list[types.FunctionDeclaration]:
     tools: list[types.FunctionDeclaration] = []
     for fnc in fncs:
         if is_raw_function_tool(fnc):
             info = get_raw_function_info(fnc)
-            tools.append(
-                types.FunctionDeclaration(
-                    name=info.name,
-                    description=info.raw_schema.get("description", ""),
-                    parameters_json_schema=info.raw_schema.get("parameters", {}),
+            fnc_kwargs = {
+                "name": info.name,
+                "description": info.raw_schema.get("description", ""),
+            }
+            if use_parameters_json_schema:
+                fnc_kwargs["parameters_json_schema"] = info.raw_schema.get("parameters", {})
+            else:
+                # https://github.com/googleapis/python-genai/issues/1147
+                fnc_kwargs["parameters"] = types.Schema.from_json_schema(
+                    json_schema=types.JSONSchema.model_validate(
+                        info.raw_schema.get("parameters", {})
+                    )
                 )
-            )
+            tools.append(types.FunctionDeclaration(**fnc_kwargs))
         elif is_function_tool(fnc):
             tools.append(_build_gemini_fnc(fnc))

{livekit_plugins_google-1.2.5 → livekit_plugins_google-1.2.7}/livekit/plugins/google/version.py RENAMED Viewed

@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "1.2.5"
+__version__ = "1.2.7"

{livekit_plugins_google-1.2.5 → livekit_plugins_google-1.2.7}/pyproject.toml RENAMED Viewed

@@ -27,7 +27,7 @@ dependencies = [
     "google-cloud-speech >= 2, < 3",
     "google-cloud-texttospeech >= 2.27, < 3",
     "google-genai >= v1.23.0",
-    "livekit-agents>=1.2.5",
+    "livekit-agents>=1.2.7",
 ]
 [project.urls]