livekit-plugins-google 1.2.5__py3-none-any.whl → 1.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of livekit-plugins-google might be problematic. Click here for more details.

@@ -428,7 +428,9 @@ class RealtimeSession(llm.RealtimeSession):
428
428
  self._chat_ctx = chat_ctx.copy()
429
429
 
430
430
  async def update_tools(self, tools: list[llm.FunctionTool | llm.RawFunctionTool]) -> None:
431
- new_declarations: list[types.FunctionDeclaration] = to_fnc_ctx(tools)
431
+ new_declarations: list[types.FunctionDeclaration] = to_fnc_ctx(
432
+ tools, use_parameters_json_schema=False
433
+ )
432
434
  current_tool_names = {f.name for f in self._gemini_declarations}
433
435
  new_tool_names = {f.name for f in new_declarations}
434
436
 
@@ -699,10 +701,15 @@ class RealtimeSession(llm.RealtimeSession):
699
701
  break
700
702
 
701
703
  async for response in session.receive():
702
- if (not self._current_generation or self._current_generation._done) and (
703
- response.server_content or response.tool_call
704
- ):
705
- self._start_new_generation()
704
+ if not self._current_generation or self._current_generation._done:
705
+ if response.server_content and response.server_content.interrupted:
706
+ # interrupt a generation already done
707
+ self._handle_input_speech_started()
708
+ # reset the flag and still start a new generation in case it has any other content
709
+ response.server_content.interrupted = False
710
+
711
+ if self._is_new_generation(response):
712
+ self._start_new_generation()
706
713
 
707
714
  if response.session_resumption_update:
708
715
  if (
@@ -1084,3 +1091,16 @@ class RealtimeSession(llm.RealtimeSession):
1084
1091
  recoverable=recoverable,
1085
1092
  ),
1086
1093
  )
1094
+
1095
+ def _is_new_generation(self, resp: types.LiveServerMessage) -> bool:
1096
+ if resp.tool_call:
1097
+ return True
1098
+
1099
+ if (sc := resp.server_content) and (
1100
+ sc.model_turn
1101
+ or (sc.output_transcription and sc.output_transcription.text is not None)
1102
+ or (sc.input_transcription and sc.input_transcription.text is not None)
1103
+ ):
1104
+ return True
1105
+
1106
+ return False
@@ -157,10 +157,6 @@ class LLM(llm.LLM):
157
157
  if _thinking_budget is not None:
158
158
  if not isinstance(_thinking_budget, int):
159
159
  raise ValueError("thinking_budget inside thinking_config must be an integer")
160
- if not (0 <= _thinking_budget <= 24576):
161
- raise ValueError(
162
- "thinking_budget inside thinking_config must be between 0 and 24576"
163
- )
164
160
 
165
161
  self._opts = _LLMOptions(
166
162
  model=model,
@@ -20,6 +20,7 @@ import time
20
20
  import weakref
21
21
  from collections.abc import AsyncGenerator, AsyncIterable
22
22
  from dataclasses import dataclass
23
+ from datetime import timedelta
23
24
  from typing import Callable, Union, cast
24
25
 
25
26
  from google.api_core.client_options import ClientOptions
@@ -28,6 +29,7 @@ from google.auth import default as gauth_default
28
29
  from google.auth.exceptions import DefaultCredentialsError
29
30
  from google.cloud.speech_v2 import SpeechAsyncClient
30
31
  from google.cloud.speech_v2.types import cloud_speech
32
+ from google.protobuf.duration_pb2 import Duration
31
33
  from livekit import rtc
32
34
  from livekit.agents import (
33
35
  DEFAULT_API_CONNECT_OPTIONS,
@@ -67,6 +69,7 @@ class STTOptions:
67
69
  punctuate: bool
68
70
  spoken_punctuation: bool
69
71
  enable_word_time_offsets: bool
72
+ enable_word_confidence: bool
70
73
  model: SpeechModels | str
71
74
  sample_rate: int
72
75
  min_confidence_threshold: float
@@ -99,6 +102,7 @@ class STT(stt.STT):
99
102
  punctuate: bool = True,
100
103
  spoken_punctuation: bool = False,
101
104
  enable_word_time_offsets: bool = True,
105
+ enable_word_confidence: bool = False,
102
106
  model: SpeechModels | str = "latest_long",
103
107
  location: str = "global",
104
108
  sample_rate: int = 16000,
@@ -122,6 +126,7 @@ class STT(stt.STT):
122
126
  punctuate(bool): whether to punctuate the audio (default: True)
123
127
  spoken_punctuation(bool): whether to use spoken punctuation (default: False)
124
128
  enable_word_time_offsets(bool): whether to enable word time offsets (default: True)
129
+ enable_word_confidence(bool): whether to enable word confidence (default: False)
125
130
  model(SpeechModels): the model to use for recognition default: "latest_long"
126
131
  location(str): the location to use for recognition default: "global"
127
132
  sample_rate(int): the sample rate of the audio default: 16000
@@ -162,6 +167,7 @@ class STT(stt.STT):
162
167
  punctuate=punctuate,
163
168
  spoken_punctuation=spoken_punctuation,
164
169
  enable_word_time_offsets=enable_word_time_offsets,
170
+ enable_word_confidence=enable_word_confidence,
165
171
  model=model,
166
172
  sample_rate=sample_rate,
167
173
  min_confidence_threshold=min_confidence_threshold,
@@ -243,6 +249,7 @@ class STT(stt.STT):
243
249
  enable_automatic_punctuation=config.punctuate,
244
250
  enable_spoken_punctuation=config.spoken_punctuation,
245
251
  enable_word_time_offsets=config.enable_word_time_offsets,
252
+ enable_word_confidence=config.enable_word_confidence,
246
253
  ),
247
254
  model=config.model,
248
255
  language_codes=config.languages,
@@ -547,6 +554,14 @@ class SpeechStream(stt.SpeechStream):
547
554
  raise APIConnectionError() from e
548
555
 
549
556
 
557
+ def _duration_to_seconds(duration: Duration | timedelta) -> float:
558
+ # Proto Plus may auto-convert Duration to timedelta; handle both.
559
+ # https://proto-plus-python.readthedocs.io/en/latest/marshal.html
560
+ if isinstance(duration, timedelta):
561
+ return duration.total_seconds()
562
+ return duration.seconds + duration.nanos / 1e9
563
+
564
+
550
565
  def _recognize_response_to_speech_event(
551
566
  resp: cloud_speech.RecognizeResponse,
552
567
  ) -> stt.SpeechEvent:
@@ -556,24 +571,31 @@ def _recognize_response_to_speech_event(
556
571
  text += result.alternatives[0].transcript
557
572
  confidence += result.alternatives[0].confidence
558
573
 
559
- # not sure why start_offset and end_offset returns a timedelta
560
- start_offset = resp.results[0].alternatives[0].words[0].start_offset
561
- end_offset = resp.results[-1].alternatives[0].words[-1].end_offset
574
+ alternatives = []
562
575
 
563
- confidence /= len(resp.results)
564
- lg = resp.results[0].language_code
565
- return stt.SpeechEvent(
566
- type=stt.SpeechEventType.FINAL_TRANSCRIPT,
567
- alternatives=[
576
+ # Google STT may return empty results when spoken_lang != stt_lang
577
+ if resp.results:
578
+ try:
579
+ start_time = _duration_to_seconds(resp.results[0].alternatives[0].words[0].start_offset)
580
+ end_time = _duration_to_seconds(resp.results[-1].alternatives[0].words[-1].end_offset)
581
+ except IndexError:
582
+ # When enable_word_time_offsets=False, there are no "words" to access
583
+ start_time = end_time = 0
584
+
585
+ confidence /= len(resp.results)
586
+ lg = resp.results[0].language_code
587
+
588
+ alternatives = [
568
589
  stt.SpeechData(
569
590
  language=lg,
570
- start_time=start_offset.total_seconds(), # type: ignore
571
- end_time=end_offset.total_seconds(), # type: ignore
591
+ start_time=start_time,
592
+ end_time=end_time,
572
593
  confidence=confidence,
573
594
  text=text,
574
595
  )
575
- ],
576
- )
596
+ ]
597
+
598
+ return stt.SpeechEvent(type=stt.SpeechEventType.FINAL_TRANSCRIPT, alternatives=alternatives)
577
599
 
578
600
 
579
601
  def _streaming_recognize_response_to_speech_data(
@@ -61,6 +61,7 @@ class TTS(tts.TTS):
61
61
  language: NotGivenOr[SpeechLanguages | str] = NOT_GIVEN,
62
62
  gender: NotGivenOr[Gender | str] = NOT_GIVEN,
63
63
  voice_name: NotGivenOr[str] = NOT_GIVEN,
64
+ voice_cloning_key: NotGivenOr[str] = NOT_GIVEN,
64
65
  sample_rate: int = 24000,
65
66
  pitch: int = 0,
66
67
  effects_profile_id: str = "",
@@ -86,6 +87,7 @@ class TTS(tts.TTS):
86
87
  language (SpeechLanguages | str, optional): Language code (e.g., "en-US"). Default is "en-US".
87
88
  gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
88
89
  voice_name (str, optional): Specific voice name. Default is an empty string.
90
+ voice_cloning_key (str, optional): Voice clone key. Created via https://cloud.google.com/text-to-speech/docs/chirp3-instant-custom-voice
89
91
  sample_rate (int, optional): Audio sample rate in Hz. Default is 24000.
90
92
  location (str, optional): Location for the TTS client. Default is "global".
91
93
  pitch (float, optional): Speaking pitch, ranging from -20.0 to 20.0 semitones relative to the original pitch. Default is 0.
@@ -115,13 +117,18 @@ class TTS(tts.TTS):
115
117
 
116
118
  lang = language if is_given(language) else DEFAULT_LANGUAGE
117
119
  ssml_gender = _gender_from_str(DEFAULT_GENDER if not is_given(gender) else gender)
118
- name = DEFAULT_VOICE_NAME if not is_given(voice_name) else voice_name
119
120
 
120
121
  voice_params = texttospeech.VoiceSelectionParams(
121
- name=name,
122
122
  language_code=lang,
123
123
  ssml_gender=ssml_gender,
124
124
  )
125
+ if is_given(voice_cloning_key):
126
+ voice_params.voice_clone = texttospeech.VoiceCloneParams(
127
+ voice_cloning_key=voice_cloning_key,
128
+ )
129
+ else:
130
+ voice_params.name = voice_name if is_given(voice_name) else DEFAULT_VOICE_NAME
131
+
125
132
  if not is_given(tokenizer):
126
133
  tokenizer = tokenize.blingfire.SentenceTokenizer()
127
134
 
@@ -23,18 +23,27 @@ from .tools import _LLMTool
23
23
  __all__ = ["to_fnc_ctx"]
24
24
 
25
25
 
26
- def to_fnc_ctx(fncs: list[FunctionTool | RawFunctionTool]) -> list[types.FunctionDeclaration]:
26
+ def to_fnc_ctx(
27
+ fncs: list[FunctionTool | RawFunctionTool], *, use_parameters_json_schema: bool = True
28
+ ) -> list[types.FunctionDeclaration]:
27
29
  tools: list[types.FunctionDeclaration] = []
28
30
  for fnc in fncs:
29
31
  if is_raw_function_tool(fnc):
30
32
  info = get_raw_function_info(fnc)
31
- tools.append(
32
- types.FunctionDeclaration(
33
- name=info.name,
34
- description=info.raw_schema.get("description", ""),
35
- parameters_json_schema=info.raw_schema.get("parameters", {}),
33
+ fnc_kwargs = {
34
+ "name": info.name,
35
+ "description": info.raw_schema.get("description", ""),
36
+ }
37
+ if use_parameters_json_schema:
38
+ fnc_kwargs["parameters_json_schema"] = info.raw_schema.get("parameters", {})
39
+ else:
40
+ # https://github.com/googleapis/python-genai/issues/1147
41
+ fnc_kwargs["parameters"] = types.Schema.from_json_schema(
42
+ json_schema=types.JSONSchema.model_validate(
43
+ info.raw_schema.get("parameters", {})
44
+ )
36
45
  )
37
- )
46
+ tools.append(types.FunctionDeclaration(**fnc_kwargs))
38
47
 
39
48
  elif is_function_tool(fnc):
40
49
  tools.append(_build_gemini_fnc(fnc))
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.2.5"
15
+ __version__ = "1.2.7"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: livekit-plugins-google
3
- Version: 1.2.5
3
+ Version: 1.2.7
4
4
  Summary: Agent Framework plugin for services from Google Cloud
5
5
  Project-URL: Documentation, https://docs.livekit.io
6
6
  Project-URL: Website, https://livekit.io/
@@ -22,7 +22,7 @@ Requires-Dist: google-auth<3,>=2
22
22
  Requires-Dist: google-cloud-speech<3,>=2
23
23
  Requires-Dist: google-cloud-texttospeech<3,>=2.27
24
24
  Requires-Dist: google-genai>=v1.23.0
25
- Requires-Dist: livekit-agents>=1.2.5
25
+ Requires-Dist: livekit-agents>=1.2.7
26
26
  Description-Content-Type: text/markdown
27
27
 
28
28
  # Google AI plugin for LiveKit Agents
@@ -1,18 +1,18 @@
1
1
  livekit/plugins/google/__init__.py,sha256=XIyZ-iFnRBpaLtOJgVwojlB-a8GjdDugVFcjBpMEww8,1412
2
- livekit/plugins/google/llm.py,sha256=VmM-OEDRplHEYEVHh9rq9rI180yP7xvu_JTI2zFolbY,19035
2
+ livekit/plugins/google/llm.py,sha256=cMlmLX1m3TsrLW0a-k2oj6WQSNWEjj3jv7ob8MUoXCI,18825
3
3
  livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
4
4
  livekit/plugins/google/models.py,sha256=poOvUBvgpqmmQV5EUQsq0RgNIRAq7nH-_IZIcIfPSBI,2801
5
5
  livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- livekit/plugins/google/stt.py,sha256=1mV8p6m5YhhyAnnmNER0H4L75vlz6VY-sEyh9c_fBdI,24997
6
+ livekit/plugins/google/stt.py,sha256=gRhVRsfg3BPNkBJGG78QOxEia0mF1jBnI_Ckq1jxqIs,25938
7
7
  livekit/plugins/google/tools.py,sha256=tD5HVDHO5JfUF029Cx3axHMJec0Gxalkl7s1FDgxLzI,259
8
- livekit/plugins/google/tts.py,sha256=SODcGwteJIpGmuFArVRwuwy49k8-uQXJAIK5wBNiMC8,16219
9
- livekit/plugins/google/utils.py,sha256=6iihkKx76DDtLiHOoTU2ZXqzupBRY_gN3njpnwdmeqY,8829
10
- livekit/plugins/google/version.py,sha256=3JgPC-1CyezRI7gGvBZSyMSrxGE-TY9iCMkxdV4uM9Y,600
8
+ livekit/plugins/google/tts.py,sha256=3TPHBKJJwIt-hSTAdbI4NUcQNerhV0eDuK_o2rprdqg,16606
9
+ livekit/plugins/google/utils.py,sha256=z0iCP6-hYix3JRm2RM5JOBEJCICehUe5N4FTl-JpXLc,9269
10
+ livekit/plugins/google/version.py,sha256=WptujSPsyLd0Bw9PUEFiZCGAoTjv83-kdmf_z2Y74qg,600
11
11
  livekit/plugins/google/beta/__init__.py,sha256=RvAUdvEiRN-fe4JrgPcN0Jkw1kZR9wPerGMFVjS1Cc0,270
12
12
  livekit/plugins/google/beta/gemini_tts.py,sha256=esWjr0Xf95tl0_AB7MXiFZ_VCORWgcWjzvLvRa3t0FQ,8515
13
13
  livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
14
14
  livekit/plugins/google/beta/realtime/api_proto.py,sha256=nb_QkVQDEH7h0SKA9vdS3JaL12a6t2Z1ja4SdnxE6a8,814
15
- livekit/plugins/google/beta/realtime/realtime_api.py,sha256=okAyJnSUP0oS47Gx7eETwCjM5Bevlsd6yaP-erGpCIM,46965
16
- livekit_plugins_google-1.2.5.dist-info/METADATA,sha256=TZt39t4NAr6HKOoZI-jYQAqiy1ktu6EIlaGvxNv9SQo,1907
17
- livekit_plugins_google-1.2.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
18
- livekit_plugins_google-1.2.5.dist-info/RECORD,,
15
+ livekit/plugins/google/beta/realtime/realtime_api.py,sha256=p0vEaxQhPLUbGjHo7Za2rbBrCjD_UqPk-thd9ybIiuk,47817
16
+ livekit_plugins_google-1.2.7.dist-info/METADATA,sha256=TNVq9U_TEmP1FtAbST42WhkqFlS8g4h0Lq4qYuSbr-0,1907
17
+ livekit_plugins_google-1.2.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
18
+ livekit_plugins_google-1.2.7.dist-info/RECORD,,