livekit-plugins-google 1.0.17__py3-none-any.whl → 1.0.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,7 @@ from collections.abc import Iterator
9
9
  from dataclasses import dataclass
10
10
 
11
11
  from google import genai
12
+ from google.genai.live import AsyncSession
12
13
  from google.genai.types import (
13
14
  AudioTranscriptionConfig,
14
15
  Blob,
@@ -17,6 +18,7 @@ from google.genai.types import (
17
18
  GenerationConfig,
18
19
  LiveClientContent,
19
20
  LiveClientRealtimeInput,
21
+ LiveClientToolResponse,
20
22
  LiveConnectConfig,
21
23
  LiveServerContent,
22
24
  LiveServerGoAway,
@@ -25,6 +27,7 @@ from google.genai.types import (
25
27
  Modality,
26
28
  Part,
27
29
  PrebuiltVoiceConfig,
30
+ SessionResumptionConfig,
28
31
  SpeechConfig,
29
32
  Tool,
30
33
  UsageMetadata,
@@ -62,6 +65,7 @@ class _RealtimeOptions:
62
65
  model: LiveAPIModels | str
63
66
  api_key: str | None
64
67
  voice: Voice | str
68
+ language: NotGivenOr[str]
65
69
  response_modalities: NotGivenOr[list[Modality]]
66
70
  vertexai: bool
67
71
  project: str | None
@@ -101,6 +105,7 @@ class RealtimeModel(llm.RealtimeModel):
101
105
  model: LiveAPIModels | str = "gemini-2.0-flash-live-001",
102
106
  api_key: NotGivenOr[str] = NOT_GIVEN,
103
107
  voice: Voice | str = "Puck",
108
+ language: NotGivenOr[str] = NOT_GIVEN,
104
109
  modalities: NotGivenOr[list[Modality]] = NOT_GIVEN,
105
110
  vertexai: bool = False,
106
111
  project: NotGivenOr[str] = NOT_GIVEN,
@@ -131,6 +136,7 @@ class RealtimeModel(llm.RealtimeModel):
131
136
  modalities (list[Modality], optional): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
132
137
  model (str, optional): The name of the model to use. Defaults to "gemini-2.0-flash-live-001".
133
138
  voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
139
+ language (str, optional): The language(BCP-47 Code) to use for the API. supported languages - https://ai.google.dev/gemini-api/docs/live#supported-languages
134
140
  temperature (float, optional): Sampling temperature for response generation. Defaults to 0.8.
135
141
  vertexai (bool, optional): Whether to use VertexAI for the API. Defaults to False.
136
142
  project (str, optional): The project id to use for the API. Defaults to None. (for vertexai)
@@ -195,6 +201,7 @@ class RealtimeModel(llm.RealtimeModel):
195
201
  instructions=instructions,
196
202
  input_audio_transcription=input_audio_transcription,
197
203
  output_audio_transcription=output_audio_transcription,
204
+ language=language,
198
205
  )
199
206
 
200
207
  self._sessions = weakref.WeakSet[RealtimeSession]()
@@ -247,12 +254,14 @@ class RealtimeSession(llm.RealtimeSession):
247
254
  self._main_atask = asyncio.create_task(self._main_task(), name="gemini-realtime-session")
248
255
 
249
256
  self._current_generation: _ResponseGeneration | None = None
250
- self._active_session: genai.LiveSession | None = None
257
+ self._active_session: AsyncSession | None = None
251
258
  # indicates if the underlying session should end
252
259
  self._session_should_close = asyncio.Event()
253
260
  self._response_created_futures: dict[str, asyncio.Future[llm.GenerationCreatedEvent]] = {}
254
261
  self._pending_generation_fut: asyncio.Future[llm.GenerationCreatedEvent] | None = None
255
262
 
263
+ self._session_resumption_handle: str | None = None
264
+
256
265
  self._update_lock = asyncio.Lock()
257
266
  self._session_lock = asyncio.Lock()
258
267
 
@@ -465,7 +474,7 @@ class RealtimeSession(llm.RealtimeSession):
465
474
  finally:
466
475
  await self._close_active_session()
467
476
 
468
- async def _send_task(self, session: genai.LiveSession):
477
+ async def _send_task(self, session: AsyncSession):
469
478
  try:
470
479
  async for msg in self._msg_ch:
471
480
  async with self._session_lock:
@@ -473,11 +482,18 @@ class RealtimeSession(llm.RealtimeSession):
473
482
  not self._active_session or self._active_session != session
474
483
  ):
475
484
  break
476
-
477
485
  if isinstance(msg, LiveClientContent):
478
- await session.send(input=msg)
486
+ await session.send_client_content(
487
+ turns=msg.turns, turn_complete=msg.turn_complete
488
+ )
489
+ elif isinstance(msg, LiveClientToolResponse):
490
+ await session.send_tool_response(function_responses=msg.function_responses)
491
+ elif isinstance(msg, LiveClientRealtimeInput):
492
+ for media_chunk in msg.media_chunks:
493
+ await session.send_realtime_input(media=media_chunk)
479
494
  else:
480
- await session.send(input=msg)
495
+ logger.warning(f"Warning: Received unhandled message type: {type(msg)}")
496
+
481
497
  except Exception as e:
482
498
  if not self._session_should_close.is_set():
483
499
  logger.error(f"error in send task: {e}", exc_info=e)
@@ -485,7 +501,7 @@ class RealtimeSession(llm.RealtimeSession):
485
501
  finally:
486
502
  logger.debug("send task finished.")
487
503
 
488
- async def _recv_task(self, session: genai.LiveSession):
504
+ async def _recv_task(self, session: AsyncSession):
489
505
  try:
490
506
  while True:
491
507
  async with self._session_lock:
@@ -501,6 +517,15 @@ class RealtimeSession(llm.RealtimeSession):
501
517
  ):
502
518
  self._start_new_generation()
503
519
 
520
+ if response.session_resumption_update:
521
+ if (
522
+ response.session_resumption_update.resumable
523
+ and response.session_resumption_update.new_handle
524
+ ):
525
+ self._session_resumption_handle = (
526
+ response.session_resumption_update.new_handle
527
+ )
528
+
504
529
  if response.server_content:
505
530
  self._handle_server_content(response.server_content)
506
531
  if response.tool_call:
@@ -548,11 +573,13 @@ class RealtimeSession(llm.RealtimeSession):
548
573
  speech_config=SpeechConfig(
549
574
  voice_config=VoiceConfig(
550
575
  prebuilt_voice_config=PrebuiltVoiceConfig(voice_name=self._opts.voice)
551
- )
576
+ ),
577
+ language_code=self._opts.language if is_given(self._opts.language) else None,
552
578
  ),
553
579
  tools=[Tool(function_declarations=self._gemini_declarations)],
554
580
  input_audio_transcription=self._opts.input_audio_transcription,
555
581
  output_audio_transcription=self._opts.output_audio_transcription,
582
+ session_resumption=SessionResumptionConfig(handle=self._session_resumption_handle),
556
583
  )
557
584
 
558
585
  def _start_new_generation(self):
@@ -270,7 +270,7 @@ class LLMStream(llm.LLMStream):
270
270
  request_id = utils.shortuuid()
271
271
 
272
272
  try:
273
- turns, system_instruction = to_chat_ctx(self._chat_ctx, id(self._llm))
273
+ turns, system_instruction = to_chat_ctx(self._chat_ctx, id(self._llm), generate=True)
274
274
  function_declarations = to_fnc_ctx(self._tools)
275
275
  if function_declarations:
276
276
  self._extra_kwargs["tools"] = [
@@ -54,7 +54,7 @@ LanguageCode = Union[LgType, list[LgType]]
54
54
  _max_session_duration = 240
55
55
 
56
56
  # Google is very sensitive to background noise, so we'll ignore results with low confidence
57
- _min_confidence = 0.65
57
+ _default_min_confidence = 0.65
58
58
 
59
59
 
60
60
  # This class is only be used internally to encapsulate the options
@@ -67,6 +67,7 @@ class STTOptions:
67
67
  spoken_punctuation: bool
68
68
  model: SpeechModels | str
69
69
  sample_rate: int
70
+ min_confidence_threshold: float
70
71
  keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN
71
72
 
72
73
  def build_adaptation(self) -> cloud_speech.SpeechAdaptation | None:
@@ -98,6 +99,7 @@ class STT(stt.STT):
98
99
  model: SpeechModels | str = "latest_long",
99
100
  location: str = "global",
100
101
  sample_rate: int = 16000,
102
+ min_confidence_threshold: float = _default_min_confidence,
101
103
  credentials_info: NotGivenOr[dict] = NOT_GIVEN,
102
104
  credentials_file: NotGivenOr[str] = NOT_GIVEN,
103
105
  keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN,
@@ -118,6 +120,8 @@ class STT(stt.STT):
118
120
  model(SpeechModels): the model to use for recognition default: "latest_long"
119
121
  location(str): the location to use for recognition default: "global"
120
122
  sample_rate(int): the sample rate of the audio default: 16000
123
+ min_confidence_threshold(float): minimum confidence threshold for recognition
124
+ (default: 0.65)
121
125
  credentials_info(dict): the credentials info to use for recognition (default: None)
122
126
  credentials_file(str): the credentials file to use for recognition (default: None)
123
127
  keywords(List[tuple[str, float]]): list of keywords to recognize (default: None)
@@ -149,6 +153,7 @@ class STT(stt.STT):
149
153
  spoken_punctuation=spoken_punctuation,
150
154
  model=model,
151
155
  sample_rate=sample_rate,
156
+ min_confidence_threshold=min_confidence_threshold,
152
157
  keywords=keywords,
153
158
  )
154
159
  self._streams = weakref.WeakSet[SpeechStream]()
@@ -343,6 +348,7 @@ class SpeechStream(stt.SpeechStream):
343
348
  punctuate: NotGivenOr[bool] = NOT_GIVEN,
344
349
  spoken_punctuation: NotGivenOr[bool] = NOT_GIVEN,
345
350
  model: NotGivenOr[SpeechModels] = NOT_GIVEN,
351
+ min_confidence_threshold: NotGivenOr[float] = NOT_GIVEN,
346
352
  keywords: NotGivenOr[list[tuple[str, float]]] = NOT_GIVEN,
347
353
  ):
348
354
  if is_given(languages):
@@ -359,6 +365,8 @@ class SpeechStream(stt.SpeechStream):
359
365
  self._config.spoken_punctuation = spoken_punctuation
360
366
  if is_given(model):
361
367
  self._config.model = model
368
+ if is_given(min_confidence_threshold):
369
+ self._config.min_confidence_threshold = min_confidence_threshold
362
370
  if is_given(keywords):
363
371
  self._config.keywords = keywords
364
372
 
@@ -405,7 +413,10 @@ class SpeechStream(stt.SpeechStream):
405
413
  == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_EVENT_TYPE_UNSPECIFIED # noqa: E501
406
414
  ):
407
415
  result = resp.results[0]
408
- speech_data = _streaming_recognize_response_to_speech_data(resp)
416
+ speech_data = _streaming_recognize_response_to_speech_data(
417
+ resp,
418
+ min_confidence_threshold=self._config.min_confidence_threshold,
419
+ )
409
420
  if speech_data is None:
410
421
  continue
411
422
 
@@ -530,6 +541,8 @@ def _recognize_response_to_speech_event(
530
541
 
531
542
  def _streaming_recognize_response_to_speech_data(
532
543
  resp: cloud_speech.StreamingRecognizeResponse,
544
+ *,
545
+ min_confidence_threshold: float,
533
546
  ) -> stt.SpeechData | None:
534
547
  text = ""
535
548
  confidence = 0.0
@@ -542,7 +555,7 @@ def _streaming_recognize_response_to_speech_data(
542
555
  confidence /= len(resp.results)
543
556
  lg = resp.results[0].language_code
544
557
 
545
- if confidence < _min_confidence:
558
+ if confidence < min_confidence_threshold:
546
559
  return None
547
560
  if text == "":
548
561
  return None
@@ -56,6 +56,7 @@ class TTS(tts.TTS):
56
56
  effects_profile_id: str = "",
57
57
  speaking_rate: float = 1.0,
58
58
  location: str = "global",
59
+ audio_encoding: texttospeech.AudioEncoding = texttospeech.AudioEncoding.PCM,
59
60
  credentials_info: NotGivenOr[dict] = NOT_GIVEN,
60
61
  credentials_file: NotGivenOr[str] = NOT_GIVEN,
61
62
  ) -> None:
@@ -105,7 +106,7 @@ class TTS(tts.TTS):
105
106
  self._opts = _TTSOptions(
106
107
  voice=voice_params,
107
108
  audio_config=texttospeech.AudioConfig(
108
- audio_encoding=texttospeech.AudioEncoding.OGG_OPUS,
109
+ audio_encoding=audio_encoding,
109
110
  sample_rate_hertz=sample_rate,
110
111
  pitch=pitch,
111
112
  effects_profile_id=effects_profile_id,
@@ -132,11 +133,11 @@ class TTS(tts.TTS):
132
133
  """ # noqa: E501
133
134
  params = {}
134
135
  if is_given(language):
135
- params["language"] = language
136
+ params["language_code"] = str(language)
136
137
  if is_given(gender):
137
- params["gender"] = gender
138
+ params["ssml_gender"] = _gender_from_str(str(gender))
138
139
  if is_given(voice_name):
139
- params["voice_name"] = voice_name
140
+ params["name"] = voice_name
140
141
 
141
142
  if params:
142
143
  self._opts.voice = texttospeech.VoiceSelectionParams(**params)
@@ -39,7 +39,10 @@ def get_tool_results_for_realtime(chat_ctx: llm.ChatContext) -> types.LiveClient
39
39
 
40
40
 
41
41
  def to_chat_ctx(
42
- chat_ctx: llm.ChatContext, cache_key: Any, ignore_functions: bool = False
42
+ chat_ctx: llm.ChatContext,
43
+ cache_key: Any,
44
+ ignore_functions: bool = False,
45
+ generate: bool = False,
43
46
  ) -> tuple[list[types.Content], types.Content | None]:
44
47
  turns: list[types.Content] = []
45
48
  system_instruction: types.Content | None = None
@@ -99,10 +102,9 @@ def to_chat_ctx(
99
102
  if current_role is not None and parts:
100
103
  turns.append(types.Content(role=current_role, parts=parts))
101
104
 
102
- # # Gemini requires the last message to end with user's turn before they can generate
103
- # # currently not used because to_chat_ctx should not be used to force a new generation
104
- # if current_role != "user":
105
- # turns.append(types.Content(role="user", parts=[types.Part(text=".")]))
105
+ # Gemini requires the last message to end with user's turn before they can generate
106
+ if generate and current_role != "user":
107
+ turns.append(types.Content(role="user", parts=[types.Part(text=".")]))
106
108
 
107
109
  return turns, system_instruction
108
110
 
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.0.17"
15
+ __version__ = "1.0.19"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: livekit-plugins-google
3
- Version: 1.0.17
3
+ Version: 1.0.19
4
4
  Summary: Agent Framework plugin for services from Google Cloud
5
5
  Project-URL: Documentation, https://docs.livekit.io
6
6
  Project-URL: Website, https://livekit.io/
@@ -21,8 +21,8 @@ Requires-Python: >=3.9.0
21
21
  Requires-Dist: google-auth<3,>=2
22
22
  Requires-Dist: google-cloud-speech<3,>=2
23
23
  Requires-Dist: google-cloud-texttospeech<3,>=2
24
- Requires-Dist: google-genai>=1.11.0
25
- Requires-Dist: livekit-agents>=1.0.17
24
+ Requires-Dist: google-genai>=1.12.1
25
+ Requires-Dist: livekit-agents>=1.0.19
26
26
  Description-Content-Type: text/markdown
27
27
 
28
28
  # LiveKit Plugins Google
@@ -0,0 +1,16 @@
1
+ livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
2
+ livekit/plugins/google/llm.py,sha256=NaaT4Zaw6o98VcUHNrQcZZRkD7DPREd76O8fG9IOpXQ,16190
3
+ livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
4
+ livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
5
+ livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ livekit/plugins/google/stt.py,sha256=MADnkh0YKWY4bLRgBwFv4emu4YFO-7EVnhxO--dPTlI,23082
7
+ livekit/plugins/google/tts.py,sha256=29R0ieV5sRPBf5Yi0SPFQk7ZZMbELF30bIL9K_j_Wcg,9100
8
+ livekit/plugins/google/utils.py,sha256=sPZZg5VHf60kSILUIHGIZyN2CWYwnCGNYICn8Mhcv9g,9534
9
+ livekit/plugins/google/version.py,sha256=UDC8ahmGgRkv-qMQUY3QibuuVevGMQ9Fd4yIhcQBZwA,601
10
+ livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
11
+ livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
12
+ livekit/plugins/google/beta/realtime/api_proto.py,sha256=Fyrejs3SG0EjOPCCFLEnWXKEUxCff47PMWk2VsKJm5E,594
13
+ livekit/plugins/google/beta/realtime/realtime_api.py,sha256=yk202S604Eogp_ssBX2BSbAXV67uUyQzVO-bzLnScrs,31423
14
+ livekit_plugins_google-1.0.19.dist-info/METADATA,sha256=HuRBvpT9dX3Mz7YOVhZhgQLm3-qQa2vAf2SRDQ5u1vM,3492
15
+ livekit_plugins_google-1.0.19.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
16
+ livekit_plugins_google-1.0.19.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
2
- livekit/plugins/google/llm.py,sha256=SqNGg6-wlrIUo9uaismP7QW5XztkXyDivJXLVgOIZMI,16175
3
- livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
4
- livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
5
- livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- livekit/plugins/google/stt.py,sha256=AG_lh2fuuduJi0jFbA_QKFXLJ6NUdF1W_FfkLUJML_Q,22413
7
- livekit/plugins/google/tts.py,sha256=xhINokqY8UutXn85N-cbzq68eptbM6TTtIXmLktE_RM,9004
8
- livekit/plugins/google/utils.py,sha256=TjjTwMbdJdxr3bZjUXxs-J_fipTTM00goW2-d9KWX6w,9582
9
- livekit/plugins/google/version.py,sha256=GOfJB-DKZur-i3hrjFbzgpC2NHE96dnWhGLziW1e0_E,601
10
- livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
11
- livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
12
- livekit/plugins/google/beta/realtime/api_proto.py,sha256=Fyrejs3SG0EjOPCCFLEnWXKEUxCff47PMWk2VsKJm5E,594
13
- livekit/plugins/google/beta/realtime/realtime_api.py,sha256=2_nPBvPttVudoQswhf19ieJ6wxvHquGJgALJ09afQms,29873
14
- livekit_plugins_google-1.0.17.dist-info/METADATA,sha256=cKeNSFwiM2A-MJeNA6zNeX7ioqbvkEZO3aFfR8Run2c,3492
15
- livekit_plugins_google-1.0.17.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
16
- livekit_plugins_google-1.0.17.dist-info/RECORD,,