livekit-plugins-google 1.0.21__py3-none-any.whl → 1.0.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,9 +5,15 @@ from typing import Literal, Union
5
5
 
6
6
  from google.genai import types
7
7
 
8
- LiveAPIModels = Literal["gemini-2.0-flash-exp", "gemini-2.0-flash-live-001"]
8
+ LiveAPIModels = Literal[
9
+ "gemini-2.0-flash-exp",
10
+ # models supported on Gemini API
11
+ "gemini-2.0-flash-live-001",
12
+ "gemini-2.5-flash-preview-native-audio-dialog",
13
+ "gemini-2.5-flash-exp-native-audio-thinking-dialog",
14
+ ]
9
15
 
10
- Voice = Literal["Puck", "Charon", "Kore", "Fenrir", "Aoede"]
16
+ Voice = Literal["Puck", "Charon", "Kore", "Fenrir", "Aoede", "Leda", "Oru", "Zephyr"]
11
17
 
12
18
 
13
19
  ClientEvents = Union[
@@ -13,7 +13,6 @@ from google import genai
13
13
  from google.genai.live import AsyncSession
14
14
  from google.genai.types import (
15
15
  AudioTranscriptionConfig,
16
- AutomaticActivityDetection,
17
16
  Blob,
18
17
  Content,
19
18
  FunctionDeclaration,
@@ -86,6 +85,9 @@ class _RealtimeOptions:
86
85
  input_audio_transcription: AudioTranscriptionConfig | None
87
86
  output_audio_transcription: AudioTranscriptionConfig | None
88
87
  image_encode_options: NotGivenOr[images.EncodeOptions]
88
+ enable_affective_dialog: NotGivenOr[bool] = NOT_GIVEN
89
+ proactivity: NotGivenOr[bool] = NOT_GIVEN
90
+ realtime_input_config: NotGivenOr[RealtimeInputConfig] = NOT_GIVEN
89
91
 
90
92
 
91
93
  @dataclass
@@ -131,6 +133,9 @@ class RealtimeModel(llm.RealtimeModel):
131
133
  input_audio_transcription: NotGivenOr[AudioTranscriptionConfig | None] = NOT_GIVEN,
132
134
  output_audio_transcription: NotGivenOr[AudioTranscriptionConfig | None] = NOT_GIVEN,
133
135
  image_encode_options: NotGivenOr[images.EncodeOptions] = NOT_GIVEN,
136
+ enable_affective_dialog: NotGivenOr[bool] = NOT_GIVEN,
137
+ proactivity: NotGivenOr[bool] = NOT_GIVEN,
138
+ realtime_input_config: NotGivenOr[RealtimeInputConfig] = NOT_GIVEN,
134
139
  ) -> None:
135
140
  """
136
141
  Initializes a RealtimeModel instance for interacting with Google's Realtime API.
@@ -161,6 +166,9 @@ class RealtimeModel(llm.RealtimeModel):
161
166
  input_audio_transcription (AudioTranscriptionConfig | None, optional): The configuration for input audio transcription. Defaults to None.)
162
167
  output_audio_transcription (AudioTranscriptionConfig | None, optional): The configuration for output audio transcription. Defaults to AudioTranscriptionConfig().
163
168
  image_encode_options (images.EncodeOptions, optional): The configuration for image encoding. Defaults to DEFAULT_ENCODE_OPTIONS.
169
+ enable_affective_dialog (bool, optional): Whether to enable affective dialog. Defaults to False.
170
+ proactivity (bool, optional): Whether to enable proactive audio. Defaults to False.
171
+ realtime_input_config (RealtimeInputConfig, optional): The configuration for realtime input. Defaults to None.
164
172
 
165
173
  Raises:
166
174
  ValueError: If the API key is required but not found.
@@ -232,6 +240,9 @@ class RealtimeModel(llm.RealtimeModel):
232
240
  output_audio_transcription=output_audio_transcription,
233
241
  language=language,
234
242
  image_encode_options=image_encode_options,
243
+ enable_affective_dialog=enable_affective_dialog,
244
+ proactivity=proactivity,
245
+ realtime_input_config=realtime_input_config,
235
246
  )
236
247
 
237
248
  self._sessions = weakref.WeakSet[RealtimeSession]()
@@ -583,7 +594,7 @@ class RealtimeSession(llm.RealtimeSession):
583
594
  def _build_connect_config(self) -> LiveConnectConfig:
584
595
  temp = self._opts.temperature if is_given(self._opts.temperature) else None
585
596
 
586
- return LiveConnectConfig(
597
+ conf = LiveConnectConfig(
587
598
  response_modalities=self._opts.response_modalities
588
599
  if is_given(self._opts.response_modalities)
589
600
  else [Modality.AUDIO],
@@ -615,11 +626,18 @@ class RealtimeSession(llm.RealtimeSession):
615
626
  input_audio_transcription=self._opts.input_audio_transcription,
616
627
  output_audio_transcription=self._opts.output_audio_transcription,
617
628
  session_resumption=SessionResumptionConfig(handle=self._session_resumption_handle),
618
- realtime_input_config=RealtimeInputConfig(
619
- automatic_activity_detection=AutomaticActivityDetection(),
620
- ),
629
+ realtime_input_config=self._opts.realtime_input_config,
621
630
  )
622
631
 
632
+ if is_given(self._opts.proactivity):
633
+ conf.proactivity = {"proactive_audio": self._opts.proactivity}
634
+ if is_given(self._opts.enable_affective_dialog):
635
+ conf.enable_affective_dialog = self._opts.enable_affective_dialog
636
+ if is_given(self._opts.realtime_input_config):
637
+ conf.realtime_input_config = self._opts.realtime_input_config
638
+
639
+ return conf
640
+
623
641
  def _start_new_generation(self):
624
642
  if self._current_generation and not self._current_generation._done:
625
643
  logger.warning("starting new generation while another is active. Finalizing previous.")
@@ -789,6 +807,9 @@ class RealtimeSession(llm.RealtimeSession):
789
807
  return token_details_map
790
808
 
791
809
  for token_detail in token_details:
810
+ if not token_detail.token_count:
811
+ continue
812
+
792
813
  if token_detail.modality == Modality.AUDIO:
793
814
  token_details_map["audio_tokens"] += token_detail.token_count
794
815
  elif token_detail.modality == Modality.TEXT:
@@ -304,11 +304,8 @@ class LLMStream(llm.LLMStream):
304
304
  or not response.candidates[0].content
305
305
  or not response.candidates[0].content.parts
306
306
  ):
307
- raise APIStatusError(
308
- "No candidates in the response",
309
- retryable=True,
310
- request_id=request_id,
311
- )
307
+ logger.warning(f"no candidates in the response: {response}")
308
+ continue
312
309
 
313
310
  if len(response.candidates) > 1:
314
311
  logger.warning(
@@ -97,6 +97,7 @@ Gender = Literal["male", "female", "neutral"]
97
97
  ChatModels = Literal[
98
98
  "gemini-2.5-pro-preview-05-06",
99
99
  "gemini-2.5-flash-preview-04-17",
100
+ "gemini-2.5-flash-preview-05-20",
100
101
  "gemini-2.0-flash-001",
101
102
  "gemini-2.0-flash-lite-preview-02-05",
102
103
  "gemini-2.0-pro-exp-02-05",
@@ -14,6 +14,8 @@
14
14
 
15
15
  from __future__ import annotations
16
16
 
17
+ import asyncio
18
+ import weakref
17
19
  from dataclasses import dataclass
18
20
 
19
21
  from google.api_core.client_options import ClientOptions
@@ -25,6 +27,7 @@ from livekit.agents import (
25
27
  APIConnectOptions,
26
28
  APIStatusError,
27
29
  APITimeoutError,
30
+ tokenize,
28
31
  tts,
29
32
  utils,
30
33
  )
@@ -35,13 +38,21 @@ from livekit.agents.types import (
35
38
  )
36
39
  from livekit.agents.utils import is_given
37
40
 
41
+ from .log import logger
38
42
  from .models import Gender, SpeechLanguages
39
43
 
44
+ BUFFERED_WORDS_COUNT = 8
45
+ NUM_CHANNELS = 1
46
+ DEFAULT_VOICE_NAME = "en-US-Chirp3-HD-Charon"
47
+ DEFAULT_LANGUAGE = "en-US"
48
+ DEFAULT_GENDER = "neutral"
49
+
40
50
 
41
51
  @dataclass
42
52
  class _TTSOptions:
43
53
  voice: texttospeech.VoiceSelectionParams
44
54
  audio_config: texttospeech.AudioConfig
55
+ tokenizer: tokenize.SentenceTokenizer
45
56
 
46
57
 
47
58
  class TTS(tts.TTS):
@@ -59,6 +70,8 @@ class TTS(tts.TTS):
59
70
  audio_encoding: texttospeech.AudioEncoding = texttospeech.AudioEncoding.PCM,
60
71
  credentials_info: NotGivenOr[dict] = NOT_GIVEN,
61
72
  credentials_file: NotGivenOr[str] = NOT_GIVEN,
73
+ tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN,
74
+ use_streaming: NotGivenOr[bool] = NOT_GIVEN,
62
75
  ) -> None:
63
76
  """
64
77
  Create a new instance of Google TTS.
@@ -78,12 +91,14 @@ class TTS(tts.TTS):
78
91
  speaking_rate (float, optional): Speed of speech. Default is 1.0.
79
92
  credentials_info (dict, optional): Dictionary containing Google Cloud credentials. Default is None.
80
93
  credentials_file (str, optional): Path to the Google Cloud credentials JSON file. Default is None.
94
+ tokenizer (tokenize.SentenceTokenizer, optional): Tokenizer for the TTS. Default is a basic sentence tokenizer.
95
+ use_streaming (bool, optional): Whether to use streaming synthesis. Default is True.
81
96
  """ # noqa: E501
97
+ if not is_given(use_streaming):
98
+ use_streaming = True
82
99
 
83
100
  super().__init__(
84
- capabilities=tts.TTSCapabilities(
85
- streaming=False,
86
- ),
101
+ capabilities=tts.TTSCapabilities(streaming=use_streaming),
87
102
  sample_rate=sample_rate,
88
103
  num_channels=1,
89
104
  )
@@ -93,15 +108,17 @@ class TTS(tts.TTS):
93
108
  self._credentials_file = credentials_file
94
109
  self._location = location
95
110
 
96
- lang = language if is_given(language) else "en-US"
97
- ssml_gender = _gender_from_str("neutral" if not is_given(gender) else gender)
98
- name = "" if not is_given(voice_name) else voice_name
111
+ lang = language if is_given(language) else DEFAULT_LANGUAGE
112
+ ssml_gender = _gender_from_str(DEFAULT_GENDER if not is_given(gender) else gender)
113
+ name = DEFAULT_VOICE_NAME if not is_given(voice_name) else voice_name
99
114
 
100
115
  voice_params = texttospeech.VoiceSelectionParams(
101
116
  name=name,
102
117
  language_code=lang,
103
118
  ssml_gender=ssml_gender,
104
119
  )
120
+ if not is_given(tokenizer):
121
+ tokenizer = tokenize.basic.SentenceTokenizer(min_sentence_len=BUFFERED_WORDS_COUNT)
105
122
 
106
123
  self._opts = _TTSOptions(
107
124
  voice=voice_params,
@@ -112,7 +129,9 @@ class TTS(tts.TTS):
112
129
  effects_profile_id=effects_profile_id,
113
130
  speaking_rate=speaking_rate,
114
131
  ),
132
+ tokenizer=tokenizer,
115
133
  )
134
+ self._streams = weakref.WeakSet[SynthesizeStream]()
116
135
 
117
136
  def update_options(
118
137
  self,
@@ -168,6 +187,18 @@ class TTS(tts.TTS):
168
187
  assert self._client is not None
169
188
  return self._client
170
189
 
190
+ def stream(
191
+ self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
192
+ ) -> SynthesizeStream:
193
+ stream = SynthesizeStream(
194
+ tts=self,
195
+ opts=self._opts,
196
+ client=self._ensure_client(),
197
+ conn_options=conn_options,
198
+ )
199
+ self._streams.add(stream)
200
+ return stream
201
+
171
202
  def synthesize(
172
203
  self,
173
204
  text: str,
@@ -182,6 +213,12 @@ class TTS(tts.TTS):
182
213
  client=self._ensure_client(),
183
214
  )
184
215
 
216
+ async def aclose(self) -> None:
217
+ for stream in list(self._streams):
218
+ await stream.aclose()
219
+ self._streams.clear()
220
+ await super().aclose()
221
+
185
222
 
186
223
  class ChunkedStream(tts.ChunkedStream):
187
224
  def __init__(
@@ -230,8 +267,105 @@ class ChunkedStream(tts.ChunkedStream):
230
267
  raise APITimeoutError() from None
231
268
  except GoogleAPICallError as e:
232
269
  raise APIStatusError(
233
- e.message, status_code=e.code or -1, request_id=None, body=None
234
- ) from None
270
+ f"{e.message} {e.details}", status_code=e.code or -1, request_id=None, body=None
271
+ ) from e
272
+ except Exception as e:
273
+ raise APIConnectionError() from e
274
+
275
+
276
+ class SynthesizeStream(tts.SynthesizeStream):
277
+ def __init__(
278
+ self,
279
+ *,
280
+ tts: TTS,
281
+ opts: _TTSOptions,
282
+ client: texttospeech.TextToSpeechAsyncClient,
283
+ conn_options: APIConnectOptions,
284
+ ):
285
+ super().__init__(tts=tts, conn_options=conn_options)
286
+ self._opts, self._client = opts, client
287
+ self._segments_ch = utils.aio.Chan[tokenize.SentenceStream]()
288
+
289
+ async def _run(self) -> None:
290
+ request_id = utils.shortuuid()
291
+
292
+ @utils.log_exceptions(logger=logger)
293
+ async def _tokenize_input():
294
+ input_stream = None
295
+ async for input in self._input_ch:
296
+ if isinstance(input, str):
297
+ if input_stream is None:
298
+ input_stream = self._opts.tokenizer.stream()
299
+ self._segments_ch.send_nowait(input_stream)
300
+ input_stream.push_text(input)
301
+ elif isinstance(input, self._FlushSentinel):
302
+ if input_stream:
303
+ input_stream.end_input()
304
+ input_stream = None
305
+ self._segments_ch.close()
306
+
307
+ @utils.log_exceptions(logger=logger)
308
+ async def _run_segments():
309
+ async for input_stream in self._segments_ch:
310
+ await self._run_stream(input_stream, request_id)
311
+
312
+ tasks = [
313
+ asyncio.create_task(_tokenize_input()),
314
+ asyncio.create_task(_run_segments()),
315
+ ]
316
+ try:
317
+ await asyncio.gather(*tasks)
318
+ except Exception as e:
319
+ raise APIConnectionError() from e
320
+
321
+ async def _run_stream(self, input_stream, request_id):
322
+ streaming_config = texttospeech.StreamingSynthesizeConfig(
323
+ voice=self._opts.voice,
324
+ streaming_audio_config=texttospeech.StreamingAudioConfig(
325
+ audio_encoding=texttospeech.AudioEncoding.PCM
326
+ ),
327
+ )
328
+ emitter = tts.SynthesizedAudioEmitter(event_ch=self._event_ch, request_id=request_id)
329
+ audio_bstream = utils.audio.AudioByteStream(
330
+ sample_rate=self._opts.audio_config.sample_rate_hertz,
331
+ num_channels=NUM_CHANNELS,
332
+ )
333
+
334
+ @utils.log_exceptions(logger=logger)
335
+ async def input_generator():
336
+ try:
337
+ yield texttospeech.StreamingSynthesizeRequest(streaming_config=streaming_config)
338
+ async for input in input_stream:
339
+ self._mark_started()
340
+ yield texttospeech.StreamingSynthesizeRequest(
341
+ input=texttospeech.StreamingSynthesisInput(text=input.token)
342
+ )
343
+
344
+ except Exception:
345
+ logger.exception("an error occurred while streaming input to google TTS")
346
+
347
+ try:
348
+ stream = await self._client.streaming_synthesize(
349
+ input_generator(),
350
+ timeout=self._conn_options.timeout,
351
+ )
352
+ async for resp in stream:
353
+ for frame in audio_bstream.write(resp.audio_content):
354
+ emitter.push(frame)
355
+
356
+ for frame in audio_bstream.flush():
357
+ emitter.push(frame)
358
+ emitter.flush()
359
+ except DeadlineExceeded as e:
360
+ logger.debug(f"google tts deadline exceeded: {e}")
361
+ pass
362
+ except GoogleAPICallError as e:
363
+ raise APIStatusError(
364
+ f"{e.message} {e.details}",
365
+ status_code=e.code or -1,
366
+ request_id=request_id,
367
+ body=None,
368
+ ) from e
235
369
  except Exception as e:
236
370
  raise APIConnectionError() from e
237
371
 
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.0.21"
15
+ __version__ = "1.0.23"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: livekit-plugins-google
3
- Version: 1.0.21
3
+ Version: 1.0.23
4
4
  Summary: Agent Framework plugin for services from Google Cloud
5
5
  Project-URL: Documentation, https://docs.livekit.io
6
6
  Project-URL: Website, https://livekit.io/
@@ -21,8 +21,8 @@ Requires-Python: >=3.9.0
21
21
  Requires-Dist: google-auth<3,>=2
22
22
  Requires-Dist: google-cloud-speech<3,>=2
23
23
  Requires-Dist: google-cloud-texttospeech<3,>=2.24
24
- Requires-Dist: google-genai>=1.14.0
25
- Requires-Dist: livekit-agents>=1.0.21
24
+ Requires-Dist: google-genai>=v1.16.1
25
+ Requires-Dist: livekit-agents>=1.0.23
26
26
  Description-Content-Type: text/markdown
27
27
 
28
28
  # Google AI plugin for LiveKit Agents
@@ -0,0 +1,16 @@
1
+ livekit/plugins/google/__init__.py,sha256=xain2qUzU-YWhYWsLBkW8Q-szV-htpnzHTqymMPo-j0,1364
2
+ livekit/plugins/google/llm.py,sha256=E1T_7cugMVN13dyAbXHVS5sC1lxRPNUemwJdV29-CPk,16206
3
+ livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
4
+ livekit/plugins/google/models.py,sha256=hOpfbN_qdQ1ZTpCN9m9dvG2eb6WgQ3KE3WRpIeeM_T0,1569
5
+ livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ livekit/plugins/google/stt.py,sha256=2jk-1fHiBT8UW_n3CZsIEdMp2iBnUAlTnmefdUd8rAM,23620
7
+ livekit/plugins/google/tts.py,sha256=FfhNfGtW8drmYDDfLLZDjaIp2GvNiIdoovgtZq4t_l8,14211
8
+ livekit/plugins/google/utils.py,sha256=UBAbddYk7G8Nojg6bSC7_xN2pdl9qhs86HGhKYFuf9M,10509
9
+ livekit/plugins/google/version.py,sha256=BRUqwxRBnPVqEcIODJdaZHGAanu4zkwM4NsAQjNtUEM,601
10
+ livekit/plugins/google/beta/__init__.py,sha256=5PnoG3Ux24bjzMSzmTeSVljE9EINivGcbWUEV6egGnM,216
11
+ livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
12
+ livekit/plugins/google/beta/realtime/api_proto.py,sha256=NfE7xr2N3JOu7gVfWbAmDcEhs8vuZgMRu5vpScPJzsg,776
13
+ livekit/plugins/google/beta/realtime/realtime_api.py,sha256=fgN2InMMCQL8JAHm-6J-SekzS5ymeH-hMRLzSW86Qkw,37477
14
+ livekit_plugins_google-1.0.23.dist-info/METADATA,sha256=69J1PJEwdaM6jWeMUXpbaU8A0quqi3UjDb5884qG9mI,1909
15
+ livekit_plugins_google-1.0.23.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
16
+ livekit_plugins_google-1.0.23.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- livekit/plugins/google/__init__.py,sha256=xain2qUzU-YWhYWsLBkW8Q-szV-htpnzHTqymMPo-j0,1364
2
- livekit/plugins/google/llm.py,sha256=Kr9qeBZ5Dd0WCCBR_-gM3WWsVRZPCSteK8NpBsg2C5Y,16304
3
- livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
4
- livekit/plugins/google/models.py,sha256=maGlEM3hK4-5hMnH9UQMJewA7BZMrnStsFLBNoNVySg,1531
5
- livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- livekit/plugins/google/stt.py,sha256=2jk-1fHiBT8UW_n3CZsIEdMp2iBnUAlTnmefdUd8rAM,23620
7
- livekit/plugins/google/tts.py,sha256=29R0ieV5sRPBf5Yi0SPFQk7ZZMbELF30bIL9K_j_Wcg,9100
8
- livekit/plugins/google/utils.py,sha256=UBAbddYk7G8Nojg6bSC7_xN2pdl9qhs86HGhKYFuf9M,10509
9
- livekit/plugins/google/version.py,sha256=5lzQkS1jEPqreexacwMd18b2EOx7R5m8AQMKtQRBgC4,601
10
- livekit/plugins/google/beta/__init__.py,sha256=5PnoG3Ux24bjzMSzmTeSVljE9EINivGcbWUEV6egGnM,216
11
- livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
12
- livekit/plugins/google/beta/realtime/api_proto.py,sha256=Fyrejs3SG0EjOPCCFLEnWXKEUxCff47PMWk2VsKJm5E,594
13
- livekit/plugins/google/beta/realtime/realtime_api.py,sha256=yYB5fKXl_aaMH_ZSpfUlfOTUg4eRqqRENLTZhZMfBMc,36253
14
- livekit_plugins_google-1.0.21.dist-info/METADATA,sha256=mQA8BfvWhAjp3V9GJA5OsZLzP_Q03UuDbRX2HbcEgtY,1908
15
- livekit_plugins_google-1.0.21.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
16
- livekit_plugins_google-1.0.21.dist-info/RECORD,,